mirror of
https://github.com/minio/minio.git
synced 2025-01-13 16:03:21 -05:00
Add drive metrics in metrics-v3 (#19452)
Add following metrics: - used_inodes - total_inodes - healing - online - reads_per_sec - reads_kb_per_sec - reads_await - writes_per_sec - writes_kb_per_sec - writes_await - perc_util To be able to calculate the `per_sec` values, we capture the IOStats-related data in the beginning (along with the time at which they were captured), and compare them against the current values subsequently. This is because dividing by "time since server uptime." doesn't work in k8s environments.
This commit is contained in:
parent
074febd9e1
commit
08d3d06a06
@ -227,15 +227,7 @@ func updateDriveIOStats(currentStats madmin.DiskIOStats, latestStats madmin.Disk
|
||||
// too soon to update the stats
|
||||
return
|
||||
}
|
||||
diffStats := madmin.DiskIOStats{
|
||||
ReadIOs: currentStats.ReadIOs - latestStats.ReadIOs,
|
||||
WriteIOs: currentStats.WriteIOs - latestStats.WriteIOs,
|
||||
ReadTicks: currentStats.ReadTicks - latestStats.ReadTicks,
|
||||
WriteTicks: currentStats.WriteTicks - latestStats.WriteTicks,
|
||||
TotalTicks: currentStats.TotalTicks - latestStats.TotalTicks,
|
||||
ReadSectors: currentStats.ReadSectors - latestStats.ReadSectors,
|
||||
WriteSectors: currentStats.WriteSectors - latestStats.WriteSectors,
|
||||
}
|
||||
diffStats := getDiffStats(latestStats, currentStats)
|
||||
|
||||
updateResourceMetrics(driveSubsystem, readsPerSec, float64(diffStats.ReadIOs)/diffInSeconds, labels, false)
|
||||
readKib := float64(diffStats.ReadSectors*sectorSize) / kib
|
||||
|
@ -18,6 +18,7 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/minio/madmin-go/v3"
|
||||
@ -61,8 +62,20 @@ func newNodesUpDownCache() *cachevalue.Cache[nodesOnline] {
|
||||
loadNodesUpDown)
|
||||
}
|
||||
|
||||
type driveIOStatMetrics struct {
|
||||
readsPerSec float64
|
||||
readsKBPerSec float64
|
||||
readsAwait float64
|
||||
writesPerSec float64
|
||||
writesKBPerSec float64
|
||||
writesAwait float64
|
||||
percUtil float64
|
||||
}
|
||||
|
||||
// storageMetrics - cached storage metrics.
|
||||
type storageMetrics struct {
|
||||
storageInfo madmin.StorageInfo
|
||||
ioStats map[string]driveIOStatMetrics
|
||||
onlineDrives, offlineDrives, totalDrives int
|
||||
}
|
||||
|
||||
@ -98,7 +111,48 @@ func newESetHealthResultCache() *cachevalue.Cache[HealthResult] {
|
||||
)
|
||||
}
|
||||
|
||||
func getDiffStats(initialStats, currentStats madmin.DiskIOStats) madmin.DiskIOStats {
|
||||
return madmin.DiskIOStats{
|
||||
ReadIOs: currentStats.ReadIOs - initialStats.ReadIOs,
|
||||
WriteIOs: currentStats.WriteIOs - initialStats.WriteIOs,
|
||||
ReadSectors: currentStats.ReadSectors - initialStats.ReadSectors,
|
||||
WriteSectors: currentStats.WriteSectors - initialStats.WriteSectors,
|
||||
ReadTicks: currentStats.ReadTicks - initialStats.ReadTicks,
|
||||
WriteTicks: currentStats.WriteTicks - initialStats.WriteTicks,
|
||||
TotalTicks: currentStats.TotalTicks - initialStats.TotalTicks,
|
||||
}
|
||||
}
|
||||
|
||||
func getDriveIOStatMetrics(ioStats madmin.DiskIOStats, duration time.Duration) (m driveIOStatMetrics) {
|
||||
durationSecs := duration.Seconds()
|
||||
|
||||
m.readsPerSec = float64(ioStats.ReadIOs) / durationSecs
|
||||
m.readsKBPerSec = float64(ioStats.ReadSectors) * float64(sectorSize) / kib / durationSecs
|
||||
if ioStats.ReadIOs > 0 {
|
||||
m.readsAwait = float64(ioStats.ReadTicks) / float64(ioStats.ReadIOs)
|
||||
}
|
||||
|
||||
m.writesPerSec = float64(ioStats.WriteIOs) / durationSecs
|
||||
m.writesKBPerSec = float64(ioStats.WriteSectors) * float64(sectorSize) / kib / durationSecs
|
||||
if ioStats.WriteIOs > 0 {
|
||||
m.writesAwait = float64(ioStats.WriteTicks) / float64(ioStats.WriteIOs)
|
||||
}
|
||||
|
||||
// TotalTicks is in milliseconds
|
||||
m.percUtil = float64(ioStats.TotalTicks) * 100 / (durationSecs * 1000)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func newDriveMetricsCache() *cachevalue.Cache[storageMetrics] {
|
||||
var (
|
||||
// prevDriveIOStats is used to calculate "per second"
|
||||
// values for IOStat related disk metrics e.g. reads/sec.
|
||||
prevDriveIOStats map[string]madmin.DiskIOStats
|
||||
prevDriveIOStatsMu sync.RWMutex
|
||||
prevDriveIOStatsRefreshedAt time.Time
|
||||
)
|
||||
|
||||
loadDriveMetrics := func() (v storageMetrics, err error) {
|
||||
objLayer := newObjectLayerFn()
|
||||
if objLayer == nil {
|
||||
@ -108,14 +162,37 @@ func newDriveMetricsCache() *cachevalue.Cache[storageMetrics] {
|
||||
storageInfo := objLayer.LocalStorageInfo(GlobalContext, true)
|
||||
onlineDrives, offlineDrives := getOnlineOfflineDisksStats(storageInfo.Disks)
|
||||
totalDrives := onlineDrives.Merge(offlineDrives)
|
||||
|
||||
v = storageMetrics{
|
||||
storageInfo: storageInfo,
|
||||
onlineDrives: onlineDrives.Sum(),
|
||||
offlineDrives: offlineDrives.Sum(),
|
||||
totalDrives: totalDrives.Sum(),
|
||||
ioStats: map[string]driveIOStatMetrics{},
|
||||
}
|
||||
|
||||
currentStats := getCurrentDriveIOStats()
|
||||
now := time.Now().UTC()
|
||||
|
||||
prevDriveIOStatsMu.Lock()
|
||||
if prevDriveIOStats != nil {
|
||||
duration := now.Sub(prevDriveIOStatsRefreshedAt)
|
||||
if duration.Seconds() > 1 {
|
||||
for d, cs := range currentStats {
|
||||
if ps, found := prevDriveIOStats[d]; found {
|
||||
v.ioStats[d] = getDriveIOStatMetrics(getDiffStats(ps, cs), duration)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
prevDriveIOStats = currentStats
|
||||
prevDriveIOStatsRefreshedAt = now
|
||||
prevDriveIOStatsMu.Unlock()
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
return cachevalue.NewFromFunc(1*time.Minute,
|
||||
cachevalue.Opts{ReturnLastGood: true},
|
||||
loadDriveMetrics)
|
||||
|
@ -20,6 +20,8 @@ package cmd
|
||||
import (
|
||||
"context"
|
||||
"strconv"
|
||||
|
||||
"github.com/minio/madmin-go/v3"
|
||||
)
|
||||
|
||||
// label constants
|
||||
@ -30,6 +32,9 @@ const (
|
||||
driveIndexL = "drive_index"
|
||||
|
||||
apiL = "api"
|
||||
|
||||
sectorSize = uint64(512)
|
||||
kib = float64(1 << 10)
|
||||
)
|
||||
|
||||
var allDriveLabels = []string{driveL, poolIndexL, setIndexL, driveIndexL}
|
||||
@ -38,15 +43,28 @@ const (
|
||||
driveUsedBytes = "used_bytes"
|
||||
driveFreeBytes = "free_bytes"
|
||||
driveTotalBytes = "total_bytes"
|
||||
driveUsedInodes = "used_inodes"
|
||||
driveFreeInodes = "free_inodes"
|
||||
driveTotalInodes = "total_inodes"
|
||||
driveTimeoutErrorsTotal = "timeout_errors_total"
|
||||
driveAvailabilityErrorsTotal = "availability_errors_total"
|
||||
driveWaitingIO = "waiting_io"
|
||||
driveAPILatencyMicros = "api_latency_micros"
|
||||
driveHealing = "healing"
|
||||
driveOnline = "online"
|
||||
|
||||
driveOfflineCount = "offline_count"
|
||||
driveOnlineCount = "online_count"
|
||||
driveCount = "count"
|
||||
|
||||
// iostat related
|
||||
driveReadsPerSec = "reads_per_sec"
|
||||
driveReadsKBPerSec = "reads_kb_per_sec"
|
||||
driveReadsAwait = "reads_await"
|
||||
driveWritesPerSec = "writes_per_sec"
|
||||
driveWritesKBPerSec = "writes_kb_per_sec"
|
||||
driveWritesAwait = "writes_await"
|
||||
drivePercUtil = "perc_util"
|
||||
)
|
||||
|
||||
var (
|
||||
@ -56,8 +74,12 @@ var (
|
||||
"Total storage free on a drive in bytes", allDriveLabels...)
|
||||
driveTotalBytesMD = NewGaugeMD(driveTotalBytes,
|
||||
"Total storage available on a drive in bytes", allDriveLabels...)
|
||||
driveUsedInodesMD = NewGaugeMD(driveUsedInodes,
|
||||
"Total used inodes on a drive", allDriveLabels...)
|
||||
driveFreeInodesMD = NewGaugeMD(driveFreeInodes,
|
||||
"Total free inodes on a drive", allDriveLabels...)
|
||||
driveTotalInodesMD = NewGaugeMD(driveTotalInodes,
|
||||
"Total inodes available on a drive", allDriveLabels...)
|
||||
driveTimeoutErrorsMD = NewCounterMD(driveTimeoutErrorsTotal,
|
||||
"Total timeout errors on a drive", allDriveLabels...)
|
||||
driveAvailabilityErrorsMD = NewCounterMD(driveAvailabilityErrorsTotal,
|
||||
@ -68,6 +90,10 @@ var (
|
||||
driveAPILatencyMD = NewGaugeMD(driveAPILatencyMicros,
|
||||
"Average last minute latency in µs for drive API storage operations",
|
||||
append(allDriveLabels, apiL)...)
|
||||
driveHealingMD = NewGaugeMD(driveHealing,
|
||||
"Is it healing?", allDriveLabels...)
|
||||
driveOnlineMD = NewGaugeMD(driveOnline,
|
||||
"Is it online?", allDriveLabels...)
|
||||
|
||||
driveOfflineCountMD = NewGaugeMD(driveOfflineCount,
|
||||
"Count of offline drives")
|
||||
@ -75,32 +101,71 @@ var (
|
||||
"Count of online drives")
|
||||
driveCountMD = NewGaugeMD(driveCount,
|
||||
"Count of all drives")
|
||||
|
||||
// iostat related
|
||||
driveReadsPerSecMD = NewGaugeMD(driveReadsPerSec,
|
||||
"Reads per second on a drive",
|
||||
allDriveLabels...)
|
||||
driveReadsKBPerSecMD = NewGaugeMD(driveReadsKBPerSec,
|
||||
"Kilobytes read per second on a drive",
|
||||
allDriveLabels...)
|
||||
driveReadsAwaitMD = NewGaugeMD(driveReadsAwait,
|
||||
"Average time for read requests served on a drive",
|
||||
allDriveLabels...)
|
||||
driveWritesPerSecMD = NewGaugeMD(driveWritesPerSec,
|
||||
"Writes per second on a drive",
|
||||
allDriveLabels...)
|
||||
driveWritesKBPerSecMD = NewGaugeMD(driveWritesKBPerSec,
|
||||
"Kilobytes written per second on a drive",
|
||||
allDriveLabels...)
|
||||
driveWritesAwaitMD = NewGaugeMD(driveWritesAwait,
|
||||
"Average time for write requests served on a drive",
|
||||
allDriveLabels...)
|
||||
drivePercUtilMD = NewGaugeMD(drivePercUtil,
|
||||
"Percentage of time the disk was busy",
|
||||
allDriveLabels...)
|
||||
)
|
||||
|
||||
// loadDriveMetrics - `MetricsLoaderFn` for node drive metrics.
|
||||
func loadDriveMetrics(ctx context.Context, m MetricValues, c *metricsCache) error {
|
||||
driveMetrics, err := c.driveMetrics.Get()
|
||||
if err != nil {
|
||||
metricsLogIf(ctx, err)
|
||||
return nil
|
||||
func getCurrentDriveIOStats() map[string]madmin.DiskIOStats {
|
||||
var types madmin.MetricType = madmin.MetricsDisk
|
||||
driveRealtimeMetrics := collectLocalMetrics(types, collectMetricsOpts{
|
||||
hosts: map[string]struct{}{
|
||||
globalLocalNodeName: {},
|
||||
},
|
||||
})
|
||||
|
||||
stats := map[string]madmin.DiskIOStats{}
|
||||
for d, m := range driveRealtimeMetrics.ByDisk {
|
||||
stats[d] = m.IOStats
|
||||
}
|
||||
return stats
|
||||
}
|
||||
|
||||
storageInfo := driveMetrics.storageInfo
|
||||
func (m *MetricValues) setDriveBasicMetrics(drive madmin.Disk, labels []string) {
|
||||
m.Set(driveUsedBytes, float64(drive.UsedSpace), labels...)
|
||||
m.Set(driveFreeBytes, float64(drive.AvailableSpace), labels...)
|
||||
m.Set(driveTotalBytes, float64(drive.TotalSpace), labels...)
|
||||
m.Set(driveUsedInodes, float64(drive.UsedInodes), labels...)
|
||||
m.Set(driveFreeInodes, float64(drive.FreeInodes), labels...)
|
||||
m.Set(driveTotalInodes, float64(drive.UsedInodes+drive.FreeInodes), labels...)
|
||||
|
||||
for _, disk := range storageInfo.Disks {
|
||||
labels := []string{
|
||||
driveL, disk.DrivePath,
|
||||
poolIndexL, strconv.Itoa(disk.PoolIndex),
|
||||
setIndexL, strconv.Itoa(disk.SetIndex),
|
||||
driveIndexL, strconv.Itoa(disk.DiskIndex),
|
||||
var healing, online float64
|
||||
if drive.Healing {
|
||||
healing = 1
|
||||
}
|
||||
m.Set(driveHealing, healing, labels...)
|
||||
|
||||
if drive.State == "ok" {
|
||||
online = 1
|
||||
}
|
||||
m.Set(driveOnline, online, labels...)
|
||||
}
|
||||
|
||||
m.Set(driveUsedBytes, float64(disk.UsedSpace), labels...)
|
||||
m.Set(driveFreeBytes, float64(disk.AvailableSpace), labels...)
|
||||
m.Set(driveTotalBytes, float64(disk.TotalSpace), labels...)
|
||||
m.Set(driveFreeInodes, float64(disk.FreeInodes), labels...)
|
||||
func (m *MetricValues) setDriveAPIMetrics(disk madmin.Disk, labels []string) {
|
||||
if disk.Metrics == nil {
|
||||
return
|
||||
}
|
||||
|
||||
if disk.Metrics != nil {
|
||||
m.Set(driveTimeoutErrorsTotal, float64(disk.Metrics.TotalErrorsTimeout), labels...)
|
||||
m.Set(driveAvailabilityErrorsTotal, float64(disk.Metrics.TotalErrorsAvailability), labels...)
|
||||
m.Set(driveWaitingIO, float64(disk.Metrics.TotalWaiting), labels...)
|
||||
@ -114,6 +179,44 @@ func loadDriveMetrics(ctx context.Context, m MetricValues, c *metricsCache) erro
|
||||
labels...)
|
||||
}
|
||||
}
|
||||
|
||||
func (m *MetricValues) setDriveIOStatMetrics(ioStats driveIOStatMetrics, labels []string) {
|
||||
m.Set(driveReadsPerSec, ioStats.readsPerSec, labels...)
|
||||
m.Set(driveReadsKBPerSec, ioStats.readsKBPerSec, labels...)
|
||||
if ioStats.readsPerSec > 0 {
|
||||
m.Set(driveReadsAwait, ioStats.readsAwait, labels...)
|
||||
}
|
||||
|
||||
m.Set(driveWritesPerSec, ioStats.writesPerSec, labels...)
|
||||
m.Set(driveWritesKBPerSec, ioStats.writesKBPerSec, labels...)
|
||||
if ioStats.writesPerSec > 0 {
|
||||
m.Set(driveWritesAwait, ioStats.writesAwait, labels...)
|
||||
}
|
||||
|
||||
m.Set(drivePercUtil, ioStats.percUtil, labels...)
|
||||
}
|
||||
|
||||
// loadDriveMetrics - `MetricsLoaderFn` for node drive metrics.
|
||||
func loadDriveMetrics(ctx context.Context, m MetricValues, c *metricsCache) error {
|
||||
driveMetrics, err := c.driveMetrics.Get()
|
||||
if err != nil {
|
||||
metricsLogIf(ctx, err)
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, disk := range driveMetrics.storageInfo.Disks {
|
||||
labels := []string{
|
||||
driveL, disk.DrivePath,
|
||||
poolIndexL, strconv.Itoa(disk.PoolIndex),
|
||||
setIndexL, strconv.Itoa(disk.SetIndex),
|
||||
driveIndexL, strconv.Itoa(disk.DiskIndex),
|
||||
}
|
||||
|
||||
m.setDriveBasicMetrics(disk, labels)
|
||||
if dm, found := driveMetrics.ioStats[disk.DrivePath]; found {
|
||||
m.setDriveIOStatMetrics(dm, labels)
|
||||
}
|
||||
m.setDriveAPIMetrics(disk, labels)
|
||||
}
|
||||
|
||||
m.Set(driveOfflineCount, float64(driveMetrics.offlineDrives))
|
||||
|
@ -117,15 +117,28 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection {
|
||||
driveUsedBytesMD,
|
||||
driveFreeBytesMD,
|
||||
driveTotalBytesMD,
|
||||
driveUsedInodesMD,
|
||||
driveFreeInodesMD,
|
||||
driveTotalInodesMD,
|
||||
driveTimeoutErrorsMD,
|
||||
driveAvailabilityErrorsMD,
|
||||
driveWaitingIOMD,
|
||||
driveAPILatencyMD,
|
||||
driveHealingMD,
|
||||
driveOnlineMD,
|
||||
|
||||
driveOfflineCountMD,
|
||||
driveOnlineCountMD,
|
||||
driveCountMD,
|
||||
|
||||
// iostat related
|
||||
driveReadsPerSecMD,
|
||||
driveReadsKBPerSecMD,
|
||||
driveReadsAwaitMD,
|
||||
driveWritesPerSecMD,
|
||||
driveWritesKBPerSecMD,
|
||||
driveWritesAwaitMD,
|
||||
drivePercUtilMD,
|
||||
},
|
||||
loadDriveMetrics,
|
||||
)
|
||||
|
@ -105,7 +105,9 @@ The standard metrics groups for ProcessCollector and GoCollector are not shown b
|
||||
| `minio_system_drive_used_bytes` | `gauge` | Total storage used on a drive in bytes | `drive,set_index,drive_index,pool_index,server` |
|
||||
| `minio_system_drive_free_bytes` | `gauge` | Total storage free on a drive in bytes | `drive,set_index,drive_index,pool_index,server` |
|
||||
| `minio_system_drive_total_bytes` | `gauge` | Total storage available on a drive in bytes | `drive,set_index,drive_index,pool_index,server` |
|
||||
| `minio_system_drive_used_inodes` | `gauge` | Total used inodes on a drive | `drive,set_index,drive_index,pool_index,server` |
|
||||
| `minio_system_drive_free_inodes` | `gauge` | Total free inodes on a drive | `drive,set_index,drive_index,pool_index,server` |
|
||||
| `minio_system_drive_total_inodes` | `gauge` | Total inodes available on a drive | `drive,set_index,drive_index,pool_index,server` |
|
||||
| `minio_system_drive_timeout_errors_total` | `counter` | Total timeout errors on a drive | `drive,set_index,drive_index,pool_index,server` |
|
||||
| `minio_system_drive_availability_errors_total` | `counter` | Total availability errors (I/O errors, permission denied and timeouts) on a drive | `drive,set_index,drive_index,pool_index,server` |
|
||||
| `minio_system_drive_waiting_io` | `gauge` | Total waiting I/O operations on a drive | `drive,set_index,drive_index,pool_index,server` |
|
||||
@ -113,6 +115,15 @@ The standard metrics groups for ProcessCollector and GoCollector are not shown b
|
||||
| `minio_system_drive_offline_count` | `gauge` | Count of offline drives | `pool_index,server` |
|
||||
| `minio_system_drive_online_count` | `gauge` | Count of online drives | `pool_index,server` |
|
||||
| `minio_system_drive_count` | `gauge` | Count of all drives | `pool_index,server` |
|
||||
| `minio_system_drive_healing` | `gauge` | Is it healing? | `drive,set_index,drive_index,pool_index,server` |
|
||||
| `minio_system_drive_online` | `gauge` | Is it online? | `drive,set_index,drive_index,pool_index,server` |
|
||||
| `minio_system_drive_reads_per_sec` | `gauge` | Reads per second on a drive | `drive,set_index,drive_index,pool_index,server` |
|
||||
| `minio_system_drive_reads_kb_per_sec` | `gauge` | Kilobytes read per second on a drive | `drive,set_index,drive_index,pool_index,server` |
|
||||
| `minio_system_drive_reads_await` | `gauge` | Average time for read requests served on a drive | `drive,set_index,drive_index,pool_index,server` |
|
||||
| `minio_system_drive_writes_per_sec` | `gauge` | Writes per second on a drive | `drive,set_index,drive_index,pool_index,server` |
|
||||
| `minio_system_drive_writes_kb_per_sec` | `gauge` | Kilobytes written per second on a drive | `drive,set_index,drive_index,pool_index,server` |
|
||||
| `minio_system_drive_writes_await` | `gauge` | Average time for write requests served on a drive | `drive,set_index,drive_index,pool_index,server` |
|
||||
| `minio_system_drive_perc_util` | `gauge` | Percentage of time the disk was busy | `drive,set_index,drive_index,pool_index,server` |
|
||||
|
||||
### `/system/network/internode`
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user