diff --git a/cmd/metrics-resource.go b/cmd/metrics-resource.go index f2f54a637..9392231f5 100644 --- a/cmd/metrics-resource.go +++ b/cmd/metrics-resource.go @@ -227,15 +227,7 @@ func updateDriveIOStats(currentStats madmin.DiskIOStats, latestStats madmin.Disk // too soon to update the stats return } - diffStats := madmin.DiskIOStats{ - ReadIOs: currentStats.ReadIOs - latestStats.ReadIOs, - WriteIOs: currentStats.WriteIOs - latestStats.WriteIOs, - ReadTicks: currentStats.ReadTicks - latestStats.ReadTicks, - WriteTicks: currentStats.WriteTicks - latestStats.WriteTicks, - TotalTicks: currentStats.TotalTicks - latestStats.TotalTicks, - ReadSectors: currentStats.ReadSectors - latestStats.ReadSectors, - WriteSectors: currentStats.WriteSectors - latestStats.WriteSectors, - } + diffStats := getDiffStats(latestStats, currentStats) updateResourceMetrics(driveSubsystem, readsPerSec, float64(diffStats.ReadIOs)/diffInSeconds, labels, false) readKib := float64(diffStats.ReadSectors*sectorSize) / kib diff --git a/cmd/metrics-v3-cache.go b/cmd/metrics-v3-cache.go index 1fa22396c..8b1c7fd1c 100644 --- a/cmd/metrics-v3-cache.go +++ b/cmd/metrics-v3-cache.go @@ -18,6 +18,7 @@ package cmd import ( + "sync" "time" "github.com/minio/madmin-go/v3" @@ -61,8 +62,20 @@ func newNodesUpDownCache() *cachevalue.Cache[nodesOnline] { loadNodesUpDown) } +type driveIOStatMetrics struct { + readsPerSec float64 + readsKBPerSec float64 + readsAwait float64 + writesPerSec float64 + writesKBPerSec float64 + writesAwait float64 + percUtil float64 +} + +// storageMetrics - cached storage metrics. type storageMetrics struct { storageInfo madmin.StorageInfo + ioStats map[string]driveIOStatMetrics onlineDrives, offlineDrives, totalDrives int } @@ -98,7 +111,48 @@ func newESetHealthResultCache() *cachevalue.Cache[HealthResult] { ) } +func getDiffStats(initialStats, currentStats madmin.DiskIOStats) madmin.DiskIOStats { + return madmin.DiskIOStats{ + ReadIOs: currentStats.ReadIOs - initialStats.ReadIOs, + WriteIOs: currentStats.WriteIOs - initialStats.WriteIOs, + ReadSectors: currentStats.ReadSectors - initialStats.ReadSectors, + WriteSectors: currentStats.WriteSectors - initialStats.WriteSectors, + ReadTicks: currentStats.ReadTicks - initialStats.ReadTicks, + WriteTicks: currentStats.WriteTicks - initialStats.WriteTicks, + TotalTicks: currentStats.TotalTicks - initialStats.TotalTicks, + } +} + +func getDriveIOStatMetrics(ioStats madmin.DiskIOStats, duration time.Duration) (m driveIOStatMetrics) { + durationSecs := duration.Seconds() + + m.readsPerSec = float64(ioStats.ReadIOs) / durationSecs + m.readsKBPerSec = float64(ioStats.ReadSectors) * float64(sectorSize) / kib / durationSecs + if ioStats.ReadIOs > 0 { + m.readsAwait = float64(ioStats.ReadTicks) / float64(ioStats.ReadIOs) + } + + m.writesPerSec = float64(ioStats.WriteIOs) / durationSecs + m.writesKBPerSec = float64(ioStats.WriteSectors) * float64(sectorSize) / kib / durationSecs + if ioStats.WriteIOs > 0 { + m.writesAwait = float64(ioStats.WriteTicks) / float64(ioStats.WriteIOs) + } + + // TotalTicks is in milliseconds + m.percUtil = float64(ioStats.TotalTicks) * 100 / (durationSecs * 1000) + + return +} + func newDriveMetricsCache() *cachevalue.Cache[storageMetrics] { + var ( + // prevDriveIOStats is used to calculate "per second" + // values for IOStat related disk metrics e.g. reads/sec. + prevDriveIOStats map[string]madmin.DiskIOStats + prevDriveIOStatsMu sync.RWMutex + prevDriveIOStatsRefreshedAt time.Time + ) + loadDriveMetrics := func() (v storageMetrics, err error) { objLayer := newObjectLayerFn() if objLayer == nil { @@ -108,14 +162,37 @@ func newDriveMetricsCache() *cachevalue.Cache[storageMetrics] { storageInfo := objLayer.LocalStorageInfo(GlobalContext, true) onlineDrives, offlineDrives := getOnlineOfflineDisksStats(storageInfo.Disks) totalDrives := onlineDrives.Merge(offlineDrives) + v = storageMetrics{ storageInfo: storageInfo, onlineDrives: onlineDrives.Sum(), offlineDrives: offlineDrives.Sum(), totalDrives: totalDrives.Sum(), + ioStats: map[string]driveIOStatMetrics{}, } + + currentStats := getCurrentDriveIOStats() + now := time.Now().UTC() + + prevDriveIOStatsMu.Lock() + if prevDriveIOStats != nil { + duration := now.Sub(prevDriveIOStatsRefreshedAt) + if duration.Seconds() > 1 { + for d, cs := range currentStats { + if ps, found := prevDriveIOStats[d]; found { + v.ioStats[d] = getDriveIOStatMetrics(getDiffStats(ps, cs), duration) + } + } + } + } + + prevDriveIOStats = currentStats + prevDriveIOStatsRefreshedAt = now + prevDriveIOStatsMu.Unlock() + return } + return cachevalue.NewFromFunc(1*time.Minute, cachevalue.Opts{ReturnLastGood: true}, loadDriveMetrics) diff --git a/cmd/metrics-v3-system-drive.go b/cmd/metrics-v3-system-drive.go index a4217b495..b231b1787 100644 --- a/cmd/metrics-v3-system-drive.go +++ b/cmd/metrics-v3-system-drive.go @@ -20,6 +20,8 @@ package cmd import ( "context" "strconv" + + "github.com/minio/madmin-go/v3" ) // label constants @@ -30,6 +32,9 @@ const ( driveIndexL = "drive_index" apiL = "api" + + sectorSize = uint64(512) + kib = float64(1 << 10) ) var allDriveLabels = []string{driveL, poolIndexL, setIndexL, driveIndexL} @@ -38,15 +43,28 @@ const ( driveUsedBytes = "used_bytes" driveFreeBytes = "free_bytes" driveTotalBytes = "total_bytes" + driveUsedInodes = "used_inodes" driveFreeInodes = "free_inodes" + driveTotalInodes = "total_inodes" driveTimeoutErrorsTotal = "timeout_errors_total" driveAvailabilityErrorsTotal = "availability_errors_total" driveWaitingIO = "waiting_io" driveAPILatencyMicros = "api_latency_micros" + driveHealing = "healing" + driveOnline = "online" driveOfflineCount = "offline_count" driveOnlineCount = "online_count" driveCount = "count" + + // iostat related + driveReadsPerSec = "reads_per_sec" + driveReadsKBPerSec = "reads_kb_per_sec" + driveReadsAwait = "reads_await" + driveWritesPerSec = "writes_per_sec" + driveWritesKBPerSec = "writes_kb_per_sec" + driveWritesAwait = "writes_await" + drivePercUtil = "perc_util" ) var ( @@ -56,8 +74,12 @@ var ( "Total storage free on a drive in bytes", allDriveLabels...) driveTotalBytesMD = NewGaugeMD(driveTotalBytes, "Total storage available on a drive in bytes", allDriveLabels...) + driveUsedInodesMD = NewGaugeMD(driveUsedInodes, + "Total used inodes on a drive", allDriveLabels...) driveFreeInodesMD = NewGaugeMD(driveFreeInodes, "Total free inodes on a drive", allDriveLabels...) + driveTotalInodesMD = NewGaugeMD(driveTotalInodes, + "Total inodes available on a drive", allDriveLabels...) driveTimeoutErrorsMD = NewCounterMD(driveTimeoutErrorsTotal, "Total timeout errors on a drive", allDriveLabels...) driveAvailabilityErrorsMD = NewCounterMD(driveAvailabilityErrorsTotal, @@ -68,6 +90,10 @@ var ( driveAPILatencyMD = NewGaugeMD(driveAPILatencyMicros, "Average last minute latency in µs for drive API storage operations", append(allDriveLabels, apiL)...) + driveHealingMD = NewGaugeMD(driveHealing, + "Is it healing?", allDriveLabels...) + driveOnlineMD = NewGaugeMD(driveOnline, + "Is it online?", allDriveLabels...) driveOfflineCountMD = NewGaugeMD(driveOfflineCount, "Count of offline drives") @@ -75,8 +101,101 @@ var ( "Count of online drives") driveCountMD = NewGaugeMD(driveCount, "Count of all drives") + + // iostat related + driveReadsPerSecMD = NewGaugeMD(driveReadsPerSec, + "Reads per second on a drive", + allDriveLabels...) + driveReadsKBPerSecMD = NewGaugeMD(driveReadsKBPerSec, + "Kilobytes read per second on a drive", + allDriveLabels...) + driveReadsAwaitMD = NewGaugeMD(driveReadsAwait, + "Average time for read requests served on a drive", + allDriveLabels...) + driveWritesPerSecMD = NewGaugeMD(driveWritesPerSec, + "Writes per second on a drive", + allDriveLabels...) + driveWritesKBPerSecMD = NewGaugeMD(driveWritesKBPerSec, + "Kilobytes written per second on a drive", + allDriveLabels...) + driveWritesAwaitMD = NewGaugeMD(driveWritesAwait, + "Average time for write requests served on a drive", + allDriveLabels...) + drivePercUtilMD = NewGaugeMD(drivePercUtil, + "Percentage of time the disk was busy", + allDriveLabels...) ) +func getCurrentDriveIOStats() map[string]madmin.DiskIOStats { + var types madmin.MetricType = madmin.MetricsDisk + driveRealtimeMetrics := collectLocalMetrics(types, collectMetricsOpts{ + hosts: map[string]struct{}{ + globalLocalNodeName: {}, + }, + }) + + stats := map[string]madmin.DiskIOStats{} + for d, m := range driveRealtimeMetrics.ByDisk { + stats[d] = m.IOStats + } + return stats +} + +func (m *MetricValues) setDriveBasicMetrics(drive madmin.Disk, labels []string) { + m.Set(driveUsedBytes, float64(drive.UsedSpace), labels...) + m.Set(driveFreeBytes, float64(drive.AvailableSpace), labels...) + m.Set(driveTotalBytes, float64(drive.TotalSpace), labels...) + m.Set(driveUsedInodes, float64(drive.UsedInodes), labels...) + m.Set(driveFreeInodes, float64(drive.FreeInodes), labels...) + m.Set(driveTotalInodes, float64(drive.UsedInodes+drive.FreeInodes), labels...) + + var healing, online float64 + if drive.Healing { + healing = 1 + } + m.Set(driveHealing, healing, labels...) + + if drive.State == "ok" { + online = 1 + } + m.Set(driveOnline, online, labels...) +} + +func (m *MetricValues) setDriveAPIMetrics(disk madmin.Disk, labels []string) { + if disk.Metrics == nil { + return + } + + m.Set(driveTimeoutErrorsTotal, float64(disk.Metrics.TotalErrorsTimeout), labels...) + m.Set(driveAvailabilityErrorsTotal, float64(disk.Metrics.TotalErrorsAvailability), labels...) + m.Set(driveWaitingIO, float64(disk.Metrics.TotalWaiting), labels...) + + // Append the api label for the drive API latencies. + labels = append(labels, "api", "") + lastIdx := len(labels) - 1 + for apiName, latency := range disk.Metrics.LastMinute { + labels[lastIdx] = "storage." + apiName + m.Set(driveAPILatencyMicros, float64(latency.Avg().Microseconds()), + labels...) + } +} + +func (m *MetricValues) setDriveIOStatMetrics(ioStats driveIOStatMetrics, labels []string) { + m.Set(driveReadsPerSec, ioStats.readsPerSec, labels...) + m.Set(driveReadsKBPerSec, ioStats.readsKBPerSec, labels...) + if ioStats.readsPerSec > 0 { + m.Set(driveReadsAwait, ioStats.readsAwait, labels...) + } + + m.Set(driveWritesPerSec, ioStats.writesPerSec, labels...) + m.Set(driveWritesKBPerSec, ioStats.writesKBPerSec, labels...) + if ioStats.writesPerSec > 0 { + m.Set(driveWritesAwait, ioStats.writesAwait, labels...) + } + + m.Set(drivePercUtil, ioStats.percUtil, labels...) +} + // loadDriveMetrics - `MetricsLoaderFn` for node drive metrics. func loadDriveMetrics(ctx context.Context, m MetricValues, c *metricsCache) error { driveMetrics, err := c.driveMetrics.Get() @@ -85,9 +204,7 @@ func loadDriveMetrics(ctx context.Context, m MetricValues, c *metricsCache) erro return nil } - storageInfo := driveMetrics.storageInfo - - for _, disk := range storageInfo.Disks { + for _, disk := range driveMetrics.storageInfo.Disks { labels := []string{ driveL, disk.DrivePath, poolIndexL, strconv.Itoa(disk.PoolIndex), @@ -95,25 +212,11 @@ func loadDriveMetrics(ctx context.Context, m MetricValues, c *metricsCache) erro driveIndexL, strconv.Itoa(disk.DiskIndex), } - m.Set(driveUsedBytes, float64(disk.UsedSpace), labels...) - m.Set(driveFreeBytes, float64(disk.AvailableSpace), labels...) - m.Set(driveTotalBytes, float64(disk.TotalSpace), labels...) - m.Set(driveFreeInodes, float64(disk.FreeInodes), labels...) - - if disk.Metrics != nil { - m.Set(driveTimeoutErrorsTotal, float64(disk.Metrics.TotalErrorsTimeout), labels...) - m.Set(driveAvailabilityErrorsTotal, float64(disk.Metrics.TotalErrorsAvailability), labels...) - m.Set(driveWaitingIO, float64(disk.Metrics.TotalWaiting), labels...) - - // Append the api label for the drive API latencies. - labels = append(labels, "api", "") - lastIdx := len(labels) - 1 - for apiName, latency := range disk.Metrics.LastMinute { - labels[lastIdx] = "storage." + apiName - m.Set(driveAPILatencyMicros, float64(latency.Avg().Microseconds()), - labels...) - } + m.setDriveBasicMetrics(disk, labels) + if dm, found := driveMetrics.ioStats[disk.DrivePath]; found { + m.setDriveIOStatMetrics(dm, labels) } + m.setDriveAPIMetrics(disk, labels) } m.Set(driveOfflineCount, float64(driveMetrics.offlineDrives)) diff --git a/cmd/metrics-v3.go b/cmd/metrics-v3.go index 5814f9c39..a8353882d 100644 --- a/cmd/metrics-v3.go +++ b/cmd/metrics-v3.go @@ -117,15 +117,28 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection { driveUsedBytesMD, driveFreeBytesMD, driveTotalBytesMD, + driveUsedInodesMD, driveFreeInodesMD, + driveTotalInodesMD, driveTimeoutErrorsMD, driveAvailabilityErrorsMD, driveWaitingIOMD, driveAPILatencyMD, + driveHealingMD, + driveOnlineMD, driveOfflineCountMD, driveOnlineCountMD, driveCountMD, + + // iostat related + driveReadsPerSecMD, + driveReadsKBPerSecMD, + driveReadsAwaitMD, + driveWritesPerSecMD, + driveWritesKBPerSecMD, + driveWritesAwaitMD, + drivePercUtilMD, }, loadDriveMetrics, ) diff --git a/docs/metrics/v3.md b/docs/metrics/v3.md index e048cf44f..aebd6e32d 100644 --- a/docs/metrics/v3.md +++ b/docs/metrics/v3.md @@ -105,7 +105,9 @@ The standard metrics groups for ProcessCollector and GoCollector are not shown b | `minio_system_drive_used_bytes` | `gauge` | Total storage used on a drive in bytes | `drive,set_index,drive_index,pool_index,server` | | `minio_system_drive_free_bytes` | `gauge` | Total storage free on a drive in bytes | `drive,set_index,drive_index,pool_index,server` | | `minio_system_drive_total_bytes` | `gauge` | Total storage available on a drive in bytes | `drive,set_index,drive_index,pool_index,server` | +| `minio_system_drive_used_inodes` | `gauge` | Total used inodes on a drive | `drive,set_index,drive_index,pool_index,server` | | `minio_system_drive_free_inodes` | `gauge` | Total free inodes on a drive | `drive,set_index,drive_index,pool_index,server` | +| `minio_system_drive_total_inodes` | `gauge` | Total inodes available on a drive | `drive,set_index,drive_index,pool_index,server` | | `minio_system_drive_timeout_errors_total` | `counter` | Total timeout errors on a drive | `drive,set_index,drive_index,pool_index,server` | | `minio_system_drive_availability_errors_total` | `counter` | Total availability errors (I/O errors, permission denied and timeouts) on a drive | `drive,set_index,drive_index,pool_index,server` | | `minio_system_drive_waiting_io` | `gauge` | Total waiting I/O operations on a drive | `drive,set_index,drive_index,pool_index,server` | @@ -113,6 +115,15 @@ The standard metrics groups for ProcessCollector and GoCollector are not shown b | `minio_system_drive_offline_count` | `gauge` | Count of offline drives | `pool_index,server` | | `minio_system_drive_online_count` | `gauge` | Count of online drives | `pool_index,server` | | `minio_system_drive_count` | `gauge` | Count of all drives | `pool_index,server` | +| `minio_system_drive_healing` | `gauge` | Is it healing? | `drive,set_index,drive_index,pool_index,server` | +| `minio_system_drive_online` | `gauge` | Is it online? | `drive,set_index,drive_index,pool_index,server` | +| `minio_system_drive_reads_per_sec` | `gauge` | Reads per second on a drive | `drive,set_index,drive_index,pool_index,server` | +| `minio_system_drive_reads_kb_per_sec` | `gauge` | Kilobytes read per second on a drive | `drive,set_index,drive_index,pool_index,server` | +| `minio_system_drive_reads_await` | `gauge` | Average time for read requests served on a drive | `drive,set_index,drive_index,pool_index,server` | +| `minio_system_drive_writes_per_sec` | `gauge` | Writes per second on a drive | `drive,set_index,drive_index,pool_index,server` | +| `minio_system_drive_writes_kb_per_sec` | `gauge` | Kilobytes written per second on a drive | `drive,set_index,drive_index,pool_index,server` | +| `minio_system_drive_writes_await` | `gauge` | Average time for write requests served on a drive | `drive,set_index,drive_index,pool_index,server` | +| `minio_system_drive_perc_util` | `gauge` | Percentage of time the disk was busy | `drive,set_index,drive_index,pool_index,server` | ### `/system/network/internode`