Add drive metrics in metrics-v3 (#19452)

Add following metrics:

- used_inodes
- total_inodes
- healing
- online
- reads_per_sec
- reads_kb_per_sec
- reads_await
- writes_per_sec
- writes_kb_per_sec
- writes_await
- perc_util

To be able to calculate the `per_sec` values, we capture the IOStats-related 
data in the beginning (along with the time at which they were captured), 
and compare them against the current values subsequently. This is because 
dividing by "time since server uptime." doesn't work in k8s environments.
This commit is contained in:
Shireesh Anjal 2024-04-11 23:16:34 +05:30 committed by GitHub
parent 074febd9e1
commit 08d3d06a06
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 226 additions and 30 deletions

View File

@ -227,15 +227,7 @@ func updateDriveIOStats(currentStats madmin.DiskIOStats, latestStats madmin.Disk
// too soon to update the stats
return
}
diffStats := madmin.DiskIOStats{
ReadIOs: currentStats.ReadIOs - latestStats.ReadIOs,
WriteIOs: currentStats.WriteIOs - latestStats.WriteIOs,
ReadTicks: currentStats.ReadTicks - latestStats.ReadTicks,
WriteTicks: currentStats.WriteTicks - latestStats.WriteTicks,
TotalTicks: currentStats.TotalTicks - latestStats.TotalTicks,
ReadSectors: currentStats.ReadSectors - latestStats.ReadSectors,
WriteSectors: currentStats.WriteSectors - latestStats.WriteSectors,
}
diffStats := getDiffStats(latestStats, currentStats)
updateResourceMetrics(driveSubsystem, readsPerSec, float64(diffStats.ReadIOs)/diffInSeconds, labels, false)
readKib := float64(diffStats.ReadSectors*sectorSize) / kib

View File

@ -18,6 +18,7 @@
package cmd
import (
"sync"
"time"
"github.com/minio/madmin-go/v3"
@ -61,8 +62,20 @@ func newNodesUpDownCache() *cachevalue.Cache[nodesOnline] {
loadNodesUpDown)
}
type driveIOStatMetrics struct {
readsPerSec float64
readsKBPerSec float64
readsAwait float64
writesPerSec float64
writesKBPerSec float64
writesAwait float64
percUtil float64
}
// storageMetrics - cached storage metrics.
type storageMetrics struct {
storageInfo madmin.StorageInfo
ioStats map[string]driveIOStatMetrics
onlineDrives, offlineDrives, totalDrives int
}
@ -98,7 +111,48 @@ func newESetHealthResultCache() *cachevalue.Cache[HealthResult] {
)
}
func getDiffStats(initialStats, currentStats madmin.DiskIOStats) madmin.DiskIOStats {
return madmin.DiskIOStats{
ReadIOs: currentStats.ReadIOs - initialStats.ReadIOs,
WriteIOs: currentStats.WriteIOs - initialStats.WriteIOs,
ReadSectors: currentStats.ReadSectors - initialStats.ReadSectors,
WriteSectors: currentStats.WriteSectors - initialStats.WriteSectors,
ReadTicks: currentStats.ReadTicks - initialStats.ReadTicks,
WriteTicks: currentStats.WriteTicks - initialStats.WriteTicks,
TotalTicks: currentStats.TotalTicks - initialStats.TotalTicks,
}
}
func getDriveIOStatMetrics(ioStats madmin.DiskIOStats, duration time.Duration) (m driveIOStatMetrics) {
durationSecs := duration.Seconds()
m.readsPerSec = float64(ioStats.ReadIOs) / durationSecs
m.readsKBPerSec = float64(ioStats.ReadSectors) * float64(sectorSize) / kib / durationSecs
if ioStats.ReadIOs > 0 {
m.readsAwait = float64(ioStats.ReadTicks) / float64(ioStats.ReadIOs)
}
m.writesPerSec = float64(ioStats.WriteIOs) / durationSecs
m.writesKBPerSec = float64(ioStats.WriteSectors) * float64(sectorSize) / kib / durationSecs
if ioStats.WriteIOs > 0 {
m.writesAwait = float64(ioStats.WriteTicks) / float64(ioStats.WriteIOs)
}
// TotalTicks is in milliseconds
m.percUtil = float64(ioStats.TotalTicks) * 100 / (durationSecs * 1000)
return
}
func newDriveMetricsCache() *cachevalue.Cache[storageMetrics] {
var (
// prevDriveIOStats is used to calculate "per second"
// values for IOStat related disk metrics e.g. reads/sec.
prevDriveIOStats map[string]madmin.DiskIOStats
prevDriveIOStatsMu sync.RWMutex
prevDriveIOStatsRefreshedAt time.Time
)
loadDriveMetrics := func() (v storageMetrics, err error) {
objLayer := newObjectLayerFn()
if objLayer == nil {
@ -108,14 +162,37 @@ func newDriveMetricsCache() *cachevalue.Cache[storageMetrics] {
storageInfo := objLayer.LocalStorageInfo(GlobalContext, true)
onlineDrives, offlineDrives := getOnlineOfflineDisksStats(storageInfo.Disks)
totalDrives := onlineDrives.Merge(offlineDrives)
v = storageMetrics{
storageInfo: storageInfo,
onlineDrives: onlineDrives.Sum(),
offlineDrives: offlineDrives.Sum(),
totalDrives: totalDrives.Sum(),
ioStats: map[string]driveIOStatMetrics{},
}
currentStats := getCurrentDriveIOStats()
now := time.Now().UTC()
prevDriveIOStatsMu.Lock()
if prevDriveIOStats != nil {
duration := now.Sub(prevDriveIOStatsRefreshedAt)
if duration.Seconds() > 1 {
for d, cs := range currentStats {
if ps, found := prevDriveIOStats[d]; found {
v.ioStats[d] = getDriveIOStatMetrics(getDiffStats(ps, cs), duration)
}
}
}
}
prevDriveIOStats = currentStats
prevDriveIOStatsRefreshedAt = now
prevDriveIOStatsMu.Unlock()
return
}
return cachevalue.NewFromFunc(1*time.Minute,
cachevalue.Opts{ReturnLastGood: true},
loadDriveMetrics)

View File

@ -20,6 +20,8 @@ package cmd
import (
"context"
"strconv"
"github.com/minio/madmin-go/v3"
)
// label constants
@ -30,6 +32,9 @@ const (
driveIndexL = "drive_index"
apiL = "api"
sectorSize = uint64(512)
kib = float64(1 << 10)
)
var allDriveLabels = []string{driveL, poolIndexL, setIndexL, driveIndexL}
@ -38,15 +43,28 @@ const (
driveUsedBytes = "used_bytes"
driveFreeBytes = "free_bytes"
driveTotalBytes = "total_bytes"
driveUsedInodes = "used_inodes"
driveFreeInodes = "free_inodes"
driveTotalInodes = "total_inodes"
driveTimeoutErrorsTotal = "timeout_errors_total"
driveAvailabilityErrorsTotal = "availability_errors_total"
driveWaitingIO = "waiting_io"
driveAPILatencyMicros = "api_latency_micros"
driveHealing = "healing"
driveOnline = "online"
driveOfflineCount = "offline_count"
driveOnlineCount = "online_count"
driveCount = "count"
// iostat related
driveReadsPerSec = "reads_per_sec"
driveReadsKBPerSec = "reads_kb_per_sec"
driveReadsAwait = "reads_await"
driveWritesPerSec = "writes_per_sec"
driveWritesKBPerSec = "writes_kb_per_sec"
driveWritesAwait = "writes_await"
drivePercUtil = "perc_util"
)
var (
@ -56,8 +74,12 @@ var (
"Total storage free on a drive in bytes", allDriveLabels...)
driveTotalBytesMD = NewGaugeMD(driveTotalBytes,
"Total storage available on a drive in bytes", allDriveLabels...)
driveUsedInodesMD = NewGaugeMD(driveUsedInodes,
"Total used inodes on a drive", allDriveLabels...)
driveFreeInodesMD = NewGaugeMD(driveFreeInodes,
"Total free inodes on a drive", allDriveLabels...)
driveTotalInodesMD = NewGaugeMD(driveTotalInodes,
"Total inodes available on a drive", allDriveLabels...)
driveTimeoutErrorsMD = NewCounterMD(driveTimeoutErrorsTotal,
"Total timeout errors on a drive", allDriveLabels...)
driveAvailabilityErrorsMD = NewCounterMD(driveAvailabilityErrorsTotal,
@ -68,6 +90,10 @@ var (
driveAPILatencyMD = NewGaugeMD(driveAPILatencyMicros,
"Average last minute latency in µs for drive API storage operations",
append(allDriveLabels, apiL)...)
driveHealingMD = NewGaugeMD(driveHealing,
"Is it healing?", allDriveLabels...)
driveOnlineMD = NewGaugeMD(driveOnline,
"Is it online?", allDriveLabels...)
driveOfflineCountMD = NewGaugeMD(driveOfflineCount,
"Count of offline drives")
@ -75,32 +101,71 @@ var (
"Count of online drives")
driveCountMD = NewGaugeMD(driveCount,
"Count of all drives")
// iostat related
driveReadsPerSecMD = NewGaugeMD(driveReadsPerSec,
"Reads per second on a drive",
allDriveLabels...)
driveReadsKBPerSecMD = NewGaugeMD(driveReadsKBPerSec,
"Kilobytes read per second on a drive",
allDriveLabels...)
driveReadsAwaitMD = NewGaugeMD(driveReadsAwait,
"Average time for read requests served on a drive",
allDriveLabels...)
driveWritesPerSecMD = NewGaugeMD(driveWritesPerSec,
"Writes per second on a drive",
allDriveLabels...)
driveWritesKBPerSecMD = NewGaugeMD(driveWritesKBPerSec,
"Kilobytes written per second on a drive",
allDriveLabels...)
driveWritesAwaitMD = NewGaugeMD(driveWritesAwait,
"Average time for write requests served on a drive",
allDriveLabels...)
drivePercUtilMD = NewGaugeMD(drivePercUtil,
"Percentage of time the disk was busy",
allDriveLabels...)
)
// loadDriveMetrics - `MetricsLoaderFn` for node drive metrics.
func loadDriveMetrics(ctx context.Context, m MetricValues, c *metricsCache) error {
driveMetrics, err := c.driveMetrics.Get()
if err != nil {
metricsLogIf(ctx, err)
return nil
func getCurrentDriveIOStats() map[string]madmin.DiskIOStats {
var types madmin.MetricType = madmin.MetricsDisk
driveRealtimeMetrics := collectLocalMetrics(types, collectMetricsOpts{
hosts: map[string]struct{}{
globalLocalNodeName: {},
},
})
stats := map[string]madmin.DiskIOStats{}
for d, m := range driveRealtimeMetrics.ByDisk {
stats[d] = m.IOStats
}
return stats
}
func (m *MetricValues) setDriveBasicMetrics(drive madmin.Disk, labels []string) {
m.Set(driveUsedBytes, float64(drive.UsedSpace), labels...)
m.Set(driveFreeBytes, float64(drive.AvailableSpace), labels...)
m.Set(driveTotalBytes, float64(drive.TotalSpace), labels...)
m.Set(driveUsedInodes, float64(drive.UsedInodes), labels...)
m.Set(driveFreeInodes, float64(drive.FreeInodes), labels...)
m.Set(driveTotalInodes, float64(drive.UsedInodes+drive.FreeInodes), labels...)
var healing, online float64
if drive.Healing {
healing = 1
}
m.Set(driveHealing, healing, labels...)
if drive.State == "ok" {
online = 1
}
m.Set(driveOnline, online, labels...)
}
func (m *MetricValues) setDriveAPIMetrics(disk madmin.Disk, labels []string) {
if disk.Metrics == nil {
return
}
storageInfo := driveMetrics.storageInfo
for _, disk := range storageInfo.Disks {
labels := []string{
driveL, disk.DrivePath,
poolIndexL, strconv.Itoa(disk.PoolIndex),
setIndexL, strconv.Itoa(disk.SetIndex),
driveIndexL, strconv.Itoa(disk.DiskIndex),
}
m.Set(driveUsedBytes, float64(disk.UsedSpace), labels...)
m.Set(driveFreeBytes, float64(disk.AvailableSpace), labels...)
m.Set(driveTotalBytes, float64(disk.TotalSpace), labels...)
m.Set(driveFreeInodes, float64(disk.FreeInodes), labels...)
if disk.Metrics != nil {
m.Set(driveTimeoutErrorsTotal, float64(disk.Metrics.TotalErrorsTimeout), labels...)
m.Set(driveAvailabilityErrorsTotal, float64(disk.Metrics.TotalErrorsAvailability), labels...)
m.Set(driveWaitingIO, float64(disk.Metrics.TotalWaiting), labels...)
@ -113,7 +178,45 @@ func loadDriveMetrics(ctx context.Context, m MetricValues, c *metricsCache) erro
m.Set(driveAPILatencyMicros, float64(latency.Avg().Microseconds()),
labels...)
}
}
func (m *MetricValues) setDriveIOStatMetrics(ioStats driveIOStatMetrics, labels []string) {
m.Set(driveReadsPerSec, ioStats.readsPerSec, labels...)
m.Set(driveReadsKBPerSec, ioStats.readsKBPerSec, labels...)
if ioStats.readsPerSec > 0 {
m.Set(driveReadsAwait, ioStats.readsAwait, labels...)
}
m.Set(driveWritesPerSec, ioStats.writesPerSec, labels...)
m.Set(driveWritesKBPerSec, ioStats.writesKBPerSec, labels...)
if ioStats.writesPerSec > 0 {
m.Set(driveWritesAwait, ioStats.writesAwait, labels...)
}
m.Set(drivePercUtil, ioStats.percUtil, labels...)
}
// loadDriveMetrics - `MetricsLoaderFn` for node drive metrics.
func loadDriveMetrics(ctx context.Context, m MetricValues, c *metricsCache) error {
driveMetrics, err := c.driveMetrics.Get()
if err != nil {
metricsLogIf(ctx, err)
return nil
}
for _, disk := range driveMetrics.storageInfo.Disks {
labels := []string{
driveL, disk.DrivePath,
poolIndexL, strconv.Itoa(disk.PoolIndex),
setIndexL, strconv.Itoa(disk.SetIndex),
driveIndexL, strconv.Itoa(disk.DiskIndex),
}
m.setDriveBasicMetrics(disk, labels)
if dm, found := driveMetrics.ioStats[disk.DrivePath]; found {
m.setDriveIOStatMetrics(dm, labels)
}
m.setDriveAPIMetrics(disk, labels)
}
m.Set(driveOfflineCount, float64(driveMetrics.offlineDrives))

View File

@ -117,15 +117,28 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection {
driveUsedBytesMD,
driveFreeBytesMD,
driveTotalBytesMD,
driveUsedInodesMD,
driveFreeInodesMD,
driveTotalInodesMD,
driveTimeoutErrorsMD,
driveAvailabilityErrorsMD,
driveWaitingIOMD,
driveAPILatencyMD,
driveHealingMD,
driveOnlineMD,
driveOfflineCountMD,
driveOnlineCountMD,
driveCountMD,
// iostat related
driveReadsPerSecMD,
driveReadsKBPerSecMD,
driveReadsAwaitMD,
driveWritesPerSecMD,
driveWritesKBPerSecMD,
driveWritesAwaitMD,
drivePercUtilMD,
},
loadDriveMetrics,
)

View File

@ -105,7 +105,9 @@ The standard metrics groups for ProcessCollector and GoCollector are not shown b
| `minio_system_drive_used_bytes` | `gauge` | Total storage used on a drive in bytes | `drive,set_index,drive_index,pool_index,server` |
| `minio_system_drive_free_bytes` | `gauge` | Total storage free on a drive in bytes | `drive,set_index,drive_index,pool_index,server` |
| `minio_system_drive_total_bytes` | `gauge` | Total storage available on a drive in bytes | `drive,set_index,drive_index,pool_index,server` |
| `minio_system_drive_used_inodes` | `gauge` | Total used inodes on a drive | `drive,set_index,drive_index,pool_index,server` |
| `minio_system_drive_free_inodes` | `gauge` | Total free inodes on a drive | `drive,set_index,drive_index,pool_index,server` |
| `minio_system_drive_total_inodes` | `gauge` | Total inodes available on a drive | `drive,set_index,drive_index,pool_index,server` |
| `minio_system_drive_timeout_errors_total` | `counter` | Total timeout errors on a drive | `drive,set_index,drive_index,pool_index,server` |
| `minio_system_drive_availability_errors_total` | `counter` | Total availability errors (I/O errors, permission denied and timeouts) on a drive | `drive,set_index,drive_index,pool_index,server` |
| `minio_system_drive_waiting_io` | `gauge` | Total waiting I/O operations on a drive | `drive,set_index,drive_index,pool_index,server` |
@ -113,6 +115,15 @@ The standard metrics groups for ProcessCollector and GoCollector are not shown b
| `minio_system_drive_offline_count` | `gauge` | Count of offline drives | `pool_index,server` |
| `minio_system_drive_online_count` | `gauge` | Count of online drives | `pool_index,server` |
| `minio_system_drive_count` | `gauge` | Count of all drives | `pool_index,server` |
| `minio_system_drive_healing` | `gauge` | Is it healing? | `drive,set_index,drive_index,pool_index,server` |
| `minio_system_drive_online` | `gauge` | Is it online? | `drive,set_index,drive_index,pool_index,server` |
| `minio_system_drive_reads_per_sec` | `gauge` | Reads per second on a drive | `drive,set_index,drive_index,pool_index,server` |
| `minio_system_drive_reads_kb_per_sec` | `gauge` | Kilobytes read per second on a drive | `drive,set_index,drive_index,pool_index,server` |
| `minio_system_drive_reads_await` | `gauge` | Average time for read requests served on a drive | `drive,set_index,drive_index,pool_index,server` |
| `minio_system_drive_writes_per_sec` | `gauge` | Writes per second on a drive | `drive,set_index,drive_index,pool_index,server` |
| `minio_system_drive_writes_kb_per_sec` | `gauge` | Kilobytes written per second on a drive | `drive,set_index,drive_index,pool_index,server` |
| `minio_system_drive_writes_await` | `gauge` | Average time for write requests served on a drive | `drive,set_index,drive_index,pool_index,server` |
| `minio_system_drive_perc_util` | `gauge` | Percentage of time the disk was busy | `drive,set_index,drive_index,pool_index,server` |
### `/system/network/internode`