Add drive metrics in metrics-v3 (#19452)

Add following metrics:

- used_inodes
- total_inodes
- healing
- online
- reads_per_sec
- reads_kb_per_sec
- reads_await
- writes_per_sec
- writes_kb_per_sec
- writes_await
- perc_util

To be able to calculate the `per_sec` values, we capture the IOStats-related 
data in the beginning (along with the time at which they were captured), 
and compare them against the current values subsequently. This is because 
dividing by "time since server uptime." doesn't work in k8s environments.
This commit is contained in:
Shireesh Anjal
2024-04-11 23:16:34 +05:30
committed by GitHub
parent 074febd9e1
commit 08d3d06a06
5 changed files with 226 additions and 30 deletions

View File

@@ -20,6 +20,8 @@ package cmd
import (
"context"
"strconv"
"github.com/minio/madmin-go/v3"
)
// label constants
@@ -30,6 +32,9 @@ const (
driveIndexL = "drive_index"
apiL = "api"
sectorSize = uint64(512)
kib = float64(1 << 10)
)
var allDriveLabels = []string{driveL, poolIndexL, setIndexL, driveIndexL}
@@ -38,15 +43,28 @@ const (
driveUsedBytes = "used_bytes"
driveFreeBytes = "free_bytes"
driveTotalBytes = "total_bytes"
driveUsedInodes = "used_inodes"
driveFreeInodes = "free_inodes"
driveTotalInodes = "total_inodes"
driveTimeoutErrorsTotal = "timeout_errors_total"
driveAvailabilityErrorsTotal = "availability_errors_total"
driveWaitingIO = "waiting_io"
driveAPILatencyMicros = "api_latency_micros"
driveHealing = "healing"
driveOnline = "online"
driveOfflineCount = "offline_count"
driveOnlineCount = "online_count"
driveCount = "count"
// iostat related
driveReadsPerSec = "reads_per_sec"
driveReadsKBPerSec = "reads_kb_per_sec"
driveReadsAwait = "reads_await"
driveWritesPerSec = "writes_per_sec"
driveWritesKBPerSec = "writes_kb_per_sec"
driveWritesAwait = "writes_await"
drivePercUtil = "perc_util"
)
var (
@@ -56,8 +74,12 @@ var (
"Total storage free on a drive in bytes", allDriveLabels...)
driveTotalBytesMD = NewGaugeMD(driveTotalBytes,
"Total storage available on a drive in bytes", allDriveLabels...)
driveUsedInodesMD = NewGaugeMD(driveUsedInodes,
"Total used inodes on a drive", allDriveLabels...)
driveFreeInodesMD = NewGaugeMD(driveFreeInodes,
"Total free inodes on a drive", allDriveLabels...)
driveTotalInodesMD = NewGaugeMD(driveTotalInodes,
"Total inodes available on a drive", allDriveLabels...)
driveTimeoutErrorsMD = NewCounterMD(driveTimeoutErrorsTotal,
"Total timeout errors on a drive", allDriveLabels...)
driveAvailabilityErrorsMD = NewCounterMD(driveAvailabilityErrorsTotal,
@@ -68,6 +90,10 @@ var (
driveAPILatencyMD = NewGaugeMD(driveAPILatencyMicros,
"Average last minute latency in µs for drive API storage operations",
append(allDriveLabels, apiL)...)
driveHealingMD = NewGaugeMD(driveHealing,
"Is it healing?", allDriveLabels...)
driveOnlineMD = NewGaugeMD(driveOnline,
"Is it online?", allDriveLabels...)
driveOfflineCountMD = NewGaugeMD(driveOfflineCount,
"Count of offline drives")
@@ -75,8 +101,101 @@ var (
"Count of online drives")
driveCountMD = NewGaugeMD(driveCount,
"Count of all drives")
// iostat related
driveReadsPerSecMD = NewGaugeMD(driveReadsPerSec,
"Reads per second on a drive",
allDriveLabels...)
driveReadsKBPerSecMD = NewGaugeMD(driveReadsKBPerSec,
"Kilobytes read per second on a drive",
allDriveLabels...)
driveReadsAwaitMD = NewGaugeMD(driveReadsAwait,
"Average time for read requests served on a drive",
allDriveLabels...)
driveWritesPerSecMD = NewGaugeMD(driveWritesPerSec,
"Writes per second on a drive",
allDriveLabels...)
driveWritesKBPerSecMD = NewGaugeMD(driveWritesKBPerSec,
"Kilobytes written per second on a drive",
allDriveLabels...)
driveWritesAwaitMD = NewGaugeMD(driveWritesAwait,
"Average time for write requests served on a drive",
allDriveLabels...)
drivePercUtilMD = NewGaugeMD(drivePercUtil,
"Percentage of time the disk was busy",
allDriveLabels...)
)
func getCurrentDriveIOStats() map[string]madmin.DiskIOStats {
var types madmin.MetricType = madmin.MetricsDisk
driveRealtimeMetrics := collectLocalMetrics(types, collectMetricsOpts{
hosts: map[string]struct{}{
globalLocalNodeName: {},
},
})
stats := map[string]madmin.DiskIOStats{}
for d, m := range driveRealtimeMetrics.ByDisk {
stats[d] = m.IOStats
}
return stats
}
func (m *MetricValues) setDriveBasicMetrics(drive madmin.Disk, labels []string) {
m.Set(driveUsedBytes, float64(drive.UsedSpace), labels...)
m.Set(driveFreeBytes, float64(drive.AvailableSpace), labels...)
m.Set(driveTotalBytes, float64(drive.TotalSpace), labels...)
m.Set(driveUsedInodes, float64(drive.UsedInodes), labels...)
m.Set(driveFreeInodes, float64(drive.FreeInodes), labels...)
m.Set(driveTotalInodes, float64(drive.UsedInodes+drive.FreeInodes), labels...)
var healing, online float64
if drive.Healing {
healing = 1
}
m.Set(driveHealing, healing, labels...)
if drive.State == "ok" {
online = 1
}
m.Set(driveOnline, online, labels...)
}
func (m *MetricValues) setDriveAPIMetrics(disk madmin.Disk, labels []string) {
if disk.Metrics == nil {
return
}
m.Set(driveTimeoutErrorsTotal, float64(disk.Metrics.TotalErrorsTimeout), labels...)
m.Set(driveAvailabilityErrorsTotal, float64(disk.Metrics.TotalErrorsAvailability), labels...)
m.Set(driveWaitingIO, float64(disk.Metrics.TotalWaiting), labels...)
// Append the api label for the drive API latencies.
labels = append(labels, "api", "")
lastIdx := len(labels) - 1
for apiName, latency := range disk.Metrics.LastMinute {
labels[lastIdx] = "storage." + apiName
m.Set(driveAPILatencyMicros, float64(latency.Avg().Microseconds()),
labels...)
}
}
func (m *MetricValues) setDriveIOStatMetrics(ioStats driveIOStatMetrics, labels []string) {
m.Set(driveReadsPerSec, ioStats.readsPerSec, labels...)
m.Set(driveReadsKBPerSec, ioStats.readsKBPerSec, labels...)
if ioStats.readsPerSec > 0 {
m.Set(driveReadsAwait, ioStats.readsAwait, labels...)
}
m.Set(driveWritesPerSec, ioStats.writesPerSec, labels...)
m.Set(driveWritesKBPerSec, ioStats.writesKBPerSec, labels...)
if ioStats.writesPerSec > 0 {
m.Set(driveWritesAwait, ioStats.writesAwait, labels...)
}
m.Set(drivePercUtil, ioStats.percUtil, labels...)
}
// loadDriveMetrics - `MetricsLoaderFn` for node drive metrics.
func loadDriveMetrics(ctx context.Context, m MetricValues, c *metricsCache) error {
driveMetrics, err := c.driveMetrics.Get()
@@ -85,9 +204,7 @@ func loadDriveMetrics(ctx context.Context, m MetricValues, c *metricsCache) erro
return nil
}
storageInfo := driveMetrics.storageInfo
for _, disk := range storageInfo.Disks {
for _, disk := range driveMetrics.storageInfo.Disks {
labels := []string{
driveL, disk.DrivePath,
poolIndexL, strconv.Itoa(disk.PoolIndex),
@@ -95,25 +212,11 @@ func loadDriveMetrics(ctx context.Context, m MetricValues, c *metricsCache) erro
driveIndexL, strconv.Itoa(disk.DiskIndex),
}
m.Set(driveUsedBytes, float64(disk.UsedSpace), labels...)
m.Set(driveFreeBytes, float64(disk.AvailableSpace), labels...)
m.Set(driveTotalBytes, float64(disk.TotalSpace), labels...)
m.Set(driveFreeInodes, float64(disk.FreeInodes), labels...)
if disk.Metrics != nil {
m.Set(driveTimeoutErrorsTotal, float64(disk.Metrics.TotalErrorsTimeout), labels...)
m.Set(driveAvailabilityErrorsTotal, float64(disk.Metrics.TotalErrorsAvailability), labels...)
m.Set(driveWaitingIO, float64(disk.Metrics.TotalWaiting), labels...)
// Append the api label for the drive API latencies.
labels = append(labels, "api", "")
lastIdx := len(labels) - 1
for apiName, latency := range disk.Metrics.LastMinute {
labels[lastIdx] = "storage." + apiName
m.Set(driveAPILatencyMicros, float64(latency.Avg().Microseconds()),
labels...)
}
m.setDriveBasicMetrics(disk, labels)
if dm, found := driveMetrics.ioStats[disk.DrivePath]; found {
m.setDriveIOStatMetrics(dm, labels)
}
m.setDriveAPIMetrics(disk, labels)
}
m.Set(driveOfflineCount, float64(driveMetrics.offlineDrives))