Consolidate drive health related metrics into single metric (#19706)

Instead of having "online" and "healing" as two metrics, replace with a
single metric "health" which can have following values:

0 = offline
1 = healthy
2 = healing
This commit is contained in:
Shireesh Anjal 2024-05-12 22:53:50 +05:30 committed by GitHub
parent e8d14c0d90
commit 074d70112d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 20 additions and 19 deletions

View File

@ -35,6 +35,10 @@ const (
sectorSize = uint64(512)
kib = float64(1 << 10)
driveHealthOffline = float64(0)
driveHealthOnline = float64(1)
driveHealthHealing = float64(2)
)
var allDriveLabels = []string{driveL, poolIndexL, setIndexL, driveIndexL}
@ -51,8 +55,7 @@ const (
driveAvailabilityErrorsTotal = "availability_errors_total"
driveWaitingIO = "waiting_io"
driveAPILatencyMicros = "api_latency_micros"
driveHealing = "healing"
driveOnline = "online"
driveHealth = "health"
driveOfflineCount = "offline_count"
driveOnlineCount = "online_count"
@ -93,10 +96,8 @@ var (
driveAPILatencyMD = NewGaugeMD(driveAPILatencyMicros,
"Average last minute latency in µs for drive API storage operations",
append(allDriveLabels, apiL)...)
driveHealingMD = NewGaugeMD(driveHealing,
"Is it healing?", allDriveLabels...)
driveOnlineMD = NewGaugeMD(driveOnline,
"Is it online?", allDriveLabels...)
driveHealthMD = NewGaugeMD(driveHealth,
"Drive health (0 = offline, 1 = healthy, 2 = healing)", allDriveLabels...)
driveOfflineCountMD = NewGaugeMD(driveOfflineCount,
"Count of offline drives")
@ -152,16 +153,18 @@ func (m *MetricValues) setDriveBasicMetrics(drive madmin.Disk, labels []string)
m.Set(driveFreeInodes, float64(drive.FreeInodes), labels...)
m.Set(driveTotalInodes, float64(drive.UsedInodes+drive.FreeInodes), labels...)
var healing, online float64
if drive.Healing {
healing = 1
var health float64
switch drive.Healing {
case true:
health = driveHealthHealing
case false:
if drive.State == "ok" {
health = driveHealthOnline
} else {
health = driveHealthOffline
}
}
m.Set(driveHealing, healing, labels...)
if drive.State == "ok" {
online = 1
}
m.Set(driveOnline, online, labels...)
m.Set(driveHealth, health, labels...)
}
func (m *MetricValues) setDriveAPIMetrics(disk madmin.Disk, labels []string) {

View File

@ -186,8 +186,7 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection {
driveAvailabilityErrorsMD,
driveWaitingIOMD,
driveAPILatencyMD,
driveHealingMD,
driveOnlineMD,
driveHealthMD,
driveOfflineCountMD,
driveOnlineCountMD,

View File

@ -132,8 +132,7 @@ The standard metrics group for GoCollector is not shown below.
| `minio_system_drive_offline_count` | `gauge` | Count of offline drives | `pool_index,server` |
| `minio_system_drive_online_count` | `gauge` | Count of online drives | `pool_index,server` |
| `minio_system_drive_count` | `gauge` | Count of all drives | `pool_index,server` |
| `minio_system_drive_healing` | `gauge` | Is it healing? | `drive,set_index,drive_index,pool_index,server` |
| `minio_system_drive_online` | `gauge` | Is it online? | `drive,set_index,drive_index,pool_index,server` |
| `minio_system_drive_health` | `gauge` | Drive health (0 = offline, 1 = healthy, 2 = healing) | `drive,set_index,drive_index,pool_index,server` |
| `minio_system_drive_reads_per_sec` | `gauge` | Reads per second on a drive | `drive,set_index,drive_index,pool_index,server` |
| `minio_system_drive_reads_kb_per_sec` | `gauge` | Kilobytes read per second on a drive | `drive,set_index,drive_index,pool_index,server` |
| `minio_system_drive_reads_await` | `gauge` | Average time for read requests served on a drive | `drive,set_index,drive_index,pool_index,server` |