Add more metrics to v3/cluster/erasure-set (#19714)

Metrics being added:

- read_tolerance: No of drive failures that can be tolerated without
  disrupting read operations
- write_tolerance: No of drive failures that can be tolerated without
  disrupting write operations
- read_health: Health of the erasure set in a pool for read operations
  (1=healthy, 0=unhealthy)
- write_health: Health of the erasure set in a pool for write operations
  (1=healthy, 0=unhealthy)
This commit is contained in:
Shireesh Anjal 2024-05-14 12:55:56 +05:30 committed by GitHub
parent b2a82248b1
commit 5808190398
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 55 additions and 19 deletions

View File

@ -30,6 +30,10 @@ const (
erasureSetOnlineDrivesCount = "online_drives_count"
erasureSetHealingDrivesCount = "healing_drives_count"
erasureSetHealth = "health"
erasureSetReadTolerance = "read_tolerance"
erasureSetWriteTolerance = "write_tolerance"
erasureSetReadHealth = "read_health"
erasureSetWriteHealth = "write_health"
)
const (
@ -53,6 +57,18 @@ var (
erasureSetHealthMD = NewGaugeMD(erasureSetHealth,
"Health of the erasure set in a pool (1=healthy, 0=unhealthy)",
poolIDL, setIDL)
erasureSetReadToleranceMD = NewGaugeMD(erasureSetReadTolerance,
"No of drive failures that can be tolerated without disrupting read operations",
poolIDL, setIDL)
erasureSetWriteToleranceMD = NewGaugeMD(erasureSetWriteTolerance,
"No of drive failures that can be tolerated without disrupting write operations",
poolIDL, setIDL)
erasureSetReadHealthMD = NewGaugeMD(erasureSetReadHealth,
"Health of the erasure set in a pool for read operations (1=healthy, 0=unhealthy)",
poolIDL, setIDL)
erasureSetWriteHealthMD = NewGaugeMD(erasureSetWriteHealth,
"Health of the erasure set in a pool for write operations (1=healthy, 0=unhealthy)",
poolIDL, setIDL)
)
func b2f(v bool) float64 {
@ -73,16 +89,28 @@ func loadClusterErasureSetMetrics(ctx context.Context, m MetricValues, c *metric
for _, h := range result.ESHealth {
poolLV := strconv.Itoa(h.PoolID)
setLV := strconv.Itoa(h.SetID)
m.Set(erasureSetReadQuorum, float64(h.ReadQuorum),
poolIDL, poolLV, setIDL, setLV)
m.Set(erasureSetWriteQuorum, float64(h.WriteQuorum),
poolIDL, poolLV, setIDL, setLV)
m.Set(erasureSetOnlineDrivesCount, float64(h.HealthyDrives),
poolIDL, poolLV, setIDL, setLV)
m.Set(erasureSetHealingDrivesCount, float64(h.HealingDrives),
poolIDL, poolLV, setIDL, setLV)
m.Set(erasureSetHealth, b2f(h.Healthy),
poolIDL, poolLV, setIDL, setLV)
labels := []string{poolIDL, poolLV, setIDL, setLV}
m.Set(erasureSetReadQuorum, float64(h.ReadQuorum), labels...)
m.Set(erasureSetWriteQuorum, float64(h.WriteQuorum), labels...)
m.Set(erasureSetOnlineDrivesCount, float64(h.HealthyDrives), labels...)
m.Set(erasureSetHealingDrivesCount, float64(h.HealingDrives), labels...)
m.Set(erasureSetHealth, b2f(h.Healthy), labels...)
readHealthy := true
readTolerance := float64(h.HealthyDrives - h.ReadQuorum)
if readTolerance < 0 {
readHealthy = false
}
m.Set(erasureSetReadTolerance, readTolerance, labels...)
m.Set(erasureSetReadHealth, b2f(readHealthy), labels...)
writeHealthy := true
writeTolerance := float64(h.HealthyDrives + h.HealingDrives - h.WriteQuorum)
if writeTolerance < 0 {
writeHealthy = false
}
m.Set(erasureSetWriteTolerance, writeTolerance, labels...)
m.Set(erasureSetWriteHealth, b2f(writeHealthy), labels...)
}
return nil

View File

@ -261,6 +261,10 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection {
erasureSetOnlineDrivesCountMD,
erasureSetHealingDrivesCountMD,
erasureSetHealthMD,
erasureSetReadToleranceMD,
erasureSetWriteToleranceMD,
erasureSetReadHealthMD,
erasureSetWriteHealthMD,
},
loadClusterErasureSetMetrics,
)

View File

@ -249,15 +249,19 @@ The standard metrics group for GoCollector is not shown below.
### `/cluster/erasure-set`
| Name | Type | Help | Labels |
|--------------------------------------------------|---------|---------------------------------------------------------------|------------------|
| `minio_cluster_erasure_set_overall_write_quorum` | `gauge` | Overall write quorum across pools and sets | |
| `minio_cluster_erasure_set_overall_health` | `gauge` | Overall health across pools and sets (1=healthy, 0=unhealthy) | |
| `minio_cluster_erasure_set_read_quorum` | `gauge` | Read quorum for the erasure set in a pool | `pool_id,set_id` |
| `minio_cluster_erasure_set_write_quorum` | `gauge` | Write quorum for the erasure set in a pool | `pool_id,set_id` |
| `minio_cluster_erasure_set_online_drives_count` | `gauge` | Count of online drives in the erasure set in a pool | `pool_id,set_id` |
| `minio_cluster_erasure_set_healing_drives_count` | `gauge` | Count of healing drives in the erasure set in a pool | `pool_id,set_id` |
| `minio_cluster_erasure_set_health` | `gauge` | Health of the erasure set in a pool (1=healthy, 0=unhealthy) | `pool_id,set_id` |
| Name | Type | Help | Labels |
|--------------------------------------------------|---------|-----------------------------------------------------------------------------------|------------------|
| `minio_cluster_erasure_set_overall_write_quorum` | `gauge` | Overall write quorum across pools and sets | |
| `minio_cluster_erasure_set_overall_health` | `gauge` | Overall health across pools and sets (1=healthy, 0=unhealthy) | |
| `minio_cluster_erasure_set_read_quorum` | `gauge` | Read quorum for the erasure set in a pool | `pool_id,set_id` |
| `minio_cluster_erasure_set_write_quorum` | `gauge` | Write quorum for the erasure set in a pool | `pool_id,set_id` |
| `minio_cluster_erasure_set_online_drives_count` | `gauge` | Count of online drives in the erasure set in a pool | `pool_id,set_id` |
| `minio_cluster_erasure_set_healing_drives_count` | `gauge` | Count of healing drives in the erasure set in a pool | `pool_id,set_id` |
| `minio_cluster_erasure_set_health` | `gauge` | Health of the erasure set in a pool (1=healthy, 0=unhealthy) | `pool_id,set_id` |
| `minio_cluster_erasure_set_read_tolerance` | `gauge` | No of drive failures that can be tolerated without disrupting read operations | `pool_id,set_id` |
| `minio_cluster_erasure_set_write_tolerance` | `gauge` | No of drive failures that can be tolerated without disrupting write operations | `pool_id,set_id` |
| `minio_cluster_erasure_set_read_health` | `gauge` | Health of the erasure set in a pool for read operations (1=healthy, 0=unhealthy) | `pool_id,set_id` |
| `minio_cluster_erasure_set_write_health` | `gauge` | Health of the erasure set in a pool for write operations (1=healthy, 0=unhealthy) | `pool_id,set_id` |
### `/cluster/notification`