remove local disk metrics from cluster metrics (#18886)

local disk metrics were polluting cluster metrics
Please remove them instead of adding relevant ones.

- batch job metrics were incorrectly kept at bucket
  metrics endpoint, move it to cluster metrics.

- add tier metrics to cluster peer metrics from the node.

- fix missing set level cluster health metrics
This commit is contained in:
Harshavardhana 2024-01-28 12:53:59 -08:00 committed by GitHub
parent 1d3bd02089
commit 944f3c1477
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 1490 additions and 2534 deletions

View File

@ -2286,6 +2286,7 @@ type HealthResult struct {
ESHealth []struct {
Maintenance bool
PoolID, SetID int
Healthy bool
HealthyDrives int
HealingDrives int
ReadQuorum int
@ -2409,23 +2410,25 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
result.ESHealth = append(result.ESHealth, struct {
Maintenance bool
PoolID, SetID int
Healthy bool
HealthyDrives, HealingDrives int
ReadQuorum, WriteQuorum int
}{
Maintenance: opts.Maintenance,
SetID: setIdx,
PoolID: poolIdx,
Healthy: erasureSetUpCount[poolIdx][setIdx].online >= poolWriteQuorums[poolIdx],
HealthyDrives: erasureSetUpCount[poolIdx][setIdx].online,
HealingDrives: erasureSetUpCount[poolIdx][setIdx].healing,
ReadQuorum: poolReadQuorums[poolIdx],
WriteQuorum: poolWriteQuorums[poolIdx],
})
if erasureSetUpCount[poolIdx][setIdx].online < poolWriteQuorums[poolIdx] {
result.Healthy = erasureSetUpCount[poolIdx][setIdx].online >= poolWriteQuorums[poolIdx]
if !result.Healthy {
logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
poolIdx, setIdx, poolWriteQuorums[poolIdx]))
result.Healthy = false
}
}
}

View File

@ -60,13 +60,13 @@ func init() {
getClusterHealthMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
getIAMNodeMetrics(MetricsGroupOpts{dependGlobalAuthNPlugin: true, dependGlobalIAMSys: true}),
getReplicationSiteMetrics(MetricsGroupOpts{dependGlobalSiteReplicationSys: true}),
getBatchJobsMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
}
peerMetricsGroups = []*MetricsGroup{
getGoMetrics(),
getHTTPMetrics(MetricsGroupOpts{}),
getNotificationMetrics(MetricsGroupOpts{dependGlobalLambdaTargetList: true}),
getLocalStorageMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
getMinioProcMetrics(),
getMinioVersionMetrics(),
getNetworkMetrics(),
@ -77,7 +77,8 @@ func init() {
getKMSNodeMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true, dependGlobalKMS: true}),
getMinioHealingMetrics(MetricsGroupOpts{dependGlobalBackgroundHealState: true}),
getWebhookMetrics(),
getReplicationClusterMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true, dependBucketTargetSys: true}),
getTierMetrics(),
getReplicationNodeMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true, dependBucketTargetSys: true}),
}
allMetricsGroups := func() (allMetrics []*MetricsGroup) {
@ -97,13 +98,13 @@ func init() {
getDistLockMetrics(MetricsGroupOpts{dependGlobalIsDistErasure: true, dependGlobalLockServer: true}),
getIAMNodeMetrics(MetricsGroupOpts{dependGlobalAuthNPlugin: true, dependGlobalIAMSys: true}),
getLocalStorageMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
getReplicationNodeMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true, dependBucketTargetSys: true}),
}
bucketMetricsGroups := []*MetricsGroup{
getBucketUsageMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
getHTTPMetrics(MetricsGroupOpts{bucketOnly: true}),
getBucketTTFBMetric(),
getBatchJobsMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
}
bucketPeerMetricsGroups = []*MetricsGroup{
@ -2137,7 +2138,7 @@ func getIAMNodeMetrics(opts MetricsGroupOpts) *MetricsGroup {
}
// replication metrics for each node - published to the cluster endpoint with nodename as label
func getReplicationClusterMetrics(opts MetricsGroupOpts) *MetricsGroup {
func getReplicationNodeMetrics(opts MetricsGroupOpts) *MetricsGroup {
mg := &MetricsGroup{
cacheInterval: 1 * time.Minute,
metricsGroupOpts: opts,
@ -3375,6 +3376,16 @@ func getClusterHealthStatusMD() MetricDescription {
}
}
func getClusterErasureSetHealthStatusMD() MetricDescription {
return MetricDescription{
Namespace: clusterMetricNamespace,
Subsystem: "health",
Name: "erasure_set_status",
Help: "Get current health status for this erasure set",
Type: gaugeMetric,
}
}
func getClusterErasureSetReadQuorumMD() MetricDescription {
return MetricDescription{
Namespace: clusterMetricNamespace,
@ -3468,6 +3479,17 @@ func getClusterHealthMetrics(opts MetricsGroupOpts) *MetricsGroup {
VariableLabels: labels,
Value: float64(h.HealingDrives),
})
health := 1
if !h.Healthy {
health = 0
}
metrics = append(metrics, Metric{
Description: getClusterErasureSetHealthStatusMD(),
VariableLabels: labels,
Value: float64(health),
})
}
return

View File

@ -51,13 +51,13 @@ groups:
- name: example
rules:
- alert: MinIOClusterTolerance
expr: minio_cluster_health_erasure_set_tolerance <= 0
expr: minio_cluster_health_erasure_set_status < 1
for: 5m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.server }} unable to tolerate node failures"
description: "MinIO instance {{ $labels.server }} of job {{ $labels.job }} has tolerance <=0 for more than 5 minutes."
summary: "Instance {{ $labels.server }} has lost quorum on pool {{ $labels.pool }} on set {{ $labels.set }}"
description: "MinIO instance {{ $labels.server }} of job {{ $labels.job }} has lost quorum on pool {{ $labels.pool }} on set {{ $labels.set }} for more than 5 minutes."
```
## Verify the configuration and alerts
@ -65,7 +65,7 @@ To verify the above sample alert follow below steps
1. Start a distributed MinIO instance (4 nodes setup)
2. Start Prometheus server and AlertManager
3. Bring down couple of MinIO instances to bring down the Erasure Set tolerance to -1 and verify the same with `mc admin prometheus metrics ALIAS | grep minio_cluster_health_erasure_set_tolerance`
3. Bring down couple of MinIO instances to bring down the Erasure Set tolerance to -1 and verify the same with `mc admin prometheus metrics ALIAS | grep minio_cluster_health_erasure_set_status`
4. Wait for 5 mins (as alert is configured to be firing after 5 mins), and verify that you see an entry in webhook for the alert as well as in Prometheus console as shown below
```json
@ -90,7 +90,7 @@ To verify the above sample alert follow below steps
},
"startsAt": "2023-11-18T06:20:09.456Z",
"endsAt": "0001-01-01T00:00:00Z",
"generatorURL": "http://fedora-shubhendu:9090/graph?g0.expr=minio_cluster_health_erasure_set_tolerance+%3C%3D+0&g0.tab=1",
"generatorURL": "http://fedora-minio:9090/graph?g0.expr=minio_cluster_health_erasure_set_tolerance+%3C%3D+0&g0.tab=1",
"fingerprint": "2255608b0da28ca3"
}
],
@ -107,10 +107,10 @@ To verify the above sample alert follow below steps
"severity": "critical"
},
"commonAnnotations": {
"description": "MinIO instance 127.0.0.1:9000 of job minio-job has tolerance <=0 for more than 5 minutes.",
"summary": "Instance 127.0.0.1:9000 unable to tolerate node failures"
"description": "MinIO instance 127.0.0.1:9000 of job minio-job has lost quorum on pool 0 on set 0 for more than 5 minutes.",
"summary": "Instance 127.0.0.1:9000 has lot quorum on pool 0 on set 0"
},
"externalURL": "http://fedora-shubhendu:9093",
"externalURL": "http://fedora-minio:9093",
"version": "4",
"groupKey": "{}:{alertname=\"MinIOClusterTolerance\"}",
"truncatedAlerts": 0

Binary file not shown.

Before

Width:  |  Height:  |  Size: 469 KiB

After

Width:  |  Height:  |  Size: 213 KiB

File diff suppressed because it is too large Load Diff

View File

@ -35,7 +35,7 @@ For deployments behind a load balancer, use the load balancer hostname instead o
| `minio_cluster_usage_version_total` | Total number of versions (includes delete marker) in a cluster |
| `minio_cluster_usage_deletemarker_total` | Total number of delete markers in a cluster |
| `minio_cluster_usage_total_bytes` | Total cluster usage in bytes |
| `minio_cluster_buckets_total` | Total number of buckets in the cluster |
| `minio_cluster_bucket_total` | Total number of buckets in the cluster |
## Cluster Drive Metrics