remove local disk metrics from cluster metrics (#18886)

local disk metrics were polluting cluster metrics
Please remove them instead of adding relevant ones.

- batch job metrics were incorrectly kept at bucket
  metrics endpoint, move it to cluster metrics.

- add tier metrics to cluster peer metrics from the node.

- fix missing set level cluster health metrics
This commit is contained in:
Harshavardhana 2024-01-28 12:53:59 -08:00 committed by GitHub
parent 1d3bd02089
commit 944f3c1477
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 1490 additions and 2534 deletions

View File

@ -2286,6 +2286,7 @@ type HealthResult struct {
ESHealth []struct { ESHealth []struct {
Maintenance bool Maintenance bool
PoolID, SetID int PoolID, SetID int
Healthy bool
HealthyDrives int HealthyDrives int
HealingDrives int HealingDrives int
ReadQuorum int ReadQuorum int
@ -2409,23 +2410,25 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
result.ESHealth = append(result.ESHealth, struct { result.ESHealth = append(result.ESHealth, struct {
Maintenance bool Maintenance bool
PoolID, SetID int PoolID, SetID int
Healthy bool
HealthyDrives, HealingDrives int HealthyDrives, HealingDrives int
ReadQuorum, WriteQuorum int ReadQuorum, WriteQuorum int
}{ }{
Maintenance: opts.Maintenance, Maintenance: opts.Maintenance,
SetID: setIdx, SetID: setIdx,
PoolID: poolIdx, PoolID: poolIdx,
Healthy: erasureSetUpCount[poolIdx][setIdx].online >= poolWriteQuorums[poolIdx],
HealthyDrives: erasureSetUpCount[poolIdx][setIdx].online, HealthyDrives: erasureSetUpCount[poolIdx][setIdx].online,
HealingDrives: erasureSetUpCount[poolIdx][setIdx].healing, HealingDrives: erasureSetUpCount[poolIdx][setIdx].healing,
ReadQuorum: poolReadQuorums[poolIdx], ReadQuorum: poolReadQuorums[poolIdx],
WriteQuorum: poolWriteQuorums[poolIdx], WriteQuorum: poolWriteQuorums[poolIdx],
}) })
if erasureSetUpCount[poolIdx][setIdx].online < poolWriteQuorums[poolIdx] { result.Healthy = erasureSetUpCount[poolIdx][setIdx].online >= poolWriteQuorums[poolIdx]
if !result.Healthy {
logger.LogIf(logger.SetReqInfo(ctx, reqInfo), logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d", fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
poolIdx, setIdx, poolWriteQuorums[poolIdx])) poolIdx, setIdx, poolWriteQuorums[poolIdx]))
result.Healthy = false
} }
} }
} }

View File

@ -60,13 +60,13 @@ func init() {
getClusterHealthMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}), getClusterHealthMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
getIAMNodeMetrics(MetricsGroupOpts{dependGlobalAuthNPlugin: true, dependGlobalIAMSys: true}), getIAMNodeMetrics(MetricsGroupOpts{dependGlobalAuthNPlugin: true, dependGlobalIAMSys: true}),
getReplicationSiteMetrics(MetricsGroupOpts{dependGlobalSiteReplicationSys: true}), getReplicationSiteMetrics(MetricsGroupOpts{dependGlobalSiteReplicationSys: true}),
getBatchJobsMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
} }
peerMetricsGroups = []*MetricsGroup{ peerMetricsGroups = []*MetricsGroup{
getGoMetrics(), getGoMetrics(),
getHTTPMetrics(MetricsGroupOpts{}), getHTTPMetrics(MetricsGroupOpts{}),
getNotificationMetrics(MetricsGroupOpts{dependGlobalLambdaTargetList: true}), getNotificationMetrics(MetricsGroupOpts{dependGlobalLambdaTargetList: true}),
getLocalStorageMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
getMinioProcMetrics(), getMinioProcMetrics(),
getMinioVersionMetrics(), getMinioVersionMetrics(),
getNetworkMetrics(), getNetworkMetrics(),
@ -77,7 +77,8 @@ func init() {
getKMSNodeMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true, dependGlobalKMS: true}), getKMSNodeMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true, dependGlobalKMS: true}),
getMinioHealingMetrics(MetricsGroupOpts{dependGlobalBackgroundHealState: true}), getMinioHealingMetrics(MetricsGroupOpts{dependGlobalBackgroundHealState: true}),
getWebhookMetrics(), getWebhookMetrics(),
getReplicationClusterMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true, dependBucketTargetSys: true}), getTierMetrics(),
getReplicationNodeMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true, dependBucketTargetSys: true}),
} }
allMetricsGroups := func() (allMetrics []*MetricsGroup) { allMetricsGroups := func() (allMetrics []*MetricsGroup) {
@ -97,13 +98,13 @@ func init() {
getDistLockMetrics(MetricsGroupOpts{dependGlobalIsDistErasure: true, dependGlobalLockServer: true}), getDistLockMetrics(MetricsGroupOpts{dependGlobalIsDistErasure: true, dependGlobalLockServer: true}),
getIAMNodeMetrics(MetricsGroupOpts{dependGlobalAuthNPlugin: true, dependGlobalIAMSys: true}), getIAMNodeMetrics(MetricsGroupOpts{dependGlobalAuthNPlugin: true, dependGlobalIAMSys: true}),
getLocalStorageMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}), getLocalStorageMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
getReplicationNodeMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true, dependBucketTargetSys: true}),
} }
bucketMetricsGroups := []*MetricsGroup{ bucketMetricsGroups := []*MetricsGroup{
getBucketUsageMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}), getBucketUsageMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
getHTTPMetrics(MetricsGroupOpts{bucketOnly: true}), getHTTPMetrics(MetricsGroupOpts{bucketOnly: true}),
getBucketTTFBMetric(), getBucketTTFBMetric(),
getBatchJobsMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
} }
bucketPeerMetricsGroups = []*MetricsGroup{ bucketPeerMetricsGroups = []*MetricsGroup{
@ -2137,7 +2138,7 @@ func getIAMNodeMetrics(opts MetricsGroupOpts) *MetricsGroup {
} }
// replication metrics for each node - published to the cluster endpoint with nodename as label // replication metrics for each node - published to the cluster endpoint with nodename as label
func getReplicationClusterMetrics(opts MetricsGroupOpts) *MetricsGroup { func getReplicationNodeMetrics(opts MetricsGroupOpts) *MetricsGroup {
mg := &MetricsGroup{ mg := &MetricsGroup{
cacheInterval: 1 * time.Minute, cacheInterval: 1 * time.Minute,
metricsGroupOpts: opts, metricsGroupOpts: opts,
@ -3375,6 +3376,16 @@ func getClusterHealthStatusMD() MetricDescription {
} }
} }
func getClusterErasureSetHealthStatusMD() MetricDescription {
return MetricDescription{
Namespace: clusterMetricNamespace,
Subsystem: "health",
Name: "erasure_set_status",
Help: "Get current health status for this erasure set",
Type: gaugeMetric,
}
}
func getClusterErasureSetReadQuorumMD() MetricDescription { func getClusterErasureSetReadQuorumMD() MetricDescription {
return MetricDescription{ return MetricDescription{
Namespace: clusterMetricNamespace, Namespace: clusterMetricNamespace,
@ -3468,6 +3479,17 @@ func getClusterHealthMetrics(opts MetricsGroupOpts) *MetricsGroup {
VariableLabels: labels, VariableLabels: labels,
Value: float64(h.HealingDrives), Value: float64(h.HealingDrives),
}) })
health := 1
if !h.Healthy {
health = 0
}
metrics = append(metrics, Metric{
Description: getClusterErasureSetHealthStatusMD(),
VariableLabels: labels,
Value: float64(health),
})
} }
return return

View File

@ -51,13 +51,13 @@ groups:
- name: example - name: example
rules: rules:
- alert: MinIOClusterTolerance - alert: MinIOClusterTolerance
expr: minio_cluster_health_erasure_set_tolerance <= 0 expr: minio_cluster_health_erasure_set_status < 1
for: 5m for: 5m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: "Instance {{ $labels.server }} unable to tolerate node failures" summary: "Instance {{ $labels.server }} has lost quorum on pool {{ $labels.pool }} on set {{ $labels.set }}"
description: "MinIO instance {{ $labels.server }} of job {{ $labels.job }} has tolerance <=0 for more than 5 minutes." description: "MinIO instance {{ $labels.server }} of job {{ $labels.job }} has lost quorum on pool {{ $labels.pool }} on set {{ $labels.set }} for more than 5 minutes."
``` ```
## Verify the configuration and alerts ## Verify the configuration and alerts
@ -65,7 +65,7 @@ To verify the above sample alert follow below steps
1. Start a distributed MinIO instance (4 nodes setup) 1. Start a distributed MinIO instance (4 nodes setup)
2. Start Prometheus server and AlertManager 2. Start Prometheus server and AlertManager
3. Bring down couple of MinIO instances to bring down the Erasure Set tolerance to -1 and verify the same with `mc admin prometheus metrics ALIAS | grep minio_cluster_health_erasure_set_tolerance` 3. Bring down couple of MinIO instances to bring down the Erasure Set tolerance to -1 and verify the same with `mc admin prometheus metrics ALIAS | grep minio_cluster_health_erasure_set_status`
4. Wait for 5 mins (as alert is configured to be firing after 5 mins), and verify that you see an entry in webhook for the alert as well as in Prometheus console as shown below 4. Wait for 5 mins (as alert is configured to be firing after 5 mins), and verify that you see an entry in webhook for the alert as well as in Prometheus console as shown below
```json ```json
@ -90,7 +90,7 @@ To verify the above sample alert follow below steps
}, },
"startsAt": "2023-11-18T06:20:09.456Z", "startsAt": "2023-11-18T06:20:09.456Z",
"endsAt": "0001-01-01T00:00:00Z", "endsAt": "0001-01-01T00:00:00Z",
"generatorURL": "http://fedora-shubhendu:9090/graph?g0.expr=minio_cluster_health_erasure_set_tolerance+%3C%3D+0&g0.tab=1", "generatorURL": "http://fedora-minio:9090/graph?g0.expr=minio_cluster_health_erasure_set_tolerance+%3C%3D+0&g0.tab=1",
"fingerprint": "2255608b0da28ca3" "fingerprint": "2255608b0da28ca3"
} }
], ],
@ -107,10 +107,10 @@ To verify the above sample alert follow below steps
"severity": "critical" "severity": "critical"
}, },
"commonAnnotations": { "commonAnnotations": {
"description": "MinIO instance 127.0.0.1:9000 of job minio-job has tolerance <=0 for more than 5 minutes.", "description": "MinIO instance 127.0.0.1:9000 of job minio-job has lost quorum on pool 0 on set 0 for more than 5 minutes.",
"summary": "Instance 127.0.0.1:9000 unable to tolerate node failures" "summary": "Instance 127.0.0.1:9000 has lot quorum on pool 0 on set 0"
}, },
"externalURL": "http://fedora-shubhendu:9093", "externalURL": "http://fedora-minio:9093",
"version": "4", "version": "4",
"groupKey": "{}:{alertname=\"MinIOClusterTolerance\"}", "groupKey": "{}:{alertname=\"MinIOClusterTolerance\"}",
"truncatedAlerts": 0 "truncatedAlerts": 0

Binary file not shown.

Before

Width:  |  Height:  |  Size: 469 KiB

After

Width:  |  Height:  |  Size: 213 KiB

File diff suppressed because it is too large Load Diff

View File

@ -35,7 +35,7 @@ For deployments behind a load balancer, use the load balancer hostname instead o
| `minio_cluster_usage_version_total` | Total number of versions (includes delete marker) in a cluster | | `minio_cluster_usage_version_total` | Total number of versions (includes delete marker) in a cluster |
| `minio_cluster_usage_deletemarker_total` | Total number of delete markers in a cluster | | `minio_cluster_usage_deletemarker_total` | Total number of delete markers in a cluster |
| `minio_cluster_usage_total_bytes` | Total cluster usage in bytes | | `minio_cluster_usage_total_bytes` | Total cluster usage in bytes |
| `minio_cluster_buckets_total` | Total number of buckets in the cluster | | `minio_cluster_bucket_total` | Total number of buckets in the cluster |
## Cluster Drive Metrics ## Cluster Drive Metrics