remove local disk metrics from cluster metrics (#18886)

local disk metrics were polluting cluster metrics Please remove them instead of adding relevant ones. - batch job metrics were incorrectly kept at bucket metrics endpoint, move it to cluster metrics. - add tier metrics to cluster peer metrics from the node. - fix missing set level cluster health metrics
2025-05-21 09:33:50 -04:00 · 2024-01-28 12:53:59 -08:00 · 2024-01-28 12:53:59 -08:00 · 944f3c1477
commit 944f3c1477
parent 1d3bd02089
6 changed files with 1490 additions and 2534 deletions
--- a/cmd/erasure-server-pool.go
+++ b/cmd/erasure-server-pool.go
@ -2286,6 +2286,7 @@ type HealthResult struct {
 	ESHealth      []struct {
 		Maintenance   bool
 		PoolID, SetID int
+		Healthy       bool
 		HealthyDrives int
 		HealingDrives int
 		ReadQuorum    int
@ -2409,23 +2410,25 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
 			result.ESHealth = append(result.ESHealth, struct {
 				Maintenance                  bool
 				PoolID, SetID                int
+				Healthy                      bool
 				HealthyDrives, HealingDrives int
 				ReadQuorum, WriteQuorum      int
 			}{
 				Maintenance:   opts.Maintenance,
 				SetID:         setIdx,
 				PoolID:        poolIdx,
+				Healthy:       erasureSetUpCount[poolIdx][setIdx].online >= poolWriteQuorums[poolIdx],
 				HealthyDrives: erasureSetUpCount[poolIdx][setIdx].online,
 				HealingDrives: erasureSetUpCount[poolIdx][setIdx].healing,
 				ReadQuorum:    poolReadQuorums[poolIdx],
 				WriteQuorum:   poolWriteQuorums[poolIdx],
 			})

-			if erasureSetUpCount[poolIdx][setIdx].online < poolWriteQuorums[poolIdx] {
+			result.Healthy = erasureSetUpCount[poolIdx][setIdx].online >= poolWriteQuorums[poolIdx]
+			if !result.Healthy {
 				logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
 					fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
 						poolIdx, setIdx, poolWriteQuorums[poolIdx]))
-				result.Healthy = false
 			}
 		}
 	}
--- a/cmd/metrics-v2.go
+++ b/cmd/metrics-v2.go
@ -60,13 +60,13 @@ func init() {
 		getClusterHealthMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
 		getIAMNodeMetrics(MetricsGroupOpts{dependGlobalAuthNPlugin: true, dependGlobalIAMSys: true}),
 		getReplicationSiteMetrics(MetricsGroupOpts{dependGlobalSiteReplicationSys: true}),
+		getBatchJobsMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
 	}

 	peerMetricsGroups = []*MetricsGroup{
 		getGoMetrics(),
 		getHTTPMetrics(MetricsGroupOpts{}),
 		getNotificationMetrics(MetricsGroupOpts{dependGlobalLambdaTargetList: true}),
-		getLocalStorageMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
 		getMinioProcMetrics(),
 		getMinioVersionMetrics(),
 		getNetworkMetrics(),
@ -77,7 +77,8 @@ func init() {
 		getKMSNodeMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true, dependGlobalKMS: true}),
 		getMinioHealingMetrics(MetricsGroupOpts{dependGlobalBackgroundHealState: true}),
 		getWebhookMetrics(),
-		getReplicationClusterMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true, dependBucketTargetSys: true}),
+		getTierMetrics(),
+		getReplicationNodeMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true, dependBucketTargetSys: true}),
 	}

 	allMetricsGroups := func() (allMetrics []*MetricsGroup) {
@ -97,13 +98,13 @@ func init() {
 		getDistLockMetrics(MetricsGroupOpts{dependGlobalIsDistErasure: true, dependGlobalLockServer: true}),
 		getIAMNodeMetrics(MetricsGroupOpts{dependGlobalAuthNPlugin: true, dependGlobalIAMSys: true}),
 		getLocalStorageMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
+		getReplicationNodeMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true, dependBucketTargetSys: true}),
 	}

 	bucketMetricsGroups := []*MetricsGroup{
 		getBucketUsageMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
 		getHTTPMetrics(MetricsGroupOpts{bucketOnly: true}),
 		getBucketTTFBMetric(),
-		getBatchJobsMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
 	}

 	bucketPeerMetricsGroups = []*MetricsGroup{
@ -2137,7 +2138,7 @@ func getIAMNodeMetrics(opts MetricsGroupOpts) *MetricsGroup {
 }

 // replication metrics for each node - published to the cluster endpoint with nodename as label
-func getReplicationClusterMetrics(opts MetricsGroupOpts) *MetricsGroup {
+func getReplicationNodeMetrics(opts MetricsGroupOpts) *MetricsGroup {
 	mg := &MetricsGroup{
 		cacheInterval:    1 * time.Minute,
 		metricsGroupOpts: opts,
@ -3375,6 +3376,16 @@ func getClusterHealthStatusMD() MetricDescription {
 	}
 }

+func getClusterErasureSetHealthStatusMD() MetricDescription {
+	return MetricDescription{
+		Namespace: clusterMetricNamespace,
+		Subsystem: "health",
+		Name:      "erasure_set_status",
+		Help:      "Get current health status for this erasure set",
+		Type:      gaugeMetric,
+	}
+}
+
 func getClusterErasureSetReadQuorumMD() MetricDescription {
 	return MetricDescription{
 		Namespace: clusterMetricNamespace,
@ -3468,6 +3479,17 @@ func getClusterHealthMetrics(opts MetricsGroupOpts) *MetricsGroup {
 				VariableLabels: labels,
 				Value:          float64(h.HealingDrives),
 			})
+
+			health := 1
+			if !h.Healthy {
+				health = 0
+			}
+
+			metrics = append(metrics, Metric{
+				Description:    getClusterErasureSetHealthStatusMD(),
+				VariableLabels: labels,
+				Value:          float64(health),
+			})
 		}

 		return
--- a/docs/metrics/prometheus/alerts.md
+++ b/docs/metrics/prometheus/alerts.md
@ -51,13 +51,13 @@ groups:
 - name: example
  rules:
  - alert: MinIOClusterTolerance
-    expr: minio_cluster_health_erasure_set_tolerance <= 0
+    expr: minio_cluster_health_erasure_set_status < 1
    for: 5m
    labels:
      severity: critical
    annotations:
-      summary: "Instance {{ $labels.server }} unable to tolerate node failures"
-      description: "MinIO instance {{ $labels.server }} of job {{ $labels.job }} has tolerance <=0 for more than 5 minutes."
+      summary: "Instance {{ $labels.server }} has lost quorum on pool {{ $labels.pool }} on set {{ $labels.set }}"
+      description: "MinIO instance {{ $labels.server }} of job {{ $labels.job }} has lost quorum on pool {{ $labels.pool }} on set {{ $labels.set }} for more than 5 minutes."
 ```

 ## Verify the configuration and alerts
@ -65,7 +65,7 @@ To verify the above sample alert follow below steps

 1. Start a distributed MinIO instance (4 nodes setup)
 2. Start Prometheus server and AlertManager
-3. Bring down couple of MinIO instances to bring down the Erasure Set tolerance to -1 and verify the same with `mc admin prometheus metrics ALIAS | grep minio_cluster_health_erasure_set_tolerance`
+3. Bring down couple of MinIO instances to bring down the Erasure Set tolerance to -1 and verify the same with `mc admin prometheus metrics ALIAS | grep minio_cluster_health_erasure_set_status`
 4. Wait for 5 mins (as alert is configured to be firing after 5 mins), and verify that you see an entry in webhook for the alert as well as in Prometheus console as shown below

 ```json
@ -90,7 +90,7 @@ To verify the above sample alert follow below steps
      },
      "startsAt": "2023-11-18T06:20:09.456Z",
      "endsAt": "0001-01-01T00:00:00Z",
-      "generatorURL": "http://fedora-shubhendu:9090/graph?g0.expr=minio_cluster_health_erasure_set_tolerance+%3C%3D+0&g0.tab=1",
+      "generatorURL": "http://fedora-minio:9090/graph?g0.expr=minio_cluster_health_erasure_set_tolerance+%3C%3D+0&g0.tab=1",
      "fingerprint": "2255608b0da28ca3"
    }
  ],
@ -107,10 +107,10 @@ To verify the above sample alert follow below steps
    "severity": "critical"
  },
  "commonAnnotations": {
-    "description": "MinIO instance 127.0.0.1:9000 of job minio-job has tolerance <=0 for more than 5 minutes.",
-    "summary": "Instance 127.0.0.1:9000 unable to tolerate node failures"
+    "description": "MinIO instance 127.0.0.1:9000 of job minio-job has lost quorum on pool 0 on set 0 for more than 5 minutes.",
+    "summary": "Instance 127.0.0.1:9000 has lot quorum on pool 0 on set 0"
  },
-  "externalURL": "http://fedora-shubhendu:9093",
+  "externalURL": "http://fedora-minio:9093",
  "version": "4",
  "groupKey": "{}:{alertname=\"MinIOClusterTolerance\"}",
  "truncatedAlerts": 0
--- a/docs/metrics/prometheus/grafana/grafana-minio.png
+++ b/docs/metrics/prometheus/grafana/grafana-minio.png
--- a/docs/metrics/prometheus/grafana/minio-dashboard.json
+++ b/docs/metrics/prometheus/grafana/minio-dashboard.json
--- a/docs/metrics/prometheus/list.md
+++ b/docs/metrics/prometheus/list.md
@ -35,7 +35,7 @@ For deployments behind a load balancer, use the load balancer hostname instead o
 | `minio_cluster_usage_version_total`          | Total number of versions (includes delete marker) in a cluster |
 | `minio_cluster_usage_deletemarker_total`     | Total number of delete markers in a cluster                    |
 | `minio_cluster_usage_total_bytes`            | Total cluster usage in bytes                                   |
-| `minio_cluster_buckets_total`                | Total number of buckets in the cluster                         |
+| `minio_cluster_bucket_total`                 | Total number of buckets in the cluster                         |

 ## Cluster Drive Metrics