mirror of
https://github.com/minio/minio.git
synced 2025-01-26 22:23:15 -05:00
remove local disk metrics from cluster metrics (#18886)
local disk metrics were polluting cluster metrics Please remove them instead of adding relevant ones. - batch job metrics were incorrectly kept at bucket metrics endpoint, move it to cluster metrics. - add tier metrics to cluster peer metrics from the node. - fix missing set level cluster health metrics
This commit is contained in:
parent
1d3bd02089
commit
944f3c1477
@ -2286,6 +2286,7 @@ type HealthResult struct {
|
|||||||
ESHealth []struct {
|
ESHealth []struct {
|
||||||
Maintenance bool
|
Maintenance bool
|
||||||
PoolID, SetID int
|
PoolID, SetID int
|
||||||
|
Healthy bool
|
||||||
HealthyDrives int
|
HealthyDrives int
|
||||||
HealingDrives int
|
HealingDrives int
|
||||||
ReadQuorum int
|
ReadQuorum int
|
||||||
@ -2409,23 +2410,25 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
|
|||||||
result.ESHealth = append(result.ESHealth, struct {
|
result.ESHealth = append(result.ESHealth, struct {
|
||||||
Maintenance bool
|
Maintenance bool
|
||||||
PoolID, SetID int
|
PoolID, SetID int
|
||||||
|
Healthy bool
|
||||||
HealthyDrives, HealingDrives int
|
HealthyDrives, HealingDrives int
|
||||||
ReadQuorum, WriteQuorum int
|
ReadQuorum, WriteQuorum int
|
||||||
}{
|
}{
|
||||||
Maintenance: opts.Maintenance,
|
Maintenance: opts.Maintenance,
|
||||||
SetID: setIdx,
|
SetID: setIdx,
|
||||||
PoolID: poolIdx,
|
PoolID: poolIdx,
|
||||||
|
Healthy: erasureSetUpCount[poolIdx][setIdx].online >= poolWriteQuorums[poolIdx],
|
||||||
HealthyDrives: erasureSetUpCount[poolIdx][setIdx].online,
|
HealthyDrives: erasureSetUpCount[poolIdx][setIdx].online,
|
||||||
HealingDrives: erasureSetUpCount[poolIdx][setIdx].healing,
|
HealingDrives: erasureSetUpCount[poolIdx][setIdx].healing,
|
||||||
ReadQuorum: poolReadQuorums[poolIdx],
|
ReadQuorum: poolReadQuorums[poolIdx],
|
||||||
WriteQuorum: poolWriteQuorums[poolIdx],
|
WriteQuorum: poolWriteQuorums[poolIdx],
|
||||||
})
|
})
|
||||||
|
|
||||||
if erasureSetUpCount[poolIdx][setIdx].online < poolWriteQuorums[poolIdx] {
|
result.Healthy = erasureSetUpCount[poolIdx][setIdx].online >= poolWriteQuorums[poolIdx]
|
||||||
|
if !result.Healthy {
|
||||||
logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
|
logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
|
||||||
fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
|
fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
|
||||||
poolIdx, setIdx, poolWriteQuorums[poolIdx]))
|
poolIdx, setIdx, poolWriteQuorums[poolIdx]))
|
||||||
result.Healthy = false
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -60,13 +60,13 @@ func init() {
|
|||||||
getClusterHealthMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
|
getClusterHealthMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
|
||||||
getIAMNodeMetrics(MetricsGroupOpts{dependGlobalAuthNPlugin: true, dependGlobalIAMSys: true}),
|
getIAMNodeMetrics(MetricsGroupOpts{dependGlobalAuthNPlugin: true, dependGlobalIAMSys: true}),
|
||||||
getReplicationSiteMetrics(MetricsGroupOpts{dependGlobalSiteReplicationSys: true}),
|
getReplicationSiteMetrics(MetricsGroupOpts{dependGlobalSiteReplicationSys: true}),
|
||||||
|
getBatchJobsMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
|
||||||
}
|
}
|
||||||
|
|
||||||
peerMetricsGroups = []*MetricsGroup{
|
peerMetricsGroups = []*MetricsGroup{
|
||||||
getGoMetrics(),
|
getGoMetrics(),
|
||||||
getHTTPMetrics(MetricsGroupOpts{}),
|
getHTTPMetrics(MetricsGroupOpts{}),
|
||||||
getNotificationMetrics(MetricsGroupOpts{dependGlobalLambdaTargetList: true}),
|
getNotificationMetrics(MetricsGroupOpts{dependGlobalLambdaTargetList: true}),
|
||||||
getLocalStorageMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
|
|
||||||
getMinioProcMetrics(),
|
getMinioProcMetrics(),
|
||||||
getMinioVersionMetrics(),
|
getMinioVersionMetrics(),
|
||||||
getNetworkMetrics(),
|
getNetworkMetrics(),
|
||||||
@ -77,7 +77,8 @@ func init() {
|
|||||||
getKMSNodeMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true, dependGlobalKMS: true}),
|
getKMSNodeMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true, dependGlobalKMS: true}),
|
||||||
getMinioHealingMetrics(MetricsGroupOpts{dependGlobalBackgroundHealState: true}),
|
getMinioHealingMetrics(MetricsGroupOpts{dependGlobalBackgroundHealState: true}),
|
||||||
getWebhookMetrics(),
|
getWebhookMetrics(),
|
||||||
getReplicationClusterMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true, dependBucketTargetSys: true}),
|
getTierMetrics(),
|
||||||
|
getReplicationNodeMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true, dependBucketTargetSys: true}),
|
||||||
}
|
}
|
||||||
|
|
||||||
allMetricsGroups := func() (allMetrics []*MetricsGroup) {
|
allMetricsGroups := func() (allMetrics []*MetricsGroup) {
|
||||||
@ -97,13 +98,13 @@ func init() {
|
|||||||
getDistLockMetrics(MetricsGroupOpts{dependGlobalIsDistErasure: true, dependGlobalLockServer: true}),
|
getDistLockMetrics(MetricsGroupOpts{dependGlobalIsDistErasure: true, dependGlobalLockServer: true}),
|
||||||
getIAMNodeMetrics(MetricsGroupOpts{dependGlobalAuthNPlugin: true, dependGlobalIAMSys: true}),
|
getIAMNodeMetrics(MetricsGroupOpts{dependGlobalAuthNPlugin: true, dependGlobalIAMSys: true}),
|
||||||
getLocalStorageMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
|
getLocalStorageMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
|
||||||
|
getReplicationNodeMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true, dependBucketTargetSys: true}),
|
||||||
}
|
}
|
||||||
|
|
||||||
bucketMetricsGroups := []*MetricsGroup{
|
bucketMetricsGroups := []*MetricsGroup{
|
||||||
getBucketUsageMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
|
getBucketUsageMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
|
||||||
getHTTPMetrics(MetricsGroupOpts{bucketOnly: true}),
|
getHTTPMetrics(MetricsGroupOpts{bucketOnly: true}),
|
||||||
getBucketTTFBMetric(),
|
getBucketTTFBMetric(),
|
||||||
getBatchJobsMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bucketPeerMetricsGroups = []*MetricsGroup{
|
bucketPeerMetricsGroups = []*MetricsGroup{
|
||||||
@ -2137,7 +2138,7 @@ func getIAMNodeMetrics(opts MetricsGroupOpts) *MetricsGroup {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// replication metrics for each node - published to the cluster endpoint with nodename as label
|
// replication metrics for each node - published to the cluster endpoint with nodename as label
|
||||||
func getReplicationClusterMetrics(opts MetricsGroupOpts) *MetricsGroup {
|
func getReplicationNodeMetrics(opts MetricsGroupOpts) *MetricsGroup {
|
||||||
mg := &MetricsGroup{
|
mg := &MetricsGroup{
|
||||||
cacheInterval: 1 * time.Minute,
|
cacheInterval: 1 * time.Minute,
|
||||||
metricsGroupOpts: opts,
|
metricsGroupOpts: opts,
|
||||||
@ -3375,6 +3376,16 @@ func getClusterHealthStatusMD() MetricDescription {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getClusterErasureSetHealthStatusMD() MetricDescription {
|
||||||
|
return MetricDescription{
|
||||||
|
Namespace: clusterMetricNamespace,
|
||||||
|
Subsystem: "health",
|
||||||
|
Name: "erasure_set_status",
|
||||||
|
Help: "Get current health status for this erasure set",
|
||||||
|
Type: gaugeMetric,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func getClusterErasureSetReadQuorumMD() MetricDescription {
|
func getClusterErasureSetReadQuorumMD() MetricDescription {
|
||||||
return MetricDescription{
|
return MetricDescription{
|
||||||
Namespace: clusterMetricNamespace,
|
Namespace: clusterMetricNamespace,
|
||||||
@ -3468,6 +3479,17 @@ func getClusterHealthMetrics(opts MetricsGroupOpts) *MetricsGroup {
|
|||||||
VariableLabels: labels,
|
VariableLabels: labels,
|
||||||
Value: float64(h.HealingDrives),
|
Value: float64(h.HealingDrives),
|
||||||
})
|
})
|
||||||
|
|
||||||
|
health := 1
|
||||||
|
if !h.Healthy {
|
||||||
|
health = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
metrics = append(metrics, Metric{
|
||||||
|
Description: getClusterErasureSetHealthStatusMD(),
|
||||||
|
VariableLabels: labels,
|
||||||
|
Value: float64(health),
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
return
|
return
|
||||||
|
@ -51,13 +51,13 @@ groups:
|
|||||||
- name: example
|
- name: example
|
||||||
rules:
|
rules:
|
||||||
- alert: MinIOClusterTolerance
|
- alert: MinIOClusterTolerance
|
||||||
expr: minio_cluster_health_erasure_set_tolerance <= 0
|
expr: minio_cluster_health_erasure_set_status < 1
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Instance {{ $labels.server }} unable to tolerate node failures"
|
summary: "Instance {{ $labels.server }} has lost quorum on pool {{ $labels.pool }} on set {{ $labels.set }}"
|
||||||
description: "MinIO instance {{ $labels.server }} of job {{ $labels.job }} has tolerance <=0 for more than 5 minutes."
|
description: "MinIO instance {{ $labels.server }} of job {{ $labels.job }} has lost quorum on pool {{ $labels.pool }} on set {{ $labels.set }} for more than 5 minutes."
|
||||||
```
|
```
|
||||||
|
|
||||||
## Verify the configuration and alerts
|
## Verify the configuration and alerts
|
||||||
@ -65,7 +65,7 @@ To verify the above sample alert follow below steps
|
|||||||
|
|
||||||
1. Start a distributed MinIO instance (4 nodes setup)
|
1. Start a distributed MinIO instance (4 nodes setup)
|
||||||
2. Start Prometheus server and AlertManager
|
2. Start Prometheus server and AlertManager
|
||||||
3. Bring down couple of MinIO instances to bring down the Erasure Set tolerance to -1 and verify the same with `mc admin prometheus metrics ALIAS | grep minio_cluster_health_erasure_set_tolerance`
|
3. Bring down couple of MinIO instances to bring down the Erasure Set tolerance to -1 and verify the same with `mc admin prometheus metrics ALIAS | grep minio_cluster_health_erasure_set_status`
|
||||||
4. Wait for 5 mins (as alert is configured to be firing after 5 mins), and verify that you see an entry in webhook for the alert as well as in Prometheus console as shown below
|
4. Wait for 5 mins (as alert is configured to be firing after 5 mins), and verify that you see an entry in webhook for the alert as well as in Prometheus console as shown below
|
||||||
|
|
||||||
```json
|
```json
|
||||||
@ -90,7 +90,7 @@ To verify the above sample alert follow below steps
|
|||||||
},
|
},
|
||||||
"startsAt": "2023-11-18T06:20:09.456Z",
|
"startsAt": "2023-11-18T06:20:09.456Z",
|
||||||
"endsAt": "0001-01-01T00:00:00Z",
|
"endsAt": "0001-01-01T00:00:00Z",
|
||||||
"generatorURL": "http://fedora-shubhendu:9090/graph?g0.expr=minio_cluster_health_erasure_set_tolerance+%3C%3D+0&g0.tab=1",
|
"generatorURL": "http://fedora-minio:9090/graph?g0.expr=minio_cluster_health_erasure_set_tolerance+%3C%3D+0&g0.tab=1",
|
||||||
"fingerprint": "2255608b0da28ca3"
|
"fingerprint": "2255608b0da28ca3"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -107,10 +107,10 @@ To verify the above sample alert follow below steps
|
|||||||
"severity": "critical"
|
"severity": "critical"
|
||||||
},
|
},
|
||||||
"commonAnnotations": {
|
"commonAnnotations": {
|
||||||
"description": "MinIO instance 127.0.0.1:9000 of job minio-job has tolerance <=0 for more than 5 minutes.",
|
"description": "MinIO instance 127.0.0.1:9000 of job minio-job has lost quorum on pool 0 on set 0 for more than 5 minutes.",
|
||||||
"summary": "Instance 127.0.0.1:9000 unable to tolerate node failures"
|
"summary": "Instance 127.0.0.1:9000 has lot quorum on pool 0 on set 0"
|
||||||
},
|
},
|
||||||
"externalURL": "http://fedora-shubhendu:9093",
|
"externalURL": "http://fedora-minio:9093",
|
||||||
"version": "4",
|
"version": "4",
|
||||||
"groupKey": "{}:{alertname=\"MinIOClusterTolerance\"}",
|
"groupKey": "{}:{alertname=\"MinIOClusterTolerance\"}",
|
||||||
"truncatedAlerts": 0
|
"truncatedAlerts": 0
|
||||||
|
Binary file not shown.
Before Width: | Height: | Size: 469 KiB After Width: | Height: | Size: 213 KiB |
File diff suppressed because it is too large
Load Diff
@ -35,7 +35,7 @@ For deployments behind a load balancer, use the load balancer hostname instead o
|
|||||||
| `minio_cluster_usage_version_total` | Total number of versions (includes delete marker) in a cluster |
|
| `minio_cluster_usage_version_total` | Total number of versions (includes delete marker) in a cluster |
|
||||||
| `minio_cluster_usage_deletemarker_total` | Total number of delete markers in a cluster |
|
| `minio_cluster_usage_deletemarker_total` | Total number of delete markers in a cluster |
|
||||||
| `minio_cluster_usage_total_bytes` | Total cluster usage in bytes |
|
| `minio_cluster_usage_total_bytes` | Total cluster usage in bytes |
|
||||||
| `minio_cluster_buckets_total` | Total number of buckets in the cluster |
|
| `minio_cluster_bucket_total` | Total number of buckets in the cluster |
|
||||||
|
|
||||||
## Cluster Drive Metrics
|
## Cluster Drive Metrics
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user