mirror of
https://github.com/minio/minio.git
synced 2025-11-07 04:42:56 -05:00
remove local disk metrics from cluster metrics (#18886)
local disk metrics were polluting cluster metrics Please remove them instead of adding relevant ones. - batch job metrics were incorrectly kept at bucket metrics endpoint, move it to cluster metrics. - add tier metrics to cluster peer metrics from the node. - fix missing set level cluster health metrics
This commit is contained in:
@@ -2286,6 +2286,7 @@ type HealthResult struct {
|
||||
ESHealth []struct {
|
||||
Maintenance bool
|
||||
PoolID, SetID int
|
||||
Healthy bool
|
||||
HealthyDrives int
|
||||
HealingDrives int
|
||||
ReadQuorum int
|
||||
@@ -2409,23 +2410,25 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
|
||||
result.ESHealth = append(result.ESHealth, struct {
|
||||
Maintenance bool
|
||||
PoolID, SetID int
|
||||
Healthy bool
|
||||
HealthyDrives, HealingDrives int
|
||||
ReadQuorum, WriteQuorum int
|
||||
}{
|
||||
Maintenance: opts.Maintenance,
|
||||
SetID: setIdx,
|
||||
PoolID: poolIdx,
|
||||
Healthy: erasureSetUpCount[poolIdx][setIdx].online >= poolWriteQuorums[poolIdx],
|
||||
HealthyDrives: erasureSetUpCount[poolIdx][setIdx].online,
|
||||
HealingDrives: erasureSetUpCount[poolIdx][setIdx].healing,
|
||||
ReadQuorum: poolReadQuorums[poolIdx],
|
||||
WriteQuorum: poolWriteQuorums[poolIdx],
|
||||
})
|
||||
|
||||
if erasureSetUpCount[poolIdx][setIdx].online < poolWriteQuorums[poolIdx] {
|
||||
result.Healthy = erasureSetUpCount[poolIdx][setIdx].online >= poolWriteQuorums[poolIdx]
|
||||
if !result.Healthy {
|
||||
logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
|
||||
fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
|
||||
poolIdx, setIdx, poolWriteQuorums[poolIdx]))
|
||||
result.Healthy = false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -60,13 +60,13 @@ func init() {
|
||||
getClusterHealthMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
|
||||
getIAMNodeMetrics(MetricsGroupOpts{dependGlobalAuthNPlugin: true, dependGlobalIAMSys: true}),
|
||||
getReplicationSiteMetrics(MetricsGroupOpts{dependGlobalSiteReplicationSys: true}),
|
||||
getBatchJobsMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
|
||||
}
|
||||
|
||||
peerMetricsGroups = []*MetricsGroup{
|
||||
getGoMetrics(),
|
||||
getHTTPMetrics(MetricsGroupOpts{}),
|
||||
getNotificationMetrics(MetricsGroupOpts{dependGlobalLambdaTargetList: true}),
|
||||
getLocalStorageMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
|
||||
getMinioProcMetrics(),
|
||||
getMinioVersionMetrics(),
|
||||
getNetworkMetrics(),
|
||||
@@ -77,7 +77,8 @@ func init() {
|
||||
getKMSNodeMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true, dependGlobalKMS: true}),
|
||||
getMinioHealingMetrics(MetricsGroupOpts{dependGlobalBackgroundHealState: true}),
|
||||
getWebhookMetrics(),
|
||||
getReplicationClusterMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true, dependBucketTargetSys: true}),
|
||||
getTierMetrics(),
|
||||
getReplicationNodeMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true, dependBucketTargetSys: true}),
|
||||
}
|
||||
|
||||
allMetricsGroups := func() (allMetrics []*MetricsGroup) {
|
||||
@@ -97,13 +98,13 @@ func init() {
|
||||
getDistLockMetrics(MetricsGroupOpts{dependGlobalIsDistErasure: true, dependGlobalLockServer: true}),
|
||||
getIAMNodeMetrics(MetricsGroupOpts{dependGlobalAuthNPlugin: true, dependGlobalIAMSys: true}),
|
||||
getLocalStorageMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
|
||||
getReplicationNodeMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true, dependBucketTargetSys: true}),
|
||||
}
|
||||
|
||||
bucketMetricsGroups := []*MetricsGroup{
|
||||
getBucketUsageMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
|
||||
getHTTPMetrics(MetricsGroupOpts{bucketOnly: true}),
|
||||
getBucketTTFBMetric(),
|
||||
getBatchJobsMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
|
||||
}
|
||||
|
||||
bucketPeerMetricsGroups = []*MetricsGroup{
|
||||
@@ -2137,7 +2138,7 @@ func getIAMNodeMetrics(opts MetricsGroupOpts) *MetricsGroup {
|
||||
}
|
||||
|
||||
// replication metrics for each node - published to the cluster endpoint with nodename as label
|
||||
func getReplicationClusterMetrics(opts MetricsGroupOpts) *MetricsGroup {
|
||||
func getReplicationNodeMetrics(opts MetricsGroupOpts) *MetricsGroup {
|
||||
mg := &MetricsGroup{
|
||||
cacheInterval: 1 * time.Minute,
|
||||
metricsGroupOpts: opts,
|
||||
@@ -3375,6 +3376,16 @@ func getClusterHealthStatusMD() MetricDescription {
|
||||
}
|
||||
}
|
||||
|
||||
func getClusterErasureSetHealthStatusMD() MetricDescription {
|
||||
return MetricDescription{
|
||||
Namespace: clusterMetricNamespace,
|
||||
Subsystem: "health",
|
||||
Name: "erasure_set_status",
|
||||
Help: "Get current health status for this erasure set",
|
||||
Type: gaugeMetric,
|
||||
}
|
||||
}
|
||||
|
||||
func getClusterErasureSetReadQuorumMD() MetricDescription {
|
||||
return MetricDescription{
|
||||
Namespace: clusterMetricNamespace,
|
||||
@@ -3468,6 +3479,17 @@ func getClusterHealthMetrics(opts MetricsGroupOpts) *MetricsGroup {
|
||||
VariableLabels: labels,
|
||||
Value: float64(h.HealingDrives),
|
||||
})
|
||||
|
||||
health := 1
|
||||
if !h.Healthy {
|
||||
health = 0
|
||||
}
|
||||
|
||||
metrics = append(metrics, Metric{
|
||||
Description: getClusterErasureSetHealthStatusMD(),
|
||||
VariableLabels: labels,
|
||||
Value: float64(health),
|
||||
})
|
||||
}
|
||||
|
||||
return
|
||||
|
||||
Reference in New Issue
Block a user