From b534dc69ab5b736fcbaac5443918b6d58b1704d9 Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Thu, 9 May 2024 11:04:41 -0700 Subject: [PATCH] deprecate unexpected healing failed counters (#19705) simplify this to avoid verbose metrics, and make room for valid metrics to be reported for alerting etc. --- cmd/admin-heal-ops.go | 41 +++++++++++---------- cmd/background-heal-ops.go | 19 +++++----- cmd/global-heal.go | 2 +- cmd/metrics-v2.go | 74 ++++++++++++++++++-------------------- cmd/metrics.go | 20 +++++------ 5 files changed, 76 insertions(+), 80 deletions(-) diff --git a/cmd/admin-heal-ops.go b/cmd/admin-heal-ops.go index a7257d34f..fe064840a 100644 --- a/cmd/admin-heal-ops.go +++ b/cmd/admin-heal-ops.go @@ -455,8 +455,8 @@ type healSequence struct { // Number of total items healed against item type healedItemsMap map[madmin.HealItemType]int64 - // Number of total items where healing failed against endpoint and drive state - healFailedItemsMap map[string]int64 + // Number of total items where healing failed against item type + healFailedItemsMap map[madmin.HealItemType]int64 // The time of the last scan/heal activity lastHealActivity time.Time @@ -497,7 +497,7 @@ func newHealSequence(ctx context.Context, bucket, objPrefix, clientAddr string, ctx: ctx, scannedItemsMap: make(map[madmin.HealItemType]int64), healedItemsMap: make(map[madmin.HealItemType]int64), - healFailedItemsMap: make(map[string]int64), + healFailedItemsMap: make(map[madmin.HealItemType]int64), } } @@ -543,12 +543,12 @@ func (h *healSequence) getHealedItemsMap() map[madmin.HealItemType]int64 { // getHealFailedItemsMap - returns map of all items where heal failed against // drive endpoint and status -func (h *healSequence) getHealFailedItemsMap() map[string]int64 { +func (h *healSequence) getHealFailedItemsMap() map[madmin.HealItemType]int64 { h.mutex.RLock() defer h.mutex.RUnlock() // Make a copy before returning the value - retMap := make(map[string]int64, len(h.healFailedItemsMap)) + retMap := make(map[madmin.HealItemType]int64, len(h.healFailedItemsMap)) for k, v := range h.healFailedItemsMap { retMap[k] = v } @@ -556,29 +556,27 @@ func (h *healSequence) getHealFailedItemsMap() map[string]int64 { return retMap } -func (h *healSequence) countFailed(res madmin.HealResultItem) { +func (h *healSequence) countFailed(healType madmin.HealItemType) { h.mutex.Lock() defer h.mutex.Unlock() - for _, d := range res.After.Drives { - // For failed items we report the endpoint and drive state - // This will help users take corrective actions for drives - h.healFailedItemsMap[d.Endpoint+","+d.State]++ - } - + h.healFailedItemsMap[healType]++ h.lastHealActivity = UTCNow() } -func (h *healSequence) countHeals(healType madmin.HealItemType, healed bool) { +func (h *healSequence) countScanned(healType madmin.HealItemType) { h.mutex.Lock() defer h.mutex.Unlock() - if !healed { - h.scannedItemsMap[healType]++ - } else { - h.healedItemsMap[healType]++ - } + h.scannedItemsMap[healType]++ + h.lastHealActivity = UTCNow() +} +func (h *healSequence) countHealed(healType madmin.HealItemType) { + h.mutex.Lock() + defer h.mutex.Unlock() + + h.healedItemsMap[healType]++ h.lastHealActivity = UTCNow() } @@ -734,7 +732,7 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem task.opts.ScanMode = madmin.HealNormalScan } - h.countHeals(healType, false) + h.countScanned(healType) if source.noWait { select { @@ -766,6 +764,11 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem // task queued, now wait for the response. select { case res := <-task.respCh: + if res.err == nil { + h.countHealed(healType) + } else { + h.countFailed(healType) + } if !h.reportProgress { if errors.Is(res.err, errSkipFile) { // this is only sent usually by nopHeal return nil diff --git a/cmd/background-heal-ops.go b/cmd/background-heal-ops.go index 15aab7762..113939f90 100644 --- a/cmd/background-heal-ops.go +++ b/cmd/background-heal-ops.go @@ -133,19 +133,20 @@ func (h *healRoutine) AddWorker(ctx context.Context, objAPI ObjectLayer, bgSeq * } } - if bgSeq != nil { - // We increment relevant counter based on the heal result for prometheus reporting. - if err != nil { - bgSeq.countFailed(res) - } else { - bgSeq.countHeals(res.Type, false) - } - } - if task.respCh != nil { task.respCh <- healResult{result: res, err: err} + continue } + // when respCh is not set caller is not waiting but we + // update the relevant metrics for them + if bgSeq != nil { + if err == nil { + bgSeq.countHealed(res.Type) + } else { + bgSeq.countFailed(res.Type) + } + } case <-ctx.Done(): return } diff --git a/cmd/global-heal.go b/cmd/global-heal.go index 019fb6c77..1152628c7 100644 --- a/cmd/global-heal.go +++ b/cmd/global-heal.go @@ -70,7 +70,7 @@ func newBgHealSequence() *healSequence { reportProgress: false, scannedItemsMap: make(map[madmin.HealItemType]int64), healedItemsMap: make(map[madmin.HealItemType]int64), - healFailedItemsMap: make(map[string]int64), + healFailedItemsMap: make(map[madmin.HealItemType]int64), } } diff --git a/cmd/metrics-v2.go b/cmd/metrics-v2.go index b03860af3..47a0e87f7 100644 --- a/cmd/metrics-v2.go +++ b/cmd/metrics-v2.go @@ -536,7 +536,7 @@ func getNodeDriveTimeoutErrorsMD() MetricDescription { Namespace: nodeMetricNamespace, Subsystem: driveSubsystem, Name: "errors_timeout", - Help: "Total number of drive timeout errors since server start", + Help: "Total number of drive timeout errors since server uptime", Type: counterMetric, } } @@ -546,7 +546,7 @@ func getNodeDriveIOErrorsMD() MetricDescription { Namespace: nodeMetricNamespace, Subsystem: driveSubsystem, Name: "errors_ioerror", - Help: "Total number of drive I/O errors since server start", + Help: "Total number of drive I/O errors since server uptime", Type: counterMetric, } } @@ -556,7 +556,7 @@ func getNodeDriveAvailabilityErrorsMD() MetricDescription { Namespace: nodeMetricNamespace, Subsystem: driveSubsystem, Name: "errors_availability", - Help: "Total number of drive I/O errors, timeouts since server start", + Help: "Total number of drive I/O errors, timeouts since server uptime", Type: counterMetric, } } @@ -686,7 +686,7 @@ func getUsageLastScanActivityMD() MetricDescription { Namespace: minioMetricNamespace, Subsystem: usageSubsystem, Name: lastActivityTime, - Help: "Time elapsed (in nano seconds) since last scan activity.", + Help: "Time elapsed (in nano seconds) since last scan activity", Type: gaugeMetric, } } @@ -856,7 +856,7 @@ func getClusterRepLinkTotalOfflineDurationMD() MetricDescription { Namespace: clusterMetricNamespace, Subsystem: replicationSubsystem, Name: linkDowntimeTotalDuration, - Help: "Total downtime of replication link in seconds since server start", + Help: "Total downtime of replication link in seconds since server uptime", Type: gaugeMetric, } } @@ -916,7 +916,7 @@ func getRepFailedBytesTotalMD(namespace MetricNamespace) MetricDescription { Namespace: namespace, Subsystem: replicationSubsystem, Name: totalFailedBytes, - Help: "Total number of bytes failed at least once to replicate since server start", + Help: "Total number of bytes failed at least once to replicate since server uptime", Type: counterMetric, } } @@ -926,7 +926,7 @@ func getRepFailedOperationsTotalMD(namespace MetricNamespace) MetricDescription Namespace: namespace, Subsystem: replicationSubsystem, Name: totalFailedCount, - Help: "Total number of objects which failed replication since server start", + Help: "Total number of objects which failed replication since server uptime", Type: counterMetric, } } @@ -994,7 +994,7 @@ func getClusterRepCredentialErrorsMD(namespace MetricNamespace) MetricDescriptio Namespace: namespace, Subsystem: replicationSubsystem, Name: credentialErrors, - Help: "Total number of replication credential errors since server start", + Help: "Total number of replication credential errors since server uptime", Type: counterMetric, } } @@ -1044,7 +1044,7 @@ func getClusterReplMaxActiveWorkersCountMD() MetricDescription { Namespace: clusterMetricNamespace, Subsystem: replicationSubsystem, Name: maxActiveWorkers, - Help: "Maximum number of active replication workers seen since server start", + Help: "Maximum number of active replication workers seen since server uptime", Type: gaugeMetric, } } @@ -1064,7 +1064,7 @@ func getClusterRepLinkLatencyMaxMD() MetricDescription { Namespace: clusterMetricNamespace, Subsystem: replicationSubsystem, Name: maxLinkLatency, - Help: "Maximum replication link latency in milliseconds seen since server start", + Help: "Maximum replication link latency in milliseconds seen since server uptime", Type: gaugeMetric, } } @@ -1084,7 +1084,7 @@ func getClusterReplAvgQueuedOperationsMD() MetricDescription { Namespace: clusterMetricNamespace, Subsystem: replicationSubsystem, Name: avgInQueueCount, - Help: "Average number of objects queued for replication since server start", + Help: "Average number of objects queued for replication since server uptime", Type: gaugeMetric, } } @@ -1094,7 +1094,7 @@ func getClusterReplAvgQueuedBytesMD() MetricDescription { Namespace: clusterMetricNamespace, Subsystem: replicationSubsystem, Name: avgInQueueBytes, - Help: "Average number of bytes queued for replication since server start", + Help: "Average number of bytes queued for replication since server uptime", Type: gaugeMetric, } } @@ -1104,7 +1104,7 @@ func getClusterReplMaxQueuedOperationsMD() MetricDescription { Namespace: clusterMetricNamespace, Subsystem: replicationSubsystem, Name: maxInQueueCount, - Help: "Maximum number of objects queued for replication since server start", + Help: "Maximum number of objects queued for replication since server uptime", Type: gaugeMetric, } } @@ -1114,7 +1114,7 @@ func getClusterReplMaxQueuedBytesMD() MetricDescription { Namespace: clusterMetricNamespace, Subsystem: replicationSubsystem, Name: maxInQueueBytes, - Help: "Maximum number of bytes queued for replication since server start", + Help: "Maximum number of bytes queued for replication since server uptime", Type: gaugeMetric, } } @@ -1134,7 +1134,7 @@ func getClusterReplMaxTransferRateMD() MetricDescription { Namespace: clusterMetricNamespace, Subsystem: replicationSubsystem, Name: maxTransferRate, - Help: "Maximum replication transfer rate in bytes/sec seen since server start", + Help: "Maximum replication transfer rate in bytes/sec seen since server uptime", Type: gaugeMetric, } } @@ -1454,8 +1454,8 @@ func getHealObjectsTotalMD() MetricDescription { Namespace: healMetricNamespace, Subsystem: objectsSubsystem, Name: total, - Help: "Objects scanned in current self healing run", - Type: gaugeMetric, + Help: "Objects scanned since server uptime", + Type: counterMetric, } } @@ -1464,8 +1464,8 @@ func getHealObjectsHealTotalMD() MetricDescription { Namespace: healMetricNamespace, Subsystem: objectsSubsystem, Name: healTotal, - Help: "Objects healed in current self healing run", - Type: gaugeMetric, + Help: "Objects healed since server uptime", + Type: counterMetric, } } @@ -1474,8 +1474,8 @@ func getHealObjectsFailTotalMD() MetricDescription { Namespace: healMetricNamespace, Subsystem: objectsSubsystem, Name: errorsTotal, - Help: "Objects for which healing failed in current self healing run", - Type: gaugeMetric, + Help: "Objects with healing failed since server uptime", + Type: counterMetric, } } @@ -1484,7 +1484,7 @@ func getHealLastActivityTimeMD() MetricDescription { Namespace: healMetricNamespace, Subsystem: timeSubsystem, Name: lastActivityTime, - Help: "Time elapsed (in nano seconds) since last self healing activity.", + Help: "Time elapsed (in nano seconds) since last self healing activity", Type: gaugeMetric, } } @@ -2105,7 +2105,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 { Namespace: nodeMetricNamespace, Subsystem: scannerSubsystem, Name: "objects_scanned", - Help: "Total number of unique objects scanned since server start", + Help: "Total number of unique objects scanned since server uptime", Type: counterMetric, }, Value: float64(globalScannerMetrics.lifetime(scannerMetricScanObject)), @@ -2115,7 +2115,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 { Namespace: nodeMetricNamespace, Subsystem: scannerSubsystem, Name: "versions_scanned", - Help: "Total number of object versions scanned since server start", + Help: "Total number of object versions scanned since server uptime", Type: counterMetric, }, Value: float64(globalScannerMetrics.lifetime(scannerMetricApplyVersion)), @@ -2125,7 +2125,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 { Namespace: nodeMetricNamespace, Subsystem: scannerSubsystem, Name: "directories_scanned", - Help: "Total number of directories scanned since server start", + Help: "Total number of directories scanned since server uptime", Type: counterMetric, }, Value: float64(globalScannerMetrics.lifetime(scannerMetricScanFolder)), @@ -2135,7 +2135,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 { Namespace: nodeMetricNamespace, Subsystem: scannerSubsystem, Name: "bucket_scans_started", - Help: "Total number of bucket scans started since server start", + Help: "Total number of bucket scans started since server uptime", Type: counterMetric, }, Value: float64(globalScannerMetrics.lifetime(scannerMetricScanBucketDrive) + uint64(globalScannerMetrics.activeDrives())), @@ -2145,7 +2145,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 { Namespace: nodeMetricNamespace, Subsystem: scannerSubsystem, Name: "bucket_scans_finished", - Help: "Total number of bucket scans finished since server start", + Help: "Total number of bucket scans finished since server uptime", Type: counterMetric, }, Value: float64(globalScannerMetrics.lifetime(scannerMetricScanBucketDrive)), @@ -2155,7 +2155,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 { Namespace: nodeMetricNamespace, Subsystem: ilmSubsystem, Name: "versions_scanned", - Help: "Total number of object versions checked for ilm actions since server start", + Help: "Total number of object versions checked for ilm actions since server uptime", Type: counterMetric, }, Value: float64(globalScannerMetrics.lifetime(scannerMetricILM)), @@ -2172,7 +2172,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 { Namespace: nodeMetricNamespace, Subsystem: ilmSubsystem, Name: MetricName("action_count_" + toSnake(action.String())), - Help: "Total action outcome of lifecycle checks since server start", + Help: "Total action outcome of lifecycle checks since server uptime", Type: counterMetric, }, Value: float64(v), @@ -2212,7 +2212,7 @@ func getIAMNodeMetrics(opts MetricsGroupOpts) *MetricsGroupV2 { Namespace: nodeMetricNamespace, Subsystem: iamSubsystem, Name: "since_last_sync_millis", - Help: "Time (in milliseconds) since last successful IAM data sync.", + Help: "Time (in milliseconds) since last successful IAM data sync", Type: gaugeMetric, }, Value: float64(sinceLastSyncMillis), @@ -2222,7 +2222,7 @@ func getIAMNodeMetrics(opts MetricsGroupOpts) *MetricsGroupV2 { Namespace: nodeMetricNamespace, Subsystem: iamSubsystem, Name: "sync_successes", - Help: "Number of successful IAM data syncs since server start.", + Help: "Number of successful IAM data syncs since server uptime", Type: counterMetric, }, Value: float64(atomic.LoadUint64(&globalIAMSys.TotalRefreshSuccesses)), @@ -2232,7 +2232,7 @@ func getIAMNodeMetrics(opts MetricsGroupOpts) *MetricsGroupV2 { Namespace: nodeMetricNamespace, Subsystem: iamSubsystem, Name: "sync_failures", - Help: "Number of failed IAM data syncs since server start.", + Help: "Number of failed IAM data syncs since server uptime", Type: counterMetric, }, Value: float64(atomic.LoadUint64(&globalIAMSys.TotalRefreshFailures)), @@ -2667,14 +2667,10 @@ func getFailedItems(seq *healSequence) (m []MetricV2) { items := seq.getHealFailedItemsMap() m = make([]MetricV2, 0, len(items)) for k, v := range items { - s := strings.Split(k, ",") m = append(m, MetricV2{ - Description: getHealObjectsFailTotalMD(), - VariableLabels: map[string]string{ - "mount_path": s[0], - "volume_status": s[1], - }, - Value: float64(v), + Description: getHealObjectsFailTotalMD(), + VariableLabels: map[string]string{"type": string(k)}, + Value: float64(v), }) } return diff --git a/cmd/metrics.go b/cmd/metrics.go index b31688d50..da11af44c 100644 --- a/cmd/metrics.go +++ b/cmd/metrics.go @@ -19,7 +19,6 @@ package cmd import ( "net/http" - "strings" "time" "github.com/minio/minio/internal/auth" @@ -156,9 +155,9 @@ func healingMetricsPrometheus(ch chan<- prometheus.Metric) { ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( prometheus.BuildFQName(healMetricsNamespace, "objects", "scanned"), - "Objects scanned in current self healing run", + "Objects scanned since uptime", []string{"type"}, nil), - prometheus.GaugeValue, + prometheus.CounterValue, float64(v), string(k), ) } @@ -166,23 +165,20 @@ func healingMetricsPrometheus(ch chan<- prometheus.Metric) { ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( prometheus.BuildFQName(healMetricsNamespace, "objects", "healed"), - "Objects healed in current self healing run", + "Objects healed since uptime", []string{"type"}, nil), - prometheus.GaugeValue, + prometheus.CounterValue, float64(v), string(k), ) } for k, v := range bgSeq.getHealFailedItemsMap() { - // healFailedItemsMap stores the endpoint and volume state separated by comma, - // split the fields and pass to channel at correct index - s := strings.Split(k, ",") ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( prometheus.BuildFQName(healMetricsNamespace, "objects", "heal_failed"), - "Objects for which healing failed in current self healing run", - []string{"mount_path", "volume_status"}, nil), - prometheus.GaugeValue, - float64(v), s[0], s[1], + "Objects for which healing failed since uptime", + []string{"type"}, nil), + prometheus.CounterValue, + float64(v), string(k), ) } }