deprecate unexpected healing failed counters (#19705)

simplify this to avoid verbose metrics, and make room for valid metrics to be reported for alerting etc.
2025-11-20 01:50:24 -05:00 · 2024-05-09 11:04:41 -07:00
parent 7b7d2ea7d4
commit b534dc69ab
5 changed files with 76 additions and 80 deletions
--- a/cmd/admin-heal-ops.go
+++ b/cmd/admin-heal-ops.go
@@ -455,8 +455,8 @@ type healSequence struct {
 	// Number of total items healed against item type
 	healedItemsMap map[madmin.HealItemType]int64

-	// Number of total items where healing failed against endpoint and drive state
-	healFailedItemsMap map[string]int64
+	// Number of total items where healing failed against item type
+	healFailedItemsMap map[madmin.HealItemType]int64

 	// The time of the last scan/heal activity
 	lastHealActivity time.Time
@@ -497,7 +497,7 @@ func newHealSequence(ctx context.Context, bucket, objPrefix, clientAddr string,
 		ctx:                   ctx,
 		scannedItemsMap:       make(map[madmin.HealItemType]int64),
 		healedItemsMap:        make(map[madmin.HealItemType]int64),
-		healFailedItemsMap:    make(map[string]int64),
+		healFailedItemsMap:    make(map[madmin.HealItemType]int64),
 	}
 }

@@ -543,12 +543,12 @@ func (h *healSequence) getHealedItemsMap() map[madmin.HealItemType]int64 {

 // getHealFailedItemsMap - returns map of all items where heal failed against
 // drive endpoint and status
-func (h *healSequence) getHealFailedItemsMap() map[string]int64 {
+func (h *healSequence) getHealFailedItemsMap() map[madmin.HealItemType]int64 {
 	h.mutex.RLock()
 	defer h.mutex.RUnlock()

 	// Make a copy before returning the value
-	retMap := make(map[string]int64, len(h.healFailedItemsMap))
+	retMap := make(map[madmin.HealItemType]int64, len(h.healFailedItemsMap))
 	for k, v := range h.healFailedItemsMap {
 		retMap[k] = v
 	}
@@ -556,29 +556,27 @@ func (h *healSequence) getHealFailedItemsMap() map[string]int64 {
 	return retMap
 }

-func (h *healSequence) countFailed(res madmin.HealResultItem) {
+func (h *healSequence) countFailed(healType madmin.HealItemType) {
 	h.mutex.Lock()
 	defer h.mutex.Unlock()

-	for _, d := range res.After.Drives {
-		// For failed items we report the endpoint and drive state
-		// This will help users take corrective actions for drives
-		h.healFailedItemsMap[d.Endpoint+","+d.State]++
-	}
-
+	h.healFailedItemsMap[healType]++
 	h.lastHealActivity = UTCNow()
 }

-func (h *healSequence) countHeals(healType madmin.HealItemType, healed bool) {
+func (h *healSequence) countScanned(healType madmin.HealItemType) {
 	h.mutex.Lock()
 	defer h.mutex.Unlock()

-	if !healed {
-		h.scannedItemsMap[healType]++
-	} else {
-		h.healedItemsMap[healType]++
-	}
+	h.scannedItemsMap[healType]++
+	h.lastHealActivity = UTCNow()
+}

+func (h *healSequence) countHealed(healType madmin.HealItemType) {
+	h.mutex.Lock()
+	defer h.mutex.Unlock()
+
+	h.healedItemsMap[healType]++
 	h.lastHealActivity = UTCNow()
 }

@@ -734,7 +732,7 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
 		task.opts.ScanMode = madmin.HealNormalScan
 	}

-	h.countHeals(healType, false)
+	h.countScanned(healType)

 	if source.noWait {
 		select {
@@ -766,6 +764,11 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
 	// task queued, now wait for the response.
 	select {
 	case res := <-task.respCh:
+		if res.err == nil {
+			h.countHealed(healType)
+		} else {
+			h.countFailed(healType)
+		}
 		if !h.reportProgress {
 			if errors.Is(res.err, errSkipFile) { // this is only sent usually by nopHeal
 				return nil
--- a/cmd/background-heal-ops.go
+++ b/cmd/background-heal-ops.go
@@ -133,19 +133,20 @@ func (h *healRoutine) AddWorker(ctx context.Context, objAPI ObjectLayer, bgSeq *
 				}
 			}

-			if bgSeq != nil {
-				// We increment relevant counter based on the heal result for prometheus reporting.
-				if err != nil {
-					bgSeq.countFailed(res)
-				} else {
-					bgSeq.countHeals(res.Type, false)
-				}
-			}
-
 			if task.respCh != nil {
 				task.respCh <- healResult{result: res, err: err}
+				continue
 			}

+			// when respCh is not set caller is not waiting but we
+			// update the relevant metrics for them
+			if bgSeq != nil {
+				if err == nil {
+					bgSeq.countHealed(res.Type)
+				} else {
+					bgSeq.countFailed(res.Type)
+				}
+			}
 		case <-ctx.Done():
 			return
 		}
--- a/cmd/global-heal.go
+++ b/cmd/global-heal.go
@@ -70,7 +70,7 @@ func newBgHealSequence() *healSequence {
 		reportProgress:     false,
 		scannedItemsMap:    make(map[madmin.HealItemType]int64),
 		healedItemsMap:     make(map[madmin.HealItemType]int64),
-		healFailedItemsMap: make(map[string]int64),
+		healFailedItemsMap: make(map[madmin.HealItemType]int64),
 	}
 }

--- a/cmd/metrics-v2.go
+++ b/cmd/metrics-v2.go
@@ -536,7 +536,7 @@ func getNodeDriveTimeoutErrorsMD() MetricDescription {
 		Namespace: nodeMetricNamespace,
 		Subsystem: driveSubsystem,
 		Name:      "errors_timeout",
-		Help:      "Total number of drive timeout errors since server start",
+		Help:      "Total number of drive timeout errors since server uptime",
 		Type:      counterMetric,
 	}
 }
@@ -546,7 +546,7 @@ func getNodeDriveIOErrorsMD() MetricDescription {
 		Namespace: nodeMetricNamespace,
 		Subsystem: driveSubsystem,
 		Name:      "errors_ioerror",
-		Help:      "Total number of drive I/O errors since server start",
+		Help:      "Total number of drive I/O errors since server uptime",
 		Type:      counterMetric,
 	}
 }
@@ -556,7 +556,7 @@ func getNodeDriveAvailabilityErrorsMD() MetricDescription {
 		Namespace: nodeMetricNamespace,
 		Subsystem: driveSubsystem,
 		Name:      "errors_availability",
-		Help:      "Total number of drive I/O errors, timeouts since server start",
+		Help:      "Total number of drive I/O errors, timeouts since server uptime",
 		Type:      counterMetric,
 	}
 }
@@ -686,7 +686,7 @@ func getUsageLastScanActivityMD() MetricDescription {
 		Namespace: minioMetricNamespace,
 		Subsystem: usageSubsystem,
 		Name:      lastActivityTime,
-		Help:      "Time elapsed (in nano seconds) since last scan activity.",
+		Help:      "Time elapsed (in nano seconds) since last scan activity",
 		Type:      gaugeMetric,
 	}
 }
@@ -856,7 +856,7 @@ func getClusterRepLinkTotalOfflineDurationMD() MetricDescription {
 		Namespace: clusterMetricNamespace,
 		Subsystem: replicationSubsystem,
 		Name:      linkDowntimeTotalDuration,
-		Help:      "Total downtime of replication link in seconds since server start",
+		Help:      "Total downtime of replication link in seconds since server uptime",
 		Type:      gaugeMetric,
 	}
 }
@@ -916,7 +916,7 @@ func getRepFailedBytesTotalMD(namespace MetricNamespace) MetricDescription {
 		Namespace: namespace,
 		Subsystem: replicationSubsystem,
 		Name:      totalFailedBytes,
-		Help:      "Total number of bytes failed at least once to replicate since server start",
+		Help:      "Total number of bytes failed at least once to replicate since server uptime",
 		Type:      counterMetric,
 	}
 }
@@ -926,7 +926,7 @@ func getRepFailedOperationsTotalMD(namespace MetricNamespace) MetricDescription
 		Namespace: namespace,
 		Subsystem: replicationSubsystem,
 		Name:      totalFailedCount,
-		Help:      "Total number of objects which failed replication since server start",
+		Help:      "Total number of objects which failed replication since server uptime",
 		Type:      counterMetric,
 	}
 }
@@ -994,7 +994,7 @@ func getClusterRepCredentialErrorsMD(namespace MetricNamespace) MetricDescriptio
 		Namespace: namespace,
 		Subsystem: replicationSubsystem,
 		Name:      credentialErrors,
-		Help:      "Total number of replication credential errors since server start",
+		Help:      "Total number of replication credential errors since server uptime",
 		Type:      counterMetric,
 	}
 }
@@ -1044,7 +1044,7 @@ func getClusterReplMaxActiveWorkersCountMD() MetricDescription {
 		Namespace: clusterMetricNamespace,
 		Subsystem: replicationSubsystem,
 		Name:      maxActiveWorkers,
-		Help:      "Maximum number of active replication workers seen since server start",
+		Help:      "Maximum number of active replication workers seen since server uptime",
 		Type:      gaugeMetric,
 	}
 }
@@ -1064,7 +1064,7 @@ func getClusterRepLinkLatencyMaxMD() MetricDescription {
 		Namespace: clusterMetricNamespace,
 		Subsystem: replicationSubsystem,
 		Name:      maxLinkLatency,
-		Help:      "Maximum replication link latency in milliseconds seen since server start",
+		Help:      "Maximum replication link latency in milliseconds seen since server uptime",
 		Type:      gaugeMetric,
 	}
 }
@@ -1084,7 +1084,7 @@ func getClusterReplAvgQueuedOperationsMD() MetricDescription {
 		Namespace: clusterMetricNamespace,
 		Subsystem: replicationSubsystem,
 		Name:      avgInQueueCount,
-		Help:      "Average number of objects queued for replication since server start",
+		Help:      "Average number of objects queued for replication since server uptime",
 		Type:      gaugeMetric,
 	}
 }
@@ -1094,7 +1094,7 @@ func getClusterReplAvgQueuedBytesMD() MetricDescription {
 		Namespace: clusterMetricNamespace,
 		Subsystem: replicationSubsystem,
 		Name:      avgInQueueBytes,
-		Help:      "Average number of bytes queued for replication since server start",
+		Help:      "Average number of bytes queued for replication since server uptime",
 		Type:      gaugeMetric,
 	}
 }
@@ -1104,7 +1104,7 @@ func getClusterReplMaxQueuedOperationsMD() MetricDescription {
 		Namespace: clusterMetricNamespace,
 		Subsystem: replicationSubsystem,
 		Name:      maxInQueueCount,
-		Help:      "Maximum number of objects queued for replication since server start",
+		Help:      "Maximum number of objects queued for replication since server uptime",
 		Type:      gaugeMetric,
 	}
 }
@@ -1114,7 +1114,7 @@ func getClusterReplMaxQueuedBytesMD() MetricDescription {
 		Namespace: clusterMetricNamespace,
 		Subsystem: replicationSubsystem,
 		Name:      maxInQueueBytes,
-		Help:      "Maximum number of bytes queued for replication since server start",
+		Help:      "Maximum number of bytes queued for replication since server uptime",
 		Type:      gaugeMetric,
 	}
 }
@@ -1134,7 +1134,7 @@ func getClusterReplMaxTransferRateMD() MetricDescription {
 		Namespace: clusterMetricNamespace,
 		Subsystem: replicationSubsystem,
 		Name:      maxTransferRate,
-		Help:      "Maximum replication transfer rate in bytes/sec seen since server start",
+		Help:      "Maximum replication transfer rate in bytes/sec seen since server uptime",
 		Type:      gaugeMetric,
 	}
 }
@@ -1454,8 +1454,8 @@ func getHealObjectsTotalMD() MetricDescription {
 		Namespace: healMetricNamespace,
 		Subsystem: objectsSubsystem,
 		Name:      total,
-		Help:      "Objects scanned in current self healing run",
-		Type:      gaugeMetric,
+		Help:      "Objects scanned since server uptime",
+		Type:      counterMetric,
 	}
 }

@@ -1464,8 +1464,8 @@ func getHealObjectsHealTotalMD() MetricDescription {
 		Namespace: healMetricNamespace,
 		Subsystem: objectsSubsystem,
 		Name:      healTotal,
-		Help:      "Objects healed in current self healing run",
-		Type:      gaugeMetric,
+		Help:      "Objects healed since server uptime",
+		Type:      counterMetric,
 	}
 }

@@ -1474,8 +1474,8 @@ func getHealObjectsFailTotalMD() MetricDescription {
 		Namespace: healMetricNamespace,
 		Subsystem: objectsSubsystem,
 		Name:      errorsTotal,
-		Help:      "Objects for which healing failed in current self healing run",
-		Type:      gaugeMetric,
+		Help:      "Objects with healing failed since server uptime",
+		Type:      counterMetric,
 	}
 }

@@ -1484,7 +1484,7 @@ func getHealLastActivityTimeMD() MetricDescription {
 		Namespace: healMetricNamespace,
 		Subsystem: timeSubsystem,
 		Name:      lastActivityTime,
-		Help:      "Time elapsed (in nano seconds) since last self healing activity.",
+		Help:      "Time elapsed (in nano seconds) since last self healing activity",
 		Type:      gaugeMetric,
 	}
 }
@@ -2105,7 +2105,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
 					Namespace: nodeMetricNamespace,
 					Subsystem: scannerSubsystem,
 					Name:      "objects_scanned",
-					Help:      "Total number of unique objects scanned since server start",
+					Help:      "Total number of unique objects scanned since server uptime",
 					Type:      counterMetric,
 				},
 				Value: float64(globalScannerMetrics.lifetime(scannerMetricScanObject)),
@@ -2115,7 +2115,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
 					Namespace: nodeMetricNamespace,
 					Subsystem: scannerSubsystem,
 					Name:      "versions_scanned",
-					Help:      "Total number of object versions scanned since server start",
+					Help:      "Total number of object versions scanned since server uptime",
 					Type:      counterMetric,
 				},
 				Value: float64(globalScannerMetrics.lifetime(scannerMetricApplyVersion)),
@@ -2125,7 +2125,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
 					Namespace: nodeMetricNamespace,
 					Subsystem: scannerSubsystem,
 					Name:      "directories_scanned",
-					Help:      "Total number of directories scanned since server start",
+					Help:      "Total number of directories scanned since server uptime",
 					Type:      counterMetric,
 				},
 				Value: float64(globalScannerMetrics.lifetime(scannerMetricScanFolder)),
@@ -2135,7 +2135,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
 					Namespace: nodeMetricNamespace,
 					Subsystem: scannerSubsystem,
 					Name:      "bucket_scans_started",
-					Help:      "Total number of bucket scans started since server start",
+					Help:      "Total number of bucket scans started since server uptime",
 					Type:      counterMetric,
 				},
 				Value: float64(globalScannerMetrics.lifetime(scannerMetricScanBucketDrive) + uint64(globalScannerMetrics.activeDrives())),
@@ -2145,7 +2145,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
 					Namespace: nodeMetricNamespace,
 					Subsystem: scannerSubsystem,
 					Name:      "bucket_scans_finished",
-					Help:      "Total number of bucket scans finished since server start",
+					Help:      "Total number of bucket scans finished since server uptime",
 					Type:      counterMetric,
 				},
 				Value: float64(globalScannerMetrics.lifetime(scannerMetricScanBucketDrive)),
@@ -2155,7 +2155,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
 					Namespace: nodeMetricNamespace,
 					Subsystem: ilmSubsystem,
 					Name:      "versions_scanned",
-					Help:      "Total number of object versions checked for ilm actions since server start",
+					Help:      "Total number of object versions checked for ilm actions since server uptime",
 					Type:      counterMetric,
 				},
 				Value: float64(globalScannerMetrics.lifetime(scannerMetricILM)),
@@ -2172,7 +2172,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
 					Namespace: nodeMetricNamespace,
 					Subsystem: ilmSubsystem,
 					Name:      MetricName("action_count_" + toSnake(action.String())),
-					Help:      "Total action outcome of lifecycle checks since server start",
+					Help:      "Total action outcome of lifecycle checks since server uptime",
 					Type:      counterMetric,
 				},
 				Value: float64(v),
@@ -2212,7 +2212,7 @@ func getIAMNodeMetrics(opts MetricsGroupOpts) *MetricsGroupV2 {
 					Namespace: nodeMetricNamespace,
 					Subsystem: iamSubsystem,
 					Name:      "since_last_sync_millis",
-					Help:      "Time (in milliseconds) since last successful IAM data sync.",
+					Help:      "Time (in milliseconds) since last successful IAM data sync",
 					Type:      gaugeMetric,
 				},
 				Value: float64(sinceLastSyncMillis),
@@ -2222,7 +2222,7 @@ func getIAMNodeMetrics(opts MetricsGroupOpts) *MetricsGroupV2 {
 					Namespace: nodeMetricNamespace,
 					Subsystem: iamSubsystem,
 					Name:      "sync_successes",
-					Help:      "Number of successful IAM data syncs since server start.",
+					Help:      "Number of successful IAM data syncs since server uptime",
 					Type:      counterMetric,
 				},
 				Value: float64(atomic.LoadUint64(&globalIAMSys.TotalRefreshSuccesses)),
@@ -2232,7 +2232,7 @@ func getIAMNodeMetrics(opts MetricsGroupOpts) *MetricsGroupV2 {
 					Namespace: nodeMetricNamespace,
 					Subsystem: iamSubsystem,
 					Name:      "sync_failures",
-					Help:      "Number of failed IAM data syncs since server start.",
+					Help:      "Number of failed IAM data syncs since server uptime",
 					Type:      counterMetric,
 				},
 				Value: float64(atomic.LoadUint64(&globalIAMSys.TotalRefreshFailures)),
@@ -2667,14 +2667,10 @@ func getFailedItems(seq *healSequence) (m []MetricV2) {
 	items := seq.getHealFailedItemsMap()
 	m = make([]MetricV2, 0, len(items))
 	for k, v := range items {
-		s := strings.Split(k, ",")
 		m = append(m, MetricV2{
-			Description: getHealObjectsFailTotalMD(),
-			VariableLabels: map[string]string{
-				"mount_path":    s[0],
-				"volume_status": s[1],
-			},
-			Value: float64(v),
+			Description:    getHealObjectsFailTotalMD(),
+			VariableLabels: map[string]string{"type": string(k)},
+			Value:          float64(v),
 		})
 	}
 	return
--- a/cmd/metrics.go
+++ b/cmd/metrics.go
@@ -19,7 +19,6 @@ package cmd

 import (
 	"net/http"
-	"strings"
 	"time"

 	"github.com/minio/minio/internal/auth"
@@ -156,9 +155,9 @@ func healingMetricsPrometheus(ch chan<- prometheus.Metric) {
 		ch <- prometheus.MustNewConstMetric(
 			prometheus.NewDesc(
 				prometheus.BuildFQName(healMetricsNamespace, "objects", "scanned"),
-				"Objects scanned in current self healing run",
+				"Objects scanned since uptime",
 				[]string{"type"}, nil),
-			prometheus.GaugeValue,
+			prometheus.CounterValue,
 			float64(v), string(k),
 		)
 	}
@@ -166,23 +165,20 @@ func healingMetricsPrometheus(ch chan<- prometheus.Metric) {
 		ch <- prometheus.MustNewConstMetric(
 			prometheus.NewDesc(
 				prometheus.BuildFQName(healMetricsNamespace, "objects", "healed"),
-				"Objects healed in current self healing run",
+				"Objects healed since uptime",
 				[]string{"type"}, nil),
-			prometheus.GaugeValue,
+			prometheus.CounterValue,
 			float64(v), string(k),
 		)
 	}
 	for k, v := range bgSeq.getHealFailedItemsMap() {
-		// healFailedItemsMap stores the endpoint and volume state separated by comma,
-		// split the fields and pass to channel at correct index
-		s := strings.Split(k, ",")
 		ch <- prometheus.MustNewConstMetric(
 			prometheus.NewDesc(
 				prometheus.BuildFQName(healMetricsNamespace, "objects", "heal_failed"),
-				"Objects for which healing failed in current self healing run",
-				[]string{"mount_path", "volume_status"}, nil),
-			prometheus.GaugeValue,
-			float64(v), s[0], s[1],
+				"Objects for which healing failed since uptime",
+				[]string{"type"}, nil),
+			prometheus.CounterValue,
+			float64(v), string(k),
 		)
 	}
 }