deprecate unexpected healing failed counters (#19705)

simplify this to avoid verbose metrics, and make
room for valid metrics to be reported for alerting
etc.
This commit is contained in:
Harshavardhana 2024-05-09 11:04:41 -07:00 committed by GitHub
parent 7b7d2ea7d4
commit b534dc69ab
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 76 additions and 80 deletions

View File

@ -455,8 +455,8 @@ type healSequence struct {
// Number of total items healed against item type
healedItemsMap map[madmin.HealItemType]int64
// Number of total items where healing failed against endpoint and drive state
healFailedItemsMap map[string]int64
// Number of total items where healing failed against item type
healFailedItemsMap map[madmin.HealItemType]int64
// The time of the last scan/heal activity
lastHealActivity time.Time
@ -497,7 +497,7 @@ func newHealSequence(ctx context.Context, bucket, objPrefix, clientAddr string,
ctx: ctx,
scannedItemsMap: make(map[madmin.HealItemType]int64),
healedItemsMap: make(map[madmin.HealItemType]int64),
healFailedItemsMap: make(map[string]int64),
healFailedItemsMap: make(map[madmin.HealItemType]int64),
}
}
@ -543,12 +543,12 @@ func (h *healSequence) getHealedItemsMap() map[madmin.HealItemType]int64 {
// getHealFailedItemsMap - returns map of all items where heal failed against
// drive endpoint and status
func (h *healSequence) getHealFailedItemsMap() map[string]int64 {
func (h *healSequence) getHealFailedItemsMap() map[madmin.HealItemType]int64 {
h.mutex.RLock()
defer h.mutex.RUnlock()
// Make a copy before returning the value
retMap := make(map[string]int64, len(h.healFailedItemsMap))
retMap := make(map[madmin.HealItemType]int64, len(h.healFailedItemsMap))
for k, v := range h.healFailedItemsMap {
retMap[k] = v
}
@ -556,29 +556,27 @@ func (h *healSequence) getHealFailedItemsMap() map[string]int64 {
return retMap
}
func (h *healSequence) countFailed(res madmin.HealResultItem) {
func (h *healSequence) countFailed(healType madmin.HealItemType) {
h.mutex.Lock()
defer h.mutex.Unlock()
for _, d := range res.After.Drives {
// For failed items we report the endpoint and drive state
// This will help users take corrective actions for drives
h.healFailedItemsMap[d.Endpoint+","+d.State]++
}
h.healFailedItemsMap[healType]++
h.lastHealActivity = UTCNow()
}
func (h *healSequence) countHeals(healType madmin.HealItemType, healed bool) {
func (h *healSequence) countScanned(healType madmin.HealItemType) {
h.mutex.Lock()
defer h.mutex.Unlock()
if !healed {
h.scannedItemsMap[healType]++
} else {
h.healedItemsMap[healType]++
}
h.scannedItemsMap[healType]++
h.lastHealActivity = UTCNow()
}
func (h *healSequence) countHealed(healType madmin.HealItemType) {
h.mutex.Lock()
defer h.mutex.Unlock()
h.healedItemsMap[healType]++
h.lastHealActivity = UTCNow()
}
@ -734,7 +732,7 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
task.opts.ScanMode = madmin.HealNormalScan
}
h.countHeals(healType, false)
h.countScanned(healType)
if source.noWait {
select {
@ -766,6 +764,11 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
// task queued, now wait for the response.
select {
case res := <-task.respCh:
if res.err == nil {
h.countHealed(healType)
} else {
h.countFailed(healType)
}
if !h.reportProgress {
if errors.Is(res.err, errSkipFile) { // this is only sent usually by nopHeal
return nil

View File

@ -133,19 +133,20 @@ func (h *healRoutine) AddWorker(ctx context.Context, objAPI ObjectLayer, bgSeq *
}
}
if bgSeq != nil {
// We increment relevant counter based on the heal result for prometheus reporting.
if err != nil {
bgSeq.countFailed(res)
} else {
bgSeq.countHeals(res.Type, false)
}
}
if task.respCh != nil {
task.respCh <- healResult{result: res, err: err}
continue
}
// when respCh is not set caller is not waiting but we
// update the relevant metrics for them
if bgSeq != nil {
if err == nil {
bgSeq.countHealed(res.Type)
} else {
bgSeq.countFailed(res.Type)
}
}
case <-ctx.Done():
return
}

View File

@ -70,7 +70,7 @@ func newBgHealSequence() *healSequence {
reportProgress: false,
scannedItemsMap: make(map[madmin.HealItemType]int64),
healedItemsMap: make(map[madmin.HealItemType]int64),
healFailedItemsMap: make(map[string]int64),
healFailedItemsMap: make(map[madmin.HealItemType]int64),
}
}

View File

@ -536,7 +536,7 @@ func getNodeDriveTimeoutErrorsMD() MetricDescription {
Namespace: nodeMetricNamespace,
Subsystem: driveSubsystem,
Name: "errors_timeout",
Help: "Total number of drive timeout errors since server start",
Help: "Total number of drive timeout errors since server uptime",
Type: counterMetric,
}
}
@ -546,7 +546,7 @@ func getNodeDriveIOErrorsMD() MetricDescription {
Namespace: nodeMetricNamespace,
Subsystem: driveSubsystem,
Name: "errors_ioerror",
Help: "Total number of drive I/O errors since server start",
Help: "Total number of drive I/O errors since server uptime",
Type: counterMetric,
}
}
@ -556,7 +556,7 @@ func getNodeDriveAvailabilityErrorsMD() MetricDescription {
Namespace: nodeMetricNamespace,
Subsystem: driveSubsystem,
Name: "errors_availability",
Help: "Total number of drive I/O errors, timeouts since server start",
Help: "Total number of drive I/O errors, timeouts since server uptime",
Type: counterMetric,
}
}
@ -686,7 +686,7 @@ func getUsageLastScanActivityMD() MetricDescription {
Namespace: minioMetricNamespace,
Subsystem: usageSubsystem,
Name: lastActivityTime,
Help: "Time elapsed (in nano seconds) since last scan activity.",
Help: "Time elapsed (in nano seconds) since last scan activity",
Type: gaugeMetric,
}
}
@ -856,7 +856,7 @@ func getClusterRepLinkTotalOfflineDurationMD() MetricDescription {
Namespace: clusterMetricNamespace,
Subsystem: replicationSubsystem,
Name: linkDowntimeTotalDuration,
Help: "Total downtime of replication link in seconds since server start",
Help: "Total downtime of replication link in seconds since server uptime",
Type: gaugeMetric,
}
}
@ -916,7 +916,7 @@ func getRepFailedBytesTotalMD(namespace MetricNamespace) MetricDescription {
Namespace: namespace,
Subsystem: replicationSubsystem,
Name: totalFailedBytes,
Help: "Total number of bytes failed at least once to replicate since server start",
Help: "Total number of bytes failed at least once to replicate since server uptime",
Type: counterMetric,
}
}
@ -926,7 +926,7 @@ func getRepFailedOperationsTotalMD(namespace MetricNamespace) MetricDescription
Namespace: namespace,
Subsystem: replicationSubsystem,
Name: totalFailedCount,
Help: "Total number of objects which failed replication since server start",
Help: "Total number of objects which failed replication since server uptime",
Type: counterMetric,
}
}
@ -994,7 +994,7 @@ func getClusterRepCredentialErrorsMD(namespace MetricNamespace) MetricDescriptio
Namespace: namespace,
Subsystem: replicationSubsystem,
Name: credentialErrors,
Help: "Total number of replication credential errors since server start",
Help: "Total number of replication credential errors since server uptime",
Type: counterMetric,
}
}
@ -1044,7 +1044,7 @@ func getClusterReplMaxActiveWorkersCountMD() MetricDescription {
Namespace: clusterMetricNamespace,
Subsystem: replicationSubsystem,
Name: maxActiveWorkers,
Help: "Maximum number of active replication workers seen since server start",
Help: "Maximum number of active replication workers seen since server uptime",
Type: gaugeMetric,
}
}
@ -1064,7 +1064,7 @@ func getClusterRepLinkLatencyMaxMD() MetricDescription {
Namespace: clusterMetricNamespace,
Subsystem: replicationSubsystem,
Name: maxLinkLatency,
Help: "Maximum replication link latency in milliseconds seen since server start",
Help: "Maximum replication link latency in milliseconds seen since server uptime",
Type: gaugeMetric,
}
}
@ -1084,7 +1084,7 @@ func getClusterReplAvgQueuedOperationsMD() MetricDescription {
Namespace: clusterMetricNamespace,
Subsystem: replicationSubsystem,
Name: avgInQueueCount,
Help: "Average number of objects queued for replication since server start",
Help: "Average number of objects queued for replication since server uptime",
Type: gaugeMetric,
}
}
@ -1094,7 +1094,7 @@ func getClusterReplAvgQueuedBytesMD() MetricDescription {
Namespace: clusterMetricNamespace,
Subsystem: replicationSubsystem,
Name: avgInQueueBytes,
Help: "Average number of bytes queued for replication since server start",
Help: "Average number of bytes queued for replication since server uptime",
Type: gaugeMetric,
}
}
@ -1104,7 +1104,7 @@ func getClusterReplMaxQueuedOperationsMD() MetricDescription {
Namespace: clusterMetricNamespace,
Subsystem: replicationSubsystem,
Name: maxInQueueCount,
Help: "Maximum number of objects queued for replication since server start",
Help: "Maximum number of objects queued for replication since server uptime",
Type: gaugeMetric,
}
}
@ -1114,7 +1114,7 @@ func getClusterReplMaxQueuedBytesMD() MetricDescription {
Namespace: clusterMetricNamespace,
Subsystem: replicationSubsystem,
Name: maxInQueueBytes,
Help: "Maximum number of bytes queued for replication since server start",
Help: "Maximum number of bytes queued for replication since server uptime",
Type: gaugeMetric,
}
}
@ -1134,7 +1134,7 @@ func getClusterReplMaxTransferRateMD() MetricDescription {
Namespace: clusterMetricNamespace,
Subsystem: replicationSubsystem,
Name: maxTransferRate,
Help: "Maximum replication transfer rate in bytes/sec seen since server start",
Help: "Maximum replication transfer rate in bytes/sec seen since server uptime",
Type: gaugeMetric,
}
}
@ -1454,8 +1454,8 @@ func getHealObjectsTotalMD() MetricDescription {
Namespace: healMetricNamespace,
Subsystem: objectsSubsystem,
Name: total,
Help: "Objects scanned in current self healing run",
Type: gaugeMetric,
Help: "Objects scanned since server uptime",
Type: counterMetric,
}
}
@ -1464,8 +1464,8 @@ func getHealObjectsHealTotalMD() MetricDescription {
Namespace: healMetricNamespace,
Subsystem: objectsSubsystem,
Name: healTotal,
Help: "Objects healed in current self healing run",
Type: gaugeMetric,
Help: "Objects healed since server uptime",
Type: counterMetric,
}
}
@ -1474,8 +1474,8 @@ func getHealObjectsFailTotalMD() MetricDescription {
Namespace: healMetricNamespace,
Subsystem: objectsSubsystem,
Name: errorsTotal,
Help: "Objects for which healing failed in current self healing run",
Type: gaugeMetric,
Help: "Objects with healing failed since server uptime",
Type: counterMetric,
}
}
@ -1484,7 +1484,7 @@ func getHealLastActivityTimeMD() MetricDescription {
Namespace: healMetricNamespace,
Subsystem: timeSubsystem,
Name: lastActivityTime,
Help: "Time elapsed (in nano seconds) since last self healing activity.",
Help: "Time elapsed (in nano seconds) since last self healing activity",
Type: gaugeMetric,
}
}
@ -2105,7 +2105,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
Namespace: nodeMetricNamespace,
Subsystem: scannerSubsystem,
Name: "objects_scanned",
Help: "Total number of unique objects scanned since server start",
Help: "Total number of unique objects scanned since server uptime",
Type: counterMetric,
},
Value: float64(globalScannerMetrics.lifetime(scannerMetricScanObject)),
@ -2115,7 +2115,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
Namespace: nodeMetricNamespace,
Subsystem: scannerSubsystem,
Name: "versions_scanned",
Help: "Total number of object versions scanned since server start",
Help: "Total number of object versions scanned since server uptime",
Type: counterMetric,
},
Value: float64(globalScannerMetrics.lifetime(scannerMetricApplyVersion)),
@ -2125,7 +2125,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
Namespace: nodeMetricNamespace,
Subsystem: scannerSubsystem,
Name: "directories_scanned",
Help: "Total number of directories scanned since server start",
Help: "Total number of directories scanned since server uptime",
Type: counterMetric,
},
Value: float64(globalScannerMetrics.lifetime(scannerMetricScanFolder)),
@ -2135,7 +2135,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
Namespace: nodeMetricNamespace,
Subsystem: scannerSubsystem,
Name: "bucket_scans_started",
Help: "Total number of bucket scans started since server start",
Help: "Total number of bucket scans started since server uptime",
Type: counterMetric,
},
Value: float64(globalScannerMetrics.lifetime(scannerMetricScanBucketDrive) + uint64(globalScannerMetrics.activeDrives())),
@ -2145,7 +2145,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
Namespace: nodeMetricNamespace,
Subsystem: scannerSubsystem,
Name: "bucket_scans_finished",
Help: "Total number of bucket scans finished since server start",
Help: "Total number of bucket scans finished since server uptime",
Type: counterMetric,
},
Value: float64(globalScannerMetrics.lifetime(scannerMetricScanBucketDrive)),
@ -2155,7 +2155,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
Namespace: nodeMetricNamespace,
Subsystem: ilmSubsystem,
Name: "versions_scanned",
Help: "Total number of object versions checked for ilm actions since server start",
Help: "Total number of object versions checked for ilm actions since server uptime",
Type: counterMetric,
},
Value: float64(globalScannerMetrics.lifetime(scannerMetricILM)),
@ -2172,7 +2172,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
Namespace: nodeMetricNamespace,
Subsystem: ilmSubsystem,
Name: MetricName("action_count_" + toSnake(action.String())),
Help: "Total action outcome of lifecycle checks since server start",
Help: "Total action outcome of lifecycle checks since server uptime",
Type: counterMetric,
},
Value: float64(v),
@ -2212,7 +2212,7 @@ func getIAMNodeMetrics(opts MetricsGroupOpts) *MetricsGroupV2 {
Namespace: nodeMetricNamespace,
Subsystem: iamSubsystem,
Name: "since_last_sync_millis",
Help: "Time (in milliseconds) since last successful IAM data sync.",
Help: "Time (in milliseconds) since last successful IAM data sync",
Type: gaugeMetric,
},
Value: float64(sinceLastSyncMillis),
@ -2222,7 +2222,7 @@ func getIAMNodeMetrics(opts MetricsGroupOpts) *MetricsGroupV2 {
Namespace: nodeMetricNamespace,
Subsystem: iamSubsystem,
Name: "sync_successes",
Help: "Number of successful IAM data syncs since server start.",
Help: "Number of successful IAM data syncs since server uptime",
Type: counterMetric,
},
Value: float64(atomic.LoadUint64(&globalIAMSys.TotalRefreshSuccesses)),
@ -2232,7 +2232,7 @@ func getIAMNodeMetrics(opts MetricsGroupOpts) *MetricsGroupV2 {
Namespace: nodeMetricNamespace,
Subsystem: iamSubsystem,
Name: "sync_failures",
Help: "Number of failed IAM data syncs since server start.",
Help: "Number of failed IAM data syncs since server uptime",
Type: counterMetric,
},
Value: float64(atomic.LoadUint64(&globalIAMSys.TotalRefreshFailures)),
@ -2667,14 +2667,10 @@ func getFailedItems(seq *healSequence) (m []MetricV2) {
items := seq.getHealFailedItemsMap()
m = make([]MetricV2, 0, len(items))
for k, v := range items {
s := strings.Split(k, ",")
m = append(m, MetricV2{
Description: getHealObjectsFailTotalMD(),
VariableLabels: map[string]string{
"mount_path": s[0],
"volume_status": s[1],
},
Value: float64(v),
Description: getHealObjectsFailTotalMD(),
VariableLabels: map[string]string{"type": string(k)},
Value: float64(v),
})
}
return

View File

@ -19,7 +19,6 @@ package cmd
import (
"net/http"
"strings"
"time"
"github.com/minio/minio/internal/auth"
@ -156,9 +155,9 @@ func healingMetricsPrometheus(ch chan<- prometheus.Metric) {
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(healMetricsNamespace, "objects", "scanned"),
"Objects scanned in current self healing run",
"Objects scanned since uptime",
[]string{"type"}, nil),
prometheus.GaugeValue,
prometheus.CounterValue,
float64(v), string(k),
)
}
@ -166,23 +165,20 @@ func healingMetricsPrometheus(ch chan<- prometheus.Metric) {
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(healMetricsNamespace, "objects", "healed"),
"Objects healed in current self healing run",
"Objects healed since uptime",
[]string{"type"}, nil),
prometheus.GaugeValue,
prometheus.CounterValue,
float64(v), string(k),
)
}
for k, v := range bgSeq.getHealFailedItemsMap() {
// healFailedItemsMap stores the endpoint and volume state separated by comma,
// split the fields and pass to channel at correct index
s := strings.Split(k, ",")
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(healMetricsNamespace, "objects", "heal_failed"),
"Objects for which healing failed in current self healing run",
[]string{"mount_path", "volume_status"}, nil),
prometheus.GaugeValue,
float64(v), s[0], s[1],
"Objects for which healing failed since uptime",
[]string{"type"}, nil),
prometheus.CounterValue,
float64(v), string(k),
)
}
}