mirror of
https://github.com/minio/minio.git
synced 2025-01-23 12:43:16 -05:00
deprecate unexpected healing failed counters (#19705)
simplify this to avoid verbose metrics, and make room for valid metrics to be reported for alerting etc.
This commit is contained in:
parent
7b7d2ea7d4
commit
b534dc69ab
@ -455,8 +455,8 @@ type healSequence struct {
|
||||
// Number of total items healed against item type
|
||||
healedItemsMap map[madmin.HealItemType]int64
|
||||
|
||||
// Number of total items where healing failed against endpoint and drive state
|
||||
healFailedItemsMap map[string]int64
|
||||
// Number of total items where healing failed against item type
|
||||
healFailedItemsMap map[madmin.HealItemType]int64
|
||||
|
||||
// The time of the last scan/heal activity
|
||||
lastHealActivity time.Time
|
||||
@ -497,7 +497,7 @@ func newHealSequence(ctx context.Context, bucket, objPrefix, clientAddr string,
|
||||
ctx: ctx,
|
||||
scannedItemsMap: make(map[madmin.HealItemType]int64),
|
||||
healedItemsMap: make(map[madmin.HealItemType]int64),
|
||||
healFailedItemsMap: make(map[string]int64),
|
||||
healFailedItemsMap: make(map[madmin.HealItemType]int64),
|
||||
}
|
||||
}
|
||||
|
||||
@ -543,12 +543,12 @@ func (h *healSequence) getHealedItemsMap() map[madmin.HealItemType]int64 {
|
||||
|
||||
// getHealFailedItemsMap - returns map of all items where heal failed against
|
||||
// drive endpoint and status
|
||||
func (h *healSequence) getHealFailedItemsMap() map[string]int64 {
|
||||
func (h *healSequence) getHealFailedItemsMap() map[madmin.HealItemType]int64 {
|
||||
h.mutex.RLock()
|
||||
defer h.mutex.RUnlock()
|
||||
|
||||
// Make a copy before returning the value
|
||||
retMap := make(map[string]int64, len(h.healFailedItemsMap))
|
||||
retMap := make(map[madmin.HealItemType]int64, len(h.healFailedItemsMap))
|
||||
for k, v := range h.healFailedItemsMap {
|
||||
retMap[k] = v
|
||||
}
|
||||
@ -556,29 +556,27 @@ func (h *healSequence) getHealFailedItemsMap() map[string]int64 {
|
||||
return retMap
|
||||
}
|
||||
|
||||
func (h *healSequence) countFailed(res madmin.HealResultItem) {
|
||||
func (h *healSequence) countFailed(healType madmin.HealItemType) {
|
||||
h.mutex.Lock()
|
||||
defer h.mutex.Unlock()
|
||||
|
||||
for _, d := range res.After.Drives {
|
||||
// For failed items we report the endpoint and drive state
|
||||
// This will help users take corrective actions for drives
|
||||
h.healFailedItemsMap[d.Endpoint+","+d.State]++
|
||||
}
|
||||
|
||||
h.healFailedItemsMap[healType]++
|
||||
h.lastHealActivity = UTCNow()
|
||||
}
|
||||
|
||||
func (h *healSequence) countHeals(healType madmin.HealItemType, healed bool) {
|
||||
func (h *healSequence) countScanned(healType madmin.HealItemType) {
|
||||
h.mutex.Lock()
|
||||
defer h.mutex.Unlock()
|
||||
|
||||
if !healed {
|
||||
h.scannedItemsMap[healType]++
|
||||
} else {
|
||||
h.healedItemsMap[healType]++
|
||||
}
|
||||
h.scannedItemsMap[healType]++
|
||||
h.lastHealActivity = UTCNow()
|
||||
}
|
||||
|
||||
func (h *healSequence) countHealed(healType madmin.HealItemType) {
|
||||
h.mutex.Lock()
|
||||
defer h.mutex.Unlock()
|
||||
|
||||
h.healedItemsMap[healType]++
|
||||
h.lastHealActivity = UTCNow()
|
||||
}
|
||||
|
||||
@ -734,7 +732,7 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
|
||||
task.opts.ScanMode = madmin.HealNormalScan
|
||||
}
|
||||
|
||||
h.countHeals(healType, false)
|
||||
h.countScanned(healType)
|
||||
|
||||
if source.noWait {
|
||||
select {
|
||||
@ -766,6 +764,11 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
|
||||
// task queued, now wait for the response.
|
||||
select {
|
||||
case res := <-task.respCh:
|
||||
if res.err == nil {
|
||||
h.countHealed(healType)
|
||||
} else {
|
||||
h.countFailed(healType)
|
||||
}
|
||||
if !h.reportProgress {
|
||||
if errors.Is(res.err, errSkipFile) { // this is only sent usually by nopHeal
|
||||
return nil
|
||||
|
@ -133,19 +133,20 @@ func (h *healRoutine) AddWorker(ctx context.Context, objAPI ObjectLayer, bgSeq *
|
||||
}
|
||||
}
|
||||
|
||||
if bgSeq != nil {
|
||||
// We increment relevant counter based on the heal result for prometheus reporting.
|
||||
if err != nil {
|
||||
bgSeq.countFailed(res)
|
||||
} else {
|
||||
bgSeq.countHeals(res.Type, false)
|
||||
}
|
||||
}
|
||||
|
||||
if task.respCh != nil {
|
||||
task.respCh <- healResult{result: res, err: err}
|
||||
continue
|
||||
}
|
||||
|
||||
// when respCh is not set caller is not waiting but we
|
||||
// update the relevant metrics for them
|
||||
if bgSeq != nil {
|
||||
if err == nil {
|
||||
bgSeq.countHealed(res.Type)
|
||||
} else {
|
||||
bgSeq.countFailed(res.Type)
|
||||
}
|
||||
}
|
||||
case <-ctx.Done():
|
||||
return
|
||||
}
|
||||
|
@ -70,7 +70,7 @@ func newBgHealSequence() *healSequence {
|
||||
reportProgress: false,
|
||||
scannedItemsMap: make(map[madmin.HealItemType]int64),
|
||||
healedItemsMap: make(map[madmin.HealItemType]int64),
|
||||
healFailedItemsMap: make(map[string]int64),
|
||||
healFailedItemsMap: make(map[madmin.HealItemType]int64),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -536,7 +536,7 @@ func getNodeDriveTimeoutErrorsMD() MetricDescription {
|
||||
Namespace: nodeMetricNamespace,
|
||||
Subsystem: driveSubsystem,
|
||||
Name: "errors_timeout",
|
||||
Help: "Total number of drive timeout errors since server start",
|
||||
Help: "Total number of drive timeout errors since server uptime",
|
||||
Type: counterMetric,
|
||||
}
|
||||
}
|
||||
@ -546,7 +546,7 @@ func getNodeDriveIOErrorsMD() MetricDescription {
|
||||
Namespace: nodeMetricNamespace,
|
||||
Subsystem: driveSubsystem,
|
||||
Name: "errors_ioerror",
|
||||
Help: "Total number of drive I/O errors since server start",
|
||||
Help: "Total number of drive I/O errors since server uptime",
|
||||
Type: counterMetric,
|
||||
}
|
||||
}
|
||||
@ -556,7 +556,7 @@ func getNodeDriveAvailabilityErrorsMD() MetricDescription {
|
||||
Namespace: nodeMetricNamespace,
|
||||
Subsystem: driveSubsystem,
|
||||
Name: "errors_availability",
|
||||
Help: "Total number of drive I/O errors, timeouts since server start",
|
||||
Help: "Total number of drive I/O errors, timeouts since server uptime",
|
||||
Type: counterMetric,
|
||||
}
|
||||
}
|
||||
@ -686,7 +686,7 @@ func getUsageLastScanActivityMD() MetricDescription {
|
||||
Namespace: minioMetricNamespace,
|
||||
Subsystem: usageSubsystem,
|
||||
Name: lastActivityTime,
|
||||
Help: "Time elapsed (in nano seconds) since last scan activity.",
|
||||
Help: "Time elapsed (in nano seconds) since last scan activity",
|
||||
Type: gaugeMetric,
|
||||
}
|
||||
}
|
||||
@ -856,7 +856,7 @@ func getClusterRepLinkTotalOfflineDurationMD() MetricDescription {
|
||||
Namespace: clusterMetricNamespace,
|
||||
Subsystem: replicationSubsystem,
|
||||
Name: linkDowntimeTotalDuration,
|
||||
Help: "Total downtime of replication link in seconds since server start",
|
||||
Help: "Total downtime of replication link in seconds since server uptime",
|
||||
Type: gaugeMetric,
|
||||
}
|
||||
}
|
||||
@ -916,7 +916,7 @@ func getRepFailedBytesTotalMD(namespace MetricNamespace) MetricDescription {
|
||||
Namespace: namespace,
|
||||
Subsystem: replicationSubsystem,
|
||||
Name: totalFailedBytes,
|
||||
Help: "Total number of bytes failed at least once to replicate since server start",
|
||||
Help: "Total number of bytes failed at least once to replicate since server uptime",
|
||||
Type: counterMetric,
|
||||
}
|
||||
}
|
||||
@ -926,7 +926,7 @@ func getRepFailedOperationsTotalMD(namespace MetricNamespace) MetricDescription
|
||||
Namespace: namespace,
|
||||
Subsystem: replicationSubsystem,
|
||||
Name: totalFailedCount,
|
||||
Help: "Total number of objects which failed replication since server start",
|
||||
Help: "Total number of objects which failed replication since server uptime",
|
||||
Type: counterMetric,
|
||||
}
|
||||
}
|
||||
@ -994,7 +994,7 @@ func getClusterRepCredentialErrorsMD(namespace MetricNamespace) MetricDescriptio
|
||||
Namespace: namespace,
|
||||
Subsystem: replicationSubsystem,
|
||||
Name: credentialErrors,
|
||||
Help: "Total number of replication credential errors since server start",
|
||||
Help: "Total number of replication credential errors since server uptime",
|
||||
Type: counterMetric,
|
||||
}
|
||||
}
|
||||
@ -1044,7 +1044,7 @@ func getClusterReplMaxActiveWorkersCountMD() MetricDescription {
|
||||
Namespace: clusterMetricNamespace,
|
||||
Subsystem: replicationSubsystem,
|
||||
Name: maxActiveWorkers,
|
||||
Help: "Maximum number of active replication workers seen since server start",
|
||||
Help: "Maximum number of active replication workers seen since server uptime",
|
||||
Type: gaugeMetric,
|
||||
}
|
||||
}
|
||||
@ -1064,7 +1064,7 @@ func getClusterRepLinkLatencyMaxMD() MetricDescription {
|
||||
Namespace: clusterMetricNamespace,
|
||||
Subsystem: replicationSubsystem,
|
||||
Name: maxLinkLatency,
|
||||
Help: "Maximum replication link latency in milliseconds seen since server start",
|
||||
Help: "Maximum replication link latency in milliseconds seen since server uptime",
|
||||
Type: gaugeMetric,
|
||||
}
|
||||
}
|
||||
@ -1084,7 +1084,7 @@ func getClusterReplAvgQueuedOperationsMD() MetricDescription {
|
||||
Namespace: clusterMetricNamespace,
|
||||
Subsystem: replicationSubsystem,
|
||||
Name: avgInQueueCount,
|
||||
Help: "Average number of objects queued for replication since server start",
|
||||
Help: "Average number of objects queued for replication since server uptime",
|
||||
Type: gaugeMetric,
|
||||
}
|
||||
}
|
||||
@ -1094,7 +1094,7 @@ func getClusterReplAvgQueuedBytesMD() MetricDescription {
|
||||
Namespace: clusterMetricNamespace,
|
||||
Subsystem: replicationSubsystem,
|
||||
Name: avgInQueueBytes,
|
||||
Help: "Average number of bytes queued for replication since server start",
|
||||
Help: "Average number of bytes queued for replication since server uptime",
|
||||
Type: gaugeMetric,
|
||||
}
|
||||
}
|
||||
@ -1104,7 +1104,7 @@ func getClusterReplMaxQueuedOperationsMD() MetricDescription {
|
||||
Namespace: clusterMetricNamespace,
|
||||
Subsystem: replicationSubsystem,
|
||||
Name: maxInQueueCount,
|
||||
Help: "Maximum number of objects queued for replication since server start",
|
||||
Help: "Maximum number of objects queued for replication since server uptime",
|
||||
Type: gaugeMetric,
|
||||
}
|
||||
}
|
||||
@ -1114,7 +1114,7 @@ func getClusterReplMaxQueuedBytesMD() MetricDescription {
|
||||
Namespace: clusterMetricNamespace,
|
||||
Subsystem: replicationSubsystem,
|
||||
Name: maxInQueueBytes,
|
||||
Help: "Maximum number of bytes queued for replication since server start",
|
||||
Help: "Maximum number of bytes queued for replication since server uptime",
|
||||
Type: gaugeMetric,
|
||||
}
|
||||
}
|
||||
@ -1134,7 +1134,7 @@ func getClusterReplMaxTransferRateMD() MetricDescription {
|
||||
Namespace: clusterMetricNamespace,
|
||||
Subsystem: replicationSubsystem,
|
||||
Name: maxTransferRate,
|
||||
Help: "Maximum replication transfer rate in bytes/sec seen since server start",
|
||||
Help: "Maximum replication transfer rate in bytes/sec seen since server uptime",
|
||||
Type: gaugeMetric,
|
||||
}
|
||||
}
|
||||
@ -1454,8 +1454,8 @@ func getHealObjectsTotalMD() MetricDescription {
|
||||
Namespace: healMetricNamespace,
|
||||
Subsystem: objectsSubsystem,
|
||||
Name: total,
|
||||
Help: "Objects scanned in current self healing run",
|
||||
Type: gaugeMetric,
|
||||
Help: "Objects scanned since server uptime",
|
||||
Type: counterMetric,
|
||||
}
|
||||
}
|
||||
|
||||
@ -1464,8 +1464,8 @@ func getHealObjectsHealTotalMD() MetricDescription {
|
||||
Namespace: healMetricNamespace,
|
||||
Subsystem: objectsSubsystem,
|
||||
Name: healTotal,
|
||||
Help: "Objects healed in current self healing run",
|
||||
Type: gaugeMetric,
|
||||
Help: "Objects healed since server uptime",
|
||||
Type: counterMetric,
|
||||
}
|
||||
}
|
||||
|
||||
@ -1474,8 +1474,8 @@ func getHealObjectsFailTotalMD() MetricDescription {
|
||||
Namespace: healMetricNamespace,
|
||||
Subsystem: objectsSubsystem,
|
||||
Name: errorsTotal,
|
||||
Help: "Objects for which healing failed in current self healing run",
|
||||
Type: gaugeMetric,
|
||||
Help: "Objects with healing failed since server uptime",
|
||||
Type: counterMetric,
|
||||
}
|
||||
}
|
||||
|
||||
@ -1484,7 +1484,7 @@ func getHealLastActivityTimeMD() MetricDescription {
|
||||
Namespace: healMetricNamespace,
|
||||
Subsystem: timeSubsystem,
|
||||
Name: lastActivityTime,
|
||||
Help: "Time elapsed (in nano seconds) since last self healing activity.",
|
||||
Help: "Time elapsed (in nano seconds) since last self healing activity",
|
||||
Type: gaugeMetric,
|
||||
}
|
||||
}
|
||||
@ -2105,7 +2105,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
|
||||
Namespace: nodeMetricNamespace,
|
||||
Subsystem: scannerSubsystem,
|
||||
Name: "objects_scanned",
|
||||
Help: "Total number of unique objects scanned since server start",
|
||||
Help: "Total number of unique objects scanned since server uptime",
|
||||
Type: counterMetric,
|
||||
},
|
||||
Value: float64(globalScannerMetrics.lifetime(scannerMetricScanObject)),
|
||||
@ -2115,7 +2115,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
|
||||
Namespace: nodeMetricNamespace,
|
||||
Subsystem: scannerSubsystem,
|
||||
Name: "versions_scanned",
|
||||
Help: "Total number of object versions scanned since server start",
|
||||
Help: "Total number of object versions scanned since server uptime",
|
||||
Type: counterMetric,
|
||||
},
|
||||
Value: float64(globalScannerMetrics.lifetime(scannerMetricApplyVersion)),
|
||||
@ -2125,7 +2125,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
|
||||
Namespace: nodeMetricNamespace,
|
||||
Subsystem: scannerSubsystem,
|
||||
Name: "directories_scanned",
|
||||
Help: "Total number of directories scanned since server start",
|
||||
Help: "Total number of directories scanned since server uptime",
|
||||
Type: counterMetric,
|
||||
},
|
||||
Value: float64(globalScannerMetrics.lifetime(scannerMetricScanFolder)),
|
||||
@ -2135,7 +2135,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
|
||||
Namespace: nodeMetricNamespace,
|
||||
Subsystem: scannerSubsystem,
|
||||
Name: "bucket_scans_started",
|
||||
Help: "Total number of bucket scans started since server start",
|
||||
Help: "Total number of bucket scans started since server uptime",
|
||||
Type: counterMetric,
|
||||
},
|
||||
Value: float64(globalScannerMetrics.lifetime(scannerMetricScanBucketDrive) + uint64(globalScannerMetrics.activeDrives())),
|
||||
@ -2145,7 +2145,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
|
||||
Namespace: nodeMetricNamespace,
|
||||
Subsystem: scannerSubsystem,
|
||||
Name: "bucket_scans_finished",
|
||||
Help: "Total number of bucket scans finished since server start",
|
||||
Help: "Total number of bucket scans finished since server uptime",
|
||||
Type: counterMetric,
|
||||
},
|
||||
Value: float64(globalScannerMetrics.lifetime(scannerMetricScanBucketDrive)),
|
||||
@ -2155,7 +2155,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
|
||||
Namespace: nodeMetricNamespace,
|
||||
Subsystem: ilmSubsystem,
|
||||
Name: "versions_scanned",
|
||||
Help: "Total number of object versions checked for ilm actions since server start",
|
||||
Help: "Total number of object versions checked for ilm actions since server uptime",
|
||||
Type: counterMetric,
|
||||
},
|
||||
Value: float64(globalScannerMetrics.lifetime(scannerMetricILM)),
|
||||
@ -2172,7 +2172,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
|
||||
Namespace: nodeMetricNamespace,
|
||||
Subsystem: ilmSubsystem,
|
||||
Name: MetricName("action_count_" + toSnake(action.String())),
|
||||
Help: "Total action outcome of lifecycle checks since server start",
|
||||
Help: "Total action outcome of lifecycle checks since server uptime",
|
||||
Type: counterMetric,
|
||||
},
|
||||
Value: float64(v),
|
||||
@ -2212,7 +2212,7 @@ func getIAMNodeMetrics(opts MetricsGroupOpts) *MetricsGroupV2 {
|
||||
Namespace: nodeMetricNamespace,
|
||||
Subsystem: iamSubsystem,
|
||||
Name: "since_last_sync_millis",
|
||||
Help: "Time (in milliseconds) since last successful IAM data sync.",
|
||||
Help: "Time (in milliseconds) since last successful IAM data sync",
|
||||
Type: gaugeMetric,
|
||||
},
|
||||
Value: float64(sinceLastSyncMillis),
|
||||
@ -2222,7 +2222,7 @@ func getIAMNodeMetrics(opts MetricsGroupOpts) *MetricsGroupV2 {
|
||||
Namespace: nodeMetricNamespace,
|
||||
Subsystem: iamSubsystem,
|
||||
Name: "sync_successes",
|
||||
Help: "Number of successful IAM data syncs since server start.",
|
||||
Help: "Number of successful IAM data syncs since server uptime",
|
||||
Type: counterMetric,
|
||||
},
|
||||
Value: float64(atomic.LoadUint64(&globalIAMSys.TotalRefreshSuccesses)),
|
||||
@ -2232,7 +2232,7 @@ func getIAMNodeMetrics(opts MetricsGroupOpts) *MetricsGroupV2 {
|
||||
Namespace: nodeMetricNamespace,
|
||||
Subsystem: iamSubsystem,
|
||||
Name: "sync_failures",
|
||||
Help: "Number of failed IAM data syncs since server start.",
|
||||
Help: "Number of failed IAM data syncs since server uptime",
|
||||
Type: counterMetric,
|
||||
},
|
||||
Value: float64(atomic.LoadUint64(&globalIAMSys.TotalRefreshFailures)),
|
||||
@ -2667,14 +2667,10 @@ func getFailedItems(seq *healSequence) (m []MetricV2) {
|
||||
items := seq.getHealFailedItemsMap()
|
||||
m = make([]MetricV2, 0, len(items))
|
||||
for k, v := range items {
|
||||
s := strings.Split(k, ",")
|
||||
m = append(m, MetricV2{
|
||||
Description: getHealObjectsFailTotalMD(),
|
||||
VariableLabels: map[string]string{
|
||||
"mount_path": s[0],
|
||||
"volume_status": s[1],
|
||||
},
|
||||
Value: float64(v),
|
||||
Description: getHealObjectsFailTotalMD(),
|
||||
VariableLabels: map[string]string{"type": string(k)},
|
||||
Value: float64(v),
|
||||
})
|
||||
}
|
||||
return
|
||||
|
@ -19,7 +19,6 @@ package cmd
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/minio/minio/internal/auth"
|
||||
@ -156,9 +155,9 @@ func healingMetricsPrometheus(ch chan<- prometheus.Metric) {
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
prometheus.NewDesc(
|
||||
prometheus.BuildFQName(healMetricsNamespace, "objects", "scanned"),
|
||||
"Objects scanned in current self healing run",
|
||||
"Objects scanned since uptime",
|
||||
[]string{"type"}, nil),
|
||||
prometheus.GaugeValue,
|
||||
prometheus.CounterValue,
|
||||
float64(v), string(k),
|
||||
)
|
||||
}
|
||||
@ -166,23 +165,20 @@ func healingMetricsPrometheus(ch chan<- prometheus.Metric) {
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
prometheus.NewDesc(
|
||||
prometheus.BuildFQName(healMetricsNamespace, "objects", "healed"),
|
||||
"Objects healed in current self healing run",
|
||||
"Objects healed since uptime",
|
||||
[]string{"type"}, nil),
|
||||
prometheus.GaugeValue,
|
||||
prometheus.CounterValue,
|
||||
float64(v), string(k),
|
||||
)
|
||||
}
|
||||
for k, v := range bgSeq.getHealFailedItemsMap() {
|
||||
// healFailedItemsMap stores the endpoint and volume state separated by comma,
|
||||
// split the fields and pass to channel at correct index
|
||||
s := strings.Split(k, ",")
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
prometheus.NewDesc(
|
||||
prometheus.BuildFQName(healMetricsNamespace, "objects", "heal_failed"),
|
||||
"Objects for which healing failed in current self healing run",
|
||||
[]string{"mount_path", "volume_status"}, nil),
|
||||
prometheus.GaugeValue,
|
||||
float64(v), s[0], s[1],
|
||||
"Objects for which healing failed since uptime",
|
||||
[]string{"type"}, nil),
|
||||
prometheus.CounterValue,
|
||||
float64(v), string(k),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user