mirror of
https://github.com/minio/minio.git
synced 2025-01-23 20:53:18 -05:00
deprecate unexpected healing failed counters (#19705)
simplify this to avoid verbose metrics, and make room for valid metrics to be reported for alerting etc.
This commit is contained in:
parent
7b7d2ea7d4
commit
b534dc69ab
@ -455,8 +455,8 @@ type healSequence struct {
|
|||||||
// Number of total items healed against item type
|
// Number of total items healed against item type
|
||||||
healedItemsMap map[madmin.HealItemType]int64
|
healedItemsMap map[madmin.HealItemType]int64
|
||||||
|
|
||||||
// Number of total items where healing failed against endpoint and drive state
|
// Number of total items where healing failed against item type
|
||||||
healFailedItemsMap map[string]int64
|
healFailedItemsMap map[madmin.HealItemType]int64
|
||||||
|
|
||||||
// The time of the last scan/heal activity
|
// The time of the last scan/heal activity
|
||||||
lastHealActivity time.Time
|
lastHealActivity time.Time
|
||||||
@ -497,7 +497,7 @@ func newHealSequence(ctx context.Context, bucket, objPrefix, clientAddr string,
|
|||||||
ctx: ctx,
|
ctx: ctx,
|
||||||
scannedItemsMap: make(map[madmin.HealItemType]int64),
|
scannedItemsMap: make(map[madmin.HealItemType]int64),
|
||||||
healedItemsMap: make(map[madmin.HealItemType]int64),
|
healedItemsMap: make(map[madmin.HealItemType]int64),
|
||||||
healFailedItemsMap: make(map[string]int64),
|
healFailedItemsMap: make(map[madmin.HealItemType]int64),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -543,12 +543,12 @@ func (h *healSequence) getHealedItemsMap() map[madmin.HealItemType]int64 {
|
|||||||
|
|
||||||
// getHealFailedItemsMap - returns map of all items where heal failed against
|
// getHealFailedItemsMap - returns map of all items where heal failed against
|
||||||
// drive endpoint and status
|
// drive endpoint and status
|
||||||
func (h *healSequence) getHealFailedItemsMap() map[string]int64 {
|
func (h *healSequence) getHealFailedItemsMap() map[madmin.HealItemType]int64 {
|
||||||
h.mutex.RLock()
|
h.mutex.RLock()
|
||||||
defer h.mutex.RUnlock()
|
defer h.mutex.RUnlock()
|
||||||
|
|
||||||
// Make a copy before returning the value
|
// Make a copy before returning the value
|
||||||
retMap := make(map[string]int64, len(h.healFailedItemsMap))
|
retMap := make(map[madmin.HealItemType]int64, len(h.healFailedItemsMap))
|
||||||
for k, v := range h.healFailedItemsMap {
|
for k, v := range h.healFailedItemsMap {
|
||||||
retMap[k] = v
|
retMap[k] = v
|
||||||
}
|
}
|
||||||
@ -556,29 +556,27 @@ func (h *healSequence) getHealFailedItemsMap() map[string]int64 {
|
|||||||
return retMap
|
return retMap
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *healSequence) countFailed(res madmin.HealResultItem) {
|
func (h *healSequence) countFailed(healType madmin.HealItemType) {
|
||||||
h.mutex.Lock()
|
h.mutex.Lock()
|
||||||
defer h.mutex.Unlock()
|
defer h.mutex.Unlock()
|
||||||
|
|
||||||
for _, d := range res.After.Drives {
|
h.healFailedItemsMap[healType]++
|
||||||
// For failed items we report the endpoint and drive state
|
|
||||||
// This will help users take corrective actions for drives
|
|
||||||
h.healFailedItemsMap[d.Endpoint+","+d.State]++
|
|
||||||
}
|
|
||||||
|
|
||||||
h.lastHealActivity = UTCNow()
|
h.lastHealActivity = UTCNow()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *healSequence) countHeals(healType madmin.HealItemType, healed bool) {
|
func (h *healSequence) countScanned(healType madmin.HealItemType) {
|
||||||
h.mutex.Lock()
|
h.mutex.Lock()
|
||||||
defer h.mutex.Unlock()
|
defer h.mutex.Unlock()
|
||||||
|
|
||||||
if !healed {
|
|
||||||
h.scannedItemsMap[healType]++
|
h.scannedItemsMap[healType]++
|
||||||
} else {
|
h.lastHealActivity = UTCNow()
|
||||||
h.healedItemsMap[healType]++
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
func (h *healSequence) countHealed(healType madmin.HealItemType) {
|
||||||
|
h.mutex.Lock()
|
||||||
|
defer h.mutex.Unlock()
|
||||||
|
|
||||||
|
h.healedItemsMap[healType]++
|
||||||
h.lastHealActivity = UTCNow()
|
h.lastHealActivity = UTCNow()
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -734,7 +732,7 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
|
|||||||
task.opts.ScanMode = madmin.HealNormalScan
|
task.opts.ScanMode = madmin.HealNormalScan
|
||||||
}
|
}
|
||||||
|
|
||||||
h.countHeals(healType, false)
|
h.countScanned(healType)
|
||||||
|
|
||||||
if source.noWait {
|
if source.noWait {
|
||||||
select {
|
select {
|
||||||
@ -766,6 +764,11 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
|
|||||||
// task queued, now wait for the response.
|
// task queued, now wait for the response.
|
||||||
select {
|
select {
|
||||||
case res := <-task.respCh:
|
case res := <-task.respCh:
|
||||||
|
if res.err == nil {
|
||||||
|
h.countHealed(healType)
|
||||||
|
} else {
|
||||||
|
h.countFailed(healType)
|
||||||
|
}
|
||||||
if !h.reportProgress {
|
if !h.reportProgress {
|
||||||
if errors.Is(res.err, errSkipFile) { // this is only sent usually by nopHeal
|
if errors.Is(res.err, errSkipFile) { // this is only sent usually by nopHeal
|
||||||
return nil
|
return nil
|
||||||
|
@ -133,19 +133,20 @@ func (h *healRoutine) AddWorker(ctx context.Context, objAPI ObjectLayer, bgSeq *
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if bgSeq != nil {
|
|
||||||
// We increment relevant counter based on the heal result for prometheus reporting.
|
|
||||||
if err != nil {
|
|
||||||
bgSeq.countFailed(res)
|
|
||||||
} else {
|
|
||||||
bgSeq.countHeals(res.Type, false)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if task.respCh != nil {
|
if task.respCh != nil {
|
||||||
task.respCh <- healResult{result: res, err: err}
|
task.respCh <- healResult{result: res, err: err}
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// when respCh is not set caller is not waiting but we
|
||||||
|
// update the relevant metrics for them
|
||||||
|
if bgSeq != nil {
|
||||||
|
if err == nil {
|
||||||
|
bgSeq.countHealed(res.Type)
|
||||||
|
} else {
|
||||||
|
bgSeq.countFailed(res.Type)
|
||||||
|
}
|
||||||
|
}
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
@ -70,7 +70,7 @@ func newBgHealSequence() *healSequence {
|
|||||||
reportProgress: false,
|
reportProgress: false,
|
||||||
scannedItemsMap: make(map[madmin.HealItemType]int64),
|
scannedItemsMap: make(map[madmin.HealItemType]int64),
|
||||||
healedItemsMap: make(map[madmin.HealItemType]int64),
|
healedItemsMap: make(map[madmin.HealItemType]int64),
|
||||||
healFailedItemsMap: make(map[string]int64),
|
healFailedItemsMap: make(map[madmin.HealItemType]int64),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -536,7 +536,7 @@ func getNodeDriveTimeoutErrorsMD() MetricDescription {
|
|||||||
Namespace: nodeMetricNamespace,
|
Namespace: nodeMetricNamespace,
|
||||||
Subsystem: driveSubsystem,
|
Subsystem: driveSubsystem,
|
||||||
Name: "errors_timeout",
|
Name: "errors_timeout",
|
||||||
Help: "Total number of drive timeout errors since server start",
|
Help: "Total number of drive timeout errors since server uptime",
|
||||||
Type: counterMetric,
|
Type: counterMetric,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -546,7 +546,7 @@ func getNodeDriveIOErrorsMD() MetricDescription {
|
|||||||
Namespace: nodeMetricNamespace,
|
Namespace: nodeMetricNamespace,
|
||||||
Subsystem: driveSubsystem,
|
Subsystem: driveSubsystem,
|
||||||
Name: "errors_ioerror",
|
Name: "errors_ioerror",
|
||||||
Help: "Total number of drive I/O errors since server start",
|
Help: "Total number of drive I/O errors since server uptime",
|
||||||
Type: counterMetric,
|
Type: counterMetric,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -556,7 +556,7 @@ func getNodeDriveAvailabilityErrorsMD() MetricDescription {
|
|||||||
Namespace: nodeMetricNamespace,
|
Namespace: nodeMetricNamespace,
|
||||||
Subsystem: driveSubsystem,
|
Subsystem: driveSubsystem,
|
||||||
Name: "errors_availability",
|
Name: "errors_availability",
|
||||||
Help: "Total number of drive I/O errors, timeouts since server start",
|
Help: "Total number of drive I/O errors, timeouts since server uptime",
|
||||||
Type: counterMetric,
|
Type: counterMetric,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -686,7 +686,7 @@ func getUsageLastScanActivityMD() MetricDescription {
|
|||||||
Namespace: minioMetricNamespace,
|
Namespace: minioMetricNamespace,
|
||||||
Subsystem: usageSubsystem,
|
Subsystem: usageSubsystem,
|
||||||
Name: lastActivityTime,
|
Name: lastActivityTime,
|
||||||
Help: "Time elapsed (in nano seconds) since last scan activity.",
|
Help: "Time elapsed (in nano seconds) since last scan activity",
|
||||||
Type: gaugeMetric,
|
Type: gaugeMetric,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -856,7 +856,7 @@ func getClusterRepLinkTotalOfflineDurationMD() MetricDescription {
|
|||||||
Namespace: clusterMetricNamespace,
|
Namespace: clusterMetricNamespace,
|
||||||
Subsystem: replicationSubsystem,
|
Subsystem: replicationSubsystem,
|
||||||
Name: linkDowntimeTotalDuration,
|
Name: linkDowntimeTotalDuration,
|
||||||
Help: "Total downtime of replication link in seconds since server start",
|
Help: "Total downtime of replication link in seconds since server uptime",
|
||||||
Type: gaugeMetric,
|
Type: gaugeMetric,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -916,7 +916,7 @@ func getRepFailedBytesTotalMD(namespace MetricNamespace) MetricDescription {
|
|||||||
Namespace: namespace,
|
Namespace: namespace,
|
||||||
Subsystem: replicationSubsystem,
|
Subsystem: replicationSubsystem,
|
||||||
Name: totalFailedBytes,
|
Name: totalFailedBytes,
|
||||||
Help: "Total number of bytes failed at least once to replicate since server start",
|
Help: "Total number of bytes failed at least once to replicate since server uptime",
|
||||||
Type: counterMetric,
|
Type: counterMetric,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -926,7 +926,7 @@ func getRepFailedOperationsTotalMD(namespace MetricNamespace) MetricDescription
|
|||||||
Namespace: namespace,
|
Namespace: namespace,
|
||||||
Subsystem: replicationSubsystem,
|
Subsystem: replicationSubsystem,
|
||||||
Name: totalFailedCount,
|
Name: totalFailedCount,
|
||||||
Help: "Total number of objects which failed replication since server start",
|
Help: "Total number of objects which failed replication since server uptime",
|
||||||
Type: counterMetric,
|
Type: counterMetric,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -994,7 +994,7 @@ func getClusterRepCredentialErrorsMD(namespace MetricNamespace) MetricDescriptio
|
|||||||
Namespace: namespace,
|
Namespace: namespace,
|
||||||
Subsystem: replicationSubsystem,
|
Subsystem: replicationSubsystem,
|
||||||
Name: credentialErrors,
|
Name: credentialErrors,
|
||||||
Help: "Total number of replication credential errors since server start",
|
Help: "Total number of replication credential errors since server uptime",
|
||||||
Type: counterMetric,
|
Type: counterMetric,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1044,7 +1044,7 @@ func getClusterReplMaxActiveWorkersCountMD() MetricDescription {
|
|||||||
Namespace: clusterMetricNamespace,
|
Namespace: clusterMetricNamespace,
|
||||||
Subsystem: replicationSubsystem,
|
Subsystem: replicationSubsystem,
|
||||||
Name: maxActiveWorkers,
|
Name: maxActiveWorkers,
|
||||||
Help: "Maximum number of active replication workers seen since server start",
|
Help: "Maximum number of active replication workers seen since server uptime",
|
||||||
Type: gaugeMetric,
|
Type: gaugeMetric,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1064,7 +1064,7 @@ func getClusterRepLinkLatencyMaxMD() MetricDescription {
|
|||||||
Namespace: clusterMetricNamespace,
|
Namespace: clusterMetricNamespace,
|
||||||
Subsystem: replicationSubsystem,
|
Subsystem: replicationSubsystem,
|
||||||
Name: maxLinkLatency,
|
Name: maxLinkLatency,
|
||||||
Help: "Maximum replication link latency in milliseconds seen since server start",
|
Help: "Maximum replication link latency in milliseconds seen since server uptime",
|
||||||
Type: gaugeMetric,
|
Type: gaugeMetric,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1084,7 +1084,7 @@ func getClusterReplAvgQueuedOperationsMD() MetricDescription {
|
|||||||
Namespace: clusterMetricNamespace,
|
Namespace: clusterMetricNamespace,
|
||||||
Subsystem: replicationSubsystem,
|
Subsystem: replicationSubsystem,
|
||||||
Name: avgInQueueCount,
|
Name: avgInQueueCount,
|
||||||
Help: "Average number of objects queued for replication since server start",
|
Help: "Average number of objects queued for replication since server uptime",
|
||||||
Type: gaugeMetric,
|
Type: gaugeMetric,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1094,7 +1094,7 @@ func getClusterReplAvgQueuedBytesMD() MetricDescription {
|
|||||||
Namespace: clusterMetricNamespace,
|
Namespace: clusterMetricNamespace,
|
||||||
Subsystem: replicationSubsystem,
|
Subsystem: replicationSubsystem,
|
||||||
Name: avgInQueueBytes,
|
Name: avgInQueueBytes,
|
||||||
Help: "Average number of bytes queued for replication since server start",
|
Help: "Average number of bytes queued for replication since server uptime",
|
||||||
Type: gaugeMetric,
|
Type: gaugeMetric,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1104,7 +1104,7 @@ func getClusterReplMaxQueuedOperationsMD() MetricDescription {
|
|||||||
Namespace: clusterMetricNamespace,
|
Namespace: clusterMetricNamespace,
|
||||||
Subsystem: replicationSubsystem,
|
Subsystem: replicationSubsystem,
|
||||||
Name: maxInQueueCount,
|
Name: maxInQueueCount,
|
||||||
Help: "Maximum number of objects queued for replication since server start",
|
Help: "Maximum number of objects queued for replication since server uptime",
|
||||||
Type: gaugeMetric,
|
Type: gaugeMetric,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1114,7 +1114,7 @@ func getClusterReplMaxQueuedBytesMD() MetricDescription {
|
|||||||
Namespace: clusterMetricNamespace,
|
Namespace: clusterMetricNamespace,
|
||||||
Subsystem: replicationSubsystem,
|
Subsystem: replicationSubsystem,
|
||||||
Name: maxInQueueBytes,
|
Name: maxInQueueBytes,
|
||||||
Help: "Maximum number of bytes queued for replication since server start",
|
Help: "Maximum number of bytes queued for replication since server uptime",
|
||||||
Type: gaugeMetric,
|
Type: gaugeMetric,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1134,7 +1134,7 @@ func getClusterReplMaxTransferRateMD() MetricDescription {
|
|||||||
Namespace: clusterMetricNamespace,
|
Namespace: clusterMetricNamespace,
|
||||||
Subsystem: replicationSubsystem,
|
Subsystem: replicationSubsystem,
|
||||||
Name: maxTransferRate,
|
Name: maxTransferRate,
|
||||||
Help: "Maximum replication transfer rate in bytes/sec seen since server start",
|
Help: "Maximum replication transfer rate in bytes/sec seen since server uptime",
|
||||||
Type: gaugeMetric,
|
Type: gaugeMetric,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1454,8 +1454,8 @@ func getHealObjectsTotalMD() MetricDescription {
|
|||||||
Namespace: healMetricNamespace,
|
Namespace: healMetricNamespace,
|
||||||
Subsystem: objectsSubsystem,
|
Subsystem: objectsSubsystem,
|
||||||
Name: total,
|
Name: total,
|
||||||
Help: "Objects scanned in current self healing run",
|
Help: "Objects scanned since server uptime",
|
||||||
Type: gaugeMetric,
|
Type: counterMetric,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1464,8 +1464,8 @@ func getHealObjectsHealTotalMD() MetricDescription {
|
|||||||
Namespace: healMetricNamespace,
|
Namespace: healMetricNamespace,
|
||||||
Subsystem: objectsSubsystem,
|
Subsystem: objectsSubsystem,
|
||||||
Name: healTotal,
|
Name: healTotal,
|
||||||
Help: "Objects healed in current self healing run",
|
Help: "Objects healed since server uptime",
|
||||||
Type: gaugeMetric,
|
Type: counterMetric,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1474,8 +1474,8 @@ func getHealObjectsFailTotalMD() MetricDescription {
|
|||||||
Namespace: healMetricNamespace,
|
Namespace: healMetricNamespace,
|
||||||
Subsystem: objectsSubsystem,
|
Subsystem: objectsSubsystem,
|
||||||
Name: errorsTotal,
|
Name: errorsTotal,
|
||||||
Help: "Objects for which healing failed in current self healing run",
|
Help: "Objects with healing failed since server uptime",
|
||||||
Type: gaugeMetric,
|
Type: counterMetric,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1484,7 +1484,7 @@ func getHealLastActivityTimeMD() MetricDescription {
|
|||||||
Namespace: healMetricNamespace,
|
Namespace: healMetricNamespace,
|
||||||
Subsystem: timeSubsystem,
|
Subsystem: timeSubsystem,
|
||||||
Name: lastActivityTime,
|
Name: lastActivityTime,
|
||||||
Help: "Time elapsed (in nano seconds) since last self healing activity.",
|
Help: "Time elapsed (in nano seconds) since last self healing activity",
|
||||||
Type: gaugeMetric,
|
Type: gaugeMetric,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2105,7 +2105,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
|
|||||||
Namespace: nodeMetricNamespace,
|
Namespace: nodeMetricNamespace,
|
||||||
Subsystem: scannerSubsystem,
|
Subsystem: scannerSubsystem,
|
||||||
Name: "objects_scanned",
|
Name: "objects_scanned",
|
||||||
Help: "Total number of unique objects scanned since server start",
|
Help: "Total number of unique objects scanned since server uptime",
|
||||||
Type: counterMetric,
|
Type: counterMetric,
|
||||||
},
|
},
|
||||||
Value: float64(globalScannerMetrics.lifetime(scannerMetricScanObject)),
|
Value: float64(globalScannerMetrics.lifetime(scannerMetricScanObject)),
|
||||||
@ -2115,7 +2115,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
|
|||||||
Namespace: nodeMetricNamespace,
|
Namespace: nodeMetricNamespace,
|
||||||
Subsystem: scannerSubsystem,
|
Subsystem: scannerSubsystem,
|
||||||
Name: "versions_scanned",
|
Name: "versions_scanned",
|
||||||
Help: "Total number of object versions scanned since server start",
|
Help: "Total number of object versions scanned since server uptime",
|
||||||
Type: counterMetric,
|
Type: counterMetric,
|
||||||
},
|
},
|
||||||
Value: float64(globalScannerMetrics.lifetime(scannerMetricApplyVersion)),
|
Value: float64(globalScannerMetrics.lifetime(scannerMetricApplyVersion)),
|
||||||
@ -2125,7 +2125,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
|
|||||||
Namespace: nodeMetricNamespace,
|
Namespace: nodeMetricNamespace,
|
||||||
Subsystem: scannerSubsystem,
|
Subsystem: scannerSubsystem,
|
||||||
Name: "directories_scanned",
|
Name: "directories_scanned",
|
||||||
Help: "Total number of directories scanned since server start",
|
Help: "Total number of directories scanned since server uptime",
|
||||||
Type: counterMetric,
|
Type: counterMetric,
|
||||||
},
|
},
|
||||||
Value: float64(globalScannerMetrics.lifetime(scannerMetricScanFolder)),
|
Value: float64(globalScannerMetrics.lifetime(scannerMetricScanFolder)),
|
||||||
@ -2135,7 +2135,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
|
|||||||
Namespace: nodeMetricNamespace,
|
Namespace: nodeMetricNamespace,
|
||||||
Subsystem: scannerSubsystem,
|
Subsystem: scannerSubsystem,
|
||||||
Name: "bucket_scans_started",
|
Name: "bucket_scans_started",
|
||||||
Help: "Total number of bucket scans started since server start",
|
Help: "Total number of bucket scans started since server uptime",
|
||||||
Type: counterMetric,
|
Type: counterMetric,
|
||||||
},
|
},
|
||||||
Value: float64(globalScannerMetrics.lifetime(scannerMetricScanBucketDrive) + uint64(globalScannerMetrics.activeDrives())),
|
Value: float64(globalScannerMetrics.lifetime(scannerMetricScanBucketDrive) + uint64(globalScannerMetrics.activeDrives())),
|
||||||
@ -2145,7 +2145,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
|
|||||||
Namespace: nodeMetricNamespace,
|
Namespace: nodeMetricNamespace,
|
||||||
Subsystem: scannerSubsystem,
|
Subsystem: scannerSubsystem,
|
||||||
Name: "bucket_scans_finished",
|
Name: "bucket_scans_finished",
|
||||||
Help: "Total number of bucket scans finished since server start",
|
Help: "Total number of bucket scans finished since server uptime",
|
||||||
Type: counterMetric,
|
Type: counterMetric,
|
||||||
},
|
},
|
||||||
Value: float64(globalScannerMetrics.lifetime(scannerMetricScanBucketDrive)),
|
Value: float64(globalScannerMetrics.lifetime(scannerMetricScanBucketDrive)),
|
||||||
@ -2155,7 +2155,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
|
|||||||
Namespace: nodeMetricNamespace,
|
Namespace: nodeMetricNamespace,
|
||||||
Subsystem: ilmSubsystem,
|
Subsystem: ilmSubsystem,
|
||||||
Name: "versions_scanned",
|
Name: "versions_scanned",
|
||||||
Help: "Total number of object versions checked for ilm actions since server start",
|
Help: "Total number of object versions checked for ilm actions since server uptime",
|
||||||
Type: counterMetric,
|
Type: counterMetric,
|
||||||
},
|
},
|
||||||
Value: float64(globalScannerMetrics.lifetime(scannerMetricILM)),
|
Value: float64(globalScannerMetrics.lifetime(scannerMetricILM)),
|
||||||
@ -2172,7 +2172,7 @@ func getScannerNodeMetrics() *MetricsGroupV2 {
|
|||||||
Namespace: nodeMetricNamespace,
|
Namespace: nodeMetricNamespace,
|
||||||
Subsystem: ilmSubsystem,
|
Subsystem: ilmSubsystem,
|
||||||
Name: MetricName("action_count_" + toSnake(action.String())),
|
Name: MetricName("action_count_" + toSnake(action.String())),
|
||||||
Help: "Total action outcome of lifecycle checks since server start",
|
Help: "Total action outcome of lifecycle checks since server uptime",
|
||||||
Type: counterMetric,
|
Type: counterMetric,
|
||||||
},
|
},
|
||||||
Value: float64(v),
|
Value: float64(v),
|
||||||
@ -2212,7 +2212,7 @@ func getIAMNodeMetrics(opts MetricsGroupOpts) *MetricsGroupV2 {
|
|||||||
Namespace: nodeMetricNamespace,
|
Namespace: nodeMetricNamespace,
|
||||||
Subsystem: iamSubsystem,
|
Subsystem: iamSubsystem,
|
||||||
Name: "since_last_sync_millis",
|
Name: "since_last_sync_millis",
|
||||||
Help: "Time (in milliseconds) since last successful IAM data sync.",
|
Help: "Time (in milliseconds) since last successful IAM data sync",
|
||||||
Type: gaugeMetric,
|
Type: gaugeMetric,
|
||||||
},
|
},
|
||||||
Value: float64(sinceLastSyncMillis),
|
Value: float64(sinceLastSyncMillis),
|
||||||
@ -2222,7 +2222,7 @@ func getIAMNodeMetrics(opts MetricsGroupOpts) *MetricsGroupV2 {
|
|||||||
Namespace: nodeMetricNamespace,
|
Namespace: nodeMetricNamespace,
|
||||||
Subsystem: iamSubsystem,
|
Subsystem: iamSubsystem,
|
||||||
Name: "sync_successes",
|
Name: "sync_successes",
|
||||||
Help: "Number of successful IAM data syncs since server start.",
|
Help: "Number of successful IAM data syncs since server uptime",
|
||||||
Type: counterMetric,
|
Type: counterMetric,
|
||||||
},
|
},
|
||||||
Value: float64(atomic.LoadUint64(&globalIAMSys.TotalRefreshSuccesses)),
|
Value: float64(atomic.LoadUint64(&globalIAMSys.TotalRefreshSuccesses)),
|
||||||
@ -2232,7 +2232,7 @@ func getIAMNodeMetrics(opts MetricsGroupOpts) *MetricsGroupV2 {
|
|||||||
Namespace: nodeMetricNamespace,
|
Namespace: nodeMetricNamespace,
|
||||||
Subsystem: iamSubsystem,
|
Subsystem: iamSubsystem,
|
||||||
Name: "sync_failures",
|
Name: "sync_failures",
|
||||||
Help: "Number of failed IAM data syncs since server start.",
|
Help: "Number of failed IAM data syncs since server uptime",
|
||||||
Type: counterMetric,
|
Type: counterMetric,
|
||||||
},
|
},
|
||||||
Value: float64(atomic.LoadUint64(&globalIAMSys.TotalRefreshFailures)),
|
Value: float64(atomic.LoadUint64(&globalIAMSys.TotalRefreshFailures)),
|
||||||
@ -2667,13 +2667,9 @@ func getFailedItems(seq *healSequence) (m []MetricV2) {
|
|||||||
items := seq.getHealFailedItemsMap()
|
items := seq.getHealFailedItemsMap()
|
||||||
m = make([]MetricV2, 0, len(items))
|
m = make([]MetricV2, 0, len(items))
|
||||||
for k, v := range items {
|
for k, v := range items {
|
||||||
s := strings.Split(k, ",")
|
|
||||||
m = append(m, MetricV2{
|
m = append(m, MetricV2{
|
||||||
Description: getHealObjectsFailTotalMD(),
|
Description: getHealObjectsFailTotalMD(),
|
||||||
VariableLabels: map[string]string{
|
VariableLabels: map[string]string{"type": string(k)},
|
||||||
"mount_path": s[0],
|
|
||||||
"volume_status": s[1],
|
|
||||||
},
|
|
||||||
Value: float64(v),
|
Value: float64(v),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -19,7 +19,6 @@ package cmd
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"net/http"
|
"net/http"
|
||||||
"strings"
|
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/minio/minio/internal/auth"
|
"github.com/minio/minio/internal/auth"
|
||||||
@ -156,9 +155,9 @@ func healingMetricsPrometheus(ch chan<- prometheus.Metric) {
|
|||||||
ch <- prometheus.MustNewConstMetric(
|
ch <- prometheus.MustNewConstMetric(
|
||||||
prometheus.NewDesc(
|
prometheus.NewDesc(
|
||||||
prometheus.BuildFQName(healMetricsNamespace, "objects", "scanned"),
|
prometheus.BuildFQName(healMetricsNamespace, "objects", "scanned"),
|
||||||
"Objects scanned in current self healing run",
|
"Objects scanned since uptime",
|
||||||
[]string{"type"}, nil),
|
[]string{"type"}, nil),
|
||||||
prometheus.GaugeValue,
|
prometheus.CounterValue,
|
||||||
float64(v), string(k),
|
float64(v), string(k),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@ -166,23 +165,20 @@ func healingMetricsPrometheus(ch chan<- prometheus.Metric) {
|
|||||||
ch <- prometheus.MustNewConstMetric(
|
ch <- prometheus.MustNewConstMetric(
|
||||||
prometheus.NewDesc(
|
prometheus.NewDesc(
|
||||||
prometheus.BuildFQName(healMetricsNamespace, "objects", "healed"),
|
prometheus.BuildFQName(healMetricsNamespace, "objects", "healed"),
|
||||||
"Objects healed in current self healing run",
|
"Objects healed since uptime",
|
||||||
[]string{"type"}, nil),
|
[]string{"type"}, nil),
|
||||||
prometheus.GaugeValue,
|
prometheus.CounterValue,
|
||||||
float64(v), string(k),
|
float64(v), string(k),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
for k, v := range bgSeq.getHealFailedItemsMap() {
|
for k, v := range bgSeq.getHealFailedItemsMap() {
|
||||||
// healFailedItemsMap stores the endpoint and volume state separated by comma,
|
|
||||||
// split the fields and pass to channel at correct index
|
|
||||||
s := strings.Split(k, ",")
|
|
||||||
ch <- prometheus.MustNewConstMetric(
|
ch <- prometheus.MustNewConstMetric(
|
||||||
prometheus.NewDesc(
|
prometheus.NewDesc(
|
||||||
prometheus.BuildFQName(healMetricsNamespace, "objects", "heal_failed"),
|
prometheus.BuildFQName(healMetricsNamespace, "objects", "heal_failed"),
|
||||||
"Objects for which healing failed in current self healing run",
|
"Objects for which healing failed since uptime",
|
||||||
[]string{"mount_path", "volume_status"}, nil),
|
[]string{"type"}, nil),
|
||||||
prometheus.GaugeValue,
|
prometheus.CounterValue,
|
||||||
float64(v), s[0], s[1],
|
float64(v), string(k),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user