fix: missing metrics for healed objects (#19392)

all healed successful objects via queueHealTask
in a non-blocking heal weren't being reported
correctly, this PR fixes this comprehensively.
This commit is contained in:
Harshavardhana
2024-04-01 23:48:36 -07:00
committed by GitHub
parent ae4fb1b72e
commit 4f660a8eb7
4 changed files with 51 additions and 35 deletions

View File

@@ -329,14 +329,18 @@ func (ahs *allHealState) LaunchNewHealSequence(h *healSequence, objAPI ObjectLay
// Add heal state and start sequence
ahs.healSeqMap[hpath] = h
// Launch top-level background heal go-routine
go h.healSequenceStart(objAPI)
clientToken := h.clientToken
if globalIsDistErasure {
clientToken = fmt.Sprintf("%s:%d", h.clientToken, GetProxyEndpointLocalIndex(globalProxyEndpoints))
}
if h.clientToken == bgHealingUUID {
// For background heal do nothing, do not spawn an unnecessary goroutine.
} else {
// Launch top-level background heal go-routine
go h.healSequenceStart(objAPI)
}
b, err := json.Marshal(madmin.HealStartSuccess{
ClientToken: clientToken,
ClientAddress: h.clientAddress,
@@ -537,9 +541,9 @@ func (h *healSequence) getHealedItemsMap() map[madmin.HealItemType]int64 {
return retMap
}
// gethealFailedItemsMap - returns map of all items where heal failed against
// getHealFailedItemsMap - returns map of all items where heal failed against
// drive endpoint and status
func (h *healSequence) gethealFailedItemsMap() map[string]int64 {
func (h *healSequence) getHealFailedItemsMap() map[string]int64 {
h.mutex.RLock()
defer h.mutex.RUnlock()
@@ -552,6 +556,32 @@ func (h *healSequence) gethealFailedItemsMap() map[string]int64 {
return retMap
}
func (h *healSequence) countFailed(res madmin.HealResultItem) {
h.mutex.Lock()
defer h.mutex.Unlock()
for _, d := range res.After.Drives {
// For failed items we report the endpoint and drive state
// This will help users take corrective actions for drives
h.healFailedItemsMap[d.Endpoint+","+d.State]++
}
h.lastHealActivity = UTCNow()
}
func (h *healSequence) countHeals(healType madmin.HealItemType, healed bool) {
h.mutex.Lock()
defer h.mutex.Unlock()
if !healed {
h.scannedItemsMap[healType]++
} else {
h.healedItemsMap[healType]++
}
h.lastHealActivity = UTCNow()
}
// isQuitting - determines if the heal sequence is quitting (due to an
// external signal)
func (h *healSequence) isQuitting() bool {
@@ -704,10 +734,7 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
task.opts.ScanMode = madmin.HealNormalScan
}
h.mutex.Lock()
h.scannedItemsMap[healType]++
h.lastHealActivity = UTCNow()
h.mutex.Unlock()
h.countHeals(healType, false)
if source.noWait {
select {
@@ -744,32 +771,11 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
return nil
}
h.mutex.Lock()
defer h.mutex.Unlock()
// Progress is not reported in case of background heal processing.
// Instead we increment relevant counter based on the heal result
// for prometheus reporting.
if res.err != nil {
for _, d := range res.result.After.Drives {
// For failed items we report the endpoint and drive state
// This will help users take corrective actions for drives
h.healFailedItemsMap[d.Endpoint+","+d.State]++
}
} else {
// Only object type reported for successful healing
h.healedItemsMap[res.result.Type]++
}
// Report caller of any failure
return res.err
}
res.result.Type = healType
if res.err != nil {
// Only report object error
if healType != madmin.HealItemObject {
return res.err
}
res.result.Detail = res.err.Error()
}
return h.pushHealResultItem(res.result)