mirror of https://github.com/minio/minio.git
fix: missing metrics for healed objects (#19392)
all healed successful objects via queueHealTask in a non-blocking heal weren't being reported correctly, this PR fixes this comprehensively.
This commit is contained in:
parent
ae4fb1b72e
commit
4f660a8eb7
|
@ -329,14 +329,18 @@ func (ahs *allHealState) LaunchNewHealSequence(h *healSequence, objAPI ObjectLay
|
|||
// Add heal state and start sequence
|
||||
ahs.healSeqMap[hpath] = h
|
||||
|
||||
// Launch top-level background heal go-routine
|
||||
go h.healSequenceStart(objAPI)
|
||||
|
||||
clientToken := h.clientToken
|
||||
if globalIsDistErasure {
|
||||
clientToken = fmt.Sprintf("%s:%d", h.clientToken, GetProxyEndpointLocalIndex(globalProxyEndpoints))
|
||||
}
|
||||
|
||||
if h.clientToken == bgHealingUUID {
|
||||
// For background heal do nothing, do not spawn an unnecessary goroutine.
|
||||
} else {
|
||||
// Launch top-level background heal go-routine
|
||||
go h.healSequenceStart(objAPI)
|
||||
}
|
||||
|
||||
b, err := json.Marshal(madmin.HealStartSuccess{
|
||||
ClientToken: clientToken,
|
||||
ClientAddress: h.clientAddress,
|
||||
|
@ -537,9 +541,9 @@ func (h *healSequence) getHealedItemsMap() map[madmin.HealItemType]int64 {
|
|||
return retMap
|
||||
}
|
||||
|
||||
// gethealFailedItemsMap - returns map of all items where heal failed against
|
||||
// getHealFailedItemsMap - returns map of all items where heal failed against
|
||||
// drive endpoint and status
|
||||
func (h *healSequence) gethealFailedItemsMap() map[string]int64 {
|
||||
func (h *healSequence) getHealFailedItemsMap() map[string]int64 {
|
||||
h.mutex.RLock()
|
||||
defer h.mutex.RUnlock()
|
||||
|
||||
|
@ -552,6 +556,32 @@ func (h *healSequence) gethealFailedItemsMap() map[string]int64 {
|
|||
return retMap
|
||||
}
|
||||
|
||||
func (h *healSequence) countFailed(res madmin.HealResultItem) {
|
||||
h.mutex.Lock()
|
||||
defer h.mutex.Unlock()
|
||||
|
||||
for _, d := range res.After.Drives {
|
||||
// For failed items we report the endpoint and drive state
|
||||
// This will help users take corrective actions for drives
|
||||
h.healFailedItemsMap[d.Endpoint+","+d.State]++
|
||||
}
|
||||
|
||||
h.lastHealActivity = UTCNow()
|
||||
}
|
||||
|
||||
func (h *healSequence) countHeals(healType madmin.HealItemType, healed bool) {
|
||||
h.mutex.Lock()
|
||||
defer h.mutex.Unlock()
|
||||
|
||||
if !healed {
|
||||
h.scannedItemsMap[healType]++
|
||||
} else {
|
||||
h.healedItemsMap[healType]++
|
||||
}
|
||||
|
||||
h.lastHealActivity = UTCNow()
|
||||
}
|
||||
|
||||
// isQuitting - determines if the heal sequence is quitting (due to an
|
||||
// external signal)
|
||||
func (h *healSequence) isQuitting() bool {
|
||||
|
@ -704,10 +734,7 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
|
|||
task.opts.ScanMode = madmin.HealNormalScan
|
||||
}
|
||||
|
||||
h.mutex.Lock()
|
||||
h.scannedItemsMap[healType]++
|
||||
h.lastHealActivity = UTCNow()
|
||||
h.mutex.Unlock()
|
||||
h.countHeals(healType, false)
|
||||
|
||||
if source.noWait {
|
||||
select {
|
||||
|
@ -744,32 +771,11 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
|
|||
return nil
|
||||
}
|
||||
|
||||
h.mutex.Lock()
|
||||
defer h.mutex.Unlock()
|
||||
|
||||
// Progress is not reported in case of background heal processing.
|
||||
// Instead we increment relevant counter based on the heal result
|
||||
// for prometheus reporting.
|
||||
if res.err != nil {
|
||||
for _, d := range res.result.After.Drives {
|
||||
// For failed items we report the endpoint and drive state
|
||||
// This will help users take corrective actions for drives
|
||||
h.healFailedItemsMap[d.Endpoint+","+d.State]++
|
||||
}
|
||||
} else {
|
||||
// Only object type reported for successful healing
|
||||
h.healedItemsMap[res.result.Type]++
|
||||
}
|
||||
|
||||
// Report caller of any failure
|
||||
return res.err
|
||||
}
|
||||
res.result.Type = healType
|
||||
if res.err != nil {
|
||||
// Only report object error
|
||||
if healType != madmin.HealItemObject {
|
||||
return res.err
|
||||
}
|
||||
res.result.Detail = res.err.Error()
|
||||
}
|
||||
return h.pushHealResultItem(res.result)
|
||||
|
|
|
@ -101,16 +101,17 @@ func waitForLowHTTPReq() {
|
|||
}
|
||||
|
||||
func initBackgroundHealing(ctx context.Context, objAPI ObjectLayer) {
|
||||
bgSeq := newBgHealSequence()
|
||||
// Run the background healer
|
||||
for i := 0; i < globalBackgroundHealRoutine.workers; i++ {
|
||||
go globalBackgroundHealRoutine.AddWorker(ctx, objAPI)
|
||||
go globalBackgroundHealRoutine.AddWorker(ctx, objAPI, bgSeq)
|
||||
}
|
||||
|
||||
globalBackgroundHealState.LaunchNewHealSequence(newBgHealSequence(), objAPI)
|
||||
globalBackgroundHealState.LaunchNewHealSequence(bgSeq, objAPI)
|
||||
}
|
||||
|
||||
// Wait for heal requests and process them
|
||||
func (h *healRoutine) AddWorker(ctx context.Context, objAPI ObjectLayer) {
|
||||
func (h *healRoutine) AddWorker(ctx context.Context, objAPI ObjectLayer, bgSeq *healSequence) {
|
||||
for {
|
||||
select {
|
||||
case task, ok := <-h.tasks:
|
||||
|
@ -133,6 +134,15 @@ func (h *healRoutine) AddWorker(ctx context.Context, objAPI ObjectLayer) {
|
|||
}
|
||||
}
|
||||
|
||||
if bgSeq != nil {
|
||||
// We increment relevant counter based on the heal result for prometheus reporting.
|
||||
if err != nil {
|
||||
bgSeq.countFailed(res)
|
||||
} else {
|
||||
bgSeq.countHeals(res.Type, false)
|
||||
}
|
||||
}
|
||||
|
||||
if task.respCh != nil {
|
||||
task.respCh <- healResult{result: res, err: err}
|
||||
}
|
||||
|
|
|
@ -2654,7 +2654,7 @@ func getMinioHealingMetrics(opts MetricsGroupOpts) *MetricsGroupV2 {
|
|||
}
|
||||
|
||||
func getFailedItems(seq *healSequence) (m []MetricV2) {
|
||||
items := seq.gethealFailedItemsMap()
|
||||
items := seq.getHealFailedItemsMap()
|
||||
m = make([]MetricV2, 0, len(items))
|
||||
for k, v := range items {
|
||||
s := strings.Split(k, ",")
|
||||
|
|
|
@ -172,7 +172,7 @@ func healingMetricsPrometheus(ch chan<- prometheus.Metric) {
|
|||
float64(v), string(k),
|
||||
)
|
||||
}
|
||||
for k, v := range bgSeq.gethealFailedItemsMap() {
|
||||
for k, v := range bgSeq.getHealFailedItemsMap() {
|
||||
// healFailedItemsMap stores the endpoint and volume state separated by comma,
|
||||
// split the fields and pass to channel at correct index
|
||||
s := strings.Split(k, ",")
|
||||
|
|
Loading…
Reference in New Issue