mirror of
https://github.com/minio/minio.git
synced 2025-03-31 09:43:43 -04:00
fix: missing metrics for healed objects (#19392)
all healed successful objects via queueHealTask in a non-blocking heal weren't being reported correctly, this PR fixes this comprehensively.
This commit is contained in:
parent
ae4fb1b72e
commit
4f660a8eb7
@ -329,14 +329,18 @@ func (ahs *allHealState) LaunchNewHealSequence(h *healSequence, objAPI ObjectLay
|
|||||||
// Add heal state and start sequence
|
// Add heal state and start sequence
|
||||||
ahs.healSeqMap[hpath] = h
|
ahs.healSeqMap[hpath] = h
|
||||||
|
|
||||||
// Launch top-level background heal go-routine
|
|
||||||
go h.healSequenceStart(objAPI)
|
|
||||||
|
|
||||||
clientToken := h.clientToken
|
clientToken := h.clientToken
|
||||||
if globalIsDistErasure {
|
if globalIsDistErasure {
|
||||||
clientToken = fmt.Sprintf("%s:%d", h.clientToken, GetProxyEndpointLocalIndex(globalProxyEndpoints))
|
clientToken = fmt.Sprintf("%s:%d", h.clientToken, GetProxyEndpointLocalIndex(globalProxyEndpoints))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if h.clientToken == bgHealingUUID {
|
||||||
|
// For background heal do nothing, do not spawn an unnecessary goroutine.
|
||||||
|
} else {
|
||||||
|
// Launch top-level background heal go-routine
|
||||||
|
go h.healSequenceStart(objAPI)
|
||||||
|
}
|
||||||
|
|
||||||
b, err := json.Marshal(madmin.HealStartSuccess{
|
b, err := json.Marshal(madmin.HealStartSuccess{
|
||||||
ClientToken: clientToken,
|
ClientToken: clientToken,
|
||||||
ClientAddress: h.clientAddress,
|
ClientAddress: h.clientAddress,
|
||||||
@ -537,9 +541,9 @@ func (h *healSequence) getHealedItemsMap() map[madmin.HealItemType]int64 {
|
|||||||
return retMap
|
return retMap
|
||||||
}
|
}
|
||||||
|
|
||||||
// gethealFailedItemsMap - returns map of all items where heal failed against
|
// getHealFailedItemsMap - returns map of all items where heal failed against
|
||||||
// drive endpoint and status
|
// drive endpoint and status
|
||||||
func (h *healSequence) gethealFailedItemsMap() map[string]int64 {
|
func (h *healSequence) getHealFailedItemsMap() map[string]int64 {
|
||||||
h.mutex.RLock()
|
h.mutex.RLock()
|
||||||
defer h.mutex.RUnlock()
|
defer h.mutex.RUnlock()
|
||||||
|
|
||||||
@ -552,6 +556,32 @@ func (h *healSequence) gethealFailedItemsMap() map[string]int64 {
|
|||||||
return retMap
|
return retMap
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *healSequence) countFailed(res madmin.HealResultItem) {
|
||||||
|
h.mutex.Lock()
|
||||||
|
defer h.mutex.Unlock()
|
||||||
|
|
||||||
|
for _, d := range res.After.Drives {
|
||||||
|
// For failed items we report the endpoint and drive state
|
||||||
|
// This will help users take corrective actions for drives
|
||||||
|
h.healFailedItemsMap[d.Endpoint+","+d.State]++
|
||||||
|
}
|
||||||
|
|
||||||
|
h.lastHealActivity = UTCNow()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *healSequence) countHeals(healType madmin.HealItemType, healed bool) {
|
||||||
|
h.mutex.Lock()
|
||||||
|
defer h.mutex.Unlock()
|
||||||
|
|
||||||
|
if !healed {
|
||||||
|
h.scannedItemsMap[healType]++
|
||||||
|
} else {
|
||||||
|
h.healedItemsMap[healType]++
|
||||||
|
}
|
||||||
|
|
||||||
|
h.lastHealActivity = UTCNow()
|
||||||
|
}
|
||||||
|
|
||||||
// isQuitting - determines if the heal sequence is quitting (due to an
|
// isQuitting - determines if the heal sequence is quitting (due to an
|
||||||
// external signal)
|
// external signal)
|
||||||
func (h *healSequence) isQuitting() bool {
|
func (h *healSequence) isQuitting() bool {
|
||||||
@ -704,10 +734,7 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
|
|||||||
task.opts.ScanMode = madmin.HealNormalScan
|
task.opts.ScanMode = madmin.HealNormalScan
|
||||||
}
|
}
|
||||||
|
|
||||||
h.mutex.Lock()
|
h.countHeals(healType, false)
|
||||||
h.scannedItemsMap[healType]++
|
|
||||||
h.lastHealActivity = UTCNow()
|
|
||||||
h.mutex.Unlock()
|
|
||||||
|
|
||||||
if source.noWait {
|
if source.noWait {
|
||||||
select {
|
select {
|
||||||
@ -744,32 +771,11 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
h.mutex.Lock()
|
|
||||||
defer h.mutex.Unlock()
|
|
||||||
|
|
||||||
// Progress is not reported in case of background heal processing.
|
|
||||||
// Instead we increment relevant counter based on the heal result
|
|
||||||
// for prometheus reporting.
|
|
||||||
if res.err != nil {
|
|
||||||
for _, d := range res.result.After.Drives {
|
|
||||||
// For failed items we report the endpoint and drive state
|
|
||||||
// This will help users take corrective actions for drives
|
|
||||||
h.healFailedItemsMap[d.Endpoint+","+d.State]++
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Only object type reported for successful healing
|
|
||||||
h.healedItemsMap[res.result.Type]++
|
|
||||||
}
|
|
||||||
|
|
||||||
// Report caller of any failure
|
// Report caller of any failure
|
||||||
return res.err
|
return res.err
|
||||||
}
|
}
|
||||||
res.result.Type = healType
|
res.result.Type = healType
|
||||||
if res.err != nil {
|
if res.err != nil {
|
||||||
// Only report object error
|
|
||||||
if healType != madmin.HealItemObject {
|
|
||||||
return res.err
|
|
||||||
}
|
|
||||||
res.result.Detail = res.err.Error()
|
res.result.Detail = res.err.Error()
|
||||||
}
|
}
|
||||||
return h.pushHealResultItem(res.result)
|
return h.pushHealResultItem(res.result)
|
||||||
|
@ -101,16 +101,17 @@ func waitForLowHTTPReq() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func initBackgroundHealing(ctx context.Context, objAPI ObjectLayer) {
|
func initBackgroundHealing(ctx context.Context, objAPI ObjectLayer) {
|
||||||
|
bgSeq := newBgHealSequence()
|
||||||
// Run the background healer
|
// Run the background healer
|
||||||
for i := 0; i < globalBackgroundHealRoutine.workers; i++ {
|
for i := 0; i < globalBackgroundHealRoutine.workers; i++ {
|
||||||
go globalBackgroundHealRoutine.AddWorker(ctx, objAPI)
|
go globalBackgroundHealRoutine.AddWorker(ctx, objAPI, bgSeq)
|
||||||
}
|
}
|
||||||
|
|
||||||
globalBackgroundHealState.LaunchNewHealSequence(newBgHealSequence(), objAPI)
|
globalBackgroundHealState.LaunchNewHealSequence(bgSeq, objAPI)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for heal requests and process them
|
// Wait for heal requests and process them
|
||||||
func (h *healRoutine) AddWorker(ctx context.Context, objAPI ObjectLayer) {
|
func (h *healRoutine) AddWorker(ctx context.Context, objAPI ObjectLayer, bgSeq *healSequence) {
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case task, ok := <-h.tasks:
|
case task, ok := <-h.tasks:
|
||||||
@ -133,6 +134,15 @@ func (h *healRoutine) AddWorker(ctx context.Context, objAPI ObjectLayer) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if bgSeq != nil {
|
||||||
|
// We increment relevant counter based on the heal result for prometheus reporting.
|
||||||
|
if err != nil {
|
||||||
|
bgSeq.countFailed(res)
|
||||||
|
} else {
|
||||||
|
bgSeq.countHeals(res.Type, false)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if task.respCh != nil {
|
if task.respCh != nil {
|
||||||
task.respCh <- healResult{result: res, err: err}
|
task.respCh <- healResult{result: res, err: err}
|
||||||
}
|
}
|
||||||
|
@ -2654,7 +2654,7 @@ func getMinioHealingMetrics(opts MetricsGroupOpts) *MetricsGroupV2 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func getFailedItems(seq *healSequence) (m []MetricV2) {
|
func getFailedItems(seq *healSequence) (m []MetricV2) {
|
||||||
items := seq.gethealFailedItemsMap()
|
items := seq.getHealFailedItemsMap()
|
||||||
m = make([]MetricV2, 0, len(items))
|
m = make([]MetricV2, 0, len(items))
|
||||||
for k, v := range items {
|
for k, v := range items {
|
||||||
s := strings.Split(k, ",")
|
s := strings.Split(k, ",")
|
||||||
|
@ -172,7 +172,7 @@ func healingMetricsPrometheus(ch chan<- prometheus.Metric) {
|
|||||||
float64(v), string(k),
|
float64(v), string(k),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
for k, v := range bgSeq.gethealFailedItemsMap() {
|
for k, v := range bgSeq.getHealFailedItemsMap() {
|
||||||
// healFailedItemsMap stores the endpoint and volume state separated by comma,
|
// healFailedItemsMap stores the endpoint and volume state separated by comma,
|
||||||
// split the fields and pass to channel at correct index
|
// split the fields and pass to channel at correct index
|
||||||
s := strings.Split(k, ",")
|
s := strings.Split(k, ",")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user