count metrics properly for any failures during drive heal (#20193)

or via `mc admin heal --set 1 --pool 1`
This commit is contained in:
Harshavardhana
2024-07-30 22:46:26 -07:00
committed by GitHub
parent 01a8c09920
commit a9dc061d84
2 changed files with 26 additions and 9 deletions

View File

@@ -761,6 +761,15 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
return nil
}
countOKDrives := func(drives []madmin.HealDriveInfo) (count int) {
for _, drive := range drives {
if drive.State == madmin.DriveStateOk {
count++
}
}
return count
}
// task queued, now wait for the response.
select {
case res := <-task.respCh:
@@ -781,6 +790,11 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
if res.err != nil {
res.result.Detail = res.err.Error()
}
if res.result.ParityBlocks > 0 && res.result.DataBlocks > 0 && res.result.DataBlocks > res.result.ParityBlocks {
if got := countOKDrives(res.result.After.Drives); got < res.result.ParityBlocks {
res.result.Detail = fmt.Sprintf("quorum loss - expected %d minimum, got drive states in OK %d", res.result.ParityBlocks, got)
}
}
return h.pushHealResultItem(res.result)
case <-h.ctx.Done():
return nil