count metrics properly for any failures during drive heal (#20193)

or via `mc admin heal --set 1 --pool 1`
2025-11-07 12:52:58 -05:00 · 2024-07-30 22:46:26 -07:00
parent 01a8c09920
commit a9dc061d84
2 changed files with 26 additions and 9 deletions
--- a/cmd/admin-heal-ops.go
+++ b/cmd/admin-heal-ops.go
@@ -761,6 +761,15 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
 		return nil
 	}

+	countOKDrives := func(drives []madmin.HealDriveInfo) (count int) {
+		for _, drive := range drives {
+			if drive.State == madmin.DriveStateOk {
+				count++
+			}
+		}
+		return count
+	}
+
 	// task queued, now wait for the response.
 	select {
 	case res := <-task.respCh:
@@ -781,6 +790,11 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
 		if res.err != nil {
 			res.result.Detail = res.err.Error()
 		}
+		if res.result.ParityBlocks > 0 && res.result.DataBlocks > 0 && res.result.DataBlocks > res.result.ParityBlocks {
+			if got := countOKDrives(res.result.After.Drives); got < res.result.ParityBlocks {
+				res.result.Detail = fmt.Sprintf("quorum loss - expected %d minimum, got drive states in OK %d", res.result.ParityBlocks, got)
+			}
+		}
 		return h.pushHealResultItem(res.result)
 	case <-h.ctx.Done():
 		return nil