From a9dc061d847277e30c5c6918d6de6f0606f9d285 Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Tue, 30 Jul 2024 22:46:26 -0700 Subject: [PATCH] count metrics properly for any failures during drive heal (#20193) or via `mc admin heal --set 1 --pool 1` --- cmd/admin-heal-ops.go | 14 ++++++++++++++ cmd/global-heal.go | 21 ++++++++++++--------- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/cmd/admin-heal-ops.go b/cmd/admin-heal-ops.go index fe064840a..4f90f03b8 100644 --- a/cmd/admin-heal-ops.go +++ b/cmd/admin-heal-ops.go @@ -761,6 +761,15 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem return nil } + countOKDrives := func(drives []madmin.HealDriveInfo) (count int) { + for _, drive := range drives { + if drive.State == madmin.DriveStateOk { + count++ + } + } + return count + } + // task queued, now wait for the response. select { case res := <-task.respCh: @@ -781,6 +790,11 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem if res.err != nil { res.result.Detail = res.err.Error() } + if res.result.ParityBlocks > 0 && res.result.DataBlocks > 0 && res.result.DataBlocks > res.result.ParityBlocks { + if got := countOKDrives(res.result.After.Drives); got < res.result.ParityBlocks { + res.result.Detail = fmt.Sprintf("quorum loss - expected %d minimum, got drive states in OK %d", res.result.ParityBlocks, got) + } + } return h.pushHealResultItem(res.result) case <-h.ctx.Done(): return nil diff --git a/cmd/global-heal.go b/cmd/global-heal.go index 352f4a9af..0a38d98c6 100644 --- a/cmd/global-heal.go +++ b/cmd/global-heal.go @@ -150,6 +150,11 @@ type healEntryResult struct { // healErasureSet lists and heals all objects in a specific erasure set func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, tracker *healingTracker) error { + bgSeq, found := globalBackgroundHealState.getHealSequenceByToken(bgHealingUUID) + if !found { + return errors.New("no local healing sequence initialized, unable to heal the drive") + } + scanMode := madmin.HealNormalScan // Make sure to copy since `buckets slice` @@ -163,11 +168,7 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, } for _, bucket := range healBuckets { - _, err := objAPI.HealBucket(ctx, bucket, madmin.HealOpts{ - Recreate: true, - ScanMode: scanMode, - }) - if err != nil { + if err := bgSeq.healBucket(objAPI, bucket, true); err != nil { // Log bucket healing error if any, we shall retry again. healingLogIf(ctx, err) } @@ -264,10 +265,7 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, tracker.setBucket(bucket) // Heal current bucket again in case if it is failed // in the beginning of erasure set healing - if _, err := objAPI.HealBucket(ctx, bucket, madmin.HealOpts{ - Recreate: true, - ScanMode: scanMode, - }); err != nil { + if err := bgSeq.healBucket(objAPI, bucket, true); err != nil { // Set this such that when we return this function // we let the caller retry this disk again for the // buckets that failed healing. @@ -366,6 +364,7 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, } return false case results <- result: + bgSeq.countScanned(madmin.HealItemObject) return true } } @@ -416,8 +415,10 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, return } result = healEntryFailure(0) + bgSeq.countFailed(madmin.HealItemObject) healingLogIf(ctx, fmt.Errorf("unable to heal object %s/%s: %w", bucket, entry.name, err)) } else { + bgSeq.countHealed(madmin.HealItemObject) result = healEntrySuccess(uint64(res.ObjectSize)) } @@ -463,8 +464,10 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, } if versionHealed { + bgSeq.countHealed(madmin.HealItemObject) result = healEntrySuccess(uint64(version.Size)) } else { + bgSeq.countFailed(madmin.HealItemObject) result = healEntryFailure(uint64(version.Size)) if version.VersionID != "" { healingLogIf(ctx, fmt.Errorf("unable to heal object %s/%s-v(%s): %w", bucket, version.Name, version.VersionID, err))