From 17fd71164cf86eaadeb40d0cfd8d93c4365fa93d Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Fri, 19 Nov 2021 08:46:47 -0800 Subject: [PATCH] retry disk replacement healing if listing fails (#13689) listing can fail and it is allowed to be retried, instead of returning right away return an error at the end - heal the rest of the buckets and objects, and when we are retrying skip the buckets that are already marked done by using the tracked buckets. fixes #12972 --- cmd/background-newdisks-heal-ops.go | 12 +++---- cmd/global-heal.go | 53 +++++++++++++++++------------ 2 files changed, 37 insertions(+), 28 deletions(-) diff --git a/cmd/background-newdisks-heal-ops.go b/cmd/background-newdisks-heal-ops.go index c45d77b5f..4d128c518 100644 --- a/cmd/background-newdisks-heal-ops.go +++ b/cmd/background-newdisks-heal-ops.go @@ -18,12 +18,12 @@ package cmd import ( - "bytes" "context" "encoding/json" "errors" "fmt" "io" + "os" "sort" "strings" "sync" @@ -412,7 +412,8 @@ func monitorLocalDisksAndHeal(ctx context.Context, z *erasureServerPools, bgSeq // So someone changed the drives underneath, healing tracker missing. tracker, err := loadHealingTracker(ctx, disk) if err != nil { - logger.Info("Healing tracker missing on '%s', disk was swapped again on %s pool", disk, humanize.Ordinal(i+1)) + logger.Info("Healing tracker missing on '%s', disk was swapped again on %s pool", + disk, humanize.Ordinal(i+1)) tracker = newHealingTracker(disk) } @@ -434,16 +435,15 @@ func monitorLocalDisksAndHeal(ctx context.Context, z *erasureServerPools, bgSeq return } - err = z.serverPools[i].sets[setIndex].healErasureSet(ctx, buckets, tracker) + err = z.serverPools[i].sets[setIndex].healErasureSet(ctx, tracker.QueuedBuckets, tracker) if err != nil { logger.LogIf(ctx, err) continue } logger.Info("Healing disk '%s' on %s pool complete", disk, humanize.Ordinal(i+1)) - var buf bytes.Buffer - tracker.printTo(&buf) - logger.Info("Summary:\n%s", buf.String()) + logger.Info("Summary:\n") + tracker.printTo(os.Stdout) logger.LogIf(ctx, tracker.delete(ctx)) // Only upon success pop the healed disk. diff --git a/cmd/global-heal.go b/cmd/global-heal.go index c7826e7b1..ade64e855 100644 --- a/cmd/global-heal.go +++ b/cmd/global-heal.go @@ -23,6 +23,7 @@ import ( "sort" "time" + "github.com/dustin/go-humanize" "github.com/minio/madmin-go" "github.com/minio/minio/internal/color" "github.com/minio/minio/internal/config/storageclass" @@ -163,23 +164,20 @@ func mustGetHealSequence(ctx context.Context) *healSequence { } // healErasureSet lists and heals all objects in a specific erasure set -func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []BucketInfo, tracker *healingTracker) error { +func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, tracker *healingTracker) error { bgSeq := mustGetHealSequence(ctx) - buckets = append(buckets, BucketInfo{ - Name: pathJoin(minioMetaBucket, minioConfigPrefix), - }) - scanMode := globalHealConfig.ScanMode() + var retErr error // Heal all buckets with all objects for _, bucket := range buckets { - if tracker.isHealed(bucket.Name) { + if tracker.isHealed(bucket) { continue } var forwardTo string // If we resume to the same bucket, forward to last known item. if tracker.Bucket != "" { - if tracker.Bucket == bucket.Name { + if tracker.Bucket == bucket { forwardTo = tracker.Object } else { // Reset to where last bucket ended if resuming. @@ -187,16 +185,18 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []BucketIn } } tracker.Object = "" - tracker.Bucket = bucket.Name + tracker.Bucket = bucket // Heal current bucket - if _, err := er.HealBucket(ctx, bucket.Name, madmin.HealOpts{ + if _, err := er.HealBucket(ctx, bucket, madmin.HealOpts{ ScanMode: scanMode, }); err != nil { logger.LogIf(ctx, err) + continue } if serverDebugLog { - console.Debugf(color.Green("healDisk:")+" healing bucket %s content on erasure set %d\n", bucket.Name, tracker.SetIndex+1) + console.Debugf(color.Green("healDisk:")+" healing bucket %s content on %s erasure set\n", + bucket, humanize.Ordinal(tracker.SetIndex+1)) } disks, _ := er.getOnlineDisksWithHealing() @@ -204,7 +204,7 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []BucketIn // all disks are healing in this set, this is allowed // so we simply proceed to next bucket, marking the bucket // as done as there are no objects to heal. - tracker.bucketDone(bucket.Name) + tracker.bucketDone(bucket) logger.LogIf(ctx, tracker.update(ctx)) continue } @@ -221,7 +221,7 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []BucketIn // We might land at .metacache, .trash, .multipart // no need to heal them skip, only when bucket // is '.minio.sys' - if bucket.Name == minioMetaBucket { + if bucket == minioMetaBucket { if wildcard.Match("buckets/*/.metacache/*", entry.name) { return } @@ -233,19 +233,25 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []BucketIn } } - fivs, err := entry.fileInfoVersions(bucket.Name) + fivs, err := entry.fileInfoVersions(bucket) if err != nil { err := bgSeq.queueHealTask(healSource{ - bucket: bucket.Name, + bucket: bucket, object: entry.name, versionID: "", }, madmin.HealItemObject) - logger.LogIf(ctx, err) + if err != nil { + tracker.ItemsFailed++ + logger.LogIf(ctx, err) + } else { + tracker.ItemsHealed++ + } + bgSeq.logHeal(madmin.HealItemObject) return } for _, version := range fivs.Versions { - if _, err := er.HealObject(ctx, bucket.Name, version.Name, + if _, err := er.HealObject(ctx, bucket, version.Name, version.VersionID, madmin.HealOpts{ ScanMode: scanMode, Remove: healDeleteDangling, @@ -273,12 +279,12 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []BucketIn resolver := metadataResolutionParams{ dirQuorum: 1, objQuorum: 1, - bucket: bucket.Name, + bucket: bucket, } err := listPathRaw(ctx, listPathRawOptions{ disks: disks, - bucket: bucket.Name, + bucket: bucket, recursive: true, forwardTo: forwardTo, minDisks: 1, @@ -297,8 +303,12 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []BucketIn }) if err != nil { + // Set this such that when we return this function + // we let the caller retry this disk again for the + // buckets it failed to list. + retErr = err logger.LogIf(ctx, err) - return err + continue } select { @@ -306,15 +316,14 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []BucketIn case <-ctx.Done(): return ctx.Err() default: - logger.LogIf(ctx, err) - tracker.bucketDone(bucket.Name) + tracker.bucketDone(bucket) logger.LogIf(ctx, tracker.update(ctx)) } } tracker.Object = "" tracker.Bucket = "" - return nil + return retErr } // healObject heals given object path in deep to fix bitrot.