From a8f143298ff2f5437180a8f9d1448f6d5d020176 Mon Sep 17 00:00:00 2001 From: Anis Eleuch Date: Thu, 22 Aug 2024 13:35:43 +0100 Subject: [PATCH] heal: Reset healing params when a retry is decided (#20285) Currently, retry healing of a new drive healing does not reset HealedBuckets means that the next healing retry will skip those buckets. The commit will fix this behavior. Also, the skipped objects counter will include objects uploaded that are uploaded after the healing is started. --- cmd/background-newdisks-heal-ops.go | 28 ++++++++++++++++++++++++++-- cmd/global-heal.go | 28 +++++++++++++++++----------- go.mod | 2 +- go.sum | 4 ++-- 4 files changed, 46 insertions(+), 16 deletions(-) diff --git a/cmd/background-newdisks-heal-ops.go b/cmd/background-newdisks-heal-ops.go index 56a95e588..c24b41622 100644 --- a/cmd/background-newdisks-heal-ops.go +++ b/cmd/background-newdisks-heal-ops.go @@ -148,6 +148,26 @@ func initHealingTracker(disk StorageAPI, healID string) *healingTracker { return h } +func (h *healingTracker) resetHealing() { + h.mu.Lock() + defer h.mu.Unlock() + + h.ItemsHealed = 0 + h.ItemsFailed = 0 + h.BytesDone = 0 + h.BytesFailed = 0 + h.ResumeItemsHealed = 0 + h.ResumeItemsFailed = 0 + h.ResumeBytesDone = 0 + h.ResumeBytesFailed = 0 + h.ItemsSkipped = 0 + h.BytesSkipped = 0 + + h.HealedBuckets = nil + h.Object = "" + h.Bucket = "" +} + func (h *healingTracker) getLastUpdate() time.Time { h.mu.RLock() defer h.mu.RUnlock() @@ -349,6 +369,7 @@ func (h *healingTracker) toHealingDisk() madmin.HealingDisk { Object: h.Object, QueuedBuckets: h.QueuedBuckets, HealedBuckets: h.HealedBuckets, + RetryAttempts: h.RetryAttempts, ObjectsHealed: h.ItemsHealed, // Deprecated July 2021 ObjectsFailed: h.ItemsFailed, // Deprecated July 2021 @@ -482,16 +503,19 @@ func healFreshDisk(ctx context.Context, z *erasureServerPools, endpoint Endpoint // if objects have failed healing, we attempt a retry to heal the drive upto 3 times before giving up. if tracker.ItemsFailed > 0 && tracker.RetryAttempts < 4 { tracker.RetryAttempts++ - bugLogIf(ctx, tracker.update(ctx)) healingLogEvent(ctx, "Healing of drive '%s' is incomplete, retrying %s time (healed: %d, skipped: %d, failed: %d).", disk, humanize.Ordinal(int(tracker.RetryAttempts)), tracker.ItemsHealed, tracker.ItemsSkipped, tracker.ItemsFailed) + + tracker.resetHealing() + bugLogIf(ctx, tracker.update(ctx)) + return errRetryHealing } if tracker.ItemsFailed > 0 { healingLogEvent(ctx, "Healing of drive '%s' is incomplete, retried %d times (healed: %d, skipped: %d, failed: %d).", disk, - tracker.RetryAttempts-1, tracker.ItemsHealed, tracker.ItemsSkipped, tracker.ItemsFailed) + tracker.RetryAttempts, tracker.ItemsHealed, tracker.ItemsSkipped, tracker.ItemsFailed) } else { if tracker.RetryAttempts > 0 { healingLogEvent(ctx, "Healing of drive '%s' is complete, retried %d times (healed: %d, skipped: %d).", disk, diff --git a/cmd/global-heal.go b/cmd/global-heal.go index 49449e190..9ae58a323 100644 --- a/cmd/global-heal.go +++ b/cmd/global-heal.go @@ -167,6 +167,19 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, return errServerNotInitialized } + started := tracker.Started + if started.IsZero() || started.Equal(timeSentinel) { + healingLogIf(ctx, fmt.Errorf("unexpected tracker healing start time found: %v", started)) + started = time.Time{} + } + + // Final tracer update before quitting + defer func() { + tracker.setObject("") + tracker.setBucket("") + healingLogIf(ctx, tracker.update(ctx)) + }() + for _, bucket := range healBuckets { if err := bgSeq.healBucket(objAPI, bucket, true); err != nil { // Log bucket healing error if any, we shall retry again. @@ -435,13 +448,10 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, var versionNotFound int for _, version := range fivs.Versions { - // Ignore a version with a modtime newer than healing start time. - if version.ModTime.After(tracker.Started) { - continue - } - - // Apply lifecycle rules on the objects that are expired. - if filterLifecycle(bucket, version.Name, version) { + // Ignore healing a version if: + // - It is uploaded after the drive healing is started + // - An object that is already expired by ILM rule. + if !started.IsZero() && version.ModTime.After(started) || filterLifecycle(bucket, version.Name, version) { versionNotFound++ if !send(healEntrySkipped(uint64(version.Size))) { return @@ -556,10 +566,6 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, healingLogIf(ctx, tracker.update(ctx)) } } - - tracker.setObject("") - tracker.setBucket("") - if retErr != nil { return retErr } diff --git a/go.mod b/go.mod index 5461bda23..9018318b3 100644 --- a/go.mod +++ b/go.mod @@ -51,7 +51,7 @@ require ( github.com/minio/highwayhash v1.0.3 github.com/minio/kms-go/kes v0.3.0 github.com/minio/kms-go/kms v0.4.0 - github.com/minio/madmin-go/v3 v3.0.63 + github.com/minio/madmin-go/v3 v3.0.64-0.20240822003756-fe52a32e526d github.com/minio/minio-go/v7 v7.0.75 github.com/minio/mux v1.9.0 github.com/minio/pkg/v3 v3.0.11 diff --git a/go.sum b/go.sum index c72aa291c..edb1a365c 100644 --- a/go.sum +++ b/go.sum @@ -426,8 +426,8 @@ github.com/minio/kms-go/kes v0.3.0 h1:SU8VGVM/Hk9w1OiSby3OatkcojooUqIdDHl6dtM6Nk github.com/minio/kms-go/kes v0.3.0/go.mod h1:w6DeVT878qEOU3nUrYVy1WOT5H1Ig9hbDIh698NYJKY= github.com/minio/kms-go/kms v0.4.0 h1:cLPZceEp+05xHotVBaeFJrgL7JcXM4lBy6PU0idkE7I= github.com/minio/kms-go/kms v0.4.0/go.mod h1:q12CehiIy2qgBnDKq6Q7wmPi2PHSyRVug5DKp0HAVeE= -github.com/minio/madmin-go/v3 v3.0.63 h1:ERJRxEI/FFRh8MDi4Z+3DKe4sONkQ0g+OkNzRpk7qxk= -github.com/minio/madmin-go/v3 v3.0.63/go.mod h1:IFAwr0XMrdsLovxAdCcuq/eoL4nRuMVQQv0iubJANQw= +github.com/minio/madmin-go/v3 v3.0.64-0.20240822003756-fe52a32e526d h1:ma9PAmbEs+TP9BdsbQLO3gUa2nHSzeuQobOCT8BWUpg= +github.com/minio/madmin-go/v3 v3.0.64-0.20240822003756-fe52a32e526d/go.mod h1:IFAwr0XMrdsLovxAdCcuq/eoL4nRuMVQQv0iubJANQw= github.com/minio/mc v0.0.0-20240815155011-479171e7be9c h1:0tzuJ1nV6oZstqKQ/CwK1dzxNJ/cE38ym4SPi2HsWoY= github.com/minio/mc v0.0.0-20240815155011-479171e7be9c/go.mod h1:Cr4x7eiMJfOTWwg40Rk3EaOI7i+DUyOAtqLO7x+heiA= github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34=