heal: Reset healing params when a retry is decided (#20285)

Currently, retry healing of a new drive healing does not reset
HealedBuckets means that the next healing retry will skip those
buckets. The commit will fix this behavior.

Also, the skipped objects counter will include objects uploaded
that are uploaded after the healing is started.
This commit is contained in:
Anis Eleuch 2024-08-22 13:35:43 +01:00 committed by GitHub
parent 2d44c161c7
commit a8f143298f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 46 additions and 16 deletions

View File

@ -148,6 +148,26 @@ func initHealingTracker(disk StorageAPI, healID string) *healingTracker {
return h
}
func (h *healingTracker) resetHealing() {
h.mu.Lock()
defer h.mu.Unlock()
h.ItemsHealed = 0
h.ItemsFailed = 0
h.BytesDone = 0
h.BytesFailed = 0
h.ResumeItemsHealed = 0
h.ResumeItemsFailed = 0
h.ResumeBytesDone = 0
h.ResumeBytesFailed = 0
h.ItemsSkipped = 0
h.BytesSkipped = 0
h.HealedBuckets = nil
h.Object = ""
h.Bucket = ""
}
func (h *healingTracker) getLastUpdate() time.Time {
h.mu.RLock()
defer h.mu.RUnlock()
@ -349,6 +369,7 @@ func (h *healingTracker) toHealingDisk() madmin.HealingDisk {
Object: h.Object,
QueuedBuckets: h.QueuedBuckets,
HealedBuckets: h.HealedBuckets,
RetryAttempts: h.RetryAttempts,
ObjectsHealed: h.ItemsHealed, // Deprecated July 2021
ObjectsFailed: h.ItemsFailed, // Deprecated July 2021
@ -482,16 +503,19 @@ func healFreshDisk(ctx context.Context, z *erasureServerPools, endpoint Endpoint
// if objects have failed healing, we attempt a retry to heal the drive upto 3 times before giving up.
if tracker.ItemsFailed > 0 && tracker.RetryAttempts < 4 {
tracker.RetryAttempts++
bugLogIf(ctx, tracker.update(ctx))
healingLogEvent(ctx, "Healing of drive '%s' is incomplete, retrying %s time (healed: %d, skipped: %d, failed: %d).", disk,
humanize.Ordinal(int(tracker.RetryAttempts)), tracker.ItemsHealed, tracker.ItemsSkipped, tracker.ItemsFailed)
tracker.resetHealing()
bugLogIf(ctx, tracker.update(ctx))
return errRetryHealing
}
if tracker.ItemsFailed > 0 {
healingLogEvent(ctx, "Healing of drive '%s' is incomplete, retried %d times (healed: %d, skipped: %d, failed: %d).", disk,
tracker.RetryAttempts-1, tracker.ItemsHealed, tracker.ItemsSkipped, tracker.ItemsFailed)
tracker.RetryAttempts, tracker.ItemsHealed, tracker.ItemsSkipped, tracker.ItemsFailed)
} else {
if tracker.RetryAttempts > 0 {
healingLogEvent(ctx, "Healing of drive '%s' is complete, retried %d times (healed: %d, skipped: %d).", disk,

View File

@ -167,6 +167,19 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
return errServerNotInitialized
}
started := tracker.Started
if started.IsZero() || started.Equal(timeSentinel) {
healingLogIf(ctx, fmt.Errorf("unexpected tracker healing start time found: %v", started))
started = time.Time{}
}
// Final tracer update before quitting
defer func() {
tracker.setObject("")
tracker.setBucket("")
healingLogIf(ctx, tracker.update(ctx))
}()
for _, bucket := range healBuckets {
if err := bgSeq.healBucket(objAPI, bucket, true); err != nil {
// Log bucket healing error if any, we shall retry again.
@ -435,13 +448,10 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
var versionNotFound int
for _, version := range fivs.Versions {
// Ignore a version with a modtime newer than healing start time.
if version.ModTime.After(tracker.Started) {
continue
}
// Apply lifecycle rules on the objects that are expired.
if filterLifecycle(bucket, version.Name, version) {
// Ignore healing a version if:
// - It is uploaded after the drive healing is started
// - An object that is already expired by ILM rule.
if !started.IsZero() && version.ModTime.After(started) || filterLifecycle(bucket, version.Name, version) {
versionNotFound++
if !send(healEntrySkipped(uint64(version.Size))) {
return
@ -556,10 +566,6 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
healingLogIf(ctx, tracker.update(ctx))
}
}
tracker.setObject("")
tracker.setBucket("")
if retErr != nil {
return retErr
}

2
go.mod
View File

@ -51,7 +51,7 @@ require (
github.com/minio/highwayhash v1.0.3
github.com/minio/kms-go/kes v0.3.0
github.com/minio/kms-go/kms v0.4.0
github.com/minio/madmin-go/v3 v3.0.63
github.com/minio/madmin-go/v3 v3.0.64-0.20240822003756-fe52a32e526d
github.com/minio/minio-go/v7 v7.0.75
github.com/minio/mux v1.9.0
github.com/minio/pkg/v3 v3.0.11

4
go.sum
View File

@ -426,8 +426,8 @@ github.com/minio/kms-go/kes v0.3.0 h1:SU8VGVM/Hk9w1OiSby3OatkcojooUqIdDHl6dtM6Nk
github.com/minio/kms-go/kes v0.3.0/go.mod h1:w6DeVT878qEOU3nUrYVy1WOT5H1Ig9hbDIh698NYJKY=
github.com/minio/kms-go/kms v0.4.0 h1:cLPZceEp+05xHotVBaeFJrgL7JcXM4lBy6PU0idkE7I=
github.com/minio/kms-go/kms v0.4.0/go.mod h1:q12CehiIy2qgBnDKq6Q7wmPi2PHSyRVug5DKp0HAVeE=
github.com/minio/madmin-go/v3 v3.0.63 h1:ERJRxEI/FFRh8MDi4Z+3DKe4sONkQ0g+OkNzRpk7qxk=
github.com/minio/madmin-go/v3 v3.0.63/go.mod h1:IFAwr0XMrdsLovxAdCcuq/eoL4nRuMVQQv0iubJANQw=
github.com/minio/madmin-go/v3 v3.0.64-0.20240822003756-fe52a32e526d h1:ma9PAmbEs+TP9BdsbQLO3gUa2nHSzeuQobOCT8BWUpg=
github.com/minio/madmin-go/v3 v3.0.64-0.20240822003756-fe52a32e526d/go.mod h1:IFAwr0XMrdsLovxAdCcuq/eoL4nRuMVQQv0iubJANQw=
github.com/minio/mc v0.0.0-20240815155011-479171e7be9c h1:0tzuJ1nV6oZstqKQ/CwK1dzxNJ/cE38ym4SPi2HsWoY=
github.com/minio/mc v0.0.0-20240815155011-479171e7be9c/go.mod h1:Cr4x7eiMJfOTWwg40Rk3EaOI7i+DUyOAtqLO7x+heiA=
github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34=