retry disk replacement healing if listing fails (#13689)

listing can fail and it is allowed to be retried,
instead of returning right away return an error at
the end - heal the rest of the buckets and objects,
and when we are retrying skip the buckets that
are already marked done by using the tracked buckets.

fixes #12972
This commit is contained in:
Harshavardhana 2021-11-19 08:46:47 -08:00 committed by GitHub
parent 81d19156e9
commit 17fd71164c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 37 additions and 28 deletions

View File

@ -18,12 +18,12 @@
package cmd package cmd
import ( import (
"bytes"
"context" "context"
"encoding/json" "encoding/json"
"errors" "errors"
"fmt" "fmt"
"io" "io"
"os"
"sort" "sort"
"strings" "strings"
"sync" "sync"
@ -412,7 +412,8 @@ func monitorLocalDisksAndHeal(ctx context.Context, z *erasureServerPools, bgSeq
// So someone changed the drives underneath, healing tracker missing. // So someone changed the drives underneath, healing tracker missing.
tracker, err := loadHealingTracker(ctx, disk) tracker, err := loadHealingTracker(ctx, disk)
if err != nil { if err != nil {
logger.Info("Healing tracker missing on '%s', disk was swapped again on %s pool", disk, humanize.Ordinal(i+1)) logger.Info("Healing tracker missing on '%s', disk was swapped again on %s pool",
disk, humanize.Ordinal(i+1))
tracker = newHealingTracker(disk) tracker = newHealingTracker(disk)
} }
@ -434,16 +435,15 @@ func monitorLocalDisksAndHeal(ctx context.Context, z *erasureServerPools, bgSeq
return return
} }
err = z.serverPools[i].sets[setIndex].healErasureSet(ctx, buckets, tracker) err = z.serverPools[i].sets[setIndex].healErasureSet(ctx, tracker.QueuedBuckets, tracker)
if err != nil { if err != nil {
logger.LogIf(ctx, err) logger.LogIf(ctx, err)
continue continue
} }
logger.Info("Healing disk '%s' on %s pool complete", disk, humanize.Ordinal(i+1)) logger.Info("Healing disk '%s' on %s pool complete", disk, humanize.Ordinal(i+1))
var buf bytes.Buffer logger.Info("Summary:\n")
tracker.printTo(&buf) tracker.printTo(os.Stdout)
logger.Info("Summary:\n%s", buf.String())
logger.LogIf(ctx, tracker.delete(ctx)) logger.LogIf(ctx, tracker.delete(ctx))
// Only upon success pop the healed disk. // Only upon success pop the healed disk.

View File

@ -23,6 +23,7 @@ import (
"sort" "sort"
"time" "time"
"github.com/dustin/go-humanize"
"github.com/minio/madmin-go" "github.com/minio/madmin-go"
"github.com/minio/minio/internal/color" "github.com/minio/minio/internal/color"
"github.com/minio/minio/internal/config/storageclass" "github.com/minio/minio/internal/config/storageclass"
@ -163,23 +164,20 @@ func mustGetHealSequence(ctx context.Context) *healSequence {
} }
// healErasureSet lists and heals all objects in a specific erasure set // healErasureSet lists and heals all objects in a specific erasure set
func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []BucketInfo, tracker *healingTracker) error { func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, tracker *healingTracker) error {
bgSeq := mustGetHealSequence(ctx) bgSeq := mustGetHealSequence(ctx)
buckets = append(buckets, BucketInfo{
Name: pathJoin(minioMetaBucket, minioConfigPrefix),
})
scanMode := globalHealConfig.ScanMode() scanMode := globalHealConfig.ScanMode()
var retErr error
// Heal all buckets with all objects // Heal all buckets with all objects
for _, bucket := range buckets { for _, bucket := range buckets {
if tracker.isHealed(bucket.Name) { if tracker.isHealed(bucket) {
continue continue
} }
var forwardTo string var forwardTo string
// If we resume to the same bucket, forward to last known item. // If we resume to the same bucket, forward to last known item.
if tracker.Bucket != "" { if tracker.Bucket != "" {
if tracker.Bucket == bucket.Name { if tracker.Bucket == bucket {
forwardTo = tracker.Object forwardTo = tracker.Object
} else { } else {
// Reset to where last bucket ended if resuming. // Reset to where last bucket ended if resuming.
@ -187,16 +185,18 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []BucketIn
} }
} }
tracker.Object = "" tracker.Object = ""
tracker.Bucket = bucket.Name tracker.Bucket = bucket
// Heal current bucket // Heal current bucket
if _, err := er.HealBucket(ctx, bucket.Name, madmin.HealOpts{ if _, err := er.HealBucket(ctx, bucket, madmin.HealOpts{
ScanMode: scanMode, ScanMode: scanMode,
}); err != nil { }); err != nil {
logger.LogIf(ctx, err) logger.LogIf(ctx, err)
continue
} }
if serverDebugLog { if serverDebugLog {
console.Debugf(color.Green("healDisk:")+" healing bucket %s content on erasure set %d\n", bucket.Name, tracker.SetIndex+1) console.Debugf(color.Green("healDisk:")+" healing bucket %s content on %s erasure set\n",
bucket, humanize.Ordinal(tracker.SetIndex+1))
} }
disks, _ := er.getOnlineDisksWithHealing() disks, _ := er.getOnlineDisksWithHealing()
@ -204,7 +204,7 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []BucketIn
// all disks are healing in this set, this is allowed // all disks are healing in this set, this is allowed
// so we simply proceed to next bucket, marking the bucket // so we simply proceed to next bucket, marking the bucket
// as done as there are no objects to heal. // as done as there are no objects to heal.
tracker.bucketDone(bucket.Name) tracker.bucketDone(bucket)
logger.LogIf(ctx, tracker.update(ctx)) logger.LogIf(ctx, tracker.update(ctx))
continue continue
} }
@ -221,7 +221,7 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []BucketIn
// We might land at .metacache, .trash, .multipart // We might land at .metacache, .trash, .multipart
// no need to heal them skip, only when bucket // no need to heal them skip, only when bucket
// is '.minio.sys' // is '.minio.sys'
if bucket.Name == minioMetaBucket { if bucket == minioMetaBucket {
if wildcard.Match("buckets/*/.metacache/*", entry.name) { if wildcard.Match("buckets/*/.metacache/*", entry.name) {
return return
} }
@ -233,19 +233,25 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []BucketIn
} }
} }
fivs, err := entry.fileInfoVersions(bucket.Name) fivs, err := entry.fileInfoVersions(bucket)
if err != nil { if err != nil {
err := bgSeq.queueHealTask(healSource{ err := bgSeq.queueHealTask(healSource{
bucket: bucket.Name, bucket: bucket,
object: entry.name, object: entry.name,
versionID: "", versionID: "",
}, madmin.HealItemObject) }, madmin.HealItemObject)
logger.LogIf(ctx, err) if err != nil {
tracker.ItemsFailed++
logger.LogIf(ctx, err)
} else {
tracker.ItemsHealed++
}
bgSeq.logHeal(madmin.HealItemObject)
return return
} }
for _, version := range fivs.Versions { for _, version := range fivs.Versions {
if _, err := er.HealObject(ctx, bucket.Name, version.Name, if _, err := er.HealObject(ctx, bucket, version.Name,
version.VersionID, madmin.HealOpts{ version.VersionID, madmin.HealOpts{
ScanMode: scanMode, ScanMode: scanMode,
Remove: healDeleteDangling, Remove: healDeleteDangling,
@ -273,12 +279,12 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []BucketIn
resolver := metadataResolutionParams{ resolver := metadataResolutionParams{
dirQuorum: 1, dirQuorum: 1,
objQuorum: 1, objQuorum: 1,
bucket: bucket.Name, bucket: bucket,
} }
err := listPathRaw(ctx, listPathRawOptions{ err := listPathRaw(ctx, listPathRawOptions{
disks: disks, disks: disks,
bucket: bucket.Name, bucket: bucket,
recursive: true, recursive: true,
forwardTo: forwardTo, forwardTo: forwardTo,
minDisks: 1, minDisks: 1,
@ -297,8 +303,12 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []BucketIn
}) })
if err != nil { if err != nil {
// Set this such that when we return this function
// we let the caller retry this disk again for the
// buckets it failed to list.
retErr = err
logger.LogIf(ctx, err) logger.LogIf(ctx, err)
return err continue
} }
select { select {
@ -306,15 +316,14 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []BucketIn
case <-ctx.Done(): case <-ctx.Done():
return ctx.Err() return ctx.Err()
default: default:
logger.LogIf(ctx, err) tracker.bucketDone(bucket)
tracker.bucketDone(bucket.Name)
logger.LogIf(ctx, tracker.update(ctx)) logger.LogIf(ctx, tracker.update(ctx))
} }
} }
tracker.Object = "" tracker.Object = ""
tracker.Bucket = "" tracker.Bucket = ""
return nil return retErr
} }
// healObject heals given object path in deep to fix bitrot. // healObject heals given object path in deep to fix bitrot.