From 03334121482c2263c0511f1c3ab95e463cec1d6d Mon Sep 17 00:00:00 2001 From: Anis Elleuch Date: Fri, 6 Jan 2023 05:41:19 +0100 Subject: [PATCH] fix: heal only once per disk per set among multiple disks (#16358) --- cmd/background-newdisks-heal-ops.go | 52 ++++++++++++++++++------- cmd/background-newdisks-heal-ops_gen.go | 34 ++++++++++++++-- cmd/erasure-sets.go | 4 +- cmd/format-erasure.go | 12 +++--- go.mod | 2 +- go.sum | 4 +- 6 files changed, 81 insertions(+), 27 deletions(-) diff --git a/cmd/background-newdisks-heal-ops.go b/cmd/background-newdisks-heal-ops.go index 9e4b10ee5..2ccbf5afb 100644 --- a/cmd/background-newdisks-heal-ops.go +++ b/cmd/background-newdisks-heal-ops.go @@ -79,6 +79,10 @@ type healingTracker struct { // Filled during heal. HealedBuckets []string + + // ID of the current healing operation + HealID string + // Add future tracking capabilities // Be sure that they are included in toHealingDisk } @@ -112,11 +116,12 @@ func loadHealingTracker(ctx context.Context, disk StorageAPI) (*healingTracker, } // newHealingTracker will create a new healing tracker for the disk. -func newHealingTracker(disk StorageAPI) *healingTracker { +func newHealingTracker(disk StorageAPI, healID string) *healingTracker { diskID, _ := disk.GetDiskID() h := healingTracker{ disk: disk, ID: diskID, + HealID: healID, Path: disk.String(), Endpoint: disk.Endpoint().String(), Started: time.Now().UTC(), @@ -227,6 +232,7 @@ func (h *healingTracker) printTo(writer io.Writer) { func (h *healingTracker) toHealingDisk() madmin.HealingDisk { return madmin.HealingDisk{ ID: h.ID, + HealID: h.HealID, Endpoint: h.Endpoint, PoolIndex: h.PoolIndex, SetIndex: h.SetIndex, @@ -286,8 +292,6 @@ func getLocalDisksToHeal() (disksToHeal Endpoints) { var newDiskHealingTimeout = newDynamicTimeout(30*time.Second, 10*time.Second) func healFreshDisk(ctx context.Context, z *erasureServerPools, endpoint Endpoint) error { - logger.Info(fmt.Sprintf("Proceeding to heal '%s' - 'mc admin heal alias/ --verbose' to check the status.", endpoint)) - disk, format, err := connectEndpoint(endpoint) if err != nil { return fmt.Errorf("Error: %w, %s", err, endpoint) @@ -318,6 +322,20 @@ func healFreshDisk(ctx context.Context, z *erasureServerPools, endpoint Endpoint ctx = lkctx.Context() defer locker.Unlock(lkctx) + // Load healing tracker in this disk + tracker, err := loadHealingTracker(ctx, disk) + if err != nil { + // A healing track can be not found when another disk in the same + // erasure set and same healing-id successfully finished healing. + if err == errFileNotFound { + return nil + } + logger.LogIf(ctx, fmt.Errorf("Unable to load a healing tracker on '%s': %w", disk, err)) + tracker = newHealingTracker(disk, mustGetUUID()) + } + + logger.Info(fmt.Sprintf("Proceeding to heal '%s' - 'mc admin heal alias/ --verbose' to check the status.", endpoint)) + buckets, _ := z.ListBuckets(ctx, BucketOptions{}) // Buckets data are dispersed in multiple zones/sets, make // sure to heal all bucket metadata configuration. @@ -340,15 +358,6 @@ func healFreshDisk(ctx context.Context, z *erasureServerPools, endpoint Endpoint logger.Info("Healing drive '%v' on %s pool", disk, humanize.Ordinal(poolIdx+1)) } - // Load healing tracker in this disk - tracker, err := loadHealingTracker(ctx, disk) - if err != nil { - // So someone changed the drives underneath, healing tracker missing. - logger.LogIf(ctx, fmt.Errorf("Healing tracker missing on '%s', drive was swapped again on %s pool: %w", - disk, humanize.Ordinal(poolIdx+1), err)) - tracker = newHealingTracker(disk) - } - // Load bucket totals cache := dataUsageCache{} if err := cache.load(ctx, z.serverPools[poolIdx].sets[setIdx], dataUsageCacheName); err == nil { @@ -379,7 +388,24 @@ func healFreshDisk(ctx context.Context, z *erasureServerPools, endpoint Endpoint logger.Info("\n") } - logger.LogIf(ctx, tracker.delete(ctx)) + if tracker.HealID == "" { // HealID is empty only before Feb 2023 + logger.LogIf(ctx, tracker.delete(ctx)) + return nil + } + + // Remove .healing.bin from all disks with similar heal-id + for _, disk := range z.serverPools[poolIdx].sets[setIdx].getDisks() { + t, err := loadHealingTracker(ctx, disk) + if err != nil { + if err != errFileNotFound { + logger.LogIf(ctx, err) + } + continue + } + if t.HealID == tracker.HealID { + t.delete(ctx) + } + } return nil } diff --git a/cmd/background-newdisks-heal-ops_gen.go b/cmd/background-newdisks-heal-ops_gen.go index 0b34df90c..5aad8c9f1 100644 --- a/cmd/background-newdisks-heal-ops_gen.go +++ b/cmd/background-newdisks-heal-ops_gen.go @@ -182,6 +182,12 @@ func (z *healingTracker) DecodeMsg(dc *msgp.Reader) (err error) { return } } + case "HealID": + z.HealID, err = dc.ReadString() + if err != nil { + err = msgp.WrapError(err, "HealID") + return + } default: err = dc.Skip() if err != nil { @@ -195,9 +201,9 @@ func (z *healingTracker) DecodeMsg(dc *msgp.Reader) (err error) { // EncodeMsg implements msgp.Encodable func (z *healingTracker) EncodeMsg(en *msgp.Writer) (err error) { - // map header, size 22 + // map header, size 23 // write "ID" - err = en.Append(0xde, 0x0, 0x16, 0xa2, 0x49, 0x44) + err = en.Append(0xde, 0x0, 0x17, 0xa2, 0x49, 0x44) if err != nil { return } @@ -430,15 +436,25 @@ func (z *healingTracker) EncodeMsg(en *msgp.Writer) (err error) { return } } + // write "HealID" + err = en.Append(0xa6, 0x48, 0x65, 0x61, 0x6c, 0x49, 0x44) + if err != nil { + return + } + err = en.WriteString(z.HealID) + if err != nil { + err = msgp.WrapError(err, "HealID") + return + } return } // MarshalMsg implements msgp.Marshaler func (z *healingTracker) MarshalMsg(b []byte) (o []byte, err error) { o = msgp.Require(b, z.Msgsize()) - // map header, size 22 + // map header, size 23 // string "ID" - o = append(o, 0xde, 0x0, 0x16, 0xa2, 0x49, 0x44) + o = append(o, 0xde, 0x0, 0x17, 0xa2, 0x49, 0x44) o = msgp.AppendString(o, z.ID) // string "PoolIndex" o = append(o, 0xa9, 0x50, 0x6f, 0x6f, 0x6c, 0x49, 0x6e, 0x64, 0x65, 0x78) @@ -509,6 +525,9 @@ func (z *healingTracker) MarshalMsg(b []byte) (o []byte, err error) { for za0002 := range z.HealedBuckets { o = msgp.AppendString(o, z.HealedBuckets[za0002]) } + // string "HealID" + o = append(o, 0xa6, 0x48, 0x65, 0x61, 0x6c, 0x49, 0x44) + o = msgp.AppendString(o, z.HealID) return } @@ -688,6 +707,12 @@ func (z *healingTracker) UnmarshalMsg(bts []byte) (o []byte, err error) { return } } + case "HealID": + z.HealID, bts, err = msgp.ReadStringBytes(bts) + if err != nil { + err = msgp.WrapError(err, "HealID") + return + } default: bts, err = msgp.Skip(bts) if err != nil { @@ -710,5 +735,6 @@ func (z *healingTracker) Msgsize() (s int) { for za0002 := range z.HealedBuckets { s += msgp.StringPrefixSize + len(z.HealedBuckets[za0002]) } + s += 7 + msgp.StringPrefixSize + len(z.HealID) return } diff --git a/cmd/erasure-sets.go b/cmd/erasure-sets.go index 10d808171..17df60854 100644 --- a/cmd/erasure-sets.go +++ b/cmd/erasure-sets.go @@ -1149,6 +1149,8 @@ func (s *erasureSets) HealFormat(ctx context.Context, dryRun bool) (res madmin.H return res, errNoHealRequired } + formatOpID := mustGetUUID() + // Initialize a new set of set formats which will be written to disk. newFormatSets := newHealFormatSets(refFormat, s.setCount, s.setDriveCount, formats, sErrs) @@ -1170,7 +1172,7 @@ func (s *erasureSets) HealFormat(ctx context.Context, dryRun bool) (res madmin.H if storageDisks[index] == nil || format == nil { continue } - if err := saveFormatErasure(storageDisks[index], format, true); err != nil { + if err := saveFormatErasure(storageDisks[index], format, formatOpID); err != nil { logger.LogIf(ctx, fmt.Errorf("Drive %s failed to write updated 'format.json': %v", storageDisks[index], err)) tmpNewFormats[index] = nil // this disk failed to write new format } diff --git a/cmd/format-erasure.go b/cmd/format-erasure.go index 340565535..ed7d1dd7b 100644 --- a/cmd/format-erasure.go +++ b/cmd/format-erasure.go @@ -347,7 +347,7 @@ func loadFormatErasureAll(storageDisks []StorageAPI, heal bool) ([]*formatErasur return formats, g.Wait() } -func saveFormatErasure(disk StorageAPI, format *formatErasureV3, heal bool) error { +func saveFormatErasure(disk StorageAPI, format *formatErasureV3, healID string) error { if disk == nil || format == nil { return errDiskNotFound } @@ -383,9 +383,9 @@ func saveFormatErasure(disk StorageAPI, format *formatErasureV3, heal bool) erro } disk.SetDiskID(diskID) - if heal { + if healID != "" { ctx := context.Background() - ht := newHealingTracker(disk) + ht := newHealingTracker(disk, healID) return ht.save(ctx) } return nil @@ -541,7 +541,7 @@ func formatErasureFixLocalDeploymentID(endpoints Endpoints, storageDisks []Stora } format.ID = refFormat.ID // Heal the drive if we fixed its deployment ID. - if err := saveFormatErasure(storageDisks[index], format, true); err != nil { + if err := saveFormatErasure(storageDisks[index], format, mustGetUUID()); err != nil { logger.LogIf(GlobalContext, err) return fmt.Errorf("Unable to save format.json, %w", err) } @@ -642,7 +642,7 @@ func saveFormatErasureAll(ctx context.Context, storageDisks []StorageAPI, format if formats[index] == nil { return errDiskNotFound } - return saveFormatErasure(storageDisks[index], formats[index], false) + return saveFormatErasure(storageDisks[index], formats[index], "") }, index) } @@ -722,7 +722,7 @@ func fixFormatErasureV3(storageDisks []StorageAPI, endpoints Endpoints, formats if formats[i].Erasure.This == "" { formats[i].Erasure.This = formats[i].Erasure.Sets[0][i] // Heal the drive if drive has .This empty. - if err := saveFormatErasure(storageDisks[i], formats[i], true); err != nil { + if err := saveFormatErasure(storageDisks[i], formats[i], mustGetUUID()); err != nil { return err } } diff --git a/go.mod b/go.mod index a0f35c950..424806177 100644 --- a/go.mod +++ b/go.mod @@ -49,7 +49,7 @@ require ( github.com/minio/dperf v0.4.2 github.com/minio/highwayhash v1.0.2 github.com/minio/kes v0.22.2 - github.com/minio/madmin-go/v2 v2.0.3 + github.com/minio/madmin-go/v2 v2.0.5 github.com/minio/minio-go/v7 v7.0.45 github.com/minio/pkg v1.5.8 github.com/minio/selfupdate v0.5.0 diff --git a/go.sum b/go.sum index ce2c4c49f..fe4e158d4 100644 --- a/go.sum +++ b/go.sum @@ -770,8 +770,8 @@ github.com/minio/highwayhash v1.0.2/go.mod h1:BQskDq+xkJ12lmlUUi7U0M5Swg3EWR+dLT github.com/minio/kes v0.22.2 h1:9NdgTx+TFJco0Pqdrq8WZbrTZVv0ichg+sbPRQiJ2HU= github.com/minio/kes v0.22.2/go.mod h1:J9sD6Pe8obPt7+JXFcznkWaYaj9pBWCfN9U9j//NsNw= github.com/minio/madmin-go v1.6.6/go.mod h1:ATvkBOLiP3av4D++2v1UEHC/QzsGtgXD5kYvvRYzdKs= -github.com/minio/madmin-go/v2 v2.0.3 h1:Q8qco+JrbRIim25tGrs0enVRJGoIMUHfULa5nJoSiqM= -github.com/minio/madmin-go/v2 v2.0.3/go.mod h1:5aFi/VLWBHC2DEFfGIlUmAeJhaF4ZAjuYpEWZFU14Zw= +github.com/minio/madmin-go/v2 v2.0.5 h1:W0dY4enDYdIegTcIQSkdtzvvyQpZtEn6bft5JMb/wYA= +github.com/minio/madmin-go/v2 v2.0.5/go.mod h1:5aFi/VLWBHC2DEFfGIlUmAeJhaF4ZAjuYpEWZFU14Zw= github.com/minio/mc v0.0.0-20221224152138-176072dee43d h1:etzZIWQ3NFrxzwnvjczETWMcgoja9ZKLFLIfQzvpqP8= github.com/minio/mc v0.0.0-20221224152138-176072dee43d/go.mod h1:af4hDQUHwu8az+6TyEKXa2Yd+lvMDVgnc9/kstHPZY8= github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34=