fix: the race in healing tracker code (#17048)

This commit is contained in:
Anis Eleuch 2023-04-18 22:49:56 +01:00 committed by GitHub
parent 0db34e4b85
commit 224d9a752f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 104 additions and 32 deletions

View File

@ -26,6 +26,7 @@ import (
"os" "os"
"sort" "sort"
"strings" "strings"
"sync"
"time" "time"
"github.com/dustin/go-humanize" "github.com/dustin/go-humanize"
@ -44,6 +45,7 @@ const (
// healingTracker is used to persist healing information during a heal. // healingTracker is used to persist healing information during a heal.
type healingTracker struct { type healingTracker struct {
disk StorageAPI `msg:"-"` disk StorageAPI `msg:"-"`
mu *sync.RWMutex `msg:"-"`
ID string ID string
PoolIndex int PoolIndex int
@ -112,22 +114,76 @@ func loadHealingTracker(ctx context.Context, disk StorageAPI) (*healingTracker,
} }
h.disk = disk h.disk = disk
h.ID = diskID h.ID = diskID
h.mu = &sync.RWMutex{}
return &h, nil return &h, nil
} }
// newHealingTracker will create a new healing tracker for the disk. // newHealingTracker will create a new healing tracker for the disk.
func newHealingTracker(disk StorageAPI, healID string) *healingTracker { func newHealingTracker() *healingTracker {
diskID, _ := disk.GetDiskID() return &healingTracker{
h := healingTracker{ mu: &sync.RWMutex{},
disk: disk,
ID: diskID,
HealID: healID,
Path: disk.String(),
Endpoint: disk.Endpoint().String(),
Started: time.Now().UTC(),
} }
}
func initHealingTracker(disk StorageAPI, healID string) *healingTracker {
h := newHealingTracker()
diskID, _ := disk.GetDiskID()
h.disk = disk
h.ID = diskID
h.HealID = healID
h.Path = disk.String()
h.Endpoint = disk.Endpoint().String()
h.Started = time.Now().UTC()
h.PoolIndex, h.SetIndex, h.DiskIndex = disk.GetDiskLoc() h.PoolIndex, h.SetIndex, h.DiskIndex = disk.GetDiskLoc()
return &h return h
}
func (h healingTracker) getLastUpdate() time.Time {
h.mu.RLock()
defer h.mu.RUnlock()
return h.LastUpdate
}
func (h healingTracker) getBucket() string {
h.mu.RLock()
defer h.mu.RUnlock()
return h.Bucket
}
func (h *healingTracker) setBucket(bucket string) {
h.mu.Lock()
defer h.mu.Unlock()
h.Bucket = bucket
}
func (h healingTracker) getObject() string {
h.mu.RLock()
defer h.mu.RUnlock()
return h.Object
}
func (h *healingTracker) setObject(object string) {
h.mu.Lock()
defer h.mu.Unlock()
h.Object = object
}
func (h *healingTracker) updateProgress(success bool, bytes uint64) {
h.mu.Lock()
defer h.mu.Unlock()
if success {
h.ItemsHealed++
h.BytesDone += bytes
} else {
h.ItemsFailed++
h.BytesFailed += bytes
}
} }
// update will update the tracker on the disk. // update will update the tracker on the disk.
@ -136,15 +192,18 @@ func (h *healingTracker) update(ctx context.Context) error {
if h.disk.Healing() == nil { if h.disk.Healing() == nil {
return fmt.Errorf("healingTracker: drive %q is not marked as healing", h.ID) return fmt.Errorf("healingTracker: drive %q is not marked as healing", h.ID)
} }
h.mu.Lock()
if h.ID == "" || h.PoolIndex < 0 || h.SetIndex < 0 || h.DiskIndex < 0 { if h.ID == "" || h.PoolIndex < 0 || h.SetIndex < 0 || h.DiskIndex < 0 {
h.ID, _ = h.disk.GetDiskID() h.ID, _ = h.disk.GetDiskID()
h.PoolIndex, h.SetIndex, h.DiskIndex = h.disk.GetDiskLoc() h.PoolIndex, h.SetIndex, h.DiskIndex = h.disk.GetDiskLoc()
} }
h.mu.Unlock()
return h.save(ctx) return h.save(ctx)
} }
// save will unconditionally save the tracker and will be created if not existing. // save will unconditionally save the tracker and will be created if not existing.
func (h *healingTracker) save(ctx context.Context) error { func (h *healingTracker) save(ctx context.Context) error {
h.mu.Lock()
if h.PoolIndex < 0 || h.SetIndex < 0 || h.DiskIndex < 0 { if h.PoolIndex < 0 || h.SetIndex < 0 || h.DiskIndex < 0 {
// Attempt to get location. // Attempt to get location.
if api := newObjectLayerFn(); api != nil { if api := newObjectLayerFn(); api != nil {
@ -155,6 +214,7 @@ func (h *healingTracker) save(ctx context.Context) error {
} }
h.LastUpdate = time.Now().UTC() h.LastUpdate = time.Now().UTC()
htrackerBytes, err := h.MarshalMsg(nil) htrackerBytes, err := h.MarshalMsg(nil)
h.mu.Unlock()
if err != nil { if err != nil {
return err return err
} }
@ -176,6 +236,8 @@ func (h *healingTracker) delete(ctx context.Context) error {
} }
func (h *healingTracker) isHealed(bucket string) bool { func (h *healingTracker) isHealed(bucket string) bool {
h.mu.RLock()
defer h.mu.RUnlock()
for _, v := range h.HealedBuckets { for _, v := range h.HealedBuckets {
if v == bucket { if v == bucket {
return true return true
@ -186,6 +248,9 @@ func (h *healingTracker) isHealed(bucket string) bool {
// resume will reset progress to the numbers at the start of the bucket. // resume will reset progress to the numbers at the start of the bucket.
func (h *healingTracker) resume() { func (h *healingTracker) resume() {
h.mu.Lock()
defer h.mu.Unlock()
h.ItemsHealed = h.ResumeItemsHealed h.ItemsHealed = h.ResumeItemsHealed
h.ItemsFailed = h.ResumeItemsFailed h.ItemsFailed = h.ResumeItemsFailed
h.BytesDone = h.ResumeBytesDone h.BytesDone = h.ResumeBytesDone
@ -195,6 +260,9 @@ func (h *healingTracker) resume() {
// bucketDone should be called when a bucket is done healing. // bucketDone should be called when a bucket is done healing.
// Adds the bucket to the list of healed buckets and updates resume numbers. // Adds the bucket to the list of healed buckets and updates resume numbers.
func (h *healingTracker) bucketDone(bucket string) { func (h *healingTracker) bucketDone(bucket string) {
h.mu.Lock()
defer h.mu.Unlock()
h.ResumeItemsHealed = h.ItemsHealed h.ResumeItemsHealed = h.ItemsHealed
h.ResumeItemsFailed = h.ItemsFailed h.ResumeItemsFailed = h.ItemsFailed
h.ResumeBytesDone = h.BytesDone h.ResumeBytesDone = h.BytesDone
@ -211,6 +279,9 @@ func (h *healingTracker) bucketDone(bucket string) {
// setQueuedBuckets will add buckets, but exclude any that is already in h.HealedBuckets. // setQueuedBuckets will add buckets, but exclude any that is already in h.HealedBuckets.
// Order is preserved. // Order is preserved.
func (h *healingTracker) setQueuedBuckets(buckets []BucketInfo) { func (h *healingTracker) setQueuedBuckets(buckets []BucketInfo) {
h.mu.Lock()
defer h.mu.Unlock()
s := set.CreateStringSet(h.HealedBuckets...) s := set.CreateStringSet(h.HealedBuckets...)
h.QueuedBuckets = make([]string, 0, len(buckets)) h.QueuedBuckets = make([]string, 0, len(buckets))
for _, b := range buckets { for _, b := range buckets {
@ -221,6 +292,9 @@ func (h *healingTracker) setQueuedBuckets(buckets []BucketInfo) {
} }
func (h *healingTracker) printTo(writer io.Writer) { func (h *healingTracker) printTo(writer io.Writer) {
h.mu.RLock()
defer h.mu.RUnlock()
b, err := json.MarshalIndent(h, "", " ") b, err := json.MarshalIndent(h, "", " ")
if err != nil { if err != nil {
writer.Write([]byte(err.Error())) writer.Write([]byte(err.Error()))
@ -230,6 +304,9 @@ func (h *healingTracker) printTo(writer io.Writer) {
// toHealingDisk converts the information to madmin.HealingDisk // toHealingDisk converts the information to madmin.HealingDisk
func (h *healingTracker) toHealingDisk() madmin.HealingDisk { func (h *healingTracker) toHealingDisk() madmin.HealingDisk {
h.mu.RLock()
defer h.mu.RUnlock()
return madmin.HealingDisk{ return madmin.HealingDisk{
ID: h.ID, ID: h.ID,
HealID: h.HealID, HealID: h.HealID,
@ -336,7 +413,7 @@ func healFreshDisk(ctx context.Context, z *erasureServerPools, endpoint Endpoint
return nil return nil
} }
logger.LogIf(ctx, fmt.Errorf("Unable to load healing tracker on '%s': %w, re-initializing..", disk, err)) logger.LogIf(ctx, fmt.Errorf("Unable to load healing tracker on '%s': %w, re-initializing..", disk, err))
tracker = newHealingTracker(disk, mustGetUUID()) tracker = initHealingTracker(disk, mustGetUUID())
} }
logger.Info(fmt.Sprintf("Healing drive '%s' - 'mc admin heal alias/ --verbose' to check the current status.", endpoint)) logger.Info(fmt.Sprintf("Healing drive '%s' - 'mc admin heal alias/ --verbose' to check the current status.", endpoint))

View File

@ -380,7 +380,7 @@ func saveFormatErasure(disk StorageAPI, format *formatErasureV3, healID string)
disk.SetDiskID(diskID) disk.SetDiskID(diskID)
if healID != "" { if healID != "" {
ctx := context.Background() ctx := context.Background()
ht := newHealingTracker(disk, healID) ht := initHealingTracker(disk, healID)
return ht.save(ctx) return ht.save(ctx)
} }
return nil return nil

View File

@ -187,16 +187,16 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
} }
var forwardTo string var forwardTo string
// If we resume to the same bucket, forward to last known item. // If we resume to the same bucket, forward to last known item.
if tracker.Bucket != "" { if b := tracker.getBucket(); b != "" {
if tracker.Bucket == bucket { if b == bucket {
forwardTo = tracker.Object forwardTo = tracker.getObject()
} else { } else {
// Reset to where last bucket ended if resuming. // Reset to where last bucket ended if resuming.
tracker.resume() tracker.resume()
} }
} }
tracker.Object = "" tracker.setObject("")
tracker.Bucket = bucket tracker.setBucket(bucket)
// Heal current bucket again in case if it is failed // Heal current bucket again in case if it is failed
// in the being of erasure set healing // in the being of erasure set healing
if _, err := er.HealBucket(ctx, bucket, madmin.HealOpts{ if _, err := er.HealBucket(ctx, bucket, madmin.HealOpts{
@ -208,7 +208,7 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
if serverDebugLog { if serverDebugLog {
console.Debugf(color.Green("healDrive:")+" healing bucket %s content on %s erasure set\n", console.Debugf(color.Green("healDrive:")+" healing bucket %s content on %s erasure set\n",
bucket, humanize.Ordinal(tracker.SetIndex+1)) bucket, humanize.Ordinal(er.setIndex+1))
} }
disks, _ := er.getOnlineDisksWithHealing() disks, _ := er.getOnlineDisksWithHealing()
@ -251,20 +251,14 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
go func() { go func() {
for res := range results { for res := range results {
if res.entryDone { if res.entryDone {
tracker.Object = res.name tracker.setObject(res.name)
if time.Since(tracker.LastUpdate) > time.Minute { if time.Since(tracker.getLastUpdate()) > time.Minute {
logger.LogIf(ctx, tracker.update(ctx)) logger.LogIf(ctx, tracker.update(ctx))
} }
continue continue
} }
if res.success { tracker.updateProgress(res.success, res.bytes)
tracker.ItemsHealed++
tracker.BytesDone += res.bytes
} else {
tracker.ItemsFailed++
tracker.BytesFailed += res.bytes
}
} }
}() }()
@ -420,8 +414,9 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
logger.LogIf(ctx, tracker.update(ctx)) logger.LogIf(ctx, tracker.update(ctx))
} }
} }
tracker.Object = ""
tracker.Bucket = "" tracker.setObject("")
tracker.setBucket("")
return retErr return retErr
} }

View File

@ -365,10 +365,10 @@ func (s *xlStorage) Healing() *healingTracker {
if err != nil { if err != nil {
return nil return nil
} }
var h healingTracker h := newHealingTracker()
_, err = h.UnmarshalMsg(b) _, err = h.UnmarshalMsg(b)
logger.LogIf(GlobalContext, err) logger.LogIf(GlobalContext, err)
return &h return h
} }
// checkODirectDiskSupport asks the disk to write some data // checkODirectDiskSupport asks the disk to write some data