Parallelize new disks healing of different erasure sets (#15112)

- Always reformat all disks when a new disk is detected, this will
  ensure new uploads to be written in new fresh disks
- Always heal all buckets first when an erasure set started to be healed
- Use a lock to prevent two disks belonging to different nodes but in
  the same erasure set to be healed in parallel
- Heal different sets in parallel

Bonus:
- Avoid logging errUnformattedDisk when a new fresh disk is inserted but
  not detected by healing mechanism yet (10 seconds lag)
This commit is contained in:
Anis Elleuch 2022-06-21 15:53:55 +01:00 committed by GitHub
parent 95b51c48be
commit b3eda248a3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 154 additions and 171 deletions

View File

@ -92,7 +92,10 @@ type allHealState struct {
// map of heal path to heal sequence // map of heal path to heal sequence
healSeqMap map[string]*healSequence // Indexed by endpoint healSeqMap map[string]*healSequence // Indexed by endpoint
healLocalDisks map[Endpoint]struct{} // keep track of the healing status of disks in the memory
// false: the disk needs to be healed but no healing routine is started
// true: the disk is currently healing
healLocalDisks map[Endpoint]bool
healStatus map[string]healingTracker // Indexed by disk ID healStatus map[string]healingTracker // Indexed by disk ID
} }
@ -100,7 +103,7 @@ type allHealState struct {
func newHealState(cleanup bool) *allHealState { func newHealState(cleanup bool) *allHealState {
hstate := &allHealState{ hstate := &allHealState{
healSeqMap: make(map[string]*healSequence), healSeqMap: make(map[string]*healSequence),
healLocalDisks: map[Endpoint]struct{}{}, healLocalDisks: make(map[Endpoint]bool),
healStatus: make(map[string]healingTracker), healStatus: make(map[string]healingTracker),
} }
if cleanup { if cleanup {
@ -109,13 +112,6 @@ func newHealState(cleanup bool) *allHealState {
return hstate return hstate
} }
func (ahs *allHealState) healDriveCount() int {
ahs.RLock()
defer ahs.RUnlock()
return len(ahs.healLocalDisks)
}
func (ahs *allHealState) popHealLocalDisks(healLocalDisks ...Endpoint) { func (ahs *allHealState) popHealLocalDisks(healLocalDisks ...Endpoint) {
ahs.Lock() ahs.Lock()
defer ahs.Unlock() defer ahs.Unlock()
@ -165,23 +161,34 @@ func (ahs *allHealState) getLocalHealingDisks() map[string]madmin.HealingDisk {
return dst return dst
} }
// getHealLocalDiskEndpoints() returns the list of disks that need
// to be healed but there is no healing routine in progress on them.
func (ahs *allHealState) getHealLocalDiskEndpoints() Endpoints { func (ahs *allHealState) getHealLocalDiskEndpoints() Endpoints {
ahs.RLock() ahs.RLock()
defer ahs.RUnlock() defer ahs.RUnlock()
var endpoints Endpoints var endpoints Endpoints
for ep := range ahs.healLocalDisks { for ep, healing := range ahs.healLocalDisks {
if !healing {
endpoints = append(endpoints, ep) endpoints = append(endpoints, ep)
} }
}
return endpoints return endpoints
} }
func (ahs *allHealState) markDiskForHealing(ep Endpoint) {
ahs.Lock()
defer ahs.Unlock()
ahs.healLocalDisks[ep] = true
}
func (ahs *allHealState) pushHealLocalDisks(healLocalDisks ...Endpoint) { func (ahs *allHealState) pushHealLocalDisks(healLocalDisks ...Endpoint) {
ahs.Lock() ahs.Lock()
defer ahs.Unlock() defer ahs.Unlock()
for _, ep := range healLocalDisks { for _, ep := range healLocalDisks {
ahs.healLocalDisks[ep] = struct{}{} ahs.healLocalDisks[ep] = false
} }
} }
@ -804,16 +811,6 @@ func (h *healSequence) healMinioSysMeta(objAPI ObjectLayer, metaPrefix string) f
} }
} }
// healDiskFormat - heals format.json, return value indicates if a
// failure error occurred.
func (h *healSequence) healDiskFormat() error {
if h.isQuitting() {
return errHealStopSignalled
}
return h.queueHealTask(healSource{bucket: SlashSeparator}, madmin.HealItemMetadata)
}
// healBuckets - check for all buckets heal or just particular bucket. // healBuckets - check for all buckets heal or just particular bucket.
func (h *healSequence) healBuckets(objAPI ObjectLayer, bucketsOnly bool) error { func (h *healSequence) healBuckets(objAPI ObjectLayer, bucketsOnly bool) error {
if h.isQuitting() { if h.isQuitting() {

View File

@ -26,15 +26,12 @@ import (
"os" "os"
"sort" "sort"
"strings" "strings"
"sync"
"time" "time"
"github.com/dustin/go-humanize" "github.com/dustin/go-humanize"
"github.com/minio/madmin-go" "github.com/minio/madmin-go"
"github.com/minio/minio-go/v7/pkg/set" "github.com/minio/minio-go/v7/pkg/set"
"github.com/minio/minio/internal/color"
"github.com/minio/minio/internal/logger" "github.com/minio/minio/internal/logger"
"github.com/minio/pkg/console"
) )
const ( const (
@ -258,26 +255,9 @@ func initAutoHeal(ctx context.Context, objAPI ObjectLayer) {
initBackgroundHealing(ctx, objAPI) // start quick background healing initBackgroundHealing(ctx, objAPI) // start quick background healing
bgSeq := mustGetHealSequence(ctx)
globalBackgroundHealState.pushHealLocalDisks(getLocalDisksToHeal()...) globalBackgroundHealState.pushHealLocalDisks(getLocalDisksToHeal()...)
if drivesToHeal := globalBackgroundHealState.healDriveCount(); drivesToHeal > 0 { go monitorLocalDisksAndHeal(ctx, z)
logger.Info(fmt.Sprintf("Found drives to heal %d, waiting until %s to heal the content - use 'mc admin heal alias/ --verbose' to check the status",
drivesToHeal, defaultMonitorNewDiskInterval))
// Heal any disk format and metadata early, if possible.
// Start with format healing
if err := bgSeq.healDiskFormat(); err != nil {
if newObjectLayerFn() != nil {
// log only in situations, when object layer
// has fully initialized.
logger.LogIf(bgSeq.ctx, err)
}
}
}
go monitorLocalDisksAndHeal(ctx, z, bgSeq)
} }
func getLocalDisksToHeal() (disksToHeal Endpoints) { func getLocalDisksToHeal() (disksToHeal Endpoints) {
@ -299,67 +279,40 @@ func getLocalDisksToHeal() (disksToHeal Endpoints) {
return disksToHeal return disksToHeal
} }
// monitorLocalDisksAndHeal - ensures that detected new disks are healed var newDiskHealingTimeout = newDynamicTimeout(30*time.Second, 10*time.Second)
// 1. Only the concerned erasure set will be listed and healed
// 2. Only the node hosting the disk is responsible to perform the heal
func monitorLocalDisksAndHeal(ctx context.Context, z *erasureServerPools, bgSeq *healSequence) {
// Perform automatic disk healing when a disk is replaced locally.
diskCheckTimer := time.NewTimer(defaultMonitorNewDiskInterval)
defer diskCheckTimer.Stop()
for { func healFreshDisk(ctx context.Context, z *erasureServerPools, endpoint Endpoint) error {
select { logger.Info(fmt.Sprintf("Proceeding to heal '%s' - 'mc admin heal alias/ --verbose' to check the status.", endpoint))
case <-ctx.Done():
return
case <-diskCheckTimer.C:
var erasureSetInPoolDisksToHeal []map[int][]StorageAPI
healDisks := globalBackgroundHealState.getHealLocalDiskEndpoints()
if len(healDisks) > 0 {
// Reformat disks
bgSeq.queueHealTask(healSource{bucket: SlashSeparator}, madmin.HealItemMetadata)
// Ensure that reformatting disks is finished
bgSeq.queueHealTask(healSource{bucket: nopHeal}, madmin.HealItemMetadata)
logger.Info(fmt.Sprintf("Found drives to heal %d, proceeding to heal - 'mc admin heal alias/ --verbose' to check the status.",
len(healDisks)))
erasureSetInPoolDisksToHeal = make([]map[int][]StorageAPI, len(z.serverPools))
for i := range z.serverPools {
erasureSetInPoolDisksToHeal[i] = map[int][]StorageAPI{}
}
}
if serverDebugLog && len(healDisks) > 0 {
console.Debugf(color.Green("healDisk:")+" disk check timer fired, attempting to heal %d drives\n", len(healDisks))
}
// heal only if new disks found.
for _, endpoint := range healDisks {
disk, format, err := connectEndpoint(endpoint) disk, format, err := connectEndpoint(endpoint)
if err != nil { if err != nil {
printEndpointError(endpoint, err, true) return fmt.Errorf("Error: %w, %s", err, endpoint)
continue
} }
poolIdx := globalEndpoints.GetLocalPoolIdx(disk.Endpoint()) poolIdx := globalEndpoints.GetLocalPoolIdx(disk.Endpoint())
if poolIdx < 0 { if poolIdx < 0 {
continue return fmt.Errorf("unexpected pool index (%d) found in %s", poolIdx, disk.Endpoint())
} }
// Calculate the set index where the current endpoint belongs // Calculate the set index where the current endpoint belongs
z.serverPools[poolIdx].erasureDisksMu.RLock() z.serverPools[poolIdx].erasureDisksMu.RLock()
// Protect reading reference format. setIdx, _, err := findDiskIndex(z.serverPools[poolIdx].format, format)
setIndex, _, err := findDiskIndex(z.serverPools[poolIdx].format, format)
z.serverPools[poolIdx].erasureDisksMu.RUnlock() z.serverPools[poolIdx].erasureDisksMu.RUnlock()
if err != nil { if err != nil {
printEndpointError(endpoint, err, false) return err
continue }
if setIdx < 0 {
return fmt.Errorf("unexpected set index (%d) found in %s", setIdx, disk.Endpoint())
} }
erasureSetInPoolDisksToHeal[poolIdx][setIndex] = append(erasureSetInPoolDisksToHeal[poolIdx][setIndex], disk) // Prevent parallel erasure set healing
locker := z.NewNSLock(minioMetaBucket, fmt.Sprintf("new-disk-healing/%s/%d/%d", endpoint, poolIdx, setIdx))
lkctx, err := locker.GetLock(ctx, newDiskHealingTimeout)
if err != nil {
return err
} }
ctx = lkctx.Context()
defer locker.Unlock(lkctx.Cancel)
buckets, _ := z.ListBuckets(ctx) buckets, _ := z.ListBuckets(ctx)
@ -380,34 +333,22 @@ func monitorLocalDisksAndHeal(ctx context.Context, z *erasureServerPools, bgSeq
return buckets[i].Created.After(buckets[j].Created) return buckets[i].Created.After(buckets[j].Created)
}) })
// TODO(klauspost): This will block until all heals are done,
// in the future this should be able to start healing other sets at once.
var wg sync.WaitGroup
for i, setMap := range erasureSetInPoolDisksToHeal {
i := i
for setIndex, disks := range setMap {
if len(disks) == 0 {
continue
}
wg.Add(1)
go func(setIndex int, disks []StorageAPI) {
defer wg.Done()
for _, disk := range disks {
if serverDebugLog { if serverDebugLog {
logger.Info("Healing disk '%v' on %s pool", disk, humanize.Ordinal(i+1)) logger.Info("Healing disk '%v' on %s pool", disk, humanize.Ordinal(poolIdx+1))
} }
// So someone changed the drives underneath, healing tracker missing. // Load healing tracker in this disk
tracker, err := loadHealingTracker(ctx, disk) tracker, err := loadHealingTracker(ctx, disk)
if err != nil { if err != nil {
// So someone changed the drives underneath, healing tracker missing.
logger.LogIf(ctx, fmt.Errorf("Healing tracker missing on '%s', disk was swapped again on %s pool: %w", logger.LogIf(ctx, fmt.Errorf("Healing tracker missing on '%s', disk was swapped again on %s pool: %w",
disk, humanize.Ordinal(i+1), err)) disk, humanize.Ordinal(poolIdx+1), err))
tracker = newHealingTracker(disk) tracker = newHealingTracker(disk)
} }
// Load bucket totals // Load bucket totals
cache := dataUsageCache{} cache := dataUsageCache{}
if err := cache.load(ctx, z.serverPools[i].sets[setIndex], dataUsageCacheName); err == nil { if err := cache.load(ctx, z.serverPools[poolIdx].sets[setIdx], dataUsageCacheName); err == nil {
dataUsageInfo := cache.dui(dataUsageRoot, nil) dataUsageInfo := cache.dui(dataUsageRoot, nil)
tracker.ObjectsTotalCount = dataUsageInfo.ObjectsTotalCount tracker.ObjectsTotalCount = dataUsageInfo.ObjectsTotalCount
tracker.ObjectsTotalSize = dataUsageInfo.ObjectsTotalSize tracker.ObjectsTotalSize = dataUsageInfo.ObjectsTotalSize
@ -416,35 +357,68 @@ func monitorLocalDisksAndHeal(ctx context.Context, z *erasureServerPools, bgSeq
tracker.PoolIndex, tracker.SetIndex, tracker.DiskIndex = disk.GetDiskLoc() tracker.PoolIndex, tracker.SetIndex, tracker.DiskIndex = disk.GetDiskLoc()
tracker.setQueuedBuckets(buckets) tracker.setQueuedBuckets(buckets)
if err := tracker.save(ctx); err != nil { if err := tracker.save(ctx); err != nil {
logger.LogIf(ctx, err) return err
// Unable to write healing tracker, permission denied or some
// other unexpected error occurred. Proceed to look for new
// disks to be healed again, we cannot proceed further.
return
} }
err = z.serverPools[i].sets[setIndex].healErasureSet(ctx, tracker.QueuedBuckets, tracker) // Start or resume healing of this erasure set
err = z.serverPools[poolIdx].sets[setIdx].healErasureSet(ctx, tracker.QueuedBuckets, tracker)
if err != nil { if err != nil {
logger.LogIf(ctx, err) return err
continue
} }
logger.Info("Healing disk '%s' is complete (healed: %d, failed: %d).", disk, tracker.ItemsHealed, tracker.ItemsFailed)
if serverDebugLog { if serverDebugLog {
logger.Info("Healing disk '%s' on %s pool, %s set complete", disk,
humanize.Ordinal(i+1), humanize.Ordinal(setIndex+1))
logger.Info("Summary:\n")
tracker.printTo(os.Stdout) tracker.printTo(os.Stdout)
logger.Info("\n") logger.Info("\n")
} }
logger.LogIf(ctx, tracker.delete(ctx))
logger.LogIf(ctx, tracker.delete(ctx))
return nil
}
// monitorLocalDisksAndHeal - ensures that detected new disks are healed
// 1. Only the concerned erasure set will be listed and healed
// 2. Only the node hosting the disk is responsible to perform the heal
func monitorLocalDisksAndHeal(ctx context.Context, z *erasureServerPools) {
// Perform automatic disk healing when a disk is replaced locally.
diskCheckTimer := time.NewTimer(defaultMonitorNewDiskInterval)
defer diskCheckTimer.Stop()
for {
select {
case <-ctx.Done():
return
case <-diskCheckTimer.C:
healDisks := globalBackgroundHealState.getHealLocalDiskEndpoints()
if len(healDisks) == 0 {
// Reset for next interval.
diskCheckTimer.Reset(defaultMonitorNewDiskInterval)
break
}
// Reformat disks immediately
_, err := z.HealFormat(context.Background(), false)
if err != nil && !errors.Is(err, errNoHealRequired) {
logger.LogIf(ctx, err)
// Reset for next interval.
diskCheckTimer.Reset(defaultMonitorNewDiskInterval)
break
}
for _, disk := range healDisks {
go func(disk Endpoint) {
globalBackgroundHealState.markDiskForHealing(disk)
err := healFreshDisk(ctx, z, disk)
if err != nil {
printEndpointError(disk, err, false)
return
}
// Only upon success pop the healed disk. // Only upon success pop the healed disk.
globalBackgroundHealState.popHealLocalDisks(disk.Endpoint()) globalBackgroundHealState.popHealLocalDisks(disk)
}(disk)
} }
}(setIndex, disks)
}
}
wg.Wait()
// Reset for next interval. // Reset for next interval.
diskCheckTimer.Reset(defaultMonitorNewDiskInterval) diskCheckTimer.Reset(defaultMonitorNewDiskInterval)

View File

@ -151,6 +151,7 @@ func readAllFileInfo(ctx context.Context, disks []StorageAPI, bucket, object, ve
errVolumeNotFound, errVolumeNotFound,
errFileVersionNotFound, errFileVersionNotFound,
errDiskNotFound, errDiskNotFound,
errUnformattedDisk,
}...) { }...) {
logger.LogOnceIf(ctx, fmt.Errorf("Drive %s, path (%s/%s) returned an error (%w)", logger.LogOnceIf(ctx, fmt.Errorf("Drive %s, path (%s/%s) returned an error (%w)",
disks[index], bucket, object, err), disks[index], bucket, object, err),

View File

@ -171,6 +171,16 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
healBuckets := make([]string, len(buckets)) healBuckets := make([]string, len(buckets))
copy(healBuckets, buckets) copy(healBuckets, buckets)
// Heal all buckets first in this erasure set - this is useful
// for new objects upload in different buckets to be successful
for _, bucket := range healBuckets {
_, err := er.HealBucket(ctx, bucket, madmin.HealOpts{ScanMode: scanMode})
if err != nil {
// Log bucket healing error if any, we shall retry again.
logger.LogIf(ctx, err)
}
}
var retErr error var retErr error
// Heal all buckets with all objects // Heal all buckets with all objects
for _, bucket := range healBuckets { for _, bucket := range healBuckets {
@ -189,7 +199,8 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
} }
tracker.Object = "" tracker.Object = ""
tracker.Bucket = bucket tracker.Bucket = bucket
// Heal current bucket // Heal current bucket again in case if it is failed
// in the being of erasure set healing
if _, err := er.HealBucket(ctx, bucket, madmin.HealOpts{ if _, err := er.HealBucket(ctx, bucket, madmin.HealOpts{
ScanMode: scanMode, ScanMode: scanMode,
}); err != nil { }); err != nil {