Add parallel bucket healing during startup (#11457)

Replaces #11449 Does concurrent healing but limits concurrency to 50 buckets. Aborts on first error. `errgroup.Group` is extended to facilitate this in a generic way.
2025-07-09 09:02:17 -04:00 · 2021-02-05 13:04:26 -08:00 · 2021-02-05 13:04:26 -08:00 · b4ac05523b
commit b4ac05523b
parent c7eacba41c
2 changed files with 99 additions and 13 deletions
--- a/cmd/server-main.go
+++ b/cmd/server-main.go
@ -41,6 +41,7 @@ import (
 	"github.com/minio/minio/pkg/color"
 	"github.com/minio/minio/pkg/env"
 	"github.com/minio/minio/pkg/madmin"
+	"github.com/minio/minio/pkg/sync/errgroup"
 )

 // ServerFlags - server command specific flags
@ -346,10 +347,22 @@ func initAllSubsystems(ctx context.Context, newObject ObjectLayer) (err error) {
 				logger.Info(fmt.Sprintf("Verifying if %d buckets are consistent across drives...", len(buckets)))
 			}
 		}
-		for _, bucket := range buckets {
-			if _, err = newObject.HealBucket(ctx, bucket.Name, madmin.HealOpts{Recreate: true}); err != nil {
-				return fmt.Errorf("Unable to list buckets to heal: %w", err)
+
+		// Limit to no more than 50 concurrent buckets.
+		g := errgroup.WithNErrs(len(buckets)).WithConcurrency(50)
+		ctx, cancel := g.WithCancelOnError(ctx)
+		defer cancel()
+		for index := range buckets {
+			index := index
+			g.Go(func() error {
+				if _, berr := newObject.HealBucket(ctx, buckets[index].Name, madmin.HealOpts{Recreate: true}); berr != nil {
+					return fmt.Errorf("Unable to list buckets to heal: %w", berr)
 				}
+				return nil
+			}, index)
+		}
+		if err := g.WaitErr(); err != nil {
+			return err
 		}
 	}

--- a/pkg/sync/errgroup/errgroup.go
+++ b/pkg/sync/errgroup/errgroup.go
@ -17,43 +17,116 @@
 package errgroup

 import (
+	"context"
 	"sync"
+	"sync/atomic"
 )

 // A Group is a collection of goroutines working on subtasks that are part of
 // the same overall task.
 //
-// A zero Group is valid and does not cancel on error.
+// A zero Group can be used if errors should not be tracked.
 type Group struct {
 	wg        sync.WaitGroup
+	bucket    chan struct{}
 	errs      []error
+	firstErr  int64
+	cancel    context.CancelFunc
+	ctxCancel <-chan struct{} // nil if no context.
+	ctxErr    func() error
 }

 // WithNErrs returns a new Group with length of errs slice upto nerrs,
 // upon Wait() errors are returned collected from all tasks.
 func WithNErrs(nerrs int) *Group {
-	return &Group{errs: make([]error, nerrs)}
+	return &Group{errs: make([]error, nerrs), firstErr: -1}
 }

 // Wait blocks until all function calls from the Go method have returned, then
 // returns the slice of errors from all function calls.
 func (g *Group) Wait() []error {
 	g.wg.Wait()
+	if g.cancel != nil {
+		g.cancel()
+	}
 	return g.errs
 }

+// WaitErr blocks until all function calls from the Go method have returned, then
+// returns the first error returned.
+func (g *Group) WaitErr() error {
+	g.wg.Wait()
+	if g.cancel != nil {
+		g.cancel()
+	}
+	if g.firstErr >= 0 && len(g.errs) > int(g.firstErr) {
+		// len(g.errs) > int(g.firstErr) is for then used uninitialized.
+		return g.errs[g.firstErr]
+	}
+	return nil
+}
+
+// WithConcurrency allows to limit the concurrency of the group.
+// This must be called before starting any async processes.
+// There is no order to which functions are allowed to run.
+// If n <= 0 no concurrency limits are enforced.
+// g is modified and returned as well.
+func (g *Group) WithConcurrency(n int) *Group {
+	if n <= 0 {
+		g.bucket = nil
+		return g
+	}
+
+	// Fill bucket with tokens
+	g.bucket = make(chan struct{}, n)
+	for i := 0; i < n; i++ {
+		g.bucket <- struct{}{}
+	}
+	return g
+}
+
+// WithCancelOnError will return a context that is canceled
+// as soon as an error occurs.
+// The returned CancelFunc must always be called similar to context.WithCancel.
+// If the supplied context is canceled any goroutines waiting for execution are also canceled.
+func (g *Group) WithCancelOnError(ctx context.Context) (context.Context, context.CancelFunc) {
+	ctx, g.cancel = context.WithCancel(ctx)
+	g.ctxCancel = ctx.Done()
+	g.ctxErr = ctx.Err
+	return ctx, g.cancel
+}
+
 // Go calls the given function in a new goroutine.
 //
-// The first call to return a non-nil error will be
-// collected in errs slice and returned by Wait().
+// The errors will be collected in errs slice and returned by Wait().
 func (g *Group) Go(f func() error, index int) {
 	g.wg.Add(1)
-
 	go func() {
 		defer g.wg.Done()
-
+		if g.bucket != nil {
+			// Wait for token
+			select {
+			case <-g.bucket:
+				defer func() {
+					// Put back token..
+					g.bucket <- struct{}{}
+				}()
+			case <-g.ctxCancel:
+				if len(g.errs) > index {
+					atomic.CompareAndSwapInt64(&g.firstErr, -1, int64(index))
+					g.errs[index] = g.ctxErr()
+				}
+				return
+			}
+		}
 		if err := f(); err != nil {
+			if len(g.errs) > index {
+				atomic.CompareAndSwapInt64(&g.firstErr, -1, int64(index))
 				g.errs[index] = err
 			}
+			if g.cancel != nil {
+				g.cancel()
+			}
+		}
 	}()
 }