fix: make decommission restart non-blocking (#14591)

currently an on-going decommission, during a server restart might block the startup sequence for relatively longer periods, instead start the decommission in background lazily.
2025-05-21 17:43:48 -04:00 · 2022-03-20 14:46:43 -07:00 · 2022-03-20 14:46:43 -07:00 · bd6f7b6d83
commit bd6f7b6d83
parent b0a4beb66a
2 changed files with 13 additions and 8 deletions
--- a/cmd/erasure-server-pool-decom.go
+++ b/cmd/erasure-server-pool-decom.go
@ -472,13 +472,16 @@ func (z *erasureServerPools) Init(ctx context.Context) error {
 		// '-1' as argument to decommission multiple pools at a time
 		// but this is not a priority at the moment.
 		for _, pool := range meta.returnResumablePools(1) {
-			err := z.Decommission(ctx, pool.ID)
-			switch err {
+			go func(pool PoolStatus) {
+				switch err := z.Decommission(ctx, pool.ID); err {
 				case errDecommissionAlreadyRunning:
 					fallthrough
 				case nil:
-				go z.doDecommissionInRoutine(ctx, pool.ID)
+					z.doDecommissionInRoutine(ctx, pool.ID)
+				default:
+					logger.LogIf(ctx, fmt.Errorf("Unable to resume decommission of pool %v: %w", pool, err))
 				}
+			}(pool)
 		}
 		z.poolMeta = meta

--- a/cmd/erasure-server-pool.go
+++ b/cmd/erasure-server-pool.go
@ -128,7 +128,9 @@ func newErasureServerPools(ctx context.Context, endpointServerPools EndpointServ
 			if !configRetriableErrors(err) {
 				logger.Fatal(err, "Unable to initialize backend")
 			}
-			time.Sleep(time.Duration(r.Float64() * float64(5*time.Second)))
+			retry := time.Duration(r.Float64() * float64(5*time.Second))
+			logger.LogIf(ctx, fmt.Errorf("Unable to initialize backend: %w, retrying in %s", err, retry))
+			time.Sleep(retry)
 			continue
 		}
 		break