diff --git a/cmd/erasure-server-pool-decom.go b/cmd/erasure-server-pool-decom.go index 3dba71ed5..dbda64cf2 100644 --- a/cmd/erasure-server-pool-decom.go +++ b/cmd/erasure-server-pool-decom.go @@ -696,7 +696,9 @@ func (z *erasureServerPools) decommissionPool(ctx context.Context, idx int, pool // Check if bucket is object locked. lr, _ := globalBucketObjectLockSys.Get(bi.Name) - for _, set := range pool.sets { + r := rand.New(rand.NewSource(time.Now().UnixNano())) + + for setIdx, set := range pool.sets { set := set filterLifecycle := func(bucket, object string, fi FileInfo) bool { @@ -895,16 +897,26 @@ func (z *erasureServerPools) decommissionPool(ctx context.Context, idx int, pool } wk.Take() - go func() { + go func(setIdx int) { defer wk.Give() - err := set.listObjectsToDecommission(ctx, bi, - func(entry metaCacheEntry) { - wk.Take() - go decommissionEntry(entry) - }, - ) - logger.LogIf(ctx, err) - }() + // We will perpetually retry listing if it fails, since we cannot + // possibly give up in this matter + for { + err := set.listObjectsToDecommission(ctx, bi, + func(entry metaCacheEntry) { + wk.Take() + go decommissionEntry(entry) + }, + ) + if err == nil { + break + } + setN := humanize.Ordinal(setIdx + 1) + retryDur := time.Duration(r.Float64() * float64(5*time.Second)) + logger.LogOnceIf(ctx, fmt.Errorf("listing objects from %s set failed with %v, retrying in %v", setN, err, retryDur), "decom-listing-failed"+setN) + time.Sleep(retryDur) + } + }(setIdx) } wk.Wait() return nil