Wait one minute after startup to restart decommissioning (#19645)

Typically not all drives are connected, so we delay 3 minutes before resuming. This greatly reduces risk of starting to list unconnected drives, or drives we risk being disconnected soon. This delay is not applied when starting with an admin call.
2025-11-20 09:56:07 -05:00 · 2024-05-01 08:18:21 -07:00
parent 08ff702434
commit dbfb5e797b
2 changed files with 11 additions and 11 deletions
--- a/cmd/erasure-server-pool-decom.go
+++ b/cmd/erasure-server-pool-decom.go
@@ -535,6 +535,10 @@ func (z *erasureServerPools) Init(ctx context.Context) error {

 	if len(poolIndices) > 0 && globalEndpoints[poolIndices[0]].Endpoints[0].IsLocal {
 		go func() {
+			// Resume decommissioning of pools, but wait 3 minutes for cluster to stabilize.
+			if err := sleepContext(ctx, 3*time.Minute); err != nil {
+				return
+			}
 			r := rand.New(rand.NewSource(time.Now().UnixNano()))
 			for {
 				if err := z.Decommission(ctx, poolIndices...); err != nil {
--- a/cmd/utils.go
+++ b/cmd/utils.go
@@ -1127,16 +1127,12 @@ func ptr[T any](a T) *T {
 	return &a
 }

-func max(a, b int) int {
-	if a > b {
-		return a
+// sleepContext sleeps for d duration or until ctx is done.
+func sleepContext(ctx context.Context, d time.Duration) error {
+	select {
+	case <-ctx.Done():
+		return ctx.Err()
+	case <-time.After(d):
 	}
-	return b
-}
-
-func min(a, b int) int {
-	if a < b {
-		return a
-	}
-	return b
+	return nil
 }