Faster startup of large distributed systems with latency (#16259)

2025-07-11 18:11:06 -04:00 · 2022-12-15 17:31:21 +01:00 · 2022-12-15 17:31:21 +01:00 · 988a2e8fed
commit 988a2e8fed
parent 2433698372
2 changed files with 27 additions and 5 deletions
--- a/cmd/server-main.go
+++ b/cmd/server-main.go
@ -358,6 +358,12 @@ func initServer(ctx context.Context, newObject ObjectLayer) error {
 	r := rand.New(rand.NewSource(time.Now().UnixNano()))
 	lockTimeout := newDynamicTimeout(5*time.Second, 3*time.Second)
 	// do not retry to avoid high contention on startup.
 	lockTimeout.retryInterval = -1
 	// Do an initial random sleep to avoid stampeding herd of initial
 	// lock request. This will spread locks requests over 1 second.
 	time.Sleep(time.Duration(r.Float64() * float64(time.Second)))
 	for {
 		select {
@ -380,7 +386,8 @@ func initServer(ctx context.Context, newObject ObjectLayer) error {
 		if err != nil {
 			logger.Info("Waiting for all MinIO sub-systems to be initialized.. trying to acquire lock")
-			time.Sleep(time.Duration(r.Float64() * float64(5*time.Second)))
+			// Sleep 0 -> 2 seconds to average 1 second retry interval.
 			time.Sleep(time.Duration(r.Float64() * 2 * float64(time.Second)))
 			continue
 		}
--- a/internal/dsync/drwmutex.go
+++ b/internal/dsync/drwmutex.go
@ -23,6 +23,7 @@ import (
 	"math/rand"
 	"os"
 	"sort"
 	"strconv"
 	"sync"
 	"time"
@ -32,9 +33,23 @@ import (
 // Indicator if logging is enabled.
 var dsyncLog bool
 // maximum time to sleep before retrying a failed blocking lock()
 var lockRetryInterval time.Duration
 func init() {
 	// Check for MINIO_DSYNC_TRACE env variable, if set logging will be enabled for failed REST operations.
 	dsyncLog = os.Getenv("MINIO_DSYNC_TRACE") == "1"
 	// lockRetryInterval specifies the maximum time between retries for failed locks.
 	// Average retry time will be value / 2.
 	lockRetryInterval = 100 * time.Millisecond
 	if lri := os.Getenv("_MINIO_LOCK_RETRY_INTERVAL"); lri != "" {
 		v, err := strconv.Atoi(lri)
 		if err != nil {
 			panic(err)
 		}
 		lockRetryInterval = time.Duration(v) * time.Millisecond
 	}
 }
 func log(format string, data ...interface{}) {
@ -59,9 +74,6 @@ const (
 	// dRWMutexRefreshInterval - default the interval between two refresh calls
 	drwMutexRefreshInterval = 10 * time.Second
 	// maximum time to sleep before retrying a failed blocking lock()
 	lockRetryInterval = 50 * time.Millisecond
 	drwMutexInfinite = 1<<63 - 1
 )
@ -239,9 +251,12 @@ func (dm *DRWMutex) lockBlocking(ctx context.Context, lockLossCallback func(), i
 			}
 			lockRetryInterval := dm.lockRetryInterval
-			if opts.RetryInterval > 0 {
+			if opts.RetryInterval != 0 {
 				lockRetryInterval = opts.RetryInterval
 			}
 			if lockRetryInterval < 0 {
 				return false
 			}
 			time.Sleep(time.Duration(dm.rng.Float64() * float64(lockRetryInterval)))
 		}
 	}