Faster startup of large distributed systems with latency (#16259)

This commit is contained in:
Klaus Post 2022-12-15 17:31:21 +01:00 committed by GitHub
parent 2433698372
commit 988a2e8fed
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 27 additions and 5 deletions

View File

@ -358,6 +358,12 @@ func initServer(ctx context.Context, newObject ObjectLayer) error {
r := rand.New(rand.NewSource(time.Now().UnixNano())) r := rand.New(rand.NewSource(time.Now().UnixNano()))
lockTimeout := newDynamicTimeout(5*time.Second, 3*time.Second) lockTimeout := newDynamicTimeout(5*time.Second, 3*time.Second)
// do not retry to avoid high contention on startup.
lockTimeout.retryInterval = -1
// Do an initial random sleep to avoid stampeding herd of initial
// lock request. This will spread locks requests over 1 second.
time.Sleep(time.Duration(r.Float64() * float64(time.Second)))
for { for {
select { select {
@ -380,7 +386,8 @@ func initServer(ctx context.Context, newObject ObjectLayer) error {
if err != nil { if err != nil {
logger.Info("Waiting for all MinIO sub-systems to be initialized.. trying to acquire lock") logger.Info("Waiting for all MinIO sub-systems to be initialized.. trying to acquire lock")
time.Sleep(time.Duration(r.Float64() * float64(5*time.Second))) // Sleep 0 -> 2 seconds to average 1 second retry interval.
time.Sleep(time.Duration(r.Float64() * 2 * float64(time.Second)))
continue continue
} }

View File

@ -23,6 +23,7 @@ import (
"math/rand" "math/rand"
"os" "os"
"sort" "sort"
"strconv"
"sync" "sync"
"time" "time"
@ -32,9 +33,23 @@ import (
// Indicator if logging is enabled. // Indicator if logging is enabled.
var dsyncLog bool var dsyncLog bool
// maximum time to sleep before retrying a failed blocking lock()
var lockRetryInterval time.Duration
func init() { func init() {
// Check for MINIO_DSYNC_TRACE env variable, if set logging will be enabled for failed REST operations. // Check for MINIO_DSYNC_TRACE env variable, if set logging will be enabled for failed REST operations.
dsyncLog = os.Getenv("MINIO_DSYNC_TRACE") == "1" dsyncLog = os.Getenv("MINIO_DSYNC_TRACE") == "1"
// lockRetryInterval specifies the maximum time between retries for failed locks.
// Average retry time will be value / 2.
lockRetryInterval = 100 * time.Millisecond
if lri := os.Getenv("_MINIO_LOCK_RETRY_INTERVAL"); lri != "" {
v, err := strconv.Atoi(lri)
if err != nil {
panic(err)
}
lockRetryInterval = time.Duration(v) * time.Millisecond
}
} }
func log(format string, data ...interface{}) { func log(format string, data ...interface{}) {
@ -59,9 +74,6 @@ const (
// dRWMutexRefreshInterval - default the interval between two refresh calls // dRWMutexRefreshInterval - default the interval between two refresh calls
drwMutexRefreshInterval = 10 * time.Second drwMutexRefreshInterval = 10 * time.Second
// maximum time to sleep before retrying a failed blocking lock()
lockRetryInterval = 50 * time.Millisecond
drwMutexInfinite = 1<<63 - 1 drwMutexInfinite = 1<<63 - 1
) )
@ -239,9 +251,12 @@ func (dm *DRWMutex) lockBlocking(ctx context.Context, lockLossCallback func(), i
} }
lockRetryInterval := dm.lockRetryInterval lockRetryInterval := dm.lockRetryInterval
if opts.RetryInterval > 0 { if opts.RetryInterval != 0 {
lockRetryInterval = opts.RetryInterval lockRetryInterval = opts.RetryInterval
} }
if lockRetryInterval < 0 {
return false
}
time.Sleep(time.Duration(dm.rng.Float64() * float64(lockRetryInterval))) time.Sleep(time.Duration(dm.rng.Float64() * float64(lockRetryInterval)))
} }
} }