mirror of
https://github.com/minio/minio.git
synced 2025-11-10 14:09:48 -05:00
fix: avoid timer leaks in dsync/lsync (#9781)
At a customer setup with lots of concurrent calls
it can be observed that in newRetryTimer there
were lots of tiny alloations which are not
relinquished upon retries, in this codepath
we were only interested in re-using the timer
and use it wisely for each locker.
```
(pprof) top
Showing nodes accounting for 8.68TB, 97.02% of 8.95TB total
Dropped 1198 nodes (cum <= 0.04TB)
Showing top 10 nodes out of 79
flat flat% sum% cum cum%
5.95TB 66.50% 66.50% 5.95TB 66.50% time.NewTimer
1.16TB 13.02% 79.51% 1.16TB 13.02% github.com/ncw/directio.AlignedBlock
0.67TB 7.53% 87.04% 0.70TB 7.78% github.com/minio/minio/cmd.xlObjects.putObject
0.21TB 2.36% 89.40% 0.21TB 2.36% github.com/minio/minio/cmd.(*posix).Walk
0.19TB 2.08% 91.49% 0.27TB 2.99% os.statNolog
0.14TB 1.59% 93.08% 0.14TB 1.60% os.(*File).readdirnames
0.10TB 1.09% 94.17% 0.11TB 1.25% github.com/minio/minio/cmd.readDirN
0.10TB 1.07% 95.23% 0.10TB 1.07% syscall.ByteSliceFromString
0.09TB 1.03% 96.27% 0.09TB 1.03% strings.(*Builder).grow
0.07TB 0.75% 97.02% 0.07TB 0.75% path.(*lazybuf).append
```
This commit is contained in:
@@ -1138,14 +1138,14 @@ var errorCodes = errorCodeMap{
|
||||
HTTPStatusCode: http.StatusBadRequest,
|
||||
},
|
||||
ErrOperationTimedOut: {
|
||||
Code: "XMinioServerTimedOut",
|
||||
Description: "A timeout occurred while trying to lock a resource",
|
||||
HTTPStatusCode: http.StatusRequestTimeout,
|
||||
Code: "RequestTimeout",
|
||||
Description: "A timeout occurred while trying to lock a resource, please reduce your request rate",
|
||||
HTTPStatusCode: http.StatusServiceUnavailable,
|
||||
},
|
||||
ErrOperationMaxedOut: {
|
||||
Code: "XMinioServerTimedOut",
|
||||
Description: "A timeout exceeded while waiting to proceed with the request",
|
||||
HTTPStatusCode: http.StatusRequestTimeout,
|
||||
Code: "SlowDown",
|
||||
Description: "A timeout exceeded while waiting to proceed with the request, please reduce your request rate",
|
||||
HTTPStatusCode: http.StatusServiceUnavailable,
|
||||
},
|
||||
ErrUnsupportedMetadata: {
|
||||
Code: "InvalidArgument",
|
||||
|
||||
@@ -27,6 +27,7 @@ import (
|
||||
"github.com/minio/minio/cmd/config"
|
||||
"github.com/minio/minio/cmd/logger"
|
||||
"github.com/minio/minio/pkg/lock"
|
||||
"github.com/minio/minio/pkg/retry"
|
||||
)
|
||||
|
||||
// FS format version strings.
|
||||
@@ -344,7 +345,7 @@ func formatFSFixDeploymentID(ctx context.Context, fsFormatPath string) error {
|
||||
defer cancel()
|
||||
|
||||
var wlk *lock.LockedFile
|
||||
retryCh := newRetryTimerSimple(retryCtx)
|
||||
retryCh := retry.NewTimerWithJitter(retryCtx, time.Second, 30*time.Second, retry.MaxJitter)
|
||||
var stop bool
|
||||
for !stop {
|
||||
select {
|
||||
|
||||
@@ -613,13 +613,11 @@ func (fs *FSObjects) GetObjectNInfo(ctx context.Context, bucket, object string,
|
||||
switch lockType {
|
||||
case writeLock:
|
||||
if err = lock.GetLock(globalObjectTimeout); err != nil {
|
||||
logger.LogIf(ctx, err)
|
||||
return nil, err
|
||||
}
|
||||
nsUnlocker = lock.Unlock
|
||||
case readLock:
|
||||
if err = lock.GetRLock(globalObjectTimeout); err != nil {
|
||||
logger.LogIf(ctx, err)
|
||||
return nil, err
|
||||
}
|
||||
nsUnlocker = lock.RUnlock
|
||||
|
||||
@@ -76,7 +76,7 @@ func (client *lockRESTClient) call(method string, values url.Values, body io.Rea
|
||||
}
|
||||
|
||||
if isNetworkError(err) {
|
||||
time.AfterFunc(defaultRetryUnit, func() {
|
||||
time.AfterFunc(time.Second, func() {
|
||||
// After 1 seconds, take this lock client online for a retry.
|
||||
atomic.StoreInt32(&client.connected, 1)
|
||||
})
|
||||
|
||||
@@ -28,9 +28,9 @@ import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/minio/lsync"
|
||||
"github.com/minio/minio/cmd/logger"
|
||||
"github.com/minio/minio/pkg/dsync"
|
||||
"github.com/minio/minio/pkg/lsync"
|
||||
)
|
||||
|
||||
// local lock servers
|
||||
|
||||
136
cmd/retry.go
136
cmd/retry.go
@@ -1,136 +0,0 @@
|
||||
/*
|
||||
* MinIO Cloud Storage, (C) 2016, 2017 MinIO, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"context"
|
||||
"math/rand"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// lockedRandSource provides protected rand source, implements rand.Source interface.
|
||||
type lockedRandSource struct {
|
||||
lk sync.Mutex
|
||||
src rand.Source
|
||||
}
|
||||
|
||||
// Int63 returns a non-negative pseudo-random 63-bit integer as an
|
||||
// int64.
|
||||
func (r *lockedRandSource) Int63() (n int64) {
|
||||
r.lk.Lock()
|
||||
n = r.src.Int63()
|
||||
r.lk.Unlock()
|
||||
return
|
||||
}
|
||||
|
||||
// Seed uses the provided seed value to initialize the generator to a
|
||||
// deterministic state.
|
||||
func (r *lockedRandSource) Seed(seed int64) {
|
||||
r.lk.Lock()
|
||||
r.src.Seed(seed)
|
||||
r.lk.Unlock()
|
||||
}
|
||||
|
||||
// MaxJitter will randomize over the full exponential backoff time
|
||||
const MaxJitter = 1.0
|
||||
|
||||
// NoJitter disables the use of jitter for randomizing the
|
||||
// exponential backoff time
|
||||
const NoJitter = 0.0
|
||||
|
||||
// Global random source for fetching random values.
|
||||
var globalRandomSource = rand.New(&lockedRandSource{
|
||||
src: rand.NewSource(UTCNow().UnixNano()),
|
||||
})
|
||||
|
||||
// newRetryTimerJitter creates a timer with exponentially increasing delays
|
||||
// until the maximum retry attempts are reached. - this function is a fully
|
||||
// configurable version, meant for only advanced use cases. For the most part
|
||||
// one should use newRetryTimerSimple and newRetryTimer.
|
||||
func newRetryTimerWithJitter(ctx context.Context, unit time.Duration, cap time.Duration, jitter float64) <-chan int {
|
||||
attemptCh := make(chan int)
|
||||
|
||||
// normalize jitter to the range [0, 1.0]
|
||||
if jitter < NoJitter {
|
||||
jitter = NoJitter
|
||||
}
|
||||
if jitter > MaxJitter {
|
||||
jitter = MaxJitter
|
||||
}
|
||||
|
||||
// computes the exponential backoff duration according to
|
||||
// https://www.awsarchitectureblog.com/2015/03/backoff.html
|
||||
exponentialBackoffWait := func(attempt int) time.Duration {
|
||||
// 1<<uint(attempt) below could overflow, so limit the value of attempt
|
||||
maxAttempt := 30
|
||||
if attempt > maxAttempt {
|
||||
attempt = maxAttempt
|
||||
}
|
||||
//sleep = random_between(0, min(cap, base * 2 ** attempt))
|
||||
sleep := unit * time.Duration(1<<uint(attempt))
|
||||
if sleep > cap {
|
||||
sleep = cap
|
||||
}
|
||||
if jitter != NoJitter {
|
||||
sleep -= time.Duration(globalRandomSource.Float64() * float64(sleep) * jitter)
|
||||
}
|
||||
return sleep
|
||||
}
|
||||
|
||||
go func() {
|
||||
defer close(attemptCh)
|
||||
nextBackoff := 0
|
||||
// Channel used to signal after the expiry of backoff wait seconds.
|
||||
var timer *time.Timer
|
||||
for {
|
||||
select { // Attempts starts.
|
||||
case attemptCh <- nextBackoff:
|
||||
nextBackoff++
|
||||
case <-ctx.Done():
|
||||
// Stop the routine.
|
||||
return
|
||||
}
|
||||
timer = time.NewTimer(exponentialBackoffWait(nextBackoff))
|
||||
// wait till next backoff time or till doneCh gets a message.
|
||||
select {
|
||||
case <-timer.C:
|
||||
case <-ctx.Done():
|
||||
// stop the timer and return.
|
||||
timer.Stop()
|
||||
return
|
||||
}
|
||||
|
||||
}
|
||||
}()
|
||||
|
||||
// Start reading..
|
||||
return attemptCh
|
||||
}
|
||||
|
||||
// Default retry constants.
|
||||
const (
|
||||
defaultRetryUnit = time.Second // 1 second.
|
||||
defaultRetryCap = 30 * time.Second // 30 seconds.
|
||||
)
|
||||
|
||||
// newRetryTimerSimple creates a timer with exponentially increasing delays
|
||||
// until the maximum retry attempts are reached. - this function is a
|
||||
// simpler version with all default values.
|
||||
func newRetryTimerSimple(ctx context.Context) <-chan int {
|
||||
return newRetryTimerWithJitter(ctx, defaultRetryUnit, defaultRetryCap, MaxJitter)
|
||||
}
|
||||
@@ -1,87 +0,0 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2016-2020 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Tests for retry timer.
|
||||
func TestRetryTimerSimple(t *testing.T) {
|
||||
rctx, cancel := context.WithCancel(context.Background())
|
||||
attemptCh := newRetryTimerSimple(rctx)
|
||||
i := <-attemptCh
|
||||
if i != 0 {
|
||||
cancel()
|
||||
t.Fatalf("Invalid attempt counter returned should be 0, found %d instead", i)
|
||||
}
|
||||
i = <-attemptCh
|
||||
if i <= 0 {
|
||||
cancel()
|
||||
t.Fatalf("Invalid attempt counter returned should be greater than 0, found %d instead", i)
|
||||
}
|
||||
cancel()
|
||||
_, ok := <-attemptCh
|
||||
if ok {
|
||||
t.Fatal("Attempt counter should be closed")
|
||||
}
|
||||
}
|
||||
|
||||
// Test retry time with no jitter.
|
||||
func TestRetryTimerWithNoJitter(t *testing.T) {
|
||||
rctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
// No jitter
|
||||
attemptCh := newRetryTimerWithJitter(rctx, time.Millisecond, 5*time.Millisecond, NoJitter)
|
||||
i := <-attemptCh
|
||||
if i != 0 {
|
||||
cancel()
|
||||
t.Fatalf("Invalid attempt counter returned should be 0, found %d instead", i)
|
||||
}
|
||||
// Loop through the maximum possible attempt.
|
||||
for i = range attemptCh {
|
||||
if i == 30 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
cancel()
|
||||
_, ok := <-attemptCh
|
||||
if ok {
|
||||
t.Fatal("Attempt counter should be closed")
|
||||
}
|
||||
}
|
||||
|
||||
// Test retry time with Jitter greater than MaxJitter.
|
||||
func TestRetryTimerWithJitter(t *testing.T) {
|
||||
rctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
// Jitter will be set back to 1.0
|
||||
attemptCh := newRetryTimerWithJitter(rctx, time.Second, 30*time.Second, 2.0)
|
||||
i := <-attemptCh
|
||||
if i != 0 {
|
||||
cancel()
|
||||
t.Fatalf("Invalid attempt counter returned should be 0, found %d instead", i)
|
||||
}
|
||||
cancel()
|
||||
_, ok := <-attemptCh
|
||||
if ok {
|
||||
t.Fatal("Attempt counter should be closed")
|
||||
}
|
||||
}
|
||||
@@ -38,6 +38,7 @@ import (
|
||||
"github.com/minio/minio/pkg/certs"
|
||||
"github.com/minio/minio/pkg/color"
|
||||
"github.com/minio/minio/pkg/env"
|
||||
"github.com/minio/minio/pkg/retry"
|
||||
)
|
||||
|
||||
// ServerFlags - server command specific flags
|
||||
@@ -216,10 +217,10 @@ func initSafeMode() (err error) {
|
||||
// version is needed, migration is needed etc.
|
||||
rquorum := InsufficientReadQuorum{}
|
||||
wquorum := InsufficientWriteQuorum{}
|
||||
for range newRetryTimerSimple(retryCtx) {
|
||||
for range retry.NewTimer(retryCtx) {
|
||||
// let one of the server acquire the lock, if not let them timeout.
|
||||
// which shall be retried again by this loop.
|
||||
if err = txnLk.GetLock(leaderLockTimeout); err != nil {
|
||||
if err = txnLk.GetLock(newDynamicTimeout(5*time.Second, 30*time.Second)); err != nil {
|
||||
logger.Info("Waiting for all MinIO sub-systems to be initialized.. trying to acquire lock")
|
||||
continue
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user