mirror of
https://github.com/minio/minio.git
synced 2024-12-26 07:05:55 -05:00
Use expontential backoff algo for internode reconnections (#17052)
This commit is contained in:
parent
1704abaf6b
commit
4640b13c66
@ -75,9 +75,9 @@ type Client struct {
|
|||||||
// is online or offline.
|
// is online or offline.
|
||||||
HealthCheckFn func() bool
|
HealthCheckFn func() bool
|
||||||
|
|
||||||
// HealthCheckInterval will be the duration between re-connection attempts
|
// HealthCheckRetryUnit will be used to calculate the exponential
|
||||||
// when a call has failed with a network error.
|
// backoff when trying to reconnect to an offline node
|
||||||
HealthCheckInterval time.Duration
|
HealthCheckReconnectUnit time.Duration
|
||||||
|
|
||||||
// HealthCheckTimeout determines timeout for each call.
|
// HealthCheckTimeout determines timeout for each call.
|
||||||
HealthCheckTimeout time.Duration
|
HealthCheckTimeout time.Duration
|
||||||
@ -315,7 +315,7 @@ func NewClient(url *url.URL, tr http.RoundTripper, newAuthToken func(aud string)
|
|||||||
connected: online,
|
connected: online,
|
||||||
lastConn: time.Now().UnixNano(),
|
lastConn: time.Now().UnixNano(),
|
||||||
MaxErrResponseSize: 4096,
|
MaxErrResponseSize: 4096,
|
||||||
HealthCheckInterval: 200 * time.Millisecond,
|
HealthCheckReconnectUnit: 200 * time.Millisecond,
|
||||||
HealthCheckTimeout: time.Second,
|
HealthCheckTimeout: time.Second,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -337,6 +337,28 @@ func (c *Client) LastError() error {
|
|||||||
return c.lastErr
|
return c.lastErr
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// computes the exponential backoff duration according to
|
||||||
|
// https://www.awsarchitectureblog.com/2015/03/backoff.html
|
||||||
|
func exponentialBackoffWait(r *rand.Rand, unit, cap time.Duration) func(uint) time.Duration {
|
||||||
|
if unit > time.Hour {
|
||||||
|
// Protect against integer overflow
|
||||||
|
panic("unit cannot exceed one hour")
|
||||||
|
}
|
||||||
|
return func(attempt uint) time.Duration {
|
||||||
|
if attempt > 16 {
|
||||||
|
// Protect against integer overflow
|
||||||
|
attempt = 16
|
||||||
|
}
|
||||||
|
// sleep = random_between(unit, min(cap, base * 2 ** attempt))
|
||||||
|
sleep := unit * time.Duration(1<<attempt)
|
||||||
|
if sleep > cap {
|
||||||
|
sleep = cap
|
||||||
|
}
|
||||||
|
sleep -= time.Duration(r.Float64() * float64(sleep-unit))
|
||||||
|
return sleep
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// MarkOffline - will mark a client as being offline and spawns
|
// MarkOffline - will mark a client as being offline and spawns
|
||||||
// a goroutine that will attempt to reconnect if HealthCheckFn is set.
|
// a goroutine that will attempt to reconnect if HealthCheckFn is set.
|
||||||
// returns true if the node changed state from online to offline
|
// returns true if the node changed state from online to offline
|
||||||
@ -347,8 +369,14 @@ func (c *Client) MarkOffline(err error) bool {
|
|||||||
// Start goroutine that will attempt to reconnect.
|
// Start goroutine that will attempt to reconnect.
|
||||||
// If server is already trying to reconnect this will have no effect.
|
// If server is already trying to reconnect this will have no effect.
|
||||||
if c.HealthCheckFn != nil && atomic.CompareAndSwapInt32(&c.connected, online, offline) {
|
if c.HealthCheckFn != nil && atomic.CompareAndSwapInt32(&c.connected, online, offline) {
|
||||||
r := rand.New(rand.NewSource(time.Now().UnixNano()))
|
|
||||||
go func() {
|
go func() {
|
||||||
|
backOff := exponentialBackoffWait(
|
||||||
|
rand.New(rand.NewSource(time.Now().UnixNano())),
|
||||||
|
200*time.Millisecond,
|
||||||
|
30*time.Second,
|
||||||
|
)
|
||||||
|
|
||||||
|
attempt := uint(0)
|
||||||
for {
|
for {
|
||||||
if atomic.LoadInt32(&c.connected) == closed {
|
if atomic.LoadInt32(&c.connected) == closed {
|
||||||
return
|
return
|
||||||
@ -362,7 +390,8 @@ func (c *Client) MarkOffline(err error) bool {
|
|||||||
}
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
time.Sleep(time.Duration(r.Float64() * float64(c.HealthCheckInterval)))
|
attempt++
|
||||||
|
time.Sleep(backOff(attempt))
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
return true
|
return true
|
||||||
|
Loading…
Reference in New Issue
Block a user