mirror of https://github.com/minio/minio.git
Add more delays on distributed startup for slow network (#5240)
Refer #5237
This commit is contained in:
parent
ffdf115bf2
commit
819d1e80c6
|
@ -842,7 +842,7 @@ func (adminAPI adminAPIHandlers) HealFormatHandler(w http.ResponseWriter, r *htt
|
|||
|
||||
// Wrap into retrying disks
|
||||
retryingDisks := initRetryableStorageDisks(bootstrapDisks,
|
||||
time.Millisecond, time.Millisecond*5)
|
||||
time.Millisecond, time.Millisecond*5, globalStorageHealthCheckInterval, globalStorageRetryThreshold)
|
||||
|
||||
// Heal format.json on available storage.
|
||||
err = healFormatXL(retryingDisks)
|
||||
|
|
|
@ -318,13 +318,14 @@ func initStorageDisks(endpoints EndpointList) ([]StorageAPI, error) {
|
|||
}
|
||||
|
||||
// Wrap disks into retryable disks.
|
||||
func initRetryableStorageDisks(disks []StorageAPI, retryUnit, retryCap time.Duration) (outDisks []StorageAPI) {
|
||||
func initRetryableStorageDisks(disks []StorageAPI, retryUnit, retryCap, retryInterval time.Duration, retryThreshold int) (outDisks []StorageAPI) {
|
||||
// Initialize the disk into a retryable-disks wrapper.
|
||||
outDisks = make([]StorageAPI, len(disks))
|
||||
for i, disk := range disks {
|
||||
outDisks[i] = &retryStorage{
|
||||
remoteStorage: disk,
|
||||
maxRetryAttempts: globalStorageRetryThreshold,
|
||||
retryInterval: retryInterval,
|
||||
maxRetryAttempts: retryThreshold,
|
||||
retryUnit: retryUnit,
|
||||
retryCap: retryCap,
|
||||
offlineTimestamp: UTCNow(), // Set timestamp to prevent immediate marking as offline
|
||||
|
@ -346,19 +347,20 @@ func waitForFormatXLDisks(firstDisk bool, endpoints EndpointList, storageDisks [
|
|||
// retry window (30 seconds, with once-per-second retries) so
|
||||
// that we wait enough amount of time before the disks come
|
||||
// online.
|
||||
retryDisks := initRetryableStorageDisks(storageDisks, time.Second, time.Second*30)
|
||||
retryDisks := initRetryableStorageDisks(storageDisks, time.Second, time.Second*30,
|
||||
globalStorageInitHealthCheckInterval, globalStorageInitRetryThreshold)
|
||||
|
||||
// Start retry loop retrying until disks are formatted
|
||||
// properly, until we have reached a conditional quorum of
|
||||
// formatted disks.
|
||||
err = retryFormattingXLDisks(firstDisk, endpoints, retryDisks)
|
||||
if err != nil {
|
||||
if err = retryFormattingXLDisks(firstDisk, endpoints, retryDisks); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Initialize the disk into a formatted disks wrapper. This
|
||||
// uses a shorter retry window (5ms with once-per-ms retries)
|
||||
formattedDisks = initRetryableStorageDisks(storageDisks, time.Millisecond, time.Millisecond*5)
|
||||
formattedDisks = initRetryableStorageDisks(storageDisks, time.Millisecond, time.Millisecond*5,
|
||||
globalStorageHealthCheckInterval, globalStorageRetryThreshold)
|
||||
|
||||
// Success.
|
||||
return formattedDisks, nil
|
||||
|
|
|
@ -23,12 +23,24 @@ import (
|
|||
)
|
||||
|
||||
const (
|
||||
// NOTE: Values indicated here are based on manual testing and
|
||||
// for best case scenarios under wide array of setups. If you
|
||||
// encounter changes in future feel free to change these values.
|
||||
|
||||
// Attempt to retry only this many number of times before
|
||||
// giving up on the remote disk entirely during initialization.
|
||||
globalStorageInitRetryThreshold = 2
|
||||
|
||||
// Attempt to retry only this many number of times before
|
||||
// giving up on the remote disk entirely after initialization.
|
||||
globalStorageRetryThreshold = 1
|
||||
|
||||
// Interval to check health status of a node whether it has
|
||||
// come back up online
|
||||
// come back up online during initialization.
|
||||
globalStorageInitHealthCheckInterval = 15 * time.Minute
|
||||
|
||||
// Interval to check health status of a node whether it has
|
||||
// come back up online.
|
||||
globalStorageHealthCheckInterval = 5 * time.Minute
|
||||
)
|
||||
|
||||
|
@ -52,6 +64,7 @@ func retryToStorageErr(err error) error {
|
|||
type retryStorage struct {
|
||||
remoteStorage StorageAPI
|
||||
maxRetryAttempts int
|
||||
retryInterval time.Duration
|
||||
retryUnit time.Duration
|
||||
retryCap time.Duration
|
||||
offline bool // Mark whether node is offline
|
||||
|
@ -78,7 +91,7 @@ func (f *retryStorage) Close() (err error) {
|
|||
// restore the connection
|
||||
func (f *retryStorage) IsOffline() bool {
|
||||
// Check if offline and whether enough time has lapsed since most recent check
|
||||
if f.offline && UTCNow().Sub(f.offlineTimestamp) >= globalStorageHealthCheckInterval {
|
||||
if f.offline && UTCNow().Sub(f.offlineTimestamp) >= f.retryInterval {
|
||||
f.offlineTimestamp = UTCNow() // reset timestamp
|
||||
|
||||
if e := f.reInit(nil); e == nil {
|
||||
|
@ -260,15 +273,13 @@ func (f *retryStorage) reInitUponDiskNotFound(err error) bool {
|
|||
return false
|
||||
}
|
||||
|
||||
// Connect and attempt to load the format from a disconnected node,
|
||||
// attempts three times before giving up.
|
||||
// Connect and attempt to load the format from a disconnected node.
|
||||
// Additionally upon failure, we retry maxRetryAttempts times before
|
||||
// giving up. Essentially as a whole it would mean we are infact
|
||||
// performing 1 + maxRetryAttempts times reInit.
|
||||
func (f *retryStorage) reInit(e error) (err error) {
|
||||
|
||||
// Only after initialization and minimum of one interval
|
||||
// has passed (to prevent marking a node as offline right
|
||||
// after initialization), check whether node has gone offline
|
||||
if f.maxRetryAttempts == globalStorageRetryThreshold &&
|
||||
UTCNow().Sub(f.offlineTimestamp) >= globalStorageHealthCheckInterval {
|
||||
// Check whether node has gone offline.
|
||||
if UTCNow().Sub(f.offlineTimestamp) >= f.retryInterval {
|
||||
if e == errDiskNotFoundFromNetError { // Make node offline due to network error
|
||||
f.offline = true // Marking node offline
|
||||
f.offlineTimestamp = UTCNow()
|
||||
|
@ -299,8 +310,7 @@ func (f *retryStorage) reInit(e error) (err error) {
|
|||
|
||||
// Attempt to load format to see if the disk is really
|
||||
// a formatted disk and part of the cluster.
|
||||
_, err = loadFormat(f.remoteStorage)
|
||||
if err != nil {
|
||||
if _, err = loadFormat(f.remoteStorage); err != nil {
|
||||
// No need to return error until the retry count
|
||||
// threshold has reached.
|
||||
if i < f.maxRetryAttempts {
|
||||
|
|
Loading…
Reference in New Issue