mirror of
https://github.com/minio/minio.git
synced 2025-11-20 09:56:07 -05:00
heal buckets during init and make sure to wait on quorum (#9526)
heal buckets properly during expansion, and make sure to wait for the quorum properly such that healing can be retried.
This commit is contained in:
@@ -208,20 +208,24 @@ func initSafeMode() (err error) {
|
||||
// version is needed, migration is needed etc.
|
||||
rquorum := InsufficientReadQuorum{}
|
||||
wquorum := InsufficientWriteQuorum{}
|
||||
optimeout := OperationTimedOut{}
|
||||
for n := range newRetryTimerSimple(retryCtx) {
|
||||
for range newRetryTimerSimple(retryCtx) {
|
||||
// let one of the server acquire the lock, if not let them timeout.
|
||||
// which shall be retried again by this loop.
|
||||
if err = txnLk.GetLock(leaderLockTimeout); err == nil {
|
||||
// Migrate all backend configs to encrypted backend configs, optionally
|
||||
// handles rotating keys for encryption, if there is any retriable failure
|
||||
// that shall be retried if there is an error.
|
||||
if err = handleEncryptedConfigBackend(newObject, true); err == nil {
|
||||
// Upon success migrating the config, initialize all sub-systems
|
||||
// if all sub-systems initialized successfully return right away
|
||||
if err = initAllSubsystems(newObject); err == nil {
|
||||
return nil
|
||||
}
|
||||
if err = txnLk.GetLock(leaderLockTimeout); err != nil {
|
||||
logger.Info("Waiting for all MinIO sub-systems to be initialized.. trying to acquire lock")
|
||||
continue
|
||||
}
|
||||
logger.Info("Waiting for all MinIO sub-systems to be initialized.. lock acquired")
|
||||
|
||||
// Migrate all backend configs to encrypted backend configs, optionally
|
||||
// handles rotating keys for encryption, if there is any retriable failure
|
||||
// that shall be retried if there is an error.
|
||||
if err = handleEncryptedConfigBackend(newObject, true); err == nil {
|
||||
// Upon success migrating the config, initialize all sub-systems
|
||||
// if all sub-systems initialized successfully return right away
|
||||
if err = initAllSubsystems(newObject); err == nil {
|
||||
// All successful return.
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
@@ -230,15 +234,11 @@ func initSafeMode() (err error) {
|
||||
errors.Is(err, errConfigNotFound) ||
|
||||
errors.Is(err, context.Canceled) ||
|
||||
errors.Is(err, context.DeadlineExceeded) ||
|
||||
errors.As(err, &optimeout) ||
|
||||
errors.As(err, &rquorum) ||
|
||||
errors.As(err, &wquorum) ||
|
||||
isErrBucketNotFound(err) {
|
||||
if n < 5 {
|
||||
logger.Info("Waiting for all MinIO sub-systems to be initialized..")
|
||||
} else {
|
||||
logger.Info("Waiting for all MinIO sub-systems to be initialized.. possible cause (%v)", err)
|
||||
}
|
||||
logger.Info("Waiting for all MinIO sub-systems to be initialized.. possible cause (%v)", err)
|
||||
txnLk.Unlock() // Unlock the transaction lock and allow other nodes to acquire the lock if possible.
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -256,10 +256,41 @@ func initSafeMode() (err error) {
|
||||
}
|
||||
|
||||
func initAllSubsystems(newObject ObjectLayer) (err error) {
|
||||
// List buckets to be re-used for loading configs.
|
||||
buckets, err := newObject.ListBuckets(GlobalContext)
|
||||
if err != nil {
|
||||
return fmt.Errorf("Unable to list buckets: %w", err)
|
||||
// %w is used by all error returns here to make sure
|
||||
// we wrap the underlying error, make sure when you
|
||||
// are modifying this code that you do so, if and when
|
||||
// you want to add extra context to your error. This
|
||||
// ensures top level retry works accordingly.
|
||||
var buckets []BucketInfo
|
||||
if globalIsDistXL || globalIsXL {
|
||||
// List buckets to heal, and be re-used for loading configs.
|
||||
buckets, err = newObject.ListBucketsHeal(GlobalContext)
|
||||
if err != nil {
|
||||
return fmt.Errorf("Unable to list buckets to heal: %w", err)
|
||||
}
|
||||
// Attempt a heal if possible and re-use the bucket names
|
||||
// to reload their config.
|
||||
wquorum := &InsufficientWriteQuorum{}
|
||||
rquorum := &InsufficientReadQuorum{}
|
||||
for _, bucket := range buckets {
|
||||
if err = newObject.MakeBucketWithLocation(GlobalContext, bucket.Name, ""); err != nil {
|
||||
if errors.As(err, &wquorum) || errors.As(err, &rquorum) {
|
||||
// Retrun the error upwards for the caller to retry.
|
||||
return fmt.Errorf("Unable to heal bucket: %w", err)
|
||||
}
|
||||
if _, ok := err.(BucketExists); !ok {
|
||||
// ignore any other error and log for investigation.
|
||||
logger.LogIf(GlobalContext, err)
|
||||
continue
|
||||
}
|
||||
// Bucket already exists, nothing that needs to be done.
|
||||
}
|
||||
}
|
||||
} else {
|
||||
buckets, err = newObject.ListBuckets(GlobalContext)
|
||||
if err != nil {
|
||||
return fmt.Errorf("Unable to list buckets: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize config system.
|
||||
|
||||
Reference in New Issue
Block a user