fix: allow server not initialized error to be retried (#18300)

Since relaxing quorum the error across pools
for ListBuckets(), GetBucketInfo() we hit a
situation where loading IAM could potentially
return an error for second pool that server
is not initialized.

We need to handle this, let the pool come online
and retry transparently - this PR fixes that.
This commit is contained in:
Harshavardhana 2023-10-23 12:30:20 -07:00 committed by GitHub
parent bbfea29c2b
commit fd37418da2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 29 additions and 20 deletions

View File

@ -112,15 +112,10 @@ func saveIAMFormat(ctx context.Context, store IAMStorageAPI) error {
bootstrapTraceMsg("Load IAM format file") bootstrapTraceMsg("Load IAM format file")
var iamFmt iamFormat var iamFmt iamFormat
path := getIAMFormatFilePath() path := getIAMFormatFilePath()
if err := store.loadIAMConfig(ctx, &iamFmt, path); err != nil { if err := store.loadIAMConfig(ctx, &iamFmt, path); err != nil && !errors.Is(err, errConfigNotFound) {
switch err {
case errConfigNotFound:
// Need to migrate to V1.
default:
// if IAM format // if IAM format
return err return err
} }
}
if iamFmt.Version >= iamFormatVersion1 { if iamFmt.Version >= iamFormatVersion1 {
// Nothing to do. // Nothing to do.
@ -129,12 +124,7 @@ func saveIAMFormat(ctx context.Context, store IAMStorageAPI) error {
bootstrapTraceMsg("Write IAM format file") bootstrapTraceMsg("Write IAM format file")
// Save iam format to version 1. // Save iam format to version 1.
if err := store.saveIAMConfig(ctx, newIAMFormatVersion1(), path); err != nil { return store.saveIAMConfig(ctx, newIAMFormatVersion1(), path)
logger.LogIf(ctx, err)
return err
}
return nil
} }
func getGroupInfoPath(group string) string { func getGroupInfoPath(group string) string {

View File

@ -293,6 +293,7 @@ func (sys *IAMSys) Init(ctx context.Context, objAPI ObjectLayer, etcdClient *etc
if err := saveIAMFormat(retryCtx, sys.store); err != nil { if err := saveIAMFormat(retryCtx, sys.store); err != nil {
if configRetriableErrors(err) { if configRetriableErrors(err) {
logger.Info("Waiting for all MinIO IAM sub-system to be initialized.. possible cause (%v)", err) logger.Info("Waiting for all MinIO IAM sub-system to be initialized.. possible cause (%v)", err)
time.Sleep(time.Duration(r.Float64() * float64(time.Second)))
continue continue
} }
logger.LogIf(ctx, fmt.Errorf("IAM sub-system is partially initialized, unable to write the IAM format: %w", err)) logger.LogIf(ctx, fmt.Errorf("IAM sub-system is partially initialized, unable to write the IAM format: %w", err))
@ -307,7 +308,7 @@ func (sys *IAMSys) Init(ctx context.Context, objAPI ObjectLayer, etcdClient *etc
if err := sys.Load(retryCtx, true); err != nil { if err := sys.Load(retryCtx, true); err != nil {
if configRetriableErrors(err) { if configRetriableErrors(err) {
logger.Info("Waiting for all MinIO IAM sub-system to be initialized.. possible cause (%v)", err) logger.Info("Waiting for all MinIO IAM sub-system to be initialized.. possible cause (%v)", err)
time.Sleep(time.Duration(r.Float64() * float64(5*time.Second))) time.Sleep(time.Duration(r.Float64() * float64(time.Second)))
continue continue
} }
if err != nil { if err != nil {

View File

@ -372,6 +372,12 @@ func initAllSubsystems(ctx context.Context) {
} }
func configRetriableErrors(err error) bool { func configRetriableErrors(err error) bool {
if err == nil {
return false
}
notInitialized := err.Error() == "Server not initialized, please try again"
// Initializing sub-systems needs a retry mechanism for // Initializing sub-systems needs a retry mechanism for
// the following reasons: // the following reasons:
// - Read quorum is lost just after the initialization // - Read quorum is lost just after the initialization
@ -392,7 +398,8 @@ func configRetriableErrors(err error) bool {
errors.As(err, &wquorum) || errors.As(err, &wquorum) ||
isErrObjectNotFound(err) || isErrObjectNotFound(err) ||
isErrBucketNotFound(err) || isErrBucketNotFound(err) ||
errors.Is(err, os.ErrDeadlineExceeded) errors.Is(err, os.ErrDeadlineExceeded) ||
notInitialized
} }
func bootstrapTraceMsg(msg string) { func bootstrapTraceMsg(msg string) {
@ -813,10 +820,12 @@ func serverMain(ctx *cli.Context) {
}() }()
go func() { go func() {
r := rand.New(rand.NewSource(time.Now().UnixNano()))
if !globalDisableFreezeOnBoot { if !globalDisableFreezeOnBoot {
defer bootstrapTrace("unfreezeServices", unfreezeServices) defer bootstrapTrace("unfreezeServices", unfreezeServices)
t := time.AfterFunc(5*time.Minute, func() { t := time.AfterFunc(5*time.Minute, func() {
logger.Info(color.Yellow("WARNING: Taking more time to initialize the config subsystem. Please set '_MINIO_DISABLE_API_FREEZE_ON_BOOT=true' to not freeze the APIs")) logger.Info(color.Yellow("WARNING: Initializing the config subsystem is taking longer than 5 minutes. Please set '_MINIO_DISABLE_API_FREEZE_ON_BOOT=true' to not freeze the APIs"))
}) })
defer t.Stop() defer t.Stop()
} }
@ -864,10 +873,19 @@ func serverMain(ctx *cli.Context) {
var buckets []BucketInfo var buckets []BucketInfo
// List buckets to initialize bucket metadata sub-sys. // List buckets to initialize bucket metadata sub-sys.
bootstrapTrace("listBuckets", func() { bootstrapTrace("listBuckets", func() {
for {
buckets, err = newObject.ListBuckets(GlobalContext, BucketOptions{}) buckets, err = newObject.ListBuckets(GlobalContext, BucketOptions{})
if err != nil { if err != nil {
if configRetriableErrors(err) {
logger.Info("Waiting for list buckets to succeed to initialize buckets.. possible cause (%v)", err)
time.Sleep(time.Duration(r.Float64() * float64(time.Second)))
continue
}
logger.LogIf(GlobalContext, fmt.Errorf("Unable to list buckets to initialize bucket metadata sub-system: %w", err)) logger.LogIf(GlobalContext, fmt.Errorf("Unable to list buckets to initialize bucket metadata sub-system: %w", err))
} }
break
}
}) })
// Initialize bucket metadata sub-system. // Initialize bucket metadata sub-system.