mirror of
https://github.com/minio/minio.git
synced 2025-03-02 23:09:13 -05:00
Handle read/quorum errors when initializing all subsystems (#6585)
- Only require len(disks)/2 to initialize the cluster - Fix checking of read/write quorm in subsystems init - Add retry mechanism in policy and notification to avoid aborting in case of read/write quorums errors
This commit is contained in:
parent
d8a2975a68
commit
cbc5d78a09
@ -26,6 +26,7 @@ import (
|
||||
"os"
|
||||
"path"
|
||||
"runtime"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/minio/minio/cmd/logger"
|
||||
@ -234,7 +235,8 @@ func (sys *ConfigSys) Init(objAPI ObjectLayer) error {
|
||||
case _ = <-retryTimerCh:
|
||||
err := initConfig(objAPI)
|
||||
if err != nil {
|
||||
if isInsufficientReadQuorum(err) || isInsufficientWriteQuorum(err) {
|
||||
if strings.Contains(err.Error(), InsufficientReadQuorum{}.Error()) ||
|
||||
strings.Contains(err.Error(), InsufficientWriteQuorum{}.Error()) {
|
||||
logger.Info("Waiting for configuration to be initialized..")
|
||||
continue
|
||||
}
|
||||
|
@ -241,10 +241,10 @@ func (sys *NotificationSys) initListeners(ctx context.Context, objAPI ObjectLaye
|
||||
// and configFile, take a transaction lock to avoid data race between readConfig()
|
||||
// and saveConfig().
|
||||
objLock := globalNSMutex.NewNSLock(minioMetaBucket, transactionConfigFile)
|
||||
if err := objLock.GetLock(globalOperationTimeout); err != nil {
|
||||
if err := objLock.GetRLock(globalOperationTimeout); err != nil {
|
||||
return err
|
||||
}
|
||||
defer objLock.Unlock()
|
||||
defer objLock.RUnlock()
|
||||
|
||||
reader, e := readConfig(ctx, objAPI, configFile)
|
||||
if e != nil && !IsErrIgnored(e, errDiskNotFound, errConfigNotFound) {
|
||||
@ -265,7 +265,6 @@ func (sys *NotificationSys) initListeners(ctx context.Context, objAPI ObjectLaye
|
||||
return nil
|
||||
}
|
||||
|
||||
activeListenerList := []ListenBucketNotificationArgs{}
|
||||
for _, args := range listenerList {
|
||||
found, err := isLocalHost(args.Addr.Name)
|
||||
if err != nil {
|
||||
@ -301,16 +300,31 @@ func (sys *NotificationSys) initListeners(ctx context.Context, objAPI ObjectLaye
|
||||
logger.LogIf(ctx, err)
|
||||
return err
|
||||
}
|
||||
activeListenerList = append(activeListenerList, args)
|
||||
}
|
||||
|
||||
data, err := json.Marshal(activeListenerList)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (sys *NotificationSys) refresh(objAPI ObjectLayer) error {
|
||||
buckets, err := objAPI.ListBuckets(context.Background())
|
||||
if err != nil {
|
||||
logger.LogIf(ctx, err)
|
||||
return err
|
||||
}
|
||||
|
||||
return saveConfig(objAPI, configFile, data)
|
||||
for _, bucket := range buckets {
|
||||
ctx := logger.SetReqInfo(context.Background(), &logger.ReqInfo{BucketName: bucket.Name})
|
||||
config, err := readNotificationConfig(ctx, objAPI, bucket.Name)
|
||||
if err != nil && err != errNoSuchNotifications {
|
||||
return err
|
||||
}
|
||||
if err == errNoSuchNotifications {
|
||||
continue
|
||||
}
|
||||
sys.AddRulesMap(bucket.Name, config.ToRulesMap())
|
||||
if err = sys.initListeners(ctx, objAPI, bucket.Name); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Init - initializes notification system from notification.xml and listener.json of all buckets.
|
||||
@ -319,28 +333,29 @@ func (sys *NotificationSys) Init(objAPI ObjectLayer) error {
|
||||
return errInvalidArgument
|
||||
}
|
||||
|
||||
buckets, err := objAPI.ListBuckets(context.Background())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
doneCh := make(chan struct{})
|
||||
defer close(doneCh)
|
||||
|
||||
for _, bucket := range buckets {
|
||||
ctx := logger.SetReqInfo(context.Background(), &logger.ReqInfo{BucketName: bucket.Name})
|
||||
config, err := readNotificationConfig(ctx, objAPI, bucket.Name)
|
||||
if err != nil {
|
||||
if !IsErrIgnored(err, errDiskNotFound, errNoSuchNotifications) {
|
||||
// Initializing notification needs a retry mechanism for
|
||||
// the following reasons:
|
||||
// - Read quorum is lost just after the initialization
|
||||
// of the object layer.
|
||||
retryTimerCh := newRetryTimerSimple(doneCh)
|
||||
for {
|
||||
select {
|
||||
case _ = <-retryTimerCh:
|
||||
if err := sys.refresh(objAPI); err != nil {
|
||||
if err == errDiskNotFound ||
|
||||
strings.Contains(err.Error(), InsufficientReadQuorum{}.Error()) ||
|
||||
strings.Contains(err.Error(), InsufficientWriteQuorum{}.Error()) {
|
||||
logger.Info("Waiting for notification subsystem to be initialized..")
|
||||
continue
|
||||
}
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
sys.AddRulesMap(bucket.Name, config.ToRulesMap())
|
||||
}
|
||||
|
||||
if err = sys.initListeners(ctx, objAPI, bucket.Name); err != nil {
|
||||
return err
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// AddRulesMap - adds rules map for bucket name.
|
||||
|
@ -21,6 +21,7 @@ import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"path"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@ -131,25 +132,46 @@ func (sys *PolicySys) Init(objAPI ObjectLayer) error {
|
||||
return errInvalidArgument
|
||||
}
|
||||
|
||||
// Load PolicySys once during boot.
|
||||
if err := sys.refresh(objAPI); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Refresh PolicySys in background.
|
||||
go func() {
|
||||
ticker := time.NewTicker(globalRefreshBucketPolicyInterval)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-globalServiceDoneCh:
|
||||
return
|
||||
case <-ticker.C:
|
||||
sys.refresh(objAPI)
|
||||
defer func() {
|
||||
// Refresh PolicySys in background.
|
||||
go func() {
|
||||
ticker := time.NewTicker(globalRefreshBucketPolicyInterval)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-globalServiceDoneCh:
|
||||
return
|
||||
case <-ticker.C:
|
||||
sys.refresh(objAPI)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
}()
|
||||
return nil
|
||||
|
||||
doneCh := make(chan struct{})
|
||||
defer close(doneCh)
|
||||
|
||||
// Initializing policy needs a retry mechanism for
|
||||
// the following reasons:
|
||||
// - Read quorum is lost just after the initialization
|
||||
// of the object layer.
|
||||
retryTimerCh := newRetryTimerSimple(doneCh)
|
||||
for {
|
||||
select {
|
||||
case _ = <-retryTimerCh:
|
||||
// Load PolicySys once during boot.
|
||||
if err := sys.refresh(objAPI); err != nil {
|
||||
if err == errDiskNotFound ||
|
||||
strings.Contains(err.Error(), InsufficientReadQuorum{}.Error()) ||
|
||||
strings.Contains(err.Error(), InsufficientWriteQuorum{}.Error()) {
|
||||
logger.Info("Waiting for policy subsystem to be initialized..")
|
||||
continue
|
||||
}
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// NewPolicySys - creates new policy system.
|
||||
|
@ -174,7 +174,7 @@ func (s *xlSets) reInitDisks(refFormat *formatXLV3, storageDisks []StorageAPI, f
|
||||
// any given sets.
|
||||
func (s *xlSets) connectDisksWithQuorum() {
|
||||
var onlineDisks int
|
||||
for onlineDisks < (len(s.endpoints)/2)+1 {
|
||||
for onlineDisks < len(s.endpoints)/2 {
|
||||
for _, endpoint := range s.endpoints {
|
||||
if s.isConnected(endpoint) {
|
||||
continue
|
||||
|
Loading…
x
Reference in New Issue
Block a user