heal buckets during init and make sure to wait on quorum (#9526)

heal buckets properly during expansion, and make sure
to wait for the quorum properly such that healing can
be retried.
This commit is contained in:
Harshavardhana 2020-05-06 14:25:05 -07:00 committed by GitHub
parent a2ccba69e5
commit 4c9de098b0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 69 additions and 81 deletions

View File

@ -154,34 +154,6 @@ function run_test_erasure_sets() {
return "$rv"
}
function run_test_dist_erasure_sets_ipv6()
{
minio_pids=( $(start_minio_dist_erasure_sets_ipv6) )
export SERVER_ENDPOINT="[::1]:9000"
(cd "$WORK_DIR" && "$FUNCTIONAL_TESTS")
rv=$?
for pid in "${minio_pids[@]}"; do
kill "$pid"
done
sleep 3
if [ "$rv" -ne 0 ]; then
for i in $(seq 0 9); do
echo "server$i log:"
cat "$WORK_DIR/dist-minio-v6-900$i.log"
done
fi
for i in $(seq 0 9); do
rm -f "$WORK_DIR/dist-minio-v6-900$i.log"
done
return "$rv"
}
function run_test_zone_erasure_sets()
{
minio_pids=( $(start_minio_zone_erasure_sets) )

View File

@ -208,20 +208,24 @@ func initSafeMode() (err error) {
// version is needed, migration is needed etc.
rquorum := InsufficientReadQuorum{}
wquorum := InsufficientWriteQuorum{}
optimeout := OperationTimedOut{}
for n := range newRetryTimerSimple(retryCtx) {
for range newRetryTimerSimple(retryCtx) {
// let one of the server acquire the lock, if not let them timeout.
// which shall be retried again by this loop.
if err = txnLk.GetLock(leaderLockTimeout); err == nil {
// Migrate all backend configs to encrypted backend configs, optionally
// handles rotating keys for encryption, if there is any retriable failure
// that shall be retried if there is an error.
if err = handleEncryptedConfigBackend(newObject, true); err == nil {
// Upon success migrating the config, initialize all sub-systems
// if all sub-systems initialized successfully return right away
if err = initAllSubsystems(newObject); err == nil {
return nil
}
if err = txnLk.GetLock(leaderLockTimeout); err != nil {
logger.Info("Waiting for all MinIO sub-systems to be initialized.. trying to acquire lock")
continue
}
logger.Info("Waiting for all MinIO sub-systems to be initialized.. lock acquired")
// Migrate all backend configs to encrypted backend configs, optionally
// handles rotating keys for encryption, if there is any retriable failure
// that shall be retried if there is an error.
if err = handleEncryptedConfigBackend(newObject, true); err == nil {
// Upon success migrating the config, initialize all sub-systems
// if all sub-systems initialized successfully return right away
if err = initAllSubsystems(newObject); err == nil {
// All successful return.
return nil
}
}
@ -230,15 +234,11 @@ func initSafeMode() (err error) {
errors.Is(err, errConfigNotFound) ||
errors.Is(err, context.Canceled) ||
errors.Is(err, context.DeadlineExceeded) ||
errors.As(err, &optimeout) ||
errors.As(err, &rquorum) ||
errors.As(err, &wquorum) ||
isErrBucketNotFound(err) {
if n < 5 {
logger.Info("Waiting for all MinIO sub-systems to be initialized..")
} else {
logger.Info("Waiting for all MinIO sub-systems to be initialized.. possible cause (%v)", err)
}
logger.Info("Waiting for all MinIO sub-systems to be initialized.. possible cause (%v)", err)
txnLk.Unlock() // Unlock the transaction lock and allow other nodes to acquire the lock if possible.
continue
}
@ -256,10 +256,41 @@ func initSafeMode() (err error) {
}
func initAllSubsystems(newObject ObjectLayer) (err error) {
// List buckets to be re-used for loading configs.
buckets, err := newObject.ListBuckets(GlobalContext)
if err != nil {
return fmt.Errorf("Unable to list buckets: %w", err)
// %w is used by all error returns here to make sure
// we wrap the underlying error, make sure when you
// are modifying this code that you do so, if and when
// you want to add extra context to your error. This
// ensures top level retry works accordingly.
var buckets []BucketInfo
if globalIsDistXL || globalIsXL {
// List buckets to heal, and be re-used for loading configs.
buckets, err = newObject.ListBucketsHeal(GlobalContext)
if err != nil {
return fmt.Errorf("Unable to list buckets to heal: %w", err)
}
// Attempt a heal if possible and re-use the bucket names
// to reload their config.
wquorum := &InsufficientWriteQuorum{}
rquorum := &InsufficientReadQuorum{}
for _, bucket := range buckets {
if err = newObject.MakeBucketWithLocation(GlobalContext, bucket.Name, ""); err != nil {
if errors.As(err, &wquorum) || errors.As(err, &rquorum) {
// Retrun the error upwards for the caller to retry.
return fmt.Errorf("Unable to heal bucket: %w", err)
}
if _, ok := err.(BucketExists); !ok {
// ignore any other error and log for investigation.
logger.LogIf(GlobalContext, err)
continue
}
// Bucket already exists, nothing that needs to be done.
}
}
} else {
buckets, err = newObject.ListBuckets(GlobalContext)
if err != nil {
return fmt.Errorf("Unable to list buckets: %w", err)
}
}
// Initialize config system.

View File

@ -22,6 +22,7 @@ import (
"hash/crc32"
"io"
"net/http"
"sort"
"strings"
"sync"
"time"
@ -1689,20 +1690,18 @@ func (s *xlSets) HealObject(ctx context.Context, bucket, object string, opts mad
// Lists all buckets which need healing.
func (s *xlSets) ListBucketsHeal(ctx context.Context) ([]BucketInfo, error) {
listBuckets := []BucketInfo{}
var healBuckets = map[string]BucketInfo{}
var listBuckets []BucketInfo
var healBuckets = make(map[string]VolInfo)
for _, set := range s.sets {
buckets, _, err := listAllBuckets(set.getDisks())
if err != nil {
// lists all unique buckets across drives.
if err := listAllBuckets(set.getDisks(), healBuckets); err != nil {
return nil, err
}
for _, currBucket := range buckets {
healBuckets[currBucket.Name] = BucketInfo(currBucket)
}
}
for _, bucketInfo := range healBuckets {
listBuckets = append(listBuckets, bucketInfo)
for _, v := range healBuckets {
listBuckets = append(listBuckets, BucketInfo(v))
}
sort.Sort(byBucketName(listBuckets))
return listBuckets, nil
}

View File

@ -173,11 +173,7 @@ func healBucket(ctx context.Context, storageDisks []StorageAPI, bucket string, w
// listAllBuckets lists all buckets from all disks. It also
// returns the occurrence of each buckets in all disks
func listAllBuckets(storageDisks []StorageAPI) (buckets map[string]VolInfo,
bucketsOcc map[string]int, err error) {
buckets = make(map[string]VolInfo)
bucketsOcc = make(map[string]int)
func listAllBuckets(storageDisks []StorageAPI, healBuckets map[string]VolInfo) (err error) {
for _, disk := range storageDisks {
if disk == nil {
continue
@ -188,7 +184,7 @@ func listAllBuckets(storageDisks []StorageAPI) (buckets map[string]VolInfo,
if IsErrIgnored(err, bucketMetadataOpIgnoredErrs...) {
continue
}
return nil, nil, err
return err
}
for _, volInfo := range volsInfo {
// StorageAPI can send volume names which are
@ -197,13 +193,14 @@ func listAllBuckets(storageDisks []StorageAPI) (buckets map[string]VolInfo,
if isReservedOrInvalidBucket(volInfo.Name, false) {
continue
}
// Increase counter per bucket name
bucketsOcc[volInfo.Name]++
// Save volume info under bucket name
buckets[volInfo.Name] = volInfo
// always save unique buckets across drives.
if _, ok := healBuckets[volInfo.Name]; !ok {
healBuckets[volInfo.Name] = volInfo
}
}
}
return buckets, bucketsOcc, nil
return nil
}
// Only heal on disks where we are sure that healing is needed. We can expand

View File

@ -44,16 +44,6 @@ func (z *xlZones) SingleZone() bool {
return len(z.zones) == 1
}
func (z *xlZones) quickHealBuckets(ctx context.Context) {
bucketsInfo, err := z.ListBucketsHeal(ctx)
if err != nil {
return
}
for _, bucket := range bucketsInfo {
z.MakeBucketWithLocation(ctx, bucket.Name, "")
}
}
// Initialize new zone of erasure sets.
func newXLZones(ctx context.Context, endpointZones EndpointZones) (ObjectLayer, error) {
var (
@ -88,7 +78,6 @@ func newXLZones(ctx context.Context, endpointZones EndpointZones) (ObjectLayer,
}
}
z.quickHealBuckets(ctx)
go intDataUpdateTracker.start(GlobalContext, localDrives...)
return z, nil
}