1
0
mirror of https://github.com/minio/minio.git synced 2025-04-09 22:20:00 -04:00

heal buckets during init and make sure to wait on quorum ()

heal buckets properly during expansion, and make sure
to wait for the quorum properly such that healing can
be retried.
This commit is contained in:
Harshavardhana 2020-05-06 14:25:05 -07:00 committed by GitHub
parent a2ccba69e5
commit 4c9de098b0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 69 additions and 81 deletions

@ -154,34 +154,6 @@ function run_test_erasure_sets() {
return "$rv"
}
function run_test_dist_erasure_sets_ipv6()
{
minio_pids=( $(start_minio_dist_erasure_sets_ipv6) )
export SERVER_ENDPOINT="[::1]:9000"
(cd "$WORK_DIR" && "$FUNCTIONAL_TESTS")
rv=$?
for pid in "${minio_pids[@]}"; do
kill "$pid"
done
sleep 3
if [ "$rv" -ne 0 ]; then
for i in $(seq 0 9); do
echo "server$i log:"
cat "$WORK_DIR/dist-minio-v6-900$i.log"
done
fi
for i in $(seq 0 9); do
rm -f "$WORK_DIR/dist-minio-v6-900$i.log"
done
return "$rv"
}
function run_test_zone_erasure_sets()
{
minio_pids=( $(start_minio_zone_erasure_sets) )

@ -208,11 +208,15 @@ func initSafeMode() (err error) {
// version is needed, migration is needed etc.
rquorum := InsufficientReadQuorum{}
wquorum := InsufficientWriteQuorum{}
optimeout := OperationTimedOut{}
for n := range newRetryTimerSimple(retryCtx) {
for range newRetryTimerSimple(retryCtx) {
// let one of the server acquire the lock, if not let them timeout.
// which shall be retried again by this loop.
if err = txnLk.GetLock(leaderLockTimeout); err == nil {
if err = txnLk.GetLock(leaderLockTimeout); err != nil {
logger.Info("Waiting for all MinIO sub-systems to be initialized.. trying to acquire lock")
continue
}
logger.Info("Waiting for all MinIO sub-systems to be initialized.. lock acquired")
// Migrate all backend configs to encrypted backend configs, optionally
// handles rotating keys for encryption, if there is any retriable failure
// that shall be retried if there is an error.
@ -220,25 +224,21 @@ func initSafeMode() (err error) {
// Upon success migrating the config, initialize all sub-systems
// if all sub-systems initialized successfully return right away
if err = initAllSubsystems(newObject); err == nil {
// All successful return.
return nil
}
}
}
// One of these retriable errors shall be retried.
if errors.Is(err, errDiskNotFound) ||
errors.Is(err, errConfigNotFound) ||
errors.Is(err, context.Canceled) ||
errors.Is(err, context.DeadlineExceeded) ||
errors.As(err, &optimeout) ||
errors.As(err, &rquorum) ||
errors.As(err, &wquorum) ||
isErrBucketNotFound(err) {
if n < 5 {
logger.Info("Waiting for all MinIO sub-systems to be initialized..")
} else {
logger.Info("Waiting for all MinIO sub-systems to be initialized.. possible cause (%v)", err)
}
txnLk.Unlock() // Unlock the transaction lock and allow other nodes to acquire the lock if possible.
continue
}
@ -256,11 +256,42 @@ func initSafeMode() (err error) {
}
func initAllSubsystems(newObject ObjectLayer) (err error) {
// List buckets to be re-used for loading configs.
buckets, err := newObject.ListBuckets(GlobalContext)
// %w is used by all error returns here to make sure
// we wrap the underlying error, make sure when you
// are modifying this code that you do so, if and when
// you want to add extra context to your error. This
// ensures top level retry works accordingly.
var buckets []BucketInfo
if globalIsDistXL || globalIsXL {
// List buckets to heal, and be re-used for loading configs.
buckets, err = newObject.ListBucketsHeal(GlobalContext)
if err != nil {
return fmt.Errorf("Unable to list buckets to heal: %w", err)
}
// Attempt a heal if possible and re-use the bucket names
// to reload their config.
wquorum := &InsufficientWriteQuorum{}
rquorum := &InsufficientReadQuorum{}
for _, bucket := range buckets {
if err = newObject.MakeBucketWithLocation(GlobalContext, bucket.Name, ""); err != nil {
if errors.As(err, &wquorum) || errors.As(err, &rquorum) {
// Retrun the error upwards for the caller to retry.
return fmt.Errorf("Unable to heal bucket: %w", err)
}
if _, ok := err.(BucketExists); !ok {
// ignore any other error and log for investigation.
logger.LogIf(GlobalContext, err)
continue
}
// Bucket already exists, nothing that needs to be done.
}
}
} else {
buckets, err = newObject.ListBuckets(GlobalContext)
if err != nil {
return fmt.Errorf("Unable to list buckets: %w", err)
}
}
// Initialize config system.
if err = globalConfigSys.Init(newObject); err != nil {

@ -22,6 +22,7 @@ import (
"hash/crc32"
"io"
"net/http"
"sort"
"strings"
"sync"
"time"
@ -1689,20 +1690,18 @@ func (s *xlSets) HealObject(ctx context.Context, bucket, object string, opts mad
// Lists all buckets which need healing.
func (s *xlSets) ListBucketsHeal(ctx context.Context) ([]BucketInfo, error) {
listBuckets := []BucketInfo{}
var healBuckets = map[string]BucketInfo{}
var listBuckets []BucketInfo
var healBuckets = make(map[string]VolInfo)
for _, set := range s.sets {
buckets, _, err := listAllBuckets(set.getDisks())
if err != nil {
// lists all unique buckets across drives.
if err := listAllBuckets(set.getDisks(), healBuckets); err != nil {
return nil, err
}
for _, currBucket := range buckets {
healBuckets[currBucket.Name] = BucketInfo(currBucket)
}
for _, v := range healBuckets {
listBuckets = append(listBuckets, BucketInfo(v))
}
for _, bucketInfo := range healBuckets {
listBuckets = append(listBuckets, bucketInfo)
}
sort.Sort(byBucketName(listBuckets))
return listBuckets, nil
}

@ -173,11 +173,7 @@ func healBucket(ctx context.Context, storageDisks []StorageAPI, bucket string, w
// listAllBuckets lists all buckets from all disks. It also
// returns the occurrence of each buckets in all disks
func listAllBuckets(storageDisks []StorageAPI) (buckets map[string]VolInfo,
bucketsOcc map[string]int, err error) {
buckets = make(map[string]VolInfo)
bucketsOcc = make(map[string]int)
func listAllBuckets(storageDisks []StorageAPI, healBuckets map[string]VolInfo) (err error) {
for _, disk := range storageDisks {
if disk == nil {
continue
@ -188,7 +184,7 @@ func listAllBuckets(storageDisks []StorageAPI) (buckets map[string]VolInfo,
if IsErrIgnored(err, bucketMetadataOpIgnoredErrs...) {
continue
}
return nil, nil, err
return err
}
for _, volInfo := range volsInfo {
// StorageAPI can send volume names which are
@ -197,13 +193,14 @@ func listAllBuckets(storageDisks []StorageAPI) (buckets map[string]VolInfo,
if isReservedOrInvalidBucket(volInfo.Name, false) {
continue
}
// Increase counter per bucket name
bucketsOcc[volInfo.Name]++
// Save volume info under bucket name
buckets[volInfo.Name] = volInfo
// always save unique buckets across drives.
if _, ok := healBuckets[volInfo.Name]; !ok {
healBuckets[volInfo.Name] = volInfo
}
}
}
return buckets, bucketsOcc, nil
return nil
}
// Only heal on disks where we are sure that healing is needed. We can expand

@ -44,16 +44,6 @@ func (z *xlZones) SingleZone() bool {
return len(z.zones) == 1
}
func (z *xlZones) quickHealBuckets(ctx context.Context) {
bucketsInfo, err := z.ListBucketsHeal(ctx)
if err != nil {
return
}
for _, bucket := range bucketsInfo {
z.MakeBucketWithLocation(ctx, bucket.Name, "")
}
}
// Initialize new zone of erasure sets.
func newXLZones(ctx context.Context, endpointZones EndpointZones) (ObjectLayer, error) {
var (
@ -88,7 +78,6 @@ func newXLZones(ctx context.Context, endpointZones EndpointZones) (ObjectLayer,
}
}
z.quickHealBuckets(ctx)
go intDataUpdateTracker.start(GlobalContext, localDrives...)
return z, nil
}