mirror of
https://github.com/minio/minio.git
synced 2025-11-09 13:39:46 -05:00
Add large bucket support for erasure coded backend (#5160)
This PR implements an object layer which combines input erasure sets of XL layers into a unified namespace. This object layer extends the existing erasure coded implementation, it is assumed in this design that providing > 16 disks is a static configuration as well i.e if you started the setup with 32 disks with 4 sets 8 disks per pack then you would need to provide 4 sets always. Some design details and restrictions: - Objects are distributed using consistent ordering to a unique erasure coded layer. - Each pack has its own dsync so locks are synchronized properly at pack (erasure layer). - Each pack still has a maximum of 16 disks requirement, you can start with multiple such sets statically. - Static sets set of disks and cannot be changed, there is no elastic expansion allowed. - Static sets set of disks and cannot be changed, there is no elastic removal allowed. - ListObjects() across sets can be noticeably slower since List happens on all servers, and is merged at this sets layer. Fixes #5465 Fixes #5464 Fixes #5461 Fixes #5460 Fixes #5459 Fixes #5458 Fixes #5460 Fixes #5488 Fixes #5489 Fixes #5497 Fixes #5496
This commit is contained in:
committed by
kannappanr
parent
dd80256151
commit
fb96779a8a
@@ -18,222 +18,60 @@ package cmd
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/minio/mc/pkg/console"
|
||||
"github.com/minio/minio/pkg/errors"
|
||||
)
|
||||
|
||||
/*
|
||||
var printEndpointError = func() func(Endpoint, error) {
|
||||
printOnce := make(map[Endpoint]map[string]bool)
|
||||
|
||||
Following table lists different possible states the backend could be in.
|
||||
return func(endpoint Endpoint, err error) {
|
||||
m, ok := printOnce[endpoint]
|
||||
if !ok {
|
||||
m = make(map[string]bool)
|
||||
m[err.Error()] = true
|
||||
printOnce[endpoint] = m
|
||||
errorIf(err, "%s: %s", endpoint, err)
|
||||
return
|
||||
}
|
||||
if m[err.Error()] {
|
||||
return
|
||||
}
|
||||
m[err.Error()] = true
|
||||
errorIf(err, "%s: %s", endpoint, err)
|
||||
}
|
||||
}()
|
||||
|
||||
* In a single-node, multi-disk setup, "Online" would refer to disks' status.
|
||||
|
||||
* In a multi-node setup, it could refer to disks' or network connectivity
|
||||
between the nodes, or both.
|
||||
|
||||
+----------+--------------------------+-----------------------+
|
||||
| Online | Format status | Course of action |
|
||||
| | | |
|
||||
-----------+--------------------------+-----------------------+
|
||||
| All | All Formatted | |
|
||||
+----------+--------------------------+ initObjectLayer |
|
||||
| Quorum | Quorum Formatted | |
|
||||
+----------+--------------------------+-----------------------+
|
||||
| All | Quorum | Print message saying |
|
||||
| | Formatted, | "Heal via control" |
|
||||
| | some unformatted | and initObjectLayer |
|
||||
+----------+--------------------------+-----------------------+
|
||||
| All | None Formatted | FormatDisks |
|
||||
| | | and initObjectLayer |
|
||||
| | | |
|
||||
+----------+--------------------------+-----------------------+
|
||||
| No | | Wait till enough |
|
||||
| Quorum | _ | nodes are online and |
|
||||
| | | one of the above |
|
||||
| | | sections apply |
|
||||
+----------+--------------------------+-----------------------+
|
||||
| | | |
|
||||
| Quorum | Quorum UnFormatted | Abort |
|
||||
+----------+--------------------------+-----------------------+
|
||||
|
||||
A disk can be in one of the following states.
|
||||
- Unformatted
|
||||
- Formatted
|
||||
- Corrupted
|
||||
- Offline
|
||||
|
||||
*/
|
||||
|
||||
// InitActions - a type synonym for enumerating initialization activities.
|
||||
type InitActions int
|
||||
|
||||
const (
|
||||
// FormatDisks - see above table for disk states where it is applicable.
|
||||
FormatDisks InitActions = iota
|
||||
|
||||
// SuggestToHeal - Prints heal message and initialize object layer.
|
||||
SuggestToHeal
|
||||
|
||||
// WaitForQuorum - Wait for quorum number of disks to be online.
|
||||
WaitForQuorum
|
||||
|
||||
// WaitForAll - Wait for all disks to be online.
|
||||
WaitForAll
|
||||
|
||||
// WaitForFormatting - Wait for formatting to be triggered
|
||||
// from the '1st' server in the cluster.
|
||||
WaitForFormatting
|
||||
|
||||
// WaitForConfig - Wait for all servers to have the same config
|
||||
// including (credentials, version and time).
|
||||
WaitForConfig
|
||||
|
||||
// InitObjectLayer - Initialize object layer.
|
||||
InitObjectLayer
|
||||
|
||||
// Abort initialization of object layer since there aren't enough good
|
||||
// copies of format.json to recover.
|
||||
Abort
|
||||
)
|
||||
|
||||
// configErrs contains the list of configuration errors.
|
||||
var configErrs = []error{
|
||||
errInvalidAccessKeyID,
|
||||
errAuthentication,
|
||||
errRPCAPIVersionUnsupported,
|
||||
errServerTimeMismatch,
|
||||
func formatXLMigrateLocalEndpoints(endpoints EndpointList) error {
|
||||
for _, endpoint := range endpoints {
|
||||
if !endpoint.IsLocal {
|
||||
continue
|
||||
}
|
||||
formatPath := pathJoin(endpoint.Path, minioMetaBucket, formatConfigFile)
|
||||
if _, err := os.Stat(formatPath); err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
continue
|
||||
}
|
||||
return err
|
||||
}
|
||||
if err := formatXLMigrate(endpoint.Path); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Config errs to actions converts looking for specific config errors
|
||||
// which need to be returned quickly and server should wait instead.
|
||||
func configErrsToActions(errMap map[error]int) InitActions {
|
||||
var action InitActions
|
||||
for _, configErr := range configErrs {
|
||||
if errMap[configErr] > 0 {
|
||||
action = WaitForConfig
|
||||
break
|
||||
}
|
||||
}
|
||||
return action
|
||||
}
|
||||
|
||||
// reduceInitXLErrs reduces errors found in distributed XL initialization
|
||||
func reduceInitXLErrs(storageDisks []StorageAPI, sErrs []error) error {
|
||||
var foundErrs int
|
||||
for i := range sErrs {
|
||||
if sErrs[i] != nil {
|
||||
foundErrs++
|
||||
}
|
||||
}
|
||||
if foundErrs == 0 {
|
||||
return nil
|
||||
}
|
||||
// Early quit if there is a config error
|
||||
for i := range sErrs {
|
||||
if contains(configErrs, sErrs[i]) {
|
||||
return fmt.Errorf("%s: %s", storageDisks[i], sErrs[i])
|
||||
}
|
||||
}
|
||||
// Combine all disk errors otherwise for user inspection
|
||||
return fmt.Errorf("%s", combineDiskErrs(storageDisks, sErrs))
|
||||
}
|
||||
|
||||
// Preparatory initialization stage for XL validates known errors.
|
||||
// Converts them into specific actions. These actions have special purpose
|
||||
// which caller decides on what needs to be done.
|
||||
|
||||
// Logic used in this function is as shown below.
|
||||
//
|
||||
// ---- Possible states and handled conditions -----
|
||||
//
|
||||
// - Formatted setup
|
||||
// - InitObjectLayer when `disksFormatted >= readQuorum`
|
||||
// - Wait for quorum when `disksFormatted < readQuorum && disksFormatted + disksOffline >= readQuorum`
|
||||
// (we don't know yet if there are unformatted disks)
|
||||
// - Wait for heal when `disksFormatted >= readQuorum && disksUnformatted > 0`
|
||||
// (here we know there is at least one unformatted disk which requires healing)
|
||||
//
|
||||
// - Unformatted setup
|
||||
// - Format/Wait for format when `disksUnformatted == diskCount`
|
||||
//
|
||||
// - Wait for all when `disksUnformatted + disksFormatted + diskOffline == diskCount`
|
||||
//
|
||||
// Under all other conditions should lead to server initialization aborted.
|
||||
func prepForInitXL(firstDisk bool, sErrs []error, diskCount int) InitActions {
|
||||
// Count errors by error value.
|
||||
errMap := make(map[error]int)
|
||||
for _, err := range sErrs {
|
||||
errMap[errors.Cause(err)]++
|
||||
// Format disks before initialization of object layer.
|
||||
func waitForFormatXL(firstDisk bool, endpoints EndpointList, setCount, disksPerSet int) (format *formatXLV2, err error) {
|
||||
if len(endpoints) == 0 || setCount == 0 || disksPerSet == 0 {
|
||||
return nil, errInvalidArgument
|
||||
}
|
||||
|
||||
// Validates and converts specific config errors into WaitForConfig.
|
||||
if configErrsToActions(errMap) == WaitForConfig {
|
||||
return WaitForConfig
|
||||
}
|
||||
|
||||
readQuorum := diskCount / 2
|
||||
disksOffline := errMap[errDiskNotFound]
|
||||
disksFormatted := errMap[nil]
|
||||
disksUnformatted := errMap[errUnformattedDisk]
|
||||
|
||||
// No Quorum lots of offline disks, wait for quorum.
|
||||
if disksOffline > readQuorum {
|
||||
return WaitForQuorum
|
||||
}
|
||||
|
||||
// All disks are unformatted, proceed to formatting disks.
|
||||
if disksUnformatted == diskCount {
|
||||
// Only the first server formats an uninitialized setup, others wait for notification.
|
||||
if firstDisk { // First node always initializes.
|
||||
return FormatDisks
|
||||
}
|
||||
return WaitForFormatting
|
||||
}
|
||||
|
||||
// Already formatted and in quorum, proceed to initialization of object layer.
|
||||
if disksFormatted >= readQuorum {
|
||||
if disksFormatted+disksOffline == diskCount {
|
||||
return InitObjectLayer
|
||||
}
|
||||
|
||||
// Some of the formatted disks are possibly corrupted or unformatted,
|
||||
// let user know to heal them.
|
||||
return SuggestToHeal
|
||||
}
|
||||
|
||||
// Some unformatted, some disks formatted and some disks are offline but we don't
|
||||
// quorum to decide. This is an undecisive state - wait for all of offline disks
|
||||
// to be online to figure out the course of action.
|
||||
if disksUnformatted+disksFormatted+disksOffline == diskCount {
|
||||
return WaitForAll
|
||||
}
|
||||
|
||||
// Exhausted all our checks, un-handled situations such as some disks corrupted we Abort.
|
||||
return Abort
|
||||
}
|
||||
|
||||
// Prints retry message upon a specific retry count.
|
||||
func printRetryMsg(sErrs []error, storageDisks []StorageAPI) {
|
||||
for i, sErr := range sErrs {
|
||||
switch sErr {
|
||||
case errDiskNotFound, errFaultyDisk, errFaultyRemoteDisk:
|
||||
errorIf(sErr, "Disk %s is still unreachable", storageDisks[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Maximum retry attempts.
|
||||
const maxRetryAttempts = 5
|
||||
|
||||
// Implements a jitter backoff loop for formatting all disks during
|
||||
// initialization of the server.
|
||||
func retryFormattingXLDisks(firstDisk bool, endpoints EndpointList, storageDisks []StorageAPI) error {
|
||||
if len(endpoints) == 0 {
|
||||
return errInvalidArgument
|
||||
}
|
||||
if storageDisks == nil {
|
||||
return errInvalidArgument
|
||||
if err = formatXLMigrateLocalEndpoints(endpoints); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Done channel is used to close any lingering retry routine, as soon
|
||||
@@ -254,138 +92,54 @@ func retryFormattingXLDisks(firstDisk bool, endpoints EndpointList, storageDisks
|
||||
retryTimerCh := newRetryTimerSimple(doneCh)
|
||||
for {
|
||||
select {
|
||||
case retryCount := <-retryTimerCh:
|
||||
case _ = <-retryTimerCh:
|
||||
// Attempt to load all `format.json` from all disks.
|
||||
formatConfigs, sErrs := loadAllFormats(storageDisks)
|
||||
if retryCount > maxRetryAttempts {
|
||||
// After max retry attempts we start printing
|
||||
// actual errors for disks not being available.
|
||||
printRetryMsg(sErrs, storageDisks)
|
||||
}
|
||||
formatConfigs, sErrs := loadFormatXLAll(endpoints)
|
||||
|
||||
// Pre-emptively check if one of the formatted disks
|
||||
// is invalid. This function returns success for the
|
||||
// most part unless one of the formats is not consistent
|
||||
// with expected XL format. For example if a user is
|
||||
// trying to pool FS backend into an XL set.
|
||||
if index, err := checkFormatXLValues(formatConfigs); err != nil {
|
||||
// We will perhaps print and retry for the first 5 attempts
|
||||
// because in XL initialization first server is the one which
|
||||
// initializes the erasure set. This check ensures that the
|
||||
// rest of the other servers do get a chance to see that the
|
||||
// first server has a wrong format and exit gracefully.
|
||||
// refer - https://github.com/minio/minio/issues/4140
|
||||
if retryCount > maxRetryAttempts {
|
||||
errorIf(err, "%s : Detected disk in unexpected format",
|
||||
storageDisks[index])
|
||||
if err = checkFormatXLValues(formatConfigs); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for i, sErr := range sErrs {
|
||||
if _, ok := formatCriticalErrors[errors.Cause(sErr)]; ok {
|
||||
return nil, fmt.Errorf("Disk %s: %s", endpoints[i], sErr)
|
||||
}
|
||||
}
|
||||
|
||||
if shouldInitXLDisks(sErrs) {
|
||||
if !firstDisk {
|
||||
console.Println("Waiting for the first server to format the disks.")
|
||||
continue
|
||||
}
|
||||
return err
|
||||
return initFormatXL(endpoints, setCount, disksPerSet)
|
||||
}
|
||||
|
||||
// Check if this is a XL or distributed XL, anything > 1 is considered XL backend.
|
||||
switch prepForInitXL(firstDisk, sErrs, len(storageDisks)) {
|
||||
case Abort:
|
||||
return reduceInitXLErrs(storageDisks, sErrs)
|
||||
case FormatDisks:
|
||||
printFormatMsg(endpoints, storageDisks, printOnceFn())
|
||||
return initFormatXL(storageDisks)
|
||||
case InitObjectLayer:
|
||||
// Validate formats loaded before proceeding forward.
|
||||
err := genericFormatCheckXL(formatConfigs, sErrs)
|
||||
if err == nil {
|
||||
printRegularMsg(endpoints, storageDisks, printOnceFn())
|
||||
format, err = getFormatXLInQuorum(formatConfigs)
|
||||
if err == nil {
|
||||
for i := range formatConfigs {
|
||||
if formatConfigs[i] == nil {
|
||||
continue
|
||||
}
|
||||
if err = formatXLV2Check(format, formatConfigs[i]); err != nil {
|
||||
return nil, fmt.Errorf("%s format error: %s", endpoints[i], err)
|
||||
}
|
||||
}
|
||||
return err
|
||||
case SuggestToHeal:
|
||||
// Validate formats loaded before proceeding forward.
|
||||
err := genericFormatCheckXL(formatConfigs, sErrs)
|
||||
if err == nil {
|
||||
printHealMsg(endpoints, storageDisks, printOnceFn())
|
||||
if len(format.XL.Sets) != globalXLSetCount {
|
||||
return nil, fmt.Errorf("Current backend format is inconsistent with input args (%s), Expected set count %d, got %d", endpoints, len(format.XL.Sets), globalXLSetCount)
|
||||
}
|
||||
return err
|
||||
case WaitForQuorum:
|
||||
log.Printf(
|
||||
"Initializing data volume. Waiting for minimum %d servers to come online. (elapsed %s)\n",
|
||||
len(storageDisks)/2+1, getElapsedTime(),
|
||||
)
|
||||
case WaitForConfig:
|
||||
// Print configuration errors.
|
||||
log.Printf(
|
||||
"Initializing data volume. Waiting for configuration issues to be fixed (%s). (elapsed %s)\n",
|
||||
reduceInitXLErrs(storageDisks, sErrs), getElapsedTime())
|
||||
case WaitForAll:
|
||||
log.Printf("Initializing data volume for first time. Waiting for other servers to come online (elapsed %s)\n", getElapsedTime())
|
||||
case WaitForFormatting:
|
||||
log.Printf("Initializing data volume for first time. Waiting for first server to come online (elapsed %s)\n", getElapsedTime())
|
||||
if len(format.XL.Sets[0]) != globalXLSetDriveCount {
|
||||
return nil, fmt.Errorf("Current backend format is inconsistent with input args (%s), Expected drive count per set %d, got %d", endpoints, len(format.XL.Sets[0]), globalXLSetDriveCount)
|
||||
}
|
||||
return format, nil
|
||||
}
|
||||
console.Printf("Waiting for a minimum of %d disks to come online (elapsed %s)\n", len(endpoints)/2, getElapsedTime())
|
||||
case <-globalOSSignalCh:
|
||||
return fmt.Errorf("Initializing data volumes gracefully stopped")
|
||||
return nil, fmt.Errorf("Initializing data volumes gracefully stopped")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize storage disks based on input arguments.
|
||||
func initStorageDisks(endpoints EndpointList) ([]StorageAPI, error) {
|
||||
// Bootstrap disks.
|
||||
storageDisks := make([]StorageAPI, len(endpoints))
|
||||
for index, endpoint := range endpoints {
|
||||
// Intentionally ignore disk not found errors. XL is designed
|
||||
// to handle these errors internally.
|
||||
storage, err := newStorageAPI(endpoint)
|
||||
if err != nil && err != errDiskNotFound {
|
||||
return nil, err
|
||||
}
|
||||
storageDisks[index] = storage
|
||||
}
|
||||
return storageDisks, nil
|
||||
}
|
||||
|
||||
// Wrap disks into retryable disks.
|
||||
func initRetryableStorageDisks(disks []StorageAPI, retryUnit, retryCap, retryInterval time.Duration, retryThreshold int) (outDisks []StorageAPI) {
|
||||
// Initialize the disk into a retryable-disks wrapper.
|
||||
outDisks = make([]StorageAPI, len(disks))
|
||||
for i, disk := range disks {
|
||||
outDisks[i] = &retryStorage{
|
||||
remoteStorage: disk,
|
||||
retryInterval: retryInterval,
|
||||
maxRetryAttempts: retryThreshold,
|
||||
retryUnit: retryUnit,
|
||||
retryCap: retryCap,
|
||||
offlineTimestamp: UTCNow(), // Set timestamp to prevent immediate marking as offline
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Format disks before initialization of object layer.
|
||||
func waitForFormatXLDisks(firstDisk bool, endpoints EndpointList, storageDisks []StorageAPI) (formattedDisks []StorageAPI, err error) {
|
||||
if len(endpoints) == 0 {
|
||||
return nil, errInvalidArgument
|
||||
}
|
||||
if storageDisks == nil {
|
||||
return nil, errInvalidArgument
|
||||
}
|
||||
|
||||
// Retryable disks before formatting, we need to have a larger
|
||||
// retry window (30 seconds, with once-per-second retries) so
|
||||
// that we wait enough amount of time before the disks come
|
||||
// online.
|
||||
retryDisks := initRetryableStorageDisks(storageDisks, time.Second, time.Second*30,
|
||||
globalStorageInitHealthCheckInterval, globalStorageInitRetryThreshold)
|
||||
|
||||
// Start retry loop retrying until disks are formatted
|
||||
// properly, until we have reached a conditional quorum of
|
||||
// formatted disks.
|
||||
if err = retryFormattingXLDisks(firstDisk, endpoints, retryDisks); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Initialize the disk into a formatted disks wrapper. This
|
||||
// uses a shorter retry window (5ms with once-per-ms retries)
|
||||
formattedDisks = initRetryableStorageDisks(storageDisks, time.Millisecond, time.Millisecond*5,
|
||||
globalStorageHealthCheckInterval, globalStorageRetryThreshold)
|
||||
|
||||
// Success.
|
||||
return formattedDisks, nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user