minio/cmd/prepare-storage.go
Aditya Manthramurthy 604417baf4 Allow cluster to start when only n/2 servers are up (#4066)
Fixes #3234.

Relaxes the quorum requirement to start the object layer, and skips
quick-healing at start-up (as no write quorum is present).
2017-04-09 00:28:27 -07:00

346 lines
11 KiB
Go

/*
* Minio Cloud Storage, (C) 2016 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cmd
import (
"errors"
"net/url"
"time"
"github.com/minio/mc/pkg/console"
)
/*
Following table lists different possible states the backend could be in.
* In a single-node, multi-disk setup, "Online" would refer to disks' status.
* In a multi-node setup, it could refer to disks' or network connectivity
between the nodes, or both.
+----------+--------------------------+-----------------------+
| Online | Format status | Course of action |
| | | |
-----------+--------------------------+-----------------------+
| All | All Formatted | |
+----------+--------------------------+ initObjectLayer |
| Quorum | Quorum Formatted | |
+----------+--------------------------+-----------------------+
| All | Quorum | Print message saying |
| | Formatted, | "Heal via control" |
| | some unformatted | and initObjectLayer |
+----------+--------------------------+-----------------------+
| All | None Formatted | FormatDisks |
| | | and initObjectLayer |
| | | |
+----------+--------------------------+-----------------------+
| No | | Wait till enough |
| Quorum | _ | nodes are online and |
| | | one of the above |
| | | sections apply |
+----------+--------------------------+-----------------------+
| | | |
| Quorum | Quorum UnFormatted | Abort |
+----------+--------------------------+-----------------------+
A disk can be in one of the following states.
- Unformatted
- Formatted
- Corrupted
- Offline
*/
// InitActions - a type synonym for enumerating initialization activities.
type InitActions int
const (
// FormatDisks - see above table for disk states where it
// is applicable.
FormatDisks InitActions = iota
// WaitForHeal - Wait for disks to heal.
WaitForHeal
// WaitForQuorum - Wait for quorum number of disks to be online.
WaitForQuorum
// WaitForAll - Wait for all disks to be online.
WaitForAll
// WaitForFormatting - Wait for formatting to be triggered
// from the '1st' server in the cluster.
WaitForFormatting
// WaitForConfig - Wait for all servers to have the same config
// including (credentials, version and time).
WaitForConfig
// InitObjectLayer - Initialize object layer.
InitObjectLayer
// Abort initialization of object layer since there aren't enough good
// copies of format.json to recover.
Abort
)
// Quick error to actions converts looking for specific errors
// which need to be returned quickly and server should wait instead.
func quickErrToActions(errMap map[error]int) InitActions {
var action InitActions
switch {
case errMap[errInvalidAccessKeyID] > 0:
fallthrough
case errMap[errAuthentication] > 0:
fallthrough
case errMap[errServerVersionMismatch] > 0:
fallthrough
case errMap[errServerTimeMismatch] > 0:
action = WaitForConfig
}
return action
}
// Preparatory initialization stage for XL validates known errors.
// Converts them into specific actions. These actions have special purpose
// which caller decides on what needs to be done.
func prepForInitXL(firstDisk bool, sErrs []error, diskCount int) InitActions {
// Count errors by error value.
errMap := make(map[error]int)
for _, err := range sErrs {
errMap[err]++
}
// Validates and converts specific config errors into WaitForConfig.
if quickErrToActions(errMap) == WaitForConfig {
return WaitForConfig
}
quorum := diskCount/2 + 1
readQuorum := diskCount / 2
disksOffline := errMap[errDiskNotFound]
disksFormatted := errMap[nil]
disksUnformatted := errMap[errUnformattedDisk]
disksCorrupted := errMap[errCorruptedFormat]
// No Quorum lots of offline disks, wait for quorum.
if disksOffline > readQuorum {
return WaitForQuorum
}
// There is quorum or more corrupted disks, there is not enough good
// disks to reconstruct format.json.
if disksCorrupted >= quorum {
return Abort
}
// All disks are unformatted, proceed to formatting disks.
if disksUnformatted == diskCount {
// Only the first server formats an uninitialized setup, others wait for notification.
if firstDisk { // First node always initializes.
return FormatDisks
}
return WaitForFormatting
}
// Total disks unformatted are in quorum verify if we have some offline disks.
if disksUnformatted >= quorum {
// Some disks offline and some disks unformatted, wait for all of them to come online.
if disksUnformatted+disksFormatted+disksOffline == diskCount {
return WaitForAll
}
// Some disks possibly corrupted and too many unformatted disks.
return Abort
}
// Already formatted and in quorum, proceed to initialization of object layer.
if disksFormatted >= readQuorum {
if disksFormatted+disksOffline == diskCount {
return InitObjectLayer
}
// Some of the formatted disks are possibly corrupted or unformatted, heal them.
return WaitForHeal
} // Exhausted all our checks, un-handled errors perhaps we Abort.
return WaitForQuorum
}
// Prints retry message upon a specific retry count.
func printRetryMsg(sErrs []error, storageDisks []StorageAPI) {
for i, sErr := range sErrs {
switch sErr {
case errDiskNotFound, errFaultyDisk, errFaultyRemoteDisk:
errorIf(sErr, "Disk %s is still unreachable", storageDisks[i])
}
}
}
// Implements a jitter backoff loop for formatting all disks during
// initialization of the server.
func retryFormattingXLDisks(firstDisk bool, endpoints []*url.URL, storageDisks []StorageAPI) error {
if len(endpoints) == 0 {
return errInvalidArgument
}
if storageDisks == nil {
return errInvalidArgument
}
// Done channel is used to close any lingering retry routine, as soon
// as this function returns.
doneCh := make(chan struct{})
// Indicate to our retry routine to exit cleanly, upon this function return.
defer close(doneCh)
// prepare getElapsedTime() to calculate elapsed time since we started trying formatting disks.
// All times are rounded to avoid showing milli, micro and nano seconds
formatStartTime := time.Now().Round(time.Second)
getElapsedTime := func() string {
return time.Now().Round(time.Second).Sub(formatStartTime).String()
}
// Wait on the jitter retry loop.
retryTimerCh := newRetryTimerSimple(doneCh)
for {
select {
case retryCount := <-retryTimerCh:
// Attempt to load all `format.json` from all disks.
formatConfigs, sErrs := loadAllFormats(storageDisks)
if retryCount > 5 {
// After 5 retry attempts we start printing actual errors
// for disks not being available.
printRetryMsg(sErrs, storageDisks)
}
// Pre-emptively check if one of the formatted disks
// is invalid. This function returns success for the
// most part unless one of the formats is not consistent
// with expected XL format. For example if a user is trying
// to pool FS backend.
if err := checkFormatXLValues(formatConfigs); err != nil {
return err
}
// Check if this is a XL or distributed XL, anything > 1 is considered XL backend.
switch prepForInitXL(firstDisk, sErrs, len(storageDisks)) {
case Abort:
return errCorruptedFormat
case FormatDisks:
console.Eraseline()
printFormatMsg(endpoints, storageDisks, printOnceFn())
return initFormatXL(storageDisks)
case InitObjectLayer:
console.Eraseline()
// Validate formats loaded before proceeding forward.
err := genericFormatCheckXL(formatConfigs, sErrs)
if err == nil {
printRegularMsg(endpoints, storageDisks, printOnceFn())
}
return err
case WaitForHeal:
// Validate formats loaded before proceeding forward.
err := genericFormatCheckXL(formatConfigs, sErrs)
if err == nil {
printHealMsg(endpoints, storageDisks, printOnceFn())
}
return err
case WaitForQuorum:
console.Printf(
"Initializing data volume. Waiting for minimum %d servers to come online. (elapsed %s)\n",
len(storageDisks)/2+1, getElapsedTime(),
)
case WaitForConfig:
// Print configuration errors.
printConfigErrMsg(storageDisks, sErrs, printOnceFn())
case WaitForAll:
console.Printf("Initializing data volume for first time. Waiting for other servers to come online (elapsed %s)\n", getElapsedTime())
case WaitForFormatting:
console.Printf("Initializing data volume for first time. Waiting for first server to come online (elapsed %s)\n", getElapsedTime())
}
case <-globalServiceDoneCh:
return errors.New("Initializing data volumes gracefully stopped")
}
}
}
// Initialize storage disks based on input arguments.
func initStorageDisks(endpoints []*url.URL) ([]StorageAPI, error) {
// Bootstrap disks.
storageDisks := make([]StorageAPI, len(endpoints))
for index, ep := range endpoints {
if ep == nil {
return nil, errInvalidArgument
}
// Intentionally ignore disk not found errors. XL is designed
// to handle these errors internally.
storage, err := newStorageAPI(ep)
if err != nil && err != errDiskNotFound {
return nil, err
}
storageDisks[index] = storage
}
return storageDisks, nil
}
// Format disks before initialization of object layer.
func waitForFormatXLDisks(firstDisk bool, endpoints []*url.URL, storageDisks []StorageAPI) (formattedDisks []StorageAPI, err error) {
if len(endpoints) == 0 {
return nil, errInvalidArgument
}
firstEndpoint := endpoints[0]
if firstEndpoint == nil {
return nil, errInvalidArgument
}
if storageDisks == nil {
return nil, errInvalidArgument
}
// Retryable disks before formatting, we need to have a larger
// retry window so that we wait enough amount of time before
// the disks come online.
retryDisks := make([]StorageAPI, len(storageDisks))
for i, storage := range storageDisks {
retryDisks[i] = &retryStorage{
remoteStorage: storage,
maxRetryAttempts: globalStorageInitRetryThreshold,
retryUnit: time.Second,
retryCap: time.Second * 30, // 30 seconds.
}
}
// Start retry loop retrying until disks are formatted properly, until we have reached
// a conditional quorum of formatted disks.
err = retryFormattingXLDisks(firstDisk, endpoints, retryDisks)
if err != nil {
return nil, err
}
// Initialize the disk into a formatted disks wrapper.
formattedDisks = make([]StorageAPI, len(storageDisks))
for i, storage := range storageDisks {
// After formatting is done we need a smaller time
// window and lower retry value before formatting.
formattedDisks[i] = &retryStorage{
remoteStorage: storage,
maxRetryAttempts: globalStorageRetryThreshold,
retryUnit: time.Millisecond,
retryCap: time.Millisecond * 5, // 5 milliseconds.
}
}
// Success.
return formattedDisks, nil
}