xl: prepare storage should Abort properly. (#4542)

Current state-machine didn't honor a situation
which can arise when there is a combination of

 - formatted
 - unformatted
 - corrupted

disks - this combination invariably goes into a
mode where all servers are waiting perpetually
forever thinking we will get quorum in future.

At this point there is a distant possibility of
ever getting a quorum since we don't even have
quorum number of disks offline.

We should exit and print a proper message per disk
to indicate what went wrong and what was detected
by the server.

Refer #4477
This commit is contained in:
Harshavardhana 2017-06-17 11:20:12 -07:00 committed by GitHub
parent 58833711e0
commit e99244be02
2 changed files with 37 additions and 11 deletions

View File

@ -133,6 +133,14 @@ func printConfigErrMsg(storageDisks []StorageAPI, sErrs []error, fn printOnceFun
// Generate a formatted message when cluster is misconfigured. // Generate a formatted message when cluster is misconfigured.
func getConfigErrMsg(storageDisks []StorageAPI, sErrs []error) string { func getConfigErrMsg(storageDisks []StorageAPI, sErrs []error) string {
msg := colorBlue("\nDetected configuration inconsistencies in the cluster. Please fix following servers.") msg := colorBlue("\nDetected configuration inconsistencies in the cluster. Please fix following servers.")
return msg + combineDiskErrs(storageDisks, sErrs)
}
// Combines each disk errors in a newline formatted string.
// this is a helper function in printing messages across
// all disks.
func combineDiskErrs(storageDisks []StorageAPI, sErrs []error) string {
var msg string
for i, disk := range storageDisks { for i, disk := range storageDisks {
if disk == nil { if disk == nil {
continue continue

View File

@ -18,6 +18,7 @@ package cmd
import ( import (
"errors" "errors"
"fmt"
"time" "time"
"github.com/minio/mc/pkg/console" "github.com/minio/mc/pkg/console"
@ -118,6 +119,24 @@ func quickErrToActions(errMap map[error]int) InitActions {
// Preparatory initialization stage for XL validates known errors. // Preparatory initialization stage for XL validates known errors.
// Converts them into specific actions. These actions have special purpose // Converts them into specific actions. These actions have special purpose
// which caller decides on what needs to be done. // which caller decides on what needs to be done.
// Logic used in this function is as shown below.
//
// ---- Possible states and handled conditions -----
//
// - Formatted setup
// - InitObjectLayer when `disksFormatted >= readQuorum`
// - Wait for quorum when `disksFormatted < readQuorum && disksFormatted + disksOffline >= readQuorum`
// (we don't know yet if there are unformatted disks)
// - Wait for heal when `disksFormatted >= readQuorum && disksUnformatted > 0`
// (here we know there is at least one unformatted disk which requires healing)
//
// - Unformatted setup
// - Format/Wait for format when `disksUnformatted == diskCount`
//
// - Wait for all when `disksUnformatted + disksOffline == disksCount`
//
// Under all other conditions should lead to server initialization aborted.
func prepForInitXL(firstDisk bool, sErrs []error, diskCount int) InitActions { func prepForInitXL(firstDisk bool, sErrs []error, diskCount int) InitActions {
// Count errors by error value. // Count errors by error value.
errMap := make(map[error]int) errMap := make(map[error]int)
@ -135,19 +154,12 @@ func prepForInitXL(firstDisk bool, sErrs []error, diskCount int) InitActions {
disksOffline := errMap[errDiskNotFound] disksOffline := errMap[errDiskNotFound]
disksFormatted := errMap[nil] disksFormatted := errMap[nil]
disksUnformatted := errMap[errUnformattedDisk] disksUnformatted := errMap[errUnformattedDisk]
disksCorrupted := errMap[errCorruptedFormat]
// No Quorum lots of offline disks, wait for quorum. // No Quorum lots of offline disks, wait for quorum.
if disksOffline > readQuorum { if disksOffline > readQuorum {
return WaitForQuorum return WaitForQuorum
} }
// There is quorum or more corrupted disks, there is not enough good
// disks to reconstruct format.json.
if disksCorrupted >= quorum {
return Abort
}
// All disks are unformatted, proceed to formatting disks. // All disks are unformatted, proceed to formatting disks.
if disksUnformatted == diskCount { if disksUnformatted == diskCount {
// Only the first server formats an uninitialized setup, others wait for notification. // Only the first server formats an uninitialized setup, others wait for notification.
@ -163,6 +175,7 @@ func prepForInitXL(firstDisk bool, sErrs []error, diskCount int) InitActions {
if disksUnformatted+disksFormatted+disksOffline == diskCount { if disksUnformatted+disksFormatted+disksOffline == diskCount {
return WaitForAll return WaitForAll
} }
// Some disks possibly corrupted and too many unformatted disks. // Some disks possibly corrupted and too many unformatted disks.
return Abort return Abort
} }
@ -172,10 +185,13 @@ func prepForInitXL(firstDisk bool, sErrs []error, diskCount int) InitActions {
if disksFormatted+disksOffline == diskCount { if disksFormatted+disksOffline == diskCount {
return InitObjectLayer return InitObjectLayer
} }
// Some of the formatted disks are possibly corrupted or unformatted, heal them. // Some of the formatted disks are possibly corrupted or unformatted, heal them.
return WaitForHeal return WaitForHeal
} // Exhausted all our checks, un-handled errors perhaps we Abort. }
return WaitForQuorum
// Exhausted all our checks, un-handled errors perhaps we Abort.
return Abort
} }
// Prints retry message upon a specific retry count. // Prints retry message upon a specific retry count.
@ -227,6 +243,7 @@ func retryFormattingXLDisks(firstDisk bool, endpoints EndpointList, storageDisks
// actual errors for disks not being available. // actual errors for disks not being available.
printRetryMsg(sErrs, storageDisks) printRetryMsg(sErrs, storageDisks)
} }
// Pre-emptively check if one of the formatted disks // Pre-emptively check if one of the formatted disks
// is invalid. This function returns success for the // is invalid. This function returns success for the
// most part unless one of the formats is not consistent // most part unless one of the formats is not consistent
@ -240,16 +257,17 @@ func retryFormattingXLDisks(firstDisk bool, endpoints EndpointList, storageDisks
// first server has a wrong format and exit gracefully. // first server has a wrong format and exit gracefully.
// refer - https://github.com/minio/minio/issues/4140 // refer - https://github.com/minio/minio/issues/4140
if retryCount > maxRetryAttempts { if retryCount > maxRetryAttempts {
errorIf(err, "Detected disk (%s) in unexpected format", errorIf(err, "%s : Detected disk in unexpected format",
storageDisks[index]) storageDisks[index])
continue continue
} }
return err return err
} }
// Check if this is a XL or distributed XL, anything > 1 is considered XL backend. // Check if this is a XL or distributed XL, anything > 1 is considered XL backend.
switch prepForInitXL(firstDisk, sErrs, len(storageDisks)) { switch prepForInitXL(firstDisk, sErrs, len(storageDisks)) {
case Abort: case Abort:
return errCorruptedFormat return fmt.Errorf("%s", combineDiskErrs(storageDisks, sErrs))
case FormatDisks: case FormatDisks:
console.Eraseline() console.Eraseline()
printFormatMsg(endpoints, storageDisks, printOnceFn()) printFormatMsg(endpoints, storageDisks, printOnceFn())