Fix preInit logic when mixed disk situations exist. (#4904)

When servers are started simultaneously across multiple
nodes or simulating a local setup, it can happen such
that one of the servers in setup reaches a following
situation where it observes

 - Some servers are formatted
 - Some servers are unformatted
 - Some servers are offline

Current state machine doesn't handle this correctly, to fix
this situation where we have unformatted, formatted and
disks offline we do not decisively know the course of
action. So we wait for the offline disks to change their state.

Once the offline disks change their state to either one of these
states we can decisively move forward.

  - nil (formatted disk)
  - errUnformattedDisk
  - Or any other error such as errCorruptedDisk.

Fixes #4903
This commit is contained in:
Harshavardhana 2017-09-12 12:17:44 -07:00 committed by Dee Koder
parent f66239e82f
commit b9fc4150f6
3 changed files with 42 additions and 28 deletions

View File

@ -13,10 +13,11 @@ clone_folder: c:\gopath\src\github.com\minio\minio
environment:
GOVERSION: 1.8.3
GOPATH: c:\gopath
GOROOT: c:\go18
# scripts that run after cloning repository
install:
- set PATH=%GOPATH%\bin;c:\go\bin;%PATH%
- set PATH=%GOPATH%\bin;c:\go18\bin;%PATH%
- go version
- go env
- python --version

View File

@ -70,12 +70,11 @@ import (
type InitActions int
const (
// FormatDisks - see above table for disk states where it
// is applicable.
// FormatDisks - see above table for disk states where it is applicable.
FormatDisks InitActions = iota
// WaitForHeal - Wait for disks to heal.
WaitForHeal
// SuggestToHeal - Prints heal message and initialize object layer.
SuggestToHeal
// WaitForQuorum - Wait for quorum number of disks to be online.
WaitForQuorum
@ -134,14 +133,14 @@ func quickErrToActions(errMap map[error]int) InitActions {
// - Unformatted setup
// - Format/Wait for format when `disksUnformatted == diskCount`
//
// - Wait for all when `disksUnformatted + disksOffline == disksCount`
// - Wait for all when `disksUnformatted + disksFormatted + diskOffline == diskCount`
//
// Under all other conditions should lead to server initialization aborted.
func prepForInitXL(firstDisk bool, sErrs []error, diskCount int) InitActions {
// Count errors by error value.
errMap := make(map[error]int)
for _, err := range sErrs {
errMap[err]++
errMap[errorCause(err)]++
}
// Validates and converts specific config errors into WaitForConfig.
@ -149,7 +148,6 @@ func prepForInitXL(firstDisk bool, sErrs []error, diskCount int) InitActions {
return WaitForConfig
}
quorum := diskCount/2 + 1
readQuorum := diskCount / 2
disksOffline := errMap[errDiskNotFound]
disksFormatted := errMap[nil]
@ -169,28 +167,25 @@ func prepForInitXL(firstDisk bool, sErrs []error, diskCount int) InitActions {
return WaitForFormatting
}
// Total disks unformatted are in quorum verify if we have some offline disks.
if disksUnformatted >= quorum {
// Some disks offline and some disks unformatted, wait for all of them to come online.
if disksUnformatted+disksFormatted+disksOffline == diskCount {
return WaitForAll
}
// Some disks possibly corrupted and too many unformatted disks.
return Abort
}
// Already formatted and in quorum, proceed to initialization of object layer.
if disksFormatted >= readQuorum {
if disksFormatted+disksOffline == diskCount {
return InitObjectLayer
}
// Some of the formatted disks are possibly corrupted or unformatted, heal them.
return WaitForHeal
// Some of the formatted disks are possibly corrupted or unformatted,
// let user know to heal them.
return SuggestToHeal
}
// Exhausted all our checks, un-handled errors perhaps we Abort.
// Some unformatted, some disks formatted and some disks are offline but we don't
// quorum to decide. This is an undecisive state - wait for all of offline disks
// to be online to figure out the course of action.
if disksUnformatted+disksFormatted+disksOffline == diskCount {
return WaitForAll
}
// Exhausted all our checks, un-handled situations such as some disks corrupted we Abort.
return Abort
}
@ -280,7 +275,7 @@ func retryFormattingXLDisks(firstDisk bool, endpoints EndpointList, storageDisks
printRegularMsg(endpoints, storageDisks, printOnceFn())
}
return err
case WaitForHeal:
case SuggestToHeal:
// Validate formats loaded before proceeding forward.
err := genericFormatCheckXL(formatConfigs, sErrs)
if err == nil {

View File

@ -26,8 +26,8 @@ func (action InitActions) String() string {
return "FormatDisks"
case WaitForFormatting:
return "WaitForFormatting"
case WaitForHeal:
return "WaitForHeal"
case SuggestToHeal:
return "SuggestToHeal"
case WaitForAll:
return "WaitForAll"
case WaitForQuorum:
@ -76,6 +76,7 @@ func TestPrepForInitXL(t *testing.T) {
errUnformattedDisk, errUnformattedDisk, errUnformattedDisk, errUnformattedDisk,
errUnformattedDisk, errCorruptedFormat, errCorruptedFormat, errDiskNotFound,
}
// Quorum number of disks not online yet.
noQuourm := []error{
errDiskNotFound, errDiskNotFound, errDiskNotFound, errDiskNotFound,
@ -101,6 +102,20 @@ func TestPrepForInitXL(t *testing.T) {
nil, nil, nil, nil,
errServerTimeMismatch, nil, nil, nil,
}
// Suggest to heal under formatted disks in quorum.
formattedDisksInQuorum := []error{
nil, nil, nil, nil,
errUnformattedDisk, errUnformattedDisk, errDiskNotFound, errDiskNotFound,
}
// Wait for all under undecisive state.
undecisiveErrs1 := []error{
errDiskNotFound, nil, nil, nil,
errUnformattedDisk, errUnformattedDisk, errDiskNotFound, errDiskNotFound,
}
undecisiveErrs2 := []error{
errDiskNotFound, errDiskNotFound, errDiskNotFound, errDiskNotFound,
errUnformattedDisk, errUnformattedDisk, errUnformattedDisk, errUnformattedDisk,
}
testCases := []struct {
// Params for prepForInit().
@ -116,7 +131,7 @@ func TestPrepForInitXL(t *testing.T) {
{true, quorumUnformatted, 8, WaitForAll},
{true, quorumUnformattedSomeCorrupted, 8, Abort},
{true, noQuourm, 8, WaitForQuorum},
{true, minorityCorrupted, 8, WaitForHeal},
{true, minorityCorrupted, 8, SuggestToHeal},
{true, majorityCorrupted, 8, Abort},
// Remote disks.
{false, allFormatted, 8, InitObjectLayer},
@ -125,8 +140,11 @@ func TestPrepForInitXL(t *testing.T) {
{false, quorumUnformatted, 8, WaitForAll},
{false, quorumUnformattedSomeCorrupted, 8, Abort},
{false, noQuourm, 8, WaitForQuorum},
{false, minorityCorrupted, 8, WaitForHeal},
{false, minorityCorrupted, 8, SuggestToHeal},
{false, formattedDisksInQuorum, 8, SuggestToHeal},
{false, majorityCorrupted, 8, Abort},
{false, undecisiveErrs1, 8, WaitForAll},
{false, undecisiveErrs2, 8, WaitForAll},
// Config mistakes.
{true, accessKeyIDErr, 8, WaitForConfig},
{true, authenticationErr, 8, WaitForConfig},
@ -136,7 +154,7 @@ func TestPrepForInitXL(t *testing.T) {
for i, test := range testCases {
actual := prepForInitXL(test.firstDisk, test.errs, test.diskCount)
if actual != test.action {
t.Errorf("Test %d expected %s but receieved %s\n", i+1, test.action, actual)
t.Errorf("Test %d expected %s but received %s\n", i+1, test.action, actual)
}
}
}