From b9fc4150f616924489ff266aa55fc66c99f9a4d6 Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Tue, 12 Sep 2017 12:17:44 -0700 Subject: [PATCH] Fix preInit logic when mixed disk situations exist. (#4904) When servers are started simultaneously across multiple nodes or simulating a local setup, it can happen such that one of the servers in setup reaches a following situation where it observes - Some servers are formatted - Some servers are unformatted - Some servers are offline Current state machine doesn't handle this correctly, to fix this situation where we have unformatted, formatted and disks offline we do not decisively know the course of action. So we wait for the offline disks to change their state. Once the offline disks change their state to either one of these states we can decisively move forward. - nil (formatted disk) - errUnformattedDisk - Or any other error such as errCorruptedDisk. Fixes #4903 --- appveyor.yml | 3 ++- cmd/prepare-storage.go | 39 ++++++++++++++++--------------------- cmd/prepare-storage_test.go | 28 +++++++++++++++++++++----- 3 files changed, 42 insertions(+), 28 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 78d3026d0..3dd13af1d 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -13,10 +13,11 @@ clone_folder: c:\gopath\src\github.com\minio\minio environment: GOVERSION: 1.8.3 GOPATH: c:\gopath + GOROOT: c:\go18 # scripts that run after cloning repository install: - - set PATH=%GOPATH%\bin;c:\go\bin;%PATH% + - set PATH=%GOPATH%\bin;c:\go18\bin;%PATH% - go version - go env - python --version diff --git a/cmd/prepare-storage.go b/cmd/prepare-storage.go index c97aa2091..91b750f46 100644 --- a/cmd/prepare-storage.go +++ b/cmd/prepare-storage.go @@ -70,12 +70,11 @@ import ( type InitActions int const ( - // FormatDisks - see above table for disk states where it - // is applicable. + // FormatDisks - see above table for disk states where it is applicable. FormatDisks InitActions = iota - // WaitForHeal - Wait for disks to heal. - WaitForHeal + // SuggestToHeal - Prints heal message and initialize object layer. + SuggestToHeal // WaitForQuorum - Wait for quorum number of disks to be online. WaitForQuorum @@ -134,14 +133,14 @@ func quickErrToActions(errMap map[error]int) InitActions { // - Unformatted setup // - Format/Wait for format when `disksUnformatted == diskCount` // -// - Wait for all when `disksUnformatted + disksOffline == disksCount` +// - Wait for all when `disksUnformatted + disksFormatted + diskOffline == diskCount` // // Under all other conditions should lead to server initialization aborted. func prepForInitXL(firstDisk bool, sErrs []error, diskCount int) InitActions { // Count errors by error value. errMap := make(map[error]int) for _, err := range sErrs { - errMap[err]++ + errMap[errorCause(err)]++ } // Validates and converts specific config errors into WaitForConfig. @@ -149,7 +148,6 @@ func prepForInitXL(firstDisk bool, sErrs []error, diskCount int) InitActions { return WaitForConfig } - quorum := diskCount/2 + 1 readQuorum := diskCount / 2 disksOffline := errMap[errDiskNotFound] disksFormatted := errMap[nil] @@ -169,28 +167,25 @@ func prepForInitXL(firstDisk bool, sErrs []error, diskCount int) InitActions { return WaitForFormatting } - // Total disks unformatted are in quorum verify if we have some offline disks. - if disksUnformatted >= quorum { - // Some disks offline and some disks unformatted, wait for all of them to come online. - if disksUnformatted+disksFormatted+disksOffline == diskCount { - return WaitForAll - } - - // Some disks possibly corrupted and too many unformatted disks. - return Abort - } - // Already formatted and in quorum, proceed to initialization of object layer. if disksFormatted >= readQuorum { if disksFormatted+disksOffline == diskCount { return InitObjectLayer } - // Some of the formatted disks are possibly corrupted or unformatted, heal them. - return WaitForHeal + // Some of the formatted disks are possibly corrupted or unformatted, + // let user know to heal them. + return SuggestToHeal } - // Exhausted all our checks, un-handled errors perhaps we Abort. + // Some unformatted, some disks formatted and some disks are offline but we don't + // quorum to decide. This is an undecisive state - wait for all of offline disks + // to be online to figure out the course of action. + if disksUnformatted+disksFormatted+disksOffline == diskCount { + return WaitForAll + } + + // Exhausted all our checks, un-handled situations such as some disks corrupted we Abort. return Abort } @@ -280,7 +275,7 @@ func retryFormattingXLDisks(firstDisk bool, endpoints EndpointList, storageDisks printRegularMsg(endpoints, storageDisks, printOnceFn()) } return err - case WaitForHeal: + case SuggestToHeal: // Validate formats loaded before proceeding forward. err := genericFormatCheckXL(formatConfigs, sErrs) if err == nil { diff --git a/cmd/prepare-storage_test.go b/cmd/prepare-storage_test.go index e4646149d..8c384cb5a 100644 --- a/cmd/prepare-storage_test.go +++ b/cmd/prepare-storage_test.go @@ -26,8 +26,8 @@ func (action InitActions) String() string { return "FormatDisks" case WaitForFormatting: return "WaitForFormatting" - case WaitForHeal: - return "WaitForHeal" + case SuggestToHeal: + return "SuggestToHeal" case WaitForAll: return "WaitForAll" case WaitForQuorum: @@ -76,6 +76,7 @@ func TestPrepForInitXL(t *testing.T) { errUnformattedDisk, errUnformattedDisk, errUnformattedDisk, errUnformattedDisk, errUnformattedDisk, errCorruptedFormat, errCorruptedFormat, errDiskNotFound, } + // Quorum number of disks not online yet. noQuourm := []error{ errDiskNotFound, errDiskNotFound, errDiskNotFound, errDiskNotFound, @@ -101,6 +102,20 @@ func TestPrepForInitXL(t *testing.T) { nil, nil, nil, nil, errServerTimeMismatch, nil, nil, nil, } + // Suggest to heal under formatted disks in quorum. + formattedDisksInQuorum := []error{ + nil, nil, nil, nil, + errUnformattedDisk, errUnformattedDisk, errDiskNotFound, errDiskNotFound, + } + // Wait for all under undecisive state. + undecisiveErrs1 := []error{ + errDiskNotFound, nil, nil, nil, + errUnformattedDisk, errUnformattedDisk, errDiskNotFound, errDiskNotFound, + } + undecisiveErrs2 := []error{ + errDiskNotFound, errDiskNotFound, errDiskNotFound, errDiskNotFound, + errUnformattedDisk, errUnformattedDisk, errUnformattedDisk, errUnformattedDisk, + } testCases := []struct { // Params for prepForInit(). @@ -116,7 +131,7 @@ func TestPrepForInitXL(t *testing.T) { {true, quorumUnformatted, 8, WaitForAll}, {true, quorumUnformattedSomeCorrupted, 8, Abort}, {true, noQuourm, 8, WaitForQuorum}, - {true, minorityCorrupted, 8, WaitForHeal}, + {true, minorityCorrupted, 8, SuggestToHeal}, {true, majorityCorrupted, 8, Abort}, // Remote disks. {false, allFormatted, 8, InitObjectLayer}, @@ -125,8 +140,11 @@ func TestPrepForInitXL(t *testing.T) { {false, quorumUnformatted, 8, WaitForAll}, {false, quorumUnformattedSomeCorrupted, 8, Abort}, {false, noQuourm, 8, WaitForQuorum}, - {false, minorityCorrupted, 8, WaitForHeal}, + {false, minorityCorrupted, 8, SuggestToHeal}, + {false, formattedDisksInQuorum, 8, SuggestToHeal}, {false, majorityCorrupted, 8, Abort}, + {false, undecisiveErrs1, 8, WaitForAll}, + {false, undecisiveErrs2, 8, WaitForAll}, // Config mistakes. {true, accessKeyIDErr, 8, WaitForConfig}, {true, authenticationErr, 8, WaitForConfig}, @@ -136,7 +154,7 @@ func TestPrepForInitXL(t *testing.T) { for i, test := range testCases { actual := prepForInitXL(test.firstDisk, test.errs, test.diskCount) if actual != test.action { - t.Errorf("Test %d expected %s but receieved %s\n", i+1, test.action, actual) + t.Errorf("Test %d expected %s but received %s\n", i+1, test.action, actual) } } }