From e3fd4c0dd67562e931ac613303e70229831863ae Mon Sep 17 00:00:00 2001 From: Krishnan Parthasarathi Date: Sun, 5 Mar 2017 04:23:28 +0530 Subject: [PATCH] XL: Make listOnlineDisks and outDatedDisks consistent w/ each other. (#3808) --- cmd/xl-v1-healing-common.go | 117 +++++++++++---- cmd/xl-v1-healing-common_test.go | 249 +++++++++++++++++++++++++++++++ cmd/xl-v1-healing.go | 68 ++++++--- cmd/xl-v1-object_test.go | 6 + 4 files changed, 393 insertions(+), 47 deletions(-) diff --git a/cmd/xl-v1-healing-common.go b/cmd/xl-v1-healing-common.go index d652bc462..38f79ba6a 100644 --- a/cmd/xl-v1-healing-common.go +++ b/cmd/xl-v1-healing-common.go @@ -16,7 +16,11 @@ package cmd -import "time" +import ( + "encoding/hex" + "path/filepath" + "time" +) // commonTime returns a maximally occurring time from a list of time. func commonTime(modTimes []time.Time) (modTime time.Time, count int) { @@ -59,29 +63,43 @@ func bootModtimes(diskCount int) []time.Time { } // Extracts list of times from xlMetaV1 slice and returns, skips -// slice elements which have errors. As a special error -// errFileNotFound is treated as a initial good condition. +// slice elements which have errors. func listObjectModtimes(partsMetadata []xlMetaV1, errs []error) (modTimes []time.Time) { modTimes = bootModtimes(len(partsMetadata)) - // Set a new time value, specifically set when - // error == errFileNotFound (this is needed when this is a - // fresh PutObject). - timeNow := time.Now().UTC() for index, metadata := range partsMetadata { - if errs[index] == nil { - // Once the file is found, save the uuid saved on disk. - modTimes[index] = metadata.Stat.ModTime - } else if errs[index] == errFileNotFound { - // Once the file is not found then the epoch is current time. - modTimes[index] = timeNow + if errs[index] != nil { + continue } + // Once the file is found, save the uuid saved on disk. + modTimes[index] = metadata.Stat.ModTime } return modTimes } -// Returns slice of online disks needed. -// - slice returing readable disks. -// - modTime of the Object +// Notes: +// There are 5 possible states a disk could be in, +// 1. __online__ - has the latest copy of xl.json - returned by listOnlineDisks +// +// 2. __offline__ - err == errDiskNotFound +// +// 3. __availableWithParts__ - has the latest copy of xl.json and has all +// parts with checksums matching; returned by disksWithAllParts +// +// 4. __outdated__ - returned by outDatedDisk, provided []StorageAPI +// returned by diskWithAllParts is passed for latestDisks. +// - has an old copy of xl.json +// - doesn't have xl.json (errFileNotFound) +// - has the latest xl.json but one or more parts are corrupt +// +// 5. __missingParts__ - has the latest copy of xl.json but has some parts +// missing. This is identified separately since this may need manual +// inspection to understand the root cause. E.g, this could be due to +// backend filesystem corruption. + +// listOnlineDisks - returns +// - a slice of disks where disk having 'older' xl.json (or nothing) +// are set to nil. +// - latest (in time) of the maximally occurring modTime(s). func listOnlineDisks(disks []StorageAPI, partsMetadata []xlMetaV1, errs []error) (onlineDisks []StorageAPI, modTime time.Time) { onlineDisks = make([]StorageAPI, len(disks)) @@ -102,22 +120,23 @@ func listOnlineDisks(disks []StorageAPI, partsMetadata []xlMetaV1, errs []error) return onlineDisks, modTime } -// Return disks with the outdated or missing object. -func outDatedDisks(disks []StorageAPI, partsMetadata []xlMetaV1, errs []error) (outDatedDisks []StorageAPI) { +// outDatedDisks - return disks which don't have the latest object (i.e xl.json). +// disks that are offline are not 'marked' outdated. +func outDatedDisks(disks, latestDisks []StorageAPI, errs []error, partsMetadata []xlMetaV1, + bucket, object string) (outDatedDisks []StorageAPI) { + outDatedDisks = make([]StorageAPI, len(disks)) - latestDisks, _ := listOnlineDisks(disks, partsMetadata, errs) - for index, disk := range latestDisks { - if errorCause(errs[index]) == errFileNotFound { - outDatedDisks[index] = disks[index] + for index, latestDisk := range latestDisks { + if latestDisk != nil { continue } - if errs[index] != nil { - continue - } - if disk == nil { + // disk either has an older xl.json or doesn't have one. + switch errorCause(errs[index]) { + case nil, errFileNotFound: outDatedDisks[index] = disks[index] } } + return outDatedDisks } @@ -189,3 +208,49 @@ func xlHealStat(xl xlObjects, partsMetadata []xlMetaV1, errs []error) HealObject MissingPartityCount: missingParityCount, } } + +// disksWithAllParts - This function needs to be called with +// []StorageAPI returned by listOnlineDisks. Returns, +// - disks which have all parts specified in the latest xl.json. +// - errs updated to have errFileNotFound in place of disks that had +// missing parts. +// - non-nil error if any of the online disks failed during +// calculating blake2b checksum. +func disksWithAllParts(onlineDisks []StorageAPI, partsMetadata []xlMetaV1, errs []error, bucket, object string) ([]StorageAPI, []error, error) { + availableDisks := make([]StorageAPI, len(onlineDisks)) + for index, onlineDisk := range onlineDisks { + if onlineDisk == nil { + continue + } + // disk has a valid xl.json but may not have all the + // parts. This is considered an outdated disk, since + // it needs healing too. + for pIndex, part := range partsMetadata[index].Parts { + // compute blake2b sum of part. + partPath := filepath.Join(object, part.Name) + hash := newHash(blake2bAlgo) + blakeBytes, hErr := hashSum(onlineDisk, bucket, partPath, hash) + if hErr == errFileNotFound { + errs[index] = errFileNotFound + continue + } + + if hErr != nil && hErr != errFileNotFound { + return nil, nil, traceError(hErr) + } + + partChecksum := partsMetadata[index].Erasure.Checksum[pIndex].Hash + blakeSum := hex.EncodeToString(blakeBytes) + // if blake2b sum doesn't match for a part + // then this disk is outdated and needs + // healing. + if blakeSum != partChecksum { + errs[index] = errFileNotFound + break + } + availableDisks[index] = onlineDisk + } + } + + return availableDisks, errs, nil +} diff --git a/cmd/xl-v1-healing-common_test.go b/cmd/xl-v1-healing-common_test.go index be7f200b9..7a3c63db7 100644 --- a/cmd/xl-v1-healing-common_test.go +++ b/cmd/xl-v1-healing-common_test.go @@ -17,6 +17,8 @@ package cmd import ( + "bytes" + "path/filepath" "testing" "time" ) @@ -81,3 +83,250 @@ func TestCommonTime(t *testing.T) { } } } + +// partsMetaFromModTimes - returns slice of modTimes given metadata of +// an object part. +func partsMetaFromModTimes(modTimes []time.Time, checksums []checkSumInfo) []xlMetaV1 { + var partsMetadata []xlMetaV1 + for _, modTime := range modTimes { + partsMetadata = append(partsMetadata, xlMetaV1{ + Erasure: erasureInfo{ + Checksum: checksums, + }, + Stat: statInfo{ + ModTime: modTime, + }, + Parts: []objectPartInfo{ + { + Name: "part.1", + }, + }, + }) + } + return partsMetadata +} + +// toPosix - fetches *posix object from StorageAPI. +func toPosix(disk StorageAPI) *posix { + retryDisk, ok := disk.(*retryStorage) + if !ok { + return nil + } + pDisk, ok := retryDisk.remoteStorage.(*posix) + if !ok { + return nil + } + return pDisk + +} + +// TestListOnlineDisks - checks if listOnlineDisks and outDatedDisks +// are consistent with each other. +func TestListOnlineDisks(t *testing.T) { + rootPath, err := newTestConfig(globalMinioDefaultRegion) + if err != nil { + t.Fatalf("Failed to initialize config - %v", err) + } + defer removeAll(rootPath) + + obj, disks, err := prepareXL() + if err != nil { + t.Fatalf("Prepare XL backend failed - %v", err) + } + defer removeRoots(disks) + + type tamperKind int + const ( + noTamper tamperKind = iota + deletePart tamperKind = iota + corruptPart tamperKind = iota + ) + threeNanoSecs := time.Unix(0, 3).UTC() + fourNanoSecs := time.Unix(0, 4).UTC() + modTimesThreeNone := []time.Time{ + threeNanoSecs, threeNanoSecs, threeNanoSecs, threeNanoSecs, + threeNanoSecs, threeNanoSecs, threeNanoSecs, + timeSentinel, timeSentinel, timeSentinel, timeSentinel, + timeSentinel, timeSentinel, timeSentinel, timeSentinel, + timeSentinel, + } + modTimesThreeFour := []time.Time{ + threeNanoSecs, threeNanoSecs, threeNanoSecs, threeNanoSecs, + threeNanoSecs, threeNanoSecs, threeNanoSecs, threeNanoSecs, + fourNanoSecs, fourNanoSecs, fourNanoSecs, fourNanoSecs, + fourNanoSecs, fourNanoSecs, fourNanoSecs, fourNanoSecs, + } + testCases := []struct { + modTimes []time.Time + expectedTime time.Time + errs []error + _tamperBackend tamperKind + }{ + { + modTimes: modTimesThreeFour, + expectedTime: fourNanoSecs, + errs: []error{ + nil, nil, nil, nil, nil, nil, nil, nil, nil, + nil, nil, nil, nil, nil, nil, nil, + }, + _tamperBackend: noTamper, + }, + { + modTimes: modTimesThreeNone, + expectedTime: threeNanoSecs, + errs: []error{ + // Disks that have a valid xl.json. + nil, nil, nil, nil, nil, nil, nil, + // Majority of disks don't have xl.json. + errFileNotFound, errFileNotFound, + errFileNotFound, errFileNotFound, + errFileNotFound, errDiskAccessDenied, + errDiskNotFound, errFileNotFound, + errFileNotFound, + }, + _tamperBackend: deletePart, + }, + { + modTimes: modTimesThreeNone, + expectedTime: threeNanoSecs, + errs: []error{ + // Disks that have a valid xl.json. + nil, nil, nil, nil, nil, nil, nil, + // Majority of disks don't have xl.json. + errFileNotFound, errFileNotFound, + errFileNotFound, errFileNotFound, + errFileNotFound, errDiskAccessDenied, + errDiskNotFound, errFileNotFound, + errFileNotFound, + }, + _tamperBackend: corruptPart, + }, + } + + bucket := "bucket" + object := "object" + data := bytes.Repeat([]byte("a"), 1024) + xlDisks := obj.(*xlObjects).storageDisks + for i, test := range testCases { + // Prepare bucket/object backend for the tests below. + + // Cleanup from previous test. + obj.DeleteObject(bucket, object) + obj.DeleteBucket(bucket) + + err = obj.MakeBucket("bucket") + if err != nil { + t.Fatalf("Failed to make a bucket %v", err) + } + + _, err = obj.PutObject(bucket, object, int64(len(data)), bytes.NewReader(data), nil, "") + if err != nil { + t.Fatalf("Failed to putObject %v", err) + } + + // Fetch xl.json from first disk to construct partsMetadata for the tests. + xlMeta, err := readXLMeta(xlDisks[0], bucket, object) + if err != nil { + t.Fatalf("Test %d: Failed to read xl.json %v", i+1, err) + } + + tamperedIndex := -1 + switch test._tamperBackend { + case deletePart: + for index, err := range test.errs { + if err != nil { + continue + } + // Remove a part from a disk + // which has a valid xl.json, + // and check if that disk + // appears in outDatedDisks. + tamperedIndex = index + dErr := xlDisks[index].DeleteFile(bucket, filepath.Join(object, "part.1")) + if dErr != nil { + t.Fatalf("Test %d: Failed to delete %s - %v", i+1, + filepath.Join(object, "part.1"), dErr) + } + break + } + case corruptPart: + for index, err := range test.errs { + if err != nil { + continue + } + // Corrupt a part from a disk + // which has a valid xl.json, + // and check if that disk + // appears in outDatedDisks. + tamperedIndex = index + dErr := xlDisks[index].AppendFile(bucket, filepath.Join(object, "part.1"), []byte("corruption")) + if dErr != nil { + t.Fatalf("Test %d: Failed to append corrupting data at the end of file %s - %v", + i+1, filepath.Join(object, "part.1"), dErr) + } + break + } + + } + + partsMetadata := partsMetaFromModTimes(test.modTimes, xlMeta.Erasure.Checksum) + + onlineDisks, modTime := listOnlineDisks(xlDisks, partsMetadata, test.errs) + availableDisks, newErrs, err := disksWithAllParts(onlineDisks, partsMetadata, test.errs, bucket, object) + test.errs = newErrs + outdatedDisks := outDatedDisks(xlDisks, availableDisks, test.errs, partsMetadata, bucket, object) + if modTime.Equal(timeSentinel) { + t.Fatalf("Test %d: modTime should never be equal to timeSentinel, but found equal", + i+1) + } + + if test._tamperBackend != noTamper { + if tamperedIndex != -1 && outdatedDisks[tamperedIndex] == nil { + t.Fatalf("Test %d: disk (%v) with part.1 missing is an outdated disk, but wasn't listed by outDatedDisks", + i+1, xlDisks[tamperedIndex]) + } + + } + + if !modTime.Equal(test.expectedTime) { + t.Fatalf("Test %d: Expected modTime to be equal to %v but was found to be %v", + i+1, test.expectedTime, modTime) + } + + // Check if a disk is considered both online and outdated, + // which is a contradiction, except if parts are missing. + overlappingDisks := make(map[string]*posix) + for _, availableDisk := range availableDisks { + if availableDisk == nil { + continue + } + pDisk := toPosix(availableDisk) + overlappingDisks[pDisk.diskPath] = pDisk + } + + for index, outdatedDisk := range outdatedDisks { + // ignore the intentionally tampered disk, + // this is expected to appear as outdated + // disk, since it doesn't have all the parts. + if index == tamperedIndex { + continue + } + + if outdatedDisk == nil { + continue + } + + pDisk := toPosix(outdatedDisk) + if _, ok := overlappingDisks[pDisk.diskPath]; ok { + t.Errorf("Test %d: Outdated disk %v was also detected as an online disk - %v %v", + i+1, pDisk, availableDisks, outdatedDisks) + } + + // errors other than errFileNotFound doesn't imply that the disk is outdated. + if test.errs[index] != nil && test.errs[index] != errFileNotFound && outdatedDisk != nil { + t.Errorf("Test %d: error (%v) other than errFileNotFound doesn't imply that the disk (%v) could be outdated", + i+1, test.errs[index], pDisk) + } + } + } +} diff --git a/cmd/xl-v1-healing.go b/cmd/xl-v1-healing.go index db8ba2fdd..c815349d7 100644 --- a/cmd/xl-v1-healing.go +++ b/cmd/xl-v1-healing.go @@ -311,6 +311,9 @@ func quickHeal(storageDisks []StorageAPI, writeQuorum int, readQuorum int) error // Heals an object only the corrupted/missing erasure blocks. func healObject(storageDisks []StorageAPI, bucket string, object string, quorum int) error { partsMetadata, errs := readAllXLMetadata(storageDisks, bucket, object) + // readQuorum suffices for xl.json since we use monotonic + // system time to break the tie when a split-brain situation + // arises. if reducedErr := reduceReadQuorumErrs(errs, nil, quorum); reducedErr != nil { return toObjectErr(reducedErr, bucket, object) } @@ -322,12 +325,35 @@ func healObject(storageDisks []StorageAPI, bucket string, object string, quorum // List of disks having latest version of the object. latestDisks, modTime := listOnlineDisks(storageDisks, partsMetadata, errs) + + // List of disks having all parts as per latest xl.json. + availableDisks, errs, aErr := disksWithAllParts(latestDisks, partsMetadata, errs, bucket, object) + if aErr != nil { + return toObjectErr(aErr, bucket, object) + } + + numAvailableDisks := 0 + for _, disk := range availableDisks { + if disk != nil { + numAvailableDisks++ + } + } + + // If less than read quorum number of disks have all the parts + // of the data, we can't reconstruct the erasure-coded data. + if numAvailableDisks < quorum { + return toObjectErr(errXLReadQuorum, bucket, object) + } + // List of disks having outdated version of the object or missing object. - outDatedDisks := outDatedDisks(storageDisks, partsMetadata, errs) - // Latest xlMetaV1 for reference. If a valid metadata is not present, it is as good as object not found. + outDatedDisks := outDatedDisks(storageDisks, availableDisks, errs, partsMetadata, + bucket, object) + + // Latest xlMetaV1 for reference. If a valid metadata is not + // present, it is as good as object not found. latestMeta, pErr := pickValidXLMeta(partsMetadata, modTime) if pErr != nil { - return pErr + return toObjectErr(pErr, bucket, object) } for index, disk := range outDatedDisks { @@ -357,16 +383,16 @@ func healObject(storageDisks []StorageAPI, bucket string, object string, quorum // Delete all the parts. Ignore if parts are not found. for _, part := range outDatedMeta.Parts { - err := disk.DeleteFile(bucket, pathJoin(object, part.Name)) - if err != nil && !isErr(err, errFileNotFound) { - return traceError(err) + dErr := disk.DeleteFile(bucket, pathJoin(object, part.Name)) + if dErr != nil && !isErr(dErr, errFileNotFound) { + return toObjectErr(traceError(dErr), bucket, object) } } // Delete xl.json file. Ignore if xl.json not found. - err := disk.DeleteFile(bucket, pathJoin(object, xlMetaJSONFile)) - if err != nil && !isErr(err, errFileNotFound) { - return traceError(err) + dErr := disk.DeleteFile(bucket, pathJoin(object, xlMetaJSONFile)) + if dErr != nil && !isErr(dErr, errFileNotFound) { + return toObjectErr(traceError(dErr), bucket, object) } } @@ -390,12 +416,12 @@ func healObject(storageDisks []StorageAPI, bucket string, object string, quorum erasure := latestMeta.Erasure sumInfo := latestMeta.Erasure.GetCheckSumInfo(partName) // Heal the part file. - checkSums, err := erasureHealFile(latestDisks, outDatedDisks, + checkSums, hErr := erasureHealFile(latestDisks, outDatedDisks, bucket, pathJoin(object, partName), minioMetaTmpBucket, pathJoin(tmpID, partName), partSize, erasure.BlockSize, erasure.DataBlocks, erasure.ParityBlocks, sumInfo.Algorithm) - if err != nil { - return err + if hErr != nil { + return toObjectErr(hErr, bucket, object) } for index, sum := range checkSums { if outDatedDisks[index] != nil { @@ -418,9 +444,9 @@ func healObject(storageDisks []StorageAPI, bucket string, object string, quorum } // Generate and write `xl.json` generated from other disks. - err := writeUniqueXLMetadata(outDatedDisks, minioMetaTmpBucket, tmpID, partsMetadata, diskCount(outDatedDisks)) - if err != nil { - return toObjectErr(err, bucket, object) + aErr = writeUniqueXLMetadata(outDatedDisks, minioMetaTmpBucket, tmpID, partsMetadata, diskCount(outDatedDisks)) + if aErr != nil { + return toObjectErr(aErr, bucket, object) } // Rename from tmp location to the actual location. @@ -429,14 +455,14 @@ func healObject(storageDisks []StorageAPI, bucket string, object string, quorum continue } // Remove any lingering partial data from current namespace. - err = disk.DeleteFile(bucket, retainSlash(object)) - if err != nil && err != errFileNotFound { - return traceError(err) + aErr = disk.DeleteFile(bucket, retainSlash(object)) + if aErr != nil && aErr != errFileNotFound { + return toObjectErr(traceError(aErr), bucket, object) } // Attempt a rename now from healed data to final location. - err = disk.RenameFile(minioMetaTmpBucket, retainSlash(tmpID), bucket, retainSlash(object)) - if err != nil { - return traceError(err) + aErr = disk.RenameFile(minioMetaTmpBucket, retainSlash(tmpID), bucket, retainSlash(object)) + if aErr != nil { + return toObjectErr(traceError(aErr), bucket, object) } } return nil diff --git a/cmd/xl-v1-object_test.go b/cmd/xl-v1-object_test.go index 1349ec985..0c30cd355 100644 --- a/cmd/xl-v1-object_test.go +++ b/cmd/xl-v1-object_test.go @@ -265,6 +265,12 @@ func TestPutObjectNoQuorum(t *testing.T) { // Tests both object and bucket healing. func TestHealing(t *testing.T) { + rootPath, err := newTestConfig(globalMinioDefaultRegion) + if err != nil { + t.Fatalf("Failed to initialize test config %v", err) + } + defer removeAll(rootPath) + obj, fsDirs, err := prepareXL() if err != nil { t.Fatal(err)