Fix bug in ErasureStorage.HealFile (#4913)

This commit is contained in:
Aditya Manthramurthy 2017-09-20 22:20:27 +05:30 committed by Dee Koder
parent f7ae3be586
commit 3c0d3f7510
2 changed files with 169 additions and 100 deletions

View File

@ -16,72 +16,140 @@
package cmd
import "hash"
import (
"hash"
)
// HealFile tries to reconstruct an erasure-coded file spread over all
// available disks. HealFile will read the valid parts of the file,
// reconstruct the missing data and write the reconstructed parts back
// to `staleDisks`.
//
// `staleDisks` is a slice of disks where each non-nil entry has stale
// or no data, and so will be healed.
//
// It is required that `s.disks` have a (read-quorum) majority of
// disks with valid data for healing to work.
//
// In addition, `staleDisks` and `s.disks` must have the same ordering
// of disks w.r.t. erasure coding of the object.
//
// The function will try to read the valid parts from the file under
// the given volume and path and tries to reconstruct the file under
// the given healVolume and healPath (on staleDisks). The given
// algorithm will be used to verify the valid parts and to protect the
// reconstructed file.
//
// It returns bitrot checksums for the non-nil staleDisks.
func (s ErasureStorage) HealFile(staleDisks []StorageAPI, volume, path string,
blocksize int64, healVolume, healPath string, size int64,
algorithm BitrotAlgorithm, checksums [][]byte) (f ErasureFileInfo,
err error) {
// HealFile tries to reconstruct a bitrot encoded file spread over all available disks. HealFile will read the valid parts of the file,
// reconstruct the missing data and write the reconstructed parts back to the disks.
// It will try to read the valid parts from the file under the given volume and path and tries to reconstruct the file under the given
// healVolume and healPath. The given algorithm will be used to verify the valid parts and to protect the reconstructed file.
func (s ErasureStorage) HealFile(offlineDisks []StorageAPI, volume, path string, blocksize int64, healVolume, healPath string, size int64, algorithm BitrotAlgorithm, checksums [][]byte) (f ErasureFileInfo, err error) {
if !algorithm.Available() {
return f, traceError(errBitrotHashAlgoInvalid)
}
// Initialization
f.Checksums = make([][]byte, len(s.disks))
hashers, verifiers := make([]hash.Hash, len(s.disks)), make([]*BitrotVerifier, len(s.disks))
hashers := make([]hash.Hash, len(s.disks))
verifiers := make([]*BitrotVerifier, len(s.disks))
for i, disk := range s.disks {
if disk == OfflineDisk {
switch {
case staleDisks[i] != nil:
hashers[i] = algorithm.New()
} else {
case disk == nil:
// disregard unavailable disk
continue
default:
verifiers[i] = NewBitrotVerifier(algorithm, checksums[i])
f.Checksums[i] = checksums[i]
}
}
blocks := make([][]byte, len(s.disks))
// Scan part files on disk, block-by-block reconstruct it and
// write to stale disks.
chunksize := getChunkSize(blocksize, s.dataBlocks)
for offset := int64(0); offset < size; offset += blocksize {
if size < blocksize {
blocksize = size
var chunkOffset, blockOffset int64
for ; blockOffset < size; blockOffset += blocksize {
// last iteration may have less than blocksize data
// left, so chunksize needs to be recomputed.
if size < blockOffset+blocksize {
blocksize = size - blockOffset
chunksize = getChunkSize(blocksize, s.dataBlocks)
}
// read a chunk from each disk, until we have
// `s.dataBlocks` number of chunks set to non-nil in
// `blocks`
blocks := make([][]byte, len(s.disks))
var buffer []byte
numReads := 0
for i, disk := range s.disks {
if disk != OfflineDisk {
if blocks[i] == nil {
blocks[i] = make([]byte, chunksize)
}
blocks[i] = blocks[i][:chunksize]
if !verifiers[i].IsVerified() {
_, err = disk.ReadFileWithVerify(volume, path, offset, blocks[i], verifiers[i])
} else {
_, err = disk.ReadFile(volume, path, offset, blocks[i])
}
if err != nil {
blocks[i] = nil
} else {
numReads++
}
if numReads == s.dataBlocks { // we have enough data to reconstruct
break
}
// skip reading from unavailable or stale disks
if disk == nil || staleDisks[i] != nil {
continue
}
// allocate buffer only when needed - when
// reads fail, the buffer can be reused
if int64(len(buffer)) != chunksize {
buffer = make([]byte, chunksize)
}
if !verifiers[i].IsVerified() {
_, err = disk.ReadFileWithVerify(volume, path,
chunkOffset, buffer, verifiers[i])
} else {
_, err = disk.ReadFile(volume, path,
chunkOffset, buffer)
}
if err != nil {
// LOG FIXME: add a conditional log
// for read failures, once per-disk
// per-function-invocation.
continue
}
// read was successful, so set the buffer as
// blocks[i], and reset buffer to nil to force
// allocation on next iteration
blocks[i], buffer = buffer, nil
numReads++
if numReads == s.dataBlocks {
// we have enough data to reconstruct
break
}
}
// advance the chunk offset to prepare for next loop
// iteration
chunkOffset += chunksize
// reconstruct data - this computes all data and parity shards
if err = s.ErasureDecodeDataAndParityBlocks(blocks); err != nil {
return f, err
}
for i, disk := range s.disks {
if disk != OfflineDisk {
// write computed shards as chunks on file in each
// stale disk
for i, disk := range staleDisks {
if disk == nil {
continue
}
if err = offlineDisks[i].AppendFile(healVolume, healPath, blocks[i]); err != nil {
err = disk.AppendFile(healVolume, healPath, blocks[i])
if err != nil {
return f, traceError(err)
}
hashers[i].Write(blocks[i])
}
}
// copy computed file hashes into output variable
f.Size = size
f.Algorithm = algorithm
for i, disk := range s.disks {
if disk != OfflineDisk {
for i, disk := range staleDisks {
if disk == nil {
continue
}
f.Checksums[i] = hashers[i].Sum(nil)

View File

@ -25,38 +25,51 @@ import (
)
var erasureHealFileTests = []struct {
dataBlocks int
disks, offDisks, badDisks, badOffDisks int
blocksize, size int64
algorithm BitrotAlgorithm
shouldFail bool
shouldFailQuorum bool
dataBlocks, disks int
// number of offline disks is also number of staleDisks for
// erasure reconstruction in this test
offDisks int
// bad disks are online disks which return errors
badDisks, badStaleDisks int
blocksize, size int64
algorithm BitrotAlgorithm
shouldFail bool
}{
{dataBlocks: 2, disks: 4, offDisks: 1, badDisks: 0, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: SHA256, shouldFail: false, shouldFailQuorum: false}, // 0
{dataBlocks: 3, disks: 6, offDisks: 2, badDisks: 0, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 1
{dataBlocks: 4, disks: 8, offDisks: 2, badDisks: 1, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 2
{dataBlocks: 5, disks: 10, offDisks: 3, badDisks: 1, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 3
{dataBlocks: 6, disks: 12, offDisks: 2, badDisks: 3, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: SHA256, shouldFail: false, shouldFailQuorum: false}, // 4
{dataBlocks: 7, disks: 14, offDisks: 4, badDisks: 1, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 5
{dataBlocks: 8, disks: 16, offDisks: 6, badDisks: 1, badOffDisks: 1, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true}, // 6
{dataBlocks: 7, disks: 14, offDisks: 2, badDisks: 3, badOffDisks: 0, blocksize: int64(oneMiByte / 2), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: true, shouldFailQuorum: false}, // 7
{dataBlocks: 6, disks: 12, offDisks: 1, badDisks: 0, badOffDisks: 1, blocksize: int64(oneMiByte - 1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true, shouldFailQuorum: false}, // 8
{dataBlocks: 5, disks: 10, offDisks: 3, badDisks: 0, badOffDisks: 3, blocksize: int64(oneMiByte / 2), size: oneMiByte, algorithm: SHA256, shouldFail: true, shouldFailQuorum: false}, // 9
{dataBlocks: 4, disks: 8, offDisks: 1, badDisks: 1, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 10
{dataBlocks: 2, disks: 4, offDisks: 1, badDisks: 0, badOffDisks: 1, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true}, // 11
{dataBlocks: 6, disks: 12, offDisks: 8, badDisks: 3, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true}, // 12
{dataBlocks: 7, disks: 14, offDisks: 3, badDisks: 4, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 13
{dataBlocks: 7, disks: 14, offDisks: 6, badDisks: 1, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 14
{dataBlocks: 8, disks: 16, offDisks: 4, badDisks: 5, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true}, // 15
{dataBlocks: 2, disks: 4, offDisks: 0, badDisks: 0, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 16
{dataBlocks: 2, disks: 4, offDisks: 0, badDisks: 0, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: 0, shouldFail: true, shouldFailQuorum: false}, // 17
{dataBlocks: 12, disks: 16, offDisks: 2, badDisks: 1, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 18
{dataBlocks: 6, disks: 8, offDisks: 1, badDisks: 0, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 19
{dataBlocks: 7, disks: 10, offDisks: 1, badDisks: 0, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: 0, shouldFail: true, shouldFailQuorum: false}, // 20
{dataBlocks: 2, disks: 4, offDisks: 1, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: SHA256, shouldFail: false}, // 0
{dataBlocks: 3, disks: 6, offDisks: 2, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false}, // 1
{dataBlocks: 4, disks: 8, offDisks: 2, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false}, // 2
{dataBlocks: 5, disks: 10, offDisks: 3, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false}, // 3
{dataBlocks: 6, disks: 12, offDisks: 2, badDisks: 3, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: SHA256, shouldFail: false}, // 4
{dataBlocks: 7, disks: 14, offDisks: 4, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false}, // 5
{dataBlocks: 8, disks: 16, offDisks: 6, badDisks: 1, badStaleDisks: 1, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true}, // 6
{dataBlocks: 7, disks: 14, offDisks: 2, badDisks: 3, badStaleDisks: 0, blocksize: int64(oneMiByte / 2), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false}, // 7
{dataBlocks: 6, disks: 12, offDisks: 1, badDisks: 0, badStaleDisks: 1, blocksize: int64(oneMiByte - 1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true}, // 8
{dataBlocks: 5, disks: 10, offDisks: 3, badDisks: 0, badStaleDisks: 3, blocksize: int64(oneMiByte / 2), size: oneMiByte, algorithm: SHA256, shouldFail: true}, // 9
{dataBlocks: 4, disks: 8, offDisks: 1, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false}, // 10
{dataBlocks: 2, disks: 4, offDisks: 1, badDisks: 0, badStaleDisks: 1, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true}, // 11
{dataBlocks: 6, disks: 12, offDisks: 8, badDisks: 3, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true}, // 12
{dataBlocks: 7, disks: 14, offDisks: 3, badDisks: 4, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false}, // 13
{dataBlocks: 7, disks: 14, offDisks: 6, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false}, // 14
{dataBlocks: 8, disks: 16, offDisks: 4, badDisks: 5, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true}, // 15
{dataBlocks: 2, disks: 4, offDisks: 0, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false}, // 16
{dataBlocks: 2, disks: 4, offDisks: 0, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: 0, shouldFail: true}, // 17
{dataBlocks: 12, disks: 16, offDisks: 2, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false}, // 18
{dataBlocks: 6, disks: 8, offDisks: 1, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false}, // 19
{dataBlocks: 7, disks: 10, offDisks: 1, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: 0, shouldFail: true}, // 20
{dataBlocks: 2, disks: 4, offDisks: 1, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte * 64, algorithm: SHA256, shouldFail: false}, // 21
}
func TestErasureHealFile(t *testing.T) {
for i, test := range erasureHealFileTests {
if test.offDisks < test.badStaleDisks {
// test case sanity check
t.Fatalf("Test %d: Bad test case - number of stale disks cannot be less than number of badstale disks", i)
}
// create some test data
setup, err := newErasureTestSetup(test.dataBlocks, test.disks-test.dataBlocks, test.blocksize)
if err != nil {
t.Fatalf("Test %d: failed to setup XL environment: %v", i, err)
@ -66,15 +79,11 @@ func TestErasureHealFile(t *testing.T) {
setup.Remove()
t.Fatalf("Test %d: failed to create ErasureStorage: %v", i, err)
}
offline := make([]StorageAPI, len(storage.disks))
copy(offline, storage.disks)
data := make([]byte, test.size)
if _, err = io.ReadFull(rand.Reader, data); err != nil {
setup.Remove()
t.Fatalf("Test %d: failed to create random test data: %v", i, err)
}
algorithm := test.algorithm
if !algorithm.Available() {
algorithm = DefaultBitrotAlgorithm
@ -86,7 +95,25 @@ func TestErasureHealFile(t *testing.T) {
t.Fatalf("Test %d: failed to create random test data: %v", i, err)
}
info, err := storage.HealFile(offline, "testbucket", "testobject", test.blocksize, "testbucket", "healedobject", test.size, test.algorithm, file.Checksums)
// setup stale disks for the test case
staleDisks := make([]StorageAPI, len(storage.disks))
copy(staleDisks, storage.disks)
for j := 0; j < len(storage.disks); j++ {
if j < test.offDisks {
storage.disks[j] = OfflineDisk
} else {
staleDisks[j] = nil
}
}
for j := 0; j < test.badDisks; j++ {
storage.disks[test.offDisks+j] = badDisk{nil}
}
for j := 0; j < test.badStaleDisks; j++ {
staleDisks[j] = badDisk{nil}
}
// test case setup is complete - now call Healfile()
info, err := storage.HealFile(staleDisks, "testbucket", "testobject", test.blocksize, "testbucket", "healedobject", test.size, test.algorithm, file.Checksums)
if err != nil && !test.shouldFail {
t.Errorf("Test %d: should pass but it failed with: %v", i, err)
}
@ -100,39 +127,13 @@ func TestErasureHealFile(t *testing.T) {
if info.Algorithm != test.algorithm {
t.Errorf("Test %d: healed with wrong algorithm: got: %v want: %v", i, info.Algorithm, test.algorithm)
}
if !reflect.DeepEqual(info.Checksums, file.Checksums) {
t.Errorf("Test %d: heal returned different bitrot keys", i)
}
}
if err == nil && !test.shouldFail {
for j := 0; j < len(storage.disks); j++ {
if j < test.offDisks {
storage.disks[j] = OfflineDisk
} else {
offline[j] = OfflineDisk
// Verify that checksums of staleDisks
// match expected values
for i, disk := range staleDisks {
if disk == nil {
continue
}
}
for j := 0; j < test.badDisks; j++ {
storage.disks[test.offDisks+j] = badDisk{nil}
}
for j := 0; j < test.badOffDisks; j++ {
offline[j] = badDisk{nil}
}
info, err := storage.HealFile(offline, "testbucket", "testobject", test.blocksize, "testbucket", "healedobject", test.size, test.algorithm, file.Checksums)
if err != nil && !test.shouldFailQuorum {
t.Errorf("Test %d: should pass but it failed with: %v", i, err)
}
if err == nil && test.shouldFailQuorum {
t.Errorf("Test %d: should fail but it passed", i)
}
if err == nil {
if info.Size != test.size {
t.Errorf("Test %d: healed wrong number of bytes: got: #%d want: #%d", i, info.Size, test.size)
}
if info.Algorithm != test.algorithm {
t.Errorf("Test %d: healed with wrong algorithm: got: %v want: %v", i, info.Algorithm, test.algorithm)
}
if !reflect.DeepEqual(info.Checksums, file.Checksums) {
if !reflect.DeepEqual(info.Checksums[i], file.Checksums[i]) {
t.Errorf("Test %d: heal returned different bitrot checksums", i)
}
}