Fix bug in ErasureStorage.HealFile (#4913)

This commit is contained in:
Aditya Manthramurthy 2017-09-20 22:20:27 +05:30 committed by Dee Koder
parent f7ae3be586
commit 3c0d3f7510
2 changed files with 169 additions and 100 deletions

View File

@ -16,72 +16,140 @@
package cmd package cmd
import "hash" import (
"hash"
)
// HealFile tries to reconstruct an erasure-coded file spread over all
// available disks. HealFile will read the valid parts of the file,
// reconstruct the missing data and write the reconstructed parts back
// to `staleDisks`.
//
// `staleDisks` is a slice of disks where each non-nil entry has stale
// or no data, and so will be healed.
//
// It is required that `s.disks` have a (read-quorum) majority of
// disks with valid data for healing to work.
//
// In addition, `staleDisks` and `s.disks` must have the same ordering
// of disks w.r.t. erasure coding of the object.
//
// The function will try to read the valid parts from the file under
// the given volume and path and tries to reconstruct the file under
// the given healVolume and healPath (on staleDisks). The given
// algorithm will be used to verify the valid parts and to protect the
// reconstructed file.
//
// It returns bitrot checksums for the non-nil staleDisks.
func (s ErasureStorage) HealFile(staleDisks []StorageAPI, volume, path string,
blocksize int64, healVolume, healPath string, size int64,
algorithm BitrotAlgorithm, checksums [][]byte) (f ErasureFileInfo,
err error) {
// HealFile tries to reconstruct a bitrot encoded file spread over all available disks. HealFile will read the valid parts of the file,
// reconstruct the missing data and write the reconstructed parts back to the disks.
// It will try to read the valid parts from the file under the given volume and path and tries to reconstruct the file under the given
// healVolume and healPath. The given algorithm will be used to verify the valid parts and to protect the reconstructed file.
func (s ErasureStorage) HealFile(offlineDisks []StorageAPI, volume, path string, blocksize int64, healVolume, healPath string, size int64, algorithm BitrotAlgorithm, checksums [][]byte) (f ErasureFileInfo, err error) {
if !algorithm.Available() { if !algorithm.Available() {
return f, traceError(errBitrotHashAlgoInvalid) return f, traceError(errBitrotHashAlgoInvalid)
} }
// Initialization
f.Checksums = make([][]byte, len(s.disks)) f.Checksums = make([][]byte, len(s.disks))
hashers, verifiers := make([]hash.Hash, len(s.disks)), make([]*BitrotVerifier, len(s.disks)) hashers := make([]hash.Hash, len(s.disks))
verifiers := make([]*BitrotVerifier, len(s.disks))
for i, disk := range s.disks { for i, disk := range s.disks {
if disk == OfflineDisk { switch {
case staleDisks[i] != nil:
hashers[i] = algorithm.New() hashers[i] = algorithm.New()
} else { case disk == nil:
// disregard unavailable disk
continue
default:
verifiers[i] = NewBitrotVerifier(algorithm, checksums[i]) verifiers[i] = NewBitrotVerifier(algorithm, checksums[i])
f.Checksums[i] = checksums[i] f.Checksums[i] = checksums[i]
} }
} }
blocks := make([][]byte, len(s.disks))
// Scan part files on disk, block-by-block reconstruct it and
// write to stale disks.
chunksize := getChunkSize(blocksize, s.dataBlocks) chunksize := getChunkSize(blocksize, s.dataBlocks)
for offset := int64(0); offset < size; offset += blocksize { var chunkOffset, blockOffset int64
if size < blocksize { for ; blockOffset < size; blockOffset += blocksize {
blocksize = size // last iteration may have less than blocksize data
// left, so chunksize needs to be recomputed.
if size < blockOffset+blocksize {
blocksize = size - blockOffset
chunksize = getChunkSize(blocksize, s.dataBlocks) chunksize = getChunkSize(blocksize, s.dataBlocks)
} }
// read a chunk from each disk, until we have
// `s.dataBlocks` number of chunks set to non-nil in
// `blocks`
blocks := make([][]byte, len(s.disks))
var buffer []byte
numReads := 0 numReads := 0
for i, disk := range s.disks { for i, disk := range s.disks {
if disk != OfflineDisk { // skip reading from unavailable or stale disks
if blocks[i] == nil { if disk == nil || staleDisks[i] != nil {
blocks[i] = make([]byte, chunksize) continue
}
// allocate buffer only when needed - when
// reads fail, the buffer can be reused
if int64(len(buffer)) != chunksize {
buffer = make([]byte, chunksize)
} }
blocks[i] = blocks[i][:chunksize]
if !verifiers[i].IsVerified() { if !verifiers[i].IsVerified() {
_, err = disk.ReadFileWithVerify(volume, path, offset, blocks[i], verifiers[i]) _, err = disk.ReadFileWithVerify(volume, path,
chunkOffset, buffer, verifiers[i])
} else { } else {
_, err = disk.ReadFile(volume, path, offset, blocks[i]) _, err = disk.ReadFile(volume, path,
chunkOffset, buffer)
} }
if err != nil { if err != nil {
blocks[i] = nil // LOG FIXME: add a conditional log
} else { // for read failures, once per-disk
numReads++ // per-function-invocation.
continue
} }
if numReads == s.dataBlocks { // we have enough data to reconstruct
// read was successful, so set the buffer as
// blocks[i], and reset buffer to nil to force
// allocation on next iteration
blocks[i], buffer = buffer, nil
numReads++
if numReads == s.dataBlocks {
// we have enough data to reconstruct
break break
} }
} }
}
// advance the chunk offset to prepare for next loop
// iteration
chunkOffset += chunksize
// reconstruct data - this computes all data and parity shards
if err = s.ErasureDecodeDataAndParityBlocks(blocks); err != nil { if err = s.ErasureDecodeDataAndParityBlocks(blocks); err != nil {
return f, err return f, err
} }
for i, disk := range s.disks {
if disk != OfflineDisk { // write computed shards as chunks on file in each
// stale disk
for i, disk := range staleDisks {
if disk == nil {
continue continue
} }
if err = offlineDisks[i].AppendFile(healVolume, healPath, blocks[i]); err != nil {
err = disk.AppendFile(healVolume, healPath, blocks[i])
if err != nil {
return f, traceError(err) return f, traceError(err)
} }
hashers[i].Write(blocks[i]) hashers[i].Write(blocks[i])
} }
} }
// copy computed file hashes into output variable
f.Size = size f.Size = size
f.Algorithm = algorithm f.Algorithm = algorithm
for i, disk := range s.disks { for i, disk := range staleDisks {
if disk != OfflineDisk { if disk == nil {
continue continue
} }
f.Checksums[i] = hashers[i].Sum(nil) f.Checksums[i] = hashers[i].Sum(nil)

View File

@ -25,38 +25,51 @@ import (
) )
var erasureHealFileTests = []struct { var erasureHealFileTests = []struct {
dataBlocks int dataBlocks, disks int
disks, offDisks, badDisks, badOffDisks int
// number of offline disks is also number of staleDisks for
// erasure reconstruction in this test
offDisks int
// bad disks are online disks which return errors
badDisks, badStaleDisks int
blocksize, size int64 blocksize, size int64
algorithm BitrotAlgorithm algorithm BitrotAlgorithm
shouldFail bool shouldFail bool
shouldFailQuorum bool
}{ }{
{dataBlocks: 2, disks: 4, offDisks: 1, badDisks: 0, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: SHA256, shouldFail: false, shouldFailQuorum: false}, // 0 {dataBlocks: 2, disks: 4, offDisks: 1, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: SHA256, shouldFail: false}, // 0
{dataBlocks: 3, disks: 6, offDisks: 2, badDisks: 0, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 1 {dataBlocks: 3, disks: 6, offDisks: 2, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false}, // 1
{dataBlocks: 4, disks: 8, offDisks: 2, badDisks: 1, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 2 {dataBlocks: 4, disks: 8, offDisks: 2, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false}, // 2
{dataBlocks: 5, disks: 10, offDisks: 3, badDisks: 1, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 3 {dataBlocks: 5, disks: 10, offDisks: 3, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false}, // 3
{dataBlocks: 6, disks: 12, offDisks: 2, badDisks: 3, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: SHA256, shouldFail: false, shouldFailQuorum: false}, // 4 {dataBlocks: 6, disks: 12, offDisks: 2, badDisks: 3, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: SHA256, shouldFail: false}, // 4
{dataBlocks: 7, disks: 14, offDisks: 4, badDisks: 1, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 5 {dataBlocks: 7, disks: 14, offDisks: 4, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false}, // 5
{dataBlocks: 8, disks: 16, offDisks: 6, badDisks: 1, badOffDisks: 1, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true}, // 6 {dataBlocks: 8, disks: 16, offDisks: 6, badDisks: 1, badStaleDisks: 1, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true}, // 6
{dataBlocks: 7, disks: 14, offDisks: 2, badDisks: 3, badOffDisks: 0, blocksize: int64(oneMiByte / 2), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: true, shouldFailQuorum: false}, // 7 {dataBlocks: 7, disks: 14, offDisks: 2, badDisks: 3, badStaleDisks: 0, blocksize: int64(oneMiByte / 2), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false}, // 7
{dataBlocks: 6, disks: 12, offDisks: 1, badDisks: 0, badOffDisks: 1, blocksize: int64(oneMiByte - 1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true, shouldFailQuorum: false}, // 8 {dataBlocks: 6, disks: 12, offDisks: 1, badDisks: 0, badStaleDisks: 1, blocksize: int64(oneMiByte - 1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true}, // 8
{dataBlocks: 5, disks: 10, offDisks: 3, badDisks: 0, badOffDisks: 3, blocksize: int64(oneMiByte / 2), size: oneMiByte, algorithm: SHA256, shouldFail: true, shouldFailQuorum: false}, // 9 {dataBlocks: 5, disks: 10, offDisks: 3, badDisks: 0, badStaleDisks: 3, blocksize: int64(oneMiByte / 2), size: oneMiByte, algorithm: SHA256, shouldFail: true}, // 9
{dataBlocks: 4, disks: 8, offDisks: 1, badDisks: 1, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 10 {dataBlocks: 4, disks: 8, offDisks: 1, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false}, // 10
{dataBlocks: 2, disks: 4, offDisks: 1, badDisks: 0, badOffDisks: 1, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true}, // 11 {dataBlocks: 2, disks: 4, offDisks: 1, badDisks: 0, badStaleDisks: 1, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true}, // 11
{dataBlocks: 6, disks: 12, offDisks: 8, badDisks: 3, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true}, // 12 {dataBlocks: 6, disks: 12, offDisks: 8, badDisks: 3, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true}, // 12
{dataBlocks: 7, disks: 14, offDisks: 3, badDisks: 4, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 13 {dataBlocks: 7, disks: 14, offDisks: 3, badDisks: 4, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false}, // 13
{dataBlocks: 7, disks: 14, offDisks: 6, badDisks: 1, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 14 {dataBlocks: 7, disks: 14, offDisks: 6, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false}, // 14
{dataBlocks: 8, disks: 16, offDisks: 4, badDisks: 5, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: true}, // 15 {dataBlocks: 8, disks: 16, offDisks: 4, badDisks: 5, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true}, // 15
{dataBlocks: 2, disks: 4, offDisks: 0, badDisks: 0, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 16 {dataBlocks: 2, disks: 4, offDisks: 0, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false}, // 16
{dataBlocks: 2, disks: 4, offDisks: 0, badDisks: 0, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: 0, shouldFail: true, shouldFailQuorum: false}, // 17 {dataBlocks: 2, disks: 4, offDisks: 0, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: 0, shouldFail: true}, // 17
{dataBlocks: 12, disks: 16, offDisks: 2, badDisks: 1, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false, shouldFailQuorum: false}, // 18 {dataBlocks: 12, disks: 16, offDisks: 2, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false}, // 18
{dataBlocks: 6, disks: 8, offDisks: 1, badDisks: 0, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false, shouldFailQuorum: false}, // 19 {dataBlocks: 6, disks: 8, offDisks: 1, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false}, // 19
{dataBlocks: 7, disks: 10, offDisks: 1, badDisks: 0, badOffDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: 0, shouldFail: true, shouldFailQuorum: false}, // 20 {dataBlocks: 7, disks: 10, offDisks: 1, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte, algorithm: 0, shouldFail: true}, // 20
{dataBlocks: 2, disks: 4, offDisks: 1, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV1), size: oneMiByte * 64, algorithm: SHA256, shouldFail: false}, // 21
} }
func TestErasureHealFile(t *testing.T) { func TestErasureHealFile(t *testing.T) {
for i, test := range erasureHealFileTests { for i, test := range erasureHealFileTests {
if test.offDisks < test.badStaleDisks {
// test case sanity check
t.Fatalf("Test %d: Bad test case - number of stale disks cannot be less than number of badstale disks", i)
}
// create some test data
setup, err := newErasureTestSetup(test.dataBlocks, test.disks-test.dataBlocks, test.blocksize) setup, err := newErasureTestSetup(test.dataBlocks, test.disks-test.dataBlocks, test.blocksize)
if err != nil { if err != nil {
t.Fatalf("Test %d: failed to setup XL environment: %v", i, err) t.Fatalf("Test %d: failed to setup XL environment: %v", i, err)
@ -66,15 +79,11 @@ func TestErasureHealFile(t *testing.T) {
setup.Remove() setup.Remove()
t.Fatalf("Test %d: failed to create ErasureStorage: %v", i, err) t.Fatalf("Test %d: failed to create ErasureStorage: %v", i, err)
} }
offline := make([]StorageAPI, len(storage.disks))
copy(offline, storage.disks)
data := make([]byte, test.size) data := make([]byte, test.size)
if _, err = io.ReadFull(rand.Reader, data); err != nil { if _, err = io.ReadFull(rand.Reader, data); err != nil {
setup.Remove() setup.Remove()
t.Fatalf("Test %d: failed to create random test data: %v", i, err) t.Fatalf("Test %d: failed to create random test data: %v", i, err)
} }
algorithm := test.algorithm algorithm := test.algorithm
if !algorithm.Available() { if !algorithm.Available() {
algorithm = DefaultBitrotAlgorithm algorithm = DefaultBitrotAlgorithm
@ -86,7 +95,25 @@ func TestErasureHealFile(t *testing.T) {
t.Fatalf("Test %d: failed to create random test data: %v", i, err) t.Fatalf("Test %d: failed to create random test data: %v", i, err)
} }
info, err := storage.HealFile(offline, "testbucket", "testobject", test.blocksize, "testbucket", "healedobject", test.size, test.algorithm, file.Checksums) // setup stale disks for the test case
staleDisks := make([]StorageAPI, len(storage.disks))
copy(staleDisks, storage.disks)
for j := 0; j < len(storage.disks); j++ {
if j < test.offDisks {
storage.disks[j] = OfflineDisk
} else {
staleDisks[j] = nil
}
}
for j := 0; j < test.badDisks; j++ {
storage.disks[test.offDisks+j] = badDisk{nil}
}
for j := 0; j < test.badStaleDisks; j++ {
staleDisks[j] = badDisk{nil}
}
// test case setup is complete - now call Healfile()
info, err := storage.HealFile(staleDisks, "testbucket", "testobject", test.blocksize, "testbucket", "healedobject", test.size, test.algorithm, file.Checksums)
if err != nil && !test.shouldFail { if err != nil && !test.shouldFail {
t.Errorf("Test %d: should pass but it failed with: %v", i, err) t.Errorf("Test %d: should pass but it failed with: %v", i, err)
} }
@ -100,39 +127,13 @@ func TestErasureHealFile(t *testing.T) {
if info.Algorithm != test.algorithm { if info.Algorithm != test.algorithm {
t.Errorf("Test %d: healed with wrong algorithm: got: %v want: %v", i, info.Algorithm, test.algorithm) t.Errorf("Test %d: healed with wrong algorithm: got: %v want: %v", i, info.Algorithm, test.algorithm)
} }
if !reflect.DeepEqual(info.Checksums, file.Checksums) { // Verify that checksums of staleDisks
t.Errorf("Test %d: heal returned different bitrot keys", i) // match expected values
for i, disk := range staleDisks {
if disk == nil {
continue
} }
} if !reflect.DeepEqual(info.Checksums[i], file.Checksums[i]) {
if err == nil && !test.shouldFail {
for j := 0; j < len(storage.disks); j++ {
if j < test.offDisks {
storage.disks[j] = OfflineDisk
} else {
offline[j] = OfflineDisk
}
}
for j := 0; j < test.badDisks; j++ {
storage.disks[test.offDisks+j] = badDisk{nil}
}
for j := 0; j < test.badOffDisks; j++ {
offline[j] = badDisk{nil}
}
info, err := storage.HealFile(offline, "testbucket", "testobject", test.blocksize, "testbucket", "healedobject", test.size, test.algorithm, file.Checksums)
if err != nil && !test.shouldFailQuorum {
t.Errorf("Test %d: should pass but it failed with: %v", i, err)
}
if err == nil && test.shouldFailQuorum {
t.Errorf("Test %d: should fail but it passed", i)
}
if err == nil {
if info.Size != test.size {
t.Errorf("Test %d: healed wrong number of bytes: got: #%d want: #%d", i, info.Size, test.size)
}
if info.Algorithm != test.algorithm {
t.Errorf("Test %d: healed with wrong algorithm: got: %v want: %v", i, info.Algorithm, test.algorithm)
}
if !reflect.DeepEqual(info.Checksums, file.Checksums) {
t.Errorf("Test %d: heal returned different bitrot checksums", i) t.Errorf("Test %d: heal returned different bitrot checksums", i)
} }
} }