Simplify erasure code by separating bitrot from erasure code (#5959)

This commit is contained in:
Krishna Srinivas
2018-08-06 15:14:08 -07:00
committed by kannappanr
parent 37de2dbd3b
commit ce02ab613d
23 changed files with 874 additions and 751 deletions

View File

@@ -18,171 +18,31 @@ package cmd
import (
"context"
"fmt"
"hash"
"strings"
"io"
"github.com/minio/minio/cmd/logger"
)
// HealFile tries to reconstruct an erasure-coded file spread over all
// available disks. HealFile will read the valid parts of the file,
// reconstruct the missing data and write the reconstructed parts back
// to `staleDisks` at the destination `dstVol/dstPath/`. Parts are
// verified against the given BitrotAlgorithm and checksums.
//
// `staleDisks` is a slice of disks where each non-nil entry has stale
// or no data, and so will be healed.
//
// It is required that `s.disks` have a (read-quorum) majority of
// disks with valid data for healing to work.
//
// In addition, `staleDisks` and `s.disks` must have the same ordering
// of disks w.r.t. erasure coding of the object.
//
// Errors when writing to `staleDisks` are not propagated as long as
// writes succeed for at least one disk. This allows partial healing
// despite stale disks being faulty.
//
// It returns bitrot checksums for the non-nil staleDisks on which
// healing succeeded.
func (s ErasureStorage) HealFile(ctx context.Context, staleDisks []StorageAPI, volume, path string, blocksize int64,
dstVol, dstPath string, size int64, alg BitrotAlgorithm, checksums [][]byte) (
f ErasureFileInfo, err error) {
if !alg.Available() {
logger.LogIf(ctx, errBitrotHashAlgoInvalid)
return f, errBitrotHashAlgoInvalid
}
// Initialization
f.Checksums = make([][]byte, len(s.disks))
hashers := make([]hash.Hash, len(s.disks))
verifiers := make([]*BitrotVerifier, len(s.disks))
for i, disk := range s.disks {
switch {
case staleDisks[i] != nil:
hashers[i] = alg.New()
case disk == nil:
// disregard unavailable disk
continue
default:
verifiers[i] = NewBitrotVerifier(alg, checksums[i])
// HealFile heals the shard files on non-nil writers. Note that the quorum passed is 1
// as healing should continue even if it has been successful healing only one shard file.
func (s ErasureStorage) HealFile(ctx context.Context, readers []*bitrotReader, writers []*bitrotWriter, size int64) error {
r, w := io.Pipe()
go func() {
if err := s.ReadFile(ctx, w, readers, 0, size, size); err != nil {
w.CloseWithError(err)
return
}
}
writeErrors := make([]error, len(s.disks))
// Read part file data on each disk
chunksize := ceilFrac(blocksize, int64(s.dataBlocks))
numBlocks := ceilFrac(size, blocksize)
readLen := chunksize * (numBlocks - 1)
lastChunkSize := chunksize
hasSmallerLastBlock := size%blocksize != 0
if hasSmallerLastBlock {
lastBlockLen := size % blocksize
lastChunkSize = ceilFrac(lastBlockLen, int64(s.dataBlocks))
}
readLen += lastChunkSize
var buffers [][]byte
buffers, _, err = s.readConcurrent(ctx, volume, path, 0, readLen, verifiers)
w.Close()
}()
buf := make([]byte, s.blockSize)
// quorum is 1 because CreateFile should continue writing as long as we are writing to even 1 disk.
n, err := s.CreateFile(ctx, r, writers, buf, 1)
if err != nil {
return f, err
return err
}
// Scan part files on disk, block-by-block reconstruct it and
// write to stale disks.
blocks := make([][]byte, len(s.disks))
if numBlocks > 1 {
// Allocate once for all the equal length blocks. The
// last block may have a different length - allocation
// for this happens inside the for loop below.
for i := range blocks {
if len(buffers[i]) == 0 {
blocks[i] = make([]byte, chunksize)
}
}
if n != size {
logger.LogIf(ctx, errLessData)
return errLessData
}
var buffOffset int64
for blockNumber := int64(0); blockNumber < numBlocks; blockNumber++ {
if blockNumber == numBlocks-1 && lastChunkSize != chunksize {
for i := range blocks {
if len(buffers[i]) == 0 {
blocks[i] = make([]byte, lastChunkSize)
}
}
}
for i := range blocks {
if len(buffers[i]) == 0 {
blocks[i] = blocks[i][0:0]
}
}
csize := chunksize
if blockNumber == numBlocks-1 {
csize = lastChunkSize
}
for i := range blocks {
if len(buffers[i]) != 0 {
blocks[i] = buffers[i][buffOffset : buffOffset+csize]
}
}
buffOffset += csize
if err = s.ErasureDecodeDataAndParityBlocks(ctx, blocks); err != nil {
return f, err
}
// write computed shards as chunks on file in each
// stale disk
writeSucceeded := false
for i, disk := range staleDisks {
// skip nil disk or disk that had error on
// previous write
if disk == nil || writeErrors[i] != nil {
continue
}
writeErrors[i] = disk.AppendFile(dstVol, dstPath, blocks[i])
if writeErrors[i] == nil {
hashers[i].Write(blocks[i])
writeSucceeded = true
}
}
// If all disks had write errors we quit.
if !writeSucceeded {
// build error from all write errors
err := joinWriteErrors(writeErrors)
logger.LogIf(ctx, err)
return f, err
}
}
// copy computed file hashes into output variable
f.Size = size
f.Algorithm = alg
for i, disk := range staleDisks {
if disk == nil || writeErrors[i] != nil {
continue
}
f.Checksums[i] = hashers[i].Sum(nil)
}
return f, nil
}
func joinWriteErrors(errs []error) error {
msgs := []string{}
for i, err := range errs {
if err == nil {
continue
}
msgs = append(msgs, fmt.Sprintf("disk %d: %v", i+1, err))
}
return fmt.Errorf("all stale disks had write errors during healing: %s",
strings.Join(msgs, ", "))
return nil
}