Simplify erasure code by separating bitrot from erasure code (#5959)

2025-11-10 14:09:48 -05:00 · 2018-08-06 15:14:08 -07:00
parent 37de2dbd3b
commit ce02ab613d
23 changed files with 874 additions and 751 deletions
--- a/cmd/erasure-healfile.go
+++ b/cmd/erasure-healfile.go
@@ -18,171 +18,31 @@ package cmd

 import (
 	"context"
-	"fmt"
-	"hash"
-	"strings"
+	"io"

 	"github.com/minio/minio/cmd/logger"
 )

-// HealFile tries to reconstruct an erasure-coded file spread over all
-// available disks. HealFile will read the valid parts of the file,
-// reconstruct the missing data and write the reconstructed parts back
-// to `staleDisks` at the destination `dstVol/dstPath/`. Parts are
-// verified against the given BitrotAlgorithm and checksums.
-//
-// `staleDisks` is a slice of disks where each non-nil entry has stale
-// or no data, and so will be healed.
-//
-// It is required that `s.disks` have a (read-quorum) majority of
-// disks with valid data for healing to work.
-//
-// In addition, `staleDisks` and `s.disks` must have the same ordering
-// of disks w.r.t. erasure coding of the object.
-//
-// Errors when writing to `staleDisks` are not propagated as long as
-// writes succeed for at least one disk. This allows partial healing
-// despite stale disks being faulty.
-//
-// It returns bitrot checksums for the non-nil staleDisks on which
-// healing succeeded.
-func (s ErasureStorage) HealFile(ctx context.Context, staleDisks []StorageAPI, volume, path string, blocksize int64,
-	dstVol, dstPath string, size int64, alg BitrotAlgorithm, checksums [][]byte) (
-	f ErasureFileInfo, err error) {
-
-	if !alg.Available() {
-		logger.LogIf(ctx, errBitrotHashAlgoInvalid)
-		return f, errBitrotHashAlgoInvalid
-	}
-
-	// Initialization
-	f.Checksums = make([][]byte, len(s.disks))
-	hashers := make([]hash.Hash, len(s.disks))
-	verifiers := make([]*BitrotVerifier, len(s.disks))
-	for i, disk := range s.disks {
-		switch {
-		case staleDisks[i] != nil:
-			hashers[i] = alg.New()
-		case disk == nil:
-			// disregard unavailable disk
-			continue
-		default:
-			verifiers[i] = NewBitrotVerifier(alg, checksums[i])
+// HealFile heals the shard files on non-nil writers. Note that the quorum passed is 1
+// as healing should continue even if it has been successful healing only one shard file.
+func (s ErasureStorage) HealFile(ctx context.Context, readers []*bitrotReader, writers []*bitrotWriter, size int64) error {
+	r, w := io.Pipe()
+	go func() {
+		if err := s.ReadFile(ctx, w, readers, 0, size, size); err != nil {
+			w.CloseWithError(err)
+			return
 		}
-	}
-	writeErrors := make([]error, len(s.disks))
-
-	// Read part file data on each disk
-	chunksize := ceilFrac(blocksize, int64(s.dataBlocks))
-	numBlocks := ceilFrac(size, blocksize)
-
-	readLen := chunksize * (numBlocks - 1)
-
-	lastChunkSize := chunksize
-	hasSmallerLastBlock := size%blocksize != 0
-	if hasSmallerLastBlock {
-		lastBlockLen := size % blocksize
-		lastChunkSize = ceilFrac(lastBlockLen, int64(s.dataBlocks))
-	}
-	readLen += lastChunkSize
-	var buffers [][]byte
-	buffers, _, err = s.readConcurrent(ctx, volume, path, 0, readLen, verifiers)
+		w.Close()
+	}()
+	buf := make([]byte, s.blockSize)
+	// quorum is 1 because CreateFile should continue writing as long as we are writing to even 1 disk.
+	n, err := s.CreateFile(ctx, r, writers, buf, 1)
 	if err != nil {
-		return f, err
+		return err
 	}
-
-	// Scan part files on disk, block-by-block reconstruct it and
-	// write to stale disks.
-	blocks := make([][]byte, len(s.disks))
-
-	if numBlocks > 1 {
-		// Allocate once for all the equal length blocks. The
-		// last block may have a different length - allocation
-		// for this happens inside the for loop below.
-		for i := range blocks {
-			if len(buffers[i]) == 0 {
-				blocks[i] = make([]byte, chunksize)
-			}
-		}
+	if n != size {
+		logger.LogIf(ctx, errLessData)
+		return errLessData
 	}
-
-	var buffOffset int64
-	for blockNumber := int64(0); blockNumber < numBlocks; blockNumber++ {
-		if blockNumber == numBlocks-1 && lastChunkSize != chunksize {
-			for i := range blocks {
-				if len(buffers[i]) == 0 {
-					blocks[i] = make([]byte, lastChunkSize)
-				}
-			}
-		}
-
-		for i := range blocks {
-			if len(buffers[i]) == 0 {
-				blocks[i] = blocks[i][0:0]
-			}
-		}
-
-		csize := chunksize
-		if blockNumber == numBlocks-1 {
-			csize = lastChunkSize
-		}
-		for i := range blocks {
-			if len(buffers[i]) != 0 {
-				blocks[i] = buffers[i][buffOffset : buffOffset+csize]
-			}
-		}
-		buffOffset += csize
-
-		if err = s.ErasureDecodeDataAndParityBlocks(ctx, blocks); err != nil {
-			return f, err
-		}
-
-		// write computed shards as chunks on file in each
-		// stale disk
-		writeSucceeded := false
-		for i, disk := range staleDisks {
-			// skip nil disk or disk that had error on
-			// previous write
-			if disk == nil || writeErrors[i] != nil {
-				continue
-			}
-
-			writeErrors[i] = disk.AppendFile(dstVol, dstPath, blocks[i])
-			if writeErrors[i] == nil {
-				hashers[i].Write(blocks[i])
-				writeSucceeded = true
-			}
-		}
-
-		// If all disks had write errors we quit.
-		if !writeSucceeded {
-			// build error from all write errors
-			err := joinWriteErrors(writeErrors)
-			logger.LogIf(ctx, err)
-			return f, err
-		}
-	}
-
-	// copy computed file hashes into output variable
-	f.Size = size
-	f.Algorithm = alg
-	for i, disk := range staleDisks {
-		if disk == nil || writeErrors[i] != nil {
-			continue
-		}
-		f.Checksums[i] = hashers[i].Sum(nil)
-	}
-	return f, nil
-}
-
-func joinWriteErrors(errs []error) error {
-	msgs := []string{}
-	for i, err := range errs {
-		if err == nil {
-			continue
-		}
-		msgs = append(msgs, fmt.Sprintf("disk %d: %v", i+1, err))
-	}
-	return fmt.Errorf("all stale disks had write errors during healing: %s",
-		strings.Join(msgs, ", "))
+	return nil
 }