From acc452b7cef9c0023f8909c120f5c48d16de46d4 Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Thu, 27 May 2021 20:38:09 +0200
Subject: [PATCH] Add more erasure codes on degraded systems. (#11852)

In cases where a cluster is degraded, we do not uphold our consistency
guarantee and we will write fewer erasure codes and rely on healing
to recreate the missing shards.

In some cases replacing known bad disks in practice take days.
We want to change the behavior of a known degraded system to keep
the erasure code promise of the storage class for each object.

This will create the objects with the same confidence as a fully
functional cluster. The tradeoff will be that objects created
during a partial outage will take up slightly more space.

This means that when the storage class is EC:4, there should
always be written 4 parity shards, even if some disks are unavailable.

When an object is created on a set, the disks are immediately
checked. If any disks are unavailable additional parity shards
will be made for each offline disk, up to 50% of the number of disks.

We add an internal metadata field with the actual and intended
erasure code level, this can optionally be picked up later by
the scanner if we decide that data like this should be re-sharded.
---
 cmd/erasure-multipart.go   |  19 +++++
 cmd/erasure-object.go      |  18 +++++
 cmd/erasure-object_test.go | 156 +++++++++++++++++++++++++++++++++++--
 cmd/http/headers.go        |   3 +
 docs/distributed/README.md |   2 +
 docs/distributed/SIZING.md |   5 ++
 6 files changed, 197 insertions(+), 6 deletions(-)

diff --git a/cmd/erasure-multipart.go b/cmd/erasure-multipart.go
index 621455a95..6015080dd 100644
--- a/cmd/erasure-multipart.go
+++ b/cmd/erasure-multipart.go
@@ -289,7 +289,26 @@ func (er erasureObjects) newMultipartUpload(ctx context.Context, bucket string,
 		parityDrives = er.defaultParityCount
 	}
 
+	ecOrg := parityDrives
+	for _, disk := range onlineDisks {
+		if parityDrives >= len(onlineDisks)/2 {
+			break
+		}
+		if disk == nil {
+			parityDrives++
+			continue
+		}
+		di, err := disk.DiskInfo(ctx)
+		if err != nil || di.ID == "" {
+			parityDrives++
+		}
+	}
+	if ecOrg != parityDrives {
+		opts.UserDefined[xhttp.MinIOErasureUpgraded] = fmt.Sprintf("%d->%d", ecOrg, parityDrives)
+	}
+
 	dataDrives := len(onlineDisks) - parityDrives
+
 	// we now know the number of blocks this object needs for data and parity.
 	// establish the writeQuorum using this data
 	writeQuorum := dataDrives
diff --git a/cmd/erasure-object.go b/cmd/erasure-object.go
index e24dac294..cb4ed73b7 100644
--- a/cmd/erasure-object.go
+++ b/cmd/erasure-object.go
@@ -599,6 +599,24 @@ func (er erasureObjects) putObject(ctx context.Context, bucket string, object st
 		if parityDrives <= 0 {
 			parityDrives = er.defaultParityCount
 		}
+
+		// If we have offline disks upgrade the number of erasure codes for this object.
+		ecOrg := parityDrives
+		for _, disk := range storageDisks {
+			if parityDrives >= len(storageDisks)/2 {
+				break
+			}
+			if disk == nil {
+				parityDrives++
+			}
+			di, err := disk.DiskInfo(ctx)
+			if err != nil || di.ID == "" {
+				parityDrives++
+			}
+		}
+		if ecOrg != parityDrives {
+			opts.UserDefined[xhttp.MinIOErasureUpgraded] = fmt.Sprintf("%d->%d", ecOrg, parityDrives)
+		}
 	}
 	dataDrives := len(storageDisks) - parityDrives
 
diff --git a/cmd/erasure-object_test.go b/cmd/erasure-object_test.go
index 97c26bdba..3a48ddd05 100644
--- a/cmd/erasure-object_test.go
+++ b/cmd/erasure-object_test.go
@@ -238,8 +238,151 @@ func TestErasureDeleteObjectDiskNotFound(t *testing.T) {
 	if err != nil {
 		t.Fatal(err)
 	}
-	// for a 16 disk setup, quorum is 9. To simulate disks not found yet
-	// quorum is available, we remove disks leaving quorum disks behind.
+
+	erasureDisks := xl.getDisks()
+	z.serverPools[0].erasureDisksMu.Lock()
+	xl.getDisks = func() []StorageAPI {
+		for i := range erasureDisks[:6] {
+			erasureDisks[i] = newNaughtyDisk(erasureDisks[i], nil, errFaultyDisk)
+		}
+		return erasureDisks
+	}
+
+	z.serverPools[0].erasureDisksMu.Unlock()
+	_, err = obj.DeleteObject(ctx, bucket, object, ObjectOptions{})
+	if !errors.Is(err, errErasureWriteQuorum) {
+		t.Fatal(err)
+	}
+
+	// Create "obj" under "bucket".
+	_, err = obj.PutObject(ctx, bucket, object, mustGetPutObjReader(t, bytes.NewReader([]byte("abcd")), int64(len("abcd")), "", ""), opts)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Remove one more disk to 'lose' quorum, by taking 2 more drives offline.
+	erasureDisks = xl.getDisks()
+	z.serverPools[0].erasureDisksMu.Lock()
+	xl.getDisks = func() []StorageAPI {
+		erasureDisks[7] = nil
+		erasureDisks[8] = nil
+		return erasureDisks
+	}
+
+	z.serverPools[0].erasureDisksMu.Unlock()
+	_, err = obj.DeleteObject(ctx, bucket, object, ObjectOptions{})
+	// since majority of disks are not available, metaquorum is not achieved and hence errErasureWriteQuorum error
+	if !errors.Is(err, errErasureWriteQuorum) {
+		t.Errorf("Expected deleteObject to fail with %v, but failed with %v", toObjectErr(errErasureWriteQuorum, bucket, object), err)
+	}
+}
+
+func TestErasureDeleteObjectDiskNotFoundErasure4(t *testing.T) {
+	restoreGlobalStorageClass := globalStorageClass
+	defer func() {
+		globalStorageClass = restoreGlobalStorageClass
+	}()
+
+	globalStorageClass = storageclass.Config{}
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	// Create an instance of xl backend.
+	obj, fsDirs, err := prepareErasure16(ctx)
+	if err != nil {
+		t.Fatal(err)
+	}
+	// Cleanup backend directories
+	defer obj.Shutdown(context.Background())
+	defer removeRoots(fsDirs)
+
+	z := obj.(*erasureServerPools)
+	xl := z.serverPools[0].sets[0]
+
+	// Create "bucket"
+	err = obj.MakeBucketWithLocation(ctx, "bucket", BucketOptions{})
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	bucket := "bucket"
+	object := "object"
+	opts := ObjectOptions{}
+	// Create object "obj" under bucket "bucket".
+	_, err = obj.PutObject(ctx, bucket, object, mustGetPutObjReader(t, bytes.NewReader([]byte("abcd")), int64(len("abcd")), "", ""), opts)
+	if err != nil {
+		t.Fatal(err)
+	}
+	// Upload a good object
+	_, err = obj.DeleteObject(ctx, bucket, object, ObjectOptions{})
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Create "obj" under "bucket".
+	_, err = obj.PutObject(ctx, bucket, object, mustGetPutObjReader(t, bytes.NewReader([]byte("abcd")), int64(len("abcd")), "", ""), opts)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Remove disks to 'lose' quorum for object, by setting 5 to nil.
+	erasureDisks := xl.getDisks()
+	z.serverPools[0].erasureDisksMu.Lock()
+	xl.getDisks = func() []StorageAPI {
+		for i := range erasureDisks[:5] {
+			erasureDisks[i] = newNaughtyDisk(erasureDisks[i], nil, errFaultyDisk)
+		}
+		return erasureDisks
+	}
+
+	z.serverPools[0].erasureDisksMu.Unlock()
+	_, err = obj.DeleteObject(ctx, bucket, object, ObjectOptions{})
+	// since majority of disks are not available, metaquorum is not achieved and hence errErasureWriteQuorum error
+	if !errors.Is(err, errErasureWriteQuorum) {
+		t.Errorf("Expected deleteObject to fail with %v, but failed with %v", toObjectErr(errErasureWriteQuorum, bucket, object), err)
+	}
+}
+
+func TestErasureDeleteObjectDiskNotFoundErr(t *testing.T) {
+	restoreGlobalStorageClass := globalStorageClass
+	defer func() {
+		globalStorageClass = restoreGlobalStorageClass
+	}()
+
+	globalStorageClass = storageclass.Config{}
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	// Create an instance of xl backend.
+	obj, fsDirs, err := prepareErasure16(ctx)
+	if err != nil {
+		t.Fatal(err)
+	}
+	// Cleanup backend directories
+	defer obj.Shutdown(context.Background())
+	defer removeRoots(fsDirs)
+
+	z := obj.(*erasureServerPools)
+	xl := z.serverPools[0].sets[0]
+
+	// Create "bucket"
+	err = obj.MakeBucketWithLocation(ctx, "bucket", BucketOptions{})
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	bucket := "bucket"
+	object := "object"
+	opts := ObjectOptions{}
+	// Create object "obj" under bucket "bucket".
+	_, err = obj.PutObject(ctx, bucket, object, mustGetPutObjReader(t, bytes.NewReader([]byte("abcd")), int64(len("abcd")), "", ""), opts)
+	if err != nil {
+		t.Fatal(err)
+	}
+	// for a 16 disk setup, EC is 4, but will be upgraded up to 8.
+	// Remove 4 disks.
 	erasureDisks := xl.getDisks()
 	z.serverPools[0].erasureDisksMu.Lock()
 	xl.getDisks = func() []StorageAPI {
@@ -261,20 +404,21 @@ func TestErasureDeleteObjectDiskNotFound(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	// Remove one more disk to 'lose' quorum, by setting it to nil.
+	// Object was uploaded with 4 known bad drives, so we should still be able to lose 3 drives and still write to the object.
 	erasureDisks = xl.getDisks()
 	z.serverPools[0].erasureDisksMu.Lock()
 	xl.getDisks = func() []StorageAPI {
 		erasureDisks[7] = nil
 		erasureDisks[8] = nil
+		erasureDisks[9] = nil
 		return erasureDisks
 	}
 
 	z.serverPools[0].erasureDisksMu.Unlock()
 	_, err = obj.DeleteObject(ctx, bucket, object, ObjectOptions{})
-	// since majority of disks are not available, metaquorum is not achieved and hence errErasureWriteQuorum error
-	if !errors.Is(err, errErasureWriteQuorum) {
-		t.Errorf("Expected deleteObject to fail with %v, but failed with %v", toObjectErr(errErasureWriteQuorum, bucket, object), err)
+	// since majority of disks are available, metaquorum achieved.
+	if err != nil {
+		t.Errorf("Expected deleteObject to not fail, but failed with %v", err)
 	}
 }
 
diff --git a/cmd/http/headers.go b/cmd/http/headers.go
index 14f0a5f6e..bd053027a 100644
--- a/cmd/http/headers.go
+++ b/cmd/http/headers.go
@@ -156,6 +156,9 @@ const (
 	// Reports number of drives currently healing
 	MinIOHealingDrives = "x-minio-healing-drives"
 
+	// Object was stored with additional erasure codes due to degraded system at upload time
+	MinIOErasureUpgraded = "x-minio-internal-erasure-upgraded"
+
 	// Header indicates if the delete marker should be preserved by client
 	MinIOSourceDeleteMarker = "x-minio-source-deletemarker"
 
diff --git a/docs/distributed/README.md b/docs/distributed/README.md
index 64a86d729..c9bfe1841 100644
--- a/docs/distributed/README.md
+++ b/docs/distributed/README.md
@@ -10,6 +10,8 @@ MinIO in distributed mode can help you setup a highly-available storage system w
 
 Distributed MinIO provides protection against multiple node/drive failures and [bit rot](https://github.com/minio/minio/blob/master/docs/erasure/README.md#what-is-bit-rot-protection) using [erasure code](https://docs.min.io/docs/minio-erasure-code-quickstart-guide). As the minimum disks required for distributed MinIO is 4 (same as minimum disks required for erasure coding), erasure code automatically kicks in as you launch distributed MinIO.
 
+If one or more disks are offline at the start of a PutObject or NewMultipartUpload operation the object will have additional data protection bits added automatically to provide additional safety for these objects.
+
 ### High availability
 
 A stand-alone MinIO server would go down if the server hosting the disks goes offline. In contrast, a distributed MinIO setup with _m_ servers and _n_ disks will have your data safe as long as _m/2_ servers or _m*n_/2 or more disks are online.
diff --git a/docs/distributed/SIZING.md b/docs/distributed/SIZING.md
index da3448cf8..3822d9560 100644
--- a/docs/distributed/SIZING.md
+++ b/docs/distributed/SIZING.md
@@ -32,3 +32,8 @@ Capacity constrained environments, MinIO will work but not recommended for produ
 |      15 |                 2 |          15 |                       4 |                             4 |                              4 |
 |      16 |                 2 |          16 |                       4 |                             4 |                              4 |
 
+If one or more disks are offline at the start of a PutObject or NewMultipartUpload operation the object will have additional data 
+protection bits added automatically to provide the regular safety for these objects up to 50% of the number of disks.
+This will allow normal write operations to take place on systems that exceed the write tolerance.
+
+This means that in the examples above the system will always write 4 parity shards at the expense of slightly higher disk usage.