Add more erasure codes on degraded systems. (#11852)

In cases where a cluster is degraded, we do not uphold our consistency 
guarantee and we will write fewer erasure codes and rely on healing 
to recreate the missing shards.

In some cases replacing known bad disks in practice take days.
We want to change the behavior of a known degraded system to keep
the erasure code promise of the storage class for each object.

This will create the objects with the same confidence as a fully 
functional cluster. The tradeoff will be that objects created 
during a partial outage will take up slightly more space.

This means that when the storage class is EC:4, there should 
always be written 4 parity shards, even if some disks are unavailable.

When an object is created on a set, the disks are immediately 
checked. If any disks are unavailable additional parity shards 
will be made for each offline disk, up to 50% of the number of disks.

We add an internal metadata field with the actual and intended 
erasure code level, this can optionally be picked up later by 
the scanner if we decide that data like this should be re-sharded.
This commit is contained in:
Klaus Post
2021-05-27 20:38:09 +02:00
committed by GitHub
parent be541dba8a
commit acc452b7ce
6 changed files with 197 additions and 6 deletions

View File

@@ -238,8 +238,151 @@ func TestErasureDeleteObjectDiskNotFound(t *testing.T) {
if err != nil {
t.Fatal(err)
}
// for a 16 disk setup, quorum is 9. To simulate disks not found yet
// quorum is available, we remove disks leaving quorum disks behind.
erasureDisks := xl.getDisks()
z.serverPools[0].erasureDisksMu.Lock()
xl.getDisks = func() []StorageAPI {
for i := range erasureDisks[:6] {
erasureDisks[i] = newNaughtyDisk(erasureDisks[i], nil, errFaultyDisk)
}
return erasureDisks
}
z.serverPools[0].erasureDisksMu.Unlock()
_, err = obj.DeleteObject(ctx, bucket, object, ObjectOptions{})
if !errors.Is(err, errErasureWriteQuorum) {
t.Fatal(err)
}
// Create "obj" under "bucket".
_, err = obj.PutObject(ctx, bucket, object, mustGetPutObjReader(t, bytes.NewReader([]byte("abcd")), int64(len("abcd")), "", ""), opts)
if err != nil {
t.Fatal(err)
}
// Remove one more disk to 'lose' quorum, by taking 2 more drives offline.
erasureDisks = xl.getDisks()
z.serverPools[0].erasureDisksMu.Lock()
xl.getDisks = func() []StorageAPI {
erasureDisks[7] = nil
erasureDisks[8] = nil
return erasureDisks
}
z.serverPools[0].erasureDisksMu.Unlock()
_, err = obj.DeleteObject(ctx, bucket, object, ObjectOptions{})
// since majority of disks are not available, metaquorum is not achieved and hence errErasureWriteQuorum error
if !errors.Is(err, errErasureWriteQuorum) {
t.Errorf("Expected deleteObject to fail with %v, but failed with %v", toObjectErr(errErasureWriteQuorum, bucket, object), err)
}
}
func TestErasureDeleteObjectDiskNotFoundErasure4(t *testing.T) {
restoreGlobalStorageClass := globalStorageClass
defer func() {
globalStorageClass = restoreGlobalStorageClass
}()
globalStorageClass = storageclass.Config{}
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// Create an instance of xl backend.
obj, fsDirs, err := prepareErasure16(ctx)
if err != nil {
t.Fatal(err)
}
// Cleanup backend directories
defer obj.Shutdown(context.Background())
defer removeRoots(fsDirs)
z := obj.(*erasureServerPools)
xl := z.serverPools[0].sets[0]
// Create "bucket"
err = obj.MakeBucketWithLocation(ctx, "bucket", BucketOptions{})
if err != nil {
t.Fatal(err)
}
bucket := "bucket"
object := "object"
opts := ObjectOptions{}
// Create object "obj" under bucket "bucket".
_, err = obj.PutObject(ctx, bucket, object, mustGetPutObjReader(t, bytes.NewReader([]byte("abcd")), int64(len("abcd")), "", ""), opts)
if err != nil {
t.Fatal(err)
}
// Upload a good object
_, err = obj.DeleteObject(ctx, bucket, object, ObjectOptions{})
if err != nil {
t.Fatal(err)
}
// Create "obj" under "bucket".
_, err = obj.PutObject(ctx, bucket, object, mustGetPutObjReader(t, bytes.NewReader([]byte("abcd")), int64(len("abcd")), "", ""), opts)
if err != nil {
t.Fatal(err)
}
// Remove disks to 'lose' quorum for object, by setting 5 to nil.
erasureDisks := xl.getDisks()
z.serverPools[0].erasureDisksMu.Lock()
xl.getDisks = func() []StorageAPI {
for i := range erasureDisks[:5] {
erasureDisks[i] = newNaughtyDisk(erasureDisks[i], nil, errFaultyDisk)
}
return erasureDisks
}
z.serverPools[0].erasureDisksMu.Unlock()
_, err = obj.DeleteObject(ctx, bucket, object, ObjectOptions{})
// since majority of disks are not available, metaquorum is not achieved and hence errErasureWriteQuorum error
if !errors.Is(err, errErasureWriteQuorum) {
t.Errorf("Expected deleteObject to fail with %v, but failed with %v", toObjectErr(errErasureWriteQuorum, bucket, object), err)
}
}
func TestErasureDeleteObjectDiskNotFoundErr(t *testing.T) {
restoreGlobalStorageClass := globalStorageClass
defer func() {
globalStorageClass = restoreGlobalStorageClass
}()
globalStorageClass = storageclass.Config{}
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// Create an instance of xl backend.
obj, fsDirs, err := prepareErasure16(ctx)
if err != nil {
t.Fatal(err)
}
// Cleanup backend directories
defer obj.Shutdown(context.Background())
defer removeRoots(fsDirs)
z := obj.(*erasureServerPools)
xl := z.serverPools[0].sets[0]
// Create "bucket"
err = obj.MakeBucketWithLocation(ctx, "bucket", BucketOptions{})
if err != nil {
t.Fatal(err)
}
bucket := "bucket"
object := "object"
opts := ObjectOptions{}
// Create object "obj" under bucket "bucket".
_, err = obj.PutObject(ctx, bucket, object, mustGetPutObjReader(t, bytes.NewReader([]byte("abcd")), int64(len("abcd")), "", ""), opts)
if err != nil {
t.Fatal(err)
}
// for a 16 disk setup, EC is 4, but will be upgraded up to 8.
// Remove 4 disks.
erasureDisks := xl.getDisks()
z.serverPools[0].erasureDisksMu.Lock()
xl.getDisks = func() []StorageAPI {
@@ -261,20 +404,21 @@ func TestErasureDeleteObjectDiskNotFound(t *testing.T) {
t.Fatal(err)
}
// Remove one more disk to 'lose' quorum, by setting it to nil.
// Object was uploaded with 4 known bad drives, so we should still be able to lose 3 drives and still write to the object.
erasureDisks = xl.getDisks()
z.serverPools[0].erasureDisksMu.Lock()
xl.getDisks = func() []StorageAPI {
erasureDisks[7] = nil
erasureDisks[8] = nil
erasureDisks[9] = nil
return erasureDisks
}
z.serverPools[0].erasureDisksMu.Unlock()
_, err = obj.DeleteObject(ctx, bucket, object, ObjectOptions{})
// since majority of disks are not available, metaquorum is not achieved and hence errErasureWriteQuorum error
if !errors.Is(err, errErasureWriteQuorum) {
t.Errorf("Expected deleteObject to fail with %v, but failed with %v", toObjectErr(errErasureWriteQuorum, bucket, object), err)
// since majority of disks are available, metaquorum achieved.
if err != nil {
t.Errorf("Expected deleteObject to not fail, but failed with %v", err)
}
}