diff --git a/cmd/erasure-multipart.go b/cmd/erasure-multipart.go index 621455a95..6015080dd 100644 --- a/cmd/erasure-multipart.go +++ b/cmd/erasure-multipart.go @@ -289,7 +289,26 @@ func (er erasureObjects) newMultipartUpload(ctx context.Context, bucket string, parityDrives = er.defaultParityCount } + ecOrg := parityDrives + for _, disk := range onlineDisks { + if parityDrives >= len(onlineDisks)/2 { + break + } + if disk == nil { + parityDrives++ + continue + } + di, err := disk.DiskInfo(ctx) + if err != nil || di.ID == "" { + parityDrives++ + } + } + if ecOrg != parityDrives { + opts.UserDefined[xhttp.MinIOErasureUpgraded] = fmt.Sprintf("%d->%d", ecOrg, parityDrives) + } + dataDrives := len(onlineDisks) - parityDrives + // we now know the number of blocks this object needs for data and parity. // establish the writeQuorum using this data writeQuorum := dataDrives diff --git a/cmd/erasure-object.go b/cmd/erasure-object.go index e24dac294..cb4ed73b7 100644 --- a/cmd/erasure-object.go +++ b/cmd/erasure-object.go @@ -599,6 +599,24 @@ func (er erasureObjects) putObject(ctx context.Context, bucket string, object st if parityDrives <= 0 { parityDrives = er.defaultParityCount } + + // If we have offline disks upgrade the number of erasure codes for this object. + ecOrg := parityDrives + for _, disk := range storageDisks { + if parityDrives >= len(storageDisks)/2 { + break + } + if disk == nil { + parityDrives++ + } + di, err := disk.DiskInfo(ctx) + if err != nil || di.ID == "" { + parityDrives++ + } + } + if ecOrg != parityDrives { + opts.UserDefined[xhttp.MinIOErasureUpgraded] = fmt.Sprintf("%d->%d", ecOrg, parityDrives) + } } dataDrives := len(storageDisks) - parityDrives diff --git a/cmd/erasure-object_test.go b/cmd/erasure-object_test.go index 97c26bdba..3a48ddd05 100644 --- a/cmd/erasure-object_test.go +++ b/cmd/erasure-object_test.go @@ -238,8 +238,151 @@ func TestErasureDeleteObjectDiskNotFound(t *testing.T) { if err != nil { t.Fatal(err) } - // for a 16 disk setup, quorum is 9. To simulate disks not found yet - // quorum is available, we remove disks leaving quorum disks behind. + + erasureDisks := xl.getDisks() + z.serverPools[0].erasureDisksMu.Lock() + xl.getDisks = func() []StorageAPI { + for i := range erasureDisks[:6] { + erasureDisks[i] = newNaughtyDisk(erasureDisks[i], nil, errFaultyDisk) + } + return erasureDisks + } + + z.serverPools[0].erasureDisksMu.Unlock() + _, err = obj.DeleteObject(ctx, bucket, object, ObjectOptions{}) + if !errors.Is(err, errErasureWriteQuorum) { + t.Fatal(err) + } + + // Create "obj" under "bucket". + _, err = obj.PutObject(ctx, bucket, object, mustGetPutObjReader(t, bytes.NewReader([]byte("abcd")), int64(len("abcd")), "", ""), opts) + if err != nil { + t.Fatal(err) + } + + // Remove one more disk to 'lose' quorum, by taking 2 more drives offline. + erasureDisks = xl.getDisks() + z.serverPools[0].erasureDisksMu.Lock() + xl.getDisks = func() []StorageAPI { + erasureDisks[7] = nil + erasureDisks[8] = nil + return erasureDisks + } + + z.serverPools[0].erasureDisksMu.Unlock() + _, err = obj.DeleteObject(ctx, bucket, object, ObjectOptions{}) + // since majority of disks are not available, metaquorum is not achieved and hence errErasureWriteQuorum error + if !errors.Is(err, errErasureWriteQuorum) { + t.Errorf("Expected deleteObject to fail with %v, but failed with %v", toObjectErr(errErasureWriteQuorum, bucket, object), err) + } +} + +func TestErasureDeleteObjectDiskNotFoundErasure4(t *testing.T) { + restoreGlobalStorageClass := globalStorageClass + defer func() { + globalStorageClass = restoreGlobalStorageClass + }() + + globalStorageClass = storageclass.Config{} + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Create an instance of xl backend. + obj, fsDirs, err := prepareErasure16(ctx) + if err != nil { + t.Fatal(err) + } + // Cleanup backend directories + defer obj.Shutdown(context.Background()) + defer removeRoots(fsDirs) + + z := obj.(*erasureServerPools) + xl := z.serverPools[0].sets[0] + + // Create "bucket" + err = obj.MakeBucketWithLocation(ctx, "bucket", BucketOptions{}) + if err != nil { + t.Fatal(err) + } + + bucket := "bucket" + object := "object" + opts := ObjectOptions{} + // Create object "obj" under bucket "bucket". + _, err = obj.PutObject(ctx, bucket, object, mustGetPutObjReader(t, bytes.NewReader([]byte("abcd")), int64(len("abcd")), "", ""), opts) + if err != nil { + t.Fatal(err) + } + // Upload a good object + _, err = obj.DeleteObject(ctx, bucket, object, ObjectOptions{}) + if err != nil { + t.Fatal(err) + } + + // Create "obj" under "bucket". + _, err = obj.PutObject(ctx, bucket, object, mustGetPutObjReader(t, bytes.NewReader([]byte("abcd")), int64(len("abcd")), "", ""), opts) + if err != nil { + t.Fatal(err) + } + + // Remove disks to 'lose' quorum for object, by setting 5 to nil. + erasureDisks := xl.getDisks() + z.serverPools[0].erasureDisksMu.Lock() + xl.getDisks = func() []StorageAPI { + for i := range erasureDisks[:5] { + erasureDisks[i] = newNaughtyDisk(erasureDisks[i], nil, errFaultyDisk) + } + return erasureDisks + } + + z.serverPools[0].erasureDisksMu.Unlock() + _, err = obj.DeleteObject(ctx, bucket, object, ObjectOptions{}) + // since majority of disks are not available, metaquorum is not achieved and hence errErasureWriteQuorum error + if !errors.Is(err, errErasureWriteQuorum) { + t.Errorf("Expected deleteObject to fail with %v, but failed with %v", toObjectErr(errErasureWriteQuorum, bucket, object), err) + } +} + +func TestErasureDeleteObjectDiskNotFoundErr(t *testing.T) { + restoreGlobalStorageClass := globalStorageClass + defer func() { + globalStorageClass = restoreGlobalStorageClass + }() + + globalStorageClass = storageclass.Config{} + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Create an instance of xl backend. + obj, fsDirs, err := prepareErasure16(ctx) + if err != nil { + t.Fatal(err) + } + // Cleanup backend directories + defer obj.Shutdown(context.Background()) + defer removeRoots(fsDirs) + + z := obj.(*erasureServerPools) + xl := z.serverPools[0].sets[0] + + // Create "bucket" + err = obj.MakeBucketWithLocation(ctx, "bucket", BucketOptions{}) + if err != nil { + t.Fatal(err) + } + + bucket := "bucket" + object := "object" + opts := ObjectOptions{} + // Create object "obj" under bucket "bucket". + _, err = obj.PutObject(ctx, bucket, object, mustGetPutObjReader(t, bytes.NewReader([]byte("abcd")), int64(len("abcd")), "", ""), opts) + if err != nil { + t.Fatal(err) + } + // for a 16 disk setup, EC is 4, but will be upgraded up to 8. + // Remove 4 disks. erasureDisks := xl.getDisks() z.serverPools[0].erasureDisksMu.Lock() xl.getDisks = func() []StorageAPI { @@ -261,20 +404,21 @@ func TestErasureDeleteObjectDiskNotFound(t *testing.T) { t.Fatal(err) } - // Remove one more disk to 'lose' quorum, by setting it to nil. + // Object was uploaded with 4 known bad drives, so we should still be able to lose 3 drives and still write to the object. erasureDisks = xl.getDisks() z.serverPools[0].erasureDisksMu.Lock() xl.getDisks = func() []StorageAPI { erasureDisks[7] = nil erasureDisks[8] = nil + erasureDisks[9] = nil return erasureDisks } z.serverPools[0].erasureDisksMu.Unlock() _, err = obj.DeleteObject(ctx, bucket, object, ObjectOptions{}) - // since majority of disks are not available, metaquorum is not achieved and hence errErasureWriteQuorum error - if !errors.Is(err, errErasureWriteQuorum) { - t.Errorf("Expected deleteObject to fail with %v, but failed with %v", toObjectErr(errErasureWriteQuorum, bucket, object), err) + // since majority of disks are available, metaquorum achieved. + if err != nil { + t.Errorf("Expected deleteObject to not fail, but failed with %v", err) } } diff --git a/cmd/http/headers.go b/cmd/http/headers.go index 14f0a5f6e..bd053027a 100644 --- a/cmd/http/headers.go +++ b/cmd/http/headers.go @@ -156,6 +156,9 @@ const ( // Reports number of drives currently healing MinIOHealingDrives = "x-minio-healing-drives" + // Object was stored with additional erasure codes due to degraded system at upload time + MinIOErasureUpgraded = "x-minio-internal-erasure-upgraded" + // Header indicates if the delete marker should be preserved by client MinIOSourceDeleteMarker = "x-minio-source-deletemarker" diff --git a/docs/distributed/README.md b/docs/distributed/README.md index 64a86d729..c9bfe1841 100644 --- a/docs/distributed/README.md +++ b/docs/distributed/README.md @@ -10,6 +10,8 @@ MinIO in distributed mode can help you setup a highly-available storage system w Distributed MinIO provides protection against multiple node/drive failures and [bit rot](https://github.com/minio/minio/blob/master/docs/erasure/README.md#what-is-bit-rot-protection) using [erasure code](https://docs.min.io/docs/minio-erasure-code-quickstart-guide). As the minimum disks required for distributed MinIO is 4 (same as minimum disks required for erasure coding), erasure code automatically kicks in as you launch distributed MinIO. +If one or more disks are offline at the start of a PutObject or NewMultipartUpload operation the object will have additional data protection bits added automatically to provide additional safety for these objects. + ### High availability A stand-alone MinIO server would go down if the server hosting the disks goes offline. In contrast, a distributed MinIO setup with _m_ servers and _n_ disks will have your data safe as long as _m/2_ servers or _m*n_/2 or more disks are online. diff --git a/docs/distributed/SIZING.md b/docs/distributed/SIZING.md index da3448cf8..3822d9560 100644 --- a/docs/distributed/SIZING.md +++ b/docs/distributed/SIZING.md @@ -32,3 +32,8 @@ Capacity constrained environments, MinIO will work but not recommended for produ | 15 | 2 | 15 | 4 | 4 | 4 | | 16 | 2 | 16 | 4 | 4 | 4 | +If one or more disks are offline at the start of a PutObject or NewMultipartUpload operation the object will have additional data +protection bits added automatically to provide the regular safety for these objects up to 50% of the number of disks. +This will allow normal write operations to take place on systems that exceed the write tolerance. + +This means that in the examples above the system will always write 4 parity shards at the expense of slightly higher disk usage.