From 9c605ad153fec94df496906bb94ec8733b5620df Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Mon, 27 Jun 2022 20:22:18 -0700 Subject: [PATCH] allow support for parity '0', '1' enabling support for 2,3 drive setups (#15171) allows for further granular setups - 2 drives (1 parity, 1 data) - 3 drives (1 parity, 2 data) Bonus: allows '0' parity as well. --- cmd/endpoint-ellipses.go | 2 +- cmd/endpoint-ellipses_test.go | 24 +++++------ cmd/erasure-metadata.go | 2 +- cmd/erasure-multipart.go | 2 +- cmd/erasure-object.go | 4 +- cmd/erasure-object_test.go | 19 ++++++--- cmd/erasure-server-pool.go | 2 +- cmd/format-erasure.go | 19 ++------- cmd/utils.go | 9 ---- docs/distributed/DESIGN.md | 2 +- docs/distributed/README.md | 11 +++-- docs/erasure/README.md | 2 +- docs/erasure/storage-class/README.md | 4 +- docs/minio-limits.md | 6 +-- internal/config/errors.go | 4 +- internal/config/storageclass/storage-class.go | 41 ++++++++++++++----- .../config/storageclass/storage-class_test.go | 5 ++- 17 files changed, 82 insertions(+), 76 deletions(-) diff --git a/cmd/endpoint-ellipses.go b/cmd/endpoint-ellipses.go index eedd1435b..72d2a7d0d 100644 --- a/cmd/endpoint-ellipses.go +++ b/cmd/endpoint-ellipses.go @@ -42,7 +42,7 @@ type endpointSet struct { // Supported set sizes this is used to find the optimal // single set size. -var setSizes = []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} +var setSizes = []uint64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} // getDivisibleSize - returns a greatest common divisor of // all the ellipses sizes. diff --git a/cmd/endpoint-ellipses_test.go b/cmd/endpoint-ellipses_test.go index 1cf317c09..1977100e0 100644 --- a/cmd/endpoint-ellipses_test.go +++ b/cmd/endpoint-ellipses_test.go @@ -201,18 +201,6 @@ func TestGetSetIndexes(t *testing.T) { success bool }{ // Invalid inputs. - { - []string{"data{1...3}"}, - []uint64{3}, - nil, - false, - }, - { - []string{"data/controller1/export{1...2}, data/controller2/export{1...4}, data/controller3/export{1...8}"}, - []uint64{2, 4, 8}, - nil, - false, - }, { []string{"data{1...17}/export{1...52}"}, []uint64{14144}, @@ -220,6 +208,18 @@ func TestGetSetIndexes(t *testing.T) { false, }, // Valid inputs. + { + []string{"data{1...3}"}, + []uint64{3}, + [][]uint64{{3}}, + true, + }, + { + []string{"data/controller1/export{1...2}, data/controller2/export{1...4}, data/controller3/export{1...8}"}, + []uint64{2, 4, 8}, + [][]uint64{{2}, {2, 2}, {2, 2, 2, 2}}, + true, + }, { []string{"data{1...27}"}, []uint64{27}, diff --git a/cmd/erasure-metadata.go b/cmd/erasure-metadata.go index a92a55fe7..521d3f182 100644 --- a/cmd/erasure-metadata.go +++ b/cmd/erasure-metadata.go @@ -417,7 +417,7 @@ func objectQuorumFromMeta(ctx context.Context, partsMetaData []FileInfo, errs [] } parityBlocks := globalStorageClass.GetParityForSC(latestFileInfo.Metadata[xhttp.AmzStorageClass]) - if parityBlocks <= 0 { + if parityBlocks < 0 { parityBlocks = defaultParityCount } diff --git a/cmd/erasure-multipart.go b/cmd/erasure-multipart.go index eb9c185a3..0b02eedae 100644 --- a/cmd/erasure-multipart.go +++ b/cmd/erasure-multipart.go @@ -290,7 +290,7 @@ func (er erasureObjects) newMultipartUpload(ctx context.Context, bucket string, onlineDisks := er.getDisks() parityDrives := globalStorageClass.GetParityForSC(userDefined[xhttp.AmzStorageClass]) - if parityDrives <= 0 { + if parityDrives < 0 { parityDrives = er.defaultParityCount } diff --git a/cmd/erasure-object.go b/cmd/erasure-object.go index 3e96fcdab..a78518b0f 100644 --- a/cmd/erasure-object.go +++ b/cmd/erasure-object.go @@ -736,7 +736,7 @@ func (er erasureObjects) putMetacacheObject(ctx context.Context, key string, r * storageDisks := er.getDisks() // Get parity and data drive count based on storage class metadata parityDrives := globalStorageClass.GetParityForSC(opts.UserDefined[xhttp.AmzStorageClass]) - if parityDrives <= 0 { + if parityDrives < 0 { parityDrives = er.defaultParityCount } dataDrives := len(storageDisks) - parityDrives @@ -885,7 +885,7 @@ func (er erasureObjects) putObject(ctx context.Context, bucket string, object st if !opts.MaxParity { // Get parity and data drive count based on storage class metadata parityDrives = globalStorageClass.GetParityForSC(userDefined[xhttp.AmzStorageClass]) - if parityDrives <= 0 { + if parityDrives < 0 { parityDrives = er.defaultParityCount } diff --git a/cmd/erasure-object_test.go b/cmd/erasure-object_test.go index 7ca3849bd..b2dae1928 100644 --- a/cmd/erasure-object_test.go +++ b/cmd/erasure-object_test.go @@ -893,6 +893,14 @@ func testObjectQuorumFromMeta(obj ObjectLayer, instanceType string, dirs []strin // Object for test case 1 - No StorageClass defined, no MetaData in PutObject object1 := "object1" + globalStorageClass = storageclass.Config{ + RRS: storageclass.StorageClass{ + Parity: 2, + }, + Standard: storageclass.StorageClass{ + Parity: 4, + }, + } _, err = obj.PutObject(ctx, bucket, object1, mustGetPutObjReader(t, bytes.NewReader(data), int64(len(data)), "", ""), opts) if err != nil { t.Fatalf("Failed to putObject %v", err) @@ -964,17 +972,16 @@ func testObjectQuorumFromMeta(obj ObjectLayer, instanceType string, dirs []strin } parts5, errs5 := readAllFileInfo(ctx, erasureDisks, bucket, object5, "", false) - parts5SC := storageclass.Config{ - RRS: storageclass.StorageClass{ - Parity: 2, - }, - } + parts5SC := globalStorageClass // Object for test case 6 - RRS StorageClass defined as Parity 2, MetaData in PutObject requesting Standard Storage Class object6 := "object6" metadata6 := make(map[string]string) metadata6["x-amz-storage-class"] = storageclass.STANDARD globalStorageClass = storageclass.Config{ + Standard: storageclass.StorageClass{ + Parity: 4, + }, RRS: storageclass.StorageClass{ Parity: 2, }, @@ -1035,7 +1042,7 @@ func testObjectQuorumFromMeta(obj ObjectLayer, instanceType string, dirs []strin tt := tt t.(*testing.T).Run("", func(t *testing.T) { globalStorageClass = tt.storageClassCfg - actualReadQuorum, actualWriteQuorum, err := objectQuorumFromMeta(ctx, tt.parts, tt.errs, getDefaultParityBlocks(len(erasureDisks))) + actualReadQuorum, actualWriteQuorum, err := objectQuorumFromMeta(ctx, tt.parts, tt.errs, storageclass.DefaultParityBlocks(len(erasureDisks))) if tt.expectedError != nil && err == nil { t.Errorf("Expected %s, got %s", tt.expectedError, err) } diff --git a/cmd/erasure-server-pool.go b/cmd/erasure-server-pool.go index 82812e0ba..475706222 100644 --- a/cmd/erasure-server-pool.go +++ b/cmd/erasure-server-pool.go @@ -525,7 +525,7 @@ func (z *erasureServerPools) BackendInfo() (b madmin.BackendInfo) { b.Type = madmin.Erasure scParity := globalStorageClass.GetParityForSC(storageclass.STANDARD) - if scParity <= 0 { + if scParity < 0 { scParity = z.serverPools[0].defaultParityCount } rrSCParity := globalStorageClass.GetParityForSC(storageclass.RRS) diff --git a/cmd/format-erasure.go b/cmd/format-erasure.go index 2097da096..39a6610eb 100644 --- a/cmd/format-erasure.go +++ b/cmd/format-erasure.go @@ -643,7 +643,7 @@ func saveFormatErasureAll(ctx context.Context, storageDisks []StorageAPI, format }, index) } - writeQuorum := getWriteQuorum(len(storageDisks)) + writeQuorum := (len(storageDisks) + 1/2) // Wait for the routines to finish. return reduceWriteQuorumErrs(ctx, g.Wait(), nil, writeQuorum) } @@ -805,26 +805,13 @@ func initFormatErasure(ctx context.Context, storageDisks []StorageAPI, setCount, return getFormatErasureInQuorum(formats) } -func getDefaultParityBlocks(drive int) int { - switch drive { - case 3, 2: - return 1 - case 4, 5: - return 2 - case 6, 7: - return 3 - default: - return 4 - } -} - // ecDrivesNoConfig returns the erasure coded drives in a set if no config has been set. // It will attempt to read it from env variable and fall back to drives/2. func ecDrivesNoConfig(setDriveCount int) int { sc, _ := storageclass.LookupConfig(config.KVS{}, setDriveCount) ecDrives := sc.GetParityForSC(storageclass.STANDARD) - if ecDrives <= 0 { - ecDrives = getDefaultParityBlocks(setDriveCount) + if ecDrives < 0 { + ecDrives = storageclass.DefaultParityBlocks(setDriveCount) } return ecDrives } diff --git a/cmd/utils.go b/cmd/utils.go index 5dcbd6fd8..2148a78d3 100644 --- a/cmd/utils.go +++ b/cmd/utils.go @@ -121,15 +121,6 @@ func path2BucketObject(s string) (bucket, prefix string) { return path2BucketObjectWithBasePath("", s) } -func getWriteQuorum(drive int) int { - parity := getDefaultParityBlocks(drive) - quorum := drive - parity - if quorum == parity { - quorum++ - } - return quorum -} - // CloneMSS is an exposed function of cloneMSS for gateway usage. var CloneMSS = cloneMSS diff --git a/docs/distributed/DESIGN.md b/docs/distributed/DESIGN.md index e241d9ac3..5fb59c12e 100644 --- a/docs/distributed/DESIGN.md +++ b/docs/distributed/DESIGN.md @@ -41,7 +41,7 @@ Expansion of ellipses and choice of erasure sets based on this expansion is an a - Erasure coding used by MinIO is [Reed-Solomon](https://github.com/klauspost/reedsolomon) erasure coding scheme, which has a total shard maximum of 256 i.e 128 data and 128 parity. MinIO design goes beyond this limitation by doing some practical architecture choices. -- Erasure set is a single erasure coding unit within a MinIO deployment. An object is sharded within an erasure set. Erasure set size is automatically calculated based on the number of disks. MinIO supports unlimited number of disks but each erasure set can be upto 16 disks and a minimum of 4 disks. +- Erasure set is a single erasure coding unit within a MinIO deployment. An object is sharded within an erasure set. Erasure set size is automatically calculated based on the number of disks. MinIO supports unlimited number of disks but each erasure set can be upto 16 disks and a minimum of 2 disks. - We limited the number of drives to 16 for erasure set because, erasure code shards more than 16 can become chatty and do not have any performance advantages. Additionally since 16 drive erasure set gives you tolerance of 8 disks per object by default which is plenty in any practical scenario. diff --git a/docs/distributed/README.md b/docs/distributed/README.md index ae5e01e64..470489ec1 100644 --- a/docs/distributed/README.md +++ b/docs/distributed/README.md @@ -8,7 +8,7 @@ MinIO in distributed mode can help you setup a highly-available storage system w ### Data protection -Distributed MinIO provides protection against multiple node/drive failures and [bit rot](https://github.com/minio/minio/blob/master/docs/erasure/README.md#what-is-bit-rot-protection) using [erasure code](https://docs.min.io/docs/minio-erasure-code-quickstart-guide). As the minimum disks required for distributed MinIO is 4 (same as minimum disks required for erasure coding), erasure code automatically kicks in as you launch distributed MinIO. +Distributed MinIO provides protection against multiple node/drive failures and [bit rot](https://github.com/minio/minio/blob/master/docs/erasure/README.md#what-is-bit-rot-protection) using [erasure code](https://docs.min.io/docs/minio-erasure-code-quickstart-guide). As the minimum disks required for distributed MinIO is 2 (same as minimum disks required for erasure coding), erasure code automatically kicks in as you launch distributed MinIO. If one or more disks are offline at the start of a PutObject or NewMultipartUpload operation the object will have additional data protection bits added automatically to provide additional safety for these objects. @@ -22,9 +22,12 @@ Refer to sizing guide for more understanding on default values chosen depending ### Consistency Guarantees -MinIO follows strict **read-after-write** and **list-after-write** consistency model for all i/o operations both in distributed and standalone modes. This consistency model is only guaranteed if you use disk filesystems such as xfs, ext4 or zfs etc.. for distributed setup. +MinIO follows strict **read-after-write** and **list-after-write** consistency model for all i/o operations both in distributed and standalone modes. This consistency model is only guaranteed if you use disk filesystems such as xfs, zfs or btrfs etc.. for distributed setup. + +**In our tests we also found ext4 does not honor POSIX O_DIRECT/Fdatasync semantics, ext4 trades performance for consistency guarantees. Please avoid ext4 in your setup.** + +**If MinIO distributed setup is using NFS volumes underneath it is not guaranteed MinIO will provide these consistency guarantees since NFS is not strictly consistent (If you must use NFS we recommend that you atleast use NFSv4 instead of NFSv3 for relatively better outcomes).** -**If MinIO distributed setup is using NFS volumes underneath it is not guaranteed MinIO will provide these consistency guarantees since NFS is not consistent filesystem by design (If you must use NFS we recommend that you atleast use NFSv4 instead of NFSv3).** ## Get started @@ -41,7 +44,7 @@ To start a distributed MinIO instance, you just need to pass drive locations as **NOTE:** - All the nodes running distributed MinIO should share a common root credentials, for the nodes to connect and trust each other. To achieve this, it is **recommended** to export root user and root password as environment variables, `MINIO_ROOT_USER` and `MINIO_ROOT_PASSWORD`, on all the nodes before executing MinIO server command. If not exported, default `minioadmin/minioadmin` credentials shall be used. -- **MinIO creates erasure-coding sets of _4_ to _16_ drives per set. The number of drives you provide in total must be a multiple of one of those numbers.** +- **MinIO creates erasure-coding sets of _2_ to _16_ drives per set. The number of drives you provide in total must be a multiple of one of those numbers.** - **MinIO chooses the largest EC set size which divides into the total number of drives or total number of nodes given - making sure to keep the uniform distribution i.e each node participates equal number of drives per set**. - **Each object is written to a single EC set, and therefore is spread over no more than 16 drives.** - **All the nodes running distributed MinIO setup are recommended to be homogeneous, i.e. same operating system, same number of disks and same network interconnects.** diff --git a/docs/erasure/README.md b/docs/erasure/README.md index fec381583..fa2625b0d 100644 --- a/docs/erasure/README.md +++ b/docs/erasure/README.md @@ -24,7 +24,7 @@ MinIO's erasure coded backend uses high speed [HighwayHash](https://github.com/m ## How are drives used for Erasure Code? -MinIO divides the drives you provide into erasure-coding sets of *4 to 16* drives. Therefore, the number of drives you present must be a multiple of one of these numbers. Each object is written to a single erasure-coding set. +MinIO divides the drives you provide into erasure-coding sets of *2 to 16* drives. Therefore, the number of drives you present must be a multiple of one of these numbers. Each object is written to a single erasure-coding set. Minio uses the largest possible EC set size which divides into the number of drives given. For example, *18 drives* are configured as *2 sets of 9 drives*, and *24 drives* are configured as *2 sets of 12 drives*. This is true for scenarios when running MinIO as a standalone erasure coded deployment. In [distributed setup however node (affinity) based](https://docs.minio.io/docs/distributed-minio-quickstart-guide.html) erasure stripe sizes are chosen. diff --git a/docs/erasure/storage-class/README.md b/docs/erasure/storage-class/README.md index dee2dc941..8e6e55440 100644 --- a/docs/erasure/storage-class/README.md +++ b/docs/erasure/storage-class/README.md @@ -62,9 +62,7 @@ For more complete documentation on Erasure Set sizing, see the [MinIO Documentat - Less than N/2, if `STANDARD` parity is not set. - Less than `STANDARD` Parity, if it is set. -As parity below 2 is not recommended, `REDUCED_REDUNDANCY` storage class is not supported for 4 disks erasure coding setup. - -Default value for `REDUCED_REDUNDANCY` storage class is `2`. +Default value for `REDUCED_REDUNDANCY` storage class is `1`. ## Get started with Storage Class diff --git a/docs/minio-limits.md b/docs/minio-limits.md index 12e3aff19..1d64fdf4d 100644 --- a/docs/minio-limits.md +++ b/docs/minio-limits.md @@ -9,8 +9,8 @@ For best deployment experience MinIO recommends operating systems RHEL/CentOS 8. | Maximum number of servers per cluster | no-limit | | Maximum number of federated clusters | no-limit | | Minimum number of servers | 02 | -| Minimum number of drives per server when server count is 1 | 04 | -| Minimum number of drives per server when server count is 2 or 3 | 02 | +| Minimum number of drives per server when server count is 1 | 02 | +| Minimum number of drives per server when server count is 2 or 3 | 01 | | Minimum number of drives per server when server count is 4 | 01 | | Maximum number of drives per server | no-limit | | Read quorum | N/2 | @@ -53,7 +53,7 @@ We found the following APIs to be redundant or less useful outside of AWS S3. If ## Object name restrictions on MinIO - Object names that contain characters `^*|\/&";` are unsupported on Windows platform or any other file systems that do not support filenames with special charaters. **This list is non exhaustive, it depends on the operating system and filesystem under use - please consult your operating system vendor**. MinIO recommends using Linux based deployments for production workloads. -- Objects should not have conflicting objects as parents, applications using this behavior should change their behavior and use proper unique keys, for example situations such as following conflicting key patterns are not supported. +- Objects should not have conflicting objects as parent objects, applications using this behavior should change their behavior and use proper unique keys, for example situations such as following conflicting key patterns are not supported. ``` PUT /a/b/1.txt diff --git a/internal/config/errors.go b/internal/config/errors.go index 0c63d3459..521fc4b10 100644 --- a/internal/config/errors.go +++ b/internal/config/errors.go @@ -58,7 +58,7 @@ var ( ErrInvalidErasureSetSize = newErrFn( "Invalid erasure set size", "Please check the passed value", - "Erasure set can only accept any of [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] values", + "Erasure set can only accept any of [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] values", ) ErrInvalidWormValue = newErrFn( @@ -177,7 +177,7 @@ var ( ErrInvalidNumberOfErasureEndpoints = newErrFn( "Invalid total number of endpoints for erasure mode", - "Please provide an even number of endpoints greater or equal to 4", + "Please provide number of endpoints greater or equal to 2", "For more information, please refer to https://docs.min.io/docs/minio-erasure-code-quickstart-guide", ) diff --git a/internal/config/storageclass/storage-class.go b/internal/config/storageclass/storage-class.go index f39965b67..23ec357e3 100644 --- a/internal/config/storageclass/storage-class.go +++ b/internal/config/storageclass/storage-class.go @@ -50,10 +50,10 @@ const ( schemePrefix = "EC" // Min parity disks - minParityDisks = 2 + minParityDisks = 0 // Default RRS parity is always minimum parity. - defaultRRSParity = minParityDisks + defaultRRSParity = 1 ) // DefaultKVS - default storage class config @@ -65,7 +65,7 @@ var ( }, config.KV{ Key: ClassRRS, - Value: "EC:2", + Value: "EC:1", }, } ) @@ -76,7 +76,7 @@ type StorageClass struct { } // ConfigLock is a global lock for storage-class config -var ConfigLock = sync.RWMutex{} +var ConfigLock sync.RWMutex // Config storage class configuration type Config struct { @@ -154,7 +154,9 @@ func parseStorageClass(storageClassEnv string) (sc StorageClass, err error) { if err != nil { return StorageClass{}, config.ErrStorageClassValue(err) } - + if parityDisks < 0 { + return StorageClass{}, config.ErrStorageClassValue(nil).Msg("Unsupported parity value " + s[1] + " provided") + } return StorageClass{ Parity: parityDisks, }, nil @@ -196,7 +198,7 @@ func validateParity(ssParity, rrsParity, setDriveCount int) (err error) { } if rrsParity > setDriveCount/2 { - return fmt.Errorf("Reduced redundancy storage class parity %d should be less than or equal to %d", rrsParity, setDriveCount/2) + return fmt.Errorf("Reduced redundancy storage class parity %d should be less than or equal to %d", rrsParity, setDriveCount/2) } if ssParity > 0 && rrsParity > 0 { @@ -223,10 +225,6 @@ func (sCfg Config) GetParityForSC(sc string) (parity int) { defer ConfigLock.RUnlock() switch strings.TrimSpace(sc) { case RRS: - // set the rrs parity if available - if sCfg.RRS.Parity == 0 { - return defaultRRSParity - } return sCfg.RRS.Parity default: return sCfg.Standard.Parity @@ -248,6 +246,22 @@ func Enabled(kvs config.KVS) bool { return ssc != "" || rrsc != "" } +// DefaultParityBlocks returns default parity blocks for 'drive' count +func DefaultParityBlocks(drive int) int { + switch drive { + case 1: + return 0 + case 3, 2: + return 1 + case 4, 5: + return 2 + case 6, 7: + return 3 + default: + return 4 + } +} + // LookupConfig - lookup storage class config and override with valid environment settings if any. func LookupConfig(kvs config.KVS, setDriveCount int) (cfg Config, err error) { cfg = Config{} @@ -274,10 +288,15 @@ func LookupConfig(kvs config.KVS, setDriveCount int) (cfg Config, err error) { return Config{}, err } } - if cfg.RRS.Parity == 0 { + + if cfg.RRS.Parity == 0 && rrsc == "" { cfg.RRS.Parity = defaultRRSParity } + if cfg.Standard.Parity == 0 && ssc == "" { + cfg.Standard.Parity = DefaultParityBlocks(setDriveCount) + } + // Validation is done after parsing both the storage classes. This is needed because we need one // storage class value to deduce the correct value of the other storage class. if err = validateParity(cfg.Standard.Parity, cfg.RRS.Parity, setDriveCount); err != nil { diff --git a/internal/config/storageclass/storage-class_test.go b/internal/config/storageclass/storage-class_test.go index 375e99722..79323f7a5 100644 --- a/internal/config/storageclass/storage-class_test.go +++ b/internal/config/storageclass/storage-class_test.go @@ -102,7 +102,8 @@ func TestValidateParity(t *testing.T) { {2, 4, true, 16}, {3, 3, true, 16}, {0, 0, true, 16}, - {1, 4, false, 16}, + {1, 4, true, 16}, + {0, 4, true, 16}, {7, 6, false, 16}, {9, 0, false, 16}, {9, 9, false, 16}, @@ -140,7 +141,7 @@ func TestParityCount(t *testing.T) { Parity: 8, }, RRS: StorageClass{ - Parity: 0, + Parity: 2, }, } // Set env var for test case 4