mirror of
https://github.com/minio/minio.git
synced 2024-12-26 23:25:54 -05:00
fb96779a8a
This PR implements an object layer which combines input erasure sets of XL layers into a unified namespace. This object layer extends the existing erasure coded implementation, it is assumed in this design that providing > 16 disks is a static configuration as well i.e if you started the setup with 32 disks with 4 sets 8 disks per pack then you would need to provide 4 sets always. Some design details and restrictions: - Objects are distributed using consistent ordering to a unique erasure coded layer. - Each pack has its own dsync so locks are synchronized properly at pack (erasure layer). - Each pack still has a maximum of 16 disks requirement, you can start with multiple such sets statically. - Static sets set of disks and cannot be changed, there is no elastic expansion allowed. - Static sets set of disks and cannot be changed, there is no elastic removal allowed. - ListObjects() across sets can be noticeably slower since List happens on all servers, and is merged at this sets layer. Fixes #5465 Fixes #5464 Fixes #5461 Fixes #5460 Fixes #5459 Fixes #5458 Fixes #5460 Fixes #5488 Fixes #5489 Fixes #5497 Fixes #5496
218 lines
7.2 KiB
Go
218 lines
7.2 KiB
Go
/*
|
|
* Minio Cloud Storage, (C) 2017 Minio, Inc.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
package cmd
|
|
|
|
import (
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"strconv"
|
|
"strings"
|
|
)
|
|
|
|
const (
|
|
// metadata entry for storage class
|
|
amzStorageClass = "x-amz-storage-class"
|
|
// Canonical metadata entry for storage class
|
|
amzStorageClassCanonical = "X-Amz-Storage-Class"
|
|
// Reduced redundancy storage class
|
|
reducedRedundancyStorageClass = "REDUCED_REDUNDANCY"
|
|
// Standard storage class
|
|
standardStorageClass = "STANDARD"
|
|
// Reduced redundancy storage class environment variable
|
|
reducedRedundancyStorageClassEnv = "MINIO_STORAGE_CLASS_RRS"
|
|
// Standard storage class environment variable
|
|
standardStorageClassEnv = "MINIO_STORAGE_CLASS_STANDARD"
|
|
// Supported storage class scheme is EC
|
|
supportedStorageClassScheme = "EC"
|
|
// Minimum parity disks
|
|
minimumParityDisks = 2
|
|
defaultRRSParity = 2
|
|
)
|
|
|
|
// Struct to hold storage class
|
|
type storageClass struct {
|
|
Scheme string
|
|
Parity int
|
|
}
|
|
|
|
type storageClassConfig struct {
|
|
Standard storageClass `json:"standard"`
|
|
RRS storageClass `json:"rrs"`
|
|
}
|
|
|
|
// Validate SS and RRS parity when unmarshalling JSON.
|
|
func (sCfg *storageClassConfig) UnmarshalJSON(data []byte) error {
|
|
type Alias storageClassConfig
|
|
aux := &struct {
|
|
*Alias
|
|
}{
|
|
Alias: (*Alias)(sCfg),
|
|
}
|
|
if err := json.Unmarshal(data, &aux); err != nil {
|
|
return err
|
|
}
|
|
return validateParity(aux.Standard.Parity, aux.RRS.Parity)
|
|
}
|
|
|
|
// Validate if storage class in metadata
|
|
// Only Standard and RRS Storage classes are supported
|
|
func isValidStorageClassMeta(sc string) bool {
|
|
return sc == reducedRedundancyStorageClass || sc == standardStorageClass
|
|
}
|
|
|
|
func (sc *storageClass) UnmarshalText(b []byte) error {
|
|
scStr := string(b)
|
|
if scStr == "" {
|
|
return nil
|
|
}
|
|
s, err := parseStorageClass(scStr)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
sc.Parity = s.Parity
|
|
sc.Scheme = s.Scheme
|
|
return nil
|
|
}
|
|
|
|
func (sc *storageClass) MarshalText() ([]byte, error) {
|
|
if sc.Scheme != "" && sc.Parity != 0 {
|
|
return []byte(fmt.Sprintf("%s:%d", sc.Scheme, sc.Parity)), nil
|
|
}
|
|
return []byte(""), nil
|
|
}
|
|
|
|
// Parses given storageClassEnv and returns a storageClass structure.
|
|
// Supported Storage Class format is "Scheme:Number of parity disks".
|
|
// Currently only supported scheme is "EC".
|
|
func parseStorageClass(storageClassEnv string) (sc storageClass, err error) {
|
|
s := strings.Split(storageClassEnv, ":")
|
|
|
|
// only two elements allowed in the string - "scheme" and "number of parity disks"
|
|
if len(s) > 2 {
|
|
return storageClass{}, errors.New("Too many sections in " + storageClassEnv)
|
|
} else if len(s) < 2 {
|
|
return storageClass{}, errors.New("Too few sections in " + storageClassEnv)
|
|
}
|
|
|
|
// only allowed scheme is "EC"
|
|
if s[0] != supportedStorageClassScheme {
|
|
return storageClass{}, errors.New("Unsupported scheme " + s[0] + ". Supported scheme is EC")
|
|
}
|
|
|
|
// Number of parity disks should be integer
|
|
parityDisks, err := strconv.Atoi(s[1])
|
|
if err != nil {
|
|
return storageClass{}, err
|
|
}
|
|
|
|
sc = storageClass{
|
|
Scheme: s[0],
|
|
Parity: parityDisks,
|
|
}
|
|
|
|
return sc, nil
|
|
}
|
|
|
|
// Validates the parity disks.
|
|
func validateParity(ssParity, rrsParity int) (err error) {
|
|
if ssParity == 0 && rrsParity == 0 {
|
|
return nil
|
|
}
|
|
|
|
if !globalIsXL {
|
|
return fmt.Errorf("Setting storage class only allowed for erasure coding mode")
|
|
}
|
|
|
|
// SS parity disks should be greater than or equal to minimumParityDisks. Parity below minimumParityDisks is not recommended.
|
|
if ssParity > 0 && ssParity < minimumParityDisks {
|
|
return fmt.Errorf("Standard storage class parity %d should be greater than or equal to %d", ssParity, minimumParityDisks)
|
|
}
|
|
|
|
// RRS parity disks should be greater than or equal to minimumParityDisks. Parity below minimumParityDisks is not recommended.
|
|
if rrsParity > 0 && rrsParity < minimumParityDisks {
|
|
return fmt.Errorf("Reduced redundancy storage class parity %d should be greater than or equal to %d", rrsParity, minimumParityDisks)
|
|
}
|
|
|
|
if ssParity > globalXLSetDriveCount/2 {
|
|
return fmt.Errorf("Standard storage class parity %d should be less than or equal to %d", ssParity, globalXLSetDriveCount/2)
|
|
}
|
|
|
|
if rrsParity > globalXLSetDriveCount/2 {
|
|
return fmt.Errorf("Reduced redundancy storage class parity %d should be less than or equal to %d", rrsParity, globalXLSetDriveCount/2)
|
|
}
|
|
|
|
if ssParity > 0 && rrsParity > 0 {
|
|
if ssParity < rrsParity {
|
|
return fmt.Errorf("Standard storage class parity disks %d should be greater than or equal to Reduced redundancy storage class parity disks %d", ssParity, rrsParity)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Returns the data and parity drive count based on storage class
|
|
// If storage class is set using the env vars MINIO_STORAGE_CLASS_RRS and MINIO_STORAGE_CLASS_STANDARD
|
|
// or config.json fields
|
|
// -- corresponding values are returned
|
|
// If storage class is not set during startup, default values are returned
|
|
// -- Default for Reduced Redundancy Storage class is, parity = 2 and data = N-Parity
|
|
// -- Default for Standard Storage class is, parity = N/2, data = N/2
|
|
// If storage class is empty
|
|
// -- standard storage class is assumed and corresponding data and parity is returned
|
|
func getRedundancyCount(sc string, totalDisks int) (data, parity int) {
|
|
parity = totalDisks / 2
|
|
switch sc {
|
|
case reducedRedundancyStorageClass:
|
|
if globalRRStorageClass.Parity != 0 {
|
|
// set the rrs parity if available
|
|
parity = globalRRStorageClass.Parity
|
|
} else {
|
|
// else fall back to default value
|
|
parity = defaultRRSParity
|
|
}
|
|
case standardStorageClass, "":
|
|
if globalStandardStorageClass.Parity != 0 {
|
|
// set the standard parity if available
|
|
parity = globalStandardStorageClass.Parity
|
|
}
|
|
}
|
|
// data is always totalDisks - parity
|
|
return totalDisks - parity, parity
|
|
}
|
|
|
|
// Returns per object readQuorum and writeQuorum
|
|
// readQuorum is the minimum required disks to read data.
|
|
// writeQuorum is the minimum required disks to write data.
|
|
func objectQuorumFromMeta(xl xlObjects, partsMetaData []xlMetaV1, errs []error) (objectReadQuorum, objectWriteQuorum int, err error) {
|
|
|
|
// get the latest updated Metadata and a count of all the latest updated xlMeta(s)
|
|
latestXLMeta, count := getLatestXLMeta(partsMetaData, errs)
|
|
|
|
// latestXLMeta is updated most recently.
|
|
// We implicitly assume that all the xlMeta(s) have same dataBlocks and parityBlocks.
|
|
// We now check that at least dataBlocks number of xlMeta is available. This means count
|
|
// should be greater than or equal to dataBlocks field of latestXLMeta. If not we throw read quorum error.
|
|
if count < latestXLMeta.Erasure.DataBlocks {
|
|
// This is the case when we can't reliably deduce object quorum
|
|
return 0, 0, errXLReadQuorum
|
|
}
|
|
|
|
// Since all the valid erasure code meta updated at the same time are equivalent, pass dataBlocks
|
|
// from latestXLMeta to get the quorum
|
|
return latestXLMeta.Erasure.DataBlocks, latestXLMeta.Erasure.DataBlocks + 1, nil
|
|
}
|