mirror of
https://github.com/minio/minio.git
synced 2025-11-09 21:49:46 -05:00
Add large bucket support for erasure coded backend (#5160)
This PR implements an object layer which combines input erasure sets of XL layers into a unified namespace. This object layer extends the existing erasure coded implementation, it is assumed in this design that providing > 16 disks is a static configuration as well i.e if you started the setup with 32 disks with 4 sets 8 disks per pack then you would need to provide 4 sets always. Some design details and restrictions: - Objects are distributed using consistent ordering to a unique erasure coded layer. - Each pack has its own dsync so locks are synchronized properly at pack (erasure layer). - Each pack still has a maximum of 16 disks requirement, you can start with multiple such sets statically. - Static sets set of disks and cannot be changed, there is no elastic expansion allowed. - Static sets set of disks and cannot be changed, there is no elastic removal allowed. - ListObjects() across sets can be noticeably slower since List happens on all servers, and is merged at this sets layer. Fixes #5465 Fixes #5464 Fixes #5461 Fixes #5460 Fixes #5459 Fixes #5458 Fixes #5460 Fixes #5488 Fixes #5489 Fixes #5497 Fixes #5496
This commit is contained in:
committed by
kannappanr
parent
dd80256151
commit
fb96779a8a
@@ -36,9 +36,9 @@ var objectOpIgnoredErrs = append(baseIgnoredErrs, errDiskAccessDenied)
|
||||
func (xl xlObjects) putObjectDir(bucket, object string, writeQuorum int) error {
|
||||
var wg = &sync.WaitGroup{}
|
||||
|
||||
errs := make([]error, len(xl.storageDisks))
|
||||
errs := make([]error, len(xl.getDisks()))
|
||||
// Prepare object creation in all disks
|
||||
for index, disk := range xl.storageDisks {
|
||||
for index, disk := range xl.getDisks() {
|
||||
if disk == nil {
|
||||
continue
|
||||
}
|
||||
@@ -114,7 +114,7 @@ func (xl xlObjects) CopyObject(srcBucket, srcObject, dstBucket, dstObject string
|
||||
}
|
||||
|
||||
// Read metadata associated with the object from all disks.
|
||||
metaArr, errs := readAllXLMetadata(xl.storageDisks, srcBucket, srcObject)
|
||||
metaArr, errs := readAllXLMetadata(xl.getDisks(), srcBucket, srcObject)
|
||||
|
||||
// get Quorum for this object
|
||||
readQuorum, writeQuorum, err := objectQuorumFromMeta(xl, metaArr, errs)
|
||||
@@ -127,7 +127,7 @@ func (xl xlObjects) CopyObject(srcBucket, srcObject, dstBucket, dstObject string
|
||||
}
|
||||
|
||||
// List all online disks.
|
||||
onlineDisks, modTime := listOnlineDisks(xl.storageDisks, metaArr, errs)
|
||||
onlineDisks, modTime := listOnlineDisks(xl.getDisks(), metaArr, errs)
|
||||
|
||||
// Pick latest valid metadata.
|
||||
xlMeta, err := pickValidXLMeta(metaArr, modTime)
|
||||
@@ -145,7 +145,7 @@ func (xl xlObjects) CopyObject(srcBucket, srcObject, dstBucket, dstObject string
|
||||
cpMetadataOnly := isStringEqual(pathJoin(srcBucket, srcObject), pathJoin(dstBucket, dstObject))
|
||||
if cpMetadataOnly {
|
||||
xlMeta.Meta = metadata
|
||||
partsMetadata := make([]xlMetaV1, len(xl.storageDisks))
|
||||
partsMetadata := make([]xlMetaV1, len(xl.getDisks()))
|
||||
// Update `xl.json` content on each disks.
|
||||
for index := range partsMetadata {
|
||||
partsMetadata[index] = xlMeta
|
||||
@@ -233,7 +233,7 @@ func (xl xlObjects) getObject(bucket, object string, startOffset int64, length i
|
||||
}
|
||||
|
||||
// Read metadata associated with the object from all disks.
|
||||
metaArr, errs := readAllXLMetadata(xl.storageDisks, bucket, object)
|
||||
metaArr, errs := readAllXLMetadata(xl.getDisks(), bucket, object)
|
||||
|
||||
// get Quorum for this object
|
||||
readQuorum, _, err := objectQuorumFromMeta(xl, metaArr, errs)
|
||||
@@ -246,7 +246,7 @@ func (xl xlObjects) getObject(bucket, object string, startOffset int64, length i
|
||||
}
|
||||
|
||||
// List all online disks.
|
||||
onlineDisks, modTime := listOnlineDisks(xl.storageDisks, metaArr, errs)
|
||||
onlineDisks, modTime := listOnlineDisks(xl.getDisks(), metaArr, errs)
|
||||
|
||||
// Pick latest valid metadata.
|
||||
xlMeta, err := pickValidXLMeta(metaArr, modTime)
|
||||
@@ -340,9 +340,9 @@ func (xl xlObjects) getObject(bucket, object string, startOffset int64, length i
|
||||
func (xl xlObjects) getObjectInfoDir(bucket, object string) (oi ObjectInfo, err error) {
|
||||
var wg = &sync.WaitGroup{}
|
||||
|
||||
errs := make([]error, len(xl.storageDisks))
|
||||
errs := make([]error, len(xl.getDisks()))
|
||||
// Prepare object creation in a all disks
|
||||
for index, disk := range xl.storageDisks {
|
||||
for index, disk := range xl.getDisks() {
|
||||
if disk == nil {
|
||||
continue
|
||||
}
|
||||
@@ -368,7 +368,7 @@ func (xl xlObjects) getObjectInfoDir(bucket, object string) (oi ObjectInfo, err
|
||||
|
||||
wg.Wait()
|
||||
|
||||
readQuorum := len(xl.storageDisks) / 2
|
||||
readQuorum := len(xl.getDisks()) / 2
|
||||
return dirObjectInfo(bucket, object, 0, map[string]string{}), reduceReadQuorumErrs(errs, objectOpIgnoredErrs, readQuorum)
|
||||
}
|
||||
|
||||
@@ -403,7 +403,7 @@ func (xl xlObjects) GetObjectInfo(bucket, object string) (oi ObjectInfo, e error
|
||||
// getObjectInfo - wrapper for reading object metadata and constructs ObjectInfo.
|
||||
func (xl xlObjects) getObjectInfo(bucket, object string) (objInfo ObjectInfo, err error) {
|
||||
// Read metadata associated with the object from all disks.
|
||||
metaArr, errs := readAllXLMetadata(xl.storageDisks, bucket, object)
|
||||
metaArr, errs := readAllXLMetadata(xl.getDisks(), bucket, object)
|
||||
|
||||
// get Quorum for this object
|
||||
readQuorum, _, err := objectQuorumFromMeta(xl, metaArr, errs)
|
||||
@@ -509,7 +509,7 @@ func rename(disks []StorageAPI, srcBucket, srcEntry, dstBucket, dstEntry string,
|
||||
// Wait for all renames to finish.
|
||||
wg.Wait()
|
||||
|
||||
// We can safely allow RenameFile errors up to len(xl.storageDisks) - writeQuorum
|
||||
// We can safely allow RenameFile errors up to len(xl.getDisks()) - writeQuorum
|
||||
// otherwise return failure. Cleanup successful renames.
|
||||
err := reduceWriteQuorumErrs(errs, objectOpIgnoredErrs, writeQuorum)
|
||||
if errors.Cause(err) == errXLWriteQuorum {
|
||||
@@ -566,7 +566,7 @@ func (xl xlObjects) putObject(bucket string, object string, data *hash.Reader, m
|
||||
}
|
||||
|
||||
// Get parity and data drive count based on storage class metadata
|
||||
dataDrives, parityDrives := getRedundancyCount(metadata[amzStorageClass], len(xl.storageDisks))
|
||||
dataDrives, parityDrives := getRedundancyCount(metadata[amzStorageClass], len(xl.getDisks()))
|
||||
|
||||
// we now know the number of blocks this object needs for data and parity.
|
||||
// writeQuorum is dataBlocks + 1
|
||||
@@ -593,7 +593,7 @@ func (xl xlObjects) putObject(bucket string, object string, data *hash.Reader, m
|
||||
}
|
||||
|
||||
// Rename the successfully written temporary object to final location.
|
||||
if _, err = renameObject(xl.storageDisks, minioMetaTmpBucket, tempObj, bucket, object, writeQuorum); err != nil {
|
||||
if _, err = renameObject(xl.getDisks(), minioMetaTmpBucket, tempObj, bucket, object, writeQuorum); err != nil {
|
||||
return ObjectInfo{}, toObjectErr(err, bucket, object)
|
||||
}
|
||||
|
||||
@@ -621,7 +621,7 @@ func (xl xlObjects) putObject(bucket string, object string, data *hash.Reader, m
|
||||
var reader io.Reader = data
|
||||
|
||||
// Initialize parts metadata
|
||||
partsMetadata := make([]xlMetaV1, len(xl.storageDisks))
|
||||
partsMetadata := make([]xlMetaV1, len(xl.getDisks()))
|
||||
|
||||
xlMeta := newXLMetaV1(object, dataDrives, parityDrives)
|
||||
|
||||
@@ -631,7 +631,7 @@ func (xl xlObjects) putObject(bucket string, object string, data *hash.Reader, m
|
||||
}
|
||||
|
||||
// Order disks according to erasure distribution
|
||||
onlineDisks := shuffleDisks(xl.storageDisks, partsMetadata[0].Erasure.Distribution)
|
||||
onlineDisks := shuffleDisks(xl.getDisks(), partsMetadata[0].Erasure.Distribution)
|
||||
|
||||
// Total size of the written object
|
||||
var sizeWritten int64
|
||||
@@ -641,8 +641,9 @@ func (xl xlObjects) putObject(bucket string, object string, data *hash.Reader, m
|
||||
return ObjectInfo{}, toObjectErr(err, bucket, object)
|
||||
}
|
||||
|
||||
// Alloc additional space for parity blocks created while erasure coding
|
||||
buffer := make([]byte, xlMeta.Erasure.BlockSize, 2*xlMeta.Erasure.BlockSize)
|
||||
// Fetch buffer for I/O, returns from the pool if not allocates a new one and returns.
|
||||
buffer := xl.bp.Get()
|
||||
defer xl.bp.Put(buffer)
|
||||
|
||||
// Read data and split into parts - similar to multipart mechanism
|
||||
for partIdx := 1; ; partIdx++ {
|
||||
@@ -723,7 +724,7 @@ func (xl xlObjects) putObject(bucket string, object string, data *hash.Reader, m
|
||||
// NOTE: Do not use online disks slice here.
|
||||
// The reason is that existing object should be purged
|
||||
// regardless of `xl.json` status and rolled back in case of errors.
|
||||
_, err = renameObject(xl.storageDisks, bucket, object, minioMetaTmpBucket, newUniqueID, writeQuorum)
|
||||
_, err = renameObject(xl.getDisks(), bucket, object, minioMetaTmpBucket, newUniqueID, writeQuorum)
|
||||
if err != nil {
|
||||
return ObjectInfo{}, toObjectErr(err, bucket, object)
|
||||
}
|
||||
@@ -779,7 +780,7 @@ func (xl xlObjects) deleteObject(bucket, object string) error {
|
||||
// If its a directory request, no need to read metadata.
|
||||
if !hasSuffix(object, slashSeparator) {
|
||||
// Read metadata associated with the object from all disks.
|
||||
metaArr, errs := readAllXLMetadata(xl.storageDisks, bucket, object)
|
||||
metaArr, errs := readAllXLMetadata(xl.getDisks(), bucket, object)
|
||||
|
||||
// get Quorum for this object
|
||||
_, writeQuorum, err = objectQuorumFromMeta(xl, metaArr, errs)
|
||||
@@ -787,13 +788,13 @@ func (xl xlObjects) deleteObject(bucket, object string) error {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
writeQuorum = len(xl.storageDisks)/2 + 1
|
||||
writeQuorum = len(xl.getDisks())/2 + 1
|
||||
}
|
||||
|
||||
// Initialize list of errors.
|
||||
var dErrs = make([]error, len(xl.storageDisks))
|
||||
var dErrs = make([]error, len(xl.getDisks()))
|
||||
|
||||
for index, disk := range xl.storageDisks {
|
||||
for index, disk := range xl.getDisks() {
|
||||
if disk == nil {
|
||||
dErrs[index] = errors.Trace(errDiskNotFound)
|
||||
continue
|
||||
|
||||
Reference in New Issue
Block a user