Add large bucket support for erasure coded backend (#5160)

This PR implements an object layer which combines input erasure sets of XL layers into a unified namespace. This object layer extends the existing erasure coded implementation, it is assumed in this design that providing > 16 disks is a static configuration as well i.e if you started the setup with 32 disks with 4 sets 8 disks per pack then you would need to provide 4 sets always. Some design details and restrictions: - Objects are distributed using consistent ordering to a unique erasure coded layer. - Each pack has its own dsync so locks are synchronized properly at pack (erasure layer). - Each pack still has a maximum of 16 disks requirement, you can start with multiple such sets statically. - Static sets set of disks and cannot be changed, there is no elastic expansion allowed. - Static sets set of disks and cannot be changed, there is no elastic removal allowed. - ListObjects() across sets can be noticeably slower since List happens on all servers, and is merged at this sets layer. Fixes #5465 Fixes #5464 Fixes #5461 Fixes #5460 Fixes #5459 Fixes #5458 Fixes #5460 Fixes #5488 Fixes #5489 Fixes #5497 Fixes #5496
2025-11-06 20:33:07 -05:00 · 2018-02-15 17:45:57 -08:00
parent dd80256151
commit fb96779a8a
82 changed files with 5046 additions and 4771 deletions
--- a/cmd/xl-v1.go
+++ b/cmd/xl-v1.go
@@ -17,11 +17,10 @@
 package cmd

 import (
-	"fmt"
 	"sort"
-	"sync"
 	"time"

+	"github.com/minio/minio/pkg/bpool"
 	"github.com/minio/minio/pkg/disk"
 	"github.com/minio/minio/pkg/errors"
 )
@@ -33,107 +32,36 @@ const (

 	// Uploads metadata file carries per multipart object metadata.
 	uploadsJSONFile = "uploads.json"
-
-	// Maximum erasure blocks.
-	maxErasureBlocks = 32
-
-	// Minimum erasure blocks.
-	minErasureBlocks = 4
 )

 // xlObjects - Implements XL object layer.
 type xlObjects struct {
-	mutex        *sync.Mutex
-	storageDisks []StorageAPI // Collection of initialized backend disks.
-
-	// ListObjects pool management.
-	listPool *treeWalkPool
-
-	// name space mutex for object layer
+	// name space mutex for object layer.
 	nsMutex *nsLockMap

+	// getDisks returns list of storageAPIs.
+	getDisks func() []StorageAPI
+
+	// Byte pools used for temporary i/o buffers.
+	bp *bpool.BytePoolCap
+
 	// Variable represents bucket policies in memory.
 	bucketPolicies *bucketPolicies
+
+	// TODO: Deprecated only kept here for tests, should be removed in future.
+	storageDisks []StorageAPI
+
+	// TODO: ListObjects pool management, should be removed in future.
+	listPool *treeWalkPool
 }

 // list of all errors that can be ignored in tree walk operation in XL
 var xlTreeWalkIgnoredErrs = append(baseIgnoredErrs, errDiskAccessDenied, errVolumeNotFound, errFileNotFound)

-// newXLObjectLayer - initialize any object layer depending on the number of disks.
-func newXLObjectLayer(storageDisks []StorageAPI) (ObjectLayer, error) {
-	// Initialize XL object layer.
-	objAPI, err := newXLObjects(storageDisks)
-	fatalIf(err, "Unable to initialize XL object layer.")
-
-	// Initialize and load bucket policies.
-	err = initBucketPolicies(objAPI)
-	fatalIf(err, "Unable to load all bucket policies.")
-
-	// Initialize a new event notifier.
-	err = initEventNotifier(objAPI)
-	fatalIf(err, "Unable to initialize event notification.")
-
-	// Success.
-	return objAPI, nil
-}
-
-// newXLObjects - initialize new xl object layer.
-func newXLObjects(storageDisks []StorageAPI) (ObjectLayer, error) {
-	if storageDisks == nil {
-		return nil, errInvalidArgument
-	}
-
-	// figure out readQuorum for erasure format.json
-	readQuorum := len(storageDisks) / 2
-	writeQuorum := len(storageDisks)/2 + 1
-
-	// Load saved XL format.json and validate.
-	newStorageDisks, err := loadFormatXL(storageDisks, readQuorum)
-	if err != nil {
-		return nil, fmt.Errorf("Unable to recognize backend format, %s", err)
-	}
-
-	// Initialize list pool.
-	listPool := newTreeWalkPool(globalLookupTimeout)
-
-	// Initialize xl objects.
-	xl := &xlObjects{
-		mutex:        &sync.Mutex{},
-		storageDisks: newStorageDisks,
-		listPool:     listPool,
-		nsMutex:      newNSLock(globalIsDistXL),
-	}
-
-	// Initialize meta volume, if volume already exists ignores it.
-	if err = initMetaVolume(xl.storageDisks); err != nil {
-		return nil, fmt.Errorf("Unable to initialize '.minio.sys' meta volume, %s", err)
-	}
-
-	// If the number of offline servers is equal to the readQuorum
-	// (i.e. the number of online servers also equals the
-	// readQuorum), we cannot perform quick-heal (no
-	// write-quorum). However reads may still be possible, so we
-	// skip quick-heal in this case, and continue.
-	offlineCount := len(newStorageDisks) - diskCount(newStorageDisks)
-	if offlineCount == readQuorum {
-		return xl, nil
-	}
-
-	// Perform a quick heal on the buckets and bucket metadata for any discrepancies.
-	if err = quickHeal(*xl, writeQuorum, readQuorum); err != nil {
-		return nil, err
-	}
-
-	// Start background process to cleanup old multipart objects in `.minio.sys`.
-	go cleanupStaleMultipartUploads(multipartCleanupInterval, multipartExpiry, xl, xl.listMultipartUploadsCleanup, globalServiceDoneCh)
-
-	return xl, nil
-}
-
 // Shutdown function for object storage interface.
 func (xl xlObjects) Shutdown() error {
 	// Add any object layer shutdown activities here.
-	for _, disk := range xl.storageDisks {
+	for _, disk := range xl.getDisks() {
 		// This closes storage rpc client connections if any.
 		// Otherwise this is a no-op.
 		if disk == nil {
@@ -291,6 +219,5 @@ func getStorageInfo(disks []StorageAPI) StorageInfo {

 // StorageInfo - returns underlying storage statistics.
 func (xl xlObjects) StorageInfo() StorageInfo {
-	storageInfo := getStorageInfo(xl.storageDisks)
-	return storageInfo
+	return getStorageInfo(xl.getDisks())
 }