Add large bucket support for erasure coded backend (#5160)

This PR implements an object layer which
combines input erasure sets of XL layers
into a unified namespace.

This object layer extends the existing
erasure coded implementation, it is assumed
in this design that providing > 16 disks is
a static configuration as well i.e if you started
the setup with 32 disks with 4 sets 8 disks per
pack then you would need to provide 4 sets always.

Some design details and restrictions:

- Objects are distributed using consistent ordering
  to a unique erasure coded layer.
- Each pack has its own dsync so locks are synchronized
  properly at pack (erasure layer).
- Each pack still has a maximum of 16 disks
  requirement, you can start with multiple
  such sets statically.
- Static sets set of disks and cannot be
  changed, there is no elastic expansion allowed.
- Static sets set of disks and cannot be
  changed, there is no elastic removal allowed.
- ListObjects() across sets can be noticeably
  slower since List happens on all servers,
  and is merged at this sets layer.

Fixes #5465
Fixes #5464
Fixes #5461
Fixes #5460
Fixes #5459
Fixes #5458
Fixes #5460
Fixes #5488
Fixes #5489
Fixes #5497
Fixes #5496
This commit is contained in:
Harshavardhana
2018-02-15 17:45:57 -08:00
committed by kannappanr
parent dd80256151
commit fb96779a8a
82 changed files with 5046 additions and 4771 deletions

View File

@@ -17,11 +17,10 @@
package cmd
import (
"fmt"
"sort"
"sync"
"time"
"github.com/minio/minio/pkg/bpool"
"github.com/minio/minio/pkg/disk"
"github.com/minio/minio/pkg/errors"
)
@@ -33,107 +32,36 @@ const (
// Uploads metadata file carries per multipart object metadata.
uploadsJSONFile = "uploads.json"
// Maximum erasure blocks.
maxErasureBlocks = 32
// Minimum erasure blocks.
minErasureBlocks = 4
)
// xlObjects - Implements XL object layer.
type xlObjects struct {
mutex *sync.Mutex
storageDisks []StorageAPI // Collection of initialized backend disks.
// ListObjects pool management.
listPool *treeWalkPool
// name space mutex for object layer
// name space mutex for object layer.
nsMutex *nsLockMap
// getDisks returns list of storageAPIs.
getDisks func() []StorageAPI
// Byte pools used for temporary i/o buffers.
bp *bpool.BytePoolCap
// Variable represents bucket policies in memory.
bucketPolicies *bucketPolicies
// TODO: Deprecated only kept here for tests, should be removed in future.
storageDisks []StorageAPI
// TODO: ListObjects pool management, should be removed in future.
listPool *treeWalkPool
}
// list of all errors that can be ignored in tree walk operation in XL
var xlTreeWalkIgnoredErrs = append(baseIgnoredErrs, errDiskAccessDenied, errVolumeNotFound, errFileNotFound)
// newXLObjectLayer - initialize any object layer depending on the number of disks.
func newXLObjectLayer(storageDisks []StorageAPI) (ObjectLayer, error) {
// Initialize XL object layer.
objAPI, err := newXLObjects(storageDisks)
fatalIf(err, "Unable to initialize XL object layer.")
// Initialize and load bucket policies.
err = initBucketPolicies(objAPI)
fatalIf(err, "Unable to load all bucket policies.")
// Initialize a new event notifier.
err = initEventNotifier(objAPI)
fatalIf(err, "Unable to initialize event notification.")
// Success.
return objAPI, nil
}
// newXLObjects - initialize new xl object layer.
func newXLObjects(storageDisks []StorageAPI) (ObjectLayer, error) {
if storageDisks == nil {
return nil, errInvalidArgument
}
// figure out readQuorum for erasure format.json
readQuorum := len(storageDisks) / 2
writeQuorum := len(storageDisks)/2 + 1
// Load saved XL format.json and validate.
newStorageDisks, err := loadFormatXL(storageDisks, readQuorum)
if err != nil {
return nil, fmt.Errorf("Unable to recognize backend format, %s", err)
}
// Initialize list pool.
listPool := newTreeWalkPool(globalLookupTimeout)
// Initialize xl objects.
xl := &xlObjects{
mutex: &sync.Mutex{},
storageDisks: newStorageDisks,
listPool: listPool,
nsMutex: newNSLock(globalIsDistXL),
}
// Initialize meta volume, if volume already exists ignores it.
if err = initMetaVolume(xl.storageDisks); err != nil {
return nil, fmt.Errorf("Unable to initialize '.minio.sys' meta volume, %s", err)
}
// If the number of offline servers is equal to the readQuorum
// (i.e. the number of online servers also equals the
// readQuorum), we cannot perform quick-heal (no
// write-quorum). However reads may still be possible, so we
// skip quick-heal in this case, and continue.
offlineCount := len(newStorageDisks) - diskCount(newStorageDisks)
if offlineCount == readQuorum {
return xl, nil
}
// Perform a quick heal on the buckets and bucket metadata for any discrepancies.
if err = quickHeal(*xl, writeQuorum, readQuorum); err != nil {
return nil, err
}
// Start background process to cleanup old multipart objects in `.minio.sys`.
go cleanupStaleMultipartUploads(multipartCleanupInterval, multipartExpiry, xl, xl.listMultipartUploadsCleanup, globalServiceDoneCh)
return xl, nil
}
// Shutdown function for object storage interface.
func (xl xlObjects) Shutdown() error {
// Add any object layer shutdown activities here.
for _, disk := range xl.storageDisks {
for _, disk := range xl.getDisks() {
// This closes storage rpc client connections if any.
// Otherwise this is a no-op.
if disk == nil {
@@ -291,6 +219,5 @@ func getStorageInfo(disks []StorageAPI) StorageInfo {
// StorageInfo - returns underlying storage statistics.
func (xl xlObjects) StorageInfo() StorageInfo {
storageInfo := getStorageInfo(xl.storageDisks)
return storageInfo
return getStorageInfo(xl.getDisks())
}