Add large bucket support for erasure coded backend (#5160)

This PR implements an object layer which
combines input erasure sets of XL layers
into a unified namespace.

This object layer extends the existing
erasure coded implementation, it is assumed
in this design that providing > 16 disks is
a static configuration as well i.e if you started
the setup with 32 disks with 4 sets 8 disks per
pack then you would need to provide 4 sets always.

Some design details and restrictions:

- Objects are distributed using consistent ordering
  to a unique erasure coded layer.
- Each pack has its own dsync so locks are synchronized
  properly at pack (erasure layer).
- Each pack still has a maximum of 16 disks
  requirement, you can start with multiple
  such sets statically.
- Static sets set of disks and cannot be
  changed, there is no elastic expansion allowed.
- Static sets set of disks and cannot be
  changed, there is no elastic removal allowed.
- ListObjects() across sets can be noticeably
  slower since List happens on all servers,
  and is merged at this sets layer.

Fixes #5465
Fixes #5464
Fixes #5461
Fixes #5460
Fixes #5459
Fixes #5458
Fixes #5460
Fixes #5488
Fixes #5489
Fixes #5497
Fixes #5496
This commit is contained in:
Harshavardhana
2018-02-15 17:45:57 -08:00
committed by kannappanr
parent dd80256151
commit fb96779a8a
82 changed files with 5046 additions and 4771 deletions

View File

@@ -34,8 +34,8 @@ import (
const (
// Admin service names
signalServiceRPC = "Admin.SignalService"
reInitFormatRPC = "Admin.ReInitFormat"
listLocksRPC = "Admin.ListLocks"
reInitDisksRPC = "Admin.ReInitDisks"
serverInfoDataRPC = "Admin.ServerInfoData"
getConfigRPC = "Admin.GetConfig"
writeTmpConfigRPC = "Admin.WriteTmpConfig"
@@ -56,8 +56,8 @@ type remoteAdminClient struct {
// commands like service stop and service restart.
type adminCmdRunner interface {
SignalService(s serviceSignal) error
ReInitFormat(dryRun bool) error
ListLocks(bucket, prefix string, duration time.Duration) ([]VolumeLockInfo, error)
ReInitDisks() error
ServerInfoData() (ServerInfoData, error)
GetConfig() ([]byte, error)
WriteTmpConfig(tmpFileName string, configBytes []byte) error
@@ -77,6 +77,16 @@ func (lc localAdminClient) SignalService(s serviceSignal) error {
return nil
}
// ReInitFormat - re-initialize disk format.
func (lc localAdminClient) ReInitFormat(dryRun bool) error {
objectAPI := newObjectLayerFn()
if objectAPI == nil {
return errServerNotInitialized
}
_, err := objectAPI.HealFormat(dryRun)
return err
}
// ListLocks - Fetches lock information from local lock instrumentation.
func (lc localAdminClient) ListLocks(bucket, prefix string, duration time.Duration) ([]VolumeLockInfo, error) {
return listLocksInfo(bucket, prefix, duration), nil
@@ -92,7 +102,14 @@ func (rc remoteAdminClient) SignalService(s serviceSignal) (err error) {
err = errUnsupportedSignal
}
return err
}
// ReInitFormat - re-initialize disk format, remotely.
func (rc remoteAdminClient) ReInitFormat(dryRun bool) error {
reply := AuthRPCReply{}
return rc.Call(reInitFormatRPC, &ReInitFormatArgs{
DryRun: dryRun,
}, &reply)
}
// ListLocks - Sends list locks command to remote server via RPC.
@@ -109,20 +126,6 @@ func (rc remoteAdminClient) ListLocks(bucket, prefix string, duration time.Durat
return reply.VolLocks, nil
}
// ReInitDisks - There is nothing to do here, heal format REST API
// handler has already formatted and reinitialized the local disks.
func (lc localAdminClient) ReInitDisks() error {
return nil
}
// ReInitDisks - Signals peers via RPC to reinitialize their disks and
// object layer.
func (rc remoteAdminClient) ReInitDisks() error {
args := AuthRPCArgs{}
reply := AuthRPCReply{}
return rc.Call(reInitDisksRPC, &args, &reply)
}
// ServerInfoData - Returns the server info of this server.
func (lc localAdminClient) ServerInfoData() (sid ServerInfoData, e error) {
if globalBootTime.IsZero() {
@@ -240,6 +243,7 @@ func (rc remoteAdminClient) CommitConfig(tmpFileName string) error {
type adminPeer struct {
addr string
cmdRunner adminCmdRunner
isLocal bool
}
// type alias for a collection of adminPeer.
@@ -254,6 +258,7 @@ func makeAdminPeers(endpoints EndpointList) (adminPeerList adminPeers) {
adminPeerList = append(adminPeerList, adminPeer{
thisPeer,
localAdminClient{},
true,
})
hostSet := set.CreateStringSet(globalMinioAddr)
@@ -280,6 +285,26 @@ func makeAdminPeers(endpoints EndpointList) (adminPeerList adminPeers) {
return adminPeerList
}
// peersReInitFormat - reinitialize remote object layers to new format.
func peersReInitFormat(peers adminPeers, dryRun bool) error {
errs := make([]error, len(peers))
// Send ReInitFormat RPC call to all nodes.
// for local adminPeer this is a no-op.
wg := sync.WaitGroup{}
for i, peer := range peers {
wg.Add(1)
go func(idx int, peer adminPeer) {
defer wg.Done()
if !peer.isLocal {
errs[idx] = peer.cmdRunner.ReInitFormat(dryRun)
}
}(i, peer)
}
wg.Wait()
return nil
}
// Initialize global adminPeer collection.
func initGlobalAdminPeers(endpoints EndpointList) {
globalAdminPeers = makeAdminPeers(endpoints)
@@ -363,24 +388,6 @@ func listPeerLocksInfo(peers adminPeers, bucket, prefix string, duration time.Du
return groupedLockInfos, nil
}
// reInitPeerDisks - reinitialize disks and object layer on peer servers to use the new format.
func reInitPeerDisks(peers adminPeers) error {
errs := make([]error, len(peers))
// Send ReInitDisks RPC call to all nodes.
// for local adminPeer this is a no-op.
wg := sync.WaitGroup{}
for i, peer := range peers {
wg.Add(1)
go func(idx int, peer adminPeer) {
defer wg.Done()
errs[idx] = peer.cmdRunner.ReInitDisks()
}(i, peer)
}
wg.Wait()
return nil
}
// uptimeSlice - used to sort uptimes in chronological order.
type uptimeSlice []struct {
err error