mirror of https://github.com/minio/minio.git
1747 lines
50 KiB
Go
1747 lines
50 KiB
Go
/*
|
|
* MinIO Cloud Storage, (C) 2019,2020 MinIO, Inc.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
package cmd
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"math/rand"
|
|
"net/http"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/minio/minio-go/v7/pkg/set"
|
|
"github.com/minio/minio-go/v7/pkg/tags"
|
|
"github.com/minio/minio/cmd/config/storageclass"
|
|
"github.com/minio/minio/cmd/logger"
|
|
"github.com/minio/minio/pkg/madmin"
|
|
"github.com/minio/minio/pkg/sync/errgroup"
|
|
)
|
|
|
|
type erasureServerPools struct {
|
|
GatewayUnsupported
|
|
|
|
serverPools []*erasureSets
|
|
|
|
// Shut down async operations
|
|
shutdown context.CancelFunc
|
|
}
|
|
|
|
func (z *erasureServerPools) SinglePool() bool {
|
|
return len(z.serverPools) == 1
|
|
}
|
|
|
|
// Initialize new pool of erasure sets.
|
|
func newErasureServerPools(ctx context.Context, endpointServerPools EndpointServerPools) (ObjectLayer, error) {
|
|
var (
|
|
deploymentID string
|
|
distributionAlgo string
|
|
commonParityDrives int
|
|
err error
|
|
|
|
formats = make([]*formatErasureV3, len(endpointServerPools))
|
|
storageDisks = make([][]StorageAPI, len(endpointServerPools))
|
|
z = &erasureServerPools{serverPools: make([]*erasureSets, len(endpointServerPools))}
|
|
)
|
|
|
|
var localDrives []string
|
|
|
|
local := endpointServerPools.FirstLocal()
|
|
for i, ep := range endpointServerPools {
|
|
for _, endpoint := range ep.Endpoints {
|
|
if endpoint.IsLocal {
|
|
localDrives = append(localDrives, endpoint.Path)
|
|
}
|
|
}
|
|
|
|
// If storage class is not set during startup, default values are used
|
|
// -- Default for Reduced Redundancy Storage class is, parity = 2
|
|
// -- Default for Standard Storage class is, parity = 2 - disks 4, 5
|
|
// -- Default for Standard Storage class is, parity = 3 - disks 6, 7
|
|
// -- Default for Standard Storage class is, parity = 4 - disks 8 to 16
|
|
if commonParityDrives == 0 {
|
|
commonParityDrives = ecDrivesNoConfig(ep.DrivesPerSet)
|
|
}
|
|
|
|
if err = storageclass.ValidateParity(commonParityDrives, ep.DrivesPerSet); err != nil {
|
|
return nil, fmt.Errorf("All current serverPools should have same parity ratio - expected %d, got %d", commonParityDrives, ecDrivesNoConfig(ep.DrivesPerSet))
|
|
}
|
|
|
|
storageDisks[i], formats[i], err = waitForFormatErasure(local, ep.Endpoints, i+1,
|
|
ep.SetCount, ep.DrivesPerSet, deploymentID, distributionAlgo)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if deploymentID == "" {
|
|
// all zones should have same deployment ID
|
|
deploymentID = formats[i].ID
|
|
}
|
|
|
|
if distributionAlgo == "" {
|
|
distributionAlgo = formats[i].Erasure.DistributionAlgo
|
|
}
|
|
|
|
// Validate if users brought different DeploymentID pools.
|
|
if deploymentID != formats[i].ID {
|
|
return nil, fmt.Errorf("All serverPools should have same deployment ID expected %s, got %s", deploymentID, formats[i].ID)
|
|
}
|
|
|
|
z.serverPools[i], err = newErasureSets(ctx, ep.Endpoints, storageDisks[i], formats[i], commonParityDrives, i)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
ctx, z.shutdown = context.WithCancel(ctx)
|
|
go intDataUpdateTracker.start(ctx, localDrives...)
|
|
return z, nil
|
|
}
|
|
|
|
func (z *erasureServerPools) NewNSLock(bucket string, objects ...string) RWLocker {
|
|
return z.serverPools[0].NewNSLock(bucket, objects...)
|
|
}
|
|
|
|
// GetDisksID will return disks by their ID.
|
|
func (z *erasureServerPools) GetDisksID(ids ...string) []StorageAPI {
|
|
idMap := make(map[string]struct{})
|
|
for _, id := range ids {
|
|
idMap[id] = struct{}{}
|
|
}
|
|
res := make([]StorageAPI, 0, len(idMap))
|
|
for _, s := range z.serverPools {
|
|
s.erasureDisksMu.RLock()
|
|
defer s.erasureDisksMu.RUnlock()
|
|
for _, disks := range s.erasureDisks {
|
|
for _, disk := range disks {
|
|
if disk == OfflineDisk {
|
|
continue
|
|
}
|
|
if id, _ := disk.GetDiskID(); id != "" {
|
|
if _, ok := idMap[id]; ok {
|
|
res = append(res, disk)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return res
|
|
}
|
|
|
|
func (z *erasureServerPools) SetDriveCounts() []int {
|
|
setDriveCounts := make([]int, len(z.serverPools))
|
|
for i := range z.serverPools {
|
|
setDriveCounts[i] = z.serverPools[i].SetDriveCount()
|
|
}
|
|
return setDriveCounts
|
|
}
|
|
|
|
type serverPoolsAvailableSpace []poolAvailableSpace
|
|
|
|
type poolAvailableSpace struct {
|
|
Index int
|
|
Available uint64
|
|
}
|
|
|
|
// TotalAvailable - total available space
|
|
func (p serverPoolsAvailableSpace) TotalAvailable() uint64 {
|
|
total := uint64(0)
|
|
for _, z := range p {
|
|
total += z.Available
|
|
}
|
|
return total
|
|
}
|
|
|
|
// getAvailablePoolIdx will return an index that can hold size bytes.
|
|
// -1 is returned if no serverPools have available space for the size given.
|
|
func (z *erasureServerPools) getAvailablePoolIdx(ctx context.Context, size int64) int {
|
|
serverPools := z.getServerPoolsAvailableSpace(ctx, size)
|
|
total := serverPools.TotalAvailable()
|
|
if total == 0 {
|
|
return -1
|
|
}
|
|
// choose when we reach this many
|
|
choose := rand.Uint64() % total
|
|
atTotal := uint64(0)
|
|
for _, pool := range serverPools {
|
|
atTotal += pool.Available
|
|
if atTotal > choose && pool.Available > 0 {
|
|
return pool.Index
|
|
}
|
|
}
|
|
// Should not happen, but print values just in case.
|
|
logger.LogIf(ctx, fmt.Errorf("reached end of serverPools (total: %v, atTotal: %v, choose: %v)", total, atTotal, choose))
|
|
return -1
|
|
}
|
|
|
|
// getServerPoolsAvailableSpace will return the available space of each pool after storing the content.
|
|
// If there is not enough space the pool will return 0 bytes available.
|
|
// Negative sizes are seen as 0 bytes.
|
|
func (z *erasureServerPools) getServerPoolsAvailableSpace(ctx context.Context, size int64) serverPoolsAvailableSpace {
|
|
if size < 0 {
|
|
size = 0
|
|
}
|
|
var serverPools = make(serverPoolsAvailableSpace, len(z.serverPools))
|
|
|
|
storageInfos := make([]StorageInfo, len(z.serverPools))
|
|
g := errgroup.WithNErrs(len(z.serverPools))
|
|
for index := range z.serverPools {
|
|
index := index
|
|
g.Go(func() error {
|
|
storageInfos[index] = z.serverPools[index].StorageUsageInfo(ctx)
|
|
return nil
|
|
}, index)
|
|
}
|
|
|
|
// Wait for the go routines.
|
|
g.Wait()
|
|
|
|
for i, zinfo := range storageInfos {
|
|
var available uint64
|
|
var total uint64
|
|
for _, disk := range zinfo.Disks {
|
|
total += disk.TotalSpace
|
|
available += disk.TotalSpace - disk.UsedSpace
|
|
}
|
|
// Make sure we can fit "size" on to the disk without getting above the diskFillFraction
|
|
if available < uint64(size) {
|
|
available = 0
|
|
}
|
|
if available > 0 {
|
|
// How much will be left after adding the file.
|
|
available -= -uint64(size)
|
|
|
|
// wantLeft is how much space there at least must be left.
|
|
wantLeft := uint64(float64(total) * (1.0 - diskFillFraction))
|
|
if available <= wantLeft {
|
|
available = 0
|
|
}
|
|
}
|
|
serverPools[i] = poolAvailableSpace{
|
|
Index: i,
|
|
Available: available,
|
|
}
|
|
}
|
|
return serverPools
|
|
}
|
|
|
|
// getPoolIdx returns the found previous object and its corresponding pool idx,
|
|
// if none are found falls back to most available space pool.
|
|
func (z *erasureServerPools) getPoolIdx(ctx context.Context, bucket, object string, size int64) (idx int, err error) {
|
|
if z.SinglePool() {
|
|
return 0, nil
|
|
}
|
|
|
|
errs := make([]error, len(z.serverPools))
|
|
objInfos := make([]ObjectInfo, len(z.serverPools))
|
|
|
|
var wg sync.WaitGroup
|
|
for i, pool := range z.serverPools {
|
|
wg.Add(1)
|
|
go func(i int, pool *erasureSets) {
|
|
defer wg.Done()
|
|
objInfos[i], errs[i] = pool.GetObjectInfo(ctx, bucket, object, ObjectOptions{})
|
|
}(i, pool)
|
|
}
|
|
wg.Wait()
|
|
|
|
for i, err := range errs {
|
|
if err != nil && !isErrObjectNotFound(err) {
|
|
return -1, err
|
|
}
|
|
if isErrObjectNotFound(err) {
|
|
// No object exists or its a delete marker,
|
|
// check objInfo to confirm.
|
|
if objInfos[i].DeleteMarker && objInfos[i].Name != "" {
|
|
return i, nil
|
|
}
|
|
// objInfo is not valid, truly the object doesn't
|
|
// exist proceed to next pool.
|
|
continue
|
|
}
|
|
// object exists at this pool.
|
|
return i, nil
|
|
}
|
|
|
|
// We multiply the size by 2 to account for erasure coding.
|
|
idx = z.getAvailablePoolIdx(ctx, size*2)
|
|
if idx < 0 {
|
|
return -1, toObjectErr(errDiskFull)
|
|
}
|
|
return idx, nil
|
|
}
|
|
|
|
func (z *erasureServerPools) Shutdown(ctx context.Context) error {
|
|
defer z.shutdown()
|
|
|
|
g := errgroup.WithNErrs(len(z.serverPools))
|
|
|
|
for index := range z.serverPools {
|
|
index := index
|
|
g.Go(func() error {
|
|
return z.serverPools[index].Shutdown(ctx)
|
|
}, index)
|
|
}
|
|
|
|
for _, err := range g.Wait() {
|
|
if err != nil {
|
|
logger.LogIf(ctx, err)
|
|
}
|
|
// let's the rest shutdown
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (z *erasureServerPools) BackendInfo() (b madmin.BackendInfo) {
|
|
b.Type = madmin.Erasure
|
|
|
|
scParity := globalStorageClass.GetParityForSC(storageclass.STANDARD)
|
|
if scParity <= 0 {
|
|
scParity = z.serverPools[0].defaultParityCount
|
|
}
|
|
rrSCParity := globalStorageClass.GetParityForSC(storageclass.RRS)
|
|
|
|
// Data blocks can vary per pool, but parity is same.
|
|
for _, setDriveCount := range z.SetDriveCounts() {
|
|
b.StandardSCData = append(b.StandardSCData, setDriveCount-scParity)
|
|
b.RRSCData = append(b.RRSCData, setDriveCount-rrSCParity)
|
|
}
|
|
|
|
b.StandardSCParity = scParity
|
|
b.RRSCParity = rrSCParity
|
|
return
|
|
}
|
|
|
|
func (z *erasureServerPools) LocalStorageInfo(ctx context.Context) (StorageInfo, []error) {
|
|
var storageInfo StorageInfo
|
|
|
|
storageInfos := make([]StorageInfo, len(z.serverPools))
|
|
storageInfosErrs := make([][]error, len(z.serverPools))
|
|
g := errgroup.WithNErrs(len(z.serverPools))
|
|
for index := range z.serverPools {
|
|
index := index
|
|
g.Go(func() error {
|
|
storageInfos[index], storageInfosErrs[index] = z.serverPools[index].LocalStorageInfo(ctx)
|
|
return nil
|
|
}, index)
|
|
}
|
|
|
|
// Wait for the go routines.
|
|
g.Wait()
|
|
|
|
storageInfo.Backend = z.BackendInfo()
|
|
for _, lstorageInfo := range storageInfos {
|
|
storageInfo.Disks = append(storageInfo.Disks, lstorageInfo.Disks...)
|
|
}
|
|
|
|
var errs []error
|
|
for i := range z.serverPools {
|
|
errs = append(errs, storageInfosErrs[i]...)
|
|
}
|
|
return storageInfo, errs
|
|
}
|
|
|
|
func (z *erasureServerPools) StorageInfo(ctx context.Context) (StorageInfo, []error) {
|
|
var storageInfo StorageInfo
|
|
|
|
storageInfos := make([]StorageInfo, len(z.serverPools))
|
|
storageInfosErrs := make([][]error, len(z.serverPools))
|
|
g := errgroup.WithNErrs(len(z.serverPools))
|
|
for index := range z.serverPools {
|
|
index := index
|
|
g.Go(func() error {
|
|
storageInfos[index], storageInfosErrs[index] = z.serverPools[index].StorageInfo(ctx)
|
|
return nil
|
|
}, index)
|
|
}
|
|
|
|
// Wait for the go routines.
|
|
g.Wait()
|
|
|
|
storageInfo.Backend = z.BackendInfo()
|
|
for _, lstorageInfo := range storageInfos {
|
|
storageInfo.Disks = append(storageInfo.Disks, lstorageInfo.Disks...)
|
|
}
|
|
|
|
var errs []error
|
|
for i := range z.serverPools {
|
|
errs = append(errs, storageInfosErrs[i]...)
|
|
}
|
|
return storageInfo, errs
|
|
}
|
|
|
|
func (z *erasureServerPools) NSScanner(ctx context.Context, bf *bloomFilter, updates chan<- DataUsageInfo) error {
|
|
ctx, cancel := context.WithCancel(ctx)
|
|
defer cancel()
|
|
|
|
var wg sync.WaitGroup
|
|
var mu sync.Mutex
|
|
var results []dataUsageCache
|
|
var firstErr error
|
|
|
|
allBuckets, err := z.ListBuckets(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if len(allBuckets) == 0 {
|
|
updates <- DataUsageInfo{} // no buckets found update data usage to reflect latest state
|
|
return nil
|
|
}
|
|
|
|
// Scanner latest allBuckets first.
|
|
sort.Slice(allBuckets, func(i, j int) bool {
|
|
return allBuckets[i].Created.After(allBuckets[j].Created)
|
|
})
|
|
|
|
// Collect for each set in serverPools.
|
|
for _, z := range z.serverPools {
|
|
for _, erObj := range z.sets {
|
|
wg.Add(1)
|
|
results = append(results, dataUsageCache{})
|
|
go func(i int, erObj *erasureObjects) {
|
|
updates := make(chan dataUsageCache, 1)
|
|
defer close(updates)
|
|
// Start update collector.
|
|
go func() {
|
|
defer wg.Done()
|
|
for info := range updates {
|
|
mu.Lock()
|
|
results[i] = info
|
|
mu.Unlock()
|
|
}
|
|
}()
|
|
// Start scanner. Blocks until done.
|
|
err := erObj.nsScanner(ctx, allBuckets, bf, updates)
|
|
if err != nil {
|
|
logger.LogIf(ctx, err)
|
|
mu.Lock()
|
|
if firstErr == nil {
|
|
firstErr = err
|
|
}
|
|
// Cancel remaining...
|
|
cancel()
|
|
mu.Unlock()
|
|
return
|
|
}
|
|
}(len(results)-1, erObj)
|
|
}
|
|
}
|
|
updateCloser := make(chan chan struct{})
|
|
go func() {
|
|
updateTicker := time.NewTicker(30 * time.Second)
|
|
defer updateTicker.Stop()
|
|
var lastUpdate time.Time
|
|
|
|
// We need to merge since we will get the same buckets from each pool.
|
|
// Therefore to get the exact bucket sizes we must merge before we can convert.
|
|
var allMerged dataUsageCache
|
|
|
|
update := func() {
|
|
mu.Lock()
|
|
defer mu.Unlock()
|
|
|
|
allMerged = dataUsageCache{Info: dataUsageCacheInfo{Name: dataUsageRoot}}
|
|
for _, info := range results {
|
|
if info.Info.LastUpdate.IsZero() {
|
|
// Not filled yet.
|
|
return
|
|
}
|
|
allMerged.merge(info)
|
|
}
|
|
if allMerged.root() != nil && allMerged.Info.LastUpdate.After(lastUpdate) {
|
|
updates <- allMerged.dui(allMerged.Info.Name, allBuckets)
|
|
lastUpdate = allMerged.Info.LastUpdate
|
|
}
|
|
}
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case v := <-updateCloser:
|
|
update()
|
|
// Enforce quotas when all is done.
|
|
if firstErr == nil {
|
|
for _, b := range allBuckets {
|
|
enforceFIFOQuotaBucket(ctx, z, b.Name, allMerged.bucketUsageInfo(b.Name))
|
|
}
|
|
}
|
|
close(v)
|
|
return
|
|
case <-updateTicker.C:
|
|
update()
|
|
}
|
|
}
|
|
}()
|
|
|
|
wg.Wait()
|
|
ch := make(chan struct{})
|
|
select {
|
|
case updateCloser <- ch:
|
|
<-ch
|
|
case <-ctx.Done():
|
|
if firstErr == nil {
|
|
firstErr = ctx.Err()
|
|
}
|
|
}
|
|
return firstErr
|
|
}
|
|
|
|
// MakeBucketWithLocation - creates a new bucket across all serverPools simultaneously
|
|
// even if one of the sets fail to create buckets, we proceed all the successful
|
|
// operations.
|
|
func (z *erasureServerPools) MakeBucketWithLocation(ctx context.Context, bucket string, opts BucketOptions) error {
|
|
g := errgroup.WithNErrs(len(z.serverPools))
|
|
|
|
// Create buckets in parallel across all sets.
|
|
for index := range z.serverPools {
|
|
index := index
|
|
g.Go(func() error {
|
|
return z.serverPools[index].MakeBucketWithLocation(ctx, bucket, opts)
|
|
}, index)
|
|
}
|
|
|
|
errs := g.Wait()
|
|
// Return the first encountered error
|
|
for _, err := range errs {
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// If it doesn't exist we get a new, so ignore errors
|
|
meta := newBucketMetadata(bucket)
|
|
if opts.LockEnabled {
|
|
meta.VersioningConfigXML = enabledBucketVersioningConfig
|
|
meta.ObjectLockConfigXML = enabledBucketObjectLockConfig
|
|
}
|
|
|
|
if err := meta.Save(ctx, z); err != nil {
|
|
return toObjectErr(err, bucket)
|
|
}
|
|
|
|
globalBucketMetadataSys.Set(bucket, meta)
|
|
|
|
// Success.
|
|
return nil
|
|
|
|
}
|
|
|
|
func (z *erasureServerPools) GetObjectNInfo(ctx context.Context, bucket, object string, rs *HTTPRangeSpec, h http.Header, lockType LockType, opts ObjectOptions) (gr *GetObjectReader, err error) {
|
|
if err = checkGetObjArgs(ctx, bucket, object); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
object = encodeDirObject(object)
|
|
|
|
if z.SinglePool() {
|
|
return z.serverPools[0].GetObjectNInfo(ctx, bucket, object, rs, h, lockType, opts)
|
|
}
|
|
|
|
var unlockOnDefer bool
|
|
var nsUnlocker = func() {}
|
|
defer func() {
|
|
if unlockOnDefer {
|
|
nsUnlocker()
|
|
}
|
|
}()
|
|
|
|
// Acquire lock
|
|
if lockType != noLock {
|
|
lock := z.NewNSLock(bucket, object)
|
|
switch lockType {
|
|
case writeLock:
|
|
ctx, err = lock.GetLock(ctx, globalOperationTimeout)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
nsUnlocker = lock.Unlock
|
|
case readLock:
|
|
ctx, err = lock.GetRLock(ctx, globalOperationTimeout)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
nsUnlocker = lock.RUnlock
|
|
}
|
|
unlockOnDefer = true
|
|
}
|
|
|
|
errs := make([]error, len(z.serverPools))
|
|
grs := make([]*GetObjectReader, len(z.serverPools))
|
|
|
|
lockType = noLock // do not take locks at lower levels
|
|
var wg sync.WaitGroup
|
|
for i, pool := range z.serverPools {
|
|
wg.Add(1)
|
|
go func(i int, pool *erasureSets) {
|
|
defer wg.Done()
|
|
grs[i], errs[i] = pool.GetObjectNInfo(ctx, bucket, object, rs, h, lockType, opts)
|
|
}(i, pool)
|
|
}
|
|
wg.Wait()
|
|
|
|
var found int = -1
|
|
for i, err := range errs {
|
|
if err == nil {
|
|
found = i
|
|
break
|
|
}
|
|
if !isErrObjectNotFound(err) && !isErrVersionNotFound(err) {
|
|
for _, grr := range grs {
|
|
if grr != nil {
|
|
grr.Close()
|
|
}
|
|
}
|
|
return gr, err
|
|
}
|
|
}
|
|
|
|
if found >= 0 {
|
|
return grs[found], nil
|
|
}
|
|
|
|
object = decodeDirObject(object)
|
|
if opts.VersionID != "" {
|
|
return gr, VersionNotFound{Bucket: bucket, Object: object, VersionID: opts.VersionID}
|
|
}
|
|
return gr, ObjectNotFound{Bucket: bucket, Object: object}
|
|
}
|
|
|
|
func (z *erasureServerPools) GetObjectInfo(ctx context.Context, bucket, object string, opts ObjectOptions) (objInfo ObjectInfo, err error) {
|
|
if err = checkGetObjArgs(ctx, bucket, object); err != nil {
|
|
return objInfo, err
|
|
}
|
|
|
|
object = encodeDirObject(object)
|
|
|
|
if z.SinglePool() {
|
|
return z.serverPools[0].GetObjectInfo(ctx, bucket, object, opts)
|
|
}
|
|
|
|
// Lock the object before reading.
|
|
lk := z.NewNSLock(bucket, object)
|
|
ctx, err = lk.GetRLock(ctx, globalOperationTimeout)
|
|
if err != nil {
|
|
return ObjectInfo{}, err
|
|
}
|
|
defer lk.RUnlock()
|
|
|
|
errs := make([]error, len(z.serverPools))
|
|
objInfos := make([]ObjectInfo, len(z.serverPools))
|
|
|
|
opts.NoLock = true // avoid taking locks at lower levels for multi-pool setups.
|
|
var wg sync.WaitGroup
|
|
for i, pool := range z.serverPools {
|
|
wg.Add(1)
|
|
go func(i int, pool *erasureSets) {
|
|
defer wg.Done()
|
|
objInfos[i], errs[i] = pool.GetObjectInfo(ctx, bucket, object, opts)
|
|
}(i, pool)
|
|
}
|
|
wg.Wait()
|
|
|
|
var found int = -1
|
|
for i, err := range errs {
|
|
if err == nil {
|
|
found = i
|
|
break
|
|
}
|
|
if !isErrObjectNotFound(err) && !isErrVersionNotFound(err) {
|
|
// some errors such as MethodNotAllowed for delete marker
|
|
// should be returned upwards.
|
|
return objInfos[i], err
|
|
}
|
|
}
|
|
|
|
if found >= 0 {
|
|
return objInfos[found], nil
|
|
}
|
|
|
|
object = decodeDirObject(object)
|
|
if opts.VersionID != "" {
|
|
return objInfo, VersionNotFound{Bucket: bucket, Object: object, VersionID: opts.VersionID}
|
|
}
|
|
return objInfo, ObjectNotFound{Bucket: bucket, Object: object}
|
|
}
|
|
|
|
// PutObject - writes an object to least used erasure pool.
|
|
func (z *erasureServerPools) PutObject(ctx context.Context, bucket string, object string, data *PutObjReader, opts ObjectOptions) (ObjectInfo, error) {
|
|
// Validate put object input args.
|
|
if err := checkPutObjectArgs(ctx, bucket, object, z); err != nil {
|
|
return ObjectInfo{}, err
|
|
}
|
|
|
|
object = encodeDirObject(object)
|
|
|
|
if z.SinglePool() {
|
|
return z.serverPools[0].PutObject(ctx, bucket, object, data, opts)
|
|
}
|
|
|
|
idx, err := z.getPoolIdx(ctx, bucket, object, data.Size())
|
|
if err != nil {
|
|
return ObjectInfo{}, err
|
|
}
|
|
|
|
// Overwrite the object at the right pool
|
|
return z.serverPools[idx].PutObject(ctx, bucket, object, data, opts)
|
|
}
|
|
|
|
func (z *erasureServerPools) DeleteObject(ctx context.Context, bucket string, object string, opts ObjectOptions) (objInfo ObjectInfo, err error) {
|
|
if err = checkDelObjArgs(ctx, bucket, object); err != nil {
|
|
return objInfo, err
|
|
}
|
|
|
|
object = encodeDirObject(object)
|
|
if z.SinglePool() {
|
|
return z.serverPools[0].DeleteObject(ctx, bucket, object, opts)
|
|
}
|
|
|
|
// We don't know the size here set 1GiB atleast.
|
|
idx, err := z.getPoolIdx(ctx, bucket, object, 1<<30)
|
|
if err != nil {
|
|
return objInfo, err
|
|
}
|
|
|
|
return z.serverPools[idx].DeleteObject(ctx, bucket, object, opts)
|
|
}
|
|
|
|
func (z *erasureServerPools) DeleteObjects(ctx context.Context, bucket string, objects []ObjectToDelete, opts ObjectOptions) ([]DeletedObject, []error) {
|
|
derrs := make([]error, len(objects))
|
|
dobjects := make([]DeletedObject, len(objects))
|
|
objSets := set.NewStringSet()
|
|
for i := range derrs {
|
|
objects[i].ObjectName = encodeDirObject(objects[i].ObjectName)
|
|
|
|
derrs[i] = checkDelObjArgs(ctx, bucket, objects[i].ObjectName)
|
|
objSets.Add(objects[i].ObjectName)
|
|
}
|
|
|
|
poolObjIdxMap := map[int][]ObjectToDelete{}
|
|
origIndexMap := map[int][]int{}
|
|
if !z.SinglePool() {
|
|
for j, obj := range objects {
|
|
idx, err := z.getPoolIdx(ctx, bucket, obj.ObjectName, 1<<30)
|
|
if err != nil {
|
|
// Unhandled errors return right here.
|
|
for i := range derrs {
|
|
derrs[i] = err
|
|
}
|
|
return dobjects, derrs
|
|
}
|
|
poolObjIdxMap[idx] = append(poolObjIdxMap[idx], obj)
|
|
origIndexMap[idx] = append(origIndexMap[idx], j)
|
|
}
|
|
}
|
|
|
|
var err error
|
|
// Acquire a bulk write lock across 'objects'
|
|
multiDeleteLock := z.NewNSLock(bucket, objSets.ToSlice()...)
|
|
ctx, err = multiDeleteLock.GetLock(ctx, globalOperationTimeout)
|
|
if err != nil {
|
|
for i := range derrs {
|
|
derrs[i] = err
|
|
}
|
|
return dobjects, derrs
|
|
}
|
|
defer multiDeleteLock.Unlock()
|
|
|
|
if z.SinglePool() {
|
|
return z.serverPools[0].DeleteObjects(ctx, bucket, objects, opts)
|
|
}
|
|
|
|
for idx, pool := range z.serverPools {
|
|
objs := poolObjIdxMap[idx]
|
|
orgIndexes := origIndexMap[idx]
|
|
deletedObjects, errs := pool.DeleteObjects(ctx, bucket, objs, opts)
|
|
for i, derr := range errs {
|
|
if derr != nil {
|
|
derrs[orgIndexes[i]] = derr
|
|
}
|
|
dobjects[orgIndexes[i]] = deletedObjects[i]
|
|
}
|
|
}
|
|
return dobjects, derrs
|
|
}
|
|
|
|
func (z *erasureServerPools) CopyObject(ctx context.Context, srcBucket, srcObject, dstBucket, dstObject string, srcInfo ObjectInfo, srcOpts, dstOpts ObjectOptions) (objInfo ObjectInfo, err error) {
|
|
srcObject = encodeDirObject(srcObject)
|
|
dstObject = encodeDirObject(dstObject)
|
|
|
|
cpSrcDstSame := isStringEqual(pathJoin(srcBucket, srcObject), pathJoin(dstBucket, dstObject))
|
|
|
|
poolIdx, err := z.getPoolIdx(ctx, dstBucket, dstObject, srcInfo.Size)
|
|
if err != nil {
|
|
return objInfo, err
|
|
}
|
|
|
|
if cpSrcDstSame && srcInfo.metadataOnly {
|
|
// Version ID is set for the destination and source == destination version ID.
|
|
if dstOpts.VersionID != "" && srcOpts.VersionID == dstOpts.VersionID {
|
|
return z.serverPools[poolIdx].CopyObject(ctx, srcBucket, srcObject, dstBucket, dstObject, srcInfo, srcOpts, dstOpts)
|
|
}
|
|
// Destination is not versioned and source version ID is empty
|
|
// perform an in-place update.
|
|
if !dstOpts.Versioned && srcOpts.VersionID == "" {
|
|
return z.serverPools[poolIdx].CopyObject(ctx, srcBucket, srcObject, dstBucket, dstObject, srcInfo, srcOpts, dstOpts)
|
|
}
|
|
// Destination is versioned, source is not destination version,
|
|
// as a special case look for if the source object is not legacy
|
|
// from older format, for older format we will rewrite them as
|
|
// newer using PutObject() - this is an optimization to save space
|
|
if dstOpts.Versioned && srcOpts.VersionID != dstOpts.VersionID && !srcInfo.Legacy {
|
|
// CopyObject optimization where we don't create an entire copy
|
|
// of the content, instead we add a reference.
|
|
srcInfo.versionOnly = true
|
|
return z.serverPools[poolIdx].CopyObject(ctx, srcBucket, srcObject, dstBucket, dstObject, srcInfo, srcOpts, dstOpts)
|
|
}
|
|
}
|
|
|
|
putOpts := ObjectOptions{
|
|
ServerSideEncryption: dstOpts.ServerSideEncryption,
|
|
UserDefined: srcInfo.UserDefined,
|
|
Versioned: dstOpts.Versioned,
|
|
VersionID: dstOpts.VersionID,
|
|
MTime: dstOpts.MTime,
|
|
}
|
|
|
|
return z.serverPools[poolIdx].PutObject(ctx, dstBucket, dstObject, srcInfo.PutObjReader, putOpts)
|
|
}
|
|
|
|
func (z *erasureServerPools) ListObjectsV2(ctx context.Context, bucket, prefix, continuationToken, delimiter string, maxKeys int, fetchOwner bool, startAfter string) (ListObjectsV2Info, error) {
|
|
marker := continuationToken
|
|
if marker == "" {
|
|
marker = startAfter
|
|
}
|
|
|
|
loi, err := z.ListObjects(ctx, bucket, prefix, marker, delimiter, maxKeys)
|
|
if err != nil {
|
|
return ListObjectsV2Info{}, err
|
|
}
|
|
|
|
listObjectsV2Info := ListObjectsV2Info{
|
|
IsTruncated: loi.IsTruncated,
|
|
ContinuationToken: continuationToken,
|
|
NextContinuationToken: loi.NextMarker,
|
|
Objects: loi.Objects,
|
|
Prefixes: loi.Prefixes,
|
|
}
|
|
return listObjectsV2Info, err
|
|
}
|
|
|
|
func (z *erasureServerPools) ListObjectVersions(ctx context.Context, bucket, prefix, marker, versionMarker, delimiter string, maxKeys int) (ListObjectVersionsInfo, error) {
|
|
loi := ListObjectVersionsInfo{}
|
|
if marker == "" && versionMarker != "" {
|
|
return loi, NotImplemented{}
|
|
}
|
|
|
|
opts := listPathOptions{
|
|
Bucket: bucket,
|
|
Prefix: prefix,
|
|
Separator: delimiter,
|
|
Limit: maxKeysPlusOne(maxKeys, marker != ""),
|
|
Marker: marker,
|
|
InclDeleted: true,
|
|
AskDisks: globalAPIConfig.getListQuorum(),
|
|
}
|
|
|
|
// Shortcut for APN/1.0 Veeam/1.0 Backup/10.0
|
|
// It requests unique blocks with a specific prefix.
|
|
// We skip scanning the parent directory for
|
|
// more objects matching the prefix.
|
|
ri := logger.GetReqInfo(ctx)
|
|
if ri != nil && strings.Contains(ri.UserAgent, `1.0 Veeam/1.0 Backup`) && strings.HasSuffix(prefix, ".blk") {
|
|
opts.discardResult = true
|
|
opts.Transient = true
|
|
}
|
|
|
|
merged, err := z.listPath(ctx, opts)
|
|
if err != nil && err != io.EOF {
|
|
return loi, err
|
|
}
|
|
if versionMarker == "" {
|
|
// If we are not looking for a specific version skip it.
|
|
marker, _ = parseMarker(marker)
|
|
merged.forwardPast(marker)
|
|
}
|
|
objects := merged.fileInfoVersions(bucket, prefix, delimiter, versionMarker)
|
|
loi.IsTruncated = err == nil && len(objects) > 0
|
|
if maxKeys > 0 && len(objects) > maxKeys {
|
|
objects = objects[:maxKeys]
|
|
loi.IsTruncated = true
|
|
}
|
|
for _, obj := range objects {
|
|
if obj.IsDir && obj.ModTime.IsZero() && delimiter != "" {
|
|
loi.Prefixes = append(loi.Prefixes, obj.Name)
|
|
} else {
|
|
loi.Objects = append(loi.Objects, obj)
|
|
}
|
|
}
|
|
if loi.IsTruncated {
|
|
last := objects[len(objects)-1]
|
|
loi.NextMarker = encodeMarker(last.Name, merged.listID)
|
|
loi.NextVersionIDMarker = last.VersionID
|
|
}
|
|
return loi, nil
|
|
}
|
|
|
|
func maxKeysPlusOne(maxKeys int, addOne bool) int {
|
|
if maxKeys < 0 || maxKeys > maxObjectList {
|
|
maxKeys = maxObjectList
|
|
}
|
|
if addOne {
|
|
maxKeys++
|
|
}
|
|
return maxKeys
|
|
}
|
|
|
|
func (z *erasureServerPools) ListObjects(ctx context.Context, bucket, prefix, marker, delimiter string, maxKeys int) (ListObjectsInfo, error) {
|
|
var loi ListObjectsInfo
|
|
|
|
merged, err := z.listPath(ctx, listPathOptions{
|
|
Bucket: bucket,
|
|
Prefix: prefix,
|
|
Separator: delimiter,
|
|
Limit: maxKeysPlusOne(maxKeys, marker != ""),
|
|
Marker: marker,
|
|
InclDeleted: false,
|
|
AskDisks: globalAPIConfig.getListQuorum(),
|
|
})
|
|
if err != nil && err != io.EOF {
|
|
logger.LogIf(ctx, err)
|
|
return loi, err
|
|
}
|
|
marker, _ = parseMarker(marker)
|
|
merged.forwardPast(marker)
|
|
|
|
// Default is recursive, if delimiter is set then list non recursive.
|
|
objects := merged.fileInfos(bucket, prefix, delimiter)
|
|
loi.IsTruncated = err == nil && len(objects) > 0
|
|
if maxKeys > 0 && len(objects) > maxKeys {
|
|
objects = objects[:maxKeys]
|
|
loi.IsTruncated = true
|
|
}
|
|
for _, obj := range objects {
|
|
if obj.IsDir && obj.ModTime.IsZero() && delimiter != "" {
|
|
loi.Prefixes = append(loi.Prefixes, obj.Name)
|
|
} else {
|
|
loi.Objects = append(loi.Objects, obj)
|
|
}
|
|
}
|
|
if loi.IsTruncated {
|
|
last := objects[len(objects)-1]
|
|
loi.NextMarker = encodeMarker(last.Name, merged.listID)
|
|
}
|
|
return loi, nil
|
|
}
|
|
|
|
func (z *erasureServerPools) ListMultipartUploads(ctx context.Context, bucket, prefix, keyMarker, uploadIDMarker, delimiter string, maxUploads int) (ListMultipartsInfo, error) {
|
|
if err := checkListMultipartArgs(ctx, bucket, prefix, keyMarker, uploadIDMarker, delimiter, z); err != nil {
|
|
return ListMultipartsInfo{}, err
|
|
}
|
|
|
|
if z.SinglePool() {
|
|
return z.serverPools[0].ListMultipartUploads(ctx, bucket, prefix, keyMarker, uploadIDMarker, delimiter, maxUploads)
|
|
}
|
|
|
|
var poolResult = ListMultipartsInfo{}
|
|
poolResult.MaxUploads = maxUploads
|
|
poolResult.KeyMarker = keyMarker
|
|
poolResult.Prefix = prefix
|
|
poolResult.Delimiter = delimiter
|
|
for _, pool := range z.serverPools {
|
|
result, err := pool.ListMultipartUploads(ctx, bucket, prefix, keyMarker, uploadIDMarker,
|
|
delimiter, maxUploads)
|
|
if err != nil {
|
|
return result, err
|
|
}
|
|
poolResult.Uploads = append(poolResult.Uploads, result.Uploads...)
|
|
}
|
|
return poolResult, nil
|
|
}
|
|
|
|
// Initiate a new multipart upload on a hashedSet based on object name.
|
|
func (z *erasureServerPools) NewMultipartUpload(ctx context.Context, bucket, object string, opts ObjectOptions) (string, error) {
|
|
if err := checkNewMultipartArgs(ctx, bucket, object, z); err != nil {
|
|
return "", err
|
|
}
|
|
|
|
if z.SinglePool() {
|
|
return z.serverPools[0].NewMultipartUpload(ctx, bucket, object, opts)
|
|
}
|
|
|
|
// We don't know the exact size, so we ask for at least 1GiB file.
|
|
idx, err := z.getPoolIdx(ctx, bucket, object, 1<<30)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
return z.serverPools[idx].NewMultipartUpload(ctx, bucket, object, opts)
|
|
}
|
|
|
|
// Copies a part of an object from source hashedSet to destination hashedSet.
|
|
func (z *erasureServerPools) CopyObjectPart(ctx context.Context, srcBucket, srcObject, destBucket, destObject string, uploadID string, partID int, startOffset int64, length int64, srcInfo ObjectInfo, srcOpts, dstOpts ObjectOptions) (PartInfo, error) {
|
|
if err := checkNewMultipartArgs(ctx, srcBucket, srcObject, z); err != nil {
|
|
return PartInfo{}, err
|
|
}
|
|
|
|
return z.PutObjectPart(ctx, destBucket, destObject, uploadID, partID,
|
|
NewPutObjReader(srcInfo.Reader), dstOpts)
|
|
}
|
|
|
|
// PutObjectPart - writes part of an object to hashedSet based on the object name.
|
|
func (z *erasureServerPools) PutObjectPart(ctx context.Context, bucket, object, uploadID string, partID int, data *PutObjReader, opts ObjectOptions) (PartInfo, error) {
|
|
if err := checkPutObjectPartArgs(ctx, bucket, object, z); err != nil {
|
|
return PartInfo{}, err
|
|
}
|
|
|
|
if z.SinglePool() {
|
|
return z.serverPools[0].PutObjectPart(ctx, bucket, object, uploadID, partID, data, opts)
|
|
}
|
|
|
|
for _, pool := range z.serverPools {
|
|
_, err := pool.GetMultipartInfo(ctx, bucket, object, uploadID, opts)
|
|
if err == nil {
|
|
return pool.PutObjectPart(ctx, bucket, object, uploadID, partID, data, opts)
|
|
}
|
|
switch err.(type) {
|
|
case InvalidUploadID:
|
|
// Look for information on the next pool
|
|
continue
|
|
}
|
|
// Any other unhandled errors such as quorum return.
|
|
return PartInfo{}, err
|
|
}
|
|
|
|
return PartInfo{}, InvalidUploadID{
|
|
Bucket: bucket,
|
|
Object: object,
|
|
UploadID: uploadID,
|
|
}
|
|
}
|
|
|
|
func (z *erasureServerPools) GetMultipartInfo(ctx context.Context, bucket, object, uploadID string, opts ObjectOptions) (MultipartInfo, error) {
|
|
if err := checkListPartsArgs(ctx, bucket, object, z); err != nil {
|
|
return MultipartInfo{}, err
|
|
}
|
|
|
|
if z.SinglePool() {
|
|
return z.serverPools[0].GetMultipartInfo(ctx, bucket, object, uploadID, opts)
|
|
}
|
|
for _, pool := range z.serverPools {
|
|
mi, err := pool.GetMultipartInfo(ctx, bucket, object, uploadID, opts)
|
|
if err == nil {
|
|
return mi, nil
|
|
}
|
|
switch err.(type) {
|
|
case InvalidUploadID:
|
|
// upload id not found, continue to the next pool.
|
|
continue
|
|
}
|
|
// any other unhandled error return right here.
|
|
return MultipartInfo{}, err
|
|
}
|
|
return MultipartInfo{}, InvalidUploadID{
|
|
Bucket: bucket,
|
|
Object: object,
|
|
UploadID: uploadID,
|
|
}
|
|
|
|
}
|
|
|
|
// ListObjectParts - lists all uploaded parts to an object in hashedSet.
|
|
func (z *erasureServerPools) ListObjectParts(ctx context.Context, bucket, object, uploadID string, partNumberMarker int, maxParts int, opts ObjectOptions) (ListPartsInfo, error) {
|
|
if err := checkListPartsArgs(ctx, bucket, object, z); err != nil {
|
|
return ListPartsInfo{}, err
|
|
}
|
|
|
|
if z.SinglePool() {
|
|
return z.serverPools[0].ListObjectParts(ctx, bucket, object, uploadID, partNumberMarker, maxParts, opts)
|
|
}
|
|
for _, pool := range z.serverPools {
|
|
_, err := pool.GetMultipartInfo(ctx, bucket, object, uploadID, opts)
|
|
if err == nil {
|
|
return pool.ListObjectParts(ctx, bucket, object, uploadID, partNumberMarker, maxParts, opts)
|
|
}
|
|
switch err.(type) {
|
|
case InvalidUploadID:
|
|
continue
|
|
}
|
|
return ListPartsInfo{}, err
|
|
}
|
|
return ListPartsInfo{}, InvalidUploadID{
|
|
Bucket: bucket,
|
|
Object: object,
|
|
UploadID: uploadID,
|
|
}
|
|
}
|
|
|
|
// Aborts an in-progress multipart operation on hashedSet based on the object name.
|
|
func (z *erasureServerPools) AbortMultipartUpload(ctx context.Context, bucket, object, uploadID string, opts ObjectOptions) error {
|
|
if err := checkAbortMultipartArgs(ctx, bucket, object, z); err != nil {
|
|
return err
|
|
}
|
|
|
|
if z.SinglePool() {
|
|
return z.serverPools[0].AbortMultipartUpload(ctx, bucket, object, uploadID, opts)
|
|
}
|
|
|
|
for _, pool := range z.serverPools {
|
|
_, err := pool.GetMultipartInfo(ctx, bucket, object, uploadID, opts)
|
|
if err == nil {
|
|
return pool.AbortMultipartUpload(ctx, bucket, object, uploadID, opts)
|
|
}
|
|
switch err.(type) {
|
|
case InvalidUploadID:
|
|
// upload id not found move to next pool
|
|
continue
|
|
}
|
|
return err
|
|
}
|
|
return InvalidUploadID{
|
|
Bucket: bucket,
|
|
Object: object,
|
|
UploadID: uploadID,
|
|
}
|
|
}
|
|
|
|
// CompleteMultipartUpload - completes a pending multipart transaction, on hashedSet based on object name.
|
|
func (z *erasureServerPools) CompleteMultipartUpload(ctx context.Context, bucket, object, uploadID string, uploadedParts []CompletePart, opts ObjectOptions) (objInfo ObjectInfo, err error) {
|
|
if err = checkCompleteMultipartArgs(ctx, bucket, object, z); err != nil {
|
|
return objInfo, err
|
|
}
|
|
|
|
if z.SinglePool() {
|
|
return z.serverPools[0].CompleteMultipartUpload(ctx, bucket, object, uploadID, uploadedParts, opts)
|
|
}
|
|
|
|
// Purge any existing object.
|
|
for _, pool := range z.serverPools {
|
|
pool.DeleteObject(ctx, bucket, object, opts)
|
|
}
|
|
|
|
for _, pool := range z.serverPools {
|
|
result, err := pool.ListMultipartUploads(ctx, bucket, object, "", "", "", maxUploadsList)
|
|
if err != nil {
|
|
return objInfo, err
|
|
}
|
|
if result.Lookup(uploadID) {
|
|
return pool.CompleteMultipartUpload(ctx, bucket, object, uploadID, uploadedParts, opts)
|
|
}
|
|
}
|
|
return objInfo, InvalidUploadID{
|
|
Bucket: bucket,
|
|
Object: object,
|
|
UploadID: uploadID,
|
|
}
|
|
}
|
|
|
|
// GetBucketInfo - returns bucket info from one of the erasure coded serverPools.
|
|
func (z *erasureServerPools) GetBucketInfo(ctx context.Context, bucket string) (bucketInfo BucketInfo, err error) {
|
|
if z.SinglePool() {
|
|
bucketInfo, err = z.serverPools[0].GetBucketInfo(ctx, bucket)
|
|
if err != nil {
|
|
return bucketInfo, err
|
|
}
|
|
meta, err := globalBucketMetadataSys.Get(bucket)
|
|
if err == nil {
|
|
bucketInfo.Created = meta.Created
|
|
}
|
|
return bucketInfo, nil
|
|
}
|
|
for _, pool := range z.serverPools {
|
|
bucketInfo, err = pool.GetBucketInfo(ctx, bucket)
|
|
if err != nil {
|
|
if isErrBucketNotFound(err) {
|
|
continue
|
|
}
|
|
return bucketInfo, err
|
|
}
|
|
meta, err := globalBucketMetadataSys.Get(bucket)
|
|
if err == nil {
|
|
bucketInfo.Created = meta.Created
|
|
}
|
|
return bucketInfo, nil
|
|
}
|
|
return bucketInfo, BucketNotFound{
|
|
Bucket: bucket,
|
|
}
|
|
}
|
|
|
|
// IsNotificationSupported returns whether bucket notification is applicable for this layer.
|
|
func (z *erasureServerPools) IsNotificationSupported() bool {
|
|
return true
|
|
}
|
|
|
|
// IsListenSupported returns whether listen bucket notification is applicable for this layer.
|
|
func (z *erasureServerPools) IsListenSupported() bool {
|
|
return true
|
|
}
|
|
|
|
// IsEncryptionSupported returns whether server side encryption is implemented for this layer.
|
|
func (z *erasureServerPools) IsEncryptionSupported() bool {
|
|
return true
|
|
}
|
|
|
|
// IsCompressionSupported returns whether compression is applicable for this layer.
|
|
func (z *erasureServerPools) IsCompressionSupported() bool {
|
|
return true
|
|
}
|
|
|
|
func (z *erasureServerPools) IsTaggingSupported() bool {
|
|
return true
|
|
}
|
|
|
|
// DeleteBucket - deletes a bucket on all serverPools simultaneously,
|
|
// even if one of the serverPools fail to delete buckets, we proceed to
|
|
// undo a successful operation.
|
|
func (z *erasureServerPools) DeleteBucket(ctx context.Context, bucket string, forceDelete bool) error {
|
|
if z.SinglePool() {
|
|
return z.serverPools[0].DeleteBucket(ctx, bucket, forceDelete)
|
|
}
|
|
g := errgroup.WithNErrs(len(z.serverPools))
|
|
|
|
// Delete buckets in parallel across all serverPools.
|
|
for index := range z.serverPools {
|
|
index := index
|
|
g.Go(func() error {
|
|
return z.serverPools[index].DeleteBucket(ctx, bucket, forceDelete)
|
|
}, index)
|
|
}
|
|
|
|
errs := g.Wait()
|
|
|
|
// For any write quorum failure, we undo all the delete
|
|
// buckets operation by creating all the buckets again.
|
|
for _, err := range errs {
|
|
if err != nil {
|
|
if _, ok := err.(InsufficientWriteQuorum); ok {
|
|
undoDeleteBucketServerPools(ctx, bucket, z.serverPools, errs)
|
|
}
|
|
|
|
return err
|
|
}
|
|
}
|
|
|
|
// Success.
|
|
return nil
|
|
}
|
|
|
|
// deleteAll will delete a bucket+prefix unconditionally across all disks.
|
|
// Note that set distribution is ignored so it should only be used in cases where
|
|
// data is not distributed across sets.
|
|
// Errors are logged but individual disk failures are not returned.
|
|
func (z *erasureServerPools) deleteAll(ctx context.Context, bucket, prefix string) {
|
|
for _, servers := range z.serverPools {
|
|
for _, set := range servers.sets {
|
|
set.deleteAll(ctx, bucket, prefix)
|
|
}
|
|
}
|
|
}
|
|
|
|
// renameAll will rename bucket+prefix unconditionally across all disks to
|
|
// minioMetaTmpBucket + unique uuid,
|
|
// Note that set distribution is ignored so it should only be used in cases where
|
|
// data is not distributed across sets. Errors are logged but individual
|
|
// disk failures are not returned.
|
|
func (z *erasureServerPools) renameAll(ctx context.Context, bucket, prefix string) {
|
|
for _, servers := range z.serverPools {
|
|
for _, set := range servers.sets {
|
|
set.renameAll(ctx, bucket, prefix)
|
|
}
|
|
}
|
|
}
|
|
|
|
// This function is used to undo a successful DeleteBucket operation.
|
|
func undoDeleteBucketServerPools(ctx context.Context, bucket string, serverPools []*erasureSets, errs []error) {
|
|
g := errgroup.WithNErrs(len(serverPools))
|
|
|
|
// Undo previous delete bucket on all underlying serverPools.
|
|
for index := range serverPools {
|
|
index := index
|
|
g.Go(func() error {
|
|
if errs[index] == nil {
|
|
return serverPools[index].MakeBucketWithLocation(ctx, bucket, BucketOptions{})
|
|
}
|
|
return nil
|
|
}, index)
|
|
}
|
|
|
|
g.Wait()
|
|
}
|
|
|
|
// List all buckets from one of the serverPools, we are not doing merge
|
|
// sort here just for simplification. As per design it is assumed
|
|
// that all buckets are present on all serverPools.
|
|
func (z *erasureServerPools) ListBuckets(ctx context.Context) (buckets []BucketInfo, err error) {
|
|
if z.SinglePool() {
|
|
buckets, err = z.serverPools[0].ListBuckets(ctx)
|
|
} else {
|
|
for _, pool := range z.serverPools {
|
|
buckets, err = pool.ListBuckets(ctx)
|
|
if err != nil {
|
|
logger.LogIf(ctx, err)
|
|
continue
|
|
}
|
|
break
|
|
}
|
|
}
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
for i := range buckets {
|
|
meta, err := globalBucketMetadataSys.Get(buckets[i].Name)
|
|
if err == nil {
|
|
buckets[i].Created = meta.Created
|
|
}
|
|
}
|
|
return buckets, nil
|
|
}
|
|
|
|
func (z *erasureServerPools) HealFormat(ctx context.Context, dryRun bool) (madmin.HealResultItem, error) {
|
|
var err error
|
|
// Acquire lock on format.json
|
|
formatLock := z.NewNSLock(minioMetaBucket, formatConfigFile)
|
|
ctx, err = formatLock.GetLock(ctx, globalOperationTimeout)
|
|
if err != nil {
|
|
return madmin.HealResultItem{}, err
|
|
}
|
|
defer formatLock.Unlock()
|
|
|
|
var r = madmin.HealResultItem{
|
|
Type: madmin.HealItemMetadata,
|
|
Detail: "disk-format",
|
|
}
|
|
|
|
var countNoHeal int
|
|
for _, pool := range z.serverPools {
|
|
result, err := pool.HealFormat(ctx, dryRun)
|
|
if err != nil && !errors.Is(err, errNoHealRequired) {
|
|
logger.LogIf(ctx, err)
|
|
continue
|
|
}
|
|
// Count errNoHealRequired across all serverPools,
|
|
// to return appropriate error to the caller
|
|
if errors.Is(err, errNoHealRequired) {
|
|
countNoHeal++
|
|
}
|
|
r.DiskCount += result.DiskCount
|
|
r.SetCount += result.SetCount
|
|
r.Before.Drives = append(r.Before.Drives, result.Before.Drives...)
|
|
r.After.Drives = append(r.After.Drives, result.After.Drives...)
|
|
}
|
|
|
|
// No heal returned by all serverPools, return errNoHealRequired
|
|
if countNoHeal == len(z.serverPools) {
|
|
return r, errNoHealRequired
|
|
}
|
|
|
|
return r, nil
|
|
}
|
|
|
|
func (z *erasureServerPools) HealBucket(ctx context.Context, bucket string, opts madmin.HealOpts) (madmin.HealResultItem, error) {
|
|
var r = madmin.HealResultItem{
|
|
Type: madmin.HealItemBucket,
|
|
Bucket: bucket,
|
|
}
|
|
|
|
// Attempt heal on the bucket metadata, ignore any failures
|
|
_, _ = z.HealObject(ctx, minioMetaBucket, pathJoin(bucketConfigPrefix, bucket, bucketMetadataFile), "", opts)
|
|
|
|
for _, pool := range z.serverPools {
|
|
result, err := pool.HealBucket(ctx, bucket, opts)
|
|
if err != nil {
|
|
switch err.(type) {
|
|
case BucketNotFound:
|
|
continue
|
|
}
|
|
return result, err
|
|
}
|
|
r.DiskCount += result.DiskCount
|
|
r.SetCount += result.SetCount
|
|
r.Before.Drives = append(r.Before.Drives, result.Before.Drives...)
|
|
r.After.Drives = append(r.After.Drives, result.After.Drives...)
|
|
}
|
|
|
|
return r, nil
|
|
}
|
|
|
|
// Walk a bucket, optionally prefix recursively, until we have returned
|
|
// all the content to objectInfo channel, it is callers responsibility
|
|
// to allocate a receive channel for ObjectInfo, upon any unhandled
|
|
// error walker returns error. Optionally if context.Done() is received
|
|
// then Walk() stops the walker.
|
|
func (z *erasureServerPools) Walk(ctx context.Context, bucket, prefix string, results chan<- ObjectInfo, opts ObjectOptions) error {
|
|
if err := checkListObjsArgs(ctx, bucket, prefix, "", z); err != nil {
|
|
// Upon error close the channel.
|
|
close(results)
|
|
return err
|
|
}
|
|
|
|
if opts.WalkVersions {
|
|
go func() {
|
|
defer close(results)
|
|
|
|
var marker, versionIDMarker string
|
|
for {
|
|
loi, err := z.ListObjectVersions(ctx, bucket, prefix, marker, versionIDMarker, "", 1000)
|
|
if err != nil {
|
|
break
|
|
}
|
|
|
|
for _, obj := range loi.Objects {
|
|
results <- obj
|
|
}
|
|
|
|
if !loi.IsTruncated {
|
|
break
|
|
}
|
|
|
|
marker = loi.NextMarker
|
|
versionIDMarker = loi.NextVersionIDMarker
|
|
}
|
|
}()
|
|
return nil
|
|
}
|
|
|
|
go func() {
|
|
defer close(results)
|
|
|
|
var marker string
|
|
for {
|
|
loi, err := z.ListObjects(ctx, bucket, prefix, marker, "", 1000)
|
|
if err != nil {
|
|
break
|
|
}
|
|
|
|
for _, obj := range loi.Objects {
|
|
results <- obj
|
|
}
|
|
|
|
if !loi.IsTruncated {
|
|
break
|
|
}
|
|
|
|
marker = loi.NextMarker
|
|
}
|
|
}()
|
|
|
|
return nil
|
|
}
|
|
|
|
// HealObjectFn closure function heals the object.
|
|
type HealObjectFn func(bucket, object, versionID string) error
|
|
|
|
func (z *erasureServerPools) HealObjects(ctx context.Context, bucket, prefix string, opts madmin.HealOpts, healObject HealObjectFn) error {
|
|
// If listing did not return any entries upon first attempt, we
|
|
// return `ObjectNotFound`, to indicate the caller for any
|
|
// actions they may want to take as if `prefix` is missing.
|
|
err := toObjectErr(errFileNotFound, bucket, prefix)
|
|
for _, erasureSet := range z.serverPools {
|
|
for _, set := range erasureSet.sets {
|
|
var entryChs []FileInfoVersionsCh
|
|
var mu sync.Mutex
|
|
var wg sync.WaitGroup
|
|
for _, disk := range set.getOnlineDisks() {
|
|
disk := disk
|
|
wg.Add(1)
|
|
go func() {
|
|
defer wg.Done()
|
|
entryCh, err := disk.WalkVersions(ctx, bucket, prefix, "", true, ctx.Done())
|
|
if err != nil {
|
|
// Disk walk returned error, ignore it.
|
|
return
|
|
}
|
|
mu.Lock()
|
|
entryChs = append(entryChs, FileInfoVersionsCh{
|
|
Ch: entryCh,
|
|
})
|
|
mu.Unlock()
|
|
}()
|
|
}
|
|
wg.Wait()
|
|
|
|
entriesValid := make([]bool, len(entryChs))
|
|
entries := make([]FileInfoVersions, len(entryChs))
|
|
|
|
for {
|
|
entry, quorumCount, ok := lexicallySortedEntryVersions(entryChs, entries, entriesValid)
|
|
if !ok {
|
|
break
|
|
}
|
|
|
|
// Remove empty directories if found - they have no meaning.
|
|
// Can be left over from highly concurrent put/remove.
|
|
if quorumCount > set.setDriveCount/2 && entry.IsEmptyDir {
|
|
if !opts.DryRun && opts.Remove {
|
|
set.deleteEmptyDir(ctx, bucket, entry.Name)
|
|
}
|
|
}
|
|
|
|
// Indicate that first attempt was a success and subsequent loop
|
|
// knows that its not our first attempt at 'prefix'
|
|
err = nil
|
|
|
|
if quorumCount == set.setDriveCount && opts.ScanMode == madmin.HealNormalScan {
|
|
continue
|
|
}
|
|
|
|
for _, version := range entry.Versions {
|
|
if err := healObject(bucket, version.Name, version.VersionID); err != nil {
|
|
return toObjectErr(err, bucket, version.Name)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return err
|
|
}
|
|
|
|
func (z *erasureServerPools) HealObject(ctx context.Context, bucket, object, versionID string, opts madmin.HealOpts) (madmin.HealResultItem, error) {
|
|
object = encodeDirObject(object)
|
|
|
|
for _, pool := range z.serverPools {
|
|
result, err := pool.HealObject(ctx, bucket, object, versionID, opts)
|
|
result.Object = decodeDirObject(result.Object)
|
|
if err != nil {
|
|
if isErrObjectNotFound(err) || isErrVersionNotFound(err) {
|
|
continue
|
|
}
|
|
return result, err
|
|
}
|
|
return result, nil
|
|
}
|
|
if versionID != "" {
|
|
return madmin.HealResultItem{}, VersionNotFound{
|
|
Bucket: bucket,
|
|
Object: object,
|
|
VersionID: versionID,
|
|
}
|
|
}
|
|
return madmin.HealResultItem{}, ObjectNotFound{
|
|
Bucket: bucket,
|
|
Object: object,
|
|
}
|
|
}
|
|
|
|
// GetMetrics - no op
|
|
func (z *erasureServerPools) GetMetrics(ctx context.Context) (*BackendMetrics, error) {
|
|
logger.LogIf(ctx, NotImplemented{})
|
|
return &BackendMetrics{}, NotImplemented{}
|
|
}
|
|
|
|
func (z *erasureServerPools) getPoolAndSet(id string) (poolIdx, setIdx, diskIdx int, err error) {
|
|
for poolIdx := range z.serverPools {
|
|
format := z.serverPools[poolIdx].format
|
|
for setIdx, set := range format.Erasure.Sets {
|
|
for i, diskID := range set {
|
|
if diskID == id {
|
|
return poolIdx, setIdx, i, nil
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return -1, -1, -1, fmt.Errorf("DiskID(%s) %w", id, errDiskNotFound)
|
|
}
|
|
|
|
// HealthOptions takes input options to return sepcific information
|
|
type HealthOptions struct {
|
|
Maintenance bool
|
|
}
|
|
|
|
// HealthResult returns the current state of the system, also
|
|
// additionally with any specific heuristic information which
|
|
// was queried
|
|
type HealthResult struct {
|
|
Healthy bool
|
|
HealingDrives int
|
|
PoolID, SetID int
|
|
WriteQuorum int
|
|
}
|
|
|
|
// ReadHealth returns if the cluster can serve read requests
|
|
func (z *erasureServerPools) ReadHealth(ctx context.Context) bool {
|
|
erasureSetUpCount := make([][]int, len(z.serverPools))
|
|
for i := range z.serverPools {
|
|
erasureSetUpCount[i] = make([]int, len(z.serverPools[i].sets))
|
|
}
|
|
|
|
diskIDs := globalNotificationSys.GetLocalDiskIDs(ctx)
|
|
diskIDs = append(diskIDs, getLocalDiskIDs(z))
|
|
|
|
for _, localDiskIDs := range diskIDs {
|
|
for _, id := range localDiskIDs {
|
|
poolIdx, setIdx, _, err := z.getPoolAndSet(id)
|
|
if err != nil {
|
|
logger.LogIf(ctx, err)
|
|
continue
|
|
}
|
|
erasureSetUpCount[poolIdx][setIdx]++
|
|
}
|
|
}
|
|
|
|
b := z.BackendInfo()
|
|
readQuorum := b.StandardSCData[0]
|
|
|
|
for poolIdx := range erasureSetUpCount {
|
|
for setIdx := range erasureSetUpCount[poolIdx] {
|
|
if erasureSetUpCount[poolIdx][setIdx] < readQuorum {
|
|
return false
|
|
}
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// Health - returns current status of the object layer health,
|
|
// provides if write access exists across sets, additionally
|
|
// can be used to query scenarios if health may be lost
|
|
// if this node is taken down by an external orchestrator.
|
|
func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) HealthResult {
|
|
erasureSetUpCount := make([][]int, len(z.serverPools))
|
|
for i := range z.serverPools {
|
|
erasureSetUpCount[i] = make([]int, len(z.serverPools[i].sets))
|
|
}
|
|
|
|
diskIDs := globalNotificationSys.GetLocalDiskIDs(ctx)
|
|
if !opts.Maintenance {
|
|
diskIDs = append(diskIDs, getLocalDiskIDs(z))
|
|
}
|
|
|
|
for _, localDiskIDs := range diskIDs {
|
|
for _, id := range localDiskIDs {
|
|
poolIdx, setIdx, _, err := z.getPoolAndSet(id)
|
|
if err != nil {
|
|
logger.LogIf(ctx, err)
|
|
continue
|
|
}
|
|
erasureSetUpCount[poolIdx][setIdx]++
|
|
}
|
|
}
|
|
|
|
reqInfo := (&logger.ReqInfo{}).AppendTags("maintenance", strconv.FormatBool(opts.Maintenance))
|
|
|
|
b := z.BackendInfo()
|
|
writeQuorum := b.StandardSCData[0]
|
|
if writeQuorum == b.StandardSCParity {
|
|
writeQuorum++
|
|
}
|
|
|
|
var aggHealStateResult madmin.BgHealState
|
|
if opts.Maintenance {
|
|
// check if local disks are being healed, if they are being healed
|
|
// we need to tell healthy status as 'false' so that this server
|
|
// is not taken down for maintenance
|
|
var err error
|
|
aggHealStateResult, err = getAggregatedBackgroundHealState(ctx, nil)
|
|
if err != nil {
|
|
logger.LogIf(logger.SetReqInfo(ctx, reqInfo), fmt.Errorf("Unable to verify global heal status: %w", err))
|
|
return HealthResult{
|
|
Healthy: false,
|
|
}
|
|
}
|
|
|
|
if len(aggHealStateResult.HealDisks) > 0 {
|
|
logger.LogIf(logger.SetReqInfo(ctx, reqInfo), fmt.Errorf("Total drives to be healed %d", len(aggHealStateResult.HealDisks)))
|
|
}
|
|
}
|
|
|
|
for poolIdx := range erasureSetUpCount {
|
|
for setIdx := range erasureSetUpCount[poolIdx] {
|
|
if erasureSetUpCount[poolIdx][setIdx] < writeQuorum {
|
|
logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
|
|
fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
|
|
poolIdx, setIdx, writeQuorum))
|
|
return HealthResult{
|
|
Healthy: false,
|
|
HealingDrives: len(aggHealStateResult.HealDisks),
|
|
PoolID: poolIdx,
|
|
SetID: setIdx,
|
|
WriteQuorum: writeQuorum,
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// when maintenance is not specified we don't have
|
|
// to look at the healing side of the code.
|
|
if !opts.Maintenance {
|
|
return HealthResult{
|
|
Healthy: true,
|
|
WriteQuorum: writeQuorum,
|
|
}
|
|
}
|
|
|
|
return HealthResult{
|
|
Healthy: len(aggHealStateResult.HealDisks) == 0,
|
|
HealingDrives: len(aggHealStateResult.HealDisks),
|
|
WriteQuorum: writeQuorum,
|
|
}
|
|
}
|
|
|
|
// PutObjectTags - replace or add tags to an existing object
|
|
func (z *erasureServerPools) PutObjectTags(ctx context.Context, bucket, object string, tags string, opts ObjectOptions) (ObjectInfo, error) {
|
|
object = encodeDirObject(object)
|
|
if z.SinglePool() {
|
|
return z.serverPools[0].PutObjectTags(ctx, bucket, object, tags, opts)
|
|
}
|
|
|
|
// We don't know the size here set 1GiB atleast.
|
|
idx, err := z.getPoolIdx(ctx, bucket, object, 1<<30)
|
|
if err != nil {
|
|
return ObjectInfo{}, err
|
|
}
|
|
|
|
return z.serverPools[idx].PutObjectTags(ctx, bucket, object, tags, opts)
|
|
}
|
|
|
|
// DeleteObjectTags - delete object tags from an existing object
|
|
func (z *erasureServerPools) DeleteObjectTags(ctx context.Context, bucket, object string, opts ObjectOptions) (ObjectInfo, error) {
|
|
object = encodeDirObject(object)
|
|
if z.SinglePool() {
|
|
return z.serverPools[0].DeleteObjectTags(ctx, bucket, object, opts)
|
|
}
|
|
|
|
// We don't know the size here set 1GiB atleast.
|
|
idx, err := z.getPoolIdx(ctx, bucket, object, 1<<30)
|
|
if err != nil {
|
|
return ObjectInfo{}, err
|
|
}
|
|
|
|
return z.serverPools[idx].DeleteObjectTags(ctx, bucket, object, opts)
|
|
}
|
|
|
|
// GetObjectTags - get object tags from an existing object
|
|
func (z *erasureServerPools) GetObjectTags(ctx context.Context, bucket, object string, opts ObjectOptions) (*tags.Tags, error) {
|
|
object = encodeDirObject(object)
|
|
if z.SinglePool() {
|
|
return z.serverPools[0].GetObjectTags(ctx, bucket, object, opts)
|
|
}
|
|
|
|
// We don't know the size here set 1GiB atleast.
|
|
idx, err := z.getPoolIdx(ctx, bucket, object, 1<<30)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return z.serverPools[idx].GetObjectTags(ctx, bucket, object, opts)
|
|
}
|