DeleteObjects: Send delete to all pools (#172) (#20821)

Currently, DeleteObjects() tries to find the object's pool before
sending a delete request. This only works well when an object has
multiple versions in different pools since looking for the pool does
not consider the version-id. When an S3 client wants to
remove a version-id that exists in pool 2, the delete request will be
directed to pool one because it has another version of the same object.

This commit will remove looking for pool logic and will send a delete
request to all pools in parallel. This should not cause any performance
regression in most of the cases since the object will unlikely exist
in only one pool, and the performance price will be similar to
getPoolIndex() in that case.
This commit is contained in:
Anis Eleuch 2025-01-28 17:57:18 +01:00 committed by GitHub
parent dcc000ae2c
commit 079d64c801
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 124 additions and 84 deletions

View File

@ -32,6 +32,8 @@ type DeletedObject struct {
DeleteMarkerMTime DeleteMarkerMTime `xml:"-"` DeleteMarkerMTime DeleteMarkerMTime `xml:"-"`
// MinIO extensions to support delete marker replication // MinIO extensions to support delete marker replication
ReplicationState ReplicationState `xml:"-"` ReplicationState ReplicationState `xml:"-"`
found bool // the object was found during deletion
} }
// DeleteMarkerMTime is an embedded type containing time.Time for XML marshal // DeleteMarkerMTime is an embedded type containing time.Time for XML marshal

View File

@ -27,6 +27,8 @@ import (
"net/http" "net/http"
"path" "path"
"runtime" "runtime"
"slices"
"sort"
"strconv" "strconv"
"strings" "strings"
"sync" "sync"
@ -1706,8 +1708,21 @@ func (er erasureObjects) DeleteObjects(ctx context.Context, bucket string, objec
} }
dedupVersions := make([]FileInfoVersions, 0, len(versionsMap)) dedupVersions := make([]FileInfoVersions, 0, len(versionsMap))
for _, version := range versionsMap { for _, fivs := range versionsMap {
dedupVersions = append(dedupVersions, version) // Removal of existing versions and adding a delete marker in the same
// request is supported. At the same time, we cannot allow adding
// two delete markers on top of any object. To avoid this situation,
// we will sort deletions to execute existing deletion first,
// then add only one delete marker if requested
sort.SliceStable(fivs.Versions, func(i, j int) bool {
return !fivs.Versions[i].Deleted
})
if idx := slices.IndexFunc(fivs.Versions, func(fi FileInfo) bool {
return fi.Deleted
}); idx > -1 {
fivs.Versions = fivs.Versions[:idx+1]
}
dedupVersions = append(dedupVersions, fivs)
} }
// Initialize list of errors. // Initialize list of errors.
@ -1732,12 +1747,6 @@ func (er erasureObjects) DeleteObjects(ctx context.Context, bucket string, objec
continue continue
} }
for _, v := range dedupVersions[i].Versions { for _, v := range dedupVersions[i].Versions {
if err == errFileNotFound || err == errFileVersionNotFound {
if !dobjects[v.Idx].DeleteMarker {
// Not delete marker, if not found, ok.
continue
}
}
delObjErrs[index][v.Idx] = err delObjErrs[index][v.Idx] = err
} }
} }
@ -1757,6 +1766,13 @@ func (er erasureObjects) DeleteObjects(ctx context.Context, bucket string, objec
} }
} }
err := reduceWriteQuorumErrs(ctx, diskErrs, objectOpIgnoredErrs, writeQuorums[objIndex]) err := reduceWriteQuorumErrs(ctx, diskErrs, objectOpIgnoredErrs, writeQuorums[objIndex])
if err == nil {
dobjects[objIndex].found = true
} else if isErrVersionNotFound(err) || isErrObjectNotFound(err) {
if !dobjects[objIndex].DeleteMarker {
err = nil
}
}
if objects[objIndex].VersionID != "" { if objects[objIndex].VersionID != "" {
errs[objIndex] = toObjectErr(err, bucket, objects[objIndex].ObjectName, objects[objIndex].VersionID) errs[objIndex] = toObjectErr(err, bucket, objects[objIndex].ObjectName, objects[objIndex].VersionID)
} else { } else {

View File

@ -131,6 +131,75 @@ func TestErasureDeleteObjectBasic(t *testing.T) {
removeRoots(fsDirs) removeRoots(fsDirs)
} }
func TestDeleteObjectsVersionedTwoPools(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
obj, fsDirs, err := prepareErasurePools()
if err != nil {
t.Fatal("Unable to initialize 'Erasure' object layer.", err)
}
// Remove all dirs.
for _, dir := range fsDirs {
defer os.RemoveAll(dir)
}
bucketName := "bucket"
objectName := "myobject"
err = obj.MakeBucket(ctx, bucketName, MakeBucketOptions{
VersioningEnabled: true,
})
if err != nil {
t.Fatal(err)
}
z, ok := obj.(*erasureServerPools)
if !ok {
t.Fatal("unexpected object layer type")
}
versions := make([]string, 2)
for i := range z.serverPools {
objInfo, err := z.serverPools[i].PutObject(ctx, bucketName, objectName,
mustGetPutObjReader(t, bytes.NewReader([]byte("abcd")), int64(len("abcd")), "", ""), ObjectOptions{
Versioned: true,
})
if err != nil {
t.Fatalf("Erasure Object upload failed: <ERROR> %s", err)
}
versions[i] = objInfo.VersionID
}
// Remove and check the version in the second pool, then
// remove and check the version in the first pool
for testIdx, vid := range []string{versions[1], versions[0]} {
names := []ObjectToDelete{
{
ObjectV: ObjectV{
ObjectName: objectName,
VersionID: vid,
},
},
}
_, delErrs := obj.DeleteObjects(ctx, bucketName, names, ObjectOptions{
Versioned: true,
})
for i := range delErrs {
if delErrs[i] != nil {
t.Errorf("Test %d: Failed to remove object `%v` with the error: `%v`", testIdx, names[i], delErrs[i])
}
_, statErr := obj.GetObjectInfo(ctx, bucketName, objectName, ObjectOptions{
VersionID: names[i].ObjectV.VersionID,
})
switch statErr.(type) {
case VersionNotFound:
default:
t.Errorf("Test %d: Object %s is not removed", testIdx, objectName)
}
}
}
}
func TestDeleteObjectsVersioned(t *testing.T) { func TestDeleteObjectsVersioned(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background()) ctx, cancel := context.WithCancel(context.Background())
defer cancel() defer cancel()

View File

@ -32,9 +32,9 @@ func prepareErasurePools() (ObjectLayer, []string, error) {
pools := mustGetPoolEndpoints(0, fsDirs[:16]...) pools := mustGetPoolEndpoints(0, fsDirs[:16]...)
pools = append(pools, mustGetPoolEndpoints(1, fsDirs[16:]...)...) pools = append(pools, mustGetPoolEndpoints(1, fsDirs[16:]...)...)
// Everything is fine, should return nil objLayer, _, err := initObjectLayer(context.Background(), pools)
objLayer, err := newErasureServerPools(context.Background(), pools)
if err != nil { if err != nil {
removeRoots(fsDirs)
return nil, nil, err return nil, nil, err
} }
return objLayer, fsDirs, nil return objLayer, fsDirs, nil

View File

@ -1248,89 +1248,42 @@ func (z *erasureServerPools) DeleteObjects(ctx context.Context, bucket string, o
ctx = lkctx.Context() ctx = lkctx.Context()
defer multiDeleteLock.Unlock(lkctx) defer multiDeleteLock.Unlock(lkctx)
// Fetch location of up to 10 objects concurrently. dObjectsByPool := make([][]DeletedObject, len(z.serverPools))
poolObjIdxMap := map[int][]ObjectToDelete{} dErrsByPool := make([][]error, len(z.serverPools))
origIndexMap := map[int][]int{}
// Always perform 1/10th of the number of objects per delete eg := errgroup.WithNErrs(len(z.serverPools)).WithConcurrency(len(z.serverPools))
concurrent := len(objects) / 10 for i, pool := range z.serverPools {
if concurrent <= 10 { i := i
// if we cannot get 1/10th then choose the number of pool := pool
// objects as concurrent.
concurrent = len(objects)
}
var mu sync.Mutex
eg := errgroup.WithNErrs(len(objects)).WithConcurrency(concurrent)
for j, obj := range objects {
j := j
obj := obj
eg.Go(func() error { eg.Go(func() error {
pinfo, _, err := z.getPoolInfoExistingWithOpts(ctx, bucket, obj.ObjectName, ObjectOptions{ dObjectsByPool[i], dErrsByPool[i] = pool.DeleteObjects(ctx, bucket, objects, opts)
NoLock: true,
})
if err != nil {
if !isErrObjectNotFound(err) && !isErrVersionNotFound(err) {
derrs[j] = err
}
dobjects[j] = DeletedObject{
ObjectName: decodeDirObject(obj.ObjectName),
VersionID: obj.VersionID,
}
return nil
}
// Delete marker already present we are not going to create new delete markers.
if pinfo.ObjInfo.DeleteMarker && obj.VersionID == "" {
dobjects[j] = DeletedObject{
DeleteMarker: pinfo.ObjInfo.DeleteMarker,
DeleteMarkerVersionID: pinfo.ObjInfo.VersionID,
DeleteMarkerMTime: DeleteMarkerMTime{pinfo.ObjInfo.ModTime},
ObjectName: decodeDirObject(pinfo.ObjInfo.Name),
}
return nil
}
idx := pinfo.Index
mu.Lock()
defer mu.Unlock()
poolObjIdxMap[idx] = append(poolObjIdxMap[idx], obj)
origIndexMap[idx] = append(origIndexMap[idx], j)
return nil return nil
}, j) }, i)
} }
eg.Wait() // wait to check all the pools. eg.Wait() // wait to check all the pools.
if len(poolObjIdxMap) > 0 { for i := range dobjects {
// Delete concurrently in all server pools. // Iterate over pools
var wg sync.WaitGroup for pool := range z.serverPools {
wg.Add(len(z.serverPools)) if dErrsByPool[pool][i] == nil && dObjectsByPool[pool][i].found {
for idx, pool := range z.serverPools { // A fast exit when the object is found and removed
go func(idx int, pool *erasureSets) { dobjects[i] = dObjectsByPool[pool][i]
defer wg.Done() derrs[i] = nil
objs := poolObjIdxMap[idx] break
if len(objs) == 0 { }
return if derrs[i] == nil {
} // No error related to this object is found, assign this pool result
orgIndexes := origIndexMap[idx] // whether it is nil because there is no object found or because of
deletedObjects, errs := pool.DeleteObjects(ctx, bucket, objs, opts) // some other errors such erasure quorum errors.
mu.Lock() dobjects[i] = dObjectsByPool[pool][i]
for i, derr := range errs { derrs[i] = dErrsByPool[pool][i]
if derr != nil { }
derrs[orgIndexes[i]] = derr
}
deletedObjects[i].ObjectName = decodeDirObject(deletedObjects[i].ObjectName)
dobjects[orgIndexes[i]] = deletedObjects[i]
}
mu.Unlock()
}(idx, pool)
} }
wg.Wait()
} }
for i := range dobjects {
dobjects[i].ObjectName = decodeDirObject(dobjects[i].ObjectName)
}
return dobjects, derrs return dobjects, derrs
} }