DeleteObjects: Send delete to all pools (#172) (#20821)

Currently, DeleteObjects() tries to find the object's pool before
sending a delete request. This only works well when an object has
multiple versions in different pools since looking for the pool does
not consider the version-id. When an S3 client wants to
remove a version-id that exists in pool 2, the delete request will be
directed to pool one because it has another version of the same object.

This commit will remove looking for pool logic and will send a delete
request to all pools in parallel. This should not cause any performance
regression in most of the cases since the object will unlikely exist
in only one pool, and the performance price will be similar to
getPoolIndex() in that case.
This commit is contained in:
Anis Eleuch 2025-01-28 17:57:18 +01:00 committed by GitHub
parent dcc000ae2c
commit 079d64c801
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 124 additions and 84 deletions

View File

@ -32,6 +32,8 @@ type DeletedObject struct {
DeleteMarkerMTime DeleteMarkerMTime `xml:"-"`
// MinIO extensions to support delete marker replication
ReplicationState ReplicationState `xml:"-"`
found bool // the object was found during deletion
}
// DeleteMarkerMTime is an embedded type containing time.Time for XML marshal

View File

@ -27,6 +27,8 @@ import (
"net/http"
"path"
"runtime"
"slices"
"sort"
"strconv"
"strings"
"sync"
@ -1706,8 +1708,21 @@ func (er erasureObjects) DeleteObjects(ctx context.Context, bucket string, objec
}
dedupVersions := make([]FileInfoVersions, 0, len(versionsMap))
for _, version := range versionsMap {
dedupVersions = append(dedupVersions, version)
for _, fivs := range versionsMap {
// Removal of existing versions and adding a delete marker in the same
// request is supported. At the same time, we cannot allow adding
// two delete markers on top of any object. To avoid this situation,
// we will sort deletions to execute existing deletion first,
// then add only one delete marker if requested
sort.SliceStable(fivs.Versions, func(i, j int) bool {
return !fivs.Versions[i].Deleted
})
if idx := slices.IndexFunc(fivs.Versions, func(fi FileInfo) bool {
return fi.Deleted
}); idx > -1 {
fivs.Versions = fivs.Versions[:idx+1]
}
dedupVersions = append(dedupVersions, fivs)
}
// Initialize list of errors.
@ -1732,12 +1747,6 @@ func (er erasureObjects) DeleteObjects(ctx context.Context, bucket string, objec
continue
}
for _, v := range dedupVersions[i].Versions {
if err == errFileNotFound || err == errFileVersionNotFound {
if !dobjects[v.Idx].DeleteMarker {
// Not delete marker, if not found, ok.
continue
}
}
delObjErrs[index][v.Idx] = err
}
}
@ -1757,6 +1766,13 @@ func (er erasureObjects) DeleteObjects(ctx context.Context, bucket string, objec
}
}
err := reduceWriteQuorumErrs(ctx, diskErrs, objectOpIgnoredErrs, writeQuorums[objIndex])
if err == nil {
dobjects[objIndex].found = true
} else if isErrVersionNotFound(err) || isErrObjectNotFound(err) {
if !dobjects[objIndex].DeleteMarker {
err = nil
}
}
if objects[objIndex].VersionID != "" {
errs[objIndex] = toObjectErr(err, bucket, objects[objIndex].ObjectName, objects[objIndex].VersionID)
} else {

View File

@ -131,6 +131,75 @@ func TestErasureDeleteObjectBasic(t *testing.T) {
removeRoots(fsDirs)
}
func TestDeleteObjectsVersionedTwoPools(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
obj, fsDirs, err := prepareErasurePools()
if err != nil {
t.Fatal("Unable to initialize 'Erasure' object layer.", err)
}
// Remove all dirs.
for _, dir := range fsDirs {
defer os.RemoveAll(dir)
}
bucketName := "bucket"
objectName := "myobject"
err = obj.MakeBucket(ctx, bucketName, MakeBucketOptions{
VersioningEnabled: true,
})
if err != nil {
t.Fatal(err)
}
z, ok := obj.(*erasureServerPools)
if !ok {
t.Fatal("unexpected object layer type")
}
versions := make([]string, 2)
for i := range z.serverPools {
objInfo, err := z.serverPools[i].PutObject(ctx, bucketName, objectName,
mustGetPutObjReader(t, bytes.NewReader([]byte("abcd")), int64(len("abcd")), "", ""), ObjectOptions{
Versioned: true,
})
if err != nil {
t.Fatalf("Erasure Object upload failed: <ERROR> %s", err)
}
versions[i] = objInfo.VersionID
}
// Remove and check the version in the second pool, then
// remove and check the version in the first pool
for testIdx, vid := range []string{versions[1], versions[0]} {
names := []ObjectToDelete{
{
ObjectV: ObjectV{
ObjectName: objectName,
VersionID: vid,
},
},
}
_, delErrs := obj.DeleteObjects(ctx, bucketName, names, ObjectOptions{
Versioned: true,
})
for i := range delErrs {
if delErrs[i] != nil {
t.Errorf("Test %d: Failed to remove object `%v` with the error: `%v`", testIdx, names[i], delErrs[i])
}
_, statErr := obj.GetObjectInfo(ctx, bucketName, objectName, ObjectOptions{
VersionID: names[i].ObjectV.VersionID,
})
switch statErr.(type) {
case VersionNotFound:
default:
t.Errorf("Test %d: Object %s is not removed", testIdx, objectName)
}
}
}
}
func TestDeleteObjectsVersioned(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()

View File

@ -32,9 +32,9 @@ func prepareErasurePools() (ObjectLayer, []string, error) {
pools := mustGetPoolEndpoints(0, fsDirs[:16]...)
pools = append(pools, mustGetPoolEndpoints(1, fsDirs[16:]...)...)
// Everything is fine, should return nil
objLayer, err := newErasureServerPools(context.Background(), pools)
objLayer, _, err := initObjectLayer(context.Background(), pools)
if err != nil {
removeRoots(fsDirs)
return nil, nil, err
}
return objLayer, fsDirs, nil

View File

@ -1248,89 +1248,42 @@ func (z *erasureServerPools) DeleteObjects(ctx context.Context, bucket string, o
ctx = lkctx.Context()
defer multiDeleteLock.Unlock(lkctx)
// Fetch location of up to 10 objects concurrently.
poolObjIdxMap := map[int][]ObjectToDelete{}
origIndexMap := map[int][]int{}
dObjectsByPool := make([][]DeletedObject, len(z.serverPools))
dErrsByPool := make([][]error, len(z.serverPools))
// Always perform 1/10th of the number of objects per delete
concurrent := len(objects) / 10
if concurrent <= 10 {
// if we cannot get 1/10th then choose the number of
// objects as concurrent.
concurrent = len(objects)
}
var mu sync.Mutex
eg := errgroup.WithNErrs(len(objects)).WithConcurrency(concurrent)
for j, obj := range objects {
j := j
obj := obj
eg := errgroup.WithNErrs(len(z.serverPools)).WithConcurrency(len(z.serverPools))
for i, pool := range z.serverPools {
i := i
pool := pool
eg.Go(func() error {
pinfo, _, err := z.getPoolInfoExistingWithOpts(ctx, bucket, obj.ObjectName, ObjectOptions{
NoLock: true,
})
if err != nil {
if !isErrObjectNotFound(err) && !isErrVersionNotFound(err) {
derrs[j] = err
}
dobjects[j] = DeletedObject{
ObjectName: decodeDirObject(obj.ObjectName),
VersionID: obj.VersionID,
}
return nil
}
// Delete marker already present we are not going to create new delete markers.
if pinfo.ObjInfo.DeleteMarker && obj.VersionID == "" {
dobjects[j] = DeletedObject{
DeleteMarker: pinfo.ObjInfo.DeleteMarker,
DeleteMarkerVersionID: pinfo.ObjInfo.VersionID,
DeleteMarkerMTime: DeleteMarkerMTime{pinfo.ObjInfo.ModTime},
ObjectName: decodeDirObject(pinfo.ObjInfo.Name),
}
return nil
}
idx := pinfo.Index
mu.Lock()
defer mu.Unlock()
poolObjIdxMap[idx] = append(poolObjIdxMap[idx], obj)
origIndexMap[idx] = append(origIndexMap[idx], j)
dObjectsByPool[i], dErrsByPool[i] = pool.DeleteObjects(ctx, bucket, objects, opts)
return nil
}, j)
}, i)
}
eg.Wait() // wait to check all the pools.
if len(poolObjIdxMap) > 0 {
// Delete concurrently in all server pools.
var wg sync.WaitGroup
wg.Add(len(z.serverPools))
for idx, pool := range z.serverPools {
go func(idx int, pool *erasureSets) {
defer wg.Done()
objs := poolObjIdxMap[idx]
if len(objs) == 0 {
return
}
orgIndexes := origIndexMap[idx]
deletedObjects, errs := pool.DeleteObjects(ctx, bucket, objs, opts)
mu.Lock()
for i, derr := range errs {
if derr != nil {
derrs[orgIndexes[i]] = derr
}
deletedObjects[i].ObjectName = decodeDirObject(deletedObjects[i].ObjectName)
dobjects[orgIndexes[i]] = deletedObjects[i]
}
mu.Unlock()
}(idx, pool)
for i := range dobjects {
// Iterate over pools
for pool := range z.serverPools {
if dErrsByPool[pool][i] == nil && dObjectsByPool[pool][i].found {
// A fast exit when the object is found and removed
dobjects[i] = dObjectsByPool[pool][i]
derrs[i] = nil
break
}
if derrs[i] == nil {
// No error related to this object is found, assign this pool result
// whether it is nil because there is no object found or because of
// some other errors such erasure quorum errors.
dobjects[i] = dObjectsByPool[pool][i]
derrs[i] = dErrsByPool[pool][i]
}
}
wg.Wait()
}
for i := range dobjects {
dobjects[i].ObjectName = decodeDirObject(dobjects[i].ObjectName)
}
return dobjects, derrs
}