DeleteObjects: Send delete to all pools (#172) (#20821)

Currently, DeleteObjects() tries to find the object's pool before sending a delete request. This only works well when an object has multiple versions in different pools since looking for the pool does not consider the version-id. When an S3 client wants to remove a version-id that exists in pool 2, the delete request will be directed to pool one because it has another version of the same object. This commit will remove looking for pool logic and will send a delete request to all pools in parallel. This should not cause any performance regression in most of the cases since the object will unlikely exist in only one pool, and the performance price will be similar to getPoolIndex() in that case.
2025-10-30 00:05:02 -04:00 · 2025-01-28 17:57:18 +01:00 · 2025-01-28 17:57:18 +01:00 · 079d64c801
commit 079d64c801
parent dcc000ae2c
5 changed files with 124 additions and 84 deletions
--- a/cmd/api-datatypes.go
+++ b/cmd/api-datatypes.go
@ -32,6 +32,8 @@ type DeletedObject struct {
 	DeleteMarkerMTime DeleteMarkerMTime `xml:"-"`
 	// MinIO extensions to support delete marker replication
 	ReplicationState ReplicationState `xml:"-"`
 	found bool // the object was found during deletion
 }
 // DeleteMarkerMTime is an embedded type containing time.Time for XML marshal
--- a/cmd/erasure-object.go
+++ b/cmd/erasure-object.go
@ -27,6 +27,8 @@ import (
 	"net/http"
 	"path"
 	"runtime"
 	"slices"
 	"sort"
 	"strconv"
 	"strings"
 	"sync"
@ -1706,8 +1708,21 @@ func (er erasureObjects) DeleteObjects(ctx context.Context, bucket string, objec
 	}
 	dedupVersions := make([]FileInfoVersions, 0, len(versionsMap))
-	for _, version := range versionsMap {
+	for _, fivs := range versionsMap {
-		dedupVersions = append(dedupVersions, version)
+		// Removal of existing versions and adding a delete marker in the same
 		// request is supported. At the same time, we cannot allow adding
 		// two delete markers on top of any object. To avoid this situation,
 		// we will sort deletions to execute existing deletion first,
 		// then add only one delete marker if requested
 		sort.SliceStable(fivs.Versions, func(i, j int) bool {
 			return !fivs.Versions[i].Deleted
 		})
 		if idx := slices.IndexFunc(fivs.Versions, func(fi FileInfo) bool {
 			return fi.Deleted
 		}); idx > -1 {
 			fivs.Versions = fivs.Versions[:idx+1]
 		}
 		dedupVersions = append(dedupVersions, fivs)
 	}
 	// Initialize list of errors.
@ -1732,12 +1747,6 @@ func (er erasureObjects) DeleteObjects(ctx context.Context, bucket string, objec
 					continue
 				}
 				for _, v := range dedupVersions[i].Versions {
 					if err == errFileNotFound || err == errFileVersionNotFound {
 						if !dobjects[v.Idx].DeleteMarker {
 							// Not delete marker, if not found, ok.
 							continue
 						}
 					}
 					delObjErrs[index][v.Idx] = err
 				}
 			}
@ -1757,6 +1766,13 @@ func (er erasureObjects) DeleteObjects(ctx context.Context, bucket string, objec
 			}
 		}
 		err := reduceWriteQuorumErrs(ctx, diskErrs, objectOpIgnoredErrs, writeQuorums[objIndex])
 		if err == nil {
 			dobjects[objIndex].found = true
 		} else if isErrVersionNotFound(err) || isErrObjectNotFound(err) {
 			if !dobjects[objIndex].DeleteMarker {
 				err = nil
 			}
 		}
 		if objects[objIndex].VersionID != "" {
 			errs[objIndex] = toObjectErr(err, bucket, objects[objIndex].ObjectName, objects[objIndex].VersionID)
 		} else {
--- a/cmd/erasure-object_test.go
+++ b/cmd/erasure-object_test.go
@ -131,6 +131,75 @@ func TestErasureDeleteObjectBasic(t *testing.T) {
 	removeRoots(fsDirs)
 }
 func TestDeleteObjectsVersionedTwoPools(t *testing.T) {
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()
 	obj, fsDirs, err := prepareErasurePools()
 	if err != nil {
 		t.Fatal("Unable to initialize 'Erasure' object layer.", err)
 	}
 	// Remove all dirs.
 	for _, dir := range fsDirs {
 		defer os.RemoveAll(dir)
 	}
 	bucketName := "bucket"
 	objectName := "myobject"
 	err = obj.MakeBucket(ctx, bucketName, MakeBucketOptions{
 		VersioningEnabled: true,
 	})
 	if err != nil {
 		t.Fatal(err)
 	}
 	z, ok := obj.(*erasureServerPools)
 	if !ok {
 		t.Fatal("unexpected object layer type")
 	}
 	versions := make([]string, 2)
 	for i := range z.serverPools {
 		objInfo, err := z.serverPools[i].PutObject(ctx, bucketName, objectName,
 			mustGetPutObjReader(t, bytes.NewReader([]byte("abcd")), int64(len("abcd")), "", ""), ObjectOptions{
 				Versioned: true,
 			})
 		if err != nil {
 			t.Fatalf("Erasure Object upload failed: <ERROR> %s", err)
 		}
 		versions[i] = objInfo.VersionID
 	}
 	// Remove and check the version in the second pool, then
 	// remove and check the version in the first pool
 	for testIdx, vid := range []string{versions[1], versions[0]} {
 		names := []ObjectToDelete{
 			{
 				ObjectV: ObjectV{
 					ObjectName: objectName,
 					VersionID:  vid,
 				},
 			},
 		}
 		_, delErrs := obj.DeleteObjects(ctx, bucketName, names, ObjectOptions{
 			Versioned: true,
 		})
 		for i := range delErrs {
 			if delErrs[i] != nil {
 				t.Errorf("Test %d: Failed to remove object `%v` with the error: `%v`", testIdx, names[i], delErrs[i])
 			}
 			_, statErr := obj.GetObjectInfo(ctx, bucketName, objectName, ObjectOptions{
 				VersionID: names[i].ObjectV.VersionID,
 			})
 			switch statErr.(type) {
 			case VersionNotFound:
 			default:
 				t.Errorf("Test %d: Object %s is not removed", testIdx, objectName)
 			}
 		}
 	}
 }
 func TestDeleteObjectsVersioned(t *testing.T) {
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()
--- a/cmd/erasure-server-pool-decom_test.go
+++ b/cmd/erasure-server-pool-decom_test.go
@ -32,9 +32,9 @@ func prepareErasurePools() (ObjectLayer, []string, error) {
 	pools := mustGetPoolEndpoints(0, fsDirs[:16]...)
 	pools = append(pools, mustGetPoolEndpoints(1, fsDirs[16:]...)...)
-	// Everything is fine, should return nil
+	objLayer, _, err := initObjectLayer(context.Background(), pools)
 	objLayer, err := newErasureServerPools(context.Background(), pools)
 	if err != nil {
 		removeRoots(fsDirs)
 		return nil, nil, err
 	}
 	return objLayer, fsDirs, nil
--- a/cmd/erasure-server-pool.go
+++ b/cmd/erasure-server-pool.go
@ -1248,89 +1248,42 @@ func (z *erasureServerPools) DeleteObjects(ctx context.Context, bucket string, o
 	ctx = lkctx.Context()
 	defer multiDeleteLock.Unlock(lkctx)
-	// Fetch location of up to 10 objects concurrently.
+	dObjectsByPool := make([][]DeletedObject, len(z.serverPools))
-	poolObjIdxMap := map[int][]ObjectToDelete{}
+	dErrsByPool := make([][]error, len(z.serverPools))
 	origIndexMap := map[int][]int{}
-	// Always perform 1/10th of the number of objects per delete
+	eg := errgroup.WithNErrs(len(z.serverPools)).WithConcurrency(len(z.serverPools))
-	concurrent := len(objects) / 10
+	for i, pool := range z.serverPools {
-	if concurrent <= 10 {
+		i := i
-		// if we cannot get 1/10th then choose the number of
+		pool := pool
 		// objects as concurrent.
 		concurrent = len(objects)
 	}
 	var mu sync.Mutex
 	eg := errgroup.WithNErrs(len(objects)).WithConcurrency(concurrent)
 	for j, obj := range objects {
 		j := j
 		obj := obj
 		eg.Go(func() error {
-			pinfo, _, err := z.getPoolInfoExistingWithOpts(ctx, bucket, obj.ObjectName, ObjectOptions{
+			dObjectsByPool[i], dErrsByPool[i] = pool.DeleteObjects(ctx, bucket, objects, opts)
 				NoLock: true,
 			})
 			if err != nil {
 				if !isErrObjectNotFound(err) && !isErrVersionNotFound(err) {
 					derrs[j] = err
 				}
 				dobjects[j] = DeletedObject{
 					ObjectName: decodeDirObject(obj.ObjectName),
 					VersionID:  obj.VersionID,
 				}
 				return nil
 			}
 			// Delete marker already present we are not going to create new delete markers.
 			if pinfo.ObjInfo.DeleteMarker && obj.VersionID == "" {
 				dobjects[j] = DeletedObject{
 					DeleteMarker:          pinfo.ObjInfo.DeleteMarker,
 					DeleteMarkerVersionID: pinfo.ObjInfo.VersionID,
 					DeleteMarkerMTime:     DeleteMarkerMTime{pinfo.ObjInfo.ModTime},
 					ObjectName:            decodeDirObject(pinfo.ObjInfo.Name),
 				}
 				return nil
 			}
 			idx := pinfo.Index
 			mu.Lock()
 			defer mu.Unlock()
 			poolObjIdxMap[idx] = append(poolObjIdxMap[idx], obj)
 			origIndexMap[idx] = append(origIndexMap[idx], j)
 			return nil
-		}, j)
+		}, i)
 	}
 	eg.Wait() // wait to check all the pools.
-	if len(poolObjIdxMap) > 0 {
+	for i := range dobjects {
-		// Delete concurrently in all server pools.
+		// Iterate over pools
-		var wg sync.WaitGroup
+		for pool := range z.serverPools {
-		wg.Add(len(z.serverPools))
+			if dErrsByPool[pool][i] == nil && dObjectsByPool[pool][i].found {
-		for idx, pool := range z.serverPools {
+				// A fast exit when the object is found and removed
-			go func(idx int, pool *erasureSets) {
+				dobjects[i] = dObjectsByPool[pool][i]
-				defer wg.Done()
+				derrs[i] = nil
-				objs := poolObjIdxMap[idx]
+				break
-				if len(objs) == 0 {
+			}
-					return
+			if derrs[i] == nil {
-				}
+				// No error related to this object is found, assign this pool result
-				orgIndexes := origIndexMap[idx]
+				// whether it is nil because there is no object found or because of
-				deletedObjects, errs := pool.DeleteObjects(ctx, bucket, objs, opts)
+				// some other errors such erasure quorum errors.
-				mu.Lock()
+				dobjects[i] = dObjectsByPool[pool][i]
-				for i, derr := range errs {
+				derrs[i] = dErrsByPool[pool][i]
-					if derr != nil {
+			}
 						derrs[orgIndexes[i]] = derr
 					}
 					deletedObjects[i].ObjectName = decodeDirObject(deletedObjects[i].ObjectName)
 					dobjects[orgIndexes[i]] = deletedObjects[i]
 				}
 				mu.Unlock()
 			}(idx, pool)
 		}
 		wg.Wait()
 	}
 	for i := range dobjects {
 		dobjects[i].ObjectName = decodeDirObject(dobjects[i].ObjectName)
 	}
 	return dobjects, derrs
 }