From 079d64c8017e54a8075197375cb1b9995e957105 Mon Sep 17 00:00:00 2001 From: Anis Eleuch Date: Tue, 28 Jan 2025 17:57:18 +0100 Subject: [PATCH] DeleteObjects: Send delete to all pools (#172) (#20821) Currently, DeleteObjects() tries to find the object's pool before sending a delete request. This only works well when an object has multiple versions in different pools since looking for the pool does not consider the version-id. When an S3 client wants to remove a version-id that exists in pool 2, the delete request will be directed to pool one because it has another version of the same object. This commit will remove looking for pool logic and will send a delete request to all pools in parallel. This should not cause any performance regression in most of the cases since the object will unlikely exist in only one pool, and the performance price will be similar to getPoolIndex() in that case. --- cmd/api-datatypes.go | 2 + cmd/erasure-object.go | 32 ++++++-- cmd/erasure-object_test.go | 69 ++++++++++++++++++ cmd/erasure-server-pool-decom_test.go | 4 +- cmd/erasure-server-pool.go | 101 +++++++------------------- 5 files changed, 124 insertions(+), 84 deletions(-) diff --git a/cmd/api-datatypes.go b/cmd/api-datatypes.go index 2d2831ca1..84a18986f 100644 --- a/cmd/api-datatypes.go +++ b/cmd/api-datatypes.go @@ -32,6 +32,8 @@ type DeletedObject struct { DeleteMarkerMTime DeleteMarkerMTime `xml:"-"` // MinIO extensions to support delete marker replication ReplicationState ReplicationState `xml:"-"` + + found bool // the object was found during deletion } // DeleteMarkerMTime is an embedded type containing time.Time for XML marshal diff --git a/cmd/erasure-object.go b/cmd/erasure-object.go index 80c819680..4422a5d47 100644 --- a/cmd/erasure-object.go +++ b/cmd/erasure-object.go @@ -27,6 +27,8 @@ import ( "net/http" "path" "runtime" + "slices" + "sort" "strconv" "strings" "sync" @@ -1706,8 +1708,21 @@ func (er erasureObjects) DeleteObjects(ctx context.Context, bucket string, objec } dedupVersions := make([]FileInfoVersions, 0, len(versionsMap)) - for _, version := range versionsMap { - dedupVersions = append(dedupVersions, version) + for _, fivs := range versionsMap { + // Removal of existing versions and adding a delete marker in the same + // request is supported. At the same time, we cannot allow adding + // two delete markers on top of any object. To avoid this situation, + // we will sort deletions to execute existing deletion first, + // then add only one delete marker if requested + sort.SliceStable(fivs.Versions, func(i, j int) bool { + return !fivs.Versions[i].Deleted + }) + if idx := slices.IndexFunc(fivs.Versions, func(fi FileInfo) bool { + return fi.Deleted + }); idx > -1 { + fivs.Versions = fivs.Versions[:idx+1] + } + dedupVersions = append(dedupVersions, fivs) } // Initialize list of errors. @@ -1732,12 +1747,6 @@ func (er erasureObjects) DeleteObjects(ctx context.Context, bucket string, objec continue } for _, v := range dedupVersions[i].Versions { - if err == errFileNotFound || err == errFileVersionNotFound { - if !dobjects[v.Idx].DeleteMarker { - // Not delete marker, if not found, ok. - continue - } - } delObjErrs[index][v.Idx] = err } } @@ -1757,6 +1766,13 @@ func (er erasureObjects) DeleteObjects(ctx context.Context, bucket string, objec } } err := reduceWriteQuorumErrs(ctx, diskErrs, objectOpIgnoredErrs, writeQuorums[objIndex]) + if err == nil { + dobjects[objIndex].found = true + } else if isErrVersionNotFound(err) || isErrObjectNotFound(err) { + if !dobjects[objIndex].DeleteMarker { + err = nil + } + } if objects[objIndex].VersionID != "" { errs[objIndex] = toObjectErr(err, bucket, objects[objIndex].ObjectName, objects[objIndex].VersionID) } else { diff --git a/cmd/erasure-object_test.go b/cmd/erasure-object_test.go index a30803f85..e3492f075 100644 --- a/cmd/erasure-object_test.go +++ b/cmd/erasure-object_test.go @@ -131,6 +131,75 @@ func TestErasureDeleteObjectBasic(t *testing.T) { removeRoots(fsDirs) } +func TestDeleteObjectsVersionedTwoPools(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + obj, fsDirs, err := prepareErasurePools() + if err != nil { + t.Fatal("Unable to initialize 'Erasure' object layer.", err) + } + // Remove all dirs. + for _, dir := range fsDirs { + defer os.RemoveAll(dir) + } + + bucketName := "bucket" + objectName := "myobject" + err = obj.MakeBucket(ctx, bucketName, MakeBucketOptions{ + VersioningEnabled: true, + }) + if err != nil { + t.Fatal(err) + } + + z, ok := obj.(*erasureServerPools) + if !ok { + t.Fatal("unexpected object layer type") + } + + versions := make([]string, 2) + for i := range z.serverPools { + objInfo, err := z.serverPools[i].PutObject(ctx, bucketName, objectName, + mustGetPutObjReader(t, bytes.NewReader([]byte("abcd")), int64(len("abcd")), "", ""), ObjectOptions{ + Versioned: true, + }) + if err != nil { + t.Fatalf("Erasure Object upload failed: %s", err) + } + versions[i] = objInfo.VersionID + } + + // Remove and check the version in the second pool, then + // remove and check the version in the first pool + for testIdx, vid := range []string{versions[1], versions[0]} { + names := []ObjectToDelete{ + { + ObjectV: ObjectV{ + ObjectName: objectName, + VersionID: vid, + }, + }, + } + _, delErrs := obj.DeleteObjects(ctx, bucketName, names, ObjectOptions{ + Versioned: true, + }) + for i := range delErrs { + if delErrs[i] != nil { + t.Errorf("Test %d: Failed to remove object `%v` with the error: `%v`", testIdx, names[i], delErrs[i]) + } + _, statErr := obj.GetObjectInfo(ctx, bucketName, objectName, ObjectOptions{ + VersionID: names[i].ObjectV.VersionID, + }) + switch statErr.(type) { + case VersionNotFound: + default: + t.Errorf("Test %d: Object %s is not removed", testIdx, objectName) + } + } + } +} + func TestDeleteObjectsVersioned(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() diff --git a/cmd/erasure-server-pool-decom_test.go b/cmd/erasure-server-pool-decom_test.go index 7e6d29c19..567e6b367 100644 --- a/cmd/erasure-server-pool-decom_test.go +++ b/cmd/erasure-server-pool-decom_test.go @@ -32,9 +32,9 @@ func prepareErasurePools() (ObjectLayer, []string, error) { pools := mustGetPoolEndpoints(0, fsDirs[:16]...) pools = append(pools, mustGetPoolEndpoints(1, fsDirs[16:]...)...) - // Everything is fine, should return nil - objLayer, err := newErasureServerPools(context.Background(), pools) + objLayer, _, err := initObjectLayer(context.Background(), pools) if err != nil { + removeRoots(fsDirs) return nil, nil, err } return objLayer, fsDirs, nil diff --git a/cmd/erasure-server-pool.go b/cmd/erasure-server-pool.go index f87cd4797..b626879f2 100644 --- a/cmd/erasure-server-pool.go +++ b/cmd/erasure-server-pool.go @@ -1248,89 +1248,42 @@ func (z *erasureServerPools) DeleteObjects(ctx context.Context, bucket string, o ctx = lkctx.Context() defer multiDeleteLock.Unlock(lkctx) - // Fetch location of up to 10 objects concurrently. - poolObjIdxMap := map[int][]ObjectToDelete{} - origIndexMap := map[int][]int{} + dObjectsByPool := make([][]DeletedObject, len(z.serverPools)) + dErrsByPool := make([][]error, len(z.serverPools)) - // Always perform 1/10th of the number of objects per delete - concurrent := len(objects) / 10 - if concurrent <= 10 { - // if we cannot get 1/10th then choose the number of - // objects as concurrent. - concurrent = len(objects) - } - - var mu sync.Mutex - eg := errgroup.WithNErrs(len(objects)).WithConcurrency(concurrent) - for j, obj := range objects { - j := j - obj := obj + eg := errgroup.WithNErrs(len(z.serverPools)).WithConcurrency(len(z.serverPools)) + for i, pool := range z.serverPools { + i := i + pool := pool eg.Go(func() error { - pinfo, _, err := z.getPoolInfoExistingWithOpts(ctx, bucket, obj.ObjectName, ObjectOptions{ - NoLock: true, - }) - if err != nil { - if !isErrObjectNotFound(err) && !isErrVersionNotFound(err) { - derrs[j] = err - } - dobjects[j] = DeletedObject{ - ObjectName: decodeDirObject(obj.ObjectName), - VersionID: obj.VersionID, - } - return nil - } - - // Delete marker already present we are not going to create new delete markers. - if pinfo.ObjInfo.DeleteMarker && obj.VersionID == "" { - dobjects[j] = DeletedObject{ - DeleteMarker: pinfo.ObjInfo.DeleteMarker, - DeleteMarkerVersionID: pinfo.ObjInfo.VersionID, - DeleteMarkerMTime: DeleteMarkerMTime{pinfo.ObjInfo.ModTime}, - ObjectName: decodeDirObject(pinfo.ObjInfo.Name), - } - return nil - } - - idx := pinfo.Index - - mu.Lock() - defer mu.Unlock() - - poolObjIdxMap[idx] = append(poolObjIdxMap[idx], obj) - origIndexMap[idx] = append(origIndexMap[idx], j) + dObjectsByPool[i], dErrsByPool[i] = pool.DeleteObjects(ctx, bucket, objects, opts) return nil - }, j) + }, i) } - eg.Wait() // wait to check all the pools. - if len(poolObjIdxMap) > 0 { - // Delete concurrently in all server pools. - var wg sync.WaitGroup - wg.Add(len(z.serverPools)) - for idx, pool := range z.serverPools { - go func(idx int, pool *erasureSets) { - defer wg.Done() - objs := poolObjIdxMap[idx] - if len(objs) == 0 { - return - } - orgIndexes := origIndexMap[idx] - deletedObjects, errs := pool.DeleteObjects(ctx, bucket, objs, opts) - mu.Lock() - for i, derr := range errs { - if derr != nil { - derrs[orgIndexes[i]] = derr - } - deletedObjects[i].ObjectName = decodeDirObject(deletedObjects[i].ObjectName) - dobjects[orgIndexes[i]] = deletedObjects[i] - } - mu.Unlock() - }(idx, pool) + for i := range dobjects { + // Iterate over pools + for pool := range z.serverPools { + if dErrsByPool[pool][i] == nil && dObjectsByPool[pool][i].found { + // A fast exit when the object is found and removed + dobjects[i] = dObjectsByPool[pool][i] + derrs[i] = nil + break + } + if derrs[i] == nil { + // No error related to this object is found, assign this pool result + // whether it is nil because there is no object found or because of + // some other errors such erasure quorum errors. + dobjects[i] = dObjectsByPool[pool][i] + derrs[i] = dErrsByPool[pool][i] + } } - wg.Wait() } + for i := range dobjects { + dobjects[i].ObjectName = decodeDirObject(dobjects[i].ObjectName) + } return dobjects, derrs }