mirror of
https://github.com/minio/minio.git
synced 2025-11-07 21:02:58 -05:00
feat: add idempotent delete marker support (#15521)
The bottom line is delete markers are a nuisance, most applications are not version aware and this has simply complicated the version management. AWS S3 gave an unnecessary complication overhead for customers, they need to now manage these markers by applying ILM settings and clean them up on a regular basis. To make matters worse all these delete markers get replicated as well in a replicated setup, requiring two ILM settings on each site. This PR is an attempt to address this inferior implementation by deviating MinIO towards an idempotent delete marker implementation i.e MinIO will never create any more than single consecutive delete markers. This significantly reduces operational overhead by making versioning more useful for real data. This is an S3 spec deviation for pragmatic reasons.
This commit is contained in:
@@ -380,19 +380,15 @@ func (z *erasureServerPools) getServerPoolsAvailableSpace(ctx context.Context, b
|
||||
return serverPools
|
||||
}
|
||||
|
||||
// poolObjInfo represents the state of an object per pool
|
||||
type poolObjInfo struct {
|
||||
PoolIndex int
|
||||
ObjInfo ObjectInfo
|
||||
Err error
|
||||
// PoolObjInfo represents the state of current object version per pool
|
||||
type PoolObjInfo struct {
|
||||
Index int
|
||||
ObjInfo ObjectInfo
|
||||
Err error
|
||||
}
|
||||
|
||||
func (z *erasureServerPools) getPoolIdxExistingWithOpts(ctx context.Context, bucket, object string, opts ObjectOptions) (idx int, err error) {
|
||||
if z.SinglePool() {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
poolObjInfos := make([]poolObjInfo, len(z.serverPools))
|
||||
func (z *erasureServerPools) getPoolInfoExistingWithOpts(ctx context.Context, bucket, object string, opts ObjectOptions) (PoolObjInfo, error) {
|
||||
poolObjInfos := make([]PoolObjInfo, len(z.serverPools))
|
||||
poolOpts := make([]ObjectOptions, len(z.serverPools))
|
||||
for i := range z.serverPools {
|
||||
poolOpts[i] = opts
|
||||
@@ -404,8 +400,8 @@ func (z *erasureServerPools) getPoolIdxExistingWithOpts(ctx context.Context, buc
|
||||
go func(i int, pool *erasureSets, opts ObjectOptions) {
|
||||
defer wg.Done()
|
||||
// remember the pool index, we may sort the slice original index might be lost.
|
||||
pinfo := poolObjInfo{
|
||||
PoolIndex: i,
|
||||
pinfo := PoolObjInfo{
|
||||
Index: i,
|
||||
}
|
||||
// do not remove this check as it can lead to inconsistencies
|
||||
// for all callers of bucket replication.
|
||||
@@ -429,19 +425,19 @@ func (z *erasureServerPools) getPoolIdxExistingWithOpts(ctx context.Context, buc
|
||||
for _, pinfo := range poolObjInfos {
|
||||
// skip all objects from suspended pools if asked by the
|
||||
// caller.
|
||||
if z.IsSuspended(pinfo.PoolIndex) && opts.SkipDecommissioned {
|
||||
if z.IsSuspended(pinfo.Index) && opts.SkipDecommissioned {
|
||||
continue
|
||||
}
|
||||
|
||||
if pinfo.Err != nil && !isErrObjectNotFound(pinfo.Err) {
|
||||
return -1, pinfo.Err
|
||||
return pinfo, pinfo.Err
|
||||
}
|
||||
|
||||
if isErrObjectNotFound(pinfo.Err) {
|
||||
// No object exists or its a delete marker,
|
||||
// check objInfo to confirm.
|
||||
if pinfo.ObjInfo.DeleteMarker && pinfo.ObjInfo.Name != "" {
|
||||
return pinfo.PoolIndex, nil
|
||||
return pinfo, nil
|
||||
}
|
||||
|
||||
// objInfo is not valid, truly the object doesn't
|
||||
@@ -449,10 +445,18 @@ func (z *erasureServerPools) getPoolIdxExistingWithOpts(ctx context.Context, buc
|
||||
continue
|
||||
}
|
||||
|
||||
return pinfo.PoolIndex, nil
|
||||
return pinfo, nil
|
||||
}
|
||||
|
||||
return -1, toObjectErr(errFileNotFound, bucket, object)
|
||||
return PoolObjInfo{}, toObjectErr(errFileNotFound, bucket, object)
|
||||
}
|
||||
|
||||
func (z *erasureServerPools) getPoolIdxExistingWithOpts(ctx context.Context, bucket, object string, opts ObjectOptions) (idx int, err error) {
|
||||
pinfo, err := z.getPoolInfoExistingWithOpts(ctx, bucket, object, opts)
|
||||
if err != nil {
|
||||
return -1, err
|
||||
}
|
||||
return pinfo.Index, nil
|
||||
}
|
||||
|
||||
// getPoolIdxExistingNoLock returns the (first) found object pool index containing an object.
|
||||
@@ -999,16 +1003,36 @@ func (z *erasureServerPools) DeleteObject(ctx context.Context, bucket string, ob
|
||||
}
|
||||
|
||||
object = encodeDirObject(object)
|
||||
if z.SinglePool() {
|
||||
return z.serverPools[0].DeleteObject(ctx, bucket, object, opts)
|
||||
}
|
||||
|
||||
idx, err := z.getPoolIdxExistingWithOpts(ctx, bucket, object, opts)
|
||||
// Acquire a write lock before deleting the object.
|
||||
lk := z.NewNSLock(bucket, object)
|
||||
lkctx, err := lk.GetLock(ctx, globalDeleteOperationTimeout)
|
||||
if err != nil {
|
||||
return ObjectInfo{}, err
|
||||
}
|
||||
ctx = lkctx.Context()
|
||||
defer lk.Unlock(lkctx.Cancel)
|
||||
|
||||
gopts := opts
|
||||
gopts.NoLock = true
|
||||
pinfo, err := z.getPoolInfoExistingWithOpts(ctx, bucket, object, gopts)
|
||||
if err != nil {
|
||||
switch err.(type) {
|
||||
case InsufficientReadQuorum:
|
||||
return objInfo, InsufficientWriteQuorum{}
|
||||
}
|
||||
return objInfo, err
|
||||
}
|
||||
|
||||
return z.serverPools[idx].DeleteObject(ctx, bucket, object, opts)
|
||||
// Delete marker already present we are not going to create new delete markers.
|
||||
if pinfo.ObjInfo.DeleteMarker && opts.VersionID == "" {
|
||||
pinfo.ObjInfo.Name = decodeDirObject(object)
|
||||
return pinfo.ObjInfo, nil
|
||||
}
|
||||
|
||||
objInfo, err = z.serverPools[pinfo.Index].DeleteObject(ctx, bucket, object, opts)
|
||||
objInfo.Name = decodeDirObject(object)
|
||||
return objInfo, err
|
||||
}
|
||||
|
||||
func (z *erasureServerPools) DeleteObjects(ctx context.Context, bucket string, objects []ObjectToDelete, opts ObjectOptions) ([]DeletedObject, []error) {
|
||||
@@ -1034,14 +1058,6 @@ func (z *erasureServerPools) DeleteObjects(ctx context.Context, bucket string, o
|
||||
ctx = lkctx.Context()
|
||||
defer multiDeleteLock.Unlock(lkctx.Cancel)
|
||||
|
||||
if z.SinglePool() {
|
||||
deleteObjects, dErrs := z.serverPools[0].DeleteObjects(ctx, bucket, objects, opts)
|
||||
for i := range deleteObjects {
|
||||
deleteObjects[i].ObjectName = decodeDirObject(deleteObjects[i].ObjectName)
|
||||
}
|
||||
return deleteObjects, dErrs
|
||||
}
|
||||
|
||||
// Fetch location of up to 10 objects concurrently.
|
||||
poolObjIdxMap := map[int][]ObjectToDelete{}
|
||||
origIndexMap := map[int][]int{}
|
||||
@@ -1060,46 +1076,66 @@ func (z *erasureServerPools) DeleteObjects(ctx context.Context, bucket string, o
|
||||
j := j
|
||||
obj := obj
|
||||
eg.Go(func() error {
|
||||
idx, err := z.getPoolIdxExistingWithOpts(ctx, bucket, obj.ObjectName, ObjectOptions{
|
||||
pinfo, err := z.getPoolInfoExistingWithOpts(ctx, bucket, obj.ObjectName, ObjectOptions{
|
||||
NoLock: true,
|
||||
})
|
||||
if err != nil {
|
||||
derrs[j] = err
|
||||
dobjects[j] = DeletedObject{
|
||||
ObjectName: obj.ObjectName,
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Delete marker already present we are not going to create new delete markers.
|
||||
if pinfo.ObjInfo.DeleteMarker && obj.VersionID == "" {
|
||||
dobjects[j] = DeletedObject{
|
||||
DeleteMarker: pinfo.ObjInfo.DeleteMarker,
|
||||
DeleteMarkerVersionID: pinfo.ObjInfo.VersionID,
|
||||
DeleteMarkerMTime: DeleteMarkerMTime{pinfo.ObjInfo.ModTime},
|
||||
ObjectName: pinfo.ObjInfo.Name,
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
idx := pinfo.Index
|
||||
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
|
||||
poolObjIdxMap[idx] = append(poolObjIdxMap[idx], obj)
|
||||
origIndexMap[idx] = append(origIndexMap[idx], j)
|
||||
mu.Unlock()
|
||||
return nil
|
||||
}, j)
|
||||
}
|
||||
|
||||
eg.Wait() // wait to check all the pools.
|
||||
|
||||
// Delete concurrently in all server pools.
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(len(z.serverPools))
|
||||
for idx, pool := range z.serverPools {
|
||||
go func(idx int, pool *erasureSets) {
|
||||
defer wg.Done()
|
||||
objs := poolObjIdxMap[idx]
|
||||
if len(objs) > 0 {
|
||||
orgIndexes := origIndexMap[idx]
|
||||
deletedObjects, errs := pool.DeleteObjects(ctx, bucket, objs, opts)
|
||||
mu.Lock()
|
||||
for i, derr := range errs {
|
||||
if derr != nil {
|
||||
derrs[orgIndexes[i]] = derr
|
||||
if len(poolObjIdxMap) > 0 {
|
||||
// Delete concurrently in all server pools.
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(len(z.serverPools))
|
||||
for idx, pool := range z.serverPools {
|
||||
go func(idx int, pool *erasureSets) {
|
||||
defer wg.Done()
|
||||
objs := poolObjIdxMap[idx]
|
||||
if len(objs) > 0 {
|
||||
orgIndexes := origIndexMap[idx]
|
||||
deletedObjects, errs := pool.DeleteObjects(ctx, bucket, objs, opts)
|
||||
mu.Lock()
|
||||
for i, derr := range errs {
|
||||
if derr != nil {
|
||||
derrs[orgIndexes[i]] = derr
|
||||
}
|
||||
deletedObjects[i].ObjectName = decodeDirObject(deletedObjects[i].ObjectName)
|
||||
dobjects[orgIndexes[i]] = deletedObjects[i]
|
||||
}
|
||||
deletedObjects[i].ObjectName = decodeDirObject(deletedObjects[i].ObjectName)
|
||||
dobjects[orgIndexes[i]] = deletedObjects[i]
|
||||
mu.Unlock()
|
||||
}
|
||||
mu.Unlock()
|
||||
}
|
||||
}(idx, pool)
|
||||
}(idx, pool)
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
return dobjects, derrs
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user