fix: allow DeleteObject unversioned objects with insufficient read quorum (#19581)

Since the object is being permanently deleted, the lack of read quorum should not
matter as long as sufficient disks are online to complete the deletion with parity
requirements.

If several pools have the same object with insufficient read quorum, attempt to
delete object from all the pools where it exists
This commit is contained in:
Poorna 2024-04-25 17:31:12 -07:00 committed by GitHub
parent c54ffde568
commit e7aa26dc29
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 123 additions and 27 deletions

View File

@ -275,7 +275,7 @@ func (fi FileInfo) ObjectToPartOffset(ctx context.Context, offset int64) (partIn
func findFileInfoInQuorum(ctx context.Context, metaArr []FileInfo, modTime time.Time, etag string, quorum int) (FileInfo, error) { func findFileInfoInQuorum(ctx context.Context, metaArr []FileInfo, modTime time.Time, etag string, quorum int) (FileInfo, error) {
// with less quorum return error. // with less quorum return error.
if quorum < 1 { if quorum < 1 {
return FileInfo{}, errErasureReadQuorum return FileInfo{}, InsufficientReadQuorum{Err: errErasureReadQuorum, Type: RQInsufficientOnlineDrives}
} }
metaHashes := make([]string, len(metaArr)) metaHashes := make([]string, len(metaArr))
h := sha256.New() h := sha256.New()
@ -341,7 +341,7 @@ func findFileInfoInQuorum(ctx context.Context, metaArr []FileInfo, modTime time.
} }
if maxCount < quorum { if maxCount < quorum {
return FileInfo{}, errErasureReadQuorum return FileInfo{}, InsufficientReadQuorum{Err: errErasureReadQuorum, Type: RQInconsistentMeta}
} }
// Find the successor mod time in quorum, otherwise leave the // Find the successor mod time in quorum, otherwise leave the
@ -377,7 +377,7 @@ func findFileInfoInQuorum(ctx context.Context, metaArr []FileInfo, modTime time.
} }
return candidate, nil return candidate, nil
} }
return FileInfo{}, errErasureReadQuorum return FileInfo{}, InsufficientReadQuorum{Err: errErasureReadQuorum, Type: RQInconsistentMeta}
} }
// pickValidFileInfo - picks one valid FileInfo content and returns from a // pickValidFileInfo - picks one valid FileInfo content and returns from a
@ -498,7 +498,7 @@ func objectQuorumFromMeta(ctx context.Context, partsMetaData []FileInfo, errs []
parities := listObjectParities(partsMetaData, errs) parities := listObjectParities(partsMetaData, errs)
parityBlocks := commonParity(parities, defaultParityCount) parityBlocks := commonParity(parities, defaultParityCount)
if parityBlocks < 0 { if parityBlocks < 0 {
return -1, -1, errErasureReadQuorum return -1, -1, InsufficientReadQuorum{Err: errErasureReadQuorum, Type: RQInsufficientOnlineDrives}
} }
dataBlocks := len(partsMetaData) - parityBlocks dataBlocks := len(partsMetaData) - parityBlocks

View File

@ -210,13 +210,13 @@ func TestFindFileInfoInQuorum(t *testing.T) {
{ {
fis: getNFInfo(16, 7, 1603863445, "36a21454-a2ca-11eb-bbaa-93a81c686f21", nil), fis: getNFInfo(16, 7, 1603863445, "36a21454-a2ca-11eb-bbaa-93a81c686f21", nil),
modTime: time.Unix(1603863445, 0), modTime: time.Unix(1603863445, 0),
expectedErr: errErasureReadQuorum, expectedErr: InsufficientReadQuorum{},
expectedQuorum: 8, expectedQuorum: 8,
}, },
{ {
fis: getNFInfo(16, 16, 1603863445, "36a21454-a2ca-11eb-bbaa-93a81c686f21", nil), fis: getNFInfo(16, 16, 1603863445, "36a21454-a2ca-11eb-bbaa-93a81c686f21", nil),
modTime: time.Unix(1603863445, 0), modTime: time.Unix(1603863445, 0),
expectedErr: errErasureReadQuorum, expectedErr: InsufficientReadQuorum{},
expectedQuorum: 0, expectedQuorum: 0,
}, },
{ {
@ -241,7 +241,9 @@ func TestFindFileInfoInQuorum(t *testing.T) {
test := test test := test
t.Run("", func(t *testing.T) { t.Run("", func(t *testing.T) {
fi, err := findFileInfoInQuorum(context.Background(), test.fis, test.modTime, "", test.expectedQuorum) fi, err := findFileInfoInQuorum(context.Background(), test.fis, test.modTime, "", test.expectedQuorum)
if err != test.expectedErr { _, ok1 := err.(InsufficientReadQuorum)
_, ok2 := test.expectedErr.(InsufficientReadQuorum)
if ok1 != ok2 {
t.Errorf("Expected %s, got %s", test.expectedErr, err) t.Errorf("Expected %s, got %s", test.expectedErr, err)
} }
if test.succmodTimes != nil { if test.succmodTimes != nil {

View File

@ -1912,23 +1912,26 @@ func (er erasureObjects) DeleteObject(ctx context.Context, bucket, object string
versionFound := true versionFound := true
objInfo = ObjectInfo{VersionID: opts.VersionID} // version id needed in Delete API response. objInfo = ObjectInfo{VersionID: opts.VersionID} // version id needed in Delete API response.
goi, _, gerr := er.getObjectInfoAndQuorum(ctx, bucket, object, opts) goi, _, gerr := er.getObjectInfoAndQuorum(ctx, bucket, object, opts)
tryDel := false
if gerr != nil && goi.Name == "" { if gerr != nil && goi.Name == "" {
if _, ok := gerr.(InsufficientReadQuorum); ok { if _, ok := gerr.(InsufficientReadQuorum); ok {
// Add an MRF heal for next time. if opts.Versioned || opts.VersionSuspended || countOnlineDisks(storageDisks) < len(storageDisks)/2+1 {
er.addPartial(bucket, object, opts.VersionID) // Add an MRF heal for next time.
er.addPartial(bucket, object, opts.VersionID)
return objInfo, InsufficientWriteQuorum{} return objInfo, InsufficientWriteQuorum{}
}
tryDel = true // only for unversioned objects if there is write quorum
} }
// For delete marker replication, versionID being replicated will not exist on disk // For delete marker replication, versionID being replicated will not exist on disk
if opts.DeleteMarker { if opts.DeleteMarker {
versionFound = false versionFound = false
} else { } else if !tryDel {
return objInfo, gerr return objInfo, gerr
} }
} }
if opts.EvalMetadataFn != nil { if opts.EvalMetadataFn != nil {
dsc, err := opts.EvalMetadataFn(&goi, err) dsc, err := opts.EvalMetadataFn(&goi, gerr)
if err != nil { if err != nil {
return ObjectInfo{}, err return ObjectInfo{}, err
} }

View File

@ -458,7 +458,13 @@ type PoolObjInfo struct {
Err error Err error
} }
func (z *erasureServerPools) getPoolInfoExistingWithOpts(ctx context.Context, bucket, object string, opts ObjectOptions) (PoolObjInfo, error) { type poolErrs struct {
Index int
Err error
}
func (z *erasureServerPools) getPoolInfoExistingWithOpts(ctx context.Context, bucket, object string, opts ObjectOptions) (PoolObjInfo, []poolErrs, error) {
var noReadQuorumPools []poolErrs
poolObjInfos := make([]PoolObjInfo, len(z.serverPools)) poolObjInfos := make([]PoolObjInfo, len(z.serverPools))
poolOpts := make([]ObjectOptions, len(z.serverPools)) poolOpts := make([]ObjectOptions, len(z.serverPools))
for i := range z.serverPools { for i := range z.serverPools {
@ -508,8 +514,9 @@ func (z *erasureServerPools) getPoolInfoExistingWithOpts(ctx context.Context, bu
} }
if pinfo.Err == nil { if pinfo.Err == nil {
// found a pool // found a pool
return pinfo, nil return pinfo, z.poolsWithObject(poolObjInfos, opts), nil
} }
if isErrReadQuorum(pinfo.Err) && !opts.MetadataChg { if isErrReadQuorum(pinfo.Err) && !opts.MetadataChg {
// read quorum is returned when the object is visibly // read quorum is returned when the object is visibly
// present but its unreadable, we simply ask the writes to // present but its unreadable, we simply ask the writes to
@ -518,30 +525,49 @@ func (z *erasureServerPools) getPoolInfoExistingWithOpts(ctx context.Context, bu
// with enough disks online but sufficiently inconsistent to // with enough disks online but sufficiently inconsistent to
// break parity threshold, allow them to be overwritten // break parity threshold, allow them to be overwritten
// or allow new versions to be added. // or allow new versions to be added.
return pinfo, nil
return pinfo, z.poolsWithObject(poolObjInfos, opts), nil
} }
defPool = pinfo defPool = pinfo
if !isErrObjectNotFound(pinfo.Err) { if !isErrObjectNotFound(pinfo.Err) {
return pinfo, pinfo.Err return pinfo, noReadQuorumPools, pinfo.Err
} }
// No object exists or its a delete marker, // No object exists or its a delete marker,
// check objInfo to confirm. // check objInfo to confirm.
if pinfo.ObjInfo.DeleteMarker && pinfo.ObjInfo.Name != "" { if pinfo.ObjInfo.DeleteMarker && pinfo.ObjInfo.Name != "" {
return pinfo, nil return pinfo, noReadQuorumPools, nil
} }
} }
if opts.ReplicationRequest && opts.DeleteMarker && defPool.Index >= 0 { if opts.ReplicationRequest && opts.DeleteMarker && defPool.Index >= 0 {
// If the request is a delete marker replication request, return a default pool // If the request is a delete marker replication request, return a default pool
// in cases where the object does not exist. // in cases where the object does not exist.
// This is to ensure that the delete marker is replicated to the destination. // This is to ensure that the delete marker is replicated to the destination.
return defPool, nil return defPool, noReadQuorumPools, nil
} }
return PoolObjInfo{}, toObjectErr(errFileNotFound, bucket, object) return PoolObjInfo{}, noReadQuorumPools, toObjectErr(errFileNotFound, bucket, object)
}
// return all pools with read quorum error or no error for an object with given opts.Note that this array is
// returned in the order of recency of object ModTime.
func (z *erasureServerPools) poolsWithObject(pools []PoolObjInfo, opts ObjectOptions) (errs []poolErrs) {
for _, pool := range pools {
if opts.SkipDecommissioned && z.IsSuspended(pool.Index) {
continue
}
// Skip object if it's from pools participating in a rebalance operation.
if opts.SkipRebalancing && z.IsPoolRebalancing(pool.Index) {
continue
}
if isErrReadQuorum(pool.Err) || pool.Err == nil {
errs = append(errs, poolErrs{Err: pool.Err, Index: pool.Index})
}
}
return errs
} }
func (z *erasureServerPools) getPoolIdxExistingWithOpts(ctx context.Context, bucket, object string, opts ObjectOptions) (idx int, err error) { func (z *erasureServerPools) getPoolIdxExistingWithOpts(ctx context.Context, bucket, object string, opts ObjectOptions) (idx int, err error) {
pinfo, err := z.getPoolInfoExistingWithOpts(ctx, bucket, object, opts) pinfo, _, err := z.getPoolInfoExistingWithOpts(ctx, bucket, object, opts)
if err != nil { if err != nil {
return -1, err return -1, err
} }
@ -1082,7 +1108,8 @@ func (z *erasureServerPools) DeleteObject(ctx context.Context, bucket string, ob
gopts := opts gopts := opts
gopts.NoLock = true gopts.NoLock = true
pinfo, err := z.getPoolInfoExistingWithOpts(ctx, bucket, object, gopts)
pinfo, noReadQuorumPools, err := z.getPoolInfoExistingWithOpts(ctx, bucket, object, gopts)
if err != nil { if err != nil {
if _, ok := err.(InsufficientReadQuorum); ok { if _, ok := err.(InsufficientReadQuorum); ok {
return objInfo, InsufficientWriteQuorum{} return objInfo, InsufficientWriteQuorum{}
@ -1096,11 +1123,44 @@ func (z *erasureServerPools) DeleteObject(ctx context.Context, bucket string, ob
return pinfo.ObjInfo, nil return pinfo.ObjInfo, nil
} }
// Delete concurrently in all server pools with read quorum error for unversioned objects.
if len(noReadQuorumPools) > 0 && !opts.Versioned && !opts.VersionSuspended {
return z.deleteObjectFromAllPools(ctx, bucket, object, opts, noReadQuorumPools)
}
objInfo, err = z.serverPools[pinfo.Index].DeleteObject(ctx, bucket, object, opts) objInfo, err = z.serverPools[pinfo.Index].DeleteObject(ctx, bucket, object, opts)
objInfo.Name = decodeDirObject(object) objInfo.Name = decodeDirObject(object)
return objInfo, err return objInfo, err
} }
func (z *erasureServerPools) deleteObjectFromAllPools(ctx context.Context, bucket string, object string, opts ObjectOptions, poolIndices []poolErrs) (objInfo ObjectInfo, err error) {
derrs := make([]error, len(poolIndices))
dobjects := make([]ObjectInfo, len(poolIndices))
// Delete concurrently in all server pools that reported no error or read quorum error
// where the read quorum issue is from metadata inconsistency.
var wg sync.WaitGroup
for idx, pe := range poolIndices {
if v, ok := pe.Err.(InsufficientReadQuorum); ok && v.Type != RQInconsistentMeta {
derrs[idx] = InsufficientWriteQuorum{}
continue
}
wg.Add(1)
pool := z.serverPools[pe.Index]
go func(idx int, pool *erasureSets) {
defer wg.Done()
dobjects[idx], derrs[idx] = pool.DeleteObject(ctx, bucket, object, opts)
}(idx, pool)
}
wg.Wait()
// the poolIndices array is pre-sorted in order of latest ModTime, we care only about pool with latest object though
// the delete call tries to clean up other pools during DeleteObject call.
objInfo = dobjects[0]
objInfo.Name = decodeDirObject(object)
err = derrs[0]
return objInfo, err
}
func (z *erasureServerPools) DeleteObjects(ctx context.Context, bucket string, objects []ObjectToDelete, opts ObjectOptions) ([]DeletedObject, []error) { func (z *erasureServerPools) DeleteObjects(ctx context.Context, bucket string, objects []ObjectToDelete, opts ObjectOptions) ([]DeletedObject, []error) {
derrs := make([]error, len(objects)) derrs := make([]error, len(objects))
dobjects := make([]DeletedObject, len(objects)) dobjects := make([]DeletedObject, len(objects))
@ -1142,7 +1202,7 @@ func (z *erasureServerPools) DeleteObjects(ctx context.Context, bucket string, o
j := j j := j
obj := obj obj := obj
eg.Go(func() error { eg.Go(func() error {
pinfo, err := z.getPoolInfoExistingWithOpts(ctx, bucket, obj.ObjectName, ObjectOptions{ pinfo, _, err := z.getPoolInfoExistingWithOpts(ctx, bucket, obj.ObjectName, ObjectOptions{
NoLock: true, NoLock: true,
}) })
if err != nil { if err != nil {

View File

@ -27,13 +27,13 @@ import (
// Converts underlying storage error. Convenience function written to // Converts underlying storage error. Convenience function written to
// handle all cases where we have known types of errors returned by // handle all cases where we have known types of errors returned by
// underlying storage layer. // underlying storage layer.
func toObjectErr(err error, params ...string) error { func toObjectErr(oerr error, params ...string) error {
if err == nil { if oerr == nil {
return nil return nil
} }
// Unwarp the error first // Unwarp the error first
err = unwrapAll(err) err := unwrapAll(oerr)
if err == context.Canceled { if err == context.Canceled {
return context.Canceled return context.Canceled
@ -157,6 +157,9 @@ func toObjectErr(err error, params ...string) error {
if len(params) >= 2 { if len(params) >= 2 {
apiErr.Object = decodeDirObject(params[1]) apiErr.Object = decodeDirObject(params[1])
} }
if v, ok := oerr.(InsufficientReadQuorum); ok {
apiErr.Type = v.Type
}
return apiErr return apiErr
case errErasureWriteQuorum.Error(): case errErasureWriteQuorum.Error():
apiErr := InsufficientWriteQuorum{} apiErr := InsufficientWriteQuorum{}
@ -201,8 +204,34 @@ func (e SlowDown) Error() string {
return "Please reduce your request rate" return "Please reduce your request rate"
} }
// RQErrType reason for read quorum error.
type RQErrType int
const (
// RQInsufficientOnlineDrives - not enough online drives.
RQInsufficientOnlineDrives RQErrType = 1 << iota
// RQInconsistentMeta - inconsistent metadata.
RQInconsistentMeta
)
func (t RQErrType) String() string {
switch t {
case RQInsufficientOnlineDrives:
return "InsufficientOnlineDrives"
case RQInconsistentMeta:
return "InconsistentMeta"
default:
return "Unknown"
}
}
// InsufficientReadQuorum storage cannot satisfy quorum for read operation. // InsufficientReadQuorum storage cannot satisfy quorum for read operation.
type InsufficientReadQuorum GenericError type InsufficientReadQuorum struct {
Bucket string
Object string
Err error
Type RQErrType
}
func (e InsufficientReadQuorum) Error() string { func (e InsufficientReadQuorum) Error() string {
return "Storage resources are insufficient for the read operation " + e.Bucket + "/" + e.Object return "Storage resources are insufficient for the read operation " + e.Bucket + "/" + e.Object

BIN
cmd/testdata/undeleteable-object.tgz vendored Normal file

Binary file not shown.

View File

@ -80,6 +80,8 @@ export MC_HOST_sited=http://minio:minio123@127.0.0.1:9008
## Setup site replication ## Setup site replication
./mc admin replicate add sitea siteb --replicate-ilm-expiry ./mc admin replicate add sitea siteb --replicate-ilm-expiry
sleep 10s
## Add warm tier ## Add warm tier
./mc ilm tier add minio sitea WARM-TIER --endpoint http://localhost:9006 --access-key minio --secret-key minio123 --bucket bucket ./mc ilm tier add minio sitea WARM-TIER --endpoint http://localhost:9006 --access-key minio --secret-key minio123 --bucket bucket