upon RenameData() quorum error delete any partial success (#18586)

there is potential for danglingWrites when quorum failed, where
only some drives took a successful write, generally this is left
to the healing routine to pick it up. However it is better that
we delete it right away to avoid potential for quorum issues on
version signature when there are many versions of an object.
This commit is contained in:
Harshavardhana 2023-12-04 11:33:39 -08:00 committed by GitHub
parent e7c144eeac
commit 8fdfcfb562
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 12 additions and 1 deletions

View File

@ -1036,6 +1036,17 @@ func renameData(ctx context.Context, disks []StorageAPI, srcBucket, srcEntry str
var versionsDisparity bool var versionsDisparity bool
err := reduceWriteQuorumErrs(ctx, errs, objectOpIgnoredErrs, writeQuorum) err := reduceWriteQuorumErrs(ctx, errs, objectOpIgnoredErrs, writeQuorum)
if err != nil {
for index, nerr := range errs {
// When we are going to return error, attempt to delete success
// on some of the drives, if we cannot we do not have to notify
// caller this dangling object will be now scheduled to be removed
// via active healing.
if nerr == nil {
disks[index].DeleteVersion(ctx, dstBucket, dstEntry, metadata[index], false)
}
}
}
if err == nil { if err == nil {
versions := reduceCommonVersions(diskVersions, writeQuorum) versions := reduceCommonVersions(diskVersions, writeQuorum)
for index, dversions := range diskVersions { for index, dversions := range diskVersions {

View File

@ -97,7 +97,7 @@ func (m *mrfState) healRoutine() {
// let recently failed networks to reconnect // let recently failed networks to reconnect
// making MRF wait for 1s before retrying, // making MRF wait for 1s before retrying,
// i.e 4 reconnect attempts. // i.e 4 reconnect attempts.
time.Sleep(1 * time.Second) time.Sleep(time.Second)
} }
// wait on timer per heal // wait on timer per heal