fix: make sure failed requests only to failed queue (#12196)

failed queue should be used for retried requests to
avoid cascading the failures into incoming queue, this
would allow for a more fair retry for failed replicas.

Additionally also avoid taking context in queue task
to avoid confusion, simplifies its usage.
This commit is contained in:
Harshavardhana 2021-04-29 18:20:39 -07:00 committed by GitHub
parent 90112b5644
commit 0faa4e6187
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 43 additions and 40 deletions

View File

@ -799,7 +799,7 @@ func replicateObject(ctx context.Context, ri ReplicateObjectInfo, objectAPI Obje
if replicationStatus != replication.Completed && ri.RetryCount < 1 {
ri.OpType = replication.HealReplicationType
ri.RetryCount++
globalReplicationPool.queueReplicaTask(ctx, ri)
globalReplicationPool.queueReplicaFailedTask(ri)
}
}
@ -843,7 +843,6 @@ type ReplicationPool struct {
ctx context.Context
mrfWorkerKillCh chan struct{}
workerKillCh chan struct{}
mrfReplicaDeleteCh chan DeletedObjectVersionInfo
replicaCh chan ReplicateObjectInfo
replicaDeleteCh chan DeletedObjectVersionInfo
mrfReplicaCh chan ReplicateObjectInfo
@ -858,10 +857,9 @@ type ReplicationPool struct {
// NewReplicationPool creates a pool of replication workers of specified size
func NewReplicationPool(ctx context.Context, o ObjectLayer, opts replicationPoolOpts) *ReplicationPool {
pool := &ReplicationPool{
replicaCh: make(chan ReplicateObjectInfo, 10000),
replicaDeleteCh: make(chan DeletedObjectVersionInfo, 10000),
replicaCh: make(chan ReplicateObjectInfo, 100000),
replicaDeleteCh: make(chan DeletedObjectVersionInfo, 100000),
mrfReplicaCh: make(chan ReplicateObjectInfo, 100000),
mrfReplicaDeleteCh: make(chan DeletedObjectVersionInfo, 100000),
ctx: ctx,
objLayer: o,
}
@ -882,11 +880,6 @@ func (p *ReplicationPool) AddMRFWorker() {
return
}
replicateObject(p.ctx, oi, p.objLayer)
case doi, ok := <-p.mrfReplicaDeleteCh:
if !ok {
return
}
replicateDelete(p.ctx, doi, p.objLayer)
}
}
}
@ -947,36 +940,46 @@ func (p *ReplicationPool) ResizeFailedWorkers(n int) {
}
}
func (p *ReplicationPool) queueReplicaTask(ctx context.Context, ri ReplicateObjectInfo) {
func (p *ReplicationPool) queueReplicaFailedTask(ri ReplicateObjectInfo) {
if p == nil {
return
}
select {
case <-ctx.Done():
case <-GlobalContext.Done():
p.once.Do(func() {
close(p.replicaCh)
close(p.mrfReplicaCh)
})
case p.mrfReplicaCh <- ri:
default:
}
}
func (p *ReplicationPool) queueReplicaTask(ri ReplicateObjectInfo) {
if p == nil {
return
}
select {
case <-GlobalContext.Done():
p.once.Do(func() {
close(p.replicaCh)
close(p.mrfReplicaCh)
})
case p.replicaCh <- ri:
case p.mrfReplicaCh <- ri:
// queue all overflows into the mrfReplicaCh to handle incoming pending/failed operations
default:
}
}
func (p *ReplicationPool) queueReplicaDeleteTask(ctx context.Context, doi DeletedObjectVersionInfo) {
func (p *ReplicationPool) queueReplicaDeleteTask(doi DeletedObjectVersionInfo) {
if p == nil {
return
}
select {
case <-ctx.Done():
case <-GlobalContext.Done():
p.once.Do(func() {
close(p.replicaDeleteCh)
close(p.mrfReplicaDeleteCh)
})
case p.replicaDeleteCh <- doi:
case p.mrfReplicaDeleteCh <- doi:
// queue all overflows into the mrfReplicaDeleteCh to handle incoming pending/failed operations
default:
}
}
@ -1134,7 +1137,7 @@ func scheduleReplication(ctx context.Context, objInfo ObjectInfo, o ObjectLayer,
if sync {
replicateObject(ctx, ReplicateObjectInfo{ObjectInfo: objInfo, OpType: opType}, o)
} else {
globalReplicationPool.queueReplicaTask(GlobalContext, ReplicateObjectInfo{ObjectInfo: objInfo, OpType: opType})
globalReplicationPool.queueReplicaTask(ReplicateObjectInfo{ObjectInfo: objInfo, OpType: opType})
}
if sz, err := objInfo.GetActualSize(); err == nil {
globalReplicationStats.Update(objInfo.Bucket, sz, objInfo.ReplicationStatus, replication.StatusType(""), opType)
@ -1142,6 +1145,6 @@ func scheduleReplication(ctx context.Context, objInfo ObjectInfo, o ObjectLayer,
}
func scheduleReplicationDelete(ctx context.Context, dv DeletedObjectVersionInfo, o ObjectLayer, sync bool) {
globalReplicationPool.queueReplicaDeleteTask(GlobalContext, dv)
globalReplicationPool.queueReplicaDeleteTask(dv)
globalReplicationStats.Update(dv.Bucket, 0, replication.Pending, replication.StatusType(""), replication.DeleteReplicationType)
}

View File

@ -1136,11 +1136,11 @@ func (i *scannerItem) healReplication(ctx context.Context, o ObjectLayer, oi Obj
case replication.Pending:
sizeS.pendingCount++
sizeS.pendingSize += oi.Size
globalReplicationPool.queueReplicaTask(ctx, ReplicateObjectInfo{ObjectInfo: oi, OpType: replication.HealReplicationType})
globalReplicationPool.queueReplicaTask(ReplicateObjectInfo{ObjectInfo: oi, OpType: replication.HealReplicationType})
case replication.Failed:
sizeS.failedSize += oi.Size
sizeS.failedCount++
globalReplicationPool.queueReplicaTask(ctx, ReplicateObjectInfo{ObjectInfo: oi, OpType: replication.HealReplicationType})
globalReplicationPool.queueReplicaTask(ReplicateObjectInfo{ObjectInfo: oi, OpType: replication.HealReplicationType})
case replication.Completed, "COMPLETE":
sizeS.replicatedSize += oi.Size
case replication.Replica:
@ -1159,7 +1159,7 @@ func (i *scannerItem) healReplicationDeletes(ctx context.Context, o ObjectLayer,
} else {
versionID = oi.VersionID
}
globalReplicationPool.queueReplicaDeleteTask(ctx, DeletedObjectVersionInfo{
globalReplicationPool.queueReplicaDeleteTask(DeletedObjectVersionInfo{
DeletedObject: DeletedObject{
ObjectName: oi.Name,
DeleteMarkerVersionID: dmVersionID,