Various improvements in replication (#11949)

- collect real time replication metrics for prometheus.
- add pending_count, failed_count metric for total pending/failed replication operations.

- add API to get replication metrics

- add MRF worker to handle spill-over replication operations

- multiple issues found with replication
- fixes an issue when client sends a bucket
 name with `/` at the end from SetRemoteTarget
 API call make sure to trim the bucket name to 
 avoid any extra `/`.

- hold write locks in GetObjectNInfo during replication
  to ensure that object version stack is not overwritten
  while reading the content.

- add additional protection during WriteMetadata() to
  ensure that we always write a valid FileInfo{} and avoid
  ever writing empty FileInfo{} to the lowest layers.

Co-authored-by: Poorna Krishnamoorthy <poorna@minio.io>
Co-authored-by: Harshavardhana <harsha@minio.io>
This commit is contained in:
Poorna Krishnamoorthy
2021-04-03 09:03:42 -07:00
committed by GitHub
parent dca7cf7200
commit 47c09a1e6f
36 changed files with 1914 additions and 496 deletions

View File

@@ -134,7 +134,7 @@ func runDataScanner(ctx context.Context, objAPI ObjectLayer) {
}
// Wait before starting next cycle and wait on startup.
results := make(chan DataUsageInfo, 1)
results := make(chan madmin.DataUsageInfo, 1)
go storeDataUsageInBackend(ctx, objAPI, results)
bf, err := globalNotificationSys.updateBloomFilter(ctx, nextBloomCycle)
logger.LogIf(ctx, err)
@@ -790,6 +790,8 @@ type sizeSummary struct {
pendingSize int64
failedSize int64
replicaSize int64
pendingCount uint64
failedCount uint64
}
type getSizeFn func(item scannerItem) (sizeSummary, error)
@@ -1105,11 +1107,13 @@ func (i *scannerItem) healReplication(ctx context.Context, o ObjectLayer, oi Obj
}
switch oi.ReplicationStatus {
case replication.Pending:
sizeS.pendingCount++
sizeS.pendingSize += oi.Size
globalReplicationPool.queueReplicaTask(oi)
globalReplicationPool.queueReplicaTask(ctx, oi)
case replication.Failed:
sizeS.failedSize += oi.Size
globalReplicationPool.queueReplicaTask(oi)
sizeS.failedCount++
globalReplicationPool.queueReplicaTask(ctx, oi)
case replication.Completed, "COMPLETE":
sizeS.replicatedSize += oi.Size
case replication.Replica:
@@ -1128,7 +1132,7 @@ func (i *scannerItem) healReplicationDeletes(ctx context.Context, o ObjectLayer,
} else {
versionID = oi.VersionID
}
globalReplicationPool.queueReplicaDeleteTask(DeletedObjectVersionInfo{
globalReplicationPool.queueReplicaDeleteTask(ctx, DeletedObjectVersionInfo{
DeletedObject: DeletedObject{
ObjectName: oi.Name,
DeleteMarkerVersionID: dmVersionID,