Fixes to replication metrics (#13493)

For reporting ReplicaSize and loading initial
replication metrics correctly.
This commit is contained in:
Poorna K 2021-10-21 21:52:55 -04:00 committed by GitHub
parent 52c5f6e152
commit e7f559c582
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 59 additions and 75 deletions

View File

@ -21,6 +21,7 @@ import (
"context" "context"
"sync" "sync"
"sync/atomic" "sync/atomic"
"time"
"github.com/minio/minio/internal/bucket/replication" "github.com/minio/minio/internal/bucket/replication"
) )
@ -67,7 +68,7 @@ func (r *ReplicationStats) UpdateReplicaStat(bucket string, n int64) {
if !ok { if !ok {
bs = &BucketReplicationStats{Stats: make(map[string]*BucketReplicationStat)} bs = &BucketReplicationStats{Stats: make(map[string]*BucketReplicationStat)}
} }
atomic.StoreInt64(&bs.ReplicaSize, n) atomic.AddInt64(&bs.ReplicaSize, n)
r.Cache[bucket] = bs r.Cache[bucket] = bs
} }
@ -122,44 +123,13 @@ func (r *ReplicationStats) GetInitialUsage(bucket string) BucketReplicationStats
if r == nil { if r == nil {
return BucketReplicationStats{} return BucketReplicationStats{}
} }
r.ulock.RLock() r.ulock.RLock()
defer r.ulock.RUnlock()
brs := BucketReplicationStats{Stats: make(map[string]*BucketReplicationStat)}
st, ok := r.UsageCache[bucket] st, ok := r.UsageCache[bucket]
if ok { if ok {
return st.Clone() return st.Clone()
} }
r.ulock.RUnlock() return BucketReplicationStats{Stats: make(map[string]*BucketReplicationStat)}
dataUsageInfo, err := loadDataUsageFromBackend(GlobalContext, newObjectLayerFn())
if err != nil {
return brs
}
// data usage has not captured any data yet.
if dataUsageInfo.LastUpdate.IsZero() {
return brs
}
usage, ok := dataUsageInfo.BucketsUsage[bucket]
if ok && usage.ReplicationInfo != nil {
brs.ReplicaSize = int64(usage.ReplicaSize)
for arn, uinfo := range usage.ReplicationInfo {
brs.Stats[arn] = &BucketReplicationStat{
FailedSize: int64(uinfo.ReplicationFailedSize),
ReplicatedSize: int64(uinfo.ReplicatedSize),
ReplicaSize: int64(uinfo.ReplicaSize),
FailedCount: int64(uinfo.ReplicationFailedCount),
}
}
if brs.hasReplicationUsage() {
r.ulock.Lock()
defer r.ulock.Unlock()
r.UsageCache[bucket] = &brs
}
}
return brs
} }
// Get replication metrics for a bucket from this node since this node came up. // Get replication metrics for a bucket from this node since this node came up.
@ -180,22 +150,31 @@ func (r *ReplicationStats) Get(bucket string) BucketReplicationStats {
// NewReplicationStats initialize in-memory replication statistics // NewReplicationStats initialize in-memory replication statistics
func NewReplicationStats(ctx context.Context, objectAPI ObjectLayer) *ReplicationStats { func NewReplicationStats(ctx context.Context, objectAPI ObjectLayer) *ReplicationStats {
st := &ReplicationStats{ return &ReplicationStats{
Cache: make(map[string]*BucketReplicationStats), Cache: make(map[string]*BucketReplicationStats),
UsageCache: make(map[string]*BucketReplicationStats), UsageCache: make(map[string]*BucketReplicationStats),
} }
}
dataUsageInfo, err := loadDataUsageFromBackend(ctx, objectAPI) // load replication metrics at cluster start from initial data usage
func (r *ReplicationStats) loadInitialReplicationMetrics(ctx context.Context) {
rTimer := time.NewTimer(time.Minute * 1)
defer rTimer.Stop()
for {
select {
case <-ctx.Done():
return
case <-rTimer.C:
dui, err := loadDataUsageFromBackend(GlobalContext, newObjectLayerFn())
if err != nil { if err != nil {
return st continue
} }
// data usage has not captured any data yet. // data usage has not captured any data yet.
if dataUsageInfo.LastUpdate.IsZero() { if dui.LastUpdate.IsZero() {
return st continue
} }
m := make(map[string]*BucketReplicationStats)
for bucket, usage := range dataUsageInfo.BucketsUsage { for bucket, usage := range dui.BucketsUsage {
b := &BucketReplicationStats{ b := &BucketReplicationStats{
Stats: make(map[string]*BucketReplicationStat, len(usage.ReplicationInfo)), Stats: make(map[string]*BucketReplicationStat, len(usage.ReplicationInfo)),
} }
@ -209,9 +188,13 @@ func NewReplicationStats(ctx context.Context, objectAPI ObjectLayer) *Replicatio
} }
b.ReplicaSize += int64(usage.ReplicaSize) b.ReplicaSize += int64(usage.ReplicaSize)
if b.hasReplicationUsage() { if b.hasReplicationUsage() {
st.UsageCache[bucket] = b m[bucket] = b
}
}
r.ulock.Lock()
defer r.ulock.Unlock()
r.UsageCache = m
return
} }
} }
return st
} }

View File

@ -1497,6 +1497,7 @@ func initBackgroundReplication(ctx context.Context, objectAPI ObjectLayer) {
FailedWorkers: globalAPIConfig.getReplicationFailedWorkers(), FailedWorkers: globalAPIConfig.getReplicationFailedWorkers(),
}) })
globalReplicationStats = NewReplicationStats(ctx, objectAPI) globalReplicationStats = NewReplicationStats(ctx, objectAPI)
go globalReplicationStats.loadInitialReplicationMetrics(ctx)
} }
// get Reader from replication target if active-active replication is in place and // get Reader from replication target if active-active replication is in place and

View File

@ -442,8 +442,8 @@ func getLatestReplicationStats(bucket string, u BucketUsageInfo) (s BucketReplic
for _, bucketStat := range bucketStats { for _, bucketStat := range bucketStats {
totReplicaSize += bucketStat.ReplicationStats.ReplicaSize totReplicaSize += bucketStat.ReplicationStats.ReplicaSize
for arn, stat := range bucketStat.ReplicationStats.Stats { for arn, stat := range bucketStat.ReplicationStats.Stats {
oldst, ok := stats[arn] oldst := stats[arn]
if !ok { if oldst == nil {
oldst = &BucketReplicationStat{} oldst = &BucketReplicationStat{}
} }
stats[arn] = &BucketReplicationStat{ stats[arn] = &BucketReplicationStat{
@ -459,8 +459,8 @@ func getLatestReplicationStats(bucket string, u BucketUsageInfo) (s BucketReplic
if usageStat.Stats != nil { if usageStat.Stats != nil {
totReplicaSize += usageStat.ReplicaSize totReplicaSize += usageStat.ReplicaSize
for arn, stat := range usageStat.Stats { for arn, stat := range usageStat.Stats {
st, ok := stats[arn] st := stats[arn]
if !ok { if st == nil {
st = &BucketReplicationStat{ st = &BucketReplicationStat{
ReplicatedSize: stat.ReplicatedSize, ReplicatedSize: stat.ReplicatedSize,
FailedSize: stat.FailedSize, FailedSize: stat.FailedSize,
@ -484,13 +484,13 @@ func getLatestReplicationStats(bucket string, u BucketUsageInfo) (s BucketReplic
// normalize computed real time stats with latest usage stat // normalize computed real time stats with latest usage stat
for arn, tgtstat := range stats { for arn, tgtstat := range stats {
st := BucketReplicationStat{} st := BucketReplicationStat{}
bu, ok := usageStat.Stats[arn] bu, ok := u.ReplicationInfo[arn]
if !ok { if !ok {
bu = &BucketReplicationStat{} bu = BucketTargetUsageInfo{}
} }
// use in memory replication stats if it is ahead of usage info. // use in memory replication stats if it is ahead of usage info.
st.ReplicatedSize = bu.ReplicatedSize st.ReplicatedSize = int64(bu.ReplicatedSize)
if tgtstat.ReplicatedSize >= bu.ReplicatedSize { if tgtstat.ReplicatedSize >= int64(bu.ReplicatedSize) {
st.ReplicatedSize = tgtstat.ReplicatedSize st.ReplicatedSize = tgtstat.ReplicatedSize
} }
s.ReplicatedSize += st.ReplicatedSize s.ReplicatedSize += st.ReplicatedSize

View File

@ -3278,9 +3278,9 @@ func (api objectAPIHandlers) CompleteMultipartUploadHandler(w http.ResponseWrite
if dsc := mustReplicate(ctx, bucket, object, getMustReplicateOptions(objInfo, replication.ObjectReplicationType, opts)); dsc.ReplicateAny() { if dsc := mustReplicate(ctx, bucket, object, getMustReplicateOptions(objInfo, replication.ObjectReplicationType, opts)); dsc.ReplicateAny() {
scheduleReplication(ctx, objInfo.Clone(), objectAPI, dsc, replication.ObjectReplicationType) scheduleReplication(ctx, objInfo.Clone(), objectAPI, dsc, replication.ObjectReplicationType)
} }
if objInfo.ReplicationStatus == replication.Replica { if _, ok := r.Header[xhttp.MinIOSourceReplicationRequest]; ok {
actualSize, _ := objInfo.GetActualSize() actualSize, _ := objInfo.GetActualSize()
globalReplicationStats.UpdateReplicaStat(bucket, actualSize) defer globalReplicationStats.UpdateReplicaStat(bucket, actualSize)
} }
// Write success response. // Write success response.