mirror of
https://github.com/minio/minio.git
synced 2024-12-24 06:05:55 -05:00
fix; race in bucket replication stats (#13942)
- r.ulock was not locked when r.UsageCache was being modified Bonus: - simplify code by removing some unnecessary clone methods - we can do this because go arrays are values (not pointers/references) that are automatically copied on assignment. - remove some unnecessary map allocation calls
This commit is contained in:
parent
13441ad0f8
commit
997e808088
@ -51,6 +51,9 @@ func (r *ReplicationStats) Delete(bucket string) {
|
|||||||
r.Lock()
|
r.Lock()
|
||||||
defer r.Unlock()
|
defer r.Unlock()
|
||||||
delete(r.Cache, bucket)
|
delete(r.Cache, bucket)
|
||||||
|
|
||||||
|
r.ulock.Lock()
|
||||||
|
defer r.ulock.Unlock()
|
||||||
delete(r.UsageCache, bucket)
|
delete(r.UsageCache, bucket)
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -82,10 +85,12 @@ func (r *ReplicationStats) Update(bucket string, arn string, n int64, duration t
|
|||||||
bs, ok := r.Cache[bucket]
|
bs, ok := r.Cache[bucket]
|
||||||
if !ok {
|
if !ok {
|
||||||
bs = &BucketReplicationStats{Stats: make(map[string]*BucketReplicationStat)}
|
bs = &BucketReplicationStats{Stats: make(map[string]*BucketReplicationStat)}
|
||||||
|
r.Cache[bucket] = bs
|
||||||
}
|
}
|
||||||
b, ok := bs.Stats[arn]
|
b, ok := bs.Stats[arn]
|
||||||
if !ok {
|
if !ok {
|
||||||
b = &BucketReplicationStat{}
|
b = &BucketReplicationStat{}
|
||||||
|
bs.Stats[arn] = b
|
||||||
}
|
}
|
||||||
switch status {
|
switch status {
|
||||||
case replication.Completed:
|
case replication.Completed:
|
||||||
@ -115,9 +120,6 @@ func (r *ReplicationStats) Update(bucket string, arn string, n int64, duration t
|
|||||||
b.ReplicaSize += n
|
b.ReplicaSize += n
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bs.Stats[arn] = b
|
|
||||||
r.Cache[bucket] = bs
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetInitialUsage get replication metrics available at the time of cluster initialization
|
// GetInitialUsage get replication metrics available at the time of cluster initialization
|
||||||
@ -128,10 +130,10 @@ func (r *ReplicationStats) GetInitialUsage(bucket string) BucketReplicationStats
|
|||||||
r.ulock.RLock()
|
r.ulock.RLock()
|
||||||
defer r.ulock.RUnlock()
|
defer r.ulock.RUnlock()
|
||||||
st, ok := r.UsageCache[bucket]
|
st, ok := r.UsageCache[bucket]
|
||||||
if ok {
|
if !ok {
|
||||||
return st.Clone()
|
return BucketReplicationStats{}
|
||||||
}
|
}
|
||||||
return BucketReplicationStats{Stats: make(map[string]*BucketReplicationStat)}
|
return st.Clone()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get replication metrics for a bucket from this node since this node came up.
|
// Get replication metrics for a bucket from this node since this node came up.
|
||||||
@ -145,7 +147,7 @@ func (r *ReplicationStats) Get(bucket string) BucketReplicationStats {
|
|||||||
|
|
||||||
st, ok := r.Cache[bucket]
|
st, ok := r.Cache[bucket]
|
||||||
if !ok {
|
if !ok {
|
||||||
return BucketReplicationStats{Stats: make(map[string]*BucketReplicationStat)}
|
return BucketReplicationStats{}
|
||||||
}
|
}
|
||||||
return st.Clone()
|
return st.Clone()
|
||||||
}
|
}
|
||||||
@ -162,41 +164,46 @@ func NewReplicationStats(ctx context.Context, objectAPI ObjectLayer) *Replicatio
|
|||||||
func (r *ReplicationStats) loadInitialReplicationMetrics(ctx context.Context) {
|
func (r *ReplicationStats) loadInitialReplicationMetrics(ctx context.Context) {
|
||||||
rTimer := time.NewTimer(time.Minute * 1)
|
rTimer := time.NewTimer(time.Minute * 1)
|
||||||
defer rTimer.Stop()
|
defer rTimer.Stop()
|
||||||
|
var (
|
||||||
|
dui DataUsageInfo
|
||||||
|
err error
|
||||||
|
)
|
||||||
|
outer:
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
return
|
return
|
||||||
case <-rTimer.C:
|
case <-rTimer.C:
|
||||||
dui, err := loadDataUsageFromBackend(GlobalContext, newObjectLayerFn())
|
dui, err = loadDataUsageFromBackend(GlobalContext, newObjectLayerFn())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// data usage has not captured any data yet.
|
// If LastUpdate is set, data usage is available.
|
||||||
if dui.LastUpdate.IsZero() {
|
if !dui.LastUpdate.IsZero() {
|
||||||
continue
|
break outer
|
||||||
}
|
}
|
||||||
m := make(map[string]*BucketReplicationStats)
|
|
||||||
for bucket, usage := range dui.BucketsUsage {
|
|
||||||
b := &BucketReplicationStats{
|
|
||||||
Stats: make(map[string]*BucketReplicationStat, len(usage.ReplicationInfo)),
|
|
||||||
}
|
|
||||||
for arn, uinfo := range usage.ReplicationInfo {
|
|
||||||
b.Stats[arn] = &BucketReplicationStat{
|
|
||||||
FailedSize: int64(uinfo.ReplicationFailedSize),
|
|
||||||
ReplicatedSize: int64(uinfo.ReplicatedSize),
|
|
||||||
ReplicaSize: int64(uinfo.ReplicaSize),
|
|
||||||
FailedCount: int64(uinfo.ReplicationFailedCount),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
b.ReplicaSize += int64(usage.ReplicaSize)
|
|
||||||
if b.hasReplicationUsage() {
|
|
||||||
m[bucket] = b
|
|
||||||
}
|
|
||||||
}
|
|
||||||
r.ulock.Lock()
|
|
||||||
defer r.ulock.Unlock()
|
|
||||||
r.UsageCache = m
|
|
||||||
return
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
m := make(map[string]*BucketReplicationStats)
|
||||||
|
for bucket, usage := range dui.BucketsUsage {
|
||||||
|
b := &BucketReplicationStats{
|
||||||
|
Stats: make(map[string]*BucketReplicationStat, len(usage.ReplicationInfo)),
|
||||||
|
}
|
||||||
|
for arn, uinfo := range usage.ReplicationInfo {
|
||||||
|
b.Stats[arn] = &BucketReplicationStat{
|
||||||
|
FailedSize: int64(uinfo.ReplicationFailedSize),
|
||||||
|
ReplicatedSize: int64(uinfo.ReplicatedSize),
|
||||||
|
ReplicaSize: int64(uinfo.ReplicaSize),
|
||||||
|
FailedCount: int64(uinfo.ReplicationFailedCount),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
b.ReplicaSize += int64(usage.ReplicaSize)
|
||||||
|
if b.hasReplicationUsage() {
|
||||||
|
m[bucket] = b
|
||||||
|
}
|
||||||
|
}
|
||||||
|
r.ulock.Lock()
|
||||||
|
defer r.ulock.Unlock()
|
||||||
|
r.UsageCache = m
|
||||||
}
|
}
|
||||||
|
@ -1808,22 +1808,15 @@ func getLatestReplicationStats(bucket string, u BucketUsageInfo) (s BucketReplic
|
|||||||
// add initial usage stat to cluster stats
|
// add initial usage stat to cluster stats
|
||||||
usageStat := globalReplicationStats.GetInitialUsage(bucket)
|
usageStat := globalReplicationStats.GetInitialUsage(bucket)
|
||||||
totReplicaSize += usageStat.ReplicaSize
|
totReplicaSize += usageStat.ReplicaSize
|
||||||
if usageStat.Stats != nil {
|
for arn, stat := range usageStat.Stats {
|
||||||
for arn, stat := range usageStat.Stats {
|
st, ok := stats[arn]
|
||||||
st := stats[arn]
|
if !ok {
|
||||||
if st == nil {
|
st = &BucketReplicationStat{}
|
||||||
st = &BucketReplicationStat{
|
|
||||||
ReplicatedSize: stat.ReplicatedSize,
|
|
||||||
FailedSize: stat.FailedSize,
|
|
||||||
FailedCount: stat.FailedCount,
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
st.ReplicatedSize += stat.ReplicatedSize
|
|
||||||
st.FailedSize += stat.FailedSize
|
|
||||||
st.FailedCount += stat.FailedCount
|
|
||||||
}
|
|
||||||
stats[arn] = st
|
stats[arn] = st
|
||||||
}
|
}
|
||||||
|
st.ReplicatedSize += stat.ReplicatedSize
|
||||||
|
st.FailedSize += stat.FailedSize
|
||||||
|
st.FailedCount += stat.FailedCount
|
||||||
}
|
}
|
||||||
s = BucketReplicationStats{
|
s = BucketReplicationStats{
|
||||||
Stats: make(map[string]*BucketReplicationStat, len(stats)),
|
Stats: make(map[string]*BucketReplicationStat, len(stats)),
|
||||||
|
@ -18,7 +18,6 @@
|
|||||||
package cmd
|
package cmd
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"sync/atomic"
|
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -52,13 +51,6 @@ func (rl *ReplicationLatency) update(size int64, duration time.Duration) {
|
|||||||
rl.UploadHistogram.Add(size, duration)
|
rl.UploadHistogram.Add(size, duration)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Clone replication latency
|
|
||||||
func (rl ReplicationLatency) clone() ReplicationLatency {
|
|
||||||
return ReplicationLatency{
|
|
||||||
UploadHistogram: rl.UploadHistogram.Clone(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// BucketStats bucket statistics
|
// BucketStats bucket statistics
|
||||||
type BucketStats struct {
|
type BucketStats struct {
|
||||||
ReplicationStats BucketReplicationStats
|
ReplicationStats BucketReplicationStats
|
||||||
@ -88,29 +80,18 @@ func (brs *BucketReplicationStats) Empty() bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Clone creates a new BucketReplicationStats copy
|
// Clone creates a new BucketReplicationStats copy
|
||||||
func (brs BucketReplicationStats) Clone() BucketReplicationStats {
|
func (brs BucketReplicationStats) Clone() (c BucketReplicationStats) {
|
||||||
c := BucketReplicationStats{
|
// This is called only by replicationStats cache and already holds a
|
||||||
Stats: make(map[string]*BucketReplicationStat, len(brs.Stats)),
|
// read lock before calling Clone()
|
||||||
}
|
|
||||||
// This is called only by replicationStats cache and already holds a read lock before calling Clone()
|
c = brs
|
||||||
|
// We need to copy the map, so we do not reference the one in `brs`.
|
||||||
|
c.Stats = make(map[string]*BucketReplicationStat, len(brs.Stats))
|
||||||
for arn, st := range brs.Stats {
|
for arn, st := range brs.Stats {
|
||||||
c.Stats[arn] = &BucketReplicationStat{
|
// make a copy of `*st`
|
||||||
FailedSize: atomic.LoadInt64(&st.FailedSize),
|
s := *st
|
||||||
ReplicatedSize: atomic.LoadInt64(&st.ReplicatedSize),
|
c.Stats[arn] = &s
|
||||||
ReplicaSize: atomic.LoadInt64(&st.ReplicaSize),
|
|
||||||
FailedCount: atomic.LoadInt64(&st.FailedCount),
|
|
||||||
PendingSize: atomic.LoadInt64(&st.PendingSize),
|
|
||||||
PendingCount: atomic.LoadInt64(&st.PendingCount),
|
|
||||||
Latency: st.Latency.clone(),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
// update total counts across targets
|
|
||||||
c.FailedSize = atomic.LoadInt64(&brs.FailedSize)
|
|
||||||
c.FailedCount = atomic.LoadInt64(&brs.FailedCount)
|
|
||||||
c.PendingCount = atomic.LoadInt64(&brs.PendingCount)
|
|
||||||
c.PendingSize = atomic.LoadInt64(&brs.PendingSize)
|
|
||||||
c.ReplicaSize = atomic.LoadInt64(&brs.ReplicaSize)
|
|
||||||
c.ReplicatedSize = atomic.LoadInt64(&brs.ReplicatedSize)
|
|
||||||
return c
|
return c
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -102,39 +102,21 @@ type LastMinuteLatencies struct {
|
|||||||
LastSec int64
|
LastSec int64
|
||||||
}
|
}
|
||||||
|
|
||||||
// Clone safely returns a copy for a LastMinuteLatencies structure
|
|
||||||
func (l *LastMinuteLatencies) Clone() LastMinuteLatencies {
|
|
||||||
n := LastMinuteLatencies{}
|
|
||||||
n.LastSec = l.LastSec
|
|
||||||
for i := range l.Totals {
|
|
||||||
for j := range l.Totals[i] {
|
|
||||||
n.Totals[i][j] = AccElem{
|
|
||||||
Total: l.Totals[i][j].Total,
|
|
||||||
N: l.Totals[i][j].N,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return n
|
|
||||||
}
|
|
||||||
|
|
||||||
// Merge safely merges two LastMinuteLatencies structures into one
|
// Merge safely merges two LastMinuteLatencies structures into one
|
||||||
func (l LastMinuteLatencies) Merge(o LastMinuteLatencies) (merged LastMinuteLatencies) {
|
func (l LastMinuteLatencies) Merge(o LastMinuteLatencies) (merged LastMinuteLatencies) {
|
||||||
cl := l.Clone()
|
if l.LastSec > o.LastSec {
|
||||||
co := o.Clone()
|
o.forwardTo(l.LastSec)
|
||||||
|
merged.LastSec = l.LastSec
|
||||||
if cl.LastSec > co.LastSec {
|
|
||||||
co.forwardTo(cl.LastSec)
|
|
||||||
merged.LastSec = cl.LastSec
|
|
||||||
} else {
|
} else {
|
||||||
cl.forwardTo(co.LastSec)
|
l.forwardTo(o.LastSec)
|
||||||
merged.LastSec = co.LastSec
|
merged.LastSec = o.LastSec
|
||||||
}
|
}
|
||||||
|
|
||||||
for i := range cl.Totals {
|
for i := range merged.Totals {
|
||||||
for j := range cl.Totals[i] {
|
for j := range merged.Totals[i] {
|
||||||
merged.Totals[i][j] = AccElem{
|
merged.Totals[i][j] = AccElem{
|
||||||
Total: cl.Totals[i][j].Total + co.Totals[i][j].Total,
|
Total: l.Totals[i][j].Total + o.Totals[i][j].Total,
|
||||||
N: cl.Totals[i][j].N + co.Totals[i][j].N,
|
N: l.Totals[i][j].N + o.Totals[i][j].N,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user