Add additional info for replication metrics API (#17293)

to track the replication transfer rate across different nodes,
number of active workers in use and in-queue stats to get
an idea of the current workload.

This PR also adds replication metrics to the site replication
status API. For site replication, prometheus metrics are
no longer at the bucket level - but at the cluster level.

Add prometheus metric to track credential errors since uptime
This commit is contained in:
Poorna
2023-08-30 01:00:59 -07:00
committed by GitHub
parent cce90cb2b7
commit b48bbe08b2
31 changed files with 8779 additions and 743 deletions

View File

@@ -19,7 +19,10 @@ package cmd
import (
"fmt"
"math"
"time"
"github.com/minio/madmin-go/v3"
)
//go:generate msgp -file $GOFILE
@@ -52,6 +55,94 @@ func (rl *ReplicationLatency) update(size int64, duration time.Duration) {
rl.UploadHistogram.Add(size, duration)
}
// ReplicationLastMinute has last minute replication counters
type ReplicationLastMinute struct {
LastMinute lastMinuteLatency
}
func (rl ReplicationLastMinute) merge(other ReplicationLastMinute) (nl ReplicationLastMinute) {
nl = ReplicationLastMinute{rl.LastMinute.merge(other.LastMinute)}
return
}
func (rl *ReplicationLastMinute) addsize(n int64) {
t := time.Now().Unix()
rl.LastMinute.addAll(t-1, AccElem{Total: t - 1, Size: n, N: 1})
}
func (rl *ReplicationLastMinute) String() string {
t := rl.LastMinute.getTotal()
return fmt.Sprintf("ReplicationLastMinute sz= %d, n=%d , dur=%d", t.Size, t.N, t.Total)
}
func (rl *ReplicationLastMinute) getTotal() AccElem {
return rl.LastMinute.getTotal()
}
// ReplicationLastHour keeps track of replication counts over the last hour
type ReplicationLastHour struct {
Totals [60]AccElem
LastMin int64
}
// Merge data of two ReplicationLastHour structure
func (l ReplicationLastHour) merge(o ReplicationLastHour) (merged ReplicationLastHour) {
if l.LastMin > o.LastMin {
o.forwardTo(l.LastMin)
merged.LastMin = l.LastMin
} else {
l.forwardTo(o.LastMin)
merged.LastMin = o.LastMin
}
for i := range merged.Totals {
merged.Totals[i] = AccElem{
Total: l.Totals[i].Total + o.Totals[i].Total,
N: l.Totals[i].N + o.Totals[i].N,
Size: l.Totals[i].Size + o.Totals[i].Size,
}
}
return merged
}
// Add a new duration data
func (l *ReplicationLastHour) addsize(sz int64) {
min := time.Now().Unix() / 60
l.forwardTo(min)
winIdx := min % 60
l.Totals[winIdx].merge(AccElem{Total: min, Size: sz, N: 1})
l.LastMin = min
}
// Merge all recorded counts of last hour into one
func (l *ReplicationLastHour) getTotal() AccElem {
var res AccElem
min := time.Now().Unix() / 60
l.forwardTo(min)
for _, elem := range l.Totals[:] {
res.merge(elem)
}
return res
}
// forwardTo time t, clearing any entries in between.
func (l *ReplicationLastHour) forwardTo(t int64) {
tMin := t / 60
if l.LastMin >= tMin {
return
}
if t-l.LastMin >= 60 {
l.Totals = [60]AccElem{}
return
}
for l.LastMin != t {
// Clear next element.
idx := (l.LastMin + 1) % 60
l.Totals[idx] = AccElem{}
l.LastMin++
}
}
// BucketStatsMap captures bucket statistics for all buckets
type BucketStatsMap struct {
Stats map[string]BucketStats
@@ -60,19 +151,32 @@ type BucketStatsMap struct {
// BucketStats bucket statistics
type BucketStats struct {
ReplicationStats BucketReplicationStats
Uptime int64 `json:"uptime"`
ReplicationStats BucketReplicationStats `json:"currStats"` // current replication stats since cluster startup
QueueStats ReplicationQueueStats `json:"queueStats"` // replication queue stats
}
// BucketReplicationStats represents inline replication statistics
// such as pending, failed and completed bytes in total for a bucket
type BucketReplicationStats struct {
Stats map[string]*BucketReplicationStat `json:",omitempty"`
// Pending size in bytes
PendingSize int64 `json:"pendingReplicationSize"`
// Completed size in bytes
ReplicatedSize int64 `json:"completedReplicationSize"`
// Total Replica size in bytes
ReplicaSize int64 `json:"replicaSize"`
// Total failed operations including metadata updates for various time frames
Failed madmin.TimedErrStats `json:"failed"`
// Total number of completed operations
ReplicatedCount int64 `json:"replicationCount"`
// Total number of replica received
ReplicaCount int64 `json:"replicaCount"`
// in Queue stats for bucket - from qCache
QStat InQueueMetric `json:"queued"`
// Deprecated fields
// Pending size in bytes
PendingSize int64 `json:"pendingReplicationSize"`
// Failed size in bytes
FailedSize int64 `json:"failedReplicationSize"`
// Total number of pending operations including metadata updates
@@ -81,6 +185,12 @@ type BucketReplicationStats struct {
FailedCount int64 `json:"failedReplicationCount"`
}
func newBucketReplicationStats() *BucketReplicationStats {
return &BucketReplicationStats{
Stats: make(map[string]*BucketReplicationStat),
}
}
// Empty returns true if there are no target stats
func (brs *BucketReplicationStats) Empty() bool {
return len(brs.Stats) == 0 && brs.ReplicaSize == 0
@@ -96,7 +206,24 @@ func (brs BucketReplicationStats) Clone() (c BucketReplicationStats) {
c.Stats = make(map[string]*BucketReplicationStat, len(brs.Stats))
for arn, st := range brs.Stats {
// make a copy of `*st`
s := *st
s := BucketReplicationStat{
ReplicatedSize: st.ReplicatedSize,
ReplicaSize: st.ReplicaSize,
Latency: st.Latency,
BandWidthLimitInBytesPerSecond: st.BandWidthLimitInBytesPerSecond,
CurrentBandwidthInBytesPerSecond: st.CurrentBandwidthInBytesPerSecond,
XferRateLrg: st.XferRateLrg.Clone(),
XferRateSml: st.XferRateSml.Clone(),
ReplicatedCount: st.ReplicatedCount,
Failed: st.Failed,
FailStats: st.FailStats,
}
if s.Failed.ErrCounts == nil {
s.Failed.ErrCounts = make(map[string]int)
for k, v := range st.Failed.ErrCounts {
s.Failed.ErrCounts[k] = v
}
}
c.Stats[arn] = &s
}
return c
@@ -107,38 +234,189 @@ func (brs BucketReplicationStats) Clone() (c BucketReplicationStats) {
// remote target
type BucketReplicationStat struct {
// Pending size in bytes
PendingSize int64 `json:"pendingReplicationSize"`
// PendingSize int64 `json:"pendingReplicationSize"`
// Completed size in bytes
ReplicatedSize int64 `json:"completedReplicationSize"`
// Total Replica size in bytes
ReplicaSize int64 `json:"replicaSize"`
// Failed size in bytes
FailedSize int64 `json:"failedReplicationSize"`
// Total number of pending operations including metadata updates
PendingCount int64 `json:"pendingReplicationCount"`
// Total number of failed operations including metadata updates
FailedCount int64 `json:"failedReplicationCount"`
// Collect stats for failures
FailStats RTimedMetrics `json:"-"`
// Total number of failed operations including metadata updates in the last minute
Failed madmin.TimedErrStats `json:"failed"`
// Total number of completed operations
ReplicatedCount int64 `json:"replicationCount"`
// Replication latency information
Latency ReplicationLatency `json:"replicationLatency"`
// bandwidth limit for target
BandWidthLimitInBytesPerSecond int64 `json:"limitInBits"`
// current bandwidth reported
CurrentBandwidthInBytesPerSecond float64 `json:"currentBandwidth"`
// transfer rate for large uploads
XferRateLrg *XferStats `json:"-" msg:"lt"`
// transfer rate for small uploads
XferRateSml *XferStats `json:"-" msg:"st"`
// Deprecated fields
// Pending size in bytes
PendingSize int64 `json:"pendingReplicationSize"`
// Failed size in bytes
FailedSize int64 `json:"failedReplicationSize"`
// Total number of pending operations including metadata updates
PendingCount int64 `json:"pendingReplicationCount"`
// Total number of failed operations including metadata updates
FailedCount int64 `json:"failedReplicationCount"`
}
func (bs *BucketReplicationStat) hasReplicationUsage() bool {
return bs.FailedSize > 0 ||
return bs.FailStats.SinceUptime.Count > 0 ||
bs.ReplicatedSize > 0 ||
bs.ReplicaSize > 0 ||
bs.FailedCount > 0 ||
bs.PendingCount > 0 ||
bs.PendingSize > 0
bs.ReplicaSize > 0
}
func (brs BucketReplicationStats) String() string {
s := "ReplicatedSize=" + fmt.Sprintf("%d", brs.ReplicatedSize) + "+\n ReplicaSize=" + fmt.Sprintf("%d", brs.ReplicaSize)
for arn, st := range brs.Stats {
s += "\n arn: " + arn + " ReplicatedSize=" + fmt.Sprintf("%d", st.ReplicatedSize) + " +::ReplicaSize=" + fmt.Sprintf("%d", st.ReplicaSize)
func (bs *BucketReplicationStat) updateXferRate(sz int64, duration time.Duration) {
if sz > minLargeObjSize {
bs.XferRateLrg.addSize(sz, duration)
} else {
bs.XferRateSml.addSize(sz, duration)
}
return s
}
// RMetricName - name of replication metric
type RMetricName string
const (
// Large - objects larger than 128MiB
Large RMetricName = "Large"
// Small - objects smaller than 128MiB
Small RMetricName = "Small"
// Total - metric pertaining to totals
Total RMetricName = "Total"
)
// ReplQNodeStats holds queue stats for replication per node
type ReplQNodeStats struct {
NodeName string `json:"nodeName"`
Uptime int64 `json:"uptime"`
ActiveWorkers ActiveWorkerStat `json:"activeWorkers"`
XferStats map[RMetricName]XferStats `json:"transferSummary"`
TgtXferStats map[string]map[RMetricName]XferStats `json:"tgtTransferStats"`
QStats InQueueMetric `json:"queueStats"`
MRFStats ReplicationMRFStats `json:"mrfStats"`
}
// getNodeQueueStats returns replication operational stats at the node level
func (r *ReplicationStats) getNodeQueueStats(bucket string) (qs ReplQNodeStats) {
qs.NodeName = globalLocalNodeName
qs.Uptime = UTCNow().Unix() - globalBootTime.Unix()
qs.ActiveWorkers = globalReplicationStats.ActiveWorkers()
qs.XferStats = make(map[RMetricName]XferStats)
qs.QStats = r.qCache.getBucketStats(bucket)
qs.TgtXferStats = make(map[string]map[RMetricName]XferStats)
r.RLock()
defer r.RUnlock()
brs, ok := r.Cache[bucket]
if !ok {
return qs
}
for arn := range brs.Stats {
qs.TgtXferStats[arn] = make(map[RMetricName]XferStats)
}
count := 0
var totPeak float64
// calculate large, small transfers and total transfer rates per replication target at bucket level
for arn, v := range brs.Stats {
lcurrTgt := v.XferRateLrg.curr()
scurrTgt := v.XferRateSml.curr()
totPeak = math.Max(math.Max(v.XferRateLrg.Peak, v.XferRateSml.Peak), totPeak)
totPeak = math.Max(math.Max(lcurrTgt, scurrTgt), totPeak)
tcount := 0
if v.XferRateLrg.Peak > 0 {
tcount++
}
if v.XferRateSml.Peak > 0 {
tcount++
}
qs.TgtXferStats[arn][Large] = XferStats{
Avg: v.XferRateLrg.Avg,
Curr: lcurrTgt,
Peak: math.Max(v.XferRateLrg.Peak, lcurrTgt),
}
qs.TgtXferStats[arn][Small] = XferStats{
Avg: v.XferRateSml.Avg,
Curr: scurrTgt,
Peak: math.Max(v.XferRateSml.Peak, scurrTgt),
}
if tcount > 0 {
qs.TgtXferStats[arn][Total] = XferStats{
Avg: (v.XferRateLrg.Avg + v.XferRateSml.Avg) / float64(tcount),
Curr: (scurrTgt + lcurrTgt) / float64(tcount),
Peak: totPeak,
}
}
}
// calculate large, small and total transfer rates for a minio node
var lavg, lcurr, lpeak, savg, scurr, speak, totpeak float64
for _, v := range qs.TgtXferStats {
tot := v[Total]
lavg += v[Large].Avg
lcurr += v[Large].Curr
savg += v[Small].Avg
scurr += v[Small].Curr
totpeak = math.Max(math.Max(tot.Peak, totpeak), tot.Curr)
lpeak = math.Max(math.Max(v[Large].Peak, lpeak), v[Large].Curr)
speak = math.Max(math.Max(v[Small].Peak, speak), v[Small].Curr)
if lpeak > 0 || speak > 0 {
count++
}
}
if count > 0 {
lrg := XferStats{
Avg: lavg / float64(count),
Curr: lcurr / float64(count),
Peak: lpeak,
}
sml := XferStats{
Avg: savg / float64(count),
Curr: scurr / float64(count),
Peak: speak,
}
qs.XferStats[Large] = lrg
qs.XferStats[Small] = sml
qs.XferStats[Total] = XferStats{
Avg: (savg + lavg) / float64(count),
Curr: (lcurr + scurr) / float64(count),
Peak: totpeak,
}
}
return qs
}
// populate queue totals for node and active workers in use for metrics
func (r *ReplicationStats) getNodeQueueStatsSummary() (qs ReplQNodeStats) {
qs.NodeName = globalLocalNodeName
qs.Uptime = UTCNow().Unix() - globalBootTime.Unix()
qs.ActiveWorkers = globalReplicationStats.ActiveWorkers()
qs.XferStats = make(map[RMetricName]XferStats)
qs.QStats = r.qCache.getSiteStats()
r.RLock()
defer r.RUnlock()
tx := newXferStats()
for _, brs := range r.Cache {
for _, v := range brs.Stats {
tx := tx.merge(*v.XferRateLrg)
tx = tx.merge(*v.XferRateSml)
}
}
qs.XferStats[Total] = *tx
return qs
}
// ReplicationQueueStats holds overall queue stats for replication
type ReplicationQueueStats struct {
Nodes []ReplQNodeStats `json:"nodes"`
Uptime int64 `json:"uptime"`
}