minio/cmd/site-replication-metrics.go

// Copyright (c) 2015-2023 MinIO, Inc.
//
// This file is part of MinIO Object Storage stack
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.

package cmd

import (
	"fmt"
	"sync"
	"sync/atomic"
	"time"

	"github.com/minio/madmin-go/v3"
	"github.com/minio/minio-go/v7"
)

//go:generate msgp -file $GOFILE

// RStat has replication error stats
type RStat struct {
	Count int64 `json:"count"`
	Bytes int64 `json:"bytes"`
}

// RTimedMetrics has replication error stats for various time windows
type RTimedMetrics struct {
	LastHour    ReplicationLastHour `json:"lastHour"`
	SinceUptime RStat               `json:"sinceUptime"`
	LastMinute  ReplicationLastMinute
	// Error counts
	ErrCounts map[string]int `json:"errCounts"` // Count of credential errors
}

func (rt *RTimedMetrics) String() string {
	s := rt.toMetric()
	return fmt.Sprintf("Errors in LastMinute: %v, LastHour: %v, SinceUptime: %v", s.LastMinute.Count, s.LastHour.Count, s.Totals.Count)
}

func (rt *RTimedMetrics) toMetric() madmin.TimedErrStats {
	if rt == nil {
		return madmin.TimedErrStats{}
	}
	errCounts := make(map[string]int)
	for k, v := range rt.ErrCounts {
		errCounts[k] = v
	}
	minuteTotals := rt.LastMinute.getTotal()
	hourTotals := rt.LastHour.getTotal()
	return madmin.TimedErrStats{
		LastMinute: madmin.RStat{
			Count: float64(minuteTotals.N),
			Bytes: minuteTotals.Size,
		},
		LastHour: madmin.RStat{
			Count: float64(hourTotals.N),
			Bytes: hourTotals.Size,
		},
		Totals: madmin.RStat{
			Count: float64(rt.SinceUptime.Count),
			Bytes: rt.SinceUptime.Bytes,
		},
		ErrCounts: errCounts,
	}
}

func (rt *RTimedMetrics) addsize(size int64, err error) {
	// failures seen since uptime
	atomic.AddInt64(&rt.SinceUptime.Bytes, size)
	atomic.AddInt64(&rt.SinceUptime.Count, 1)
	rt.LastMinute.addsize(size)
	rt.LastHour.addsize(size)
	if err != nil && minio.ToErrorResponse(err).Code == "AccessDenied" {
		if rt.ErrCounts == nil {
			rt.ErrCounts = make(map[string]int)
		}
		rt.ErrCounts["AccessDenied"]++
	}
}

func (rt *RTimedMetrics) merge(o RTimedMetrics) (n RTimedMetrics) {
	n.SinceUptime.Bytes = atomic.LoadInt64(&rt.SinceUptime.Bytes) + atomic.LoadInt64(&o.SinceUptime.Bytes)
	n.SinceUptime.Count = atomic.LoadInt64(&rt.SinceUptime.Count) + atomic.LoadInt64(&o.SinceUptime.Count)

	n.LastMinute = n.LastMinute.merge(rt.LastMinute)
	n.LastMinute = n.LastMinute.merge(o.LastMinute)
	n.LastHour = n.LastHour.merge(rt.LastHour)
	n.LastHour = n.LastHour.merge(o.LastHour)
	n.ErrCounts = make(map[string]int)
	for k, v := range rt.ErrCounts {
		n.ErrCounts[k] = v
	}
	for k, v := range o.ErrCounts {
		n.ErrCounts[k] += v
	}
	return n
}

// SRStats has replication stats at site level
type SRStats struct {
	// Total Replica size in bytes
	ReplicaSize int64 `json:"replicaSize"`
	// Total Replica received
	ReplicaCount int64                `json:"replicaCount"`
	M            map[string]*SRStatus `json:"srStatusMap"`

	movingAvgTicker *time.Ticker // Ticker for calculating moving averages
	lock            sync.RWMutex // mutex for srStats
}

// SRStatus has replication stats at deployment level
type SRStatus struct {
	ReplicatedSize int64 `json:"completedReplicationSize"`
	// Total number of failed operations including metadata updates in the last minute
	Failed RTimedMetrics `json:"failedReplication"`
	// Total number of completed operations
	ReplicatedCount int64 `json:"replicationCount"`
	// Replication latency information
	Latency ReplicationLatency `json:"replicationLatency"`
	// transfer rate for large uploads
	XferRateLrg *XferStats `json:"largeTransferRate" msg:"lt"`
	// transfer rate for small uploads
	XferRateSml *XferStats `json:"smallTransferRate" msg:"st"`
	// Endpoint is the replication target endpoint
	Endpoint string `json:"-"`
	// Secure is true if the replication target endpoint is secure
	Secure bool `json:"-"`
}

func (sr *SRStats) update(st replStat, dID string) {
	sr.lock.Lock()
	defer sr.lock.Unlock()
	srs, ok := sr.M[dID]
	if !ok {
		srs = &SRStatus{
			XferRateLrg: newXferStats(),
			XferRateSml: newXferStats(),
		}
	}
	srs.Endpoint = st.Endpoint
	srs.Secure = st.Secure
	switch {
	case st.Completed:
		srs.ReplicatedSize += st.TransferSize
		srs.ReplicatedCount++
		if st.TransferDuration > 0 {
			srs.Latency.update(st.TransferSize, st.TransferDuration)
			srs.updateXferRate(st.TransferSize, st.TransferDuration)
		}
	case st.Failed:
		srs.Failed.addsize(st.TransferSize, st.Err)
	case st.Pending:
	}
	sr.M[dID] = srs
}

func (sr *SRStats) get() map[string]SRMetric {
	epMap := globalBucketTargetSys.healthStats()

	sr.lock.RLock()
	defer sr.lock.RUnlock()
	m := make(map[string]SRMetric, len(sr.M))
	for dID, v := range sr.M {
		t := newXferStats()
		mx := make(map[RMetricName]XferStats)

		if v.XferRateLrg != nil {
			mx[Large] = *v.XferRateLrg.Clone()
			m := t.merge(*v.XferRateLrg)
			t = &m
		}
		if v.XferRateSml != nil {
			mx[Small] = *v.XferRateSml.Clone()
			m := t.merge(*v.XferRateSml)
			t = &m
		}

		mx[Total] = *t
		metric := SRMetric{
			ReplicatedSize:  v.ReplicatedSize,
			ReplicatedCount: v.ReplicatedCount,
			DeploymentID:    dID,
			Failed:          v.Failed.toMetric(),
			XferStats:       mx,
		}
		epHealth, ok := epMap[v.Endpoint]
		if ok {
			metric.Endpoint = epHealth.Endpoint
			metric.TotalDowntime = epHealth.offlineDuration
			metric.LastOnline = epHealth.lastOnline
			metric.Online = epHealth.Online
			metric.Latency = madmin.LatencyStat{
				Curr: epHealth.latency.curr,
				Avg:  epHealth.latency.avg,
				Max:  epHealth.latency.peak,
			}
		}
		m[dID] = metric
	}
	return m
}

func (srs *SRStatus) updateXferRate(sz int64, duration time.Duration) {
	if sz > minLargeObjSize {
		srs.XferRateLrg.addSize(sz, duration)
	} else {
		srs.XferRateSml.addSize(sz, duration)
	}
}

func newSRStats() *SRStats {
	s := SRStats{
		M:               make(map[string]*SRStatus),
		movingAvgTicker: time.NewTicker(time.Second * 2),
	}
	go s.trackEWMA()
	return &s
}

func (sr *SRStats) trackEWMA() {
	for {
		select {
		case <-sr.movingAvgTicker.C:
			sr.updateMovingAvg()
		case <-GlobalContext.Done():
			return
		}
	}
}

func (sr *SRStats) updateMovingAvg() {
	sr.lock.Lock()
	defer sr.lock.Unlock()
	for _, s := range sr.M {
		s.XferRateLrg.measure.updateExponentialMovingAverage(time.Now())
		s.XferRateSml.measure.updateExponentialMovingAverage(time.Now())
	}
}

// SRMetric captures replication metrics for a deployment
type SRMetric struct {
	DeploymentID  string             `json:"deploymentID"`
	Endpoint      string             `json:"endpoint"`
	TotalDowntime time.Duration      `json:"totalDowntime"`
	LastOnline    time.Time          `json:"lastOnline"`
	Online        bool               `json:"isOnline"`
	Latency       madmin.LatencyStat `json:"latency"`

	// replication metrics across buckets roll up
	ReplicatedSize int64 `json:"replicatedSize"`
	// Total number of completed operations
	ReplicatedCount int64 `json:"replicatedCount"`
	// Failed captures replication errors in various time windows

	Failed madmin.TimedErrStats `json:"failed,omitempty"`

	XferStats map[RMetricName]XferStats `json:"transferSummary"`
}

// SRMetricsSummary captures summary of replication counts across buckets on site
// along with op metrics rollup.
type SRMetricsSummary struct {
	// op metrics roll up
	ActiveWorkers ActiveWorkerStat `json:"activeWorkers"`

	// Total Replica size in bytes
	ReplicaSize int64 `json:"replicaSize"`

	// Total number of replica received
	ReplicaCount int64 `json:"replicaCount"`
	// Queued operations
	Queued InQueueMetric `json:"queued"`
	// replication metrics summary for each site replication peer
	Metrics map[string]SRMetric `json:"replMetrics"`
	// uptime of node being queried for site replication metrics
	Uptime int64 `json:"uptime"`
}