mirror of
https://github.com/minio/minio.git
synced 2025-01-12 23:43:22 -05:00
Drop Pending size and count from replication metrics (#12378)
Real-time metrics calculated in-memory rely on the initial replication metrics saved with data usage. However, this can lag behind the actual state of the cluster at the time of server restart leading to inaccurate Pending size/counts reported to Prometheus. Dropping the Pending metrics as this can be more reliably monitored by applications with replication notifications. Signed-off-by: Poorna Krishnamoorthy <poorna@minio.io>
This commit is contained in:
parent
ab7410af11
commit
3690de0c6b
@ -26,11 +26,9 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func (b *BucketReplicationStats) hasReplicationUsage() bool {
|
func (b *BucketReplicationStats) hasReplicationUsage() bool {
|
||||||
return b.PendingSize > 0 ||
|
return b.FailedSize > 0 ||
|
||||||
b.FailedSize > 0 ||
|
|
||||||
b.ReplicatedSize > 0 ||
|
b.ReplicatedSize > 0 ||
|
||||||
b.ReplicaSize > 0 ||
|
b.ReplicaSize > 0 ||
|
||||||
b.PendingCount > 0 ||
|
|
||||||
b.FailedCount > 0
|
b.FailedCount > 0
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -67,38 +65,23 @@ func (r *ReplicationStats) Update(bucket string, n int64, status, prevStatus rep
|
|||||||
}
|
}
|
||||||
r.RUnlock()
|
r.RUnlock()
|
||||||
switch status {
|
switch status {
|
||||||
case replication.Pending:
|
|
||||||
if opType == replication.ObjectReplicationType {
|
|
||||||
atomic.AddUint64(&b.PendingSize, uint64(n))
|
|
||||||
}
|
|
||||||
atomic.AddUint64(&b.PendingCount, 1)
|
|
||||||
case replication.Completed:
|
case replication.Completed:
|
||||||
switch prevStatus { // adjust counters based on previous state
|
switch prevStatus { // adjust counters based on previous state
|
||||||
case replication.Pending:
|
|
||||||
atomic.AddUint64(&b.PendingCount, ^uint64(0))
|
|
||||||
case replication.Failed:
|
case replication.Failed:
|
||||||
atomic.AddUint64(&b.FailedCount, ^uint64(0))
|
atomic.AddUint64(&b.FailedCount, ^uint64(0))
|
||||||
}
|
}
|
||||||
if opType == replication.ObjectReplicationType {
|
if opType == replication.ObjectReplicationType {
|
||||||
atomic.AddUint64(&b.ReplicatedSize, uint64(n))
|
atomic.AddUint64(&b.ReplicatedSize, uint64(n))
|
||||||
switch prevStatus {
|
switch prevStatus {
|
||||||
case replication.Pending:
|
|
||||||
atomic.AddUint64(&b.PendingSize, ^uint64(n-1))
|
|
||||||
case replication.Failed:
|
case replication.Failed:
|
||||||
atomic.AddUint64(&b.FailedSize, ^uint64(n-1))
|
atomic.AddUint64(&b.FailedSize, ^uint64(n-1))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
case replication.Failed:
|
case replication.Failed:
|
||||||
// count failures only once - not on every retry
|
|
||||||
switch prevStatus { // adjust counters based on previous state
|
|
||||||
case replication.Pending:
|
|
||||||
atomic.AddUint64(&b.PendingCount, ^uint64(0))
|
|
||||||
}
|
|
||||||
if opType == replication.ObjectReplicationType {
|
if opType == replication.ObjectReplicationType {
|
||||||
if prevStatus == replication.Pending {
|
if prevStatus == replication.Pending {
|
||||||
atomic.AddUint64(&b.FailedSize, uint64(n))
|
atomic.AddUint64(&b.FailedSize, uint64(n))
|
||||||
atomic.AddUint64(&b.FailedCount, 1)
|
atomic.AddUint64(&b.FailedCount, 1)
|
||||||
atomic.AddUint64(&b.PendingSize, ^uint64(n-1))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
case replication.Replica:
|
case replication.Replica:
|
||||||
@ -125,11 +108,9 @@ func (r *ReplicationStats) GetInitialUsage(bucket string) BucketReplicationStats
|
|||||||
return BucketReplicationStats{}
|
return BucketReplicationStats{}
|
||||||
}
|
}
|
||||||
return BucketReplicationStats{
|
return BucketReplicationStats{
|
||||||
PendingSize: atomic.LoadUint64(&st.PendingSize),
|
|
||||||
FailedSize: atomic.LoadUint64(&st.FailedSize),
|
FailedSize: atomic.LoadUint64(&st.FailedSize),
|
||||||
ReplicatedSize: atomic.LoadUint64(&st.ReplicatedSize),
|
ReplicatedSize: atomic.LoadUint64(&st.ReplicatedSize),
|
||||||
ReplicaSize: atomic.LoadUint64(&st.ReplicaSize),
|
ReplicaSize: atomic.LoadUint64(&st.ReplicaSize),
|
||||||
PendingCount: atomic.LoadUint64(&st.PendingCount),
|
|
||||||
FailedCount: atomic.LoadUint64(&st.FailedCount),
|
FailedCount: atomic.LoadUint64(&st.FailedCount),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -149,11 +130,9 @@ func (r *ReplicationStats) Get(bucket string) BucketReplicationStats {
|
|||||||
}
|
}
|
||||||
|
|
||||||
return BucketReplicationStats{
|
return BucketReplicationStats{
|
||||||
PendingSize: atomic.LoadUint64(&st.PendingSize),
|
|
||||||
FailedSize: atomic.LoadUint64(&st.FailedSize),
|
FailedSize: atomic.LoadUint64(&st.FailedSize),
|
||||||
ReplicatedSize: atomic.LoadUint64(&st.ReplicatedSize),
|
ReplicatedSize: atomic.LoadUint64(&st.ReplicatedSize),
|
||||||
ReplicaSize: atomic.LoadUint64(&st.ReplicaSize),
|
ReplicaSize: atomic.LoadUint64(&st.ReplicaSize),
|
||||||
PendingCount: atomic.LoadUint64(&st.PendingCount),
|
|
||||||
FailedCount: atomic.LoadUint64(&st.FailedCount),
|
FailedCount: atomic.LoadUint64(&st.FailedCount),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -177,11 +156,9 @@ func NewReplicationStats(ctx context.Context, objectAPI ObjectLayer) *Replicatio
|
|||||||
|
|
||||||
for bucket, usage := range dataUsageInfo.BucketsUsage {
|
for bucket, usage := range dataUsageInfo.BucketsUsage {
|
||||||
b := &BucketReplicationStats{
|
b := &BucketReplicationStats{
|
||||||
PendingSize: usage.ReplicationPendingSize,
|
|
||||||
FailedSize: usage.ReplicationFailedSize,
|
FailedSize: usage.ReplicationFailedSize,
|
||||||
ReplicatedSize: usage.ReplicatedSize,
|
ReplicatedSize: usage.ReplicatedSize,
|
||||||
ReplicaSize: usage.ReplicaSize,
|
ReplicaSize: usage.ReplicaSize,
|
||||||
PendingCount: usage.ReplicationPendingCount,
|
|
||||||
FailedCount: usage.ReplicationFailedCount,
|
FailedCount: usage.ReplicationFailedCount,
|
||||||
}
|
}
|
||||||
if b.hasReplicationUsage() {
|
if b.hasReplicationUsage() {
|
||||||
|
@ -98,8 +98,6 @@ const (
|
|||||||
failedCount MetricName = "failed_count"
|
failedCount MetricName = "failed_count"
|
||||||
failedBytes MetricName = "failed_bytes"
|
failedBytes MetricName = "failed_bytes"
|
||||||
freeBytes MetricName = "free_bytes"
|
freeBytes MetricName = "free_bytes"
|
||||||
pendingBytes MetricName = "pending_bytes"
|
|
||||||
pendingCount MetricName = "pending_count"
|
|
||||||
readBytes MetricName = "read_bytes"
|
readBytes MetricName = "read_bytes"
|
||||||
rcharBytes MetricName = "rchar_bytes"
|
rcharBytes MetricName = "rchar_bytes"
|
||||||
receivedBytes MetricName = "received_bytes"
|
receivedBytes MetricName = "received_bytes"
|
||||||
@ -400,15 +398,7 @@ func getBucketUsageObjectsTotalMD() MetricDescription {
|
|||||||
Type: gaugeMetric,
|
Type: gaugeMetric,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
func getBucketRepPendingBytesMD() MetricDescription {
|
|
||||||
return MetricDescription{
|
|
||||||
Namespace: bucketMetricNamespace,
|
|
||||||
Subsystem: replicationSubsystem,
|
|
||||||
Name: pendingBytes,
|
|
||||||
Help: "Total bytes pending to replicate.",
|
|
||||||
Type: gaugeMetric,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
func getBucketRepFailedBytesMD() MetricDescription {
|
func getBucketRepFailedBytesMD() MetricDescription {
|
||||||
return MetricDescription{
|
return MetricDescription{
|
||||||
Namespace: bucketMetricNamespace,
|
Namespace: bucketMetricNamespace,
|
||||||
@ -436,15 +426,7 @@ func getBucketRepReceivedBytesMD() MetricDescription {
|
|||||||
Type: gaugeMetric,
|
Type: gaugeMetric,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
func getBucketRepPendingOperationsMD() MetricDescription {
|
|
||||||
return MetricDescription{
|
|
||||||
Namespace: bucketMetricNamespace,
|
|
||||||
Subsystem: replicationSubsystem,
|
|
||||||
Name: pendingCount,
|
|
||||||
Help: "Total number of objects pending replication",
|
|
||||||
Type: gaugeMetric,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
func getBucketRepFailedOperationsMD() MetricDescription {
|
func getBucketRepFailedOperationsMD() MetricDescription {
|
||||||
return MetricDescription{
|
return MetricDescription{
|
||||||
Namespace: bucketMetricNamespace,
|
Namespace: bucketMetricNamespace,
|
||||||
@ -1318,11 +1300,6 @@ func getBucketUsageMetrics() MetricsGroup {
|
|||||||
})
|
})
|
||||||
|
|
||||||
if stat.hasReplicationUsage() {
|
if stat.hasReplicationUsage() {
|
||||||
metrics = append(metrics, Metric{
|
|
||||||
Description: getBucketRepPendingBytesMD(),
|
|
||||||
Value: float64(stat.PendingSize),
|
|
||||||
VariableLabels: map[string]string{"bucket": bucket},
|
|
||||||
})
|
|
||||||
metrics = append(metrics, Metric{
|
metrics = append(metrics, Metric{
|
||||||
Description: getBucketRepFailedBytesMD(),
|
Description: getBucketRepFailedBytesMD(),
|
||||||
Value: float64(stat.FailedSize),
|
Value: float64(stat.FailedSize),
|
||||||
@ -1338,11 +1315,6 @@ func getBucketUsageMetrics() MetricsGroup {
|
|||||||
Value: float64(stat.ReplicaSize),
|
Value: float64(stat.ReplicaSize),
|
||||||
VariableLabels: map[string]string{"bucket": bucket},
|
VariableLabels: map[string]string{"bucket": bucket},
|
||||||
})
|
})
|
||||||
metrics = append(metrics, Metric{
|
|
||||||
Description: getBucketRepPendingOperationsMD(),
|
|
||||||
Value: float64(stat.PendingCount),
|
|
||||||
VariableLabels: map[string]string{"bucket": bucket},
|
|
||||||
})
|
|
||||||
metrics = append(metrics, Metric{
|
metrics = append(metrics, Metric{
|
||||||
Description: getBucketRepFailedOperationsMD(),
|
Description: getBucketRepFailedOperationsMD(),
|
||||||
Value: float64(stat.FailedCount),
|
Value: float64(stat.FailedCount),
|
||||||
|
@ -18,6 +18,7 @@
|
|||||||
package cmd
|
package cmd
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"math"
|
||||||
"net/http"
|
"net/http"
|
||||||
"strings"
|
"strings"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
@ -441,56 +442,23 @@ func getLatestReplicationStats(bucket string, u madmin.BucketUsageInfo) (s Bucke
|
|||||||
for _, bucketStat := range bucketStats {
|
for _, bucketStat := range bucketStats {
|
||||||
replStats.FailedCount += bucketStat.ReplicationStats.FailedCount
|
replStats.FailedCount += bucketStat.ReplicationStats.FailedCount
|
||||||
replStats.FailedSize += bucketStat.ReplicationStats.FailedSize
|
replStats.FailedSize += bucketStat.ReplicationStats.FailedSize
|
||||||
replStats.PendingCount += bucketStat.ReplicationStats.PendingCount
|
|
||||||
replStats.PendingSize += bucketStat.ReplicationStats.PendingSize
|
|
||||||
replStats.ReplicaSize += bucketStat.ReplicationStats.ReplicaSize
|
replStats.ReplicaSize += bucketStat.ReplicationStats.ReplicaSize
|
||||||
replStats.ReplicatedSize += bucketStat.ReplicationStats.ReplicatedSize
|
replStats.ReplicatedSize += bucketStat.ReplicationStats.ReplicatedSize
|
||||||
}
|
}
|
||||||
usageStat := globalReplicationStats.GetInitialUsage(bucket)
|
usageStat := globalReplicationStats.GetInitialUsage(bucket)
|
||||||
replStats.FailedCount += usageStat.FailedCount
|
|
||||||
replStats.FailedSize += usageStat.FailedSize
|
|
||||||
replStats.PendingCount += usageStat.PendingCount
|
|
||||||
replStats.PendingSize += usageStat.PendingSize
|
|
||||||
replStats.ReplicaSize += usageStat.ReplicaSize
|
replStats.ReplicaSize += usageStat.ReplicaSize
|
||||||
replStats.ReplicatedSize += usageStat.ReplicatedSize
|
replStats.ReplicatedSize += usageStat.ReplicatedSize
|
||||||
|
|
||||||
// use in memory replication stats if it is ahead of usage info.
|
// use in memory replication stats if it is ahead of usage info.
|
||||||
|
s.ReplicatedSize = u.ReplicatedSize
|
||||||
if replStats.ReplicatedSize >= u.ReplicatedSize {
|
if replStats.ReplicatedSize >= u.ReplicatedSize {
|
||||||
s.ReplicatedSize = replStats.ReplicatedSize
|
s.ReplicatedSize = replStats.ReplicatedSize
|
||||||
} else {
|
|
||||||
s.ReplicatedSize = u.ReplicatedSize
|
|
||||||
}
|
}
|
||||||
|
// Reset FailedSize and FailedCount to 0 for negative overflows which can
|
||||||
if replStats.PendingSize > u.ReplicationPendingSize {
|
// happen since data usage picture can lag behind actual usage state at the time of cluster start
|
||||||
s.PendingSize = replStats.PendingSize
|
s.FailedSize = uint64(math.Max(float64(replStats.FailedSize), 0))
|
||||||
} else {
|
s.FailedCount = uint64(math.Max(float64(replStats.FailedCount), 0))
|
||||||
s.PendingSize = u.ReplicationPendingSize
|
s.ReplicaSize = uint64(math.Max(float64(replStats.ReplicaSize), float64(u.ReplicaSize)))
|
||||||
}
|
|
||||||
|
|
||||||
if replStats.FailedSize > u.ReplicationFailedSize {
|
|
||||||
s.FailedSize = replStats.FailedSize
|
|
||||||
} else {
|
|
||||||
s.FailedSize = u.ReplicationFailedSize
|
|
||||||
}
|
|
||||||
|
|
||||||
if replStats.ReplicaSize > u.ReplicaSize {
|
|
||||||
s.ReplicaSize = replStats.ReplicaSize
|
|
||||||
} else {
|
|
||||||
s.ReplicaSize = u.ReplicaSize
|
|
||||||
}
|
|
||||||
|
|
||||||
if replStats.PendingCount > u.ReplicationPendingCount {
|
|
||||||
s.PendingCount = replStats.PendingCount
|
|
||||||
} else {
|
|
||||||
s.PendingCount = u.ReplicationPendingCount
|
|
||||||
}
|
|
||||||
|
|
||||||
if replStats.FailedCount > u.ReplicationFailedCount {
|
|
||||||
s.FailedCount = replStats.FailedCount
|
|
||||||
} else {
|
|
||||||
s.FailedCount = u.ReplicationFailedCount
|
|
||||||
}
|
|
||||||
|
|
||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -537,15 +505,6 @@ func bucketUsageMetricsPrometheus(ch chan<- prometheus.Metric) {
|
|||||||
float64(usageInfo.ObjectsCount),
|
float64(usageInfo.ObjectsCount),
|
||||||
bucket,
|
bucket,
|
||||||
)
|
)
|
||||||
ch <- prometheus.MustNewConstMetric(
|
|
||||||
prometheus.NewDesc(
|
|
||||||
prometheus.BuildFQName("bucket", "replication", "pending_size"),
|
|
||||||
"Total capacity pending to be replicated",
|
|
||||||
[]string{"bucket"}, nil),
|
|
||||||
prometheus.GaugeValue,
|
|
||||||
float64(stat.PendingSize),
|
|
||||||
bucket,
|
|
||||||
)
|
|
||||||
ch <- prometheus.MustNewConstMetric(
|
ch <- prometheus.MustNewConstMetric(
|
||||||
prometheus.NewDesc(
|
prometheus.NewDesc(
|
||||||
prometheus.BuildFQName("bucket", "replication", "failed_size"),
|
prometheus.BuildFQName("bucket", "replication", "failed_size"),
|
||||||
@ -573,15 +532,6 @@ func bucketUsageMetricsPrometheus(ch chan<- prometheus.Metric) {
|
|||||||
float64(stat.ReplicaSize),
|
float64(stat.ReplicaSize),
|
||||||
bucket,
|
bucket,
|
||||||
)
|
)
|
||||||
ch <- prometheus.MustNewConstMetric(
|
|
||||||
prometheus.NewDesc(
|
|
||||||
prometheus.BuildFQName("bucket", "replication", "pending_count"),
|
|
||||||
"Total replication operations pending",
|
|
||||||
[]string{"bucket"}, nil),
|
|
||||||
prometheus.GaugeValue,
|
|
||||||
float64(stat.PendingCount),
|
|
||||||
bucket,
|
|
||||||
)
|
|
||||||
ch <- prometheus.MustNewConstMetric(
|
ch <- prometheus.MustNewConstMetric(
|
||||||
prometheus.NewDesc(
|
prometheus.NewDesc(
|
||||||
prometheus.BuildFQName("bucket", "replication", "failed_count"),
|
prometheus.BuildFQName("bucket", "replication", "failed_count"),
|
||||||
|
@ -9,10 +9,8 @@ These metrics can be from any MinIO server once per collection.
|
|||||||
|:---------------------------------------------|:--------------------------------------------------------------------------------------------------------------------|
|
|:---------------------------------------------|:--------------------------------------------------------------------------------------------------------------------|
|
||||||
| `minio_bucket_objects_size_distribution` | Distribution of object sizes in the bucket, includes label for the bucket name. |
|
| `minio_bucket_objects_size_distribution` | Distribution of object sizes in the bucket, includes label for the bucket name. |
|
||||||
| `minio_bucket_replication_failed_bytes` | Total number of bytes failed at least once to replicate. |
|
| `minio_bucket_replication_failed_bytes` | Total number of bytes failed at least once to replicate. |
|
||||||
| `minio_bucket_replication_pending_bytes` | Total bytes pending to replicate. |
|
|
||||||
| `minio_bucket_replication_received_bytes` | Total number of bytes replicated to this bucket from another source bucket. |
|
| `minio_bucket_replication_received_bytes` | Total number of bytes replicated to this bucket from another source bucket. |
|
||||||
| `minio_bucket_replication_sent_bytes` | Total number of bytes replicated to the target bucket. |
|
| `minio_bucket_replication_sent_bytes` | Total number of bytes replicated to the target bucket. |
|
||||||
| `minio_bucket_replication_pending_count` | Total number of replication operations pending for this bucket. |
|
|
||||||
| `minio_bucket_replication_failed_count` | Total number of replication foperations failed for this bucket. |
|
| `minio_bucket_replication_failed_count` | Total number of replication foperations failed for this bucket. |
|
||||||
| `minio_bucket_usage_object_total` | Total number of objects |
|
| `minio_bucket_usage_object_total` | Total number of objects |
|
||||||
| `minio_bucket_usage_total_bytes` | Total bucket size in bytes |
|
| `minio_bucket_usage_total_bytes` | Total bucket size in bytes |
|
||||||
|
Loading…
Reference in New Issue
Block a user