diff --git a/cmd/metrics-v3-api.go b/cmd/metrics-v3-api.go
index b12fe3a5f..548d72172 100644
--- a/cmd/metrics-v3-api.go
+++ b/cmd/metrics-v3-api.go
@@ -144,33 +144,33 @@ func loadAPIRequestsNetworkMetrics(ctx context.Context, m MetricValues, _ *metri
// Metric Descriptions for bucket level S3 metrics.
var (
- apiBucketTrafficSentBytesMD = NewCounterMD(apiTrafficSentBytes,
+ bucketAPITrafficSentBytesMD = NewCounterMD(apiTrafficSentBytes,
"Total number of bytes received for a bucket", "bucket", "type")
- apiBucketTrafficRecvBytesMD = NewCounterMD(apiTrafficRecvBytes,
+ bucketAPITrafficRecvBytesMD = NewCounterMD(apiTrafficRecvBytes,
"Total number of bytes sent for a bucket", "bucket", "type")
- apiBucketRequestsInFlightMD = NewGaugeMD(apiRequestsInFlightTotal,
+ bucketAPIRequestsInFlightMD = NewGaugeMD(apiRequestsInFlightTotal,
"Total number of requests currently in flight for a bucket", "bucket", "name", "type")
- apiBucketRequestsTotalMD = NewCounterMD(apiRequestsTotal,
+ bucketAPIRequestsTotalMD = NewCounterMD(apiRequestsTotal,
"Total number of requests for a bucket", "bucket", "name", "type")
- apiBucketRequestsCanceledMD = NewCounterMD(apiRequestsCanceledTotal,
+ bucketAPIRequestsCanceledMD = NewCounterMD(apiRequestsCanceledTotal,
"Total number of requests canceled by the client for a bucket", "bucket", "name", "type")
- apiBucketRequests4xxErrorsMD = NewCounterMD(apiRequests4xxErrorsTotal,
+ bucketAPIRequests4xxErrorsMD = NewCounterMD(apiRequests4xxErrorsTotal,
"Total number of requests with 4xx errors for a bucket", "bucket", "name", "type")
- apiBucketRequests5xxErrorsMD = NewCounterMD(apiRequests5xxErrorsTotal,
+ bucketAPIRequests5xxErrorsMD = NewCounterMD(apiRequests5xxErrorsTotal,
"Total number of requests with 5xx errors for a bucket", "bucket", "name", "type")
- apiBucketRequestsTTFBSecondsDistributionMD = NewCounterMD(apiRequestsTTFBSecondsDistribution,
+ bucketAPIRequestsTTFBSecondsDistributionMD = NewCounterMD(apiRequestsTTFBSecondsDistribution,
"Distribution of time to first byte across API calls for a bucket",
"bucket", "name", "le", "type")
)
-// loadAPIBucketHTTPMetrics - loads bucket level S3 HTTP metrics.
+// loadBucketAPIHTTPMetrics - loads bucket level S3 HTTP metrics.
//
// This is a `MetricsLoaderFn`.
//
// This includes bucket level S3 HTTP metrics and S3 network in/out metrics.
-func loadAPIBucketHTTPMetrics(ctx context.Context, m MetricValues, _ *metricsCache, buckets []string) error {
+func loadBucketAPIHTTPMetrics(ctx context.Context, m MetricValues, _ *metricsCache, buckets []string) error {
if len(buckets) == 0 {
return nil
}
@@ -209,10 +209,10 @@ func loadAPIBucketHTTPMetrics(ctx context.Context, m MetricValues, _ *metricsCac
return nil
}
-// loadAPIBucketTTFBMetrics - loads bucket S3 TTFB metrics.
+// loadBucketAPITTFBMetrics - loads bucket S3 TTFB metrics.
//
// This is a `MetricsLoaderFn`.
-func loadAPIBucketTTFBMetrics(ctx context.Context, m MetricValues, _ *metricsCache, buckets []string) error {
+func loadBucketAPITTFBMetrics(ctx context.Context, m MetricValues, _ *metricsCache, buckets []string) error {
renameLabels := map[string]string{"api": "name"}
m.SetHistogram(apiRequestsTTFBSecondsDistribution, bucketHTTPRequestsDuration, renameLabels,
buckets, "type", "s3")
diff --git a/cmd/metrics-v3-bucket-replication.go b/cmd/metrics-v3-bucket-replication.go
new file mode 100644
index 000000000..64f65e832
--- /dev/null
+++ b/cmd/metrics-v3-bucket-replication.go
@@ -0,0 +1,155 @@
+// Copyright (c) 2015-2024 MinIO, Inc.
+//
+// This file is part of MinIO Object Storage stack
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see .
+
+package cmd
+
+import (
+ "context"
+)
+
+const (
+ bucketReplLastHrFailedBytes = "last_hour_failed_bytes"
+ bucketReplLastHrFailedCount = "last_hour_failed_count"
+ bucketReplLastMinFailedBytes = "last_minute_failed_bytes"
+ bucketReplLastMinFailedCount = "last_minute_failed_count"
+ bucketReplLatencyMs = "latency_ms"
+ bucketReplProxiedDeleteTaggingRequestsTotal = "proxied_delete_tagging_requests_total"
+ bucketReplProxiedGetRequestsFailures = "proxied_get_requests_failures"
+ bucketReplProxiedGetRequestsTotal = "proxied_get_requests_total"
+ bucketReplProxiedGetTaggingRequestsFailures = "proxied_get_tagging_requests_failures"
+ bucketReplProxiedGetTaggingRequestsTotal = "proxied_get_tagging_requests_total"
+ bucketReplProxiedHeadRequestsFailures = "proxied_head_requests_failures"
+ bucketReplProxiedHeadRequestsTotal = "proxied_head_requests_total"
+ bucketReplProxiedPutTaggingRequestsFailures = "proxied_put_tagging_requests_failures"
+ bucketReplProxiedPutTaggingRequestsTotal = "proxied_put_tagging_requests_total"
+ bucketReplSentBytes = "sent_bytes"
+ bucketReplSentCount = "sent_count"
+ bucketReplTotalFailedBytes = "total_failed_bytes"
+ bucketReplTotalFailedCount = "total_failed_count"
+ bucketReplProxiedDeleteTaggingRequestsFailures = "proxied_delete_tagging_requests_failures"
+ bucketL = "bucket"
+ operationL = "operation"
+ targetArnL = "targetArn"
+)
+
+var (
+ bucketReplLastHrFailedBytesMD = NewGaugeMD(bucketReplLastHrFailedBytes,
+ "Total number of bytes failed at least once to replicate in the last hour on a bucket",
+ bucketL)
+ bucketReplLastHrFailedCountMD = NewGaugeMD(bucketReplLastHrFailedCount,
+ "Total number of objects which failed replication in the last hour on a bucket",
+ bucketL)
+ bucketReplLastMinFailedBytesMD = NewGaugeMD(bucketReplLastMinFailedBytes,
+ "Total number of bytes failed at least once to replicate in the last full minute on a bucket",
+ bucketL)
+ bucketReplLastMinFailedCountMD = NewGaugeMD(bucketReplLastMinFailedCount,
+ "Total number of objects which failed replication in the last full minute on a bucket",
+ bucketL)
+ bucketReplLatencyMsMD = NewGaugeMD(bucketReplLatencyMs,
+ "Replication latency on a bucket in milliseconds",
+ bucketL, operationL, rangeL, targetArnL)
+ bucketReplProxiedDeleteTaggingRequestsTotalMD = NewCounterMD(bucketReplProxiedDeleteTaggingRequestsTotal,
+ "Number of DELETE tagging requests proxied to replication target",
+ bucketL)
+ bucketReplProxiedGetRequestsFailuresMD = NewCounterMD(bucketReplProxiedGetRequestsFailures,
+ "Number of failures in GET requests proxied to replication target",
+ bucketL)
+ bucketReplProxiedGetRequestsTotalMD = NewCounterMD(bucketReplProxiedGetRequestsTotal,
+ "Number of GET requests proxied to replication target",
+ bucketL)
+ bucketReplProxiedGetTaggingRequestsFailuresMD = NewCounterMD(bucketReplProxiedGetTaggingRequestsFailures,
+ "Number of failures in GET tagging requests proxied to replication target",
+ bucketL)
+ bucketReplProxiedGetTaggingRequestsTotalMD = NewCounterMD(bucketReplProxiedGetTaggingRequestsTotal,
+ "Number of GET tagging requests proxied to replication target",
+ bucketL)
+ bucketReplProxiedHeadRequestsFailuresMD = NewCounterMD(bucketReplProxiedHeadRequestsFailures,
+ "Number of failures in HEAD requests proxied to replication target",
+ bucketL)
+ bucketReplProxiedHeadRequestsTotalMD = NewCounterMD(bucketReplProxiedHeadRequestsTotal,
+ "Number of HEAD requests proxied to replication target",
+ bucketL)
+ bucketReplProxiedPutTaggingRequestsFailuresMD = NewCounterMD(bucketReplProxiedPutTaggingRequestsFailures,
+ "Number of failures in PUT tagging requests proxied to replication target",
+ bucketL)
+ bucketReplProxiedPutTaggingRequestsTotalMD = NewCounterMD(bucketReplProxiedPutTaggingRequestsTotal,
+ "Number of PUT tagging requests proxied to replication target",
+ bucketL)
+ bucketReplSentBytesMD = NewCounterMD(bucketReplSentBytes,
+ "Total number of bytes replicated to the target",
+ bucketL)
+ bucketReplSentCountMD = NewCounterMD(bucketReplSentCount,
+ "Total number of objects replicated to the target",
+ bucketL)
+ bucketReplTotalFailedBytesMD = NewCounterMD(bucketReplTotalFailedBytes,
+ "Total number of bytes failed at least once to replicate since server start",
+ bucketL)
+ bucketReplTotalFailedCountMD = NewCounterMD(bucketReplTotalFailedCount,
+ "Total number of objects which failed replication since server start",
+ bucketL)
+ bucketReplProxiedDeleteTaggingRequestsFailuresMD = NewCounterMD(bucketReplProxiedDeleteTaggingRequestsFailures,
+ "Number of failures in DELETE tagging requests proxied to replication target",
+ bucketL)
+)
+
+// loadBucketReplicationMetrics - `BucketMetricsLoaderFn` for bucket replication metrics
+// such as latency and sent bytes.
+func loadBucketReplicationMetrics(ctx context.Context, m MetricValues, c *metricsCache, buckets []string) error {
+ if globalSiteReplicationSys.isEnabled() {
+ return nil
+ }
+
+ dataUsageInfo, err := c.dataUsageInfo.Get()
+ if err != nil {
+ metricsLogIf(ctx, err)
+ return nil
+ }
+
+ bucketReplStats := globalReplicationStats.getAllLatest(dataUsageInfo.BucketsUsage)
+ for _, bucket := range buckets {
+ labels := []string{bucketL, bucket}
+ if s, ok := bucketReplStats[bucket]; ok {
+ stats := s.ReplicationStats
+ if stats.hasReplicationUsage() {
+ for arn, stat := range stats.Stats {
+ m.Set(bucketReplLastHrFailedBytes, float64(stat.Failed.LastHour.Bytes), labels...)
+ m.Set(bucketReplLastHrFailedCount, float64(stat.Failed.LastHour.Count), labels...)
+ m.Set(bucketReplLastMinFailedBytes, float64(stat.Failed.LastMinute.Bytes), labels...)
+ m.Set(bucketReplLastMinFailedCount, float64(stat.Failed.LastMinute.Count), labels...)
+ m.Set(bucketReplProxiedDeleteTaggingRequestsTotal, float64(s.ProxyStats.RmvTagTotal), labels...)
+ m.Set(bucketReplProxiedGetRequestsFailures, float64(s.ProxyStats.GetFailedTotal), labels...)
+ m.Set(bucketReplProxiedGetRequestsTotal, float64(s.ProxyStats.GetTotal), labels...)
+ m.Set(bucketReplProxiedGetTaggingRequestsFailures, float64(s.ProxyStats.GetTagFailedTotal), labels...)
+ m.Set(bucketReplProxiedGetTaggingRequestsTotal, float64(s.ProxyStats.GetTagTotal), labels...)
+ m.Set(bucketReplProxiedHeadRequestsFailures, float64(s.ProxyStats.HeadFailedTotal), labels...)
+ m.Set(bucketReplProxiedHeadRequestsTotal, float64(s.ProxyStats.HeadTotal), labels...)
+ m.Set(bucketReplProxiedPutTaggingRequestsFailures, float64(s.ProxyStats.PutTagFailedTotal), labels...)
+ m.Set(bucketReplProxiedPutTaggingRequestsTotal, float64(s.ProxyStats.PutTagTotal), labels...)
+ m.Set(bucketReplSentCount, float64(stat.ReplicatedCount), labels...)
+ m.Set(bucketReplTotalFailedBytes, float64(stat.Failed.Totals.Bytes), labels...)
+ m.Set(bucketReplTotalFailedCount, float64(stat.Failed.Totals.Count), labels...)
+ m.Set(bucketReplProxiedDeleteTaggingRequestsFailures, float64(s.ProxyStats.RmvTagFailedTotal), labels...)
+ m.Set(bucketReplSentBytes, float64(stat.ReplicatedSize), labels...)
+
+ SetHistogramValues(m, bucketReplLatencyMs, stat.Latency.getUploadLatency(), bucketL, bucket, operationL, "upload", targetArnL, arn)
+ }
+ }
+ }
+ }
+
+ return nil
+}
diff --git a/cmd/metrics-v3-replication.go b/cmd/metrics-v3-replication.go
new file mode 100644
index 000000000..1961c3304
--- /dev/null
+++ b/cmd/metrics-v3-replication.go
@@ -0,0 +1,96 @@
+// Copyright (c) 2015-2024 MinIO, Inc.
+//
+// This file is part of MinIO Object Storage stack
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see .
+
+package cmd
+
+import (
+ "context"
+)
+
+const (
+ replicationAverageActiveWorkers = "average_active_workers"
+ replicationAverageQueuedBytes = "average_queued_bytes"
+ replicationAverageQueuedCount = "average_queued_count"
+ replicationAverageDataTransferRate = "average_data_transfer_rate"
+ replicationCurrentActiveWorkers = "current_active_workers"
+ replicationCurrentDataTransferRate = "current_data_transfer_rate"
+ replicationLastMinuteQueuedBytes = "last_minute_queued_bytes"
+ replicationLastMinuteQueuedCount = "last_minute_queued_count"
+ replicationMaxActiveWorkers = "max_active_workers"
+ replicationMaxQueuedBytes = "max_queued_bytes"
+ replicationMaxQueuedCount = "max_queued_count"
+ replicationMaxDataTransferRate = "max_data_transfer_rate"
+)
+
+var (
+ replicationAverageActiveWorkersMD = NewGaugeMD(replicationAverageActiveWorkers,
+ "Average number of active replication workers")
+ replicationAverageQueuedBytesMD = NewGaugeMD(replicationAverageQueuedBytes,
+ "Average number of bytes queued for replication since server start")
+ replicationAverageQueuedCountMD = NewGaugeMD(replicationAverageQueuedCount,
+ "Average number of objects queued for replication since server start")
+ replicationAverageDataTransferRateMD = NewGaugeMD(replicationAverageDataTransferRate,
+ "Average replication data transfer rate in bytes/sec")
+ replicationCurrentActiveWorkersMD = NewGaugeMD(replicationCurrentActiveWorkers,
+ "Total number of active replication workers")
+ replicationCurrentDataTransferRateMD = NewGaugeMD(replicationCurrentDataTransferRate,
+ "Current replication data transfer rate in bytes/sec")
+ replicationLastMinuteQueuedBytesMD = NewGaugeMD(replicationLastMinuteQueuedBytes,
+ "Number of bytes queued for replication in the last full minute")
+ replicationLastMinuteQueuedCountMD = NewGaugeMD(replicationLastMinuteQueuedCount,
+ "Number of objects queued for replication in the last full minute")
+ replicationMaxActiveWorkersMD = NewGaugeMD(replicationMaxActiveWorkers,
+ "Maximum number of active replication workers seen since server start")
+ replicationMaxQueuedBytesMD = NewGaugeMD(replicationMaxQueuedBytes,
+ "Maximum number of bytes queued for replication since server start")
+ replicationMaxQueuedCountMD = NewGaugeMD(replicationMaxQueuedCount,
+ "Maximum number of objects queued for replication since server start")
+ replicationMaxDataTransferRateMD = NewGaugeMD(replicationMaxDataTransferRate,
+ "Maximum replication data transfer rate in bytes/sec seen since server start")
+)
+
+// loadClusterReplicationMetrics - `MetricsLoaderFn` for cluster replication metrics
+// such as transfer rate and objects queued.
+func loadClusterReplicationMetrics(ctx context.Context, m MetricValues, c *metricsCache) error {
+ if globalReplicationStats == nil {
+ return nil
+ }
+
+ qs := globalReplicationStats.getNodeQueueStatsSummary()
+
+ qt := qs.QStats
+ m.Set(replicationAverageQueuedBytes, float64(qt.Avg.Bytes))
+ m.Set(replicationAverageQueuedCount, float64(qt.Avg.Count))
+ m.Set(replicationMaxQueuedBytes, float64(qt.Max.Bytes))
+ m.Set(replicationMaxQueuedCount, float64(qt.Max.Count))
+ m.Set(replicationLastMinuteQueuedBytes, float64(qt.Curr.Bytes))
+ m.Set(replicationLastMinuteQueuedCount, float64(qt.Curr.Count))
+
+ qa := qs.ActiveWorkers
+ m.Set(replicationAverageActiveWorkers, float64(qa.Avg))
+ m.Set(replicationCurrentActiveWorkers, float64(qa.Curr))
+ m.Set(replicationMaxActiveWorkers, float64(qa.Max))
+
+ if len(qs.XferStats) > 0 {
+ tots := qs.XferStats[Total]
+ m.Set(replicationAverageDataTransferRate, tots.Avg)
+ m.Set(replicationCurrentDataTransferRate, tots.Curr)
+ m.Set(replicationMaxDataTransferRate, tots.Peak)
+ }
+
+ return nil
+}
diff --git a/cmd/metrics-v3-types.go b/cmd/metrics-v3-types.go
index 4fd5e265b..07bc1d616 100644
--- a/cmd/metrics-v3-types.go
+++ b/cmd/metrics-v3-types.go
@@ -72,6 +72,8 @@ const (
GaugeMT
// HistogramMT - represents a histogram metric.
HistogramMT
+ // rangeL - represents a range label.
+ rangeL = "range"
)
func (mt MetricType) String() string {
@@ -225,7 +227,7 @@ func (m *MetricValues) Set(name MetricName, value float64, labels ...string) {
}
if len(labels)/2 != len(validLabels) {
- panic(fmt.Sprintf("not all labels were given values"))
+ panic("not all labels were given values")
}
v, ok := m.values[name]
@@ -284,6 +286,14 @@ func (m *MetricValues) SetHistogram(name MetricName, hist *prometheus.HistogramV
}
}
+// SetHistogramValues - sets values for the given MetricName using the provided map of
+// range to value.
+func SetHistogramValues[V uint64 | int64 | float64](m MetricValues, name MetricName, values map[string]V, labels ...string) {
+ for rng, val := range values {
+ m.Set(name, float64(val), append(labels, rangeL, rng)...)
+ }
+}
+
// MetricsLoaderFn - represents a function to load metrics from the
// metricsCache.
//
diff --git a/cmd/metrics-v3.go b/cmd/metrics-v3.go
index 79b432b9b..9dc36e4fc 100644
--- a/cmd/metrics-v3.go
+++ b/cmd/metrics-v3.go
@@ -35,7 +35,9 @@ import (
// for the bucket "mybucket" would be /minio/metrics/v3/bucket/api/mybucket
const (
apiRequestsCollectorPath collectorPath = "/api/requests"
- apiBucketCollectorPath collectorPath = "/bucket/api"
+
+ bucketAPICollectorPath collectorPath = "/bucket/api"
+ bucketReplicationCollectorPath collectorPath = "/bucket/replication"
systemNetworkInternodeCollectorPath collectorPath = "/system/network/internode"
systemDriveCollectorPath collectorPath = "/system/drive"
@@ -54,6 +56,7 @@ const (
auditCollectorPath collectorPath = "/audit"
loggerWebhookCollectorPath collectorPath = "/logger/webhook"
+ replicationCollectorPath collectorPath = "/replication"
)
const (
@@ -97,20 +100,45 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection {
loadAPIRequestsNetworkMetrics),
)
- apiBucketMG := NewBucketMetricsGroup(apiBucketCollectorPath,
+ bucketAPIMG := NewBucketMetricsGroup(bucketAPICollectorPath,
[]MetricDescriptor{
- apiBucketTrafficRecvBytesMD,
- apiBucketTrafficSentBytesMD,
+ bucketAPITrafficRecvBytesMD,
+ bucketAPITrafficSentBytesMD,
- apiBucketRequestsInFlightMD,
- apiBucketRequestsTotalMD,
- apiBucketRequestsCanceledMD,
- apiBucketRequests4xxErrorsMD,
- apiBucketRequests5xxErrorsMD,
+ bucketAPIRequestsInFlightMD,
+ bucketAPIRequestsTotalMD,
+ bucketAPIRequestsCanceledMD,
+ bucketAPIRequests4xxErrorsMD,
+ bucketAPIRequests5xxErrorsMD,
- apiBucketRequestsTTFBSecondsDistributionMD,
+ bucketAPIRequestsTTFBSecondsDistributionMD,
},
- JoinBucketLoaders(loadAPIBucketHTTPMetrics, loadAPIBucketTTFBMetrics),
+ JoinBucketLoaders(loadBucketAPIHTTPMetrics, loadBucketAPITTFBMetrics),
+ )
+
+ bucketReplicationMG := NewBucketMetricsGroup(bucketReplicationCollectorPath,
+ []MetricDescriptor{
+ bucketReplLastHrFailedBytesMD,
+ bucketReplLastHrFailedCountMD,
+ bucketReplLastMinFailedBytesMD,
+ bucketReplLastMinFailedCountMD,
+ bucketReplLatencyMsMD,
+ bucketReplProxiedDeleteTaggingRequestsTotalMD,
+ bucketReplProxiedGetRequestsFailuresMD,
+ bucketReplProxiedGetRequestsTotalMD,
+ bucketReplProxiedGetTaggingRequestsFailuresMD,
+ bucketReplProxiedGetTaggingRequestsTotalMD,
+ bucketReplProxiedHeadRequestsFailuresMD,
+ bucketReplProxiedHeadRequestsTotalMD,
+ bucketReplProxiedPutTaggingRequestsFailuresMD,
+ bucketReplProxiedPutTaggingRequestsTotalMD,
+ bucketReplSentBytesMD,
+ bucketReplSentCountMD,
+ bucketReplTotalFailedBytesMD,
+ bucketReplTotalFailedCountMD,
+ bucketReplProxiedDeleteTaggingRequestsFailuresMD,
+ },
+ loadBucketReplicationMetrics,
)
systemNetworkInternodeMG := NewMetricsGroup(systemNetworkInternodeCollectorPath,
@@ -296,6 +324,24 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection {
loadClusterIAMMetrics,
)
+ clusterReplicationMG := NewMetricsGroup(replicationCollectorPath,
+ []MetricDescriptor{
+ replicationAverageActiveWorkersMD,
+ replicationAverageQueuedBytesMD,
+ replicationAverageQueuedCountMD,
+ replicationAverageDataTransferRateMD,
+ replicationCurrentActiveWorkersMD,
+ replicationCurrentDataTransferRateMD,
+ replicationLastMinuteQueuedBytesMD,
+ replicationLastMinuteQueuedCountMD,
+ replicationMaxActiveWorkersMD,
+ replicationMaxQueuedBytesMD,
+ replicationMaxQueuedCountMD,
+ replicationMaxDataTransferRateMD,
+ },
+ loadClusterReplicationMetrics,
+ )
+
loggerWebhookMG := NewMetricsGroup(loggerWebhookCollectorPath,
[]MetricDescriptor{
webhookFailedMessagesMD,
@@ -316,7 +362,8 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection {
allMetricGroups := []*MetricsGroup{
apiRequestsMG,
- apiBucketMG,
+ bucketAPIMG,
+ bucketReplicationMG,
systemNetworkInternodeMG,
systemDriveMG,
@@ -330,6 +377,7 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection {
clusterErasureSetMG,
clusterNotificationMG,
clusterIAMMG,
+ clusterReplicationMG,
auditMG,
loggerWebhookMG,
diff --git a/docs/metrics/v3.md b/docs/metrics/v3.md
index d09348d0c..ffbeea9d8 100644
--- a/docs/metrics/v3.md
+++ b/docs/metrics/v3.md
@@ -31,7 +31,7 @@ These are metrics about requests served by the (current) node.
| Path | Description |
|-----------------|--------------------------------------------------|
| `/api/requests` | Metrics over all requests |
-| `/api/bucket` | Metrics over all requests split by bucket labels |
+| `/bucket/api` | Metrics over all requests for a given bucket |
| | |
### Audit metrics
@@ -122,6 +122,30 @@ The standard metrics group for GoCollector is not shown below.
| `minio_bucket_api_5xx_errors_total` | `counter` | Total number of requests with 5xx errors for a bucket | `bucket,name,type,server,pool_index` |
| `minio_bucket_api_ttfb_seconds_distribution` | `counter` | Distribution of time to first byte across API calls for a bucket | `bucket,name,le,type,server,pool_index` |
+### `/bucket/replication`
+
+| Name | Type | Help | Labels |
+|---------------------------------------------------------------------|-----------|---------------------------------------------------------------------------------------------|-------------------------------------------|
+| `minio_bucket_replication_last_hour_failed_bytes` | `gauge` | Total number of bytes failed at least once to replicate in the last hour on a bucket | `bucket,server` |
+| `minio_bucket_replication_last_hour_failed_count` | `gauge` | Total number of objects which failed replication in the last hour on a bucket | `bucket,server` |
+| `minio_bucket_replication_last_minute_failed_bytes` | `gauge` | Total number of bytes failed at least once to replicate in the last full minute on a bucket | `bucket,server` |
+| `minio_bucket_replication_last_minute_failed_count` | `gauge` | Total number of objects which failed replication in the last full minute on a bucket | `bucket,server` |
+| `minio_bucket_replication_latency_ms` | `gauge` | Replication latency on a bucket in milliseconds | `bucket,operation,range,targetArn,server` |
+| `minio_bucket_replication_proxied_delete_tagging_requests_total` | `counter` | Number of DELETE tagging requests proxied to replication target | `bucket,server` |
+| `minio_bucket_replication_proxied_get_requests_failures` | `counter` | Number of failures in GET requests proxied to replication target | `bucket,server` |
+| `minio_bucket_replication_proxied_get_requests_total` | `counter` | Number of GET requests proxied to replication target | `bucket,server` |
+| `minio_bucket_replication_proxied_get_tagging_requests_failures` | `counter` | Number of failures in GET tagging requests proxied to replication target | `bucket,server` |
+| `minio_bucket_replication_proxied_get_tagging_requests_total` | `counter` | Number of GET tagging requests proxied to replication target | `bucket,server` |
+| `minio_bucket_replication_proxied_head_requests_failures` | `counter` | Number of failures in HEAD requests proxied to replication target | `bucket,server` |
+| `minio_bucket_replication_proxied_head_requests_total` | `counter` | Number of HEAD requests proxied to replication target | `bucket,server` |
+| `minio_bucket_replication_proxied_put_tagging_requests_failures` | `counter` | Number of failures in PUT tagging requests proxied to replication target | `bucket,server` |
+| `minio_bucket_replication_proxied_put_tagging_requests_total` | `counter` | Number of PUT tagging requests proxied to replication target | `bucket,server` |
+| `minio_bucket_replication_sent_bytes` | `counter` | Total number of bytes replicated to the target | `bucket,server` |
+| `minio_bucket_replication_sent_count` | `counter` | Total number of objects replicated to the target | `bucket,server` |
+| `minio_bucket_replication_total_failed_bytes` | `counter` | Total number of bytes failed at least once to replicate since server start | `bucket,server` |
+| `minio_bucket_replication_total_failed_count` | `counter` | Total number of objects which failed replication since server start | `bucket,server` |
+| `minio_bucket_replication_proxied_delete_tagging_requests_failures` | `counter` | Number of failures in DELETE tagging requests proxied to replication target | `bucket,server` |
+
### `/audit`
| Name | Type | Help | Labels |
@@ -195,25 +219,25 @@ The standard metrics group for GoCollector is not shown below.
### `/system/process`
-| Name | Type | Help | Labels |
-|-------------------------------|-----------|----------------------------------------------------------------------------------------------------------------|----------|
-| `locks_read_total` | `gauge` | Number of current READ locks on this peer | `server` |
-| `locks_write_total` | `gauge` | Number of current WRITE locks on this peer | `server` |
-| `cpu_total_seconds` | `counter` | Total user and system CPU time spent in seconds | `server` |
-| `go_routine_total` | `gauge` | Total number of go routines running | `server` |
-| `io_rchar_bytes` | `counter` | Total bytes read by the process from the underlying storage system including cache, /proc/[pid]/io rchar | `server` |
-| `io_read_bytes` | `counter` | Total bytes read by the process from the underlying storage system, /proc/[pid]/io read_bytes | `server` |
-| `io_wchar_bytes` | `counter` | Total bytes written by the process to the underlying storage system including page cache, /proc/[pid]/io wchar | `server` |
-| `io_write_bytes` | `counter` | Total bytes written by the process to the underlying storage system, /proc/[pid]/io write_bytes | `server` |
-| `start_time_seconds` | `gauge` | Start time for MinIO process in seconds since Unix epoc | `server` |
-| `uptime_seconds` | `gauge` | Uptime for MinIO process in seconds | `server` |
-| `file_descriptor_limit_total` | `gauge` | Limit on total number of open file descriptors for the MinIO Server process | `server` |
-| `file_descriptor_open_total` | `gauge` | Total number of open file descriptors by the MinIO Server process | `server` |
-| `syscall_read_total` | `counter` | Total read SysCalls to the kernel. /proc/[pid]/io syscr | `server` |
-| `syscall_write_total` | `counter` | Total write SysCalls to the kernel. /proc/[pid]/io syscw | `server` |
-| `resident_memory_bytes` | `gauge` | Resident memory size in bytes | `server` |
-| `virtual_memory_bytes` | `gauge` | Virtual memory size in bytes | `server` |
-| `virtual_memory_max_bytes` | `gauge` | Maximum virtual memory size in bytes | `server` |
+| Name | Type | Help | Labels |
+|----------------------------------------------------|-----------|----------------------------------------------------------------------------------------------------------------|----------|
+| `minio_system_process_locks_read_total` | `gauge` | Number of current READ locks on this peer | `server` |
+| `minio_system_process_locks_write_total` | `gauge` | Number of current WRITE locks on this peer | `server` |
+| `minio_system_process_cpu_total_seconds` | `counter` | Total user and system CPU time spent in seconds | `server` |
+| `minio_system_process_go_routine_total` | `gauge` | Total number of go routines running | `server` |
+| `minio_system_process_io_rchar_bytes` | `counter` | Total bytes read by the process from the underlying storage system including cache, /proc/[pid]/io rchar | `server` |
+| `minio_system_process_io_read_bytes` | `counter` | Total bytes read by the process from the underlying storage system, /proc/[pid]/io read_bytes | `server` |
+| `minio_system_process_io_wchar_bytes` | `counter` | Total bytes written by the process to the underlying storage system including page cache, /proc/[pid]/io wchar | `server` |
+| `minio_system_process_io_write_bytes` | `counter` | Total bytes written by the process to the underlying storage system, /proc/[pid]/io write_bytes | `server` |
+| `minio_system_process_start_time_seconds` | `gauge` | Start time for MinIO process in seconds since Unix epoc | `server` |
+| `minio_system_process_uptime_seconds` | `gauge` | Uptime for MinIO process in seconds | `server` |
+| `minio_system_process_file_descriptor_limit_total` | `gauge` | Limit on total number of open file descriptors for the MinIO Server process | `server` |
+| `minio_system_process_file_descriptor_open_total` | `gauge` | Total number of open file descriptors by the MinIO Server process | `server` |
+| `minio_system_process_syscall_read_total` | `counter` | Total read SysCalls to the kernel. /proc/[pid]/io syscr | `server` |
+| `minio_system_process_syscall_write_total` | `counter` | Total write SysCalls to the kernel. /proc/[pid]/io syscw | `server` |
+| `minio_system_process_resident_memory_bytes` | `gauge` | Resident memory size in bytes | `server` |
+| `minio_system_process_virtual_memory_bytes` | `gauge` | Virtual memory size in bytes | `server` |
+| `minio_system_process_virtual_memory_max_bytes` | `gauge` | Maximum virtual memory size in bytes | `server` |
### `/cluster/health`
@@ -302,3 +326,20 @@ The standard metrics group for GoCollector is not shown below.
| `minio_logger_webhook_failed_messages` | `counter` | Number of messages that failed to send | `server,name,endpoint` |
| `minio_logger_webhook_queue_length` | `gauge` | Webhook queue length | `server,name,endpoint` |
| `minio_logger_webhook_total_message` | `counter` | Total number of messages sent to this target | `server,name,endpoint` |
+
+### `/replication`
+
+| Name | Type | Help | Labels |
+|---------------------------------------------------|---------|-----------------------------------------------------------------------------|----------|
+| `minio_replication_average_active_workers` | `gauge` | Average number of active replication workers | `server` |
+| `minio_replication_average_queued_bytes` | `gauge` | Average number of bytes queued for replication since server start | `server` |
+| `minio_replication_average_queued_count` | `gauge` | Average number of objects queued for replication since server start | `server` |
+| `minio_replication_average_data_transfer_rate` | `gauge` | Average replication data transfer rate in bytes/sec | `server` |
+| `minio_replication_current_active_workers` | `gauge` | Total number of active replication workers | `server` |
+| `minio_replication_current_data_transfer_rate` | `gauge` | Current replication data transfer rate in bytes/sec | `server` |
+| `minio_replication_last_minute_queued_bytes` | `gauge` | Number of bytes queued for replication in the last full minute | `server` |
+| `minio_replication_last_minute_queued_count` | `gauge` | Number of objects queued for replication in the last full minute | `server` |
+| `minio_replication_max_active_workers` | `gauge` | Maximum number of active replication workers seen since server start | `server` |
+| `minio_replication_max_queued_bytes` | `gauge` | Maximum number of bytes queued for replication since server start | `server` |
+| `minio_replication_max_queued_count` | `gauge` | Maximum number of objects queued for replication since server start | `server` |
+| `minio_replication_max_data_transfer_rate` | `gauge` | Maximum replication data transfer rate in bytes/sec seen since server start | `server` |