mirror of
https://github.com/minio/minio.git
synced 2025-11-08 21:24:55 -05:00
Various improvements in replication (#11949)
- collect real time replication metrics for prometheus.
- add pending_count, failed_count metric for total pending/failed replication operations.
- add API to get replication metrics
- add MRF worker to handle spill-over replication operations
- multiple issues found with replication
- fixes an issue when client sends a bucket
name with `/` at the end from SetRemoteTarget
API call make sure to trim the bucket name to
avoid any extra `/`.
- hold write locks in GetObjectNInfo during replication
to ensure that object version stack is not overwritten
while reading the content.
- add additional protection during WriteMetadata() to
ensure that we always write a valid FileInfo{} and avoid
ever writing empty FileInfo{} to the lowest layers.
Co-authored-by: Poorna Krishnamoorthy <poorna@minio.io>
Co-authored-by: Harshavardhana <harsha@minio.io>
This commit is contained in:
committed by
GitHub
parent
dca7cf7200
commit
47c09a1e6f
@@ -44,7 +44,7 @@ const (
|
||||
healMetricNamespace MetricNamespace = "minio_heal"
|
||||
interNodeMetricNamespace MetricNamespace = "minio_inter_node"
|
||||
nodeMetricNamespace MetricNamespace = "minio_node"
|
||||
minIOMetricNamespace MetricNamespace = "minio"
|
||||
minioMetricNamespace MetricNamespace = "minio"
|
||||
s3MetricNamespace MetricNamespace = "minio_s3"
|
||||
)
|
||||
|
||||
@@ -93,9 +93,11 @@ const (
|
||||
writeTotal MetricName = "write_total"
|
||||
total MetricName = "total"
|
||||
|
||||
failedCount MetricName = "failed_count"
|
||||
failedBytes MetricName = "failed_bytes"
|
||||
freeBytes MetricName = "free_bytes"
|
||||
pendingBytes MetricName = "pending_bytes"
|
||||
pendingCount MetricName = "pending_count"
|
||||
readBytes MetricName = "read_bytes"
|
||||
rcharBytes MetricName = "rchar_bytes"
|
||||
receivedBytes MetricName = "received_bytes"
|
||||
@@ -356,6 +358,16 @@ func getNodeDiskTotalBytesMD() MetricDescription {
|
||||
Type: gaugeMetric,
|
||||
}
|
||||
}
|
||||
func getUsageLastScanActivityMD() MetricDescription {
|
||||
return MetricDescription{
|
||||
Namespace: minioMetricNamespace,
|
||||
Subsystem: usageSubsystem,
|
||||
Name: lastActivityTime,
|
||||
Help: "Time elapsed (in nano seconds) since last scan activity. This is set to 0 until first scan cycle",
|
||||
Type: gaugeMetric,
|
||||
}
|
||||
}
|
||||
|
||||
func getBucketUsageTotalBytesMD() MetricDescription {
|
||||
return MetricDescription{
|
||||
Namespace: bucketMetricNamespace,
|
||||
@@ -410,6 +422,24 @@ func getBucketRepReceivedBytesMD() MetricDescription {
|
||||
Type: gaugeMetric,
|
||||
}
|
||||
}
|
||||
func getBucketRepPendingOperationsMD() MetricDescription {
|
||||
return MetricDescription{
|
||||
Namespace: bucketMetricNamespace,
|
||||
Subsystem: replicationSubsystem,
|
||||
Name: pendingCount,
|
||||
Help: "Total number of objects pending replication",
|
||||
Type: gaugeMetric,
|
||||
}
|
||||
}
|
||||
func getBucketRepFailedOperationsMD() MetricDescription {
|
||||
return MetricDescription{
|
||||
Namespace: bucketMetricNamespace,
|
||||
Subsystem: replicationSubsystem,
|
||||
Name: failedCount,
|
||||
Help: "Total number of objects which failed replication",
|
||||
Type: gaugeMetric,
|
||||
}
|
||||
}
|
||||
func getBucketObjectDistributionMD() MetricDescription {
|
||||
return MetricDescription{
|
||||
Namespace: bucketMetricNamespace,
|
||||
@@ -666,7 +696,7 @@ func getNodeOfflineTotalMD() MetricDescription {
|
||||
}
|
||||
func getMinIOVersionMD() MetricDescription {
|
||||
return MetricDescription{
|
||||
Namespace: minIOMetricNamespace,
|
||||
Namespace: minioMetricNamespace,
|
||||
Subsystem: softwareSubsystem,
|
||||
Name: versionInfo,
|
||||
Help: "MinIO Release tag for the server",
|
||||
@@ -675,7 +705,7 @@ func getMinIOVersionMD() MetricDescription {
|
||||
}
|
||||
func getMinIOCommitMD() MetricDescription {
|
||||
return MetricDescription{
|
||||
Namespace: minIOMetricNamespace,
|
||||
Namespace: minioMetricNamespace,
|
||||
Subsystem: softwareSubsystem,
|
||||
Name: commitInfo,
|
||||
Help: "Git commit hash for the MinIO release.",
|
||||
@@ -996,13 +1026,14 @@ func getMinioHealingMetrics() MetricsGroup {
|
||||
if !exists {
|
||||
return
|
||||
}
|
||||
var dur time.Duration
|
||||
if !bgSeq.lastHealActivity.IsZero() {
|
||||
dur = time.Since(bgSeq.lastHealActivity)
|
||||
|
||||
if bgSeq.lastHealActivity.IsZero() {
|
||||
return
|
||||
}
|
||||
|
||||
metrics = append(metrics, Metric{
|
||||
Description: getHealLastActivityTimeMD(),
|
||||
Value: float64(dur),
|
||||
Value: float64(time.Since(bgSeq.lastHealActivity)),
|
||||
})
|
||||
metrics = append(metrics, getObjectsScanned(bgSeq)...)
|
||||
metrics = append(metrics, getScannedItems(bgSeq)...)
|
||||
@@ -1224,7 +1255,14 @@ func getBucketUsageMetrics() MetricsGroup {
|
||||
return
|
||||
}
|
||||
|
||||
metrics = append(metrics, Metric{
|
||||
Description: getUsageLastScanActivityMD(),
|
||||
Value: float64(time.Since(dataUsageInfo.LastUpdate)),
|
||||
})
|
||||
|
||||
for bucket, usage := range dataUsageInfo.BucketsUsage {
|
||||
stat := getLatestReplicationStats(bucket, usage)
|
||||
|
||||
metrics = append(metrics, Metric{
|
||||
Description: getBucketUsageTotalBytesMD(),
|
||||
Value: float64(usage.Size),
|
||||
@@ -1237,25 +1275,35 @@ func getBucketUsageMetrics() MetricsGroup {
|
||||
VariableLabels: map[string]string{"bucket": bucket},
|
||||
})
|
||||
|
||||
if usage.hasReplicationUsage() {
|
||||
if stat.hasReplicationUsage() {
|
||||
metrics = append(metrics, Metric{
|
||||
Description: getBucketRepPendingBytesMD(),
|
||||
Value: float64(usage.ReplicationPendingSize),
|
||||
Value: float64(stat.PendingSize),
|
||||
VariableLabels: map[string]string{"bucket": bucket},
|
||||
})
|
||||
metrics = append(metrics, Metric{
|
||||
Description: getBucketRepFailedBytesMD(),
|
||||
Value: float64(usage.ReplicationFailedSize),
|
||||
Value: float64(stat.FailedSize),
|
||||
VariableLabels: map[string]string{"bucket": bucket},
|
||||
})
|
||||
metrics = append(metrics, Metric{
|
||||
Description: getBucketRepSentBytesMD(),
|
||||
Value: float64(usage.ReplicatedSize),
|
||||
Value: float64(stat.ReplicatedSize),
|
||||
VariableLabels: map[string]string{"bucket": bucket},
|
||||
})
|
||||
metrics = append(metrics, Metric{
|
||||
Description: getBucketRepReceivedBytesMD(),
|
||||
Value: float64(usage.ReplicaSize),
|
||||
Value: float64(stat.ReplicaSize),
|
||||
VariableLabels: map[string]string{"bucket": bucket},
|
||||
})
|
||||
metrics = append(metrics, Metric{
|
||||
Description: getBucketRepPendingOperationsMD(),
|
||||
Value: float64(stat.PendingCount),
|
||||
VariableLabels: map[string]string{"bucket": bucket},
|
||||
})
|
||||
metrics = append(metrics, Metric{
|
||||
Description: getBucketRepFailedOperationsMD(),
|
||||
Value: float64(stat.FailedCount),
|
||||
VariableLabels: map[string]string{"bucket": bucket},
|
||||
})
|
||||
}
|
||||
@@ -1372,13 +1420,6 @@ func getClusterStorageMetrics() MetricsGroup {
|
||||
}
|
||||
}
|
||||
|
||||
func (b *BucketUsageInfo) hasReplicationUsage() bool {
|
||||
return b.ReplicationPendingSize > 0 ||
|
||||
b.ReplicationFailedSize > 0 ||
|
||||
b.ReplicatedSize > 0 ||
|
||||
b.ReplicaSize > 0
|
||||
}
|
||||
|
||||
type minioClusterCollector struct {
|
||||
desc *prometheus.Desc
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user