Various improvements in replication (#11949)

- collect real time replication metrics for prometheus.
- add pending_count, failed_count metric for total pending/failed replication operations.

- add API to get replication metrics

- add MRF worker to handle spill-over replication operations

- multiple issues found with replication
- fixes an issue when client sends a bucket
 name with `/` at the end from SetRemoteTarget
 API call make sure to trim the bucket name to 
 avoid any extra `/`.

- hold write locks in GetObjectNInfo during replication
  to ensure that object version stack is not overwritten
  while reading the content.

- add additional protection during WriteMetadata() to
  ensure that we always write a valid FileInfo{} and avoid
  ever writing empty FileInfo{} to the lowest layers.

Co-authored-by: Poorna Krishnamoorthy <poorna@minio.io>
Co-authored-by: Harshavardhana <harsha@minio.io>
This commit is contained in:
Poorna Krishnamoorthy
2021-04-03 09:03:42 -07:00
committed by GitHub
parent dca7cf7200
commit 47c09a1e6f
36 changed files with 1914 additions and 496 deletions

View File

@@ -44,7 +44,7 @@ const (
healMetricNamespace MetricNamespace = "minio_heal"
interNodeMetricNamespace MetricNamespace = "minio_inter_node"
nodeMetricNamespace MetricNamespace = "minio_node"
minIOMetricNamespace MetricNamespace = "minio"
minioMetricNamespace MetricNamespace = "minio"
s3MetricNamespace MetricNamespace = "minio_s3"
)
@@ -93,9 +93,11 @@ const (
writeTotal MetricName = "write_total"
total MetricName = "total"
failedCount MetricName = "failed_count"
failedBytes MetricName = "failed_bytes"
freeBytes MetricName = "free_bytes"
pendingBytes MetricName = "pending_bytes"
pendingCount MetricName = "pending_count"
readBytes MetricName = "read_bytes"
rcharBytes MetricName = "rchar_bytes"
receivedBytes MetricName = "received_bytes"
@@ -356,6 +358,16 @@ func getNodeDiskTotalBytesMD() MetricDescription {
Type: gaugeMetric,
}
}
func getUsageLastScanActivityMD() MetricDescription {
return MetricDescription{
Namespace: minioMetricNamespace,
Subsystem: usageSubsystem,
Name: lastActivityTime,
Help: "Time elapsed (in nano seconds) since last scan activity. This is set to 0 until first scan cycle",
Type: gaugeMetric,
}
}
func getBucketUsageTotalBytesMD() MetricDescription {
return MetricDescription{
Namespace: bucketMetricNamespace,
@@ -410,6 +422,24 @@ func getBucketRepReceivedBytesMD() MetricDescription {
Type: gaugeMetric,
}
}
func getBucketRepPendingOperationsMD() MetricDescription {
return MetricDescription{
Namespace: bucketMetricNamespace,
Subsystem: replicationSubsystem,
Name: pendingCount,
Help: "Total number of objects pending replication",
Type: gaugeMetric,
}
}
func getBucketRepFailedOperationsMD() MetricDescription {
return MetricDescription{
Namespace: bucketMetricNamespace,
Subsystem: replicationSubsystem,
Name: failedCount,
Help: "Total number of objects which failed replication",
Type: gaugeMetric,
}
}
func getBucketObjectDistributionMD() MetricDescription {
return MetricDescription{
Namespace: bucketMetricNamespace,
@@ -666,7 +696,7 @@ func getNodeOfflineTotalMD() MetricDescription {
}
func getMinIOVersionMD() MetricDescription {
return MetricDescription{
Namespace: minIOMetricNamespace,
Namespace: minioMetricNamespace,
Subsystem: softwareSubsystem,
Name: versionInfo,
Help: "MinIO Release tag for the server",
@@ -675,7 +705,7 @@ func getMinIOVersionMD() MetricDescription {
}
func getMinIOCommitMD() MetricDescription {
return MetricDescription{
Namespace: minIOMetricNamespace,
Namespace: minioMetricNamespace,
Subsystem: softwareSubsystem,
Name: commitInfo,
Help: "Git commit hash for the MinIO release.",
@@ -996,13 +1026,14 @@ func getMinioHealingMetrics() MetricsGroup {
if !exists {
return
}
var dur time.Duration
if !bgSeq.lastHealActivity.IsZero() {
dur = time.Since(bgSeq.lastHealActivity)
if bgSeq.lastHealActivity.IsZero() {
return
}
metrics = append(metrics, Metric{
Description: getHealLastActivityTimeMD(),
Value: float64(dur),
Value: float64(time.Since(bgSeq.lastHealActivity)),
})
metrics = append(metrics, getObjectsScanned(bgSeq)...)
metrics = append(metrics, getScannedItems(bgSeq)...)
@@ -1224,7 +1255,14 @@ func getBucketUsageMetrics() MetricsGroup {
return
}
metrics = append(metrics, Metric{
Description: getUsageLastScanActivityMD(),
Value: float64(time.Since(dataUsageInfo.LastUpdate)),
})
for bucket, usage := range dataUsageInfo.BucketsUsage {
stat := getLatestReplicationStats(bucket, usage)
metrics = append(metrics, Metric{
Description: getBucketUsageTotalBytesMD(),
Value: float64(usage.Size),
@@ -1237,25 +1275,35 @@ func getBucketUsageMetrics() MetricsGroup {
VariableLabels: map[string]string{"bucket": bucket},
})
if usage.hasReplicationUsage() {
if stat.hasReplicationUsage() {
metrics = append(metrics, Metric{
Description: getBucketRepPendingBytesMD(),
Value: float64(usage.ReplicationPendingSize),
Value: float64(stat.PendingSize),
VariableLabels: map[string]string{"bucket": bucket},
})
metrics = append(metrics, Metric{
Description: getBucketRepFailedBytesMD(),
Value: float64(usage.ReplicationFailedSize),
Value: float64(stat.FailedSize),
VariableLabels: map[string]string{"bucket": bucket},
})
metrics = append(metrics, Metric{
Description: getBucketRepSentBytesMD(),
Value: float64(usage.ReplicatedSize),
Value: float64(stat.ReplicatedSize),
VariableLabels: map[string]string{"bucket": bucket},
})
metrics = append(metrics, Metric{
Description: getBucketRepReceivedBytesMD(),
Value: float64(usage.ReplicaSize),
Value: float64(stat.ReplicaSize),
VariableLabels: map[string]string{"bucket": bucket},
})
metrics = append(metrics, Metric{
Description: getBucketRepPendingOperationsMD(),
Value: float64(stat.PendingCount),
VariableLabels: map[string]string{"bucket": bucket},
})
metrics = append(metrics, Metric{
Description: getBucketRepFailedOperationsMD(),
Value: float64(stat.FailedCount),
VariableLabels: map[string]string{"bucket": bucket},
})
}
@@ -1372,13 +1420,6 @@ func getClusterStorageMetrics() MetricsGroup {
}
}
func (b *BucketUsageInfo) hasReplicationUsage() bool {
return b.ReplicationPendingSize > 0 ||
b.ReplicationFailedSize > 0 ||
b.ReplicatedSize > 0 ||
b.ReplicaSize > 0
}
type minioClusterCollector struct {
desc *prometheus.Desc
}