metrics: fix panic in replication stats reporting (#17979)

This commit is contained in:
Poorna 2023-09-05 10:26:18 -07:00 committed by GitHub
parent 19f70dbfbf
commit 812f5a02d7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -2085,99 +2085,101 @@ func getReplicationClusterMetrics() *MetricsGroup {
) )
mg.RegisterRead(func(_ context.Context) []Metric { mg.RegisterRead(func(_ context.Context) []Metric {
var ml []Metric
// common operational metrics for bucket replication and site replication - published // common operational metrics for bucket replication and site replication - published
// at cluster level // at cluster level
qs := globalReplicationStats.getNodeQueueStatsSummary() if globalReplicationStats != nil {
activeWorkersCount := Metric{ qs := globalReplicationStats.getNodeQueueStatsSummary()
Description: getClusterReplActiveWorkersCountMD(), activeWorkersCount := Metric{
VariableLabels: map[string]string{serverName: qs.NodeName}, Description: getClusterReplActiveWorkersCountMD(),
} VariableLabels: map[string]string{serverName: qs.NodeName},
avgActiveWorkersCount := Metric{ }
Description: getClusterReplAvgActiveWorkersCountMD(), avgActiveWorkersCount := Metric{
VariableLabels: map[string]string{serverName: qs.NodeName}, Description: getClusterReplAvgActiveWorkersCountMD(),
} VariableLabels: map[string]string{serverName: qs.NodeName},
maxActiveWorkersCount := Metric{ }
Description: getClusterReplMaxActiveWorkersCountMD(), maxActiveWorkersCount := Metric{
VariableLabels: map[string]string{serverName: qs.NodeName}, Description: getClusterReplMaxActiveWorkersCountMD(),
} VariableLabels: map[string]string{serverName: qs.NodeName},
currInQueueCount := Metric{ }
Description: getClusterReplCurrQueuedOperationsMD(), currInQueueCount := Metric{
VariableLabels: map[string]string{serverName: qs.NodeName}, Description: getClusterReplCurrQueuedOperationsMD(),
} VariableLabels: map[string]string{serverName: qs.NodeName},
currInQueueBytes := Metric{ }
Description: getClusterReplCurrQueuedBytesMD(), currInQueueBytes := Metric{
VariableLabels: map[string]string{serverName: qs.NodeName}, Description: getClusterReplCurrQueuedBytesMD(),
} VariableLabels: map[string]string{serverName: qs.NodeName},
}
currTransferRate := Metric{ currTransferRate := Metric{
Description: getClusterReplCurrentTransferRateMD(), Description: getClusterReplCurrentTransferRateMD(),
VariableLabels: map[string]string{serverName: qs.NodeName}, VariableLabels: map[string]string{serverName: qs.NodeName},
} }
avgQueueCount := Metric{ avgQueueCount := Metric{
Description: getClusterReplAvgQueuedOperationsMD(), Description: getClusterReplAvgQueuedOperationsMD(),
VariableLabels: map[string]string{serverName: qs.NodeName}, VariableLabels: map[string]string{serverName: qs.NodeName},
} }
avgQueueBytes := Metric{ avgQueueBytes := Metric{
Description: getClusterReplAvgQueuedBytesMD(), Description: getClusterReplAvgQueuedBytesMD(),
VariableLabels: map[string]string{serverName: qs.NodeName}, VariableLabels: map[string]string{serverName: qs.NodeName},
} }
maxQueueCount := Metric{ maxQueueCount := Metric{
Description: getClusterReplMaxQueuedOperationsMD(), Description: getClusterReplMaxQueuedOperationsMD(),
VariableLabels: map[string]string{serverName: qs.NodeName}, VariableLabels: map[string]string{serverName: qs.NodeName},
} }
maxQueueBytes := Metric{ maxQueueBytes := Metric{
Description: getClusterReplMaxQueuedBytesMD(), Description: getClusterReplMaxQueuedBytesMD(),
VariableLabels: map[string]string{serverName: qs.NodeName}, VariableLabels: map[string]string{serverName: qs.NodeName},
} }
avgTransferRate := Metric{ avgTransferRate := Metric{
Description: getClusterReplAvgTransferRateMD(), Description: getClusterReplAvgTransferRateMD(),
VariableLabels: map[string]string{serverName: qs.NodeName}, VariableLabels: map[string]string{serverName: qs.NodeName},
} }
maxTransferRate := Metric{ maxTransferRate := Metric{
Description: getClusterReplMaxTransferRateMD(), Description: getClusterReplMaxTransferRateMD(),
VariableLabels: map[string]string{serverName: qs.NodeName}, VariableLabels: map[string]string{serverName: qs.NodeName},
} }
mrfCount := Metric{ mrfCount := Metric{
Description: getClusterReplMRFFailedOperationsMD(), Description: getClusterReplMRFFailedOperationsMD(),
VariableLabels: map[string]string{serverName: qs.NodeName}, VariableLabels: map[string]string{serverName: qs.NodeName},
Value: float64(qs.MRFStats.LastFailedCount), Value: float64(qs.MRFStats.LastFailedCount),
} }
if qs.QStats.Avg.Count > 0 || qs.QStats.Curr.Count > 0 { if qs.QStats.Avg.Count > 0 || qs.QStats.Curr.Count > 0 {
qt := qs.QStats qt := qs.QStats
currInQueueBytes.Value = qt.Curr.Bytes currInQueueBytes.Value = qt.Curr.Bytes
currInQueueCount.Value = qt.Curr.Count currInQueueCount.Value = qt.Curr.Count
avgQueueBytes.Value = qt.Avg.Bytes avgQueueBytes.Value = qt.Avg.Bytes
avgQueueCount.Value = qt.Avg.Count avgQueueCount.Value = qt.Avg.Count
maxQueueBytes.Value = qt.Max.Bytes maxQueueBytes.Value = qt.Max.Bytes
maxQueueCount.Value = qt.Max.Count maxQueueCount.Value = qt.Max.Count
} }
activeWorkersCount.Value = float64(qs.ActiveWorkers.Curr) activeWorkersCount.Value = float64(qs.ActiveWorkers.Curr)
avgActiveWorkersCount.Value = float64(qs.ActiveWorkers.Avg) avgActiveWorkersCount.Value = float64(qs.ActiveWorkers.Avg)
maxActiveWorkersCount.Value = float64(qs.ActiveWorkers.Max) maxActiveWorkersCount.Value = float64(qs.ActiveWorkers.Max)
if len(qs.XferStats) > 0 { if len(qs.XferStats) > 0 {
tots := qs.XferStats[Total] tots := qs.XferStats[Total]
currTransferRate.Value = tots.Curr currTransferRate.Value = tots.Curr
avgTransferRate.Value = tots.Avg avgTransferRate.Value = tots.Avg
maxTransferRate.Value = tots.Peak maxTransferRate.Value = tots.Peak
}
ml = []Metric{
activeWorkersCount,
avgActiveWorkersCount,
maxActiveWorkersCount,
currInQueueCount,
currInQueueBytes,
avgQueueCount,
avgQueueBytes,
maxQueueCount,
maxQueueBytes,
currTransferRate,
avgTransferRate,
maxTransferRate,
mrfCount,
}
} }
ml := []Metric{
activeWorkersCount,
avgActiveWorkersCount,
maxActiveWorkersCount,
currInQueueCount,
currInQueueBytes,
avgQueueCount,
avgQueueBytes,
maxQueueCount,
maxQueueBytes,
currTransferRate,
avgTransferRate,
maxTransferRate,
mrfCount,
}
for ep, health := range globalBucketTargetSys.healthStats() { for ep, health := range globalBucketTargetSys.healthStats() {
// link latency current // link latency current
m := Metric{ m := Metric{
@ -3019,17 +3021,21 @@ func getBucketUsageMetrics() *MetricsGroup {
}) })
} }
if !globalSiteReplicationSys.isEnabled() { if !globalSiteReplicationSys.isEnabled() {
stats := bucketReplStats[bucket].ReplicationStats var stats BucketReplicationStats
metrics = append(metrics, Metric{ s, ok := bucketReplStats[bucket]
Description: getRepReceivedBytesMD(bucketMetricNamespace), if ok {
Value: float64(stats.ReplicaSize), stats = s.ReplicationStats
VariableLabels: map[string]string{"bucket": bucket}, metrics = append(metrics, Metric{
}) Description: getRepReceivedBytesMD(bucketMetricNamespace),
metrics = append(metrics, Metric{ Value: float64(stats.ReplicaSize),
Description: getRepReceivedOperationsMD(bucketMetricNamespace), VariableLabels: map[string]string{"bucket": bucket},
Value: float64(stats.ReplicaCount), })
VariableLabels: map[string]string{"bucket": bucket}, metrics = append(metrics, Metric{
}) Description: getRepReceivedOperationsMD(bucketMetricNamespace),
Value: float64(stats.ReplicaCount),
VariableLabels: map[string]string{"bucket": bucket},
})
}
if stats.hasReplicationUsage() { if stats.hasReplicationUsage() {
for arn, stat := range stats.Stats { for arn, stat := range stats.Stats {
metrics = append(metrics, Metric{ metrics = append(metrics, Metric{