Add RPC tcp timeout/errs and AVG duration to prometheus (#15747)

This commit is contained in:
Anis Elleuch
2022-09-26 17:04:26 +01:00
committed by GitHub
parent 1480340830
commit 048a46ec2a
4 changed files with 122 additions and 28 deletions

View File

@@ -31,6 +31,7 @@ import (
"github.com/minio/madmin-go"
"github.com/minio/minio/internal/bucket/lifecycle"
"github.com/minio/minio/internal/logger"
"github.com/minio/minio/internal/rest"
"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"github.com/prometheus/common/expfmt"
@@ -167,7 +168,8 @@ const (
writeBytes MetricName = "write_bytes"
wcharBytes MetricName = "wchar_bytes"
apiLatencyMicroSec MetricName = "latency_us"
latencyMicroSec MetricName = "latency_us"
latencyNanoSec MetricName = "latency_ns"
usagePercent MetricName = "update_percent"
@@ -331,7 +333,7 @@ func getNodeDiskAPILatencyMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: diskSubsystem,
Name: apiLatencyMicroSec,
Name: latencyMicroSec,
Help: "Average last minute latency in µs for drive API storage operations",
Type: gaugeMetric,
}
@@ -537,6 +539,26 @@ func getInternodeFailedRequests() MetricDescription {
}
}
func getInternodeTCPDialTimeout() MetricDescription {
return MetricDescription{
Namespace: interNodeMetricNamespace,
Subsystem: trafficSubsystem,
Name: "dial_errors",
Help: "Total number of internode TCP dial timeouts and errors",
Type: counterMetric,
}
}
func getInternodeTCPAvgDuration() MetricDescription {
return MetricDescription{
Namespace: interNodeMetricNamespace,
Subsystem: trafficSubsystem,
Name: "dial_avg_time",
Help: "Average time of internodes TCP dial calls",
Type: gaugeMetric,
}
}
func getInterNodeSentBytesMD() MetricDescription {
return MetricDescription{
Namespace: interNodeMetricNamespace,
@@ -1607,10 +1629,19 @@ func getNetworkMetrics() *MetricsGroup {
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
metrics = make([]Metric, 0, 10)
connStats := globalConnStats.toServerConnStats()
rpcStats := rest.GetRPCStats()
if globalIsDistErasure {
metrics = append(metrics, Metric{
Description: getInternodeFailedRequests(),
Value: float64(loadAndResetRPCNetworkErrsCounter()),
Value: float64(rpcStats.Errs),
})
metrics = append(metrics, Metric{
Description: getInternodeTCPDialTimeout(),
Value: float64(rpcStats.DialErrs),
})
metrics = append(metrics, Metric{
Description: getInternodeTCPAvgDuration(),
Value: float64(rpcStats.DialAvgDuration),
})
metrics = append(metrics, Metric{
Description: getInterNodeSentBytesMD(),

View File

@@ -56,7 +56,6 @@ import (
ioutilx "github.com/minio/minio/internal/ioutil"
"github.com/minio/minio/internal/logger"
"github.com/minio/minio/internal/logger/message/audit"
"github.com/minio/minio/internal/rest"
"github.com/minio/pkg/certs"
"github.com/minio/pkg/env"
"golang.org/x/oauth2"
@@ -1016,13 +1015,6 @@ func decodeDirObject(object string) string {
return object
}
// This is used by metrics to show the number of failed RPC calls
// between internodes
func loadAndResetRPCNetworkErrsCounter() uint64 {
defer rest.ResetNetworkErrsCounter()
return rest.GetNetworkErrsCounter()
}
// Helper method to return total number of nodes in cluster
func totalNodeCount() uint64 {
peers, _ := globalEndpoints.peers()