diff --git a/cmd/metrics-v2.go b/cmd/metrics-v2.go index a38c0eeca..a71e324a6 100644 --- a/cmd/metrics-v2.go +++ b/cmd/metrics-v2.go @@ -346,6 +346,16 @@ func getBucketObjectDistributionMD() MetricDescription { Type: histogramMetric, } } +func getInternodeFailedRequests() MetricDescription { + return MetricDescription{ + Namespace: interNodeMetricNamespace, + Subsystem: trafficSubsystem, + Name: errorsTotal, + Help: "Total number of failed internode calls.", + Type: counterMetric, + } +} + func getInterNodeSentBytesMD() MetricDescription { return MetricDescription{ Namespace: interNodeMetricNamespace, @@ -982,6 +992,10 @@ func getNetworkMetrics() MetricsGroup { return MetricsGroup{ Metrics: []Metric{}, initialize: func(ctx context.Context, metrics *MetricsGroup) { + metrics.Metrics = append(metrics.Metrics, Metric{ + Description: getInternodeFailedRequests(), + Value: float64(loadAndResetRPCNetworkErrsCounter()), + }) connStats := globalConnStats.toServerConnStats() metrics.Metrics = append(metrics.Metrics, Metric{ Description: getInterNodeSentBytesMD(), diff --git a/cmd/rest/client.go b/cmd/rest/client.go index 79629d442..ccac83ce8 100644 --- a/cmd/rest/client.go +++ b/cmd/rest/client.go @@ -42,6 +42,19 @@ const ( closed ) +// Hold the number of failed RPC calls due to networking errors +var networkErrsCounter uint64 + +// GetNetworkErrsCounter returns the number of failed RPC requests +func GetNetworkErrsCounter() uint64 { + return atomic.LoadUint64(&networkErrsCounter) +} + +// ResetNetworkErrsCounter resets the number of failed RPC requests +func ResetNetworkErrsCounter() { + atomic.StoreUint64(&networkErrsCounter, 0) +} + // NetworkError - error type in case of errors related to http/transport // for ex. connection refused, connection reset, dns resolution failure etc. // All errors returned by storage-rest-server (ex errFileNotFound, errDiskNotFound) are not considered to be network errors. @@ -120,6 +133,7 @@ func (c *Client) Call(ctx context.Context, method string, values url.Values, bod resp, err := c.httpClient.Do(req) if err != nil { if c.HealthCheckFn != nil && xnet.IsNetworkOrHostDown(err, c.ExpectTimeouts) { + atomic.AddUint64(&networkErrsCounter, 1) if c.MarkOffline() { logger.LogIf(ctx, fmt.Errorf("Marking %s temporary offline; caused by %w", c.url.String(), err)) } diff --git a/cmd/utils.go b/cmd/utils.go index 3e392d156..8a0fa6897 100644 --- a/cmd/utils.go +++ b/cmd/utils.go @@ -43,6 +43,7 @@ import ( "github.com/gorilla/mux" xhttp "github.com/minio/minio/cmd/http" "github.com/minio/minio/cmd/logger" + "github.com/minio/minio/cmd/rest" "github.com/minio/minio/pkg/certs" "github.com/minio/minio/pkg/handlers" "github.com/minio/minio/pkg/madmin" @@ -883,3 +884,10 @@ func decodeDirObject(object string) string { } return object } + +// This is used by metrics to show the number of failed RPC calls +// between internodes +func loadAndResetRPCNetworkErrsCounter() uint64 { + defer rest.ResetNetworkErrsCounter() + return rest.GetNetworkErrsCounter() +}