mirror of
https://github.com/minio/minio.git
synced 2024-12-25 14:45:54 -05:00
Add RPC tcp timeout/errs and AVG duration to prometheus (#15747)
This commit is contained in:
parent
1480340830
commit
048a46ec2a
@ -31,6 +31,7 @@ import (
|
|||||||
"github.com/minio/madmin-go"
|
"github.com/minio/madmin-go"
|
||||||
"github.com/minio/minio/internal/bucket/lifecycle"
|
"github.com/minio/minio/internal/bucket/lifecycle"
|
||||||
"github.com/minio/minio/internal/logger"
|
"github.com/minio/minio/internal/logger"
|
||||||
|
"github.com/minio/minio/internal/rest"
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
dto "github.com/prometheus/client_model/go"
|
dto "github.com/prometheus/client_model/go"
|
||||||
"github.com/prometheus/common/expfmt"
|
"github.com/prometheus/common/expfmt"
|
||||||
@ -167,7 +168,8 @@ const (
|
|||||||
writeBytes MetricName = "write_bytes"
|
writeBytes MetricName = "write_bytes"
|
||||||
wcharBytes MetricName = "wchar_bytes"
|
wcharBytes MetricName = "wchar_bytes"
|
||||||
|
|
||||||
apiLatencyMicroSec MetricName = "latency_us"
|
latencyMicroSec MetricName = "latency_us"
|
||||||
|
latencyNanoSec MetricName = "latency_ns"
|
||||||
|
|
||||||
usagePercent MetricName = "update_percent"
|
usagePercent MetricName = "update_percent"
|
||||||
|
|
||||||
@ -331,7 +333,7 @@ func getNodeDiskAPILatencyMD() MetricDescription {
|
|||||||
return MetricDescription{
|
return MetricDescription{
|
||||||
Namespace: nodeMetricNamespace,
|
Namespace: nodeMetricNamespace,
|
||||||
Subsystem: diskSubsystem,
|
Subsystem: diskSubsystem,
|
||||||
Name: apiLatencyMicroSec,
|
Name: latencyMicroSec,
|
||||||
Help: "Average last minute latency in µs for drive API storage operations",
|
Help: "Average last minute latency in µs for drive API storage operations",
|
||||||
Type: gaugeMetric,
|
Type: gaugeMetric,
|
||||||
}
|
}
|
||||||
@ -537,6 +539,26 @@ func getInternodeFailedRequests() MetricDescription {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getInternodeTCPDialTimeout() MetricDescription {
|
||||||
|
return MetricDescription{
|
||||||
|
Namespace: interNodeMetricNamespace,
|
||||||
|
Subsystem: trafficSubsystem,
|
||||||
|
Name: "dial_errors",
|
||||||
|
Help: "Total number of internode TCP dial timeouts and errors",
|
||||||
|
Type: counterMetric,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func getInternodeTCPAvgDuration() MetricDescription {
|
||||||
|
return MetricDescription{
|
||||||
|
Namespace: interNodeMetricNamespace,
|
||||||
|
Subsystem: trafficSubsystem,
|
||||||
|
Name: "dial_avg_time",
|
||||||
|
Help: "Average time of internodes TCP dial calls",
|
||||||
|
Type: gaugeMetric,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func getInterNodeSentBytesMD() MetricDescription {
|
func getInterNodeSentBytesMD() MetricDescription {
|
||||||
return MetricDescription{
|
return MetricDescription{
|
||||||
Namespace: interNodeMetricNamespace,
|
Namespace: interNodeMetricNamespace,
|
||||||
@ -1607,10 +1629,19 @@ func getNetworkMetrics() *MetricsGroup {
|
|||||||
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
|
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
|
||||||
metrics = make([]Metric, 0, 10)
|
metrics = make([]Metric, 0, 10)
|
||||||
connStats := globalConnStats.toServerConnStats()
|
connStats := globalConnStats.toServerConnStats()
|
||||||
|
rpcStats := rest.GetRPCStats()
|
||||||
if globalIsDistErasure {
|
if globalIsDistErasure {
|
||||||
metrics = append(metrics, Metric{
|
metrics = append(metrics, Metric{
|
||||||
Description: getInternodeFailedRequests(),
|
Description: getInternodeFailedRequests(),
|
||||||
Value: float64(loadAndResetRPCNetworkErrsCounter()),
|
Value: float64(rpcStats.Errs),
|
||||||
|
})
|
||||||
|
metrics = append(metrics, Metric{
|
||||||
|
Description: getInternodeTCPDialTimeout(),
|
||||||
|
Value: float64(rpcStats.DialErrs),
|
||||||
|
})
|
||||||
|
metrics = append(metrics, Metric{
|
||||||
|
Description: getInternodeTCPAvgDuration(),
|
||||||
|
Value: float64(rpcStats.DialAvgDuration),
|
||||||
})
|
})
|
||||||
metrics = append(metrics, Metric{
|
metrics = append(metrics, Metric{
|
||||||
Description: getInterNodeSentBytesMD(),
|
Description: getInterNodeSentBytesMD(),
|
||||||
|
@ -56,7 +56,6 @@ import (
|
|||||||
ioutilx "github.com/minio/minio/internal/ioutil"
|
ioutilx "github.com/minio/minio/internal/ioutil"
|
||||||
"github.com/minio/minio/internal/logger"
|
"github.com/minio/minio/internal/logger"
|
||||||
"github.com/minio/minio/internal/logger/message/audit"
|
"github.com/minio/minio/internal/logger/message/audit"
|
||||||
"github.com/minio/minio/internal/rest"
|
|
||||||
"github.com/minio/pkg/certs"
|
"github.com/minio/pkg/certs"
|
||||||
"github.com/minio/pkg/env"
|
"github.com/minio/pkg/env"
|
||||||
"golang.org/x/oauth2"
|
"golang.org/x/oauth2"
|
||||||
@ -1016,13 +1015,6 @@ func decodeDirObject(object string) string {
|
|||||||
return object
|
return object
|
||||||
}
|
}
|
||||||
|
|
||||||
// This is used by metrics to show the number of failed RPC calls
|
|
||||||
// between internodes
|
|
||||||
func loadAndResetRPCNetworkErrsCounter() uint64 {
|
|
||||||
defer rest.ResetNetworkErrsCounter()
|
|
||||||
return rest.GetNetworkErrsCounter()
|
|
||||||
}
|
|
||||||
|
|
||||||
// Helper method to return total number of nodes in cluster
|
// Helper method to return total number of nodes in cluster
|
||||||
func totalNodeCount() uint64 {
|
func totalNodeCount() uint64 {
|
||||||
peers, _ := globalEndpoints.peers()
|
peers, _ := globalEndpoints.peers()
|
||||||
|
@ -46,19 +46,6 @@ const (
|
|||||||
closed
|
closed
|
||||||
)
|
)
|
||||||
|
|
||||||
// Hold the number of failed RPC calls due to networking errors
|
|
||||||
var networkErrsCounter uint64
|
|
||||||
|
|
||||||
// GetNetworkErrsCounter returns the number of failed RPC requests
|
|
||||||
func GetNetworkErrsCounter() uint64 {
|
|
||||||
return atomic.LoadUint64(&networkErrsCounter)
|
|
||||||
}
|
|
||||||
|
|
||||||
// ResetNetworkErrsCounter resets the number of failed RPC requests
|
|
||||||
func ResetNetworkErrsCounter() {
|
|
||||||
atomic.StoreUint64(&networkErrsCounter, 0)
|
|
||||||
}
|
|
||||||
|
|
||||||
// NetworkError - error type in case of errors related to http/transport
|
// NetworkError - error type in case of errors related to http/transport
|
||||||
// for ex. connection refused, connection reset, dns resolution failure etc.
|
// for ex. connection refused, connection reset, dns resolution failure etc.
|
||||||
// All errors returned by storage-rest-server (ex errFileNotFound, errDiskNotFound) are not considered to be network errors.
|
// All errors returned by storage-rest-server (ex errFileNotFound, errDiskNotFound) are not considered to be network errors.
|
||||||
@ -217,7 +204,7 @@ type respBodyMonitor struct {
|
|||||||
func (r respBodyMonitor) Read(p []byte) (n int, err error) {
|
func (r respBodyMonitor) Read(p []byte) (n int, err error) {
|
||||||
n, err = r.ReadCloser.Read(p)
|
n, err = r.ReadCloser.Read(p)
|
||||||
if err != nil && err != io.EOF {
|
if err != nil && err != io.EOF {
|
||||||
atomic.AddUint64(&networkErrsCounter, 1)
|
atomic.AddUint64(&globalStats.errs, 1)
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@ -225,7 +212,7 @@ func (r respBodyMonitor) Read(p []byte) (n int, err error) {
|
|||||||
func (r respBodyMonitor) Close() (err error) {
|
func (r respBodyMonitor) Close() (err error) {
|
||||||
err = r.ReadCloser.Close()
|
err = r.ReadCloser.Close()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
atomic.AddUint64(&networkErrsCounter, 1)
|
atomic.AddUint64(&globalStats.errs, 1)
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@ -252,11 +239,15 @@ func (c *Client) Call(ctx context.Context, method string, values url.Values, bod
|
|||||||
if length > 0 {
|
if length > 0 {
|
||||||
req.ContentLength = length
|
req.ContentLength = length
|
||||||
}
|
}
|
||||||
|
|
||||||
|
req, update := setupReqStatsUpdate(req)
|
||||||
|
defer update()
|
||||||
|
|
||||||
resp, err := c.httpClient.Do(req)
|
resp, err := c.httpClient.Do(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if xnet.IsNetworkOrHostDown(err, c.ExpectTimeouts) {
|
if xnet.IsNetworkOrHostDown(err, c.ExpectTimeouts) {
|
||||||
if !c.NoMetrics {
|
if !c.NoMetrics {
|
||||||
atomic.AddUint64(&networkErrsCounter, 1)
|
atomic.AddUint64(&globalStats.errs, 1)
|
||||||
}
|
}
|
||||||
if c.MarkOffline(err) {
|
if c.MarkOffline(err) {
|
||||||
logger.LogOnceIf(ctx, fmt.Errorf("Marking %s offline temporarily; caused by %w", c.url.Host, err), c.url.Host)
|
logger.LogOnceIf(ctx, fmt.Errorf("Marking %s offline temporarily; caused by %w", c.url.Host, err), c.url.Host)
|
||||||
@ -292,7 +283,7 @@ func (c *Client) Call(ctx context.Context, method string, values url.Values, bod
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
if xnet.IsNetworkOrHostDown(err, c.ExpectTimeouts) {
|
if xnet.IsNetworkOrHostDown(err, c.ExpectTimeouts) {
|
||||||
if !c.NoMetrics {
|
if !c.NoMetrics {
|
||||||
atomic.AddUint64(&networkErrsCounter, 1)
|
atomic.AddUint64(&globalStats.errs, 1)
|
||||||
}
|
}
|
||||||
if c.MarkOffline(err) {
|
if c.MarkOffline(err) {
|
||||||
logger.LogOnceIf(ctx, fmt.Errorf("Marking %s offline temporarily; caused by %w", c.url.Host, err), c.url.Host)
|
logger.LogOnceIf(ctx, fmt.Errorf("Marking %s offline temporarily; caused by %w", c.url.Host, err), c.url.Host)
|
||||||
|
80
internal/rest/rpc-stats.go
Normal file
80
internal/rest/rpc-stats.go
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
// Copyright (c) 2015-2022 MinIO, Inc.
|
||||||
|
//
|
||||||
|
// This file is part of MinIO Object Storage stack
|
||||||
|
//
|
||||||
|
// This program is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU Affero General Public License as published by
|
||||||
|
// the Free Software Foundation, either version 3 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// This program is distributed in the hope that it will be useful
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU Affero General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU Affero General Public License
|
||||||
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
package rest
|
||||||
|
|
||||||
|
import (
|
||||||
|
"net/http"
|
||||||
|
"net/http/httptrace"
|
||||||
|
"sync/atomic"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
var globalStats = struct {
|
||||||
|
errs uint64
|
||||||
|
|
||||||
|
tcpDialErrs uint64
|
||||||
|
tcpDialCount uint64
|
||||||
|
tcpDialTotalDur uint64
|
||||||
|
}{}
|
||||||
|
|
||||||
|
// RPCStats holds information about the DHCP/TCP metrics and errors
|
||||||
|
type RPCStats struct {
|
||||||
|
Errs uint64
|
||||||
|
|
||||||
|
DialAvgDuration uint64
|
||||||
|
DialErrs uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetRPCStats returns RPC stats, include calls errors and dhcp/tcp metrics
|
||||||
|
func GetRPCStats() RPCStats {
|
||||||
|
s := RPCStats{
|
||||||
|
Errs: atomic.LoadUint64(&globalStats.errs),
|
||||||
|
DialErrs: atomic.LoadUint64(&globalStats.tcpDialErrs),
|
||||||
|
}
|
||||||
|
if v := atomic.LoadUint64(&globalStats.tcpDialCount); v > 0 {
|
||||||
|
s.DialAvgDuration = atomic.LoadUint64(&globalStats.tcpDialTotalDur) / v
|
||||||
|
}
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return a function which update the global stats related to tcp connections
|
||||||
|
func setupReqStatsUpdate(req *http.Request) (*http.Request, func()) {
|
||||||
|
var dialStart, dialEnd time.Time
|
||||||
|
|
||||||
|
trace := &httptrace.ClientTrace{
|
||||||
|
ConnectStart: func(network, addr string) {
|
||||||
|
dialStart = time.Now()
|
||||||
|
},
|
||||||
|
ConnectDone: func(network, addr string, err error) {
|
||||||
|
if err == nil {
|
||||||
|
dialEnd = time.Now()
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
return req.WithContext(httptrace.WithClientTrace(req.Context(), trace)), func() {
|
||||||
|
if !dialStart.IsZero() {
|
||||||
|
if dialEnd.IsZero() {
|
||||||
|
atomic.AddUint64(&globalStats.tcpDialErrs, 1)
|
||||||
|
} else {
|
||||||
|
atomic.AddUint64(&globalStats.tcpDialCount, 1)
|
||||||
|
atomic.AddUint64(&globalStats.tcpDialTotalDur, uint64(dialEnd.Sub(dialStart)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user