mirror of
https://github.com/minio/minio.git
synced 2025-07-25 08:20:09 -04:00
prometheus: Add S3 4xx and 5xx S3 monitoring (#15052)
Currently minio_s3_requests_errors_total covers 4xx and 5xx S3 responses which can be confusing when s3 applications sent a lot of HEAD requests with obvious 404 responses or when the replication is enabled. Add - minio_s3_requests_4xx_errors_total - minio_s3_requests_5xx_errors_total to help users monitor 4xx and 5xx HTTP status codes separately.
This commit is contained in:
parent
2420f6c000
commit
5fb420c703
@ -291,6 +291,8 @@ type ServerHTTPStats struct {
|
|||||||
CurrentS3Requests ServerHTTPAPIStats `json:"currentS3Requests"`
|
CurrentS3Requests ServerHTTPAPIStats `json:"currentS3Requests"`
|
||||||
TotalS3Requests ServerHTTPAPIStats `json:"totalS3Requests"`
|
TotalS3Requests ServerHTTPAPIStats `json:"totalS3Requests"`
|
||||||
TotalS3Errors ServerHTTPAPIStats `json:"totalS3Errors"`
|
TotalS3Errors ServerHTTPAPIStats `json:"totalS3Errors"`
|
||||||
|
TotalS35xxErrors ServerHTTPAPIStats `json:"totalS35xxErrors"`
|
||||||
|
TotalS34xxErrors ServerHTTPAPIStats `json:"totalS34xxErrors"`
|
||||||
TotalS3Canceled ServerHTTPAPIStats `json:"totalS3Canceled"`
|
TotalS3Canceled ServerHTTPAPIStats `json:"totalS3Canceled"`
|
||||||
TotalS3RejectedAuth uint64 `json:"totalS3RejectedAuth"`
|
TotalS3RejectedAuth uint64 `json:"totalS3RejectedAuth"`
|
||||||
TotalS3RejectedTime uint64 `json:"totalS3RejectedTime"`
|
TotalS3RejectedTime uint64 `json:"totalS3RejectedTime"`
|
||||||
|
@ -115,9 +115,11 @@ func setRequestLimitHandler(h http.Handler) http.Handler {
|
|||||||
|
|
||||||
// Reserved bucket.
|
// Reserved bucket.
|
||||||
const (
|
const (
|
||||||
minioReservedBucket = "minio"
|
minioReservedBucket = "minio"
|
||||||
minioReservedBucketPath = SlashSeparator + minioReservedBucket
|
minioReservedBucketPath = SlashSeparator + minioReservedBucket
|
||||||
loginPathPrefix = SlashSeparator + "login"
|
minioReservedBucketPathWithSlash = SlashSeparator + minioReservedBucket + SlashSeparator
|
||||||
|
|
||||||
|
loginPathPrefix = SlashSeparator + "login"
|
||||||
)
|
)
|
||||||
|
|
||||||
func guessIsBrowserReq(r *http.Request) bool {
|
func guessIsBrowserReq(r *http.Request) bool {
|
||||||
|
@ -148,6 +148,8 @@ type HTTPStats struct {
|
|||||||
currentS3Requests HTTPAPIStats
|
currentS3Requests HTTPAPIStats
|
||||||
totalS3Requests HTTPAPIStats
|
totalS3Requests HTTPAPIStats
|
||||||
totalS3Errors HTTPAPIStats
|
totalS3Errors HTTPAPIStats
|
||||||
|
totalS34xxErrors HTTPAPIStats
|
||||||
|
totalS35xxErrors HTTPAPIStats
|
||||||
totalS3Canceled HTTPAPIStats
|
totalS3Canceled HTTPAPIStats
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -178,6 +180,12 @@ func (st *HTTPStats) toServerHTTPStats() ServerHTTPStats {
|
|||||||
serverStats.TotalS3Errors = ServerHTTPAPIStats{
|
serverStats.TotalS3Errors = ServerHTTPAPIStats{
|
||||||
APIStats: st.totalS3Errors.Load(),
|
APIStats: st.totalS3Errors.Load(),
|
||||||
}
|
}
|
||||||
|
serverStats.TotalS34xxErrors = ServerHTTPAPIStats{
|
||||||
|
APIStats: st.totalS34xxErrors.Load(),
|
||||||
|
}
|
||||||
|
serverStats.TotalS35xxErrors = ServerHTTPAPIStats{
|
||||||
|
APIStats: st.totalS35xxErrors.Load(),
|
||||||
|
}
|
||||||
serverStats.TotalS3Canceled = ServerHTTPAPIStats{
|
serverStats.TotalS3Canceled = ServerHTTPAPIStats{
|
||||||
APIStats: st.totalS3Canceled.Load(),
|
APIStats: st.totalS3Canceled.Load(),
|
||||||
}
|
}
|
||||||
@ -186,27 +194,29 @@ func (st *HTTPStats) toServerHTTPStats() ServerHTTPStats {
|
|||||||
|
|
||||||
// Update statistics from http request and response data
|
// Update statistics from http request and response data
|
||||||
func (st *HTTPStats) updateStats(api string, r *http.Request, w *logger.ResponseWriter) {
|
func (st *HTTPStats) updateStats(api string, r *http.Request, w *logger.ResponseWriter) {
|
||||||
// A successful request has a 2xx response code or < 4xx response
|
// Ignore non S3 requests
|
||||||
successReq := w.StatusCode >= 200 && w.StatusCode < 400
|
if strings.HasSuffix(r.URL.Path, minioReservedBucketPathWithSlash) {
|
||||||
|
return
|
||||||
if !strings.HasSuffix(r.URL.Path, prometheusMetricsPathLegacy) ||
|
|
||||||
!strings.HasSuffix(r.URL.Path, prometheusMetricsV2ClusterPath) ||
|
|
||||||
!strings.HasSuffix(r.URL.Path, prometheusMetricsV2NodePath) {
|
|
||||||
st.totalS3Requests.Inc(api)
|
|
||||||
if !successReq {
|
|
||||||
switch w.StatusCode {
|
|
||||||
case 0:
|
|
||||||
case 499:
|
|
||||||
// 499 is a good error, shall be counted as canceled.
|
|
||||||
st.totalS3Canceled.Inc(api)
|
|
||||||
default:
|
|
||||||
st.totalS3Errors.Inc(api)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Increment the prometheus http request response histogram with appropriate label
|
// Increment the prometheus http request response histogram with appropriate label
|
||||||
httpRequestsDuration.With(prometheus.Labels{"api": api}).Observe(w.TimeToFirstByte.Seconds())
|
httpRequestsDuration.With(prometheus.Labels{"api": api}).Observe(w.TimeToFirstByte.Seconds())
|
||||||
|
|
||||||
|
code := w.StatusCode
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case code == 0:
|
||||||
|
case code == 499:
|
||||||
|
// 499 is a good error, shall be counted as canceled.
|
||||||
|
st.totalS3Canceled.Inc(api)
|
||||||
|
case code >= http.StatusBadRequest:
|
||||||
|
st.totalS3Errors.Inc(api)
|
||||||
|
if code >= http.StatusInternalServerError {
|
||||||
|
st.totalS35xxErrors.Inc(api)
|
||||||
|
} else {
|
||||||
|
st.totalS34xxErrors.Inc(api)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Prepare new HTTPStats structure
|
// Prepare new HTTPStats structure
|
||||||
|
@ -603,7 +603,27 @@ func getS3RequestsErrorsMD() MetricDescription {
|
|||||||
Namespace: s3MetricNamespace,
|
Namespace: s3MetricNamespace,
|
||||||
Subsystem: requestsSubsystem,
|
Subsystem: requestsSubsystem,
|
||||||
Name: errorsTotal,
|
Name: errorsTotal,
|
||||||
Help: "Total number S3 requests with errors",
|
Help: "Total number S3 requests with (4xx and 5xx) errors",
|
||||||
|
Type: counterMetric,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func getS3Requests4xxErrorsMD() MetricDescription {
|
||||||
|
return MetricDescription{
|
||||||
|
Namespace: s3MetricNamespace,
|
||||||
|
Subsystem: requestsSubsystem,
|
||||||
|
Name: "4xx_" + errorsTotal,
|
||||||
|
Help: "Total number S3 requests with (4xx) errors",
|
||||||
|
Type: counterMetric,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func getS3Requests5xxErrorsMD() MetricDescription {
|
||||||
|
return MetricDescription{
|
||||||
|
Namespace: s3MetricNamespace,
|
||||||
|
Subsystem: requestsSubsystem,
|
||||||
|
Name: "5xx_" + errorsTotal,
|
||||||
|
Help: "Total number S3 requests with (5xx) errors",
|
||||||
Type: counterMetric,
|
Type: counterMetric,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1488,7 +1508,9 @@ func getHTTPMetrics() *MetricsGroup {
|
|||||||
metrics = make([]Metric, 0, 3+
|
metrics = make([]Metric, 0, 3+
|
||||||
len(httpStats.CurrentS3Requests.APIStats)+
|
len(httpStats.CurrentS3Requests.APIStats)+
|
||||||
len(httpStats.TotalS3Requests.APIStats)+
|
len(httpStats.TotalS3Requests.APIStats)+
|
||||||
len(httpStats.TotalS3Errors.APIStats))
|
len(httpStats.TotalS3Errors.APIStats)+
|
||||||
|
len(httpStats.TotalS35xxErrors.APIStats)+
|
||||||
|
len(httpStats.TotalS34xxErrors.APIStats))
|
||||||
metrics = append(metrics, Metric{
|
metrics = append(metrics, Metric{
|
||||||
Description: getS3RejectedAuthRequestsTotalMD(),
|
Description: getS3RejectedAuthRequestsTotalMD(),
|
||||||
Value: float64(httpStats.TotalS3RejectedAuth),
|
Value: float64(httpStats.TotalS3RejectedAuth),
|
||||||
@ -1535,6 +1557,20 @@ func getHTTPMetrics() *MetricsGroup {
|
|||||||
VariableLabels: map[string]string{"api": api},
|
VariableLabels: map[string]string{"api": api},
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
for api, value := range httpStats.TotalS35xxErrors.APIStats {
|
||||||
|
metrics = append(metrics, Metric{
|
||||||
|
Description: getS3Requests5xxErrorsMD(),
|
||||||
|
Value: float64(value),
|
||||||
|
VariableLabels: map[string]string{"api": api},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
for api, value := range httpStats.TotalS34xxErrors.APIStats {
|
||||||
|
metrics = append(metrics, Metric{
|
||||||
|
Description: getS3Requests4xxErrorsMD(),
|
||||||
|
Value: float64(value),
|
||||||
|
VariableLabels: map[string]string{"api": api},
|
||||||
|
})
|
||||||
|
}
|
||||||
for api, value := range httpStats.TotalS3Canceled.APIStats {
|
for api, value := range httpStats.TotalS3Canceled.APIStats {
|
||||||
metrics = append(metrics, Metric{
|
metrics = append(metrics, Metric{
|
||||||
Description: getS3RequestsCanceledMD(),
|
Description: getS3RequestsCanceledMD(),
|
||||||
|
@ -52,7 +52,9 @@ These metrics can be from any MinIO server once per collection.
|
|||||||
| `minio_node_process_uptime_seconds` | Uptime for MinIO process per node in seconds. |
|
| `minio_node_process_uptime_seconds` | Uptime for MinIO process per node in seconds. |
|
||||||
| `minio_node_syscall_read_total` | Total read SysCalls to the kernel. /proc/[pid]/io syscr |
|
| `minio_node_syscall_read_total` | Total read SysCalls to the kernel. /proc/[pid]/io syscr |
|
||||||
| `minio_node_syscall_write_total` | Total write SysCalls to the kernel. /proc/[pid]/io syscw |
|
| `minio_node_syscall_write_total` | Total write SysCalls to the kernel. /proc/[pid]/io syscw |
|
||||||
| `minio_s3_requests_error_total` | Total number S3 requests with errors |
|
| `minio_s3_requests_errors_total` | Total number S3 requests with 4xx and 5xx errors |
|
||||||
|
| `minio_s3_requests_4xx_errors_total` | Total number S3 requests with 4xx errors |
|
||||||
|
| `minio_s3_requests_5xx_errors_total` | Total number S3 requests with 5xx errors |
|
||||||
| `minio_s3_requests_inflight_total` | Total number of S3 requests currently in flight |
|
| `minio_s3_requests_inflight_total` | Total number of S3 requests currently in flight |
|
||||||
| `minio_s3_requests_total` | Total number S3 requests |
|
| `minio_s3_requests_total` | Total number S3 requests |
|
||||||
| `minio_s3_time_ttfb_seconds_distribution` | Distribution of the time to first byte across API calls. |
|
| `minio_s3_time_ttfb_seconds_distribution` | Distribution of the time to first byte across API calls. |
|
||||||
|
Loading…
x
Reference in New Issue
Block a user