From b80271476be6e388f9022fb8baad2dd40d2cc7aa Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Thu, 12 Dec 2024 14:27:21 +0100 Subject: [PATCH] Enforce a bucket limit to v2 metrics calls Enforce a bucket count limit on metrics for v2 calls. If people hit this limit, they should move to v3, as certain calls explode with high bucket count. Reviewers: This *should* only affect v2 calls, but the complexity is rather overwhelming. --- cmd/metrics-v2.go | 44 ++++++++++++++++++++++++++++++++++------- cmd/metrics-v3-types.go | 2 +- cmd/tier.go | 2 +- cmd/utils.go | 17 ++++++++++++++++ 4 files changed, 56 insertions(+), 9 deletions(-) diff --git a/cmd/metrics-v2.go b/cmd/metrics-v2.go index bcf974240..8b8554052 100644 --- a/cmd/metrics-v2.go +++ b/cmd/metrics-v2.go @@ -53,6 +53,10 @@ var ( bucketPeerMetricsGroups []*MetricsGroupV2 ) +// v2MetricsMaxBuckets enforces a bucket count limit on metrics for v2 calls. +// If people hit this limit, they should move to v3, as certain calls explode with high bucket count. +const v2MetricsMaxBuckets = 100 + func init() { clusterMetricsGroups := []*MetricsGroupV2{ getNodeHealthMetrics(MetricsGroupOpts{dependGlobalNotificationSys: true}), @@ -1834,7 +1838,7 @@ func getGoMetrics() *MetricsGroupV2 { // // The last parameter is added for compatibility - if true it lowercases the // `api` label values. -func getHistogramMetrics(hist *prometheus.HistogramVec, desc MetricDescription, toLowerAPILabels bool) []MetricV2 { +func getHistogramMetrics(hist *prometheus.HistogramVec, desc MetricDescription, toLowerAPILabels, limitBuckets bool) []MetricV2 { ch := make(chan prometheus.Metric) go func() { defer xioutil.SafeClose(ch) @@ -1844,6 +1848,7 @@ func getHistogramMetrics(hist *prometheus.HistogramVec, desc MetricDescription, // Converts metrics received into internal []Metric type var metrics []MetricV2 + buckets := make(map[string][]MetricV2, v2MetricsMaxBuckets) for promMetric := range ch { dtoMetric := &dto.Metric{} err := promMetric.Write(dtoMetric) @@ -1870,7 +1875,11 @@ func getHistogramMetrics(hist *prometheus.HistogramVec, desc MetricDescription, VariableLabels: labels, Value: float64(b.GetCumulativeCount()), } - metrics = append(metrics, metric) + if limitBuckets && labels["bucket"] != "" { + buckets[labels["bucket"]] = append(buckets[labels["bucket"]], metric) + } else { + metrics = append(metrics, metric) + } } // add metrics with +Inf label labels1 := make(map[string]string) @@ -1882,11 +1891,26 @@ func getHistogramMetrics(hist *prometheus.HistogramVec, desc MetricDescription, } } labels1["le"] = fmt.Sprintf("%.3f", math.Inf(+1)) - metrics = append(metrics, MetricV2{ + + metric := MetricV2{ Description: desc, VariableLabels: labels1, Value: float64(dtoMetric.Histogram.GetSampleCount()), - }) + } + if limitBuckets && labels1["bucket"] != "" { + buckets[labels1["bucket"]] = append(buckets[labels1["bucket"]], metric) + } else { + metrics = append(metrics, metric) + } + } + + // Limit bucket metrics... + if limitBuckets { + bucketNames := mapKeysSorted(buckets) + bucketNames = bucketNames[:max(len(buckets), v2MetricsMaxBuckets)] + for _, b := range bucketNames { + metrics = append(metrics, buckets[b]...) + } } return metrics } @@ -1897,7 +1921,7 @@ func getBucketTTFBMetric() *MetricsGroupV2 { } mg.RegisterRead(func(ctx context.Context) []MetricV2 { return getHistogramMetrics(bucketHTTPRequestsDuration, - getBucketTTFBDistributionMD(), true) + getBucketTTFBDistributionMD(), true, true) }) return mg } @@ -1908,7 +1932,7 @@ func getS3TTFBMetric() *MetricsGroupV2 { } mg.RegisterRead(func(ctx context.Context) []MetricV2 { return getHistogramMetrics(httpRequestsDuration, - getS3TTFBDistributionMD(), true) + getS3TTFBDistributionMD(), true, true) }) return mg } @@ -3008,7 +3032,13 @@ func getHTTPMetrics(opts MetricsGroupOpts) *MetricsGroupV2 { return } - for bucket, inOut := range globalBucketConnStats.getS3InOutBytes() { + // If we have too many, limit them + bConnStats := globalBucketConnStats.getS3InOutBytes() + buckets := mapKeysSorted(bConnStats) + buckets = buckets[:min(v2MetricsMaxBuckets, len(buckets))] + + for _, bucket := range buckets { + inOut := bConnStats[bucket] recvBytes := inOut.In if recvBytes > 0 { metrics = append(metrics, MetricV2{ diff --git a/cmd/metrics-v3-types.go b/cmd/metrics-v3-types.go index 0aa04efa6..92004c961 100644 --- a/cmd/metrics-v3-types.go +++ b/cmd/metrics-v3-types.go @@ -267,7 +267,7 @@ func (m *MetricValues) SetHistogram(name MetricName, hist *prometheus.HistogramV panic(fmt.Sprintf("metric has no description: %s", name)) } dummyDesc := MetricDescription{} - metricsV2 := getHistogramMetrics(hist, dummyDesc, false) + metricsV2 := getHistogramMetrics(hist, dummyDesc, false, false) mainLoop: for _, metric := range metricsV2 { for label, allowedValues := range filterByLabels { diff --git a/cmd/tier.go b/cmd/tier.go index 953a8d100..317dd5061 100644 --- a/cmd/tier.go +++ b/cmd/tier.go @@ -165,7 +165,7 @@ var ( ) func (t *tierMetrics) Report() []MetricV2 { - metrics := getHistogramMetrics(t.histogram, tierTTLBMD, true) + metrics := getHistogramMetrics(t.histogram, tierTTLBMD, true, true) t.RLock() defer t.RUnlock() for tier, stat := range t.requestsCount { diff --git a/cmd/utils.go b/cmd/utils.go index 87926b93d..5f5208cc5 100644 --- a/cmd/utils.go +++ b/cmd/utils.go @@ -34,6 +34,7 @@ import ( "runtime" "runtime/pprof" "runtime/trace" + "sort" "strings" "sync" "time" @@ -1151,3 +1152,19 @@ func filterStorageClass(ctx context.Context, s string) string { } return s } + +type ordered interface { + ~int | ~int8 | ~int16 | ~int32 | ~int64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~uintptr | ~float32 | ~float64 | string +} + +// mapKeysSorted returns the map keys as a sorted slice. +func mapKeysSorted[Map ~map[K]V, K ordered, V any](m Map) []K { + res := make([]K, 0, len(m)) + for k := range m { + res = append(res, k) + } + sort.Slice(res, func(i, j int) bool { + return res[i] < res[j] + }) + return res +}