From 56b7045c203a71ffd28176827f6ad5f4237dae47 Mon Sep 17 00:00:00 2001 From: Krishnan Parthasarathi Date: Wed, 20 Dec 2023 20:13:40 -0800 Subject: [PATCH] Export tier metrics (#18678) minio_node_tier_ttlb_seconds - Distribution of time to last byte for streaming objects from warm tier minio_node_tier_requests_success - Number of requests to download object from warm tier that were successful minio_node_tier_requests_failure - Number of requests to download object from warm tier that failed --- cmd/bucket-lifecycle.go | 6 ++- cmd/metrics-v2.go | 16 ++++++ cmd/tier.go | 94 ++++++++++++++++++++++++++++++++- cmd/tier_test.go | 52 ++++++++++++++++++ docs/metrics/prometheus/list.md | 8 +++ 5 files changed, 174 insertions(+), 2 deletions(-) create mode 100644 cmd/tier_test.go diff --git a/cmd/bucket-lifecycle.go b/cmd/bucket-lifecycle.go index a0daa1c2a..da94d5c96 100644 --- a/cmd/bucket-lifecycle.go +++ b/cmd/bucket-lifecycle.go @@ -507,9 +507,13 @@ func auditTierActions(ctx context.Context, tier string, bytes int64) func(err er } if err == nil { - op.TimeToResponseNS = time.Since(startTime).Nanoseconds() + since := time.Since(startTime) + op.TimeToResponseNS = since.Nanoseconds() + globalTierMetrics.Observe(tier, since) + globalTierMetrics.logSuccess(tier) } else { op.Error = err.Error() + globalTierMetrics.logFailure(tier) } logger.GetReqInfo(ctx).AppendTags("tierStats", op) diff --git a/cmd/metrics-v2.go b/cmd/metrics-v2.go index 9d7579008..ccbd074ea 100644 --- a/cmd/metrics-v2.go +++ b/cmd/metrics-v2.go @@ -90,6 +90,7 @@ func init() { getNetworkMetrics(), getMinioVersionMetrics(), getS3TTFBMetric(), + getTierMetrics(), getNotificationMetrics(), getDistLockMetrics(), getIAMNodeMetrics(), @@ -155,6 +156,7 @@ const ( usageSubsystem MetricSubsystem = "usage" quotaSubsystem MetricSubsystem = "quota" ilmSubsystem MetricSubsystem = "ilm" + tierSubsystem MetricSubsystem = "tier" scannerSubsystem MetricSubsystem = "scanner" iamSubsystem MetricSubsystem = "iam" kmsSubsystem MetricSubsystem = "kms" @@ -246,6 +248,7 @@ const ( sizeDistribution = "size_distribution" versionDistribution = "version_distribution" ttfbDistribution = "seconds_distribution" + ttlbDistribution = "ttlb_seconds_distribution" lastActivityTime = "last_activity_nano_seconds" startTime = "starttime_seconds" @@ -262,6 +265,9 @@ const ( transitionedObjects MetricName = "transitioned_objects" transitionedVersions MetricName = "transitioned_versions" + tierRequestsSuccess MetricName = "requests_success" + tierRequestsFailure MetricName = "requests_failure" + kmsOnline = "online" kmsRequestsSuccess = "request_success" kmsRequestsError = "request_error" @@ -1658,6 +1664,16 @@ func getS3TTFBMetric() *MetricsGroup { return mg } +func getTierMetrics() *MetricsGroup { + mg := &MetricsGroup{ + cacheInterval: 10 * time.Second, + } + mg.RegisterRead(func(ctx context.Context) []Metric { + return globalTierMetrics.Report() + }) + return mg +} + func getTransitionPendingTasksMD() MetricDescription { return MetricDescription{ Namespace: nodeMetricNamespace, diff --git a/cmd/tier.go b/cmd/tier.go index 0ab8f3775..66b0f1c81 100644 --- a/cmd/tier.go +++ b/cmd/tier.go @@ -1,4 +1,4 @@ -// Copyright (c) 2015-2021 MinIO, Inc. +// Copyright (c) 2015-2023 MinIO, Inc. // // This file is part of MinIO Object Storage stack // @@ -27,11 +27,13 @@ import ( "path" "strings" "sync" + "time" "github.com/minio/madmin-go/v3" "github.com/minio/minio/internal/crypto" "github.com/minio/minio/internal/hash" "github.com/minio/minio/internal/kms" + "github.com/prometheus/client_golang/prometheus" ) //go:generate msgp -file $GOFILE @@ -80,6 +82,96 @@ type TierConfigMgr struct { Tiers map[string]madmin.TierConfig `json:"tiers"` } +type tierMetrics struct { + sync.RWMutex // protects requestsCount only + requestsCount map[string]struct { + success int64 + failure int64 + } + histogram *prometheus.HistogramVec +} + +var globalTierMetrics = tierMetrics{ + requestsCount: make(map[string]struct { + success int64 + failure int64 + }), + histogram: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "tier_ttlb_seconds", + Help: "Time taken by requests served by warm tier", + Buckets: []float64{0.01, 0.1, 1, 2, 5, 10, 60, 5 * 60, 15 * 60, 30 * 60}, + }, []string{"tier"}), +} + +func (t *tierMetrics) Observe(tier string, dur time.Duration) { + t.histogram.With(prometheus.Labels{"tier": tier}).Observe(dur.Seconds()) +} + +func (t *tierMetrics) logSuccess(tier string) { + t.Lock() + defer t.Unlock() + + stat := t.requestsCount[tier] + stat.success++ + t.requestsCount[tier] = stat +} + +func (t *tierMetrics) logFailure(tier string) { + t.Lock() + defer t.Unlock() + + stat := t.requestsCount[tier] + stat.failure++ + t.requestsCount[tier] = stat +} + +var ( + // {minio_node}_{tier}_{ttlb_seconds_distribution} + tierTTLBMD = MetricDescription{ + Namespace: nodeMetricNamespace, + Subsystem: tierSubsystem, + Name: ttlbDistribution, + Help: "Distribution of time to last byte for objects downloaded from warm tier", + Type: gaugeMetric, + } + + // {minio_node}_{tier}_{requests_success} + tierRequestsSuccessMD = MetricDescription{ + Namespace: nodeMetricNamespace, + Subsystem: tierSubsystem, + Name: tierRequestsSuccess, + Help: "Number of requests to download object from warm tier that were successful", + Type: counterMetric, + } + // {minio_node}_{tier}_{requests_failure} + tierRequestsFailureMD = MetricDescription{ + Namespace: nodeMetricNamespace, + Subsystem: tierSubsystem, + Name: tierRequestsFailure, + Help: "Number of requests to download object from warm tier that failed", + Type: counterMetric, + } +) + +func (t *tierMetrics) Report() []Metric { + metrics := getHistogramMetrics(t.histogram, tierTTLBMD) + t.RLock() + defer t.RUnlock() + for tier, stat := range t.requestsCount { + metrics = append(metrics, Metric{ + Description: tierRequestsSuccessMD, + Value: float64(stat.success), + VariableLabels: map[string]string{"tier": tier}, + }) + metrics = append(metrics, Metric{ + Description: tierRequestsFailureMD, + Value: float64(stat.failure), + VariableLabels: map[string]string{"tier": tier}, + }) + } + return metrics +} + // IsTierValid returns true if there exists a remote tier by name tierName, // otherwise returns false. func (config *TierConfigMgr) IsTierValid(tierName string) bool { diff --git a/cmd/tier_test.go b/cmd/tier_test.go new file mode 100644 index 000000000..9cf62b8d9 --- /dev/null +++ b/cmd/tier_test.go @@ -0,0 +1,52 @@ +// Copyright (c) 2015-2023 MinIO, Inc. +// +// This file is part of MinIO Object Storage stack +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +package cmd + +import ( + "testing" + "time" +) + +func TestTierMetrics(t *testing.T) { + tier := "WARM-1" + globalTierMetrics.Observe(tier, 200*time.Millisecond) + expSuccess := 10 + expFailure := 5 + for i := 0; i < expSuccess; i++ { + globalTierMetrics.logSuccess(tier) + } + for i := 0; i < expFailure; i++ { + globalTierMetrics.logFailure(tier) + } + metrics := globalTierMetrics.Report() + var succ, fail float64 + for _, metric := range metrics { + switch metric.Description.Name { + case tierRequestsSuccess: + succ += metric.Value + case tierRequestsFailure: + fail += metric.Value + } + } + if int(succ) != expSuccess { + t.Fatalf("Expected %d successes but got %f", expSuccess, succ) + } + if int(fail) != expFailure { + t.Fatalf("Expected %d failures but got %f", expFailure, fail) + } +} diff --git a/docs/metrics/prometheus/list.md b/docs/metrics/prometheus/list.md index dc40fd898..af029ab0e 100644 --- a/docs/metrics/prometheus/list.md +++ b/docs/metrics/prometheus/list.md @@ -200,6 +200,14 @@ For deployments with [bucket](https://min.io/docs/minio/linux/administration/buc | `minio_node_ilm_transition_missed_immediate_tasks` | Number of missed immediate ILM transition tasks. | | `minio_node_ilm_versions_scanned` | Total number of object versions checked for ilm actions since server start. | +## Tier Metrics + +| Name | Description | +|:---------------------------------------------------|:----------------------------------------------------------------------------| +| `minio_node_tier_tier_ttlb_seconds_distribution` | Distribution of time to last byte for objects downloaded from warm tier | +| `minio_node_tier_requests_success` | Number of requests to download object from warm tier that were successful | +| `minio_node_tier_requests_failure` | Number of requests to download object from warm tier that were failure | + ## System Metrics | Name | Description |