From e11d851aee0f729daf5c58087fb58364f4b3684b Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Fri, 19 Jan 2024 14:51:36 -0800 Subject: [PATCH] add new drive I/O waiting/tokens metric (#18836) Bonus: add virtual memory used as well part of the system resource metrics. --- cmd/erasure.go | 2 ++ cmd/metrics-v2.go | 60 +++++++++++++++++++++++++++++++-- docs/metrics/prometheus/list.md | 25 ++++++++------ go.mod | 2 +- go.sum | 4 +-- 5 files changed, 77 insertions(+), 16 deletions(-) diff --git a/cmd/erasure.go b/cmd/erasure.go index 9642d8a9d..2160fb85a 100644 --- a/cmd/erasure.go +++ b/cmd/erasure.go @@ -203,6 +203,8 @@ func getDisksInfo(disks []StorageAPI, endpoints []Endpoint, metrics bool) (disks APICalls: make(map[string]uint64, len(info.Metrics.APICalls)), TotalErrorsAvailability: info.Metrics.TotalErrorsAvailability, TotalErrorsTimeout: info.Metrics.TotalErrorsTimeout, + TotalTokens: info.Metrics.TotalTokens, + TotalWaiting: info.Metrics.TotalWaiting, } for k, v := range info.Metrics.LastMinute { if v.N > 0 { diff --git a/cmd/metrics-v2.go b/cmd/metrics-v2.go index 1022bfe8c..8388f3301 100644 --- a/cmd/metrics-v2.go +++ b/cmd/metrics-v2.go @@ -1,4 +1,4 @@ -// Copyright (c) 2015-2023 MinIO, Inc. +// Copyright (c) 2015-2024 MinIO, Inc. // // This file is part of MinIO Object Storage stack // @@ -256,6 +256,7 @@ const ( startTime = "starttime_seconds" upTime = "uptime_seconds" memory = "resident_memory_bytes" + vmemory = "virtual_memory_bytes" cpu = "cpu_total_seconds" expiryPendingTasks MetricName = "expiry_pending_tasks" @@ -519,7 +520,7 @@ func getNodeDriveTimeoutErrorsMD() MetricDescription { Namespace: nodeMetricNamespace, Subsystem: driveSubsystem, Name: "errors_timeout", - Help: "Total number of timeout errors since server start", + Help: "Total number of drive timeout errors since server start", Type: counterMetric, } } @@ -529,7 +530,27 @@ func getNodeDriveAvailablityErrorsMD() MetricDescription { Namespace: nodeMetricNamespace, Subsystem: driveSubsystem, Name: "errors_availability", - Help: "Total number of I/O errors, permission denied and timeouts since server start", + Help: "Total number of drive I/O errors, permission denied and timeouts since server start", + Type: counterMetric, + } +} + +func getNodeDriveWaitingIOMD() MetricDescription { + return MetricDescription{ + Namespace: nodeMetricNamespace, + Subsystem: driveSubsystem, + Name: "io_waiting", + Help: "Total number I/O operations waiting on drive", + Type: counterMetric, + } +} + +func getNodeDriveTokensIOMD() MetricDescription { + return MetricDescription{ + Namespace: nodeMetricNamespace, + Subsystem: driveSubsystem, + Name: "io_tokens", + Help: "Total number concurrent I/O operations configured on drive", Type: counterMetric, } } @@ -1532,6 +1553,16 @@ func getMinIOProcessResidentMemory() MetricDescription { } } +func getMinIOProcessVirtualMemory() MetricDescription { + return MetricDescription{ + Namespace: nodeMetricNamespace, + Subsystem: processSubsystem, + Name: memory, + Help: "Virtual memory size in bytes", + Type: gaugeMetric, + } +} + func getMinIOProcessCPUTime() MetricDescription { return MetricDescription{ Namespace: nodeMetricNamespace, @@ -1654,6 +1685,14 @@ func getMinioProcMetrics() *MetricsGroup { }) } + if stat.VirtualMemory() > 0 { + metrics = append(metrics, + Metric{ + Description: getMinIOProcessVirtualMemory(), + Value: float64(stat.VirtualMemory()), + }) + } + if stat.CPUTime() > 0 { metrics = append(metrics, Metric{ @@ -2900,6 +2939,9 @@ func getClusterUsageMetrics(opts MetricsGroupOpts) *MetricsGroup { } mg.RegisterRead(func(ctx context.Context) (metrics []Metric) { objLayer := newObjectLayerFn() + if objLayer == nil { + return + } metrics = make([]Metric, 0, 50) dataUsageInfo, err := loadDataUsageFromBackend(ctx, objLayer) @@ -3260,6 +3302,18 @@ func getLocalStorageMetrics(opts MetricsGroupOpts) *MetricsGroup { VariableLabels: map[string]string{"drive": disk.DrivePath}, }) + metrics = append(metrics, Metric{ + Description: getNodeDriveWaitingIOMD(), + Value: float64(disk.Metrics.TotalWaiting), + VariableLabels: map[string]string{"drive": disk.DrivePath}, + }) + + metrics = append(metrics, Metric{ + Description: getNodeDriveTokensIOMD(), + Value: float64(disk.Metrics.TotalTokens), + VariableLabels: map[string]string{"drive": disk.DrivePath}, + }) + for apiName, latency := range disk.Metrics.LastMinute { metrics = append(metrics, Metric{ Description: getNodeDriveAPILatencyMD(), diff --git a/docs/metrics/prometheus/list.md b/docs/metrics/prometheus/list.md index 65b447d95..0ae9baba3 100644 --- a/docs/metrics/prometheus/list.md +++ b/docs/metrics/prometheus/list.md @@ -170,16 +170,20 @@ For deployments with [bucket](https://min.io/docs/minio/linux/administration/buc ## Drive Metrics -| Name | Description | -|:---------------------------------|:--------------------------------------------------------------------| -| `minio_node_drive_free_bytes` | Total storage available on a drive. | -| `minio_node_drive_free_inodes` | Total free inodes. | -| `minio_node_drive_latency_us` | Average last minute latency in µs for drive API storage operations. | -| `minio_node_drive_offline_total` | Total drives offline in this node. | -| `minio_node_drive_online_total` | Total drives online in this node. | -| `minio_node_drive_total` | Total drives in this node. | -| `minio_node_drive_total_bytes` | Total storage on a drive. | -| `minio_node_drive_used_bytes` | Total storage used on a drive. | +| Name | Description | +|:---------------------------------------|:------------------------------------------------------------------------------------| +| `minio_node_drive_free_bytes` | Total storage available on a drive. | +| `minio_node_drive_free_inodes` | Total free inodes. | +| `minio_node_drive_latency_us` | Average last minute latency in µs for drive API storage operations. | +| `minio_node_drive_offline_total` | Total drives offline in this node. | +| `minio_node_drive_online_total` | Total drives online in this node. | +| `minio_node_drive_total` | Total drives in this node. | +| `minio_node_drive_total_bytes` | Total storage on a drive. | +| `minio_node_drive_used_bytes` | Total storage used on a drive. | +| `minio_node_drive_errors_timeout` | Total number of drive timeout errors since server start | +| `minio_node_drive_errors_availability` | Total number of drive I/O errors, permission denied and timeouts since server start | +| `minio_node_drive_io_waiting` | Total number I/O operations waiting on drive | +| `minio_node_drive_io_tokens` | Total number concurrent I/O operations configured on drive | ## Identity and Access Management (IAM) Metrics @@ -228,6 +232,7 @@ For deployments with [bucket](https://min.io/docs/minio/linux/administration/buc | `minio_node_io_write_bytes` | Total bytes written by the process to the underlying storage system, /proc/[pid]/io write_bytes. | | `minio_node_process_cpu_total_seconds` | Total user and system CPU time spent in seconds. | | `minio_node_process_resident_memory_bytes` | Resident memory size in bytes. | +| `minio_node_process_virtual_memory_bytes` | Virtual memory size in bytes. | | `minio_node_process_starttime_seconds` | Start time for MinIO process per node, time in seconds since Unix epoc. | | `minio_node_process_uptime_seconds` | Uptime for MinIO process per node in seconds. | diff --git a/go.mod b/go.mod index e3c45baf6..13e03d00d 100644 --- a/go.mod +++ b/go.mod @@ -51,7 +51,7 @@ require ( github.com/minio/dperf v0.5.3 github.com/minio/highwayhash v1.0.2 github.com/minio/kes-go v0.2.0 - github.com/minio/madmin-go/v3 v3.0.38 + github.com/minio/madmin-go/v3 v3.0.40-0.20240119195114-66fab65f959f github.com/minio/minio-go/v7 v7.0.66 github.com/minio/mux v1.9.0 github.com/minio/pkg/v2 v2.0.8 diff --git a/go.sum b/go.sum index 53f1a5f15..c9a8e09fb 100644 --- a/go.sum +++ b/go.sum @@ -443,8 +443,8 @@ github.com/minio/highwayhash v1.0.2 h1:Aak5U0nElisjDCfPSG79Tgzkn2gl66NxOMspRrKnA github.com/minio/highwayhash v1.0.2/go.mod h1:BQskDq+xkJ12lmlUUi7U0M5Swg3EWR+dLTk+kldvVxY= github.com/minio/kes-go v0.2.0 h1:HA33arq9s3MErbsj3PAXFVfFo4U4yw7lTKQ5kWFrpCA= github.com/minio/kes-go v0.2.0/go.mod h1:VorHLaIYis9/MxAHAtXN4d8PUMNKhIxTIlvFt0hBOEo= -github.com/minio/madmin-go/v3 v3.0.38 h1:hgyQg43IkTq40ymFWoJwZyoqjYoT2GkiPlc1e7Bu+dY= -github.com/minio/madmin-go/v3 v3.0.38/go.mod h1:4QN2NftLSV7MdlT50dkrenOMmNVHluxTvlqJou3hte8= +github.com/minio/madmin-go/v3 v3.0.40-0.20240119195114-66fab65f959f h1:clgtVs6KUJTtKb4Xghq35gyJM/m10IwEmgfb4Do6BuY= +github.com/minio/madmin-go/v3 v3.0.40-0.20240119195114-66fab65f959f/go.mod h1:4QN2NftLSV7MdlT50dkrenOMmNVHluxTvlqJou3hte8= github.com/minio/mc v0.0.0-20240111054932-d4305a5bf95e h1:vKnv5aBTcAAnDGYeJW/SPieXCerp/7MIYxuEUYt7iOE= github.com/minio/mc v0.0.0-20240111054932-d4305a5bf95e/go.mod h1:wFVJTmLJniMFDkcvPP0h/KvCxK+MiA2rc6q7KUefN28= github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34=