From f7b665347e01335ab961bc8a7422f7a5fddbb880 Mon Sep 17 00:00:00 2001 From: Shireesh Anjal <355479+anjalshireesh@users.noreply.github.com> Date: Wed, 24 Apr 2024 05:26:12 +0530 Subject: [PATCH] Add system CPU metrics to metrics-v3 (#19560) endpoint: /minio/metrics/v3/system/cpu metrics: - minio_system_cpu_avg_idle - minio_system_cpu_avg_iowait - minio_system_cpu_load - minio_system_cpu_load_perc - minio_system_cpu_nice - minio_system_cpu_steal - minio_system_cpu_system - minio_system_cpu_user --- cmd/metrics-resource.go | 22 ++++++---- cmd/metrics-v3-cache.go | 27 ++++++++++++ cmd/metrics-v3-system-cpu.go | 82 ++++++++++++++++++++++++++++++++++++ cmd/metrics-v3.go | 16 +++++++ docs/metrics/v3.md | 12 ++++++ 5 files changed, 150 insertions(+), 9 deletions(-) create mode 100644 cmd/metrics-v3-system-cpu.go diff --git a/cmd/metrics-resource.go b/cmd/metrics-resource.go index eaa07ba5c..bda5c3899 100644 --- a/cmd/metrics-resource.go +++ b/cmd/metrics-resource.go @@ -162,14 +162,7 @@ func init() { resourceCollector = newMinioResourceCollector(resourceMetricsGroups) } -func updateResourceMetrics(subSys MetricSubsystem, name MetricName, val float64, labels map[string]string, isCumulative bool) { - resourceMetricsMapMu.Lock() - defer resourceMetricsMapMu.Unlock() - subsysMetrics, found := resourceMetricsMap[subSys] - if !found { - subsysMetrics = ResourceMetrics{} - } - +func getResourceKey(name MetricName, labels map[string]string) string { // labels are used to uniquely identify a metric // e.g. reads_per_sec_{drive} inside the map sfx := "" @@ -180,7 +173,18 @@ func updateResourceMetrics(subSys MetricSubsystem, name MetricName, val float64, sfx += v } - key := string(name) + "_" + sfx + return string(name) + "_" + sfx +} + +func updateResourceMetrics(subSys MetricSubsystem, name MetricName, val float64, labels map[string]string, isCumulative bool) { + resourceMetricsMapMu.Lock() + defer resourceMetricsMapMu.Unlock() + subsysMetrics, found := resourceMetricsMap[subSys] + if !found { + subsysMetrics = ResourceMetrics{} + } + + key := getResourceKey(name, labels) metric, found := subsysMetrics[key] if !found { metric = ResourceMetric{ diff --git a/cmd/metrics-v3-cache.go b/cmd/metrics-v3-cache.go index 3f178f8e9..ac40681a8 100644 --- a/cmd/metrics-v3-cache.go +++ b/cmd/metrics-v3-cache.go @@ -35,6 +35,7 @@ type metricsCache struct { esetHealthResult *cachevalue.Cache[HealthResult] driveMetrics *cachevalue.Cache[storageMetrics] memoryMetrics *cachevalue.Cache[madmin.MemInfo] + cpuMetrics *cachevalue.Cache[madmin.CPUMetrics] clusterDriveMetrics *cachevalue.Cache[storageMetrics] nodesUpDown *cachevalue.Cache[nodesOnline] } @@ -45,6 +46,7 @@ func newMetricsCache() *metricsCache { esetHealthResult: newESetHealthResultCache(), driveMetrics: newDriveMetricsCache(), memoryMetrics: newMemoryMetricsCache(), + cpuMetrics: newCPUMetricsCache(), clusterDriveMetrics: newClusterStorageInfoCache(), nodesUpDown: newNodesUpDownCache(), } @@ -200,6 +202,31 @@ func newDriveMetricsCache() *cachevalue.Cache[storageMetrics] { loadDriveMetrics) } +func newCPUMetricsCache() *cachevalue.Cache[madmin.CPUMetrics] { + loadCPUMetrics := func() (v madmin.CPUMetrics, err error) { + var types madmin.MetricType = madmin.MetricsCPU + + m := collectLocalMetrics(types, collectMetricsOpts{ + hosts: map[string]struct{}{ + globalLocalNodeName: {}, + }, + }) + + for _, hm := range m.ByHost { + if hm.CPU != nil { + v = *hm.CPU + break + } + } + + return + } + + return cachevalue.NewFromFunc(1*time.Minute, + cachevalue.Opts{ReturnLastGood: true}, + loadCPUMetrics) +} + func newMemoryMetricsCache() *cachevalue.Cache[madmin.MemInfo] { loadMemoryMetrics := func() (v madmin.MemInfo, err error) { var types madmin.MetricType = madmin.MetricsMem diff --git a/cmd/metrics-v3-system-cpu.go b/cmd/metrics-v3-system-cpu.go new file mode 100644 index 000000000..d6526b0fd --- /dev/null +++ b/cmd/metrics-v3-system-cpu.go @@ -0,0 +1,82 @@ +// Copyright (c) 2015-2024 MinIO, Inc. +// +// # This file is part of MinIO Object Storage stack +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +package cmd + +import ( + "context" + "math" +) + +const ( + sysCPUAvgIdle = "avg_idle" + sysCPUAvgIOWait = "avg_iowait" + sysCPULoad = "load" + sysCPULoadPerc = "load_perc" + sysCPUNice = "nice" + sysCPUSteal = "steal" + sysCPUSystem = "system" + sysCPUUser = "user" +) + +var ( + sysCPUAvgIdleMD = NewGaugeMD(sysCPUAvgIdle, "Average CPU idle time") + sysCPUAvgIOWaitMD = NewGaugeMD(sysCPUAvgIOWait, "Average CPU IOWait time") + sysCPULoadMD = NewGaugeMD(sysCPULoad, "CPU load average 1min") + sysCPULoadPercMD = NewGaugeMD(sysCPULoadPerc, "CPU load average 1min (percentage)") + sysCPUNiceMD = NewGaugeMD(sysCPUNice, "CPU nice time") + sysCPUStealMD = NewGaugeMD(sysCPUSteal, "CPU steal time") + sysCPUSystemMD = NewGaugeMD(sysCPUSystem, "CPU system time") + sysCPUUserMD = NewGaugeMD(sysCPUUser, "CPU user time") +) + +// loadCPUMetrics - `MetricsLoaderFn` for system CPU metrics. +func loadCPUMetrics(ctx context.Context, m MetricValues, c *metricsCache) error { + cpuMetrics, _ := c.cpuMetrics.Get() + + if cpuMetrics.LoadStat != nil { + m.Set(sysCPULoad, cpuMetrics.LoadStat.Load1) + perc := cpuMetrics.LoadStat.Load1 * 100 / float64(cpuMetrics.CPUCount) + m.Set(sysCPULoadPerc, math.Round(perc*100)/100) + } + + ts := cpuMetrics.TimesStat + tot := ts.User + ts.System + ts.Idle + ts.Iowait + ts.Nice + ts.Steal + cpuUserVal := math.Round(ts.User/tot*100*100) / 100 + m.Set(sysCPUUser, cpuUserVal) + cpuSystemVal := math.Round(ts.System/tot*100*100) / 100 + m.Set(sysCPUSystem, cpuSystemVal) + cpuNiceVal := math.Round(ts.Nice/tot*100*100) / 100 + m.Set(sysCPUNice, cpuNiceVal) + cpuStealVal := math.Round(ts.Steal/tot*100*100) / 100 + m.Set(sysCPUSteal, cpuStealVal) + + // metrics-resource.go runs a job to collect resource metrics including their Avg values and + // stores them in resourceMetricsMap. We can use it to get the Avg values of CPU idle and IOWait. + cpuResourceMetrics, found := resourceMetricsMap[cpuSubsystem] + if found { + if cpuIdleMetric, ok := cpuResourceMetrics[getResourceKey(cpuIdle, nil)]; ok { + avgVal := math.Round(cpuIdleMetric.Avg*100) / 100 + m.Set(sysCPUAvgIdle, avgVal) + } + if cpuIOWaitMetric, ok := cpuResourceMetrics[getResourceKey(cpuIOWait, nil)]; ok { + avgVal := math.Round(cpuIOWaitMetric.Avg*100) / 100 + m.Set(sysCPUAvgIOWait, avgVal) + } + } + return nil +} diff --git a/cmd/metrics-v3.go b/cmd/metrics-v3.go index 00604c858..22242b11d 100644 --- a/cmd/metrics-v3.go +++ b/cmd/metrics-v3.go @@ -36,6 +36,7 @@ const ( systemNetworkInternodeCollectorPath collectorPath = "/system/network/internode" systemDriveCollectorPath collectorPath = "/system/drive" systemMemoryCollectorPath collectorPath = "/system/memory" + systemCPUCollectorPath collectorPath = "/system/cpu" systemProcessCollectorPath collectorPath = "/system/process" systemGoCollectorPath collectorPath = "/system/go" @@ -128,6 +129,20 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection { loadMemoryMetrics, ) + systemCPUMG := NewMetricsGroup(systemCPUCollectorPath, + []MetricDescriptor{ + sysCPUAvgIdleMD, + sysCPUAvgIOWaitMD, + sysCPULoadMD, + sysCPULoadPercMD, + sysCPUNiceMD, + sysCPUStealMD, + sysCPUSystemMD, + sysCPUUserMD, + }, + loadCPUMetrics, + ) + systemDriveMG := NewMetricsGroup(systemDriveCollectorPath, []MetricDescriptor{ driveUsedBytesMD, @@ -235,6 +250,7 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection { systemNetworkInternodeMG, systemDriveMG, systemMemoryMG, + systemCPUMG, clusterHealthMG, clusterUsageObjectsMG, diff --git a/docs/metrics/v3.md b/docs/metrics/v3.md index 66d0047e5..56ff26be0 100644 --- a/docs/metrics/v3.md +++ b/docs/metrics/v3.md @@ -139,6 +139,18 @@ The standard metrics groups for ProcessCollector and GoCollector are not shown b | `minio_system_memory_shared` | `gauge` | Shared memory on the node | `server` | | `minio_system_memory_available` | `gauge` | Available memory on the node | `server` | +### `/system/cpu` + +| Name | Type | Help | Labels | +|-------------------------------|---------|------------------------------------|----------| +| `minio_system_cpu_avg_idle` | `gauge` | Average CPU idle time | `server` | +| `minio_system_cpu_avg_iowait` | `gauge` | Average CPU IOWait time | `server` | +| `minio_system_cpu_load` | `gauge` | CPU load average 1min | `server` | +| `minio_system_cpu_load_perc` | `gauge` | CPU load average 1min (percentage) | `server` | +| `minio_system_cpu_nice` | `gauge` | CPU nice time | `server` | +| `minio_system_cpu_steal` | `gauge` | CPU steal time | `server` | +| `minio_system_cpu_system` | `gauge` | CPU system time | `server` | +| `minio_system_cpu_user` | `gauge` | CPU user time | `server` | ### `/system/network/internode`