From 4caa3422bdfcb3d3d145b69c56d4873391f43938 Mon Sep 17 00:00:00 2001 From: Shireesh Anjal <355479+anjalshireesh@users.noreply.github.com> Date: Fri, 26 Apr 2024 21:37:23 +0530 Subject: [PATCH] Add process metrics in `metrics-v3` (#19612) endpoint: /minio/metrics/v3/system/process metrics: - locks_read_total - locks_write_total - cpu_total_seconds - go_routine_total - io_rchar_bytes - io_read_bytes - io_wchar_bytes - io_write_bytes - start_time_seconds - uptime_seconds - file_descriptor_limit_total - file_descriptor_open_total - syscall_read_total - syscall_write_total - resident_memory_bytes - virtual_memory_bytes - virtual_memory_max_bytes Since the standard process collector implements only a subset of these metrics, remove it and implement our own custom process collector that captures all the process metrics we need. --- cmd/metrics-v3-system-process.go | 172 +++++++++++++++++++++++++++++++ cmd/metrics-v3.go | 29 +++++- docs/metrics/v3.md | 24 ++++- 3 files changed, 220 insertions(+), 5 deletions(-) create mode 100644 cmd/metrics-v3-system-process.go diff --git a/cmd/metrics-v3-system-process.go b/cmd/metrics-v3-system-process.go new file mode 100644 index 000000000..0db6140bc --- /dev/null +++ b/cmd/metrics-v3-system-process.go @@ -0,0 +1,172 @@ +// Copyright (c) 2015-2024 MinIO, Inc. +// +// # This file is part of MinIO Object Storage stack +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +package cmd + +import ( + "context" + "runtime" + "time" + + "github.com/prometheus/procfs" +) + +const ( + processLocksReadTotal = "locks_read_total" + processLocksWriteTotal = "locks_write_total" + processCPUTotalSeconds = "cpu_total_seconds" + processGoRoutineTotal = "go_routine_total" + processIORCharBytes = "io_rchar_bytes" + processIOReadBytes = "io_read_bytes" + processIOWCharBytes = "io_wchar_bytes" + processIOWriteBytes = "io_write_bytes" + processStartTimeSeconds = "start_time_seconds" + processUptimeSeconds = "uptime_seconds" + processFileDescriptorLimitTotal = "file_descriptor_limit_total" + processFileDescriptorOpenTotal = "file_descriptor_open_total" + processSyscallReadTotal = "syscall_read_total" + processSyscallWriteTotal = "syscall_write_total" + processResidentMemoryBytes = "resident_memory_bytes" + processVirtualMemoryBytes = "virtual_memory_bytes" + processVirtualMemoryMaxBytes = "virtual_memory_max_bytes" +) + +var ( + processLocksReadTotalMD = NewGaugeMD(processLocksReadTotal, "Number of current READ locks on this peer") + processLocksWriteTotalMD = NewGaugeMD(processLocksWriteTotal, "Number of current WRITE locks on this peer") + processCPUTotalSecondsMD = NewCounterMD(processCPUTotalSeconds, "Total user and system CPU time spent in seconds") + processGoRoutineTotalMD = NewGaugeMD(processGoRoutineTotal, "Total number of go routines running") + processIORCharBytesMD = NewCounterMD(processIORCharBytes, "Total bytes read by the process from the underlying storage system including cache, /proc/[pid]/io rchar") + processIOReadBytesMD = NewCounterMD(processIOReadBytes, "Total bytes read by the process from the underlying storage system, /proc/[pid]/io read_bytes") + processIOWCharBytesMD = NewCounterMD(processIOWCharBytes, "Total bytes written by the process to the underlying storage system including page cache, /proc/[pid]/io wchar") + processIOWriteBytesMD = NewCounterMD(processIOWriteBytes, "Total bytes written by the process to the underlying storage system, /proc/[pid]/io write_bytes") + processStarttimeSecondsMD = NewGaugeMD(processStartTimeSeconds, "Start time for MinIO process in seconds since Unix epoc") + processUptimeSecondsMD = NewGaugeMD(processUptimeSeconds, "Uptime for MinIO process in seconds") + processFileDescriptorLimitTotalMD = NewGaugeMD(processFileDescriptorLimitTotal, "Limit on total number of open file descriptors for the MinIO Server process") + processFileDescriptorOpenTotalMD = NewGaugeMD(processFileDescriptorOpenTotal, "Total number of open file descriptors by the MinIO Server process") + processSyscallReadTotalMD = NewCounterMD(processSyscallReadTotal, "Total read SysCalls to the kernel. /proc/[pid]/io syscr") + processSyscallWriteTotalMD = NewCounterMD(processSyscallWriteTotal, "Total write SysCalls to the kernel. /proc/[pid]/io syscw") + processResidentMemoryBytesMD = NewGaugeMD(processResidentMemoryBytes, "Resident memory size in bytes") + processVirtualMemoryBytesMD = NewGaugeMD(processVirtualMemoryBytes, "Virtual memory size in bytes") + processVirtualMemoryMaxBytesMD = NewGaugeMD(processVirtualMemoryMaxBytes, "Maximum virtual memory size in bytes") +) + +func loadProcStatMetrics(ctx context.Context, stat procfs.ProcStat, m MetricValues) { + if stat.CPUTime() > 0 { + m.Set(processCPUTotalSeconds, float64(stat.CPUTime())) + } + + if stat.ResidentMemory() > 0 { + m.Set(processResidentMemoryBytes, float64(stat.ResidentMemory())) + } + + if stat.VirtualMemory() > 0 { + m.Set(processVirtualMemoryBytes, float64(stat.VirtualMemory())) + } + + startTime, err := stat.StartTime() + if err != nil { + metricsLogIf(ctx, err) + } else if startTime > 0 { + m.Set(processStartTimeSeconds, float64(startTime)) + } +} + +func loadProcIOMetrics(ctx context.Context, io procfs.ProcIO, m MetricValues) { + if io.RChar > 0 { + m.Set(processIORCharBytes, float64(io.RChar)) + } + + if io.ReadBytes > 0 { + m.Set(processIOReadBytes, float64(io.ReadBytes)) + } + + if io.WChar > 0 { + m.Set(processIOWCharBytes, float64(io.WChar)) + } + + if io.WriteBytes > 0 { + m.Set(processIOWriteBytes, float64(io.WriteBytes)) + } + + if io.SyscR > 0 { + m.Set(processSyscallReadTotal, float64(io.SyscR)) + } + + if io.SyscW > 0 { + m.Set(processSyscallWriteTotal, float64(io.SyscW)) + } +} + +func loadProcFSMetrics(ctx context.Context, p procfs.Proc, m MetricValues) { + stat, err := p.Stat() + if err != nil { + metricsLogIf(ctx, err) + } else { + loadProcStatMetrics(ctx, stat, m) + } + + io, err := p.IO() + if err != nil { + metricsLogIf(ctx, err) + } else { + loadProcIOMetrics(ctx, io, m) + } + + l, err := p.Limits() + if err != nil { + metricsLogIf(ctx, err) + } else { + if l.OpenFiles > 0 { + m.Set(processFileDescriptorLimitTotal, float64(l.OpenFiles)) + } + + if l.AddressSpace > 0 { + m.Set(processVirtualMemoryMaxBytes, float64(l.AddressSpace)) + } + } + + openFDs, err := p.FileDescriptorsLen() + if err != nil { + metricsLogIf(ctx, err) + } else if openFDs > 0 { + m.Set(processFileDescriptorOpenTotal, float64(openFDs)) + } +} + +// loadProcessMetrics - `MetricsLoaderFn` for process metrics +func loadProcessMetrics(ctx context.Context, m MetricValues, c *metricsCache) error { + m.Set(processGoRoutineTotal, float64(runtime.NumGoroutine())) + + if !globalBootTime.IsZero() { + m.Set(processUptimeSeconds, time.Since(globalBootTime).Seconds()) + } + + p, err := procfs.Self() + if err != nil { + metricsLogIf(ctx, err) + } else { + loadProcFSMetrics(ctx, p, m) + } + + if globalIsDistErasure && globalLockServer != nil { + st := globalLockServer.stats() + m.Set(processLocksReadTotal, float64(st.Reads)) + m.Set(processLocksWriteTotal, float64(st.Writes)) + } + return nil +} diff --git a/cmd/metrics-v3.go b/cmd/metrics-v3.go index c0c2e19ed..38c30dd21 100644 --- a/cmd/metrics-v3.go +++ b/cmd/metrics-v3.go @@ -144,6 +144,29 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection { loadCPUMetrics, ) + systemProcessMG := NewMetricsGroup(systemProcessCollectorPath, + []MetricDescriptor{ + processLocksReadTotalMD, + processLocksWriteTotalMD, + processCPUTotalSecondsMD, + processGoRoutineTotalMD, + processIORCharBytesMD, + processIOReadBytesMD, + processIOWCharBytesMD, + processIOWriteBytesMD, + processStarttimeSecondsMD, + processUptimeSecondsMD, + processFileDescriptorLimitTotalMD, + processFileDescriptorOpenTotalMD, + processSyscallReadTotalMD, + processSyscallWriteTotalMD, + processResidentMemoryBytesMD, + processVirtualMemoryBytesMD, + processVirtualMemoryMaxBytesMD, + }, + loadProcessMetrics, + ) + systemDriveMG := NewMetricsGroup(systemDriveCollectorPath, []MetricDescriptor{ driveUsedBytesMD, @@ -263,6 +286,7 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection { systemDriveMG, systemMemoryMG, systemCPUMG, + systemProcessMG, clusterHealthMG, clusterUsageObjectsMG, @@ -299,13 +323,10 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection { } // Prepare to register the collectors. Other than `MetricGroup` collectors, - // we also have standard collectors like `ProcessCollector` and `GoCollector`. + // we also have standard collectors like `GoCollector`. // Create all Non-`MetricGroup` collectors here. collectors := map[collectorPath]prometheus.Collector{ - systemProcessCollectorPath: collectors.NewProcessCollector(collectors.ProcessCollectorOpts{ - ReportErrors: true, - }), systemGoCollectorPath: collectors.NewGoCollector(), } diff --git a/docs/metrics/v3.md b/docs/metrics/v3.md index 38713704d..e10bc947c 100644 --- a/docs/metrics/v3.md +++ b/docs/metrics/v3.md @@ -64,7 +64,7 @@ These present metrics about the whole MinIO cluster. Each of the following sub-sections list metrics returned by each of the endpoints. -The standard metrics groups for ProcessCollector and GoCollector are not shown below. +The standard metrics group for GoCollector is not shown below. ### `/api/requests` @@ -163,6 +163,28 @@ The standard metrics groups for ProcessCollector and GoCollector are not shown b | `minio_system_network_internode_sent_bytes_total` | `counter` | Total number of bytes sent to other peer nodes | `server,pool_index` | | `minio_system_network_internode_recv_bytes_total` | `counter` | Total number of bytes received from other peer nodes | `server,pool_index` | +### `/system/process` + +| Name | Type | Help | Labels | +|-------------------------------|-----------|----------------------------------------------------------------------------------------------------------------|----------| +| `locks_read_total` | `gauge` | Number of current READ locks on this peer | `server` | +| `locks_write_total` | `gauge` | Number of current WRITE locks on this peer | `server` | +| `cpu_total_seconds` | `counter` | Total user and system CPU time spent in seconds | `server` | +| `go_routine_total` | `gauge` | Total number of go routines running | `server` | +| `io_rchar_bytes` | `counter` | Total bytes read by the process from the underlying storage system including cache, /proc/[pid]/io rchar | `server` | +| `io_read_bytes` | `counter` | Total bytes read by the process from the underlying storage system, /proc/[pid]/io read_bytes | `server` | +| `io_wchar_bytes` | `counter` | Total bytes written by the process to the underlying storage system including page cache, /proc/[pid]/io wchar | `server` | +| `io_write_bytes` | `counter` | Total bytes written by the process to the underlying storage system, /proc/[pid]/io write_bytes | `server` | +| `start_time_seconds` | `gauge` | Start time for MinIO process in seconds since Unix epoc | `server` | +| `uptime_seconds` | `gauge` | Uptime for MinIO process in seconds | `server` | +| `file_descriptor_limit_total` | `gauge` | Limit on total number of open file descriptors for the MinIO Server process | `server` | +| `file_descriptor_open_total` | `gauge` | Total number of open file descriptors by the MinIO Server process | `server` | +| `syscall_read_total` | `counter` | Total read SysCalls to the kernel. /proc/[pid]/io syscr | `server` | +| `syscall_write_total` | `counter` | Total write SysCalls to the kernel. /proc/[pid]/io syscw | `server` | +| `resident_memory_bytes` | `gauge` | Resident memory size in bytes | `server` | +| `virtual_memory_bytes` | `gauge` | Virtual memory size in bytes | `server` | +| `virtual_memory_max_bytes` | `gauge` | Maximum virtual memory size in bytes | `server` | + ### `/cluster/health` | Name | Type | Help | Labels |