mirror of
https://github.com/minio/minio.git
synced 2025-01-23 04:33:15 -05:00
Add support for resource metrics (#18057)
Add a new endpoint for "resource" metrics `/v2/metrics/resource` This should return system metrics related to drives, network, CPU and memory. Except for drives, other metrics should have corresponding "avg" and "max" values also. Reuse the real-time feature to capture the required data, introducing CPU and memory metrics in it. Collect the data every minute and keep updating the average and max values accordingly, returning the latest values when the API is called.
This commit is contained in:
parent
c50627ee3e
commit
6d20ec3bea
@ -231,7 +231,8 @@ func guessIsMetricsReq(req *http.Request) bool {
|
|||||||
req.URL.Path == minioReservedBucketPath+prometheusMetricsPathLegacy ||
|
req.URL.Path == minioReservedBucketPath+prometheusMetricsPathLegacy ||
|
||||||
req.URL.Path == minioReservedBucketPath+prometheusMetricsV2ClusterPath ||
|
req.URL.Path == minioReservedBucketPath+prometheusMetricsV2ClusterPath ||
|
||||||
req.URL.Path == minioReservedBucketPath+prometheusMetricsV2NodePath ||
|
req.URL.Path == minioReservedBucketPath+prometheusMetricsV2NodePath ||
|
||||||
req.URL.Path == minioReservedBucketPath+prometheusMetricsV2BucketPath
|
req.URL.Path == minioReservedBucketPath+prometheusMetricsV2BucketPath ||
|
||||||
|
req.URL.Path == minioReservedBucketPath+prometheusMetricsV2ResourcePath
|
||||||
}
|
}
|
||||||
|
|
||||||
// guessIsRPCReq - returns true if the request is for an RPC endpoint.
|
// guessIsRPCReq - returns true if the request is for an RPC endpoint.
|
||||||
|
@ -19,12 +19,15 @@ package cmd
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"fmt"
|
||||||
"net/http"
|
"net/http"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/minio/madmin-go/v3"
|
"github.com/minio/madmin-go/v3"
|
||||||
"github.com/minio/minio/internal/disk"
|
"github.com/minio/minio/internal/disk"
|
||||||
"github.com/minio/minio/internal/net"
|
"github.com/minio/minio/internal/net"
|
||||||
|
c "github.com/shirou/gopsutil/v3/cpu"
|
||||||
|
"github.com/shirou/gopsutil/v3/load"
|
||||||
)
|
)
|
||||||
|
|
||||||
type collectMetricsOpts struct {
|
type collectMetricsOpts struct {
|
||||||
@ -89,6 +92,34 @@ func collectLocalMetrics(types madmin.MetricType, opts collectMetricsOpts) (m ma
|
|||||||
m.Aggregated.Net.NetStats = netStats
|
m.Aggregated.Net.NetStats = netStats
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if types.Contains(madmin.MetricsMem) {
|
||||||
|
m.Aggregated.Mem = &madmin.MemMetrics{
|
||||||
|
CollectedAt: UTCNow(),
|
||||||
|
}
|
||||||
|
m.Aggregated.Mem.Info = madmin.GetMemInfo(GlobalContext, globalMinioAddr)
|
||||||
|
}
|
||||||
|
if types.Contains(madmin.MetricsCPU) {
|
||||||
|
m.Aggregated.CPU = &madmin.CPUMetrics{
|
||||||
|
CollectedAt: UTCNow(),
|
||||||
|
}
|
||||||
|
cm, err := c.Times(false)
|
||||||
|
if err != nil {
|
||||||
|
m.Errors = append(m.Errors, err.Error())
|
||||||
|
} else {
|
||||||
|
// not collecting per-cpu stats, so there will be only one element
|
||||||
|
if len(cm) == 1 {
|
||||||
|
m.Aggregated.CPU.TimesStat = &cm[0]
|
||||||
|
} else {
|
||||||
|
m.Errors = append(m.Errors, fmt.Sprintf("Expected one CPU stat, got %d", len(cm)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
loadStat, err := load.Avg()
|
||||||
|
if err != nil {
|
||||||
|
m.Errors = append(m.Errors, err.Error())
|
||||||
|
} else {
|
||||||
|
m.Aggregated.CPU.LoadStat = loadStat
|
||||||
|
}
|
||||||
|
}
|
||||||
// Add types...
|
// Add types...
|
||||||
|
|
||||||
// ByHost is a shallow reference, so careful about sharing.
|
// ByHost is a shallow reference, so careful about sharing.
|
||||||
|
453
cmd/metrics-resource.go
Normal file
453
cmd/metrics-resource.go
Normal file
@ -0,0 +1,453 @@
|
|||||||
|
// Copyright (c) 2015-2023 MinIO, Inc.
|
||||||
|
//
|
||||||
|
// This file is part of MinIO Object Storage stack
|
||||||
|
//
|
||||||
|
// This program is free software: you can redistribute it and/or modify
|
||||||
|
// it under the terms of the GNU Affero General Public License as published by
|
||||||
|
// the Free Software Foundation, either version 3 of the License, or
|
||||||
|
// (at your option) any later version.
|
||||||
|
//
|
||||||
|
// This program is distributed in the hope that it will be useful
|
||||||
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
// GNU Affero General Public License for more details.
|
||||||
|
//
|
||||||
|
// You should have received a copy of the GNU Affero General Public License
|
||||||
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
package cmd
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"math"
|
||||||
|
"net/http"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/minio/madmin-go/v3"
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
"github.com/shirou/gopsutil/v3/host"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
resourceMetricsCollectionInterval = time.Minute
|
||||||
|
resourceMetricsCacheInterval = time.Minute
|
||||||
|
|
||||||
|
// drive stats
|
||||||
|
totalInodes MetricName = "total_inodes"
|
||||||
|
readsPerSec MetricName = "reads_per_sec"
|
||||||
|
writesPerSec MetricName = "writes_per_sec"
|
||||||
|
readsKBPerSec MetricName = "reads_kb_per_sec"
|
||||||
|
writesKBPerSec MetricName = "writes_kb_per_sec"
|
||||||
|
readsAwait MetricName = "reads_await"
|
||||||
|
writesAwait MetricName = "writes_await"
|
||||||
|
percUtil MetricName = "perc_util"
|
||||||
|
usedInodes MetricName = "used_inodes"
|
||||||
|
|
||||||
|
// network stats
|
||||||
|
interfaceRxBytes MetricName = "rx_bytes"
|
||||||
|
interfaceRxErrors MetricName = "rx_errors"
|
||||||
|
interfaceTxBytes MetricName = "tx_bytes"
|
||||||
|
interfaceTxErrors MetricName = "tx_errors"
|
||||||
|
|
||||||
|
// memory stats
|
||||||
|
memUsed MetricName = "used"
|
||||||
|
memFree MetricName = "free"
|
||||||
|
memShared MetricName = "shared"
|
||||||
|
memBuffers MetricName = "buffers"
|
||||||
|
memCache MetricName = "cache"
|
||||||
|
memAvailable MetricName = "available"
|
||||||
|
|
||||||
|
// cpu stats
|
||||||
|
cpuUser MetricName = "user"
|
||||||
|
cpuSystem MetricName = "system"
|
||||||
|
cpuIOWait MetricName = "iowait"
|
||||||
|
cpuIdle MetricName = "idle"
|
||||||
|
cpuNice MetricName = "nice"
|
||||||
|
cpuSteal MetricName = "steal"
|
||||||
|
cpuLoad1 MetricName = "load1"
|
||||||
|
cpuLoad5 MetricName = "load5"
|
||||||
|
cpuLoad15 MetricName = "load15"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
resourceCollector *minioResourceCollector
|
||||||
|
// resourceMetricsMap is a map of subsystem to its metrics
|
||||||
|
resourceMetricsMap map[MetricSubsystem]ResourceMetrics
|
||||||
|
// resourceMetricsHelpMap maps metric name to its help string
|
||||||
|
resourceMetricsHelpMap map[MetricName]string
|
||||||
|
resourceMetricsGroups []*MetricsGroup
|
||||||
|
)
|
||||||
|
|
||||||
|
// PeerResourceMetrics represents the resource metrics
|
||||||
|
// retrieved from a peer, along with errors if any
|
||||||
|
type PeerResourceMetrics struct {
|
||||||
|
Metrics map[MetricSubsystem]ResourceMetrics
|
||||||
|
Errors []string
|
||||||
|
}
|
||||||
|
|
||||||
|
// ResourceMetrics is a map of unique key identifying
|
||||||
|
// a resource metric (e.g. reads_per_sec_{node}_{drive})
|
||||||
|
// to its data
|
||||||
|
type ResourceMetrics map[string]ResourceMetric
|
||||||
|
|
||||||
|
// ResourceMetric represents a single resource metric
|
||||||
|
// The metrics are collected from all servers periodically
|
||||||
|
// and stored in the resource metrics map.
|
||||||
|
// It also maintains the count of number of times this metric
|
||||||
|
// was collected since the server started, and the sum,
|
||||||
|
// average and max values across the same.
|
||||||
|
type ResourceMetric struct {
|
||||||
|
Name MetricName
|
||||||
|
Labels map[string]string
|
||||||
|
|
||||||
|
// value captured in current cycle
|
||||||
|
Current float64
|
||||||
|
|
||||||
|
// Used when system provides cumulative (since uptime) values
|
||||||
|
// helps in calculating the current value by comparing the new
|
||||||
|
// cumulative value with previous one
|
||||||
|
Cumulative float64
|
||||||
|
|
||||||
|
Max float64
|
||||||
|
Avg float64
|
||||||
|
Sum float64
|
||||||
|
Count uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
interval := fmt.Sprintf("%ds", int(resourceMetricsCollectionInterval.Seconds()))
|
||||||
|
resourceMetricsHelpMap = map[MetricName]string{
|
||||||
|
interfaceRxBytes: "Bytes received on the interface in " + interval,
|
||||||
|
interfaceRxErrors: "Receive errors in " + interval,
|
||||||
|
interfaceTxBytes: "Bytes transmitted in " + interval,
|
||||||
|
interfaceTxErrors: "Transmit errors in " + interval,
|
||||||
|
total: "Total memory on the node",
|
||||||
|
memUsed: "Used memory on the node",
|
||||||
|
memFree: "Free memory on the node",
|
||||||
|
memShared: "Shared memory on the node",
|
||||||
|
memBuffers: "Buffers memory on the node",
|
||||||
|
memCache: "Cache memory on the node",
|
||||||
|
memAvailable: "Available memory on the node",
|
||||||
|
readsPerSec: "Reads per second on a drive",
|
||||||
|
writesPerSec: "Writes per second on a drive",
|
||||||
|
readsKBPerSec: "Kilobytes read per second on a drive",
|
||||||
|
writesKBPerSec: "Kilobytes written per second on a drive",
|
||||||
|
readsAwait: "Average time for read requests to be served on a drive",
|
||||||
|
writesAwait: "Average time for write requests to be served on a drive",
|
||||||
|
percUtil: "Percentage of time the disk was busy since uptime",
|
||||||
|
usedBytes: "Used bytes on a drive",
|
||||||
|
totalBytes: "Total bytes on a drive",
|
||||||
|
usedInodes: "Total inodes used on a drive",
|
||||||
|
totalInodes: "Total inodes on a drive",
|
||||||
|
cpuUser: "CPU user time",
|
||||||
|
cpuSystem: "CPU system time",
|
||||||
|
cpuIdle: "CPU idle time",
|
||||||
|
cpuIOWait: "CPU ioWait time",
|
||||||
|
cpuSteal: "CPU steal time",
|
||||||
|
cpuNice: "CPU nice time",
|
||||||
|
cpuLoad1: "CPU load average 1min",
|
||||||
|
cpuLoad5: "CPU load average 5min",
|
||||||
|
cpuLoad15: "CPU load average 15min",
|
||||||
|
}
|
||||||
|
resourceMetricsGroups = []*MetricsGroup{
|
||||||
|
getResourceMetrics(),
|
||||||
|
}
|
||||||
|
|
||||||
|
resourceCollector = newMinioResourceCollector(resourceMetricsGroups)
|
||||||
|
}
|
||||||
|
|
||||||
|
func updateResourceMetrics(subSys MetricSubsystem, name MetricName, val float64, labels map[string]string, isCumulative bool) {
|
||||||
|
subsysMetrics, found := resourceMetricsMap[subSys]
|
||||||
|
if !found {
|
||||||
|
subsysMetrics = ResourceMetrics{}
|
||||||
|
}
|
||||||
|
|
||||||
|
// labels are used to uniquely identify a metric
|
||||||
|
// e.g. reads_per_sec_{drive} inside the map
|
||||||
|
sfx := ""
|
||||||
|
for _, v := range labels {
|
||||||
|
if len(sfx) > 0 {
|
||||||
|
sfx += "_"
|
||||||
|
}
|
||||||
|
sfx += v
|
||||||
|
}
|
||||||
|
|
||||||
|
key := string(name) + "_" + sfx
|
||||||
|
metric, found := subsysMetrics[key]
|
||||||
|
if !found {
|
||||||
|
metric = ResourceMetric{
|
||||||
|
Name: name,
|
||||||
|
Labels: labels,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if isCumulative {
|
||||||
|
metric.Current = val - metric.Cumulative
|
||||||
|
metric.Cumulative = val
|
||||||
|
} else {
|
||||||
|
metric.Current = val
|
||||||
|
}
|
||||||
|
|
||||||
|
if metric.Current > metric.Max {
|
||||||
|
metric.Max = val
|
||||||
|
}
|
||||||
|
|
||||||
|
metric.Sum += metric.Current
|
||||||
|
metric.Count++
|
||||||
|
|
||||||
|
metric.Avg = metric.Sum / float64(metric.Count)
|
||||||
|
subsysMetrics[key] = metric
|
||||||
|
|
||||||
|
resourceMetricsMap[subSys] = subsysMetrics
|
||||||
|
}
|
||||||
|
|
||||||
|
func collectDriveMetrics(m madmin.RealtimeMetrics) {
|
||||||
|
upt, _ := host.Uptime()
|
||||||
|
kib := 1 << 10
|
||||||
|
sectorSize := uint64(512)
|
||||||
|
|
||||||
|
for d, dm := range m.ByDisk {
|
||||||
|
stats := dm.IOStats
|
||||||
|
labels := map[string]string{"drive": d}
|
||||||
|
updateResourceMetrics(driveSubsystem, readsPerSec, float64(stats.ReadIOs)/float64(upt), labels, false)
|
||||||
|
|
||||||
|
readBytes := stats.ReadSectors * sectorSize
|
||||||
|
readKib := float64(readBytes) / float64(kib)
|
||||||
|
readKibPerSec := readKib / float64(upt)
|
||||||
|
updateResourceMetrics(driveSubsystem, readsKBPerSec, readKibPerSec, labels, false)
|
||||||
|
|
||||||
|
updateResourceMetrics(driveSubsystem, writesPerSec, float64(stats.WriteIOs)/float64(upt), labels, false)
|
||||||
|
|
||||||
|
writeBytes := stats.WriteSectors * sectorSize
|
||||||
|
writeKib := float64(writeBytes) / float64(kib)
|
||||||
|
writeKibPerSec := writeKib / float64(upt)
|
||||||
|
updateResourceMetrics(driveSubsystem, writesKBPerSec, writeKibPerSec, labels, false)
|
||||||
|
|
||||||
|
rdAwait := 0.0
|
||||||
|
if stats.ReadIOs > 0 {
|
||||||
|
rdAwait = float64(stats.ReadTicks) / float64(stats.ReadIOs)
|
||||||
|
}
|
||||||
|
updateResourceMetrics(driveSubsystem, readsAwait, rdAwait, labels, false)
|
||||||
|
|
||||||
|
wrAwait := 0.0
|
||||||
|
if stats.WriteIOs > 0 {
|
||||||
|
wrAwait = float64(stats.WriteTicks) / float64(stats.WriteIOs)
|
||||||
|
}
|
||||||
|
updateResourceMetrics(driveSubsystem, writesAwait, wrAwait, labels, false)
|
||||||
|
|
||||||
|
updateResourceMetrics(driveSubsystem, percUtil, float64(stats.TotalTicks)/float64(upt*10), labels, false)
|
||||||
|
}
|
||||||
|
|
||||||
|
globalLocalDrivesMu.RLock()
|
||||||
|
gld := globalLocalDrives
|
||||||
|
globalLocalDrivesMu.RUnlock()
|
||||||
|
|
||||||
|
for _, d := range gld {
|
||||||
|
labels := map[string]string{"drive": d.Endpoint().RawPath}
|
||||||
|
di, err := d.DiskInfo(GlobalContext, false)
|
||||||
|
if err == nil {
|
||||||
|
updateResourceMetrics(driveSubsystem, usedBytes, float64(di.Used), labels, false)
|
||||||
|
updateResourceMetrics(driveSubsystem, totalBytes, float64(di.Total), labels, false)
|
||||||
|
updateResourceMetrics(driveSubsystem, usedInodes, float64(di.UsedInodes), labels, false)
|
||||||
|
updateResourceMetrics(driveSubsystem, totalInodes, float64(di.FreeInodes+di.UsedInodes), labels, false)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func collectLocalResourceMetrics() {
|
||||||
|
var types madmin.MetricType = madmin.MetricsDisk | madmin.MetricNet | madmin.MetricsMem | madmin.MetricsCPU
|
||||||
|
|
||||||
|
m := collectLocalMetrics(types, collectMetricsOpts{
|
||||||
|
hosts: map[string]struct{}{
|
||||||
|
globalLocalNodeName: {},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
for host, hm := range m.ByHost {
|
||||||
|
if len(host) > 0 {
|
||||||
|
if hm.Net != nil && len(hm.Net.NetStats.Name) > 0 {
|
||||||
|
stats := hm.Net.NetStats
|
||||||
|
labels := map[string]string{"interface": stats.Name}
|
||||||
|
updateResourceMetrics(interfaceSubsystem, interfaceRxBytes, float64(stats.RxBytes), labels, true)
|
||||||
|
updateResourceMetrics(interfaceSubsystem, interfaceRxErrors, float64(stats.RxErrors), labels, true)
|
||||||
|
updateResourceMetrics(interfaceSubsystem, interfaceTxBytes, float64(stats.TxBytes), labels, true)
|
||||||
|
updateResourceMetrics(interfaceSubsystem, interfaceTxErrors, float64(stats.TxErrors), labels, true)
|
||||||
|
}
|
||||||
|
if hm.Mem != nil && len(hm.Mem.Info.Addr) > 0 {
|
||||||
|
labels := map[string]string{}
|
||||||
|
stats := hm.Mem.Info
|
||||||
|
updateResourceMetrics(memSubsystem, total, float64(stats.Total), labels, false)
|
||||||
|
updateResourceMetrics(memSubsystem, memUsed, float64(stats.Used), labels, false)
|
||||||
|
updateResourceMetrics(memSubsystem, memFree, float64(stats.Free), labels, false)
|
||||||
|
updateResourceMetrics(memSubsystem, memShared, float64(stats.Shared), labels, false)
|
||||||
|
updateResourceMetrics(memSubsystem, memBuffers, float64(stats.Buffers), labels, false)
|
||||||
|
updateResourceMetrics(memSubsystem, memAvailable, float64(stats.Available), labels, false)
|
||||||
|
updateResourceMetrics(memSubsystem, memCache, float64(stats.Cache), labels, false)
|
||||||
|
}
|
||||||
|
if hm.CPU != nil {
|
||||||
|
labels := map[string]string{}
|
||||||
|
ts := hm.CPU.TimesStat
|
||||||
|
if ts != nil {
|
||||||
|
tot := ts.User + ts.System + ts.Idle + ts.Iowait + ts.Nice + ts.Steal
|
||||||
|
cpuUserVal := math.Round(ts.User/tot*100*100) / 100
|
||||||
|
updateResourceMetrics(cpuSubsystem, cpuUser, cpuUserVal, labels, false)
|
||||||
|
cpuSystemVal := math.Round(ts.System/tot*100*100) / 100
|
||||||
|
updateResourceMetrics(cpuSubsystem, cpuSystem, cpuSystemVal, labels, false)
|
||||||
|
cpuIdleVal := math.Round(ts.Idle/tot*100*100) / 100
|
||||||
|
updateResourceMetrics(cpuSubsystem, cpuIdle, cpuIdleVal, labels, false)
|
||||||
|
cpuIOWaitVal := math.Round(ts.Iowait/tot*100*100) / 100
|
||||||
|
updateResourceMetrics(cpuSubsystem, cpuIOWait, cpuIOWaitVal, labels, false)
|
||||||
|
cpuNiceVal := math.Round(ts.Nice/tot*100*100) / 100
|
||||||
|
updateResourceMetrics(cpuSubsystem, cpuNice, cpuNiceVal, labels, false)
|
||||||
|
cpuStealVal := math.Round(ts.Steal/tot*100*100) / 100
|
||||||
|
updateResourceMetrics(cpuSubsystem, cpuSteal, cpuStealVal, labels, false)
|
||||||
|
}
|
||||||
|
ls := hm.CPU.LoadStat
|
||||||
|
if ls != nil {
|
||||||
|
updateResourceMetrics(cpuSubsystem, cpuLoad1, ls.Load1, labels, false)
|
||||||
|
updateResourceMetrics(cpuSubsystem, cpuLoad5, ls.Load5, labels, false)
|
||||||
|
updateResourceMetrics(cpuSubsystem, cpuLoad15, ls.Load15, labels, false)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break // only one host expected
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
collectDriveMetrics(m)
|
||||||
|
}
|
||||||
|
|
||||||
|
// startResourceMetricsCollection - starts the job for collecting resource metrics
|
||||||
|
func startResourceMetricsCollection() {
|
||||||
|
resourceMetricsMap = map[MetricSubsystem]ResourceMetrics{}
|
||||||
|
metricsTimer := time.NewTimer(resourceMetricsCollectionInterval)
|
||||||
|
defer metricsTimer.Stop()
|
||||||
|
|
||||||
|
collectLocalResourceMetrics()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-GlobalContext.Done():
|
||||||
|
return
|
||||||
|
case <-metricsTimer.C:
|
||||||
|
collectLocalResourceMetrics()
|
||||||
|
|
||||||
|
// Reset the timer for next cycle.
|
||||||
|
metricsTimer.Reset(resourceMetricsCollectionInterval)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// minioResourceCollector is the Collector for resource metrics
|
||||||
|
type minioResourceCollector struct {
|
||||||
|
metricsGroups []*MetricsGroup
|
||||||
|
desc *prometheus.Desc
|
||||||
|
}
|
||||||
|
|
||||||
|
// Describe sends the super-set of all possible descriptors of metrics
|
||||||
|
func (c *minioResourceCollector) Describe(ch chan<- *prometheus.Desc) {
|
||||||
|
ch <- c.desc
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect is called by the Prometheus registry when collecting metrics.
|
||||||
|
func (c *minioResourceCollector) Collect(out chan<- prometheus.Metric) {
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
publish := func(in <-chan Metric) {
|
||||||
|
defer wg.Done()
|
||||||
|
for metric := range in {
|
||||||
|
labels, values := getOrderedLabelValueArrays(metric.VariableLabels)
|
||||||
|
collectMetric(metric, labels, values, "resource", out)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Call peer api to fetch metrics
|
||||||
|
wg.Add(2)
|
||||||
|
go publish(ReportMetrics(GlobalContext, c.metricsGroups))
|
||||||
|
go publish(globalNotificationSys.GetResourceMetrics(GlobalContext))
|
||||||
|
wg.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
|
// newMinioResourceCollector describes the collector
|
||||||
|
// and returns reference of minio resource Collector
|
||||||
|
// It creates the Prometheus Description which is used
|
||||||
|
// to define Metric and help string
|
||||||
|
func newMinioResourceCollector(metricsGroups []*MetricsGroup) *minioResourceCollector {
|
||||||
|
return &minioResourceCollector{
|
||||||
|
metricsGroups: metricsGroups,
|
||||||
|
desc: prometheus.NewDesc("minio_resource_stats", "Resource statistics exposed by MinIO server", nil, nil),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func prepareResourceMetrics(rm ResourceMetric, subSys MetricSubsystem, requireAvgMax bool) []Metric {
|
||||||
|
help := resourceMetricsHelpMap[rm.Name]
|
||||||
|
name := rm.Name
|
||||||
|
|
||||||
|
metrics := []Metric{}
|
||||||
|
metrics = append(metrics, Metric{
|
||||||
|
Description: getResourceMetricDescription(subSys, name, help),
|
||||||
|
Value: rm.Current,
|
||||||
|
VariableLabels: rm.Labels,
|
||||||
|
})
|
||||||
|
|
||||||
|
if requireAvgMax {
|
||||||
|
avgName := MetricName(fmt.Sprintf("%s_avg", name))
|
||||||
|
avgHelp := fmt.Sprintf("%s (avg)", help)
|
||||||
|
metrics = append(metrics, Metric{
|
||||||
|
Description: getResourceMetricDescription(subSys, avgName, avgHelp),
|
||||||
|
Value: math.Round(rm.Avg*100) / 100,
|
||||||
|
VariableLabels: rm.Labels,
|
||||||
|
})
|
||||||
|
|
||||||
|
maxName := MetricName(fmt.Sprintf("%s_max", name))
|
||||||
|
maxHelp := fmt.Sprintf("%s (max)", help)
|
||||||
|
metrics = append(metrics, Metric{
|
||||||
|
Description: getResourceMetricDescription(subSys, maxName, maxHelp),
|
||||||
|
Value: rm.Max,
|
||||||
|
VariableLabels: rm.Labels,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
return metrics
|
||||||
|
}
|
||||||
|
|
||||||
|
func getResourceMetricDescription(subSys MetricSubsystem, name MetricName, help string) MetricDescription {
|
||||||
|
return MetricDescription{
|
||||||
|
Namespace: nodeMetricNamespace,
|
||||||
|
Subsystem: subSys,
|
||||||
|
Name: name,
|
||||||
|
Help: help,
|
||||||
|
Type: gaugeMetric,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func getResourceMetrics() *MetricsGroup {
|
||||||
|
mg := &MetricsGroup{
|
||||||
|
cacheInterval: resourceMetricsCacheInterval,
|
||||||
|
}
|
||||||
|
mg.RegisterRead(func(ctx context.Context) []Metric {
|
||||||
|
metrics := []Metric{}
|
||||||
|
|
||||||
|
subSystems := []MetricSubsystem{interfaceSubsystem, memSubsystem, driveSubsystem, cpuSubsystem}
|
||||||
|
for _, subSys := range subSystems {
|
||||||
|
stats, found := resourceMetricsMap[subSys]
|
||||||
|
if found {
|
||||||
|
requireAvgMax := true
|
||||||
|
if subSys == driveSubsystem {
|
||||||
|
requireAvgMax = false
|
||||||
|
}
|
||||||
|
for _, m := range stats {
|
||||||
|
metrics = append(metrics, prepareResourceMetrics(m, subSys, requireAvgMax)...)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return metrics
|
||||||
|
})
|
||||||
|
return mg
|
||||||
|
}
|
||||||
|
|
||||||
|
// metricsResourceHandler is the prometheus handler for resource metrics
|
||||||
|
func metricsResourceHandler() http.Handler {
|
||||||
|
return metricsHTTPHandler(resourceCollector, "handler.MetricsResource")
|
||||||
|
}
|
@ -25,10 +25,11 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
prometheusMetricsPathLegacy = "/prometheus/metrics"
|
prometheusMetricsPathLegacy = "/prometheus/metrics"
|
||||||
prometheusMetricsV2ClusterPath = "/v2/metrics/cluster"
|
prometheusMetricsV2ClusterPath = "/v2/metrics/cluster"
|
||||||
prometheusMetricsV2BucketPath = "/v2/metrics/bucket"
|
prometheusMetricsV2BucketPath = "/v2/metrics/bucket"
|
||||||
prometheusMetricsV2NodePath = "/v2/metrics/node"
|
prometheusMetricsV2NodePath = "/v2/metrics/node"
|
||||||
|
prometheusMetricsV2ResourcePath = "/v2/metrics/resource"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Standard env prometheus auth type
|
// Standard env prometheus auth type
|
||||||
@ -57,4 +58,5 @@ func registerMetricsRouter(router *mux.Router) {
|
|||||||
metricsRouter.Handle(prometheusMetricsV2ClusterPath, auth(metricsServerHandler()))
|
metricsRouter.Handle(prometheusMetricsV2ClusterPath, auth(metricsServerHandler()))
|
||||||
metricsRouter.Handle(prometheusMetricsV2BucketPath, auth(metricsBucketHandler()))
|
metricsRouter.Handle(prometheusMetricsV2BucketPath, auth(metricsBucketHandler()))
|
||||||
metricsRouter.Handle(prometheusMetricsV2NodePath, auth(metricsNodeHandler()))
|
metricsRouter.Handle(prometheusMetricsV2NodePath, auth(metricsNodeHandler()))
|
||||||
|
metricsRouter.Handle(prometheusMetricsV2ResourcePath, auth(metricsResourceHandler()))
|
||||||
}
|
}
|
||||||
|
@ -132,6 +132,9 @@ const (
|
|||||||
capacityRawSubsystem MetricSubsystem = "capacity_raw"
|
capacityRawSubsystem MetricSubsystem = "capacity_raw"
|
||||||
capacityUsableSubsystem MetricSubsystem = "capacity_usable"
|
capacityUsableSubsystem MetricSubsystem = "capacity_usable"
|
||||||
driveSubsystem MetricSubsystem = "drive"
|
driveSubsystem MetricSubsystem = "drive"
|
||||||
|
interfaceSubsystem MetricSubsystem = "if"
|
||||||
|
memSubsystem MetricSubsystem = "mem"
|
||||||
|
cpuSubsystem MetricSubsystem = "cpu_avg"
|
||||||
storageClassSubsystem MetricSubsystem = "storage_class"
|
storageClassSubsystem MetricSubsystem = "storage_class"
|
||||||
fileDescriptorSubsystem MetricSubsystem = "file_descriptor"
|
fileDescriptorSubsystem MetricSubsystem = "file_descriptor"
|
||||||
goRoutines MetricSubsystem = "go_routine"
|
goRoutines MetricSubsystem = "go_routine"
|
||||||
@ -539,12 +542,12 @@ func getNodeRRSParityMD() MetricDescription {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func getNodeDrivesFreeInodes() MetricDescription {
|
func getNodeDrivesFreeInodesMD() MetricDescription {
|
||||||
return MetricDescription{
|
return MetricDescription{
|
||||||
Namespace: nodeMetricNamespace,
|
Namespace: nodeMetricNamespace,
|
||||||
Subsystem: driveSubsystem,
|
Subsystem: driveSubsystem,
|
||||||
Name: freeInodes,
|
Name: freeInodes,
|
||||||
Help: "Total free inodes",
|
Help: "Free inodes on a drive",
|
||||||
Type: gaugeMetric,
|
Type: gaugeMetric,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -3207,7 +3210,7 @@ func getLocalStorageMetrics() *MetricsGroup {
|
|||||||
})
|
})
|
||||||
|
|
||||||
metrics = append(metrics, Metric{
|
metrics = append(metrics, Metric{
|
||||||
Description: getNodeDrivesFreeInodes(),
|
Description: getNodeDrivesFreeInodesMD(),
|
||||||
Value: float64(disk.FreeInodes),
|
Value: float64(disk.FreeInodes),
|
||||||
VariableLabels: map[string]string{"drive": disk.DrivePath},
|
VariableLabels: map[string]string{"drive": disk.DrivePath},
|
||||||
})
|
})
|
||||||
@ -3546,6 +3549,61 @@ func getKMSMetrics() *MetricsGroup {
|
|||||||
return mg
|
return mg
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func collectMetric(metric Metric, labels []string, values []string, metricName string, out chan<- prometheus.Metric) {
|
||||||
|
if metric.Description.Type == histogramMetric {
|
||||||
|
if metric.Histogram == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for k, v := range metric.Histogram {
|
||||||
|
pmetric, err := prometheus.NewConstMetric(
|
||||||
|
prometheus.NewDesc(
|
||||||
|
prometheus.BuildFQName(string(metric.Description.Namespace),
|
||||||
|
string(metric.Description.Subsystem),
|
||||||
|
string(metric.Description.Name)),
|
||||||
|
metric.Description.Help,
|
||||||
|
append(labels, metric.HistogramBucketLabel),
|
||||||
|
metric.StaticLabels,
|
||||||
|
),
|
||||||
|
prometheus.GaugeValue,
|
||||||
|
float64(v),
|
||||||
|
append(values, k)...)
|
||||||
|
if err != nil {
|
||||||
|
// Enable for debugging
|
||||||
|
if serverDebugLog {
|
||||||
|
logger.LogOnceIf(GlobalContext, fmt.Errorf("unable to validate prometheus metric (%w) %v+%v", err, values, metric.Histogram), metricName+"-metrics-histogram")
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
out <- pmetric
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
metricType := prometheus.GaugeValue
|
||||||
|
if metric.Description.Type == counterMetric {
|
||||||
|
metricType = prometheus.CounterValue
|
||||||
|
}
|
||||||
|
pmetric, err := prometheus.NewConstMetric(
|
||||||
|
prometheus.NewDesc(
|
||||||
|
prometheus.BuildFQName(string(metric.Description.Namespace),
|
||||||
|
string(metric.Description.Subsystem),
|
||||||
|
string(metric.Description.Name)),
|
||||||
|
metric.Description.Help,
|
||||||
|
labels,
|
||||||
|
metric.StaticLabels,
|
||||||
|
),
|
||||||
|
metricType,
|
||||||
|
metric.Value,
|
||||||
|
values...)
|
||||||
|
if err != nil {
|
||||||
|
// Enable for debugging
|
||||||
|
if serverDebugLog {
|
||||||
|
logger.LogOnceIf(GlobalContext, fmt.Errorf("unable to validate prometheus metric (%w) %v", err, values), metricName+"-metrics")
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
out <- pmetric
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
type minioBucketCollector struct {
|
type minioBucketCollector struct {
|
||||||
metricsGroups []*MetricsGroup
|
metricsGroups []*MetricsGroup
|
||||||
desc *prometheus.Desc
|
desc *prometheus.Desc
|
||||||
@ -3570,52 +3628,7 @@ func (c *minioBucketCollector) Collect(out chan<- prometheus.Metric) {
|
|||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
for metric := range in {
|
for metric := range in {
|
||||||
labels, values := getOrderedLabelValueArrays(metric.VariableLabels)
|
labels, values := getOrderedLabelValueArrays(metric.VariableLabels)
|
||||||
if metric.Description.Type == histogramMetric {
|
collectMetric(metric, labels, values, "bucket", out)
|
||||||
if metric.Histogram == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
for k, v := range metric.Histogram {
|
|
||||||
pmetric, err := prometheus.NewConstMetric(
|
|
||||||
prometheus.NewDesc(
|
|
||||||
prometheus.BuildFQName(string(metric.Description.Namespace),
|
|
||||||
string(metric.Description.Subsystem),
|
|
||||||
string(metric.Description.Name)),
|
|
||||||
metric.Description.Help,
|
|
||||||
append(labels, metric.HistogramBucketLabel),
|
|
||||||
metric.StaticLabels,
|
|
||||||
),
|
|
||||||
prometheus.GaugeValue,
|
|
||||||
float64(v),
|
|
||||||
append(values, k)...)
|
|
||||||
if err != nil {
|
|
||||||
logger.LogOnceIf(GlobalContext, fmt.Errorf("unable to validate prometheus metric (%w) %v+%v", err, values, metric.Histogram), "bucket-metrics-histogram")
|
|
||||||
} else {
|
|
||||||
out <- pmetric
|
|
||||||
}
|
|
||||||
}
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
metricType := prometheus.GaugeValue
|
|
||||||
if metric.Description.Type == counterMetric {
|
|
||||||
metricType = prometheus.CounterValue
|
|
||||||
}
|
|
||||||
pmetric, err := prometheus.NewConstMetric(
|
|
||||||
prometheus.NewDesc(
|
|
||||||
prometheus.BuildFQName(string(metric.Description.Namespace),
|
|
||||||
string(metric.Description.Subsystem),
|
|
||||||
string(metric.Description.Name)),
|
|
||||||
metric.Description.Help,
|
|
||||||
labels,
|
|
||||||
metric.StaticLabels,
|
|
||||||
),
|
|
||||||
metricType,
|
|
||||||
metric.Value,
|
|
||||||
values...)
|
|
||||||
if err != nil {
|
|
||||||
logger.LogOnceIf(GlobalContext, fmt.Errorf("unable to validate prometheus metric (%w) %v", err, values), "bucket-metrics")
|
|
||||||
} else {
|
|
||||||
out <- pmetric
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3650,59 +3663,7 @@ func (c *minioClusterCollector) Collect(out chan<- prometheus.Metric) {
|
|||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
for metric := range in {
|
for metric := range in {
|
||||||
labels, values := getOrderedLabelValueArrays(metric.VariableLabels)
|
labels, values := getOrderedLabelValueArrays(metric.VariableLabels)
|
||||||
if metric.Description.Type == histogramMetric {
|
collectMetric(metric, labels, values, "cluster", out)
|
||||||
if metric.Histogram == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
for k, v := range metric.Histogram {
|
|
||||||
pmetric, err := prometheus.NewConstMetric(
|
|
||||||
prometheus.NewDesc(
|
|
||||||
prometheus.BuildFQName(string(metric.Description.Namespace),
|
|
||||||
string(metric.Description.Subsystem),
|
|
||||||
string(metric.Description.Name)),
|
|
||||||
metric.Description.Help,
|
|
||||||
append(labels, metric.HistogramBucketLabel),
|
|
||||||
metric.StaticLabels,
|
|
||||||
),
|
|
||||||
prometheus.GaugeValue,
|
|
||||||
float64(v),
|
|
||||||
append(values, k)...)
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
// Enable for debugging
|
|
||||||
if serverDebugLog {
|
|
||||||
logger.LogOnceIf(GlobalContext, fmt.Errorf("unable to validate prometheus metric (%w) %v:%v", err, values, metric.Histogram), "cluster-metrics-histogram")
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
out <- pmetric
|
|
||||||
}
|
|
||||||
}
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
metricType := prometheus.GaugeValue
|
|
||||||
if metric.Description.Type == counterMetric {
|
|
||||||
metricType = prometheus.CounterValue
|
|
||||||
}
|
|
||||||
pmetric, err := prometheus.NewConstMetric(
|
|
||||||
prometheus.NewDesc(
|
|
||||||
prometheus.BuildFQName(string(metric.Description.Namespace),
|
|
||||||
string(metric.Description.Subsystem),
|
|
||||||
string(metric.Description.Name)),
|
|
||||||
metric.Description.Help,
|
|
||||||
labels,
|
|
||||||
metric.StaticLabels,
|
|
||||||
),
|
|
||||||
metricType,
|
|
||||||
metric.Value,
|
|
||||||
values...)
|
|
||||||
if err != nil {
|
|
||||||
// Enable for debugging
|
|
||||||
if serverDebugLog {
|
|
||||||
logger.LogOnceIf(GlobalContext, fmt.Errorf("unable to validate prometheus metric (%w) %v", err, values), "cluster-metrics")
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
out <- pmetric
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3835,11 +3796,11 @@ func newMinioCollectorNode(metricsGroups []*MetricsGroup) *minioNodeCollector {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func metricsBucketHandler() http.Handler {
|
func metricsHTTPHandler(c prometheus.Collector, funcName string) http.Handler {
|
||||||
registry := prometheus.NewRegistry()
|
registry := prometheus.NewRegistry()
|
||||||
|
|
||||||
// Report all other metrics
|
// Report all other metrics
|
||||||
logger.CriticalIf(GlobalContext, registry.Register(bucketCollector))
|
logger.CriticalIf(GlobalContext, registry.Register(c))
|
||||||
|
|
||||||
// DefaultGatherers include golang metrics and process metrics.
|
// DefaultGatherers include golang metrics and process metrics.
|
||||||
gatherers := prometheus.Gatherers{
|
gatherers := prometheus.Gatherers{
|
||||||
@ -3849,16 +3810,14 @@ func metricsBucketHandler() http.Handler {
|
|||||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
tc, ok := r.Context().Value(mcontext.ContextTraceKey).(*mcontext.TraceCtxt)
|
tc, ok := r.Context().Value(mcontext.ContextTraceKey).(*mcontext.TraceCtxt)
|
||||||
if ok {
|
if ok {
|
||||||
tc.FuncName = "handler.MetricsBucket"
|
tc.FuncName = funcName
|
||||||
tc.ResponseRecorder.LogErrBody = true
|
tc.ResponseRecorder.LogErrBody = true
|
||||||
}
|
}
|
||||||
|
|
||||||
mfs, err := gatherers.Gather()
|
mfs, err := gatherers.Gather()
|
||||||
if err != nil {
|
if err != nil && len(mfs) == 0 {
|
||||||
if len(mfs) == 0 {
|
writeErrorResponseJSON(r.Context(), w, toAdminAPIErr(r.Context(), err), r.URL)
|
||||||
writeErrorResponseJSON(r.Context(), w, toAdminAPIErr(r.Context(), err), r.URL)
|
return
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
contentType := expfmt.Negotiate(r.Header)
|
contentType := expfmt.Negotiate(r.Header)
|
||||||
@ -3878,6 +3837,10 @@ func metricsBucketHandler() http.Handler {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func metricsBucketHandler() http.Handler {
|
||||||
|
return metricsHTTPHandler(bucketCollector, "handler.MetricsBucket")
|
||||||
|
}
|
||||||
|
|
||||||
func metricsServerHandler() http.Handler {
|
func metricsServerHandler() http.Handler {
|
||||||
registry := prometheus.NewRegistry()
|
registry := prometheus.NewRegistry()
|
||||||
|
|
||||||
|
@ -783,6 +783,27 @@ func (sys *NotificationSys) GetMetrics(ctx context.Context, t madmin.MetricType,
|
|||||||
return reply
|
return reply
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GetResourceMetrics - gets the resource metrics from all nodes excluding self.
|
||||||
|
func (sys *NotificationSys) GetResourceMetrics(ctx context.Context) <-chan Metric {
|
||||||
|
if sys == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
g := errgroup.WithNErrs(len(sys.peerClients))
|
||||||
|
peerChannels := make([]<-chan Metric, len(sys.peerClients))
|
||||||
|
for index := range sys.peerClients {
|
||||||
|
index := index
|
||||||
|
g.Go(func() error {
|
||||||
|
if sys.peerClients[index] == nil {
|
||||||
|
return errPeerNotReachable
|
||||||
|
}
|
||||||
|
var err error
|
||||||
|
peerChannels[index], err = sys.peerClients[index].GetResourceMetrics(ctx)
|
||||||
|
return err
|
||||||
|
}, index)
|
||||||
|
}
|
||||||
|
return sys.collectPeerMetrics(ctx, peerChannels, g)
|
||||||
|
}
|
||||||
|
|
||||||
// GetSysConfig - Get information about system config
|
// GetSysConfig - Get information about system config
|
||||||
// (only the config that are of concern to minio)
|
// (only the config that are of concern to minio)
|
||||||
func (sys *NotificationSys) GetSysConfig(ctx context.Context) []madmin.SysConfig {
|
func (sys *NotificationSys) GetSysConfig(ctx context.Context) []madmin.SysConfig {
|
||||||
@ -1122,25 +1143,7 @@ func (sys *NotificationSys) GetBandwidthReports(ctx context.Context, buckets ...
|
|||||||
return consolidatedReport
|
return consolidatedReport
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetBucketMetrics - gets the cluster level bucket metrics from all nodes excluding self.
|
func (sys *NotificationSys) collectPeerMetrics(ctx context.Context, peerChannels []<-chan Metric, g *errgroup.Group) <-chan Metric {
|
||||||
func (sys *NotificationSys) GetBucketMetrics(ctx context.Context) <-chan Metric {
|
|
||||||
if sys == nil {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
g := errgroup.WithNErrs(len(sys.peerClients))
|
|
||||||
peerChannels := make([]<-chan Metric, len(sys.peerClients))
|
|
||||||
for index := range sys.peerClients {
|
|
||||||
index := index
|
|
||||||
g.Go(func() error {
|
|
||||||
if sys.peerClients[index] == nil {
|
|
||||||
return errPeerNotReachable
|
|
||||||
}
|
|
||||||
var err error
|
|
||||||
peerChannels[index], err = sys.peerClients[index].GetPeerBucketMetrics(ctx)
|
|
||||||
return err
|
|
||||||
}, index)
|
|
||||||
}
|
|
||||||
|
|
||||||
ch := make(chan Metric)
|
ch := make(chan Metric)
|
||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
for index, err := range g.Wait() {
|
for index, err := range g.Wait() {
|
||||||
@ -1181,6 +1184,27 @@ func (sys *NotificationSys) GetBucketMetrics(ctx context.Context) <-chan Metric
|
|||||||
return ch
|
return ch
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GetBucketMetrics - gets the cluster level bucket metrics from all nodes excluding self.
|
||||||
|
func (sys *NotificationSys) GetBucketMetrics(ctx context.Context) <-chan Metric {
|
||||||
|
if sys == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
g := errgroup.WithNErrs(len(sys.peerClients))
|
||||||
|
peerChannels := make([]<-chan Metric, len(sys.peerClients))
|
||||||
|
for index := range sys.peerClients {
|
||||||
|
index := index
|
||||||
|
g.Go(func() error {
|
||||||
|
if sys.peerClients[index] == nil {
|
||||||
|
return errPeerNotReachable
|
||||||
|
}
|
||||||
|
var err error
|
||||||
|
peerChannels[index], err = sys.peerClients[index].GetPeerBucketMetrics(ctx)
|
||||||
|
return err
|
||||||
|
}, index)
|
||||||
|
}
|
||||||
|
return sys.collectPeerMetrics(ctx, peerChannels, g)
|
||||||
|
}
|
||||||
|
|
||||||
// GetClusterMetrics - gets the cluster metrics from all nodes excluding self.
|
// GetClusterMetrics - gets the cluster metrics from all nodes excluding self.
|
||||||
func (sys *NotificationSys) GetClusterMetrics(ctx context.Context) <-chan Metric {
|
func (sys *NotificationSys) GetClusterMetrics(ctx context.Context) <-chan Metric {
|
||||||
if sys == nil {
|
if sys == nil {
|
||||||
@ -1199,45 +1223,7 @@ func (sys *NotificationSys) GetClusterMetrics(ctx context.Context) <-chan Metric
|
|||||||
return err
|
return err
|
||||||
}, index)
|
}, index)
|
||||||
}
|
}
|
||||||
|
return sys.collectPeerMetrics(ctx, peerChannels, g)
|
||||||
ch := make(chan Metric)
|
|
||||||
var wg sync.WaitGroup
|
|
||||||
for index, err := range g.Wait() {
|
|
||||||
if err != nil {
|
|
||||||
if sys.peerClients[index] != nil {
|
|
||||||
reqInfo := (&logger.ReqInfo{}).AppendTags("peerAddress",
|
|
||||||
sys.peerClients[index].host.String())
|
|
||||||
logger.LogOnceIf(logger.SetReqInfo(ctx, reqInfo), err, sys.peerClients[index].host.String())
|
|
||||||
} else {
|
|
||||||
logger.LogOnceIf(ctx, err, "peer-offline")
|
|
||||||
}
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
wg.Add(1)
|
|
||||||
go func(ctx context.Context, peerChannel <-chan Metric, wg *sync.WaitGroup) {
|
|
||||||
defer wg.Done()
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case m, ok := <-peerChannel:
|
|
||||||
if !ok {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
select {
|
|
||||||
case ch <- m:
|
|
||||||
case <-ctx.Done():
|
|
||||||
return
|
|
||||||
}
|
|
||||||
case <-ctx.Done():
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}(ctx, peerChannels[index], &wg)
|
|
||||||
}
|
|
||||||
go func(wg *sync.WaitGroup, ch chan Metric) {
|
|
||||||
wg.Wait()
|
|
||||||
close(ch)
|
|
||||||
}(&wg, ch)
|
|
||||||
return ch
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ServiceFreeze freezes all S3 API calls when 'freeze' is true,
|
// ServiceFreeze freezes all S3 API calls when 'freeze' is true,
|
||||||
|
@ -229,6 +229,33 @@ func (client *peerRESTClient) GetMetrics(ctx context.Context, t madmin.MetricTyp
|
|||||||
return info, err
|
return info, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (client *peerRESTClient) GetResourceMetrics(ctx context.Context) (<-chan Metric, error) {
|
||||||
|
respBody, err := client.callWithContext(ctx, peerRESTMethodResourceMetrics, nil, nil, -1)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
dec := gob.NewDecoder(respBody)
|
||||||
|
ch := make(chan Metric)
|
||||||
|
go func(ch chan<- Metric) {
|
||||||
|
defer func() {
|
||||||
|
xhttp.DrainBody(respBody)
|
||||||
|
close(ch)
|
||||||
|
}()
|
||||||
|
for {
|
||||||
|
var metric Metric
|
||||||
|
if err := dec.Decode(&metric); err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
case ch <- metric:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}(ch)
|
||||||
|
return ch, nil
|
||||||
|
}
|
||||||
|
|
||||||
// GetProcInfo - fetch MinIO process information for a remote node.
|
// GetProcInfo - fetch MinIO process information for a remote node.
|
||||||
func (client *peerRESTClient) GetProcInfo(ctx context.Context) (info madmin.ProcInfo, err error) {
|
func (client *peerRESTClient) GetProcInfo(ctx context.Context) (info madmin.ProcInfo, err error) {
|
||||||
respBody, err := client.callWithContext(ctx, peerRESTMethodProcInfo, nil, nil, -1)
|
respBody, err := client.callWithContext(ctx, peerRESTMethodProcInfo, nil, nil, -1)
|
||||||
|
@ -77,6 +77,7 @@ const (
|
|||||||
peerRESTMethodDevNull = "/devnull"
|
peerRESTMethodDevNull = "/devnull"
|
||||||
peerRESTMethodNetperf = "/netperf"
|
peerRESTMethodNetperf = "/netperf"
|
||||||
peerRESTMethodMetrics = "/metrics"
|
peerRESTMethodMetrics = "/metrics"
|
||||||
|
peerRESTMethodResourceMetrics = "/resourcemetrics"
|
||||||
peerRESTMethodGetReplicationMRF = "/getreplicationmrf"
|
peerRESTMethodGetReplicationMRF = "/getreplicationmrf"
|
||||||
peerRESTMethodGetSRMetrics = "/getsrmetrics"
|
peerRESTMethodGetSRMetrics = "/getsrmetrics"
|
||||||
)
|
)
|
||||||
|
@ -474,6 +474,22 @@ func (s *peerRESTServer) GetMetricsHandler(w http.ResponseWriter, r *http.Reques
|
|||||||
logger.LogIf(ctx, gob.NewEncoder(w).Encode(info))
|
logger.LogIf(ctx, gob.NewEncoder(w).Encode(info))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *peerRESTServer) GetResourceMetrics(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if !s.IsValid(w, r) {
|
||||||
|
s.writeErrorResponse(w, errors.New("invalid request"))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
enc := gob.NewEncoder(w)
|
||||||
|
|
||||||
|
for m := range ReportMetrics(r.Context(), resourceMetricsGroups) {
|
||||||
|
if err := enc.Encode(m); err != nil {
|
||||||
|
s.writeErrorResponse(w, errors.New("Encoding metric failed: "+err.Error()))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// GetSysConfigHandler - returns system config information.
|
// GetSysConfigHandler - returns system config information.
|
||||||
// (only the config that are of concern to minio)
|
// (only the config that are of concern to minio)
|
||||||
func (s *peerRESTServer) GetSysConfigHandler(w http.ResponseWriter, r *http.Request) {
|
func (s *peerRESTServer) GetSysConfigHandler(w http.ResponseWriter, r *http.Request) {
|
||||||
@ -1438,6 +1454,7 @@ func registerPeerRESTHandlers(router *mux.Router) {
|
|||||||
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodProcInfo).HandlerFunc(h(server.GetProcInfoHandler))
|
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodProcInfo).HandlerFunc(h(server.GetProcInfoHandler))
|
||||||
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodMemInfo).HandlerFunc(h(server.GetMemInfoHandler))
|
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodMemInfo).HandlerFunc(h(server.GetMemInfoHandler))
|
||||||
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodMetrics).HandlerFunc(h(server.GetMetricsHandler)).Queries(restQueries(peerRESTMetricsTypes)...)
|
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodMetrics).HandlerFunc(h(server.GetMetricsHandler)).Queries(restQueries(peerRESTMetricsTypes)...)
|
||||||
|
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodResourceMetrics).HandlerFunc(h(server.GetResourceMetrics))
|
||||||
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodSysErrors).HandlerFunc(h(server.GetSysErrorsHandler))
|
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodSysErrors).HandlerFunc(h(server.GetSysErrorsHandler))
|
||||||
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodSysServices).HandlerFunc(h(server.GetSysServicesHandler))
|
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodSysServices).HandlerFunc(h(server.GetSysServicesHandler))
|
||||||
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodSysConfig).HandlerFunc(h(server.GetSysConfigHandler))
|
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodSysConfig).HandlerFunc(h(server.GetSysConfigHandler))
|
||||||
|
@ -928,6 +928,10 @@ func serverMain(ctx *cli.Context) {
|
|||||||
logger.FatalIf(err, "Unable to initialize MinIO client")
|
logger.FatalIf(err, "Unable to initialize MinIO client")
|
||||||
})
|
})
|
||||||
|
|
||||||
|
go bootstrapTrace("startResourceMetricsCollection", func() {
|
||||||
|
startResourceMetricsCollection()
|
||||||
|
})
|
||||||
|
|
||||||
// Add User-Agent to differentiate the requests.
|
// Add User-Agent to differentiate the requests.
|
||||||
globalMinioClient.SetAppInfo("minio-perf-test", ReleaseTag)
|
globalMinioClient.SetAppInfo("minio-perf-test", ReleaseTag)
|
||||||
|
|
||||||
|
@ -303,3 +303,109 @@ For deployments with [Site Replication](https://min.io/docs/minio/linux/operatio
|
|||||||
| `minio_bucket_requests_total` | Total number of S3 requests on a bucket. |
|
| `minio_bucket_requests_total` | Total number of S3 requests on a bucket. |
|
||||||
| `minio_bucket_requests_canceled_total` | Total number S3 requests canceled by the client. |
|
| `minio_bucket_requests_canceled_total` | Total number S3 requests canceled by the client. |
|
||||||
| `minio_bucket_requests_ttfb_seconds_distribution` | Distribution of time to first byte across API calls per bucket. |
|
| `minio_bucket_requests_ttfb_seconds_distribution` | Distribution of time to first byte across API calls per bucket. |
|
||||||
|
|
||||||
|
# Resource Metrics
|
||||||
|
|
||||||
|
MinIO collects the following resource metrics at the node level.
|
||||||
|
Each metric includes the `server` label to identify the corresponding node.
|
||||||
|
Metrics may include one or more additional labels, such as the drive path, interface name, etc.
|
||||||
|
|
||||||
|
These metrics can be obtained from any MinIO server once per collection by using the following URL:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
https://HOSTNAME:PORT/minio/metrics/v2/resource
|
||||||
|
```
|
||||||
|
|
||||||
|
Replace `HOSTNAME:PORT` with the hostname of your MinIO deployment.
|
||||||
|
For deployments behind a load balancer, use the load balancer hostname instead of a single node hostname.
|
||||||
|
|
||||||
|
## Drive Resource Metrics
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| :----------------------------------- | :------------------------------------------------------- |
|
||||||
|
| `minio_node_drive_total_bytes` | Total bytes on a drive. |
|
||||||
|
| `minio_node_drive_used_bytes` | Used bytes on a drive. |
|
||||||
|
| `minio_node_drive_total_inodes` | Total inodes on a drive. |
|
||||||
|
| `minio_node_drive_used_inodes` | Total inodes used on a drive. |
|
||||||
|
| `minio_node_drive_reads_per_sec` | Reads per second on a drive. |
|
||||||
|
| `minio_node_drive_reads_kb_per_sec` | Kilobytes read per second on a drive. |
|
||||||
|
| `minio_node_drive_reads_await` | Average time for read requests to be served on a drive. |
|
||||||
|
| `minio_node_drive_writes_per_sec` | Writes per second on a drive. |
|
||||||
|
| `minio_node_drive_writes_kb_per_sec` | Kilobytes written per second on a drive. |
|
||||||
|
| `minio_node_drive_writes_await` | Average time for write requests to be served on a drive. |
|
||||||
|
| `minio_node_drive_perc_util` | Percentage of time the disk was busy since uptime. |
|
||||||
|
|
||||||
|
## Network Interface Metrics
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| :---------------------------- | :-------------------------------------------- |
|
||||||
|
| `minio_node_if_rx_bytes` | Bytes received on the interface in 60s. |
|
||||||
|
| `minio_node_if_rx_bytes_avg` | Bytes received on the interface in 60s (avg). |
|
||||||
|
| `minio_node_if_rx_bytes_max` | Bytes received on the interface in 60s (max). |
|
||||||
|
| `minio_node_if_rx_errors` | Receive errors in 60s. |
|
||||||
|
| `minio_node_if_rx_errors_avg` | Receive errors in 60s (avg). |
|
||||||
|
| `minio_node_if_rx_errors_max` | Receive errors in 60s (max). |
|
||||||
|
| `minio_node_if_tx_bytes` | Bytes transmitted in 60s. |
|
||||||
|
| `minio_node_if_tx_bytes_avg` | Bytes transmitted in 60s (avg). |
|
||||||
|
| `minio_node_if_tx_bytes_max` | Bytes transmitted in 60s (max). |
|
||||||
|
| `minio_node_if_tx_errors` | Transmit errors in 60s. |
|
||||||
|
| `minio_node_if_tx_errors_avg` | Transmit errors in 60s (avg). |
|
||||||
|
| `minio_node_if_tx_errors_max` | Transmit errors in 60s (max). |
|
||||||
|
|
||||||
|
## CPU Metrics
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| :------------------------------ | :---------------------------- |
|
||||||
|
| `minio_node_cpu_avg_user` | CPU user time. |
|
||||||
|
| `minio_node_cpu_avg_user_avg` | CPU user time (avg). |
|
||||||
|
| `minio_node_cpu_avg_user_max` | CPU user time (max). |
|
||||||
|
| `minio_node_cpu_avg_system` | CPU system time. |
|
||||||
|
| `minio_node_cpu_avg_system_avg` | CPU system time (avg). |
|
||||||
|
| `minio_node_cpu_avg_system_max` | CPU system time (max). |
|
||||||
|
| `minio_node_cpu_avg_idle` | CPU idle time. |
|
||||||
|
| `minio_node_cpu_avg_idle_avg` | CPU idle time (avg). |
|
||||||
|
| `minio_node_cpu_avg_idle_max` | CPU idle time (max). |
|
||||||
|
| `minio_node_cpu_avg_iowait` | CPU ioWait time. |
|
||||||
|
| `minio_node_cpu_avg_iowait_avg` | CPU ioWait time (avg). |
|
||||||
|
| `minio_node_cpu_avg_iowait_max` | CPU ioWait time (max). |
|
||||||
|
| `minio_node_cpu_avg_nice` | CPU nice time. |
|
||||||
|
| `minio_node_cpu_avg_nice_avg` | CPU nice time (avg). |
|
||||||
|
| `minio_node_cpu_avg_nice_max` | CPU nice time (max). |
|
||||||
|
| `minio_node_cpu_avg_steal` | CPU steam time. |
|
||||||
|
| `minio_node_cpu_avg_steal_avg` | CPU steam time (avg). |
|
||||||
|
| `minio_node_cpu_avg_steal_max` | CPU steam time (max). |
|
||||||
|
| `minio_node_cpu_avg_load1` | CPU load average 1min. |
|
||||||
|
| `minio_node_cpu_avg_load1_avg` | CPU load average 1min (avg). |
|
||||||
|
| `minio_node_cpu_avg_load1_max` | CPU load average 1min (max). |
|
||||||
|
| `minio_node_cpu_avg_load5` | CPU load average 5min. |
|
||||||
|
| `minio_node_cpu_avg_load5_avg` | CPU load average 5min (avg). |
|
||||||
|
| `minio_node_cpu_avg_load5_max` | CPU load average 5min (max). |
|
||||||
|
| `minio_node_cpu_avg_load15` | CPU load average 15min. |
|
||||||
|
| `minio_node_cpu_avg_load15_avg` | CPU load average 15min (avg). |
|
||||||
|
| `minio_node_cpu_avg_load15_max` | CPU load average 15min (max). |
|
||||||
|
|
||||||
|
## Memory Metrics
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| :----------------------------- | :---------------------------------- |
|
||||||
|
| `minio_node_mem_available` | Available memory on the node. |
|
||||||
|
| `minio_node_mem_available_avg` | Available memory on the node (avg). |
|
||||||
|
| `minio_node_mem_available_max` | Available memory on the node (max). |
|
||||||
|
| `minio_node_mem_buffers` | Buffers memory on the node. |
|
||||||
|
| `minio_node_mem_buffers_avg` | Buffers memory on the node (avg). |
|
||||||
|
| `minio_node_mem_buffers_max` | Buffers memory on the node (max). |
|
||||||
|
| `minio_node_mem_cache` | Cache memory on the node. |
|
||||||
|
| `minio_node_mem_cache_avg` | Cache memory on the node (avg). |
|
||||||
|
| `minio_node_mem_cache_max` | Cache memory on the node (max). |
|
||||||
|
| `minio_node_mem_free` | Free memory on the node. |
|
||||||
|
| `minio_node_mem_free_avg` | Free memory on the node (avg). |
|
||||||
|
| `minio_node_mem_free_max` | Free memory on the node (max). |
|
||||||
|
| `minio_node_mem_shared` | Shared memory on the node. |
|
||||||
|
| `minio_node_mem_shared_avg` | Shared memory on the node (avg). |
|
||||||
|
| `minio_node_mem_shared_max` | Shared memory on the node (max). |
|
||||||
|
| `minio_node_mem_total` | Total memory on the node. |
|
||||||
|
| `minio_node_mem_total_avg` | Total memory on the node (avg). |
|
||||||
|
| `minio_node_mem_total_max` | Total memory on the node (max). |
|
||||||
|
| `minio_node_mem_used` | Used memory on the node. |
|
||||||
|
| `minio_node_mem_used_avg` | Used memory on the node (avg). |
|
||||||
|
| `minio_node_mem_used_max` | Used memory on the node (max). |
|
||||||
|
Loading…
x
Reference in New Issue
Block a user