Capture percentage of cpu load and memory used (#18596)

By default the cpu load is the cumulative of all cores. Capture the
percentage load (load * 100 / cpu-count)

Also capture the percentage memory used (used * 100 / total)
This commit is contained in:
Shireesh Anjal
2023-12-07 02:49:59 +05:30
committed by GitHub
parent 5cc2c62c66
commit 7350a29fec
5 changed files with 102 additions and 65 deletions

View File

@@ -91,7 +91,7 @@ func collectLocalMetrics(types madmin.MetricType, opts collectMetricsOpts) (m ma
}
cm, err := c.Times(false)
if err != nil {
m.Errors = append(m.Errors, fmt.Sprintf("%s: %v (cputimes)", globalMinioAddr, err.Error()))
m.Errors = append(m.Errors, fmt.Sprintf("%s: %v (cpuTimes)", globalMinioAddr, err.Error()))
} else {
// not collecting per-cpu stats, so there will be only one element
if len(cm) == 1 {
@@ -100,6 +100,13 @@ func collectLocalMetrics(types madmin.MetricType, opts collectMetricsOpts) (m ma
m.Errors = append(m.Errors, fmt.Sprintf("%s: Expected one CPU stat, got %d", globalMinioAddr, len(cm)))
}
}
cpuCount, err := c.Counts(true)
if err != nil {
m.Errors = append(m.Errors, fmt.Sprintf("%s: %v (cpuCount)", globalMinioAddr, err.Error()))
} else {
m.Aggregated.CPU.CPUCount = cpuCount
}
loadStat, err := load.Avg()
if err != nil {
m.Errors = append(m.Errors, fmt.Sprintf("%s: %v (loadStat)", globalMinioAddr, err.Error()))

View File

@@ -53,6 +53,7 @@ const (
// memory stats
memUsed MetricName = "used"
memUsedPerc MetricName = "used_perc"
memFree MetricName = "free"
memShared MetricName = "shared"
memBuffers MetricName = "buffers"
@@ -60,15 +61,18 @@ const (
memAvailable MetricName = "available"
// cpu stats
cpuUser MetricName = "user"
cpuSystem MetricName = "system"
cpuIOWait MetricName = "iowait"
cpuIdle MetricName = "idle"
cpuNice MetricName = "nice"
cpuSteal MetricName = "steal"
cpuLoad1 MetricName = "load1"
cpuLoad5 MetricName = "load5"
cpuLoad15 MetricName = "load15"
cpuUser MetricName = "user"
cpuSystem MetricName = "system"
cpuIOWait MetricName = "iowait"
cpuIdle MetricName = "idle"
cpuNice MetricName = "nice"
cpuSteal MetricName = "steal"
cpuLoad1 MetricName = "load1"
cpuLoad5 MetricName = "load5"
cpuLoad15 MetricName = "load15"
cpuLoad1Perc MetricName = "load1_perc"
cpuLoad5Perc MetricName = "load5_perc"
cpuLoad15Perc MetricName = "load15_perc"
)
var (
@@ -126,6 +130,7 @@ func init() {
interfaceTxErrors: "Transmit errors in " + interval,
total: "Total memory on the node",
memUsed: "Used memory on the node",
memUsedPerc: "Used memory percentage on the node",
memFree: "Free memory on the node",
memShared: "Shared memory on the node",
memBuffers: "Buffers memory on the node",
@@ -151,6 +156,9 @@ func init() {
cpuLoad1: "CPU load average 1min",
cpuLoad5: "CPU load average 5min",
cpuLoad15: "CPU load average 15min",
cpuLoad1Perc: "CPU load average 1min (perentage)",
cpuLoad5Perc: "CPU load average 5min (percentage)",
cpuLoad15Perc: "CPU load average 15min (percentage)",
}
resourceMetricsGroups = []*MetricsGroup{
getResourceMetrics(),
@@ -283,6 +291,8 @@ func collectLocalResourceMetrics() {
stats := hm.Mem.Info
updateResourceMetrics(memSubsystem, total, float64(stats.Total), labels, false)
updateResourceMetrics(memSubsystem, memUsed, float64(stats.Used), labels, false)
perc := math.Round(float64(stats.Used*100*100)/float64(stats.Total)) / 100
updateResourceMetrics(memSubsystem, memUsedPerc, perc, labels, false)
updateResourceMetrics(memSubsystem, memFree, float64(stats.Free), labels, false)
updateResourceMetrics(memSubsystem, memShared, float64(stats.Shared), labels, false)
updateResourceMetrics(memSubsystem, memBuffers, float64(stats.Buffers), labels, false)
@@ -312,6 +322,14 @@ func collectLocalResourceMetrics() {
updateResourceMetrics(cpuSubsystem, cpuLoad1, ls.Load1, labels, false)
updateResourceMetrics(cpuSubsystem, cpuLoad5, ls.Load5, labels, false)
updateResourceMetrics(cpuSubsystem, cpuLoad15, ls.Load15, labels, false)
if hm.CPU.CPUCount > 0 {
perc := math.Round(ls.Load1*100*100/float64(hm.CPU.CPUCount)) / 100
updateResourceMetrics(cpuSubsystem, cpuLoad1Perc, perc, labels, false)
perc = math.Round(ls.Load5*100*100/float64(hm.CPU.CPUCount)) / 100
updateResourceMetrics(cpuSubsystem, cpuLoad5Perc, perc, labels, false)
perc = math.Round(ls.Load15*100*100/float64(hm.CPU.CPUCount)) / 100
updateResourceMetrics(cpuSubsystem, cpuLoad15Perc, perc, labels, false)
}
}
}
break // only one host expected