mirror of
https://github.com/minio/minio.git
synced 2025-01-11 23:13:23 -05:00
fix incorrect disk io stats in k8s environment (#19016)
The previous logic of calculating per second values for disk io stats divides the stats by the host uptime. This doesn't work in k8s environment as the uptime is of the pod, but the stats (from /proc/diskstats) are from the host. Fix this by storing the initial values of uptime and the stats at the timme of server startup, and using the difference between current and initial values when calculating the per second values.
This commit is contained in:
parent
ac8e9ce04f
commit
7b9f9e0628
@ -83,6 +83,11 @@ var (
|
||||
// resourceMetricsHelpMap maps metric name to its help string
|
||||
resourceMetricsHelpMap map[MetricName]string
|
||||
resourceMetricsGroups []*MetricsGroup
|
||||
// initial values for drives (at the time of server startup)
|
||||
// used for calculating avg values for drive metrics
|
||||
initialDriveStats map[string]madmin.DiskIOStats
|
||||
initialDriveStatsMu sync.RWMutex
|
||||
initialUptime uint64
|
||||
)
|
||||
|
||||
// PeerResourceMetrics represents the resource metrics
|
||||
@ -214,41 +219,58 @@ func updateResourceMetrics(subSys MetricSubsystem, name MetricName, val float64,
|
||||
resourceMetricsMap[subSys] = subsysMetrics
|
||||
}
|
||||
|
||||
func collectDriveMetrics(m madmin.RealtimeMetrics) {
|
||||
upt, _ := host.Uptime()
|
||||
kib := 1 << 10
|
||||
// updateDriveIOStats - Updates the drive IO stats by calculating the difference between the current
|
||||
// and initial values. We cannot rely on host.Uptime here as it will not work in k8s environments, where
|
||||
// it will return the pod's uptime but the disk metrics are always from the host (/proc/diskstats)
|
||||
func updateDriveIOStats(currentStats madmin.DiskIOStats, initialStats madmin.DiskIOStats, labels map[string]string) {
|
||||
sectorSize := uint64(512)
|
||||
kib := float64(1 << 10)
|
||||
uptime, _ := host.Uptime()
|
||||
uptimeDiff := float64(uptime - initialUptime)
|
||||
if uptimeDiff == 0 {
|
||||
// too soon to update the stats
|
||||
return
|
||||
}
|
||||
diffStats := madmin.DiskIOStats{
|
||||
ReadIOs: currentStats.ReadIOs - initialStats.ReadIOs,
|
||||
WriteIOs: currentStats.WriteIOs - initialStats.WriteIOs,
|
||||
ReadTicks: currentStats.ReadTicks - initialStats.ReadTicks,
|
||||
WriteTicks: currentStats.WriteTicks - initialStats.WriteTicks,
|
||||
TotalTicks: currentStats.TotalTicks - initialStats.TotalTicks,
|
||||
ReadSectors: currentStats.ReadSectors - initialStats.ReadSectors,
|
||||
WriteSectors: currentStats.WriteSectors - initialStats.WriteSectors,
|
||||
}
|
||||
|
||||
updateResourceMetrics(driveSubsystem, readsPerSec, float64(diffStats.ReadIOs)/uptimeDiff, labels, false)
|
||||
readKib := float64(diffStats.ReadSectors*sectorSize) / kib
|
||||
updateResourceMetrics(driveSubsystem, readsKBPerSec, readKib/uptimeDiff, labels, false)
|
||||
|
||||
updateResourceMetrics(driveSubsystem, writesPerSec, float64(diffStats.WriteIOs)/uptimeDiff, labels, false)
|
||||
writeKib := float64(diffStats.WriteSectors*sectorSize) / kib
|
||||
updateResourceMetrics(driveSubsystem, writesKBPerSec, writeKib/uptimeDiff, labels, false)
|
||||
|
||||
rdAwait := 0.0
|
||||
if diffStats.ReadIOs > 0 {
|
||||
rdAwait = float64(diffStats.ReadTicks) / float64(diffStats.ReadIOs)
|
||||
}
|
||||
updateResourceMetrics(driveSubsystem, readsAwait, rdAwait, labels, false)
|
||||
|
||||
wrAwait := 0.0
|
||||
if diffStats.WriteIOs > 0 {
|
||||
wrAwait = float64(diffStats.WriteTicks) / float64(diffStats.WriteIOs)
|
||||
}
|
||||
updateResourceMetrics(driveSubsystem, writesAwait, wrAwait, labels, false)
|
||||
updateResourceMetrics(driveSubsystem, percUtil, float64(diffStats.TotalTicks)/(uptimeDiff*10), labels, false)
|
||||
}
|
||||
|
||||
func collectDriveMetrics(m madmin.RealtimeMetrics) {
|
||||
for d, dm := range m.ByDisk {
|
||||
stats := dm.IOStats
|
||||
labels := map[string]string{"drive": d}
|
||||
updateResourceMetrics(driveSubsystem, readsPerSec, float64(stats.ReadIOs)/float64(upt), labels, false)
|
||||
|
||||
readBytes := stats.ReadSectors * sectorSize
|
||||
readKib := float64(readBytes) / float64(kib)
|
||||
readKibPerSec := readKib / float64(upt)
|
||||
updateResourceMetrics(driveSubsystem, readsKBPerSec, readKibPerSec, labels, false)
|
||||
|
||||
updateResourceMetrics(driveSubsystem, writesPerSec, float64(stats.WriteIOs)/float64(upt), labels, false)
|
||||
|
||||
writeBytes := stats.WriteSectors * sectorSize
|
||||
writeKib := float64(writeBytes) / float64(kib)
|
||||
writeKibPerSec := writeKib / float64(upt)
|
||||
updateResourceMetrics(driveSubsystem, writesKBPerSec, writeKibPerSec, labels, false)
|
||||
|
||||
rdAwait := 0.0
|
||||
if stats.ReadIOs > 0 {
|
||||
rdAwait = float64(stats.ReadTicks) / float64(stats.ReadIOs)
|
||||
initialStats, ok := initialDriveStats[d]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
updateResourceMetrics(driveSubsystem, readsAwait, rdAwait, labels, false)
|
||||
|
||||
wrAwait := 0.0
|
||||
if stats.WriteIOs > 0 {
|
||||
wrAwait = float64(stats.WriteTicks) / float64(stats.WriteIOs)
|
||||
}
|
||||
updateResourceMetrics(driveSubsystem, writesAwait, wrAwait, labels, false)
|
||||
|
||||
updateResourceMetrics(driveSubsystem, percUtil, float64(stats.TotalTicks)/float64(upt*10), labels, false)
|
||||
updateDriveIOStats(dm.IOStats, initialStats, labels)
|
||||
}
|
||||
|
||||
globalLocalDrivesMu.RLock()
|
||||
@ -256,8 +278,8 @@ func collectDriveMetrics(m madmin.RealtimeMetrics) {
|
||||
globalLocalDrivesMu.RUnlock()
|
||||
|
||||
for _, d := range localDrives {
|
||||
labels := map[string]string{"drive": d.Endpoint().RawPath}
|
||||
di, err := d.DiskInfo(GlobalContext, DiskInfoOptions{})
|
||||
labels := map[string]string{"drive": di.Endpoint}
|
||||
if err == nil {
|
||||
updateResourceMetrics(driveSubsystem, usedBytes, float64(di.Used), labels, false)
|
||||
updateResourceMetrics(driveSubsystem, totalBytes, float64(di.Total), labels, false)
|
||||
@ -339,8 +361,30 @@ func collectLocalResourceMetrics() {
|
||||
collectDriveMetrics(m)
|
||||
}
|
||||
|
||||
// populateInitialValues - populates the initial values
|
||||
// for drive stats and host uptime
|
||||
func populateInitialValues() {
|
||||
initialDriveStatsMu.Lock()
|
||||
|
||||
m := collectLocalMetrics(madmin.MetricsDisk, collectMetricsOpts{
|
||||
hosts: map[string]struct{}{
|
||||
globalLocalNodeName: {},
|
||||
},
|
||||
})
|
||||
|
||||
initialDriveStats = map[string]madmin.DiskIOStats{}
|
||||
for d, dm := range m.ByDisk {
|
||||
initialDriveStats[d] = dm.IOStats
|
||||
}
|
||||
|
||||
initialUptime, _ = host.Uptime()
|
||||
initialDriveStatsMu.Unlock()
|
||||
}
|
||||
|
||||
// startResourceMetricsCollection - starts the job for collecting resource metrics
|
||||
func startResourceMetricsCollection() {
|
||||
populateInitialValues()
|
||||
|
||||
resourceMetricsMapMu.Lock()
|
||||
resourceMetricsMap = map[MetricSubsystem]ResourceMetrics{}
|
||||
resourceMetricsMapMu.Unlock()
|
||||
|
Loading…
Reference in New Issue
Block a user