fix incorrect disk io stats in k8s environment (#19016)

The previous logic of calculating per second values for disk io stats
divides the stats by the host uptime. This doesn't work in k8s
environment as the uptime is of the pod, but the stats (from
/proc/diskstats) are from the host.

Fix this by storing the initial values of uptime and the stats at the
timme of server startup, and using the difference between current and
initial values when calculating the per second values.
This commit is contained in:
Shireesh Anjal 2024-02-13 21:05:11 +05:30 committed by GitHub
parent ac8e9ce04f
commit 7b9f9e0628
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -83,6 +83,11 @@ var (
// resourceMetricsHelpMap maps metric name to its help string
resourceMetricsHelpMap map[MetricName]string
resourceMetricsGroups []*MetricsGroup
// initial values for drives (at the time of server startup)
// used for calculating avg values for drive metrics
initialDriveStats map[string]madmin.DiskIOStats
initialDriveStatsMu sync.RWMutex
initialUptime uint64
)
// PeerResourceMetrics represents the resource metrics
@ -214,41 +219,58 @@ func updateResourceMetrics(subSys MetricSubsystem, name MetricName, val float64,
resourceMetricsMap[subSys] = subsysMetrics
}
func collectDriveMetrics(m madmin.RealtimeMetrics) {
upt, _ := host.Uptime()
kib := 1 << 10
// updateDriveIOStats - Updates the drive IO stats by calculating the difference between the current
// and initial values. We cannot rely on host.Uptime here as it will not work in k8s environments, where
// it will return the pod's uptime but the disk metrics are always from the host (/proc/diskstats)
func updateDriveIOStats(currentStats madmin.DiskIOStats, initialStats madmin.DiskIOStats, labels map[string]string) {
sectorSize := uint64(512)
kib := float64(1 << 10)
uptime, _ := host.Uptime()
uptimeDiff := float64(uptime - initialUptime)
if uptimeDiff == 0 {
// too soon to update the stats
return
}
diffStats := madmin.DiskIOStats{
ReadIOs: currentStats.ReadIOs - initialStats.ReadIOs,
WriteIOs: currentStats.WriteIOs - initialStats.WriteIOs,
ReadTicks: currentStats.ReadTicks - initialStats.ReadTicks,
WriteTicks: currentStats.WriteTicks - initialStats.WriteTicks,
TotalTicks: currentStats.TotalTicks - initialStats.TotalTicks,
ReadSectors: currentStats.ReadSectors - initialStats.ReadSectors,
WriteSectors: currentStats.WriteSectors - initialStats.WriteSectors,
}
updateResourceMetrics(driveSubsystem, readsPerSec, float64(diffStats.ReadIOs)/uptimeDiff, labels, false)
readKib := float64(diffStats.ReadSectors*sectorSize) / kib
updateResourceMetrics(driveSubsystem, readsKBPerSec, readKib/uptimeDiff, labels, false)
updateResourceMetrics(driveSubsystem, writesPerSec, float64(diffStats.WriteIOs)/uptimeDiff, labels, false)
writeKib := float64(diffStats.WriteSectors*sectorSize) / kib
updateResourceMetrics(driveSubsystem, writesKBPerSec, writeKib/uptimeDiff, labels, false)
rdAwait := 0.0
if diffStats.ReadIOs > 0 {
rdAwait = float64(diffStats.ReadTicks) / float64(diffStats.ReadIOs)
}
updateResourceMetrics(driveSubsystem, readsAwait, rdAwait, labels, false)
wrAwait := 0.0
if diffStats.WriteIOs > 0 {
wrAwait = float64(diffStats.WriteTicks) / float64(diffStats.WriteIOs)
}
updateResourceMetrics(driveSubsystem, writesAwait, wrAwait, labels, false)
updateResourceMetrics(driveSubsystem, percUtil, float64(diffStats.TotalTicks)/(uptimeDiff*10), labels, false)
}
func collectDriveMetrics(m madmin.RealtimeMetrics) {
for d, dm := range m.ByDisk {
stats := dm.IOStats
labels := map[string]string{"drive": d}
updateResourceMetrics(driveSubsystem, readsPerSec, float64(stats.ReadIOs)/float64(upt), labels, false)
readBytes := stats.ReadSectors * sectorSize
readKib := float64(readBytes) / float64(kib)
readKibPerSec := readKib / float64(upt)
updateResourceMetrics(driveSubsystem, readsKBPerSec, readKibPerSec, labels, false)
updateResourceMetrics(driveSubsystem, writesPerSec, float64(stats.WriteIOs)/float64(upt), labels, false)
writeBytes := stats.WriteSectors * sectorSize
writeKib := float64(writeBytes) / float64(kib)
writeKibPerSec := writeKib / float64(upt)
updateResourceMetrics(driveSubsystem, writesKBPerSec, writeKibPerSec, labels, false)
rdAwait := 0.0
if stats.ReadIOs > 0 {
rdAwait = float64(stats.ReadTicks) / float64(stats.ReadIOs)
initialStats, ok := initialDriveStats[d]
if !ok {
continue
}
updateResourceMetrics(driveSubsystem, readsAwait, rdAwait, labels, false)
wrAwait := 0.0
if stats.WriteIOs > 0 {
wrAwait = float64(stats.WriteTicks) / float64(stats.WriteIOs)
}
updateResourceMetrics(driveSubsystem, writesAwait, wrAwait, labels, false)
updateResourceMetrics(driveSubsystem, percUtil, float64(stats.TotalTicks)/float64(upt*10), labels, false)
updateDriveIOStats(dm.IOStats, initialStats, labels)
}
globalLocalDrivesMu.RLock()
@ -256,8 +278,8 @@ func collectDriveMetrics(m madmin.RealtimeMetrics) {
globalLocalDrivesMu.RUnlock()
for _, d := range localDrives {
labels := map[string]string{"drive": d.Endpoint().RawPath}
di, err := d.DiskInfo(GlobalContext, DiskInfoOptions{})
labels := map[string]string{"drive": di.Endpoint}
if err == nil {
updateResourceMetrics(driveSubsystem, usedBytes, float64(di.Used), labels, false)
updateResourceMetrics(driveSubsystem, totalBytes, float64(di.Total), labels, false)
@ -339,8 +361,30 @@ func collectLocalResourceMetrics() {
collectDriveMetrics(m)
}
// populateInitialValues - populates the initial values
// for drive stats and host uptime
func populateInitialValues() {
initialDriveStatsMu.Lock()
m := collectLocalMetrics(madmin.MetricsDisk, collectMetricsOpts{
hosts: map[string]struct{}{
globalLocalNodeName: {},
},
})
initialDriveStats = map[string]madmin.DiskIOStats{}
for d, dm := range m.ByDisk {
initialDriveStats[d] = dm.IOStats
}
initialUptime, _ = host.Uptime()
initialDriveStatsMu.Unlock()
}
// startResourceMetricsCollection - starts the job for collecting resource metrics
func startResourceMetricsCollection() {
populateInitialValues()
resourceMetricsMapMu.Lock()
resourceMetricsMap = map[MetricSubsystem]ResourceMetrics{}
resourceMetricsMapMu.Unlock()