fix: make metrics endpoint responsive by reducing the chatter (#15055)

peerOnlineCounter was making NxN calls to many peers, this
can be really long and tedious if there are random servers
that are going down.

Instead we should calculate online peers from the point of
view of "self" and return those online and offline appropriately
by performing a healthcheck.
This commit is contained in:
Harshavardhana
2022-06-08 02:43:13 -07:00
committed by GitHub
parent b0d7332a0c
commit 2420f6c000
4 changed files with 42 additions and 31 deletions

View File

@@ -1347,7 +1347,7 @@ func getNodeHealthMetrics() *MetricsGroup {
return
}
metrics = make([]Metric, 0, 16)
nodesUp, nodesDown := GetPeerOnlineCount()
nodesUp, nodesDown := globalNotificationSys.GetPeerOnlineCount()
metrics = append(metrics, Metric{
Description: getNodeOnlineTotalMD(),
Value: float64(nodesUp),
@@ -1932,11 +1932,9 @@ func (c *minioClusterCollector) Collect(out chan<- prometheus.Metric) {
}
// Call peer api to fetch metrics
peerCh := globalNotificationSys.GetClusterMetrics(GlobalContext)
selfCh := ReportMetrics(GlobalContext, c.metricsGroups)
wg.Add(2)
go publish(peerCh)
go publish(selfCh)
go publish(ReportMetrics(GlobalContext, c.metricsGroups))
go publish(globalNotificationSys.GetClusterMetrics(GlobalContext))
wg.Wait()
}