fix: make metrics endpoint responsive by reducing the chatter (#15055)

peerOnlineCounter was making NxN calls to many peers, this
can be really long and tedious if there are random servers
that are going down.

Instead we should calculate online peers from the point of
view of "self" and return those online and offline appropriately
by performing a healthcheck.
This commit is contained in:
Harshavardhana 2022-06-08 02:43:13 -07:00 committed by GitHub
parent b0d7332a0c
commit 2420f6c000
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 42 additions and 31 deletions

View File

@ -1347,7 +1347,7 @@ func getNodeHealthMetrics() *MetricsGroup {
return
}
metrics = make([]Metric, 0, 16)
nodesUp, nodesDown := GetPeerOnlineCount()
nodesUp, nodesDown := globalNotificationSys.GetPeerOnlineCount()
metrics = append(metrics, Metric{
Description: getNodeOnlineTotalMD(),
Value: float64(nodesUp),
@ -1932,11 +1932,9 @@ func (c *minioClusterCollector) Collect(out chan<- prometheus.Metric) {
}
// Call peer api to fetch metrics
peerCh := globalNotificationSys.GetClusterMetrics(GlobalContext)
selfCh := ReportMetrics(GlobalContext, c.metricsGroups)
wg.Add(2)
go publish(peerCh)
go publish(selfCh)
go publish(ReportMetrics(GlobalContext, c.metricsGroups))
go publish(globalNotificationSys.GetClusterMetrics(GlobalContext))
wg.Wait()
}

View File

@ -110,7 +110,7 @@ func nodeHealthMetricsPrometheus(ch chan<- prometheus.Metric) {
return
}
nodesUp, nodesDown := GetPeerOnlineCount()
nodesUp, nodesDown := globalNotificationSys.GetPeerOnlineCount()
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(minioNamespace, "nodes", "online"),

View File

@ -1344,6 +1344,35 @@ func (sys *NotificationSys) restClientFromHash(s string) (client *peerRESTClient
return peerClients[idx]
}
// GetPeerOnlineCount gets the count of online and offline nodes.
func (sys *NotificationSys) GetPeerOnlineCount() (nodesOnline, nodesOffline int) {
nodesOnline = 1 // Self is always online.
nodesOffline = 0
nodesOnlineIndex := make([]bool, len(sys.peerClients))
var wg sync.WaitGroup
for idx, client := range sys.peerClients {
if client == nil {
continue
}
wg.Add(1)
go func(idx int, client *peerRESTClient) {
defer wg.Done()
nodesOnlineIndex[idx] = client.restClient.HealthCheckFn()
}(idx, client)
}
wg.Wait()
for _, online := range nodesOnlineIndex {
if online {
nodesOnline++
} else {
nodesOffline++
}
}
return
}
// NewNotificationSys - creates new notification system object.
func NewNotificationSys(endpoints EndpointServerPools) *NotificationSys {
// targetList/bucketRulesMap/bucketRemoteTargetRulesMap are populated by NotificationSys.Init()
@ -1358,21 +1387,6 @@ func NewNotificationSys(endpoints EndpointServerPools) *NotificationSys {
}
}
// GetPeerOnlineCount gets the count of online and offline nodes.
func GetPeerOnlineCount() (nodesOnline, nodesOffline int) {
nodesOnline = 1 // Self is always online.
nodesOffline = 0
servers := globalNotificationSys.ServerInfo()
for _, s := range servers {
if s.State == string(madmin.ItemOnline) {
nodesOnline++
continue
}
nodesOffline++
}
return
}
type eventArgs struct {
EventName event.Name
BucketName string
@ -1524,7 +1538,7 @@ func (sys *NotificationSys) GetBandwidthReports(ctx context.Context, buckets ...
}
// GetClusterMetrics - gets the cluster metrics from all nodes excluding self.
func (sys *NotificationSys) GetClusterMetrics(ctx context.Context) chan Metric {
func (sys *NotificationSys) GetClusterMetrics(ctx context.Context) <-chan Metric {
if sys == nil {
return nil
}
@ -1545,11 +1559,14 @@ func (sys *NotificationSys) GetClusterMetrics(ctx context.Context) chan Metric {
ch := make(chan Metric)
var wg sync.WaitGroup
for index, err := range g.Wait() {
reqInfo := (&logger.ReqInfo{}).AppendTags("peerAddress",
sys.peerClients[index].host.String())
ctx := logger.SetReqInfo(ctx, reqInfo)
if err != nil {
logger.LogOnceIf(ctx, err, sys.peerClients[index].host.String())
if sys.peerClients[index] != nil {
reqInfo := (&logger.ReqInfo{}).AppendTags("peerAddress",
sys.peerClients[index].host.String())
logger.LogOnceIf(logger.SetReqInfo(ctx, reqInfo), err, sys.peerClients[index].host.String())
} else {
logger.LogOnceIf(ctx, err, "peer-offline")
}
continue
}
wg.Add(1)

View File

@ -1174,13 +1174,9 @@ func (s *peerRESTServer) GetPeerMetrics(w http.ResponseWriter, r *http.Request)
s.writeErrorResponse(w, errors.New("invalid request"))
}
doneCh := make(chan struct{})
defer close(doneCh)
enc := gob.NewEncoder(w)
ch := ReportMetrics(r.Context(), peerMetricsGroups)
for m := range ch {
for m := range ReportMetrics(r.Context(), peerMetricsGroups) {
if err := enc.Encode(m); err != nil {
s.writeErrorResponse(w, errors.New("Encoding metric failed: "+err.Error()))
return