mirror of
https://github.com/minio/minio.git
synced 2025-04-27 05:15:01 -04:00
fix: make metrics endpoint responsive by reducing the chatter (#15055)
peerOnlineCounter was making NxN calls to many peers, this can be really long and tedious if there are random servers that are going down. Instead we should calculate online peers from the point of view of "self" and return those online and offline appropriately by performing a healthcheck.
This commit is contained in:
parent
b0d7332a0c
commit
2420f6c000
@ -1347,7 +1347,7 @@ func getNodeHealthMetrics() *MetricsGroup {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
metrics = make([]Metric, 0, 16)
|
metrics = make([]Metric, 0, 16)
|
||||||
nodesUp, nodesDown := GetPeerOnlineCount()
|
nodesUp, nodesDown := globalNotificationSys.GetPeerOnlineCount()
|
||||||
metrics = append(metrics, Metric{
|
metrics = append(metrics, Metric{
|
||||||
Description: getNodeOnlineTotalMD(),
|
Description: getNodeOnlineTotalMD(),
|
||||||
Value: float64(nodesUp),
|
Value: float64(nodesUp),
|
||||||
@ -1932,11 +1932,9 @@ func (c *minioClusterCollector) Collect(out chan<- prometheus.Metric) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Call peer api to fetch metrics
|
// Call peer api to fetch metrics
|
||||||
peerCh := globalNotificationSys.GetClusterMetrics(GlobalContext)
|
|
||||||
selfCh := ReportMetrics(GlobalContext, c.metricsGroups)
|
|
||||||
wg.Add(2)
|
wg.Add(2)
|
||||||
go publish(peerCh)
|
go publish(ReportMetrics(GlobalContext, c.metricsGroups))
|
||||||
go publish(selfCh)
|
go publish(globalNotificationSys.GetClusterMetrics(GlobalContext))
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -110,7 +110,7 @@ func nodeHealthMetricsPrometheus(ch chan<- prometheus.Metric) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
nodesUp, nodesDown := GetPeerOnlineCount()
|
nodesUp, nodesDown := globalNotificationSys.GetPeerOnlineCount()
|
||||||
ch <- prometheus.MustNewConstMetric(
|
ch <- prometheus.MustNewConstMetric(
|
||||||
prometheus.NewDesc(
|
prometheus.NewDesc(
|
||||||
prometheus.BuildFQName(minioNamespace, "nodes", "online"),
|
prometheus.BuildFQName(minioNamespace, "nodes", "online"),
|
||||||
|
@ -1344,6 +1344,35 @@ func (sys *NotificationSys) restClientFromHash(s string) (client *peerRESTClient
|
|||||||
return peerClients[idx]
|
return peerClients[idx]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GetPeerOnlineCount gets the count of online and offline nodes.
|
||||||
|
func (sys *NotificationSys) GetPeerOnlineCount() (nodesOnline, nodesOffline int) {
|
||||||
|
nodesOnline = 1 // Self is always online.
|
||||||
|
nodesOffline = 0
|
||||||
|
nodesOnlineIndex := make([]bool, len(sys.peerClients))
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
for idx, client := range sys.peerClients {
|
||||||
|
if client == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
wg.Add(1)
|
||||||
|
go func(idx int, client *peerRESTClient) {
|
||||||
|
defer wg.Done()
|
||||||
|
nodesOnlineIndex[idx] = client.restClient.HealthCheckFn()
|
||||||
|
}(idx, client)
|
||||||
|
|
||||||
|
}
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
|
for _, online := range nodesOnlineIndex {
|
||||||
|
if online {
|
||||||
|
nodesOnline++
|
||||||
|
} else {
|
||||||
|
nodesOffline++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
// NewNotificationSys - creates new notification system object.
|
// NewNotificationSys - creates new notification system object.
|
||||||
func NewNotificationSys(endpoints EndpointServerPools) *NotificationSys {
|
func NewNotificationSys(endpoints EndpointServerPools) *NotificationSys {
|
||||||
// targetList/bucketRulesMap/bucketRemoteTargetRulesMap are populated by NotificationSys.Init()
|
// targetList/bucketRulesMap/bucketRemoteTargetRulesMap are populated by NotificationSys.Init()
|
||||||
@ -1358,21 +1387,6 @@ func NewNotificationSys(endpoints EndpointServerPools) *NotificationSys {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetPeerOnlineCount gets the count of online and offline nodes.
|
|
||||||
func GetPeerOnlineCount() (nodesOnline, nodesOffline int) {
|
|
||||||
nodesOnline = 1 // Self is always online.
|
|
||||||
nodesOffline = 0
|
|
||||||
servers := globalNotificationSys.ServerInfo()
|
|
||||||
for _, s := range servers {
|
|
||||||
if s.State == string(madmin.ItemOnline) {
|
|
||||||
nodesOnline++
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
nodesOffline++
|
|
||||||
}
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
type eventArgs struct {
|
type eventArgs struct {
|
||||||
EventName event.Name
|
EventName event.Name
|
||||||
BucketName string
|
BucketName string
|
||||||
@ -1524,7 +1538,7 @@ func (sys *NotificationSys) GetBandwidthReports(ctx context.Context, buckets ...
|
|||||||
}
|
}
|
||||||
|
|
||||||
// GetClusterMetrics - gets the cluster metrics from all nodes excluding self.
|
// GetClusterMetrics - gets the cluster metrics from all nodes excluding self.
|
||||||
func (sys *NotificationSys) GetClusterMetrics(ctx context.Context) chan Metric {
|
func (sys *NotificationSys) GetClusterMetrics(ctx context.Context) <-chan Metric {
|
||||||
if sys == nil {
|
if sys == nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@ -1545,11 +1559,14 @@ func (sys *NotificationSys) GetClusterMetrics(ctx context.Context) chan Metric {
|
|||||||
ch := make(chan Metric)
|
ch := make(chan Metric)
|
||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
for index, err := range g.Wait() {
|
for index, err := range g.Wait() {
|
||||||
reqInfo := (&logger.ReqInfo{}).AppendTags("peerAddress",
|
|
||||||
sys.peerClients[index].host.String())
|
|
||||||
ctx := logger.SetReqInfo(ctx, reqInfo)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.LogOnceIf(ctx, err, sys.peerClients[index].host.String())
|
if sys.peerClients[index] != nil {
|
||||||
|
reqInfo := (&logger.ReqInfo{}).AppendTags("peerAddress",
|
||||||
|
sys.peerClients[index].host.String())
|
||||||
|
logger.LogOnceIf(logger.SetReqInfo(ctx, reqInfo), err, sys.peerClients[index].host.String())
|
||||||
|
} else {
|
||||||
|
logger.LogOnceIf(ctx, err, "peer-offline")
|
||||||
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
|
@ -1174,13 +1174,9 @@ func (s *peerRESTServer) GetPeerMetrics(w http.ResponseWriter, r *http.Request)
|
|||||||
s.writeErrorResponse(w, errors.New("invalid request"))
|
s.writeErrorResponse(w, errors.New("invalid request"))
|
||||||
}
|
}
|
||||||
|
|
||||||
doneCh := make(chan struct{})
|
|
||||||
defer close(doneCh)
|
|
||||||
|
|
||||||
enc := gob.NewEncoder(w)
|
enc := gob.NewEncoder(w)
|
||||||
|
|
||||||
ch := ReportMetrics(r.Context(), peerMetricsGroups)
|
for m := range ReportMetrics(r.Context(), peerMetricsGroups) {
|
||||||
for m := range ch {
|
|
||||||
if err := enc.Encode(m); err != nil {
|
if err := enc.Encode(m); err != nil {
|
||||||
s.writeErrorResponse(w, errors.New("Encoding metric failed: "+err.Error()))
|
s.writeErrorResponse(w, errors.New("Encoding metric failed: "+err.Error()))
|
||||||
return
|
return
|
||||||
|
Loading…
x
Reference in New Issue
Block a user