Add support for resource metrics (#18057)

Add a new endpoint for "resource" metrics `/v2/metrics/resource`

This should return system metrics related to drives, network, CPU and
memory. Except for drives, other metrics should have corresponding "avg"
and "max" values also.

Reuse the real-time feature to capture the required data,
introducing CPU and memory metrics in it.

Collect the data every minute and keep updating the average and max values
accordingly, returning the latest values when the API is called.
This commit is contained in:
Shireesh Anjal
2023-10-01 02:10:20 +05:30
committed by GitHub
parent c50627ee3e
commit 6d20ec3bea
11 changed files with 764 additions and 173 deletions

View File

@@ -783,6 +783,27 @@ func (sys *NotificationSys) GetMetrics(ctx context.Context, t madmin.MetricType,
return reply
}
// GetResourceMetrics - gets the resource metrics from all nodes excluding self.
func (sys *NotificationSys) GetResourceMetrics(ctx context.Context) <-chan Metric {
if sys == nil {
return nil
}
g := errgroup.WithNErrs(len(sys.peerClients))
peerChannels := make([]<-chan Metric, len(sys.peerClients))
for index := range sys.peerClients {
index := index
g.Go(func() error {
if sys.peerClients[index] == nil {
return errPeerNotReachable
}
var err error
peerChannels[index], err = sys.peerClients[index].GetResourceMetrics(ctx)
return err
}, index)
}
return sys.collectPeerMetrics(ctx, peerChannels, g)
}
// GetSysConfig - Get information about system config
// (only the config that are of concern to minio)
func (sys *NotificationSys) GetSysConfig(ctx context.Context) []madmin.SysConfig {
@@ -1122,25 +1143,7 @@ func (sys *NotificationSys) GetBandwidthReports(ctx context.Context, buckets ...
return consolidatedReport
}
// GetBucketMetrics - gets the cluster level bucket metrics from all nodes excluding self.
func (sys *NotificationSys) GetBucketMetrics(ctx context.Context) <-chan Metric {
if sys == nil {
return nil
}
g := errgroup.WithNErrs(len(sys.peerClients))
peerChannels := make([]<-chan Metric, len(sys.peerClients))
for index := range sys.peerClients {
index := index
g.Go(func() error {
if sys.peerClients[index] == nil {
return errPeerNotReachable
}
var err error
peerChannels[index], err = sys.peerClients[index].GetPeerBucketMetrics(ctx)
return err
}, index)
}
func (sys *NotificationSys) collectPeerMetrics(ctx context.Context, peerChannels []<-chan Metric, g *errgroup.Group) <-chan Metric {
ch := make(chan Metric)
var wg sync.WaitGroup
for index, err := range g.Wait() {
@@ -1181,6 +1184,27 @@ func (sys *NotificationSys) GetBucketMetrics(ctx context.Context) <-chan Metric
return ch
}
// GetBucketMetrics - gets the cluster level bucket metrics from all nodes excluding self.
func (sys *NotificationSys) GetBucketMetrics(ctx context.Context) <-chan Metric {
if sys == nil {
return nil
}
g := errgroup.WithNErrs(len(sys.peerClients))
peerChannels := make([]<-chan Metric, len(sys.peerClients))
for index := range sys.peerClients {
index := index
g.Go(func() error {
if sys.peerClients[index] == nil {
return errPeerNotReachable
}
var err error
peerChannels[index], err = sys.peerClients[index].GetPeerBucketMetrics(ctx)
return err
}, index)
}
return sys.collectPeerMetrics(ctx, peerChannels, g)
}
// GetClusterMetrics - gets the cluster metrics from all nodes excluding self.
func (sys *NotificationSys) GetClusterMetrics(ctx context.Context) <-chan Metric {
if sys == nil {
@@ -1199,45 +1223,7 @@ func (sys *NotificationSys) GetClusterMetrics(ctx context.Context) <-chan Metric
return err
}, index)
}
ch := make(chan Metric)
var wg sync.WaitGroup
for index, err := range g.Wait() {
if err != nil {
if sys.peerClients[index] != nil {
reqInfo := (&logger.ReqInfo{}).AppendTags("peerAddress",
sys.peerClients[index].host.String())
logger.LogOnceIf(logger.SetReqInfo(ctx, reqInfo), err, sys.peerClients[index].host.String())
} else {
logger.LogOnceIf(ctx, err, "peer-offline")
}
continue
}
wg.Add(1)
go func(ctx context.Context, peerChannel <-chan Metric, wg *sync.WaitGroup) {
defer wg.Done()
for {
select {
case m, ok := <-peerChannel:
if !ok {
return
}
select {
case ch <- m:
case <-ctx.Done():
return
}
case <-ctx.Done():
return
}
}
}(ctx, peerChannels[index], &wg)
}
go func(wg *sync.WaitGroup, ch chan Metric) {
wg.Wait()
close(ch)
}(&wg, ch)
return ch
return sys.collectPeerMetrics(ctx, peerChannels, g)
}
// ServiceFreeze freezes all S3 API calls when 'freeze' is true,