From b4add82bb6233a3ce97cbc605d91efb8405f4582 Mon Sep 17 00:00:00 2001 From: Ritesh H Shukla Date: Mon, 18 Jan 2021 20:35:38 -0800 Subject: [PATCH] Updated Prometheus metrics (#11141) * Add metrics for nodes online and offline * Add cluster capacity metrics * Introduce v2 metrics --- cmd/admin-server-info.go | 32 + cmd/disk-cache-stats.go | 8 + cmd/erasure-server-pool.go | 4 +- cmd/fs-v1.go | 4 +- cmd/gateway-common.go | 2 +- cmd/gateway-metrics.go | 26 +- cmd/gateway-unsupported.go | 4 +- cmd/gateway/azure/gateway-azure.go | 4 +- cmd/gateway/gcs/gateway-gcs.go | 4 +- cmd/gateway/s3/gateway-s3.go | 4 +- cmd/generic-handlers.go | 4 +- cmd/http-stats.go | 14 +- cmd/metrics-router.go | 13 +- cmd/metrics-v2.go | 1187 ++++++++++++++++++++++++++++ cmd/metrics.go | 127 ++- cmd/notification-summary.go | 54 ++ cmd/notification.go | 68 +- cmd/object-api-interface.go | 9 +- cmd/peer-rest-client.go | 23 +- cmd/peer-rest-common.go | 1 + cmd/peer-rest-server.go | 32 +- docs/metrics/README.md | 11 +- docs/metrics/prometheus/README.md | 202 +---- docs/metrics/prometheus/list.md | 47 ++ go.mod | 3 + go.sum | 6 + pkg/madmin/health.go | 28 +- 27 files changed, 1669 insertions(+), 252 deletions(-) create mode 100644 cmd/metrics-v2.go create mode 100644 cmd/notification-summary.go create mode 100644 docs/metrics/prometheus/list.md diff --git a/cmd/admin-server-info.go b/cmd/admin-server-info.go index a2f20f831..c6321a032 100644 --- a/cmd/admin-server-info.go +++ b/cmd/admin-server-info.go @@ -69,3 +69,35 @@ func getLocalServerProperty(endpointServerPools EndpointServerPools, r *http.Req Disks: storageInfo.Disks, } } + +func getLocalDisks(endpointServerPools EndpointServerPools) []madmin.Disk { + var localEndpoints Endpoints + network := make(map[string]string) + + for _, ep := range endpointServerPools { + for _, endpoint := range ep.Endpoints { + nodeName := endpoint.Host + if nodeName == "" { + nodeName = "localhost" + } + if endpoint.IsLocal { + // Only proceed for local endpoints + network[nodeName] = "online" + localEndpoints = append(localEndpoints, endpoint) + continue + } + _, present := network[nodeName] + if !present { + if err := isServerResolvable(endpoint); err == nil { + network[nodeName] = "online" + } else { + network[nodeName] = "offline" + } + } + } + } + localDisks, _ := initStorageDisksWithErrors(localEndpoints) + defer closeStorageDisks(localDisks) + storageInfo, _ := getStorageInfo(localDisks, localEndpoints.GetAllStrings()) + return storageInfo.Disks +} diff --git a/cmd/disk-cache-stats.go b/cmd/disk-cache-stats.go index ff831e13b..eae32889a 100644 --- a/cmd/disk-cache-stats.go +++ b/cmd/disk-cache-stats.go @@ -34,6 +34,14 @@ type CacheDiskStats struct { Dir string } +// GetUsageLevelString gets the string representation for the usage level. +func (c *CacheDiskStats) GetUsageLevelString() (u string) { + if atomic.LoadInt32(&c.UsageState) == 0 { + return "low" + } + return "high" +} + // CacheStats - represents bytes served from cache, // cache hits and cache misses. type CacheStats struct { diff --git a/cmd/erasure-server-pool.go b/cmd/erasure-server-pool.go index 6625a0179..710685410 100644 --- a/cmd/erasure-server-pool.go +++ b/cmd/erasure-server-pool.go @@ -1377,9 +1377,9 @@ func (z *erasureServerPools) HealObject(ctx context.Context, bucket, object, ver } // GetMetrics - no op -func (z *erasureServerPools) GetMetrics(ctx context.Context) (*Metrics, error) { +func (z *erasureServerPools) GetMetrics(ctx context.Context) (*BackendMetrics, error) { logger.LogIf(ctx, NotImplemented{}) - return &Metrics{}, NotImplemented{} + return &BackendMetrics{}, NotImplemented{} } func (z *erasureServerPools) getZoneAndSet(id string) (int, int, error) { diff --git a/cmd/fs-v1.go b/cmd/fs-v1.go index 4517f9374..9e478c13e 100644 --- a/cmd/fs-v1.go +++ b/cmd/fs-v1.go @@ -1554,9 +1554,9 @@ func (fs *FSObjects) HealObjects(ctx context.Context, bucket, prefix string, opt } // GetMetrics - no op -func (fs *FSObjects) GetMetrics(ctx context.Context) (*Metrics, error) { +func (fs *FSObjects) GetMetrics(ctx context.Context) (*BackendMetrics, error) { logger.LogIf(ctx, NotImplemented{}) - return &Metrics{}, NotImplemented{} + return &BackendMetrics{}, NotImplemented{} } // ListObjectsV2 lists all blobs in bucket filtered by prefix diff --git a/cmd/gateway-common.go b/cmd/gateway-common.go index 93f6a2caa..19e506827 100644 --- a/cmd/gateway-common.go +++ b/cmd/gateway-common.go @@ -389,7 +389,7 @@ func shouldMeterRequest(req *http.Request) bool { // MetricsTransport is a custom wrapper around Transport to track metrics type MetricsTransport struct { Transport *http.Transport - Metrics *Metrics + Metrics *BackendMetrics } // RoundTrip implements the RoundTrip method for MetricsTransport diff --git a/cmd/gateway-metrics.go b/cmd/gateway-metrics.go index 3f9746fa3..fa53815c1 100644 --- a/cmd/gateway-metrics.go +++ b/cmd/gateway-metrics.go @@ -29,36 +29,28 @@ type RequestStats struct { Post uint64 `json:"Post"` } -// Metrics - represents bytes served from backend -// only implemented for S3 Gateway -type Metrics struct { - bytesReceived uint64 - bytesSent uint64 - requestStats RequestStats -} - // IncBytesReceived - Increase total bytes received from gateway backend -func (s *Metrics) IncBytesReceived(n uint64) { +func (s *BackendMetrics) IncBytesReceived(n uint64) { atomic.AddUint64(&s.bytesReceived, n) } // GetBytesReceived - Get total bytes received from gateway backend -func (s *Metrics) GetBytesReceived() uint64 { +func (s *BackendMetrics) GetBytesReceived() uint64 { return atomic.LoadUint64(&s.bytesReceived) } // IncBytesSent - Increase total bytes sent to gateway backend -func (s *Metrics) IncBytesSent(n uint64) { +func (s *BackendMetrics) IncBytesSent(n uint64) { atomic.AddUint64(&s.bytesSent, n) } // GetBytesSent - Get total bytes received from gateway backend -func (s *Metrics) GetBytesSent() uint64 { +func (s *BackendMetrics) GetBytesSent() uint64 { return atomic.LoadUint64(&s.bytesSent) } // IncRequests - Increase request count sent to gateway backend by 1 -func (s *Metrics) IncRequests(method string) { +func (s *BackendMetrics) IncRequests(method string) { // Only increment for Head & Get requests, else no op if method == http.MethodGet { atomic.AddUint64(&s.requestStats.Get, 1) @@ -72,11 +64,11 @@ func (s *Metrics) IncRequests(method string) { } // GetRequests - Get total number of Get & Headrequests sent to gateway backend -func (s *Metrics) GetRequests() RequestStats { +func (s *BackendMetrics) GetRequests() RequestStats { return s.requestStats } -// NewMetrics - Prepare new Metrics structure -func NewMetrics() *Metrics { - return &Metrics{} +// NewMetrics - Prepare new BackendMetrics structure +func NewMetrics() *BackendMetrics { + return &BackendMetrics{} } diff --git a/cmd/gateway-unsupported.go b/cmd/gateway-unsupported.go index af755caaf..98cd3f6ce 100644 --- a/cmd/gateway-unsupported.go +++ b/cmd/gateway-unsupported.go @@ -202,9 +202,9 @@ func (a GatewayUnsupported) CopyObject(ctx context.Context, srcBucket string, sr } // GetMetrics - no op -func (a GatewayUnsupported) GetMetrics(ctx context.Context) (*Metrics, error) { +func (a GatewayUnsupported) GetMetrics(ctx context.Context) (*BackendMetrics, error) { logger.LogIf(ctx, NotImplemented{}) - return &Metrics{}, NotImplemented{} + return &BackendMetrics{}, NotImplemented{} } // PutObjectTags - not implemented. diff --git a/cmd/gateway/azure/gateway-azure.go b/cmd/gateway/azure/gateway-azure.go index d9f652caf..df9edfa03 100644 --- a/cmd/gateway/azure/gateway-azure.go +++ b/cmd/gateway/azure/gateway-azure.go @@ -419,7 +419,7 @@ type azureObjects struct { minio.GatewayUnsupported endpoint *url.URL httpClient *http.Client - metrics *minio.Metrics + metrics *minio.BackendMetrics client azblob.ServiceURL // Azure sdk client } @@ -533,7 +533,7 @@ func parseAzurePart(metaPartFileName, prefix string) (partID int, err error) { } // GetMetrics returns this gateway's metrics -func (a *azureObjects) GetMetrics(ctx context.Context) (*minio.Metrics, error) { +func (a *azureObjects) GetMetrics(ctx context.Context) (*minio.BackendMetrics, error) { return a.metrics, nil } diff --git a/cmd/gateway/gcs/gateway-gcs.go b/cmd/gateway/gcs/gateway-gcs.go index 70a2fe5e3..20813aaf4 100644 --- a/cmd/gateway/gcs/gateway-gcs.go +++ b/cmd/gateway/gcs/gateway-gcs.go @@ -341,7 +341,7 @@ type gcsGateway struct { minio.GatewayUnsupported client *storage.Client httpClient *http.Client - metrics *minio.Metrics + metrics *minio.BackendMetrics projectID string } @@ -359,7 +359,7 @@ func gcsParseProjectID(credsFile string) (projectID string, err error) { } // GetMetrics returns this gateway's metrics -func (l *gcsGateway) GetMetrics(ctx context.Context) (*minio.Metrics, error) { +func (l *gcsGateway) GetMetrics(ctx context.Context) (*minio.BackendMetrics, error) { return l.metrics, nil } diff --git a/cmd/gateway/s3/gateway-s3.go b/cmd/gateway/s3/gateway-s3.go index 4fb0054bf..8495f28b2 100644 --- a/cmd/gateway/s3/gateway-s3.go +++ b/cmd/gateway/s3/gateway-s3.go @@ -259,11 +259,11 @@ type s3Objects struct { minio.GatewayUnsupported Client *miniogo.Core HTTPClient *http.Client - Metrics *minio.Metrics + Metrics *minio.BackendMetrics } // GetMetrics returns this gateway's metrics -func (l *s3Objects) GetMetrics(ctx context.Context) (*minio.Metrics, error) { +func (l *s3Objects) GetMetrics(ctx context.Context) (*minio.BackendMetrics, error) { return l.Metrics, nil } diff --git a/cmd/generic-handlers.go b/cmd/generic-handlers.go index 03abfc07d..46e9ab4c5 100644 --- a/cmd/generic-handlers.go +++ b/cmd/generic-handlers.go @@ -228,7 +228,9 @@ func guessIsMetricsReq(req *http.Request) bool { } aType := getRequestAuthType(req) return (aType == authTypeAnonymous || aType == authTypeJWT) && - req.URL.Path == minioReservedBucketPath+prometheusMetricsPath + req.URL.Path == minioReservedBucketPath+prometheusMetricsPathLegacy || + req.URL.Path == minioReservedBucketPath+prometheusMetricsV2ClusterPath || + req.URL.Path == minioReservedBucketPath+prometheusMetricsV2NodePath } // guessIsRPCReq - returns true if the request is for an RPC endpoint. diff --git a/cmd/http-stats.go b/cmd/http-stats.go index d4535c1ed..4058acc7b 100644 --- a/cmd/http-stats.go +++ b/cmd/http-stats.go @@ -79,10 +79,10 @@ func (s *ConnStats) getS3OutputBytes() uint64 { // Return connection stats (total input/output bytes and total s3 input/output bytes) func (s *ConnStats) toServerConnStats() ServerConnStats { return ServerConnStats{ - TotalInputBytes: s.getTotalInputBytes(), - TotalOutputBytes: s.getTotalOutputBytes(), - S3InputBytes: s.getS3InputBytes(), - S3OutputBytes: s.getS3OutputBytes(), + TotalInputBytes: s.getTotalInputBytes(), // Traffic including reserved bucket + TotalOutputBytes: s.getTotalOutputBytes(), // Traffic including reserved bucket + S3InputBytes: s.getS3InputBytes(), // Traffic for client buckets + S3OutputBytes: s.getS3OutputBytes(), // Traffic for client buckets } } @@ -163,9 +163,11 @@ func (st *HTTPStats) toServerHTTPStats() ServerHTTPStats { // Update statistics from http request and response data func (st *HTTPStats) updateStats(api string, r *http.Request, w *logger.ResponseWriter) { // A successful request has a 2xx response code - successReq := (w.StatusCode >= 200 && w.StatusCode < 300) + successReq := w.StatusCode >= 200 && w.StatusCode < 300 - if !strings.HasSuffix(r.URL.Path, prometheusMetricsPath) { + if !strings.HasSuffix(r.URL.Path, prometheusMetricsPathLegacy) || + !strings.HasSuffix(r.URL.Path, prometheusMetricsV2ClusterPath) || + !strings.HasSuffix(r.URL.Path, prometheusMetricsV2NodePath) { st.totalS3Requests.Inc(api) if !successReq && w.StatusCode != 0 { st.totalS3Errors.Inc(api) diff --git a/cmd/metrics-router.go b/cmd/metrics-router.go index 3e26fe7d7..230ca079e 100644 --- a/cmd/metrics-router.go +++ b/cmd/metrics-router.go @@ -24,7 +24,9 @@ import ( ) const ( - prometheusMetricsPath = "/prometheus/metrics" + prometheusMetricsPathLegacy = "/prometheus/metrics" + prometheusMetricsV2ClusterPath = "/v2/metrics/cluster" + prometheusMetricsV2NodePath = "/v2/metrics/node" ) // Standard env prometheus auth type @@ -43,14 +45,17 @@ const ( func registerMetricsRouter(router *mux.Router) { // metrics router metricsRouter := router.NewRoute().PathPrefix(minioReservedBucketPath).Subrouter() - authType := strings.ToLower(os.Getenv(EnvPrometheusAuthType)) switch prometheusAuthType(authType) { case prometheusPublic: - metricsRouter.Handle(prometheusMetricsPath, metricsHandler()) + metricsRouter.Handle(prometheusMetricsPathLegacy, metricsHandler()) + metricsRouter.Handle(prometheusMetricsV2ClusterPath, metricsServerHandler()) + metricsRouter.Handle(prometheusMetricsV2NodePath, metricsNodeHandler()) case prometheusJWT: fallthrough default: - metricsRouter.Handle(prometheusMetricsPath, AuthMiddleware(metricsHandler())) + metricsRouter.Handle(prometheusMetricsPathLegacy, AuthMiddleware(metricsHandler())) + metricsRouter.Handle(prometheusMetricsV2ClusterPath, AuthMiddleware(metricsServerHandler())) + metricsRouter.Handle(prometheusMetricsV2NodePath, AuthMiddleware(metricsNodeHandler())) } } diff --git a/cmd/metrics-v2.go b/cmd/metrics-v2.go new file mode 100644 index 000000000..b6af16328 --- /dev/null +++ b/cmd/metrics-v2.go @@ -0,0 +1,1187 @@ +/* + * MinIO Cloud Storage, (C) 2018-2020 MinIO, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package cmd + +import ( + "context" + "fmt" + "net/http" + "strings" + "sync" + "time" + + "github.com/minio/minio/cmd/logger" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + dto "github.com/prometheus/client_model/go" +) + +// MetricNamespace is top level grouping of metrics to create the metric name. +type MetricNamespace string + +// MetricSubsystem is the sub grouping for metrics within a namespace. +type MetricSubsystem string + +const ( + bucketMetricNamespace MetricNamespace = "minio_bucket" + clusterMetricNamespace MetricNamespace = "minio_cluster" + healMetricNamespace MetricNamespace = "minio_heal" + interNodeMetricNamespace MetricNamespace = "minio_inter_node" + nodeMetricNamespace MetricNamespace = "minio_node" + minIOMetricNamespace MetricNamespace = "minio" + s3MetricNamespace MetricNamespace = "minio_s3" +) + +const ( + cacheSubsystem MetricSubsystem = "cache" + capacityRawSubsystem MetricSubsystem = "capacity_raw" + capacityUsableSubsystem MetricSubsystem = "capacity_usable" + diskSubsystem MetricSubsystem = "disk" + nodesSubsystem MetricSubsystem = "nodes" + objectsSubsystem MetricSubsystem = "objects" + replicationSubsystem MetricSubsystem = "replication" + requestsSubsystem MetricSubsystem = "requests" + timeSubsystem MetricSubsystem = "time" + trafficSubsystem MetricSubsystem = "traffic" + usageSubsystem MetricSubsystem = "usage" + softwareSubsystem MetricSubsystem = "software" +) + +// MetricNames are the individual names for the metric. +type MetricNames string + +const ( + errorsTotal MetricNames = "error_total" + healTotal MetricNames = "heal_total" + hitsTotal MetricNames = "hits_total" + inflightTotal MetricNames = "inflight_total" + missedTotal MetricNames = "missed_total" + objectTotal MetricNames = "object_total" + offlineTotal MetricNames = "offline_total" + onlineTotal MetricNames = "online_total" + total MetricNames = "total" + + failedBytes MetricNames = "failed_bytes" + freeBytes MetricNames = "free_bytes" + pendingBytes MetricNames = "pending_bytes" + receivedBytes MetricNames = "received_bytes" + sentBytes MetricNames = "sent_bytes" + totalBytes MetricNames = "total_bytes" + usedBytes MetricNames = "used_bytes" + + usagePercent MetricNames = "update_percent" + + commitInfo MetricNames = "commit_info" + usageInfo MetricNames = "usage_info" + versionInfo MetricNames = "version_info" + + sizeDistribution = "size_distribution" + ttfbDistribution = "ttbf_seconds_distribution" + + lastActivityTime = "last_activity_nano_seconds" +) + +const ( + serverName = "server" +) + +// GaugeMetricType for the types of metrics supported +type GaugeMetricType string + +const ( + gaugeMetric = "gaugeMetric" + counterMetric = "counterMetric" + histogramMetric = "histogramMetric" +) + +// MetricDescription describes the metric +type MetricDescription struct { + Namespace MetricNamespace `json:"MetricNamespace"` + Subsystem MetricSubsystem `json:"Subsystem"` + Name MetricNames `json:"MetricNames"` + Help string `json:"Help"` + Type GaugeMetricType `json:"Type"` +} + +// Metric captures the details for a metric +type Metric struct { + Description MetricDescription `json:"Description"` + StaticLabels map[string]string `json:"StaticLabels"` + Value float64 `json:"Value"` + VariableLabels map[string]string `json:"VariableLabels"` + HistogramBucketLabel string `json:"HistogramBucketLabel"` + Histogram map[string]uint64 `json:"Histogram"` +} + +// MetricsGroup are a group of metrics that are initialized together. +type MetricsGroup struct { + Metrics []Metric + initialize func(ctx context.Context, m *MetricsGroup) +} + +// MetricsGenerator are functions that generate metric groups. +type MetricsGenerator func() MetricsGroup + +// GetGlobalGenerators gets all the generators the report global metrics pre calculated. +func GetGlobalGenerators() []MetricsGenerator { + g := []MetricsGenerator{ + getBucketUsageMetrics, + getMinioHealingMetrics, + getNodeHealthMetrics, + getClusterStorageMetrics, + } + return g +} + +// GetAllGenerators gets all the metric generators. +func GetAllGenerators() []MetricsGenerator { + g := GetGlobalGenerators() + g = append(g, GetGeneratorsForPeer()...) + return g +} + +// GetGeneratorsForPeer - gets the generators to report to peer. +func GetGeneratorsForPeer() []MetricsGenerator { + g := []MetricsGenerator{ + getLocalStorageMetrics, + getMinioVersionMetrics, + getHTTPMetrics, + getNetworkMetrics, + getS3TTFBMetric, + getCacheMetrics, + } + return g +} + +// GetSingleNodeGenerators gets the metrics that are local +func GetSingleNodeGenerators() []MetricsGenerator { + g := []MetricsGenerator{ + getNodeHealthMetrics, + getCacheMetrics, + getHTTPMetrics, + getNetworkMetrics, + getMinioVersionMetrics, + getS3TTFBMetric, + } + return g +} + +func getClusterCapacityTotalBytesMD() MetricDescription { + return MetricDescription{ + Namespace: clusterMetricNamespace, + Subsystem: capacityRawSubsystem, + Name: totalBytes, + Help: "Total capacity online in the cluster.", + Type: gaugeMetric, + } +} +func getClusterCapacityFreeBytesMD() MetricDescription { + return MetricDescription{ + Namespace: clusterMetricNamespace, + Subsystem: capacityRawSubsystem, + Name: freeBytes, + Help: "Total free capacity online in the cluster.", + Type: gaugeMetric, + } +} +func getClusterCapacityUsageBytesMD() MetricDescription { + return MetricDescription{ + Namespace: clusterMetricNamespace, + Subsystem: capacityUsableSubsystem, + Name: totalBytes, + Help: "Total usable capacity online in the cluster.", + Type: gaugeMetric, + } +} +func getClusterCapacityUsageFreeBytesMD() MetricDescription { + return MetricDescription{ + Namespace: clusterMetricNamespace, + Subsystem: capacityUsableSubsystem, + Name: freeBytes, + Help: "Total free usable capacity online in the cluster.", + Type: gaugeMetric, + } +} + +func getNodeDiskUsedBytesMD() MetricDescription { + return MetricDescription{ + Namespace: nodeMetricNamespace, + Subsystem: diskSubsystem, + Name: usedBytes, + Help: "Total storage used on a disk.", + Type: gaugeMetric, + } +} +func getNodeDiskFreeBytesMD() MetricDescription { + return MetricDescription{ + Namespace: nodeMetricNamespace, + Subsystem: diskSubsystem, + Name: freeBytes, + Help: "Total storage available on a disk.", + Type: gaugeMetric, + } +} +func getClusterDiskOfflineTotalMD() MetricDescription { + return MetricDescription{ + Namespace: clusterMetricNamespace, + Subsystem: diskSubsystem, + Name: offlineTotal, + Help: "Total disks offline.", + Type: gaugeMetric, + } +} + +func getClusterDiskOnlineTotalMD() MetricDescription { + return MetricDescription{ + Namespace: clusterMetricNamespace, + Subsystem: diskSubsystem, + Name: onlineTotal, + Help: "Total disks online.", + Type: gaugeMetric, + } +} + +func getNodeDiskTotalBytesMD() MetricDescription { + return MetricDescription{ + Namespace: nodeMetricNamespace, + Subsystem: diskSubsystem, + Name: totalBytes, + Help: "Total storage on a disk.", + Type: gaugeMetric, + } +} +func getBucketUsageTotalBytesMD() MetricDescription { + return MetricDescription{ + Namespace: bucketMetricNamespace, + Subsystem: usageSubsystem, + Name: totalBytes, + Help: "Total bucket size in bytes", + Type: gaugeMetric, + } +} +func getBucketUsageObjectsTotalMD() MetricDescription { + return MetricDescription{ + Namespace: bucketMetricNamespace, + Subsystem: usageSubsystem, + Name: objectTotal, + Help: "Total number of objects", + Type: gaugeMetric, + } +} +func getBucketRepPendingBytesMD() MetricDescription { + return MetricDescription{ + Namespace: bucketMetricNamespace, + Subsystem: replicationSubsystem, + Name: pendingBytes, + Help: "Total bytes pending to replicate.", + Type: gaugeMetric, + } +} +func getBucketRepFailedBytesMD() MetricDescription { + return MetricDescription{ + Namespace: bucketMetricNamespace, + Subsystem: replicationSubsystem, + Name: failedBytes, + Help: "Total number of bytes failed at least once to replicate.", + Type: gaugeMetric, + } +} +func getBucketRepSentBytesMD() MetricDescription { + return MetricDescription{ + Namespace: bucketMetricNamespace, + Subsystem: replicationSubsystem, + Name: sentBytes, + Help: "Total number of bytes replicated to the target bucket.", + Type: gaugeMetric, + } +} +func getBucketRepReceivedBytesMD() MetricDescription { + return MetricDescription{ + Namespace: bucketMetricNamespace, + Subsystem: replicationSubsystem, + Name: receivedBytes, + Help: "Total number of bytes replicated to this bucket from another source bucket.", + Type: gaugeMetric, + } +} +func getBucketObjectDistributionMD() MetricDescription { + return MetricDescription{ + Namespace: bucketMetricNamespace, + Subsystem: objectsSubsystem, + Name: sizeDistribution, + Help: "Distribution of object sizes in the bucket, includes label for the bucket name.", + Type: histogramMetric, + } +} +func getInterNodeSentBytesMD() MetricDescription { + return MetricDescription{ + Namespace: interNodeMetricNamespace, + Subsystem: trafficSubsystem, + Name: sentBytes, + Help: "Total number of bytes sent to the other peer nodes.", + Type: counterMetric, + } +} +func getInterNodeReceivedBytesMD() MetricDescription { + return MetricDescription{ + Namespace: interNodeMetricNamespace, + Subsystem: trafficSubsystem, + Name: receivedBytes, + Help: "Total number of bytes received from other peer nodes.", + Type: counterMetric, + } +} +func getS3SentBytesMD() MetricDescription { + return MetricDescription{ + Namespace: s3MetricNamespace, + Subsystem: trafficSubsystem, + Name: sentBytes, + Help: "Total number of s3 bytes sent", + Type: counterMetric, + } +} +func getS3ReceivedBytesMD() MetricDescription { + return MetricDescription{ + Namespace: s3MetricNamespace, + Subsystem: trafficSubsystem, + Name: receivedBytes, + Help: "Total number of s3 bytes received.", + Type: counterMetric, + } +} +func getS3RequestsInFlightMD() MetricDescription { + return MetricDescription{ + Namespace: s3MetricNamespace, + Subsystem: requestsSubsystem, + Name: inflightTotal, + Help: "Total number of S3 requests currently in flight.", + Type: counterMetric, + } +} +func getS3RequestsTotalMD() MetricDescription { + return MetricDescription{ + Namespace: s3MetricNamespace, + Subsystem: requestsSubsystem, + Name: total, + Help: "Total number S3 requests", + Type: counterMetric, + } +} +func getS3RequestsErrorsMD() MetricDescription { + return MetricDescription{ + Namespace: s3MetricNamespace, + Subsystem: requestsSubsystem, + Name: errorsTotal, + Help: "Total number S3 requests with errors", + Type: counterMetric, + } +} +func getCacheHitsTotalMD() MetricDescription { + return MetricDescription{ + Namespace: minioNamespace, + Subsystem: cacheSubsystem, + Name: hitsTotal, + Help: "Total number of disk cache hits", + Type: counterMetric, + } +} +func getCacheHitsMissedTotalMD() MetricDescription { + return MetricDescription{ + Namespace: minioNamespace, + Subsystem: cacheSubsystem, + Name: missedTotal, + Help: "Total number of disk cache misses", + Type: counterMetric, + } +} +func getCacheUsagePercentMD() MetricDescription { + return MetricDescription{ + Namespace: minioNamespace, + Subsystem: minioNamespace, + Name: usagePercent, + Help: "Total percentage cache usage", + Type: gaugeMetric, + } +} +func getCacheUsageInfoMD() MetricDescription { + return MetricDescription{ + Namespace: minioNamespace, + Subsystem: cacheSubsystem, + Name: usageInfo, + Help: "Total percentage cache usage, value of 1 indicates high and 0 low, label level is set as well", + Type: gaugeMetric, + } +} +func getCacheUsedBytesMD() MetricDescription { + return MetricDescription{ + Namespace: minioNamespace, + Subsystem: cacheSubsystem, + Name: usedBytes, + Help: "Current cache usage in bytes", + Type: gaugeMetric, + } +} +func getCacheTotalBytesMD() MetricDescription { + return MetricDescription{ + Namespace: minioNamespace, + Subsystem: cacheSubsystem, + Name: totalBytes, + Help: "Total size of cache disk in bytes", + Type: gaugeMetric, + } +} +func getCacheSentBytesMD() MetricDescription { + return MetricDescription{ + Namespace: minioNamespace, + Subsystem: cacheSubsystem, + Name: sentBytes, + Help: "Total number of bytes served from cache", + Type: counterMetric, + } +} +func getHealObjectsTotalMD() MetricDescription { + return MetricDescription{ + Namespace: healMetricNamespace, + Subsystem: objectsSubsystem, + Name: total, + Help: "Objects scanned in current self healing run", + Type: gaugeMetric, + } +} +func getHealObjectsHealTotalMD() MetricDescription { + return MetricDescription{ + Namespace: healMetricNamespace, + Subsystem: objectsSubsystem, + Name: healTotal, + Help: "Objects healed in current self healing run", + Type: gaugeMetric, + } +} +func getHealObjectsFailTotalMD() MetricDescription { + return MetricDescription{ + Namespace: healMetricNamespace, + Subsystem: objectsSubsystem, + Name: errorsTotal, + Help: "Objects for which healing failed in current self healing run", + Type: gaugeMetric, + } +} +func getHealLastActivityTimeMD() MetricDescription { + return MetricDescription{ + Namespace: healMetricNamespace, + Subsystem: timeSubsystem, + Name: lastActivityTime, + Help: "Time elapsed (in nano seconds) since last self healing activity. This is set to -1 until initial self heal activity", + Type: gaugeMetric, + } +} +func getNodeOnlineTotalMD() MetricDescription { + return MetricDescription{ + Namespace: clusterMetricNamespace, + Subsystem: nodesSubsystem, + Name: onlineTotal, + Help: "Total number of MinIO nodes online.", + Type: gaugeMetric, + } +} +func getNodeOfflineTotalMD() MetricDescription { + return MetricDescription{ + Namespace: clusterMetricNamespace, + Subsystem: nodesSubsystem, + Name: offlineTotal, + Help: "Total number of MinIO nodes offline.", + Type: gaugeMetric, + } +} +func getMinIOVersionMD() MetricDescription { + return MetricDescription{ + Namespace: minIOMetricNamespace, + Subsystem: softwareSubsystem, + Name: versionInfo, + Help: "MinIO Release tag for the server", + Type: gaugeMetric, + } +} +func getMinIOCommitMD() MetricDescription { + return MetricDescription{ + Namespace: minIOMetricNamespace, + Subsystem: softwareSubsystem, + Name: commitInfo, + Help: "Git commit hash for the MinIO release.", + Type: gaugeMetric, + } +} +func getS3TTFBDistributionMD() MetricDescription { + return MetricDescription{ + Namespace: s3MetricNamespace, + Subsystem: timeSubsystem, + Name: ttfbDistribution, + Help: "Distribution of the time to first byte across API calls.", + Type: gaugeMetric, + } +} + +func getS3TTFBMetric() MetricsGroup { + return MetricsGroup{ + Metrics: []Metric{}, + initialize: func(ctx context.Context, metrics *MetricsGroup) { + + // Read prometheus metric on this channel + ch := make(chan prometheus.Metric) + var wg sync.WaitGroup + wg.Add(1) + + // Read prometheus histogram data and convert it to internal metric data + go func() { + defer wg.Done() + for promMetric := range ch { + dtoMetric := &dto.Metric{} + err := promMetric.Write(dtoMetric) + if err != nil { + logger.LogIf(GlobalContext, err) + return + } + h := dtoMetric.GetHistogram() + for _, b := range h.Bucket { + labels := make(map[string]string) + for _, lp := range dtoMetric.GetLabel() { + labels[*lp.Name] = *lp.Value + } + labels["le"] = fmt.Sprintf("%.3f", *b.UpperBound) + metric := Metric{ + Description: getS3TTFBDistributionMD(), + VariableLabels: labels, + Value: float64(b.GetCumulativeCount()), + } + metrics.Metrics = append(metrics.Metrics, metric) + } + } + + }() + + httpRequestsDuration.Collect(ch) + close(ch) + wg.Wait() + }, + } +} + +func getMinioVersionMetrics() MetricsGroup { + return MetricsGroup{ + Metrics: []Metric{}, + initialize: func(_ context.Context, m *MetricsGroup) { + m.Metrics = append(m.Metrics, Metric{ + Description: getMinIOCommitMD(), + VariableLabels: map[string]string{"commit": CommitID}, + }) + m.Metrics = append(m.Metrics, Metric{ + Description: getMinIOVersionMD(), + VariableLabels: map[string]string{"version": Version}, + }) + }, + } +} + +func getNodeHealthMetrics() MetricsGroup { + return MetricsGroup{ + Metrics: []Metric{ + { + Description: getNodeOnlineTotalMD(), + }, { + Description: getNodeOfflineTotalMD(), + }, + }, + initialize: func(_ context.Context, m *MetricsGroup) { + nodesUp, nodesDown := GetPeerOnlineCount() + for i := range m.Metrics { + switch { + case m.Metrics[i].Description.Name == onlineTotal: + m.Metrics[i].Value = float64(nodesUp) + case m.Metrics[i].Description.Name == offlineTotal: + m.Metrics[i].Value = float64(nodesDown) + } + } + }, + } +} + +func getMinioHealingMetrics() MetricsGroup { + return MetricsGroup{ + Metrics: []Metric{}, + initialize: func(_ context.Context, m *MetricsGroup) { + if !globalIsErasure { + return + } + bgSeq, exists := globalBackgroundHealState.getHealSequenceByToken(bgHealingUUID) + if !exists { + return + } + var dur time.Duration + if !bgSeq.lastHealActivity.IsZero() { + dur = time.Since(bgSeq.lastHealActivity) + } + m.Metrics = append(m.Metrics, Metric{ + Description: getHealLastActivityTimeMD(), + Value: float64(dur), + }) + m.Metrics = append(m.Metrics, getObjectsScanned(bgSeq)...) + m.Metrics = append(m.Metrics, getScannedItems(bgSeq)...) + m.Metrics = append(m.Metrics, getFailedItems(bgSeq)...) + }, + } +} + +func getFailedItems(seq *healSequence) (m []Metric) { + m = make([]Metric, 0) + for k, v := range seq.gethealFailedItemsMap() { + s := strings.Split(k, ",") + m = append(m, Metric{ + Description: getHealObjectsFailTotalMD(), + VariableLabels: map[string]string{ + "mount_path": s[0], + "volume_status": s[1], + }, + Value: float64(v), + }) + } + return +} + +func getScannedItems(seq *healSequence) (m []Metric) { + m = make([]Metric, 0) + for k, v := range seq.getHealedItemsMap() { + m = append(m, Metric{ + Description: getHealObjectsHealTotalMD(), + VariableLabels: map[string]string{"type": string(k)}, + Value: float64(v), + }) + } + return +} + +func getObjectsScanned(seq *healSequence) (m []Metric) { + m = make([]Metric, 0) + for k, v := range seq.getScannedItemsMap() { + m = append(m, Metric{ + Description: getHealObjectsTotalMD(), + VariableLabels: map[string]string{"type": string(k)}, + Value: float64(v), + }) + } + return +} +func getCacheMetrics() MetricsGroup { + return MetricsGroup{ + Metrics: []Metric{}, + initialize: func(ctx context.Context, m *MetricsGroup) { + cacheObjLayer := newCachedObjectLayerFn() + // Service not initialized yet + if cacheObjLayer == nil { + return + } + m.Metrics = append(m.Metrics, Metric{ + Description: getCacheHitsTotalMD(), + Value: float64(cacheObjLayer.CacheStats().getHits()), + }) + m.Metrics = append(m.Metrics, Metric{ + Description: getCacheHitsMissedTotalMD(), + Value: float64(cacheObjLayer.CacheStats().getMisses()), + }) + m.Metrics = append(m.Metrics, Metric{ + Description: getCacheSentBytesMD(), + Value: float64(cacheObjLayer.CacheStats().getBytesServed()), + }) + for _, cdStats := range cacheObjLayer.CacheStats().GetDiskStats() { + m.Metrics = append(m.Metrics, Metric{ + Description: getCacheUsagePercentMD(), + Value: float64(cdStats.UsagePercent), + VariableLabels: map[string]string{"disk": cdStats.Dir}, + }) + m.Metrics = append(m.Metrics, Metric{ + Description: getCacheUsageInfoMD(), + Value: float64(cdStats.UsageState), + VariableLabels: map[string]string{"disk": cdStats.Dir, "level": cdStats.GetUsageLevelString()}, + }) + m.Metrics = append(m.Metrics, Metric{ + Description: getCacheUsedBytesMD(), + Value: float64(cdStats.UsageSize), + VariableLabels: map[string]string{"disk": cdStats.Dir}, + }) + m.Metrics = append(m.Metrics, Metric{ + Description: getCacheTotalBytesMD(), + Value: float64(cdStats.TotalCapacity), + VariableLabels: map[string]string{"disk": cdStats.Dir}, + }) + } + }, + } +} + +func getHTTPMetrics() MetricsGroup { + return MetricsGroup{ + Metrics: []Metric{}, + initialize: func(ctx context.Context, metrics *MetricsGroup) { + httpStats := globalHTTPStats.toServerHTTPStats() + for api, value := range httpStats.CurrentS3Requests.APIStats { + metrics.Metrics = append(metrics.Metrics, Metric{ + Description: getS3RequestsInFlightMD(), + Value: float64(value), + VariableLabels: map[string]string{"api": api}, + }) + } + for api, value := range httpStats.TotalS3Requests.APIStats { + metrics.Metrics = append(metrics.Metrics, Metric{ + Description: getS3RequestsTotalMD(), + Value: float64(value), + VariableLabels: map[string]string{"api": api}, + }) + } + for api, value := range httpStats.TotalS3Errors.APIStats { + metrics.Metrics = append(metrics.Metrics, Metric{ + Description: getS3RequestsErrorsMD(), + Value: float64(value), + VariableLabels: map[string]string{"api": api}, + }) + } + }, + } +} + +func getNetworkMetrics() MetricsGroup { + return MetricsGroup{ + Metrics: []Metric{}, + initialize: func(ctx context.Context, metrics *MetricsGroup) { + connStats := globalConnStats.toServerConnStats() + metrics.Metrics = append(metrics.Metrics, Metric{ + Description: getInterNodeSentBytesMD(), + Value: float64(connStats.TotalOutputBytes), + }) + metrics.Metrics = append(metrics.Metrics, Metric{ + Description: getInterNodeReceivedBytesMD(), + Value: float64(connStats.TotalInputBytes), + }) + metrics.Metrics = append(metrics.Metrics, Metric{ + Description: getS3SentBytesMD(), + Value: float64(connStats.S3OutputBytes), + }) + metrics.Metrics = append(metrics.Metrics, Metric{ + Description: getS3ReceivedBytesMD(), + Value: float64(connStats.S3InputBytes), + }) + }, + } +} + +func getBucketUsageMetrics() MetricsGroup { + return MetricsGroup{ + Metrics: []Metric{}, + initialize: func(ctx context.Context, metrics *MetricsGroup) { + objLayer := newObjectLayerFn() + // Service not initialized yet + if objLayer == nil || globalIsGateway { + return + } + + if globalIsGateway { + return + } + + dataUsageInfo, err := loadDataUsageFromBackend(GlobalContext, objLayer) + if err != nil { + return + } + + // data usage has not captured any data yet. + if dataUsageInfo.LastUpdate.IsZero() { + return + } + for bucket, usage := range dataUsageInfo.BucketsUsage { + + metrics.Metrics = append(metrics.Metrics, Metric{ + Description: getBucketUsageTotalBytesMD(), + Value: float64(usage.Size), + VariableLabels: map[string]string{"bucket": bucket}, + }) + + metrics.Metrics = append(metrics.Metrics, Metric{ + Description: getBucketUsageObjectsTotalMD(), + Value: float64(usage.ObjectsCount), + VariableLabels: map[string]string{"bucket": bucket}, + }) + + if usage.hasReplicationUsage() { + metrics.Metrics = append(metrics.Metrics, Metric{ + Description: getBucketRepPendingBytesMD(), + Value: float64(usage.ReplicationPendingSize), + VariableLabels: map[string]string{"bucket": bucket}, + }) + metrics.Metrics = append(metrics.Metrics, Metric{ + Description: getBucketRepFailedBytesMD(), + Value: float64(usage.ReplicationFailedSize), + VariableLabels: map[string]string{"bucket": bucket}, + }) + metrics.Metrics = append(metrics.Metrics, Metric{ + Description: getBucketRepSentBytesMD(), + Value: float64(usage.ReplicatedSize), + VariableLabels: map[string]string{"bucket": bucket}, + }) + metrics.Metrics = append(metrics.Metrics, Metric{ + Description: getBucketRepReceivedBytesMD(), + Value: float64(usage.ReplicaSize), + VariableLabels: map[string]string{"bucket": bucket}, + }) + } + + metrics.Metrics = append(metrics.Metrics, Metric{ + Description: getBucketObjectDistributionMD(), + Histogram: usage.ObjectSizesHistogram, + HistogramBucketLabel: "range", + VariableLabels: map[string]string{"bucket": bucket}, + }) + + } + }, + } +} +func getLocalStorageMetrics() MetricsGroup { + return MetricsGroup{ + Metrics: []Metric{}, + initialize: func(ctx context.Context, metrics *MetricsGroup) { + disks := getLocalDisks(globalEndpoints) + for _, disk := range disks { + + metrics.Metrics = append(metrics.Metrics, Metric{ + Description: getNodeDiskUsedBytesMD(), + Value: float64(disk.UsedSpace), + VariableLabels: map[string]string{"disk": disk.DrivePath}, + }) + + metrics.Metrics = append(metrics.Metrics, Metric{ + Description: getNodeDiskFreeBytesMD(), + Value: float64(disk.AvailableSpace), + VariableLabels: map[string]string{"disk": disk.DrivePath}, + }) + + metrics.Metrics = append(metrics.Metrics, Metric{ + Description: getNodeDiskTotalBytesMD(), + Value: float64(disk.TotalSpace), + VariableLabels: map[string]string{"disk": disk.DrivePath}, + }) + } + }, + } +} +func getClusterStorageMetrics() MetricsGroup { + return MetricsGroup{ + Metrics: []Metric{}, + initialize: func(ctx context.Context, metrics *MetricsGroup) { + + objLayer := newObjectLayerFn() + // Service not initialized yet + if objLayer == nil { + return + } + + // Fetch disk space info, ignore errors + storageInfo, _ := objLayer.StorageInfo(GlobalContext) + onlineDisks, offlineDisks := getOnlineOfflineDisksStats(storageInfo.Disks) + totalDisks := offlineDisks.Merge(onlineDisks) + + metrics.Metrics = append(metrics.Metrics, Metric{ + Description: getClusterCapacityTotalBytesMD(), + Value: float64(GetTotalCapacity(ctx)), + }) + + metrics.Metrics = append(metrics.Metrics, Metric{ + Description: getClusterCapacityFreeBytesMD(), + Value: float64(GetTotalCapacityFree(ctx)), + }) + + s, _ := objLayer.StorageInfo(GlobalContext) + metrics.Metrics = append(metrics.Metrics, Metric{ + Description: getClusterCapacityUsageBytesMD(), + Value: GetTotalUsableCapacity(ctx, s), + }) + + metrics.Metrics = append(metrics.Metrics, Metric{ + Description: getClusterCapacityUsageFreeBytesMD(), + Value: GetTotalUsableCapacityFree(ctx, s), + }) + + metrics.Metrics = append(metrics.Metrics, Metric{ + Description: getClusterDiskOfflineTotalMD(), + Value: float64(offlineDisks.Sum()), + }) + + metrics.Metrics = append(metrics.Metrics, Metric{ + Description: getClusterDiskOnlineTotalMD(), + Value: float64(totalDisks.Sum()), + }) + }, + } +} + +func (b *BucketUsageInfo) hasReplicationUsage() bool { + return b.ReplicationPendingSize > 0 || + b.ReplicationFailedSize > 0 || + b.ReplicatedSize > 0 || + b.ReplicaSize > 0 +} + +type minioClusterCollector struct { + desc *prometheus.Desc +} + +func newMinioClusterCollector() *minioClusterCollector { + return &minioClusterCollector{ + desc: prometheus.NewDesc("minio_stats", "Statistics exposed by MinIO server", nil, nil), + } +} + +// Describe sends the super-set of all possible descriptors of metrics +func (c *minioClusterCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- c.desc +} + +// Collect is called by the Prometheus registry when collecting metrics. +func (c *minioClusterCollector) Collect(out chan<- prometheus.Metric) { + + var wg sync.WaitGroup + publish := func(in <-chan Metric) { + defer wg.Done() + for metric := range in { + labels, values := getOrderedLabelValueArrays(metric.VariableLabels) + if metric.Description.Type == histogramMetric { + if metric.Histogram == nil { + continue + } + for k, v := range metric.Histogram { + l := append(labels, metric.HistogramBucketLabel) + lv := append(values, k) + out <- prometheus.MustNewConstMetric( + prometheus.NewDesc( + prometheus.BuildFQName(string(metric.Description.Namespace), + string(metric.Description.Subsystem), + string(metric.Description.Name)), + metric.Description.Help, + l, + metric.StaticLabels, + ), + prometheus.GaugeValue, + float64(v), + lv...) + } + continue + } + metricType := prometheus.GaugeValue + switch metric.Description.Type { + case counterMetric: + metricType = prometheus.CounterValue + } + toPost := prometheus.MustNewConstMetric( + prometheus.NewDesc( + prometheus.BuildFQName(string(metric.Description.Namespace), + string(metric.Description.Subsystem), + string(metric.Description.Name)), + metric.Description.Help, + labels, + metric.StaticLabels, + ), + metricType, + metric.Value, + values...) + out <- toPost + } + } + + // Call peer api to fetch metrics + peerCh := globalNotificationSys.GetClusterMetrics(GlobalContext) + selfCh := ReportMetrics(GlobalContext, GetAllGenerators) + wg.Add(2) + go publish(peerCh) + go publish(selfCh) + wg.Wait() +} + +// ReportMetrics reports serialized metrics to the channel passed for the metrics generated. +func ReportMetrics(ctx context.Context, generators func() []MetricsGenerator) <-chan Metric { + ch := make(chan Metric) + go func() { + defer close(ch) + populateAndPublish(generators, func(m Metric) bool { + if m.VariableLabels == nil { + m.VariableLabels = make(map[string]string) + } + m.VariableLabels[serverName] = GetLocalPeer(globalEndpoints) + for { + select { + case ch <- m: + return true + case <-ctx.Done(): + return false + } + } + }) + }() + return ch +} + +// minioCollectorV2 is the Custom Collector +type minioCollectorV2 struct { + generator func() []MetricsGenerator + desc *prometheus.Desc +} + +// Describe sends the super-set of all possible descriptors of metrics +func (c *minioCollectorV2) Describe(ch chan<- *prometheus.Desc) { + ch <- c.desc +} + +// populateAndPublish populates and then publishes the metrics generated by the generator function. +func populateAndPublish(generatorFn func() []MetricsGenerator, publish func(m Metric) bool) { + generators := generatorFn() + for _, g := range generators { + metrics := g() + metrics.initialize(GlobalContext, &metrics) + for _, metric := range metrics.Metrics { + if !publish(metric) { + return + } + } + } +} + +// Collect is called by the Prometheus registry when collecting metrics. +func (c *minioCollectorV2) Collect(ch chan<- prometheus.Metric) { + + // Expose MinIO's version information + minioVersionInfo.WithLabelValues(Version, CommitID).Set(1.0) + + populateAndPublish(c.generator, func(metric Metric) bool { + labels, values := getOrderedLabelValueArrays(metric.VariableLabels) + values = append(values, GetLocalPeer(globalEndpoints)) + labels = append(labels, serverName) + + if metric.Description.Type == histogramMetric { + if metric.Histogram == nil { + return true + } + for k, v := range metric.Histogram { + labels = append(labels, metric.HistogramBucketLabel) + values = append(values, k) + ch <- prometheus.MustNewConstMetric( + prometheus.NewDesc( + prometheus.BuildFQName(string(metric.Description.Namespace), + string(metric.Description.Subsystem), + string(metric.Description.Name)), + metric.Description.Help, + labels, + metric.StaticLabels, + ), + prometheus.GaugeValue, + float64(v), + values...) + } + return true + } + + metricType := prometheus.GaugeValue + switch metric.Description.Type { + case counterMetric: + metricType = prometheus.CounterValue + } + ch <- prometheus.MustNewConstMetric( + prometheus.NewDesc( + prometheus.BuildFQName(string(metric.Description.Namespace), + string(metric.Description.Subsystem), + string(metric.Description.Name)), + metric.Description.Help, + labels, + metric.StaticLabels, + ), + metricType, + metric.Value, + values...) + return true + }) +} + +func getOrderedLabelValueArrays(labelsWithValue map[string]string) (labels, values []string) { + labels = make([]string, 0) + values = make([]string, 0) + for l, v := range labelsWithValue { + labels = append(labels, l) + values = append(values, v) + } + return +} + +// newMinioCollectorV2 describes the collector +// and returns reference of minioCollector for version 2 +// It creates the Prometheus Description which is used +// to define Metric and help string +func newMinioCollectorV2(generator func() []MetricsGenerator) *minioCollectorV2 { + return &minioCollectorV2{ + generator: generator, + desc: prometheus.NewDesc("minio_stats", "Statistics exposed by MinIO server", nil, nil), + } +} + +func metricsServerHandler() http.Handler { + + registry := prometheus.NewRegistry() + + // Report all other metrics + err := registry.Register(newMinioClusterCollector()) + if err != nil { + logger.CriticalIf(GlobalContext, err) + } + // DefaultGatherers include golang metrics and process metrics. + gatherers := prometheus.Gatherers{ + registry, + } + // Delegate http serving to Prometheus client library, which will call collector.Collect. + return promhttp.InstrumentMetricHandler( + registry, + promhttp.HandlerFor(gatherers, + promhttp.HandlerOpts{ + ErrorHandling: promhttp.ContinueOnError, + }), + ) +} + +func metricsNodeHandler() http.Handler { + registry := prometheus.NewRegistry() + + err := registry.Register(newMinioCollectorV2(GetSingleNodeGenerators)) + if err != nil { + logger.CriticalIf(GlobalContext, err) + } + + gatherers := prometheus.Gatherers{ + prometheus.DefaultGatherer, + registry, + } + // Delegate http serving to Prometheus client library, which will call collector.Collect. + return promhttp.InstrumentMetricHandler( + registry, + promhttp.HandlerFor(gatherers, + promhttp.HandlerOpts{ + ErrorHandling: promhttp.ContinueOnError, + }), + ) +} diff --git a/cmd/metrics.go b/cmd/metrics.go index b85d46a4e..0606f7d20 100644 --- a/cmd/metrics.go +++ b/cmd/metrics.go @@ -51,6 +51,17 @@ var ( ) ) +const ( + healMetricsNamespace = "self_heal" + gatewayNamespace = "gateway" + cacheNamespace = "cache" + s3Namespace = "s3" + bucketNamespace = "bucket" + minioNamespace = "minio" + diskNamespace = "disk" + interNodeNamespace = "internode" +) + func init() { prometheus.MustRegister(httpRequestsDuration) prometheus.MustRegister(newMinioCollector()) @@ -81,9 +92,10 @@ func (c *minioCollector) Describe(ch chan<- *prometheus.Desc) { func (c *minioCollector) Collect(ch chan<- prometheus.Metric) { // Expose MinIO's version information - minioVersionInfo.WithLabelValues(Version, CommitID).Set(float64(1.0)) + minioVersionInfo.WithLabelValues(Version, CommitID).Set(1.0) storageMetricsPrometheus(ch) + nodeHealthMetricsPrometheus(ch) bucketUsageMetricsPrometheus(ch) networkMetricsPrometheus(ch) httpMetricsPrometheus(ch) @@ -92,6 +104,26 @@ func (c *minioCollector) Collect(ch chan<- prometheus.Metric) { healingMetricsPrometheus(ch) } +func nodeHealthMetricsPrometheus(ch chan<- prometheus.Metric) { + nodesUp, nodesDown := GetPeerOnlineCount() + ch <- prometheus.MustNewConstMetric( + prometheus.NewDesc( + prometheus.BuildFQName(minioNamespace, "nodes", "online"), + "Total number of MinIO nodes online", + nil, nil), + prometheus.GaugeValue, + float64(nodesUp), + ) + ch <- prometheus.MustNewConstMetric( + prometheus.NewDesc( + prometheus.BuildFQName(minioNamespace, "nodes", "offline"), + "Total number of MinIO nodes offline", + nil, nil), + prometheus.GaugeValue, + float64(nodesDown), + ) +} + // collects healing specific metrics for MinIO instance in Prometheus specific format // and sends to given channel func healingMetricsPrometheus(ch chan<- prometheus.Metric) { @@ -102,7 +134,6 @@ func healingMetricsPrometheus(ch chan<- prometheus.Metric) { if !exists { return } - healMetricsNamespace := "self_heal" var dur time.Duration if !bgSeq.lastHealActivity.IsZero() { @@ -172,7 +203,7 @@ func gatewayMetricsPrometheus(ch chan<- prometheus.Metric) { ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( - prometheus.BuildFQName("gateway", globalGatewayName, "bytes_received"), + prometheus.BuildFQName(gatewayNamespace, globalGatewayName, "bytes_received"), "Total number of bytes received by current MinIO Gateway "+globalGatewayName+" backend", nil, nil), prometheus.CounterValue, @@ -180,7 +211,7 @@ func gatewayMetricsPrometheus(ch chan<- prometheus.Metric) { ) ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( - prometheus.BuildFQName("gateway", globalGatewayName, "bytes_sent"), + prometheus.BuildFQName(gatewayNamespace, globalGatewayName, "bytes_sent"), "Total number of bytes sent by current MinIO Gateway to "+globalGatewayName+" backend", nil, nil), prometheus.CounterValue, @@ -189,7 +220,7 @@ func gatewayMetricsPrometheus(ch chan<- prometheus.Metric) { s := m.GetRequests() ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( - prometheus.BuildFQName("gateway", globalGatewayName, "requests"), + prometheus.BuildFQName(gatewayNamespace, globalGatewayName, "requests"), "Total number of requests made to "+globalGatewayName+" by current MinIO Gateway", []string{"method"}, nil), prometheus.CounterValue, @@ -198,7 +229,7 @@ func gatewayMetricsPrometheus(ch chan<- prometheus.Metric) { ) ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( - prometheus.BuildFQName("gateway", globalGatewayName, "requests"), + prometheus.BuildFQName(gatewayNamespace, globalGatewayName, "requests"), "Total number of requests made to "+globalGatewayName+" by current MinIO Gateway", []string{"method"}, nil), prometheus.CounterValue, @@ -207,7 +238,7 @@ func gatewayMetricsPrometheus(ch chan<- prometheus.Metric) { ) ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( - prometheus.BuildFQName("gateway", globalGatewayName, "requests"), + prometheus.BuildFQName(gatewayNamespace, globalGatewayName, "requests"), "Total number of requests made to "+globalGatewayName+" by current MinIO Gateway", []string{"method"}, nil), prometheus.CounterValue, @@ -216,7 +247,7 @@ func gatewayMetricsPrometheus(ch chan<- prometheus.Metric) { ) ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( - prometheus.BuildFQName("gateway", globalGatewayName, "requests"), + prometheus.BuildFQName(gatewayNamespace, globalGatewayName, "requests"), "Total number of requests made to "+globalGatewayName+" by current MinIO Gateway", []string{"method"}, nil), prometheus.CounterValue, @@ -236,7 +267,7 @@ func cacheMetricsPrometheus(ch chan<- prometheus.Metric) { ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( - prometheus.BuildFQName("cache", "hits", "total"), + prometheus.BuildFQName(cacheNamespace, "hits", "total"), "Total number of disk cache hits in current MinIO instance", nil, nil), prometheus.CounterValue, @@ -244,7 +275,7 @@ func cacheMetricsPrometheus(ch chan<- prometheus.Metric) { ) ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( - prometheus.BuildFQName("cache", "misses", "total"), + prometheus.BuildFQName(cacheNamespace, "misses", "total"), "Total number of disk cache misses in current MinIO instance", nil, nil), prometheus.CounterValue, @@ -252,7 +283,7 @@ func cacheMetricsPrometheus(ch chan<- prometheus.Metric) { ) ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( - prometheus.BuildFQName("cache", "data", "served"), + prometheus.BuildFQName(cacheNamespace, "data", "served"), "Total number of bytes served from cache of current MinIO instance", nil, nil), prometheus.CounterValue, @@ -262,7 +293,7 @@ func cacheMetricsPrometheus(ch chan<- prometheus.Metric) { // Cache disk usage percentage ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( - prometheus.BuildFQName("cache", "usage", "percent"), + prometheus.BuildFQName(cacheNamespace, "usage", "percent"), "Total percentage cache usage", []string{"disk"}, nil), prometheus.GaugeValue, @@ -271,7 +302,7 @@ func cacheMetricsPrometheus(ch chan<- prometheus.Metric) { ) ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( - prometheus.BuildFQName("cache", "usage", "high"), + prometheus.BuildFQName(cacheNamespace, "usage", "high"), "Indicates cache usage is high or low, relative to current cache 'quota' settings", []string{"disk"}, nil), prometheus.GaugeValue, @@ -309,7 +340,7 @@ func httpMetricsPrometheus(ch chan<- prometheus.Metric) { for api, value := range httpStats.CurrentS3Requests.APIStats { ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( - prometheus.BuildFQName("s3", "requests", "current"), + prometheus.BuildFQName(s3Namespace, "requests", "current"), "Total number of running s3 requests in current MinIO server instance", []string{"api"}, nil), prometheus.CounterValue, @@ -321,7 +352,7 @@ func httpMetricsPrometheus(ch chan<- prometheus.Metric) { for api, value := range httpStats.TotalS3Requests.APIStats { ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( - prometheus.BuildFQName("s3", "requests", "total"), + prometheus.BuildFQName(s3Namespace, "requests", "total"), "Total number of s3 requests in current MinIO server instance", []string{"api"}, nil), prometheus.CounterValue, @@ -333,7 +364,7 @@ func httpMetricsPrometheus(ch chan<- prometheus.Metric) { for api, value := range httpStats.TotalS3Errors.APIStats { ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( - prometheus.BuildFQName("s3", "errors", "total"), + prometheus.BuildFQName(s3Namespace, "errors", "total"), "Total number of s3 errors in current MinIO server instance", []string{"api"}, nil), prometheus.CounterValue, @@ -351,7 +382,7 @@ func networkMetricsPrometheus(ch chan<- prometheus.Metric) { // Network Sent/Received Bytes (internode) ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( - prometheus.BuildFQName("internode", "tx", "bytes_total"), + prometheus.BuildFQName(interNodeNamespace, "tx", "bytes_total"), "Total number of bytes sent to the other peer nodes by current MinIO server instance", nil, nil), prometheus.CounterValue, @@ -360,7 +391,7 @@ func networkMetricsPrometheus(ch chan<- prometheus.Metric) { ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( - prometheus.BuildFQName("internode", "rx", "bytes_total"), + prometheus.BuildFQName(interNodeNamespace, "rx", "bytes_total"), "Total number of internode bytes received by current MinIO server instance", nil, nil), prometheus.CounterValue, @@ -370,7 +401,7 @@ func networkMetricsPrometheus(ch chan<- prometheus.Metric) { // Network Sent/Received Bytes (Outbound) ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( - prometheus.BuildFQName("s3", "tx", "bytes_total"), + prometheus.BuildFQName(s3Namespace, "tx", "bytes_total"), "Total number of s3 bytes sent by current MinIO server instance", nil, nil), prometheus.CounterValue, @@ -379,7 +410,7 @@ func networkMetricsPrometheus(ch chan<- prometheus.Metric) { ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( - prometheus.BuildFQName("s3", "rx", "bytes_total"), + prometheus.BuildFQName(s3Namespace, "rx", "bytes_total"), "Total number of s3 bytes received by current MinIO server instance", nil, nil), prometheus.CounterValue, @@ -414,7 +445,7 @@ func bucketUsageMetricsPrometheus(ch chan<- prometheus.Metric) { // Total space used by bucket ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( - prometheus.BuildFQName("bucket", "usage", "size"), + prometheus.BuildFQName(bucketNamespace, "usage", "size"), "Total bucket size", []string{"bucket"}, nil), prometheus.GaugeValue, @@ -423,7 +454,7 @@ func bucketUsageMetricsPrometheus(ch chan<- prometheus.Metric) { ) ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( - prometheus.BuildFQName("bucket", "objects", "count"), + prometheus.BuildFQName(bucketNamespace, "objects", "count"), "Total number of objects in a bucket", []string{"bucket"}, nil), prometheus.GaugeValue, @@ -469,7 +500,7 @@ func bucketUsageMetricsPrometheus(ch chan<- prometheus.Metric) { for k, v := range usageInfo.ObjectSizesHistogram { ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( - prometheus.BuildFQName("bucket", "objects", "histogram"), + prometheus.BuildFQName(bucketNamespace, "objects", "histogram"), "Total number of objects of different sizes in a bucket", []string{"bucket", "object_size"}, nil), prometheus.GaugeValue, @@ -497,10 +528,50 @@ func storageMetricsPrometheus(ch chan<- prometheus.Metric) { onlineDisks, offlineDisks := getOnlineOfflineDisksStats(server.Disks) totalDisks := offlineDisks.Merge(onlineDisks) + // Report total capacity + ch <- prometheus.MustNewConstMetric( + prometheus.NewDesc( + prometheus.BuildFQName(minioNamespace, "capacity_raw", "total"), + "Total capacity online in the cluster", + nil, nil), + prometheus.GaugeValue, + float64(GetTotalCapacity(GlobalContext)), + ) + + // Report total capacity free + ch <- prometheus.MustNewConstMetric( + prometheus.NewDesc( + prometheus.BuildFQName(minioNamespace, "capacity_raw_free", "total"), + "Total free capacity online in the cluster", + nil, nil), + prometheus.GaugeValue, + float64(GetTotalCapacityFree(GlobalContext)), + ) + + s, _ := objLayer.StorageInfo(GlobalContext) + // Report total usable capacity + ch <- prometheus.MustNewConstMetric( + prometheus.NewDesc( + prometheus.BuildFQName(minioNamespace, "capacity_usable", "total"), + "Total usable capacity online in the cluster", + nil, nil), + prometheus.GaugeValue, + GetTotalUsableCapacity(GlobalContext, s), + ) + // Report total usable capacity free + ch <- prometheus.MustNewConstMetric( + prometheus.NewDesc( + prometheus.BuildFQName(minioNamespace, "capacity_usable_free", "total"), + "Total free usable capacity online in the cluster", + nil, nil), + prometheus.GaugeValue, + GetTotalUsableCapacityFree(GlobalContext, s), + ) + // MinIO Offline Disks per node ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( - prometheus.BuildFQName("minio", "disks", "offline"), + prometheus.BuildFQName(minioNamespace, "disks", "offline"), "Total number of offline disks in current MinIO server instance", nil, nil), prometheus.GaugeValue, @@ -510,7 +581,7 @@ func storageMetricsPrometheus(ch chan<- prometheus.Metric) { // MinIO Total Disks per node ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( - prometheus.BuildFQName("minio", "disks", "total"), + prometheus.BuildFQName(minioNamespace, "disks", "total"), "Total number of disks for current MinIO server instance", nil, nil), prometheus.GaugeValue, @@ -521,7 +592,7 @@ func storageMetricsPrometheus(ch chan<- prometheus.Metric) { // Total disk usage by the disk ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( - prometheus.BuildFQName("disk", "storage", "used"), + prometheus.BuildFQName(diskNamespace, "storage", "used"), "Total disk storage used on the disk", []string{"disk"}, nil), prometheus.GaugeValue, @@ -532,7 +603,7 @@ func storageMetricsPrometheus(ch chan<- prometheus.Metric) { // Total available space in the disk ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( - prometheus.BuildFQName("disk", "storage", "available"), + prometheus.BuildFQName(diskNamespace, "storage", "available"), "Total available space left on the disk", []string{"disk"}, nil), prometheus.GaugeValue, @@ -543,7 +614,7 @@ func storageMetricsPrometheus(ch chan<- prometheus.Metric) { // Total storage space of the disk ch <- prometheus.MustNewConstMetric( prometheus.NewDesc( - prometheus.BuildFQName("disk", "storage", "total"), + prometheus.BuildFQName(diskNamespace, "storage", "total"), "Total space on the disk", []string{"disk"}, nil), prometheus.GaugeValue, diff --git a/cmd/notification-summary.go b/cmd/notification-summary.go new file mode 100644 index 000000000..dcebd676a --- /dev/null +++ b/cmd/notification-summary.go @@ -0,0 +1,54 @@ +/* + * MinIO Cloud Storage, (C) 2020 MinIO, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package cmd + +import ( + "context" +) + +// GetTotalCapacity gets the total capacity in the cluster. +func GetTotalCapacity(ctx context.Context) (capacity uint64) { + d := globalNotificationSys.DiskHwInfo(ctx) + for _, s := range d { + capacity += s.GetTotalCapacity() + } + return +} + +// GetTotalUsableCapacity gets the total usable capacity in the cluster. +func GetTotalUsableCapacity(ctx context.Context, s StorageInfo) (capacity float64) { + raw := GetTotalCapacity(ctx) + ratio := float64(s.Backend.StandardSCData) / float64(s.Backend.StandardSCData+s.Backend.StandardSCParity) + return float64(raw) * ratio +} + +// GetTotalCapacityFree gets the total capacity free in the cluster. +func GetTotalCapacityFree(ctx context.Context) (capacity uint64) { + d := globalNotificationSys.DiskHwInfo(ctx) + for _, s := range d { + capacity += s.GetTotalFreeCapacity() + } + return +} + +// GetTotalUsableCapacityFree gets the total usable capacity free in the cluster. +func GetTotalUsableCapacityFree(ctx context.Context, s StorageInfo) (capacity float64) { + raw := GetTotalCapacityFree(ctx) + ratio := float64(s.Backend.StandardSCData) / float64(s.Backend.StandardSCData+s.Backend.StandardSCParity) + return float64(raw) * ratio +} diff --git a/cmd/notification.go b/cmd/notification.go index bb8c2a317..1870bd27c 100644 --- a/cmd/notification.go +++ b/cmd/notification.go @@ -51,8 +51,8 @@ type NotificationSys struct { targetResCh chan event.TargetIDResult bucketRulesMap map[string]event.RulesMap bucketRemoteTargetRulesMap map[string]map[event.TargetID]event.RulesMap - peerClients []*peerRESTClient - allPeerClients []*peerRESTClient + peerClients []*peerRESTClient // Excludes self + allPeerClients []*peerRESTClient // Includes nil client for self } // GetARNList - returns available ARNs. @@ -1294,6 +1294,21 @@ func NewNotificationSys(endpoints EndpointServerPools) *NotificationSys { } } +// GetPeerOnlineCount gets the count of online and offline nodes. +func GetPeerOnlineCount() (nodesOnline, nodesOffline int) { + nodesOnline = 1 // Self is always online. + nodesOffline = 0 + servers := globalNotificationSys.ServerInfo() + for _, s := range servers { + if s.State == "ok" { + nodesOnline++ + continue + } + nodesOffline++ + } + return +} + type eventArgs struct { EventName event.Name BucketName string @@ -1428,3 +1443,52 @@ func (sys *NotificationSys) GetBandwidthReports(ctx context.Context, buckets ... } return consolidatedReport } + +// GetClusterMetrics - gets the cluster metrics from all nodes excluding self. +func (sys *NotificationSys) GetClusterMetrics(ctx context.Context) chan Metric { + g := errgroup.WithNErrs(len(sys.peerClients)) + peerChannels := make([]<-chan Metric, len(sys.peerClients)) + for index := range sys.peerClients { + if sys.peerClients[index] == nil { + continue + } + index := index + g.Go(func() error { + var err error + peerChannels[index], err = sys.peerClients[index].GetPeerMetrics(ctx) + return err + }, index) + } + + ch := make(chan Metric) + var wg sync.WaitGroup + for index, err := range g.Wait() { + reqInfo := (&logger.ReqInfo{}).AppendTags("peerAddress", + sys.peerClients[index].host.String()) + ctx := logger.SetReqInfo(ctx, reqInfo) + if err != nil { + logger.LogOnceIf(ctx, err, sys.peerClients[index].host.String()) + continue + } + wg.Add(1) + go func(ctx context.Context, peerChannel <-chan Metric, wg *sync.WaitGroup) { + defer wg.Done() + for { + select { + case m, ok := <-peerChannel: + if !ok { + return + } + ch <- m + case <-ctx.Done(): + return + } + } + }(ctx, peerChannels[index], &wg) + } + go func(wg *sync.WaitGroup, ch chan Metric) { + wg.Wait() + close(ch) + }(&wg, ch) + return ch +} diff --git a/cmd/object-api-interface.go b/cmd/object-api-interface.go index a7c0143fe..72220d498 100644 --- a/cmd/object-api-interface.go +++ b/cmd/object-api-interface.go @@ -72,6 +72,13 @@ const ( writeLock ) +// BackendMetrics - represents bytes served from backend +type BackendMetrics struct { + bytesReceived uint64 + bytesSent uint64 + requestStats RequestStats +} + // ObjectLayer implements primitives for object API layer. type ObjectLayer interface { SetDriveCount() int // Only implemented by erasure layer @@ -143,7 +150,7 @@ type ObjectLayer interface { IsCompressionSupported() bool // Backend related metrics - GetMetrics(ctx context.Context) (*Metrics, error) + GetMetrics(ctx context.Context) (*BackendMetrics, error) // Returns health of the backend Health(ctx context.Context, opts HealthOptions) HealthResult diff --git a/cmd/peer-rest-client.go b/cmd/peer-rest-client.go index 14db105f1..0cd35e107 100644 --- a/cmd/peer-rest-client.go +++ b/cmd/peer-rest-client.go @@ -749,7 +749,7 @@ func (client *peerRESTClient) doListen(listenCh chan interface{}, doneCh <-chan dec := gob.NewDecoder(respBody) for { var ev event.Event - if err = dec.Decode(&ev); err != nil { + if err := dec.Decode(&ev); err != nil { return } if len(ev.EventVersion) > 0 { @@ -906,3 +906,24 @@ func (client *peerRESTClient) MonitorBandwidth(ctx context.Context, buckets []st err = dec.Decode(&bandwidthReport) return &bandwidthReport, err } + +func (client *peerRESTClient) GetPeerMetrics(ctx context.Context) (<-chan Metric, error) { + respBody, err := client.callWithContext(ctx, peerRESTMethodGetPeerMetrics, nil, nil, -1) + if err != nil { + return nil, err + } + dec := gob.NewDecoder(respBody) + ch := make(chan Metric) + go func(ch chan<- Metric) { + for { + var metric Metric + if err := dec.Decode(&metric); err != nil { + http.DrainBody(respBody) + close(ch) + return + } + ch <- metric + } + }(ch) + return ch, nil +} diff --git a/cmd/peer-rest-common.go b/cmd/peer-rest-common.go index d9912b402..deab223c7 100644 --- a/cmd/peer-rest-common.go +++ b/cmd/peer-rest-common.go @@ -58,6 +58,7 @@ const ( peerRESTMethodGetBandwidth = "/bandwidth" peerRESTMethodGetMetacacheListing = "/getmetacache" peerRESTMethodUpdateMetacacheListing = "/updatemetacache" + peerRESTMethodGetPeerMetrics = "/peermetrics" ) const ( diff --git a/cmd/peer-rest-server.go b/cmd/peer-rest-server.go index 2a91a97dd..d96edb49f 100644 --- a/cmd/peer-rest-server.go +++ b/cmd/peer-rest-server.go @@ -801,7 +801,7 @@ func (s *peerRESTServer) SignalServiceHandler(w http.ResponseWriter, r *http.Req // ListenHandler sends http trace messages back to peer rest client func (s *peerRESTServer) ListenHandler(w http.ResponseWriter, r *http.Request) { if !s.IsValid(w, r) { - s.writeErrorResponse(w, errors.New("Invalid request")) + s.writeErrorResponse(w, errors.New("invalid request")) return } @@ -809,7 +809,7 @@ func (s *peerRESTServer) ListenHandler(w http.ResponseWriter, r *http.Request) { var prefix string if len(values[peerRESTListenPrefix]) > 1 { - s.writeErrorResponse(w, errors.New("Invalid request")) + s.writeErrorResponse(w, errors.New("invalid request")) return } @@ -824,7 +824,7 @@ func (s *peerRESTServer) ListenHandler(w http.ResponseWriter, r *http.Request) { var suffix string if len(values[peerRESTListenSuffix]) > 1 { - s.writeErrorResponse(w, errors.New("Invalid request")) + s.writeErrorResponse(w, errors.New("invalid request")) return } @@ -1004,7 +1004,7 @@ func (s *peerRESTServer) IsValid(w http.ResponseWriter, r *http.Request) bool { // GetBandwidth gets the bandwidth for the buckets requested. func (s *peerRESTServer) GetBandwidth(w http.ResponseWriter, r *http.Request) { if !s.IsValid(w, r) { - s.writeErrorResponse(w, errors.New("Invalid request")) + s.writeErrorResponse(w, errors.New("invalid request")) return } bucketsString := r.URL.Query().Get("buckets") @@ -1025,6 +1025,29 @@ func (s *peerRESTServer) GetBandwidth(w http.ResponseWriter, r *http.Request) { w.(http.Flusher).Flush() } +// GetPeerMetrics gets the metrics to be federated across peers. +func (s *peerRESTServer) GetPeerMetrics(w http.ResponseWriter, r *http.Request) { + if !s.IsValid(w, r) { + s.writeErrorResponse(w, errors.New("invalid request")) + } + w.WriteHeader(http.StatusOK) + w.(http.Flusher).Flush() + + doneCh := make(chan struct{}) + defer close(doneCh) + + enc := gob.NewEncoder(w) + + ch := ReportMetrics(r.Context(), GetGeneratorsForPeer) + for m := range ch { + if err := enc.Encode(m); err != nil { + s.writeErrorResponse(w, errors.New("Encoding metric failed: "+err.Error())) + return + } + } + w.(http.Flusher).Flush() +} + // registerPeerRESTHandlers - register peer rest router. func registerPeerRESTHandlers(router *mux.Router) { server := &peerRESTServer{} @@ -1064,4 +1087,5 @@ func registerPeerRESTHandlers(router *mux.Router) { subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodGetBandwidth).HandlerFunc(httpTraceHdrs(server.GetBandwidth)) subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodGetMetacacheListing).HandlerFunc(httpTraceHdrs(server.GetMetacacheListingHandler)) subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodUpdateMetacacheListing).HandlerFunc(httpTraceHdrs(server.UpdateMetacacheListingHandler)) + subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodGetPeerMetrics).HandlerFunc(httpTraceHdrs(server.GetPeerMetrics)) } diff --git a/docs/metrics/README.md b/docs/metrics/README.md index e537c9e62..d622202cb 100644 --- a/docs/metrics/README.md +++ b/docs/metrics/README.md @@ -13,8 +13,15 @@ Read more on how to use these endpoints in [MinIO healthcheck guide](https://git ### Prometheus Probe -MinIO server exposes Prometheus compatible data on a single endpoint. By default, the endpoint is authenticated. +MinIO allows reading metrics for the entire cluster from any single node. The cluster wide metrics can be read at +`
/minio/prometheus/cluster`. -- Prometheus data available at `/minio/prometheus/metrics` +The additional node specific metrics which include go metrics or process metrics are exposed at +`
/minio/prometheus/node`. To use this endpoint, setup Prometheus to scrape data from this endpoint. Read more on how to configure and use Prometheus to monitor MinIO server in [How to monitor MinIO server with Prometheus](https://github.com/minio/minio/blob/master/docs/metrics/prometheus/README.md). + +**Deprecated metrics monitoring** + +- Prometheus' data available at `/minio/prometheus/metrics` is deprecated + diff --git a/docs/metrics/prometheus/README.md b/docs/metrics/prometheus/README.md index 9b3c505de..efc195655 100644 --- a/docs/metrics/prometheus/README.md +++ b/docs/metrics/prometheus/README.md @@ -1,8 +1,13 @@ # How to monitor MinIO server with Prometheus [![Slack](https://slack.min.io/slack?type=svg)](https://slack.min.io) -[Prometheus](https://prometheus.io) is a cloud-native monitoring platform, built originally at SoundCloud. Prometheus offers a multi-dimensional data model with time series data identified by metric name and key/value pairs. The data collection happens via a pull model over HTTP/HTTPS. Targets to pull data from are discovered via service discovery or static configuration. +[Prometheus](https://prometheus.io) is a cloud-native monitoring platform. -MinIO exports Prometheus compatible data by default as an authorized endpoint at `/minio/prometheus/metrics`. Users looking to monitor their MinIO instances can point Prometheus configuration to scrape data from this endpoint. +Prometheus offers a multi-dimensional data model with time series data identified by metric name and key/value pairs. +The data collection happens via a pull model over HTTP/HTTPS. + +MinIO exports Prometheus compatible data by default as an authorized endpoint at `/minio/prometheus/metrics/cluster`. + +Users looking to monitor their MinIO instances can point Prometheus configuration to scrape data from this endpoint. This document explains how to setup Prometheus and configure it to scrape data from MinIO servers. @@ -20,7 +25,8 @@ This document explains how to setup Prometheus and configure it to scrape data f - [List of metrics exposed by MinIO](#list-of-metrics-exposed-by-minio) ## Prerequisites -To get started with MinIO, refer [MinIO QuickStart Document](https://docs.min.io/docs/minio-quickstart-guide). Follow below steps to get started with MinIO monitoring using Prometheus. +To get started with MinIO, refer [MinIO QuickStart Document](https://docs.min.io/docs/minio-quickstart-guide). +Follow below steps to get started with MinIO monitoring using Prometheus. ### 1. Download Prometheus @@ -68,7 +74,7 @@ The command will generate the `scrape_configs` section of the prometheus.yml as scrape_configs: - job_name: minio-job bearer_token: - metrics_path: /minio/prometheus/metrics + metrics_path: /minio/v2/metrics/cluster scheme: http static_configs: - targets: ['localhost:9000'] @@ -77,16 +83,26 @@ scrape_configs: #### 3.2 Public Prometheus config If Prometheus endpoint authentication type is set to `public`. Following prometheus config is sufficient to start scraping metrics data from MinIO. - +This can be collected from any server once per collection. +##### Cluster ```yaml scrape_configs: - job_name: minio-job - metrics_path: /minio/prometheus/metrics + metrics_path: /minio/v2/metrics/cluster + scheme: http + static_configs: + - targets: ['localhost:9000'] +``` +##### Node +Optionally you can also collect per node metrics. This needs to be done on a per server instance. +```yaml +scrape_configs: +- job_name: minio-job + metrics_path: /minio/v2/metrics/node scheme: http static_configs: - targets: ['localhost:9000'] ``` - ### 4. Update `scrape_configs` section in prometheus.yml To authorize every scrape request, copy and paste the generated `scrape_configs` section in the prometheus.yml and restart the Prometheus service. @@ -103,172 +119,16 @@ Here `prometheus.yml` is the name of configuration file. You can now see MinIO m ### 6. Configure Grafana -After Prometheus is configured, you can use Grafana to visualize MinIO metrics. Refer the [document here to setup Grafana with MinIO prometheus metrics](https://github.com/minio/minio/blob/master/docs/metrics/prometheus/grafana/README.md). +After Prometheus is configured, you can use Grafana to visualize MinIO metrics. +Refer the [document here to setup Grafana with MinIO prometheus metrics](https://github.com/minio/minio/blob/master/docs/metrics/prometheus/grafana/README.md). ## List of metrics exposed by MinIO -MinIO server exposes the following metrics on `/minio/prometheus/metrics` endpoint. All of these can be accessed via Prometheus dashboard. The full list of exposed metrics along with their definition is available in the demo server at https://play.min.io:9000/minio/prometheus/metrics +MinIO server exposes the following metrics on `/minio/prometheus/metrics/cluster` endpoint. +All of these can be accessed via Prometheus dashboard. +A sample list of exposed metrics along with their definition is available in the demo server at +`curl https://play.min.io:9000/minio/prometheus/metrics/cluster` -These are the new set of metrics which will be in effect after `RELEASE.2019-10-16*`. Some of the key changes in this update are listed below. - - Metrics are bound the respective nodes and is not cluster-wide. Each and every node in a cluster will expose its own metrics. - - Additional metrics to cover the s3 and internode traffic statistics were added. - - Metrics that records the http statistics and latencies are labeled to their respective APIs (putobject,getobject etc). - - Disk usage metrics are distributed and labeled to the respective disk paths. +### List of metrics reported -For more details, please check the `Migration guide for the new set of metrics`. - -The list of metrics and its definition are as follows. (NOTE: instance here is one MinIO node) - -> NOTES: - > 1. Instance here is one MinIO node. - > 2. `s3 requests` exclude internode requests. - -### Default set of information -| name | description | -|:------------|:--------------------------------| -| `go_` | all standard go runtime metrics | -| `process_` | all process level metrics | -| `promhttp_` | all prometheus scrape metrics | - -### MinIO node specific information -| name | description | -|:---------------------------|:-------------------------------------------------------------------------------| -| `minio_version_info` | Current MinIO version with its commit-id | -| `minio_disks_offline` | Total number of offline disks on current MinIO instance | -| `minio_disks_total` | Total number of disks on current MinIO instance | - -### Disk metrics are labeled by 'disk' which indentifies each disk -| name | description | -|:---------------------------|:-------------------------------------------------------------------------------| -| `disk_storage_total` | Total size of the disk | -| `disk_storage_used` | Total disk space used per disk | -| `disk_storage_available` | Total available disk space per disk | - -### S3 API metrics are labeled by 'api' which identifies different S3 API requests -| name | description | -|:---------------------------|:-------------------------------------------------------------------------------| -| `s3_requests_total` | Total number of s3 requests in current MinIO instance | -| `s3_errors_total` | Total number of errors in s3 requests in current MinIO instance | -| `s3_requests_current` | Total number of active s3 requests in current MinIO instance | -| `s3_rx_bytes_total` | Total number of s3 bytes received by current MinIO server instance | -| `s3_tx_bytes_total` | Total number of s3 bytes sent by current MinIO server instance | -| `s3_ttfb_seconds` | Histogram that holds the latency information of the requests | - -#### Internode metrics only available in a distributed setup -| name | description | -|:---------------------------|:-------------------------------------------------------------------------------| -| `internode_rx_bytes_total` | Total number of internode bytes received by current MinIO server instance | -| `internode_tx_bytes_total` | Total number of bytes sent to the other nodes by current MinIO server instance | - -Apart from above metrics, MinIO also exposes below mode specific metrics - -### Bucket usage specific metrics -All metrics are labeled by `bucket`, each metric is displayed per bucket. `buckets_objects_histogram` is additionally labeled by `object_size` string which is represented by any of the following values - -- *LESS_THAN_1024_B* -- *BETWEEN_1024_B_AND_1_MB* -- *BETWEEN_1_MB_AND_10_MB* -- *BETWEEN_10_MB_AND_64_MB* -- *BETWEEN_64_MB_AND_128_MB* -- *BETWEEN_128_MB_AND_512_MB* -- *GREATER_THAN_512_MB* - -Units defintions: -- 1 MB = 1024 KB -- 1 KB = 1024 B - -| name | description | -|:------------------------------------|:----------------------------------------------------| -| `bucket_usage_size` | Total size of the bucket | -| `bucket_objects_count` | Total number of objects in a bucket | -| `bucket_objects_histogram` | Total number of objects filtered by different sizes | -| `bucket_replication_pending_size` | Total capacity not replicated | -| `bucket_replication_failed_size` | Total capacity failed to replicate at least once | -| `bucket_replication_successful_size`| Total capacity successfully replicated | -| `bucket_replication_received_size` | Total capacity received as replicated objects | - -### Cache specific metrics - -MinIO Gateway instances enabled with Disk-Caching expose caching related metrics. - -#### Global cache metrics -| name | description | -|:---------------------|:--------------------------------------------------| -| `cache_hits_total` | Total number of cache hits | -| `cache_misses_total` | Total number of cache misses | -| `cache_data_served` | Total number of bytes served from cache | - -#### Per disk cache metrics -| name | description | -|:-----------------------|:---------------------------------------------------------------------------------| -| `cache_usage_size` | Total cache usage in bytes | -| `cache_total_capacity` | Total size of cache disk | -| `cache_usage_percent` | Total percentage cache usage | -| `cache_usage_state` | Indicates cache usage is high or low, relative to current cache 'quota' settings | - -`cache_usage_state` holds only two states - -- '1' indicates high disk usage -- '0' indicates low disk usage - -### Gateway specific metrics -MinIO Gateway instance exposes metrics related to Gateway communication with the cloud backend (S3, Azure & GCS Gateway). - -`` changes based on the gateway in use can be 's3', 'gcs' or 'azure'. Other metrics are labeled with `method` that identifies HTTP GET, HEAD, PUT and POST requests to the backend. - -| name | description | -|:----------------------------------------|:---------------------------------------------------------------------------| -| `gateway__requests` | Total number of requests made to the gateway backend | -| `gateway__bytes_sent` | Total number of bytes sent to cloud backend (in PUT & POST Requests) | -| `gateway__bytes_received` | Total number of bytes received from cloud backend (in GET & HEAD Requests) | - -Note that this is currently only support for Azure, S3 and GCS Gateway. - -### MinIO self-healing metrics - `self_heal_*` - -MinIO exposes self-healing related metrics for erasure-code deployments _only_. These metrics are _not_ available on Gateway or Single Node, Single Drive deployments. Note that these metrics will be exposed _only_ when there is a relevant event happening on MinIO server. - -| name | description | -|:-------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `self_heal_time_since_last_activity` | Time elapsed since last self-healing related activity | -| `self_heal_objects_scanned` | Number of objects scanned by self-healing thread in its current run. This will reset when a fresh self-healing run starts. This is labeled with the object type scanned | -| `self_heal_objects_healed` | Number of objects healing by self-healing thread in its current run. This will reset when a fresh self-healing run starts. This is labeled with the object type scanned | -| `self_heal_objects_heal_failed` | Number of objects for which self-healing failed in its current run. This will reset when a fresh self-healing run starts. This is labeled with disk status and its endpoint | - -## Migration guide for the new set of metrics - -This migration guide applies for older releases or any releases before `RELEASE.2019-10-23*` - -### MinIO disk level metrics - `disk_*` - -The migrations include - -- `minio_total_disks` to `minio_disks_total` -- `minio_offline_disks` to `minio_disks_offline` - -### MinIO disk level metrics - `disk_storage_*` - -These metrics have one label. - -- `disk`: Holds the disk path - -The migrations include - -- `minio_disk_storage_used_bytes` to `disk_storage_used` -- `minio_disk_storage_available_bytes` to `disk_storage_available` -- `minio_disk_storage_total_bytes` to `disk_storage_total` - -### MinIO network level metrics - -These metrics are detailed to cover the s3 and internode network statistics. - -The migrations include - -- `minio_network_sent_bytes_total` to `s3_tx_bytes_total` and `internode_tx_bytes_total` -- `minio_network_received_bytes_total` to `s3_rx_bytes_total` and `internode_rx_bytes_total` - -Some of the additional metrics added were - -- `s3_requests_total` -- `s3_errors_total` -- `s3_ttfb_seconds` +[The list of metrics reported can be here](https://github.com/minio/minio/blob/master/docs/metrics/prometheus/list.md) diff --git a/docs/metrics/prometheus/list.md b/docs/metrics/prometheus/list.md new file mode 100644 index 000000000..9f2cc3716 --- /dev/null +++ b/docs/metrics/prometheus/list.md @@ -0,0 +1,47 @@ +# List of metrics reported cluster wide + +Each metric includes a label for the server that calculated the metric. +Each metric has a label for the server that generated the metric. + +These metrics can be from any MinIO server once per collection. + +| Name | Description | +|:-----------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------| +|`minio_bucket_objects_size_distribution` |Distribution of object sizes in the bucket, includes label for the bucket name. | +|`minio_bucket_replication_failed_bytes` |Total number of bytes failed at least once to replicate. | +|`minio_bucket_replication_pending_bytes` |Total bytes pending to replicate. | +|`minio_bucket_replication_received_bytes` |Total number of bytes replicated to this bucket from another source bucket. | +|`minio_bucket_replication_sent_bytes` |Total number of bytes replicated to the target bucket. | +|`minio_bucket_usage_object_total` |Total number of objects | +|`minio_bucket_usage_total_bytes` |Total bucket size in bytes | +|`minio_cluster_capacity_raw_free_bytes` |Total free capacity online in the cluster. | +|`minio_cluster_capacity_raw_total_bytes` |Total capacity online in the cluster. | +|`minio_cluster_capacity_usable_free_bytes` |Total free usable capacity online in the cluster. | +|`minio_cluster_capacity_usable_total_bytes` |Total usable capacity online in the cluster. | +|`minio_cluster_disk_offline_total` |Total disks offline. | +|`minio_cluster_disk_online_total` |Total disks online. | +|`minio_cluster_nodes_offline_total` |Total number of MinIO nodes offline. | +|`minio_cluster_nodes_online_total` |Total number of MinIO nodes online. | +|`minio_heal_objects_error_total` |Objects for which healing failed in current self healing run | +|`minio_heal_objects_heal_total` |Objects healed in current self healing run | +|`minio_heal_objects_total` |Objects scanned in current self healing run | +|`minio_heal_time_last_activity_nano_seconds` |Time elapsed (in nano seconds) since last self healing activity. This is set to -1 until initial self heal activity | +|`minio_inter_node_traffic_received_bytes` |Total number of bytes received from other peer nodes. | +|`minio_inter_node_traffic_sent_bytes` |Total number of bytes sent to the other peer nodes. | +|`minio_node_disk_free_bytes` |Total storage available on a disk. | +|`minio_node_disk_total_bytes` |Total storage on a disk. | +|`minio_node_disk_used_bytes` |Total storage used on a disk. | +|`minio_s3_requests_error_total` |Total number S3 requests with errors | +|`minio_s3_requests_inflight_total` |Total number of S3 requests currently in flight. | +|`minio_s3_requests_total` |Total number S3 requests | +|`minio_s3_time_ttbf_seconds_distribution` |Distribution of the time to first byte across API calls. | +|`minio_s3_traffic_received_bytes` |Total number of s3 bytes received. | +|`minio_s3_traffic_sent_bytes` |Total number of s3 bytes sent | +|`minio_cache_hits_total` |Total number of disk cache hits | +|`minio_cache_missed_total` |Total number of disk cache misses | +|`minio_cache_sent_bytes` |Total number of bytes served from cache | +|`minio_cache_total_bytes` |Total size of cache disk in bytes | +|`minio_cache_usage_info` |Total percentage cache usage, value of 1 indicates high and 0 low, label level is set as well | +|`minio_cache_used_bytes` |Current cache usage in bytes | +|`minio_software_commit_info` |Git commit hash for the MinIO release. | +|`minio_software_version_info` |MinIO Release tag for the server | diff --git a/go.mod b/go.mod index 1409cd1bb..6967a0ac3 100644 --- a/go.mod +++ b/go.mod @@ -65,6 +65,9 @@ require ( github.com/pierrec/lz4 v2.5.2+incompatible github.com/pkg/errors v0.9.1 github.com/prometheus/client_golang v1.8.0 + github.com/quasilyte/go-ruleguard v0.2.1 // indirect + github.com/quasilyte/go-ruleguard/dsl/fluent v0.0.0-20201222093424-5d7e62a465d3 // indirect + github.com/prometheus/client_model v0.2.0 github.com/rjeczalik/notify v0.9.2 github.com/rs/cors v1.7.0 github.com/secure-io/sio-go v0.3.0 diff --git a/go.sum b/go.sum index 841f1b203..9e5e6f6ae 100644 --- a/go.sum +++ b/go.sum @@ -529,6 +529,9 @@ github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+Gx github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU= github.com/prometheus/procfs v0.2.0 h1:wH4vA7pcjKuZzjF7lM8awk4fnuJO6idemZXoKnULUx4= github.com/prometheus/procfs v0.2.0/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU= +github.com/quasilyte/go-ruleguard v0.2.1 h1:56eRm0daAyny9UhJnmtJW/UyLZQusukBAB8oT8AHKHo= +github.com/quasilyte/go-ruleguard v0.2.1/go.mod h1:hN2rVc/uS4bQhQKTio2XaSJSafJwqBUWWwtssT3cQmc= +github.com/quasilyte/go-ruleguard/dsl/fluent v0.0.0-20201222093424-5d7e62a465d3/go.mod h1:P7JlQWFT7jDcFZMtUPQbtGzzzxva3rBn6oIF+LPwFcM= github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a h1:9ZKAASQSHhDYGoxY8uLVpewe1GDZ2vu2Tr/vTdVAkFQ= github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4= github.com/rcrowley/go-metrics v0.0.0-20200313005456-10cdbea86bc0 h1:MkV+77GLUNo5oJ0jf870itWm3D0Sjh7+Za9gazKc5LQ= @@ -619,6 +622,7 @@ github.com/xdg/stringprep v1.0.0 h1:d9X0esnoa3dFsV0FG35rAT0RIhYFlPq7MiP+DW89La0= github.com/xdg/stringprep v1.0.0/go.mod h1:Jhud4/sHMO4oL310DaZAKk9ZaJ08SJfe+sJh0HrGL1Y= github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2 h1:eY9dn8+vbi4tKz5Qo6v2eYzo7kUS51QINcR5jNpbZS8= github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU= +github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= go.etcd.io/bbolt v1.3.3 h1:MUGmc65QhB3pIlaQ5bB4LwqSj6GIonVJXpZiaKNyaKk= go.etcd.io/bbolt v1.3.3/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= @@ -711,6 +715,7 @@ golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9 h1:SQFwaSi55rU7vdNs9Yr0Z324VNlrF+0wMqRXT4St8ck= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -782,6 +787,7 @@ golang.org/x/tools v0.0.0-20191029190741-b9c20aec41a5/go.mod h1:b+2E5dAYhXwXZwtn golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191216052735-49a3e744a425/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20200103221440-774c71fcf114/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200812195022-5ae4c3c160a0/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA= golang.org/x/tools v0.0.0-20201105001634-bc3cf281b174/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.0.0-20210115202250-e0d201561e39 h1:BTs2GMGSMWpgtCpv1CE7vkJTv7XcHdcLLnAMu7UbgTY= golang.org/x/tools v0.0.0-20210115202250-e0d201561e39/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= diff --git a/pkg/madmin/health.go b/pkg/madmin/health.go index 17ffed231..8eb4e3672 100644 --- a/pkg/madmin/health.go +++ b/pkg/madmin/health.go @@ -158,8 +158,8 @@ type PerfInfo struct { // ServerDrivesInfo - Drive info about all drives in a single MinIO node type ServerDrivesInfo struct { Addr string `json:"addr"` - Serial []DrivePerfInfo `json:"serial,omitempty"` - Parallel []DrivePerfInfo `json:"parallel,omitempty"` + Serial []DrivePerfInfo `json:"serial,omitempty"` // Drive perf info collected one drive at a time + Parallel []DrivePerfInfo `json:"parallel,omitempty"` // Drive perf info collected in parallel Error string `json:"error,omitempty"` } @@ -316,3 +316,27 @@ func (adm *AdminClient) ServerHealthInfo(ctx context.Context, healthDataTypes [] return respChan } + +// GetTotalCapacity gets the total capacity a server holds. +func (s *ServerDiskHwInfo) GetTotalCapacity() (capacity uint64) { + for _, u := range s.Usage { + capacity += u.Total + } + return +} + +// GetTotalFreeCapacity gets the total capacity that is free. +func (s *ServerDiskHwInfo) GetTotalFreeCapacity() (capacity uint64) { + for _, u := range s.Usage { + capacity += u.Free + } + return +} + +// GetTotalUsedCapacity gets the total capacity used. +func (s *ServerDiskHwInfo) GetTotalUsedCapacity() (capacity uint64) { + for _, u := range s.Usage { + capacity += u.Used + } + return +}