mirror of
https://github.com/minio/minio.git
synced 2024-12-25 06:35:56 -05:00
move bucket centric metrics to /minio/v2/metrics/bucket handlers (#17663)
users/customers do not have a reasonable number of buckets anymore, this is why we must avoid overpopulating cluster endpoints, instead move the bucket monitoring to a separate endpoint. some of it's a breaking change here for a couple of metrics, but it is imperative that we do it to improve the responsiveness of our Prometheus cluster endpoint. Bonus: Added new cluster metrics for usage, objects and histograms
This commit is contained in:
parent
4f257bf1e6
commit
6426b74770
@ -229,7 +229,8 @@ func guessIsMetricsReq(req *http.Request) bool {
|
|||||||
return (aType == authTypeAnonymous || aType == authTypeJWT) &&
|
return (aType == authTypeAnonymous || aType == authTypeJWT) &&
|
||||||
req.URL.Path == minioReservedBucketPath+prometheusMetricsPathLegacy ||
|
req.URL.Path == minioReservedBucketPath+prometheusMetricsPathLegacy ||
|
||||||
req.URL.Path == minioReservedBucketPath+prometheusMetricsV2ClusterPath ||
|
req.URL.Path == minioReservedBucketPath+prometheusMetricsV2ClusterPath ||
|
||||||
req.URL.Path == minioReservedBucketPath+prometheusMetricsV2NodePath
|
req.URL.Path == minioReservedBucketPath+prometheusMetricsV2NodePath ||
|
||||||
|
req.URL.Path == minioReservedBucketPath+prometheusMetricsV2BucketPath
|
||||||
}
|
}
|
||||||
|
|
||||||
// guessIsRPCReq - returns true if the request is for an RPC endpoint.
|
// guessIsRPCReq - returns true if the request is for an RPC endpoint.
|
||||||
|
@ -127,6 +127,14 @@ func (bh *bucketHTTPStats) updateHTTPStats(bucket, api string, w *xhttp.Response
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if w != nil {
|
||||||
|
// Increment the prometheus http request response histogram with API, Bucket
|
||||||
|
bucketHTTPRequestsDuration.With(prometheus.Labels{
|
||||||
|
"api": api,
|
||||||
|
"bucket": bucket,
|
||||||
|
}).Observe(w.TimeToFirstByte.Seconds())
|
||||||
|
}
|
||||||
|
|
||||||
bh.Lock()
|
bh.Lock()
|
||||||
defer bh.Unlock()
|
defer bh.Unlock()
|
||||||
|
|
||||||
|
@ -27,6 +27,7 @@ import (
|
|||||||
const (
|
const (
|
||||||
prometheusMetricsPathLegacy = "/prometheus/metrics"
|
prometheusMetricsPathLegacy = "/prometheus/metrics"
|
||||||
prometheusMetricsV2ClusterPath = "/v2/metrics/cluster"
|
prometheusMetricsV2ClusterPath = "/v2/metrics/cluster"
|
||||||
|
prometheusMetricsV2BucketPath = "/v2/metrics/bucket"
|
||||||
prometheusMetricsV2NodePath = "/v2/metrics/node"
|
prometheusMetricsV2NodePath = "/v2/metrics/node"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -47,14 +48,13 @@ func registerMetricsRouter(router *mux.Router) {
|
|||||||
// metrics router
|
// metrics router
|
||||||
metricsRouter := router.NewRoute().PathPrefix(minioReservedBucketPath).Subrouter()
|
metricsRouter := router.NewRoute().PathPrefix(minioReservedBucketPath).Subrouter()
|
||||||
authType := strings.ToLower(env.Get(EnvPrometheusAuthType, string(prometheusJWT)))
|
authType := strings.ToLower(env.Get(EnvPrometheusAuthType, string(prometheusJWT)))
|
||||||
switch prometheusAuthType(authType) {
|
|
||||||
case prometheusPublic:
|
auth := AuthMiddleware
|
||||||
metricsRouter.Handle(prometheusMetricsPathLegacy, metricsHandler())
|
if prometheusAuthType(authType) == prometheusPublic {
|
||||||
metricsRouter.Handle(prometheusMetricsV2ClusterPath, metricsServerHandler())
|
auth = NoAuthMiddleware
|
||||||
metricsRouter.Handle(prometheusMetricsV2NodePath, metricsNodeHandler())
|
|
||||||
case prometheusJWT:
|
|
||||||
metricsRouter.Handle(prometheusMetricsPathLegacy, AuthMiddleware(metricsHandler()))
|
|
||||||
metricsRouter.Handle(prometheusMetricsV2ClusterPath, AuthMiddleware(metricsServerHandler()))
|
|
||||||
metricsRouter.Handle(prometheusMetricsV2NodePath, AuthMiddleware(metricsNodeHandler()))
|
|
||||||
}
|
}
|
||||||
|
metricsRouter.Handle(prometheusMetricsPathLegacy, auth(metricsHandler()))
|
||||||
|
metricsRouter.Handle(prometheusMetricsV2ClusterPath, auth(metricsServerHandler()))
|
||||||
|
metricsRouter.Handle(prometheusMetricsV2BucketPath, auth(metricsBucketHandler()))
|
||||||
|
metricsRouter.Handle(prometheusMetricsV2NodePath, auth(metricsNodeHandler()))
|
||||||
}
|
}
|
||||||
|
@ -40,24 +40,26 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
nodeCollector *minioNodeCollector
|
nodeCollector *minioNodeCollector
|
||||||
clusterCollector *minioClusterCollector
|
clusterCollector *minioClusterCollector
|
||||||
peerMetricsGroups []*MetricsGroup
|
bucketCollector *minioBucketCollector
|
||||||
|
peerMetricsGroups []*MetricsGroup
|
||||||
|
bucketPeerMetricsGroups []*MetricsGroup
|
||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
clusterMetricsGroups := []*MetricsGroup{
|
clusterMetricsGroups := []*MetricsGroup{
|
||||||
getBucketUsageMetrics(),
|
|
||||||
getNodeHealthMetrics(),
|
getNodeHealthMetrics(),
|
||||||
getClusterStorageMetrics(),
|
getClusterStorageMetrics(),
|
||||||
getClusterTierMetrics(),
|
getClusterTierMetrics(),
|
||||||
|
getClusterUsageMetrics(),
|
||||||
getKMSMetrics(),
|
getKMSMetrics(),
|
||||||
}
|
}
|
||||||
|
|
||||||
peerMetricsGroups = []*MetricsGroup{
|
peerMetricsGroups = []*MetricsGroup{
|
||||||
getCacheMetrics(),
|
getCacheMetrics(),
|
||||||
getGoMetrics(),
|
getGoMetrics(),
|
||||||
getHTTPMetrics(),
|
getHTTPMetrics(false),
|
||||||
getNotificationMetrics(),
|
getNotificationMetrics(),
|
||||||
getLocalStorageMetrics(),
|
getLocalStorageMetrics(),
|
||||||
getMinioProcMetrics(),
|
getMinioProcMetrics(),
|
||||||
@ -82,7 +84,7 @@ func init() {
|
|||||||
getNodeHealthMetrics(),
|
getNodeHealthMetrics(),
|
||||||
getLocalDriveStorageMetrics(),
|
getLocalDriveStorageMetrics(),
|
||||||
getCacheMetrics(),
|
getCacheMetrics(),
|
||||||
getHTTPMetrics(),
|
getHTTPMetrics(false),
|
||||||
getNetworkMetrics(),
|
getNetworkMetrics(),
|
||||||
getMinioVersionMetrics(),
|
getMinioVersionMetrics(),
|
||||||
getS3TTFBMetric(),
|
getS3TTFBMetric(),
|
||||||
@ -90,8 +92,20 @@ func init() {
|
|||||||
getDistLockMetrics(),
|
getDistLockMetrics(),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bucketMetricsGroups := []*MetricsGroup{
|
||||||
|
getBucketUsageMetrics(),
|
||||||
|
getHTTPMetrics(true),
|
||||||
|
getBucketTTFBMetric(),
|
||||||
|
}
|
||||||
|
|
||||||
|
bucketPeerMetricsGroups = []*MetricsGroup{
|
||||||
|
getHTTPMetrics(true),
|
||||||
|
getBucketTTFBMetric(),
|
||||||
|
}
|
||||||
|
|
||||||
nodeCollector = newMinioCollectorNode(nodeGroups)
|
nodeCollector = newMinioCollectorNode(nodeGroups)
|
||||||
clusterCollector = newMinioClusterCollector(allMetricsGroups)
|
clusterCollector = newMinioClusterCollector(allMetricsGroups)
|
||||||
|
bucketCollector = newMinioBucketCollector(bucketMetricsGroups)
|
||||||
}
|
}
|
||||||
|
|
||||||
// MetricNamespace is top level grouping of metrics to create the metric name.
|
// MetricNamespace is top level grouping of metrics to create the metric name.
|
||||||
@ -121,11 +135,13 @@ const (
|
|||||||
ioSubsystem MetricSubsystem = "io"
|
ioSubsystem MetricSubsystem = "io"
|
||||||
nodesSubsystem MetricSubsystem = "nodes"
|
nodesSubsystem MetricSubsystem = "nodes"
|
||||||
objectsSubsystem MetricSubsystem = "objects"
|
objectsSubsystem MetricSubsystem = "objects"
|
||||||
|
bucketsSubsystem MetricSubsystem = "bucket"
|
||||||
processSubsystem MetricSubsystem = "process"
|
processSubsystem MetricSubsystem = "process"
|
||||||
replicationSubsystem MetricSubsystem = "replication"
|
replicationSubsystem MetricSubsystem = "replication"
|
||||||
requestsSubsystem MetricSubsystem = "requests"
|
requestsSubsystem MetricSubsystem = "requests"
|
||||||
requestsRejectedSubsystem MetricSubsystem = "requests_rejected"
|
requestsRejectedSubsystem MetricSubsystem = "requests_rejected"
|
||||||
timeSubsystem MetricSubsystem = "time"
|
timeSubsystem MetricSubsystem = "time"
|
||||||
|
ttfbSubsystem MetricSubsystem = "requests_ttfb"
|
||||||
trafficSubsystem MetricSubsystem = "traffic"
|
trafficSubsystem MetricSubsystem = "traffic"
|
||||||
softwareSubsystem MetricSubsystem = "software"
|
softwareSubsystem MetricSubsystem = "software"
|
||||||
sysCallSubsystem MetricSubsystem = "syscall"
|
sysCallSubsystem MetricSubsystem = "syscall"
|
||||||
@ -192,7 +208,7 @@ const (
|
|||||||
|
|
||||||
sizeDistribution = "size_distribution"
|
sizeDistribution = "size_distribution"
|
||||||
versionDistribution = "version_distribution"
|
versionDistribution = "version_distribution"
|
||||||
ttfbDistribution = "ttfb_seconds_distribution"
|
ttfbDistribution = "seconds_distribution"
|
||||||
|
|
||||||
lastActivityTime = "last_activity_nano_seconds"
|
lastActivityTime = "last_activity_nano_seconds"
|
||||||
startTime = "starttime_seconds"
|
startTime = "starttime_seconds"
|
||||||
@ -308,6 +324,16 @@ func (g *MetricsGroup) Get() (metrics []Metric) {
|
|||||||
return metrics
|
return metrics
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getClusterBucketsTotalMD() MetricDescription {
|
||||||
|
return MetricDescription{
|
||||||
|
Namespace: clusterMetricNamespace,
|
||||||
|
Subsystem: bucketsSubsystem,
|
||||||
|
Name: total,
|
||||||
|
Help: "Total number of buckets in the cluster",
|
||||||
|
Type: gaugeMetric,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func getClusterCapacityTotalBytesMD() MetricDescription {
|
func getClusterCapacityTotalBytesMD() MetricDescription {
|
||||||
return MetricDescription{
|
return MetricDescription{
|
||||||
Namespace: clusterMetricNamespace,
|
Namespace: clusterMetricNamespace,
|
||||||
@ -528,6 +554,36 @@ func getBucketUsageTotalBytesMD() MetricDescription {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getClusterUsageTotalBytesMD() MetricDescription {
|
||||||
|
return MetricDescription{
|
||||||
|
Namespace: clusterMetricNamespace,
|
||||||
|
Subsystem: usageSubsystem,
|
||||||
|
Name: totalBytes,
|
||||||
|
Help: "Total cluster usage in bytes",
|
||||||
|
Type: gaugeMetric,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func getClusterUsageObjectsTotalMD() MetricDescription {
|
||||||
|
return MetricDescription{
|
||||||
|
Namespace: clusterMetricNamespace,
|
||||||
|
Subsystem: usageSubsystem,
|
||||||
|
Name: objectTotal,
|
||||||
|
Help: "Total number of objects in a cluster",
|
||||||
|
Type: gaugeMetric,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func getClusterUsageVersionsTotalMD() MetricDescription {
|
||||||
|
return MetricDescription{
|
||||||
|
Namespace: clusterMetricNamespace,
|
||||||
|
Subsystem: usageSubsystem,
|
||||||
|
Name: versionTotal,
|
||||||
|
Help: "Total number of versions (includes delete marker) in a cluster",
|
||||||
|
Type: gaugeMetric,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func getBucketUsageObjectsTotalMD() MetricDescription {
|
func getBucketUsageObjectsTotalMD() MetricDescription {
|
||||||
return MetricDescription{
|
return MetricDescription{
|
||||||
Namespace: bucketMetricNamespace,
|
Namespace: bucketMetricNamespace,
|
||||||
@ -543,7 +599,7 @@ func getBucketUsageVersionsTotalMD() MetricDescription {
|
|||||||
Namespace: bucketMetricNamespace,
|
Namespace: bucketMetricNamespace,
|
||||||
Subsystem: usageSubsystem,
|
Subsystem: usageSubsystem,
|
||||||
Name: versionTotal,
|
Name: versionTotal,
|
||||||
Help: "Total number of versions",
|
Help: "Total number of versions (includes delete marker)",
|
||||||
Type: gaugeMetric,
|
Type: gaugeMetric,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -598,6 +654,26 @@ func getBucketRepFailedOperationsMD() MetricDescription {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getClusterObjectDistributionMD() MetricDescription {
|
||||||
|
return MetricDescription{
|
||||||
|
Namespace: clusterMetricNamespace,
|
||||||
|
Subsystem: objectsSubsystem,
|
||||||
|
Name: sizeDistribution,
|
||||||
|
Help: "Distribution of object sizes across a cluster",
|
||||||
|
Type: histogramMetric,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func getClusterObjectVersionsMD() MetricDescription {
|
||||||
|
return MetricDescription{
|
||||||
|
Namespace: clusterMetricNamespace,
|
||||||
|
Subsystem: objectsSubsystem,
|
||||||
|
Name: versionDistribution,
|
||||||
|
Help: "Distribution of object sizes across a cluster",
|
||||||
|
Type: histogramMetric,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func getBucketObjectDistributionMD() MetricDescription {
|
func getBucketObjectDistributionMD() MetricDescription {
|
||||||
return MetricDescription{
|
return MetricDescription{
|
||||||
Namespace: bucketMetricNamespace,
|
Namespace: bucketMetricNamespace,
|
||||||
@ -961,9 +1037,19 @@ func getMinIOCommitMD() MetricDescription {
|
|||||||
func getS3TTFBDistributionMD() MetricDescription {
|
func getS3TTFBDistributionMD() MetricDescription {
|
||||||
return MetricDescription{
|
return MetricDescription{
|
||||||
Namespace: s3MetricNamespace,
|
Namespace: s3MetricNamespace,
|
||||||
Subsystem: timeSubsystem,
|
Subsystem: ttfbSubsystem,
|
||||||
Name: ttfbDistribution,
|
Name: ttfbDistribution,
|
||||||
Help: "Distribution of the time to first byte across API calls",
|
Help: "Distribution of time to first byte across API calls",
|
||||||
|
Type: gaugeMetric,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func getBucketTTFBDistributionMD() MetricDescription {
|
||||||
|
return MetricDescription{
|
||||||
|
Namespace: bucketMetricNamespace,
|
||||||
|
Subsystem: ttfbSubsystem,
|
||||||
|
Name: ttfbDistribution,
|
||||||
|
Help: "Distribution of time to first byte across API calls per bucket",
|
||||||
Type: gaugeMetric,
|
Type: gaugeMetric,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1234,6 +1320,51 @@ func getGoMetrics() *MetricsGroup {
|
|||||||
return mg
|
return mg
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getBucketTTFBMetric() *MetricsGroup {
|
||||||
|
mg := &MetricsGroup{
|
||||||
|
cacheInterval: 10 * time.Second,
|
||||||
|
}
|
||||||
|
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
|
||||||
|
// Read prometheus metric on this channel
|
||||||
|
ch := make(chan prometheus.Metric)
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
wg.Add(1)
|
||||||
|
|
||||||
|
// Read prometheus histogram data and convert it to internal metric data
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
for promMetric := range ch {
|
||||||
|
dtoMetric := &dto.Metric{}
|
||||||
|
err := promMetric.Write(dtoMetric)
|
||||||
|
if err != nil {
|
||||||
|
logger.LogIf(GlobalContext, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
h := dtoMetric.GetHistogram()
|
||||||
|
for _, b := range h.Bucket {
|
||||||
|
labels := make(map[string]string)
|
||||||
|
for _, lp := range dtoMetric.GetLabel() {
|
||||||
|
labels[*lp.Name] = *lp.Value
|
||||||
|
}
|
||||||
|
labels["le"] = fmt.Sprintf("%.3f", *b.UpperBound)
|
||||||
|
metric := Metric{
|
||||||
|
Description: getBucketTTFBDistributionMD(),
|
||||||
|
VariableLabels: labels,
|
||||||
|
Value: float64(b.GetCumulativeCount()),
|
||||||
|
}
|
||||||
|
metrics = append(metrics, metric)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
bucketHTTPRequestsDuration.Collect(ch)
|
||||||
|
close(ch)
|
||||||
|
wg.Wait()
|
||||||
|
return
|
||||||
|
})
|
||||||
|
return mg
|
||||||
|
}
|
||||||
|
|
||||||
func getS3TTFBMetric() *MetricsGroup {
|
func getS3TTFBMetric() *MetricsGroup {
|
||||||
mg := &MetricsGroup{
|
mg := &MetricsGroup{
|
||||||
cacheInterval: 10 * time.Second,
|
cacheInterval: 10 * time.Second,
|
||||||
@ -1912,84 +2043,87 @@ func getNotificationMetrics() *MetricsGroup {
|
|||||||
return mg
|
return mg
|
||||||
}
|
}
|
||||||
|
|
||||||
func getHTTPMetrics() *MetricsGroup {
|
func getHTTPMetrics(bucketOnly bool) *MetricsGroup {
|
||||||
mg := &MetricsGroup{
|
mg := &MetricsGroup{
|
||||||
cacheInterval: 10 * time.Second,
|
cacheInterval: 10 * time.Second,
|
||||||
}
|
}
|
||||||
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
|
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
|
||||||
httpStats := globalHTTPStats.toServerHTTPStats()
|
if !bucketOnly {
|
||||||
metrics = make([]Metric, 0, 3+
|
httpStats := globalHTTPStats.toServerHTTPStats()
|
||||||
len(httpStats.CurrentS3Requests.APIStats)+
|
metrics = make([]Metric, 0, 3+
|
||||||
len(httpStats.TotalS3Requests.APIStats)+
|
len(httpStats.CurrentS3Requests.APIStats)+
|
||||||
len(httpStats.TotalS3Errors.APIStats)+
|
len(httpStats.TotalS3Requests.APIStats)+
|
||||||
len(httpStats.TotalS35xxErrors.APIStats)+
|
len(httpStats.TotalS3Errors.APIStats)+
|
||||||
len(httpStats.TotalS34xxErrors.APIStats))
|
len(httpStats.TotalS35xxErrors.APIStats)+
|
||||||
metrics = append(metrics, Metric{
|
len(httpStats.TotalS34xxErrors.APIStats))
|
||||||
Description: getS3RejectedAuthRequestsTotalMD(),
|
metrics = append(metrics, Metric{
|
||||||
Value: float64(httpStats.TotalS3RejectedAuth),
|
Description: getS3RejectedAuthRequestsTotalMD(),
|
||||||
})
|
Value: float64(httpStats.TotalS3RejectedAuth),
|
||||||
metrics = append(metrics, Metric{
|
})
|
||||||
Description: getS3RejectedTimestampRequestsTotalMD(),
|
metrics = append(metrics, Metric{
|
||||||
Value: float64(httpStats.TotalS3RejectedTime),
|
Description: getS3RejectedTimestampRequestsTotalMD(),
|
||||||
})
|
Value: float64(httpStats.TotalS3RejectedTime),
|
||||||
metrics = append(metrics, Metric{
|
})
|
||||||
Description: getS3RejectedHeaderRequestsTotalMD(),
|
metrics = append(metrics, Metric{
|
||||||
Value: float64(httpStats.TotalS3RejectedHeader),
|
Description: getS3RejectedHeaderRequestsTotalMD(),
|
||||||
})
|
Value: float64(httpStats.TotalS3RejectedHeader),
|
||||||
metrics = append(metrics, Metric{
|
})
|
||||||
Description: getS3RejectedInvalidRequestsTotalMD(),
|
metrics = append(metrics, Metric{
|
||||||
Value: float64(httpStats.TotalS3RejectedInvalid),
|
Description: getS3RejectedInvalidRequestsTotalMD(),
|
||||||
})
|
Value: float64(httpStats.TotalS3RejectedInvalid),
|
||||||
metrics = append(metrics, Metric{
|
})
|
||||||
Description: getS3RequestsInQueueMD(),
|
metrics = append(metrics, Metric{
|
||||||
Value: float64(httpStats.S3RequestsInQueue),
|
Description: getS3RequestsInQueueMD(),
|
||||||
})
|
Value: float64(httpStats.S3RequestsInQueue),
|
||||||
metrics = append(metrics, Metric{
|
})
|
||||||
Description: getIncomingS3RequestsMD(),
|
metrics = append(metrics, Metric{
|
||||||
Value: float64(httpStats.S3RequestsIncoming),
|
Description: getIncomingS3RequestsMD(),
|
||||||
})
|
Value: float64(httpStats.S3RequestsIncoming),
|
||||||
|
})
|
||||||
|
|
||||||
for api, value := range httpStats.CurrentS3Requests.APIStats {
|
for api, value := range httpStats.CurrentS3Requests.APIStats {
|
||||||
metrics = append(metrics, Metric{
|
metrics = append(metrics, Metric{
|
||||||
Description: getS3RequestsInFlightMD(),
|
Description: getS3RequestsInFlightMD(),
|
||||||
Value: float64(value),
|
Value: float64(value),
|
||||||
VariableLabels: map[string]string{"api": api},
|
VariableLabels: map[string]string{"api": api},
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
for api, value := range httpStats.TotalS3Requests.APIStats {
|
for api, value := range httpStats.TotalS3Requests.APIStats {
|
||||||
metrics = append(metrics, Metric{
|
metrics = append(metrics, Metric{
|
||||||
Description: getS3RequestsTotalMD(),
|
Description: getS3RequestsTotalMD(),
|
||||||
Value: float64(value),
|
Value: float64(value),
|
||||||
VariableLabels: map[string]string{"api": api},
|
VariableLabels: map[string]string{"api": api},
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
for api, value := range httpStats.TotalS3Errors.APIStats {
|
for api, value := range httpStats.TotalS3Errors.APIStats {
|
||||||
metrics = append(metrics, Metric{
|
metrics = append(metrics, Metric{
|
||||||
Description: getS3RequestsErrorsMD(),
|
Description: getS3RequestsErrorsMD(),
|
||||||
Value: float64(value),
|
Value: float64(value),
|
||||||
VariableLabels: map[string]string{"api": api},
|
VariableLabels: map[string]string{"api": api},
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
for api, value := range httpStats.TotalS35xxErrors.APIStats {
|
for api, value := range httpStats.TotalS35xxErrors.APIStats {
|
||||||
metrics = append(metrics, Metric{
|
metrics = append(metrics, Metric{
|
||||||
Description: getS3Requests5xxErrorsMD(),
|
Description: getS3Requests5xxErrorsMD(),
|
||||||
Value: float64(value),
|
Value: float64(value),
|
||||||
VariableLabels: map[string]string{"api": api},
|
VariableLabels: map[string]string{"api": api},
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
for api, value := range httpStats.TotalS34xxErrors.APIStats {
|
for api, value := range httpStats.TotalS34xxErrors.APIStats {
|
||||||
metrics = append(metrics, Metric{
|
metrics = append(metrics, Metric{
|
||||||
Description: getS3Requests4xxErrorsMD(),
|
Description: getS3Requests4xxErrorsMD(),
|
||||||
Value: float64(value),
|
Value: float64(value),
|
||||||
VariableLabels: map[string]string{"api": api},
|
VariableLabels: map[string]string{"api": api},
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
for api, value := range httpStats.TotalS3Canceled.APIStats {
|
for api, value := range httpStats.TotalS3Canceled.APIStats {
|
||||||
metrics = append(metrics, Metric{
|
metrics = append(metrics, Metric{
|
||||||
Description: getS3RequestsCanceledMD(),
|
Description: getS3RequestsCanceledMD(),
|
||||||
Value: float64(value),
|
Value: float64(value),
|
||||||
VariableLabels: map[string]string{"api": api},
|
VariableLabels: map[string]string{"api": api},
|
||||||
})
|
})
|
||||||
|
}
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
for bucket, inOut := range globalBucketConnStats.getS3InOutBytes() {
|
for bucket, inOut := range globalBucketConnStats.getS3InOutBytes() {
|
||||||
@ -2100,6 +2234,105 @@ func getNetworkMetrics() *MetricsGroup {
|
|||||||
return mg
|
return mg
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getClusterUsageMetrics() *MetricsGroup {
|
||||||
|
mg := &MetricsGroup{
|
||||||
|
cacheInterval: 1 * time.Minute,
|
||||||
|
}
|
||||||
|
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
|
||||||
|
objLayer := newObjectLayerFn()
|
||||||
|
// Service not initialized yet
|
||||||
|
if objLayer == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
metrics = make([]Metric, 0, 50)
|
||||||
|
dataUsageInfo, err := loadDataUsageFromBackend(ctx, objLayer)
|
||||||
|
if err != nil {
|
||||||
|
logger.LogIf(ctx, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// data usage has not captured any data yet.
|
||||||
|
if dataUsageInfo.LastUpdate.IsZero() {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
metrics = append(metrics, Metric{
|
||||||
|
Description: getUsageLastScanActivityMD(),
|
||||||
|
Value: float64(time.Since(dataUsageInfo.LastUpdate)),
|
||||||
|
})
|
||||||
|
|
||||||
|
var (
|
||||||
|
clusterSize uint64
|
||||||
|
clusterBuckets uint64
|
||||||
|
clusterObjectsCount uint64
|
||||||
|
clusterVersionsCount uint64
|
||||||
|
)
|
||||||
|
|
||||||
|
clusterObjectSizesHistogram := map[string]uint64{}
|
||||||
|
clusterVersionsHistogram := map[string]uint64{}
|
||||||
|
for _, usage := range dataUsageInfo.BucketsUsage {
|
||||||
|
clusterBuckets++
|
||||||
|
clusterSize += usage.Size
|
||||||
|
clusterObjectsCount += usage.ObjectsCount
|
||||||
|
clusterVersionsCount += usage.VersionsCount
|
||||||
|
for k, v := range usage.ObjectSizesHistogram {
|
||||||
|
v1, ok := clusterObjectSizesHistogram[k]
|
||||||
|
if !ok {
|
||||||
|
clusterObjectSizesHistogram[k] = v
|
||||||
|
} else {
|
||||||
|
v1 += v
|
||||||
|
clusterObjectSizesHistogram[k] = v1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for k, v := range usage.ObjectVersionsHistogram {
|
||||||
|
v1, ok := clusterVersionsHistogram[k]
|
||||||
|
if !ok {
|
||||||
|
clusterVersionsHistogram[k] = v
|
||||||
|
} else {
|
||||||
|
v1 += v
|
||||||
|
clusterVersionsHistogram[k] = v1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
metrics = append(metrics, Metric{
|
||||||
|
Description: getClusterUsageTotalBytesMD(),
|
||||||
|
Value: float64(clusterSize),
|
||||||
|
})
|
||||||
|
|
||||||
|
metrics = append(metrics, Metric{
|
||||||
|
Description: getClusterUsageObjectsTotalMD(),
|
||||||
|
Value: float64(clusterObjectsCount),
|
||||||
|
})
|
||||||
|
|
||||||
|
metrics = append(metrics, Metric{
|
||||||
|
Description: getClusterUsageVersionsTotalMD(),
|
||||||
|
Value: float64(clusterVersionsCount),
|
||||||
|
})
|
||||||
|
|
||||||
|
metrics = append(metrics, Metric{
|
||||||
|
Description: getClusterObjectDistributionMD(),
|
||||||
|
Histogram: clusterObjectSizesHistogram,
|
||||||
|
HistogramBucketLabel: "range",
|
||||||
|
})
|
||||||
|
|
||||||
|
metrics = append(metrics, Metric{
|
||||||
|
Description: getClusterObjectVersionsMD(),
|
||||||
|
Histogram: clusterVersionsHistogram,
|
||||||
|
HistogramBucketLabel: "range",
|
||||||
|
})
|
||||||
|
|
||||||
|
metrics = append(metrics, Metric{
|
||||||
|
Description: getClusterBucketsTotalMD(),
|
||||||
|
Value: float64(clusterBuckets),
|
||||||
|
})
|
||||||
|
|
||||||
|
return
|
||||||
|
})
|
||||||
|
return mg
|
||||||
|
}
|
||||||
|
|
||||||
func getBucketUsageMetrics() *MetricsGroup {
|
func getBucketUsageMetrics() *MetricsGroup {
|
||||||
mg := &MetricsGroup{
|
mg := &MetricsGroup{
|
||||||
cacheInterval: 1 * time.Minute,
|
cacheInterval: 1 * time.Minute,
|
||||||
@ -2199,6 +2432,7 @@ func getBucketUsageMetrics() *MetricsGroup {
|
|||||||
HistogramBucketLabel: "range",
|
HistogramBucketLabel: "range",
|
||||||
VariableLabels: map[string]string{"bucket": bucket},
|
VariableLabels: map[string]string{"bucket": bucket},
|
||||||
})
|
})
|
||||||
|
|
||||||
metrics = append(metrics, Metric{
|
metrics = append(metrics, Metric{
|
||||||
Description: getBucketObjectVersionsMD(),
|
Description: getBucketObjectVersionsMD(),
|
||||||
Histogram: usage.ObjectVersionsHistogram,
|
Histogram: usage.ObjectVersionsHistogram,
|
||||||
@ -2598,6 +2832,77 @@ func getKMSMetrics() *MetricsGroup {
|
|||||||
return mg
|
return mg
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type minioBucketCollector struct {
|
||||||
|
metricsGroups []*MetricsGroup
|
||||||
|
desc *prometheus.Desc
|
||||||
|
}
|
||||||
|
|
||||||
|
func newMinioBucketCollector(metricsGroups []*MetricsGroup) *minioBucketCollector {
|
||||||
|
return &minioBucketCollector{
|
||||||
|
metricsGroups: metricsGroups,
|
||||||
|
desc: prometheus.NewDesc("minio_bucket_stats", "Statistics exposed by MinIO server cluster wide per bucket", nil, nil),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Describe sends the super-set of all possible descriptors of metrics
|
||||||
|
func (c *minioBucketCollector) Describe(ch chan<- *prometheus.Desc) {
|
||||||
|
ch <- c.desc
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect is called by the Prometheus registry when collecting metrics.
|
||||||
|
func (c *minioBucketCollector) Collect(out chan<- prometheus.Metric) {
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
publish := func(in <-chan Metric) {
|
||||||
|
defer wg.Done()
|
||||||
|
for metric := range in {
|
||||||
|
labels, values := getOrderedLabelValueArrays(metric.VariableLabels)
|
||||||
|
if metric.Description.Type == histogramMetric {
|
||||||
|
if metric.Histogram == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for k, v := range metric.Histogram {
|
||||||
|
out <- prometheus.MustNewConstMetric(
|
||||||
|
prometheus.NewDesc(
|
||||||
|
prometheus.BuildFQName(string(metric.Description.Namespace),
|
||||||
|
string(metric.Description.Subsystem),
|
||||||
|
string(metric.Description.Name)),
|
||||||
|
metric.Description.Help,
|
||||||
|
append(labels, metric.HistogramBucketLabel),
|
||||||
|
metric.StaticLabels,
|
||||||
|
),
|
||||||
|
prometheus.GaugeValue,
|
||||||
|
float64(v),
|
||||||
|
append(values, k)...)
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
metricType := prometheus.GaugeValue
|
||||||
|
if metric.Description.Type == counterMetric {
|
||||||
|
metricType = prometheus.CounterValue
|
||||||
|
}
|
||||||
|
toPost := prometheus.MustNewConstMetric(
|
||||||
|
prometheus.NewDesc(
|
||||||
|
prometheus.BuildFQName(string(metric.Description.Namespace),
|
||||||
|
string(metric.Description.Subsystem),
|
||||||
|
string(metric.Description.Name)),
|
||||||
|
metric.Description.Help,
|
||||||
|
labels,
|
||||||
|
metric.StaticLabels,
|
||||||
|
),
|
||||||
|
metricType,
|
||||||
|
metric.Value,
|
||||||
|
values...)
|
||||||
|
out <- toPost
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Call peer api to fetch metrics
|
||||||
|
wg.Add(2)
|
||||||
|
go publish(ReportMetrics(GlobalContext, c.metricsGroups))
|
||||||
|
go publish(globalNotificationSys.GetBucketMetrics(GlobalContext))
|
||||||
|
wg.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
type minioClusterCollector struct {
|
type minioClusterCollector struct {
|
||||||
metricsGroups []*MetricsGroup
|
metricsGroups []*MetricsGroup
|
||||||
desc *prometheus.Desc
|
desc *prometheus.Desc
|
||||||
@ -2791,6 +3096,49 @@ func newMinioCollectorNode(metricsGroups []*MetricsGroup) *minioNodeCollector {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func metricsBucketHandler() http.Handler {
|
||||||
|
registry := prometheus.NewRegistry()
|
||||||
|
|
||||||
|
// Report all other metrics
|
||||||
|
logger.CriticalIf(GlobalContext, registry.Register(bucketCollector))
|
||||||
|
|
||||||
|
// DefaultGatherers include golang metrics and process metrics.
|
||||||
|
gatherers := prometheus.Gatherers{
|
||||||
|
registry,
|
||||||
|
}
|
||||||
|
|
||||||
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
tc, ok := r.Context().Value(mcontext.ContextTraceKey).(*mcontext.TraceCtxt)
|
||||||
|
if ok {
|
||||||
|
tc.FuncName = "handler.MetricsBucket"
|
||||||
|
tc.ResponseRecorder.LogErrBody = true
|
||||||
|
}
|
||||||
|
|
||||||
|
mfs, err := gatherers.Gather()
|
||||||
|
if err != nil {
|
||||||
|
if len(mfs) == 0 {
|
||||||
|
writeErrorResponseJSON(r.Context(), w, toAdminAPIErr(r.Context(), err), r.URL)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
contentType := expfmt.Negotiate(r.Header)
|
||||||
|
w.Header().Set("Content-Type", string(contentType))
|
||||||
|
|
||||||
|
enc := expfmt.NewEncoder(w, contentType)
|
||||||
|
for _, mf := range mfs {
|
||||||
|
if err := enc.Encode(mf); err != nil {
|
||||||
|
// client may disconnect for any reasons
|
||||||
|
// we do not have to log this.
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if closer, ok := enc.(expfmt.Closer); ok {
|
||||||
|
closer.Close()
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
func metricsServerHandler() http.Handler {
|
func metricsServerHandler() http.Handler {
|
||||||
registry := prometheus.NewRegistry()
|
registry := prometheus.NewRegistry()
|
||||||
|
|
||||||
|
@ -39,6 +39,14 @@ var (
|
|||||||
},
|
},
|
||||||
[]string{"api"},
|
[]string{"api"},
|
||||||
)
|
)
|
||||||
|
bucketHTTPRequestsDuration = prometheus.NewHistogramVec(
|
||||||
|
prometheus.HistogramOpts{
|
||||||
|
Name: "s3_ttfb_seconds",
|
||||||
|
Help: "Time taken by requests served by current MinIO server instance per bucket",
|
||||||
|
Buckets: []float64{.05, .1, .25, .5, 1, 2.5, 5, 10},
|
||||||
|
},
|
||||||
|
[]string{"api", "bucket"},
|
||||||
|
)
|
||||||
minioVersionInfo = prometheus.NewGaugeVec(
|
minioVersionInfo = prometheus.NewGaugeVec(
|
||||||
prometheus.GaugeOpts{
|
prometheus.GaugeOpts{
|
||||||
Namespace: "minio",
|
Namespace: "minio",
|
||||||
@ -614,6 +622,11 @@ func metricsHandler() http.Handler {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NoAuthMiddleware no auth middle ware.
|
||||||
|
func NoAuthMiddleware(h http.Handler) http.Handler {
|
||||||
|
return h
|
||||||
|
}
|
||||||
|
|
||||||
// AuthMiddleware checks if the bearer token is valid and authorized.
|
// AuthMiddleware checks if the bearer token is valid and authorized.
|
||||||
func AuthMiddleware(h http.Handler) http.Handler {
|
func AuthMiddleware(h http.Handler) http.Handler {
|
||||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
// Copyright (c) 2015-2021 MinIO, Inc.
|
// Copyright (c) 2015-2023 MinIO, Inc.
|
||||||
//
|
//
|
||||||
// This file is part of MinIO Object Storage stack
|
// This file is part of MinIO Object Storage stack
|
||||||
//
|
//
|
||||||
@ -121,11 +121,11 @@ func (g *NotificationGroup) Go(ctx context.Context, f func() error, index int, a
|
|||||||
func (sys *NotificationSys) DeletePolicy(policyName string) []NotificationPeerErr {
|
func (sys *NotificationSys) DeletePolicy(policyName string) []NotificationPeerErr {
|
||||||
ng := WithNPeers(len(sys.peerClients)).WithRetries(1)
|
ng := WithNPeers(len(sys.peerClients)).WithRetries(1)
|
||||||
for idx, client := range sys.peerClients {
|
for idx, client := range sys.peerClients {
|
||||||
if client == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
client := client
|
client := client
|
||||||
ng.Go(GlobalContext, func() error {
|
ng.Go(GlobalContext, func() error {
|
||||||
|
if client == nil {
|
||||||
|
return errPeerNotReachable
|
||||||
|
}
|
||||||
return client.DeletePolicy(policyName)
|
return client.DeletePolicy(policyName)
|
||||||
}, idx, *client.host)
|
}, idx, *client.host)
|
||||||
}
|
}
|
||||||
@ -136,11 +136,11 @@ func (sys *NotificationSys) DeletePolicy(policyName string) []NotificationPeerEr
|
|||||||
func (sys *NotificationSys) LoadPolicy(policyName string) []NotificationPeerErr {
|
func (sys *NotificationSys) LoadPolicy(policyName string) []NotificationPeerErr {
|
||||||
ng := WithNPeers(len(sys.peerClients)).WithRetries(1)
|
ng := WithNPeers(len(sys.peerClients)).WithRetries(1)
|
||||||
for idx, client := range sys.peerClients {
|
for idx, client := range sys.peerClients {
|
||||||
if client == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
client := client
|
client := client
|
||||||
ng.Go(GlobalContext, func() error {
|
ng.Go(GlobalContext, func() error {
|
||||||
|
if client == nil {
|
||||||
|
return errPeerNotReachable
|
||||||
|
}
|
||||||
return client.LoadPolicy(policyName)
|
return client.LoadPolicy(policyName)
|
||||||
}, idx, *client.host)
|
}, idx, *client.host)
|
||||||
}
|
}
|
||||||
@ -151,11 +151,11 @@ func (sys *NotificationSys) LoadPolicy(policyName string) []NotificationPeerErr
|
|||||||
func (sys *NotificationSys) LoadPolicyMapping(userOrGroup string, userType IAMUserType, isGroup bool) []NotificationPeerErr {
|
func (sys *NotificationSys) LoadPolicyMapping(userOrGroup string, userType IAMUserType, isGroup bool) []NotificationPeerErr {
|
||||||
ng := WithNPeers(len(sys.peerClients)).WithRetries(1)
|
ng := WithNPeers(len(sys.peerClients)).WithRetries(1)
|
||||||
for idx, client := range sys.peerClients {
|
for idx, client := range sys.peerClients {
|
||||||
if client == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
client := client
|
client := client
|
||||||
ng.Go(GlobalContext, func() error {
|
ng.Go(GlobalContext, func() error {
|
||||||
|
if client == nil {
|
||||||
|
return errPeerNotReachable
|
||||||
|
}
|
||||||
return client.LoadPolicyMapping(userOrGroup, userType, isGroup)
|
return client.LoadPolicyMapping(userOrGroup, userType, isGroup)
|
||||||
}, idx, *client.host)
|
}, idx, *client.host)
|
||||||
}
|
}
|
||||||
@ -166,11 +166,11 @@ func (sys *NotificationSys) LoadPolicyMapping(userOrGroup string, userType IAMUs
|
|||||||
func (sys *NotificationSys) DeleteUser(accessKey string) []NotificationPeerErr {
|
func (sys *NotificationSys) DeleteUser(accessKey string) []NotificationPeerErr {
|
||||||
ng := WithNPeers(len(sys.peerClients)).WithRetries(1)
|
ng := WithNPeers(len(sys.peerClients)).WithRetries(1)
|
||||||
for idx, client := range sys.peerClients {
|
for idx, client := range sys.peerClients {
|
||||||
if client == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
client := client
|
client := client
|
||||||
ng.Go(GlobalContext, func() error {
|
ng.Go(GlobalContext, func() error {
|
||||||
|
if client == nil {
|
||||||
|
return errPeerNotReachable
|
||||||
|
}
|
||||||
return client.DeleteUser(accessKey)
|
return client.DeleteUser(accessKey)
|
||||||
}, idx, *client.host)
|
}, idx, *client.host)
|
||||||
}
|
}
|
||||||
@ -181,11 +181,11 @@ func (sys *NotificationSys) DeleteUser(accessKey string) []NotificationPeerErr {
|
|||||||
func (sys *NotificationSys) LoadUser(accessKey string, temp bool) []NotificationPeerErr {
|
func (sys *NotificationSys) LoadUser(accessKey string, temp bool) []NotificationPeerErr {
|
||||||
ng := WithNPeers(len(sys.peerClients)).WithRetries(1)
|
ng := WithNPeers(len(sys.peerClients)).WithRetries(1)
|
||||||
for idx, client := range sys.peerClients {
|
for idx, client := range sys.peerClients {
|
||||||
if client == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
client := client
|
client := client
|
||||||
ng.Go(GlobalContext, func() error {
|
ng.Go(GlobalContext, func() error {
|
||||||
|
if client == nil {
|
||||||
|
return errPeerNotReachable
|
||||||
|
}
|
||||||
return client.LoadUser(accessKey, temp)
|
return client.LoadUser(accessKey, temp)
|
||||||
}, idx, *client.host)
|
}, idx, *client.host)
|
||||||
}
|
}
|
||||||
@ -196,11 +196,13 @@ func (sys *NotificationSys) LoadUser(accessKey string, temp bool) []Notification
|
|||||||
func (sys *NotificationSys) LoadGroup(group string) []NotificationPeerErr {
|
func (sys *NotificationSys) LoadGroup(group string) []NotificationPeerErr {
|
||||||
ng := WithNPeers(len(sys.peerClients)).WithRetries(1)
|
ng := WithNPeers(len(sys.peerClients)).WithRetries(1)
|
||||||
for idx, client := range sys.peerClients {
|
for idx, client := range sys.peerClients {
|
||||||
if client == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
client := client
|
client := client
|
||||||
ng.Go(GlobalContext, func() error { return client.LoadGroup(group) }, idx, *client.host)
|
ng.Go(GlobalContext, func() error {
|
||||||
|
if client == nil {
|
||||||
|
return errPeerNotReachable
|
||||||
|
}
|
||||||
|
return client.LoadGroup(group)
|
||||||
|
}, idx, *client.host)
|
||||||
}
|
}
|
||||||
return ng.Wait()
|
return ng.Wait()
|
||||||
}
|
}
|
||||||
@ -209,11 +211,11 @@ func (sys *NotificationSys) LoadGroup(group string) []NotificationPeerErr {
|
|||||||
func (sys *NotificationSys) DeleteServiceAccount(accessKey string) []NotificationPeerErr {
|
func (sys *NotificationSys) DeleteServiceAccount(accessKey string) []NotificationPeerErr {
|
||||||
ng := WithNPeers(len(sys.peerClients)).WithRetries(1)
|
ng := WithNPeers(len(sys.peerClients)).WithRetries(1)
|
||||||
for idx, client := range sys.peerClients {
|
for idx, client := range sys.peerClients {
|
||||||
if client == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
client := client
|
client := client
|
||||||
ng.Go(GlobalContext, func() error {
|
ng.Go(GlobalContext, func() error {
|
||||||
|
if client == nil {
|
||||||
|
return errPeerNotReachable
|
||||||
|
}
|
||||||
return client.DeleteServiceAccount(accessKey)
|
return client.DeleteServiceAccount(accessKey)
|
||||||
}, idx, *client.host)
|
}, idx, *client.host)
|
||||||
}
|
}
|
||||||
@ -224,11 +226,11 @@ func (sys *NotificationSys) DeleteServiceAccount(accessKey string) []Notificatio
|
|||||||
func (sys *NotificationSys) LoadServiceAccount(accessKey string) []NotificationPeerErr {
|
func (sys *NotificationSys) LoadServiceAccount(accessKey string) []NotificationPeerErr {
|
||||||
ng := WithNPeers(len(sys.peerClients)).WithRetries(1)
|
ng := WithNPeers(len(sys.peerClients)).WithRetries(1)
|
||||||
for idx, client := range sys.peerClients {
|
for idx, client := range sys.peerClients {
|
||||||
if client == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
client := client
|
client := client
|
||||||
ng.Go(GlobalContext, func() error {
|
ng.Go(GlobalContext, func() error {
|
||||||
|
if client == nil {
|
||||||
|
return errPeerNotReachable
|
||||||
|
}
|
||||||
return client.LoadServiceAccount(accessKey)
|
return client.LoadServiceAccount(accessKey)
|
||||||
}, idx, *client.host)
|
}, idx, *client.host)
|
||||||
}
|
}
|
||||||
@ -240,12 +242,12 @@ func (sys *NotificationSys) BackgroundHealStatus() ([]madmin.BgHealState, []Noti
|
|||||||
ng := WithNPeers(len(sys.peerClients))
|
ng := WithNPeers(len(sys.peerClients))
|
||||||
states := make([]madmin.BgHealState, len(sys.peerClients))
|
states := make([]madmin.BgHealState, len(sys.peerClients))
|
||||||
for idx, client := range sys.peerClients {
|
for idx, client := range sys.peerClients {
|
||||||
if client == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
idx := idx
|
idx := idx
|
||||||
client := client
|
client := client
|
||||||
ng.Go(GlobalContext, func() error {
|
ng.Go(GlobalContext, func() error {
|
||||||
|
if client == nil {
|
||||||
|
return errPeerNotReachable
|
||||||
|
}
|
||||||
st, err := client.BackgroundHealStatus()
|
st, err := client.BackgroundHealStatus()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@ -1101,6 +1103,65 @@ func (sys *NotificationSys) GetBandwidthReports(ctx context.Context, buckets ...
|
|||||||
return consolidatedReport
|
return consolidatedReport
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GetBucketMetrics - gets the cluster level bucket metrics from all nodes excluding self.
|
||||||
|
func (sys *NotificationSys) GetBucketMetrics(ctx context.Context) <-chan Metric {
|
||||||
|
if sys == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
g := errgroup.WithNErrs(len(sys.peerClients))
|
||||||
|
peerChannels := make([]<-chan Metric, len(sys.peerClients))
|
||||||
|
for index := range sys.peerClients {
|
||||||
|
index := index
|
||||||
|
g.Go(func() error {
|
||||||
|
if sys.peerClients[index] == nil {
|
||||||
|
return errPeerNotReachable
|
||||||
|
}
|
||||||
|
var err error
|
||||||
|
peerChannels[index], err = sys.peerClients[index].GetPeerBucketMetrics(ctx)
|
||||||
|
return err
|
||||||
|
}, index)
|
||||||
|
}
|
||||||
|
|
||||||
|
ch := make(chan Metric)
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
for index, err := range g.Wait() {
|
||||||
|
if err != nil {
|
||||||
|
if sys.peerClients[index] != nil {
|
||||||
|
reqInfo := (&logger.ReqInfo{}).AppendTags("peerAddress",
|
||||||
|
sys.peerClients[index].host.String())
|
||||||
|
logger.LogOnceIf(logger.SetReqInfo(ctx, reqInfo), err, sys.peerClients[index].host.String())
|
||||||
|
} else {
|
||||||
|
logger.LogOnceIf(ctx, err, "peer-offline")
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
wg.Add(1)
|
||||||
|
go func(ctx context.Context, peerChannel <-chan Metric, wg *sync.WaitGroup) {
|
||||||
|
defer wg.Done()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case m, ok := <-peerChannel:
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case ch <- m:
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
}
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}(ctx, peerChannels[index], &wg)
|
||||||
|
}
|
||||||
|
go func(wg *sync.WaitGroup, ch chan Metric) {
|
||||||
|
wg.Wait()
|
||||||
|
close(ch)
|
||||||
|
}(&wg, ch)
|
||||||
|
return ch
|
||||||
|
}
|
||||||
|
|
||||||
// GetClusterMetrics - gets the cluster metrics from all nodes excluding self.
|
// GetClusterMetrics - gets the cluster metrics from all nodes excluding self.
|
||||||
func (sys *NotificationSys) GetClusterMetrics(ctx context.Context) <-chan Metric {
|
func (sys *NotificationSys) GetClusterMetrics(ctx context.Context) <-chan Metric {
|
||||||
if sys == nil {
|
if sys == nil {
|
||||||
@ -1109,11 +1170,11 @@ func (sys *NotificationSys) GetClusterMetrics(ctx context.Context) <-chan Metric
|
|||||||
g := errgroup.WithNErrs(len(sys.peerClients))
|
g := errgroup.WithNErrs(len(sys.peerClients))
|
||||||
peerChannels := make([]<-chan Metric, len(sys.peerClients))
|
peerChannels := make([]<-chan Metric, len(sys.peerClients))
|
||||||
for index := range sys.peerClients {
|
for index := range sys.peerClients {
|
||||||
if sys.peerClients[index] == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
index := index
|
index := index
|
||||||
g.Go(func() error {
|
g.Go(func() error {
|
||||||
|
if sys.peerClients[index] == nil {
|
||||||
|
return errPeerNotReachable
|
||||||
|
}
|
||||||
var err error
|
var err error
|
||||||
peerChannels[index], err = sys.peerClients[index].GetPeerMetrics(ctx)
|
peerChannels[index], err = sys.peerClients[index].GetPeerMetrics(ctx)
|
||||||
return err
|
return err
|
||||||
@ -1142,7 +1203,11 @@ func (sys *NotificationSys) GetClusterMetrics(ctx context.Context) <-chan Metric
|
|||||||
if !ok {
|
if !ok {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
ch <- m
|
select {
|
||||||
|
case ch <- m:
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
}
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
@ -839,6 +839,33 @@ func (client *peerRESTClient) GetPeerMetrics(ctx context.Context) (<-chan Metric
|
|||||||
return ch, nil
|
return ch, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (client *peerRESTClient) GetPeerBucketMetrics(ctx context.Context) (<-chan Metric, error) {
|
||||||
|
respBody, err := client.callWithContext(ctx, peerRESTMethodGetPeerBucketMetrics, nil, nil, -1)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
dec := gob.NewDecoder(respBody)
|
||||||
|
ch := make(chan Metric)
|
||||||
|
go func(ch chan<- Metric) {
|
||||||
|
defer func() {
|
||||||
|
xhttp.DrainBody(respBody)
|
||||||
|
close(ch)
|
||||||
|
}()
|
||||||
|
for {
|
||||||
|
var metric Metric
|
||||||
|
if err := dec.Decode(&metric); err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
case ch <- metric:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}(ch)
|
||||||
|
return ch, nil
|
||||||
|
}
|
||||||
|
|
||||||
func (client *peerRESTClient) SpeedTest(ctx context.Context, opts speedTestOpts) (SpeedTestResult, error) {
|
func (client *peerRESTClient) SpeedTest(ctx context.Context, opts speedTestOpts) (SpeedTestResult, error) {
|
||||||
values := make(url.Values)
|
values := make(url.Values)
|
||||||
values.Set(peerRESTSize, strconv.Itoa(opts.objectSize))
|
values.Set(peerRESTSize, strconv.Itoa(opts.objectSize))
|
||||||
|
@ -18,7 +18,7 @@
|
|||||||
package cmd
|
package cmd
|
||||||
|
|
||||||
const (
|
const (
|
||||||
peerRESTVersion = "v31" // Add replication MRF
|
peerRESTVersion = "v32" // Add bucket peer metrics
|
||||||
|
|
||||||
peerRESTVersionPrefix = SlashSeparator + peerRESTVersion
|
peerRESTVersionPrefix = SlashSeparator + peerRESTVersion
|
||||||
peerRESTPrefix = minioReservedBucketPath + "/peer"
|
peerRESTPrefix = minioReservedBucketPath + "/peer"
|
||||||
@ -65,6 +65,7 @@ const (
|
|||||||
peerRESTMethodGetMetacacheListing = "/getmetacache"
|
peerRESTMethodGetMetacacheListing = "/getmetacache"
|
||||||
peerRESTMethodUpdateMetacacheListing = "/updatemetacache"
|
peerRESTMethodUpdateMetacacheListing = "/updatemetacache"
|
||||||
peerRESTMethodGetPeerMetrics = "/peermetrics"
|
peerRESTMethodGetPeerMetrics = "/peermetrics"
|
||||||
|
peerRESTMethodGetPeerBucketMetrics = "/peerbucketmetrics"
|
||||||
peerRESTMethodLoadTransitionTierConfig = "/loadtransitiontierconfig"
|
peerRESTMethodLoadTransitionTierConfig = "/loadtransitiontierconfig"
|
||||||
peerRESTMethodSpeedTest = "/speedtest"
|
peerRESTMethodSpeedTest = "/speedtest"
|
||||||
peerRESTMethodDriveSpeedTest = "/drivespeedtest"
|
peerRESTMethodDriveSpeedTest = "/drivespeedtest"
|
||||||
|
@ -1207,6 +1207,23 @@ func (s *peerRESTServer) GetPeerMetrics(w http.ResponseWriter, r *http.Request)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GetPeerBucketMetrics gets the metrics to be federated across peers.
|
||||||
|
func (s *peerRESTServer) GetPeerBucketMetrics(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if !s.IsValid(w, r) {
|
||||||
|
s.writeErrorResponse(w, errors.New("invalid request"))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
enc := gob.NewEncoder(w)
|
||||||
|
|
||||||
|
for m := range ReportMetrics(r.Context(), bucketPeerMetricsGroups) {
|
||||||
|
if err := enc.Encode(m); err != nil {
|
||||||
|
s.writeErrorResponse(w, errors.New("Encoding metric failed: "+err.Error()))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (s *peerRESTServer) SpeedTestHandler(w http.ResponseWriter, r *http.Request) {
|
func (s *peerRESTServer) SpeedTestHandler(w http.ResponseWriter, r *http.Request) {
|
||||||
if !s.IsValid(w, r) {
|
if !s.IsValid(w, r) {
|
||||||
s.writeErrorResponse(w, errors.New("invalid request"))
|
s.writeErrorResponse(w, errors.New("invalid request"))
|
||||||
@ -1431,6 +1448,7 @@ func registerPeerRESTHandlers(router *mux.Router) {
|
|||||||
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodGetMetacacheListing).HandlerFunc(h(server.GetMetacacheListingHandler))
|
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodGetMetacacheListing).HandlerFunc(h(server.GetMetacacheListingHandler))
|
||||||
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodUpdateMetacacheListing).HandlerFunc(h(server.UpdateMetacacheListingHandler))
|
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodUpdateMetacacheListing).HandlerFunc(h(server.UpdateMetacacheListingHandler))
|
||||||
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodGetPeerMetrics).HandlerFunc(h(server.GetPeerMetrics))
|
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodGetPeerMetrics).HandlerFunc(h(server.GetPeerMetrics))
|
||||||
|
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodGetPeerBucketMetrics).HandlerFunc(h(server.GetPeerBucketMetrics))
|
||||||
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodLoadTransitionTierConfig).HandlerFunc(h(server.LoadTransitionTierConfigHandler))
|
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodLoadTransitionTierConfig).HandlerFunc(h(server.LoadTransitionTierConfigHandler))
|
||||||
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodSpeedTest).HandlerFunc(h(server.SpeedTestHandler))
|
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodSpeedTest).HandlerFunc(h(server.SpeedTestHandler))
|
||||||
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodDriveSpeedTest).HandlerFunc(h(server.DriveSpeedTestHandler))
|
subrouter.Methods(http.MethodPost).Path(peerRESTVersionPrefix + peerRESTMethodDriveSpeedTest).HandlerFunc(h(server.DriveSpeedTestHandler))
|
||||||
|
@ -1,10 +1,11 @@
|
|||||||
# How to monitor MinIO server with Prometheus [![Slack](https://slack.min.io/slack?type=svg)](https://slack.min.io)
|
# How to monitor MinIO server with Prometheus? [![Slack](https://slack.min.io/slack?type=svg)](https://slack.min.io)
|
||||||
|
|
||||||
[Prometheus](https://prometheus.io) is a cloud-native monitoring platform.
|
[Prometheus](https://prometheus.io) is a cloud-native monitoring platform. Prometheus offers a multi-dimensional data model with time series data identified by metric name and key/value pairs. The data collection happens via a pull model over HTTP/HTTPS. Users looking to monitor their MinIO instances can point Prometheus configuration to scrape data from following endpoints.
|
||||||
|
|
||||||
Prometheus offers a multi-dimensional data model with time series data identified by metric name and key/value pairs. The data collection happens via a pull model over HTTP/HTTPS.
|
- MinIO exports Prometheus compatible data by default as an authorized endpoint at `/minio/v2/metrics/cluster`.
|
||||||
|
- MinIO exports Prometheus compatible data by default which is bucket centric as an authorized endpoint at `/minio/v2/metrics/bucket`.
|
||||||
|
|
||||||
MinIO exports Prometheus compatible data by default as an authorized endpoint at `/minio/v2/metrics/cluster`. Users looking to monitor their MinIO instances can point Prometheus configuration to scrape data from this endpoint. This document explains how to setup Prometheus and configure it to scrape data from MinIO servers.
|
This document explains how to setup Prometheus and configure it to scrape data from MinIO servers.
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
@ -52,6 +53,8 @@ The Prometheus endpoint in MinIO requires authentication by default. Prometheus
|
|||||||
|
|
||||||
The command will generate the `scrape_configs` section of the prometheus.yml as follows:
|
The command will generate the `scrape_configs` section of the prometheus.yml as follows:
|
||||||
|
|
||||||
|
##### Cluster
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
scrape_configs:
|
scrape_configs:
|
||||||
- job_name: minio-job
|
- job_name: minio-job
|
||||||
@ -62,6 +65,18 @@ scrape_configs:
|
|||||||
- targets: ['localhost:9000']
|
- targets: ['localhost:9000']
|
||||||
```
|
```
|
||||||
|
|
||||||
|
##### Bucket centric
|
||||||
|
|
||||||
|
```
|
||||||
|
- job_name: minio-job-bucket
|
||||||
|
bearer_token: <secret>
|
||||||
|
metrics_path: /minio/v2/metrics/bucket
|
||||||
|
scheme: http
|
||||||
|
static_configs:
|
||||||
|
- targets: ['localhost:9000']
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
#### 3.2 Public Prometheus config
|
#### 3.2 Public Prometheus config
|
||||||
|
|
||||||
If Prometheus endpoint authentication type is set to `public`. Following prometheus config is sufficient to start scraping metrics data from MinIO.
|
If Prometheus endpoint authentication type is set to `public`. Following prometheus config is sufficient to start scraping metrics data from MinIO.
|
||||||
@ -78,6 +93,17 @@ scrape_configs:
|
|||||||
- targets: ['localhost:9000']
|
- targets: ['localhost:9000']
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### Bucket centric
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: minio-job-bucket
|
||||||
|
metrics_path: /minio/v2/metrics/bucket
|
||||||
|
scheme: http
|
||||||
|
static_configs:
|
||||||
|
- targets: ['localhost:9000']
|
||||||
|
```
|
||||||
|
|
||||||
##### Node (optional)
|
##### Node (optional)
|
||||||
|
|
||||||
Optionally you can also collect per node metrics. This needs to be done on a per server instance.
|
Optionally you can also collect per node metrics. This needs to be done on a per server instance.
|
||||||
@ -109,17 +135,19 @@ Prometheus sets the `Host` header to `domain:port` as part of HTTP operations ag
|
|||||||
|
|
||||||
### 6. Configure Grafana
|
### 6. Configure Grafana
|
||||||
|
|
||||||
After Prometheus is configured, you can use Grafana to visualize MinIO metrics.
|
After Prometheus is configured, you can use Grafana to visualize MinIO metrics. Refer the [document here to setup Grafana with MinIO prometheus metrics](https://github.com/minio/minio/blob/master/docs/metrics/prometheus/grafana/README.md).
|
||||||
Refer the [document here to setup Grafana with MinIO prometheus metrics](https://github.com/minio/minio/blob/master/docs/metrics/prometheus/grafana/README.md).
|
|
||||||
|
|
||||||
## List of metrics exposed by MinIO
|
## List of metrics exposed by MinIO
|
||||||
|
|
||||||
MinIO server exposes the following metrics on `/minio/v2/metrics/cluster` endpoint. All of these can be accessed via Prometheus dashboard. A sample list of exposed metrics along with their definition is available in the demo server at
|
- MinIO exports Prometheus compatible data by default as an authorized endpoint at `/minio/v2/metrics/cluster`.
|
||||||
|
- MinIO exports Prometheus compatible data by default which is bucket centric as an authorized endpoint at `/minio/v2/metrics/bucket`.
|
||||||
|
|
||||||
|
All of these can be accessed via Prometheus dashboard. A sample list of exposed metrics along with their definition is available on our public demo server at
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
curl https://play.min.io/minio/v2/metrics/cluster
|
curl https://play.min.io/minio/v2/metrics/cluster
|
||||||
```
|
```
|
||||||
|
|
||||||
### List of metrics reported
|
### List of metrics reported Cluster and Bucket level
|
||||||
|
|
||||||
[The list of metrics reported can be here](https://github.com/minio/minio/blob/master/docs/metrics/prometheus/list.md)
|
[The list of metrics reported can be here](https://github.com/minio/minio/blob/master/docs/metrics/prometheus/list.md)
|
||||||
|
@ -500,7 +500,7 @@
|
|||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"exemplar": true,
|
"exemplar": true,
|
||||||
"expr": "max(sum(minio_bucket_usage_total_bytes{job=\"$scrape_jobs\"}) by (instance,server))",
|
"expr": "max(minio_cluster_usage_total_bytes{job=\"$scrape_jobs\"} by (instance,server))",
|
||||||
"interval": "",
|
"interval": "",
|
||||||
"legendFormat": "Usage",
|
"legendFormat": "Usage",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
@ -564,7 +564,7 @@
|
|||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"exemplar": true,
|
"exemplar": true,
|
||||||
"expr": "max by (range) (sum (minio_bucket_objects_size_distribution{job=\"$scrape_jobs\"}) by (range))",
|
"expr": "max by (range) (minio_cluster_objects_size_distribution{job=\"$scrape_jobs\"} by (range))",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
"instant": false,
|
"instant": false,
|
||||||
"interval": "",
|
"interval": "",
|
||||||
@ -1492,7 +1492,7 @@
|
|||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"exemplar": true,
|
"exemplar": true,
|
||||||
"expr": "topk(1, sum(minio_bucket_usage_object_total{job=\"$scrape_jobs\"}) by (instance))",
|
"expr": "topk(1, minio_cluster_usage_object_total{job=\"$scrape_jobs\"} by (instance))",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
"instant": false,
|
"instant": false,
|
||||||
"interval": "1m",
|
"interval": "1m",
|
||||||
|
@ -4,113 +4,131 @@ Each metric includes a label for the server that calculated the metric. Each met
|
|||||||
|
|
||||||
These metrics can be obtained from any MinIO server once per collection.
|
These metrics can be obtained from any MinIO server once per collection.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|:---------------------------------------------|:----------------------------------------------------------------------------------------------------------------|
|
|:----------------------------------------------|:----------------------------------------------------------------------------------------------------------------|
|
||||||
| `minio_audit_failed_messages` | Total number of messages that failed to send since start. |
|
| `minio_audit_failed_messages` | Total number of messages that failed to send since start. |
|
||||||
| `minio_audit_target_queue_length` | Number of unsent messages in queue for target. |
|
| `minio_audit_target_queue_length` | Number of unsent messages in queue for target. |
|
||||||
| `minio_audit_total_messages` | Total number of messages sent since start. |
|
| `minio_audit_total_messages` | Total number of messages sent since start. |
|
||||||
| `minio_bucket_objects_size_distribution` | Distribution of object sizes in the bucket, includes label for the bucket name. |
|
| `minio_cache_hits_total` | Total number of drive cache hits. |
|
||||||
| `minio_bucket_quota_total_bytes` | Total bucket quota size in bytes. |
|
| `minio_cache_missed_total` | Total number of drive cache misses. |
|
||||||
| `minio_bucket_replication_failed_bytes` | Total number of bytes failed at least once to replicate. |
|
| `minio_cache_sent_bytes` | Total number of bytes served from cache. |
|
||||||
| `minio_bucket_replication_failed_count` | Total number of objects which failed replication. |
|
| `minio_cache_total_bytes` | Total size of cache drive in bytes. |
|
||||||
| `minio_bucket_replication_latency_ms` | Replication latency in milliseconds. |
|
| `minio_cache_usage_info` | Total percentage cache usage, value of 1 indicates high and 0 low, label level is set as well. |
|
||||||
| `minio_bucket_replication_received_bytes` | Total number of bytes replicated to this bucket from another source bucket. |
|
| `minio_cache_used_bytes` | Current cache usage in bytes. |
|
||||||
| `minio_bucket_replication_sent_bytes` | Total number of bytes replicated to the target bucket. |
|
| `minio_cluster_capacity_raw_free_bytes` | Total free capacity online in the cluster. |
|
||||||
| `minio_bucket_traffic_received_bytes` | Total number of S3 bytes received for this bucket. |
|
| `minio_cluster_capacity_raw_total_bytes` | Total capacity online in the cluster. |
|
||||||
| `minio_bucket_traffic_sent_bytes` | Total number of S3 bytes sent for this bucket. |
|
| `minio_cluster_capacity_usable_free_bytes` | Total free usable capacity online in the cluster. |
|
||||||
| `minio_bucket_usage_object_total` | Total number of objects. |
|
| `minio_cluster_capacity_usable_total_bytes` | Total usable capacity online in the cluster. |
|
||||||
| `minio_bucket_usage_total_bytes` | Total bucket size in bytes. |
|
| `minio_cluster_objects_size_distribution` | Distribution of object sizes across a cluster |
|
||||||
| `minio_bucket_requests_4xx_errors_total` | Total number of S3 requests with (4xx) errors on a bucket. |
|
| `minio_cluster_objects_version_distribution` | Distribution of object sizes across a cluster |
|
||||||
| `minio_bucket_requests_5xx_errors_total` | Total number of S3 requests with (5xx) errors on a bucket. |
|
| `minio_cluster_usage_object_total` | Total number of objects in a cluster |
|
||||||
| `minio_bucket_requests_inflight_total` | Total number of S3 requests currently in flight on a bucket. |
|
| `minio_cluster_usage_total_bytes` | Total cluster usage in bytes |
|
||||||
| `minio_bucket_requests_total` | Total number of S3 requests on a bucket. |
|
| `minio_cluster_usage_version_total` | Total number of versions (includes delete marker) in a cluster |
|
||||||
| `minio_bucket_requests_canceled_total` | Total number S3 requests canceled by the client. |
|
| `minio_cluster_usage_total_bytes` | Total cluster usage in bytes |
|
||||||
| `minio_cache_hits_total` | Total number of drive cache hits. |
|
| `minio_cluster_buckets_total` | Total number of buckets in the cluster |
|
||||||
| `minio_cache_missed_total` | Total number of drive cache misses. |
|
| `minio_cluster_disk_offline_total` | Total drives offline. |
|
||||||
| `minio_cache_sent_bytes` | Total number of bytes served from cache. |
|
| `minio_cluster_disk_online_total` | Total drives online. |
|
||||||
| `minio_cache_total_bytes` | Total size of cache drive in bytes. |
|
| `minio_cluster_disk_total` | Total drives. |
|
||||||
| `minio_cache_usage_info` | Total percentage cache usage, value of 1 indicates high and 0 low, label level is set as well. |
|
| `minio_cluster_ilm_transitioned_bytes` | Total bytes transitioned to a tier. |
|
||||||
| `minio_cache_used_bytes` | Current cache usage in bytes. |
|
| `minio_cluster_ilm_transitioned_objects` | Total number of objects transitioned to a tier. |
|
||||||
| `minio_cluster_capacity_raw_free_bytes` | Total free capacity online in the cluster. |
|
| `minio_cluster_ilm_transitioned_versions` | Total number of versions transitioned to a tier. |
|
||||||
| `minio_cluster_capacity_raw_total_bytes` | Total capacity online in the cluster. |
|
| `minio_cluster_kms_online` | Reports whether the KMS is online (1) or offline (0). |
|
||||||
| `minio_cluster_capacity_usable_free_bytes` | Total free usable capacity online in the cluster. |
|
| `minio_cluster_kms_request_error` | Number of KMS requests that failed due to some error. (HTTP 4xx status code). |
|
||||||
| `minio_cluster_capacity_usable_total_bytes` | Total usable capacity online in the cluster. |
|
| `minio_cluster_kms_request_failure` | Number of KMS requests that failed due to some internal failure. (HTTP 5xx status code). |
|
||||||
| `minio_cluster_disk_offline_total` | Total drives offline. |
|
| `minio_cluster_kms_request_success` | Number of KMS requests that succeeded. |
|
||||||
| `minio_cluster_disk_online_total` | Total drives online. |
|
| `minio_cluster_kms_uptime` | The time the KMS has been up and running in seconds. |
|
||||||
| `minio_cluster_disk_total` | Total drives. |
|
| `minio_cluster_nodes_offline_total` | Total number of MinIO nodes offline. |
|
||||||
| `minio_cluster_ilm_transitioned_bytes` | Total bytes transitioned to a tier. |
|
| `minio_cluster_nodes_online_total` | Total number of MinIO nodes online. |
|
||||||
| `minio_cluster_ilm_transitioned_objects` | Total number of objects transitioned to a tier. |
|
| `minio_heal_objects_errors_total` | Objects for which healing failed in current self healing run. |
|
||||||
| `minio_cluster_ilm_transitioned_versions` | Total number of versions transitioned to a tier. |
|
| `minio_heal_objects_heal_total` | Objects healed in current self healing run. |
|
||||||
| `minio_cluster_kms_online` | Reports whether the KMS is online (1) or offline (0). |
|
| `minio_heal_objects_total` | Objects scanned in current self healing run. |
|
||||||
| `minio_cluster_kms_request_error` | Number of KMS requests that failed due to some error. (HTTP 4xx status code). |
|
| `minio_heal_time_last_activity_nano_seconds` | Time elapsed (in nano seconds) since last self healing activity. |
|
||||||
| `minio_cluster_kms_request_failure` | Number of KMS requests that failed due to some internal failure. (HTTP 5xx status code). |
|
| `minio_inter_node_traffic_dial_avg_time` | Average time of internodes TCP dial calls. |
|
||||||
| `minio_cluster_kms_request_success` | Number of KMS requests that succeeded. |
|
| `minio_inter_node_traffic_dial_errors` | Total number of internode TCP dial timeouts and errors. |
|
||||||
| `minio_cluster_kms_uptime` | The time the KMS has been up and running in seconds. |
|
| `minio_inter_node_traffic_errors_total` | Total number of failed internode calls. |
|
||||||
| `minio_cluster_nodes_offline_total` | Total number of MinIO nodes offline. |
|
| `minio_inter_node_traffic_received_bytes` | Total number of bytes received from other peer nodes. |
|
||||||
| `minio_cluster_nodes_online_total` | Total number of MinIO nodes online. |
|
| `minio_inter_node_traffic_sent_bytes` | Total number of bytes sent to the other peer nodes. |
|
||||||
| `minio_heal_objects_errors_total` | Objects for which healing failed in current self healing run. |
|
| `minio_minio_update_percent` | Total percentage cache usage. |
|
||||||
| `minio_heal_objects_heal_total` | Objects healed in current self healing run. |
|
| `minio_node_disk_free_bytes` | Total storage available on a drive. |
|
||||||
| `minio_heal_objects_total` | Objects scanned in current self healing run. |
|
| `minio_node_disk_free_inodes` | Total free inodes. |
|
||||||
| `minio_heal_time_last_activity_nano_seconds` | Time elapsed (in nano seconds) since last self healing activity. |
|
| `minio_node_disk_latency_us` | Average last minute latency in µs for drive API storage operations. |
|
||||||
| `minio_inter_node_traffic_dial_avg_time` | Average time of internodes TCP dial calls. |
|
| `minio_node_disk_offline_total` | Total drives offline. |
|
||||||
| `minio_inter_node_traffic_dial_errors` | Total number of internode TCP dial timeouts and errors. |
|
| `minio_node_disk_online_total` | Total drives online. |
|
||||||
| `minio_inter_node_traffic_errors_total` | Total number of failed internode calls. |
|
| `minio_node_disk_total` | Total drives. |
|
||||||
| `minio_inter_node_traffic_received_bytes` | Total number of bytes received from other peer nodes. |
|
| `minio_node_disk_total_bytes` | Total storage on a drive. |
|
||||||
| `minio_inter_node_traffic_sent_bytes` | Total number of bytes sent to the other peer nodes. |
|
| `minio_node_disk_used_bytes` | Total storage used on a drive. |
|
||||||
| `minio_minio_update_percent` | Total percentage cache usage. |
|
| `minio_node_file_descriptor_limit_total` | Limit on total number of open file descriptors for the MinIO Server process. |
|
||||||
| `minio_node_disk_free_bytes` | Total storage available on a drive. |
|
| `minio_node_file_descriptor_open_total` | Total number of open file descriptors by the MinIO Server process. |
|
||||||
| `minio_node_disk_free_inodes` | Total free inodes. |
|
| `minio_node_go_routine_total` | Total number of go routines running. |
|
||||||
| `minio_node_disk_latency_us` | Average last minute latency in µs for drive API storage operations. |
|
| `minio_node_iam_last_sync_duration_millis` | Last successful IAM data sync duration in milliseconds. |
|
||||||
| `minio_node_disk_offline_total` | Total drives offline. |
|
| `minio_node_iam_since_last_sync_millis` | Time (in milliseconds) since last successful IAM data sync. |
|
||||||
| `minio_node_disk_online_total` | Total drives online. |
|
| `minio_node_iam_sync_failures` | Number of failed IAM data syncs since server start. |
|
||||||
| `minio_node_disk_total` | Total drives. |
|
| `minio_node_iam_sync_successes` | Number of successful IAM data syncs since server start. |
|
||||||
| `minio_node_disk_total_bytes` | Total storage on a drive. |
|
| `minio_node_ilm_expiry_pending_tasks` | Number of pending ILM expiry tasks in the queue. |
|
||||||
| `minio_node_disk_used_bytes` | Total storage used on a drive. |
|
| `minio_node_ilm_transition_active_tasks` | Number of active ILM transition tasks. |
|
||||||
| `minio_node_file_descriptor_limit_total` | Limit on total number of open file descriptors for the MinIO Server process. |
|
| `minio_node_ilm_transition_pending_tasks` | Number of pending ILM transition tasks in the queue. |
|
||||||
| `minio_node_file_descriptor_open_total` | Total number of open file descriptors by the MinIO Server process. |
|
| `minio_node_ilm_versions_scanned` | Total number of object versions checked for ilm actions since server start. |
|
||||||
| `minio_node_go_routine_total` | Total number of go routines running. |
|
| `minio_node_io_rchar_bytes` | Total bytes read by the process from the underlying storage system including cache, /proc/[pid]/io rchar. |
|
||||||
| `minio_node_iam_last_sync_duration_millis` | Last successful IAM data sync duration in milliseconds. |
|
| `minio_node_io_read_bytes` | Total bytes read by the process from the underlying storage system, /proc/[pid]/io read_bytes. |
|
||||||
| `minio_node_iam_since_last_sync_millis` | Time (in milliseconds) since last successful IAM data sync. |
|
| `minio_node_io_wchar_bytes` | Total bytes written by the process to the underlying storage system including page cache, /proc/[pid]/io wchar. |
|
||||||
| `minio_node_iam_sync_failures` | Number of failed IAM data syncs since server start. |
|
| `minio_node_io_write_bytes` | Total bytes written by the process to the underlying storage system, /proc/[pid]/io write_bytes. |
|
||||||
| `minio_node_iam_sync_successes` | Number of successful IAM data syncs since server start. |
|
| `minio_node_process_cpu_total_seconds` | Total user and system CPU time spent in seconds. |
|
||||||
| `minio_node_ilm_expiry_pending_tasks` | Number of pending ILM expiry tasks in the queue. |
|
| `minio_node_process_resident_memory_bytes` | Resident memory size in bytes. |
|
||||||
| `minio_node_ilm_transition_active_tasks` | Number of active ILM transition tasks. |
|
| `minio_node_process_starttime_seconds` | Start time for MinIO process per node, time in seconds since Unix epoc. |
|
||||||
| `minio_node_ilm_transition_pending_tasks` | Number of pending ILM transition tasks in the queue. |
|
| `minio_node_process_uptime_seconds` | Uptime for MinIO process per node in seconds. |
|
||||||
| `minio_node_ilm_versions_scanned` | Total number of object versions checked for ilm actions since server start. |
|
| `minio_node_scanner_bucket_scans_finished` | Total number of bucket scans finished since server start. |
|
||||||
| `minio_node_io_rchar_bytes` | Total bytes read by the process from the underlying storage system including cache, /proc/[pid]/io rchar. |
|
| `minio_node_scanner_bucket_scans_started` | Total number of bucket scans started since server start. |
|
||||||
| `minio_node_io_read_bytes` | Total bytes read by the process from the underlying storage system, /proc/[pid]/io read_bytes. |
|
| `minio_node_scanner_directories_scanned` | Total number of directories scanned since server start. |
|
||||||
| `minio_node_io_wchar_bytes` | Total bytes written by the process to the underlying storage system including page cache, /proc/[pid]/io wchar. |
|
| `minio_node_scanner_objects_scanned` | Total number of unique objects scanned since server start. |
|
||||||
| `minio_node_io_write_bytes` | Total bytes written by the process to the underlying storage system, /proc/[pid]/io write_bytes. |
|
| `minio_node_scanner_versions_scanned` | Total number of object versions scanned since server start. |
|
||||||
| `minio_node_process_cpu_total_seconds` | Total user and system CPU time spent in seconds. |
|
| `minio_node_syscall_read_total` | Total read SysCalls to the kernel. /proc/[pid]/io syscr. |
|
||||||
| `minio_node_process_resident_memory_bytes` | Resident memory size in bytes. |
|
| `minio_node_syscall_write_total` | Total write SysCalls to the kernel. /proc/[pid]/io syscw. |
|
||||||
| `minio_node_process_starttime_seconds` | Start time for MinIO process per node, time in seconds since Unix epoc. |
|
| `minio_notify_current_send_in_progress` | Number of concurrent async Send calls active to all targets. |
|
||||||
| `minio_node_process_uptime_seconds` | Uptime for MinIO process per node in seconds. |
|
| `minio_notify_target_queue_length` | Number of unsent notifications in queue for target. |
|
||||||
| `minio_node_scanner_bucket_scans_finished` | Total number of bucket scans finished since server start. |
|
| `minio_s3_requests_4xx_errors_total` | Total number S3 requests with (4xx) errors. |
|
||||||
| `minio_node_scanner_bucket_scans_started` | Total number of bucket scans started since server start. |
|
| `minio_s3_requests_5xx_errors_total` | Total number S3 requests with (5xx) errors. |
|
||||||
| `minio_node_scanner_directories_scanned` | Total number of directories scanned since server start. |
|
| `minio_s3_requests_canceled_total` | Total number S3 requests canceled by the client. |
|
||||||
| `minio_node_scanner_objects_scanned` | Total number of unique objects scanned since server start. |
|
| `minio_s3_requests_errors_total` | Total number S3 requests with (4xx and 5xx) errors. |
|
||||||
| `minio_node_scanner_versions_scanned` | Total number of object versions scanned since server start. |
|
| `minio_s3_requests_incoming_total` | Volatile number of total incoming S3 requests. |
|
||||||
| `minio_node_syscall_read_total` | Total read SysCalls to the kernel. /proc/[pid]/io syscr. |
|
| `minio_s3_requests_inflight_total` | Total number of S3 requests currently in flight. |
|
||||||
| `minio_node_syscall_write_total` | Total write SysCalls to the kernel. /proc/[pid]/io syscw. |
|
| `minio_s3_requests_rejected_auth_total` | Total number S3 requests rejected for auth failure. |
|
||||||
| `minio_notify_current_send_in_progress` | Number of concurrent async Send calls active to all targets. |
|
| `minio_s3_requests_rejected_header_total` | Total number S3 requests rejected for invalid header. |
|
||||||
| `minio_notify_target_queue_length` | Number of unsent notifications in queue for target. |
|
| `minio_s3_requests_rejected_invalid_total` | Total number S3 invalid requests. |
|
||||||
| `minio_s3_requests_4xx_errors_total` | Total number S3 requests with (4xx) errors. |
|
| `minio_s3_requests_rejected_timestamp_total` | Total number S3 requests rejected for invalid timestamp. |
|
||||||
| `minio_s3_requests_5xx_errors_total` | Total number S3 requests with (5xx) errors. |
|
| `minio_s3_requests_total` | Total number S3 requests. |
|
||||||
| `minio_s3_requests_canceled_total` | Total number S3 requests canceled by the client. |
|
| `minio_s3_requests_waiting_total` | Number of S3 requests in the waiting queue. |
|
||||||
| `minio_s3_requests_errors_total` | Total number S3 requests with (4xx and 5xx) errors. |
|
| `minio_s3_requests_ttfb_seconds_distribution` | Distribution of the time to first byte across API calls. |
|
||||||
| `minio_s3_requests_incoming_total` | Volatile number of total incoming S3 requests. |
|
| `minio_s3_traffic_received_bytes` | Total number of s3 bytes received. |
|
||||||
| `minio_s3_requests_inflight_total` | Total number of S3 requests currently in flight. |
|
| `minio_s3_traffic_sent_bytes` | Total number of s3 bytes sent. |
|
||||||
| `minio_s3_requests_rejected_auth_total` | Total number S3 requests rejected for auth failure. |
|
| `minio_software_commit_info` | Git commit hash for the MinIO release. |
|
||||||
| `minio_s3_requests_rejected_header_total` | Total number S3 requests rejected for invalid header. |
|
| `minio_software_version_info` | MinIO Release tag for the server. |
|
||||||
| `minio_s3_requests_rejected_invalid_total` | Total number S3 invalid requests. |
|
| `minio_usage_last_activity_nano_seconds` | Time elapsed (in nano seconds) since last scan activity. |
|
||||||
| `minio_s3_requests_rejected_timestamp_total` | Total number S3 requests rejected for invalid timestamp. |
|
|
||||||
| `minio_s3_requests_total` | Total number S3 requests. |
|
|
||||||
| `minio_s3_requests_waiting_total` | Number of S3 requests in the waiting queue. |
|
|
||||||
| `minio_s3_time_ttfb_seconds_distribution` | Distribution of the time to first byte across API calls. |
|
|
||||||
| `minio_s3_traffic_received_bytes` | Total number of s3 bytes received. |
|
|
||||||
| `minio_s3_traffic_sent_bytes` | Total number of s3 bytes sent. |
|
|
||||||
| `minio_software_commit_info` | Git commit hash for the MinIO release. |
|
|
||||||
| `minio_software_version_info` | MinIO Release tag for the server. |
|
|
||||||
| `minio_usage_last_activity_nano_seconds` | Time elapsed (in nano seconds) since last scan activity. |
|
|
||||||
|
|
||||||
|
# List of metrics exported per bucket level
|
||||||
|
|
||||||
|
Each metric includes a label for the server that calculated the metric. Each metric has a label for the server that generated the metric. Each
|
||||||
|
metric has a label that distinguishes the bucket.
|
||||||
|
|
||||||
|
These metrics can be obtained from any MinIO server once per collection.
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
|:--------------------------------------------------|:--------------------------------------------------------------------------------|
|
||||||
|
| `minio_bucket_objects_size_distribution` | Distribution of object sizes in the bucket, includes label for the bucket name. |
|
||||||
|
| `minio_bucket_objects_version_distribution` | Distribution of object sizes in a bucket, by number of versions |
|
||||||
|
| `minio_bucket_quota_total_bytes` | Total bucket quota size in bytes. |
|
||||||
|
| `minio_bucket_replication_failed_bytes` | Total number of bytes failed at least once to replicate. |
|
||||||
|
| `minio_bucket_replication_failed_count` | Total number of objects which failed replication. |
|
||||||
|
| `minio_bucket_replication_latency_ms` | Replication latency in milliseconds. |
|
||||||
|
| `minio_bucket_replication_received_bytes` | Total number of bytes replicated to this bucket from another source bucket. |
|
||||||
|
| `minio_bucket_replication_sent_bytes` | Total number of bytes replicated to the target bucket. |
|
||||||
|
| `minio_bucket_traffic_received_bytes` | Total number of S3 bytes received for this bucket. |
|
||||||
|
| `minio_bucket_traffic_sent_bytes` | Total number of S3 bytes sent for this bucket. |
|
||||||
|
| `minio_bucket_usage_object_total` | Total number of objects. |
|
||||||
|
| `minio_bucket_usage_version_total` | Total number of versions (includes delete marker) |
|
||||||
|
| `minio_bucket_usage_total_bytes` | Total bucket size in bytes. |
|
||||||
|
| `minio_bucket_requests_4xx_errors_total` | Total number of S3 requests with (4xx) errors on a bucket. |
|
||||||
|
| `minio_bucket_requests_5xx_errors_total` | Total number of S3 requests with (5xx) errors on a bucket. |
|
||||||
|
| `minio_bucket_requests_inflight_total` | Total number of S3 requests currently in flight on a bucket. |
|
||||||
|
| `minio_bucket_requests_total` | Total number of S3 requests on a bucket. |
|
||||||
|
| `minio_bucket_requests_canceled_total` | Total number S3 requests canceled by the client. |
|
||||||
|
| `minio_bucket_requests_ttfb_seconds_distribution` | Distribution of time to first byte across API calls per bucket. |
|
||||||
|
Loading…
Reference in New Issue
Block a user