minio/cmd/metrics-v2.go

2497 lines
69 KiB
Go
Raw Normal View History

// Copyright (c) 2015-2021 MinIO, Inc.
//
// This file is part of MinIO Object Storage stack
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
package cmd
import (
"context"
"fmt"
"net/http"
"runtime"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/minio/kes"
"github.com/minio/madmin-go"
"github.com/minio/minio/internal/bucket/lifecycle"
"github.com/minio/minio/internal/logger"
"github.com/minio/minio/internal/rest"
"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"github.com/prometheus/common/expfmt"
"github.com/prometheus/procfs"
)
var (
nodeCollector *minioNodeCollector
clusterCollector *minioClusterCollector
peerMetricsGroups []*MetricsGroup
)
func init() {
clusterMetricsGroups := []*MetricsGroup{
getBucketUsageMetrics(),
getMinioHealingMetrics(),
getNodeHealthMetrics(),
getClusterStorageMetrics(),
getClusterTierMetrics(),
getKMSMetrics(),
}
peerMetricsGroups = []*MetricsGroup{
getCacheMetrics(),
getGoMetrics(),
getHTTPMetrics(),
getNotificationMetrics(),
getLocalStorageMetrics(),
getMinioProcMetrics(),
getMinioVersionMetrics(),
getNetworkMetrics(),
getS3TTFBMetric(),
getILMNodeMetrics(),
getScannerNodeMetrics(),
getIAMNodeMetrics(),
getKMSNodeMetrics(),
}
allMetricsGroups := func() (allMetrics []*MetricsGroup) {
allMetrics = append(allMetrics, clusterMetricsGroups...)
allMetrics = append(allMetrics, peerMetricsGroups...)
return allMetrics
}()
nodeCollector = newMinioCollectorNode([]*MetricsGroup{
getNodeHealthMetrics(),
getLocalDriveStorageMetrics(),
getCacheMetrics(),
getHTTPMetrics(),
getNetworkMetrics(),
getMinioVersionMetrics(),
getS3TTFBMetric(),
getNotificationMetrics(),
})
clusterCollector = newMinioClusterCollector(allMetricsGroups)
}
// MetricNamespace is top level grouping of metrics to create the metric name.
type MetricNamespace string
// MetricSubsystem is the sub grouping for metrics within a namespace.
type MetricSubsystem string
const (
bucketMetricNamespace MetricNamespace = "minio_bucket"
clusterMetricNamespace MetricNamespace = "minio_cluster"
healMetricNamespace MetricNamespace = "minio_heal"
interNodeMetricNamespace MetricNamespace = "minio_inter_node"
nodeMetricNamespace MetricNamespace = "minio_node"
minioMetricNamespace MetricNamespace = "minio"
s3MetricNamespace MetricNamespace = "minio_s3"
)
const (
cacheSubsystem MetricSubsystem = "cache"
capacityRawSubsystem MetricSubsystem = "capacity_raw"
capacityUsableSubsystem MetricSubsystem = "capacity_usable"
diskSubsystem MetricSubsystem = "disk"
fileDescriptorSubsystem MetricSubsystem = "file_descriptor"
goRoutines MetricSubsystem = "go_routine"
ioSubsystem MetricSubsystem = "io"
nodesSubsystem MetricSubsystem = "nodes"
objectsSubsystem MetricSubsystem = "objects"
processSubsystem MetricSubsystem = "process"
replicationSubsystem MetricSubsystem = "replication"
requestsSubsystem MetricSubsystem = "requests"
requestsRejectedSubsystem MetricSubsystem = "requests_rejected"
timeSubsystem MetricSubsystem = "time"
trafficSubsystem MetricSubsystem = "traffic"
softwareSubsystem MetricSubsystem = "software"
sysCallSubsystem MetricSubsystem = "syscall"
usageSubsystem MetricSubsystem = "usage"
quotaSubsystem MetricSubsystem = "quota"
ilmSubsystem MetricSubsystem = "ilm"
scannerSubsystem MetricSubsystem = "scanner"
iamSubsystem MetricSubsystem = "iam"
kmsSubsystem MetricSubsystem = "kms"
notifySubsystem MetricSubsystem = "notify"
2022-11-10 19:20:21 +01:00
auditSubsystem MetricSubsystem = "audit"
)
// MetricName are the individual names for the metric.
type MetricName string
const (
authTotal MetricName = "auth_total"
canceledTotal MetricName = "canceled_total"
errorsTotal MetricName = "errors_total"
headerTotal MetricName = "header_total"
healTotal MetricName = "heal_total"
hitsTotal MetricName = "hits_total"
inflightTotal MetricName = "inflight_total"
invalidTotal MetricName = "invalid_total"
limitTotal MetricName = "limit_total"
missedTotal MetricName = "missed_total"
waitingTotal MetricName = "waiting_total"
incomingTotal MetricName = "incoming_total"
objectTotal MetricName = "object_total"
offlineTotal MetricName = "offline_total"
onlineTotal MetricName = "online_total"
openTotal MetricName = "open_total"
readTotal MetricName = "read_total"
timestampTotal MetricName = "timestamp_total"
writeTotal MetricName = "write_total"
total MetricName = "total"
freeInodes MetricName = "free_inodes"
failedCount MetricName = "failed_count"
failedBytes MetricName = "failed_bytes"
freeBytes MetricName = "free_bytes"
readBytes MetricName = "read_bytes"
rcharBytes MetricName = "rchar_bytes"
receivedBytes MetricName = "received_bytes"
latencyMilliSec MetricName = "latency_ms"
sentBytes MetricName = "sent_bytes"
totalBytes MetricName = "total_bytes"
usedBytes MetricName = "used_bytes"
writeBytes MetricName = "write_bytes"
wcharBytes MetricName = "wchar_bytes"
latencyMicroSec MetricName = "latency_us"
latencyNanoSec MetricName = "latency_ns"
usagePercent MetricName = "update_percent"
commitInfo MetricName = "commit_info"
usageInfo MetricName = "usage_info"
versionInfo MetricName = "version_info"
sizeDistribution = "size_distribution"
ttfbDistribution = "ttfb_seconds_distribution"
lastActivityTime = "last_activity_nano_seconds"
startTime = "starttime_seconds"
2021-03-20 21:23:27 -07:00
upTime = "uptime_seconds"
memory = "resident_memory_bytes"
cpu = "cpu_total_seconds"
expiryPendingTasks MetricName = "expiry_pending_tasks"
transitionPendingTasks MetricName = "transition_pending_tasks"
transitionActiveTasks MetricName = "transition_active_tasks"
transitionedBytes MetricName = "transitioned_bytes"
transitionedObjects MetricName = "transitioned_objects"
transitionedVersions MetricName = "transitioned_versions"
kmsOnline = "online"
kmsRequestsSuccess = "request_success"
kmsRequestsError = "request_error"
kmsRequestsFail = "request_failure"
kmsUptime = "uptime"
)
const (
serverName = "server"
)
// MetricType for the types of metrics supported
type MetricType string
const (
gaugeMetric = "gaugeMetric"
counterMetric = "counterMetric"
histogramMetric = "histogramMetric"
)
// MetricDescription describes the metric
type MetricDescription struct {
Namespace MetricNamespace `json:"MetricNamespace"`
Subsystem MetricSubsystem `json:"Subsystem"`
Name MetricName `json:"MetricName"`
Help string `json:"Help"`
Type MetricType `json:"Type"`
}
// Metric captures the details for a metric
type Metric struct {
Description MetricDescription `json:"Description"`
StaticLabels map[string]string `json:"StaticLabels"`
Value float64 `json:"Value"`
VariableLabels map[string]string `json:"VariableLabels"`
HistogramBucketLabel string `json:"HistogramBucketLabel"`
Histogram map[string]uint64 `json:"Histogram"`
}
// MetricsGroup are a group of metrics that are initialized together.
type MetricsGroup struct {
metricsCache timedValue
cacheInterval time.Duration
}
// RegisterRead register the metrics populator function to be used
// to populate new values upon cache invalidation.
func (g *MetricsGroup) RegisterRead(read func(ctx context.Context) []Metric) {
g.metricsCache.Once.Do(func() {
g.metricsCache.Relax = true
g.metricsCache.TTL = g.cacheInterval
g.metricsCache.Update = func() (interface{}, error) {
return read(GlobalContext), nil
}
})
}
func (m *Metric) copyMetric() Metric {
metric := Metric{
Description: m.Description,
Value: m.Value,
HistogramBucketLabel: m.HistogramBucketLabel,
StaticLabels: make(map[string]string),
VariableLabels: make(map[string]string),
Histogram: make(map[string]uint64),
}
for k, v := range m.StaticLabels {
metric.StaticLabels[k] = v
}
for k, v := range m.VariableLabels {
metric.VariableLabels[k] = v
}
for k, v := range m.Histogram {
metric.Histogram[k] = v
}
return metric
}
// Get - returns cached value always upton the configured TTL,
// once the TTL expires "read()" registered function is called
// to return the new values and updated.
func (g *MetricsGroup) Get() (metrics []Metric) {
c, _ := g.metricsCache.Get()
m, ok := c.([]Metric)
if !ok {
return []Metric{}
}
metrics = make([]Metric, 0, len(m))
for i := range m {
metrics = append(metrics, m[i].copyMetric())
}
return metrics
}
func getClusterCapacityTotalBytesMD() MetricDescription {
return MetricDescription{
Namespace: clusterMetricNamespace,
Subsystem: capacityRawSubsystem,
Name: totalBytes,
Help: "Total capacity online in the cluster",
Type: gaugeMetric,
}
}
func getClusterCapacityFreeBytesMD() MetricDescription {
return MetricDescription{
Namespace: clusterMetricNamespace,
Subsystem: capacityRawSubsystem,
Name: freeBytes,
Help: "Total free capacity online in the cluster",
Type: gaugeMetric,
}
}
func getClusterCapacityUsageBytesMD() MetricDescription {
return MetricDescription{
Namespace: clusterMetricNamespace,
Subsystem: capacityUsableSubsystem,
Name: totalBytes,
Help: "Total usable capacity online in the cluster",
Type: gaugeMetric,
}
}
func getClusterCapacityUsageFreeBytesMD() MetricDescription {
return MetricDescription{
Namespace: clusterMetricNamespace,
Subsystem: capacityUsableSubsystem,
Name: freeBytes,
Help: "Total free usable capacity online in the cluster",
Type: gaugeMetric,
}
}
func getNodeDriveAPILatencyMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: diskSubsystem,
Name: latencyMicroSec,
Help: "Average last minute latency in µs for drive API storage operations",
Type: gaugeMetric,
}
}
func getNodeDriveUsedBytesMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: diskSubsystem,
Name: usedBytes,
Help: "Total storage used on a drive",
Type: gaugeMetric,
}
}
func getNodeDriveFreeBytesMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: diskSubsystem,
Name: freeBytes,
Help: "Total storage available on a drive",
Type: gaugeMetric,
}
}
func getClusterDrivesOfflineTotalMD() MetricDescription {
return MetricDescription{
Namespace: clusterMetricNamespace,
Subsystem: diskSubsystem,
Name: offlineTotal,
Help: "Total drives offline",
Type: gaugeMetric,
}
}
func getClusterDrivesOnlineTotalMD() MetricDescription {
return MetricDescription{
Namespace: clusterMetricNamespace,
Subsystem: diskSubsystem,
Name: onlineTotal,
Help: "Total drives online",
Type: gaugeMetric,
}
}
func getClusterDrivesTotalMD() MetricDescription {
return MetricDescription{
Namespace: clusterMetricNamespace,
Subsystem: diskSubsystem,
Name: total,
Help: "Total drives",
Type: gaugeMetric,
}
}
func getNodeDrivesOfflineTotalMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: diskSubsystem,
Name: offlineTotal,
Help: "Total drives offline",
Type: gaugeMetric,
}
}
func getNodeDrivesOnlineTotalMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: diskSubsystem,
Name: onlineTotal,
Help: "Total drives online",
Type: gaugeMetric,
}
}
func getNodeDrivesTotalMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: diskSubsystem,
Name: total,
Help: "Total drives",
Type: gaugeMetric,
}
}
func getNodeDrivesFreeInodes() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: diskSubsystem,
Name: freeInodes,
Help: "Total free inodes",
Type: gaugeMetric,
}
}
func getNodeDriveTotalBytesMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: diskSubsystem,
Name: totalBytes,
Help: "Total storage on a drive",
Type: gaugeMetric,
}
}
func getUsageLastScanActivityMD() MetricDescription {
return MetricDescription{
Namespace: minioMetricNamespace,
Subsystem: usageSubsystem,
Name: lastActivityTime,
Help: "Time elapsed (in nano seconds) since last scan activity. This is set to 0 until first scan cycle",
Type: gaugeMetric,
}
}
func getBucketUsageQuotaTotalBytesMD() MetricDescription {
return MetricDescription{
Namespace: bucketMetricNamespace,
Subsystem: quotaSubsystem,
Name: totalBytes,
Help: "Total bucket quota size in bytes",
Type: gaugeMetric,
}
}
func getBucketTrafficReceivedBytes() MetricDescription {
return MetricDescription{
Namespace: bucketMetricNamespace,
Subsystem: trafficSubsystem,
Name: receivedBytes,
Help: "Total number of S3 bytes received for this bucket",
Type: gaugeMetric,
}
}
func getBucketTrafficSentBytes() MetricDescription {
return MetricDescription{
Namespace: bucketMetricNamespace,
Subsystem: trafficSubsystem,
Name: sentBytes,
Help: "Total number of S3 bytes sent for this bucket",
Type: gaugeMetric,
}
}
func getBucketUsageTotalBytesMD() MetricDescription {
return MetricDescription{
Namespace: bucketMetricNamespace,
Subsystem: usageSubsystem,
Name: totalBytes,
Help: "Total bucket size in bytes",
Type: gaugeMetric,
}
}
func getBucketUsageObjectsTotalMD() MetricDescription {
return MetricDescription{
Namespace: bucketMetricNamespace,
Subsystem: usageSubsystem,
Name: objectTotal,
Help: "Total number of objects",
Type: gaugeMetric,
}
}
func getBucketRepLatencyMD() MetricDescription {
return MetricDescription{
Namespace: bucketMetricNamespace,
Subsystem: replicationSubsystem,
Name: latencyMilliSec,
Help: "Replication latency in milliseconds",
Type: histogramMetric,
}
}
func getBucketRepFailedBytesMD() MetricDescription {
return MetricDescription{
Namespace: bucketMetricNamespace,
Subsystem: replicationSubsystem,
Name: failedBytes,
Help: "Total number of bytes failed at least once to replicate",
Type: gaugeMetric,
}
}
func getBucketRepSentBytesMD() MetricDescription {
return MetricDescription{
Namespace: bucketMetricNamespace,
Subsystem: replicationSubsystem,
Name: sentBytes,
Help: "Total number of bytes replicated to the target bucket",
Type: gaugeMetric,
}
}
func getBucketRepReceivedBytesMD() MetricDescription {
return MetricDescription{
Namespace: bucketMetricNamespace,
Subsystem: replicationSubsystem,
Name: receivedBytes,
Help: "Total number of bytes replicated to this bucket from another source bucket",
Type: gaugeMetric,
}
}
func getBucketRepFailedOperationsMD() MetricDescription {
return MetricDescription{
Namespace: bucketMetricNamespace,
Subsystem: replicationSubsystem,
Name: failedCount,
Help: "Total number of objects which failed replication",
Type: gaugeMetric,
}
}
func getBucketObjectDistributionMD() MetricDescription {
return MetricDescription{
Namespace: bucketMetricNamespace,
Subsystem: objectsSubsystem,
Name: sizeDistribution,
Help: "Distribution of object sizes in the bucket, includes label for the bucket name",
Type: histogramMetric,
}
}
func getInternodeFailedRequests() MetricDescription {
return MetricDescription{
Namespace: interNodeMetricNamespace,
Subsystem: trafficSubsystem,
Name: errorsTotal,
Help: "Total number of failed internode calls",
Type: counterMetric,
}
}
func getInternodeTCPDialTimeout() MetricDescription {
return MetricDescription{
Namespace: interNodeMetricNamespace,
Subsystem: trafficSubsystem,
Name: "dial_errors",
Help: "Total number of internode TCP dial timeouts and errors",
Type: counterMetric,
}
}
func getInternodeTCPAvgDuration() MetricDescription {
return MetricDescription{
Namespace: interNodeMetricNamespace,
Subsystem: trafficSubsystem,
Name: "dial_avg_time",
Help: "Average time of internodes TCP dial calls",
Type: gaugeMetric,
}
}
func getInterNodeSentBytesMD() MetricDescription {
return MetricDescription{
Namespace: interNodeMetricNamespace,
Subsystem: trafficSubsystem,
Name: sentBytes,
Help: "Total number of bytes sent to the other peer nodes",
Type: counterMetric,
}
}
func getInterNodeReceivedBytesMD() MetricDescription {
return MetricDescription{
Namespace: interNodeMetricNamespace,
Subsystem: trafficSubsystem,
Name: receivedBytes,
Help: "Total number of bytes received from other peer nodes",
Type: counterMetric,
}
}
func getS3SentBytesMD() MetricDescription {
return MetricDescription{
Namespace: s3MetricNamespace,
Subsystem: trafficSubsystem,
Name: sentBytes,
Help: "Total number of s3 bytes sent",
Type: counterMetric,
}
}
func getS3ReceivedBytesMD() MetricDescription {
return MetricDescription{
Namespace: s3MetricNamespace,
Subsystem: trafficSubsystem,
Name: receivedBytes,
Help: "Total number of s3 bytes received",
Type: counterMetric,
}
}
func getS3RequestsInFlightMD() MetricDescription {
return MetricDescription{
Namespace: s3MetricNamespace,
Subsystem: requestsSubsystem,
Name: inflightTotal,
Help: "Total number of S3 requests currently in flight",
Type: gaugeMetric,
}
}
func getS3RequestsInQueueMD() MetricDescription {
return MetricDescription{
Namespace: s3MetricNamespace,
Subsystem: requestsSubsystem,
Name: waitingTotal,
Help: "Number of S3 requests in the waiting queue",
Type: gaugeMetric,
}
}
func getIncomingS3RequestsMD() MetricDescription {
return MetricDescription{
Namespace: s3MetricNamespace,
Subsystem: requestsSubsystem,
Name: incomingTotal,
Help: "Volatile number of total incoming S3 requests",
Type: gaugeMetric,
}
}
func getS3RequestsTotalMD() MetricDescription {
return MetricDescription{
Namespace: s3MetricNamespace,
Subsystem: requestsSubsystem,
Name: total,
Help: "Total number S3 requests",
Type: counterMetric,
}
}
func getS3RequestsErrorsMD() MetricDescription {
return MetricDescription{
Namespace: s3MetricNamespace,
Subsystem: requestsSubsystem,
Name: errorsTotal,
Help: "Total number S3 requests with (4xx and 5xx) errors",
Type: counterMetric,
}
}
func getS3Requests4xxErrorsMD() MetricDescription {
return MetricDescription{
Namespace: s3MetricNamespace,
Subsystem: requestsSubsystem,
Name: "4xx_" + errorsTotal,
Help: "Total number S3 requests with (4xx) errors",
Type: counterMetric,
}
}
func getS3Requests5xxErrorsMD() MetricDescription {
return MetricDescription{
Namespace: s3MetricNamespace,
Subsystem: requestsSubsystem,
Name: "5xx_" + errorsTotal,
Help: "Total number S3 requests with (5xx) errors",
Type: counterMetric,
}
}
func getS3RequestsCanceledMD() MetricDescription {
return MetricDescription{
Namespace: s3MetricNamespace,
Subsystem: requestsSubsystem,
Name: canceledTotal,
Help: "Total number S3 requests that were canceled from the client while processing",
Type: counterMetric,
}
}
func getS3RejectedAuthRequestsTotalMD() MetricDescription {
return MetricDescription{
Namespace: s3MetricNamespace,
Subsystem: requestsRejectedSubsystem,
Name: authTotal,
Help: "Total number S3 requests rejected for auth failure",
Type: counterMetric,
}
}
func getS3RejectedHeaderRequestsTotalMD() MetricDescription {
return MetricDescription{
Namespace: s3MetricNamespace,
Subsystem: requestsRejectedSubsystem,
Name: headerTotal,
Help: "Total number S3 requests rejected for invalid header",
Type: counterMetric,
}
}
func getS3RejectedTimestampRequestsTotalMD() MetricDescription {
return MetricDescription{
Namespace: s3MetricNamespace,
Subsystem: requestsRejectedSubsystem,
Name: timestampTotal,
Help: "Total number S3 requests rejected for invalid timestamp",
Type: counterMetric,
}
}
func getS3RejectedInvalidRequestsTotalMD() MetricDescription {
return MetricDescription{
Namespace: s3MetricNamespace,
Subsystem: requestsRejectedSubsystem,
Name: invalidTotal,
Help: "Total number S3 invalid requests",
Type: counterMetric,
}
}
func getCacheHitsTotalMD() MetricDescription {
return MetricDescription{
Namespace: minioNamespace,
Subsystem: cacheSubsystem,
Name: hitsTotal,
Help: "Total number of drive cache hits",
Type: counterMetric,
}
}
func getCacheHitsMissedTotalMD() MetricDescription {
return MetricDescription{
Namespace: minioNamespace,
Subsystem: cacheSubsystem,
Name: missedTotal,
Help: "Total number of drive cache misses",
Type: counterMetric,
}
}
func getCacheUsagePercentMD() MetricDescription {
return MetricDescription{
Namespace: minioNamespace,
Subsystem: minioNamespace,
Name: usagePercent,
Help: "Total percentage cache usage",
Type: gaugeMetric,
}
}
func getCacheUsageInfoMD() MetricDescription {
return MetricDescription{
Namespace: minioNamespace,
Subsystem: cacheSubsystem,
Name: usageInfo,
Help: "Total percentage cache usage, value of 1 indicates high and 0 low, label level is set as well",
Type: gaugeMetric,
}
}
func getCacheUsedBytesMD() MetricDescription {
return MetricDescription{
Namespace: minioNamespace,
Subsystem: cacheSubsystem,
Name: usedBytes,
Help: "Current cache usage in bytes",
Type: gaugeMetric,
}
}
func getCacheTotalBytesMD() MetricDescription {
return MetricDescription{
Namespace: minioNamespace,
Subsystem: cacheSubsystem,
Name: totalBytes,
Help: "Total size of cache drive in bytes",
Type: gaugeMetric,
}
}
func getCacheSentBytesMD() MetricDescription {
return MetricDescription{
Namespace: minioNamespace,
Subsystem: cacheSubsystem,
Name: sentBytes,
Help: "Total number of bytes served from cache",
Type: counterMetric,
}
}
func getHealObjectsTotalMD() MetricDescription {
return MetricDescription{
Namespace: healMetricNamespace,
Subsystem: objectsSubsystem,
Name: total,
Help: "Objects scanned in current self healing run",
Type: gaugeMetric,
}
}
func getHealObjectsHealTotalMD() MetricDescription {
return MetricDescription{
Namespace: healMetricNamespace,
Subsystem: objectsSubsystem,
Name: healTotal,
Help: "Objects healed in current self healing run",
Type: gaugeMetric,
}
}
func getHealObjectsFailTotalMD() MetricDescription {
return MetricDescription{
Namespace: healMetricNamespace,
Subsystem: objectsSubsystem,
Name: errorsTotal,
Help: "Objects for which healing failed in current self healing run",
Type: gaugeMetric,
}
}
func getHealLastActivityTimeMD() MetricDescription {
return MetricDescription{
Namespace: healMetricNamespace,
Subsystem: timeSubsystem,
Name: lastActivityTime,
Help: "Time elapsed (in nano seconds) since last self healing activity. This is set to -1 until initial self heal activity",
Type: gaugeMetric,
}
}
func getNodeOnlineTotalMD() MetricDescription {
return MetricDescription{
Namespace: clusterMetricNamespace,
Subsystem: nodesSubsystem,
Name: onlineTotal,
Help: "Total number of MinIO nodes online",
Type: gaugeMetric,
}
}
func getNodeOfflineTotalMD() MetricDescription {
return MetricDescription{
Namespace: clusterMetricNamespace,
Subsystem: nodesSubsystem,
Name: offlineTotal,
Help: "Total number of MinIO nodes offline",
Type: gaugeMetric,
}
}
func getMinIOVersionMD() MetricDescription {
return MetricDescription{
Namespace: minioMetricNamespace,
Subsystem: softwareSubsystem,
Name: versionInfo,
Help: "MinIO Release tag for the server",
Type: gaugeMetric,
}
}
func getMinIOCommitMD() MetricDescription {
return MetricDescription{
Namespace: minioMetricNamespace,
Subsystem: softwareSubsystem,
Name: commitInfo,
Help: "Git commit hash for the MinIO release",
Type: gaugeMetric,
}
}
func getS3TTFBDistributionMD() MetricDescription {
return MetricDescription{
Namespace: s3MetricNamespace,
Subsystem: timeSubsystem,
Name: ttfbDistribution,
Help: "Distribution of the time to first byte across API calls",
Type: gaugeMetric,
}
}
func getMinioFDOpenMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: fileDescriptorSubsystem,
Name: openTotal,
Help: "Total number of open file descriptors by the MinIO Server process",
Type: gaugeMetric,
}
}
func getMinioFDLimitMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: fileDescriptorSubsystem,
Name: limitTotal,
Help: "Limit on total number of open file descriptors for the MinIO Server process",
Type: gaugeMetric,
}
}
func getMinioProcessIOWriteBytesMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: ioSubsystem,
Name: writeBytes,
Help: "Total bytes written by the process to the underlying storage system, /proc/[pid]/io write_bytes",
Type: counterMetric,
}
}
func getMinioProcessIOReadBytesMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: ioSubsystem,
Name: readBytes,
Help: "Total bytes read by the process from the underlying storage system, /proc/[pid]/io read_bytes",
Type: counterMetric,
}
}
func getMinioProcessIOWriteCachedBytesMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: ioSubsystem,
Name: wcharBytes,
Help: "Total bytes written by the process to the underlying storage system including page cache, /proc/[pid]/io wchar",
Type: counterMetric,
}
}
func getMinioProcessIOReadCachedBytesMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: ioSubsystem,
Name: rcharBytes,
Help: "Total bytes read by the process from the underlying storage system including cache, /proc/[pid]/io rchar",
Type: counterMetric,
}
}
func getMinIOProcessSysCallRMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: sysCallSubsystem,
Name: readTotal,
Help: "Total read SysCalls to the kernel. /proc/[pid]/io syscr",
Type: counterMetric,
}
}
func getMinIOProcessSysCallWMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: sysCallSubsystem,
Name: writeTotal,
Help: "Total write SysCalls to the kernel. /proc/[pid]/io syscw",
Type: counterMetric,
}
}
func getMinIOGORoutineCountMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: goRoutines,
Name: total,
Help: "Total number of go routines running",
Type: gaugeMetric,
}
}
func getMinIOProcessStartTimeMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: processSubsystem,
Name: startTime,
Help: "Start time for MinIO process per node, time in seconds since Unix epoc",
2021-03-20 21:23:27 -07:00
Type: gaugeMetric,
}
}
2021-03-20 21:23:27 -07:00
func getMinIOProcessUptimeMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: processSubsystem,
Name: upTime,
Help: "Uptime for MinIO process per node in seconds",
Type: gaugeMetric,
}
}
func getMinIOProcessResidentMemory() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: processSubsystem,
Name: memory,
Help: "Resident memory size in bytes",
Type: gaugeMetric,
}
}
func getMinIOProcessCPUTime() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: processSubsystem,
Name: cpu,
Help: "Total user and system CPU time spent in seconds",
Type: counterMetric,
}
}
func getMinioProcMetrics() *MetricsGroup {
mg := &MetricsGroup{}
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
if runtime.GOOS == "windows" {
return nil
}
metrics = make([]Metric, 0, 20)
p, err := procfs.Self()
if err != nil {
logger.LogOnceIf(ctx, err, string(nodeMetricNamespace))
return
}
openFDs, _ := p.FileDescriptorsLen()
l, _ := p.Limits()
io, _ := p.IO()
stat, _ := p.Stat()
startTime, _ := stat.StartTime()
if openFDs > 0 {
metrics = append(metrics,
Metric{
Description: getMinioFDOpenMD(),
Value: float64(openFDs),
},
)
}
if l.OpenFiles > 0 {
metrics = append(metrics,
Metric{
Description: getMinioFDLimitMD(),
Value: float64(l.OpenFiles),
})
}
if io.SyscR > 0 {
metrics = append(metrics,
Metric{
Description: getMinIOProcessSysCallRMD(),
Value: float64(io.SyscR),
})
}
if io.SyscW > 0 {
metrics = append(metrics,
Metric{
Description: getMinIOProcessSysCallWMD(),
Value: float64(io.SyscW),
})
}
if io.ReadBytes > 0 {
metrics = append(metrics,
Metric{
Description: getMinioProcessIOReadBytesMD(),
Value: float64(io.ReadBytes),
})
}
if io.WriteBytes > 0 {
metrics = append(metrics,
Metric{
Description: getMinioProcessIOWriteBytesMD(),
Value: float64(io.WriteBytes),
})
}
if io.RChar > 0 {
metrics = append(metrics,
Metric{
Description: getMinioProcessIOReadCachedBytesMD(),
Value: float64(io.RChar),
})
}
if io.WChar > 0 {
metrics = append(metrics,
Metric{
Description: getMinioProcessIOWriteCachedBytesMD(),
Value: float64(io.WChar),
})
}
if startTime > 0 {
metrics = append(metrics,
Metric{
Description: getMinIOProcessStartTimeMD(),
Value: startTime,
})
}
if !globalBootTime.IsZero() {
metrics = append(metrics,
Metric{
Description: getMinIOProcessUptimeMD(),
Value: time.Since(globalBootTime).Seconds(),
})
}
if stat.ResidentMemory() > 0 {
metrics = append(metrics,
Metric{
Description: getMinIOProcessResidentMemory(),
Value: float64(stat.ResidentMemory()),
})
}
if stat.CPUTime() > 0 {
metrics = append(metrics,
Metric{
Description: getMinIOProcessCPUTime(),
Value: stat.CPUTime(),
})
}
return
})
return mg
}
func getGoMetrics() *MetricsGroup {
mg := &MetricsGroup{}
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
metrics = append(metrics, Metric{
Description: getMinIOGORoutineCountMD(),
Value: float64(runtime.NumGoroutine()),
})
return
})
return mg
}
func getS3TTFBMetric() *MetricsGroup {
mg := &MetricsGroup{}
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
// Read prometheus metric on this channel
ch := make(chan prometheus.Metric)
var wg sync.WaitGroup
wg.Add(1)
// Read prometheus histogram data and convert it to internal metric data
go func() {
defer wg.Done()
for promMetric := range ch {
dtoMetric := &dto.Metric{}
err := promMetric.Write(dtoMetric)
if err != nil {
logger.LogIf(GlobalContext, err)
return
}
h := dtoMetric.GetHistogram()
for _, b := range h.Bucket {
labels := make(map[string]string)
for _, lp := range dtoMetric.GetLabel() {
labels[*lp.Name] = *lp.Value
}
labels["le"] = fmt.Sprintf("%.3f", *b.UpperBound)
metric := Metric{
Description: getS3TTFBDistributionMD(),
VariableLabels: labels,
Value: float64(b.GetCumulativeCount()),
}
metrics = append(metrics, metric)
}
}
}()
httpRequestsDuration.Collect(ch)
close(ch)
wg.Wait()
return
})
return mg
}
func getTransitionPendingTasksMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: ilmSubsystem,
Name: transitionPendingTasks,
Help: "Number of pending ILM transition tasks in the queue",
Type: gaugeMetric,
}
}
func getTransitionActiveTasksMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: ilmSubsystem,
Name: transitionActiveTasks,
Help: "Number of active ILM transition tasks",
Type: gaugeMetric,
}
}
func getExpiryPendingTasksMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: ilmSubsystem,
Name: expiryPendingTasks,
Help: "Number of pending ILM expiry tasks in the queue",
Type: gaugeMetric,
}
}
func getILMNodeMetrics() *MetricsGroup {
mg := &MetricsGroup{}
mg.RegisterRead(func(_ context.Context) []Metric {
expPendingTasks := Metric{
Description: getExpiryPendingTasksMD(),
}
trPendingTasks := Metric{
Description: getTransitionPendingTasksMD(),
}
trActiveTasks := Metric{
Description: getTransitionActiveTasksMD(),
}
if globalExpiryState != nil {
expPendingTasks.Value = float64(globalExpiryState.PendingTasks())
}
if globalTransitionState != nil {
trPendingTasks.Value = float64(globalTransitionState.PendingTasks())
trActiveTasks.Value = float64(globalTransitionState.ActiveTasks())
}
return []Metric{
expPendingTasks,
trPendingTasks,
trActiveTasks,
}
})
return mg
}
func getScannerNodeMetrics() *MetricsGroup {
mg := &MetricsGroup{}
mg.RegisterRead(func(_ context.Context) []Metric {
metrics := []Metric{
{
Description: MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: scannerSubsystem,
Name: "objects_scanned",
Help: "Total number of unique objects scanned since server start",
Type: counterMetric,
},
2022-07-05 14:45:49 -07:00
Value: float64(globalScannerMetrics.lifetime(scannerMetricScanObject)),
},
{
Description: MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: scannerSubsystem,
Name: "versions_scanned",
Help: "Total number of object versions scanned since server start",
Type: counterMetric,
},
2022-07-05 14:45:49 -07:00
Value: float64(globalScannerMetrics.lifetime(scannerMetricApplyVersion)),
},
{
Description: MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: scannerSubsystem,
Name: "directories_scanned",
Help: "Total number of directories scanned since server start",
Type: counterMetric,
},
2022-07-05 14:45:49 -07:00
Value: float64(globalScannerMetrics.lifetime(scannerMetricScanFolder)),
},
{
Description: MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: scannerSubsystem,
Name: "bucket_scans_started",
Help: "Total number of bucket scans started since server start",
Type: counterMetric,
},
Value: float64(globalScannerMetrics.lifetime(scannerMetricScanBucketDrive) + uint64(globalScannerMetrics.activeDrives())),
},
{
Description: MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: scannerSubsystem,
Name: "bucket_scans_finished",
Help: "Total number of bucket scans finished since server start",
Type: counterMetric,
},
Value: float64(globalScannerMetrics.lifetime(scannerMetricScanBucketDrive)),
},
{
Description: MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: ilmSubsystem,
Name: "versions_scanned",
Help: "Total number of object versions checked for ilm actions since server start",
Type: counterMetric,
},
2022-07-05 14:45:49 -07:00
Value: float64(globalScannerMetrics.lifetime(scannerMetricILM)),
},
}
2022-07-05 14:45:49 -07:00
for i := range globalScannerMetrics.actions {
action := lifecycle.Action(i)
2022-07-05 14:45:49 -07:00
v := globalScannerMetrics.lifetimeActions(action)
if v == 0 {
continue
}
metrics = append(metrics, Metric{
Description: MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: ilmSubsystem,
Name: MetricName("action_count_" + toSnake(action.String())),
Help: "Total action outcome of lifecycle checks since server start",
Type: counterMetric,
},
Value: float64(v),
})
}
return metrics
})
return mg
}
func getIAMNodeMetrics() *MetricsGroup {
mg := &MetricsGroup{}
mg.RegisterRead(func(_ context.Context) (metrics []Metric) {
lastSyncTime := atomic.LoadUint64(&globalIAMSys.LastRefreshTimeUnixNano)
var sinceLastSyncMillis uint64
if lastSyncTime != 0 {
sinceLastSyncMillis = (uint64(time.Now().UnixNano()) - lastSyncTime) / uint64(time.Millisecond)
}
metrics = []Metric{
{
Description: MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: iamSubsystem,
Name: "last_sync_duration_millis",
Help: "Last successful IAM data sync duration in milliseconds",
Type: gaugeMetric,
},
Value: float64(atomic.LoadUint64(&globalIAMSys.LastRefreshDurationMilliseconds)),
},
{
Description: MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: iamSubsystem,
Name: "since_last_sync_millis",
Help: "Time (in milliseconds) since last successful IAM data sync. This is set to 0 until the first sync after server start.",
Type: gaugeMetric,
},
Value: float64(sinceLastSyncMillis),
},
{
Description: MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: iamSubsystem,
Name: "sync_successes",
Help: "Number of successful IAM data syncs since server start.",
Type: counterMetric,
},
Value: float64(atomic.LoadUint64(&globalIAMSys.TotalRefreshSuccesses)),
},
{
Description: MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: iamSubsystem,
Name: "sync_failures",
Help: "Number of failed IAM data syncs since server start.",
Type: counterMetric,
},
Value: float64(atomic.LoadUint64(&globalIAMSys.TotalRefreshFailures)),
},
}
return metrics
})
return mg
}
func getMinioVersionMetrics() *MetricsGroup {
mg := &MetricsGroup{}
mg.RegisterRead(func(_ context.Context) (metrics []Metric) {
metrics = append(metrics, Metric{
Description: getMinIOCommitMD(),
VariableLabels: map[string]string{"commit": CommitID},
})
metrics = append(metrics, Metric{
Description: getMinIOVersionMD(),
VariableLabels: map[string]string{"version": Version},
})
return
})
return mg
}
func getNodeHealthMetrics() *MetricsGroup {
mg := &MetricsGroup{}
mg.RegisterRead(func(_ context.Context) (metrics []Metric) {
metrics = make([]Metric, 0, 16)
nodesUp, nodesDown := globalNotificationSys.GetPeerOnlineCount()
metrics = append(metrics, Metric{
Description: getNodeOnlineTotalMD(),
Value: float64(nodesUp),
})
metrics = append(metrics, Metric{
Description: getNodeOfflineTotalMD(),
Value: float64(nodesDown),
})
return
})
return mg
}
func getMinioHealingMetrics() *MetricsGroup {
mg := &MetricsGroup{}
mg.RegisterRead(func(_ context.Context) (metrics []Metric) {
metrics = make([]Metric, 0, 5)
bgSeq, exists := globalBackgroundHealState.getHealSequenceByToken(bgHealingUUID)
if !exists {
return
}
if bgSeq.lastHealActivity.IsZero() {
return
}
metrics = append(metrics, Metric{
Description: getHealLastActivityTimeMD(),
Value: float64(time.Since(bgSeq.lastHealActivity)),
})
metrics = append(metrics, getObjectsScanned(bgSeq)...)
metrics = append(metrics, getHealedItems(bgSeq)...)
metrics = append(metrics, getFailedItems(bgSeq)...)
return
})
return mg
}
func getFailedItems(seq *healSequence) (m []Metric) {
m = make([]Metric, 0, 1)
for k, v := range seq.gethealFailedItemsMap() {
s := strings.Split(k, ",")
m = append(m, Metric{
Description: getHealObjectsFailTotalMD(),
VariableLabels: map[string]string{
"mount_path": s[0],
"volume_status": s[1],
},
Value: float64(v),
})
}
return
}
func getHealedItems(seq *healSequence) (m []Metric) {
items := seq.getHealedItemsMap()
m = make([]Metric, 0, len(items))
for k, v := range items {
m = append(m, Metric{
Description: getHealObjectsHealTotalMD(),
VariableLabels: map[string]string{"type": string(k)},
Value: float64(v),
})
}
return
}
func getObjectsScanned(seq *healSequence) (m []Metric) {
items := seq.getScannedItemsMap()
m = make([]Metric, 0, len(items))
for k, v := range items {
m = append(m, Metric{
Description: getHealObjectsTotalMD(),
VariableLabels: map[string]string{"type": string(k)},
Value: float64(v),
})
}
return
}
func getCacheMetrics() *MetricsGroup {
mg := &MetricsGroup{
cacheInterval: 10 * time.Second,
}
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
metrics = make([]Metric, 0, 20)
cacheObjLayer := newCachedObjectLayerFn()
// Service not initialized yet
if cacheObjLayer == nil {
return
}
metrics = append(metrics, Metric{
Description: getCacheHitsTotalMD(),
Value: float64(cacheObjLayer.CacheStats().getHits()),
})
metrics = append(metrics, Metric{
Description: getCacheHitsMissedTotalMD(),
Value: float64(cacheObjLayer.CacheStats().getMisses()),
})
metrics = append(metrics, Metric{
Description: getCacheSentBytesMD(),
Value: float64(cacheObjLayer.CacheStats().getBytesServed()),
})
for _, cdStats := range cacheObjLayer.CacheStats().GetDiskStats() {
metrics = append(metrics, Metric{
Description: getCacheUsagePercentMD(),
Value: float64(cdStats.UsagePercent),
VariableLabels: map[string]string{"disk": cdStats.Dir},
})
metrics = append(metrics, Metric{
Description: getCacheUsageInfoMD(),
Value: float64(cdStats.UsageState),
VariableLabels: map[string]string{"disk": cdStats.Dir, "level": cdStats.GetUsageLevelString()},
})
metrics = append(metrics, Metric{
Description: getCacheUsedBytesMD(),
Value: float64(cdStats.UsageSize),
VariableLabels: map[string]string{"disk": cdStats.Dir},
})
metrics = append(metrics, Metric{
Description: getCacheTotalBytesMD(),
Value: float64(cdStats.TotalCapacity),
VariableLabels: map[string]string{"disk": cdStats.Dir},
})
}
return
})
return mg
}
func getNotificationMetrics() *MetricsGroup {
mg := &MetricsGroup{}
mg.RegisterRead(func(ctx context.Context) []Metric {
stats := globalConfigTargetList.Stats()
metrics := make([]Metric, 0, 1+len(stats.TargetStats))
metrics = append(metrics, Metric{
Description: MetricDescription{
Namespace: minioNamespace,
Subsystem: notifySubsystem,
Name: "current_send_in_progress",
Help: "Number of concurrent async Send calls active to all targets",
Type: gaugeMetric,
},
Value: float64(stats.CurrentSendCalls),
})
for _, st := range stats.TargetStats {
metrics = append(metrics, Metric{
Description: MetricDescription{
Namespace: minioNamespace,
Subsystem: notifySubsystem,
Name: "target_queue_length",
Help: "Number of unsent notifications in queue for target",
Type: gaugeMetric,
},
VariableLabels: map[string]string{"target_id": st.ID.ID, "target_name": st.ID.Name},
Value: float64(st.CurrentQueue),
})
}
2022-11-10 19:20:21 +01:00
// Audit and system:
audit := logger.CurrentStats()
for id, st := range audit {
metrics = append(metrics, Metric{
Description: MetricDescription{
Namespace: minioNamespace,
Subsystem: auditSubsystem,
Name: "target_queue_length",
Help: "Number of unsent messages in queue for target",
Type: gaugeMetric,
},
VariableLabels: map[string]string{"target_id": id},
Value: float64(st.QueueLength),
})
metrics = append(metrics, Metric{
Description: MetricDescription{
Namespace: minioNamespace,
Subsystem: auditSubsystem,
Name: "total_messages",
Help: "Total number of messages sent since start",
Type: counterMetric,
},
VariableLabels: map[string]string{"target_id": id},
Value: float64(st.TotalMessages),
})
metrics = append(metrics, Metric{
Description: MetricDescription{
Namespace: minioNamespace,
Subsystem: auditSubsystem,
Name: "failed_messages",
Help: "Total number of messages that failed to send since start",
Type: counterMetric,
},
VariableLabels: map[string]string{"target_id": id},
Value: float64(st.FailedMessages),
})
}
return metrics
})
return mg
}
func getHTTPMetrics() *MetricsGroup {
mg := &MetricsGroup{}
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
httpStats := globalHTTPStats.toServerHTTPStats()
metrics = make([]Metric, 0, 3+
len(httpStats.CurrentS3Requests.APIStats)+
len(httpStats.TotalS3Requests.APIStats)+
len(httpStats.TotalS3Errors.APIStats)+
len(httpStats.TotalS35xxErrors.APIStats)+
len(httpStats.TotalS34xxErrors.APIStats))
metrics = append(metrics, Metric{
Description: getS3RejectedAuthRequestsTotalMD(),
Value: float64(httpStats.TotalS3RejectedAuth),
})
metrics = append(metrics, Metric{
Description: getS3RejectedTimestampRequestsTotalMD(),
Value: float64(httpStats.TotalS3RejectedTime),
})
metrics = append(metrics, Metric{
Description: getS3RejectedHeaderRequestsTotalMD(),
Value: float64(httpStats.TotalS3RejectedHeader),
})
metrics = append(metrics, Metric{
Description: getS3RejectedInvalidRequestsTotalMD(),
Value: float64(httpStats.TotalS3RejectedInvalid),
})
metrics = append(metrics, Metric{
Description: getS3RequestsInQueueMD(),
Value: float64(httpStats.S3RequestsInQueue),
})
metrics = append(metrics, Metric{
Description: getIncomingS3RequestsMD(),
Value: float64(httpStats.S3RequestsIncoming),
})
for api, value := range httpStats.CurrentS3Requests.APIStats {
metrics = append(metrics, Metric{
Description: getS3RequestsInFlightMD(),
Value: float64(value),
VariableLabels: map[string]string{"api": api},
})
}
for api, value := range httpStats.TotalS3Requests.APIStats {
metrics = append(metrics, Metric{
Description: getS3RequestsTotalMD(),
Value: float64(value),
VariableLabels: map[string]string{"api": api},
})
}
for api, value := range httpStats.TotalS3Errors.APIStats {
metrics = append(metrics, Metric{
Description: getS3RequestsErrorsMD(),
Value: float64(value),
VariableLabels: map[string]string{"api": api},
})
}
for api, value := range httpStats.TotalS35xxErrors.APIStats {
metrics = append(metrics, Metric{
Description: getS3Requests5xxErrorsMD(),
Value: float64(value),
VariableLabels: map[string]string{"api": api},
})
}
for api, value := range httpStats.TotalS34xxErrors.APIStats {
metrics = append(metrics, Metric{
Description: getS3Requests4xxErrorsMD(),
Value: float64(value),
VariableLabels: map[string]string{"api": api},
})
}
for api, value := range httpStats.TotalS3Canceled.APIStats {
metrics = append(metrics, Metric{
Description: getS3RequestsCanceledMD(),
Value: float64(value),
VariableLabels: map[string]string{"api": api},
})
}
return
})
return mg
}
func getNetworkMetrics() *MetricsGroup {
mg := &MetricsGroup{}
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
metrics = make([]Metric, 0, 10)
connStats := globalConnStats.toServerConnStats()
rpcStats := rest.GetRPCStats()
if globalIsDistErasure {
metrics = append(metrics, Metric{
Description: getInternodeFailedRequests(),
Value: float64(rpcStats.Errs),
})
metrics = append(metrics, Metric{
Description: getInternodeTCPDialTimeout(),
Value: float64(rpcStats.DialErrs),
})
metrics = append(metrics, Metric{
Description: getInternodeTCPAvgDuration(),
Value: float64(rpcStats.DialAvgDuration),
})
metrics = append(metrics, Metric{
Description: getInterNodeSentBytesMD(),
Value: float64(connStats.TotalOutputBytes),
})
metrics = append(metrics, Metric{
Description: getInterNodeReceivedBytesMD(),
Value: float64(connStats.TotalInputBytes),
})
}
metrics = append(metrics, Metric{
Description: getS3SentBytesMD(),
Value: float64(connStats.S3OutputBytes),
})
metrics = append(metrics, Metric{
Description: getS3ReceivedBytesMD(),
Value: float64(connStats.S3InputBytes),
})
return
})
return mg
}
func getBucketUsageMetrics() *MetricsGroup {
mg := &MetricsGroup{
cacheInterval: 10 * time.Second,
}
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
objLayer := newObjectLayerFn()
// Service not initialized yet
2022-10-24 17:44:15 -07:00
if objLayer == nil {
return
}
metrics = make([]Metric, 0, 50)
dataUsageInfo, err := loadDataUsageFromBackend(ctx, objLayer)
if err != nil {
return
}
// data usage has not captured any data yet.
if dataUsageInfo.LastUpdate.IsZero() {
return
}
metrics = append(metrics, Metric{
Description: getUsageLastScanActivityMD(),
Value: float64(time.Since(dataUsageInfo.LastUpdate)),
})
bucketReplStats := globalReplicationStats.getAllLatest(dataUsageInfo.BucketsUsage)
for bucket, usage := range dataUsageInfo.BucketsUsage {
stats := bucketReplStats[bucket]
quota, _ := globalBucketQuotaSys.Get(ctx, bucket)
metrics = append(metrics, Metric{
Description: getBucketUsageTotalBytesMD(),
Value: float64(usage.Size),
VariableLabels: map[string]string{"bucket": bucket},
})
metrics = append(metrics, Metric{
Description: getBucketUsageObjectsTotalMD(),
Value: float64(usage.ObjectsCount),
VariableLabels: map[string]string{"bucket": bucket},
})
metrics = append(metrics, Metric{
Description: getBucketRepReceivedBytesMD(),
Value: float64(stats.ReplicaSize),
VariableLabels: map[string]string{"bucket": bucket},
})
if quota != nil && quota.Quota > 0 {
metrics = append(metrics, Metric{
Description: getBucketUsageQuotaTotalBytesMD(),
Value: float64(quota.Quota),
VariableLabels: map[string]string{"bucket": bucket},
})
}
recvBytes := globalBucketConnStats.getS3InputBytes(bucket)
if recvBytes > 0 {
metrics = append(metrics, Metric{
Description: getBucketTrafficReceivedBytes(),
Value: float64(recvBytes),
VariableLabels: map[string]string{"bucket": bucket},
})
}
sentBytes := globalBucketConnStats.getS3OutputBytes(bucket)
if sentBytes > 0 {
metrics = append(metrics, Metric{
Description: getBucketTrafficSentBytes(),
Value: float64(sentBytes),
VariableLabels: map[string]string{"bucket": bucket},
})
}
if stats.hasReplicationUsage() {
for arn, stat := range stats.Stats {
metrics = append(metrics, Metric{
Description: getBucketRepFailedBytesMD(),
Value: float64(stat.FailedSize),
VariableLabels: map[string]string{"bucket": bucket, "targetArn": arn},
})
metrics = append(metrics, Metric{
Description: getBucketRepSentBytesMD(),
Value: float64(stat.ReplicatedSize),
VariableLabels: map[string]string{"bucket": bucket, "targetArn": arn},
})
metrics = append(metrics, Metric{
Description: getBucketRepFailedOperationsMD(),
Value: float64(stat.FailedCount),
VariableLabels: map[string]string{"bucket": bucket, "targetArn": arn},
})
metrics = append(metrics, Metric{
Description: getBucketRepLatencyMD(),
HistogramBucketLabel: "range",
Histogram: stat.Latency.getUploadLatency(),
VariableLabels: map[string]string{"bucket": bucket, "operation": "upload", "targetArn": arn},
})
}
}
metrics = append(metrics, Metric{
Description: getBucketObjectDistributionMD(),
Histogram: usage.ObjectSizesHistogram,
HistogramBucketLabel: "range",
VariableLabels: map[string]string{"bucket": bucket},
})
}
return
})
return mg
}
func getClusterTransitionedBytesMD() MetricDescription {
return MetricDescription{
Namespace: clusterMetricNamespace,
Subsystem: ilmSubsystem,
Name: transitionedBytes,
Help: "Total bytes transitioned to a tier",
Type: gaugeMetric,
}
}
func getClusterTransitionedObjectsMD() MetricDescription {
return MetricDescription{
Namespace: clusterMetricNamespace,
Subsystem: ilmSubsystem,
Name: transitionedObjects,
Help: "Total number of objects transitioned to a tier",
Type: gaugeMetric,
}
}
func getClusterTransitionedVersionsMD() MetricDescription {
return MetricDescription{
Namespace: clusterMetricNamespace,
Subsystem: ilmSubsystem,
Name: transitionedVersions,
Help: "Total number of versions transitioned to a tier",
Type: gaugeMetric,
}
}
func getClusterTierMetrics() *MetricsGroup {
mg := &MetricsGroup{
cacheInterval: 10 * time.Second,
}
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
objLayer := newObjectLayerFn()
2022-10-24 17:44:15 -07:00
if objLayer == nil {
return
}
if globalTierConfigMgr.Empty() {
return
}
dui, err := loadDataUsageFromBackend(GlobalContext, objLayer)
if err != nil {
return
}
// data usage has not captured any tier stats yet.
if dui.TierStats == nil {
return
}
return dui.tierMetrics()
})
return mg
}
func getLocalStorageMetrics() *MetricsGroup {
mg := &MetricsGroup{
cacheInterval: 10 * time.Second,
}
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
objLayer := newObjectLayerFn()
// Service not initialized yet
2022-10-24 17:44:15 -07:00
if objLayer == nil {
return
}
metrics = make([]Metric, 0, 50)
storageInfo := objLayer.LocalStorageInfo(ctx)
onlineDrives, offlineDrives := getOnlineOfflineDisksStats(storageInfo.Disks)
totalDrives := onlineDrives.Merge(offlineDrives)
for _, disk := range storageInfo.Disks {
metrics = append(metrics, Metric{
Description: getNodeDriveUsedBytesMD(),
Value: float64(disk.UsedSpace),
VariableLabels: map[string]string{"disk": disk.DrivePath},
})
metrics = append(metrics, Metric{
Description: getNodeDriveFreeBytesMD(),
Value: float64(disk.AvailableSpace),
VariableLabels: map[string]string{"disk": disk.DrivePath},
})
metrics = append(metrics, Metric{
Description: getNodeDriveTotalBytesMD(),
Value: float64(disk.TotalSpace),
VariableLabels: map[string]string{"disk": disk.DrivePath},
})
metrics = append(metrics, Metric{
Description: getNodeDrivesFreeInodes(),
Value: float64(disk.FreeInodes),
VariableLabels: map[string]string{"disk": disk.DrivePath},
})
metrics = append(metrics, Metric{
Description: getNodeDrivesOfflineTotalMD(),
Value: float64(offlineDrives.Sum()),
})
metrics = append(metrics, Metric{
Description: getNodeDrivesOnlineTotalMD(),
Value: float64(onlineDrives.Sum()),
})
metrics = append(metrics, Metric{
Description: getNodeDrivesTotalMD(),
Value: float64(totalDrives.Sum()),
})
}
return
})
return mg
}
func getLocalDriveStorageMetrics() *MetricsGroup {
mg := &MetricsGroup{
cacheInterval: 3 * time.Second,
}
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
objLayer := newObjectLayerFn()
// Service not initialized yet
2022-10-24 17:44:15 -07:00
if objLayer == nil {
return
}
storageInfo := objLayer.LocalStorageInfo(ctx)
if storageInfo.Backend.Type == madmin.FS {
return
}
metrics = make([]Metric, 0, 50)
for _, disk := range storageInfo.Disks {
if disk.Metrics == nil {
continue
}
2022-07-05 14:45:49 -07:00
for apiName, latency := range disk.Metrics.LastMinute {
metrics = append(metrics, Metric{
Description: getNodeDriveAPILatencyMD(),
2022-07-05 14:45:49 -07:00
Value: float64(latency.Avg().Microseconds()),
VariableLabels: map[string]string{"disk": disk.DrivePath, "api": "storage." + apiName},
})
}
}
return
})
return mg
}
func getClusterStorageMetrics() *MetricsGroup {
mg := &MetricsGroup{
cacheInterval: 10 * time.Second,
}
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
objLayer := newObjectLayerFn()
// Service not initialized yet
2022-10-24 17:44:15 -07:00
if objLayer == nil {
return
}
// Fetch disk space info, ignore errors
metrics = make([]Metric, 0, 10)
storageInfo := objLayer.StorageInfo(ctx)
onlineDrives, offlineDrives := getOnlineOfflineDisksStats(storageInfo.Disks)
totalDrives := onlineDrives.Merge(offlineDrives)
metrics = append(metrics, Metric{
Description: getClusterCapacityTotalBytesMD(),
Value: float64(GetTotalCapacity(storageInfo.Disks)),
})
metrics = append(metrics, Metric{
Description: getClusterCapacityFreeBytesMD(),
Value: float64(GetTotalCapacityFree(storageInfo.Disks)),
})
metrics = append(metrics, Metric{
Description: getClusterCapacityUsageBytesMD(),
Value: float64(GetTotalUsableCapacity(storageInfo.Disks, storageInfo)),
})
metrics = append(metrics, Metric{
Description: getClusterCapacityUsageFreeBytesMD(),
Value: float64(GetTotalUsableCapacityFree(storageInfo.Disks, storageInfo)),
})
metrics = append(metrics, Metric{
Description: getClusterDrivesOfflineTotalMD(),
Value: float64(offlineDrives.Sum()),
})
metrics = append(metrics, Metric{
Description: getClusterDrivesOnlineTotalMD(),
Value: float64(onlineDrives.Sum()),
})
metrics = append(metrics, Metric{
Description: getClusterDrivesTotalMD(),
Value: float64(totalDrives.Sum()),
})
return
})
return mg
}
func getKMSNodeMetrics() *MetricsGroup {
mg := &MetricsGroup{
cacheInterval: 10 * time.Second,
}
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
objLayer := newObjectLayerFn()
// Service not initialized yet
2022-10-24 17:44:15 -07:00
if objLayer == nil || GlobalKMS == nil {
return
}
const (
Online = 1
Offline = 0
)
desc := MetricDescription{
Namespace: clusterMetricNamespace,
Subsystem: kmsSubsystem,
Name: kmsOnline,
Help: "Reports whether the KMS is online (1) or offline (0)",
Type: gaugeMetric,
}
_, err := GlobalKMS.Metrics(ctx)
if _, ok := kes.IsConnError(err); ok {
return []Metric{{
Description: desc,
Value: float64(Offline),
}}
}
return []Metric{{
Description: desc,
Value: float64(Online),
}}
})
return mg
}
func getKMSMetrics() *MetricsGroup {
mg := &MetricsGroup{
cacheInterval: 10 * time.Second,
}
mg.RegisterRead(func(ctx context.Context) []Metric {
objLayer := newObjectLayerFn()
// Service not initialized yet
2022-10-24 17:44:15 -07:00
if objLayer == nil || GlobalKMS == nil {
return []Metric{}
}
metrics := make([]Metric, 0, 4)
metric, err := GlobalKMS.Metrics(ctx)
if err != nil {
return metrics
}
metrics = append(metrics, Metric{
Description: MetricDescription{
Namespace: clusterMetricNamespace,
Subsystem: kmsSubsystem,
Name: kmsRequestsSuccess,
Help: "Number of KMS requests that succeeded",
Type: counterMetric,
},
Value: float64(metric.RequestOK),
})
metrics = append(metrics, Metric{
Description: MetricDescription{
Namespace: clusterMetricNamespace,
Subsystem: kmsSubsystem,
Name: kmsRequestsError,
Help: "Number of KMS requests that failed due to some error. (HTTP 4xx status code)",
Type: counterMetric,
},
Value: float64(metric.RequestErr),
})
metrics = append(metrics, Metric{
Description: MetricDescription{
Namespace: clusterMetricNamespace,
Subsystem: kmsSubsystem,
Name: kmsRequestsFail,
Help: "Number of KMS requests that failed due to some internal failure. (HTTP 5xx status code)",
Type: counterMetric,
},
Value: float64(metric.RequestFail),
})
metrics = append(metrics, Metric{
Description: MetricDescription{
Namespace: clusterMetricNamespace,
Subsystem: kmsSubsystem,
Name: kmsUptime,
Help: "The time the KMS has been up and running in seconds.",
Type: counterMetric,
},
Value: metric.UpTime.Seconds(),
})
return metrics
})
return mg
}
type minioClusterCollector struct {
metricsGroups []*MetricsGroup
desc *prometheus.Desc
}
func newMinioClusterCollector(metricsGroups []*MetricsGroup) *minioClusterCollector {
return &minioClusterCollector{
metricsGroups: metricsGroups,
desc: prometheus.NewDesc("minio_stats", "Statistics exposed by MinIO server per cluster", nil, nil),
}
}
// Describe sends the super-set of all possible descriptors of metrics
func (c *minioClusterCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- c.desc
}
// Collect is called by the Prometheus registry when collecting metrics.
func (c *minioClusterCollector) Collect(out chan<- prometheus.Metric) {
var wg sync.WaitGroup
publish := func(in <-chan Metric) {
defer wg.Done()
for metric := range in {
labels, values := getOrderedLabelValueArrays(metric.VariableLabels)
if metric.Description.Type == histogramMetric {
if metric.Histogram == nil {
continue
}
for k, v := range metric.Histogram {
out <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(string(metric.Description.Namespace),
string(metric.Description.Subsystem),
string(metric.Description.Name)),
metric.Description.Help,
append(labels, metric.HistogramBucketLabel),
metric.StaticLabels,
),
prometheus.GaugeValue,
float64(v),
append(values, k)...)
}
continue
}
metricType := prometheus.GaugeValue
switch metric.Description.Type {
case counterMetric:
metricType = prometheus.CounterValue
}
toPost := prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(string(metric.Description.Namespace),
string(metric.Description.Subsystem),
string(metric.Description.Name)),
metric.Description.Help,
labels,
metric.StaticLabels,
),
metricType,
metric.Value,
values...)
out <- toPost
}
}
// Call peer api to fetch metrics
wg.Add(2)
go publish(ReportMetrics(GlobalContext, c.metricsGroups))
go publish(globalNotificationSys.GetClusterMetrics(GlobalContext))
wg.Wait()
}
// ReportMetrics reports serialized metrics to the channel passed for the metrics generated.
func ReportMetrics(ctx context.Context, metricsGroups []*MetricsGroup) <-chan Metric {
ch := make(chan Metric)
go func() {
defer close(ch)
populateAndPublish(metricsGroups, func(m Metric) bool {
if m.VariableLabels == nil {
m.VariableLabels = make(map[string]string)
}
m.VariableLabels[serverName] = globalLocalNodeName
for {
select {
case ch <- m:
return true
case <-ctx.Done():
return false
}
}
})
}()
return ch
}
// minioNodeCollector is the Custom Collector
type minioNodeCollector struct {
metricsGroups []*MetricsGroup
desc *prometheus.Desc
}
// Describe sends the super-set of all possible descriptors of metrics
func (c *minioNodeCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- c.desc
}
// populateAndPublish populates and then publishes the metrics generated by the generator function.
func populateAndPublish(metricsGroups []*MetricsGroup, publish func(m Metric) bool) {
for _, mg := range metricsGroups {
if mg == nil {
continue
}
for _, metric := range mg.Get() {
if !publish(metric) {
return
}
}
}
}
// Collect is called by the Prometheus registry when collecting metrics.
func (c *minioNodeCollector) Collect(ch chan<- prometheus.Metric) {
// Expose MinIO's version information
minioVersionInfo.WithLabelValues(Version, CommitID).Set(1.0)
populateAndPublish(c.metricsGroups, func(metric Metric) bool {
labels, values := getOrderedLabelValueArrays(metric.VariableLabels)
values = append(values, globalLocalNodeName)
labels = append(labels, serverName)
if metric.Description.Type == histogramMetric {
if metric.Histogram == nil {
return true
}
for k, v := range metric.Histogram {
labels = append(labels, metric.HistogramBucketLabel)
values = append(values, k)
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(string(metric.Description.Namespace),
string(metric.Description.Subsystem),
string(metric.Description.Name)),
metric.Description.Help,
labels,
metric.StaticLabels,
),
prometheus.GaugeValue,
float64(v),
values...)
}
return true
}
metricType := prometheus.GaugeValue
switch metric.Description.Type {
case counterMetric:
metricType = prometheus.CounterValue
}
ch <- prometheus.MustNewConstMetric(
prometheus.NewDesc(
prometheus.BuildFQName(string(metric.Description.Namespace),
string(metric.Description.Subsystem),
string(metric.Description.Name)),
metric.Description.Help,
labels,
metric.StaticLabels,
),
metricType,
metric.Value,
values...)
return true
})
}
func getOrderedLabelValueArrays(labelsWithValue map[string]string) (labels, values []string) {
labels = make([]string, 0)
values = make([]string, 0)
for l, v := range labelsWithValue {
labels = append(labels, l)
values = append(values, v)
}
return
}
// newMinioCollectorNode describes the collector
// and returns reference of minioCollector for version 2
// It creates the Prometheus Description which is used
// to define Metric and help string
func newMinioCollectorNode(metricsGroups []*MetricsGroup) *minioNodeCollector {
return &minioNodeCollector{
metricsGroups: metricsGroups,
desc: prometheus.NewDesc("minio_stats", "Statistics exposed by MinIO server per node", nil, nil),
}
}
func metricsServerHandler() http.Handler {
registry := prometheus.NewRegistry()
// Report all other metrics
logger.CriticalIf(GlobalContext, registry.Register(clusterCollector))
// DefaultGatherers include golang metrics and process metrics.
gatherers := prometheus.Gatherers{
registry,
}
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
tc, ok := r.Context().Value(contextTraceReqKey).(*traceCtxt)
if ok {
tc.funcName = "handler.MetricsCluster"
tc.responseRecorder.LogErrBody = true
}
mfs, err := gatherers.Gather()
if err != nil {
if len(mfs) == 0 {
writeErrorResponseJSON(r.Context(), w, toAdminAPIErr(r.Context(), err), r.URL)
return
}
}
contentType := expfmt.Negotiate(r.Header)
w.Header().Set("Content-Type", string(contentType))
enc := expfmt.NewEncoder(w, contentType)
for _, mf := range mfs {
if err := enc.Encode(mf); err != nil {
logger.LogIf(r.Context(), err)
return
}
}
if closer, ok := enc.(expfmt.Closer); ok {
closer.Close()
}
})
}
func metricsNodeHandler() http.Handler {
registry := prometheus.NewRegistry()
logger.CriticalIf(GlobalContext, registry.Register(nodeCollector))
if err := registry.Register(prometheus.NewProcessCollector(prometheus.ProcessCollectorOpts{
Namespace: minioNamespace,
ReportErrors: true,
})); err != nil {
logger.CriticalIf(GlobalContext, err)
}
if err := registry.Register(prometheus.NewGoCollector()); err != nil {
logger.CriticalIf(GlobalContext, err)
}
gatherers := prometheus.Gatherers{
registry,
}
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
tc, ok := r.Context().Value(contextTraceReqKey).(*traceCtxt)
if ok {
tc.funcName = "handler.MetricsNode"
tc.responseRecorder.LogErrBody = true
}
mfs, err := gatherers.Gather()
if err != nil {
if len(mfs) == 0 {
writeErrorResponseJSON(r.Context(), w, toAdminAPIErr(r.Context(), err), r.URL)
return
}
}
contentType := expfmt.Negotiate(r.Header)
w.Header().Set("Content-Type", string(contentType))
enc := expfmt.NewEncoder(w, contentType)
for _, mf := range mfs {
if err := enc.Encode(mf); err != nil {
logger.LogIf(r.Context(), err)
return
}
}
if closer, ok := enc.(expfmt.Closer); ok {
closer.Close()
}
})
}
func toSnake(camel string) (snake string) {
var b strings.Builder
l := len(camel)
for i, v := range camel {
// A is 65, a is 97
if v >= 'a' {
b.WriteRune(v)
continue
}
// v is capital letter here
// disregard first letter
// add underscore if last letter is capital letter
// add underscore when previous letter is lowercase
// add underscore when next letter is lowercase
if (i != 0 || i == l-1) && ((i > 0 && rune(camel[i-1]) >= 'a') ||
(i < l-1 && rune(camel[i+1]) >= 'a')) {
b.WriteRune('_')
}
b.WriteRune(v + 'a' - 'A')
}
return b.String()
}