mirror of
https://github.com/minio/minio.git
synced 2025-01-11 23:13:23 -05:00
4160 lines
122 KiB
Go
4160 lines
122 KiB
Go
// Copyright (c) 2015-2024 MinIO, Inc.
|
|
//
|
|
// This file is part of MinIO Object Storage stack
|
|
//
|
|
// This program is free software: you can redistribute it and/or modify
|
|
// it under the terms of the GNU Affero General Public License as published by
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU Affero General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU Affero General Public License
|
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
package cmd
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"math"
|
|
"net/http"
|
|
"runtime"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/minio/kes-go"
|
|
"github.com/minio/madmin-go/v3"
|
|
"github.com/minio/minio/internal/bucket/lifecycle"
|
|
xioutil "github.com/minio/minio/internal/ioutil"
|
|
"github.com/minio/minio/internal/logger"
|
|
"github.com/minio/minio/internal/mcontext"
|
|
"github.com/minio/minio/internal/rest"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
dto "github.com/prometheus/client_model/go"
|
|
"github.com/prometheus/common/expfmt"
|
|
"github.com/prometheus/procfs"
|
|
)
|
|
|
|
var (
|
|
nodeCollector *minioNodeCollector
|
|
clusterCollector *minioClusterCollector
|
|
bucketCollector *minioBucketCollector
|
|
peerMetricsGroups []*MetricsGroup
|
|
bucketPeerMetricsGroups []*MetricsGroup
|
|
)
|
|
|
|
func init() {
|
|
clusterMetricsGroups := []*MetricsGroup{
|
|
getNodeHealthMetrics(MetricsGroupOpts{dependGlobalNotificationSys: true}),
|
|
getClusterStorageMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
|
|
getClusterTierMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
|
|
getClusterUsageMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
|
|
getKMSMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true, dependGlobalKMS: true}),
|
|
getClusterHealthMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
|
|
getIAMNodeMetrics(MetricsGroupOpts{dependGlobalAuthNPlugin: true, dependGlobalIAMSys: true}),
|
|
getReplicationSiteMetrics(MetricsGroupOpts{dependGlobalSiteReplicationSys: true}),
|
|
getBatchJobsMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
|
|
}
|
|
|
|
peerMetricsGroups = []*MetricsGroup{
|
|
getGoMetrics(),
|
|
getHTTPMetrics(MetricsGroupOpts{}),
|
|
getNotificationMetrics(MetricsGroupOpts{dependGlobalLambdaTargetList: true}),
|
|
getMinioProcMetrics(),
|
|
getMinioVersionMetrics(),
|
|
getNetworkMetrics(),
|
|
getS3TTFBMetric(),
|
|
getILMNodeMetrics(),
|
|
getScannerNodeMetrics(),
|
|
getIAMNodeMetrics(MetricsGroupOpts{dependGlobalAuthNPlugin: true, dependGlobalIAMSys: true}),
|
|
getKMSNodeMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true, dependGlobalKMS: true}),
|
|
getMinioHealingMetrics(MetricsGroupOpts{dependGlobalBackgroundHealState: true}),
|
|
getWebhookMetrics(),
|
|
getTierMetrics(),
|
|
}
|
|
|
|
allMetricsGroups := func() (allMetrics []*MetricsGroup) {
|
|
allMetrics = append(allMetrics, clusterMetricsGroups...)
|
|
allMetrics = append(allMetrics, peerMetricsGroups...)
|
|
return allMetrics
|
|
}()
|
|
|
|
nodeGroups := []*MetricsGroup{
|
|
getNodeHealthMetrics(MetricsGroupOpts{dependGlobalNotificationSys: true}),
|
|
getHTTPMetrics(MetricsGroupOpts{}),
|
|
getNetworkMetrics(),
|
|
getMinioVersionMetrics(),
|
|
getS3TTFBMetric(),
|
|
getTierMetrics(),
|
|
getNotificationMetrics(MetricsGroupOpts{dependGlobalLambdaTargetList: true}),
|
|
getDistLockMetrics(MetricsGroupOpts{dependGlobalIsDistErasure: true, dependGlobalLockServer: true}),
|
|
getIAMNodeMetrics(MetricsGroupOpts{dependGlobalAuthNPlugin: true, dependGlobalIAMSys: true}),
|
|
getLocalStorageMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
|
|
getReplicationNodeMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true, dependBucketTargetSys: true}),
|
|
}
|
|
|
|
bucketMetricsGroups := []*MetricsGroup{
|
|
getBucketUsageMetrics(MetricsGroupOpts{dependGlobalObjectAPI: true}),
|
|
getHTTPMetrics(MetricsGroupOpts{bucketOnly: true}),
|
|
getBucketTTFBMetric(),
|
|
}
|
|
|
|
bucketPeerMetricsGroups = []*MetricsGroup{
|
|
getHTTPMetrics(MetricsGroupOpts{bucketOnly: true}),
|
|
getBucketTTFBMetric(),
|
|
}
|
|
|
|
nodeCollector = newMinioCollectorNode(nodeGroups)
|
|
clusterCollector = newMinioClusterCollector(allMetricsGroups)
|
|
bucketCollector = newMinioBucketCollector(bucketMetricsGroups)
|
|
}
|
|
|
|
// MetricNamespace is top level grouping of metrics to create the metric name.
|
|
type MetricNamespace string
|
|
|
|
// MetricSubsystem is the sub grouping for metrics within a namespace.
|
|
type MetricSubsystem string
|
|
|
|
const (
|
|
bucketMetricNamespace MetricNamespace = "minio_bucket"
|
|
clusterMetricNamespace MetricNamespace = "minio_cluster"
|
|
healMetricNamespace MetricNamespace = "minio_heal"
|
|
interNodeMetricNamespace MetricNamespace = "minio_inter_node"
|
|
nodeMetricNamespace MetricNamespace = "minio_node"
|
|
minioMetricNamespace MetricNamespace = "minio"
|
|
s3MetricNamespace MetricNamespace = "minio_s3"
|
|
)
|
|
|
|
const (
|
|
cacheSubsystem MetricSubsystem = "cache"
|
|
capacityRawSubsystem MetricSubsystem = "capacity_raw"
|
|
capacityUsableSubsystem MetricSubsystem = "capacity_usable"
|
|
driveSubsystem MetricSubsystem = "drive"
|
|
interfaceSubsystem MetricSubsystem = "if"
|
|
memSubsystem MetricSubsystem = "mem"
|
|
cpuSubsystem MetricSubsystem = "cpu_avg"
|
|
storageClassSubsystem MetricSubsystem = "storage_class"
|
|
fileDescriptorSubsystem MetricSubsystem = "file_descriptor"
|
|
goRoutines MetricSubsystem = "go_routine"
|
|
ioSubsystem MetricSubsystem = "io"
|
|
nodesSubsystem MetricSubsystem = "nodes"
|
|
objectsSubsystem MetricSubsystem = "objects"
|
|
bucketsSubsystem MetricSubsystem = "bucket"
|
|
processSubsystem MetricSubsystem = "process"
|
|
replicationSubsystem MetricSubsystem = "replication"
|
|
requestsSubsystem MetricSubsystem = "requests"
|
|
requestsRejectedSubsystem MetricSubsystem = "requests_rejected"
|
|
timeSubsystem MetricSubsystem = "time"
|
|
ttfbSubsystem MetricSubsystem = "requests_ttfb"
|
|
trafficSubsystem MetricSubsystem = "traffic"
|
|
softwareSubsystem MetricSubsystem = "software"
|
|
sysCallSubsystem MetricSubsystem = "syscall"
|
|
usageSubsystem MetricSubsystem = "usage"
|
|
quotaSubsystem MetricSubsystem = "quota"
|
|
ilmSubsystem MetricSubsystem = "ilm"
|
|
tierSubsystem MetricSubsystem = "tier"
|
|
scannerSubsystem MetricSubsystem = "scanner"
|
|
iamSubsystem MetricSubsystem = "iam"
|
|
kmsSubsystem MetricSubsystem = "kms"
|
|
notifySubsystem MetricSubsystem = "notify"
|
|
lambdaSubsystem MetricSubsystem = "lambda"
|
|
auditSubsystem MetricSubsystem = "audit"
|
|
webhookSubsystem MetricSubsystem = "webhook"
|
|
)
|
|
|
|
// MetricName are the individual names for the metric.
|
|
type MetricName string
|
|
|
|
const (
|
|
authTotal MetricName = "auth_total"
|
|
canceledTotal MetricName = "canceled_total"
|
|
errorsTotal MetricName = "errors_total"
|
|
headerTotal MetricName = "header_total"
|
|
healTotal MetricName = "heal_total"
|
|
hitsTotal MetricName = "hits_total"
|
|
inflightTotal MetricName = "inflight_total"
|
|
invalidTotal MetricName = "invalid_total"
|
|
limitTotal MetricName = "limit_total"
|
|
missedTotal MetricName = "missed_total"
|
|
waitingTotal MetricName = "waiting_total"
|
|
incomingTotal MetricName = "incoming_total"
|
|
objectTotal MetricName = "object_total"
|
|
versionTotal MetricName = "version_total"
|
|
deleteMarkerTotal MetricName = "deletemarker_total"
|
|
offlineTotal MetricName = "offline_total"
|
|
onlineTotal MetricName = "online_total"
|
|
openTotal MetricName = "open_total"
|
|
readTotal MetricName = "read_total"
|
|
timestampTotal MetricName = "timestamp_total"
|
|
writeTotal MetricName = "write_total"
|
|
total MetricName = "total"
|
|
freeInodes MetricName = "free_inodes"
|
|
|
|
lastMinFailedCount MetricName = "last_minute_failed_count"
|
|
lastMinFailedBytes MetricName = "last_minute_failed_bytes"
|
|
lastHourFailedCount MetricName = "last_hour_failed_count"
|
|
lastHourFailedBytes MetricName = "last_hour_failed_bytes"
|
|
totalFailedCount MetricName = "total_failed_count"
|
|
totalFailedBytes MetricName = "total_failed_bytes"
|
|
|
|
currActiveWorkers MetricName = "current_active_workers"
|
|
avgActiveWorkers MetricName = "average_active_workers"
|
|
maxActiveWorkers MetricName = "max_active_workers"
|
|
recentBacklogCount MetricName = "recent_backlog_count"
|
|
currInQueueCount MetricName = "last_minute_queued_count"
|
|
currInQueueBytes MetricName = "last_minute_queued_bytes"
|
|
receivedCount MetricName = "received_count"
|
|
sentCount MetricName = "sent_count"
|
|
currTransferRate MetricName = "current_transfer_rate"
|
|
avgTransferRate MetricName = "average_transfer_rate"
|
|
maxTransferRate MetricName = "max_transfer_rate"
|
|
credentialErrors MetricName = "credential_errors"
|
|
|
|
currLinkLatency MetricName = "current_link_latency_ms"
|
|
avgLinkLatency MetricName = "average_link_latency_ms"
|
|
maxLinkLatency MetricName = "max_link_latency_ms"
|
|
|
|
linkOnline MetricName = "link_online"
|
|
linkOfflineDuration MetricName = "link_offline_duration_seconds"
|
|
linkDowntimeTotalDuration MetricName = "link_downtime_duration_seconds"
|
|
|
|
avgInQueueCount MetricName = "average_queued_count"
|
|
avgInQueueBytes MetricName = "average_queued_bytes"
|
|
maxInQueueCount MetricName = "max_queued_count"
|
|
maxInQueueBytes MetricName = "max_queued_bytes"
|
|
|
|
freeBytes MetricName = "free_bytes"
|
|
readBytes MetricName = "read_bytes"
|
|
rcharBytes MetricName = "rchar_bytes"
|
|
receivedBytes MetricName = "received_bytes"
|
|
latencyMilliSec MetricName = "latency_ms"
|
|
sentBytes MetricName = "sent_bytes"
|
|
totalBytes MetricName = "total_bytes"
|
|
usedBytes MetricName = "used_bytes"
|
|
writeBytes MetricName = "write_bytes"
|
|
wcharBytes MetricName = "wchar_bytes"
|
|
|
|
latencyMicroSec MetricName = "latency_us"
|
|
latencyNanoSec MetricName = "latency_ns"
|
|
|
|
commitInfo MetricName = "commit_info"
|
|
usageInfo MetricName = "usage_info"
|
|
versionInfo MetricName = "version_info"
|
|
|
|
sizeDistribution = "size_distribution"
|
|
versionDistribution = "version_distribution"
|
|
ttfbDistribution = "seconds_distribution"
|
|
ttlbDistribution = "ttlb_seconds_distribution"
|
|
|
|
lastActivityTime = "last_activity_nano_seconds"
|
|
startTime = "starttime_seconds"
|
|
upTime = "uptime_seconds"
|
|
memory = "resident_memory_bytes"
|
|
vmemory = "virtual_memory_bytes"
|
|
cpu = "cpu_total_seconds"
|
|
|
|
expiryPendingTasks MetricName = "expiry_pending_tasks"
|
|
transitionPendingTasks MetricName = "transition_pending_tasks"
|
|
transitionActiveTasks MetricName = "transition_active_tasks"
|
|
transitionMissedTasks MetricName = "transition_missed_immediate_tasks"
|
|
|
|
transitionedBytes MetricName = "transitioned_bytes"
|
|
transitionedObjects MetricName = "transitioned_objects"
|
|
transitionedVersions MetricName = "transitioned_versions"
|
|
|
|
tierRequestsSuccess MetricName = "requests_success"
|
|
tierRequestsFailure MetricName = "requests_failure"
|
|
|
|
kmsOnline = "online"
|
|
kmsRequestsSuccess = "request_success"
|
|
kmsRequestsError = "request_error"
|
|
kmsRequestsFail = "request_failure"
|
|
kmsUptime = "uptime"
|
|
|
|
webhookOnline = "online"
|
|
webhookQueueLength = "queue_length"
|
|
webhookTotalMessages = "total_messages"
|
|
webhookFailedMessages = "failed_messages"
|
|
)
|
|
|
|
const (
|
|
serverName = "server"
|
|
)
|
|
|
|
// MetricType for the types of metrics supported
|
|
type MetricType string
|
|
|
|
const (
|
|
gaugeMetric = "gaugeMetric"
|
|
counterMetric = "counterMetric"
|
|
histogramMetric = "histogramMetric"
|
|
)
|
|
|
|
// MetricDescription describes the metric
|
|
type MetricDescription struct {
|
|
Namespace MetricNamespace `json:"MetricNamespace"`
|
|
Subsystem MetricSubsystem `json:"Subsystem"`
|
|
Name MetricName `json:"MetricName"`
|
|
Help string `json:"Help"`
|
|
Type MetricType `json:"Type"`
|
|
}
|
|
|
|
// Metric captures the details for a metric
|
|
type Metric struct {
|
|
Description MetricDescription `json:"Description"`
|
|
StaticLabels map[string]string `json:"StaticLabels"`
|
|
Value float64 `json:"Value"`
|
|
VariableLabels map[string]string `json:"VariableLabels"`
|
|
HistogramBucketLabel string `json:"HistogramBucketLabel"`
|
|
Histogram map[string]uint64 `json:"Histogram"`
|
|
}
|
|
|
|
// MetricsGroup are a group of metrics that are initialized together.
|
|
type MetricsGroup struct {
|
|
metricsCache timedValue
|
|
cacheInterval time.Duration
|
|
metricsGroupOpts MetricsGroupOpts
|
|
}
|
|
|
|
// MetricsGroupOpts are a group of metrics opts to be used to initialize the metrics group.
|
|
type MetricsGroupOpts struct {
|
|
dependGlobalObjectAPI bool
|
|
dependGlobalAuthNPlugin bool
|
|
dependGlobalSiteReplicationSys bool
|
|
dependGlobalNotificationSys bool
|
|
dependGlobalKMS bool
|
|
bucketOnly bool
|
|
dependGlobalLambdaTargetList bool
|
|
dependGlobalIAMSys bool
|
|
dependGlobalLockServer bool
|
|
dependGlobalIsDistErasure bool
|
|
dependGlobalBackgroundHealState bool
|
|
dependBucketTargetSys bool
|
|
}
|
|
|
|
// RegisterRead register the metrics populator function to be used
|
|
// to populate new values upon cache invalidation.
|
|
func (g *MetricsGroup) RegisterRead(read func(ctx context.Context) []Metric) {
|
|
g.metricsCache.Once.Do(func() {
|
|
g.metricsCache.Relax = true
|
|
g.metricsCache.TTL = g.cacheInterval
|
|
g.metricsCache.Update = func() (interface{}, error) {
|
|
if g.metricsGroupOpts.dependGlobalObjectAPI {
|
|
objLayer := newObjectLayerFn()
|
|
// Service not initialized yet
|
|
if objLayer == nil {
|
|
return []Metric{}, nil
|
|
}
|
|
}
|
|
if g.metricsGroupOpts.dependGlobalAuthNPlugin {
|
|
if globalAuthNPlugin == nil {
|
|
return []Metric{}, nil
|
|
}
|
|
}
|
|
if g.metricsGroupOpts.dependGlobalSiteReplicationSys {
|
|
if !globalSiteReplicationSys.isEnabled() {
|
|
return []Metric{}, nil
|
|
}
|
|
}
|
|
if g.metricsGroupOpts.dependGlobalNotificationSys {
|
|
if globalNotificationSys == nil {
|
|
return []Metric{}, nil
|
|
}
|
|
}
|
|
if g.metricsGroupOpts.dependGlobalKMS {
|
|
if GlobalKMS == nil {
|
|
return []Metric{}, nil
|
|
}
|
|
}
|
|
if g.metricsGroupOpts.dependGlobalLambdaTargetList {
|
|
if globalLambdaTargetList == nil {
|
|
return []Metric{}, nil
|
|
}
|
|
}
|
|
if g.metricsGroupOpts.dependGlobalIAMSys {
|
|
if globalIAMSys == nil {
|
|
return []Metric{}, nil
|
|
}
|
|
}
|
|
if g.metricsGroupOpts.dependGlobalLockServer {
|
|
if globalLockServer == nil {
|
|
return []Metric{}, nil
|
|
}
|
|
}
|
|
if g.metricsGroupOpts.dependGlobalIsDistErasure {
|
|
if !globalIsDistErasure {
|
|
return []Metric{}, nil
|
|
}
|
|
}
|
|
if g.metricsGroupOpts.dependGlobalBackgroundHealState {
|
|
if globalBackgroundHealState == nil {
|
|
return []Metric{}, nil
|
|
}
|
|
}
|
|
if g.metricsGroupOpts.dependBucketTargetSys {
|
|
if globalBucketTargetSys == nil {
|
|
return []Metric{}, nil
|
|
}
|
|
}
|
|
return read(GlobalContext), nil
|
|
}
|
|
})
|
|
}
|
|
|
|
func (m *Metric) copyMetric() Metric {
|
|
metric := Metric{
|
|
Description: m.Description,
|
|
Value: m.Value,
|
|
HistogramBucketLabel: m.HistogramBucketLabel,
|
|
StaticLabels: make(map[string]string),
|
|
VariableLabels: make(map[string]string),
|
|
Histogram: make(map[string]uint64),
|
|
}
|
|
for k, v := range m.StaticLabels {
|
|
metric.StaticLabels[k] = v
|
|
}
|
|
for k, v := range m.VariableLabels {
|
|
metric.VariableLabels[k] = v
|
|
}
|
|
for k, v := range m.Histogram {
|
|
metric.Histogram[k] = v
|
|
}
|
|
return metric
|
|
}
|
|
|
|
// Get - returns cached value always upton the configured TTL,
|
|
// once the TTL expires "read()" registered function is called
|
|
// to return the new values and updated.
|
|
func (g *MetricsGroup) Get() (metrics []Metric) {
|
|
c, _ := g.metricsCache.Get()
|
|
m, ok := c.([]Metric)
|
|
if !ok {
|
|
return []Metric{}
|
|
}
|
|
|
|
metrics = make([]Metric, 0, len(m))
|
|
for i := range m {
|
|
metrics = append(metrics, m[i].copyMetric())
|
|
}
|
|
return metrics
|
|
}
|
|
|
|
func getClusterBucketsTotalMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: bucketsSubsystem,
|
|
Name: total,
|
|
Help: "Total number of buckets in the cluster",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterCapacityTotalBytesMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: capacityRawSubsystem,
|
|
Name: totalBytes,
|
|
Help: "Total capacity online in the cluster",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterCapacityFreeBytesMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: capacityRawSubsystem,
|
|
Name: freeBytes,
|
|
Help: "Total free capacity online in the cluster",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterCapacityUsageBytesMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: capacityUsableSubsystem,
|
|
Name: totalBytes,
|
|
Help: "Total usable capacity online in the cluster",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterCapacityUsageFreeBytesMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: capacityUsableSubsystem,
|
|
Name: freeBytes,
|
|
Help: "Total free usable capacity online in the cluster",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getNodeDriveAPILatencyMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: driveSubsystem,
|
|
Name: latencyMicroSec,
|
|
Help: "Average last minute latency in µs for drive API storage operations",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getNodeDriveUsedBytesMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: driveSubsystem,
|
|
Name: usedBytes,
|
|
Help: "Total storage used on a drive",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getNodeDriveTimeoutErrorsMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: driveSubsystem,
|
|
Name: "errors_timeout",
|
|
Help: "Total number of drive timeout errors since server start",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getNodeDriveAvailablityErrorsMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: driveSubsystem,
|
|
Name: "errors_availability",
|
|
Help: "Total number of drive I/O errors, permission denied and timeouts since server start",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getNodeDriveWaitingIOMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: driveSubsystem,
|
|
Name: "io_waiting",
|
|
Help: "Total number I/O operations waiting on drive",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getNodeDriveTokensIOMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: driveSubsystem,
|
|
Name: "io_tokens",
|
|
Help: "Total number concurrent I/O operations configured on drive",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getNodeDriveFreeBytesMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: driveSubsystem,
|
|
Name: freeBytes,
|
|
Help: "Total storage available on a drive",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterDrivesOfflineTotalMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: driveSubsystem,
|
|
Name: offlineTotal,
|
|
Help: "Total drives offline in this cluster",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterDrivesOnlineTotalMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: driveSubsystem,
|
|
Name: onlineTotal,
|
|
Help: "Total drives online in this cluster",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterDrivesTotalMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: driveSubsystem,
|
|
Name: total,
|
|
Help: "Total drives in this cluster",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getNodeDrivesOfflineTotalMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: driveSubsystem,
|
|
Name: offlineTotal,
|
|
Help: "Total drives offline in this node",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getNodeDrivesOnlineTotalMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: driveSubsystem,
|
|
Name: onlineTotal,
|
|
Help: "Total drives online in this node",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getNodeDrivesTotalMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: driveSubsystem,
|
|
Name: total,
|
|
Help: "Total drives in this node",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getNodeStandardParityMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: storageClassSubsystem,
|
|
Name: "standard_parity",
|
|
Help: "standard storage class parity",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getNodeRRSParityMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: storageClassSubsystem,
|
|
Name: "rrs_parity",
|
|
Help: "reduced redundancy storage class parity",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getNodeDrivesFreeInodesMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: driveSubsystem,
|
|
Name: freeInodes,
|
|
Help: "Free inodes on a drive",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getNodeDriveTotalBytesMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: driveSubsystem,
|
|
Name: totalBytes,
|
|
Help: "Total storage on a drive",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getUsageLastScanActivityMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: minioMetricNamespace,
|
|
Subsystem: usageSubsystem,
|
|
Name: lastActivityTime,
|
|
Help: "Time elapsed (in nano seconds) since last scan activity.",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getBucketUsageQuotaTotalBytesMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: bucketMetricNamespace,
|
|
Subsystem: quotaSubsystem,
|
|
Name: totalBytes,
|
|
Help: "Total bucket quota size in bytes",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getBucketTrafficReceivedBytes() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: bucketMetricNamespace,
|
|
Subsystem: trafficSubsystem,
|
|
Name: receivedBytes,
|
|
Help: "Total number of S3 bytes received for this bucket",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getBucketTrafficSentBytes() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: bucketMetricNamespace,
|
|
Subsystem: trafficSubsystem,
|
|
Name: sentBytes,
|
|
Help: "Total number of S3 bytes sent for this bucket",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getBucketUsageTotalBytesMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: bucketMetricNamespace,
|
|
Subsystem: usageSubsystem,
|
|
Name: totalBytes,
|
|
Help: "Total bucket size in bytes",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterUsageTotalBytesMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: usageSubsystem,
|
|
Name: totalBytes,
|
|
Help: "Total cluster usage in bytes",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterUsageObjectsTotalMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: usageSubsystem,
|
|
Name: objectTotal,
|
|
Help: "Total number of objects in a cluster",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterUsageVersionsTotalMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: usageSubsystem,
|
|
Name: versionTotal,
|
|
Help: "Total number of versions (includes delete marker) in a cluster",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterUsageDeleteMarkersTotalMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: usageSubsystem,
|
|
Name: deleteMarkerTotal,
|
|
Help: "Total number of delete markers in a cluster",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getBucketUsageObjectsTotalMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: bucketMetricNamespace,
|
|
Subsystem: usageSubsystem,
|
|
Name: objectTotal,
|
|
Help: "Total number of objects",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getBucketUsageVersionsTotalMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: bucketMetricNamespace,
|
|
Subsystem: usageSubsystem,
|
|
Name: versionTotal,
|
|
Help: "Total number of versions (includes delete marker)",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getBucketUsageDeleteMarkersTotalMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: bucketMetricNamespace,
|
|
Subsystem: usageSubsystem,
|
|
Name: deleteMarkerTotal,
|
|
Help: "Total number of delete markers",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterObjectDistributionMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: objectsSubsystem,
|
|
Name: sizeDistribution,
|
|
Help: "Distribution of object sizes across a cluster",
|
|
Type: histogramMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterObjectVersionsMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: objectsSubsystem,
|
|
Name: versionDistribution,
|
|
Help: "Distribution of object versions across a cluster",
|
|
Type: histogramMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterRepLinkLatencyCurrMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: currLinkLatency,
|
|
Help: "Replication current link latency in milliseconds",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterRepLinkOnlineMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: linkOnline,
|
|
Help: "Reports whether replication link is online (1) or offline(0)",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterRepLinkCurrOfflineDurationMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: linkOfflineDuration,
|
|
Help: "Duration of replication link being offline in seconds since last offline event",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterRepLinkTotalOfflineDurationMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: linkDowntimeTotalDuration,
|
|
Help: "Total downtime of replication link in seconds since server start",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getBucketRepLatencyMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: bucketMetricNamespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: latencyMilliSec,
|
|
Help: "Replication latency in milliseconds",
|
|
Type: histogramMetric,
|
|
}
|
|
}
|
|
|
|
func getRepFailedBytesLastMinuteMD(namespace MetricNamespace) MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: namespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: lastMinFailedBytes,
|
|
Help: "Total number of bytes failed at least once to replicate in the last full minute",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getRepFailedOperationsLastMinuteMD(namespace MetricNamespace) MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: namespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: lastMinFailedCount,
|
|
Help: "Total number of objects which failed replication in the last full minute",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getRepFailedBytesLastHourMD(namespace MetricNamespace) MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: namespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: lastHourFailedBytes,
|
|
Help: "Total number of bytes failed at least once to replicate in the last hour",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getRepFailedOperationsLastHourMD(namespace MetricNamespace) MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: namespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: lastHourFailedCount,
|
|
Help: "Total number of objects which failed replication in the last hour",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getRepFailedBytesTotalMD(namespace MetricNamespace) MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: namespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: totalFailedBytes,
|
|
Help: "Total number of bytes failed at least once to replicate since server start",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getRepFailedOperationsTotalMD(namespace MetricNamespace) MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: namespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: totalFailedCount,
|
|
Help: "Total number of objects which failed replication since server start",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getRepSentBytesMD(namespace MetricNamespace) MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: namespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: sentBytes,
|
|
Help: "Total number of bytes replicated to the target",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getRepSentOperationsMD(namespace MetricNamespace) MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: namespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: sentCount,
|
|
Help: "Total number of objects replicated to the target",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getRepReceivedBytesMD(namespace MetricNamespace) MetricDescription {
|
|
helpText := "Total number of bytes replicated to this bucket from another source bucket"
|
|
if namespace == clusterMetricNamespace {
|
|
helpText = "Total number of bytes replicated to this cluster from site replication peer"
|
|
}
|
|
return MetricDescription{
|
|
Namespace: namespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: receivedBytes,
|
|
Help: helpText,
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getRepReceivedOperationsMD(namespace MetricNamespace) MetricDescription {
|
|
help := "Total number of objects received by this cluster"
|
|
if namespace == bucketMetricNamespace {
|
|
help = "Total number of objects received by this bucket from another source bucket"
|
|
}
|
|
return MetricDescription{
|
|
Namespace: namespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: receivedCount,
|
|
Help: help,
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterReplMRFFailedOperationsMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: recentBacklogCount,
|
|
Help: "Total number of objects seen in replication backlog in the last 5 minutes",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterRepCredentialErrorsMD(namespace MetricNamespace) MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: namespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: credentialErrors,
|
|
Help: "Total number of replication credential errors since server start",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterReplCurrQueuedOperationsMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: currInQueueCount,
|
|
Help: "Total number of objects queued for replication in the last full minute",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterReplCurrQueuedBytesMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: currInQueueBytes,
|
|
Help: "Total number of bytes queued for replication in the last full minute",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterReplActiveWorkersCountMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: currActiveWorkers,
|
|
Help: "Total number of active replication workers",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterReplAvgActiveWorkersCountMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: avgActiveWorkers,
|
|
Help: "Average number of active replication workers",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterReplMaxActiveWorkersCountMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: maxActiveWorkers,
|
|
Help: "Maximum number of active replication workers seen since server start",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterReplCurrentTransferRateMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: currTransferRate,
|
|
Help: "Current replication transfer rate in bytes/sec",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterRepLinkLatencyMaxMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: maxLinkLatency,
|
|
Help: "Maximum replication link latency in milliseconds seen since server start",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterRepLinkLatencyAvgMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: avgLinkLatency,
|
|
Help: "Average replication link latency in milliseconds",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterReplAvgQueuedOperationsMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: avgInQueueCount,
|
|
Help: "Average number of objects queued for replication since server start",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterReplAvgQueuedBytesMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: avgInQueueBytes,
|
|
Help: "Average number of bytes queued for replication since server start",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterReplMaxQueuedOperationsMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: maxInQueueCount,
|
|
Help: "Maximum number of objects queued for replication since server start",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterReplMaxQueuedBytesMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: maxInQueueBytes,
|
|
Help: "Maximum number of bytes queued for replication since server start",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterReplAvgTransferRateMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: avgTransferRate,
|
|
Help: "Average replication transfer rate in bytes/sec",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterReplMaxTransferRateMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: replicationSubsystem,
|
|
Name: maxTransferRate,
|
|
Help: "Maximum replication transfer rate in bytes/sec seen since server start",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getBucketObjectDistributionMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: bucketMetricNamespace,
|
|
Subsystem: objectsSubsystem,
|
|
Name: sizeDistribution,
|
|
Help: "Distribution of object sizes in the bucket, includes label for the bucket name",
|
|
Type: histogramMetric,
|
|
}
|
|
}
|
|
|
|
func getBucketObjectVersionsMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: bucketMetricNamespace,
|
|
Subsystem: objectsSubsystem,
|
|
Name: versionDistribution,
|
|
Help: "Distribution of object sizes in the bucket, includes label for the bucket name",
|
|
Type: histogramMetric,
|
|
}
|
|
}
|
|
|
|
func getInternodeFailedRequests() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: interNodeMetricNamespace,
|
|
Subsystem: trafficSubsystem,
|
|
Name: errorsTotal,
|
|
Help: "Total number of failed internode calls",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getInternodeTCPDialTimeout() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: interNodeMetricNamespace,
|
|
Subsystem: trafficSubsystem,
|
|
Name: "dial_errors",
|
|
Help: "Total number of internode TCP dial timeouts and errors",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getInternodeTCPAvgDuration() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: interNodeMetricNamespace,
|
|
Subsystem: trafficSubsystem,
|
|
Name: "dial_avg_time",
|
|
Help: "Average time of internodes TCP dial calls",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getInterNodeSentBytesMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: interNodeMetricNamespace,
|
|
Subsystem: trafficSubsystem,
|
|
Name: sentBytes,
|
|
Help: "Total number of bytes sent to the other peer nodes",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getInterNodeReceivedBytesMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: interNodeMetricNamespace,
|
|
Subsystem: trafficSubsystem,
|
|
Name: receivedBytes,
|
|
Help: "Total number of bytes received from other peer nodes",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getS3SentBytesMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: s3MetricNamespace,
|
|
Subsystem: trafficSubsystem,
|
|
Name: sentBytes,
|
|
Help: "Total number of s3 bytes sent",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getS3ReceivedBytesMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: s3MetricNamespace,
|
|
Subsystem: trafficSubsystem,
|
|
Name: receivedBytes,
|
|
Help: "Total number of s3 bytes received",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getS3RequestsInFlightMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: s3MetricNamespace,
|
|
Subsystem: requestsSubsystem,
|
|
Name: inflightTotal,
|
|
Help: "Total number of S3 requests currently in flight",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getS3RequestsInQueueMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: s3MetricNamespace,
|
|
Subsystem: requestsSubsystem,
|
|
Name: waitingTotal,
|
|
Help: "Total number of S3 requests in the waiting queue",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getIncomingS3RequestsMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: s3MetricNamespace,
|
|
Subsystem: requestsSubsystem,
|
|
Name: incomingTotal,
|
|
Help: "Total number of incoming S3 requests",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getS3RequestsTotalMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: s3MetricNamespace,
|
|
Subsystem: requestsSubsystem,
|
|
Name: total,
|
|
Help: "Total number of S3 requests",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getS3RequestsErrorsMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: s3MetricNamespace,
|
|
Subsystem: requestsSubsystem,
|
|
Name: errorsTotal,
|
|
Help: "Total number of S3 requests with (4xx and 5xx) errors",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getS3Requests4xxErrorsMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: s3MetricNamespace,
|
|
Subsystem: requestsSubsystem,
|
|
Name: "4xx_" + errorsTotal,
|
|
Help: "Total number of S3 requests with (4xx) errors",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getS3Requests5xxErrorsMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: s3MetricNamespace,
|
|
Subsystem: requestsSubsystem,
|
|
Name: "5xx_" + errorsTotal,
|
|
Help: "Total number of S3 requests with (5xx) errors",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getS3RequestsCanceledMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: s3MetricNamespace,
|
|
Subsystem: requestsSubsystem,
|
|
Name: canceledTotal,
|
|
Help: "Total number of S3 requests that were canceled by the client",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getS3RejectedAuthRequestsTotalMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: s3MetricNamespace,
|
|
Subsystem: requestsRejectedSubsystem,
|
|
Name: authTotal,
|
|
Help: "Total number of S3 requests rejected for auth failure",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getS3RejectedHeaderRequestsTotalMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: s3MetricNamespace,
|
|
Subsystem: requestsRejectedSubsystem,
|
|
Name: headerTotal,
|
|
Help: "Total number of S3 requests rejected for invalid header",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getS3RejectedTimestampRequestsTotalMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: s3MetricNamespace,
|
|
Subsystem: requestsRejectedSubsystem,
|
|
Name: timestampTotal,
|
|
Help: "Total number of S3 requests rejected for invalid timestamp",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getS3RejectedInvalidRequestsTotalMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: s3MetricNamespace,
|
|
Subsystem: requestsRejectedSubsystem,
|
|
Name: invalidTotal,
|
|
Help: "Total number of invalid S3 requests",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getHealObjectsTotalMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: healMetricNamespace,
|
|
Subsystem: objectsSubsystem,
|
|
Name: total,
|
|
Help: "Objects scanned in current self healing run",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getHealObjectsHealTotalMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: healMetricNamespace,
|
|
Subsystem: objectsSubsystem,
|
|
Name: healTotal,
|
|
Help: "Objects healed in current self healing run",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getHealObjectsFailTotalMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: healMetricNamespace,
|
|
Subsystem: objectsSubsystem,
|
|
Name: errorsTotal,
|
|
Help: "Objects for which healing failed in current self healing run",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getHealLastActivityTimeMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: healMetricNamespace,
|
|
Subsystem: timeSubsystem,
|
|
Name: lastActivityTime,
|
|
Help: "Time elapsed (in nano seconds) since last self healing activity.",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getNodeOnlineTotalMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: nodesSubsystem,
|
|
Name: onlineTotal,
|
|
Help: "Total number of MinIO nodes online",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getNodeOfflineTotalMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: nodesSubsystem,
|
|
Name: offlineTotal,
|
|
Help: "Total number of MinIO nodes offline",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getMinIOVersionMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: minioMetricNamespace,
|
|
Subsystem: softwareSubsystem,
|
|
Name: versionInfo,
|
|
Help: "MinIO Release tag for the server",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getMinIOCommitMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: minioMetricNamespace,
|
|
Subsystem: softwareSubsystem,
|
|
Name: commitInfo,
|
|
Help: "Git commit hash for the MinIO release",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getS3TTFBDistributionMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: s3MetricNamespace,
|
|
Subsystem: ttfbSubsystem,
|
|
Name: ttfbDistribution,
|
|
Help: "Distribution of time to first byte across API calls",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getBucketTTFBDistributionMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: bucketMetricNamespace,
|
|
Subsystem: ttfbSubsystem,
|
|
Name: ttfbDistribution,
|
|
Help: "Distribution of time to first byte across API calls per bucket",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getMinioFDOpenMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: fileDescriptorSubsystem,
|
|
Name: openTotal,
|
|
Help: "Total number of open file descriptors by the MinIO Server process",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getMinioFDLimitMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: fileDescriptorSubsystem,
|
|
Name: limitTotal,
|
|
Help: "Limit on total number of open file descriptors for the MinIO Server process",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getMinioProcessIOWriteBytesMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: ioSubsystem,
|
|
Name: writeBytes,
|
|
Help: "Total bytes written by the process to the underlying storage system, /proc/[pid]/io write_bytes",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getMinioProcessIOReadBytesMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: ioSubsystem,
|
|
Name: readBytes,
|
|
Help: "Total bytes read by the process from the underlying storage system, /proc/[pid]/io read_bytes",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getMinioProcessIOWriteCachedBytesMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: ioSubsystem,
|
|
Name: wcharBytes,
|
|
Help: "Total bytes written by the process to the underlying storage system including page cache, /proc/[pid]/io wchar",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getMinioProcessIOReadCachedBytesMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: ioSubsystem,
|
|
Name: rcharBytes,
|
|
Help: "Total bytes read by the process from the underlying storage system including cache, /proc/[pid]/io rchar",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getMinIOProcessSysCallRMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: sysCallSubsystem,
|
|
Name: readTotal,
|
|
Help: "Total read SysCalls to the kernel. /proc/[pid]/io syscr",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getMinIOProcessSysCallWMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: sysCallSubsystem,
|
|
Name: writeTotal,
|
|
Help: "Total write SysCalls to the kernel. /proc/[pid]/io syscw",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getMinIOGORoutineCountMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: goRoutines,
|
|
Name: total,
|
|
Help: "Total number of go routines running",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getMinIOProcessStartTimeMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: processSubsystem,
|
|
Name: startTime,
|
|
Help: "Start time for MinIO process per node, time in seconds since Unix epoc",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getMinIOProcessUptimeMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: processSubsystem,
|
|
Name: upTime,
|
|
Help: "Uptime for MinIO process per node in seconds",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getMinIOProcessResidentMemory() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: processSubsystem,
|
|
Name: memory,
|
|
Help: "Resident memory size in bytes",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getMinIOProcessVirtualMemory() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: processSubsystem,
|
|
Name: memory,
|
|
Help: "Virtual memory size in bytes",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getMinIOProcessCPUTime() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: processSubsystem,
|
|
Name: cpu,
|
|
Help: "Total user and system CPU time spent in seconds",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getMinioProcMetrics() *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 10 * time.Second,
|
|
}
|
|
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
|
|
if runtime.GOOS == "windows" {
|
|
return nil
|
|
}
|
|
|
|
p, err := procfs.Self()
|
|
if err != nil {
|
|
logger.LogOnceIf(ctx, err, string(nodeMetricNamespace))
|
|
return
|
|
}
|
|
|
|
openFDs, _ := p.FileDescriptorsLen()
|
|
l, _ := p.Limits()
|
|
io, _ := p.IO()
|
|
stat, _ := p.Stat()
|
|
startTime, _ := stat.StartTime()
|
|
|
|
metrics = make([]Metric, 0, 20)
|
|
|
|
if openFDs > 0 {
|
|
metrics = append(metrics,
|
|
Metric{
|
|
Description: getMinioFDOpenMD(),
|
|
Value: float64(openFDs),
|
|
},
|
|
)
|
|
}
|
|
|
|
if l.OpenFiles > 0 {
|
|
metrics = append(metrics,
|
|
Metric{
|
|
Description: getMinioFDLimitMD(),
|
|
Value: float64(l.OpenFiles),
|
|
})
|
|
}
|
|
|
|
if io.SyscR > 0 {
|
|
metrics = append(metrics,
|
|
Metric{
|
|
Description: getMinIOProcessSysCallRMD(),
|
|
Value: float64(io.SyscR),
|
|
})
|
|
}
|
|
|
|
if io.SyscW > 0 {
|
|
metrics = append(metrics,
|
|
Metric{
|
|
Description: getMinIOProcessSysCallWMD(),
|
|
Value: float64(io.SyscW),
|
|
})
|
|
}
|
|
|
|
if io.ReadBytes > 0 {
|
|
metrics = append(metrics,
|
|
Metric{
|
|
Description: getMinioProcessIOReadBytesMD(),
|
|
Value: float64(io.ReadBytes),
|
|
})
|
|
}
|
|
|
|
if io.WriteBytes > 0 {
|
|
metrics = append(metrics,
|
|
Metric{
|
|
Description: getMinioProcessIOWriteBytesMD(),
|
|
Value: float64(io.WriteBytes),
|
|
})
|
|
}
|
|
|
|
if io.RChar > 0 {
|
|
metrics = append(metrics,
|
|
Metric{
|
|
Description: getMinioProcessIOReadCachedBytesMD(),
|
|
Value: float64(io.RChar),
|
|
})
|
|
}
|
|
|
|
if io.WChar > 0 {
|
|
metrics = append(metrics,
|
|
Metric{
|
|
Description: getMinioProcessIOWriteCachedBytesMD(),
|
|
Value: float64(io.WChar),
|
|
})
|
|
}
|
|
|
|
if startTime > 0 {
|
|
metrics = append(metrics,
|
|
Metric{
|
|
Description: getMinIOProcessStartTimeMD(),
|
|
Value: startTime,
|
|
})
|
|
}
|
|
|
|
if !globalBootTime.IsZero() {
|
|
metrics = append(metrics,
|
|
Metric{
|
|
Description: getMinIOProcessUptimeMD(),
|
|
Value: time.Since(globalBootTime).Seconds(),
|
|
})
|
|
}
|
|
|
|
if stat.ResidentMemory() > 0 {
|
|
metrics = append(metrics,
|
|
Metric{
|
|
Description: getMinIOProcessResidentMemory(),
|
|
Value: float64(stat.ResidentMemory()),
|
|
})
|
|
}
|
|
|
|
if stat.VirtualMemory() > 0 {
|
|
metrics = append(metrics,
|
|
Metric{
|
|
Description: getMinIOProcessVirtualMemory(),
|
|
Value: float64(stat.VirtualMemory()),
|
|
})
|
|
}
|
|
|
|
if stat.CPUTime() > 0 {
|
|
metrics = append(metrics,
|
|
Metric{
|
|
Description: getMinIOProcessCPUTime(),
|
|
Value: stat.CPUTime(),
|
|
})
|
|
}
|
|
return
|
|
})
|
|
return mg
|
|
}
|
|
|
|
func getGoMetrics() *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 10 * time.Second,
|
|
}
|
|
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
|
|
metrics = append(metrics, Metric{
|
|
Description: getMinIOGORoutineCountMD(),
|
|
Value: float64(runtime.NumGoroutine()),
|
|
})
|
|
return
|
|
})
|
|
return mg
|
|
}
|
|
|
|
// getHistogramMetrics fetches histogram metrics and returns it in a []Metric
|
|
// Note: Typically used in MetricGroup.RegisterRead
|
|
func getHistogramMetrics(hist *prometheus.HistogramVec, desc MetricDescription) []Metric {
|
|
ch := make(chan prometheus.Metric)
|
|
go func() {
|
|
defer xioutil.SafeClose(ch)
|
|
// Collects prometheus metrics from hist and sends it over ch
|
|
hist.Collect(ch)
|
|
}()
|
|
|
|
// Converts metrics received into internal []Metric type
|
|
var metrics []Metric
|
|
for promMetric := range ch {
|
|
dtoMetric := &dto.Metric{}
|
|
err := promMetric.Write(dtoMetric)
|
|
if err != nil {
|
|
// Log error and continue to receive other metric
|
|
// values
|
|
logger.LogIf(GlobalContext, err)
|
|
continue
|
|
}
|
|
|
|
h := dtoMetric.GetHistogram()
|
|
for _, b := range h.Bucket {
|
|
labels := make(map[string]string)
|
|
for _, lp := range dtoMetric.GetLabel() {
|
|
labels[*lp.Name] = *lp.Value
|
|
}
|
|
labels["le"] = fmt.Sprintf("%.3f", *b.UpperBound)
|
|
metric := Metric{
|
|
Description: desc,
|
|
VariableLabels: labels,
|
|
Value: float64(b.GetCumulativeCount()),
|
|
}
|
|
metrics = append(metrics, metric)
|
|
}
|
|
// add metrics with +Inf label
|
|
labels1 := make(map[string]string)
|
|
for _, lp := range dtoMetric.GetLabel() {
|
|
labels1[*lp.Name] = *lp.Value
|
|
}
|
|
labels1["le"] = fmt.Sprintf("%.3f", math.Inf(+1))
|
|
metrics = append(metrics, Metric{
|
|
Description: desc,
|
|
VariableLabels: labels1,
|
|
Value: dtoMetric.Counter.GetValue(),
|
|
})
|
|
}
|
|
return metrics
|
|
}
|
|
|
|
func getBucketTTFBMetric() *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 10 * time.Second,
|
|
}
|
|
mg.RegisterRead(func(ctx context.Context) []Metric {
|
|
return getHistogramMetrics(bucketHTTPRequestsDuration, getBucketTTFBDistributionMD())
|
|
})
|
|
return mg
|
|
}
|
|
|
|
func getS3TTFBMetric() *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 10 * time.Second,
|
|
}
|
|
mg.RegisterRead(func(ctx context.Context) []Metric {
|
|
return getHistogramMetrics(httpRequestsDuration, getS3TTFBDistributionMD())
|
|
})
|
|
return mg
|
|
}
|
|
|
|
func getTierMetrics() *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 10 * time.Second,
|
|
}
|
|
mg.RegisterRead(func(ctx context.Context) []Metric {
|
|
return globalTierMetrics.Report()
|
|
})
|
|
return mg
|
|
}
|
|
|
|
func getTransitionPendingTasksMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: ilmSubsystem,
|
|
Name: transitionPendingTasks,
|
|
Help: "Number of pending ILM transition tasks in the queue",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getTransitionActiveTasksMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: ilmSubsystem,
|
|
Name: transitionActiveTasks,
|
|
Help: "Number of active ILM transition tasks",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getTransitionMissedTasksMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: ilmSubsystem,
|
|
Name: transitionMissedTasks,
|
|
Help: "Number of missed immediate ILM transition tasks",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getExpiryPendingTasksMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: ilmSubsystem,
|
|
Name: expiryPendingTasks,
|
|
Help: "Number of pending ILM expiry tasks in the queue",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getBucketS3RequestsInFlightMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: bucketMetricNamespace,
|
|
Subsystem: requestsSubsystem,
|
|
Name: inflightTotal,
|
|
Help: "Total number of S3 requests currently in flight on a bucket",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getBucketS3RequestsTotalMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: bucketMetricNamespace,
|
|
Subsystem: requestsSubsystem,
|
|
Name: total,
|
|
Help: "Total number of S3 requests on a bucket",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getBucketS3Requests4xxErrorsMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: bucketMetricNamespace,
|
|
Subsystem: requestsSubsystem,
|
|
Name: "4xx_" + errorsTotal,
|
|
Help: "Total number of S3 requests with (4xx) errors on a bucket",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getBucketS3Requests5xxErrorsMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: bucketMetricNamespace,
|
|
Subsystem: requestsSubsystem,
|
|
Name: "5xx_" + errorsTotal,
|
|
Help: "Total number of S3 requests with (5xx) errors on a bucket",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getBucketS3RequestsCanceledMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: bucketMetricNamespace,
|
|
Subsystem: requestsSubsystem,
|
|
Name: canceledTotal,
|
|
Help: "Total number of S3 requests that were canceled from the client while processing on a bucket",
|
|
Type: counterMetric,
|
|
}
|
|
}
|
|
|
|
func getILMNodeMetrics() *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 10 * time.Second,
|
|
}
|
|
mg.RegisterRead(func(_ context.Context) []Metric {
|
|
expPendingTasks := Metric{
|
|
Description: getExpiryPendingTasksMD(),
|
|
}
|
|
trPendingTasks := Metric{
|
|
Description: getTransitionPendingTasksMD(),
|
|
}
|
|
trActiveTasks := Metric{
|
|
Description: getTransitionActiveTasksMD(),
|
|
}
|
|
trMissedTasks := Metric{
|
|
Description: getTransitionMissedTasksMD(),
|
|
}
|
|
if globalExpiryState != nil {
|
|
expPendingTasks.Value = float64(globalExpiryState.PendingTasks())
|
|
}
|
|
if globalTransitionState != nil {
|
|
trPendingTasks.Value = float64(globalTransitionState.PendingTasks())
|
|
trActiveTasks.Value = float64(globalTransitionState.ActiveTasks())
|
|
trMissedTasks.Value = float64(globalTransitionState.MissedImmediateTasks())
|
|
}
|
|
return []Metric{
|
|
expPendingTasks,
|
|
trPendingTasks,
|
|
trActiveTasks,
|
|
trMissedTasks,
|
|
}
|
|
})
|
|
return mg
|
|
}
|
|
|
|
func getScannerNodeMetrics() *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 10 * time.Second,
|
|
}
|
|
mg.RegisterRead(func(_ context.Context) []Metric {
|
|
metrics := []Metric{
|
|
{
|
|
Description: MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: scannerSubsystem,
|
|
Name: "objects_scanned",
|
|
Help: "Total number of unique objects scanned since server start",
|
|
Type: counterMetric,
|
|
},
|
|
Value: float64(globalScannerMetrics.lifetime(scannerMetricScanObject)),
|
|
},
|
|
{
|
|
Description: MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: scannerSubsystem,
|
|
Name: "versions_scanned",
|
|
Help: "Total number of object versions scanned since server start",
|
|
Type: counterMetric,
|
|
},
|
|
Value: float64(globalScannerMetrics.lifetime(scannerMetricApplyVersion)),
|
|
},
|
|
{
|
|
Description: MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: scannerSubsystem,
|
|
Name: "directories_scanned",
|
|
Help: "Total number of directories scanned since server start",
|
|
Type: counterMetric,
|
|
},
|
|
Value: float64(globalScannerMetrics.lifetime(scannerMetricScanFolder)),
|
|
},
|
|
{
|
|
Description: MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: scannerSubsystem,
|
|
Name: "bucket_scans_started",
|
|
Help: "Total number of bucket scans started since server start",
|
|
Type: counterMetric,
|
|
},
|
|
Value: float64(globalScannerMetrics.lifetime(scannerMetricScanBucketDrive) + uint64(globalScannerMetrics.activeDrives())),
|
|
},
|
|
{
|
|
Description: MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: scannerSubsystem,
|
|
Name: "bucket_scans_finished",
|
|
Help: "Total number of bucket scans finished since server start",
|
|
Type: counterMetric,
|
|
},
|
|
Value: float64(globalScannerMetrics.lifetime(scannerMetricScanBucketDrive)),
|
|
},
|
|
{
|
|
Description: MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: ilmSubsystem,
|
|
Name: "versions_scanned",
|
|
Help: "Total number of object versions checked for ilm actions since server start",
|
|
Type: counterMetric,
|
|
},
|
|
Value: float64(globalScannerMetrics.lifetime(scannerMetricILM)),
|
|
},
|
|
}
|
|
for i := range globalScannerMetrics.actions {
|
|
action := lifecycle.Action(i)
|
|
v := globalScannerMetrics.lifetimeActions(action)
|
|
if v == 0 {
|
|
continue
|
|
}
|
|
metrics = append(metrics, Metric{
|
|
Description: MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: ilmSubsystem,
|
|
Name: MetricName("action_count_" + toSnake(action.String())),
|
|
Help: "Total action outcome of lifecycle checks since server start",
|
|
Type: counterMetric,
|
|
},
|
|
Value: float64(v),
|
|
})
|
|
}
|
|
return metrics
|
|
})
|
|
return mg
|
|
}
|
|
|
|
func getIAMNodeMetrics(opts MetricsGroupOpts) *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 10 * time.Second,
|
|
metricsGroupOpts: opts,
|
|
}
|
|
mg.RegisterRead(func(_ context.Context) (metrics []Metric) {
|
|
lastSyncTime := atomic.LoadUint64(&globalIAMSys.LastRefreshTimeUnixNano)
|
|
var sinceLastSyncMillis uint64
|
|
if lastSyncTime != 0 {
|
|
sinceLastSyncMillis = (uint64(time.Now().UnixNano()) - lastSyncTime) / uint64(time.Millisecond)
|
|
}
|
|
|
|
pluginAuthNMetrics := globalAuthNPlugin.Metrics()
|
|
metrics = []Metric{
|
|
{
|
|
Description: MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: iamSubsystem,
|
|
Name: "last_sync_duration_millis",
|
|
Help: "Last successful IAM data sync duration in milliseconds",
|
|
Type: gaugeMetric,
|
|
},
|
|
Value: float64(atomic.LoadUint64(&globalIAMSys.LastRefreshDurationMilliseconds)),
|
|
},
|
|
{
|
|
Description: MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: iamSubsystem,
|
|
Name: "since_last_sync_millis",
|
|
Help: "Time (in milliseconds) since last successful IAM data sync.",
|
|
Type: gaugeMetric,
|
|
},
|
|
Value: float64(sinceLastSyncMillis),
|
|
},
|
|
{
|
|
Description: MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: iamSubsystem,
|
|
Name: "sync_successes",
|
|
Help: "Number of successful IAM data syncs since server start.",
|
|
Type: counterMetric,
|
|
},
|
|
Value: float64(atomic.LoadUint64(&globalIAMSys.TotalRefreshSuccesses)),
|
|
},
|
|
{
|
|
Description: MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: iamSubsystem,
|
|
Name: "sync_failures",
|
|
Help: "Number of failed IAM data syncs since server start.",
|
|
Type: counterMetric,
|
|
},
|
|
Value: float64(atomic.LoadUint64(&globalIAMSys.TotalRefreshFailures)),
|
|
},
|
|
{
|
|
Description: MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: iamSubsystem,
|
|
Name: "plugin_authn_service_last_succ_seconds",
|
|
Help: "When plugin authentication is configured, returns time (in seconds) since the last successful request to the service",
|
|
Type: gaugeMetric,
|
|
},
|
|
Value: pluginAuthNMetrics.LastReachableSecs,
|
|
},
|
|
{
|
|
Description: MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: iamSubsystem,
|
|
Name: "plugin_authn_service_last_fail_seconds",
|
|
Help: "When plugin authentication is configured, returns time (in seconds) since the last failed request to the service",
|
|
Type: gaugeMetric,
|
|
},
|
|
Value: pluginAuthNMetrics.LastUnreachableSecs,
|
|
},
|
|
{
|
|
Description: MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: iamSubsystem,
|
|
Name: "plugin_authn_service_total_requests_minute",
|
|
Help: "When plugin authentication is configured, returns total requests count in the last full minute",
|
|
Type: gaugeMetric,
|
|
},
|
|
Value: float64(pluginAuthNMetrics.TotalRequests),
|
|
},
|
|
{
|
|
Description: MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: iamSubsystem,
|
|
Name: "plugin_authn_service_failed_requests_minute",
|
|
Help: "When plugin authentication is configured, returns failed requests count in the last full minute",
|
|
Type: gaugeMetric,
|
|
},
|
|
Value: float64(pluginAuthNMetrics.FailedRequests),
|
|
},
|
|
{
|
|
Description: MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: iamSubsystem,
|
|
Name: "plugin_authn_service_succ_avg_rtt_ms_minute",
|
|
Help: "When plugin authentication is configured, returns average round-trip-time of successful requests in the last full minute",
|
|
Type: gaugeMetric,
|
|
},
|
|
Value: pluginAuthNMetrics.AvgSuccRTTMs,
|
|
},
|
|
{
|
|
Description: MetricDescription{
|
|
Namespace: nodeMetricNamespace,
|
|
Subsystem: iamSubsystem,
|
|
Name: "plugin_authn_service_succ_max_rtt_ms_minute",
|
|
Help: "When plugin authentication is configured, returns maximum round-trip-time of successful requests in the last full minute",
|
|
Type: gaugeMetric,
|
|
},
|
|
Value: pluginAuthNMetrics.MaxSuccRTTMs,
|
|
},
|
|
}
|
|
|
|
return metrics
|
|
})
|
|
return mg
|
|
}
|
|
|
|
// replication metrics for each node - published to the cluster endpoint with nodename as label
|
|
func getReplicationNodeMetrics(opts MetricsGroupOpts) *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 1 * time.Minute,
|
|
metricsGroupOpts: opts,
|
|
}
|
|
const (
|
|
Online = 1
|
|
Offline = 0
|
|
)
|
|
|
|
mg.RegisterRead(func(_ context.Context) []Metric {
|
|
var ml []Metric
|
|
// common operational metrics for bucket replication and site replication - published
|
|
// at cluster level
|
|
if globalReplicationStats != nil {
|
|
qs := globalReplicationStats.getNodeQueueStatsSummary()
|
|
activeWorkersCount := Metric{
|
|
Description: getClusterReplActiveWorkersCountMD(),
|
|
}
|
|
avgActiveWorkersCount := Metric{
|
|
Description: getClusterReplAvgActiveWorkersCountMD(),
|
|
}
|
|
maxActiveWorkersCount := Metric{
|
|
Description: getClusterReplMaxActiveWorkersCountMD(),
|
|
}
|
|
currInQueueCount := Metric{
|
|
Description: getClusterReplCurrQueuedOperationsMD(),
|
|
}
|
|
currInQueueBytes := Metric{
|
|
Description: getClusterReplCurrQueuedBytesMD(),
|
|
}
|
|
|
|
currTransferRate := Metric{
|
|
Description: getClusterReplCurrentTransferRateMD(),
|
|
}
|
|
avgQueueCount := Metric{
|
|
Description: getClusterReplAvgQueuedOperationsMD(),
|
|
}
|
|
avgQueueBytes := Metric{
|
|
Description: getClusterReplAvgQueuedBytesMD(),
|
|
}
|
|
maxQueueCount := Metric{
|
|
Description: getClusterReplMaxQueuedOperationsMD(),
|
|
}
|
|
maxQueueBytes := Metric{
|
|
Description: getClusterReplMaxQueuedBytesMD(),
|
|
}
|
|
avgTransferRate := Metric{
|
|
Description: getClusterReplAvgTransferRateMD(),
|
|
}
|
|
maxTransferRate := Metric{
|
|
Description: getClusterReplMaxTransferRateMD(),
|
|
}
|
|
mrfCount := Metric{
|
|
Description: getClusterReplMRFFailedOperationsMD(),
|
|
Value: float64(qs.MRFStats.LastFailedCount),
|
|
}
|
|
|
|
if qs.QStats.Avg.Count > 0 || qs.QStats.Curr.Count > 0 {
|
|
qt := qs.QStats
|
|
currInQueueBytes.Value = qt.Curr.Bytes
|
|
currInQueueCount.Value = qt.Curr.Count
|
|
avgQueueBytes.Value = qt.Avg.Bytes
|
|
avgQueueCount.Value = qt.Avg.Count
|
|
maxQueueBytes.Value = qt.Max.Bytes
|
|
maxQueueCount.Value = qt.Max.Count
|
|
}
|
|
activeWorkersCount.Value = float64(qs.ActiveWorkers.Curr)
|
|
avgActiveWorkersCount.Value = float64(qs.ActiveWorkers.Avg)
|
|
maxActiveWorkersCount.Value = float64(qs.ActiveWorkers.Max)
|
|
|
|
if len(qs.XferStats) > 0 {
|
|
tots := qs.XferStats[Total]
|
|
currTransferRate.Value = tots.Curr
|
|
avgTransferRate.Value = tots.Avg
|
|
maxTransferRate.Value = tots.Peak
|
|
}
|
|
ml = []Metric{
|
|
activeWorkersCount,
|
|
avgActiveWorkersCount,
|
|
maxActiveWorkersCount,
|
|
currInQueueCount,
|
|
currInQueueBytes,
|
|
avgQueueCount,
|
|
avgQueueBytes,
|
|
maxQueueCount,
|
|
maxQueueBytes,
|
|
currTransferRate,
|
|
avgTransferRate,
|
|
maxTransferRate,
|
|
mrfCount,
|
|
}
|
|
}
|
|
for ep, health := range globalBucketTargetSys.healthStats() {
|
|
// link latency current
|
|
m := Metric{
|
|
Description: getClusterRepLinkLatencyCurrMD(),
|
|
VariableLabels: map[string]string{
|
|
"endpoint": ep,
|
|
},
|
|
}
|
|
m.Value = float64(health.latency.curr / time.Millisecond)
|
|
ml = append(ml, m)
|
|
|
|
// link latency average
|
|
m = Metric{
|
|
Description: getClusterRepLinkLatencyAvgMD(),
|
|
VariableLabels: map[string]string{
|
|
"endpoint": ep,
|
|
},
|
|
}
|
|
m.Value = float64(health.latency.avg / time.Millisecond)
|
|
ml = append(ml, m)
|
|
|
|
// link latency max
|
|
m = Metric{
|
|
Description: getClusterRepLinkLatencyMaxMD(),
|
|
VariableLabels: map[string]string{
|
|
"endpoint": ep,
|
|
},
|
|
}
|
|
m.Value = float64(health.latency.peak / time.Millisecond)
|
|
ml = append(ml, m)
|
|
|
|
linkOnline := Metric{
|
|
Description: getClusterRepLinkOnlineMD(),
|
|
VariableLabels: map[string]string{
|
|
"endpoint": ep,
|
|
},
|
|
}
|
|
online := Offline
|
|
if health.Online {
|
|
online = Online
|
|
}
|
|
linkOnline.Value = float64(online)
|
|
ml = append(ml, linkOnline)
|
|
offlineDuration := Metric{
|
|
Description: getClusterRepLinkCurrOfflineDurationMD(),
|
|
VariableLabels: map[string]string{
|
|
"endpoint": ep,
|
|
},
|
|
}
|
|
currDowntime := time.Duration(0)
|
|
if !health.Online && !health.lastOnline.IsZero() {
|
|
currDowntime = UTCNow().Sub(health.lastOnline)
|
|
}
|
|
offlineDuration.Value = float64(currDowntime / time.Second)
|
|
ml = append(ml, offlineDuration)
|
|
|
|
downtimeDuration := Metric{
|
|
Description: getClusterRepLinkTotalOfflineDurationMD(),
|
|
VariableLabels: map[string]string{
|
|
"endpoint": ep,
|
|
},
|
|
}
|
|
dwntime := currDowntime
|
|
if health.offlineDuration > currDowntime {
|
|
dwntime = health.offlineDuration
|
|
}
|
|
downtimeDuration.Value = float64(dwntime / time.Second)
|
|
ml = append(ml, downtimeDuration)
|
|
|
|
}
|
|
return ml
|
|
})
|
|
return mg
|
|
}
|
|
|
|
// replication metrics for site replication
|
|
func getReplicationSiteMetrics(opts MetricsGroupOpts) *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 1 * time.Minute,
|
|
metricsGroupOpts: opts,
|
|
}
|
|
mg.RegisterRead(func(_ context.Context) []Metric {
|
|
ml := []Metric{}
|
|
|
|
// metrics pertinent to site replication - overall roll up.
|
|
if globalSiteReplicationSys.isEnabled() {
|
|
m, err := globalSiteReplicationSys.getSiteMetrics(GlobalContext)
|
|
if err != nil {
|
|
logger.LogIf(GlobalContext, err)
|
|
return ml
|
|
}
|
|
ml = append(ml, Metric{
|
|
Description: getRepReceivedBytesMD(clusterMetricNamespace),
|
|
Value: float64(m.ReplicaSize),
|
|
})
|
|
ml = append(ml, Metric{
|
|
Description: getRepReceivedOperationsMD(clusterMetricNamespace),
|
|
Value: float64(m.ReplicaCount),
|
|
})
|
|
|
|
for _, stat := range m.Metrics {
|
|
ml = append(ml, Metric{
|
|
Description: getRepFailedBytesLastMinuteMD(clusterMetricNamespace),
|
|
Value: float64(stat.Failed.LastMinute.Bytes),
|
|
VariableLabels: map[string]string{"endpoint": stat.Endpoint},
|
|
})
|
|
ml = append(ml, Metric{
|
|
Description: getRepFailedOperationsLastMinuteMD(clusterMetricNamespace),
|
|
Value: stat.Failed.LastMinute.Count,
|
|
VariableLabels: map[string]string{"endpoint": stat.Endpoint},
|
|
})
|
|
ml = append(ml, Metric{
|
|
Description: getRepFailedBytesLastHourMD(clusterMetricNamespace),
|
|
Value: float64(stat.Failed.LastHour.Bytes),
|
|
VariableLabels: map[string]string{"endpoint": stat.Endpoint},
|
|
})
|
|
ml = append(ml, Metric{
|
|
Description: getRepFailedOperationsLastHourMD(clusterMetricNamespace),
|
|
Value: stat.Failed.LastHour.Count,
|
|
VariableLabels: map[string]string{"endpoint": stat.Endpoint},
|
|
})
|
|
ml = append(ml, Metric{
|
|
Description: getRepFailedBytesTotalMD(clusterMetricNamespace),
|
|
Value: float64(stat.Failed.Totals.Bytes),
|
|
VariableLabels: map[string]string{"endpoint": stat.Endpoint},
|
|
})
|
|
ml = append(ml, Metric{
|
|
Description: getRepFailedOperationsTotalMD(clusterMetricNamespace),
|
|
Value: stat.Failed.Totals.Count,
|
|
VariableLabels: map[string]string{"endpoint": stat.Endpoint},
|
|
})
|
|
|
|
ml = append(ml, Metric{
|
|
Description: getRepSentBytesMD(clusterMetricNamespace),
|
|
Value: float64(stat.ReplicatedSize),
|
|
VariableLabels: map[string]string{"endpoint": stat.Endpoint},
|
|
})
|
|
ml = append(ml, Metric{
|
|
Description: getRepSentOperationsMD(clusterMetricNamespace),
|
|
Value: float64(stat.ReplicatedCount),
|
|
VariableLabels: map[string]string{"endpoint": stat.Endpoint},
|
|
})
|
|
|
|
if c, ok := stat.Failed.ErrCounts["AccessDenied"]; ok {
|
|
ml = append(ml, Metric{
|
|
Description: getClusterRepCredentialErrorsMD(clusterMetricNamespace),
|
|
Value: float64(c),
|
|
VariableLabels: map[string]string{"endpoint": stat.Endpoint},
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
return ml
|
|
})
|
|
return mg
|
|
}
|
|
|
|
func getMinioVersionMetrics() *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 10 * time.Second,
|
|
}
|
|
mg.RegisterRead(func(_ context.Context) (metrics []Metric) {
|
|
metrics = append(metrics, Metric{
|
|
Description: getMinIOCommitMD(),
|
|
VariableLabels: map[string]string{"commit": CommitID},
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: getMinIOVersionMD(),
|
|
VariableLabels: map[string]string{"version": Version},
|
|
})
|
|
return
|
|
})
|
|
return mg
|
|
}
|
|
|
|
func getNodeHealthMetrics(opts MetricsGroupOpts) *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 1 * time.Minute,
|
|
metricsGroupOpts: opts,
|
|
}
|
|
mg.RegisterRead(func(_ context.Context) (metrics []Metric) {
|
|
metrics = make([]Metric, 0, 16)
|
|
nodesUp, nodesDown := globalNotificationSys.GetPeerOnlineCount()
|
|
metrics = append(metrics, Metric{
|
|
Description: getNodeOnlineTotalMD(),
|
|
Value: float64(nodesUp),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: getNodeOfflineTotalMD(),
|
|
Value: float64(nodesDown),
|
|
})
|
|
return
|
|
})
|
|
return mg
|
|
}
|
|
|
|
func getMinioHealingMetrics(opts MetricsGroupOpts) *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 10 * time.Second,
|
|
metricsGroupOpts: opts,
|
|
}
|
|
mg.RegisterRead(func(_ context.Context) (metrics []Metric) {
|
|
bgSeq, exists := globalBackgroundHealState.getHealSequenceByToken(bgHealingUUID)
|
|
if !exists {
|
|
return
|
|
}
|
|
|
|
if bgSeq.lastHealActivity.IsZero() {
|
|
return
|
|
}
|
|
|
|
metrics = make([]Metric, 0, 5)
|
|
metrics = append(metrics, Metric{
|
|
Description: getHealLastActivityTimeMD(),
|
|
Value: float64(time.Since(bgSeq.lastHealActivity)),
|
|
})
|
|
metrics = append(metrics, getObjectsScanned(bgSeq)...)
|
|
metrics = append(metrics, getHealedItems(bgSeq)...)
|
|
metrics = append(metrics, getFailedItems(bgSeq)...)
|
|
return
|
|
})
|
|
return mg
|
|
}
|
|
|
|
func getFailedItems(seq *healSequence) (m []Metric) {
|
|
items := seq.gethealFailedItemsMap()
|
|
m = make([]Metric, 0, len(items))
|
|
for k, v := range items {
|
|
s := strings.Split(k, ",")
|
|
m = append(m, Metric{
|
|
Description: getHealObjectsFailTotalMD(),
|
|
VariableLabels: map[string]string{
|
|
"mount_path": s[0],
|
|
"volume_status": s[1],
|
|
},
|
|
Value: float64(v),
|
|
})
|
|
}
|
|
return
|
|
}
|
|
|
|
func getHealedItems(seq *healSequence) (m []Metric) {
|
|
items := seq.getHealedItemsMap()
|
|
m = make([]Metric, 0, len(items))
|
|
for k, v := range items {
|
|
m = append(m, Metric{
|
|
Description: getHealObjectsHealTotalMD(),
|
|
VariableLabels: map[string]string{"type": string(k)},
|
|
Value: float64(v),
|
|
})
|
|
}
|
|
return
|
|
}
|
|
|
|
func getObjectsScanned(seq *healSequence) (m []Metric) {
|
|
items := seq.getScannedItemsMap()
|
|
m = make([]Metric, 0, len(items))
|
|
for k, v := range items {
|
|
m = append(m, Metric{
|
|
Description: getHealObjectsTotalMD(),
|
|
VariableLabels: map[string]string{"type": string(k)},
|
|
Value: float64(v),
|
|
})
|
|
}
|
|
return
|
|
}
|
|
|
|
func getDistLockMetrics(opts MetricsGroupOpts) *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 1 * time.Second,
|
|
metricsGroupOpts: opts,
|
|
}
|
|
mg.RegisterRead(func(ctx context.Context) []Metric {
|
|
if !globalIsDistErasure {
|
|
return []Metric{}
|
|
}
|
|
|
|
st := globalLockServer.stats()
|
|
|
|
metrics := make([]Metric, 0, 3)
|
|
metrics = append(metrics, Metric{
|
|
Description: MetricDescription{
|
|
Namespace: minioNamespace,
|
|
Subsystem: "locks",
|
|
Name: "total",
|
|
Help: "Number of current locks on this peer",
|
|
Type: gaugeMetric,
|
|
},
|
|
Value: float64(st.Total),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: MetricDescription{
|
|
Namespace: minioNamespace,
|
|
Subsystem: "locks",
|
|
Name: "write_total",
|
|
Help: "Number of current WRITE locks on this peer",
|
|
Type: gaugeMetric,
|
|
},
|
|
Value: float64(st.Writes),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: MetricDescription{
|
|
Namespace: minioNamespace,
|
|
Subsystem: "locks",
|
|
Name: "read_total",
|
|
Help: "Number of current READ locks on this peer",
|
|
Type: gaugeMetric,
|
|
},
|
|
Value: float64(st.Reads),
|
|
})
|
|
return metrics
|
|
})
|
|
return mg
|
|
}
|
|
|
|
func getNotificationMetrics(opts MetricsGroupOpts) *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 10 * time.Second,
|
|
metricsGroupOpts: opts,
|
|
}
|
|
mg.RegisterRead(func(ctx context.Context) []Metric {
|
|
metrics := make([]Metric, 0, 3)
|
|
|
|
if globalEventNotifier != nil {
|
|
nstats := globalEventNotifier.targetList.Stats()
|
|
metrics = append(metrics, Metric{
|
|
Description: MetricDescription{
|
|
Namespace: minioNamespace,
|
|
Subsystem: notifySubsystem,
|
|
Name: "current_send_in_progress",
|
|
Help: "Number of concurrent async Send calls active to all targets (deprecated, please use 'minio_notify_target_current_send_in_progress' instead)",
|
|
Type: gaugeMetric,
|
|
},
|
|
Value: float64(nstats.CurrentSendCalls),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: MetricDescription{
|
|
Namespace: minioNamespace,
|
|
Subsystem: notifySubsystem,
|
|
Name: "events_skipped_total",
|
|
Help: "Events that were skipped to be sent to the targets due to the in-memory queue being full",
|
|
Type: counterMetric,
|
|
},
|
|
Value: float64(nstats.EventsSkipped),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: MetricDescription{
|
|
Namespace: minioNamespace,
|
|
Subsystem: notifySubsystem,
|
|
Name: "events_errors_total",
|
|
Help: "Events that were failed to be sent to the targets (deprecated, please use 'minio_notify_target_failed_events' instead)",
|
|
Type: counterMetric,
|
|
},
|
|
Value: float64(nstats.EventsErrorsTotal),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: MetricDescription{
|
|
Namespace: minioNamespace,
|
|
Subsystem: notifySubsystem,
|
|
Name: "events_sent_total",
|
|
Help: "Total number of events sent to the targets (deprecated, please use 'minio_notify_target_total_events' instead)",
|
|
Type: counterMetric,
|
|
},
|
|
Value: float64(nstats.TotalEvents),
|
|
})
|
|
for id, st := range nstats.TargetStats {
|
|
metrics = append(metrics, Metric{
|
|
Description: MetricDescription{
|
|
Namespace: minioNamespace,
|
|
Subsystem: notifySubsystem,
|
|
Name: "target_total_events",
|
|
Help: "Total number of events sent (or) queued to the target",
|
|
Type: counterMetric,
|
|
},
|
|
VariableLabels: map[string]string{"target_id": id.ID, "target_name": id.Name},
|
|
Value: float64(st.TotalEvents),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: MetricDescription{
|
|
Namespace: minioNamespace,
|
|
Subsystem: notifySubsystem,
|
|
Name: "target_failed_events",
|
|
Help: "Number of events failed to be sent (or) queued to the target",
|
|
Type: counterMetric,
|
|
},
|
|
VariableLabels: map[string]string{"target_id": id.ID, "target_name": id.Name},
|
|
Value: float64(st.FailedEvents),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: MetricDescription{
|
|
Namespace: minioNamespace,
|
|
Subsystem: notifySubsystem,
|
|
Name: "target_current_send_in_progress",
|
|
Help: "Number of concurrent async Send calls active to the target",
|
|
Type: gaugeMetric,
|
|
},
|
|
VariableLabels: map[string]string{"target_id": id.ID, "target_name": id.Name},
|
|
Value: float64(st.CurrentSendCalls),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: MetricDescription{
|
|
Namespace: minioNamespace,
|
|
Subsystem: notifySubsystem,
|
|
Name: "target_queue_length",
|
|
Help: "Number of events currently staged in the queue_dir configured for the target",
|
|
Type: gaugeMetric,
|
|
},
|
|
VariableLabels: map[string]string{"target_id": id.ID, "target_name": id.Name},
|
|
Value: float64(st.CurrentQueue),
|
|
})
|
|
}
|
|
}
|
|
|
|
lstats := globalLambdaTargetList.Stats()
|
|
for _, st := range lstats.TargetStats {
|
|
metrics = append(metrics, Metric{
|
|
Description: MetricDescription{
|
|
Namespace: minioNamespace,
|
|
Subsystem: lambdaSubsystem,
|
|
Name: "active_requests",
|
|
Help: "Number of in progress requests",
|
|
},
|
|
VariableLabels: map[string]string{"target_id": st.ID.ID, "target_name": st.ID.Name},
|
|
Value: float64(st.ActiveRequests),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: MetricDescription{
|
|
Namespace: minioNamespace,
|
|
Subsystem: lambdaSubsystem,
|
|
Name: "total_requests",
|
|
Help: "Total number of requests sent since start",
|
|
Type: counterMetric,
|
|
},
|
|
VariableLabels: map[string]string{"target_id": st.ID.ID, "target_name": st.ID.Name},
|
|
Value: float64(st.TotalRequests),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: MetricDescription{
|
|
Namespace: minioNamespace,
|
|
Subsystem: lambdaSubsystem,
|
|
Name: "failed_requests",
|
|
Help: "Total number of requests that failed to send since start",
|
|
Type: counterMetric,
|
|
},
|
|
VariableLabels: map[string]string{"target_id": st.ID.ID, "target_name": st.ID.Name},
|
|
Value: float64(st.FailedRequests),
|
|
})
|
|
}
|
|
|
|
// Audit and system:
|
|
audit := logger.CurrentStats()
|
|
for id, st := range audit {
|
|
metrics = append(metrics, Metric{
|
|
Description: MetricDescription{
|
|
Namespace: minioNamespace,
|
|
Subsystem: auditSubsystem,
|
|
Name: "target_queue_length",
|
|
Help: "Number of unsent messages in queue for target",
|
|
Type: gaugeMetric,
|
|
},
|
|
VariableLabels: map[string]string{"target_id": id},
|
|
Value: float64(st.QueueLength),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: MetricDescription{
|
|
Namespace: minioNamespace,
|
|
Subsystem: auditSubsystem,
|
|
Name: "total_messages",
|
|
Help: "Total number of messages sent since start",
|
|
Type: counterMetric,
|
|
},
|
|
VariableLabels: map[string]string{"target_id": id},
|
|
Value: float64(st.TotalMessages),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: MetricDescription{
|
|
Namespace: minioNamespace,
|
|
Subsystem: auditSubsystem,
|
|
Name: "failed_messages",
|
|
Help: "Total number of messages that failed to send since start",
|
|
Type: counterMetric,
|
|
},
|
|
VariableLabels: map[string]string{"target_id": id},
|
|
Value: float64(st.FailedMessages),
|
|
})
|
|
}
|
|
return metrics
|
|
})
|
|
return mg
|
|
}
|
|
|
|
func getHTTPMetrics(opts MetricsGroupOpts) *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 10 * time.Second,
|
|
metricsGroupOpts: opts,
|
|
}
|
|
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
|
|
if !mg.metricsGroupOpts.bucketOnly {
|
|
httpStats := globalHTTPStats.toServerHTTPStats()
|
|
metrics = make([]Metric, 0, 3+
|
|
len(httpStats.CurrentS3Requests.APIStats)+
|
|
len(httpStats.TotalS3Requests.APIStats)+
|
|
len(httpStats.TotalS3Errors.APIStats)+
|
|
len(httpStats.TotalS35xxErrors.APIStats)+
|
|
len(httpStats.TotalS34xxErrors.APIStats))
|
|
metrics = append(metrics, Metric{
|
|
Description: getS3RejectedAuthRequestsTotalMD(),
|
|
Value: float64(httpStats.TotalS3RejectedAuth),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: getS3RejectedTimestampRequestsTotalMD(),
|
|
Value: float64(httpStats.TotalS3RejectedTime),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: getS3RejectedHeaderRequestsTotalMD(),
|
|
Value: float64(httpStats.TotalS3RejectedHeader),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: getS3RejectedInvalidRequestsTotalMD(),
|
|
Value: float64(httpStats.TotalS3RejectedInvalid),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: getS3RequestsInQueueMD(),
|
|
Value: float64(httpStats.S3RequestsInQueue),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: getIncomingS3RequestsMD(),
|
|
Value: float64(httpStats.S3RequestsIncoming),
|
|
})
|
|
|
|
for api, value := range httpStats.CurrentS3Requests.APIStats {
|
|
metrics = append(metrics, Metric{
|
|
Description: getS3RequestsInFlightMD(),
|
|
Value: float64(value),
|
|
VariableLabels: map[string]string{"api": api},
|
|
})
|
|
}
|
|
for api, value := range httpStats.TotalS3Requests.APIStats {
|
|
metrics = append(metrics, Metric{
|
|
Description: getS3RequestsTotalMD(),
|
|
Value: float64(value),
|
|
VariableLabels: map[string]string{"api": api},
|
|
})
|
|
}
|
|
for api, value := range httpStats.TotalS3Errors.APIStats {
|
|
metrics = append(metrics, Metric{
|
|
Description: getS3RequestsErrorsMD(),
|
|
Value: float64(value),
|
|
VariableLabels: map[string]string{"api": api},
|
|
})
|
|
}
|
|
for api, value := range httpStats.TotalS35xxErrors.APIStats {
|
|
metrics = append(metrics, Metric{
|
|
Description: getS3Requests5xxErrorsMD(),
|
|
Value: float64(value),
|
|
VariableLabels: map[string]string{"api": api},
|
|
})
|
|
}
|
|
for api, value := range httpStats.TotalS34xxErrors.APIStats {
|
|
metrics = append(metrics, Metric{
|
|
Description: getS3Requests4xxErrorsMD(),
|
|
Value: float64(value),
|
|
VariableLabels: map[string]string{"api": api},
|
|
})
|
|
}
|
|
for api, value := range httpStats.TotalS3Canceled.APIStats {
|
|
metrics = append(metrics, Metric{
|
|
Description: getS3RequestsCanceledMD(),
|
|
Value: float64(value),
|
|
VariableLabels: map[string]string{"api": api},
|
|
})
|
|
}
|
|
return
|
|
}
|
|
|
|
for bucket, inOut := range globalBucketConnStats.getS3InOutBytes() {
|
|
recvBytes := inOut.In
|
|
if recvBytes > 0 {
|
|
metrics = append(metrics, Metric{
|
|
Description: getBucketTrafficReceivedBytes(),
|
|
Value: float64(recvBytes),
|
|
VariableLabels: map[string]string{"bucket": bucket},
|
|
})
|
|
}
|
|
sentBytes := inOut.Out
|
|
if sentBytes > 0 {
|
|
metrics = append(metrics, Metric{
|
|
Description: getBucketTrafficSentBytes(),
|
|
Value: float64(sentBytes),
|
|
VariableLabels: map[string]string{"bucket": bucket},
|
|
})
|
|
}
|
|
|
|
httpStats := globalBucketHTTPStats.load(bucket)
|
|
for k, v := range httpStats.currentS3Requests.Load() {
|
|
metrics = append(metrics, Metric{
|
|
Description: getBucketS3RequestsInFlightMD(),
|
|
Value: float64(v),
|
|
VariableLabels: map[string]string{"bucket": bucket, "api": k},
|
|
})
|
|
}
|
|
|
|
for k, v := range httpStats.totalS3Requests.Load() {
|
|
metrics = append(metrics, Metric{
|
|
Description: getBucketS3RequestsTotalMD(),
|
|
Value: float64(v),
|
|
VariableLabels: map[string]string{"bucket": bucket, "api": k},
|
|
})
|
|
}
|
|
|
|
for k, v := range httpStats.totalS3Canceled.Load() {
|
|
metrics = append(metrics, Metric{
|
|
Description: getBucketS3RequestsCanceledMD(),
|
|
Value: float64(v),
|
|
VariableLabels: map[string]string{"bucket": bucket, "api": k},
|
|
})
|
|
}
|
|
|
|
for k, v := range httpStats.totalS34xxErrors.Load() {
|
|
metrics = append(metrics, Metric{
|
|
Description: getBucketS3Requests4xxErrorsMD(),
|
|
Value: float64(v),
|
|
VariableLabels: map[string]string{"bucket": bucket, "api": k},
|
|
})
|
|
}
|
|
|
|
for k, v := range httpStats.totalS35xxErrors.Load() {
|
|
metrics = append(metrics, Metric{
|
|
Description: getBucketS3Requests5xxErrorsMD(),
|
|
Value: float64(v),
|
|
VariableLabels: map[string]string{"bucket": bucket, "api": k},
|
|
})
|
|
}
|
|
}
|
|
|
|
return
|
|
})
|
|
return mg
|
|
}
|
|
|
|
func getNetworkMetrics() *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 10 * time.Second,
|
|
}
|
|
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
|
|
metrics = make([]Metric, 0, 10)
|
|
connStats := globalConnStats.toServerConnStats()
|
|
rpcStats := rest.GetRPCStats()
|
|
if globalIsDistErasure {
|
|
metrics = append(metrics, Metric{
|
|
Description: getInternodeFailedRequests(),
|
|
Value: float64(rpcStats.Errs),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: getInternodeTCPDialTimeout(),
|
|
Value: float64(rpcStats.DialErrs),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: getInternodeTCPAvgDuration(),
|
|
Value: float64(rpcStats.DialAvgDuration),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: getInterNodeSentBytesMD(),
|
|
Value: float64(connStats.internodeOutputBytes),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: getInterNodeReceivedBytesMD(),
|
|
Value: float64(connStats.internodeInputBytes),
|
|
})
|
|
}
|
|
metrics = append(metrics, Metric{
|
|
Description: getS3SentBytesMD(),
|
|
Value: float64(connStats.s3OutputBytes),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: getS3ReceivedBytesMD(),
|
|
Value: float64(connStats.s3InputBytes),
|
|
})
|
|
return
|
|
})
|
|
return mg
|
|
}
|
|
|
|
func getClusterUsageMetrics(opts MetricsGroupOpts) *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 1 * time.Minute,
|
|
metricsGroupOpts: opts,
|
|
}
|
|
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
|
|
objLayer := newObjectLayerFn()
|
|
if objLayer == nil {
|
|
return
|
|
}
|
|
|
|
metrics = make([]Metric, 0, 50)
|
|
dataUsageInfo, err := loadDataUsageFromBackend(ctx, objLayer)
|
|
if err != nil {
|
|
logger.LogIf(ctx, err)
|
|
return
|
|
}
|
|
|
|
// data usage has not captured any data yet.
|
|
if dataUsageInfo.LastUpdate.IsZero() {
|
|
return
|
|
}
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getUsageLastScanActivityMD(),
|
|
Value: float64(time.Since(dataUsageInfo.LastUpdate)),
|
|
})
|
|
|
|
var (
|
|
clusterSize uint64
|
|
clusterBuckets uint64
|
|
clusterObjectsCount uint64
|
|
clusterVersionsCount uint64
|
|
clusterDeleteMarkersCount uint64
|
|
)
|
|
|
|
clusterObjectSizesHistogram := map[string]uint64{}
|
|
clusterVersionsHistogram := map[string]uint64{}
|
|
for _, usage := range dataUsageInfo.BucketsUsage {
|
|
clusterBuckets++
|
|
clusterSize += usage.Size
|
|
clusterObjectsCount += usage.ObjectsCount
|
|
clusterVersionsCount += usage.VersionsCount
|
|
clusterDeleteMarkersCount += usage.DeleteMarkersCount
|
|
for k, v := range usage.ObjectSizesHistogram {
|
|
v1, ok := clusterObjectSizesHistogram[k]
|
|
if !ok {
|
|
clusterObjectSizesHistogram[k] = v
|
|
} else {
|
|
v1 += v
|
|
clusterObjectSizesHistogram[k] = v1
|
|
}
|
|
}
|
|
for k, v := range usage.ObjectVersionsHistogram {
|
|
v1, ok := clusterVersionsHistogram[k]
|
|
if !ok {
|
|
clusterVersionsHistogram[k] = v
|
|
} else {
|
|
v1 += v
|
|
clusterVersionsHistogram[k] = v1
|
|
}
|
|
}
|
|
}
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getClusterUsageTotalBytesMD(),
|
|
Value: float64(clusterSize),
|
|
})
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getClusterUsageObjectsTotalMD(),
|
|
Value: float64(clusterObjectsCount),
|
|
})
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getClusterUsageVersionsTotalMD(),
|
|
Value: float64(clusterVersionsCount),
|
|
})
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getClusterUsageDeleteMarkersTotalMD(),
|
|
Value: float64(clusterDeleteMarkersCount),
|
|
})
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getClusterObjectDistributionMD(),
|
|
Histogram: clusterObjectSizesHistogram,
|
|
HistogramBucketLabel: "range",
|
|
})
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getClusterObjectVersionsMD(),
|
|
Histogram: clusterVersionsHistogram,
|
|
HistogramBucketLabel: "range",
|
|
})
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getClusterBucketsTotalMD(),
|
|
Value: float64(clusterBuckets),
|
|
})
|
|
|
|
return
|
|
})
|
|
return mg
|
|
}
|
|
|
|
func getBucketUsageMetrics(opts MetricsGroupOpts) *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 1 * time.Minute,
|
|
metricsGroupOpts: opts,
|
|
}
|
|
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
|
|
objLayer := newObjectLayerFn()
|
|
|
|
metrics = make([]Metric, 0, 50)
|
|
dataUsageInfo, err := loadDataUsageFromBackend(ctx, objLayer)
|
|
if err != nil {
|
|
logger.LogIf(ctx, err)
|
|
return
|
|
}
|
|
|
|
// data usage has not captured any data yet.
|
|
if dataUsageInfo.LastUpdate.IsZero() {
|
|
return
|
|
}
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getUsageLastScanActivityMD(),
|
|
Value: float64(time.Since(dataUsageInfo.LastUpdate)),
|
|
})
|
|
|
|
var bucketReplStats map[string]BucketStats
|
|
if !globalSiteReplicationSys.isEnabled() {
|
|
bucketReplStats = globalReplicationStats.getAllLatest(dataUsageInfo.BucketsUsage)
|
|
}
|
|
for bucket, usage := range dataUsageInfo.BucketsUsage {
|
|
quota, _ := globalBucketQuotaSys.Get(ctx, bucket)
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getBucketUsageTotalBytesMD(),
|
|
Value: float64(usage.Size),
|
|
VariableLabels: map[string]string{"bucket": bucket},
|
|
})
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getBucketUsageObjectsTotalMD(),
|
|
Value: float64(usage.ObjectsCount),
|
|
VariableLabels: map[string]string{"bucket": bucket},
|
|
})
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getBucketUsageVersionsTotalMD(),
|
|
Value: float64(usage.VersionsCount),
|
|
VariableLabels: map[string]string{"bucket": bucket},
|
|
})
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getBucketUsageDeleteMarkersTotalMD(),
|
|
Value: float64(usage.DeleteMarkersCount),
|
|
VariableLabels: map[string]string{"bucket": bucket},
|
|
})
|
|
|
|
if quota != nil && quota.Quota > 0 {
|
|
metrics = append(metrics, Metric{
|
|
Description: getBucketUsageQuotaTotalBytesMD(),
|
|
Value: float64(quota.Quota),
|
|
VariableLabels: map[string]string{"bucket": bucket},
|
|
})
|
|
}
|
|
if !globalSiteReplicationSys.isEnabled() {
|
|
var stats BucketReplicationStats
|
|
s, ok := bucketReplStats[bucket]
|
|
if ok {
|
|
stats = s.ReplicationStats
|
|
metrics = append(metrics, Metric{
|
|
Description: getRepReceivedBytesMD(bucketMetricNamespace),
|
|
Value: float64(stats.ReplicaSize),
|
|
VariableLabels: map[string]string{"bucket": bucket},
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: getRepReceivedOperationsMD(bucketMetricNamespace),
|
|
Value: float64(stats.ReplicaCount),
|
|
VariableLabels: map[string]string{"bucket": bucket},
|
|
})
|
|
}
|
|
if stats.hasReplicationUsage() {
|
|
for arn, stat := range stats.Stats {
|
|
metrics = append(metrics, Metric{
|
|
Description: getRepFailedBytesLastMinuteMD(bucketMetricNamespace),
|
|
Value: float64(stat.Failed.LastMinute.Bytes),
|
|
VariableLabels: map[string]string{"bucket": bucket, "targetArn": arn},
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: getRepFailedOperationsLastMinuteMD(bucketMetricNamespace),
|
|
Value: stat.Failed.LastMinute.Count,
|
|
VariableLabels: map[string]string{"bucket": bucket, "targetArn": arn},
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: getRepFailedBytesLastHourMD(bucketMetricNamespace),
|
|
Value: float64(stat.Failed.LastHour.Bytes),
|
|
VariableLabels: map[string]string{"bucket": bucket, "targetArn": arn},
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: getRepFailedOperationsLastHourMD(bucketMetricNamespace),
|
|
Value: stat.Failed.LastHour.Count,
|
|
VariableLabels: map[string]string{"bucket": bucket, "targetArn": arn},
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: getRepFailedBytesTotalMD(bucketMetricNamespace),
|
|
Value: float64(stat.Failed.Totals.Bytes),
|
|
VariableLabels: map[string]string{"bucket": bucket, "targetArn": arn},
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: getRepFailedOperationsTotalMD(bucketMetricNamespace),
|
|
Value: stat.Failed.Totals.Count,
|
|
VariableLabels: map[string]string{"bucket": bucket, "targetArn": arn},
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: getRepSentBytesMD(bucketMetricNamespace),
|
|
Value: float64(stat.ReplicatedSize),
|
|
VariableLabels: map[string]string{"bucket": bucket, "targetArn": arn},
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: getRepSentOperationsMD(bucketMetricNamespace),
|
|
Value: float64(stat.ReplicatedCount),
|
|
VariableLabels: map[string]string{"bucket": bucket, "targetArn": arn},
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: getBucketRepLatencyMD(),
|
|
HistogramBucketLabel: "range",
|
|
Histogram: stat.Latency.getUploadLatency(),
|
|
VariableLabels: map[string]string{"bucket": bucket, "operation": "upload", "targetArn": arn},
|
|
})
|
|
if c, ok := stat.Failed.ErrCounts["AccessDenied"]; ok {
|
|
metrics = append(metrics, Metric{
|
|
Description: getClusterRepCredentialErrorsMD(bucketMetricNamespace),
|
|
Value: float64(c),
|
|
VariableLabels: map[string]string{"bucket": bucket, "targetArn": arn},
|
|
})
|
|
}
|
|
}
|
|
}
|
|
}
|
|
metrics = append(metrics, Metric{
|
|
Description: getBucketObjectDistributionMD(),
|
|
Histogram: usage.ObjectSizesHistogram,
|
|
HistogramBucketLabel: "range",
|
|
VariableLabels: map[string]string{"bucket": bucket},
|
|
})
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getBucketObjectVersionsMD(),
|
|
Histogram: usage.ObjectVersionsHistogram,
|
|
HistogramBucketLabel: "range",
|
|
VariableLabels: map[string]string{"bucket": bucket},
|
|
})
|
|
}
|
|
return
|
|
})
|
|
return mg
|
|
}
|
|
|
|
func getClusterTransitionedBytesMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: ilmSubsystem,
|
|
Name: transitionedBytes,
|
|
Help: "Total bytes transitioned to a tier",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterTransitionedObjectsMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: ilmSubsystem,
|
|
Name: transitionedObjects,
|
|
Help: "Total number of objects transitioned to a tier",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterTransitionedVersionsMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: ilmSubsystem,
|
|
Name: transitionedVersions,
|
|
Help: "Total number of versions transitioned to a tier",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterTierMetrics(opts MetricsGroupOpts) *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 1 * time.Minute,
|
|
metricsGroupOpts: opts,
|
|
}
|
|
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
|
|
objLayer := newObjectLayerFn()
|
|
|
|
if globalTierConfigMgr.Empty() {
|
|
return
|
|
}
|
|
|
|
dui, err := loadDataUsageFromBackend(ctx, objLayer)
|
|
if err != nil {
|
|
logger.LogIf(ctx, err)
|
|
return
|
|
}
|
|
// data usage has not captured any tier stats yet.
|
|
if dui.TierStats == nil {
|
|
return
|
|
}
|
|
|
|
return dui.tierMetrics()
|
|
})
|
|
return mg
|
|
}
|
|
|
|
func getLocalStorageMetrics(opts MetricsGroupOpts) *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 1 * time.Minute,
|
|
metricsGroupOpts: opts,
|
|
}
|
|
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
|
|
objLayer := newObjectLayerFn()
|
|
|
|
metrics = make([]Metric, 0, 50)
|
|
storageInfo := objLayer.LocalStorageInfo(ctx, true)
|
|
onlineDrives, offlineDrives := getOnlineOfflineDisksStats(storageInfo.Disks)
|
|
totalDrives := onlineDrives.Merge(offlineDrives)
|
|
|
|
for _, disk := range storageInfo.Disks {
|
|
metrics = append(metrics, Metric{
|
|
Description: getNodeDriveUsedBytesMD(),
|
|
Value: float64(disk.UsedSpace),
|
|
VariableLabels: map[string]string{"drive": disk.DrivePath},
|
|
})
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getNodeDriveFreeBytesMD(),
|
|
Value: float64(disk.AvailableSpace),
|
|
VariableLabels: map[string]string{"drive": disk.DrivePath},
|
|
})
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getNodeDriveTotalBytesMD(),
|
|
Value: float64(disk.TotalSpace),
|
|
VariableLabels: map[string]string{"drive": disk.DrivePath},
|
|
})
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getNodeDrivesFreeInodesMD(),
|
|
Value: float64(disk.FreeInodes),
|
|
VariableLabels: map[string]string{"drive": disk.DrivePath},
|
|
})
|
|
|
|
if disk.Metrics != nil {
|
|
metrics = append(metrics, Metric{
|
|
Description: getNodeDriveTimeoutErrorsMD(),
|
|
Value: float64(disk.Metrics.TotalErrorsTimeout),
|
|
VariableLabels: map[string]string{"drive": disk.DrivePath},
|
|
})
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getNodeDriveAvailablityErrorsMD(),
|
|
Value: float64(disk.Metrics.TotalErrorsAvailability),
|
|
VariableLabels: map[string]string{"drive": disk.DrivePath},
|
|
})
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getNodeDriveWaitingIOMD(),
|
|
Value: float64(disk.Metrics.TotalWaiting),
|
|
VariableLabels: map[string]string{"drive": disk.DrivePath},
|
|
})
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getNodeDriveTokensIOMD(),
|
|
Value: float64(disk.Metrics.TotalTokens),
|
|
VariableLabels: map[string]string{"drive": disk.DrivePath},
|
|
})
|
|
|
|
for apiName, latency := range disk.Metrics.LastMinute {
|
|
metrics = append(metrics, Metric{
|
|
Description: getNodeDriveAPILatencyMD(),
|
|
Value: float64(latency.Avg().Microseconds()),
|
|
VariableLabels: map[string]string{"drive": disk.DrivePath, "api": "storage." + apiName},
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getNodeDrivesOfflineTotalMD(),
|
|
Value: float64(offlineDrives.Sum()),
|
|
})
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getNodeDrivesOnlineTotalMD(),
|
|
Value: float64(onlineDrives.Sum()),
|
|
})
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getNodeDrivesTotalMD(),
|
|
Value: float64(totalDrives.Sum()),
|
|
})
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getNodeStandardParityMD(),
|
|
Value: float64(storageInfo.Backend.StandardSCParity),
|
|
})
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getNodeRRSParityMD(),
|
|
Value: float64(storageInfo.Backend.RRSCParity),
|
|
})
|
|
|
|
return
|
|
})
|
|
return mg
|
|
}
|
|
|
|
func getClusterWriteQuorumMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: "write",
|
|
Name: "quorum",
|
|
Help: "Maximum write quorum across all pools and sets",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterHealthStatusMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: "health",
|
|
Name: "status",
|
|
Help: "Get current cluster health status",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterErasureSetHealthStatusMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: "health",
|
|
Name: "erasure_set_status",
|
|
Help: "Get current health status for this erasure set",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterErasureSetReadQuorumMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: "health",
|
|
Name: "erasure_set_read_quorum",
|
|
Help: "Get the read quorum for this erasure set",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterErasureSetWriteQuorumMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: "health",
|
|
Name: "erasure_set_write_quorum",
|
|
Help: "Get the write quorum for this erasure set",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterErasureSetOnlineDrivesMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: "health",
|
|
Name: "erasure_set_online_drives",
|
|
Help: "Get the count of the online drives in this erasure set",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterErasureSetHealingDrivesMD() MetricDescription {
|
|
return MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: "health",
|
|
Name: "erasure_set_healing_drives",
|
|
Help: "Get the count of healing drives of this erasure set",
|
|
Type: gaugeMetric,
|
|
}
|
|
}
|
|
|
|
func getClusterHealthMetrics(opts MetricsGroupOpts) *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 10 * time.Second,
|
|
metricsGroupOpts: opts,
|
|
}
|
|
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
|
|
objLayer := newObjectLayerFn()
|
|
|
|
opts := HealthOptions{}
|
|
result := objLayer.Health(ctx, opts)
|
|
|
|
metrics = make([]Metric, 0, 2+4*len(result.ESHealth))
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getClusterWriteQuorumMD(),
|
|
Value: float64(result.WriteQuorum),
|
|
})
|
|
|
|
health := 1
|
|
if !result.Healthy {
|
|
health = 0
|
|
}
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getClusterHealthStatusMD(),
|
|
Value: float64(health),
|
|
})
|
|
|
|
for _, h := range result.ESHealth {
|
|
labels := map[string]string{
|
|
"pool": strconv.Itoa(h.PoolID),
|
|
"set": strconv.Itoa(h.SetID),
|
|
}
|
|
metrics = append(metrics, Metric{
|
|
Description: getClusterErasureSetReadQuorumMD(),
|
|
VariableLabels: labels,
|
|
Value: float64(h.ReadQuorum),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: getClusterErasureSetWriteQuorumMD(),
|
|
VariableLabels: labels,
|
|
Value: float64(h.WriteQuorum),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: getClusterErasureSetOnlineDrivesMD(),
|
|
VariableLabels: labels,
|
|
Value: float64(h.HealthyDrives),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: getClusterErasureSetHealingDrivesMD(),
|
|
VariableLabels: labels,
|
|
Value: float64(h.HealingDrives),
|
|
})
|
|
|
|
health := 1
|
|
if !h.Healthy {
|
|
health = 0
|
|
}
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getClusterErasureSetHealthStatusMD(),
|
|
VariableLabels: labels,
|
|
Value: float64(health),
|
|
})
|
|
}
|
|
|
|
return
|
|
})
|
|
|
|
return mg
|
|
}
|
|
|
|
func getBatchJobsMetrics(opts MetricsGroupOpts) *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 10 * time.Second,
|
|
metricsGroupOpts: opts,
|
|
}
|
|
|
|
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
|
|
var m madmin.RealtimeMetrics
|
|
mLocal := collectLocalMetrics(madmin.MetricsBatchJobs, collectMetricsOpts{})
|
|
m.Merge(&mLocal)
|
|
|
|
mRemote := collectRemoteMetrics(ctx, madmin.MetricsBatchJobs, collectMetricsOpts{})
|
|
m.Merge(&mRemote)
|
|
|
|
if m.Aggregated.BatchJobs == nil {
|
|
return
|
|
}
|
|
|
|
for _, mj := range m.Aggregated.BatchJobs.Jobs {
|
|
jtype := toSnake(mj.JobType)
|
|
var objects, objectsFailed float64
|
|
var bucket string
|
|
switch madmin.BatchJobType(mj.JobType) {
|
|
case madmin.BatchJobReplicate:
|
|
objects = float64(mj.Replicate.Objects)
|
|
objectsFailed = float64(mj.Replicate.ObjectsFailed)
|
|
bucket = mj.Replicate.Bucket
|
|
case madmin.BatchJobKeyRotate:
|
|
objects = float64(mj.KeyRotate.Objects)
|
|
objectsFailed = float64(mj.KeyRotate.ObjectsFailed)
|
|
bucket = mj.KeyRotate.Bucket
|
|
case madmin.BatchJobExpire:
|
|
objects = float64(mj.Expired.Objects)
|
|
objectsFailed = float64(mj.Expired.ObjectsFailed)
|
|
bucket = mj.Expired.Bucket
|
|
}
|
|
metrics = append(metrics,
|
|
Metric{
|
|
Description: MetricDescription{
|
|
Namespace: bucketMetricNamespace,
|
|
Subsystem: "batch",
|
|
Name: MetricName(jtype + "_objects"),
|
|
Help: "Get successfully completed batch job " + jtype + "objects",
|
|
Type: counterMetric,
|
|
},
|
|
Value: objects,
|
|
VariableLabels: map[string]string{"bucket": bucket, "jobId": mj.JobID},
|
|
},
|
|
Metric{
|
|
Description: MetricDescription{
|
|
Namespace: bucketMetricNamespace,
|
|
Subsystem: "batch",
|
|
Name: MetricName(jtype + "_objects_failed"),
|
|
Help: "Get failed batch job " + jtype + "objects",
|
|
Type: counterMetric,
|
|
},
|
|
Value: objectsFailed,
|
|
VariableLabels: map[string]string{"bucket": bucket, "jobId": mj.JobID},
|
|
},
|
|
)
|
|
}
|
|
return
|
|
})
|
|
return mg
|
|
}
|
|
|
|
func getClusterStorageMetrics(opts MetricsGroupOpts) *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 1 * time.Minute,
|
|
metricsGroupOpts: opts,
|
|
}
|
|
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
|
|
objLayer := newObjectLayerFn()
|
|
|
|
// Fetch disk space info, ignore errors
|
|
metrics = make([]Metric, 0, 10)
|
|
storageInfo := objLayer.StorageInfo(ctx, true)
|
|
onlineDrives, offlineDrives := getOnlineOfflineDisksStats(storageInfo.Disks)
|
|
totalDrives := onlineDrives.Merge(offlineDrives)
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getClusterCapacityTotalBytesMD(),
|
|
Value: float64(GetTotalCapacity(storageInfo.Disks)),
|
|
})
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getClusterCapacityFreeBytesMD(),
|
|
Value: float64(GetTotalCapacityFree(storageInfo.Disks)),
|
|
})
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getClusterCapacityUsageBytesMD(),
|
|
Value: float64(GetTotalUsableCapacity(storageInfo.Disks, storageInfo)),
|
|
})
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getClusterCapacityUsageFreeBytesMD(),
|
|
Value: float64(GetTotalUsableCapacityFree(storageInfo.Disks, storageInfo)),
|
|
})
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getClusterDrivesOfflineTotalMD(),
|
|
Value: float64(offlineDrives.Sum()),
|
|
})
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getClusterDrivesOnlineTotalMD(),
|
|
Value: float64(onlineDrives.Sum()),
|
|
})
|
|
|
|
metrics = append(metrics, Metric{
|
|
Description: getClusterDrivesTotalMD(),
|
|
Value: float64(totalDrives.Sum()),
|
|
})
|
|
return
|
|
})
|
|
return mg
|
|
}
|
|
|
|
func getKMSNodeMetrics(opts MetricsGroupOpts) *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 10 * time.Second,
|
|
metricsGroupOpts: opts,
|
|
}
|
|
|
|
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
|
|
const (
|
|
Online = 1
|
|
Offline = 0
|
|
)
|
|
desc := MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: kmsSubsystem,
|
|
Name: kmsOnline,
|
|
Help: "Reports whether the KMS is online (1) or offline (0)",
|
|
Type: gaugeMetric,
|
|
}
|
|
_, err := GlobalKMS.Metrics(ctx)
|
|
if _, ok := kes.IsConnError(err); ok {
|
|
return []Metric{{
|
|
Description: desc,
|
|
Value: float64(Offline),
|
|
}}
|
|
}
|
|
return []Metric{{
|
|
Description: desc,
|
|
Value: float64(Online),
|
|
}}
|
|
})
|
|
return mg
|
|
}
|
|
|
|
func getWebhookMetrics() *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 10 * time.Second,
|
|
}
|
|
mg.RegisterRead(func(ctx context.Context) []Metric {
|
|
tgts := append(logger.SystemTargets(), logger.AuditTargets()...)
|
|
metrics := make([]Metric, 0, len(tgts)*4)
|
|
for _, t := range tgts {
|
|
isOnline := 0
|
|
if t.IsOnline(ctx) {
|
|
isOnline = 1
|
|
}
|
|
labels := map[string]string{
|
|
"name": t.String(),
|
|
"endpoint": t.Endpoint(),
|
|
}
|
|
metrics = append(metrics, Metric{
|
|
Description: MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: webhookSubsystem,
|
|
Name: webhookOnline,
|
|
Help: "Is the webhook online?",
|
|
Type: gaugeMetric,
|
|
},
|
|
VariableLabels: labels,
|
|
Value: float64(isOnline),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: webhookSubsystem,
|
|
Name: webhookQueueLength,
|
|
Help: "Webhook queue length",
|
|
Type: counterMetric,
|
|
},
|
|
VariableLabels: labels,
|
|
Value: float64(t.Stats().QueueLength),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: webhookSubsystem,
|
|
Name: webhookTotalMessages,
|
|
Help: "Total number of messages sent to this target",
|
|
Type: counterMetric,
|
|
},
|
|
VariableLabels: labels,
|
|
Value: float64(t.Stats().TotalMessages),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: webhookSubsystem,
|
|
Name: webhookFailedMessages,
|
|
Help: "Number of messages that failed to send",
|
|
Type: counterMetric,
|
|
},
|
|
VariableLabels: labels,
|
|
Value: float64(t.Stats().FailedMessages),
|
|
})
|
|
}
|
|
|
|
return metrics
|
|
})
|
|
return mg
|
|
}
|
|
|
|
func getKMSMetrics(opts MetricsGroupOpts) *MetricsGroup {
|
|
mg := &MetricsGroup{
|
|
cacheInterval: 10 * time.Second,
|
|
metricsGroupOpts: opts,
|
|
}
|
|
|
|
mg.RegisterRead(func(ctx context.Context) []Metric {
|
|
metrics := make([]Metric, 0, 4)
|
|
metric, err := GlobalKMS.Metrics(ctx)
|
|
if err != nil {
|
|
return metrics
|
|
}
|
|
metrics = append(metrics, Metric{
|
|
Description: MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: kmsSubsystem,
|
|
Name: kmsRequestsSuccess,
|
|
Help: "Number of KMS requests that succeeded",
|
|
Type: counterMetric,
|
|
},
|
|
Value: float64(metric.RequestOK),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: kmsSubsystem,
|
|
Name: kmsRequestsError,
|
|
Help: "Number of KMS requests that failed due to some error. (HTTP 4xx status code)",
|
|
Type: counterMetric,
|
|
},
|
|
Value: float64(metric.RequestErr),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: kmsSubsystem,
|
|
Name: kmsRequestsFail,
|
|
Help: "Number of KMS requests that failed due to some internal failure. (HTTP 5xx status code)",
|
|
Type: counterMetric,
|
|
},
|
|
Value: float64(metric.RequestFail),
|
|
})
|
|
metrics = append(metrics, Metric{
|
|
Description: MetricDescription{
|
|
Namespace: clusterMetricNamespace,
|
|
Subsystem: kmsSubsystem,
|
|
Name: kmsUptime,
|
|
Help: "The time the KMS has been up and running in seconds.",
|
|
Type: counterMetric,
|
|
},
|
|
Value: metric.UpTime.Seconds(),
|
|
})
|
|
|
|
return metrics
|
|
})
|
|
return mg
|
|
}
|
|
|
|
func collectMetric(metric Metric, labels []string, values []string, metricName string, out chan<- prometheus.Metric) {
|
|
if metric.Description.Type == histogramMetric {
|
|
if metric.Histogram == nil {
|
|
return
|
|
}
|
|
for k, v := range metric.Histogram {
|
|
pmetric, err := prometheus.NewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(string(metric.Description.Namespace),
|
|
string(metric.Description.Subsystem),
|
|
string(metric.Description.Name)),
|
|
metric.Description.Help,
|
|
append(labels, metric.HistogramBucketLabel),
|
|
metric.StaticLabels,
|
|
),
|
|
prometheus.GaugeValue,
|
|
float64(v),
|
|
append(values, k)...)
|
|
if err != nil {
|
|
// Enable for debugging
|
|
if serverDebugLog {
|
|
logger.LogOnceIf(GlobalContext, fmt.Errorf("unable to validate prometheus metric (%w) %v+%v", err, values, metric.Histogram), metricName+"-metrics-histogram")
|
|
}
|
|
} else {
|
|
out <- pmetric
|
|
}
|
|
}
|
|
return
|
|
}
|
|
metricType := prometheus.GaugeValue
|
|
if metric.Description.Type == counterMetric {
|
|
metricType = prometheus.CounterValue
|
|
}
|
|
pmetric, err := prometheus.NewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(string(metric.Description.Namespace),
|
|
string(metric.Description.Subsystem),
|
|
string(metric.Description.Name)),
|
|
metric.Description.Help,
|
|
labels,
|
|
metric.StaticLabels,
|
|
),
|
|
metricType,
|
|
metric.Value,
|
|
values...)
|
|
if err != nil {
|
|
// Enable for debugging
|
|
if serverDebugLog {
|
|
logger.LogOnceIf(GlobalContext, fmt.Errorf("unable to validate prometheus metric (%w) %v", err, values), metricName+"-metrics")
|
|
}
|
|
} else {
|
|
out <- pmetric
|
|
}
|
|
}
|
|
|
|
type minioBucketCollector struct {
|
|
metricsGroups []*MetricsGroup
|
|
desc *prometheus.Desc
|
|
}
|
|
|
|
func newMinioBucketCollector(metricsGroups []*MetricsGroup) *minioBucketCollector {
|
|
return &minioBucketCollector{
|
|
metricsGroups: metricsGroups,
|
|
desc: prometheus.NewDesc("minio_bucket_stats", "Statistics exposed by MinIO server cluster wide per bucket", nil, nil),
|
|
}
|
|
}
|
|
|
|
// Describe sends the super-set of all possible descriptors of metrics
|
|
func (c *minioBucketCollector) Describe(ch chan<- *prometheus.Desc) {
|
|
ch <- c.desc
|
|
}
|
|
|
|
// Collect is called by the Prometheus registry when collecting metrics.
|
|
func (c *minioBucketCollector) Collect(out chan<- prometheus.Metric) {
|
|
var wg sync.WaitGroup
|
|
publish := func(in <-chan Metric) {
|
|
defer wg.Done()
|
|
for metric := range in {
|
|
labels, values := getOrderedLabelValueArrays(metric.VariableLabels)
|
|
collectMetric(metric, labels, values, "bucket", out)
|
|
}
|
|
}
|
|
|
|
// Call peer api to fetch metrics
|
|
wg.Add(2)
|
|
go publish(ReportMetrics(GlobalContext, c.metricsGroups))
|
|
go publish(globalNotificationSys.GetBucketMetrics(GlobalContext))
|
|
wg.Wait()
|
|
}
|
|
|
|
type minioClusterCollector struct {
|
|
metricsGroups []*MetricsGroup
|
|
desc *prometheus.Desc
|
|
}
|
|
|
|
func newMinioClusterCollector(metricsGroups []*MetricsGroup) *minioClusterCollector {
|
|
return &minioClusterCollector{
|
|
metricsGroups: metricsGroups,
|
|
desc: prometheus.NewDesc("minio_stats", "Statistics exposed by MinIO server per cluster", nil, nil),
|
|
}
|
|
}
|
|
|
|
// Describe sends the super-set of all possible descriptors of metrics
|
|
func (c *minioClusterCollector) Describe(ch chan<- *prometheus.Desc) {
|
|
ch <- c.desc
|
|
}
|
|
|
|
// Collect is called by the Prometheus registry when collecting metrics.
|
|
func (c *minioClusterCollector) Collect(out chan<- prometheus.Metric) {
|
|
var wg sync.WaitGroup
|
|
publish := func(in <-chan Metric) {
|
|
defer wg.Done()
|
|
for metric := range in {
|
|
labels, values := getOrderedLabelValueArrays(metric.VariableLabels)
|
|
collectMetric(metric, labels, values, "cluster", out)
|
|
}
|
|
}
|
|
|
|
// Call peer api to fetch metrics
|
|
wg.Add(2)
|
|
go publish(ReportMetrics(GlobalContext, c.metricsGroups))
|
|
go publish(globalNotificationSys.GetClusterMetrics(GlobalContext))
|
|
wg.Wait()
|
|
}
|
|
|
|
// ReportMetrics reports serialized metrics to the channel passed for the metrics generated.
|
|
func ReportMetrics(ctx context.Context, metricsGroups []*MetricsGroup) <-chan Metric {
|
|
ch := make(chan Metric)
|
|
go func() {
|
|
defer xioutil.SafeClose(ch)
|
|
populateAndPublish(metricsGroups, func(m Metric) bool {
|
|
if m.VariableLabels == nil {
|
|
m.VariableLabels = make(map[string]string)
|
|
}
|
|
m.VariableLabels[serverName] = globalLocalNodeName
|
|
for {
|
|
select {
|
|
case ch <- m:
|
|
return true
|
|
case <-ctx.Done():
|
|
return false
|
|
}
|
|
}
|
|
})
|
|
}()
|
|
return ch
|
|
}
|
|
|
|
// minioNodeCollector is the Custom Collector
|
|
type minioNodeCollector struct {
|
|
metricsGroups []*MetricsGroup
|
|
desc *prometheus.Desc
|
|
}
|
|
|
|
// Describe sends the super-set of all possible descriptors of metrics
|
|
func (c *minioNodeCollector) Describe(ch chan<- *prometheus.Desc) {
|
|
ch <- c.desc
|
|
}
|
|
|
|
// populateAndPublish populates and then publishes the metrics generated by the generator function.
|
|
func populateAndPublish(metricsGroups []*MetricsGroup, publish func(m Metric) bool) {
|
|
for _, mg := range metricsGroups {
|
|
if mg == nil {
|
|
continue
|
|
}
|
|
for _, metric := range mg.Get() {
|
|
if !publish(metric) {
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Collect is called by the Prometheus registry when collecting metrics.
|
|
func (c *minioNodeCollector) Collect(ch chan<- prometheus.Metric) {
|
|
// Expose MinIO's version information
|
|
minioVersionInfo.WithLabelValues(Version, CommitID).Set(1.0)
|
|
|
|
populateAndPublish(c.metricsGroups, func(metric Metric) bool {
|
|
labels, values := getOrderedLabelValueArrays(metric.VariableLabels)
|
|
values = append(values, globalLocalNodeName)
|
|
labels = append(labels, serverName)
|
|
|
|
if metric.Description.Type == histogramMetric {
|
|
if metric.Histogram == nil {
|
|
return true
|
|
}
|
|
for k, v := range metric.Histogram {
|
|
labels = append(labels, metric.HistogramBucketLabel)
|
|
values = append(values, k)
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(string(metric.Description.Namespace),
|
|
string(metric.Description.Subsystem),
|
|
string(metric.Description.Name)),
|
|
metric.Description.Help,
|
|
labels,
|
|
metric.StaticLabels,
|
|
),
|
|
prometheus.GaugeValue,
|
|
float64(v),
|
|
values...)
|
|
}
|
|
return true
|
|
}
|
|
|
|
metricType := prometheus.GaugeValue
|
|
if metric.Description.Type == counterMetric {
|
|
metricType = prometheus.CounterValue
|
|
}
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(string(metric.Description.Namespace),
|
|
string(metric.Description.Subsystem),
|
|
string(metric.Description.Name)),
|
|
metric.Description.Help,
|
|
labels,
|
|
metric.StaticLabels,
|
|
),
|
|
metricType,
|
|
metric.Value,
|
|
values...)
|
|
return true
|
|
})
|
|
}
|
|
|
|
func getOrderedLabelValueArrays(labelsWithValue map[string]string) (labels, values []string) {
|
|
labels = make([]string, 0, len(labelsWithValue))
|
|
values = make([]string, 0, len(labelsWithValue))
|
|
for l, v := range labelsWithValue {
|
|
labels = append(labels, l)
|
|
values = append(values, v)
|
|
}
|
|
return
|
|
}
|
|
|
|
// newMinioCollectorNode describes the collector
|
|
// and returns reference of minioCollector for version 2
|
|
// It creates the Prometheus Description which is used
|
|
// to define Metric and help string
|
|
func newMinioCollectorNode(metricsGroups []*MetricsGroup) *minioNodeCollector {
|
|
return &minioNodeCollector{
|
|
metricsGroups: metricsGroups,
|
|
desc: prometheus.NewDesc("minio_stats", "Statistics exposed by MinIO server per node", nil, nil),
|
|
}
|
|
}
|
|
|
|
func metricsHTTPHandler(c prometheus.Collector, funcName string) http.Handler {
|
|
registry := prometheus.NewRegistry()
|
|
|
|
// Report all other metrics
|
|
logger.CriticalIf(GlobalContext, registry.Register(c))
|
|
|
|
// DefaultGatherers include golang metrics and process metrics.
|
|
gatherers := prometheus.Gatherers{
|
|
registry,
|
|
}
|
|
|
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
tc, ok := r.Context().Value(mcontext.ContextTraceKey).(*mcontext.TraceCtxt)
|
|
if ok {
|
|
tc.FuncName = funcName
|
|
tc.ResponseRecorder.LogErrBody = true
|
|
}
|
|
|
|
mfs, err := gatherers.Gather()
|
|
if err != nil && len(mfs) == 0 {
|
|
writeErrorResponseJSON(r.Context(), w, toAdminAPIErr(r.Context(), err), r.URL)
|
|
return
|
|
}
|
|
|
|
contentType := expfmt.Negotiate(r.Header)
|
|
w.Header().Set("Content-Type", string(contentType))
|
|
|
|
enc := expfmt.NewEncoder(w, contentType)
|
|
for _, mf := range mfs {
|
|
if err := enc.Encode(mf); err != nil {
|
|
// client may disconnect for any reasons
|
|
// we do not have to log this.
|
|
return
|
|
}
|
|
}
|
|
if closer, ok := enc.(expfmt.Closer); ok {
|
|
closer.Close()
|
|
}
|
|
})
|
|
}
|
|
|
|
func metricsBucketHandler() http.Handler {
|
|
return metricsHTTPHandler(bucketCollector, "handler.MetricsBucket")
|
|
}
|
|
|
|
func metricsServerHandler() http.Handler {
|
|
registry := prometheus.NewRegistry()
|
|
|
|
// Report all other metrics
|
|
logger.CriticalIf(GlobalContext, registry.Register(clusterCollector))
|
|
|
|
// DefaultGatherers include golang metrics and process metrics.
|
|
gatherers := prometheus.Gatherers{
|
|
registry,
|
|
}
|
|
|
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
tc, ok := r.Context().Value(mcontext.ContextTraceKey).(*mcontext.TraceCtxt)
|
|
if ok {
|
|
tc.FuncName = "handler.MetricsCluster"
|
|
tc.ResponseRecorder.LogErrBody = true
|
|
}
|
|
|
|
mfs, err := gatherers.Gather()
|
|
if err != nil && len(mfs) == 0 {
|
|
writeErrorResponseJSON(r.Context(), w, toAdminAPIErr(r.Context(), err), r.URL)
|
|
return
|
|
}
|
|
|
|
contentType := expfmt.Negotiate(r.Header)
|
|
w.Header().Set("Content-Type", string(contentType))
|
|
|
|
enc := expfmt.NewEncoder(w, contentType)
|
|
for _, mf := range mfs {
|
|
if err := enc.Encode(mf); err != nil {
|
|
// client may disconnect for any reasons
|
|
// we do not have to log this.
|
|
return
|
|
}
|
|
}
|
|
if closer, ok := enc.(expfmt.Closer); ok {
|
|
closer.Close()
|
|
}
|
|
})
|
|
}
|
|
|
|
func metricsNodeHandler() http.Handler {
|
|
registry := prometheus.NewRegistry()
|
|
|
|
logger.CriticalIf(GlobalContext, registry.Register(nodeCollector))
|
|
if err := registry.Register(prometheus.NewProcessCollector(prometheus.ProcessCollectorOpts{
|
|
Namespace: minioNamespace,
|
|
ReportErrors: true,
|
|
})); err != nil {
|
|
logger.CriticalIf(GlobalContext, err)
|
|
}
|
|
if err := registry.Register(prometheus.NewGoCollector()); err != nil {
|
|
logger.CriticalIf(GlobalContext, err)
|
|
}
|
|
gatherers := prometheus.Gatherers{
|
|
registry,
|
|
}
|
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
tc, ok := r.Context().Value(mcontext.ContextTraceKey).(*mcontext.TraceCtxt)
|
|
if ok {
|
|
tc.FuncName = "handler.MetricsNode"
|
|
tc.ResponseRecorder.LogErrBody = true
|
|
}
|
|
|
|
mfs, err := gatherers.Gather()
|
|
if err != nil {
|
|
if len(mfs) == 0 {
|
|
writeErrorResponseJSON(r.Context(), w, toAdminAPIErr(r.Context(), err), r.URL)
|
|
return
|
|
}
|
|
}
|
|
|
|
contentType := expfmt.Negotiate(r.Header)
|
|
w.Header().Set("Content-Type", string(contentType))
|
|
|
|
enc := expfmt.NewEncoder(w, contentType)
|
|
for _, mf := range mfs {
|
|
if err := enc.Encode(mf); err != nil {
|
|
logger.LogIf(r.Context(), err)
|
|
return
|
|
}
|
|
}
|
|
if closer, ok := enc.(expfmt.Closer); ok {
|
|
closer.Close()
|
|
}
|
|
})
|
|
}
|
|
|
|
func toSnake(camel string) (snake string) {
|
|
var b strings.Builder
|
|
l := len(camel)
|
|
for i, v := range camel {
|
|
// A is 65, a is 97
|
|
if v >= 'a' {
|
|
b.WriteRune(v)
|
|
continue
|
|
}
|
|
// v is capital letter here
|
|
// disregard first letter
|
|
// add underscore if last letter is capital letter
|
|
// add underscore when previous letter is lowercase
|
|
// add underscore when next letter is lowercase
|
|
if (i != 0 || i == l-1) && ((i > 0 && rune(camel[i-1]) >= 'a') ||
|
|
(i < l-1 && rune(camel[i+1]) >= 'a')) {
|
|
b.WriteRune('_')
|
|
}
|
|
b.WriteRune(v + 'a' - 'A')
|
|
}
|
|
return b.String()
|
|
}
|