minio/cmd/metrics-v3.go
Shireesh Anjal 5808190398
Add more metrics to v3/cluster/erasure-set (#19714)
Metrics being added:

- read_tolerance: No of drive failures that can be tolerated without
  disrupting read operations
- write_tolerance: No of drive failures that can be tolerated without
  disrupting write operations
- read_health: Health of the erasure set in a pool for read operations
  (1=healthy, 0=unhealthy)
- write_health: Health of the erasure set in a pool for write operations
  (1=healthy, 0=unhealthy)
2024-05-14 00:25:56 -07:00

391 lines
11 KiB
Go

// Copyright (c) 2015-2024 MinIO, Inc.
//
// This file is part of MinIO Object Storage stack
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
package cmd
import (
"slices"
"strings"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/collectors"
)
// Collector paths.
//
// These are paths under the top-level /minio/metrics/v3 metrics endpoint. Each
// of these paths returns a set of V3 metrics.
//
// Per-bucket metrics endpoints always start with /bucket and the bucket name is
// appended to the path. e.g. if the collector path is /bucket/api, the endpoint
// for the bucket "mybucket" would be /minio/metrics/v3/bucket/api/mybucket
const (
apiRequestsCollectorPath collectorPath = "/api/requests"
apiBucketCollectorPath collectorPath = "/bucket/api"
systemNetworkInternodeCollectorPath collectorPath = "/system/network/internode"
systemDriveCollectorPath collectorPath = "/system/drive"
systemMemoryCollectorPath collectorPath = "/system/memory"
systemCPUCollectorPath collectorPath = "/system/cpu"
systemProcessCollectorPath collectorPath = "/system/process"
debugGoCollectorPath collectorPath = "/debug/go"
clusterHealthCollectorPath collectorPath = "/cluster/health"
clusterUsageObjectsCollectorPath collectorPath = "/cluster/usage/objects"
clusterUsageBucketsCollectorPath collectorPath = "/cluster/usage/buckets"
clusterErasureSetCollectorPath collectorPath = "/cluster/erasure-set"
clusterNotificationCollectorPath collectorPath = "/cluster/notification"
clusterIAMCollectorPath collectorPath = "/cluster/iam"
auditCollectorPath collectorPath = "/audit"
)
const (
clusterBasePath = "/cluster"
)
type metricsV3Collection struct {
mgMap map[collectorPath]*MetricsGroup
bucketMGMap map[collectorPath]*MetricsGroup
// Gatherers for non-bucket MetricsGroup's
mgGatherers map[collectorPath]prometheus.Gatherer
collectorPaths []collectorPath
}
func newMetricGroups(r *prometheus.Registry) *metricsV3Collection {
// Create all metric groups.
apiRequestsMG := NewMetricsGroup(apiRequestsCollectorPath,
[]MetricDescriptor{
apiRejectedAuthTotalMD,
apiRejectedHeaderTotalMD,
apiRejectedTimestampTotalMD,
apiRejectedInvalidTotalMD,
apiRequestsWaitingTotalMD,
apiRequestsIncomingTotalMD,
apiRequestsInFlightTotalMD,
apiRequestsTotalMD,
apiRequestsErrorsTotalMD,
apiRequests5xxErrorsTotalMD,
apiRequests4xxErrorsTotalMD,
apiRequestsCanceledTotalMD,
apiRequestsTTFBSecondsDistributionMD,
apiTrafficSentBytesMD,
apiTrafficRecvBytesMD,
},
JoinLoaders(loadAPIRequestsHTTPMetrics, loadAPIRequestsTTFBMetrics,
loadAPIRequestsNetworkMetrics),
)
apiBucketMG := NewBucketMetricsGroup(apiBucketCollectorPath,
[]MetricDescriptor{
apiBucketTrafficRecvBytesMD,
apiBucketTrafficSentBytesMD,
apiBucketRequestsInFlightMD,
apiBucketRequestsTotalMD,
apiBucketRequestsCanceledMD,
apiBucketRequests4xxErrorsMD,
apiBucketRequests5xxErrorsMD,
apiBucketRequestsTTFBSecondsDistributionMD,
},
JoinBucketLoaders(loadAPIBucketHTTPMetrics, loadAPIBucketTTFBMetrics),
)
systemNetworkInternodeMG := NewMetricsGroup(systemNetworkInternodeCollectorPath,
[]MetricDescriptor{
internodeErrorsTotalMD,
internodeDialedErrorsTotalMD,
internodeDialAvgTimeNanosMD,
internodeSentBytesTotalMD,
internodeRecvBytesTotalMD,
},
loadNetworkInternodeMetrics,
)
systemMemoryMG := NewMetricsGroup(systemMemoryCollectorPath,
[]MetricDescriptor{
memTotalMD,
memUsedMD,
memFreeMD,
memAvailableMD,
memBuffersMD,
memCacheMD,
memSharedMD,
memUsedPercMD,
},
loadMemoryMetrics,
)
systemCPUMG := NewMetricsGroup(systemCPUCollectorPath,
[]MetricDescriptor{
sysCPUAvgIdleMD,
sysCPUAvgIOWaitMD,
sysCPULoadMD,
sysCPULoadPercMD,
sysCPUNiceMD,
sysCPUStealMD,
sysCPUSystemMD,
sysCPUUserMD,
},
loadCPUMetrics,
)
systemProcessMG := NewMetricsGroup(systemProcessCollectorPath,
[]MetricDescriptor{
processLocksReadTotalMD,
processLocksWriteTotalMD,
processCPUTotalSecondsMD,
processGoRoutineTotalMD,
processIORCharBytesMD,
processIOReadBytesMD,
processIOWCharBytesMD,
processIOWriteBytesMD,
processStarttimeSecondsMD,
processUptimeSecondsMD,
processFileDescriptorLimitTotalMD,
processFileDescriptorOpenTotalMD,
processSyscallReadTotalMD,
processSyscallWriteTotalMD,
processResidentMemoryBytesMD,
processVirtualMemoryBytesMD,
processVirtualMemoryMaxBytesMD,
},
loadProcessMetrics,
)
systemDriveMG := NewMetricsGroup(systemDriveCollectorPath,
[]MetricDescriptor{
driveUsedBytesMD,
driveFreeBytesMD,
driveTotalBytesMD,
driveUsedInodesMD,
driveFreeInodesMD,
driveTotalInodesMD,
driveTimeoutErrorsMD,
driveIOErrorsMD,
driveAvailabilityErrorsMD,
driveWaitingIOMD,
driveAPILatencyMD,
driveHealthMD,
driveOfflineCountMD,
driveOnlineCountMD,
driveCountMD,
// iostat related
driveReadsPerSecMD,
driveReadsKBPerSecMD,
driveReadsAwaitMD,
driveWritesPerSecMD,
driveWritesKBPerSecMD,
driveWritesAwaitMD,
drivePercUtilMD,
},
loadDriveMetrics,
)
clusterHealthMG := NewMetricsGroup(clusterHealthCollectorPath,
[]MetricDescriptor{
healthDrivesOfflineCountMD,
healthDrivesOnlineCountMD,
healthDrivesCountMD,
healthNodesOfflineCountMD,
healthNodesOnlineCountMD,
healthCapacityRawTotalBytesMD,
healthCapacityRawFreeBytesMD,
healthCapacityUsableTotalBytesMD,
healthCapacityUsableFreeBytesMD,
},
JoinLoaders(loadClusterHealthDriveMetrics,
loadClusterHealthNodeMetrics,
loadClusterHealthCapacityMetrics),
)
clusterUsageObjectsMG := NewMetricsGroup(clusterUsageObjectsCollectorPath,
[]MetricDescriptor{
usageSinceLastUpdateSecondsMD,
usageTotalBytesMD,
usageObjectsCountMD,
usageVersionsCountMD,
usageDeleteMarkersCountMD,
usageBucketsCountMD,
usageObjectsDistributionMD,
usageVersionsDistributionMD,
},
loadClusterUsageObjectMetrics,
)
clusterUsageBucketsMG := NewBucketMetricsGroup(clusterUsageBucketsCollectorPath,
[]MetricDescriptor{
usageSinceLastUpdateSecondsMD,
usageBucketTotalBytesMD,
usageBucketObjectsTotalMD,
usageBucketVersionsCountMD,
usageBucketDeleteMarkersCountMD,
usageBucketQuotaTotalBytesMD,
usageBucketObjectSizeDistributionMD,
usageBucketObjectVersionCountDistributionMD,
},
loadClusterUsageBucketMetrics,
)
clusterErasureSetMG := NewMetricsGroup(clusterErasureSetCollectorPath,
[]MetricDescriptor{
erasureSetOverallWriteQuorumMD,
erasureSetOverallHealthMD,
erasureSetReadQuorumMD,
erasureSetWriteQuorumMD,
erasureSetOnlineDrivesCountMD,
erasureSetHealingDrivesCountMD,
erasureSetHealthMD,
erasureSetReadToleranceMD,
erasureSetWriteToleranceMD,
erasureSetReadHealthMD,
erasureSetWriteHealthMD,
},
loadClusterErasureSetMetrics,
)
clusterNotificationMG := NewMetricsGroup(clusterNotificationCollectorPath,
[]MetricDescriptor{
notificationCurrentSendInProgressMD,
notificationEventsErrorsTotalMD,
notificationEventsSentTotalMD,
notificationEventsSkippedTotalMD,
},
loadClusterNotificationMetrics,
)
clusterIAMMG := NewMetricsGroup(clusterIAMCollectorPath,
[]MetricDescriptor{
lastSyncDurationMillisMD,
pluginAuthnServiceFailedRequestsMinuteMD,
pluginAuthnServiceLastFailSecondsMD,
pluginAuthnServiceLastSuccSecondsMD,
pluginAuthnServiceSuccAvgRttMsMinuteMD,
pluginAuthnServiceSuccMaxRttMsMinuteMD,
pluginAuthnServiceTotalRequestsMinuteMD,
sinceLastSyncMillisMD,
syncFailuresMD,
syncSuccessesMD,
},
loadClusterIAMMetrics,
)
auditMG := NewMetricsGroup(auditCollectorPath,
[]MetricDescriptor{
auditFailedMessagesMD,
auditTargetQueueLengthMD,
auditTotalMessagesMD,
},
loadAuditMetrics,
)
allMetricGroups := []*MetricsGroup{
apiRequestsMG,
apiBucketMG,
systemNetworkInternodeMG,
systemDriveMG,
systemMemoryMG,
systemCPUMG,
systemProcessMG,
clusterHealthMG,
clusterUsageObjectsMG,
clusterUsageBucketsMG,
clusterErasureSetMG,
clusterNotificationMG,
clusterIAMMG,
auditMG,
}
// Bucket metrics are special, they always include the bucket label. These
// metrics required a list of buckets to be passed to the loader, and the list
// of buckets is not known until the request is made. So we keep a separate
// map for bucket metrics and handle them specially.
// Add the serverName and poolIndex labels to all non-cluster metrics.
//
// Also create metric group maps and set the cache.
metricsCache := newMetricsCache()
mgMap := make(map[collectorPath]*MetricsGroup)
bucketMGMap := make(map[collectorPath]*MetricsGroup)
for _, mg := range allMetricGroups {
if !strings.HasPrefix(string(mg.CollectorPath), clusterBasePath) {
mg.AddExtraLabels(
serverName, globalLocalNodeName,
// poolIndex, strconv.Itoa(globalLocalPoolIdx),
)
}
mg.SetCache(metricsCache)
if mg.IsBucketMetricsGroup() {
bucketMGMap[mg.CollectorPath] = mg
} else {
mgMap[mg.CollectorPath] = mg
}
}
// Prepare to register the collectors. Other than `MetricGroup` collectors,
// we also have standard collectors like `GoCollector`.
// Create all Non-`MetricGroup` collectors here.
collectors := map[collectorPath]prometheus.Collector{
debugGoCollectorPath: collectors.NewGoCollector(),
}
// Add all `MetricGroup` collectors to the map.
for _, mg := range allMetricGroups {
collectors[mg.CollectorPath] = mg
}
// Helper function to register a collector and return a gatherer for it.
mustRegister := func(c ...prometheus.Collector) prometheus.Gatherer {
subRegistry := prometheus.NewRegistry()
for _, col := range c {
subRegistry.MustRegister(col)
}
r.MustRegister(subRegistry)
return subRegistry
}
// Register all collectors and create gatherers for them.
gatherers := make(map[collectorPath]prometheus.Gatherer, len(collectors))
collectorPaths := make([]collectorPath, 0, len(collectors))
for path, collector := range collectors {
gatherers[path] = mustRegister(collector)
collectorPaths = append(collectorPaths, path)
}
slices.Sort(collectorPaths)
return &metricsV3Collection{
mgMap: mgMap,
bucketMGMap: bucketMGMap,
mgGatherers: gatherers,
collectorPaths: collectorPaths,
}
}