// Copyright (c) 2015-2024 MinIO, Inc. // // This file is part of MinIO Object Storage stack // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Affero General Public License for more details. // // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . package cmd import ( "slices" "strings" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/collectors" ) // Collector paths. // // These are paths under the top-level /minio/metrics/v3 metrics endpoint. Each // of these paths returns a set of V3 metrics. // // Per-bucket metrics endpoints always start with /bucket and the bucket name is // appended to the path. e.g. if the collector path is /bucket/api, the endpoint // for the bucket "mybucket" would be /minio/metrics/v3/bucket/api/mybucket const ( apiRequestsCollectorPath collectorPath = "/api/requests" bucketAPICollectorPath collectorPath = "/bucket/api" bucketReplicationCollectorPath collectorPath = "/bucket/replication" systemNetworkInternodeCollectorPath collectorPath = "/system/network/internode" systemDriveCollectorPath collectorPath = "/system/drive" systemMemoryCollectorPath collectorPath = "/system/memory" systemCPUCollectorPath collectorPath = "/system/cpu" systemProcessCollectorPath collectorPath = "/system/process" debugGoCollectorPath collectorPath = "/debug/go" clusterHealthCollectorPath collectorPath = "/cluster/health" clusterUsageObjectsCollectorPath collectorPath = "/cluster/usage/objects" clusterUsageBucketsCollectorPath collectorPath = "/cluster/usage/buckets" clusterErasureSetCollectorPath collectorPath = "/cluster/erasure-set" clusterIAMCollectorPath collectorPath = "/cluster/iam" clusterConfigCollectorPath collectorPath = "/cluster/config" ilmCollectorPath collectorPath = "/ilm" auditCollectorPath collectorPath = "/audit" loggerWebhookCollectorPath collectorPath = "/logger/webhook" replicationCollectorPath collectorPath = "/replication" notificationCollectorPath collectorPath = "/notification" scannerCollectorPath collectorPath = "/scanner" ) const ( clusterBasePath = "/cluster" ) type metricsV3Collection struct { mgMap map[collectorPath]*MetricsGroup bucketMGMap map[collectorPath]*MetricsGroup // Gatherers for non-bucket MetricsGroup's mgGatherers map[collectorPath]prometheus.Gatherer collectorPaths []collectorPath } func newMetricGroups(r *prometheus.Registry) *metricsV3Collection { // Create all metric groups. apiRequestsMG := NewMetricsGroup(apiRequestsCollectorPath, []MetricDescriptor{ apiRejectedAuthTotalMD, apiRejectedHeaderTotalMD, apiRejectedTimestampTotalMD, apiRejectedInvalidTotalMD, apiRequestsWaitingTotalMD, apiRequestsIncomingTotalMD, apiRequestsInFlightTotalMD, apiRequestsTotalMD, apiRequestsErrorsTotalMD, apiRequests5xxErrorsTotalMD, apiRequests4xxErrorsTotalMD, apiRequestsCanceledTotalMD, apiRequestsTTFBSecondsDistributionMD, apiTrafficSentBytesMD, apiTrafficRecvBytesMD, }, JoinLoaders(loadAPIRequestsHTTPMetrics, loadAPIRequestsTTFBMetrics, loadAPIRequestsNetworkMetrics), ) bucketAPIMG := NewBucketMetricsGroup(bucketAPICollectorPath, []MetricDescriptor{ bucketAPITrafficRecvBytesMD, bucketAPITrafficSentBytesMD, bucketAPIRequestsInFlightMD, bucketAPIRequestsTotalMD, bucketAPIRequestsCanceledMD, bucketAPIRequests4xxErrorsMD, bucketAPIRequests5xxErrorsMD, bucketAPIRequestsTTFBSecondsDistributionMD, }, JoinBucketLoaders(loadBucketAPIHTTPMetrics, loadBucketAPITTFBMetrics), ) bucketReplicationMG := NewBucketMetricsGroup(bucketReplicationCollectorPath, []MetricDescriptor{ bucketReplLastHrFailedBytesMD, bucketReplLastHrFailedCountMD, bucketReplLastMinFailedBytesMD, bucketReplLastMinFailedCountMD, bucketReplLatencyMsMD, bucketReplProxiedDeleteTaggingRequestsTotalMD, bucketReplProxiedGetRequestsFailuresMD, bucketReplProxiedGetRequestsTotalMD, bucketReplProxiedGetTaggingRequestsFailuresMD, bucketReplProxiedGetTaggingRequestsTotalMD, bucketReplProxiedHeadRequestsFailuresMD, bucketReplProxiedHeadRequestsTotalMD, bucketReplProxiedPutTaggingRequestsFailuresMD, bucketReplProxiedPutTaggingRequestsTotalMD, bucketReplSentBytesMD, bucketReplSentCountMD, bucketReplTotalFailedBytesMD, bucketReplTotalFailedCountMD, bucketReplProxiedDeleteTaggingRequestsFailuresMD, }, loadBucketReplicationMetrics, ) systemNetworkInternodeMG := NewMetricsGroup(systemNetworkInternodeCollectorPath, []MetricDescriptor{ internodeErrorsTotalMD, internodeDialedErrorsTotalMD, internodeDialAvgTimeNanosMD, internodeSentBytesTotalMD, internodeRecvBytesTotalMD, }, loadNetworkInternodeMetrics, ) systemMemoryMG := NewMetricsGroup(systemMemoryCollectorPath, []MetricDescriptor{ memTotalMD, memUsedMD, memFreeMD, memAvailableMD, memBuffersMD, memCacheMD, memSharedMD, memUsedPercMD, }, loadMemoryMetrics, ) systemCPUMG := NewMetricsGroup(systemCPUCollectorPath, []MetricDescriptor{ sysCPUAvgIdleMD, sysCPUAvgIOWaitMD, sysCPULoadMD, sysCPULoadPercMD, sysCPUNiceMD, sysCPUStealMD, sysCPUSystemMD, sysCPUUserMD, }, loadCPUMetrics, ) systemProcessMG := NewMetricsGroup(systemProcessCollectorPath, []MetricDescriptor{ processLocksReadTotalMD, processLocksWriteTotalMD, processCPUTotalSecondsMD, processGoRoutineTotalMD, processIORCharBytesMD, processIOReadBytesMD, processIOWCharBytesMD, processIOWriteBytesMD, processStarttimeSecondsMD, processUptimeSecondsMD, processFileDescriptorLimitTotalMD, processFileDescriptorOpenTotalMD, processSyscallReadTotalMD, processSyscallWriteTotalMD, processResidentMemoryBytesMD, processVirtualMemoryBytesMD, processVirtualMemoryMaxBytesMD, }, loadProcessMetrics, ) systemDriveMG := NewMetricsGroup(systemDriveCollectorPath, []MetricDescriptor{ driveUsedBytesMD, driveFreeBytesMD, driveTotalBytesMD, driveUsedInodesMD, driveFreeInodesMD, driveTotalInodesMD, driveTimeoutErrorsMD, driveIOErrorsMD, driveAvailabilityErrorsMD, driveWaitingIOMD, driveAPILatencyMD, driveHealthMD, driveOfflineCountMD, driveOnlineCountMD, driveCountMD, // iostat related driveReadsPerSecMD, driveReadsKBPerSecMD, driveReadsAwaitMD, driveWritesPerSecMD, driveWritesKBPerSecMD, driveWritesAwaitMD, drivePercUtilMD, }, loadDriveMetrics, ) clusterHealthMG := NewMetricsGroup(clusterHealthCollectorPath, []MetricDescriptor{ healthDrivesOfflineCountMD, healthDrivesOnlineCountMD, healthDrivesCountMD, healthNodesOfflineCountMD, healthNodesOnlineCountMD, healthCapacityRawTotalBytesMD, healthCapacityRawFreeBytesMD, healthCapacityUsableTotalBytesMD, healthCapacityUsableFreeBytesMD, }, JoinLoaders(loadClusterHealthDriveMetrics, loadClusterHealthNodeMetrics, loadClusterHealthCapacityMetrics), ) clusterUsageObjectsMG := NewMetricsGroup(clusterUsageObjectsCollectorPath, []MetricDescriptor{ usageSinceLastUpdateSecondsMD, usageTotalBytesMD, usageObjectsCountMD, usageVersionsCountMD, usageDeleteMarkersCountMD, usageBucketsCountMD, usageObjectsDistributionMD, usageVersionsDistributionMD, }, loadClusterUsageObjectMetrics, ) clusterUsageBucketsMG := NewMetricsGroup(clusterUsageBucketsCollectorPath, []MetricDescriptor{ usageSinceLastUpdateSecondsMD, usageBucketTotalBytesMD, usageBucketObjectsTotalMD, usageBucketVersionsCountMD, usageBucketDeleteMarkersCountMD, usageBucketQuotaTotalBytesMD, usageBucketObjectSizeDistributionMD, usageBucketObjectVersionCountDistributionMD, }, loadClusterUsageBucketMetrics, ) clusterErasureSetMG := NewMetricsGroup(clusterErasureSetCollectorPath, []MetricDescriptor{ erasureSetOverallWriteQuorumMD, erasureSetOverallHealthMD, erasureSetReadQuorumMD, erasureSetWriteQuorumMD, erasureSetOnlineDrivesCountMD, erasureSetHealingDrivesCountMD, erasureSetHealthMD, erasureSetReadToleranceMD, erasureSetWriteToleranceMD, erasureSetReadHealthMD, erasureSetWriteHealthMD, }, loadClusterErasureSetMetrics, ) clusterNotificationMG := NewMetricsGroup(notificationCollectorPath, []MetricDescriptor{ notificationCurrentSendInProgressMD, notificationEventsErrorsTotalMD, notificationEventsSentTotalMD, notificationEventsSkippedTotalMD, }, loadClusterNotificationMetrics, ) clusterIAMMG := NewMetricsGroup(clusterIAMCollectorPath, []MetricDescriptor{ lastSyncDurationMillisMD, pluginAuthnServiceFailedRequestsMinuteMD, pluginAuthnServiceLastFailSecondsMD, pluginAuthnServiceLastSuccSecondsMD, pluginAuthnServiceSuccAvgRttMsMinuteMD, pluginAuthnServiceSuccMaxRttMsMinuteMD, pluginAuthnServiceTotalRequestsMinuteMD, sinceLastSyncMillisMD, syncFailuresMD, syncSuccessesMD, }, loadClusterIAMMetrics, ) clusterReplicationMG := NewMetricsGroup(replicationCollectorPath, []MetricDescriptor{ replicationAverageActiveWorkersMD, replicationAverageQueuedBytesMD, replicationAverageQueuedCountMD, replicationAverageDataTransferRateMD, replicationCurrentActiveWorkersMD, replicationCurrentDataTransferRateMD, replicationLastMinuteQueuedBytesMD, replicationLastMinuteQueuedCountMD, replicationMaxActiveWorkersMD, replicationMaxQueuedBytesMD, replicationMaxQueuedCountMD, replicationMaxDataTransferRateMD, }, loadClusterReplicationMetrics, ) clusterConfigMG := NewMetricsGroup(clusterConfigCollectorPath, []MetricDescriptor{ configRRSParityMD, configStandardParityMD, }, loadClusterConfigMetrics, ) scannerMG := NewMetricsGroup(scannerCollectorPath, []MetricDescriptor{ scannerBucketScansFinishedMD, scannerBucketScansStartedMD, scannerDirectoriesScannedMD, scannerObjectsScannedMD, scannerVersionsScannedMD, scannerLastActivitySecondsMD, }, loadClusterScannerMetrics, ) loggerWebhookMG := NewMetricsGroup(loggerWebhookCollectorPath, []MetricDescriptor{ webhookFailedMessagesMD, webhookQueueLengthMD, webhookTotalMessagesMD, }, loadLoggerWebhookMetrics, ) auditMG := NewMetricsGroup(auditCollectorPath, []MetricDescriptor{ auditFailedMessagesMD, auditTargetQueueLengthMD, auditTotalMessagesMD, }, loadAuditMetrics, ) ilmMG := NewMetricsGroup(ilmCollectorPath, []MetricDescriptor{ ilmExpiryPendingTasksMD, ilmTransitionActiveTasksMD, ilmTransitionPendingTasksMD, ilmTransitionMissedImmediateTasksMD, ilmVersionsScannedMD, }, loadILMMetrics, ) allMetricGroups := []*MetricsGroup{ apiRequestsMG, bucketAPIMG, bucketReplicationMG, systemNetworkInternodeMG, systemDriveMG, systemMemoryMG, systemCPUMG, systemProcessMG, clusterHealthMG, clusterUsageObjectsMG, clusterUsageBucketsMG, clusterErasureSetMG, clusterNotificationMG, clusterIAMMG, clusterReplicationMG, clusterConfigMG, ilmMG, scannerMG, auditMG, loggerWebhookMG, } // Bucket metrics are special, they always include the bucket label. These // metrics required a list of buckets to be passed to the loader, and the list // of buckets is not known until the request is made. So we keep a separate // map for bucket metrics and handle them specially. // Add the serverName and poolIndex labels to all non-cluster metrics. // // Also create metric group maps and set the cache. metricsCache := newMetricsCache() mgMap := make(map[collectorPath]*MetricsGroup) bucketMGMap := make(map[collectorPath]*MetricsGroup) for _, mg := range allMetricGroups { if !strings.HasPrefix(string(mg.CollectorPath), clusterBasePath) { mg.AddExtraLabels( serverName, globalLocalNodeName, // poolIndex, strconv.Itoa(globalLocalPoolIdx), ) } mg.SetCache(metricsCache) if mg.IsBucketMetricsGroup() { bucketMGMap[mg.CollectorPath] = mg } else { mgMap[mg.CollectorPath] = mg } } // Prepare to register the collectors. Other than `MetricGroup` collectors, // we also have standard collectors like `GoCollector`. // Create all Non-`MetricGroup` collectors here. collectors := map[collectorPath]prometheus.Collector{ debugGoCollectorPath: collectors.NewGoCollector(), } // Add all `MetricGroup` collectors to the map. for _, mg := range allMetricGroups { collectors[mg.CollectorPath] = mg } // Helper function to register a collector and return a gatherer for it. mustRegister := func(c ...prometheus.Collector) prometheus.Gatherer { subRegistry := prometheus.NewRegistry() for _, col := range c { subRegistry.MustRegister(col) } r.MustRegister(subRegistry) return subRegistry } // Register all collectors and create gatherers for them. gatherers := make(map[collectorPath]prometheus.Gatherer, len(collectors)) collectorPaths := make([]collectorPath, 0, len(collectors)) for path, collector := range collectors { gatherers[path] = mustRegister(collector) collectorPaths = append(collectorPaths, path) } slices.Sort(collectorPaths) return &metricsV3Collection{ mgMap: mgMap, bucketMGMap: bucketMGMap, mgGatherers: gatherers, collectorPaths: collectorPaths, } }