add two more drive metrics when metrics is available (#17854)

This commit is contained in:
Harshavardhana 2023-08-15 10:55:47 -07:00 committed by GitHub
parent 406ea4f281
commit c4ca0a5a57
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 104 additions and 98 deletions

View File

@ -28,7 +28,6 @@ import (
"time"
"github.com/minio/kes-go"
"github.com/minio/madmin-go/v3"
"github.com/minio/minio/internal/bucket/lifecycle"
"github.com/minio/minio/internal/logger"
"github.com/minio/minio/internal/mcontext"
@ -83,7 +82,6 @@ func init() {
nodeGroups := []*MetricsGroup{
getNodeHealthMetrics(),
getLocalDriveStorageMetrics(),
getCacheMetrics(),
getHTTPMetrics(false),
getNetworkMetrics(),
@ -129,7 +127,7 @@ const (
cacheSubsystem MetricSubsystem = "cache"
capacityRawSubsystem MetricSubsystem = "capacity_raw"
capacityUsableSubsystem MetricSubsystem = "capacity_usable"
diskSubsystem MetricSubsystem = "disk"
driveSubsystem MetricSubsystem = "drive"
storageClassSubsystem MetricSubsystem = "storage_class"
fileDescriptorSubsystem MetricSubsystem = "file_descriptor"
goRoutines MetricSubsystem = "go_routine"
@ -379,7 +377,7 @@ func getClusterCapacityUsageFreeBytesMD() MetricDescription {
func getNodeDriveAPILatencyMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: diskSubsystem,
Subsystem: driveSubsystem,
Name: latencyMicroSec,
Help: "Average last minute latency in µs for drive API storage operations",
Type: gaugeMetric,
@ -389,17 +387,37 @@ func getNodeDriveAPILatencyMD() MetricDescription {
func getNodeDriveUsedBytesMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: diskSubsystem,
Subsystem: driveSubsystem,
Name: usedBytes,
Help: "Total storage used on a drive",
Type: gaugeMetric,
}
}
func getNodeDriveTimeoutErrorsMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: driveSubsystem,
Name: "errors_timeout",
Help: "Total number of timeout errors since server start",
Type: counterMetric,
}
}
func getNodeDriveAvailablityErrorsMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: driveSubsystem,
Name: "errors_availability",
Help: "Total number of I/O errors, permission denied and timeouts since server start",
Type: counterMetric,
}
}
func getNodeDriveFreeBytesMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: diskSubsystem,
Subsystem: driveSubsystem,
Name: freeBytes,
Help: "Total storage available on a drive",
Type: gaugeMetric,
@ -409,9 +427,9 @@ func getNodeDriveFreeBytesMD() MetricDescription {
func getClusterDrivesOfflineTotalMD() MetricDescription {
return MetricDescription{
Namespace: clusterMetricNamespace,
Subsystem: diskSubsystem,
Subsystem: driveSubsystem,
Name: offlineTotal,
Help: "Total drives offline",
Help: "Total drives offline in this cluster",
Type: gaugeMetric,
}
}
@ -419,9 +437,9 @@ func getClusterDrivesOfflineTotalMD() MetricDescription {
func getClusterDrivesOnlineTotalMD() MetricDescription {
return MetricDescription{
Namespace: clusterMetricNamespace,
Subsystem: diskSubsystem,
Subsystem: driveSubsystem,
Name: onlineTotal,
Help: "Total drives online",
Help: "Total drives online in this cluster",
Type: gaugeMetric,
}
}
@ -429,9 +447,9 @@ func getClusterDrivesOnlineTotalMD() MetricDescription {
func getClusterDrivesTotalMD() MetricDescription {
return MetricDescription{
Namespace: clusterMetricNamespace,
Subsystem: diskSubsystem,
Subsystem: driveSubsystem,
Name: total,
Help: "Total drives",
Help: "Total drives in this cluster",
Type: gaugeMetric,
}
}
@ -439,9 +457,9 @@ func getClusterDrivesTotalMD() MetricDescription {
func getNodeDrivesOfflineTotalMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: diskSubsystem,
Subsystem: driveSubsystem,
Name: offlineTotal,
Help: "Total drives offline",
Help: "Total drives offline in this node",
Type: gaugeMetric,
}
}
@ -449,9 +467,9 @@ func getNodeDrivesOfflineTotalMD() MetricDescription {
func getNodeDrivesOnlineTotalMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: diskSubsystem,
Subsystem: driveSubsystem,
Name: onlineTotal,
Help: "Total drives online",
Help: "Total drives online in this node",
Type: gaugeMetric,
}
}
@ -459,9 +477,9 @@ func getNodeDrivesOnlineTotalMD() MetricDescription {
func getNodeDrivesTotalMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: diskSubsystem,
Subsystem: driveSubsystem,
Name: total,
Help: "Total drives",
Help: "Total drives in this node",
Type: gaugeMetric,
}
}
@ -489,7 +507,7 @@ func getNodeRRSParityMD() MetricDescription {
func getNodeDrivesFreeInodes() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: diskSubsystem,
Subsystem: driveSubsystem,
Name: freeInodes,
Help: "Total free inodes",
Type: gaugeMetric,
@ -499,7 +517,7 @@ func getNodeDrivesFreeInodes() MetricDescription {
func getNodeDriveTotalBytesMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: diskSubsystem,
Subsystem: driveSubsystem,
Name: totalBytes,
Help: "Total storage on a drive",
Type: gaugeMetric,
@ -1888,22 +1906,22 @@ func getCacheMetrics() *MetricsGroup {
metrics = append(metrics, Metric{
Description: getCacheUsagePercentMD(),
Value: float64(cdStats.UsagePercent),
VariableLabels: map[string]string{"disk": cdStats.Dir},
VariableLabels: map[string]string{"drive": cdStats.Dir},
})
metrics = append(metrics, Metric{
Description: getCacheUsageInfoMD(),
Value: float64(cdStats.UsageState),
VariableLabels: map[string]string{"disk": cdStats.Dir, "level": cdStats.GetUsageLevelString()},
VariableLabels: map[string]string{"drive": cdStats.Dir, "level": cdStats.GetUsageLevelString()},
})
metrics = append(metrics, Metric{
Description: getCacheUsedBytesMD(),
Value: float64(cdStats.UsageSize),
VariableLabels: map[string]string{"disk": cdStats.Dir},
VariableLabels: map[string]string{"drive": cdStats.Dir},
})
metrics = append(metrics, Metric{
Description: getCacheTotalBytesMD(),
Value: float64(cdStats.TotalCapacity),
VariableLabels: map[string]string{"disk": cdStats.Dir},
VariableLabels: map[string]string{"drive": cdStats.Dir},
})
}
return
@ -2560,26 +2578,48 @@ func getLocalStorageMetrics() *MetricsGroup {
metrics = append(metrics, Metric{
Description: getNodeDriveUsedBytesMD(),
Value: float64(disk.UsedSpace),
VariableLabels: map[string]string{"disk": disk.DrivePath},
VariableLabels: map[string]string{"drive": disk.DrivePath},
})
metrics = append(metrics, Metric{
Description: getNodeDriveFreeBytesMD(),
Value: float64(disk.AvailableSpace),
VariableLabels: map[string]string{"disk": disk.DrivePath},
VariableLabels: map[string]string{"drive": disk.DrivePath},
})
metrics = append(metrics, Metric{
Description: getNodeDriveTotalBytesMD(),
Value: float64(disk.TotalSpace),
VariableLabels: map[string]string{"disk": disk.DrivePath},
VariableLabels: map[string]string{"drive": disk.DrivePath},
})
metrics = append(metrics, Metric{
Description: getNodeDrivesFreeInodes(),
Value: float64(disk.FreeInodes),
VariableLabels: map[string]string{"disk": disk.DrivePath},
VariableLabels: map[string]string{"drive": disk.DrivePath},
})
if disk.Metrics != nil {
metrics = append(metrics, Metric{
Description: getNodeDriveTimeoutErrorsMD(),
Value: float64(disk.Metrics.TotalErrorsTimeout),
VariableLabels: map[string]string{"drive": disk.DrivePath},
})
metrics = append(metrics, Metric{
Description: getNodeDriveAvailablityErrorsMD(),
Value: float64(disk.Metrics.TotalErrorsAvailability),
VariableLabels: map[string]string{"drive": disk.DrivePath},
})
for apiName, latency := range disk.Metrics.LastMinute {
metrics = append(metrics, Metric{
Description: getNodeDriveAPILatencyMD(),
Value: float64(latency.Avg().Microseconds()),
VariableLabels: map[string]string{"drive": disk.DrivePath, "api": "storage." + apiName},
})
}
}
}
metrics = append(metrics, Metric{
@ -2612,39 +2652,6 @@ func getLocalStorageMetrics() *MetricsGroup {
return mg
}
func getLocalDriveStorageMetrics() *MetricsGroup {
mg := &MetricsGroup{
cacheInterval: 1 * time.Minute,
}
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
objLayer := newObjectLayerFn()
// Service not initialized yet
if objLayer == nil {
return
}
storageInfo := objLayer.LocalStorageInfo(ctx)
if storageInfo.Backend.Type == madmin.FS {
return
}
metrics = make([]Metric, 0, 50)
for _, disk := range storageInfo.Disks {
if disk.Metrics == nil {
continue
}
for apiName, latency := range disk.Metrics.LastMinute {
metrics = append(metrics, Metric{
Description: getNodeDriveAPILatencyMD(),
Value: float64(latency.Avg().Microseconds()),
VariableLabels: map[string]string{"disk": disk.DrivePath, "api": "storage." + apiName},
})
}
}
return
})
return mg
}
func getClusterWriteQuorumMD() MetricDescription {
return MetricDescription{
Namespace: clusterMetricNamespace,

View File

@ -984,7 +984,7 @@
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "minio_cluster_disk_online_total{job=\"$scrape_jobs\"}",
"expr": "minio_cluster_drive_online_total{job=\"$scrape_jobs\"}",
"format": "table",
"hide": false,
"instant": true,
@ -1418,7 +1418,7 @@
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "minio_cluster_disk_offline_total{job=\"$scrape_jobs\"}",
"expr": "minio_cluster_drive_offline_total{job=\"$scrape_jobs\"}",
"format": "table",
"hide": false,
"instant": true,
@ -2389,7 +2389,7 @@
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "minio_node_disk_used_bytes{job=\"$scrape_jobs\"}",
"expr": "minio_node_drive_used_bytes{job=\"$scrape_jobs\"}",
"format": "time_series",
"instant": false,
"interval": "",
@ -2479,7 +2479,7 @@
"uid": "${DS_PROMETHEUS}"
},
"exemplar": true,
"expr": "minio_node_disk_free_inodes{job=\"$scrape_jobs\"}",
"expr": "minio_node_drive_free_inodes{job=\"$scrape_jobs\"}",
"format": "time_series",
"instant": false,
"interval": "",
@ -2905,4 +2905,4 @@
"uid": "TgmJnqnnk",
"version": 1,
"weekStart": ""
}
}

View File

@ -27,9 +27,9 @@ These metrics can be obtained from any MinIO server once per collection.
| `minio_cluster_usage_deletemarker_total` | Total number of delete markers in a cluster |
| `minio_cluster_usage_total_bytes` | Total cluster usage in bytes |
| `minio_cluster_buckets_total` | Total number of buckets in the cluster |
| `minio_cluster_disk_offline_total` | Total drives offline. |
| `minio_cluster_disk_online_total` | Total drives online. |
| `minio_cluster_disk_total` | Total drives. |
| `minio_cluster_drive_offline_total` | Total drives offline in this cluster. |
| `minio_cluster_drive_online_total` | Total drives online in this cluster. |
| `minio_cluster_drive_total` | Total drives in this cluster. |
| `minio_cluster_ilm_transitioned_bytes` | Total bytes transitioned to a tier. |
| `minio_cluster_ilm_transitioned_objects` | Total number of objects transitioned to a tier. |
| `minio_cluster_ilm_transitioned_versions` | Total number of versions transitioned to a tier. |
@ -51,15 +51,34 @@ These metrics can be obtained from any MinIO server once per collection.
| `minio_inter_node_traffic_errors_total` | Total number of failed internode calls. |
| `minio_inter_node_traffic_received_bytes` | Total number of bytes received from other peer nodes. |
| `minio_inter_node_traffic_sent_bytes` | Total number of bytes sent to the other peer nodes. |
| `minio_minio_update_percent` | Total percentage cache usage. |
| `minio_node_disk_free_bytes` | Total storage available on a drive. |
| `minio_node_disk_free_inodes` | Total free inodes. |
| `minio_node_disk_latency_us` | Average last minute latency in µs for drive API storage operations. |
| `minio_node_disk_offline_total` | Total drives offline. |
| `minio_node_disk_online_total` | Total drives online. |
| `minio_node_disk_total` | Total drives. |
| `minio_node_disk_total_bytes` | Total storage on a drive. |
| `minio_node_disk_used_bytes` | Total storage used on a drive. |
| `minio_notify_current_send_in_progress` | Number of concurrent async Send calls active to all targets. |
| `minio_notify_target_queue_length` | Number of unsent notifications in queue for target. |
| `minio_s3_requests_4xx_errors_total` | Total number S3 requests with (4xx) errors. |
| `minio_s3_requests_5xx_errors_total` | Total number S3 requests with (5xx) errors. |
| `minio_s3_requests_canceled_total` | Total number S3 requests canceled by the client. |
| `minio_s3_requests_errors_total` | Total number S3 requests with (4xx and 5xx) errors. |
| `minio_s3_requests_incoming_total` | Volatile number of total incoming S3 requests. |
| `minio_s3_requests_inflight_total` | Total number of S3 requests currently in flight. |
| `minio_s3_requests_rejected_auth_total` | Total number S3 requests rejected for auth failure. |
| `minio_s3_requests_rejected_header_total` | Total number S3 requests rejected for invalid header. |
| `minio_s3_requests_rejected_invalid_total` | Total number S3 invalid requests. |
| `minio_s3_requests_rejected_timestamp_total` | Total number S3 requests rejected for invalid timestamp. |
| `minio_s3_requests_total` | Total number S3 requests. |
| `minio_s3_requests_waiting_total` | Number of S3 requests in the waiting queue. |
| `minio_s3_requests_ttfb_seconds_distribution` | Distribution of the time to first byte across API calls. |
| `minio_s3_traffic_received_bytes` | Total number of s3 bytes received. |
| `minio_s3_traffic_sent_bytes` | Total number of s3 bytes sent. |
| `minio_software_commit_info` | Git commit hash for the MinIO release. |
| `minio_software_version_info` | MinIO Release tag for the server. |
| `minio_usage_last_activity_nano_seconds` | Time elapsed (in nano seconds) since last scan activity. |
| `minio_node_drive_free_bytes` | Total storage available on a drive. |
| `minio_node_drive_free_inodes` | Total free inodes. |
| `minio_node_drive_latency_us` | Average last minute latency in µs for drive API storage operations. |
| `minio_node_drive_offline_total` | Total drives offline in this node. |
| `minio_node_drive_online_total` | Total drives online in this node. |
| `minio_node_drive_total` | Total drives in this node. |
| `minio_node_drive_total_bytes` | Total storage on a drive. |
| `minio_node_drive_used_bytes` | Total storage used on a drive. |
| `minio_node_file_descriptor_limit_total` | Limit on total number of open file descriptors for the MinIO Server process. |
| `minio_node_file_descriptor_open_total` | Total number of open file descriptors by the MinIO Server process. |
| `minio_node_go_routine_total` | Total number of go routines running. |
@ -86,26 +105,6 @@ These metrics can be obtained from any MinIO server once per collection.
| `minio_node_scanner_versions_scanned` | Total number of object versions scanned since server start. |
| `minio_node_syscall_read_total` | Total read SysCalls to the kernel. /proc/[pid]/io syscr. |
| `minio_node_syscall_write_total` | Total write SysCalls to the kernel. /proc/[pid]/io syscw. |
| `minio_notify_current_send_in_progress` | Number of concurrent async Send calls active to all targets. |
| `minio_notify_target_queue_length` | Number of unsent notifications in queue for target. |
| `minio_s3_requests_4xx_errors_total` | Total number S3 requests with (4xx) errors. |
| `minio_s3_requests_5xx_errors_total` | Total number S3 requests with (5xx) errors. |
| `minio_s3_requests_canceled_total` | Total number S3 requests canceled by the client. |
| `minio_s3_requests_errors_total` | Total number S3 requests with (4xx and 5xx) errors. |
| `minio_s3_requests_incoming_total` | Volatile number of total incoming S3 requests. |
| `minio_s3_requests_inflight_total` | Total number of S3 requests currently in flight. |
| `minio_s3_requests_rejected_auth_total` | Total number S3 requests rejected for auth failure. |
| `minio_s3_requests_rejected_header_total` | Total number S3 requests rejected for invalid header. |
| `minio_s3_requests_rejected_invalid_total` | Total number S3 invalid requests. |
| `minio_s3_requests_rejected_timestamp_total` | Total number S3 requests rejected for invalid timestamp. |
| `minio_s3_requests_total` | Total number S3 requests. |
| `minio_s3_requests_waiting_total` | Number of S3 requests in the waiting queue. |
| `minio_s3_requests_ttfb_seconds_distribution` | Distribution of the time to first byte across API calls. |
| `minio_s3_traffic_received_bytes` | Total number of s3 bytes received. |
| `minio_s3_traffic_sent_bytes` | Total number of s3 bytes sent. |
| `minio_software_commit_info` | Git commit hash for the MinIO release. |
| `minio_software_version_info` | MinIO Release tag for the server. |
| `minio_usage_last_activity_nano_seconds` | Time elapsed (in nano seconds) since last scan activity. |
# List of metrics exported per bucket level