mirror of
https://github.com/minio/minio.git
synced 2024-12-24 06:05:55 -05:00
add two more drive metrics when metrics is available (#17854)
This commit is contained in:
parent
406ea4f281
commit
c4ca0a5a57
@ -28,7 +28,6 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/minio/kes-go"
|
||||
"github.com/minio/madmin-go/v3"
|
||||
"github.com/minio/minio/internal/bucket/lifecycle"
|
||||
"github.com/minio/minio/internal/logger"
|
||||
"github.com/minio/minio/internal/mcontext"
|
||||
@ -83,7 +82,6 @@ func init() {
|
||||
|
||||
nodeGroups := []*MetricsGroup{
|
||||
getNodeHealthMetrics(),
|
||||
getLocalDriveStorageMetrics(),
|
||||
getCacheMetrics(),
|
||||
getHTTPMetrics(false),
|
||||
getNetworkMetrics(),
|
||||
@ -129,7 +127,7 @@ const (
|
||||
cacheSubsystem MetricSubsystem = "cache"
|
||||
capacityRawSubsystem MetricSubsystem = "capacity_raw"
|
||||
capacityUsableSubsystem MetricSubsystem = "capacity_usable"
|
||||
diskSubsystem MetricSubsystem = "disk"
|
||||
driveSubsystem MetricSubsystem = "drive"
|
||||
storageClassSubsystem MetricSubsystem = "storage_class"
|
||||
fileDescriptorSubsystem MetricSubsystem = "file_descriptor"
|
||||
goRoutines MetricSubsystem = "go_routine"
|
||||
@ -379,7 +377,7 @@ func getClusterCapacityUsageFreeBytesMD() MetricDescription {
|
||||
func getNodeDriveAPILatencyMD() MetricDescription {
|
||||
return MetricDescription{
|
||||
Namespace: nodeMetricNamespace,
|
||||
Subsystem: diskSubsystem,
|
||||
Subsystem: driveSubsystem,
|
||||
Name: latencyMicroSec,
|
||||
Help: "Average last minute latency in µs for drive API storage operations",
|
||||
Type: gaugeMetric,
|
||||
@ -389,17 +387,37 @@ func getNodeDriveAPILatencyMD() MetricDescription {
|
||||
func getNodeDriveUsedBytesMD() MetricDescription {
|
||||
return MetricDescription{
|
||||
Namespace: nodeMetricNamespace,
|
||||
Subsystem: diskSubsystem,
|
||||
Subsystem: driveSubsystem,
|
||||
Name: usedBytes,
|
||||
Help: "Total storage used on a drive",
|
||||
Type: gaugeMetric,
|
||||
}
|
||||
}
|
||||
|
||||
func getNodeDriveTimeoutErrorsMD() MetricDescription {
|
||||
return MetricDescription{
|
||||
Namespace: nodeMetricNamespace,
|
||||
Subsystem: driveSubsystem,
|
||||
Name: "errors_timeout",
|
||||
Help: "Total number of timeout errors since server start",
|
||||
Type: counterMetric,
|
||||
}
|
||||
}
|
||||
|
||||
func getNodeDriveAvailablityErrorsMD() MetricDescription {
|
||||
return MetricDescription{
|
||||
Namespace: nodeMetricNamespace,
|
||||
Subsystem: driveSubsystem,
|
||||
Name: "errors_availability",
|
||||
Help: "Total number of I/O errors, permission denied and timeouts since server start",
|
||||
Type: counterMetric,
|
||||
}
|
||||
}
|
||||
|
||||
func getNodeDriveFreeBytesMD() MetricDescription {
|
||||
return MetricDescription{
|
||||
Namespace: nodeMetricNamespace,
|
||||
Subsystem: diskSubsystem,
|
||||
Subsystem: driveSubsystem,
|
||||
Name: freeBytes,
|
||||
Help: "Total storage available on a drive",
|
||||
Type: gaugeMetric,
|
||||
@ -409,9 +427,9 @@ func getNodeDriveFreeBytesMD() MetricDescription {
|
||||
func getClusterDrivesOfflineTotalMD() MetricDescription {
|
||||
return MetricDescription{
|
||||
Namespace: clusterMetricNamespace,
|
||||
Subsystem: diskSubsystem,
|
||||
Subsystem: driveSubsystem,
|
||||
Name: offlineTotal,
|
||||
Help: "Total drives offline",
|
||||
Help: "Total drives offline in this cluster",
|
||||
Type: gaugeMetric,
|
||||
}
|
||||
}
|
||||
@ -419,9 +437,9 @@ func getClusterDrivesOfflineTotalMD() MetricDescription {
|
||||
func getClusterDrivesOnlineTotalMD() MetricDescription {
|
||||
return MetricDescription{
|
||||
Namespace: clusterMetricNamespace,
|
||||
Subsystem: diskSubsystem,
|
||||
Subsystem: driveSubsystem,
|
||||
Name: onlineTotal,
|
||||
Help: "Total drives online",
|
||||
Help: "Total drives online in this cluster",
|
||||
Type: gaugeMetric,
|
||||
}
|
||||
}
|
||||
@ -429,9 +447,9 @@ func getClusterDrivesOnlineTotalMD() MetricDescription {
|
||||
func getClusterDrivesTotalMD() MetricDescription {
|
||||
return MetricDescription{
|
||||
Namespace: clusterMetricNamespace,
|
||||
Subsystem: diskSubsystem,
|
||||
Subsystem: driveSubsystem,
|
||||
Name: total,
|
||||
Help: "Total drives",
|
||||
Help: "Total drives in this cluster",
|
||||
Type: gaugeMetric,
|
||||
}
|
||||
}
|
||||
@ -439,9 +457,9 @@ func getClusterDrivesTotalMD() MetricDescription {
|
||||
func getNodeDrivesOfflineTotalMD() MetricDescription {
|
||||
return MetricDescription{
|
||||
Namespace: nodeMetricNamespace,
|
||||
Subsystem: diskSubsystem,
|
||||
Subsystem: driveSubsystem,
|
||||
Name: offlineTotal,
|
||||
Help: "Total drives offline",
|
||||
Help: "Total drives offline in this node",
|
||||
Type: gaugeMetric,
|
||||
}
|
||||
}
|
||||
@ -449,9 +467,9 @@ func getNodeDrivesOfflineTotalMD() MetricDescription {
|
||||
func getNodeDrivesOnlineTotalMD() MetricDescription {
|
||||
return MetricDescription{
|
||||
Namespace: nodeMetricNamespace,
|
||||
Subsystem: diskSubsystem,
|
||||
Subsystem: driveSubsystem,
|
||||
Name: onlineTotal,
|
||||
Help: "Total drives online",
|
||||
Help: "Total drives online in this node",
|
||||
Type: gaugeMetric,
|
||||
}
|
||||
}
|
||||
@ -459,9 +477,9 @@ func getNodeDrivesOnlineTotalMD() MetricDescription {
|
||||
func getNodeDrivesTotalMD() MetricDescription {
|
||||
return MetricDescription{
|
||||
Namespace: nodeMetricNamespace,
|
||||
Subsystem: diskSubsystem,
|
||||
Subsystem: driveSubsystem,
|
||||
Name: total,
|
||||
Help: "Total drives",
|
||||
Help: "Total drives in this node",
|
||||
Type: gaugeMetric,
|
||||
}
|
||||
}
|
||||
@ -489,7 +507,7 @@ func getNodeRRSParityMD() MetricDescription {
|
||||
func getNodeDrivesFreeInodes() MetricDescription {
|
||||
return MetricDescription{
|
||||
Namespace: nodeMetricNamespace,
|
||||
Subsystem: diskSubsystem,
|
||||
Subsystem: driveSubsystem,
|
||||
Name: freeInodes,
|
||||
Help: "Total free inodes",
|
||||
Type: gaugeMetric,
|
||||
@ -499,7 +517,7 @@ func getNodeDrivesFreeInodes() MetricDescription {
|
||||
func getNodeDriveTotalBytesMD() MetricDescription {
|
||||
return MetricDescription{
|
||||
Namespace: nodeMetricNamespace,
|
||||
Subsystem: diskSubsystem,
|
||||
Subsystem: driveSubsystem,
|
||||
Name: totalBytes,
|
||||
Help: "Total storage on a drive",
|
||||
Type: gaugeMetric,
|
||||
@ -1888,22 +1906,22 @@ func getCacheMetrics() *MetricsGroup {
|
||||
metrics = append(metrics, Metric{
|
||||
Description: getCacheUsagePercentMD(),
|
||||
Value: float64(cdStats.UsagePercent),
|
||||
VariableLabels: map[string]string{"disk": cdStats.Dir},
|
||||
VariableLabels: map[string]string{"drive": cdStats.Dir},
|
||||
})
|
||||
metrics = append(metrics, Metric{
|
||||
Description: getCacheUsageInfoMD(),
|
||||
Value: float64(cdStats.UsageState),
|
||||
VariableLabels: map[string]string{"disk": cdStats.Dir, "level": cdStats.GetUsageLevelString()},
|
||||
VariableLabels: map[string]string{"drive": cdStats.Dir, "level": cdStats.GetUsageLevelString()},
|
||||
})
|
||||
metrics = append(metrics, Metric{
|
||||
Description: getCacheUsedBytesMD(),
|
||||
Value: float64(cdStats.UsageSize),
|
||||
VariableLabels: map[string]string{"disk": cdStats.Dir},
|
||||
VariableLabels: map[string]string{"drive": cdStats.Dir},
|
||||
})
|
||||
metrics = append(metrics, Metric{
|
||||
Description: getCacheTotalBytesMD(),
|
||||
Value: float64(cdStats.TotalCapacity),
|
||||
VariableLabels: map[string]string{"disk": cdStats.Dir},
|
||||
VariableLabels: map[string]string{"drive": cdStats.Dir},
|
||||
})
|
||||
}
|
||||
return
|
||||
@ -2560,26 +2578,48 @@ func getLocalStorageMetrics() *MetricsGroup {
|
||||
metrics = append(metrics, Metric{
|
||||
Description: getNodeDriveUsedBytesMD(),
|
||||
Value: float64(disk.UsedSpace),
|
||||
VariableLabels: map[string]string{"disk": disk.DrivePath},
|
||||
VariableLabels: map[string]string{"drive": disk.DrivePath},
|
||||
})
|
||||
|
||||
metrics = append(metrics, Metric{
|
||||
Description: getNodeDriveFreeBytesMD(),
|
||||
Value: float64(disk.AvailableSpace),
|
||||
VariableLabels: map[string]string{"disk": disk.DrivePath},
|
||||
VariableLabels: map[string]string{"drive": disk.DrivePath},
|
||||
})
|
||||
|
||||
metrics = append(metrics, Metric{
|
||||
Description: getNodeDriveTotalBytesMD(),
|
||||
Value: float64(disk.TotalSpace),
|
||||
VariableLabels: map[string]string{"disk": disk.DrivePath},
|
||||
VariableLabels: map[string]string{"drive": disk.DrivePath},
|
||||
})
|
||||
|
||||
metrics = append(metrics, Metric{
|
||||
Description: getNodeDrivesFreeInodes(),
|
||||
Value: float64(disk.FreeInodes),
|
||||
VariableLabels: map[string]string{"disk": disk.DrivePath},
|
||||
VariableLabels: map[string]string{"drive": disk.DrivePath},
|
||||
})
|
||||
|
||||
if disk.Metrics != nil {
|
||||
metrics = append(metrics, Metric{
|
||||
Description: getNodeDriveTimeoutErrorsMD(),
|
||||
Value: float64(disk.Metrics.TotalErrorsTimeout),
|
||||
VariableLabels: map[string]string{"drive": disk.DrivePath},
|
||||
})
|
||||
|
||||
metrics = append(metrics, Metric{
|
||||
Description: getNodeDriveAvailablityErrorsMD(),
|
||||
Value: float64(disk.Metrics.TotalErrorsAvailability),
|
||||
VariableLabels: map[string]string{"drive": disk.DrivePath},
|
||||
})
|
||||
|
||||
for apiName, latency := range disk.Metrics.LastMinute {
|
||||
metrics = append(metrics, Metric{
|
||||
Description: getNodeDriveAPILatencyMD(),
|
||||
Value: float64(latency.Avg().Microseconds()),
|
||||
VariableLabels: map[string]string{"drive": disk.DrivePath, "api": "storage." + apiName},
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
metrics = append(metrics, Metric{
|
||||
@ -2612,39 +2652,6 @@ func getLocalStorageMetrics() *MetricsGroup {
|
||||
return mg
|
||||
}
|
||||
|
||||
func getLocalDriveStorageMetrics() *MetricsGroup {
|
||||
mg := &MetricsGroup{
|
||||
cacheInterval: 1 * time.Minute,
|
||||
}
|
||||
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
|
||||
objLayer := newObjectLayerFn()
|
||||
// Service not initialized yet
|
||||
if objLayer == nil {
|
||||
return
|
||||
}
|
||||
|
||||
storageInfo := objLayer.LocalStorageInfo(ctx)
|
||||
if storageInfo.Backend.Type == madmin.FS {
|
||||
return
|
||||
}
|
||||
metrics = make([]Metric, 0, 50)
|
||||
for _, disk := range storageInfo.Disks {
|
||||
if disk.Metrics == nil {
|
||||
continue
|
||||
}
|
||||
for apiName, latency := range disk.Metrics.LastMinute {
|
||||
metrics = append(metrics, Metric{
|
||||
Description: getNodeDriveAPILatencyMD(),
|
||||
Value: float64(latency.Avg().Microseconds()),
|
||||
VariableLabels: map[string]string{"disk": disk.DrivePath, "api": "storage." + apiName},
|
||||
})
|
||||
}
|
||||
}
|
||||
return
|
||||
})
|
||||
return mg
|
||||
}
|
||||
|
||||
func getClusterWriteQuorumMD() MetricDescription {
|
||||
return MetricDescription{
|
||||
Namespace: clusterMetricNamespace,
|
||||
|
@ -984,7 +984,7 @@
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"exemplar": true,
|
||||
"expr": "minio_cluster_disk_online_total{job=\"$scrape_jobs\"}",
|
||||
"expr": "minio_cluster_drive_online_total{job=\"$scrape_jobs\"}",
|
||||
"format": "table",
|
||||
"hide": false,
|
||||
"instant": true,
|
||||
@ -1418,7 +1418,7 @@
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"exemplar": true,
|
||||
"expr": "minio_cluster_disk_offline_total{job=\"$scrape_jobs\"}",
|
||||
"expr": "minio_cluster_drive_offline_total{job=\"$scrape_jobs\"}",
|
||||
"format": "table",
|
||||
"hide": false,
|
||||
"instant": true,
|
||||
@ -2389,7 +2389,7 @@
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"exemplar": true,
|
||||
"expr": "minio_node_disk_used_bytes{job=\"$scrape_jobs\"}",
|
||||
"expr": "minio_node_drive_used_bytes{job=\"$scrape_jobs\"}",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"interval": "",
|
||||
@ -2479,7 +2479,7 @@
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"exemplar": true,
|
||||
"expr": "minio_node_disk_free_inodes{job=\"$scrape_jobs\"}",
|
||||
"expr": "minio_node_drive_free_inodes{job=\"$scrape_jobs\"}",
|
||||
"format": "time_series",
|
||||
"instant": false,
|
||||
"interval": "",
|
||||
@ -2905,4 +2905,4 @@
|
||||
"uid": "TgmJnqnnk",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
}
|
||||
|
@ -27,9 +27,9 @@ These metrics can be obtained from any MinIO server once per collection.
|
||||
| `minio_cluster_usage_deletemarker_total` | Total number of delete markers in a cluster |
|
||||
| `minio_cluster_usage_total_bytes` | Total cluster usage in bytes |
|
||||
| `minio_cluster_buckets_total` | Total number of buckets in the cluster |
|
||||
| `minio_cluster_disk_offline_total` | Total drives offline. |
|
||||
| `minio_cluster_disk_online_total` | Total drives online. |
|
||||
| `minio_cluster_disk_total` | Total drives. |
|
||||
| `minio_cluster_drive_offline_total` | Total drives offline in this cluster. |
|
||||
| `minio_cluster_drive_online_total` | Total drives online in this cluster. |
|
||||
| `minio_cluster_drive_total` | Total drives in this cluster. |
|
||||
| `minio_cluster_ilm_transitioned_bytes` | Total bytes transitioned to a tier. |
|
||||
| `minio_cluster_ilm_transitioned_objects` | Total number of objects transitioned to a tier. |
|
||||
| `minio_cluster_ilm_transitioned_versions` | Total number of versions transitioned to a tier. |
|
||||
@ -51,15 +51,34 @@ These metrics can be obtained from any MinIO server once per collection.
|
||||
| `minio_inter_node_traffic_errors_total` | Total number of failed internode calls. |
|
||||
| `minio_inter_node_traffic_received_bytes` | Total number of bytes received from other peer nodes. |
|
||||
| `minio_inter_node_traffic_sent_bytes` | Total number of bytes sent to the other peer nodes. |
|
||||
| `minio_minio_update_percent` | Total percentage cache usage. |
|
||||
| `minio_node_disk_free_bytes` | Total storage available on a drive. |
|
||||
| `minio_node_disk_free_inodes` | Total free inodes. |
|
||||
| `minio_node_disk_latency_us` | Average last minute latency in µs for drive API storage operations. |
|
||||
| `minio_node_disk_offline_total` | Total drives offline. |
|
||||
| `minio_node_disk_online_total` | Total drives online. |
|
||||
| `minio_node_disk_total` | Total drives. |
|
||||
| `minio_node_disk_total_bytes` | Total storage on a drive. |
|
||||
| `minio_node_disk_used_bytes` | Total storage used on a drive. |
|
||||
| `minio_notify_current_send_in_progress` | Number of concurrent async Send calls active to all targets. |
|
||||
| `minio_notify_target_queue_length` | Number of unsent notifications in queue for target. |
|
||||
| `minio_s3_requests_4xx_errors_total` | Total number S3 requests with (4xx) errors. |
|
||||
| `minio_s3_requests_5xx_errors_total` | Total number S3 requests with (5xx) errors. |
|
||||
| `minio_s3_requests_canceled_total` | Total number S3 requests canceled by the client. |
|
||||
| `minio_s3_requests_errors_total` | Total number S3 requests with (4xx and 5xx) errors. |
|
||||
| `minio_s3_requests_incoming_total` | Volatile number of total incoming S3 requests. |
|
||||
| `minio_s3_requests_inflight_total` | Total number of S3 requests currently in flight. |
|
||||
| `minio_s3_requests_rejected_auth_total` | Total number S3 requests rejected for auth failure. |
|
||||
| `minio_s3_requests_rejected_header_total` | Total number S3 requests rejected for invalid header. |
|
||||
| `minio_s3_requests_rejected_invalid_total` | Total number S3 invalid requests. |
|
||||
| `minio_s3_requests_rejected_timestamp_total` | Total number S3 requests rejected for invalid timestamp. |
|
||||
| `minio_s3_requests_total` | Total number S3 requests. |
|
||||
| `minio_s3_requests_waiting_total` | Number of S3 requests in the waiting queue. |
|
||||
| `minio_s3_requests_ttfb_seconds_distribution` | Distribution of the time to first byte across API calls. |
|
||||
| `minio_s3_traffic_received_bytes` | Total number of s3 bytes received. |
|
||||
| `minio_s3_traffic_sent_bytes` | Total number of s3 bytes sent. |
|
||||
| `minio_software_commit_info` | Git commit hash for the MinIO release. |
|
||||
| `minio_software_version_info` | MinIO Release tag for the server. |
|
||||
| `minio_usage_last_activity_nano_seconds` | Time elapsed (in nano seconds) since last scan activity. |
|
||||
| `minio_node_drive_free_bytes` | Total storage available on a drive. |
|
||||
| `minio_node_drive_free_inodes` | Total free inodes. |
|
||||
| `minio_node_drive_latency_us` | Average last minute latency in µs for drive API storage operations. |
|
||||
| `minio_node_drive_offline_total` | Total drives offline in this node. |
|
||||
| `minio_node_drive_online_total` | Total drives online in this node. |
|
||||
| `minio_node_drive_total` | Total drives in this node. |
|
||||
| `minio_node_drive_total_bytes` | Total storage on a drive. |
|
||||
| `minio_node_drive_used_bytes` | Total storage used on a drive. |
|
||||
| `minio_node_file_descriptor_limit_total` | Limit on total number of open file descriptors for the MinIO Server process. |
|
||||
| `minio_node_file_descriptor_open_total` | Total number of open file descriptors by the MinIO Server process. |
|
||||
| `minio_node_go_routine_total` | Total number of go routines running. |
|
||||
@ -86,26 +105,6 @@ These metrics can be obtained from any MinIO server once per collection.
|
||||
| `minio_node_scanner_versions_scanned` | Total number of object versions scanned since server start. |
|
||||
| `minio_node_syscall_read_total` | Total read SysCalls to the kernel. /proc/[pid]/io syscr. |
|
||||
| `minio_node_syscall_write_total` | Total write SysCalls to the kernel. /proc/[pid]/io syscw. |
|
||||
| `minio_notify_current_send_in_progress` | Number of concurrent async Send calls active to all targets. |
|
||||
| `minio_notify_target_queue_length` | Number of unsent notifications in queue for target. |
|
||||
| `minio_s3_requests_4xx_errors_total` | Total number S3 requests with (4xx) errors. |
|
||||
| `minio_s3_requests_5xx_errors_total` | Total number S3 requests with (5xx) errors. |
|
||||
| `minio_s3_requests_canceled_total` | Total number S3 requests canceled by the client. |
|
||||
| `minio_s3_requests_errors_total` | Total number S3 requests with (4xx and 5xx) errors. |
|
||||
| `minio_s3_requests_incoming_total` | Volatile number of total incoming S3 requests. |
|
||||
| `minio_s3_requests_inflight_total` | Total number of S3 requests currently in flight. |
|
||||
| `minio_s3_requests_rejected_auth_total` | Total number S3 requests rejected for auth failure. |
|
||||
| `minio_s3_requests_rejected_header_total` | Total number S3 requests rejected for invalid header. |
|
||||
| `minio_s3_requests_rejected_invalid_total` | Total number S3 invalid requests. |
|
||||
| `minio_s3_requests_rejected_timestamp_total` | Total number S3 requests rejected for invalid timestamp. |
|
||||
| `minio_s3_requests_total` | Total number S3 requests. |
|
||||
| `minio_s3_requests_waiting_total` | Number of S3 requests in the waiting queue. |
|
||||
| `minio_s3_requests_ttfb_seconds_distribution` | Distribution of the time to first byte across API calls. |
|
||||
| `minio_s3_traffic_received_bytes` | Total number of s3 bytes received. |
|
||||
| `minio_s3_traffic_sent_bytes` | Total number of s3 bytes sent. |
|
||||
| `minio_software_commit_info` | Git commit hash for the MinIO release. |
|
||||
| `minio_software_version_info` | MinIO Release tag for the server. |
|
||||
| `minio_usage_last_activity_nano_seconds` | Time elapsed (in nano seconds) since last scan activity. |
|
||||
|
||||
# List of metrics exported per bucket level
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user