mirror of https://github.com/minio/minio.git
fix: prometheus metrics disks_online count when disks are down (#11689)
prometheus metrics was using total disks instead of online disk count, when disks were down, this PR fixes this and also adds a new metric for total_disk_count
This commit is contained in:
parent
690434514d
commit
2c198ae7b6
|
@ -72,6 +72,7 @@ const (
|
||||||
type MetricName string
|
type MetricName string
|
||||||
|
|
||||||
const (
|
const (
|
||||||
|
total MetricName = "total"
|
||||||
errorsTotal MetricName = "error_total"
|
errorsTotal MetricName = "error_total"
|
||||||
healTotal MetricName = "heal_total"
|
healTotal MetricName = "heal_total"
|
||||||
hitsTotal MetricName = "hits_total"
|
hitsTotal MetricName = "hits_total"
|
||||||
|
@ -85,7 +86,6 @@ const (
|
||||||
openTotal MetricName = "open_total"
|
openTotal MetricName = "open_total"
|
||||||
readTotal MetricName = "read_total"
|
readTotal MetricName = "read_total"
|
||||||
writeTotal MetricName = "write_total"
|
writeTotal MetricName = "write_total"
|
||||||
total MetricName = "total"
|
|
||||||
|
|
||||||
failedBytes MetricName = "failed_bytes"
|
failedBytes MetricName = "failed_bytes"
|
||||||
freeBytes MetricName = "free_bytes"
|
freeBytes MetricName = "free_bytes"
|
||||||
|
@ -254,7 +254,7 @@ func getNodeDiskFreeBytesMD() MetricDescription {
|
||||||
Type: gaugeMetric,
|
Type: gaugeMetric,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
func getClusterDiskOfflineTotalMD() MetricDescription {
|
func getClusterDisksOfflineTotalMD() MetricDescription {
|
||||||
return MetricDescription{
|
return MetricDescription{
|
||||||
Namespace: clusterMetricNamespace,
|
Namespace: clusterMetricNamespace,
|
||||||
Subsystem: diskSubsystem,
|
Subsystem: diskSubsystem,
|
||||||
|
@ -264,7 +264,7 @@ func getClusterDiskOfflineTotalMD() MetricDescription {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func getClusterDiskOnlineTotalMD() MetricDescription {
|
func getClusterDisksOnlineTotalMD() MetricDescription {
|
||||||
return MetricDescription{
|
return MetricDescription{
|
||||||
Namespace: clusterMetricNamespace,
|
Namespace: clusterMetricNamespace,
|
||||||
Subsystem: diskSubsystem,
|
Subsystem: diskSubsystem,
|
||||||
|
@ -274,6 +274,16 @@ func getClusterDiskOnlineTotalMD() MetricDescription {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getClusterDisksTotalMD() MetricDescription {
|
||||||
|
return MetricDescription{
|
||||||
|
Namespace: clusterMetricNamespace,
|
||||||
|
Subsystem: diskSubsystem,
|
||||||
|
Name: total,
|
||||||
|
Help: "Total disks.",
|
||||||
|
Type: gaugeMetric,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func getNodeDiskTotalBytesMD() MetricDescription {
|
func getNodeDiskTotalBytesMD() MetricDescription {
|
||||||
return MetricDescription{
|
return MetricDescription{
|
||||||
Namespace: nodeMetricNamespace,
|
Namespace: nodeMetricNamespace,
|
||||||
|
@ -1142,7 +1152,7 @@ func getClusterStorageMetrics() MetricsGroup {
|
||||||
// Fetch disk space info, ignore errors
|
// Fetch disk space info, ignore errors
|
||||||
storageInfo, _ := objLayer.StorageInfo(ctx)
|
storageInfo, _ := objLayer.StorageInfo(ctx)
|
||||||
onlineDisks, offlineDisks := getOnlineOfflineDisksStats(storageInfo.Disks)
|
onlineDisks, offlineDisks := getOnlineOfflineDisksStats(storageInfo.Disks)
|
||||||
totalDisks := offlineDisks.Merge(onlineDisks)
|
totalDisks := onlineDisks.Merge(offlineDisks)
|
||||||
|
|
||||||
metrics.Metrics = append(metrics.Metrics, Metric{
|
metrics.Metrics = append(metrics.Metrics, Metric{
|
||||||
Description: getClusterCapacityTotalBytesMD(),
|
Description: getClusterCapacityTotalBytesMD(),
|
||||||
|
@ -1165,12 +1175,17 @@ func getClusterStorageMetrics() MetricsGroup {
|
||||||
})
|
})
|
||||||
|
|
||||||
metrics.Metrics = append(metrics.Metrics, Metric{
|
metrics.Metrics = append(metrics.Metrics, Metric{
|
||||||
Description: getClusterDiskOfflineTotalMD(),
|
Description: getClusterDisksOfflineTotalMD(),
|
||||||
Value: float64(offlineDisks.Sum()),
|
Value: float64(offlineDisks.Sum()),
|
||||||
})
|
})
|
||||||
|
|
||||||
metrics.Metrics = append(metrics.Metrics, Metric{
|
metrics.Metrics = append(metrics.Metrics, Metric{
|
||||||
Description: getClusterDiskOnlineTotalMD(),
|
Description: getClusterDisksOnlineTotalMD(),
|
||||||
|
Value: float64(onlineDisks.Sum()),
|
||||||
|
})
|
||||||
|
|
||||||
|
metrics.Metrics = append(metrics.Metrics, Metric{
|
||||||
|
Description: getClusterDisksTotalMD(),
|
||||||
Value: float64(totalDisks.Sum()),
|
Value: float64(totalDisks.Sum()),
|
||||||
})
|
})
|
||||||
},
|
},
|
||||||
|
|
|
@ -5,52 +5,53 @@ Each metric has a label for the server that generated the metric.
|
||||||
|
|
||||||
These metrics can be from any MinIO server once per collection.
|
These metrics can be from any MinIO server once per collection.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|:-----------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------|
|
|:---------------------------------------------|:--------------------------------------------------------------------------------------------------------------------|
|
||||||
|`minio_bucket_objects_size_distribution` |Distribution of object sizes in the bucket, includes label for the bucket name. |
|
| `minio_bucket_objects_size_distribution` | Distribution of object sizes in the bucket, includes label for the bucket name. |
|
||||||
|`minio_bucket_replication_failed_bytes` |Total number of bytes failed at least once to replicate. |
|
| `minio_bucket_replication_failed_bytes` | Total number of bytes failed at least once to replicate. |
|
||||||
|`minio_bucket_replication_pending_bytes` |Total bytes pending to replicate. |
|
| `minio_bucket_replication_pending_bytes` | Total bytes pending to replicate. |
|
||||||
|`minio_bucket_replication_received_bytes` |Total number of bytes replicated to this bucket from another source bucket. |
|
| `minio_bucket_replication_received_bytes` | Total number of bytes replicated to this bucket from another source bucket. |
|
||||||
|`minio_bucket_replication_sent_bytes` |Total number of bytes replicated to the target bucket. |
|
| `minio_bucket_replication_sent_bytes` | Total number of bytes replicated to the target bucket. |
|
||||||
|`minio_bucket_usage_object_total` |Total number of objects |
|
| `minio_bucket_usage_object_total` | Total number of objects |
|
||||||
|`minio_bucket_usage_total_bytes` |Total bucket size in bytes |
|
| `minio_bucket_usage_total_bytes` | Total bucket size in bytes |
|
||||||
|`minio_cluster_capacity_raw_free_bytes` |Total free capacity online in the cluster. |
|
| `minio_cluster_capacity_raw_free_bytes` | Total free capacity online in the cluster. |
|
||||||
|`minio_cluster_capacity_raw_total_bytes` |Total capacity online in the cluster. |
|
| `minio_cluster_capacity_raw_total_bytes` | Total capacity online in the cluster. |
|
||||||
|`minio_cluster_capacity_usable_free_bytes` |Total free usable capacity online in the cluster. |
|
| `minio_cluster_capacity_usable_free_bytes` | Total free usable capacity online in the cluster. |
|
||||||
|`minio_cluster_capacity_usable_total_bytes` |Total usable capacity online in the cluster. |
|
| `minio_cluster_capacity_usable_total_bytes` | Total usable capacity online in the cluster. |
|
||||||
|`minio_cluster_disk_offline_total` |Total disks offline. |
|
| `minio_cluster_disk_total` | Total disks. |
|
||||||
|`minio_cluster_disk_online_total` |Total disks online. |
|
| `minio_cluster_disk_offline_total` | Total disks offline. |
|
||||||
|`minio_cluster_nodes_offline_total` |Total number of MinIO nodes offline. |
|
| `minio_cluster_disk_online_total` | Total disks online. |
|
||||||
|`minio_cluster_nodes_online_total` |Total number of MinIO nodes online. |
|
| `minio_cluster_nodes_offline_total` | Total number of MinIO nodes offline. |
|
||||||
|`minio_heal_objects_error_total` |Objects for which healing failed in current self healing run |
|
| `minio_cluster_nodes_online_total` | Total number of MinIO nodes online. |
|
||||||
|`minio_heal_objects_heal_total` |Objects healed in current self healing run |
|
| `minio_heal_objects_error_total` | Objects for which healing failed in current self healing run |
|
||||||
|`minio_heal_objects_total` |Objects scanned in current self healing run |
|
| `minio_heal_objects_heal_total` | Objects healed in current self healing run |
|
||||||
|`minio_heal_time_last_activity_nano_seconds` |Time elapsed (in nano seconds) since last self healing activity. This is set to -1 until initial self heal activity |
|
| `minio_heal_objects_total` | Objects scanned in current self healing run |
|
||||||
|`minio_inter_node_traffic_received_bytes` |Total number of bytes received from other peer nodes. |
|
| `minio_heal_time_last_activity_nano_seconds` | Time elapsed (in nano seconds) since last self healing activity. This is set to -1 until initial self heal activity |
|
||||||
|`minio_inter_node_traffic_sent_bytes` |Total number of bytes sent to the other peer nodes. |
|
| `minio_inter_node_traffic_received_bytes` | Total number of bytes received from other peer nodes. |
|
||||||
|`minio_node_disk_free_bytes` |Total storage available on a disk. |
|
| `minio_inter_node_traffic_sent_bytes` | Total number of bytes sent to the other peer nodes. |
|
||||||
|`minio_node_disk_total_bytes` |Total storage on a disk. |
|
| `minio_node_disk_free_bytes` | Total storage available on a disk. |
|
||||||
|`minio_node_disk_used_bytes` |Total storage used on a disk. |
|
| `minio_node_disk_total_bytes` | Total storage on a disk. |
|
||||||
|`minio_node_file_descriptor_limit_total` |Limit on total number of open file descriptors for the MinIO Server process. |
|
| `minio_node_disk_used_bytes` | Total storage used on a disk. |
|
||||||
|`minio_node_file_descriptor_open_total` |Total number of open file descriptors by the MinIO Server process. |
|
| `minio_node_file_descriptor_limit_total` | Limit on total number of open file descriptors for the MinIO Server process. |
|
||||||
|`minio_node_io_rchar_bytes` |Total bytes read by the process from the underlying storage system including cache, /proc/[pid]/io rchar |
|
| `minio_node_file_descriptor_open_total` | Total number of open file descriptors by the MinIO Server process. |
|
||||||
|`minio_node_io_read_bytes` |Total bytes read by the process from the underlying storage system, /proc/[pid]/io read_bytes |
|
| `minio_node_io_rchar_bytes` | Total bytes read by the process from the underlying storage system including cache, /proc/[pid]/io rchar |
|
||||||
|`minio_node_io_wchar_bytes` |Total bytes written by the process to the underlying storage system including page cache, /proc/[pid]/io wchar |
|
| `minio_node_io_read_bytes` | Total bytes read by the process from the underlying storage system, /proc/[pid]/io read_bytes |
|
||||||
|`minio_node_io_write_bytes` |Total bytes written by the process to the underlying storage system, /proc/[pid]/io write_bytes |
|
| `minio_node_io_wchar_bytes` | Total bytes written by the process to the underlying storage system including page cache, /proc/[pid]/io wchar |
|
||||||
|`minio_node_process_starttime_seconds` |Start time for MinIO process per node in seconds. |
|
| `minio_node_io_write_bytes` | Total bytes written by the process to the underlying storage system, /proc/[pid]/io write_bytes |
|
||||||
|`minio_node_syscall_read_total` |Total read SysCalls to the kernel. /proc/[pid]/io syscr |
|
| `minio_node_process_starttime_seconds` | Start time for MinIO process per node in seconds. |
|
||||||
|`minio_node_syscall_write_total` |Total write SysCalls to the kernel. /proc/[pid]/io syscw |
|
| `minio_node_syscall_read_total` | Total read SysCalls to the kernel. /proc/[pid]/io syscr |
|
||||||
|`minio_s3_requests_error_total` |Total number S3 requests with errors |
|
| `minio_node_syscall_write_total` | Total write SysCalls to the kernel. /proc/[pid]/io syscw |
|
||||||
|`minio_s3_requests_inflight_total` |Total number of S3 requests currently in flight. |
|
| `minio_s3_requests_error_total` | Total number S3 requests with errors |
|
||||||
|`minio_s3_requests_total` |Total number S3 requests |
|
| `minio_s3_requests_inflight_total` | Total number of S3 requests currently in flight. |
|
||||||
|`minio_s3_time_ttbf_seconds_distribution` |Distribution of the time to first byte across API calls. |
|
| `minio_s3_requests_total` | Total number S3 requests |
|
||||||
|`minio_s3_traffic_received_bytes` |Total number of s3 bytes received. |
|
| `minio_s3_time_ttbf_seconds_distribution` | Distribution of the time to first byte across API calls. |
|
||||||
|`minio_s3_traffic_sent_bytes` |Total number of s3 bytes sent |
|
| `minio_s3_traffic_received_bytes` | Total number of s3 bytes received. |
|
||||||
|`minio_cache_hits_total` |Total number of disk cache hits |
|
| `minio_s3_traffic_sent_bytes` | Total number of s3 bytes sent |
|
||||||
|`minio_cache_missed_total` |Total number of disk cache misses |
|
| `minio_cache_hits_total` | Total number of disk cache hits |
|
||||||
|`minio_cache_sent_bytes` |Total number of bytes served from cache |
|
| `minio_cache_missed_total` | Total number of disk cache misses |
|
||||||
|`minio_cache_total_bytes` |Total size of cache disk in bytes |
|
| `minio_cache_sent_bytes` | Total number of bytes served from cache |
|
||||||
|`minio_cache_usage_info` |Total percentage cache usage, value of 1 indicates high and 0 low, label level is set as well |
|
| `minio_cache_total_bytes` | Total size of cache disk in bytes |
|
||||||
|`minio_cache_used_bytes` |Current cache usage in bytes |
|
| `minio_cache_usage_info` | Total percentage cache usage, value of 1 indicates high and 0 low, label level is set as well |
|
||||||
|`minio_software_commit_info` |Git commit hash for the MinIO release. |
|
| `minio_cache_used_bytes` | Current cache usage in bytes |
|
||||||
|`minio_software_version_info` |MinIO Release tag for the server |
|
| `minio_software_commit_info` | Git commit hash for the MinIO release. |
|
||||||
|
| `minio_software_version_info` | MinIO Release tag for the server |
|
||||||
|
|
Loading…
Reference in New Issue