mirror of https://github.com/minio/minio.git
add metrics ioerror counter for alerts on I/O errors (#19618)
This commit is contained in:
parent
9a3c992d7a
commit
c54ffde568
|
@ -541,6 +541,16 @@ func getNodeDriveTimeoutErrorsMD() MetricDescription {
|
|||
}
|
||||
}
|
||||
|
||||
func getNodeDriveIOErrorsMD() MetricDescription {
|
||||
return MetricDescription{
|
||||
Namespace: nodeMetricNamespace,
|
||||
Subsystem: driveSubsystem,
|
||||
Name: "errors_ioerror",
|
||||
Help: "Total number of drive I/O errors since server start",
|
||||
Type: counterMetric,
|
||||
}
|
||||
}
|
||||
|
||||
func getNodeDriveAvailabilityErrorsMD() MetricDescription {
|
||||
return MetricDescription{
|
||||
Namespace: nodeMetricNamespace,
|
||||
|
@ -3521,6 +3531,12 @@ func getLocalStorageMetrics(opts MetricsGroupOpts) *MetricsGroupV2 {
|
|||
VariableLabels: map[string]string{"drive": disk.DrivePath},
|
||||
})
|
||||
|
||||
metrics = append(metrics, MetricV2{
|
||||
Description: getNodeDriveIOErrorsMD(),
|
||||
Value: float64(disk.Metrics.TotalErrorsAvailability - disk.Metrics.TotalErrorsTimeout),
|
||||
VariableLabels: map[string]string{"drive": disk.DrivePath},
|
||||
})
|
||||
|
||||
metrics = append(metrics, MetricV2{
|
||||
Description: getNodeDriveAvailabilityErrorsMD(),
|
||||
Value: float64(disk.Metrics.TotalErrorsAvailability),
|
||||
|
|
|
@ -47,6 +47,7 @@ const (
|
|||
driveFreeInodes = "free_inodes"
|
||||
driveTotalInodes = "total_inodes"
|
||||
driveTimeoutErrorsTotal = "timeout_errors_total"
|
||||
driveIOErrorsTotal = "io_errors_total"
|
||||
driveAvailabilityErrorsTotal = "availability_errors_total"
|
||||
driveWaitingIO = "waiting_io"
|
||||
driveAPILatencyMicros = "api_latency_micros"
|
||||
|
@ -82,6 +83,8 @@ var (
|
|||
"Total inodes available on a drive", allDriveLabels...)
|
||||
driveTimeoutErrorsMD = NewCounterMD(driveTimeoutErrorsTotal,
|
||||
"Total timeout errors on a drive", allDriveLabels...)
|
||||
driveIOErrorsMD = NewCounterMD(driveIOErrorsTotal,
|
||||
"Total I/O errors on a drive", allDriveLabels...)
|
||||
driveAvailabilityErrorsMD = NewCounterMD(driveAvailabilityErrorsTotal,
|
||||
"Total availability errors (I/O errors, timeouts) on a drive",
|
||||
allDriveLabels...)
|
||||
|
@ -167,6 +170,7 @@ func (m *MetricValues) setDriveAPIMetrics(disk madmin.Disk, labels []string) {
|
|||
}
|
||||
|
||||
m.Set(driveTimeoutErrorsTotal, float64(disk.Metrics.TotalErrorsTimeout), labels...)
|
||||
m.Set(driveIOErrorsTotal, float64(disk.Metrics.TotalErrorsAvailability-disk.Metrics.TotalErrorsTimeout), labels...)
|
||||
m.Set(driveAvailabilityErrorsTotal, float64(disk.Metrics.TotalErrorsAvailability), labels...)
|
||||
m.Set(driveWaitingIO, float64(disk.Metrics.TotalWaiting), labels...)
|
||||
|
||||
|
|
|
@ -153,6 +153,7 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection {
|
|||
driveFreeInodesMD,
|
||||
driveTotalInodesMD,
|
||||
driveTimeoutErrorsMD,
|
||||
driveIOErrorsMD,
|
||||
driveAvailabilityErrorsMD,
|
||||
driveWaitingIOMD,
|
||||
driveAPILatencyMD,
|
||||
|
|
|
@ -194,19 +194,20 @@ For deployments with [bucket](https://min.io/docs/minio/linux/administration/buc
|
|||
|
||||
## Drive Metrics
|
||||
|
||||
| Name | Description |
|
||||
|:---------------------------------------|:------------------------------------------------------------------------------------|
|
||||
| `minio_node_drive_free_bytes` | Total storage available on a drive. |
|
||||
| `minio_node_drive_free_inodes` | Total free inodes. |
|
||||
| `minio_node_drive_latency_us` | Average last minute latency in µs for drive API storage operations. |
|
||||
| `minio_node_drive_offline_total` | Total drives offline in this node. |
|
||||
| `minio_node_drive_online_total` | Total drives online in this node. |
|
||||
| `minio_node_drive_total` | Total drives in this node. |
|
||||
| `minio_node_drive_total_bytes` | Total storage on a drive. |
|
||||
| `minio_node_drive_used_bytes` | Total storage used on a drive. |
|
||||
| `minio_node_drive_errors_timeout` | Total number of drive timeout errors since server start |
|
||||
| `minio_node_drive_errors_availability` | Total number of drive I/O errors, permission denied and timeouts since server start |
|
||||
| `minio_node_drive_io_waiting` | Total number I/O operations waiting on drive |
|
||||
| Name | Description |
|
||||
|:---------------------------------------|:--------------------------------------------------------------------|
|
||||
| `minio_node_drive_free_bytes` | Total storage available on a drive. |
|
||||
| `minio_node_drive_free_inodes` | Total free inodes. |
|
||||
| `minio_node_drive_latency_us` | Average last minute latency in µs for drive API storage operations. |
|
||||
| `minio_node_drive_offline_total` | Total drives offline in this node. |
|
||||
| `minio_node_drive_online_total` | Total drives online in this node. |
|
||||
| `minio_node_drive_total` | Total drives in this node. |
|
||||
| `minio_node_drive_total_bytes` | Total storage on a drive. |
|
||||
| `minio_node_drive_used_bytes` | Total storage used on a drive. |
|
||||
| `minio_node_drive_errors_timeout` | Total number of drive timeout errors since server start |
|
||||
| `minio_node_drive_errors_ioerror` | Total number of drive I/O errors since server start |
|
||||
| `minio_node_drive_errors_availability` | Total number of drive I/O errors, timeouts since server start |
|
||||
| `minio_node_drive_io_waiting` | Total number I/O operations waiting on drive |
|
||||
|
||||
## Identity and Access Management (IAM) Metrics
|
||||
|
||||
|
|
|
@ -110,6 +110,7 @@ The standard metrics groups for ProcessCollector and GoCollector are not shown b
|
|||
| `minio_system_drive_free_inodes` | `gauge` | Total free inodes on a drive | `drive,set_index,drive_index,pool_index,server` |
|
||||
| `minio_system_drive_total_inodes` | `gauge` | Total inodes available on a drive | `drive,set_index,drive_index,pool_index,server` |
|
||||
| `minio_system_drive_timeout_errors_total` | `counter` | Total timeout errors on a drive | `drive,set_index,drive_index,pool_index,server` |
|
||||
| `minio_system_drive_io_errors_total` | `counter` | Total I/O errors on a drive | `drive,set_index,drive_index,pool_index,server` |
|
||||
| `minio_system_drive_availability_errors_total` | `counter` | Total availability errors (I/O errors, timeouts) on a drive | `drive,set_index,drive_index,pool_index,server` |
|
||||
| `minio_system_drive_waiting_io` | `gauge` | Total waiting I/O operations on a drive | `drive,set_index,drive_index,pool_index,server` |
|
||||
| `minio_system_drive_api_latency_micros` | `gauge` | Average last minute latency in µs for drive API storage operations | `drive,api,set_index,drive_index,pool_index,server` |
|
||||
|
|
Loading…
Reference in New Issue