diff --git a/cmd/metrics-v2.go b/cmd/metrics-v2.go index 0b59164cb..d307acb13 100644 --- a/cmd/metrics-v2.go +++ b/cmd/metrics-v2.go @@ -541,6 +541,16 @@ func getNodeDriveTimeoutErrorsMD() MetricDescription { } } +func getNodeDriveIOErrorsMD() MetricDescription { + return MetricDescription{ + Namespace: nodeMetricNamespace, + Subsystem: driveSubsystem, + Name: "errors_ioerror", + Help: "Total number of drive I/O errors since server start", + Type: counterMetric, + } +} + func getNodeDriveAvailabilityErrorsMD() MetricDescription { return MetricDescription{ Namespace: nodeMetricNamespace, @@ -3521,6 +3531,12 @@ func getLocalStorageMetrics(opts MetricsGroupOpts) *MetricsGroupV2 { VariableLabels: map[string]string{"drive": disk.DrivePath}, }) + metrics = append(metrics, MetricV2{ + Description: getNodeDriveIOErrorsMD(), + Value: float64(disk.Metrics.TotalErrorsAvailability - disk.Metrics.TotalErrorsTimeout), + VariableLabels: map[string]string{"drive": disk.DrivePath}, + }) + metrics = append(metrics, MetricV2{ Description: getNodeDriveAvailabilityErrorsMD(), Value: float64(disk.Metrics.TotalErrorsAvailability), diff --git a/cmd/metrics-v3-system-drive.go b/cmd/metrics-v3-system-drive.go index e1c9bd211..6c3c25548 100644 --- a/cmd/metrics-v3-system-drive.go +++ b/cmd/metrics-v3-system-drive.go @@ -47,6 +47,7 @@ const ( driveFreeInodes = "free_inodes" driveTotalInodes = "total_inodes" driveTimeoutErrorsTotal = "timeout_errors_total" + driveIOErrorsTotal = "io_errors_total" driveAvailabilityErrorsTotal = "availability_errors_total" driveWaitingIO = "waiting_io" driveAPILatencyMicros = "api_latency_micros" @@ -82,6 +83,8 @@ var ( "Total inodes available on a drive", allDriveLabels...) driveTimeoutErrorsMD = NewCounterMD(driveTimeoutErrorsTotal, "Total timeout errors on a drive", allDriveLabels...) + driveIOErrorsMD = NewCounterMD(driveIOErrorsTotal, + "Total I/O errors on a drive", allDriveLabels...) driveAvailabilityErrorsMD = NewCounterMD(driveAvailabilityErrorsTotal, "Total availability errors (I/O errors, timeouts) on a drive", allDriveLabels...) @@ -167,6 +170,7 @@ func (m *MetricValues) setDriveAPIMetrics(disk madmin.Disk, labels []string) { } m.Set(driveTimeoutErrorsTotal, float64(disk.Metrics.TotalErrorsTimeout), labels...) + m.Set(driveIOErrorsTotal, float64(disk.Metrics.TotalErrorsAvailability-disk.Metrics.TotalErrorsTimeout), labels...) m.Set(driveAvailabilityErrorsTotal, float64(disk.Metrics.TotalErrorsAvailability), labels...) m.Set(driveWaitingIO, float64(disk.Metrics.TotalWaiting), labels...) diff --git a/cmd/metrics-v3.go b/cmd/metrics-v3.go index 67deecfde..c0c2e19ed 100644 --- a/cmd/metrics-v3.go +++ b/cmd/metrics-v3.go @@ -153,6 +153,7 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection { driveFreeInodesMD, driveTotalInodesMD, driveTimeoutErrorsMD, + driveIOErrorsMD, driveAvailabilityErrorsMD, driveWaitingIOMD, driveAPILatencyMD, diff --git a/docs/metrics/prometheus/list.md b/docs/metrics/prometheus/list.md index 4647f9e82..17c4bd771 100644 --- a/docs/metrics/prometheus/list.md +++ b/docs/metrics/prometheus/list.md @@ -194,19 +194,20 @@ For deployments with [bucket](https://min.io/docs/minio/linux/administration/buc ## Drive Metrics -| Name | Description | -|:---------------------------------------|:------------------------------------------------------------------------------------| -| `minio_node_drive_free_bytes` | Total storage available on a drive. | -| `minio_node_drive_free_inodes` | Total free inodes. | -| `minio_node_drive_latency_us` | Average last minute latency in µs for drive API storage operations. | -| `minio_node_drive_offline_total` | Total drives offline in this node. | -| `minio_node_drive_online_total` | Total drives online in this node. | -| `minio_node_drive_total` | Total drives in this node. | -| `minio_node_drive_total_bytes` | Total storage on a drive. | -| `minio_node_drive_used_bytes` | Total storage used on a drive. | -| `minio_node_drive_errors_timeout` | Total number of drive timeout errors since server start | -| `minio_node_drive_errors_availability` | Total number of drive I/O errors, permission denied and timeouts since server start | -| `minio_node_drive_io_waiting` | Total number I/O operations waiting on drive | +| Name | Description | +|:---------------------------------------|:--------------------------------------------------------------------| +| `minio_node_drive_free_bytes` | Total storage available on a drive. | +| `minio_node_drive_free_inodes` | Total free inodes. | +| `minio_node_drive_latency_us` | Average last minute latency in µs for drive API storage operations. | +| `minio_node_drive_offline_total` | Total drives offline in this node. | +| `minio_node_drive_online_total` | Total drives online in this node. | +| `minio_node_drive_total` | Total drives in this node. | +| `minio_node_drive_total_bytes` | Total storage on a drive. | +| `minio_node_drive_used_bytes` | Total storage used on a drive. | +| `minio_node_drive_errors_timeout` | Total number of drive timeout errors since server start | +| `minio_node_drive_errors_ioerror` | Total number of drive I/O errors since server start | +| `minio_node_drive_errors_availability` | Total number of drive I/O errors, timeouts since server start | +| `minio_node_drive_io_waiting` | Total number I/O operations waiting on drive | ## Identity and Access Management (IAM) Metrics diff --git a/docs/metrics/v3.md b/docs/metrics/v3.md index 294f5ee83..38713704d 100644 --- a/docs/metrics/v3.md +++ b/docs/metrics/v3.md @@ -110,6 +110,7 @@ The standard metrics groups for ProcessCollector and GoCollector are not shown b | `minio_system_drive_free_inodes` | `gauge` | Total free inodes on a drive | `drive,set_index,drive_index,pool_index,server` | | `minio_system_drive_total_inodes` | `gauge` | Total inodes available on a drive | `drive,set_index,drive_index,pool_index,server` | | `minio_system_drive_timeout_errors_total` | `counter` | Total timeout errors on a drive | `drive,set_index,drive_index,pool_index,server` | +| `minio_system_drive_io_errors_total` | `counter` | Total I/O errors on a drive | `drive,set_index,drive_index,pool_index,server` | | `minio_system_drive_availability_errors_total` | `counter` | Total availability errors (I/O errors, timeouts) on a drive | `drive,set_index,drive_index,pool_index,server` | | `minio_system_drive_waiting_io` | `gauge` | Total waiting I/O operations on a drive | `drive,set_index,drive_index,pool_index,server` | | `minio_system_drive_api_latency_micros` | `gauge` | Average last minute latency in µs for drive API storage operations | `drive,api,set_index,drive_index,pool_index,server` |