Add open FD and FD limit to cluster metrics (#11328)

This commit is contained in:
Ritesh H Shukla 2021-01-22 18:30:16 -08:00 committed by GitHub
parent 43f973c4cf
commit 7575c24037
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 223 additions and 28 deletions

View File

@ -20,6 +20,7 @@ import (
"context" "context"
"fmt" "fmt"
"net/http" "net/http"
"runtime"
"strings" "strings"
"sync" "sync"
"time" "time"
@ -28,6 +29,7 @@ import (
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp" "github.com/prometheus/client_golang/prometheus/promhttp"
dto "github.com/prometheus/client_model/go" dto "github.com/prometheus/client_model/go"
"github.com/prometheus/procfs"
) )
// MetricNamespace is top level grouping of metrics to create the metric name. // MetricNamespace is top level grouping of metrics to create the metric name.
@ -51,43 +53,55 @@ const (
capacityRawSubsystem MetricSubsystem = "capacity_raw" capacityRawSubsystem MetricSubsystem = "capacity_raw"
capacityUsableSubsystem MetricSubsystem = "capacity_usable" capacityUsableSubsystem MetricSubsystem = "capacity_usable"
diskSubsystem MetricSubsystem = "disk" diskSubsystem MetricSubsystem = "disk"
goRoutines MetricSubsystem = "go_routine"
nodesSubsystem MetricSubsystem = "nodes" nodesSubsystem MetricSubsystem = "nodes"
objectsSubsystem MetricSubsystem = "objects" objectsSubsystem MetricSubsystem = "objects"
fileDescriptorSubsystem MetricSubsystem = "file_descriptor"
ioSubsystem MetricSubsystem = "io"
replicationSubsystem MetricSubsystem = "replication" replicationSubsystem MetricSubsystem = "replication"
requestsSubsystem MetricSubsystem = "requests" requestsSubsystem MetricSubsystem = "requests"
timeSubsystem MetricSubsystem = "time" timeSubsystem MetricSubsystem = "time"
trafficSubsystem MetricSubsystem = "traffic" trafficSubsystem MetricSubsystem = "traffic"
sysCallSubsystem MetricSubsystem = "syscall"
usageSubsystem MetricSubsystem = "usage" usageSubsystem MetricSubsystem = "usage"
softwareSubsystem MetricSubsystem = "software" softwareSubsystem MetricSubsystem = "software"
) )
// MetricNames are the individual names for the metric. // MetricName are the individual names for the metric.
type MetricNames string type MetricName string
const ( const (
errorsTotal MetricNames = "error_total" errorsTotal MetricName = "error_total"
healTotal MetricNames = "heal_total" healTotal MetricName = "heal_total"
hitsTotal MetricNames = "hits_total" hitsTotal MetricName = "hits_total"
inflightTotal MetricNames = "inflight_total" inflightTotal MetricName = "inflight_total"
missedTotal MetricNames = "missed_total" limitTotal MetricName = "limit_total"
objectTotal MetricNames = "object_total" missedTotal MetricName = "missed_total"
offlineTotal MetricNames = "offline_total" objectTotal MetricName = "object_total"
onlineTotal MetricNames = "online_total" offlineTotal MetricName = "offline_total"
total MetricNames = "total" onlineTotal MetricName = "online_total"
openTotal MetricName = "open_total"
readTotal MetricName = "read_total"
writeTotal MetricName = "write_total"
total MetricName = "total"
failedBytes MetricNames = "failed_bytes" failedBytes MetricName = "failed_bytes"
freeBytes MetricNames = "free_bytes" freeBytes MetricName = "free_bytes"
pendingBytes MetricNames = "pending_bytes" pendingBytes MetricName = "pending_bytes"
receivedBytes MetricNames = "received_bytes" readBytes MetricName = "read_bytes"
sentBytes MetricNames = "sent_bytes" rcharBytes MetricName = "rchar_bytes"
totalBytes MetricNames = "total_bytes" receivedBytes MetricName = "received_bytes"
usedBytes MetricNames = "used_bytes" sentBytes MetricName = "sent_bytes"
totalBytes MetricName = "total_bytes"
usedBytes MetricName = "used_bytes"
writeBytes MetricName = "write_bytes"
wcharBytes MetricName = "wchar_bytes"
usagePercent MetricNames = "update_percent" usagePercent MetricName = "update_percent"
commitInfo MetricNames = "commit_info" commitInfo MetricName = "commit_info"
usageInfo MetricNames = "usage_info" usageInfo MetricName = "usage_info"
versionInfo MetricNames = "version_info" versionInfo MetricName = "version_info"
sizeDistribution = "size_distribution" sizeDistribution = "size_distribution"
ttfbDistribution = "ttbf_seconds_distribution" ttfbDistribution = "ttbf_seconds_distribution"
@ -112,7 +126,7 @@ const (
type MetricDescription struct { type MetricDescription struct {
Namespace MetricNamespace `json:"MetricNamespace"` Namespace MetricNamespace `json:"MetricNamespace"`
Subsystem MetricSubsystem `json:"Subsystem"` Subsystem MetricSubsystem `json:"Subsystem"`
Name MetricNames `json:"MetricNames"` Name MetricName `json:"MetricName"`
Help string `json:"Help"` Help string `json:"Help"`
Type GaugeMetricType `json:"Type"` Type GaugeMetricType `json:"Type"`
} }
@ -157,12 +171,14 @@ func GetAllGenerators() []MetricsGenerator {
// GetGeneratorsForPeer - gets the generators to report to peer. // GetGeneratorsForPeer - gets the generators to report to peer.
func GetGeneratorsForPeer() []MetricsGenerator { func GetGeneratorsForPeer() []MetricsGenerator {
g := []MetricsGenerator{ g := []MetricsGenerator{
getLocalStorageMetrics, getCacheMetrics,
getMinioVersionMetrics, getGoMetrics,
getHTTPMetrics, getHTTPMetrics,
getLocalStorageMetrics,
getMinioProcMetrics,
getMinioVersionMetrics,
getNetworkMetrics, getNetworkMetrics,
getS3TTFBMetric, getS3TTFBMetric,
getCacheMetrics,
} }
return g return g
} }
@ -534,7 +550,168 @@ func getS3TTFBDistributionMD() MetricDescription {
Type: gaugeMetric, Type: gaugeMetric,
} }
} }
func getMinioFDOpenMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: fileDescriptorSubsystem,
Name: openTotal,
Help: "Total number of open file descriptors by the MinIO Server process.",
Type: gaugeMetric,
}
}
func getMinioFDLimitMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: fileDescriptorSubsystem,
Name: limitTotal,
Help: "Limit on total number of open file descriptors for the MinIO Server process.",
Type: gaugeMetric,
}
}
func getMinioProcessIOWriteBytesMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: ioSubsystem,
Name: writeBytes,
Help: "Total bytes written by the process to the underlying storage system, /proc/[pid]/io write_bytes",
Type: counterMetric,
}
}
func getMinioProcessIOReadBytesMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: ioSubsystem,
Name: readBytes,
Help: "Total bytes read by the process from the underlying storage system, /proc/[pid]/io read_bytes",
Type: counterMetric,
}
}
func getMinioProcessIOWriteCachedBytesMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: ioSubsystem,
Name: wcharBytes,
Help: "Total bytes written by the process to the underlying storage system including page cache, /proc/[pid]/io wchar",
Type: counterMetric,
}
}
func getMinioProcessIOReadCachedBytesMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: ioSubsystem,
Name: rcharBytes,
Help: "Total bytes read by the process from the underlying storage system including cache, /proc/[pid]/io rchar",
Type: counterMetric,
}
}
func getMinIOProcessSysCallRMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: sysCallSubsystem,
Name: readTotal,
Help: "Total read SysCalls to the kernel. /proc/[pid]/io syscr",
Type: counterMetric,
}
}
func getMinIOProcessSysCallWMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: sysCallSubsystem,
Name: writeTotal,
Help: "Total write SysCalls to the kernel. /proc/[pid]/io syscw",
Type: counterMetric,
}
}
func getMinIOGORoutineCountMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: goRoutines,
Name: total,
Help: "Total number of go routines running.",
Type: gaugeMetric,
}
}
func getMinioProcMetrics() MetricsGroup {
return MetricsGroup{
Metrics: []Metric{},
initialize: func(ctx context.Context, metrics *MetricsGroup) {
p, err := procfs.Self()
if err != nil {
logger.LogOnceIf(ctx, err, nodeMetricNamespace)
return
}
var openFDs int
openFDs, err = p.FileDescriptorsLen()
if err != nil {
logger.LogOnceIf(ctx, err, getMinioFDOpenMD())
return
}
l, err := p.Limits()
if err != nil {
logger.LogOnceIf(ctx, err, getMinioFDLimitMD())
return
}
io, err := p.IO()
if err != nil {
logger.LogOnceIf(ctx, err, ioSubsystem)
return
}
metrics.Metrics = append(metrics.Metrics,
Metric{
Description: getMinioFDOpenMD(),
Value: float64(openFDs),
},
)
metrics.Metrics = append(metrics.Metrics,
Metric{
Description: getMinioFDLimitMD(),
Value: float64(l.OpenFiles),
})
metrics.Metrics = append(metrics.Metrics,
Metric{
Description: getMinIOProcessSysCallRMD(),
Value: float64(io.SyscR),
})
metrics.Metrics = append(metrics.Metrics,
Metric{
Description: getMinIOProcessSysCallWMD(),
Value: float64(io.SyscW),
})
metrics.Metrics = append(metrics.Metrics,
Metric{
Description: getMinioProcessIOReadBytesMD(),
Value: float64(io.ReadBytes),
})
metrics.Metrics = append(metrics.Metrics,
Metric{
Description: getMinioProcessIOWriteBytesMD(),
Value: float64(io.WriteBytes),
})
metrics.Metrics = append(metrics.Metrics,
Metric{
Description: getMinioProcessIOReadCachedBytesMD(),
Value: float64(io.RChar),
})
metrics.Metrics = append(metrics.Metrics,
Metric{
Description: getMinioProcessIOWriteCachedBytesMD(),
Value: float64(io.WChar),
})
},
}
}
func getGoMetrics() MetricsGroup {
return MetricsGroup{
Metrics: []Metric{},
initialize: func(ctx context.Context, metrics *MetricsGroup) {
metrics.Metrics = append(metrics.Metrics, Metric{
Description: getMinIOGORoutineCountMD(),
Value: float64(runtime.NumGoroutine()),
})
},
}
}
func getS3TTFBMetric() MetricsGroup { func getS3TTFBMetric() MetricsGroup {
return MetricsGroup{ return MetricsGroup{
Metrics: []Metric{}, Metrics: []Metric{},
@ -1171,9 +1348,18 @@ func metricsNodeHandler() http.Handler {
if err != nil { if err != nil {
logger.CriticalIf(GlobalContext, err) logger.CriticalIf(GlobalContext, err)
} }
err = registry.Register(prometheus.NewProcessCollector(prometheus.ProcessCollectorOpts{
Namespace: minioNamespace,
ReportErrors: true,
}))
if err != nil {
logger.CriticalIf(GlobalContext, err)
}
err = registry.Register(prometheus.NewGoCollector())
if err != nil {
logger.CriticalIf(GlobalContext, err)
}
gatherers := prometheus.Gatherers{ gatherers := prometheus.Gatherers{
prometheus.DefaultGatherer,
registry, registry,
} }
// Delegate http serving to Prometheus client library, which will call collector.Collect. // Delegate http serving to Prometheus client library, which will call collector.Collect.

View File

@ -31,6 +31,14 @@ These metrics can be from any MinIO server once per collection.
|`minio_node_disk_free_bytes` |Total storage available on a disk. | |`minio_node_disk_free_bytes` |Total storage available on a disk. |
|`minio_node_disk_total_bytes` |Total storage on a disk. | |`minio_node_disk_total_bytes` |Total storage on a disk. |
|`minio_node_disk_used_bytes` |Total storage used on a disk. | |`minio_node_disk_used_bytes` |Total storage used on a disk. |
|`minio_node_file_descriptor_limit_total` |Limit on total number of open file descriptors for the MinIO Server process. |
|`minio_node_file_descriptor_open_total` |Total number of open file descriptors by the MinIO Server process. |
|`minio_node_io_rchar_bytes` |Total bytes read by the process from the underlying storage system including cache, /proc/[pid]/io rchar |
|`minio_node_io_read_bytes` |Total bytes read by the process from the underlying storage system, /proc/[pid]/io read_bytes |
|`minio_node_io_wchar_bytes` |Total bytes written by the process to the underlying storage system including page cache, /proc/[pid]/io wchar |
|`minio_node_io_write_bytes` |Total bytes written by the process to the underlying storage system, /proc/[pid]/io write_bytes |
|`minio_node_syscall_read_total` |Total read SysCalls to the kernel. /proc/[pid]/io syscr |
|`minio_node_syscall_write_total` |Total write SysCalls to the kernel. /proc/[pid]/io syscw |
|`minio_s3_requests_error_total` |Total number S3 requests with errors | |`minio_s3_requests_error_total` |Total number S3 requests with errors |
|`minio_s3_requests_inflight_total` |Total number of S3 requests currently in flight. | |`minio_s3_requests_inflight_total` |Total number of S3 requests currently in flight. |
|`minio_s3_requests_total` |Total number S3 requests | |`minio_s3_requests_total` |Total number S3 requests |

1
go.mod
View File

@ -66,6 +66,7 @@ require (
github.com/pkg/errors v0.9.1 github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.8.0 github.com/prometheus/client_golang v1.8.0
github.com/prometheus/client_model v0.2.0 github.com/prometheus/client_model v0.2.0
github.com/prometheus/procfs v0.2.0
github.com/rjeczalik/notify v0.9.2 github.com/rjeczalik/notify v0.9.2
github.com/rs/cors v1.7.0 github.com/rs/cors v1.7.0
github.com/secure-io/sio-go v0.3.0 github.com/secure-io/sio-go v0.3.0