mirror of
https://github.com/minio/minio.git
synced 2025-01-16 09:13:16 -05:00
f1302c40fe
Services are unfrozen before `initBackgroundReplication` is finished. This means that the globalReplicationStats write is racy. Switch to an atomic pointer. Provide the `ReplicationPool` with the stats, so it doesn't have to be grabbed from the atomic pointer on every use. All other loads and checks are nil, and calls return empty values when stats still haven't been initialized.
574 lines
17 KiB
Go
574 lines
17 KiB
Go
// Copyright (c) 2015-2021 MinIO, Inc.
|
|
//
|
|
// This file is part of MinIO Object Storage stack
|
|
//
|
|
// This program is free software: you can redistribute it and/or modify
|
|
// it under the terms of the GNU Affero General Public License as published by
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU Affero General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU Affero General Public License
|
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
package cmd
|
|
|
|
import (
|
|
"net/http"
|
|
"time"
|
|
|
|
"github.com/minio/minio/internal/auth"
|
|
"github.com/minio/minio/internal/logger"
|
|
"github.com/minio/minio/internal/mcontext"
|
|
"github.com/minio/pkg/v3/policy"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/common/expfmt"
|
|
)
|
|
|
|
var (
|
|
httpRequestsDuration = prometheus.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Name: "s3_ttfb_seconds",
|
|
Help: "Time taken by requests served by current MinIO server instance",
|
|
Buckets: []float64{.05, .1, .25, .5, 1, 2.5, 5, 10},
|
|
},
|
|
[]string{"api"},
|
|
)
|
|
bucketHTTPRequestsDuration = prometheus.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Name: "s3_ttfb_seconds",
|
|
Help: "Time taken by requests served by current MinIO server instance per bucket",
|
|
Buckets: []float64{.05, .1, .25, .5, 1, 2.5, 5, 10},
|
|
},
|
|
[]string{"api", "bucket"},
|
|
)
|
|
minioVersionInfo = prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Namespace: "minio",
|
|
Name: "version_info",
|
|
Help: "Version of current MinIO server instance",
|
|
},
|
|
[]string{
|
|
// current version
|
|
"version",
|
|
// commit-id of the current version
|
|
"commit",
|
|
},
|
|
)
|
|
)
|
|
|
|
const (
|
|
healMetricsNamespace = "self_heal"
|
|
cacheNamespace = "cache"
|
|
s3Namespace = "s3"
|
|
bucketNamespace = "bucket"
|
|
minioNamespace = "minio"
|
|
diskNamespace = "disk"
|
|
interNodeNamespace = "internode"
|
|
)
|
|
|
|
func init() {
|
|
prometheus.MustRegister(httpRequestsDuration)
|
|
prometheus.MustRegister(newMinioCollector())
|
|
prometheus.MustRegister(minioVersionInfo)
|
|
}
|
|
|
|
// newMinioCollector describes the collector
|
|
// and returns reference of minioCollector
|
|
// It creates the Prometheus Description which is used
|
|
// to define metric and help string
|
|
func newMinioCollector() *minioCollector {
|
|
return &minioCollector{
|
|
desc: prometheus.NewDesc("minio_stats", "Statistics exposed by MinIO server", nil, nil),
|
|
}
|
|
}
|
|
|
|
// minioCollector is the Custom Collector
|
|
type minioCollector struct {
|
|
desc *prometheus.Desc
|
|
}
|
|
|
|
// Describe sends the super-set of all possible descriptors of metrics
|
|
func (c *minioCollector) Describe(ch chan<- *prometheus.Desc) {
|
|
ch <- c.desc
|
|
}
|
|
|
|
// Collect is called by the Prometheus registry when collecting metrics.
|
|
func (c *minioCollector) Collect(ch chan<- prometheus.Metric) {
|
|
// Expose MinIO's version information
|
|
minioVersionInfo.WithLabelValues(Version, CommitID).Set(1.0)
|
|
|
|
storageMetricsPrometheus(ch)
|
|
nodeHealthMetricsPrometheus(ch)
|
|
bucketUsageMetricsPrometheus(ch)
|
|
networkMetricsPrometheus(ch)
|
|
httpMetricsPrometheus(ch)
|
|
healingMetricsPrometheus(ch)
|
|
}
|
|
|
|
func nodeHealthMetricsPrometheus(ch chan<- prometheus.Metric) {
|
|
nodesUp, nodesDown := globalNotificationSys.GetPeerOnlineCount()
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(minioNamespace, "nodes", "online"),
|
|
"Total number of MinIO nodes online",
|
|
nil, nil),
|
|
prometheus.GaugeValue,
|
|
float64(nodesUp),
|
|
)
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(minioNamespace, "nodes", "offline"),
|
|
"Total number of MinIO nodes offline",
|
|
nil, nil),
|
|
prometheus.GaugeValue,
|
|
float64(nodesDown),
|
|
)
|
|
}
|
|
|
|
// collects healing specific metrics for MinIO instance in Prometheus specific format
|
|
// and sends to given channel
|
|
func healingMetricsPrometheus(ch chan<- prometheus.Metric) {
|
|
bgSeq, exists := globalBackgroundHealState.getHealSequenceByToken(bgHealingUUID)
|
|
if !exists {
|
|
return
|
|
}
|
|
|
|
var dur time.Duration
|
|
if !bgSeq.lastHealActivity.IsZero() {
|
|
dur = time.Since(bgSeq.lastHealActivity)
|
|
}
|
|
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(healMetricsNamespace, "time", "since_last_activity"),
|
|
"Time elapsed (in nano seconds) since last self healing activity. This is set to -1 until initial self heal activity",
|
|
nil, nil),
|
|
prometheus.GaugeValue,
|
|
float64(dur),
|
|
)
|
|
for k, v := range bgSeq.getScannedItemsMap() {
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(healMetricsNamespace, "objects", "scanned"),
|
|
"Objects scanned since uptime",
|
|
[]string{"type"}, nil),
|
|
prometheus.CounterValue,
|
|
float64(v), string(k),
|
|
)
|
|
}
|
|
for k, v := range bgSeq.getHealedItemsMap() {
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(healMetricsNamespace, "objects", "healed"),
|
|
"Objects healed since uptime",
|
|
[]string{"type"}, nil),
|
|
prometheus.CounterValue,
|
|
float64(v), string(k),
|
|
)
|
|
}
|
|
for k, v := range bgSeq.getHealFailedItemsMap() {
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(healMetricsNamespace, "objects", "heal_failed"),
|
|
"Objects for which healing failed since uptime",
|
|
[]string{"type"}, nil),
|
|
prometheus.CounterValue,
|
|
float64(v), string(k),
|
|
)
|
|
}
|
|
}
|
|
|
|
// collects http metrics for MinIO server in Prometheus specific format
|
|
// and sends to given channel
|
|
func httpMetricsPrometheus(ch chan<- prometheus.Metric) {
|
|
httpStats := globalHTTPStats.toServerHTTPStats(true)
|
|
|
|
for api, value := range httpStats.CurrentS3Requests.APIStats {
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(s3Namespace, "requests", "current"),
|
|
"Total number of running s3 requests in current MinIO server instance",
|
|
[]string{"api"}, nil),
|
|
prometheus.CounterValue,
|
|
float64(value),
|
|
api,
|
|
)
|
|
}
|
|
|
|
for api, value := range httpStats.TotalS3Requests.APIStats {
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(s3Namespace, "requests", "total"),
|
|
"Total number of s3 requests in current MinIO server instance",
|
|
[]string{"api"}, nil),
|
|
prometheus.CounterValue,
|
|
float64(value),
|
|
api,
|
|
)
|
|
}
|
|
|
|
for api, value := range httpStats.TotalS3Errors.APIStats {
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(s3Namespace, "errors", "total"),
|
|
"Total number of s3 errors in current MinIO server instance",
|
|
[]string{"api"}, nil),
|
|
prometheus.CounterValue,
|
|
float64(value),
|
|
api,
|
|
)
|
|
}
|
|
|
|
for api, value := range httpStats.TotalS3Canceled.APIStats {
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(s3Namespace, "canceled", "total"),
|
|
"Total number of client canceled s3 request in current MinIO server instance",
|
|
[]string{"api"}, nil),
|
|
prometheus.CounterValue,
|
|
float64(value),
|
|
api,
|
|
)
|
|
}
|
|
}
|
|
|
|
// collects network metrics for MinIO server in Prometheus specific format
|
|
// and sends to given channel
|
|
func networkMetricsPrometheus(ch chan<- prometheus.Metric) {
|
|
connStats := globalConnStats.toServerConnStats()
|
|
|
|
// Network Sent/Received Bytes (internode)
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(interNodeNamespace, "tx", "bytes_total"),
|
|
"Total number of bytes sent to the other peer nodes by current MinIO server instance",
|
|
nil, nil),
|
|
prometheus.CounterValue,
|
|
float64(connStats.internodeOutputBytes),
|
|
)
|
|
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(interNodeNamespace, "rx", "bytes_total"),
|
|
"Total number of internode bytes received by current MinIO server instance",
|
|
nil, nil),
|
|
prometheus.CounterValue,
|
|
float64(connStats.internodeInputBytes),
|
|
)
|
|
|
|
// Network Sent/Received Bytes (Outbound)
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(s3Namespace, "tx", "bytes_total"),
|
|
"Total number of s3 bytes sent by current MinIO server instance",
|
|
nil, nil),
|
|
prometheus.CounterValue,
|
|
float64(connStats.s3OutputBytes),
|
|
)
|
|
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(s3Namespace, "rx", "bytes_total"),
|
|
"Total number of s3 bytes received by current MinIO server instance",
|
|
nil, nil),
|
|
prometheus.CounterValue,
|
|
float64(connStats.s3InputBytes),
|
|
)
|
|
}
|
|
|
|
// Populates prometheus with bucket usage metrics, this metrics
|
|
// is only enabled if scanner is enabled.
|
|
func bucketUsageMetricsPrometheus(ch chan<- prometheus.Metric) {
|
|
objLayer := newObjectLayerFn()
|
|
// Service not initialized yet
|
|
if objLayer == nil {
|
|
return
|
|
}
|
|
|
|
dataUsageInfo, err := loadDataUsageFromBackend(GlobalContext, objLayer)
|
|
if err != nil {
|
|
return
|
|
}
|
|
// data usage has not captured any data yet.
|
|
if dataUsageInfo.LastUpdate.IsZero() {
|
|
return
|
|
}
|
|
|
|
for bucket, usageInfo := range dataUsageInfo.BucketsUsage {
|
|
stat := globalReplicationStats.Load().getLatestReplicationStats(bucket)
|
|
// Total space used by bucket
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(bucketNamespace, "usage", "size"),
|
|
"Total bucket size",
|
|
[]string{"bucket"}, nil),
|
|
prometheus.GaugeValue,
|
|
float64(usageInfo.Size),
|
|
bucket,
|
|
)
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(bucketNamespace, "objects", "count"),
|
|
"Total number of objects in a bucket",
|
|
[]string{"bucket"}, nil),
|
|
prometheus.GaugeValue,
|
|
float64(usageInfo.ObjectsCount),
|
|
bucket,
|
|
)
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName("bucket", "replication", "successful_size"),
|
|
"Total capacity replicated to destination",
|
|
[]string{"bucket"}, nil),
|
|
prometheus.GaugeValue,
|
|
float64(stat.ReplicationStats.ReplicatedSize),
|
|
bucket,
|
|
)
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName("bucket", "replication", "received_size"),
|
|
"Total capacity replicated to this instance",
|
|
[]string{"bucket"}, nil),
|
|
prometheus.GaugeValue,
|
|
float64(stat.ReplicationStats.ReplicaSize),
|
|
bucket,
|
|
)
|
|
|
|
for k, v := range usageInfo.ObjectSizesHistogram {
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(bucketNamespace, "objects", "histogram"),
|
|
"Total number of objects of different sizes in a bucket",
|
|
[]string{"bucket", "object_size"}, nil),
|
|
prometheus.GaugeValue,
|
|
float64(v),
|
|
bucket,
|
|
k,
|
|
)
|
|
}
|
|
for k, v := range usageInfo.ObjectVersionsHistogram {
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(bucketNamespace, "objects", "histogram"),
|
|
"Total number of versions of objects in a bucket",
|
|
[]string{"bucket", "object_versions"}, nil),
|
|
prometheus.GaugeValue,
|
|
float64(v),
|
|
bucket,
|
|
k,
|
|
)
|
|
}
|
|
}
|
|
}
|
|
|
|
// collects storage metrics for MinIO server in Prometheus specific format
|
|
// and sends to given channel
|
|
func storageMetricsPrometheus(ch chan<- prometheus.Metric) {
|
|
objLayer := newObjectLayerFn()
|
|
// Service not initialized yet
|
|
if objLayer == nil {
|
|
return
|
|
}
|
|
|
|
server := getLocalServerProperty(globalEndpoints, &http.Request{
|
|
Host: globalLocalNodeName,
|
|
}, true)
|
|
|
|
onlineDisks, offlineDisks := getOnlineOfflineDisksStats(server.Disks)
|
|
totalDisks := offlineDisks.Merge(onlineDisks)
|
|
|
|
// Report total capacity
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(minioNamespace, "capacity_raw", "total"),
|
|
"Total capacity online in the cluster",
|
|
nil, nil),
|
|
prometheus.GaugeValue,
|
|
float64(GetTotalCapacity(server.Disks)),
|
|
)
|
|
|
|
// Report total capacity free
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(minioNamespace, "capacity_raw_free", "total"),
|
|
"Total free capacity online in the cluster",
|
|
nil, nil),
|
|
prometheus.GaugeValue,
|
|
float64(GetTotalCapacityFree(server.Disks)),
|
|
)
|
|
|
|
sinfo := objLayer.StorageInfo(GlobalContext, true)
|
|
|
|
// Report total usable capacity
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(minioNamespace, "capacity_usable", "total"),
|
|
"Total usable capacity online in the cluster",
|
|
nil, nil),
|
|
prometheus.GaugeValue,
|
|
float64(GetTotalUsableCapacity(server.Disks, sinfo)),
|
|
)
|
|
|
|
// Report total usable capacity free
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(minioNamespace, "capacity_usable_free", "total"),
|
|
"Total free usable capacity online in the cluster",
|
|
nil, nil),
|
|
prometheus.GaugeValue,
|
|
float64(GetTotalUsableCapacityFree(server.Disks, sinfo)),
|
|
)
|
|
|
|
// MinIO Offline Disks per node
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(minioNamespace, "disks", "offline"),
|
|
"Total number of offline drives in current MinIO server instance",
|
|
nil, nil),
|
|
prometheus.GaugeValue,
|
|
float64(offlineDisks.Sum()),
|
|
)
|
|
|
|
// MinIO Total Disks per node
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(minioNamespace, "drives", "total"),
|
|
"Total number of drives for current MinIO server instance",
|
|
nil, nil),
|
|
prometheus.GaugeValue,
|
|
float64(totalDisks.Sum()),
|
|
)
|
|
|
|
for _, disk := range server.Disks {
|
|
// Total disk usage by the disk
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(diskNamespace, "storage", "used"),
|
|
"Total disk storage used on the drive",
|
|
[]string{"disk"}, nil),
|
|
prometheus.GaugeValue,
|
|
float64(disk.UsedSpace),
|
|
disk.DrivePath,
|
|
)
|
|
|
|
// Total available space in the disk
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(diskNamespace, "storage", "available"),
|
|
"Total available space left on the drive",
|
|
[]string{"disk"}, nil),
|
|
prometheus.GaugeValue,
|
|
float64(disk.AvailableSpace),
|
|
disk.DrivePath,
|
|
)
|
|
|
|
// Total storage space of the disk
|
|
ch <- prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(diskNamespace, "storage", "total"),
|
|
"Total space on the drive",
|
|
[]string{"disk"}, nil),
|
|
prometheus.GaugeValue,
|
|
float64(disk.TotalSpace),
|
|
disk.DrivePath,
|
|
)
|
|
}
|
|
}
|
|
|
|
func metricsHandler() http.Handler {
|
|
registry := prometheus.NewRegistry()
|
|
|
|
logger.CriticalIf(GlobalContext, registry.Register(minioVersionInfo))
|
|
|
|
logger.CriticalIf(GlobalContext, registry.Register(newMinioCollector()))
|
|
|
|
gatherers := prometheus.Gatherers{
|
|
prometheus.DefaultGatherer,
|
|
registry,
|
|
}
|
|
|
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
tc, ok := r.Context().Value(mcontext.ContextTraceKey).(*mcontext.TraceCtxt)
|
|
if ok {
|
|
tc.FuncName = "handler.MetricsLegacy"
|
|
tc.ResponseRecorder.LogErrBody = true
|
|
}
|
|
|
|
mfs, err := gatherers.Gather()
|
|
if err != nil {
|
|
if len(mfs) == 0 {
|
|
writeErrorResponseJSON(r.Context(), w, toAdminAPIErr(r.Context(), err), r.URL)
|
|
return
|
|
}
|
|
}
|
|
|
|
contentType := expfmt.Negotiate(r.Header)
|
|
w.Header().Set("Content-Type", string(contentType))
|
|
|
|
enc := expfmt.NewEncoder(w, contentType)
|
|
for _, mf := range mfs {
|
|
if err := enc.Encode(mf); err != nil {
|
|
// client may disconnect for any reasons
|
|
// we do not have to log this.
|
|
return
|
|
}
|
|
}
|
|
if closer, ok := enc.(expfmt.Closer); ok {
|
|
closer.Close()
|
|
}
|
|
})
|
|
}
|
|
|
|
// NoAuthMiddleware no auth middle ware.
|
|
func NoAuthMiddleware(h http.Handler) http.Handler {
|
|
return h
|
|
}
|
|
|
|
// AuthMiddleware checks if the bearer token is valid and authorized.
|
|
func AuthMiddleware(h http.Handler) http.Handler {
|
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
tc, ok := r.Context().Value(mcontext.ContextTraceKey).(*mcontext.TraceCtxt)
|
|
|
|
claims, groups, owner, authErr := metricsRequestAuthenticate(r)
|
|
if authErr != nil || (claims != nil && !claims.VerifyIssuer("prometheus", true)) {
|
|
if ok {
|
|
tc.FuncName = "handler.MetricsAuth"
|
|
tc.ResponseRecorder.LogErrBody = true
|
|
}
|
|
|
|
writeErrorResponseJSON(r.Context(), w, toAdminAPIErr(r.Context(), errAuthentication), r.URL)
|
|
return
|
|
}
|
|
|
|
cred := auth.Credentials{
|
|
AccessKey: claims.AccessKey,
|
|
Claims: claims.Map(),
|
|
Groups: groups,
|
|
}
|
|
|
|
// For authenticated users apply IAM policy.
|
|
if !globalIAMSys.IsAllowed(policy.Args{
|
|
AccountName: cred.AccessKey,
|
|
Groups: cred.Groups,
|
|
Action: policy.PrometheusAdminAction,
|
|
ConditionValues: getConditionValues(r, "", cred),
|
|
IsOwner: owner,
|
|
Claims: cred.Claims,
|
|
}) {
|
|
if ok {
|
|
tc.FuncName = "handler.MetricsAuth"
|
|
tc.ResponseRecorder.LogErrBody = true
|
|
}
|
|
|
|
writeErrorResponseJSON(r.Context(), w, toAdminAPIErr(r.Context(), errAuthentication), r.URL)
|
|
return
|
|
}
|
|
h.ServeHTTP(w, r)
|
|
})
|
|
}
|