mirror of
https://github.com/minio/minio.git
synced 2025-01-23 12:43:16 -05:00
export cluster health as prometheus metrics (#17741)
This commit is contained in:
parent
c2edbfae55
commit
114fab4c70
@ -2060,9 +2060,13 @@ type HealthOptions struct {
|
|||||||
// additionally with any specific heuristic information which
|
// additionally with any specific heuristic information which
|
||||||
// was queried
|
// was queried
|
||||||
type HealthResult struct {
|
type HealthResult struct {
|
||||||
Healthy bool
|
Healthy bool
|
||||||
HealingDrives int
|
HealingDrives int
|
||||||
PoolID, SetID int
|
UnhealthyPools []struct {
|
||||||
|
Maintenance bool
|
||||||
|
PoolID, SetID int
|
||||||
|
WriteQuorum int
|
||||||
|
}
|
||||||
WriteQuorum int
|
WriteQuorum int
|
||||||
UsingDefaults bool
|
UsingDefaults bool
|
||||||
}
|
}
|
||||||
@ -2164,24 +2168,6 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
|
|||||||
usingDefaults = true
|
usingDefaults = true
|
||||||
}
|
}
|
||||||
|
|
||||||
for poolIdx := range erasureSetUpCount {
|
|
||||||
for setIdx := range erasureSetUpCount[poolIdx] {
|
|
||||||
if erasureSetUpCount[poolIdx][setIdx] < poolWriteQuorums[poolIdx] {
|
|
||||||
logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
|
|
||||||
fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
|
|
||||||
poolIdx, setIdx, poolWriteQuorums[poolIdx]))
|
|
||||||
return HealthResult{
|
|
||||||
Healthy: false,
|
|
||||||
HealingDrives: len(aggHealStateResult.HealDisks),
|
|
||||||
PoolID: poolIdx,
|
|
||||||
SetID: setIdx,
|
|
||||||
WriteQuorum: poolWriteQuorums[poolIdx],
|
|
||||||
UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var maximumWriteQuorum int
|
var maximumWriteQuorum int
|
||||||
for _, writeQuorum := range poolWriteQuorums {
|
for _, writeQuorum := range poolWriteQuorums {
|
||||||
if maximumWriteQuorum == 0 {
|
if maximumWriteQuorum == 0 {
|
||||||
@ -2192,6 +2178,35 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
result := HealthResult{
|
||||||
|
HealingDrives: len(aggHealStateResult.HealDisks),
|
||||||
|
WriteQuorum: maximumWriteQuorum,
|
||||||
|
UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
|
||||||
|
}
|
||||||
|
|
||||||
|
for poolIdx := range erasureSetUpCount {
|
||||||
|
for setIdx := range erasureSetUpCount[poolIdx] {
|
||||||
|
if erasureSetUpCount[poolIdx][setIdx] < poolWriteQuorums[poolIdx] {
|
||||||
|
logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
|
||||||
|
fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
|
||||||
|
poolIdx, setIdx, poolWriteQuorums[poolIdx]))
|
||||||
|
result.UnhealthyPools = append(result.UnhealthyPools, struct {
|
||||||
|
Maintenance bool
|
||||||
|
PoolID, SetID, WriteQuorum int
|
||||||
|
}{
|
||||||
|
Maintenance: opts.Maintenance,
|
||||||
|
SetID: setIdx,
|
||||||
|
PoolID: poolIdx,
|
||||||
|
WriteQuorum: poolWriteQuorums[poolIdx],
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(result.UnhealthyPools) > 0 {
|
||||||
|
// We have unhealthy pools return error.
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// when maintenance is not specified we don't have
|
// when maintenance is not specified we don't have
|
||||||
// to look at the healing side of the code.
|
// to look at the healing side of the code.
|
||||||
if !opts.Maintenance {
|
if !opts.Maintenance {
|
||||||
|
@ -28,22 +28,17 @@ import (
|
|||||||
|
|
||||||
const unavailable = "offline"
|
const unavailable = "offline"
|
||||||
|
|
||||||
func isServerNotInitialized() bool {
|
|
||||||
return newObjectLayerFn() == nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// ClusterCheckHandler returns if the server is ready for requests.
|
// ClusterCheckHandler returns if the server is ready for requests.
|
||||||
func ClusterCheckHandler(w http.ResponseWriter, r *http.Request) {
|
func ClusterCheckHandler(w http.ResponseWriter, r *http.Request) {
|
||||||
ctx := newContext(r, w, "ClusterCheckHandler")
|
ctx := newContext(r, w, "ClusterCheckHandler")
|
||||||
|
|
||||||
if isServerNotInitialized() {
|
objLayer := newObjectLayerFn()
|
||||||
|
if objLayer == nil {
|
||||||
w.Header().Set(xhttp.MinIOServerStatus, unavailable)
|
w.Header().Set(xhttp.MinIOServerStatus, unavailable)
|
||||||
writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone)
|
writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
objLayer := newObjectLayerFn()
|
|
||||||
|
|
||||||
ctx, cancel := context.WithTimeout(ctx, globalAPIConfig.getClusterDeadline())
|
ctx, cancel := context.WithTimeout(ctx, globalAPIConfig.getClusterDeadline())
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
@ -52,16 +47,13 @@ func ClusterCheckHandler(w http.ResponseWriter, r *http.Request) {
|
|||||||
DeploymentType: r.Form.Get("deployment-type"),
|
DeploymentType: r.Form.Get("deployment-type"),
|
||||||
}
|
}
|
||||||
result := objLayer.Health(ctx, opts)
|
result := objLayer.Health(ctx, opts)
|
||||||
if result.WriteQuorum > 0 {
|
w.Header().Set(xhttp.MinIOWriteQuorum, strconv.Itoa(result.WriteQuorum))
|
||||||
w.Header().Set(xhttp.MinIOWriteQuorum, strconv.Itoa(result.WriteQuorum))
|
|
||||||
}
|
|
||||||
w.Header().Set(xhttp.MinIOStorageClassDefaults, strconv.FormatBool(result.UsingDefaults))
|
w.Header().Set(xhttp.MinIOStorageClassDefaults, strconv.FormatBool(result.UsingDefaults))
|
||||||
|
// return how many drives are being healed if any
|
||||||
|
if result.HealingDrives > 0 {
|
||||||
|
w.Header().Set(xhttp.MinIOHealingDrives, strconv.Itoa(result.HealingDrives))
|
||||||
|
}
|
||||||
if !result.Healthy {
|
if !result.Healthy {
|
||||||
// return how many drives are being healed if any
|
|
||||||
if result.HealingDrives > 0 {
|
|
||||||
w.Header().Set(xhttp.MinIOHealingDrives, strconv.Itoa(result.HealingDrives))
|
|
||||||
}
|
|
||||||
// As a maintenance call we are purposefully asked to be taken
|
// As a maintenance call we are purposefully asked to be taken
|
||||||
// down, this is for orchestrators to know if we can safely
|
// down, this is for orchestrators to know if we can safely
|
||||||
// take this server down, return appropriate error.
|
// take this server down, return appropriate error.
|
||||||
@ -79,14 +71,13 @@ func ClusterCheckHandler(w http.ResponseWriter, r *http.Request) {
|
|||||||
func ClusterReadCheckHandler(w http.ResponseWriter, r *http.Request) {
|
func ClusterReadCheckHandler(w http.ResponseWriter, r *http.Request) {
|
||||||
ctx := newContext(r, w, "ClusterReadCheckHandler")
|
ctx := newContext(r, w, "ClusterReadCheckHandler")
|
||||||
|
|
||||||
if isServerNotInitialized() {
|
objLayer := newObjectLayerFn()
|
||||||
|
if objLayer == nil {
|
||||||
w.Header().Set(xhttp.MinIOServerStatus, unavailable)
|
w.Header().Set(xhttp.MinIOServerStatus, unavailable)
|
||||||
writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone)
|
writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
objLayer := newObjectLayerFn()
|
|
||||||
|
|
||||||
ctx, cancel := context.WithTimeout(ctx, globalAPIConfig.getClusterDeadline())
|
ctx, cancel := context.WithTimeout(ctx, globalAPIConfig.getClusterDeadline())
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
@ -106,17 +97,17 @@ func ReadinessCheckHandler(w http.ResponseWriter, r *http.Request) {
|
|||||||
|
|
||||||
// LivenessCheckHandler - Checks if the process is up. Always returns success.
|
// LivenessCheckHandler - Checks if the process is up. Always returns success.
|
||||||
func LivenessCheckHandler(w http.ResponseWriter, r *http.Request) {
|
func LivenessCheckHandler(w http.ResponseWriter, r *http.Request) {
|
||||||
peerCall := r.Header.Get("x-minio-from-peer") != ""
|
objLayer := newObjectLayerFn()
|
||||||
|
if objLayer == nil {
|
||||||
if peerCall {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
if isServerNotInitialized() {
|
|
||||||
// Service not initialized yet
|
// Service not initialized yet
|
||||||
w.Header().Set(xhttp.MinIOServerStatus, unavailable)
|
w.Header().Set(xhttp.MinIOServerStatus, unavailable)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
peerCall := r.Header.Get(xhttp.MinIOPeerCall) != ""
|
||||||
|
if peerCall {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
if int(globalHTTPStats.loadRequestsInQueue()) > globalAPIConfig.getRequestsPoolCapacity() {
|
if int(globalHTTPStats.loadRequestsInQueue()) > globalAPIConfig.getRequestsPoolCapacity() {
|
||||||
apiErr := getAPIError(ErrBusy)
|
apiErr := getAPIError(ErrBusy)
|
||||||
switch r.Method {
|
switch r.Method {
|
||||||
|
@ -54,6 +54,7 @@ func init() {
|
|||||||
getClusterTierMetrics(),
|
getClusterTierMetrics(),
|
||||||
getClusterUsageMetrics(),
|
getClusterUsageMetrics(),
|
||||||
getKMSMetrics(),
|
getKMSMetrics(),
|
||||||
|
getClusterHealthMetrics(),
|
||||||
}
|
}
|
||||||
|
|
||||||
peerMetricsGroups = []*MetricsGroup{
|
peerMetricsGroups = []*MetricsGroup{
|
||||||
@ -2642,6 +2643,63 @@ func getLocalDriveStorageMetrics() *MetricsGroup {
|
|||||||
return mg
|
return mg
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getClusterWriteQuorumMD() MetricDescription {
|
||||||
|
return MetricDescription{
|
||||||
|
Namespace: clusterMetricNamespace,
|
||||||
|
Subsystem: "write",
|
||||||
|
Name: "quorum",
|
||||||
|
Help: "Maximum write quorum across all pools and sets",
|
||||||
|
Type: gaugeMetric,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func getClusterHealthStatusMD() MetricDescription {
|
||||||
|
return MetricDescription{
|
||||||
|
Namespace: clusterMetricNamespace,
|
||||||
|
Subsystem: "health",
|
||||||
|
Name: "status",
|
||||||
|
Help: "Get current cluster health status",
|
||||||
|
Type: gaugeMetric,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func getClusterHealthMetrics() *MetricsGroup {
|
||||||
|
mg := &MetricsGroup{
|
||||||
|
cacheInterval: 10 * time.Second,
|
||||||
|
}
|
||||||
|
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
|
||||||
|
objLayer := newObjectLayerFn()
|
||||||
|
// Service not initialized yet
|
||||||
|
if objLayer == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
metrics = make([]Metric, 0, 2)
|
||||||
|
|
||||||
|
opts := HealthOptions{}
|
||||||
|
result := objLayer.Health(ctx, opts)
|
||||||
|
|
||||||
|
metrics = append(metrics, Metric{
|
||||||
|
Description: getClusterWriteQuorumMD(),
|
||||||
|
Value: float64(result.WriteQuorum),
|
||||||
|
})
|
||||||
|
|
||||||
|
health := 1
|
||||||
|
if !result.Healthy {
|
||||||
|
health = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
metrics = append(metrics, Metric{
|
||||||
|
Description: getClusterHealthStatusMD(),
|
||||||
|
Value: float64(health),
|
||||||
|
})
|
||||||
|
|
||||||
|
return
|
||||||
|
})
|
||||||
|
|
||||||
|
return mg
|
||||||
|
}
|
||||||
|
|
||||||
func getClusterStorageMetrics() *MetricsGroup {
|
func getClusterStorageMetrics() *MetricsGroup {
|
||||||
mg := &MetricsGroup{
|
mg := &MetricsGroup{
|
||||||
cacheInterval: 1 * time.Minute,
|
cacheInterval: 1 * time.Minute,
|
||||||
|
@ -138,8 +138,8 @@ func isServerResolvable(endpoint Endpoint, timeout time.Duration) error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
// Indicate that the liveness check for a peer call
|
||||||
req.Header.Set("x-minio-from-peer", "true")
|
req.Header.Set(xhttp.MinIOPeerCall, "true")
|
||||||
|
|
||||||
resp, err := httpClient.Do(req)
|
resp, err := httpClient.Do(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -40,6 +40,8 @@ These metrics can be obtained from any MinIO server once per collection.
|
|||||||
| `minio_cluster_kms_uptime` | The time the KMS has been up and running in seconds. |
|
| `minio_cluster_kms_uptime` | The time the KMS has been up and running in seconds. |
|
||||||
| `minio_cluster_nodes_offline_total` | Total number of MinIO nodes offline. |
|
| `minio_cluster_nodes_offline_total` | Total number of MinIO nodes offline. |
|
||||||
| `minio_cluster_nodes_online_total` | Total number of MinIO nodes online. |
|
| `minio_cluster_nodes_online_total` | Total number of MinIO nodes online. |
|
||||||
|
| `minio_cluster_write_quorum` | Maximum write quorum across all pools and sets |
|
||||||
|
| `minio_cluster_health_status` | Get current cluster health status |
|
||||||
| `minio_heal_objects_errors_total` | Objects for which healing failed in current self healing run. |
|
| `minio_heal_objects_errors_total` | Objects for which healing failed in current self healing run. |
|
||||||
| `minio_heal_objects_heal_total` | Objects healed in current self healing run. |
|
| `minio_heal_objects_heal_total` | Objects healed in current self healing run. |
|
||||||
| `minio_heal_objects_total` | Objects scanned in current self healing run. |
|
| `minio_heal_objects_total` | Objects scanned in current self healing run. |
|
||||||
|
@ -152,6 +152,9 @@ const (
|
|||||||
// Deployment id.
|
// Deployment id.
|
||||||
MinioDeploymentID = "x-minio-deployment-id"
|
MinioDeploymentID = "x-minio-deployment-id"
|
||||||
|
|
||||||
|
// Peer call
|
||||||
|
MinIOPeerCall = "x-minio-from-peer"
|
||||||
|
|
||||||
// Server-Status
|
// Server-Status
|
||||||
MinIOServerStatus = "x-minio-server-status"
|
MinIOServerStatus = "x-minio-server-status"
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user