export cluster health as prometheus metrics (#17741)

This commit is contained in:
Harshavardhana 2023-07-28 01:16:53 -07:00 committed by GitHub
parent c2edbfae55
commit 114fab4c70
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 117 additions and 48 deletions

View File

@ -2062,8 +2062,12 @@ type HealthOptions struct {
type HealthResult struct { type HealthResult struct {
Healthy bool Healthy bool
HealingDrives int HealingDrives int
UnhealthyPools []struct {
Maintenance bool
PoolID, SetID int PoolID, SetID int
WriteQuorum int WriteQuorum int
}
WriteQuorum int
UsingDefaults bool UsingDefaults bool
} }
@ -2164,24 +2168,6 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
usingDefaults = true usingDefaults = true
} }
for poolIdx := range erasureSetUpCount {
for setIdx := range erasureSetUpCount[poolIdx] {
if erasureSetUpCount[poolIdx][setIdx] < poolWriteQuorums[poolIdx] {
logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
poolIdx, setIdx, poolWriteQuorums[poolIdx]))
return HealthResult{
Healthy: false,
HealingDrives: len(aggHealStateResult.HealDisks),
PoolID: poolIdx,
SetID: setIdx,
WriteQuorum: poolWriteQuorums[poolIdx],
UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
}
}
}
}
var maximumWriteQuorum int var maximumWriteQuorum int
for _, writeQuorum := range poolWriteQuorums { for _, writeQuorum := range poolWriteQuorums {
if maximumWriteQuorum == 0 { if maximumWriteQuorum == 0 {
@ -2192,6 +2178,35 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
} }
} }
result := HealthResult{
HealingDrives: len(aggHealStateResult.HealDisks),
WriteQuorum: maximumWriteQuorum,
UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
}
for poolIdx := range erasureSetUpCount {
for setIdx := range erasureSetUpCount[poolIdx] {
if erasureSetUpCount[poolIdx][setIdx] < poolWriteQuorums[poolIdx] {
logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
poolIdx, setIdx, poolWriteQuorums[poolIdx]))
result.UnhealthyPools = append(result.UnhealthyPools, struct {
Maintenance bool
PoolID, SetID, WriteQuorum int
}{
Maintenance: opts.Maintenance,
SetID: setIdx,
PoolID: poolIdx,
WriteQuorum: poolWriteQuorums[poolIdx],
})
}
}
if len(result.UnhealthyPools) > 0 {
// We have unhealthy pools return error.
return result
}
}
// when maintenance is not specified we don't have // when maintenance is not specified we don't have
// to look at the healing side of the code. // to look at the healing side of the code.
if !opts.Maintenance { if !opts.Maintenance {

View File

@ -28,22 +28,17 @@ import (
const unavailable = "offline" const unavailable = "offline"
func isServerNotInitialized() bool {
return newObjectLayerFn() == nil
}
// ClusterCheckHandler returns if the server is ready for requests. // ClusterCheckHandler returns if the server is ready for requests.
func ClusterCheckHandler(w http.ResponseWriter, r *http.Request) { func ClusterCheckHandler(w http.ResponseWriter, r *http.Request) {
ctx := newContext(r, w, "ClusterCheckHandler") ctx := newContext(r, w, "ClusterCheckHandler")
if isServerNotInitialized() { objLayer := newObjectLayerFn()
if objLayer == nil {
w.Header().Set(xhttp.MinIOServerStatus, unavailable) w.Header().Set(xhttp.MinIOServerStatus, unavailable)
writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone) writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone)
return return
} }
objLayer := newObjectLayerFn()
ctx, cancel := context.WithTimeout(ctx, globalAPIConfig.getClusterDeadline()) ctx, cancel := context.WithTimeout(ctx, globalAPIConfig.getClusterDeadline())
defer cancel() defer cancel()
@ -52,16 +47,13 @@ func ClusterCheckHandler(w http.ResponseWriter, r *http.Request) {
DeploymentType: r.Form.Get("deployment-type"), DeploymentType: r.Form.Get("deployment-type"),
} }
result := objLayer.Health(ctx, opts) result := objLayer.Health(ctx, opts)
if result.WriteQuorum > 0 {
w.Header().Set(xhttp.MinIOWriteQuorum, strconv.Itoa(result.WriteQuorum)) w.Header().Set(xhttp.MinIOWriteQuorum, strconv.Itoa(result.WriteQuorum))
}
w.Header().Set(xhttp.MinIOStorageClassDefaults, strconv.FormatBool(result.UsingDefaults)) w.Header().Set(xhttp.MinIOStorageClassDefaults, strconv.FormatBool(result.UsingDefaults))
if !result.Healthy {
// return how many drives are being healed if any // return how many drives are being healed if any
if result.HealingDrives > 0 { if result.HealingDrives > 0 {
w.Header().Set(xhttp.MinIOHealingDrives, strconv.Itoa(result.HealingDrives)) w.Header().Set(xhttp.MinIOHealingDrives, strconv.Itoa(result.HealingDrives))
} }
if !result.Healthy {
// As a maintenance call we are purposefully asked to be taken // As a maintenance call we are purposefully asked to be taken
// down, this is for orchestrators to know if we can safely // down, this is for orchestrators to know if we can safely
// take this server down, return appropriate error. // take this server down, return appropriate error.
@ -79,14 +71,13 @@ func ClusterCheckHandler(w http.ResponseWriter, r *http.Request) {
func ClusterReadCheckHandler(w http.ResponseWriter, r *http.Request) { func ClusterReadCheckHandler(w http.ResponseWriter, r *http.Request) {
ctx := newContext(r, w, "ClusterReadCheckHandler") ctx := newContext(r, w, "ClusterReadCheckHandler")
if isServerNotInitialized() { objLayer := newObjectLayerFn()
if objLayer == nil {
w.Header().Set(xhttp.MinIOServerStatus, unavailable) w.Header().Set(xhttp.MinIOServerStatus, unavailable)
writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone) writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone)
return return
} }
objLayer := newObjectLayerFn()
ctx, cancel := context.WithTimeout(ctx, globalAPIConfig.getClusterDeadline()) ctx, cancel := context.WithTimeout(ctx, globalAPIConfig.getClusterDeadline())
defer cancel() defer cancel()
@ -106,17 +97,17 @@ func ReadinessCheckHandler(w http.ResponseWriter, r *http.Request) {
// LivenessCheckHandler - Checks if the process is up. Always returns success. // LivenessCheckHandler - Checks if the process is up. Always returns success.
func LivenessCheckHandler(w http.ResponseWriter, r *http.Request) { func LivenessCheckHandler(w http.ResponseWriter, r *http.Request) {
peerCall := r.Header.Get("x-minio-from-peer") != "" objLayer := newObjectLayerFn()
if objLayer == nil {
if peerCall {
return
}
if isServerNotInitialized() {
// Service not initialized yet // Service not initialized yet
w.Header().Set(xhttp.MinIOServerStatus, unavailable) w.Header().Set(xhttp.MinIOServerStatus, unavailable)
} }
peerCall := r.Header.Get(xhttp.MinIOPeerCall) != ""
if peerCall {
return
}
if int(globalHTTPStats.loadRequestsInQueue()) > globalAPIConfig.getRequestsPoolCapacity() { if int(globalHTTPStats.loadRequestsInQueue()) > globalAPIConfig.getRequestsPoolCapacity() {
apiErr := getAPIError(ErrBusy) apiErr := getAPIError(ErrBusy)
switch r.Method { switch r.Method {

View File

@ -54,6 +54,7 @@ func init() {
getClusterTierMetrics(), getClusterTierMetrics(),
getClusterUsageMetrics(), getClusterUsageMetrics(),
getKMSMetrics(), getKMSMetrics(),
getClusterHealthMetrics(),
} }
peerMetricsGroups = []*MetricsGroup{ peerMetricsGroups = []*MetricsGroup{
@ -2642,6 +2643,63 @@ func getLocalDriveStorageMetrics() *MetricsGroup {
return mg return mg
} }
func getClusterWriteQuorumMD() MetricDescription {
return MetricDescription{
Namespace: clusterMetricNamespace,
Subsystem: "write",
Name: "quorum",
Help: "Maximum write quorum across all pools and sets",
Type: gaugeMetric,
}
}
func getClusterHealthStatusMD() MetricDescription {
return MetricDescription{
Namespace: clusterMetricNamespace,
Subsystem: "health",
Name: "status",
Help: "Get current cluster health status",
Type: gaugeMetric,
}
}
func getClusterHealthMetrics() *MetricsGroup {
mg := &MetricsGroup{
cacheInterval: 10 * time.Second,
}
mg.RegisterRead(func(ctx context.Context) (metrics []Metric) {
objLayer := newObjectLayerFn()
// Service not initialized yet
if objLayer == nil {
return
}
metrics = make([]Metric, 0, 2)
opts := HealthOptions{}
result := objLayer.Health(ctx, opts)
metrics = append(metrics, Metric{
Description: getClusterWriteQuorumMD(),
Value: float64(result.WriteQuorum),
})
health := 1
if !result.Healthy {
health = 0
}
metrics = append(metrics, Metric{
Description: getClusterHealthStatusMD(),
Value: float64(health),
})
return
})
return mg
}
func getClusterStorageMetrics() *MetricsGroup { func getClusterStorageMetrics() *MetricsGroup {
mg := &MetricsGroup{ mg := &MetricsGroup{
cacheInterval: 1 * time.Minute, cacheInterval: 1 * time.Minute,

View File

@ -138,8 +138,8 @@ func isServerResolvable(endpoint Endpoint, timeout time.Duration) error {
if err != nil { if err != nil {
return err return err
} }
// Indicate that the liveness check for a peer call
req.Header.Set("x-minio-from-peer", "true") req.Header.Set(xhttp.MinIOPeerCall, "true")
resp, err := httpClient.Do(req) resp, err := httpClient.Do(req)
if err != nil { if err != nil {

View File

@ -40,6 +40,8 @@ These metrics can be obtained from any MinIO server once per collection.
| `minio_cluster_kms_uptime` | The time the KMS has been up and running in seconds. | | `minio_cluster_kms_uptime` | The time the KMS has been up and running in seconds. |
| `minio_cluster_nodes_offline_total` | Total number of MinIO nodes offline. | | `minio_cluster_nodes_offline_total` | Total number of MinIO nodes offline. |
| `minio_cluster_nodes_online_total` | Total number of MinIO nodes online. | | `minio_cluster_nodes_online_total` | Total number of MinIO nodes online. |
| `minio_cluster_write_quorum` | Maximum write quorum across all pools and sets |
| `minio_cluster_health_status` | Get current cluster health status |
| `minio_heal_objects_errors_total` | Objects for which healing failed in current self healing run. | | `minio_heal_objects_errors_total` | Objects for which healing failed in current self healing run. |
| `minio_heal_objects_heal_total` | Objects healed in current self healing run. | | `minio_heal_objects_heal_total` | Objects healed in current self healing run. |
| `minio_heal_objects_total` | Objects scanned in current self healing run. | | `minio_heal_objects_total` | Objects scanned in current self healing run. |

View File

@ -152,6 +152,9 @@ const (
// Deployment id. // Deployment id.
MinioDeploymentID = "x-minio-deployment-id" MinioDeploymentID = "x-minio-deployment-id"
// Peer call
MinIOPeerCall = "x-minio-from-peer"
// Server-Status // Server-Status
MinIOServerStatus = "x-minio-server-status" MinIOServerStatus = "x-minio-server-status"