prom: Add drive failure tolerance per erasure set (#18424)

2025-11-07 12:52:58 -05:00 · 2023-11-13 00:59:48 -08:00
parent 4598827dcb
commit fe63664164
2 changed files with 45 additions and 31 deletions
--- a/cmd/erasure-server-pool.go
+++ b/cmd/erasure-server-pool.go
@@ -2255,9 +2255,10 @@ type HealthOptions struct {
 type HealthResult struct {
 	Healthy       bool
 	HealingDrives int
-	UnhealthyPools []struct {
+	ESHealth      []struct {
 		Maintenance   bool
 		PoolID, SetID int
 		HealthyDrives int
 		WriteQuorum   int
 	}
 	WriteQuorum   int
@@ -2372,51 +2373,41 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
 	}
 	result := HealthResult{
-		HealingDrives: len(aggHealStateResult.HealDisks),
+		Healthy:       true,
 		WriteQuorum:   maximumWriteQuorum,
 		UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
 	}
 	for poolIdx := range erasureSetUpCount {
 		for setIdx := range erasureSetUpCount[poolIdx] {
-			if erasureSetUpCount[poolIdx][setIdx] < poolWriteQuorums[poolIdx] {
+			result.ESHealth = append(result.ESHealth, struct {
 				logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
 					fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
 						poolIdx, setIdx, poolWriteQuorums[poolIdx]))
 				result.UnhealthyPools = append(result.UnhealthyPools, struct {
 				Maintenance                bool
-					PoolID, SetID, WriteQuorum int
+				PoolID, SetID              int
 				HealthyDrives, WriteQuorum int
 			}{
 				Maintenance:   opts.Maintenance,
 				SetID:         setIdx,
 				PoolID:        poolIdx,
 				HealthyDrives: erasureSetUpCount[poolIdx][setIdx],
 				WriteQuorum:   poolWriteQuorums[poolIdx],
 			})
 			if erasureSetUpCount[poolIdx][setIdx] < poolWriteQuorums[poolIdx] {
 				logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
 					fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
 						poolIdx, setIdx, poolWriteQuorums[poolIdx]))
 				result.Healthy = false
 			}
 		}
-		if len(result.UnhealthyPools) > 0 {
+	}
-			// We have unhealthy pools return error.
+
 	if opts.Maintenance {
 		result.Healthy = result.Healthy && len(aggHealStateResult.HealDisks) == 0
 		result.HealingDrives = len(aggHealStateResult.HealDisks)
 	}
 	return result
 }
 	}
 	// when maintenance is not specified we don't have
 	// to look at the healing side of the code.
 	if !opts.Maintenance {
 		return HealthResult{
 			Healthy:       true,
 			WriteQuorum:   maximumWriteQuorum,
 			UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
 		}
 	}
 	return HealthResult{
 		Healthy:       len(aggHealStateResult.HealDisks) == 0,
 		HealingDrives: len(aggHealStateResult.HealDisks),
 		WriteQuorum:   maximumWriteQuorum,
 		UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
 	}
 }
 // PutObjectMetadata - replace or add tags to an existing object
 func (z *erasureServerPools) PutObjectMetadata(ctx context.Context, bucket, object string, opts ObjectOptions) (ObjectInfo, error) {
--- a/cmd/metrics-v2.go
+++ b/cmd/metrics-v2.go
@@ -22,6 +22,7 @@ import (
 	"fmt"
 	"net/http"
 	"runtime"
 	"strconv"
 	"strings"
 	"sync"
 	"sync/atomic"
@@ -3187,6 +3188,16 @@ func getClusterHealthStatusMD() MetricDescription {
 	}
 }
 func getClusterErasureSetToleranceMD() MetricDescription {
 	return MetricDescription{
 		Namespace: clusterMetricNamespace,
 		Subsystem: "health",
 		Name:      "erasure_set_tolerance",
 		Help:      "Get erasure set tolerance status",
 		Type:      gaugeMetric,
 	}
 }
 func getClusterHealthMetrics() *MetricsGroup {
 	mg := &MetricsGroup{
 		cacheInterval: 10 * time.Second,
@@ -3218,6 +3229,18 @@ func getClusterHealthMetrics() *MetricsGroup {
 			Value:       float64(health),
 		})
 		for _, h := range result.ESHealth {
 			labels := map[string]string{
 				"pool": strconv.Itoa(h.PoolID),
 				"set":  strconv.Itoa(h.SetID),
 			}
 			metrics = append(metrics, Metric{
 				Description:    getClusterErasureSetToleranceMD(),
 				VariableLabels: labels,
 				Value:          float64(h.HealthyDrives - h.WriteQuorum),
 			})
 		}
 		return
 	})