prom: Add drive failure tolerance per erasure set (#18424)

This commit is contained in:
Anis Eleuch 2023-11-13 00:59:48 -08:00 committed by GitHub
parent 4598827dcb
commit fe63664164
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 45 additions and 31 deletions

View File

@ -2255,9 +2255,10 @@ type HealthOptions struct {
type HealthResult struct { type HealthResult struct {
Healthy bool Healthy bool
HealingDrives int HealingDrives int
UnhealthyPools []struct { ESHealth []struct {
Maintenance bool Maintenance bool
PoolID, SetID int PoolID, SetID int
HealthyDrives int
WriteQuorum int WriteQuorum int
} }
WriteQuorum int WriteQuorum int
@ -2372,51 +2373,41 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
} }
result := HealthResult{ result := HealthResult{
HealingDrives: len(aggHealStateResult.HealDisks), Healthy: true,
WriteQuorum: maximumWriteQuorum, WriteQuorum: maximumWriteQuorum,
UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node. UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
} }
for poolIdx := range erasureSetUpCount { for poolIdx := range erasureSetUpCount {
for setIdx := range erasureSetUpCount[poolIdx] { for setIdx := range erasureSetUpCount[poolIdx] {
if erasureSetUpCount[poolIdx][setIdx] < poolWriteQuorums[poolIdx] { result.ESHealth = append(result.ESHealth, struct {
logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
poolIdx, setIdx, poolWriteQuorums[poolIdx]))
result.UnhealthyPools = append(result.UnhealthyPools, struct {
Maintenance bool Maintenance bool
PoolID, SetID, WriteQuorum int PoolID, SetID int
HealthyDrives, WriteQuorum int
}{ }{
Maintenance: opts.Maintenance, Maintenance: opts.Maintenance,
SetID: setIdx, SetID: setIdx,
PoolID: poolIdx, PoolID: poolIdx,
HealthyDrives: erasureSetUpCount[poolIdx][setIdx],
WriteQuorum: poolWriteQuorums[poolIdx], WriteQuorum: poolWriteQuorums[poolIdx],
}) })
if erasureSetUpCount[poolIdx][setIdx] < poolWriteQuorums[poolIdx] {
logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
poolIdx, setIdx, poolWriteQuorums[poolIdx]))
result.Healthy = false
} }
} }
if len(result.UnhealthyPools) > 0 { }
// We have unhealthy pools return error.
if opts.Maintenance {
result.Healthy = result.Healthy && len(aggHealStateResult.HealDisks) == 0
result.HealingDrives = len(aggHealStateResult.HealDisks)
}
return result return result
} }
}
// when maintenance is not specified we don't have
// to look at the healing side of the code.
if !opts.Maintenance {
return HealthResult{
Healthy: true,
WriteQuorum: maximumWriteQuorum,
UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
}
}
return HealthResult{
Healthy: len(aggHealStateResult.HealDisks) == 0,
HealingDrives: len(aggHealStateResult.HealDisks),
WriteQuorum: maximumWriteQuorum,
UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
}
}
// PutObjectMetadata - replace or add tags to an existing object // PutObjectMetadata - replace or add tags to an existing object
func (z *erasureServerPools) PutObjectMetadata(ctx context.Context, bucket, object string, opts ObjectOptions) (ObjectInfo, error) { func (z *erasureServerPools) PutObjectMetadata(ctx context.Context, bucket, object string, opts ObjectOptions) (ObjectInfo, error) {

View File

@ -22,6 +22,7 @@ import (
"fmt" "fmt"
"net/http" "net/http"
"runtime" "runtime"
"strconv"
"strings" "strings"
"sync" "sync"
"sync/atomic" "sync/atomic"
@ -3187,6 +3188,16 @@ func getClusterHealthStatusMD() MetricDescription {
} }
} }
func getClusterErasureSetToleranceMD() MetricDescription {
return MetricDescription{
Namespace: clusterMetricNamespace,
Subsystem: "health",
Name: "erasure_set_tolerance",
Help: "Get erasure set tolerance status",
Type: gaugeMetric,
}
}
func getClusterHealthMetrics() *MetricsGroup { func getClusterHealthMetrics() *MetricsGroup {
mg := &MetricsGroup{ mg := &MetricsGroup{
cacheInterval: 10 * time.Second, cacheInterval: 10 * time.Second,
@ -3218,6 +3229,18 @@ func getClusterHealthMetrics() *MetricsGroup {
Value: float64(health), Value: float64(health),
}) })
for _, h := range result.ESHealth {
labels := map[string]string{
"pool": strconv.Itoa(h.PoolID),
"set": strconv.Itoa(h.SetID),
}
metrics = append(metrics, Metric{
Description: getClusterErasureSetToleranceMD(),
VariableLabels: labels,
Value: float64(h.HealthyDrives - h.WriteQuorum),
})
}
return return
}) })