mirror of
https://github.com/minio/minio.git
synced 2024-12-24 06:05:55 -05:00
prom: Add drive failure tolerance per erasure set (#18424)
This commit is contained in:
parent
4598827dcb
commit
fe63664164
@ -2253,11 +2253,12 @@ type HealthOptions struct {
|
||||
// additionally with any specific heuristic information which
|
||||
// was queried
|
||||
type HealthResult struct {
|
||||
Healthy bool
|
||||
HealingDrives int
|
||||
UnhealthyPools []struct {
|
||||
Healthy bool
|
||||
HealingDrives int
|
||||
ESHealth []struct {
|
||||
Maintenance bool
|
||||
PoolID, SetID int
|
||||
HealthyDrives int
|
||||
WriteQuorum int
|
||||
}
|
||||
WriteQuorum int
|
||||
@ -2372,50 +2373,40 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
|
||||
}
|
||||
|
||||
result := HealthResult{
|
||||
HealingDrives: len(aggHealStateResult.HealDisks),
|
||||
Healthy: true,
|
||||
WriteQuorum: maximumWriteQuorum,
|
||||
UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
|
||||
}
|
||||
|
||||
for poolIdx := range erasureSetUpCount {
|
||||
for setIdx := range erasureSetUpCount[poolIdx] {
|
||||
result.ESHealth = append(result.ESHealth, struct {
|
||||
Maintenance bool
|
||||
PoolID, SetID int
|
||||
HealthyDrives, WriteQuorum int
|
||||
}{
|
||||
Maintenance: opts.Maintenance,
|
||||
SetID: setIdx,
|
||||
PoolID: poolIdx,
|
||||
HealthyDrives: erasureSetUpCount[poolIdx][setIdx],
|
||||
WriteQuorum: poolWriteQuorums[poolIdx],
|
||||
})
|
||||
|
||||
if erasureSetUpCount[poolIdx][setIdx] < poolWriteQuorums[poolIdx] {
|
||||
logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
|
||||
fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
|
||||
poolIdx, setIdx, poolWriteQuorums[poolIdx]))
|
||||
result.UnhealthyPools = append(result.UnhealthyPools, struct {
|
||||
Maintenance bool
|
||||
PoolID, SetID, WriteQuorum int
|
||||
}{
|
||||
Maintenance: opts.Maintenance,
|
||||
SetID: setIdx,
|
||||
PoolID: poolIdx,
|
||||
WriteQuorum: poolWriteQuorums[poolIdx],
|
||||
})
|
||||
result.Healthy = false
|
||||
}
|
||||
}
|
||||
if len(result.UnhealthyPools) > 0 {
|
||||
// We have unhealthy pools return error.
|
||||
return result
|
||||
}
|
||||
}
|
||||
|
||||
// when maintenance is not specified we don't have
|
||||
// to look at the healing side of the code.
|
||||
if !opts.Maintenance {
|
||||
return HealthResult{
|
||||
Healthy: true,
|
||||
WriteQuorum: maximumWriteQuorum,
|
||||
UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
|
||||
}
|
||||
if opts.Maintenance {
|
||||
result.Healthy = result.Healthy && len(aggHealStateResult.HealDisks) == 0
|
||||
result.HealingDrives = len(aggHealStateResult.HealDisks)
|
||||
}
|
||||
|
||||
return HealthResult{
|
||||
Healthy: len(aggHealStateResult.HealDisks) == 0,
|
||||
HealingDrives: len(aggHealStateResult.HealDisks),
|
||||
WriteQuorum: maximumWriteQuorum,
|
||||
UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// PutObjectMetadata - replace or add tags to an existing object
|
||||
|
@ -22,6 +22,7 @@ import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
@ -3187,6 +3188,16 @@ func getClusterHealthStatusMD() MetricDescription {
|
||||
}
|
||||
}
|
||||
|
||||
func getClusterErasureSetToleranceMD() MetricDescription {
|
||||
return MetricDescription{
|
||||
Namespace: clusterMetricNamespace,
|
||||
Subsystem: "health",
|
||||
Name: "erasure_set_tolerance",
|
||||
Help: "Get erasure set tolerance status",
|
||||
Type: gaugeMetric,
|
||||
}
|
||||
}
|
||||
|
||||
func getClusterHealthMetrics() *MetricsGroup {
|
||||
mg := &MetricsGroup{
|
||||
cacheInterval: 10 * time.Second,
|
||||
@ -3218,6 +3229,18 @@ func getClusterHealthMetrics() *MetricsGroup {
|
||||
Value: float64(health),
|
||||
})
|
||||
|
||||
for _, h := range result.ESHealth {
|
||||
labels := map[string]string{
|
||||
"pool": strconv.Itoa(h.PoolID),
|
||||
"set": strconv.Itoa(h.SetID),
|
||||
}
|
||||
metrics = append(metrics, Metric{
|
||||
Description: getClusterErasureSetToleranceMD(),
|
||||
VariableLabels: labels,
|
||||
Value: float64(h.HealthyDrives - h.WriteQuorum),
|
||||
})
|
||||
}
|
||||
|
||||
return
|
||||
})
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user