mirror of
https://github.com/minio/minio.git
synced 2025-01-12 15:33:22 -05:00
prom: Add drive failure tolerance per erasure set (#18424)
This commit is contained in:
parent
4598827dcb
commit
fe63664164
@ -2253,11 +2253,12 @@ type HealthOptions struct {
|
|||||||
// additionally with any specific heuristic information which
|
// additionally with any specific heuristic information which
|
||||||
// was queried
|
// was queried
|
||||||
type HealthResult struct {
|
type HealthResult struct {
|
||||||
Healthy bool
|
Healthy bool
|
||||||
HealingDrives int
|
HealingDrives int
|
||||||
UnhealthyPools []struct {
|
ESHealth []struct {
|
||||||
Maintenance bool
|
Maintenance bool
|
||||||
PoolID, SetID int
|
PoolID, SetID int
|
||||||
|
HealthyDrives int
|
||||||
WriteQuorum int
|
WriteQuorum int
|
||||||
}
|
}
|
||||||
WriteQuorum int
|
WriteQuorum int
|
||||||
@ -2372,50 +2373,40 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
|
|||||||
}
|
}
|
||||||
|
|
||||||
result := HealthResult{
|
result := HealthResult{
|
||||||
HealingDrives: len(aggHealStateResult.HealDisks),
|
Healthy: true,
|
||||||
WriteQuorum: maximumWriteQuorum,
|
WriteQuorum: maximumWriteQuorum,
|
||||||
UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
|
UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
|
||||||
}
|
}
|
||||||
|
|
||||||
for poolIdx := range erasureSetUpCount {
|
for poolIdx := range erasureSetUpCount {
|
||||||
for setIdx := range erasureSetUpCount[poolIdx] {
|
for setIdx := range erasureSetUpCount[poolIdx] {
|
||||||
|
result.ESHealth = append(result.ESHealth, struct {
|
||||||
|
Maintenance bool
|
||||||
|
PoolID, SetID int
|
||||||
|
HealthyDrives, WriteQuorum int
|
||||||
|
}{
|
||||||
|
Maintenance: opts.Maintenance,
|
||||||
|
SetID: setIdx,
|
||||||
|
PoolID: poolIdx,
|
||||||
|
HealthyDrives: erasureSetUpCount[poolIdx][setIdx],
|
||||||
|
WriteQuorum: poolWriteQuorums[poolIdx],
|
||||||
|
})
|
||||||
|
|
||||||
if erasureSetUpCount[poolIdx][setIdx] < poolWriteQuorums[poolIdx] {
|
if erasureSetUpCount[poolIdx][setIdx] < poolWriteQuorums[poolIdx] {
|
||||||
logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
|
logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
|
||||||
fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
|
fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
|
||||||
poolIdx, setIdx, poolWriteQuorums[poolIdx]))
|
poolIdx, setIdx, poolWriteQuorums[poolIdx]))
|
||||||
result.UnhealthyPools = append(result.UnhealthyPools, struct {
|
result.Healthy = false
|
||||||
Maintenance bool
|
|
||||||
PoolID, SetID, WriteQuorum int
|
|
||||||
}{
|
|
||||||
Maintenance: opts.Maintenance,
|
|
||||||
SetID: setIdx,
|
|
||||||
PoolID: poolIdx,
|
|
||||||
WriteQuorum: poolWriteQuorums[poolIdx],
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(result.UnhealthyPools) > 0 {
|
|
||||||
// We have unhealthy pools return error.
|
|
||||||
return result
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// when maintenance is not specified we don't have
|
if opts.Maintenance {
|
||||||
// to look at the healing side of the code.
|
result.Healthy = result.Healthy && len(aggHealStateResult.HealDisks) == 0
|
||||||
if !opts.Maintenance {
|
result.HealingDrives = len(aggHealStateResult.HealDisks)
|
||||||
return HealthResult{
|
|
||||||
Healthy: true,
|
|
||||||
WriteQuorum: maximumWriteQuorum,
|
|
||||||
UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return HealthResult{
|
return result
|
||||||
Healthy: len(aggHealStateResult.HealDisks) == 0,
|
|
||||||
HealingDrives: len(aggHealStateResult.HealDisks),
|
|
||||||
WriteQuorum: maximumWriteQuorum,
|
|
||||||
UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// PutObjectMetadata - replace or add tags to an existing object
|
// PutObjectMetadata - replace or add tags to an existing object
|
||||||
|
@ -22,6 +22,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"net/http"
|
"net/http"
|
||||||
"runtime"
|
"runtime"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
@ -3187,6 +3188,16 @@ func getClusterHealthStatusMD() MetricDescription {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getClusterErasureSetToleranceMD() MetricDescription {
|
||||||
|
return MetricDescription{
|
||||||
|
Namespace: clusterMetricNamespace,
|
||||||
|
Subsystem: "health",
|
||||||
|
Name: "erasure_set_tolerance",
|
||||||
|
Help: "Get erasure set tolerance status",
|
||||||
|
Type: gaugeMetric,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func getClusterHealthMetrics() *MetricsGroup {
|
func getClusterHealthMetrics() *MetricsGroup {
|
||||||
mg := &MetricsGroup{
|
mg := &MetricsGroup{
|
||||||
cacheInterval: 10 * time.Second,
|
cacheInterval: 10 * time.Second,
|
||||||
@ -3218,6 +3229,18 @@ func getClusterHealthMetrics() *MetricsGroup {
|
|||||||
Value: float64(health),
|
Value: float64(health),
|
||||||
})
|
})
|
||||||
|
|
||||||
|
for _, h := range result.ESHealth {
|
||||||
|
labels := map[string]string{
|
||||||
|
"pool": strconv.Itoa(h.PoolID),
|
||||||
|
"set": strconv.Itoa(h.SetID),
|
||||||
|
}
|
||||||
|
metrics = append(metrics, Metric{
|
||||||
|
Description: getClusterErasureSetToleranceMD(),
|
||||||
|
VariableLabels: labels,
|
||||||
|
Value: float64(h.HealthyDrives - h.WriteQuorum),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
return
|
return
|
||||||
})
|
})
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user