mirror of
https://github.com/minio/minio.git
synced 2025-11-20 01:50:24 -05:00
Cluster healthcheck improvements (#10408)
- do not fail the healthcheck if heal status was not obtained from one of the nodes, if many nodes fail then report this as a catastrophic error. - add "x-minio-write-quorum" value to match the write tolerance supported by server. - admin info now states if a drive is healing where madmin.Disk.Healing is set to true and madmin.Disk.State is "ok"
This commit is contained in:
@@ -2045,17 +2045,18 @@ func (z *erasureZones) Health(ctx context.Context, opts HealthOptions) HealthRes
|
||||
|
||||
reqInfo := (&logger.ReqInfo{}).AppendTags("maintenance", strconv.FormatBool(opts.Maintenance))
|
||||
|
||||
parityDrives := globalStorageClass.GetParityForSC(storageclass.STANDARD)
|
||||
diskCount := z.SetDriveCount()
|
||||
if parityDrives == 0 {
|
||||
parityDrives = getDefaultParityBlocks(diskCount)
|
||||
}
|
||||
dataDrives := diskCount - parityDrives
|
||||
writeQuorum := dataDrives
|
||||
if dataDrives == parityDrives {
|
||||
writeQuorum++
|
||||
}
|
||||
|
||||
for zoneIdx := range erasureSetUpCount {
|
||||
parityDrives := globalStorageClass.GetParityForSC(storageclass.STANDARD)
|
||||
diskCount := z.zones[zoneIdx].setDriveCount
|
||||
if parityDrives == 0 {
|
||||
parityDrives = getDefaultParityBlocks(diskCount)
|
||||
}
|
||||
dataDrives := diskCount - parityDrives
|
||||
writeQuorum := dataDrives
|
||||
if dataDrives == parityDrives {
|
||||
writeQuorum++
|
||||
}
|
||||
for setIdx := range erasureSetUpCount[zoneIdx] {
|
||||
if erasureSetUpCount[zoneIdx][setIdx] < writeQuorum {
|
||||
logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
|
||||
@@ -2075,14 +2076,15 @@ func (z *erasureZones) Health(ctx context.Context, opts HealthOptions) HealthRes
|
||||
// to look at the healing side of the code.
|
||||
if !opts.Maintenance {
|
||||
return HealthResult{
|
||||
Healthy: true,
|
||||
Healthy: true,
|
||||
WriteQuorum: writeQuorum,
|
||||
}
|
||||
}
|
||||
|
||||
// check if local disks are being healed, if they are being healed
|
||||
// we need to tell healthy status as 'false' so that this server
|
||||
// is not taken down for maintenance
|
||||
aggHealStateResult, err := getAggregatedBackgroundHealState(ctx, true)
|
||||
aggHealStateResult, err := getAggregatedBackgroundHealState(ctx)
|
||||
if err != nil {
|
||||
logger.LogIf(logger.SetReqInfo(ctx, reqInfo), fmt.Errorf("Unable to verify global heal status: %w", err))
|
||||
return HealthResult{
|
||||
@@ -2094,11 +2096,10 @@ func (z *erasureZones) Health(ctx context.Context, opts HealthOptions) HealthRes
|
||||
logger.LogIf(logger.SetReqInfo(ctx, reqInfo), fmt.Errorf("Total drives to be healed %d", len(aggHealStateResult.HealDisks)))
|
||||
}
|
||||
|
||||
healthy := len(aggHealStateResult.HealDisks) == 0
|
||||
|
||||
return HealthResult{
|
||||
Healthy: healthy,
|
||||
Healthy: len(aggHealStateResult.HealDisks) == 0,
|
||||
HealingDrives: len(aggHealStateResult.HealDisks),
|
||||
WriteQuorum: writeQuorum,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user