Cluster healthcheck improvements (#10408)

- do not fail the healthcheck if heal status was not obtained from one of the nodes, if many nodes fail then report this as a catastrophic error. - add "x-minio-write-quorum" value to match the write tolerance supported by server. - admin info now states if a drive is healing where madmin.Disk.Healing is set to true and madmin.Disk.State is "ok"
2025-11-20 01:50:24 -05:00 · 2020-09-02 22:54:56 -07:00
parent 650dccfa9e
commit 8a291e1dc0
5 changed files with 49 additions and 25 deletions
--- a/cmd/erasure-zones.go
+++ b/cmd/erasure-zones.go
@@ -2045,17 +2045,18 @@ func (z *erasureZones) Health(ctx context.Context, opts HealthOptions) HealthRes

 	reqInfo := (&logger.ReqInfo{}).AppendTags("maintenance", strconv.FormatBool(opts.Maintenance))

+	parityDrives := globalStorageClass.GetParityForSC(storageclass.STANDARD)
+	diskCount := z.SetDriveCount()
+	if parityDrives == 0 {
+		parityDrives = getDefaultParityBlocks(diskCount)
+	}
+	dataDrives := diskCount - parityDrives
+	writeQuorum := dataDrives
+	if dataDrives == parityDrives {
+		writeQuorum++
+	}
+
 	for zoneIdx := range erasureSetUpCount {
-		parityDrives := globalStorageClass.GetParityForSC(storageclass.STANDARD)
-		diskCount := z.zones[zoneIdx].setDriveCount
-		if parityDrives == 0 {
-			parityDrives = getDefaultParityBlocks(diskCount)
-		}
-		dataDrives := diskCount - parityDrives
-		writeQuorum := dataDrives
-		if dataDrives == parityDrives {
-			writeQuorum++
-		}
 		for setIdx := range erasureSetUpCount[zoneIdx] {
 			if erasureSetUpCount[zoneIdx][setIdx] < writeQuorum {
 				logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
@@ -2075,14 +2076,15 @@ func (z *erasureZones) Health(ctx context.Context, opts HealthOptions) HealthRes
 	// to look at the healing side of the code.
 	if !opts.Maintenance {
 		return HealthResult{
-			Healthy: true,
+			Healthy:     true,
+			WriteQuorum: writeQuorum,
 		}
 	}

 	// check if local disks are being healed, if they are being healed
 	// we need to tell healthy status as 'false' so that this server
 	// is not taken down for maintenance
-	aggHealStateResult, err := getAggregatedBackgroundHealState(ctx, true)
+	aggHealStateResult, err := getAggregatedBackgroundHealState(ctx)
 	if err != nil {
 		logger.LogIf(logger.SetReqInfo(ctx, reqInfo), fmt.Errorf("Unable to verify global heal status: %w", err))
 		return HealthResult{
@@ -2094,11 +2096,10 @@ func (z *erasureZones) Health(ctx context.Context, opts HealthOptions) HealthRes
 		logger.LogIf(logger.SetReqInfo(ctx, reqInfo), fmt.Errorf("Total drives to be healed %d", len(aggHealStateResult.HealDisks)))
 	}

-	healthy := len(aggHealStateResult.HealDisks) == 0
-
 	return HealthResult{
-		Healthy:       healthy,
+		Healthy:       len(aggHealStateResult.HealDisks) == 0,
 		HealingDrives: len(aggHealStateResult.HealDisks),
+		WriteQuorum:   writeQuorum,
 	}
 }