re-attach offline drive after new drive replacement (#10416)

inconsistent drive healing when one of the drive is offline while a new drive was replaced, this change is to ensure that we can add the offline drive back into the mix by healing it again.
2025-11-20 01:50:24 -05:00 · 2020-09-04 17:09:02 -07:00
parent eb19c8af40
commit b0e1d4ce78
7 changed files with 166 additions and 146 deletions
--- a/cmd/erasure-zones.go
+++ b/cmd/erasure-zones.go
@@ -2056,6 +2056,25 @@ func (z *erasureZones) Health(ctx context.Context, opts HealthOptions) HealthRes
 		writeQuorum++
 	}

+	var aggHealStateResult madmin.BgHealState
+	if opts.Maintenance {
+		// check if local disks are being healed, if they are being healed
+		// we need to tell healthy status as 'false' so that this server
+		// is not taken down for maintenance
+		var err error
+		aggHealStateResult, err = getAggregatedBackgroundHealState(ctx)
+		if err != nil {
+			logger.LogIf(logger.SetReqInfo(ctx, reqInfo), fmt.Errorf("Unable to verify global heal status: %w", err))
+			return HealthResult{
+				Healthy: false,
+			}
+		}
+
+		if len(aggHealStateResult.HealDisks) > 0 {
+			logger.LogIf(logger.SetReqInfo(ctx, reqInfo), fmt.Errorf("Total drives to be healed %d", len(aggHealStateResult.HealDisks)))
+		}
+	}
+
 	for zoneIdx := range erasureSetUpCount {
 		for setIdx := range erasureSetUpCount[zoneIdx] {
 			if erasureSetUpCount[zoneIdx][setIdx] < writeQuorum {
@@ -2063,10 +2082,11 @@ func (z *erasureZones) Health(ctx context.Context, opts HealthOptions) HealthRes
 					fmt.Errorf("Write quorum may be lost on zone: %d, set: %d, expected write quorum: %d",
 						zoneIdx, setIdx, writeQuorum))
 				return HealthResult{
-					Healthy:     false,
-					ZoneID:      zoneIdx,
-					SetID:       setIdx,
-					WriteQuorum: writeQuorum,
+					Healthy:       false,
+					HealingDrives: len(aggHealStateResult.HealDisks),
+					ZoneID:        zoneIdx,
+					SetID:         setIdx,
+					WriteQuorum:   writeQuorum,
 				}
 			}
 		}
@@ -2081,21 +2101,6 @@ func (z *erasureZones) Health(ctx context.Context, opts HealthOptions) HealthRes
 		}
 	}

-	// check if local disks are being healed, if they are being healed
-	// we need to tell healthy status as 'false' so that this server
-	// is not taken down for maintenance
-	aggHealStateResult, err := getAggregatedBackgroundHealState(ctx)
-	if err != nil {
-		logger.LogIf(logger.SetReqInfo(ctx, reqInfo), fmt.Errorf("Unable to verify global heal status: %w", err))
-		return HealthResult{
-			Healthy: false,
-		}
-	}
-
-	if len(aggHealStateResult.HealDisks) > 0 {
-		logger.LogIf(logger.SetReqInfo(ctx, reqInfo), fmt.Errorf("Total drives to be healed %d", len(aggHealStateResult.HealDisks)))
-	}
-
 	return HealthResult{
 		Healthy:       len(aggHealStateResult.HealDisks) == 0,
 		HealingDrives: len(aggHealStateResult.HealDisks),