heal: Remove .healing.bin when all ES drives are healing (#19846)

In the very rare case when all drives in a erasure set need to be healed, remove .healing.bin from all drives, otherwise it will be stuck in a loop Also, fix a unit test that fails sometimes due to wrong test.
2025-07-26 08:50:08 -04:00 · 2024-05-31 15:48:50 +01:00 · 2024-05-31 15:48:50 +01:00 · 1277ad69a6
commit 1277ad69a6
parent 8f93e81afb
3 changed files with 43 additions and 18 deletions
--- a/buildscripts/verify-healing-empty-erasure-set.sh
+++ b/buildscripts/verify-healing-empty-erasure-set.sh
@ -38,8 +38,40 @@ function start_minio_3_node() {
 	disown $pid3

 	export MC_HOST_myminio="http://minio:minio123@127.0.0.1:$((start_port + 1))"
+
 	/tmp/mc ready myminio

+	# Wait for all drives to be online and formatted
+	while [ $(/tmp/mc admin info --json myminio | jq '.info.servers[].drives[].state | select(. != "ok")' | wc -l) -gt 0 ]; do sleep 1; done
+	# Wait for all drives to be healed
+	while [ $(/tmp/mc admin info --json myminio | jq '.info.servers[].drives[].healing | select(. != null) | select(. == true)' | wc -l) -gt 0 ]; do sleep 1; done
+
+	# Wait for Status: in MinIO output
+	while true; do
+		rv=$(check_online)
+		if [ "$rv" != "1" ]; then
+			# success
+			break
+		fi
+
+		# Check if we should retry
+		retry=$((retry + 1))
+		if [ $retry -le 20 ]; then
+			sleep 5
+			continue
+		fi
+
+		# Failure
+		for i in $(seq 1 3); do
+			echo "server$i log:"
+			cat "${WORK_DIR}/dist-minio-server$i.log"
+		done
+		pkill -9 minio
+		echo "FAILED"
+		purge "$WORK_DIR"
+		exit 1
+	done
+
 	if ! ps -p $pid1 1>&2 >/dev/null; then
 		echo "server1 log:"
 		cat "${WORK_DIR}/dist-minio-server1.log"
@ -90,7 +122,7 @@ function check_online() {
 }

 function purge() {
-	rm -rf "$1"
+	echo rm -rf "$1"
 }

 function __init__() {
@ -117,18 +149,6 @@ function perform_test() {

 	set -x
 	start_minio_3_node $2
-
-	rv=$(check_online)
-	if [ "$rv" == "1" ]; then
-		for i in $(seq 1 3); do
-			echo "server$i log:"
-			cat "${WORK_DIR}/dist-minio-server$i.log"
-		done
-		pkill -9 minio
-		echo "FAILED"
-		purge "$WORK_DIR"
-		exit 1
-	fi
 }

 function main() {
--- a/cmd/background-newdisks-heal-ops.go
+++ b/cmd/background-newdisks-heal-ops.go
@ -453,10 +453,6 @@ func healFreshDisk(ctx context.Context, z *erasureServerPools, endpoint Endpoint

 	healingLogEvent(ctx, "Healing of drive '%s' is finished (healed: %d, skipped: %d, failed: %d).", disk, tracker.ItemsHealed, tracker.ItemsSkipped, tracker.ItemsFailed)

-	if len(tracker.QueuedBuckets) > 0 {
-		return fmt.Errorf("not all buckets were healed: %v", tracker.QueuedBuckets)
-	}
-
 	if serverDebugLog {
 		tracker.printTo(os.Stdout)
 		fmt.Printf("\n")
--- a/cmd/global-heal.go
+++ b/cmd/global-heal.go
@ -530,7 +530,16 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
 	tracker.setObject("")
 	tracker.setBucket("")

-	return retErr
+	if retErr != nil {
+		return retErr
+	}
+
+	// Last sanity check
+	if len(tracker.QueuedBuckets) > 0 {
+		return fmt.Errorf("not all buckets were healed: %v", tracker.QueuedBuckets)
+	}
+
+	return nil
 }

 func healBucket(bucket string, scan madmin.HealScanMode) error {