heal: Remove .healing.bin when all ES drives are healing (#19846)

In the very rare case when all drives in a erasure set need to be healed,
remove .healing.bin from all drives, otherwise it will be stuck in a
loop

Also, fix a unit test that fails sometimes due to wrong test.
This commit is contained in:
Anis Eleuch 2024-05-31 15:48:50 +01:00 committed by GitHub
parent 8f93e81afb
commit 1277ad69a6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 43 additions and 18 deletions

View File

@ -38,8 +38,40 @@ function start_minio_3_node() {
disown $pid3
export MC_HOST_myminio="http://minio:minio123@127.0.0.1:$((start_port + 1))"
/tmp/mc ready myminio
# Wait for all drives to be online and formatted
while [ $(/tmp/mc admin info --json myminio | jq '.info.servers[].drives[].state | select(. != "ok")' | wc -l) -gt 0 ]; do sleep 1; done
# Wait for all drives to be healed
while [ $(/tmp/mc admin info --json myminio | jq '.info.servers[].drives[].healing | select(. != null) | select(. == true)' | wc -l) -gt 0 ]; do sleep 1; done
# Wait for Status: in MinIO output
while true; do
rv=$(check_online)
if [ "$rv" != "1" ]; then
# success
break
fi
# Check if we should retry
retry=$((retry + 1))
if [ $retry -le 20 ]; then
sleep 5
continue
fi
# Failure
for i in $(seq 1 3); do
echo "server$i log:"
cat "${WORK_DIR}/dist-minio-server$i.log"
done
pkill -9 minio
echo "FAILED"
purge "$WORK_DIR"
exit 1
done
if ! ps -p $pid1 1>&2 >/dev/null; then
echo "server1 log:"
cat "${WORK_DIR}/dist-minio-server1.log"
@ -90,7 +122,7 @@ function check_online() {
}
function purge() {
rm -rf "$1"
echo rm -rf "$1"
}
function __init__() {
@ -117,18 +149,6 @@ function perform_test() {
set -x
start_minio_3_node $2
rv=$(check_online)
if [ "$rv" == "1" ]; then
for i in $(seq 1 3); do
echo "server$i log:"
cat "${WORK_DIR}/dist-minio-server$i.log"
done
pkill -9 minio
echo "FAILED"
purge "$WORK_DIR"
exit 1
fi
}
function main() {

View File

@ -453,10 +453,6 @@ func healFreshDisk(ctx context.Context, z *erasureServerPools, endpoint Endpoint
healingLogEvent(ctx, "Healing of drive '%s' is finished (healed: %d, skipped: %d, failed: %d).", disk, tracker.ItemsHealed, tracker.ItemsSkipped, tracker.ItemsFailed)
if len(tracker.QueuedBuckets) > 0 {
return fmt.Errorf("not all buckets were healed: %v", tracker.QueuedBuckets)
}
if serverDebugLog {
tracker.printTo(os.Stdout)
fmt.Printf("\n")

View File

@ -530,7 +530,16 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
tracker.setObject("")
tracker.setBucket("")
return retErr
if retErr != nil {
return retErr
}
// Last sanity check
if len(tracker.QueuedBuckets) > 0 {
return fmt.Errorf("not all buckets were healed: %v", tracker.QueuedBuckets)
}
return nil
}
func healBucket(bucket string, scan madmin.HealScanMode) error {