mirror of
https://github.com/minio/minio.git
synced 2025-01-11 15:03:22 -05:00
heal: Remove .healing.bin when all ES drives are healing (#19846)
In the very rare case when all drives in a erasure set need to be healed, remove .healing.bin from all drives, otherwise it will be stuck in a loop Also, fix a unit test that fails sometimes due to wrong test.
This commit is contained in:
parent
8f93e81afb
commit
1277ad69a6
@ -38,8 +38,40 @@ function start_minio_3_node() {
|
||||
disown $pid3
|
||||
|
||||
export MC_HOST_myminio="http://minio:minio123@127.0.0.1:$((start_port + 1))"
|
||||
|
||||
/tmp/mc ready myminio
|
||||
|
||||
# Wait for all drives to be online and formatted
|
||||
while [ $(/tmp/mc admin info --json myminio | jq '.info.servers[].drives[].state | select(. != "ok")' | wc -l) -gt 0 ]; do sleep 1; done
|
||||
# Wait for all drives to be healed
|
||||
while [ $(/tmp/mc admin info --json myminio | jq '.info.servers[].drives[].healing | select(. != null) | select(. == true)' | wc -l) -gt 0 ]; do sleep 1; done
|
||||
|
||||
# Wait for Status: in MinIO output
|
||||
while true; do
|
||||
rv=$(check_online)
|
||||
if [ "$rv" != "1" ]; then
|
||||
# success
|
||||
break
|
||||
fi
|
||||
|
||||
# Check if we should retry
|
||||
retry=$((retry + 1))
|
||||
if [ $retry -le 20 ]; then
|
||||
sleep 5
|
||||
continue
|
||||
fi
|
||||
|
||||
# Failure
|
||||
for i in $(seq 1 3); do
|
||||
echo "server$i log:"
|
||||
cat "${WORK_DIR}/dist-minio-server$i.log"
|
||||
done
|
||||
pkill -9 minio
|
||||
echo "FAILED"
|
||||
purge "$WORK_DIR"
|
||||
exit 1
|
||||
done
|
||||
|
||||
if ! ps -p $pid1 1>&2 >/dev/null; then
|
||||
echo "server1 log:"
|
||||
cat "${WORK_DIR}/dist-minio-server1.log"
|
||||
@ -90,7 +122,7 @@ function check_online() {
|
||||
}
|
||||
|
||||
function purge() {
|
||||
rm -rf "$1"
|
||||
echo rm -rf "$1"
|
||||
}
|
||||
|
||||
function __init__() {
|
||||
@ -117,18 +149,6 @@ function perform_test() {
|
||||
|
||||
set -x
|
||||
start_minio_3_node $2
|
||||
|
||||
rv=$(check_online)
|
||||
if [ "$rv" == "1" ]; then
|
||||
for i in $(seq 1 3); do
|
||||
echo "server$i log:"
|
||||
cat "${WORK_DIR}/dist-minio-server$i.log"
|
||||
done
|
||||
pkill -9 minio
|
||||
echo "FAILED"
|
||||
purge "$WORK_DIR"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
function main() {
|
||||
|
@ -453,10 +453,6 @@ func healFreshDisk(ctx context.Context, z *erasureServerPools, endpoint Endpoint
|
||||
|
||||
healingLogEvent(ctx, "Healing of drive '%s' is finished (healed: %d, skipped: %d, failed: %d).", disk, tracker.ItemsHealed, tracker.ItemsSkipped, tracker.ItemsFailed)
|
||||
|
||||
if len(tracker.QueuedBuckets) > 0 {
|
||||
return fmt.Errorf("not all buckets were healed: %v", tracker.QueuedBuckets)
|
||||
}
|
||||
|
||||
if serverDebugLog {
|
||||
tracker.printTo(os.Stdout)
|
||||
fmt.Printf("\n")
|
||||
|
@ -530,7 +530,16 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
|
||||
tracker.setObject("")
|
||||
tracker.setBucket("")
|
||||
|
||||
return retErr
|
||||
if retErr != nil {
|
||||
return retErr
|
||||
}
|
||||
|
||||
// Last sanity check
|
||||
if len(tracker.QueuedBuckets) > 0 {
|
||||
return fmt.Errorf("not all buckets were healed: %v", tracker.QueuedBuckets)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func healBucket(bucket string, scan madmin.HealScanMode) error {
|
||||
|
Loading…
Reference in New Issue
Block a user