skip inconsistent shards if possible (#13945)

data shards were wrong due to a healing bug
reported in #13803 mainly with unaligned object
sizes.

This PR is an attempt to automatically avoid
these shards, with available information about
the `xl.meta` and actually disk mtime.
This commit is contained in:
Harshavardhana
2021-12-21 10:08:26 -08:00
committed by GitHub
parent 6fbf4f96b6
commit 0e3037631f
17 changed files with 548 additions and 129 deletions

152
buildscripts/unaligned-healing.sh Executable file
View File

@@ -0,0 +1,152 @@
#!/bin/bash -e
#
set -E
set -o pipefail
set -x
if [ ! -x "$PWD/minio" ]; then
echo "minio executable binary not found in current directory"
exit 1
fi
WORK_DIR="$PWD/.verify-$RANDOM"
MINIO_CONFIG_DIR="$WORK_DIR/.minio"
MINIO_OLD=( "$PWD/minio.RELEASE.2021-11-24T23-19-33Z" --config-dir "$MINIO_CONFIG_DIR" server )
MINIO=( "$PWD/minio" --config-dir "$MINIO_CONFIG_DIR" server )
function download_old_release() {
if [ ! -f minio.RELEASE.2021-11-24T23-19-33Z ]; then
curl --silent -O https://dl.minio.io/server/minio/release/linux-amd64/archive/minio.RELEASE.2021-11-24T23-19-33Z
chmod a+x minio.RELEASE.2021-11-24T23-19-33Z
fi
}
function start_minio_16drive() {
start_port=$1
export MINIO_ROOT_USER=minio
export MINIO_ROOT_PASSWORD=minio123
export MC_HOST_minio="http://minio:minio123@127.0.0.1:${start_port}/"
unset MINIO_KMS_AUTO_ENCRYPTION # do not auto-encrypt objects
MC_BUILD_DIR="mc-$RANDOM"
if ! git clone --quiet https://github.com/minio/mc "$MC_BUILD_DIR"; then
echo "failed to download https://github.com/minio/mc"
purge "${MC_BUILD_DIR}"
exit 1
fi
(cd "${MC_BUILD_DIR}" && go build -o "$WORK_DIR/mc")
# remove mc source.
purge "${MC_BUILD_DIR}"
"${MINIO_OLD[@]}" --address ":$start_port" "${WORK_DIR}/xl{1...16}" > "${WORK_DIR}/server1.log" 2>&1 &
pid=$!
disown $pid
sleep 30
if ! ps -p ${pid} 1>&2 >/dev/null; then
echo "server1 log:"
cat "${WORK_DIR}/server1.log"
echo "FAILED"
purge "$WORK_DIR"
exit 1
fi
shred --iterations=1 --size=5241856 - 1>"${WORK_DIR}/unaligned" 2>/dev/null
"${WORK_DIR}/mc" mb minio/healing-shard-bucket --quiet
"${WORK_DIR}/mc" cp \
"${WORK_DIR}/unaligned" \
minio/healing-shard-bucket/unaligned \
--disable-multipart --quiet
## "unaligned" object name gets consistently distributed
## to disks in following distribution order
##
## NOTE: if you change the name make sure to change the
## distribution order present here
##
## [15, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
## make sure to remove the "last" data shard
rm -rf "${WORK_DIR}/xl14/healing-shard-bucket/unaligned"
sleep 10
## Heal the shard
"${WORK_DIR}/mc" admin heal --quiet --recursive minio/healing-shard-bucket
## then remove any other data shard let's pick first disk
## - 1st data shard.
rm -rf "${WORK_DIR}/xl3/healing-shard-bucket/unaligned"
sleep 10
## Heal the shard
"${WORK_DIR}/mc" admin heal --quiet --recursive minio/healing-shard-bucket
go build ./docs/debugging/s3-check-md5/
if ! ./s3-check-md5 \
-debug \
-access-key minio \
-secret-key minio123 \
-endpoint http://127.0.0.1:${start_port}/ 2>&1 | grep CORRUPTED; then
echo "server1 log:"
cat "${WORK_DIR}/server1.log"
echo "FAILED"
purge "$WORK_DIR"
exit 1
fi
pkill minio
sleep 3
"${MINIO[@]}" --address ":$start_port" "${WORK_DIR}/xl{1...16}" > "${WORK_DIR}/server1.log" 2>&1 &
pid=$!
disown $pid
sleep 30
if ! ps -p ${pid} 1>&2 >/dev/null; then
echo "server1 log:"
cat "${WORK_DIR}/server1.log"
echo "FAILED"
purge "$WORK_DIR"
exit 1
fi
if ! ./s3-check-md5 \
-debug \
-access-key minio \
-secret-key minio123 \
-endpoint http://127.0.0.1:${start_port}/ 2>&1 | grep INTACT; then
echo "server1 log:"
cat "${WORK_DIR}/server1.log"
echo "FAILED"
mkdir inspects
(cd inspects; "${WORK_DIR}/mc" admin inspect minio/healing-shard-bucket/unaligned/**)
"${WORK_DIR}/mc" mb play/inspects
"${WORK_DIR}/mc" mirror inspects play/inspects
purge "$WORK_DIR"
exit 1
fi
pkill minio
sleep 3
}
function main() {
download_old_release
start_port=$(shuf -i 10000-65000 -n 1)
start_minio_16drive ${start_port}
}
function purge()
{
rm -rf "$1"
}
( main "$@" )
rv=$?
purge "$WORK_DIR"
exit "$rv"

View File

@@ -21,51 +21,70 @@ function start_minio_3_node() {
start_port=$2
args=""
for i in $(seq 1 3); do
args="$args http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/1/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/2/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/3/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/4/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/5/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/6/"
args="$args http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/1/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/2/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/3/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/4/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/5/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/6/"
done
"${MINIO[@]}" --address ":$[$start_port+1]" $args > "${WORK_DIR}/dist-minio-server1.log" 2>&1 &
disown $!
pid1=$!
disown ${pid1}
"${MINIO[@]}" --address ":$[$start_port+2]" $args > "${WORK_DIR}/dist-minio-server2.log" 2>&1 &
disown $!
pid2=$!
disown $pid2
"${MINIO[@]}" --address ":$[$start_port+3]" $args > "${WORK_DIR}/dist-minio-server3.log" 2>&1 &
disown $!
pid3=$!
disown $pid3
sleep "$1"
if [ "$(pgrep -c minio)" -ne 3 ]; then
for i in $(seq 1 3); do
echo "server$i log:"
cat "${WORK_DIR}/dist-minio-server$i.log"
done
echo "FAILED"
purge "$WORK_DIR"
exit 1
if ! ps -p $pid1 1>&2 > /dev/null; then
echo "server1 log:"
cat "${WORK_DIR}/dist-minio-server1.log"
echo "FAILED"
purge "$WORK_DIR"
exit 1
fi
if ! ps -p $pid2 1>&2 > /dev/null; then
echo "server2 log:"
cat "${WORK_DIR}/dist-minio-server2.log"
echo "FAILED"
purge "$WORK_DIR"
exit 1
fi
if ! ps -p $pid3 1>&2 > /dev/null; then
echo "server3 log:"
cat "${WORK_DIR}/dist-minio-server3.log"
echo "FAILED"
purge "$WORK_DIR"
exit 1
fi
if ! pkill minio; then
for i in $(seq 1 3); do
echo "server$i log:"
cat "${WORK_DIR}/dist-minio-server$i.log"
done
echo "FAILED"
purge "$WORK_DIR"
exit 1
for i in $(seq 1 3); do
echo "server$i log:"
cat "${WORK_DIR}/dist-minio-server$i.log"
done
echo "FAILED"
purge "$WORK_DIR"
exit 1
fi
sleep 1;
if pgrep minio; then
# forcibly killing, to proceed further properly.
if ! pkill -9 minio; then
echo "no minio process running anymore, proceed."
fi
# forcibly killing, to proceed further properly.
if ! pkill -9 minio; then
echo "no minio process running anymore, proceed."
fi
fi
}
function check_online() {
if grep -q 'Unable to initialize sub-systems' ${WORK_DIR}/dist-minio-*.log; then
echo "1"
echo "1"
fi
}
@@ -96,14 +115,14 @@ function perform_test() {
rv=$(check_online)
if [ "$rv" == "1" ]; then
for i in $(seq 1 3); do
echo "server$i log:"
cat "${WORK_DIR}/dist-minio-server$i.log"
done
pkill -9 minio
echo "FAILED"
purge "$WORK_DIR"
exit 1
for i in $(seq 1 3); do
echo "server$i log:"
cat "${WORK_DIR}/dist-minio-server$i.log"
done
pkill -9 minio
echo "FAILED"
purge "$WORK_DIR"
exit 1
fi
}