From 0e3037631fc54e54a6e672762e0fd4c6e11e7e3d Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Tue, 21 Dec 2021 10:08:26 -0800 Subject: [PATCH] skip inconsistent shards if possible (#13945) data shards were wrong due to a healing bug reported in #13803 mainly with unaligned object sizes. This PR is an attempt to automatically avoid these shards, with available information about the `xl.meta` and actually disk mtime. --- .github/workflows/go-healing.yml | 48 ++++++ .github/workflows/go.yml | 3 +- .gitignore | 1 + Makefile | 1 + buildscripts/unaligned-healing.sh | 152 ++++++++++++++++++ buildscripts/verify-healing.sh | 83 ++++++---- cmd/erasure-healing-common.go | 76 +++++++-- cmd/erasure-healing.go | 45 ++++-- cmd/erasure-healing_test.go | 28 ++-- cmd/erasure-metadata.go | 11 ++ cmd/erasure-object.go | 42 ++++- cmd/storage-datatypes.go | 19 +++ cmd/storage-datatypes_gen.go | 34 ++-- cmd/storage-rest-common.go | 2 +- cmd/xl-storage.go | 72 ++++++--- .../replication/setup_3site_replication.sh | 24 +-- docs/debugging/s3-check-md5/main.go | 36 ++++- 17 files changed, 548 insertions(+), 129 deletions(-) create mode 100644 .github/workflows/go-healing.yml create mode 100755 buildscripts/unaligned-healing.sh diff --git a/.github/workflows/go-healing.yml b/.github/workflows/go-healing.yml new file mode 100644 index 000000000..c033e4253 --- /dev/null +++ b/.github/workflows/go-healing.yml @@ -0,0 +1,48 @@ +name: Healing Functional Tests + +on: + pull_request: + branches: + - master + +# This ensures that previous jobs for the PR are canceled when the PR is +# updated. +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref }} + cancel-in-progress: true + +jobs: + build: + name: Go ${{ matrix.go-version }} on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + go-version: [1.17.x] + os: [ubuntu-latest] + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-go@v2 + with: + go-version: ${{ matrix.go-version }} + - uses: actions/cache@v2 + with: + path: | + ~/.cache/go-build + ~/go/pkg/mod + key: ${{ runner.os }}-${{ matrix.go-version }}-go-${{ hashFiles('**/go.sum') }} + restore-keys: | + ${{ runner.os }}-${{ matrix.go-version }}-go- + - name: Build on ${{ matrix.os }} + if: matrix.os == 'ubuntu-latest' + env: + CGO_ENABLED: 0 + GO111MODULE: on + MINIO_KMS_KES_CERT_FILE: /home/runner/work/minio/minio/.github/workflows/root.cert + MINIO_KMS_KES_KEY_FILE: /home/runner/work/minio/minio/.github/workflows/root.key + MINIO_KMS_KES_ENDPOINT: "https://play.min.io:7373" + MINIO_KMS_KES_KEY_NAME: "my-minio-key" + MINIO_KMS_AUTO_ENCRYPTION: on + run: | + sudo sysctl net.ipv6.conf.all.disable_ipv6=0 + sudo sysctl net.ipv6.conf.default.disable_ipv6=0 + make verify-healing diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 1104cbbbd..b4c1a2a2d 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -13,7 +13,7 @@ concurrency: jobs: build: - name: Go ${{ matrix.go-version }} on ${{ matrix.os }} + name: Go ${{ matrix.go-version }} on ${{ matrix.os }} - healing runs-on: ${{ matrix.os }} strategy: matrix: @@ -46,4 +46,3 @@ jobs: sudo sysctl net.ipv6.conf.all.disable_ipv6=0 sudo sysctl net.ipv6.conf.default.disable_ipv6=0 make verify - make verify-healing diff --git a/.gitignore b/.gitignore index c575c2e42..2dfe6b51d 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,4 @@ hash-set minio.RELEASE* mc nancy +inspects/* \ No newline at end of file diff --git a/Makefile b/Makefile index 56cc2c691..371277d07 100644 --- a/Makefile +++ b/Makefile @@ -73,6 +73,7 @@ verify-healing: ## verify healing and replacing disks with minio binary @echo "Verify healing build with race" @GO111MODULE=on CGO_ENABLED=1 go build -race -tags kqueue -trimpath --ldflags "$(LDFLAGS)" -o $(PWD)/minio 1>/dev/null @(env bash $(PWD)/buildscripts/verify-healing.sh) + @(env bash $(PWD)/buildscripts/unaligned-healing.sh) build: checks ## builds minio to $(PWD) @echo "Building minio binary to './minio'" diff --git a/buildscripts/unaligned-healing.sh b/buildscripts/unaligned-healing.sh new file mode 100755 index 000000000..5249d0183 --- /dev/null +++ b/buildscripts/unaligned-healing.sh @@ -0,0 +1,152 @@ +#!/bin/bash -e +# + +set -E +set -o pipefail +set -x + +if [ ! -x "$PWD/minio" ]; then + echo "minio executable binary not found in current directory" + exit 1 +fi + +WORK_DIR="$PWD/.verify-$RANDOM" +MINIO_CONFIG_DIR="$WORK_DIR/.minio" +MINIO_OLD=( "$PWD/minio.RELEASE.2021-11-24T23-19-33Z" --config-dir "$MINIO_CONFIG_DIR" server ) +MINIO=( "$PWD/minio" --config-dir "$MINIO_CONFIG_DIR" server ) + +function download_old_release() { + if [ ! -f minio.RELEASE.2021-11-24T23-19-33Z ]; then + curl --silent -O https://dl.minio.io/server/minio/release/linux-amd64/archive/minio.RELEASE.2021-11-24T23-19-33Z + chmod a+x minio.RELEASE.2021-11-24T23-19-33Z + fi +} + +function start_minio_16drive() { + start_port=$1 + + export MINIO_ROOT_USER=minio + export MINIO_ROOT_PASSWORD=minio123 + export MC_HOST_minio="http://minio:minio123@127.0.0.1:${start_port}/" + unset MINIO_KMS_AUTO_ENCRYPTION # do not auto-encrypt objects + + MC_BUILD_DIR="mc-$RANDOM" + if ! git clone --quiet https://github.com/minio/mc "$MC_BUILD_DIR"; then + echo "failed to download https://github.com/minio/mc" + purge "${MC_BUILD_DIR}" + exit 1 + fi + + (cd "${MC_BUILD_DIR}" && go build -o "$WORK_DIR/mc") + + # remove mc source. + purge "${MC_BUILD_DIR}" + + "${MINIO_OLD[@]}" --address ":$start_port" "${WORK_DIR}/xl{1...16}" > "${WORK_DIR}/server1.log" 2>&1 & + pid=$! + disown $pid + sleep 30 + + if ! ps -p ${pid} 1>&2 >/dev/null; then + echo "server1 log:" + cat "${WORK_DIR}/server1.log" + echo "FAILED" + purge "$WORK_DIR" + exit 1 + fi + + shred --iterations=1 --size=5241856 - 1>"${WORK_DIR}/unaligned" 2>/dev/null + "${WORK_DIR}/mc" mb minio/healing-shard-bucket --quiet + "${WORK_DIR}/mc" cp \ + "${WORK_DIR}/unaligned" \ + minio/healing-shard-bucket/unaligned \ + --disable-multipart --quiet + + ## "unaligned" object name gets consistently distributed + ## to disks in following distribution order + ## + ## NOTE: if you change the name make sure to change the + ## distribution order present here + ## + ## [15, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] + + ## make sure to remove the "last" data shard + rm -rf "${WORK_DIR}/xl14/healing-shard-bucket/unaligned" + sleep 10 + ## Heal the shard + "${WORK_DIR}/mc" admin heal --quiet --recursive minio/healing-shard-bucket + ## then remove any other data shard let's pick first disk + ## - 1st data shard. + rm -rf "${WORK_DIR}/xl3/healing-shard-bucket/unaligned" + sleep 10 + ## Heal the shard + "${WORK_DIR}/mc" admin heal --quiet --recursive minio/healing-shard-bucket + + go build ./docs/debugging/s3-check-md5/ + if ! ./s3-check-md5 \ + -debug \ + -access-key minio \ + -secret-key minio123 \ + -endpoint http://127.0.0.1:${start_port}/ 2>&1 | grep CORRUPTED; then + echo "server1 log:" + cat "${WORK_DIR}/server1.log" + echo "FAILED" + purge "$WORK_DIR" + exit 1 + fi + + pkill minio + sleep 3 + + "${MINIO[@]}" --address ":$start_port" "${WORK_DIR}/xl{1...16}" > "${WORK_DIR}/server1.log" 2>&1 & + pid=$! + disown $pid + sleep 30 + + if ! ps -p ${pid} 1>&2 >/dev/null; then + echo "server1 log:" + cat "${WORK_DIR}/server1.log" + echo "FAILED" + purge "$WORK_DIR" + exit 1 + fi + + if ! ./s3-check-md5 \ + -debug \ + -access-key minio \ + -secret-key minio123 \ + -endpoint http://127.0.0.1:${start_port}/ 2>&1 | grep INTACT; then + echo "server1 log:" + cat "${WORK_DIR}/server1.log" + echo "FAILED" + mkdir inspects + (cd inspects; "${WORK_DIR}/mc" admin inspect minio/healing-shard-bucket/unaligned/**) + + "${WORK_DIR}/mc" mb play/inspects + "${WORK_DIR}/mc" mirror inspects play/inspects + + purge "$WORK_DIR" + exit 1 + fi + + pkill minio + sleep 3 +} + +function main() { + download_old_release + + start_port=$(shuf -i 10000-65000 -n 1) + + start_minio_16drive ${start_port} +} + +function purge() +{ + rm -rf "$1" +} + +( main "$@" ) +rv=$? +purge "$WORK_DIR" +exit "$rv" diff --git a/buildscripts/verify-healing.sh b/buildscripts/verify-healing.sh index 8a72e808a..ed9de73e0 100755 --- a/buildscripts/verify-healing.sh +++ b/buildscripts/verify-healing.sh @@ -21,51 +21,70 @@ function start_minio_3_node() { start_port=$2 args="" for i in $(seq 1 3); do - args="$args http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/1/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/2/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/3/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/4/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/5/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/6/" + args="$args http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/1/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/2/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/3/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/4/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/5/ http://127.0.0.1:$[$start_port+$i]${WORK_DIR}/$i/6/" done "${MINIO[@]}" --address ":$[$start_port+1]" $args > "${WORK_DIR}/dist-minio-server1.log" 2>&1 & - disown $! + pid1=$! + disown ${pid1} "${MINIO[@]}" --address ":$[$start_port+2]" $args > "${WORK_DIR}/dist-minio-server2.log" 2>&1 & - disown $! + pid2=$! + disown $pid2 "${MINIO[@]}" --address ":$[$start_port+3]" $args > "${WORK_DIR}/dist-minio-server3.log" 2>&1 & - disown $! + pid3=$! + disown $pid3 sleep "$1" - if [ "$(pgrep -c minio)" -ne 3 ]; then - for i in $(seq 1 3); do - echo "server$i log:" - cat "${WORK_DIR}/dist-minio-server$i.log" - done - echo "FAILED" - purge "$WORK_DIR" - exit 1 + + if ! ps -p $pid1 1>&2 > /dev/null; then + echo "server1 log:" + cat "${WORK_DIR}/dist-minio-server1.log" + echo "FAILED" + purge "$WORK_DIR" + exit 1 fi + + if ! ps -p $pid2 1>&2 > /dev/null; then + echo "server2 log:" + cat "${WORK_DIR}/dist-minio-server2.log" + echo "FAILED" + purge "$WORK_DIR" + exit 1 + fi + + if ! ps -p $pid3 1>&2 > /dev/null; then + echo "server3 log:" + cat "${WORK_DIR}/dist-minio-server3.log" + echo "FAILED" + purge "$WORK_DIR" + exit 1 + fi + if ! pkill minio; then - for i in $(seq 1 3); do - echo "server$i log:" - cat "${WORK_DIR}/dist-minio-server$i.log" - done - echo "FAILED" - purge "$WORK_DIR" - exit 1 + for i in $(seq 1 3); do + echo "server$i log:" + cat "${WORK_DIR}/dist-minio-server$i.log" + done + echo "FAILED" + purge "$WORK_DIR" + exit 1 fi sleep 1; if pgrep minio; then - # forcibly killing, to proceed further properly. - if ! pkill -9 minio; then - echo "no minio process running anymore, proceed." - fi + # forcibly killing, to proceed further properly. + if ! pkill -9 minio; then + echo "no minio process running anymore, proceed." + fi fi } function check_online() { if grep -q 'Unable to initialize sub-systems' ${WORK_DIR}/dist-minio-*.log; then - echo "1" + echo "1" fi } @@ -96,14 +115,14 @@ function perform_test() { rv=$(check_online) if [ "$rv" == "1" ]; then - for i in $(seq 1 3); do - echo "server$i log:" - cat "${WORK_DIR}/dist-minio-server$i.log" - done - pkill -9 minio - echo "FAILED" - purge "$WORK_DIR" - exit 1 + for i in $(seq 1 3); do + echo "server$i log:" + cat "${WORK_DIR}/dist-minio-server$i.log" + done + pkill -9 minio + echo "FAILED" + purge "$WORK_DIR" + exit 1 fi } diff --git a/cmd/erasure-healing-common.go b/cmd/erasure-healing-common.go index 0f64c8d44..aac49bc72 100644 --- a/cmd/erasure-healing-common.go +++ b/cmd/erasure-healing-common.go @@ -26,29 +26,59 @@ import ( ) // commonTime returns a maximally occurring time from a list of time. -func commonTime(modTimes []time.Time) (modTime time.Time) { - timeOccurenceMap := make(map[int64]int, len(modTimes)) +func commonTimeAndOccurence(times []time.Time, group time.Duration) (maxTime time.Time, maxima int) { + timeOccurenceMap := make(map[int64]int, len(times)) + groupNano := group.Nanoseconds() // Ignore the uuid sentinel and count the rest. - for _, t := range modTimes { + for _, t := range times { if t.Equal(timeSentinel) { continue } - timeOccurenceMap[t.UnixNano()]++ + nano := t.UnixNano() + if group > 0 { + for k := range timeOccurenceMap { + if k == nano { + // We add to ourself later + continue + } + diff := k - nano + if diff < 0 { + diff = -diff + } + // We are within the limit + if diff < groupNano { + timeOccurenceMap[k]++ + } + } + } + // Add ourself... + timeOccurenceMap[nano]++ } - var maxima int // Counter for remembering max occurrence of elements. + maxima = 0 // Counter for remembering max occurrence of elements. + latest := int64(0) // Find the common cardinality from previously collected // occurrences of elements. for nano, count := range timeOccurenceMap { - t := time.Unix(0, nano).UTC() - if count > maxima || (count == maxima && t.After(modTime)) { + if count < maxima { + continue + } + + // We are at or above maxima + if count > maxima || nano > latest { maxima = count - modTime = t + latest = nano } } - // Return the collected common modTime. + // Return the collected common max time, with maxima + return time.Unix(0, latest).UTC(), maxima +} + +// commonTime returns a maximally occurring time from a list of time. +func commonTime(modTimes []time.Time) (modTime time.Time) { + modTime, _ = commonTimeAndOccurence(modTimes, 0) return modTime } @@ -88,6 +118,19 @@ func filterOnlineDisksInplace(fi FileInfo, partsMetadata []FileInfo, onlineDisks } } +// Extracts list of disk mtimes from FileInfo slice and returns, skips +// slice elements that have errors. +func listObjectDiskMtimes(partsMetadata []FileInfo) (diskMTimes []time.Time) { + diskMTimes = bootModtimes(len(partsMetadata)) + for index, metadata := range partsMetadata { + if metadata.IsValid() { + // Once the file is found, save the disk mtime saved on disk. + diskMTimes[index] = metadata.DiskMTime + } + } + return diskMTimes +} + // Notes: // There are 5 possible states a disk could be in, // 1. __online__ - has the latest copy of xl.meta - returned by listOnlineDisks @@ -185,6 +228,13 @@ func disksWithAllParts(ctx context.Context, onlineDisks []StorageAPI, partsMetad errs []error, latestMeta FileInfo, bucket, object string, scanMode madmin.HealScanMode) ([]StorageAPI, []error) { + var diskMTime time.Time + delta := 5 * time.Second + if !latestMeta.DataShardFixed() { + diskMTime = pickValidDiskTimeWithQuorum(partsMetadata, + latestMeta.Erasure.DataBlocks) + } + availableDisks := make([]StorageAPI, len(onlineDisks)) dataErrs := make([]error, len(onlineDisks)) inconsistent := 0 @@ -289,6 +339,14 @@ func disksWithAllParts(ctx context.Context, onlineDisks []StorageAPI, partsMetad } } + if !diskMTime.Equal(timeSentinel) && !diskMTime.IsZero() { + if !partsMetadata[i].AcceptableDelta(diskMTime, delta) { + // not with in acceptable delta, skip. + partsMetadata[i] = FileInfo{} + continue + } + } + if dataErrs[i] == nil { // All parts verified, mark it as all data available. availableDisks[i] = onlineDisk diff --git a/cmd/erasure-healing.go b/cmd/erasure-healing.go index 7f687aa71..b858d02a0 100644 --- a/cmd/erasure-healing.go +++ b/cmd/erasure-healing.go @@ -24,12 +24,38 @@ import ( "fmt" "io" "sync" + "time" "github.com/minio/madmin-go" "github.com/minio/minio/internal/logger" "github.com/minio/minio/internal/sync/errgroup" ) +const reservedMetadataPrefixLowerDataShardFix = ReservedMetadataPrefixLower + "data-shard-fix" + +// AcceptableDelta returns 'true' if the fi.DiskMTime is under +// acceptable delta of "delta" duration with maxTime. +// +// This code is primarily used for heuristic detection of +// incorrect shards, as per https://github.com/minio/minio/pull/13803 +// +// This check only is active if we could find maximally +// occurring disk mtimes that are somewhat same across +// the quorum. Allowing to skip those shards which we +// might think are wrong. +func (fi FileInfo) AcceptableDelta(maxTime time.Time, delta time.Duration) bool { + diff := maxTime.Sub(fi.DiskMTime) + if diff < 0 { + diff = -diff + } + return diff < delta +} + +// DataShardFixed - data shard fixed? +func (fi FileInfo) DataShardFixed() bool { + return fi.Metadata[reservedMetadataPrefixLowerDataShardFix] == "true" +} + // Heals a bucket if it doesn't exist on one of the disks, additionally // also heals the missing entries for bucket metadata files // `policy.json, notification.xml, listeners.json`. @@ -224,6 +250,11 @@ func shouldHealObjectOnDisk(erErr, dataErr error, meta FileInfo, latestMeta File return true } if erErr == nil { + if meta.XLV1 { + // Legacy means heal always + // always check first. + return true + } if !meta.Deleted && !meta.IsRemote() { // If xl.meta was read fine but there may be problem with the part.N files. if IsErr(dataErr, []error{ @@ -234,19 +265,7 @@ func shouldHealObjectOnDisk(erErr, dataErr error, meta FileInfo, latestMeta File return true } } - if !latestMeta.MetadataEquals(meta) { - return true - } - if !latestMeta.TransitionInfoEquals(meta) { - return true - } - if !latestMeta.ReplicationInfoEquals(meta) { - return true - } - if !latestMeta.ModTime.Equal(meta.ModTime) { - return true - } - if meta.XLV1 { + if !latestMeta.Equals(meta) { return true } } diff --git a/cmd/erasure-healing_test.go b/cmd/erasure-healing_test.go index a31ff179a..e83db0be4 100644 --- a/cmd/erasure-healing_test.go +++ b/cmd/erasure-healing_test.go @@ -93,7 +93,7 @@ func TestHealing(t *testing.T) { } // After heal the meta file should be as expected. - if !reflect.DeepEqual(fileInfoPreHeal, fileInfoPostHeal) { + if !fileInfoPreHeal.Equals(fileInfoPostHeal) { t.Fatal("HealObject failed") } @@ -122,7 +122,7 @@ func TestHealing(t *testing.T) { } // After heal the meta file should be as expected. - if !reflect.DeepEqual(fileInfoPreHeal, fileInfoPostHeal) { + if !fileInfoPreHeal.Equals(fileInfoPostHeal) { t.Fatal("HealObject failed") } @@ -699,7 +699,8 @@ func TestHealLastDataShard(t *testing.T) { t.Fatalf("Failed to make a bucket - %v", err) } - _, err = obj.PutObject(ctx, bucket, object, mustGetPutObjReader(t, bytes.NewReader(data), int64(len(data)), "", ""), opts) + _, err = obj.PutObject(ctx, bucket, object, + mustGetPutObjReader(t, bytes.NewReader(data), int64(len(data)), "", ""), opts) if err != nil { t.Fatal(err) @@ -724,7 +725,9 @@ func TestHealLastDataShard(t *testing.T) { if err != nil { t.Fatalf("Failed to delete a file - %v", err) } - _, err = obj.HealObject(ctx, bucket, object, "", madmin.HealOpts{ScanMode: madmin.HealNormalScan}) + _, err = obj.HealObject(ctx, bucket, object, "", madmin.HealOpts{ + ScanMode: madmin.HealNormalScan, + }) if err != nil { t.Fatal(err) } @@ -738,21 +741,23 @@ func TestHealLastDataShard(t *testing.T) { firstHealedH := sha256.New() _, err = io.Copy(firstHealedH, firstGr) if err != nil { - t.Fatal() + t.Fatal(err) } firstHealedDataSha256 := firstHealedH.Sum(nil) if !bytes.Equal(actualSha256, firstHealedDataSha256) { - t.Fatal("object healed wrong") + t.Fatalf("object healed wrong, expected %x, got %x", + actualSha256, firstHealedDataSha256) } // remove another data shard - err = removeAll(pathJoin(shuffledDisks[1].String(), bucket, object)) - if err != nil { + if err = removeAll(pathJoin(shuffledDisks[1].String(), bucket, object)); err != nil { t.Fatalf("Failed to delete a file - %v", err) } - _, err = obj.HealObject(ctx, bucket, object, "", madmin.HealOpts{ScanMode: madmin.HealNormalScan}) + _, err = obj.HealObject(ctx, bucket, object, "", madmin.HealOpts{ + ScanMode: madmin.HealNormalScan, + }) if err != nil { t.Fatal(err) } @@ -766,12 +771,13 @@ func TestHealLastDataShard(t *testing.T) { secondHealedH := sha256.New() _, err = io.Copy(secondHealedH, secondGr) if err != nil { - t.Fatal() + t.Fatal(err) } secondHealedDataSha256 := secondHealedH.Sum(nil) if !bytes.Equal(actualSha256, secondHealedDataSha256) { - t.Fatal("object healed wrong") + t.Fatalf("object healed wrong, expected %x, got %x", + actualSha256, secondHealedDataSha256) } }) } diff --git a/cmd/erasure-metadata.go b/cmd/erasure-metadata.go index 4efe4b49f..a3fff78c0 100644 --- a/cmd/erasure-metadata.go +++ b/cmd/erasure-metadata.go @@ -351,6 +351,17 @@ func findFileInfoInQuorum(ctx context.Context, metaArr []FileInfo, modTime time. return FileInfo{}, errErasureReadQuorum } +func pickValidDiskTimeWithQuorum(metaArr []FileInfo, quorum int) time.Time { + diskMTimes := listObjectDiskMtimes(metaArr) + + diskMTime, diskMaxima := commonTimeAndOccurence(diskMTimes, 5*time.Second) + if diskMaxima >= quorum { + return diskMTime + } + + return timeSentinel +} + // pickValidFileInfo - picks one valid FileInfo content and returns from a // slice of FileInfo. func pickValidFileInfo(ctx context.Context, metaArr []FileInfo, modTime time.Time, quorum int) (FileInfo, error) { diff --git a/cmd/erasure-object.go b/cmd/erasure-object.go index 5733a94db..fcde3dbe4 100644 --- a/cmd/erasure-object.go +++ b/cmd/erasure-object.go @@ -28,6 +28,7 @@ import ( "strconv" "strings" "sync" + "time" "github.com/minio/madmin-go" "github.com/minio/minio-go/v7/pkg/tags" @@ -179,6 +180,31 @@ func (er erasureObjects) GetObjectNInfo(ctx context.Context, bucket, object stri return nil, toObjectErr(err, bucket, object) } + if !fi.DataShardFixed() { + diskMTime := pickValidDiskTimeWithQuorum(metaArr, fi.Erasure.DataBlocks) + delta := 5 * time.Second + if !diskMTime.Equal(timeSentinel) && !diskMTime.IsZero() { + for index := range onlineDisks { + if onlineDisks[index] == OfflineDisk { + continue + } + if !metaArr[index].IsValid() { + continue + } + if !metaArr[index].AcceptableDelta(diskMTime, delta) { + // If disk mTime mismatches it is considered outdated + // https://github.com/minio/minio/pull/13803 + // + // This check only is active if we could find maximally + // occurring disk mtimes that are somewhat same across + // the quorum. Allowing to skip those shards which we + // might think are wrong. + onlineDisks[index] = OfflineDisk + } + } + } + } + objInfo := fi.ToObjectInfo(bucket, object) if objInfo.DeleteMarker { if opts.VersionID == "" { @@ -1444,14 +1470,16 @@ func (er erasureObjects) addPartial(bucket, object, versionID string, size int64 } func (er erasureObjects) PutObjectMetadata(ctx context.Context, bucket, object string, opts ObjectOptions) (ObjectInfo, error) { - // Lock the object before updating tags. - lk := er.NewNSLock(bucket, object) - lkctx, err := lk.GetLock(ctx, globalOperationTimeout) - if err != nil { - return ObjectInfo{}, err + if !opts.NoLock { + // Lock the object before updating metadata. + lk := er.NewNSLock(bucket, object) + lkctx, err := lk.GetLock(ctx, globalOperationTimeout) + if err != nil { + return ObjectInfo{}, err + } + ctx = lkctx.Context() + defer lk.Unlock(lkctx.Cancel) } - ctx = lkctx.Context() - defer lk.Unlock(lkctx.Cancel) disks := er.getDisks() diff --git a/cmd/storage-datatypes.go b/cmd/storage-datatypes.go index 708ad1a9b..af85153d4 100644 --- a/cmd/storage-datatypes.go +++ b/cmd/storage-datatypes.go @@ -184,6 +184,25 @@ type FileInfo struct { // no other caller must set this value other than multi-object delete call. // usage in other calls in undefined please avoid. Idx int `msg:"i"` + + // DiskMTime indicates the mtime of the xl.meta on disk + // This is mainly used for detecting a particular issue + // reported in https://github.com/minio/minio/pull/13803 + DiskMTime time.Time `msg:"dmt"` +} + +// Equals checks if fi(FileInfo) matches ofi(FileInfo) +func (fi FileInfo) Equals(ofi FileInfo) (ok bool) { + if !fi.MetadataEquals(ofi) { + return false + } + if !fi.ReplicationInfoEquals(ofi) { + return false + } + if !fi.TransitionInfoEquals(ofi) { + return false + } + return fi.ModTime.Equal(ofi.ModTime) } // GetDataDir returns an expected dataDir given FileInfo diff --git a/cmd/storage-datatypes_gen.go b/cmd/storage-datatypes_gen.go index 5b9f4a3bc..317b2c1d0 100644 --- a/cmd/storage-datatypes_gen.go +++ b/cmd/storage-datatypes_gen.go @@ -550,8 +550,8 @@ func (z *FileInfo) DecodeMsg(dc *msgp.Reader) (err error) { err = msgp.WrapError(err) return } - if zb0001 != 25 { - err = msgp.ArrayError{Wanted: 25, Got: zb0001} + if zb0001 != 26 { + err = msgp.ArrayError{Wanted: 26, Got: zb0001} return } z.Volume, err = dc.ReadString() @@ -716,13 +716,18 @@ func (z *FileInfo) DecodeMsg(dc *msgp.Reader) (err error) { err = msgp.WrapError(err, "Idx") return } + z.DiskMTime, err = dc.ReadTime() + if err != nil { + err = msgp.WrapError(err, "DiskMTime") + return + } return } // EncodeMsg implements msgp.Encodable func (z *FileInfo) EncodeMsg(en *msgp.Writer) (err error) { - // array header, size 25 - err = en.Append(0xdc, 0x0, 0x19) + // array header, size 26 + err = en.Append(0xdc, 0x0, 0x1a) if err != nil { return } @@ -870,14 +875,19 @@ func (z *FileInfo) EncodeMsg(en *msgp.Writer) (err error) { err = msgp.WrapError(err, "Idx") return } + err = en.WriteTime(z.DiskMTime) + if err != nil { + err = msgp.WrapError(err, "DiskMTime") + return + } return } // MarshalMsg implements msgp.Marshaler func (z *FileInfo) MarshalMsg(b []byte) (o []byte, err error) { o = msgp.Require(b, z.Msgsize()) - // array header, size 25 - o = append(o, 0xdc, 0x0, 0x19) + // array header, size 26 + o = append(o, 0xdc, 0x0, 0x1a) o = msgp.AppendString(o, z.Volume) o = msgp.AppendString(o, z.Name) o = msgp.AppendString(o, z.VersionID) @@ -922,6 +932,7 @@ func (z *FileInfo) MarshalMsg(b []byte) (o []byte, err error) { o = msgp.AppendTime(o, z.SuccessorModTime) o = msgp.AppendBool(o, z.Fresh) o = msgp.AppendInt(o, z.Idx) + o = msgp.AppendTime(o, z.DiskMTime) return } @@ -933,8 +944,8 @@ func (z *FileInfo) UnmarshalMsg(bts []byte) (o []byte, err error) { err = msgp.WrapError(err) return } - if zb0001 != 25 { - err = msgp.ArrayError{Wanted: 25, Got: zb0001} + if zb0001 != 26 { + err = msgp.ArrayError{Wanted: 26, Got: zb0001} return } z.Volume, bts, err = msgp.ReadStringBytes(bts) @@ -1099,6 +1110,11 @@ func (z *FileInfo) UnmarshalMsg(bts []byte) (o []byte, err error) { err = msgp.WrapError(err, "Idx") return } + z.DiskMTime, bts, err = msgp.ReadTimeBytes(bts) + if err != nil { + err = msgp.WrapError(err, "DiskMTime") + return + } o = bts return } @@ -1116,7 +1132,7 @@ func (z *FileInfo) Msgsize() (s int) { for za0003 := range z.Parts { s += z.Parts[za0003].Msgsize() } - s += z.Erasure.Msgsize() + msgp.BoolSize + z.ReplicationState.Msgsize() + msgp.BytesPrefixSize + len(z.Data) + msgp.IntSize + msgp.TimeSize + msgp.BoolSize + msgp.IntSize + s += z.Erasure.Msgsize() + msgp.BoolSize + z.ReplicationState.Msgsize() + msgp.BytesPrefixSize + len(z.Data) + msgp.IntSize + msgp.TimeSize + msgp.BoolSize + msgp.IntSize + msgp.TimeSize return } diff --git a/cmd/storage-rest-common.go b/cmd/storage-rest-common.go index 2bd6e803d..737da66bf 100644 --- a/cmd/storage-rest-common.go +++ b/cmd/storage-rest-common.go @@ -18,7 +18,7 @@ package cmd const ( - storageRESTVersion = "v42" // Added FreeVersions to FileInfoVersions + storageRESTVersion = "v43" // Added DiskMTime field for FileInfo storageRESTVersionPrefix = SlashSeparator + storageRESTVersion storageRESTPrefix = minioReservedBucketPath + "/storage" ) diff --git a/cmd/xl-storage.go b/cmd/xl-storage.go index 592f7d58e..4832d7bf9 100644 --- a/cmd/xl-storage.go +++ b/cmd/xl-storage.go @@ -375,32 +375,39 @@ func (s *xlStorage) Healing() *healingTracker { return &h } -func (s *xlStorage) readMetadata(ctx context.Context, itemPath string) ([]byte, error) { +// readsMetadata and returns disk mTime information for xl.meta +func (s *xlStorage) readMetadataWithDMTime(ctx context.Context, itemPath string) ([]byte, time.Time, error) { if contextCanceled(ctx) { - return nil, ctx.Err() + return nil, time.Time{}, ctx.Err() } if err := checkPathLength(itemPath); err != nil { - return nil, err + return nil, time.Time{}, err } f, err := OpenFile(itemPath, readMode, 0) if err != nil { - return nil, err + return nil, time.Time{}, err } defer f.Close() stat, err := f.Stat() if err != nil { - return nil, err + return nil, time.Time{}, err } if stat.IsDir() { - return nil, &os.PathError{ + return nil, time.Time{}, &os.PathError{ Op: "open", Path: itemPath, Err: syscall.EISDIR, } } - return readXLMetaNoData(f, stat.Size()) + buf, err := readXLMetaNoData(f, stat.Size()) + return buf, stat.ModTime().UTC(), err +} + +func (s *xlStorage) readMetadata(ctx context.Context, itemPath string) ([]byte, error) { + buf, _, err := s.readMetadataWithDMTime(ctx, itemPath) + return buf, err } func (s *xlStorage) NSScanner(ctx context.Context, cache dataUsageCache, updates chan<- dataUsageEntry) (dataUsageCache, error) { @@ -1214,11 +1221,18 @@ func (s *xlStorage) ReadVersion(ctx context.Context, volume, path, versionID str if err != nil { return fi, err } + // Validate file path length, before reading. + filePath := pathJoin(volumeDir, path) + if err = checkPathLength(filePath); err != nil { + return fi, err + } + var buf []byte + var dmTime time.Time if readData { - buf, err = s.ReadAll(ctx, volume, pathJoin(path, xlStorageFormatFile)) + buf, dmTime, err = s.readAllData(ctx, volumeDir, pathJoin(filePath, xlStorageFormatFile)) } else { - buf, err = s.readMetadata(ctx, pathJoin(volumeDir, path, xlStorageFormatFile)) + buf, dmTime, err = s.readMetadataWithDMTime(ctx, pathJoin(filePath, xlStorageFormatFile)) if err != nil { if osIsNotExist(err) { if aerr := Access(volumeDir); aerr != nil && osIsNotExist(aerr) { @@ -1231,7 +1245,7 @@ func (s *xlStorage) ReadVersion(ctx context.Context, volume, path, versionID str if err != nil { if err == errFileNotFound { - buf, err = s.ReadAll(ctx, volume, pathJoin(path, xlStorageFormatFileV1)) + buf, dmTime, err = s.readAllData(ctx, volumeDir, pathJoin(filePath, xlStorageFormatFileV1)) if err != nil { if err == errFileNotFound { if versionID != "" { @@ -1257,6 +1271,7 @@ func (s *xlStorage) ReadVersion(ctx context.Context, volume, path, versionID str if err != nil { return fi, err } + fi.DiskMTime = dmTime if len(fi.Data) == 0 { // We did not read inline data, so we have no references. @@ -1299,7 +1314,7 @@ func (s *xlStorage) ReadVersion(ctx context.Context, volume, path, versionID str len(fi.Parts) == 1 { partPath := fmt.Sprintf("part.%d", fi.Parts[0].Number) dataPath := pathJoin(volumeDir, path, fi.DataDir, partPath) - fi.Data, err = s.readAllData(volumeDir, dataPath) + fi.Data, _, err = s.readAllData(ctx, volumeDir, dataPath) if err != nil { return FileInfo{}, err } @@ -1309,38 +1324,42 @@ func (s *xlStorage) ReadVersion(ctx context.Context, volume, path, versionID str return fi, nil } -func (s *xlStorage) readAllData(volumeDir string, filePath string) (buf []byte, err error) { +func (s *xlStorage) readAllData(ctx context.Context, volumeDir string, filePath string) (buf []byte, dmTime time.Time, err error) { + if contextCanceled(ctx) { + return nil, time.Time{}, ctx.Err() + } + f, err := OpenFileDirectIO(filePath, readMode, 0666) if err != nil { if osIsNotExist(err) { // Check if the object doesn't exist because its bucket // is missing in order to return the correct error. if err = Access(volumeDir); err != nil && osIsNotExist(err) { - return nil, errVolumeNotFound + return nil, dmTime, errVolumeNotFound } - return nil, errFileNotFound + return nil, dmTime, errFileNotFound } else if osIsPermission(err) { - return nil, errFileAccessDenied + return nil, dmTime, errFileAccessDenied } else if isSysErrNotDir(err) || isSysErrIsDir(err) { - return nil, errFileNotFound + return nil, dmTime, errFileNotFound } else if isSysErrHandleInvalid(err) { // This case is special and needs to be handled for windows. - return nil, errFileNotFound + return nil, dmTime, errFileNotFound } else if isSysErrIO(err) { - return nil, errFaultyDisk + return nil, dmTime, errFaultyDisk } else if isSysErrTooManyFiles(err) { - return nil, errTooManyOpenFiles + return nil, dmTime, errTooManyOpenFiles } else if isSysErrInvalidArg(err) { st, _ := Lstat(filePath) if st != nil && st.IsDir() { // Linux returns InvalidArg for directory O_DIRECT // we need to keep this fallback code to return correct // errors upwards. - return nil, errFileNotFound + return nil, dmTime, errFileNotFound } - return nil, errUnsupportedDisk + return nil, dmTime, errUnsupportedDisk } - return nil, err + return nil, dmTime, err } r := &xioutil.ODirectReader{ File: f, @@ -1352,10 +1371,10 @@ func (s *xlStorage) readAllData(volumeDir string, filePath string) (buf []byte, stat, err := f.Stat() if err != nil { buf, err = ioutil.ReadAll(r) - return buf, osErrToFileErr(err) + return buf, dmTime, osErrToFileErr(err) } if stat.IsDir() { - return nil, errFileNotFound + return nil, dmTime, errFileNotFound } // Read into appropriate buffer. @@ -1369,7 +1388,7 @@ func (s *xlStorage) readAllData(volumeDir string, filePath string) (buf []byte, // Read file... _, err = io.ReadFull(r, buf) - return buf, osErrToFileErr(err) + return buf, stat.ModTime().UTC(), osErrToFileErr(err) } // ReadAll reads from r until an error or EOF and returns the data it read. @@ -1390,7 +1409,8 @@ func (s *xlStorage) ReadAll(ctx context.Context, volume string, path string) (bu return nil, err } - return s.readAllData(volumeDir, filePath) + buf, _, err = s.readAllData(ctx, volumeDir, filePath) + return buf, err } // ReadFile reads exactly len(buf) bytes into buf. It returns the diff --git a/docs/bucket/replication/setup_3site_replication.sh b/docs/bucket/replication/setup_3site_replication.sh index acf15c93e..5b0b801f2 100755 --- a/docs/bucket/replication/setup_3site_replication.sh +++ b/docs/bucket/replication/setup_3site_replication.sh @@ -240,18 +240,18 @@ head -c 221227088 200M sleep 10 echo "Verifying ETag for all objects" -./s3-check-md5 -access-key minio -secret-key minio123 -endpoint http://127.0.0.1:9001/ -bucket bucket -./s3-check-md5 -access-key minio -secret-key minio123 -endpoint http://127.0.0.1:9002/ -bucket bucket -./s3-check-md5 -access-key minio -secret-key minio123 -endpoint http://127.0.0.1:9003/ -bucket bucket -./s3-check-md5 -access-key minio -secret-key minio123 -endpoint http://127.0.0.1:9004/ -bucket bucket -./s3-check-md5 -access-key minio -secret-key minio123 -endpoint http://127.0.0.1:9005/ -bucket bucket -./s3-check-md5 -access-key minio -secret-key minio123 -endpoint http://127.0.0.1:9006/ -bucket bucket +./s3-check-md5 -versions -access-key minio -secret-key minio123 -endpoint http://127.0.0.1:9001/ -bucket bucket +./s3-check-md5 -versions -access-key minio -secret-key minio123 -endpoint http://127.0.0.1:9002/ -bucket bucket +./s3-check-md5 -versions -access-key minio -secret-key minio123 -endpoint http://127.0.0.1:9003/ -bucket bucket +./s3-check-md5 -versions -access-key minio -secret-key minio123 -endpoint http://127.0.0.1:9004/ -bucket bucket +./s3-check-md5 -versions -access-key minio -secret-key minio123 -endpoint http://127.0.0.1:9005/ -bucket bucket +./s3-check-md5 -versions -access-key minio -secret-key minio123 -endpoint http://127.0.0.1:9006/ -bucket bucket -./s3-check-md5 -access-key minio -secret-key minio123 -endpoint http://127.0.0.1:9001/ -bucket olockbucket -./s3-check-md5 -access-key minio -secret-key minio123 -endpoint http://127.0.0.1:9002/ -bucket olockbucket -./s3-check-md5 -access-key minio -secret-key minio123 -endpoint http://127.0.0.1:9003/ -bucket olockbucket -./s3-check-md5 -access-key minio -secret-key minio123 -endpoint http://127.0.0.1:9004/ -bucket olockbucket -./s3-check-md5 -access-key minio -secret-key minio123 -endpoint http://127.0.0.1:9005/ -bucket olockbucket -./s3-check-md5 -access-key minio -secret-key minio123 -endpoint http://127.0.0.1:9006/ -bucket olockbucket +./s3-check-md5 -versions -access-key minio -secret-key minio123 -endpoint http://127.0.0.1:9001/ -bucket olockbucket +./s3-check-md5 -versions -access-key minio -secret-key minio123 -endpoint http://127.0.0.1:9002/ -bucket olockbucket +./s3-check-md5 -versions -access-key minio -secret-key minio123 -endpoint http://127.0.0.1:9003/ -bucket olockbucket +./s3-check-md5 -versions -access-key minio -secret-key minio123 -endpoint http://127.0.0.1:9004/ -bucket olockbucket +./s3-check-md5 -versions -access-key minio -secret-key minio123 -endpoint http://127.0.0.1:9005/ -bucket olockbucket +./s3-check-md5 -versions -access-key minio -secret-key minio123 -endpoint http://127.0.0.1:9006/ -bucket olockbucket catch diff --git a/docs/debugging/s3-check-md5/main.go b/docs/debugging/s3-check-md5/main.go index e1a7849e9..30bdbaee8 100644 --- a/docs/debugging/s3-check-md5/main.go +++ b/docs/debugging/s3-check-md5/main.go @@ -25,6 +25,7 @@ import ( "io" "log" "net/url" + "os" "strconv" "strings" @@ -35,6 +36,8 @@ import ( var ( endpoint, accessKey, secretKey string bucket, prefix string + debug bool + versions bool ) // getMD5Sum returns MD5 sum of given data. @@ -50,6 +53,8 @@ func main() { flag.StringVar(&secretKey, "secret-key", "zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG", "S3 Secret Key") flag.StringVar(&bucket, "bucket", "", "Select a specific bucket") flag.StringVar(&prefix, "prefix", "", "Select a prefix") + flag.BoolVar(&debug, "debug", false, "Prints HTTP network calls to S3 endpoint") + flag.BoolVar(&versions, "versions", false, "Verify all versions") flag.Parse() if endpoint == "" { @@ -81,7 +86,9 @@ func main() { log.Fatalln() } - // s3Client.TraceOn(os.Stderr) + if debug { + s3Client.TraceOn(os.Stderr) + } var buckets []string if bucket != "" { @@ -100,7 +107,8 @@ func main() { opts := minio.ListObjectsOptions{ Recursive: true, Prefix: prefix, - WithVersions: true, + WithVersions: versions, + WithMetadata: true, } // List all objects from a bucket-name with a matching prefix. @@ -110,6 +118,19 @@ func main() { continue } if object.IsDeleteMarker { + log.Println("DELETE marker skipping object:", object.Key) + continue + } + if _, ok := object.UserMetadata["X-Amz-Server-Side-Encryption-Customer-Algorithm"]; ok { + log.Println("Objects encrypted with SSE-C do not have md5sum as ETag:", object.Key) + continue + } + if _, ok := object.UserMetadata["X-Amz-Server-Side-Encryption-Customer-Algorithm"]; ok { + log.Println("Objects encrypted with SSE-C do not have md5sum as ETag:", object.Key) + continue + } + if v, ok := object.UserMetadata["X-Amz-Server-Side-Encryption"]; ok && v == "aws:kms" { + log.Println("Objects encrypted with SSE-KMS do not have md5sum as ETag:", object.Key) continue } parts := 1 @@ -132,10 +153,12 @@ func main() { } var partsMD5Sum [][]byte - for p := 1; p <= parts; p++ { - obj, err := s3Client.GetObject(context.Background(), bucket, object.Key, - minio.GetObjectOptions{VersionID: object.VersionID, PartNumber: p}) + opts := minio.GetObjectOptions{ + VersionID: object.VersionID, + PartNumber: p, + } + obj, err := s3Client.GetObject(context.Background(), bucket, object.Key, opts) if err != nil { log.Println("GET", bucket, object.Key, object.VersionID, "=>", err) continue @@ -149,7 +172,6 @@ func main() { } corrupted := false - if !multipart { md5sum := fmt.Sprintf("%x", partsMD5Sum[0]) if md5sum != object.ETag { @@ -169,7 +191,7 @@ func main() { if corrupted { log.Println("CORRUPTED object:", bucket, object.Key, object.VersionID) } else { - log.Println("INTACT", bucket, object.Key, object.VersionID) + log.Println("INTACT object:", bucket, object.Key, object.VersionID) } } }