From a6e0ec4e6f8ccb95df3a10c89ff6a88f13ea295e Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Tue, 2 Aug 2022 23:10:22 -0700 Subject: [PATCH] Add support converting non-inlined to inlined (#15444) This is a feature to allow for inode compaction on large clusters that use a lot of small files spread across a large heirarchy. --- .github/workflows/go-healing.yml | 1 + Makefile | 9 +- buildscripts/heal-manual.go | 87 ++++++++++++++++++ buildscripts/rewrite-old-new.sh | 151 +++++++++++++++++++++++++++++++ cmd/erasure-healing.go | 53 +++++++---- cmd/erasure-server-pool.go | 4 +- 6 files changed, 286 insertions(+), 19 deletions(-) create mode 100644 buildscripts/heal-manual.go create mode 100755 buildscripts/rewrite-old-new.sh diff --git a/.github/workflows/go-healing.yml b/.github/workflows/go-healing.yml index 1ae485bd1..2e9984d65 100644 --- a/.github/workflows/go-healing.yml +++ b/.github/workflows/go-healing.yml @@ -49,3 +49,4 @@ jobs: make verify-healing make verify-healing-inconsistent-versions make verify-healing-with-root-disks + make verify-healing-with-rewrite diff --git a/Makefile b/Makefile index 4fa66a8cc..c7578d1a5 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,7 @@ checks: ## check dependencies @(env bash $(PWD)/buildscripts/checkdeps.sh) help: ## print this help - @grep -E '^[a-zA-Z_-]+:.*?## .*$$' Makefile | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' Makefile | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-40s\033[0m %s\n", $$1, $$2}' getdeps: ## fetch necessary dependencies @mkdir -p ${GOPATH}/bin @@ -90,11 +90,16 @@ verify-healing: ## verify healing and replacing disks with minio binary @(env bash $(PWD)/buildscripts/verify-healing.sh) @(env bash $(PWD)/buildscripts/unaligned-healing.sh) -verify-healing-with-root-disks: +verify-healing-with-root-disks: ## verify healing root disks @echo "Verify healing with root disks" @GORACE=history_size=7 CGO_ENABLED=1 go build -race -tags kqueue -trimpath --ldflags "$(LDFLAGS)" -o $(PWD)/minio 1>/dev/null @(env bash $(PWD)/buildscripts/verify-healing-with-root-disks.sh) +verify-healing-with-rewrite: ## verify healing to rewrite old xl.meta -> new xl.meta + @echo "Verify healing with rewrite" + @GORACE=history_size=7 CGO_ENABLED=1 go build -race -tags kqueue -trimpath --ldflags "$(LDFLAGS)" -o $(PWD)/minio 1>/dev/null + @(env bash $(PWD)/buildscripts/rewrite-old-new.sh) + verify-healing-inconsistent-versions: ## verify resolving inconsistent versions @echo "Verify resolving inconsistent versions build with race" @GORACE=history_size=7 CGO_ENABLED=1 go build -race -tags kqueue -trimpath --ldflags "$(LDFLAGS)" -o $(PWD)/minio 1>/dev/null diff --git a/buildscripts/heal-manual.go b/buildscripts/heal-manual.go new file mode 100644 index 000000000..a95bb5f85 --- /dev/null +++ b/buildscripts/heal-manual.go @@ -0,0 +1,87 @@ +//go:build ignore +// +build ignore + +// +// MinIO Object Storage (c) 2022 MinIO, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package main + +import ( + "context" + "encoding/json" + "fmt" + "log" + "os" + "time" + + "github.com/minio/madmin-go" +) + +func main() { + // Note: YOUR-ACCESSKEYID, YOUR-SECRETACCESSKEY are + // dummy values, please replace them with original values. + + // API requests are secure (HTTPS) if secure=true and insecure (HTTP) otherwise. + // New returns an MinIO Admin client object. + madmClnt, err := madmin.New(os.Args[1], os.Args[2], os.Args[3], false) + if err != nil { + log.Fatalln(err) + } + + opts := madmin.HealOpts{ + Recursive: true, // recursively heal all objects at 'prefix' + Remove: true, // remove content that has lost quorum and not recoverable + Recreate: true, // rewrite all old non-inlined xl.meta to new xl.meta + ScanMode: madmin.HealNormalScan, // by default do not do 'deep' scanning + } + + start, _, err := madmClnt.Heal(context.Background(), "healing-rewrite-bucket", "", opts, "", false, false) + if err != nil { + log.Fatalln(err) + } + fmt.Println("Healstart sequence ===") + enc := json.NewEncoder(os.Stdout) + if err = enc.Encode(&start); err != nil { + log.Fatalln(err) + } + + fmt.Println() + for { + _, status, err := madmClnt.Heal(context.Background(), "healing-rewrite-bucket", "", opts, start.ClientToken, false, false) + if status.Summary == "finished" { + fmt.Println("Healstatus on items ===") + for _, item := range status.Items { + if err = enc.Encode(&item); err != nil { + log.Fatalln(err) + } + } + break + } + if status.Summary == "stopped" { + fmt.Println("Healstatus on items ===") + fmt.Println("Heal failed with", status.FailureDetail) + break + } + + for _, item := range status.Items { + if err = enc.Encode(&item); err != nil { + log.Fatalln(err) + } + } + + time.Sleep(time.Second) + } +} diff --git a/buildscripts/rewrite-old-new.sh b/buildscripts/rewrite-old-new.sh new file mode 100755 index 000000000..6cc015519 --- /dev/null +++ b/buildscripts/rewrite-old-new.sh @@ -0,0 +1,151 @@ +#!/bin/bash -e + +set -E +set -o pipefail +set -x + +WORK_DIR="$PWD/.verify-$RANDOM" +MINIO_CONFIG_DIR="$WORK_DIR/.minio" +MINIO_OLD=( "$PWD/minio.RELEASE.2020-10-28T08-16-50Z" --config-dir "$MINIO_CONFIG_DIR" server ) +MINIO=( "$PWD/minio" --config-dir "$MINIO_CONFIG_DIR" server ) + +if [ ! -x "$PWD/minio" ]; then + echo "minio executable binary not found in current directory" + exit 1 +fi + +function download_old_release() { + if [ ! -f minio.RELEASE.2020-10-28T08-16-50Z ]; then + curl --silent -O https://dl.minio.io/server/minio/release/linux-amd64/archive/minio.RELEASE.2020-10-28T08-16-50Z + chmod a+x minio.RELEASE.2020-10-28T08-16-50Z + fi +} + +function verify_rewrite() { + start_port=$1 + + export MINIO_ACCESS_KEY=minio + export MINIO_SECRET_KEY=minio123 + export MC_HOST_minio="http://minio:minio123@127.0.0.1:${start_port}/" + unset MINIO_KMS_AUTO_ENCRYPTION # do not auto-encrypt objects + export MINIO_CI_CD=1 + + MC_BUILD_DIR="mc-$RANDOM" + if ! git clone --quiet https://github.com/minio/mc "$MC_BUILD_DIR"; then + echo "failed to download https://github.com/minio/mc" + purge "${MC_BUILD_DIR}" + exit 1 + fi + + (cd "${MC_BUILD_DIR}" && go build -o "$WORK_DIR/mc") + + # remove mc source. + purge "${MC_BUILD_DIR}" + + "${MINIO_OLD[@]}" --address ":$start_port" "${WORK_DIR}/xl{1...16}" > "${WORK_DIR}/server1.log" 2>&1 & + pid=$! + disown $pid + sleep 10 + + if ! ps -p ${pid} 1>&2 >/dev/null; then + echo "server1 log:" + cat "${WORK_DIR}/server1.log" + echo "FAILED" + purge "$WORK_DIR" + exit 1 + fi + + "${WORK_DIR}/mc" mb minio/healing-rewrite-bucket --quiet --with-lock + "${WORK_DIR}/mc" cp \ + buildscripts/verify-build.sh \ + minio/healing-rewrite-bucket/ \ + --disable-multipart --quiet + + "${WORK_DIR}/mc" cp \ + buildscripts/verify-build.sh \ + minio/healing-rewrite-bucket/ \ + --disable-multipart --quiet + + "${WORK_DIR}/mc" cp \ + buildscripts/verify-build.sh \ + minio/healing-rewrite-bucket/ \ + --disable-multipart --quiet + + kill ${pid} + sleep 3 + + "${MINIO[@]}" --address ":$start_port" "${WORK_DIR}/xl{1...16}" > "${WORK_DIR}/server1.log" 2>&1 & + pid=$! + disown $pid + sleep 10 + + if ! ps -p ${pid} 1>&2 >/dev/null; then + echo "server1 log:" + cat "${WORK_DIR}/server1.log" + echo "FAILED" + purge "$WORK_DIR" + exit 1 + fi + + go build ./docs/debugging/s3-check-md5/ + if ! ./s3-check-md5 \ + -debug \ + -versions \ + -access-key minio \ + -secret-key minio123 \ + -endpoint http://127.0.0.1:${start_port}/ 2>&1 | grep INTACT; then + echo "server1 log:" + cat "${WORK_DIR}/server1.log" + echo "FAILED" + mkdir -p inspects + (cd inspects; "${WORK_DIR}/mc" admin inspect minio/healing-rewrite-bucket/verify-build.sh/**) + + "${WORK_DIR}/mc" mb play/inspects + "${WORK_DIR}/mc" mirror inspects play/inspects + + purge "$WORK_DIR" + exit 1 + fi + + go run ./buildscripts/heal-manual.go "127.0.0.1:${start_port}" "minio" "minio123" + sleep 1 + + if ! ./s3-check-md5 \ + -debug \ + -versions \ + -access-key minio \ + -secret-key minio123 \ + -endpoint http://127.0.0.1:${start_port}/ 2>&1 | grep INTACT; then + echo "server1 log:" + cat "${WORK_DIR}/server1.log" + echo "FAILED" + mkdir -p inspects + (cd inspects; "${WORK_DIR}/mc" admin inspect minio/healing-rewrite-bucket/verify-build.sh/**) + + "${WORK_DIR}/mc" mb play/inspects + "${WORK_DIR}/mc" mirror inspects play/inspects + + purge "$WORK_DIR" + exit 1 + fi + + kill ${pid} +} + +function main() { + download_old_release + + start_port=$(shuf -i 10000-65000 -n 1) + + verify_rewrite ${start_port} +} + +function purge() +{ + rm -rf "$1" +} + +( main "$@" ) +rv=$? +purge "$WORK_DIR" +exit "$rv" diff --git a/cmd/erasure-healing.go b/cmd/erasure-healing.go index 43c307898..fcb07b28f 100644 --- a/cmd/erasure-healing.go +++ b/cmd/erasure-healing.go @@ -243,7 +243,7 @@ func listAllBuckets(ctx context.Context, storageDisks []StorageAPI, healBuckets // Only heal on disks where we are sure that healing is needed. We can expand // this list as and when we figure out more errors can be added to this list safely. -func shouldHealObjectOnDisk(erErr, dataErr error, meta FileInfo, latestMeta FileInfo) bool { +func shouldHealObjectOnDisk(erErr, dataErr error, meta FileInfo, latestMeta FileInfo, doinline bool) bool { switch { case errors.Is(erErr, errFileNotFound) || errors.Is(erErr, errFileVersionNotFound): return true @@ -256,6 +256,10 @@ func shouldHealObjectOnDisk(erErr, dataErr error, meta FileInfo, latestMeta File // always check first. return true } + if doinline { + // convert small files to 'inline' + return true + } if !meta.Deleted && !meta.IsRemote() { // If xl.meta was read fine but there may be problem with the part.N files. if IsErr(dataErr, []error{ @@ -342,6 +346,26 @@ func (er erasureObjects) healObject(ctx context.Context, bucket string, object s availableDisks, dataErrs, diskMTime := disksWithAllParts(ctx, onlineDisks, partsMetadata, errs, latestMeta, bucket, object, scanMode) + var erasure Erasure + var recreate bool + if !latestMeta.Deleted && !latestMeta.IsRemote() { + // Initialize erasure coding + erasure, err = NewErasure(ctx, latestMeta.Erasure.DataBlocks, + latestMeta.Erasure.ParityBlocks, latestMeta.Erasure.BlockSize) + if err != nil { + return result, err + } + + // Is only 'true' if the opts.Recreate is true and + // the object shardSize < smallFileThreshold do not + // set this to 'true' arbitrarily and must be only + // 'true' with caller ask. + recreate = (opts.Recreate && + !latestMeta.InlineData() && + len(latestMeta.Parts) == 1 && + erasure.ShardFileSize(latestMeta.Parts[0].ActualSize) < smallFileThreshold) + } + // Loop to find number of disks with valid data, per-drive // data state and a list of outdated disks on which data needs // to be healed. @@ -368,7 +392,7 @@ func (er erasureObjects) healObject(ctx context.Context, bucket string, object s driveState = madmin.DriveStateCorrupt } - if shouldHealObjectOnDisk(errs[i], dataErrs[i], partsMetadata[i], latestMeta) { + if shouldHealObjectOnDisk(errs[i], dataErrs[i], partsMetadata[i], latestMeta, recreate) { outDatedDisks[i] = storageDisks[i] disksToHealCount++ result.Before.Drives = append(result.Before.Drives, madmin.HealDriveInfo{ @@ -422,7 +446,7 @@ func (er erasureObjects) healObject(ctx context.Context, bucket string, object s return result, nil } - if !latestMeta.XLV1 && !latestMeta.Deleted && disksToHealCount > latestMeta.Erasure.ParityBlocks { + if !latestMeta.XLV1 && !latestMeta.Deleted && !recreate && disksToHealCount > latestMeta.Erasure.ParityBlocks { // When disk to heal count is greater than parity blocks we should simply error out. err := fmt.Errorf("more disks are expected to heal than parity, returned errors: %v (dataErrs %v) -> %s/%s(%s)", errs, dataErrs, bucket, object, versionID) logger.LogIf(ctx, err) @@ -478,18 +502,9 @@ func (er erasureObjects) healObject(ctx context.Context, bucket string, object s } var inlineBuffers []*bytes.Buffer - if latestMeta.InlineData() { - inlineBuffers = make([]*bytes.Buffer, len(outDatedDisks)) - } - if !latestMeta.Deleted && !latestMeta.IsRemote() { - // Heal each part. erasureHealFile() will write the healed - // part to .minio/tmp/uuid/ which needs to be renamed later to - // the final location. - erasure, err := NewErasure(ctx, latestMeta.Erasure.DataBlocks, - latestMeta.Erasure.ParityBlocks, latestMeta.Erasure.BlockSize) - if err != nil { - return result, err + if latestMeta.InlineData() || recreate { + inlineBuffers = make([]*bytes.Buffer, len(outDatedDisks)) } erasureInfo := latestMeta.Erasure @@ -525,6 +540,10 @@ func (er erasureObjects) healObject(ctx context.Context, bucket string, object s tillOffset, DefaultBitrotAlgorithm, erasure.ShardSize()) } } + + // Heal each part. erasure.Heal() will write the healed + // part to .minio/tmp/uuid/ which needs to be renamed + // later to the final location. err = erasure.Heal(ctx, writers, readers, partSize) closeBitrotReaders(readers) closeBitrotWriters(writers) @@ -556,6 +575,7 @@ func (er erasureObjects) healObject(ctx context.Context, bucket string, object s }) if len(inlineBuffers) > 0 && inlineBuffers[i] != nil { partsMetadata[i].Data = inlineBuffers[i].Bytes() + partsMetadata[i].SetInlineData() } else { partsMetadata[i].Data = nil } @@ -587,8 +607,9 @@ func (er erasureObjects) healObject(ctx context.Context, bucket string, object s return result, err } - // Remove any remaining parts from outdated disks from before transition. - if partsMetadata[i].IsRemote() { + // - Remove any parts from healed disks after its been inlined. + // - Remove any remaining parts from outdated disks from before transition. + if recreate || partsMetadata[i].IsRemote() { rmDataDir := partsMetadata[i].DataDir disk.DeleteVol(ctx, pathJoin(bucket, encodeDirObject(object), rmDataDir), true) } diff --git a/cmd/erasure-server-pool.go b/cmd/erasure-server-pool.go index cdf21f23f..190806cca 100644 --- a/cmd/erasure-server-pool.go +++ b/cmd/erasure-server-pool.go @@ -1769,7 +1769,9 @@ func (z *erasureServerPools) HealBucket(ctx context.Context, bucket string, opts } // Attempt heal on the bucket metadata, ignore any failures - defer z.HealObject(ctx, minioMetaBucket, pathJoin(bucketMetaPrefix, bucket, bucketMetadataFile), "", opts) + hopts := opts + hopts.Recreate = false + defer z.HealObject(ctx, minioMetaBucket, pathJoin(bucketMetaPrefix, bucket, bucketMetadataFile), "", hopts) for _, pool := range z.serverPools { result, err := pool.HealBucket(ctx, bucket, opts)