Add support converting non-inlined to inlined (#15444)

This is a feature to allow for inode compaction on
large clusters that use a lot of small files spread
across a large heirarchy.
This commit is contained in:
Harshavardhana 2022-08-02 23:10:22 -07:00 committed by GitHub
parent e956369c4e
commit a6e0ec4e6f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 286 additions and 19 deletions

View File

@ -49,3 +49,4 @@ jobs:
make verify-healing make verify-healing
make verify-healing-inconsistent-versions make verify-healing-inconsistent-versions
make verify-healing-with-root-disks make verify-healing-with-root-disks
make verify-healing-with-rewrite

View File

@ -15,7 +15,7 @@ checks: ## check dependencies
@(env bash $(PWD)/buildscripts/checkdeps.sh) @(env bash $(PWD)/buildscripts/checkdeps.sh)
help: ## print this help help: ## print this help
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' Makefile | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' @grep -E '^[a-zA-Z_-]+:.*?## .*$$' Makefile | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-40s\033[0m %s\n", $$1, $$2}'
getdeps: ## fetch necessary dependencies getdeps: ## fetch necessary dependencies
@mkdir -p ${GOPATH}/bin @mkdir -p ${GOPATH}/bin
@ -90,11 +90,16 @@ verify-healing: ## verify healing and replacing disks with minio binary
@(env bash $(PWD)/buildscripts/verify-healing.sh) @(env bash $(PWD)/buildscripts/verify-healing.sh)
@(env bash $(PWD)/buildscripts/unaligned-healing.sh) @(env bash $(PWD)/buildscripts/unaligned-healing.sh)
verify-healing-with-root-disks: verify-healing-with-root-disks: ## verify healing root disks
@echo "Verify healing with root disks" @echo "Verify healing with root disks"
@GORACE=history_size=7 CGO_ENABLED=1 go build -race -tags kqueue -trimpath --ldflags "$(LDFLAGS)" -o $(PWD)/minio 1>/dev/null @GORACE=history_size=7 CGO_ENABLED=1 go build -race -tags kqueue -trimpath --ldflags "$(LDFLAGS)" -o $(PWD)/minio 1>/dev/null
@(env bash $(PWD)/buildscripts/verify-healing-with-root-disks.sh) @(env bash $(PWD)/buildscripts/verify-healing-with-root-disks.sh)
verify-healing-with-rewrite: ## verify healing to rewrite old xl.meta -> new xl.meta
@echo "Verify healing with rewrite"
@GORACE=history_size=7 CGO_ENABLED=1 go build -race -tags kqueue -trimpath --ldflags "$(LDFLAGS)" -o $(PWD)/minio 1>/dev/null
@(env bash $(PWD)/buildscripts/rewrite-old-new.sh)
verify-healing-inconsistent-versions: ## verify resolving inconsistent versions verify-healing-inconsistent-versions: ## verify resolving inconsistent versions
@echo "Verify resolving inconsistent versions build with race" @echo "Verify resolving inconsistent versions build with race"
@GORACE=history_size=7 CGO_ENABLED=1 go build -race -tags kqueue -trimpath --ldflags "$(LDFLAGS)" -o $(PWD)/minio 1>/dev/null @GORACE=history_size=7 CGO_ENABLED=1 go build -race -tags kqueue -trimpath --ldflags "$(LDFLAGS)" -o $(PWD)/minio 1>/dev/null

View File

@ -0,0 +1,87 @@
//go:build ignore
// +build ignore
//
// MinIO Object Storage (c) 2022 MinIO, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package main
import (
"context"
"encoding/json"
"fmt"
"log"
"os"
"time"
"github.com/minio/madmin-go"
)
func main() {
// Note: YOUR-ACCESSKEYID, YOUR-SECRETACCESSKEY are
// dummy values, please replace them with original values.
// API requests are secure (HTTPS) if secure=true and insecure (HTTP) otherwise.
// New returns an MinIO Admin client object.
madmClnt, err := madmin.New(os.Args[1], os.Args[2], os.Args[3], false)
if err != nil {
log.Fatalln(err)
}
opts := madmin.HealOpts{
Recursive: true, // recursively heal all objects at 'prefix'
Remove: true, // remove content that has lost quorum and not recoverable
Recreate: true, // rewrite all old non-inlined xl.meta to new xl.meta
ScanMode: madmin.HealNormalScan, // by default do not do 'deep' scanning
}
start, _, err := madmClnt.Heal(context.Background(), "healing-rewrite-bucket", "", opts, "", false, false)
if err != nil {
log.Fatalln(err)
}
fmt.Println("Healstart sequence ===")
enc := json.NewEncoder(os.Stdout)
if err = enc.Encode(&start); err != nil {
log.Fatalln(err)
}
fmt.Println()
for {
_, status, err := madmClnt.Heal(context.Background(), "healing-rewrite-bucket", "", opts, start.ClientToken, false, false)
if status.Summary == "finished" {
fmt.Println("Healstatus on items ===")
for _, item := range status.Items {
if err = enc.Encode(&item); err != nil {
log.Fatalln(err)
}
}
break
}
if status.Summary == "stopped" {
fmt.Println("Healstatus on items ===")
fmt.Println("Heal failed with", status.FailureDetail)
break
}
for _, item := range status.Items {
if err = enc.Encode(&item); err != nil {
log.Fatalln(err)
}
}
time.Sleep(time.Second)
}
}

151
buildscripts/rewrite-old-new.sh Executable file
View File

@ -0,0 +1,151 @@
#!/bin/bash -e
set -E
set -o pipefail
set -x
WORK_DIR="$PWD/.verify-$RANDOM"
MINIO_CONFIG_DIR="$WORK_DIR/.minio"
MINIO_OLD=( "$PWD/minio.RELEASE.2020-10-28T08-16-50Z" --config-dir "$MINIO_CONFIG_DIR" server )
MINIO=( "$PWD/minio" --config-dir "$MINIO_CONFIG_DIR" server )
if [ ! -x "$PWD/minio" ]; then
echo "minio executable binary not found in current directory"
exit 1
fi
function download_old_release() {
if [ ! -f minio.RELEASE.2020-10-28T08-16-50Z ]; then
curl --silent -O https://dl.minio.io/server/minio/release/linux-amd64/archive/minio.RELEASE.2020-10-28T08-16-50Z
chmod a+x minio.RELEASE.2020-10-28T08-16-50Z
fi
}
function verify_rewrite() {
start_port=$1
export MINIO_ACCESS_KEY=minio
export MINIO_SECRET_KEY=minio123
export MC_HOST_minio="http://minio:minio123@127.0.0.1:${start_port}/"
unset MINIO_KMS_AUTO_ENCRYPTION # do not auto-encrypt objects
export MINIO_CI_CD=1
MC_BUILD_DIR="mc-$RANDOM"
if ! git clone --quiet https://github.com/minio/mc "$MC_BUILD_DIR"; then
echo "failed to download https://github.com/minio/mc"
purge "${MC_BUILD_DIR}"
exit 1
fi
(cd "${MC_BUILD_DIR}" && go build -o "$WORK_DIR/mc")
# remove mc source.
purge "${MC_BUILD_DIR}"
"${MINIO_OLD[@]}" --address ":$start_port" "${WORK_DIR}/xl{1...16}" > "${WORK_DIR}/server1.log" 2>&1 &
pid=$!
disown $pid
sleep 10
if ! ps -p ${pid} 1>&2 >/dev/null; then
echo "server1 log:"
cat "${WORK_DIR}/server1.log"
echo "FAILED"
purge "$WORK_DIR"
exit 1
fi
"${WORK_DIR}/mc" mb minio/healing-rewrite-bucket --quiet --with-lock
"${WORK_DIR}/mc" cp \
buildscripts/verify-build.sh \
minio/healing-rewrite-bucket/ \
--disable-multipart --quiet
"${WORK_DIR}/mc" cp \
buildscripts/verify-build.sh \
minio/healing-rewrite-bucket/ \
--disable-multipart --quiet
"${WORK_DIR}/mc" cp \
buildscripts/verify-build.sh \
minio/healing-rewrite-bucket/ \
--disable-multipart --quiet
kill ${pid}
sleep 3
"${MINIO[@]}" --address ":$start_port" "${WORK_DIR}/xl{1...16}" > "${WORK_DIR}/server1.log" 2>&1 &
pid=$!
disown $pid
sleep 10
if ! ps -p ${pid} 1>&2 >/dev/null; then
echo "server1 log:"
cat "${WORK_DIR}/server1.log"
echo "FAILED"
purge "$WORK_DIR"
exit 1
fi
go build ./docs/debugging/s3-check-md5/
if ! ./s3-check-md5 \
-debug \
-versions \
-access-key minio \
-secret-key minio123 \
-endpoint http://127.0.0.1:${start_port}/ 2>&1 | grep INTACT; then
echo "server1 log:"
cat "${WORK_DIR}/server1.log"
echo "FAILED"
mkdir -p inspects
(cd inspects; "${WORK_DIR}/mc" admin inspect minio/healing-rewrite-bucket/verify-build.sh/**)
"${WORK_DIR}/mc" mb play/inspects
"${WORK_DIR}/mc" mirror inspects play/inspects
purge "$WORK_DIR"
exit 1
fi
go run ./buildscripts/heal-manual.go "127.0.0.1:${start_port}" "minio" "minio123"
sleep 1
if ! ./s3-check-md5 \
-debug \
-versions \
-access-key minio \
-secret-key minio123 \
-endpoint http://127.0.0.1:${start_port}/ 2>&1 | grep INTACT; then
echo "server1 log:"
cat "${WORK_DIR}/server1.log"
echo "FAILED"
mkdir -p inspects
(cd inspects; "${WORK_DIR}/mc" admin inspect minio/healing-rewrite-bucket/verify-build.sh/**)
"${WORK_DIR}/mc" mb play/inspects
"${WORK_DIR}/mc" mirror inspects play/inspects
purge "$WORK_DIR"
exit 1
fi
kill ${pid}
}
function main() {
download_old_release
start_port=$(shuf -i 10000-65000 -n 1)
verify_rewrite ${start_port}
}
function purge()
{
rm -rf "$1"
}
( main "$@" )
rv=$?
purge "$WORK_DIR"
exit "$rv"

View File

@ -243,7 +243,7 @@ func listAllBuckets(ctx context.Context, storageDisks []StorageAPI, healBuckets
// Only heal on disks where we are sure that healing is needed. We can expand // Only heal on disks where we are sure that healing is needed. We can expand
// this list as and when we figure out more errors can be added to this list safely. // this list as and when we figure out more errors can be added to this list safely.
func shouldHealObjectOnDisk(erErr, dataErr error, meta FileInfo, latestMeta FileInfo) bool { func shouldHealObjectOnDisk(erErr, dataErr error, meta FileInfo, latestMeta FileInfo, doinline bool) bool {
switch { switch {
case errors.Is(erErr, errFileNotFound) || errors.Is(erErr, errFileVersionNotFound): case errors.Is(erErr, errFileNotFound) || errors.Is(erErr, errFileVersionNotFound):
return true return true
@ -256,6 +256,10 @@ func shouldHealObjectOnDisk(erErr, dataErr error, meta FileInfo, latestMeta File
// always check first. // always check first.
return true return true
} }
if doinline {
// convert small files to 'inline'
return true
}
if !meta.Deleted && !meta.IsRemote() { if !meta.Deleted && !meta.IsRemote() {
// If xl.meta was read fine but there may be problem with the part.N files. // If xl.meta was read fine but there may be problem with the part.N files.
if IsErr(dataErr, []error{ if IsErr(dataErr, []error{
@ -342,6 +346,26 @@ func (er erasureObjects) healObject(ctx context.Context, bucket string, object s
availableDisks, dataErrs, diskMTime := disksWithAllParts(ctx, onlineDisks, partsMetadata, availableDisks, dataErrs, diskMTime := disksWithAllParts(ctx, onlineDisks, partsMetadata,
errs, latestMeta, bucket, object, scanMode) errs, latestMeta, bucket, object, scanMode)
var erasure Erasure
var recreate bool
if !latestMeta.Deleted && !latestMeta.IsRemote() {
// Initialize erasure coding
erasure, err = NewErasure(ctx, latestMeta.Erasure.DataBlocks,
latestMeta.Erasure.ParityBlocks, latestMeta.Erasure.BlockSize)
if err != nil {
return result, err
}
// Is only 'true' if the opts.Recreate is true and
// the object shardSize < smallFileThreshold do not
// set this to 'true' arbitrarily and must be only
// 'true' with caller ask.
recreate = (opts.Recreate &&
!latestMeta.InlineData() &&
len(latestMeta.Parts) == 1 &&
erasure.ShardFileSize(latestMeta.Parts[0].ActualSize) < smallFileThreshold)
}
// Loop to find number of disks with valid data, per-drive // Loop to find number of disks with valid data, per-drive
// data state and a list of outdated disks on which data needs // data state and a list of outdated disks on which data needs
// to be healed. // to be healed.
@ -368,7 +392,7 @@ func (er erasureObjects) healObject(ctx context.Context, bucket string, object s
driveState = madmin.DriveStateCorrupt driveState = madmin.DriveStateCorrupt
} }
if shouldHealObjectOnDisk(errs[i], dataErrs[i], partsMetadata[i], latestMeta) { if shouldHealObjectOnDisk(errs[i], dataErrs[i], partsMetadata[i], latestMeta, recreate) {
outDatedDisks[i] = storageDisks[i] outDatedDisks[i] = storageDisks[i]
disksToHealCount++ disksToHealCount++
result.Before.Drives = append(result.Before.Drives, madmin.HealDriveInfo{ result.Before.Drives = append(result.Before.Drives, madmin.HealDriveInfo{
@ -422,7 +446,7 @@ func (er erasureObjects) healObject(ctx context.Context, bucket string, object s
return result, nil return result, nil
} }
if !latestMeta.XLV1 && !latestMeta.Deleted && disksToHealCount > latestMeta.Erasure.ParityBlocks { if !latestMeta.XLV1 && !latestMeta.Deleted && !recreate && disksToHealCount > latestMeta.Erasure.ParityBlocks {
// When disk to heal count is greater than parity blocks we should simply error out. // When disk to heal count is greater than parity blocks we should simply error out.
err := fmt.Errorf("more disks are expected to heal than parity, returned errors: %v (dataErrs %v) -> %s/%s(%s)", errs, dataErrs, bucket, object, versionID) err := fmt.Errorf("more disks are expected to heal than parity, returned errors: %v (dataErrs %v) -> %s/%s(%s)", errs, dataErrs, bucket, object, versionID)
logger.LogIf(ctx, err) logger.LogIf(ctx, err)
@ -478,18 +502,9 @@ func (er erasureObjects) healObject(ctx context.Context, bucket string, object s
} }
var inlineBuffers []*bytes.Buffer var inlineBuffers []*bytes.Buffer
if latestMeta.InlineData() {
inlineBuffers = make([]*bytes.Buffer, len(outDatedDisks))
}
if !latestMeta.Deleted && !latestMeta.IsRemote() { if !latestMeta.Deleted && !latestMeta.IsRemote() {
// Heal each part. erasureHealFile() will write the healed if latestMeta.InlineData() || recreate {
// part to .minio/tmp/uuid/ which needs to be renamed later to inlineBuffers = make([]*bytes.Buffer, len(outDatedDisks))
// the final location.
erasure, err := NewErasure(ctx, latestMeta.Erasure.DataBlocks,
latestMeta.Erasure.ParityBlocks, latestMeta.Erasure.BlockSize)
if err != nil {
return result, err
} }
erasureInfo := latestMeta.Erasure erasureInfo := latestMeta.Erasure
@ -525,6 +540,10 @@ func (er erasureObjects) healObject(ctx context.Context, bucket string, object s
tillOffset, DefaultBitrotAlgorithm, erasure.ShardSize()) tillOffset, DefaultBitrotAlgorithm, erasure.ShardSize())
} }
} }
// Heal each part. erasure.Heal() will write the healed
// part to .minio/tmp/uuid/ which needs to be renamed
// later to the final location.
err = erasure.Heal(ctx, writers, readers, partSize) err = erasure.Heal(ctx, writers, readers, partSize)
closeBitrotReaders(readers) closeBitrotReaders(readers)
closeBitrotWriters(writers) closeBitrotWriters(writers)
@ -556,6 +575,7 @@ func (er erasureObjects) healObject(ctx context.Context, bucket string, object s
}) })
if len(inlineBuffers) > 0 && inlineBuffers[i] != nil { if len(inlineBuffers) > 0 && inlineBuffers[i] != nil {
partsMetadata[i].Data = inlineBuffers[i].Bytes() partsMetadata[i].Data = inlineBuffers[i].Bytes()
partsMetadata[i].SetInlineData()
} else { } else {
partsMetadata[i].Data = nil partsMetadata[i].Data = nil
} }
@ -587,8 +607,9 @@ func (er erasureObjects) healObject(ctx context.Context, bucket string, object s
return result, err return result, err
} }
// Remove any remaining parts from outdated disks from before transition. // - Remove any parts from healed disks after its been inlined.
if partsMetadata[i].IsRemote() { // - Remove any remaining parts from outdated disks from before transition.
if recreate || partsMetadata[i].IsRemote() {
rmDataDir := partsMetadata[i].DataDir rmDataDir := partsMetadata[i].DataDir
disk.DeleteVol(ctx, pathJoin(bucket, encodeDirObject(object), rmDataDir), true) disk.DeleteVol(ctx, pathJoin(bucket, encodeDirObject(object), rmDataDir), true)
} }

View File

@ -1769,7 +1769,9 @@ func (z *erasureServerPools) HealBucket(ctx context.Context, bucket string, opts
} }
// Attempt heal on the bucket metadata, ignore any failures // Attempt heal on the bucket metadata, ignore any failures
defer z.HealObject(ctx, minioMetaBucket, pathJoin(bucketMetaPrefix, bucket, bucketMetadataFile), "", opts) hopts := opts
hopts.Recreate = false
defer z.HealObject(ctx, minioMetaBucket, pathJoin(bucketMetaPrefix, bucket, bucketMetadataFile), "", hopts)
for _, pool := range z.serverPools { for _, pool := range z.serverPools {
result, err := pool.HealBucket(ctx, bucket, opts) result, err := pool.HealBucket(ctx, bucket, opts)