From d1e775313d858e05b5d69aae74cc5e4427c9b3ae Mon Sep 17 00:00:00 2001 From: Poorna Date: Thu, 16 Mar 2023 07:48:05 -0700 Subject: [PATCH] support decommissioning of tiered objects (#16751) --- cmd/erasure-object.go | 43 ++++++++++++++++++++++++ cmd/erasure-server-pool-decom.go | 47 +++++++++++++------------- cmd/erasure-server-pool.go | 24 ++++++++++++++ cmd/erasure-sets.go | 6 ++++ cmd/object-api-interface.go | 1 + docs/distributed/decom.sh | 57 ++++++++++++++++++++++++++++++++ 6 files changed, 155 insertions(+), 23 deletions(-) diff --git a/cmd/erasure-object.go b/cmd/erasure-object.go index 8b579b4aa..067ee2a5b 100644 --- a/cmd/erasure-object.go +++ b/cmd/erasure-object.go @@ -2089,3 +2089,46 @@ func (er erasureObjects) restoreTransitionedObject(ctx context.Context, bucket s }) return setRestoreHeaderFn(oi, err) } + +// DecomTieredObject - moves tiered object to another pool during decommissioning. +func (er erasureObjects) DecomTieredObject(ctx context.Context, bucket, object string, fi FileInfo, opts ObjectOptions) error { + if opts.UserDefined == nil { + opts.UserDefined = make(map[string]string) + } + // overlay Erasure info for this set of disks + storageDisks := er.getDisks() + // Get parity and data drive count based on storage class metadata + parityDrives := globalStorageClass.GetParityForSC(opts.UserDefined[xhttp.AmzStorageClass]) + if parityDrives < 0 { + parityDrives = er.defaultParityCount + } + dataDrives := len(storageDisks) - parityDrives + + // we now know the number of blocks this object needs for data and parity. + // writeQuorum is dataBlocks + 1 + writeQuorum := dataDrives + if dataDrives == parityDrives { + writeQuorum++ + } + + // Initialize parts metadata + partsMetadata := make([]FileInfo, len(storageDisks)) + + fi2 := newFileInfo(pathJoin(bucket, object), dataDrives, parityDrives) + fi.Erasure = fi2.Erasure + // Initialize erasure metadata. + for index := range partsMetadata { + partsMetadata[index] = fi + partsMetadata[index].Erasure.Index = index + 1 + } + + // Order disks according to erasure distribution + var onlineDisks []StorageAPI + onlineDisks, partsMetadata = shuffleDisksAndPartsMetadata(storageDisks, partsMetadata, fi) + + if _, err := writeUniqueFileInfo(ctx, onlineDisks, bucket, object, partsMetadata, writeQuorum); err != nil { + return toObjectErr(err, bucket, object) + } + + return nil +} diff --git a/cmd/erasure-server-pool-decom.go b/cmd/erasure-server-pool-decom.go index 8689ad813..bbc0a3194 100644 --- a/cmd/erasure-server-pool-decom.go +++ b/cmd/erasure-server-pool-decom.go @@ -736,14 +736,16 @@ func (z *erasureServerPools) decommissionPool(ctx context.Context, idx int, pool objInfo := fi.ToObjectInfo(bucket, object, versioned) evt := evalActionFromLifecycle(ctx, *lc, lr, objInfo) - if evt.Action.Delete() { + switch { + case evt.Action.DeleteRestored(): // if restored copy has expired,delete it synchronously + applyExpiryOnTransitionedObject(ctx, z, objInfo, evt.Action.DeleteRestored()) + return false + case evt.Action.Delete(): globalExpiryState.enqueueByDays(objInfo, evt.Action.DeleteRestored(), evt.Action.DeleteVersioned()) - if !evt.Action.DeleteRestored() { - return true - } + return true + default: + return false } - - return false } decommissionEntry := func(entry metaCacheEntry) { @@ -767,12 +769,6 @@ func (z *erasureServerPools) decommissionPool(ctx context.Context, idx int, pool var decommissionedCount int for _, version := range fivs.Versions { - // TODO: Skip transitioned objects for now. - if version.IsRemote() { - logger.LogIf(ctx, fmt.Errorf("found %s/%s transitioned object, transitioned object won't be decommissioned", bi.Name, version.Name)) - continue - } - // Apply lifecycle rules on the objects that are expired. if filterLifecycle(bi.Name, version.Name, version) { logger.LogIf(ctx, fmt.Errorf("found %s/%s (%s) expired object based on ILM rules, skipping and scheduled for deletion", bi.Name, version.Name, version.VersionID)) @@ -821,6 +817,22 @@ func (z *erasureServerPools) decommissionPool(ctx context.Context, idx int, pool var failure, ignore bool // gr.Close() is ensured by decommissionObject(). for try := 0; try < 3; try++ { + if version.IsRemote() { + stopFn := globalDecommissionMetrics.log(decomMetricDecommissionObject, idx, bi.Name, version.Name, version.VersionID) + if err := z.DecomTieredObject(ctx, bi.Name, version.Name, version, ObjectOptions{ + VersionID: version.VersionID, + MTime: version.ModTime, + UserDefined: version.Metadata, + }); err != nil { + stopFn(err) + failure = true + logger.LogIf(ctx, err) + continue + } + stopFn(nil) + failure = false + break + } gr, err := set.GetObjectNInfo(ctx, bi.Name, encodeDirObject(version.Name), @@ -1268,17 +1280,6 @@ func (z *erasureServerPools) StartDecommission(ctx context.Context, indices ...i z.HealBucket(ctx, bucket.Name, madmin.HealOpts{}) } - // TODO: Support decommissioning transition tiers. - for _, bucket := range decomBuckets { - if lc, err := globalLifecycleSys.Get(bucket.Name); err == nil { - if lc.HasTransition() { - return decomError{ - Err: fmt.Sprintf("Bucket is part of transitioned tier %s: decommission is not allowed in Tier'd setups", bucket.Name), - } - } - } - } - // Create .minio.sys/conifg, .minio.sys/buckets paths if missing, // this code is present to avoid any missing meta buckets on other // pools. diff --git a/cmd/erasure-server-pool.go b/cmd/erasure-server-pool.go index b2f4c385e..33273a520 100644 --- a/cmd/erasure-server-pool.go +++ b/cmd/erasure-server-pool.go @@ -2348,3 +2348,27 @@ func (z *erasureServerPools) CheckAbandonedParts(ctx context.Context, bucket, ob } return nil } + +// DecomTieredObject - moves tiered object to another pool during decommissioning. +func (z *erasureServerPools) DecomTieredObject(ctx context.Context, bucket, object string, fi FileInfo, opts ObjectOptions) error { + object = encodeDirObject(object) + if z.SinglePool() { + return fmt.Errorf("error decommissioning %s/%s", bucket, object) + } + if !opts.NoLock { + ns := z.NewNSLock(bucket, object) + lkctx, err := ns.GetLock(ctx, globalOperationTimeout) + if err != nil { + return err + } + ctx = lkctx.Context() + defer ns.Unlock(lkctx) + opts.NoLock = true + } + idx, err := z.getPoolIdxNoLock(ctx, bucket, object, fi.Size) + if err != nil { + return err + } + + return z.serverPools[idx].DecomTieredObject(ctx, bucket, object, fi, opts) +} diff --git a/cmd/erasure-sets.go b/cmd/erasure-sets.go index b204cc4fa..56a4765cc 100644 --- a/cmd/erasure-sets.go +++ b/cmd/erasure-sets.go @@ -1215,6 +1215,12 @@ func (s *erasureSets) PutObjectMetadata(ctx context.Context, bucket, object stri return er.PutObjectMetadata(ctx, bucket, object, opts) } +// DecomTieredObject - moves tiered object to another pool during decommissioning. +func (s *erasureSets) DecomTieredObject(ctx context.Context, bucket, object string, fi FileInfo, opts ObjectOptions) error { + er := s.getHashedSet(object) + return er.DecomTieredObject(ctx, bucket, object, fi, opts) +} + // PutObjectTags - replace or add tags to an existing object func (s *erasureSets) PutObjectTags(ctx context.Context, bucket, object string, tags string, opts ObjectOptions) (ObjectInfo, error) { er := s.getHashedSet(object) diff --git a/cmd/object-api-interface.go b/cmd/object-api-interface.go index 910aea211..3f68c465b 100644 --- a/cmd/object-api-interface.go +++ b/cmd/object-api-interface.go @@ -254,6 +254,7 @@ type ObjectLayer interface { // Metadata operations PutObjectMetadata(context.Context, string, string, ObjectOptions) (ObjectInfo, error) + DecomTieredObject(context.Context, string, string, FileInfo, ObjectOptions) error // ObjectTagging operations PutObjectTags(context.Context, string, string, string, ObjectOptions) (ObjectInfo, error) diff --git a/docs/distributed/decom.sh b/docs/distributed/decom.sh index 75f1ff23a..71cc2a1a8 100755 --- a/docs/distributed/decom.sh +++ b/docs/distributed/decom.sh @@ -6,6 +6,7 @@ fi pkill minio rm -rf /tmp/xl +rm -rf /tmp/xltier if [ ! -f ./mc ]; then wget --quiet -O mc https://dl.minio.io/client/mc/release/linux-amd64/mc && \ @@ -45,6 +46,23 @@ expected_checksum=$(./mc cat internal/dsync/drwmutex.go | md5sum) user_count=$(./mc admin user list myminio/ | wc -l) policy_count=$(./mc admin policy list myminio/ | wc -l) +## create a warm tier instance +(minio server /tmp/xltier/{1...4}/disk{0...1} --address :9001 2>&1 >/dev/null)& +sleep 2 +export MC_HOST_mytier="http://minioadmin:minioadmin@localhost:9001/" + +./mc mb -l myminio/bucket2 +./mc mb -l mytier/tiered +## create a tier and set up ilm policy to tier immediately +./mc admin tier add minio myminio TIER1 --endpoint http://localhost:9001 --access-key minioadmin --secret-key minioadmin --bucket tiered --prefix prefix5/ +./mc ilm add myminio/bucket2 --transition-days 0 --transition-tier TIER1 --transition-days 0 +## mirror some content to bucket2 and capture versions tiered +./mc mirror internal myminio/bucket2/ --quiet >/dev/null +./mc ls -r myminio/bucket2/ > bucket2_ns.txt +./mc ls -r --versions myminio/bucket2/ > bucket2_ns_versions.txt +sleep 2 +./mc ls -r --versions mytier/tiered/ > tiered_ns_versions.txt + kill $pid (minio server /tmp/xl/{1...10}/disk{0...1} /tmp/xl/{11...30}/disk{0...3} 2>&1 >/tmp/expanded.log) & pid=$! @@ -134,4 +152,43 @@ if [ "${expected_checksum}" != "${got_checksum}" ]; then exit 1 fi +# after decommissioning, compare listings in bucket2 and tiered +./mc version info myminio/bucket2 | grep -q "versioning is enabled" +ret=$? +if [ $ret -ne 0 ]; then + echo "BUG: expected versioning enabled after decommission on bucket2" + exit 1 +fi + +./mc ls -r myminio/bucket2 > decommissioned_bucket2_ns.txt +./mc ls -r --versions myminio/bucket2 > decommissioned_bucket2_ns_versions.txt +./mc ls -r --versions mytier/tiered/ > tiered_ns_versions2.txt + +out=$(diff -qpruN bucket2_ns.txt decommissioned_bucket2_ns.txt) +ret=$? +if [ $ret -ne 0 ]; then + echo "BUG: expected no missing entries after decommission in bucket2: $out" + exit 1 +fi + +out=$(diff -qpruN bucket2_ns_versions.txt decommissioned_bucket2_ns_versions.txt) +ret=$? +if [ $ret -ne 0 ]; then + echo "BUG: expected no missing entries after decommission in bucket2x: $out" + exit 1 +fi + +out=$(diff -qpruN tiered_ns_versions.txt tiered_ns_versions2.txt) +ret=$? +if [ $ret -ne 0 ]; then + echo "BUG: expected no missing entries after decommission in warm tier: $out" + exit 1 +fi + +got_checksum=$(./mc cat myminio/bucket2/dsync/drwmutex.go | md5sum) +if [ "${expected_checksum}" != "${got_checksum}" ]; then + echo "BUG: decommission failed on encrypted objects with tiering: expected ${expected_checksum} got ${got_checksum}" + exit 1 +fi + kill $pid