From 4a1edfd9aad05da2db591aed28ef2599c6e04268 Mon Sep 17 00:00:00 2001 From: Krishnan Parthasarathi Date: Thu, 25 Jul 2024 14:02:50 -0700 Subject: [PATCH] Different read quorum for tiered objects (#20115) For a non-tiered object, MinIO requires that EcM (# of data blocks) of xl.meta agree, corresponding to the number of data blocks needed to read this object. OTOH, tiered objects have metadata in the hot tier and data in the warm tier. The data and its integrity are offloaded to the warm tier. This allows us to reduce the read quorum from EcM (typically > N/2, where N - erasure stripe width) to N/2 + 1. The simple majority of metadata ensures consensus on what the object is and where it is located. --- cmd/erasure-metadata.go | 8 ++- cmd/erasure-metadata_test.go | 123 +++++++++++++++++++++++++++++++++++ 2 files changed, 130 insertions(+), 1 deletion(-) diff --git a/cmd/erasure-metadata.go b/cmd/erasure-metadata.go index 41f276e24..4b025ae4b 100644 --- a/cmd/erasure-metadata.go +++ b/cmd/erasure-metadata.go @@ -26,6 +26,7 @@ import ( "time" "github.com/minio/minio/internal/amztime" + "github.com/minio/minio/internal/bucket/lifecycle" "github.com/minio/minio/internal/bucket/replication" "github.com/minio/minio/internal/crypto" "github.com/minio/minio/internal/hash/sha256" @@ -456,6 +457,7 @@ func commonParity(parities []int, defaultParityCount int) int { } func listObjectParities(partsMetadata []FileInfo, errs []error) (parities []int) { + totalShards := len(partsMetadata) parities = make([]int, len(partsMetadata)) for index, metadata := range partsMetadata { if errs[index] != nil { @@ -466,9 +468,13 @@ func listObjectParities(partsMetadata []FileInfo, errs []error) (parities []int) parities[index] = -1 continue } + //nolint:gocritic // Delete marker or zero byte objects take highest parity. if metadata.Deleted || metadata.Size == 0 { - parities[index] = len(partsMetadata) / 2 + parities[index] = totalShards / 2 + } else if metadata.TransitionStatus == lifecycle.TransitionComplete { + // For tiered objects, read quorum is N/2+1 to ensure simple majority on xl.meta. It is not equal to EcM because the data integrity is entrusted with the warm tier. + parities[index] = totalShards - (totalShards/2 + 1) } else { parities[index] = metadata.Erasure.ParityBlocks } diff --git a/cmd/erasure-metadata_test.go b/cmd/erasure-metadata_test.go index ebb0a99d5..1fc2f5d04 100644 --- a/cmd/erasure-metadata_test.go +++ b/cmd/erasure-metadata_test.go @@ -19,6 +19,8 @@ package cmd import ( "context" + "fmt" + "slices" "strconv" "testing" "time" @@ -359,3 +361,124 @@ func TestSkipTierFreeVersion(t *testing.T) { t.Fatal("Expected SkipTierFreeVersion to be set on FileInfo but wasn't") } } + +func TestListObjectParities(t *testing.T) { + mkMetaArr := func(N, parity, agree int) []FileInfo { + fi := newFileInfo("obj-1", N-parity, parity) + fi.TransitionTier = "WARM-TIER" + fi.TransitionedObjName = mustGetUUID() + fi.TransitionStatus = "complete" + fi.Size = 1 << 20 + + metaArr := make([]FileInfo, N) + for i := range N { + fi.Erasure.Index = i + 1 + metaArr[i] = fi + if i < agree { + continue + } + metaArr[i].TransitionTier, metaArr[i].TransitionedObjName = "", "" + metaArr[i].TransitionStatus = "" + } + return metaArr + } + mkParities := func(N, agreedParity, disagreedParity, agree int) []int { + ps := make([]int, N) + for i := range N { + if i < agree { + ps[i] = agreedParity + continue + } + ps[i] = disagreedParity // disagree + } + return ps + } + + mkTest := func(N, parity, agree int) (res struct { + metaArr []FileInfo + errs []error + parities []int + parity int + }, + ) { + res.metaArr = mkMetaArr(N, parity, agree) + res.parities = mkParities(N, N-(N/2+1), parity, agree) + res.errs = make([]error, N) + if agree >= N/2+1 { // simple majority consensus + res.parity = N - (N/2 + 1) + } else { + res.parity = -1 + } + return res + } + + nonTieredTest := func(N, parity, agree int) (res struct { + metaArr []FileInfo + errs []error + parities []int + parity int + }, + ) { + fi := newFileInfo("obj-1", N-parity, parity) + fi.Size = 1 << 20 + metaArr := make([]FileInfo, N) + parities := make([]int, N) + for i := range N { + fi.Erasure.Index = i + 1 + metaArr[i] = fi + parities[i] = parity + if i < agree { + continue + } + metaArr[i].Erasure.Index = 0 // creates invalid fi on remaining drives + parities[i] = -1 // invalid fi are assigned parity -1 + } + res.metaArr = metaArr + res.parities = parities + res.errs = make([]error, N) + if agree >= N-parity { + res.parity = parity + } else { + res.parity = -1 + } + + return res + } + tests := []struct { + metaArr []FileInfo + errs []error + parities []int + parity int + }{ + // More than simple majority consensus + mkTest(15, 3, 11), + // No simple majority consensus + mkTest(15, 3, 7), + // Exact simple majority consensus + mkTest(15, 3, 8), + // More than simple majority consensus + mkTest(16, 4, 11), + // No simple majority consensus + mkTest(16, 4, 8), + // Exact simple majority consensus + mkTest(16, 4, 9), + // non-tiered object require read quorum of EcM + nonTieredTest(15, 3, 12), + // non-tiered object with fewer than EcM in consensus + nonTieredTest(15, 3, 11), + // non-tiered object require read quorum of EcM + nonTieredTest(16, 4, 12), + // non-tiered object with fewer than EcM in consensus + nonTieredTest(16, 4, 11), + } + for i, test := range tests { + t.Run(fmt.Sprintf("Test %d", i+1), func(t *testing.T) { + if got := listObjectParities(test.metaArr, test.errs); !slices.Equal(got, test.parities) { + t.Fatalf("Expected parities %v but got %v", test.parities, got) + } + if got := commonParity(test.parities, len(test.metaArr)/2); got != test.parity { + t.Fatalf("Expected common parity %v but got %v", test.parity, got) + } + }) + } +}