Different read quorum for tiered objects (#20115)

For a non-tiered object, MinIO requires that EcM (# of data blocks) of
xl.meta agree, corresponding to the number of data blocks needed to 
read this object.

OTOH, tiered objects have metadata in the hot tier and data in the 
warm tier. The data and its integrity are offloaded to the warm tier. This
allows us to reduce the read quorum from EcM (typically > N/2, where N -
erasure stripe width) to N/2 + 1. The simple majority of metadata
ensures consensus on what the object is and where it is
located.
This commit is contained in:
Krishnan Parthasarathi 2024-07-25 14:02:50 -07:00 committed by GitHub
parent b7f319b62a
commit 4a1edfd9aa
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 130 additions and 1 deletions

View File

@ -26,6 +26,7 @@ import (
"time" "time"
"github.com/minio/minio/internal/amztime" "github.com/minio/minio/internal/amztime"
"github.com/minio/minio/internal/bucket/lifecycle"
"github.com/minio/minio/internal/bucket/replication" "github.com/minio/minio/internal/bucket/replication"
"github.com/minio/minio/internal/crypto" "github.com/minio/minio/internal/crypto"
"github.com/minio/minio/internal/hash/sha256" "github.com/minio/minio/internal/hash/sha256"
@ -456,6 +457,7 @@ func commonParity(parities []int, defaultParityCount int) int {
} }
func listObjectParities(partsMetadata []FileInfo, errs []error) (parities []int) { func listObjectParities(partsMetadata []FileInfo, errs []error) (parities []int) {
totalShards := len(partsMetadata)
parities = make([]int, len(partsMetadata)) parities = make([]int, len(partsMetadata))
for index, metadata := range partsMetadata { for index, metadata := range partsMetadata {
if errs[index] != nil { if errs[index] != nil {
@ -466,9 +468,13 @@ func listObjectParities(partsMetadata []FileInfo, errs []error) (parities []int)
parities[index] = -1 parities[index] = -1
continue continue
} }
//nolint:gocritic
// Delete marker or zero byte objects take highest parity. // Delete marker or zero byte objects take highest parity.
if metadata.Deleted || metadata.Size == 0 { if metadata.Deleted || metadata.Size == 0 {
parities[index] = len(partsMetadata) / 2 parities[index] = totalShards / 2
} else if metadata.TransitionStatus == lifecycle.TransitionComplete {
// For tiered objects, read quorum is N/2+1 to ensure simple majority on xl.meta. It is not equal to EcM because the data integrity is entrusted with the warm tier.
parities[index] = totalShards - (totalShards/2 + 1)
} else { } else {
parities[index] = metadata.Erasure.ParityBlocks parities[index] = metadata.Erasure.ParityBlocks
} }

View File

@ -19,6 +19,8 @@ package cmd
import ( import (
"context" "context"
"fmt"
"slices"
"strconv" "strconv"
"testing" "testing"
"time" "time"
@ -359,3 +361,124 @@ func TestSkipTierFreeVersion(t *testing.T) {
t.Fatal("Expected SkipTierFreeVersion to be set on FileInfo but wasn't") t.Fatal("Expected SkipTierFreeVersion to be set on FileInfo but wasn't")
} }
} }
func TestListObjectParities(t *testing.T) {
mkMetaArr := func(N, parity, agree int) []FileInfo {
fi := newFileInfo("obj-1", N-parity, parity)
fi.TransitionTier = "WARM-TIER"
fi.TransitionedObjName = mustGetUUID()
fi.TransitionStatus = "complete"
fi.Size = 1 << 20
metaArr := make([]FileInfo, N)
for i := range N {
fi.Erasure.Index = i + 1
metaArr[i] = fi
if i < agree {
continue
}
metaArr[i].TransitionTier, metaArr[i].TransitionedObjName = "", ""
metaArr[i].TransitionStatus = ""
}
return metaArr
}
mkParities := func(N, agreedParity, disagreedParity, agree int) []int {
ps := make([]int, N)
for i := range N {
if i < agree {
ps[i] = agreedParity
continue
}
ps[i] = disagreedParity // disagree
}
return ps
}
mkTest := func(N, parity, agree int) (res struct {
metaArr []FileInfo
errs []error
parities []int
parity int
},
) {
res.metaArr = mkMetaArr(N, parity, agree)
res.parities = mkParities(N, N-(N/2+1), parity, agree)
res.errs = make([]error, N)
if agree >= N/2+1 { // simple majority consensus
res.parity = N - (N/2 + 1)
} else {
res.parity = -1
}
return res
}
nonTieredTest := func(N, parity, agree int) (res struct {
metaArr []FileInfo
errs []error
parities []int
parity int
},
) {
fi := newFileInfo("obj-1", N-parity, parity)
fi.Size = 1 << 20
metaArr := make([]FileInfo, N)
parities := make([]int, N)
for i := range N {
fi.Erasure.Index = i + 1
metaArr[i] = fi
parities[i] = parity
if i < agree {
continue
}
metaArr[i].Erasure.Index = 0 // creates invalid fi on remaining drives
parities[i] = -1 // invalid fi are assigned parity -1
}
res.metaArr = metaArr
res.parities = parities
res.errs = make([]error, N)
if agree >= N-parity {
res.parity = parity
} else {
res.parity = -1
}
return res
}
tests := []struct {
metaArr []FileInfo
errs []error
parities []int
parity int
}{
// More than simple majority consensus
mkTest(15, 3, 11),
// No simple majority consensus
mkTest(15, 3, 7),
// Exact simple majority consensus
mkTest(15, 3, 8),
// More than simple majority consensus
mkTest(16, 4, 11),
// No simple majority consensus
mkTest(16, 4, 8),
// Exact simple majority consensus
mkTest(16, 4, 9),
// non-tiered object require read quorum of EcM
nonTieredTest(15, 3, 12),
// non-tiered object with fewer than EcM in consensus
nonTieredTest(15, 3, 11),
// non-tiered object require read quorum of EcM
nonTieredTest(16, 4, 12),
// non-tiered object with fewer than EcM in consensus
nonTieredTest(16, 4, 11),
}
for i, test := range tests {
t.Run(fmt.Sprintf("Test %d", i+1), func(t *testing.T) {
if got := listObjectParities(test.metaArr, test.errs); !slices.Equal(got, test.parities) {
t.Fatalf("Expected parities %v but got %v", test.parities, got)
}
if got := commonParity(test.parities, len(test.metaArr)/2); got != test.parity {
t.Fatalf("Expected common parity %v but got %v", test.parity, got)
}
})
}
}