mirror of
https://github.com/minio/minio.git
synced 2024-12-25 06:35:56 -05:00
Different read quorum for tiered objects (#20115)
For a non-tiered object, MinIO requires that EcM (# of data blocks) of xl.meta agree, corresponding to the number of data blocks needed to read this object. OTOH, tiered objects have metadata in the hot tier and data in the warm tier. The data and its integrity are offloaded to the warm tier. This allows us to reduce the read quorum from EcM (typically > N/2, where N - erasure stripe width) to N/2 + 1. The simple majority of metadata ensures consensus on what the object is and where it is located.
This commit is contained in:
parent
b7f319b62a
commit
4a1edfd9aa
@ -26,6 +26,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/minio/minio/internal/amztime"
|
||||
"github.com/minio/minio/internal/bucket/lifecycle"
|
||||
"github.com/minio/minio/internal/bucket/replication"
|
||||
"github.com/minio/minio/internal/crypto"
|
||||
"github.com/minio/minio/internal/hash/sha256"
|
||||
@ -456,6 +457,7 @@ func commonParity(parities []int, defaultParityCount int) int {
|
||||
}
|
||||
|
||||
func listObjectParities(partsMetadata []FileInfo, errs []error) (parities []int) {
|
||||
totalShards := len(partsMetadata)
|
||||
parities = make([]int, len(partsMetadata))
|
||||
for index, metadata := range partsMetadata {
|
||||
if errs[index] != nil {
|
||||
@ -466,9 +468,13 @@ func listObjectParities(partsMetadata []FileInfo, errs []error) (parities []int)
|
||||
parities[index] = -1
|
||||
continue
|
||||
}
|
||||
//nolint:gocritic
|
||||
// Delete marker or zero byte objects take highest parity.
|
||||
if metadata.Deleted || metadata.Size == 0 {
|
||||
parities[index] = len(partsMetadata) / 2
|
||||
parities[index] = totalShards / 2
|
||||
} else if metadata.TransitionStatus == lifecycle.TransitionComplete {
|
||||
// For tiered objects, read quorum is N/2+1 to ensure simple majority on xl.meta. It is not equal to EcM because the data integrity is entrusted with the warm tier.
|
||||
parities[index] = totalShards - (totalShards/2 + 1)
|
||||
} else {
|
||||
parities[index] = metadata.Erasure.ParityBlocks
|
||||
}
|
||||
|
@ -19,6 +19,8 @@ package cmd
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"slices"
|
||||
"strconv"
|
||||
"testing"
|
||||
"time"
|
||||
@ -359,3 +361,124 @@ func TestSkipTierFreeVersion(t *testing.T) {
|
||||
t.Fatal("Expected SkipTierFreeVersion to be set on FileInfo but wasn't")
|
||||
}
|
||||
}
|
||||
|
||||
func TestListObjectParities(t *testing.T) {
|
||||
mkMetaArr := func(N, parity, agree int) []FileInfo {
|
||||
fi := newFileInfo("obj-1", N-parity, parity)
|
||||
fi.TransitionTier = "WARM-TIER"
|
||||
fi.TransitionedObjName = mustGetUUID()
|
||||
fi.TransitionStatus = "complete"
|
||||
fi.Size = 1 << 20
|
||||
|
||||
metaArr := make([]FileInfo, N)
|
||||
for i := range N {
|
||||
fi.Erasure.Index = i + 1
|
||||
metaArr[i] = fi
|
||||
if i < agree {
|
||||
continue
|
||||
}
|
||||
metaArr[i].TransitionTier, metaArr[i].TransitionedObjName = "", ""
|
||||
metaArr[i].TransitionStatus = ""
|
||||
}
|
||||
return metaArr
|
||||
}
|
||||
mkParities := func(N, agreedParity, disagreedParity, agree int) []int {
|
||||
ps := make([]int, N)
|
||||
for i := range N {
|
||||
if i < agree {
|
||||
ps[i] = agreedParity
|
||||
continue
|
||||
}
|
||||
ps[i] = disagreedParity // disagree
|
||||
}
|
||||
return ps
|
||||
}
|
||||
|
||||
mkTest := func(N, parity, agree int) (res struct {
|
||||
metaArr []FileInfo
|
||||
errs []error
|
||||
parities []int
|
||||
parity int
|
||||
},
|
||||
) {
|
||||
res.metaArr = mkMetaArr(N, parity, agree)
|
||||
res.parities = mkParities(N, N-(N/2+1), parity, agree)
|
||||
res.errs = make([]error, N)
|
||||
if agree >= N/2+1 { // simple majority consensus
|
||||
res.parity = N - (N/2 + 1)
|
||||
} else {
|
||||
res.parity = -1
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
||||
nonTieredTest := func(N, parity, agree int) (res struct {
|
||||
metaArr []FileInfo
|
||||
errs []error
|
||||
parities []int
|
||||
parity int
|
||||
},
|
||||
) {
|
||||
fi := newFileInfo("obj-1", N-parity, parity)
|
||||
fi.Size = 1 << 20
|
||||
metaArr := make([]FileInfo, N)
|
||||
parities := make([]int, N)
|
||||
for i := range N {
|
||||
fi.Erasure.Index = i + 1
|
||||
metaArr[i] = fi
|
||||
parities[i] = parity
|
||||
if i < agree {
|
||||
continue
|
||||
}
|
||||
metaArr[i].Erasure.Index = 0 // creates invalid fi on remaining drives
|
||||
parities[i] = -1 // invalid fi are assigned parity -1
|
||||
}
|
||||
res.metaArr = metaArr
|
||||
res.parities = parities
|
||||
res.errs = make([]error, N)
|
||||
if agree >= N-parity {
|
||||
res.parity = parity
|
||||
} else {
|
||||
res.parity = -1
|
||||
}
|
||||
|
||||
return res
|
||||
}
|
||||
tests := []struct {
|
||||
metaArr []FileInfo
|
||||
errs []error
|
||||
parities []int
|
||||
parity int
|
||||
}{
|
||||
// More than simple majority consensus
|
||||
mkTest(15, 3, 11),
|
||||
// No simple majority consensus
|
||||
mkTest(15, 3, 7),
|
||||
// Exact simple majority consensus
|
||||
mkTest(15, 3, 8),
|
||||
// More than simple majority consensus
|
||||
mkTest(16, 4, 11),
|
||||
// No simple majority consensus
|
||||
mkTest(16, 4, 8),
|
||||
// Exact simple majority consensus
|
||||
mkTest(16, 4, 9),
|
||||
// non-tiered object require read quorum of EcM
|
||||
nonTieredTest(15, 3, 12),
|
||||
// non-tiered object with fewer than EcM in consensus
|
||||
nonTieredTest(15, 3, 11),
|
||||
// non-tiered object require read quorum of EcM
|
||||
nonTieredTest(16, 4, 12),
|
||||
// non-tiered object with fewer than EcM in consensus
|
||||
nonTieredTest(16, 4, 11),
|
||||
}
|
||||
for i, test := range tests {
|
||||
t.Run(fmt.Sprintf("Test %d", i+1), func(t *testing.T) {
|
||||
if got := listObjectParities(test.metaArr, test.errs); !slices.Equal(got, test.parities) {
|
||||
t.Fatalf("Expected parities %v but got %v", test.parities, got)
|
||||
}
|
||||
if got := commonParity(test.parities, len(test.metaArr)/2); got != test.parity {
|
||||
t.Fatalf("Expected common parity %v but got %v", test.parity, got)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user