mirror of
https://github.com/minio/minio.git
synced 2025-04-04 03:40:30 -04:00
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read from the local disk, even if the disk contains a parity shard. In default setup there is a 50% chance that at least one shard that otherwise would have been fetched remotely will be read locally instead. It basically trades RPC call overhead for reed-solomon. On distributed localhost this seems to be fairly break-even, with a very small gain in throughput and latency. However on networked servers this should be a bigger 1MB objects, before: ``` Operation: GET. Concurrency: 32. Hosts: 4. Requests considered: 76257: * Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms * First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms Throughput: * Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST) ``` After: ``` Operation: GET. Concurrency: 32. Hosts: 4. Requests considered: 78845: * Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms * First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms Throughput: * Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST) ``` Bonus fix: Only ask for heal once on an object.
This commit is contained in:
parent
95814359bd
commit
4a007e3767
@ -31,22 +31,59 @@ var errHealRequired = errors.New("heal required")
|
|||||||
// Reads in parallel from readers.
|
// Reads in parallel from readers.
|
||||||
type parallelReader struct {
|
type parallelReader struct {
|
||||||
readers []io.ReaderAt
|
readers []io.ReaderAt
|
||||||
|
orgReaders []io.ReaderAt
|
||||||
dataBlocks int
|
dataBlocks int
|
||||||
offset int64
|
offset int64
|
||||||
shardSize int64
|
shardSize int64
|
||||||
shardFileSize int64
|
shardFileSize int64
|
||||||
buf [][]byte
|
buf [][]byte
|
||||||
|
readerToBuf []int
|
||||||
}
|
}
|
||||||
|
|
||||||
// newParallelReader returns parallelReader.
|
// newParallelReader returns parallelReader.
|
||||||
func newParallelReader(readers []io.ReaderAt, e Erasure, offset, totalLength int64) *parallelReader {
|
func newParallelReader(readers []io.ReaderAt, e Erasure, offset, totalLength int64) *parallelReader {
|
||||||
|
r2b := make([]int, len(readers))
|
||||||
|
for i := range r2b {
|
||||||
|
r2b[i] = i
|
||||||
|
}
|
||||||
return ¶llelReader{
|
return ¶llelReader{
|
||||||
readers,
|
readers: readers,
|
||||||
e.dataBlocks,
|
orgReaders: readers,
|
||||||
(offset / e.blockSize) * e.ShardSize(),
|
dataBlocks: e.dataBlocks,
|
||||||
e.ShardSize(),
|
offset: (offset / e.blockSize) * e.ShardSize(),
|
||||||
e.ShardFileSize(totalLength),
|
shardSize: e.ShardSize(),
|
||||||
make([][]byte, len(readers)),
|
shardFileSize: e.ShardFileSize(totalLength),
|
||||||
|
buf: make([][]byte, len(readers)),
|
||||||
|
readerToBuf: r2b,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// preferReaders can mark readers as preferred.
|
||||||
|
// These will be chosen before others.
|
||||||
|
func (p *parallelReader) preferReaders(prefer []bool) {
|
||||||
|
if len(prefer) != len(p.orgReaders) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Copy so we don't change our input.
|
||||||
|
tmp := make([]io.ReaderAt, len(p.orgReaders))
|
||||||
|
copy(tmp, p.orgReaders)
|
||||||
|
p.readers = tmp
|
||||||
|
// next is the next non-preferred index.
|
||||||
|
next := 0
|
||||||
|
for i, ok := range prefer {
|
||||||
|
if !ok || p.readers[i] == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if i == next {
|
||||||
|
next++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Move reader with index i to index next.
|
||||||
|
// Do this by swapping next and i
|
||||||
|
p.readers[next], p.readers[i] = p.readers[i], p.readers[next]
|
||||||
|
p.readerToBuf[next] = i
|
||||||
|
p.readerToBuf[i] = next
|
||||||
|
next++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -54,7 +91,7 @@ func newParallelReader(readers []io.ReaderAt, e Erasure, offset, totalLength int
|
|||||||
func (p *parallelReader) canDecode(buf [][]byte) bool {
|
func (p *parallelReader) canDecode(buf [][]byte) bool {
|
||||||
bufCount := 0
|
bufCount := 0
|
||||||
for _, b := range buf {
|
for _, b := range buf {
|
||||||
if b != nil {
|
if len(b) > 0 {
|
||||||
bufCount++
|
bufCount++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -62,13 +99,23 @@ func (p *parallelReader) canDecode(buf [][]byte) bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Read reads from readers in parallel. Returns p.dataBlocks number of bufs.
|
// Read reads from readers in parallel. Returns p.dataBlocks number of bufs.
|
||||||
func (p *parallelReader) Read() ([][]byte, error) {
|
func (p *parallelReader) Read(dst [][]byte) ([][]byte, error) {
|
||||||
newBuf := make([][]byte, len(p.readers))
|
newBuf := dst
|
||||||
|
if len(dst) != len(p.readers) {
|
||||||
|
newBuf = make([][]byte, len(p.readers))
|
||||||
|
} else {
|
||||||
|
for i := range newBuf {
|
||||||
|
newBuf[i] = newBuf[i][:0]
|
||||||
|
}
|
||||||
|
}
|
||||||
var newBufLK sync.RWMutex
|
var newBufLK sync.RWMutex
|
||||||
|
|
||||||
if p.offset+p.shardSize > p.shardFileSize {
|
if p.offset+p.shardSize > p.shardFileSize {
|
||||||
p.shardSize = p.shardFileSize - p.offset
|
p.shardSize = p.shardFileSize - p.offset
|
||||||
}
|
}
|
||||||
|
if p.shardSize == 0 {
|
||||||
|
return newBuf, nil
|
||||||
|
}
|
||||||
|
|
||||||
readTriggerCh := make(chan bool, len(p.readers))
|
readTriggerCh := make(chan bool, len(p.readers))
|
||||||
for i := 0; i < p.dataBlocks; i++ {
|
for i := 0; i < p.dataBlocks; i++ {
|
||||||
@ -104,26 +151,30 @@ func (p *parallelReader) Read() ([][]byte, error) {
|
|||||||
readTriggerCh <- true
|
readTriggerCh <- true
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if p.buf[i] == nil {
|
bufIdx := p.readerToBuf[i]
|
||||||
|
if p.buf[bufIdx] == nil {
|
||||||
// Reading first time on this disk, hence the buffer needs to be allocated.
|
// Reading first time on this disk, hence the buffer needs to be allocated.
|
||||||
// Subsequent reads will re-use this buffer.
|
// Subsequent reads will re-use this buffer.
|
||||||
p.buf[i] = make([]byte, p.shardSize)
|
p.buf[bufIdx] = make([]byte, p.shardSize)
|
||||||
}
|
}
|
||||||
// For the last shard, the shardsize might be less than previous shard sizes.
|
// For the last shard, the shardsize might be less than previous shard sizes.
|
||||||
// Hence the following statement ensures that the buffer size is reset to the right size.
|
// Hence the following statement ensures that the buffer size is reset to the right size.
|
||||||
p.buf[i] = p.buf[i][:p.shardSize]
|
p.buf[bufIdx] = p.buf[bufIdx][:p.shardSize]
|
||||||
_, err := disk.ReadAt(p.buf[i], p.offset)
|
_, err := disk.ReadAt(p.buf[bufIdx], p.offset)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if _, ok := err.(*errHashMismatch); ok {
|
if _, ok := err.(*errHashMismatch); ok {
|
||||||
atomic.StoreInt32(&healRequired, 1)
|
atomic.StoreInt32(&healRequired, 1)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This will be communicated upstream.
|
||||||
|
p.orgReaders[bufIdx] = nil
|
||||||
p.readers[i] = nil
|
p.readers[i] = nil
|
||||||
// Since ReadAt returned error, trigger another read.
|
// Since ReadAt returned error, trigger another read.
|
||||||
readTriggerCh <- true
|
readTriggerCh <- true
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
newBufLK.Lock()
|
newBufLK.Lock()
|
||||||
newBuf[i] = p.buf[i]
|
newBuf[bufIdx] = p.buf[bufIdx]
|
||||||
newBufLK.Unlock()
|
newBufLK.Unlock()
|
||||||
// Since ReadAt returned success, there is no need to trigger another read.
|
// Since ReadAt returned success, there is no need to trigger another read.
|
||||||
readTriggerCh <- false
|
readTriggerCh <- false
|
||||||
@ -156,8 +207,9 @@ func (err *errDecodeHealRequired) Unwrap() error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Decode reads from readers, reconstructs data if needed and writes the data to the writer.
|
// Decode reads from readers, reconstructs data if needed and writes the data to the writer.
|
||||||
func (e Erasure) Decode(ctx context.Context, writer io.Writer, readers []io.ReaderAt, offset, length, totalLength int64) error {
|
// A set of preferred drives can be supplied. In that case they will be used and the data reconstructed.
|
||||||
healRequired, err := e.decode(ctx, writer, readers, offset, length, totalLength)
|
func (e Erasure) Decode(ctx context.Context, writer io.Writer, readers []io.ReaderAt, offset, length, totalLength int64, prefer []bool) error {
|
||||||
|
healRequired, err := e.decode(ctx, writer, readers, offset, length, totalLength, prefer)
|
||||||
if healRequired {
|
if healRequired {
|
||||||
return &errDecodeHealRequired{err}
|
return &errDecodeHealRequired{err}
|
||||||
}
|
}
|
||||||
@ -166,7 +218,7 @@ func (e Erasure) Decode(ctx context.Context, writer io.Writer, readers []io.Read
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Decode reads from readers, reconstructs data if needed and writes the data to the writer.
|
// Decode reads from readers, reconstructs data if needed and writes the data to the writer.
|
||||||
func (e Erasure) decode(ctx context.Context, writer io.Writer, readers []io.ReaderAt, offset, length, totalLength int64) (bool, error) {
|
func (e Erasure) decode(ctx context.Context, writer io.Writer, readers []io.ReaderAt, offset, length, totalLength int64, prefer []bool) (bool, error) {
|
||||||
if offset < 0 || length < 0 {
|
if offset < 0 || length < 0 {
|
||||||
logger.LogIf(ctx, errInvalidArgument)
|
logger.LogIf(ctx, errInvalidArgument)
|
||||||
return false, errInvalidArgument
|
return false, errInvalidArgument
|
||||||
@ -180,12 +232,16 @@ func (e Erasure) decode(ctx context.Context, writer io.Writer, readers []io.Read
|
|||||||
}
|
}
|
||||||
|
|
||||||
reader := newParallelReader(readers, e, offset, totalLength)
|
reader := newParallelReader(readers, e, offset, totalLength)
|
||||||
|
if len(prefer) == len(readers) {
|
||||||
|
reader.preferReaders(prefer)
|
||||||
|
}
|
||||||
|
|
||||||
startBlock := offset / e.blockSize
|
startBlock := offset / e.blockSize
|
||||||
endBlock := (offset + length) / e.blockSize
|
endBlock := (offset + length) / e.blockSize
|
||||||
|
|
||||||
var healRequired bool
|
var healRequired bool
|
||||||
var bytesWritten int64
|
var bytesWritten int64
|
||||||
|
var bufs [][]byte
|
||||||
for block := startBlock; block <= endBlock; block++ {
|
for block := startBlock; block <= endBlock; block++ {
|
||||||
var blockOffset, blockLength int64
|
var blockOffset, blockLength int64
|
||||||
switch {
|
switch {
|
||||||
@ -205,9 +261,11 @@ func (e Erasure) decode(ctx context.Context, writer io.Writer, readers []io.Read
|
|||||||
if blockLength == 0 {
|
if blockLength == 0 {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
bufs, err := reader.Read()
|
var err error
|
||||||
|
bufs, err = reader.Read(bufs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if errors.Is(err, errHealRequired) {
|
if errors.Is(err, errHealRequired) {
|
||||||
|
// errHealRequired is only returned if there are be enough data for reconstruction.
|
||||||
healRequired = true
|
healRequired = true
|
||||||
} else {
|
} else {
|
||||||
return healRequired, err
|
return healRequired, err
|
||||||
|
@ -138,7 +138,7 @@ func TestErasureDecode(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
writer := bytes.NewBuffer(nil)
|
writer := bytes.NewBuffer(nil)
|
||||||
err = erasure.Decode(context.Background(), writer, bitrotReaders, test.offset, test.length, test.data)
|
err = erasure.Decode(context.Background(), writer, bitrotReaders, test.offset, test.length, test.data, nil)
|
||||||
closeBitrotReaders(bitrotReaders)
|
closeBitrotReaders(bitrotReaders)
|
||||||
if err != nil && !test.shouldFail {
|
if err != nil && !test.shouldFail {
|
||||||
t.Errorf("Test %d: should pass but failed with: %v", i, err)
|
t.Errorf("Test %d: should pass but failed with: %v", i, err)
|
||||||
@ -181,7 +181,7 @@ func TestErasureDecode(t *testing.T) {
|
|||||||
bitrotReaders[0] = nil
|
bitrotReaders[0] = nil
|
||||||
}
|
}
|
||||||
writer.Reset()
|
writer.Reset()
|
||||||
err = erasure.Decode(context.Background(), writer, bitrotReaders, test.offset, test.length, test.data)
|
err = erasure.Decode(context.Background(), writer, bitrotReaders, test.offset, test.length, test.data, nil)
|
||||||
closeBitrotReaders(bitrotReaders)
|
closeBitrotReaders(bitrotReaders)
|
||||||
if err != nil && !test.shouldFailQuorum {
|
if err != nil && !test.shouldFailQuorum {
|
||||||
t.Errorf("Test %d: should pass but failed with: %v", i, err)
|
t.Errorf("Test %d: should pass but failed with: %v", i, err)
|
||||||
@ -191,7 +191,7 @@ func TestErasureDecode(t *testing.T) {
|
|||||||
}
|
}
|
||||||
if !test.shouldFailQuorum {
|
if !test.shouldFailQuorum {
|
||||||
if content := writer.Bytes(); !bytes.Equal(content, data[test.offset:test.offset+test.length]) {
|
if content := writer.Bytes(); !bytes.Equal(content, data[test.offset:test.offset+test.length]) {
|
||||||
t.Errorf("Test %d: read retruns wrong file content", i)
|
t.Errorf("Test %d: read returns wrong file content", i)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -271,7 +271,7 @@ func TestErasureDecodeRandomOffsetLength(t *testing.T) {
|
|||||||
tillOffset := erasure.ShardFileTillOffset(offset, readLen, length)
|
tillOffset := erasure.ShardFileTillOffset(offset, readLen, length)
|
||||||
bitrotReaders[index] = newStreamingBitrotReader(disk, "testbucket", "object", tillOffset, DefaultBitrotAlgorithm, erasure.ShardSize())
|
bitrotReaders[index] = newStreamingBitrotReader(disk, "testbucket", "object", tillOffset, DefaultBitrotAlgorithm, erasure.ShardSize())
|
||||||
}
|
}
|
||||||
err = erasure.Decode(context.Background(), buf, bitrotReaders, offset, readLen, length)
|
err = erasure.Decode(context.Background(), buf, bitrotReaders, offset, readLen, length, nil)
|
||||||
closeBitrotReaders(bitrotReaders)
|
closeBitrotReaders(bitrotReaders)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err, offset, readLen)
|
t.Fatal(err, offset, readLen)
|
||||||
@ -333,7 +333,7 @@ func benchmarkErasureDecode(data, parity, dataDown, parityDown int, size int64,
|
|||||||
tillOffset := erasure.ShardFileTillOffset(0, size, size)
|
tillOffset := erasure.ShardFileTillOffset(0, size, size)
|
||||||
bitrotReaders[index] = newStreamingBitrotReader(disk, "testbucket", "object", tillOffset, DefaultBitrotAlgorithm, erasure.ShardSize())
|
bitrotReaders[index] = newStreamingBitrotReader(disk, "testbucket", "object", tillOffset, DefaultBitrotAlgorithm, erasure.ShardSize())
|
||||||
}
|
}
|
||||||
if err = erasure.Decode(context.Background(), bytes.NewBuffer(content[:0]), bitrotReaders, 0, size, size); err != nil {
|
if err = erasure.Decode(context.Background(), bytes.NewBuffer(content[:0]), bitrotReaders, 0, size, size, nil); err != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
closeBitrotReaders(bitrotReaders)
|
closeBitrotReaders(bitrotReaders)
|
||||||
|
@ -28,7 +28,7 @@ import (
|
|||||||
func (e Erasure) Heal(ctx context.Context, readers []io.ReaderAt, writers []io.Writer, size int64) error {
|
func (e Erasure) Heal(ctx context.Context, readers []io.ReaderAt, writers []io.Writer, size int64) error {
|
||||||
r, w := io.Pipe()
|
r, w := io.Pipe()
|
||||||
go func() {
|
go func() {
|
||||||
if err := e.Decode(ctx, w, readers, 0, size, size); err != nil {
|
if err := e.Decode(ctx, w, readers, 0, size, size, nil); err != nil {
|
||||||
w.CloseWithError(err)
|
w.CloseWithError(err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
@ -87,14 +87,15 @@ func (e *Erasure) EncodeData(ctx context.Context, data []byte) ([][]byte, error)
|
|||||||
// It only decodes the data blocks but does not verify them.
|
// It only decodes the data blocks but does not verify them.
|
||||||
// It returns an error if the decoding failed.
|
// It returns an error if the decoding failed.
|
||||||
func (e *Erasure) DecodeDataBlocks(data [][]byte) error {
|
func (e *Erasure) DecodeDataBlocks(data [][]byte) error {
|
||||||
needsReconstruction := false
|
var isZero = 0
|
||||||
for _, b := range data[:e.dataBlocks] {
|
for _, b := range data[:] {
|
||||||
if b == nil {
|
if len(b) == 0 {
|
||||||
needsReconstruction = true
|
isZero++
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !needsReconstruction {
|
if isZero == 0 || isZero == len(data) {
|
||||||
|
// If all are zero, payload is 0 bytes.
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
return e.encoder().ReconstructData(data)
|
return e.encoder().ReconstructData(data)
|
||||||
|
@ -265,6 +265,7 @@ func (xl xlObjects) getObject(ctx context.Context, bucket, object string, startO
|
|||||||
return toObjectErr(err, bucket, object)
|
return toObjectErr(err, bucket, object)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var healOnce sync.Once
|
||||||
for ; partIndex <= lastPartIndex; partIndex++ {
|
for ; partIndex <= lastPartIndex; partIndex++ {
|
||||||
if length == totalBytesRead {
|
if length == totalBytesRead {
|
||||||
break
|
break
|
||||||
@ -284,6 +285,7 @@ func (xl xlObjects) getObject(ctx context.Context, bucket, object string, startO
|
|||||||
tillOffset := erasure.ShardFileTillOffset(partOffset, partLength, partSize)
|
tillOffset := erasure.ShardFileTillOffset(partOffset, partLength, partSize)
|
||||||
// Get the checksums of the current part.
|
// Get the checksums of the current part.
|
||||||
readers := make([]io.ReaderAt, len(onlineDisks))
|
readers := make([]io.ReaderAt, len(onlineDisks))
|
||||||
|
prefer := make([]bool, len(onlineDisks))
|
||||||
for index, disk := range onlineDisks {
|
for index, disk := range onlineDisks {
|
||||||
if disk == OfflineDisk {
|
if disk == OfflineDisk {
|
||||||
continue
|
continue
|
||||||
@ -292,14 +294,19 @@ func (xl xlObjects) getObject(ctx context.Context, bucket, object string, startO
|
|||||||
partPath := pathJoin(object, fmt.Sprintf("part.%d", partNumber))
|
partPath := pathJoin(object, fmt.Sprintf("part.%d", partNumber))
|
||||||
readers[index] = newBitrotReader(disk, bucket, partPath, tillOffset,
|
readers[index] = newBitrotReader(disk, bucket, partPath, tillOffset,
|
||||||
checksumInfo.Algorithm, checksumInfo.Hash, erasure.ShardSize())
|
checksumInfo.Algorithm, checksumInfo.Hash, erasure.ShardSize())
|
||||||
|
|
||||||
|
// Prefer local disks
|
||||||
|
prefer[index] = disk.Hostname() == ""
|
||||||
}
|
}
|
||||||
err := erasure.Decode(ctx, writer, readers, partOffset, partLength, partSize)
|
err := erasure.Decode(ctx, writer, readers, partOffset, partLength, partSize, prefer)
|
||||||
// Note: we should not be defer'ing the following closeBitrotReaders() call as we are inside a for loop i.e if we use defer, we would accumulate a lot of open files by the time
|
// Note: we should not be defer'ing the following closeBitrotReaders() call as we are inside a for loop i.e if we use defer, we would accumulate a lot of open files by the time
|
||||||
// we return from this function.
|
// we return from this function.
|
||||||
closeBitrotReaders(readers)
|
closeBitrotReaders(readers)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if decodeHealErr, ok := err.(*errDecodeHealRequired); ok {
|
if decodeHealErr, ok := err.(*errDecodeHealRequired); ok {
|
||||||
go deepHealObject(pathJoin(bucket, object))
|
healOnce.Do(func() {
|
||||||
|
go deepHealObject(pathJoin(bucket, object))
|
||||||
|
})
|
||||||
err = decodeHealErr.err
|
err = decodeHealErr.err
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user