2021-04-18 15:41:13 -04:00
|
|
|
// Copyright (c) 2015-2021 MinIO, Inc.
|
|
|
|
//
|
|
|
|
// This file is part of MinIO Object Storage stack
|
|
|
|
//
|
|
|
|
// This program is free software: you can redistribute it and/or modify
|
|
|
|
// it under the terms of the GNU Affero General Public License as published by
|
|
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
|
|
// (at your option) any later version.
|
|
|
|
//
|
|
|
|
// This program is distributed in the hope that it will be useful
|
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
// GNU Affero General Public License for more details.
|
|
|
|
//
|
|
|
|
// You should have received a copy of the GNU Affero General Public License
|
|
|
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
2016-03-28 00:52:38 -04:00
|
|
|
|
2016-08-18 19:23:42 -04:00
|
|
|
package cmd
|
2016-03-28 00:52:38 -04:00
|
|
|
|
2016-05-31 23:23:31 -04:00
|
|
|
import (
|
2018-04-05 18:04:40 -04:00
|
|
|
"context"
|
2020-04-01 15:14:00 -04:00
|
|
|
"errors"
|
2023-05-25 12:39:06 -04:00
|
|
|
"fmt"
|
2016-06-19 16:35:26 -04:00
|
|
|
"io"
|
2019-01-18 10:48:25 -05:00
|
|
|
"sync"
|
2020-04-01 15:14:00 -04:00
|
|
|
"sync/atomic"
|
2017-11-25 14:58:29 -05:00
|
|
|
|
2024-01-28 13:04:17 -05:00
|
|
|
xioutil "github.com/minio/minio/internal/ioutil"
|
2016-05-31 23:23:31 -04:00
|
|
|
)
|
2016-03-28 00:52:38 -04:00
|
|
|
|
2019-01-17 07:58:18 -05:00
|
|
|
// Reads in parallel from readers.
|
2018-08-06 18:14:08 -04:00
|
|
|
type parallelReader struct {
|
2019-01-17 07:58:18 -05:00
|
|
|
readers []io.ReaderAt
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
orgReaders []io.ReaderAt
|
2018-08-06 18:14:08 -04:00
|
|
|
dataBlocks int
|
|
|
|
offset int64
|
|
|
|
shardSize int64
|
|
|
|
shardFileSize int64
|
2019-01-17 07:58:18 -05:00
|
|
|
buf [][]byte
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
readerToBuf []int
|
2024-04-19 12:44:59 -04:00
|
|
|
stashBuffer []byte
|
2018-03-04 17:16:45 -05:00
|
|
|
}
|
|
|
|
|
2018-08-06 18:14:08 -04:00
|
|
|
// newParallelReader returns parallelReader.
|
2019-01-17 07:58:18 -05:00
|
|
|
func newParallelReader(readers []io.ReaderAt, e Erasure, offset, totalLength int64) *parallelReader {
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
r2b := make([]int, len(readers))
|
|
|
|
for i := range r2b {
|
|
|
|
r2b[i] = i
|
|
|
|
}
|
2024-04-19 12:44:59 -04:00
|
|
|
bufs := make([][]byte, len(readers))
|
|
|
|
shardSize := int(e.ShardSize())
|
2024-08-28 21:40:52 -04:00
|
|
|
var b []byte
|
|
|
|
|
|
|
|
// We should always have enough capacity, but older objects may be bigger
|
|
|
|
// we do not need stashbuffer for them.
|
|
|
|
if globalBytePoolCap.Load().WidthCap() >= len(readers)*shardSize {
|
|
|
|
// Fill buffers
|
|
|
|
b = globalBytePoolCap.Load().Get()
|
2024-04-19 12:44:59 -04:00
|
|
|
// Seed the buffers.
|
|
|
|
for i := range bufs {
|
|
|
|
bufs[i] = b[i*shardSize : (i+1)*shardSize]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-08-06 18:14:08 -04:00
|
|
|
return ¶llelReader{
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
readers: readers,
|
|
|
|
orgReaders: readers,
|
|
|
|
dataBlocks: e.dataBlocks,
|
|
|
|
offset: (offset / e.blockSize) * e.ShardSize(),
|
|
|
|
shardSize: e.ShardSize(),
|
|
|
|
shardFileSize: e.ShardFileSize(totalLength),
|
|
|
|
buf: make([][]byte, len(readers)),
|
|
|
|
readerToBuf: r2b,
|
2024-04-19 12:44:59 -04:00
|
|
|
stashBuffer: b,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Done will release any resources used by the parallelReader.
|
|
|
|
func (p *parallelReader) Done() {
|
|
|
|
if p.stashBuffer != nil {
|
|
|
|
globalBytePoolCap.Load().Put(p.stashBuffer)
|
|
|
|
p.stashBuffer = nil
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// preferReaders can mark readers as preferred.
|
|
|
|
// These will be chosen before others.
|
|
|
|
func (p *parallelReader) preferReaders(prefer []bool) {
|
|
|
|
if len(prefer) != len(p.orgReaders) {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
// Copy so we don't change our input.
|
|
|
|
tmp := make([]io.ReaderAt, len(p.orgReaders))
|
|
|
|
copy(tmp, p.orgReaders)
|
|
|
|
p.readers = tmp
|
|
|
|
// next is the next non-preferred index.
|
|
|
|
next := 0
|
|
|
|
for i, ok := range prefer {
|
|
|
|
if !ok || p.readers[i] == nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if i == next {
|
|
|
|
next++
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
// Move reader with index i to index next.
|
|
|
|
// Do this by swapping next and i
|
|
|
|
p.readers[next], p.readers[i] = p.readers[i], p.readers[next]
|
|
|
|
p.readerToBuf[next] = i
|
|
|
|
p.readerToBuf[i] = next
|
|
|
|
next++
|
2018-03-04 17:16:45 -05:00
|
|
|
}
|
2018-08-06 18:14:08 -04:00
|
|
|
}
|
2018-03-04 17:16:45 -05:00
|
|
|
|
2018-08-06 18:14:08 -04:00
|
|
|
// Returns if buf can be erasure decoded.
|
|
|
|
func (p *parallelReader) canDecode(buf [][]byte) bool {
|
|
|
|
bufCount := 0
|
|
|
|
for _, b := range buf {
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
if len(b) > 0 {
|
2018-08-06 18:14:08 -04:00
|
|
|
bufCount++
|
2018-03-04 17:16:45 -05:00
|
|
|
}
|
|
|
|
}
|
2018-08-06 18:14:08 -04:00
|
|
|
return bufCount >= p.dataBlocks
|
2018-03-04 17:16:45 -05:00
|
|
|
}
|
|
|
|
|
2019-01-17 07:58:18 -05:00
|
|
|
// Read reads from readers in parallel. Returns p.dataBlocks number of bufs.
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
func (p *parallelReader) Read(dst [][]byte) ([][]byte, error) {
|
|
|
|
newBuf := dst
|
|
|
|
if len(dst) != len(p.readers) {
|
|
|
|
newBuf = make([][]byte, len(p.readers))
|
|
|
|
} else {
|
|
|
|
for i := range newBuf {
|
|
|
|
newBuf[i] = newBuf[i][:0]
|
|
|
|
}
|
|
|
|
}
|
2019-01-18 10:48:25 -05:00
|
|
|
var newBufLK sync.RWMutex
|
2018-03-04 17:16:45 -05:00
|
|
|
|
2018-08-06 18:14:08 -04:00
|
|
|
if p.offset+p.shardSize > p.shardFileSize {
|
|
|
|
p.shardSize = p.shardFileSize - p.offset
|
2018-03-04 17:16:45 -05:00
|
|
|
}
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
if p.shardSize == 0 {
|
|
|
|
return newBuf, nil
|
|
|
|
}
|
2018-03-04 17:16:45 -05:00
|
|
|
|
2019-01-18 10:48:25 -05:00
|
|
|
readTriggerCh := make(chan bool, len(p.readers))
|
2024-01-28 13:04:17 -05:00
|
|
|
defer xioutil.SafeClose(readTriggerCh) // close the channel upon return
|
2021-06-29 11:47:15 -04:00
|
|
|
|
2019-01-18 10:48:25 -05:00
|
|
|
for i := 0; i < p.dataBlocks; i++ {
|
|
|
|
// Setup read triggers for p.dataBlocks number of reads so that it reads in parallel.
|
|
|
|
readTriggerCh <- true
|
2018-03-04 17:16:45 -05:00
|
|
|
}
|
|
|
|
|
2023-05-25 12:39:06 -04:00
|
|
|
disksNotFound := int32(0)
|
2021-01-27 13:21:14 -05:00
|
|
|
bitrotHeal := int32(0) // Atomic bool flag.
|
|
|
|
missingPartsHeal := int32(0) // Atomic bool flag.
|
2019-01-18 10:48:25 -05:00
|
|
|
readerIndex := 0
|
|
|
|
var wg sync.WaitGroup
|
|
|
|
// if readTrigger is true, it implies next disk.ReadAt() should be tried
|
|
|
|
// if readTrigger is false, it implies previous disk.ReadAt() was successful and there is no need
|
|
|
|
// to try reading the next disk.
|
|
|
|
for readTrigger := range readTriggerCh {
|
|
|
|
newBufLK.RLock()
|
|
|
|
canDecode := p.canDecode(newBuf)
|
|
|
|
newBufLK.RUnlock()
|
|
|
|
if canDecode {
|
|
|
|
break
|
2018-08-06 18:14:08 -04:00
|
|
|
}
|
2019-01-18 10:48:25 -05:00
|
|
|
if readerIndex == len(p.readers) {
|
2018-08-06 18:14:08 -04:00
|
|
|
break
|
2018-03-04 17:16:45 -05:00
|
|
|
}
|
2019-01-18 10:48:25 -05:00
|
|
|
if !readTrigger {
|
2018-08-06 18:14:08 -04:00
|
|
|
continue
|
2016-06-24 21:00:34 -04:00
|
|
|
}
|
2019-01-18 10:48:25 -05:00
|
|
|
wg.Add(1)
|
|
|
|
go func(i int) {
|
|
|
|
defer wg.Done()
|
2020-06-26 02:20:12 -04:00
|
|
|
rr := p.readers[i]
|
|
|
|
if rr == nil {
|
|
|
|
// Since reader is nil, trigger another read.
|
2019-01-18 10:48:25 -05:00
|
|
|
readTriggerCh <- true
|
|
|
|
return
|
2018-03-04 17:16:45 -05:00
|
|
|
}
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
bufIdx := p.readerToBuf[i]
|
|
|
|
if p.buf[bufIdx] == nil {
|
2019-01-18 10:48:25 -05:00
|
|
|
// Reading first time on this disk, hence the buffer needs to be allocated.
|
2024-01-18 02:03:17 -05:00
|
|
|
// Subsequent reads will reuse this buffer.
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
p.buf[bufIdx] = make([]byte, p.shardSize)
|
2019-01-18 10:48:25 -05:00
|
|
|
}
|
|
|
|
// For the last shard, the shardsize might be less than previous shard sizes.
|
|
|
|
// Hence the following statement ensures that the buffer size is reset to the right size.
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
p.buf[bufIdx] = p.buf[bufIdx][:p.shardSize]
|
2021-03-15 23:03:13 -04:00
|
|
|
n, err := rr.ReadAt(p.buf[bufIdx], p.offset)
|
2019-01-18 10:48:25 -05:00
|
|
|
if err != nil {
|
2023-05-25 12:39:06 -04:00
|
|
|
switch {
|
|
|
|
case errors.Is(err, errFileNotFound):
|
2021-01-27 13:21:14 -05:00
|
|
|
atomic.StoreInt32(&missingPartsHeal, 1)
|
2023-05-25 12:39:06 -04:00
|
|
|
case errors.Is(err, errFileCorrupt):
|
2021-01-27 13:21:14 -05:00
|
|
|
atomic.StoreInt32(&bitrotHeal, 1)
|
2023-05-25 12:39:06 -04:00
|
|
|
case errors.Is(err, errDiskNotFound):
|
|
|
|
atomic.AddInt32(&disksNotFound, 1)
|
2020-04-01 15:14:00 -04:00
|
|
|
}
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
|
|
|
|
// This will be communicated upstream.
|
|
|
|
p.orgReaders[bufIdx] = nil
|
2024-05-21 10:36:21 -04:00
|
|
|
if br, ok := p.readers[i].(io.Closer); ok {
|
|
|
|
br.Close()
|
|
|
|
}
|
2019-01-18 10:48:25 -05:00
|
|
|
p.readers[i] = nil
|
2021-01-02 13:35:57 -05:00
|
|
|
|
2019-01-18 10:48:25 -05:00
|
|
|
// Since ReadAt returned error, trigger another read.
|
|
|
|
readTriggerCh <- true
|
|
|
|
return
|
|
|
|
}
|
|
|
|
newBufLK.Lock()
|
2021-03-15 23:03:13 -04:00
|
|
|
newBuf[bufIdx] = p.buf[bufIdx][:n]
|
2019-01-18 10:48:25 -05:00
|
|
|
newBufLK.Unlock()
|
|
|
|
// Since ReadAt returned success, there is no need to trigger another read.
|
|
|
|
readTriggerCh <- false
|
|
|
|
}(readerIndex)
|
|
|
|
readerIndex++
|
|
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
if p.canDecode(newBuf) {
|
|
|
|
p.offset += p.shardSize
|
2023-05-25 12:39:06 -04:00
|
|
|
if missingPartsHeal == 1 {
|
2021-01-27 13:21:14 -05:00
|
|
|
return newBuf, errFileNotFound
|
2023-05-25 12:39:06 -04:00
|
|
|
} else if bitrotHeal == 1 {
|
2021-01-27 13:21:14 -05:00
|
|
|
return newBuf, errFileCorrupt
|
2020-04-01 15:14:00 -04:00
|
|
|
}
|
2019-01-18 10:48:25 -05:00
|
|
|
return newBuf, nil
|
2018-08-06 18:14:08 -04:00
|
|
|
}
|
2018-03-04 17:16:45 -05:00
|
|
|
|
2021-05-14 19:50:47 -04:00
|
|
|
// If we cannot decode, just return read quorum error.
|
2023-05-25 12:39:06 -04:00
|
|
|
return nil, fmt.Errorf("%w (offline-disks=%d/%d)", errErasureReadQuorum, disksNotFound, len(p.readers))
|
2018-08-06 18:14:08 -04:00
|
|
|
}
|
2018-03-04 17:16:45 -05:00
|
|
|
|
2018-08-24 02:35:37 -04:00
|
|
|
// Decode reads from readers, reconstructs data if needed and writes the data to the writer.
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
// A set of preferred drives can be supplied. In that case they will be used and the data reconstructed.
|
2021-01-27 13:21:14 -05:00
|
|
|
func (e Erasure) Decode(ctx context.Context, writer io.Writer, readers []io.ReaderAt, offset, length, totalLength int64, prefer []bool) (written int64, derr error) {
|
2018-08-06 18:14:08 -04:00
|
|
|
if offset < 0 || length < 0 {
|
2021-01-27 13:21:14 -05:00
|
|
|
return -1, errInvalidArgument
|
2018-08-06 18:14:08 -04:00
|
|
|
}
|
|
|
|
if offset+length > totalLength {
|
2021-01-27 13:21:14 -05:00
|
|
|
return -1, errInvalidArgument
|
2018-08-06 18:14:08 -04:00
|
|
|
}
|
2021-01-27 13:21:14 -05:00
|
|
|
|
2018-08-06 18:14:08 -04:00
|
|
|
if length == 0 {
|
2021-01-27 13:21:14 -05:00
|
|
|
return 0, nil
|
2018-08-06 18:14:08 -04:00
|
|
|
}
|
2018-03-04 17:16:45 -05:00
|
|
|
|
2019-01-17 07:58:18 -05:00
|
|
|
reader := newParallelReader(readers, e, offset, totalLength)
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
if len(prefer) == len(readers) {
|
|
|
|
reader.preferReaders(prefer)
|
|
|
|
}
|
2024-04-19 12:44:59 -04:00
|
|
|
defer reader.Done()
|
2018-08-06 18:14:08 -04:00
|
|
|
|
2018-08-24 02:35:37 -04:00
|
|
|
startBlock := offset / e.blockSize
|
|
|
|
endBlock := (offset + length) / e.blockSize
|
2018-08-06 18:14:08 -04:00
|
|
|
|
|
|
|
var bytesWritten int64
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
var bufs [][]byte
|
2018-08-06 18:14:08 -04:00
|
|
|
for block := startBlock; block <= endBlock; block++ {
|
|
|
|
var blockOffset, blockLength int64
|
|
|
|
switch {
|
|
|
|
case startBlock == endBlock:
|
2018-08-24 02:35:37 -04:00
|
|
|
blockOffset = offset % e.blockSize
|
2018-08-06 18:14:08 -04:00
|
|
|
blockLength = length
|
|
|
|
case block == startBlock:
|
2018-08-24 02:35:37 -04:00
|
|
|
blockOffset = offset % e.blockSize
|
|
|
|
blockLength = e.blockSize - blockOffset
|
2018-08-06 18:14:08 -04:00
|
|
|
case block == endBlock:
|
|
|
|
blockOffset = 0
|
2018-08-24 02:35:37 -04:00
|
|
|
blockLength = (offset + length) % e.blockSize
|
2018-08-06 18:14:08 -04:00
|
|
|
default:
|
|
|
|
blockOffset = 0
|
2018-08-24 02:35:37 -04:00
|
|
|
blockLength = e.blockSize
|
2016-06-24 21:00:34 -04:00
|
|
|
}
|
2018-08-06 18:14:08 -04:00
|
|
|
if blockLength == 0 {
|
|
|
|
break
|
|
|
|
}
|
2021-01-27 13:21:14 -05:00
|
|
|
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
var err error
|
|
|
|
bufs, err = reader.Read(bufs)
|
2021-01-27 13:21:14 -05:00
|
|
|
if len(bufs) > 0 {
|
|
|
|
// Set only if there are be enough data for reconstruction.
|
|
|
|
// and only for expected errors, also set once.
|
|
|
|
if errors.Is(err, errFileNotFound) || errors.Is(err, errFileCorrupt) {
|
|
|
|
if derr == nil {
|
|
|
|
derr = err
|
|
|
|
}
|
2020-04-01 15:14:00 -04:00
|
|
|
}
|
2021-01-27 13:21:14 -05:00
|
|
|
} else if err != nil {
|
|
|
|
// For all errors that cannot be reconstructed fail the read operation.
|
|
|
|
return -1, err
|
2017-08-14 21:08:42 -04:00
|
|
|
}
|
2020-11-12 15:12:09 -05:00
|
|
|
|
2018-08-24 02:35:37 -04:00
|
|
|
if err = e.DecodeDataBlocks(bufs); err != nil {
|
2021-01-27 13:21:14 -05:00
|
|
|
return -1, err
|
2016-06-27 16:24:55 -04:00
|
|
|
}
|
2020-11-12 15:12:09 -05:00
|
|
|
|
2018-08-24 02:35:37 -04:00
|
|
|
n, err := writeDataBlocks(ctx, writer, bufs, e.dataBlocks, blockOffset, blockLength)
|
2018-08-06 18:14:08 -04:00
|
|
|
if err != nil {
|
2021-01-27 13:21:14 -05:00
|
|
|
return -1, err
|
2018-08-06 18:14:08 -04:00
|
|
|
}
|
2021-01-27 13:21:14 -05:00
|
|
|
|
2018-08-06 18:14:08 -04:00
|
|
|
bytesWritten += n
|
|
|
|
}
|
2021-01-27 13:21:14 -05:00
|
|
|
|
2018-08-06 18:14:08 -04:00
|
|
|
if bytesWritten != length {
|
2021-01-27 13:21:14 -05:00
|
|
|
return bytesWritten, errLessData
|
2016-06-27 16:24:55 -04:00
|
|
|
}
|
2020-04-01 15:14:00 -04:00
|
|
|
|
2021-01-27 13:21:14 -05:00
|
|
|
return bytesWritten, derr
|
2016-06-27 16:24:55 -04:00
|
|
|
}
|
2021-12-03 12:26:30 -05:00
|
|
|
|
|
|
|
// Heal reads from readers, reconstruct shards and writes the data to the writers.
|
2023-08-03 05:18:18 -04:00
|
|
|
func (e Erasure) Heal(ctx context.Context, writers []io.Writer, readers []io.ReaderAt, totalLength int64, prefer []bool) (derr error) {
|
2021-12-03 12:26:30 -05:00
|
|
|
if len(writers) != e.parityBlocks+e.dataBlocks {
|
|
|
|
return errInvalidArgument
|
|
|
|
}
|
|
|
|
|
|
|
|
reader := newParallelReader(readers, e, 0, totalLength)
|
2023-08-03 05:18:18 -04:00
|
|
|
if len(readers) == len(prefer) {
|
|
|
|
reader.preferReaders(prefer)
|
|
|
|
}
|
2024-04-19 12:44:59 -04:00
|
|
|
defer reader.Done()
|
2021-12-03 12:26:30 -05:00
|
|
|
|
|
|
|
startBlock := int64(0)
|
|
|
|
endBlock := totalLength / e.blockSize
|
|
|
|
if totalLength%e.blockSize != 0 {
|
|
|
|
endBlock++
|
|
|
|
}
|
|
|
|
|
|
|
|
var bufs [][]byte
|
|
|
|
for block := startBlock; block < endBlock; block++ {
|
|
|
|
var err error
|
|
|
|
bufs, err = reader.Read(bufs)
|
|
|
|
if len(bufs) > 0 {
|
|
|
|
if errors.Is(err, errFileNotFound) || errors.Is(err, errFileCorrupt) {
|
|
|
|
if derr == nil {
|
|
|
|
derr = err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
if err = e.DecodeDataAndParityBlocks(ctx, bufs); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
Add PutObject Ring Buffer (#19605)
Replace the `io.Pipe` from streamingBitrotWriter -> CreateFile with a fixed size ring buffer.
This will add an output buffer for encoded shards to be written to disk - potentially via RPC.
This will remove blocking when `(*streamingBitrotWriter).Write` is called, and it writes hashes and data.
With current settings, the write looks like this:
```
Outbound
┌───────────────────┐ ┌────────────────┐ ┌───────────────┐ ┌────────────────┐
│ │ Parr. │ │ (http body) │ │ │ │
│ Bitrot Hash │ Write │ Pipe │ Read │ HTTP buffer │ Write (syscall) │ TCP Buffer │
│ Erasure Shard │ ──────────► │ (unbuffered) │ ────────────► │ (64K Max) │ ───────────────────► │ (4MB) │
│ │ │ │ │ (io.Copy) │ │ │
└───────────────────┘ └────────────────┘ └───────────────┘ └────────────────┘
```
We write a Hash (32 bytes). Since the pipe is unbuffered, it will block until the 32 bytes have
been delivered to the TCP buffer, and the next Read hits the Pipe.
Then we write the shard data. This will typically be bigger than 64KB, so it will block until two blocks
have been read from the pipe.
When we insert a ring buffer:
```
Outbound
┌───────────────────┐ ┌────────────────┐ ┌───────────────┐ ┌────────────────┐
│ │ │ │ (http body) │ │ │ │
│ Bitrot Hash │ Write │ Ring Buffer │ Read │ HTTP buffer │ Write (syscall) │ TCP Buffer │
│ Erasure Shard │ ──────────► │ (2MB) │ ────────────► │ (64K Max) │ ───────────────────► │ (4MB) │
│ │ │ │ │ (io.Copy) │ │ │
└───────────────────┘ └────────────────┘ └───────────────┘ └────────────────┘
```
The hash+shard will fit within the ring buffer, so writes will not block - but will complete after a
memcopy. Reads can fill the 64KB buffer if there is data for it.
If the network is congested, the ring buffer will become filled, and all syscalls will be on full buffers.
Only when the ring buffer is filled will erasure coding start blocking.
Since there is always "space" to write output data, we remove the parallel writing since we are
always writing to memory now, and the goroutine synchronization overhead probably not worth taking.
If the output were blocked in the existing, we would still wait for it to unblock in parallel write, so it would
make no difference there - except now the ring buffer smoothes out the load.
There are some micro-optimizations we could look at later. The biggest is that, in most cases,
we could encode directly to the ring buffer - if we are not at a boundary. Also, "force filling" the
Read requests (i.e., blocking until a full read can be completed) could be investigated and maybe
allow concurrent memory on read and write.
2024-05-14 20:11:04 -04:00
|
|
|
w := multiWriter{
|
2021-12-03 12:26:30 -05:00
|
|
|
writers: writers,
|
|
|
|
writeQuorum: 1,
|
|
|
|
errs: make([]error, len(writers)),
|
|
|
|
}
|
|
|
|
|
2022-01-12 21:49:01 -05:00
|
|
|
if err = w.Write(ctx, bufs); err != nil {
|
2021-12-03 12:26:30 -05:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return derr
|
|
|
|
}
|