2021-04-18 15:41:13 -04:00
|
|
|
// Copyright (c) 2015-2021 MinIO, Inc.
|
|
|
|
//
|
|
|
|
// This file is part of MinIO Object Storage stack
|
|
|
|
//
|
|
|
|
// This program is free software: you can redistribute it and/or modify
|
|
|
|
// it under the terms of the GNU Affero General Public License as published by
|
|
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
|
|
// (at your option) any later version.
|
|
|
|
//
|
|
|
|
// This program is distributed in the hope that it will be useful
|
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
// GNU Affero General Public License for more details.
|
|
|
|
//
|
|
|
|
// You should have received a copy of the GNU Affero General Public License
|
|
|
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
2016-03-28 00:52:38 -04:00
|
|
|
|
2016-08-18 19:23:42 -04:00
|
|
|
package cmd
|
2016-03-28 00:52:38 -04:00
|
|
|
|
2016-05-31 23:23:31 -04:00
|
|
|
import (
|
2018-04-05 18:04:40 -04:00
|
|
|
"context"
|
2020-04-01 15:14:00 -04:00
|
|
|
"errors"
|
2023-05-25 12:39:06 -04:00
|
|
|
"fmt"
|
2016-06-19 16:35:26 -04:00
|
|
|
"io"
|
2019-01-18 10:48:25 -05:00
|
|
|
"sync"
|
2020-04-01 15:14:00 -04:00
|
|
|
"sync/atomic"
|
2017-11-25 14:58:29 -05:00
|
|
|
|
2021-06-01 17:59:40 -04:00
|
|
|
"github.com/minio/minio/internal/logger"
|
2016-05-31 23:23:31 -04:00
|
|
|
)
|
2016-03-28 00:52:38 -04:00
|
|
|
|
2019-01-17 07:58:18 -05:00
|
|
|
// Reads in parallel from readers.
|
2018-08-06 18:14:08 -04:00
|
|
|
type parallelReader struct {
|
2019-01-17 07:58:18 -05:00
|
|
|
readers []io.ReaderAt
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
orgReaders []io.ReaderAt
|
2018-08-06 18:14:08 -04:00
|
|
|
dataBlocks int
|
|
|
|
offset int64
|
|
|
|
shardSize int64
|
|
|
|
shardFileSize int64
|
2019-01-17 07:58:18 -05:00
|
|
|
buf [][]byte
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
readerToBuf []int
|
2018-03-04 17:16:45 -05:00
|
|
|
}
|
|
|
|
|
2018-08-06 18:14:08 -04:00
|
|
|
// newParallelReader returns parallelReader.
|
2019-01-17 07:58:18 -05:00
|
|
|
func newParallelReader(readers []io.ReaderAt, e Erasure, offset, totalLength int64) *parallelReader {
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
r2b := make([]int, len(readers))
|
|
|
|
for i := range r2b {
|
|
|
|
r2b[i] = i
|
|
|
|
}
|
2018-08-06 18:14:08 -04:00
|
|
|
return ¶llelReader{
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
readers: readers,
|
|
|
|
orgReaders: readers,
|
|
|
|
dataBlocks: e.dataBlocks,
|
|
|
|
offset: (offset / e.blockSize) * e.ShardSize(),
|
|
|
|
shardSize: e.ShardSize(),
|
|
|
|
shardFileSize: e.ShardFileSize(totalLength),
|
|
|
|
buf: make([][]byte, len(readers)),
|
|
|
|
readerToBuf: r2b,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// preferReaders can mark readers as preferred.
|
|
|
|
// These will be chosen before others.
|
|
|
|
func (p *parallelReader) preferReaders(prefer []bool) {
|
|
|
|
if len(prefer) != len(p.orgReaders) {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
// Copy so we don't change our input.
|
|
|
|
tmp := make([]io.ReaderAt, len(p.orgReaders))
|
|
|
|
copy(tmp, p.orgReaders)
|
|
|
|
p.readers = tmp
|
|
|
|
// next is the next non-preferred index.
|
|
|
|
next := 0
|
|
|
|
for i, ok := range prefer {
|
|
|
|
if !ok || p.readers[i] == nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if i == next {
|
|
|
|
next++
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
// Move reader with index i to index next.
|
|
|
|
// Do this by swapping next and i
|
|
|
|
p.readers[next], p.readers[i] = p.readers[i], p.readers[next]
|
|
|
|
p.readerToBuf[next] = i
|
|
|
|
p.readerToBuf[i] = next
|
|
|
|
next++
|
2018-03-04 17:16:45 -05:00
|
|
|
}
|
2018-08-06 18:14:08 -04:00
|
|
|
}
|
2018-03-04 17:16:45 -05:00
|
|
|
|
2018-08-06 18:14:08 -04:00
|
|
|
// Returns if buf can be erasure decoded.
|
|
|
|
func (p *parallelReader) canDecode(buf [][]byte) bool {
|
|
|
|
bufCount := 0
|
|
|
|
for _, b := range buf {
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
if len(b) > 0 {
|
2018-08-06 18:14:08 -04:00
|
|
|
bufCount++
|
2018-03-04 17:16:45 -05:00
|
|
|
}
|
|
|
|
}
|
2018-08-06 18:14:08 -04:00
|
|
|
return bufCount >= p.dataBlocks
|
2018-03-04 17:16:45 -05:00
|
|
|
}
|
|
|
|
|
2019-01-17 07:58:18 -05:00
|
|
|
// Read reads from readers in parallel. Returns p.dataBlocks number of bufs.
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
func (p *parallelReader) Read(dst [][]byte) ([][]byte, error) {
|
|
|
|
newBuf := dst
|
|
|
|
if len(dst) != len(p.readers) {
|
|
|
|
newBuf = make([][]byte, len(p.readers))
|
|
|
|
} else {
|
|
|
|
for i := range newBuf {
|
|
|
|
newBuf[i] = newBuf[i][:0]
|
|
|
|
}
|
|
|
|
}
|
2019-01-18 10:48:25 -05:00
|
|
|
var newBufLK sync.RWMutex
|
2018-03-04 17:16:45 -05:00
|
|
|
|
2018-08-06 18:14:08 -04:00
|
|
|
if p.offset+p.shardSize > p.shardFileSize {
|
|
|
|
p.shardSize = p.shardFileSize - p.offset
|
2018-03-04 17:16:45 -05:00
|
|
|
}
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
if p.shardSize == 0 {
|
|
|
|
return newBuf, nil
|
|
|
|
}
|
2018-03-04 17:16:45 -05:00
|
|
|
|
2019-01-18 10:48:25 -05:00
|
|
|
readTriggerCh := make(chan bool, len(p.readers))
|
2021-06-29 11:47:15 -04:00
|
|
|
defer close(readTriggerCh) // close the channel upon return
|
|
|
|
|
2019-01-18 10:48:25 -05:00
|
|
|
for i := 0; i < p.dataBlocks; i++ {
|
|
|
|
// Setup read triggers for p.dataBlocks number of reads so that it reads in parallel.
|
|
|
|
readTriggerCh <- true
|
2018-03-04 17:16:45 -05:00
|
|
|
}
|
|
|
|
|
2023-05-25 12:39:06 -04:00
|
|
|
disksNotFound := int32(0)
|
2021-01-27 13:21:14 -05:00
|
|
|
bitrotHeal := int32(0) // Atomic bool flag.
|
|
|
|
missingPartsHeal := int32(0) // Atomic bool flag.
|
2019-01-18 10:48:25 -05:00
|
|
|
readerIndex := 0
|
|
|
|
var wg sync.WaitGroup
|
|
|
|
// if readTrigger is true, it implies next disk.ReadAt() should be tried
|
|
|
|
// if readTrigger is false, it implies previous disk.ReadAt() was successful and there is no need
|
|
|
|
// to try reading the next disk.
|
|
|
|
for readTrigger := range readTriggerCh {
|
|
|
|
newBufLK.RLock()
|
|
|
|
canDecode := p.canDecode(newBuf)
|
|
|
|
newBufLK.RUnlock()
|
|
|
|
if canDecode {
|
|
|
|
break
|
2018-08-06 18:14:08 -04:00
|
|
|
}
|
2019-01-18 10:48:25 -05:00
|
|
|
if readerIndex == len(p.readers) {
|
2018-08-06 18:14:08 -04:00
|
|
|
break
|
2018-03-04 17:16:45 -05:00
|
|
|
}
|
2019-01-18 10:48:25 -05:00
|
|
|
if !readTrigger {
|
2018-08-06 18:14:08 -04:00
|
|
|
continue
|
2016-06-24 21:00:34 -04:00
|
|
|
}
|
2019-01-18 10:48:25 -05:00
|
|
|
wg.Add(1)
|
|
|
|
go func(i int) {
|
|
|
|
defer wg.Done()
|
2020-06-26 02:20:12 -04:00
|
|
|
rr := p.readers[i]
|
|
|
|
if rr == nil {
|
|
|
|
// Since reader is nil, trigger another read.
|
2019-01-18 10:48:25 -05:00
|
|
|
readTriggerCh <- true
|
|
|
|
return
|
2018-03-04 17:16:45 -05:00
|
|
|
}
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
bufIdx := p.readerToBuf[i]
|
|
|
|
if p.buf[bufIdx] == nil {
|
2019-01-18 10:48:25 -05:00
|
|
|
// Reading first time on this disk, hence the buffer needs to be allocated.
|
2024-01-18 02:03:17 -05:00
|
|
|
// Subsequent reads will reuse this buffer.
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
p.buf[bufIdx] = make([]byte, p.shardSize)
|
2019-01-18 10:48:25 -05:00
|
|
|
}
|
|
|
|
// For the last shard, the shardsize might be less than previous shard sizes.
|
|
|
|
// Hence the following statement ensures that the buffer size is reset to the right size.
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
p.buf[bufIdx] = p.buf[bufIdx][:p.shardSize]
|
2021-03-15 23:03:13 -04:00
|
|
|
n, err := rr.ReadAt(p.buf[bufIdx], p.offset)
|
2019-01-18 10:48:25 -05:00
|
|
|
if err != nil {
|
2023-05-25 12:39:06 -04:00
|
|
|
switch {
|
|
|
|
case errors.Is(err, errFileNotFound):
|
2021-01-27 13:21:14 -05:00
|
|
|
atomic.StoreInt32(&missingPartsHeal, 1)
|
2023-05-25 12:39:06 -04:00
|
|
|
case errors.Is(err, errFileCorrupt):
|
2021-01-27 13:21:14 -05:00
|
|
|
atomic.StoreInt32(&bitrotHeal, 1)
|
2023-05-25 12:39:06 -04:00
|
|
|
case errors.Is(err, errDiskNotFound):
|
|
|
|
atomic.AddInt32(&disksNotFound, 1)
|
2020-04-01 15:14:00 -04:00
|
|
|
}
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
|
|
|
|
// This will be communicated upstream.
|
|
|
|
p.orgReaders[bufIdx] = nil
|
2019-01-18 10:48:25 -05:00
|
|
|
p.readers[i] = nil
|
2021-01-02 13:35:57 -05:00
|
|
|
|
2019-01-18 10:48:25 -05:00
|
|
|
// Since ReadAt returned error, trigger another read.
|
|
|
|
readTriggerCh <- true
|
|
|
|
return
|
|
|
|
}
|
|
|
|
newBufLK.Lock()
|
2021-03-15 23:03:13 -04:00
|
|
|
newBuf[bufIdx] = p.buf[bufIdx][:n]
|
2019-01-18 10:48:25 -05:00
|
|
|
newBufLK.Unlock()
|
|
|
|
// Since ReadAt returned success, there is no need to trigger another read.
|
|
|
|
readTriggerCh <- false
|
|
|
|
}(readerIndex)
|
|
|
|
readerIndex++
|
|
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
if p.canDecode(newBuf) {
|
|
|
|
p.offset += p.shardSize
|
2023-05-25 12:39:06 -04:00
|
|
|
if missingPartsHeal == 1 {
|
2021-01-27 13:21:14 -05:00
|
|
|
return newBuf, errFileNotFound
|
2023-05-25 12:39:06 -04:00
|
|
|
} else if bitrotHeal == 1 {
|
2021-01-27 13:21:14 -05:00
|
|
|
return newBuf, errFileCorrupt
|
2020-04-01 15:14:00 -04:00
|
|
|
}
|
2019-01-18 10:48:25 -05:00
|
|
|
return newBuf, nil
|
2018-08-06 18:14:08 -04:00
|
|
|
}
|
2018-03-04 17:16:45 -05:00
|
|
|
|
2021-05-14 19:50:47 -04:00
|
|
|
// If we cannot decode, just return read quorum error.
|
2023-05-25 12:39:06 -04:00
|
|
|
return nil, fmt.Errorf("%w (offline-disks=%d/%d)", errErasureReadQuorum, disksNotFound, len(p.readers))
|
2018-08-06 18:14:08 -04:00
|
|
|
}
|
2018-03-04 17:16:45 -05:00
|
|
|
|
2018-08-24 02:35:37 -04:00
|
|
|
// Decode reads from readers, reconstructs data if needed and writes the data to the writer.
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
// A set of preferred drives can be supplied. In that case they will be used and the data reconstructed.
|
2021-01-27 13:21:14 -05:00
|
|
|
func (e Erasure) Decode(ctx context.Context, writer io.Writer, readers []io.ReaderAt, offset, length, totalLength int64, prefer []bool) (written int64, derr error) {
|
2018-08-06 18:14:08 -04:00
|
|
|
if offset < 0 || length < 0 {
|
|
|
|
logger.LogIf(ctx, errInvalidArgument)
|
2021-01-27 13:21:14 -05:00
|
|
|
return -1, errInvalidArgument
|
2018-08-06 18:14:08 -04:00
|
|
|
}
|
|
|
|
if offset+length > totalLength {
|
|
|
|
logger.LogIf(ctx, errInvalidArgument)
|
2021-01-27 13:21:14 -05:00
|
|
|
return -1, errInvalidArgument
|
2018-08-06 18:14:08 -04:00
|
|
|
}
|
2021-01-27 13:21:14 -05:00
|
|
|
|
2018-08-06 18:14:08 -04:00
|
|
|
if length == 0 {
|
2021-01-27 13:21:14 -05:00
|
|
|
return 0, nil
|
2018-08-06 18:14:08 -04:00
|
|
|
}
|
2018-03-04 17:16:45 -05:00
|
|
|
|
2019-01-17 07:58:18 -05:00
|
|
|
reader := newParallelReader(readers, e, offset, totalLength)
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
if len(prefer) == len(readers) {
|
|
|
|
reader.preferReaders(prefer)
|
|
|
|
}
|
2018-08-06 18:14:08 -04:00
|
|
|
|
2018-08-24 02:35:37 -04:00
|
|
|
startBlock := offset / e.blockSize
|
|
|
|
endBlock := (offset + length) / e.blockSize
|
2018-08-06 18:14:08 -04:00
|
|
|
|
|
|
|
var bytesWritten int64
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
var bufs [][]byte
|
2018-08-06 18:14:08 -04:00
|
|
|
for block := startBlock; block <= endBlock; block++ {
|
|
|
|
var blockOffset, blockLength int64
|
|
|
|
switch {
|
|
|
|
case startBlock == endBlock:
|
2018-08-24 02:35:37 -04:00
|
|
|
blockOffset = offset % e.blockSize
|
2018-08-06 18:14:08 -04:00
|
|
|
blockLength = length
|
|
|
|
case block == startBlock:
|
2018-08-24 02:35:37 -04:00
|
|
|
blockOffset = offset % e.blockSize
|
|
|
|
blockLength = e.blockSize - blockOffset
|
2018-08-06 18:14:08 -04:00
|
|
|
case block == endBlock:
|
|
|
|
blockOffset = 0
|
2018-08-24 02:35:37 -04:00
|
|
|
blockLength = (offset + length) % e.blockSize
|
2018-08-06 18:14:08 -04:00
|
|
|
default:
|
|
|
|
blockOffset = 0
|
2018-08-24 02:35:37 -04:00
|
|
|
blockLength = e.blockSize
|
2016-06-24 21:00:34 -04:00
|
|
|
}
|
2018-08-06 18:14:08 -04:00
|
|
|
if blockLength == 0 {
|
|
|
|
break
|
|
|
|
}
|
2021-01-27 13:21:14 -05:00
|
|
|
|
Prefer local disks when fetching data blocks (#9563)
If the requested server is part of the set this will always read
from the local disk, even if the disk contains a parity shard.
In default setup there is a 50% chance that at least
one shard that otherwise would have been fetched remotely
will be read locally instead.
It basically trades RPC call overhead for reed-solomon.
On distributed localhost this seems to be fairly break-even,
with a very small gain in throughput and latency.
However on networked servers this should be a bigger
1MB objects, before:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 76257:
* Avg: 25ms 50%: 24ms 90%: 32ms 99%: 42ms Fastest: 7ms Slowest: 67ms
* First Byte: Average: 23ms, Median: 22ms, Best: 5ms, Worst: 65ms
Throughput:
* Average: 1213.68 MiB/s, 1272.63 obj/s (59.948s, starting 14:45:44 CEST)
```
After:
```
Operation: GET. Concurrency: 32. Hosts: 4.
Requests considered: 78845:
* Avg: 24ms 50%: 24ms 90%: 31ms 99%: 39ms Fastest: 8ms Slowest: 62ms
* First Byte: Average: 22ms, Median: 21ms, Best: 6ms, Worst: 57ms
Throughput:
* Average: 1255.11 MiB/s, 1316.08 obj/s (59.938s, starting 14:43:58 CEST)
```
Bonus fix: Only ask for heal once on an object.
2020-05-26 19:47:23 -04:00
|
|
|
var err error
|
|
|
|
bufs, err = reader.Read(bufs)
|
2021-01-27 13:21:14 -05:00
|
|
|
if len(bufs) > 0 {
|
|
|
|
// Set only if there are be enough data for reconstruction.
|
|
|
|
// and only for expected errors, also set once.
|
|
|
|
if errors.Is(err, errFileNotFound) || errors.Is(err, errFileCorrupt) {
|
|
|
|
if derr == nil {
|
|
|
|
derr = err
|
|
|
|
}
|
2020-04-01 15:14:00 -04:00
|
|
|
}
|
2021-01-27 13:21:14 -05:00
|
|
|
} else if err != nil {
|
|
|
|
// For all errors that cannot be reconstructed fail the read operation.
|
|
|
|
return -1, err
|
2017-08-14 21:08:42 -04:00
|
|
|
}
|
2020-11-12 15:12:09 -05:00
|
|
|
|
2018-08-24 02:35:37 -04:00
|
|
|
if err = e.DecodeDataBlocks(bufs); err != nil {
|
2018-08-06 18:14:08 -04:00
|
|
|
logger.LogIf(ctx, err)
|
2021-01-27 13:21:14 -05:00
|
|
|
return -1, err
|
2016-06-27 16:24:55 -04:00
|
|
|
}
|
2020-11-12 15:12:09 -05:00
|
|
|
|
2018-08-24 02:35:37 -04:00
|
|
|
n, err := writeDataBlocks(ctx, writer, bufs, e.dataBlocks, blockOffset, blockLength)
|
2018-08-06 18:14:08 -04:00
|
|
|
if err != nil {
|
2021-01-27 13:21:14 -05:00
|
|
|
return -1, err
|
2018-08-06 18:14:08 -04:00
|
|
|
}
|
2021-01-27 13:21:14 -05:00
|
|
|
|
2018-08-06 18:14:08 -04:00
|
|
|
bytesWritten += n
|
|
|
|
}
|
2021-01-27 13:21:14 -05:00
|
|
|
|
2018-08-06 18:14:08 -04:00
|
|
|
if bytesWritten != length {
|
|
|
|
logger.LogIf(ctx, errLessData)
|
2021-01-27 13:21:14 -05:00
|
|
|
return bytesWritten, errLessData
|
2016-06-27 16:24:55 -04:00
|
|
|
}
|
2020-04-01 15:14:00 -04:00
|
|
|
|
2021-01-27 13:21:14 -05:00
|
|
|
return bytesWritten, derr
|
2016-06-27 16:24:55 -04:00
|
|
|
}
|
2021-12-03 12:26:30 -05:00
|
|
|
|
|
|
|
// Heal reads from readers, reconstruct shards and writes the data to the writers.
|
2023-08-03 05:18:18 -04:00
|
|
|
func (e Erasure) Heal(ctx context.Context, writers []io.Writer, readers []io.ReaderAt, totalLength int64, prefer []bool) (derr error) {
|
2021-12-03 12:26:30 -05:00
|
|
|
if len(writers) != e.parityBlocks+e.dataBlocks {
|
|
|
|
return errInvalidArgument
|
|
|
|
}
|
|
|
|
|
|
|
|
reader := newParallelReader(readers, e, 0, totalLength)
|
2023-08-03 05:18:18 -04:00
|
|
|
if len(readers) == len(prefer) {
|
|
|
|
reader.preferReaders(prefer)
|
|
|
|
}
|
2021-12-03 12:26:30 -05:00
|
|
|
|
|
|
|
startBlock := int64(0)
|
|
|
|
endBlock := totalLength / e.blockSize
|
|
|
|
if totalLength%e.blockSize != 0 {
|
|
|
|
endBlock++
|
|
|
|
}
|
|
|
|
|
|
|
|
var bufs [][]byte
|
|
|
|
for block := startBlock; block < endBlock; block++ {
|
|
|
|
var err error
|
|
|
|
bufs, err = reader.Read(bufs)
|
|
|
|
if len(bufs) > 0 {
|
|
|
|
if errors.Is(err, errFileNotFound) || errors.Is(err, errFileCorrupt) {
|
|
|
|
if derr == nil {
|
|
|
|
derr = err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
if err = e.DecodeDataAndParityBlocks(ctx, bufs); err != nil {
|
2023-06-24 23:29:13 -04:00
|
|
|
logger.LogOnceIf(ctx, err, "erasure-heal-decode")
|
2021-12-03 12:26:30 -05:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
w := parallelWriter{
|
|
|
|
writers: writers,
|
|
|
|
writeQuorum: 1,
|
|
|
|
errs: make([]error, len(writers)),
|
|
|
|
}
|
|
|
|
|
2022-01-12 21:49:01 -05:00
|
|
|
if err = w.Write(ctx, bufs); err != nil {
|
2023-06-24 23:29:13 -04:00
|
|
|
logger.LogOnceIf(ctx, err, "erasure-heal-write")
|
2021-12-03 12:26:30 -05:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return derr
|
|
|
|
}
|