2021-04-18 12:41:13 -07:00
|
|
|
// Copyright (c) 2015-2021 MinIO, Inc.
|
|
|
|
//
|
|
|
|
// This file is part of MinIO Object Storage stack
|
|
|
|
//
|
|
|
|
// This program is free software: you can redistribute it and/or modify
|
|
|
|
// it under the terms of the GNU Affero General Public License as published by
|
|
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
|
|
// (at your option) any later version.
|
|
|
|
//
|
|
|
|
// This program is distributed in the hope that it will be useful
|
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
// GNU Affero General Public License for more details.
|
|
|
|
//
|
|
|
|
// You should have received a copy of the GNU Affero General Public License
|
|
|
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
2016-08-18 00:06:33 +05:30
|
|
|
|
2016-08-18 16:23:42 -07:00
|
|
|
package cmd
|
2016-08-18 00:06:33 +05:30
|
|
|
|
|
|
|
import (
|
|
|
|
"bytes"
|
2018-04-05 15:04:40 -07:00
|
|
|
"context"
|
2016-08-18 00:06:33 +05:30
|
|
|
"crypto/rand"
|
2017-08-14 18:08:42 -07:00
|
|
|
"io"
|
2019-01-17 04:58:18 -08:00
|
|
|
"os"
|
2016-08-18 00:06:33 +05:30
|
|
|
"testing"
|
|
|
|
)
|
|
|
|
|
2018-08-23 23:35:37 -07:00
|
|
|
var erasureHealTests = []struct {
|
2017-09-20 22:20:27 +05:30
|
|
|
dataBlocks, disks int
|
|
|
|
|
|
|
|
// number of offline disks is also number of staleDisks for
|
|
|
|
// erasure reconstruction in this test
|
|
|
|
offDisks int
|
|
|
|
|
|
|
|
// bad disks are online disks which return errors
|
|
|
|
badDisks, badStaleDisks int
|
|
|
|
|
|
|
|
blocksize, size int64
|
|
|
|
algorithm BitrotAlgorithm
|
|
|
|
shouldFail bool
|
2017-08-14 18:08:42 -07:00
|
|
|
}{
|
[feat]: change erasure coding default block size from 10MiB to 1MiB (#11721)
major performance improvements in range GETs to avoid large
read amplification when ranges are tiny and random
```
-------------------
Operation: GET
Operations: 142014 -> 339421
Duration: 4m50s -> 4m56s
* Average: +139.41% (+1177.3 MiB/s) throughput, +139.11% (+658.4) obj/s
* Fastest: +125.24% (+1207.4 MiB/s) throughput, +132.32% (+612.9) obj/s
* 50% Median: +139.06% (+1175.7 MiB/s) throughput, +133.46% (+660.9) obj/s
* Slowest: +203.40% (+1267.9 MiB/s) throughput, +198.59% (+753.5) obj/s
```
TTFB from 10MiB BlockSize
```
* First Access TTFB: Avg: 81ms, Median: 61ms, Best: 20ms, Worst: 2.056s
```
TTFB from 1MiB BlockSize
```
* First Access TTFB: Avg: 22ms, Median: 21ms, Best: 8ms, Worst: 91ms
```
Full object reads however do see a slight change which won't be
noticeable in real world, so not doing any comparisons
TTFB still had improvements with full object reads with 1MiB
```
* First Access TTFB: Avg: 68ms, Median: 35ms, Best: 11ms, Worst: 1.16s
```
v/s
TTFB with 10MiB
```
* First Access TTFB: Avg: 388ms, Median: 98ms, Best: 20ms, Worst: 4.156s
```
This change should affect all new uploads, previous uploads should
continue to work with business as usual. But dramatic improvements can
be seen with these changes.
2021-03-06 14:09:34 -08:00
|
|
|
{dataBlocks: 2, disks: 4, offDisks: 1, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: SHA256, shouldFail: false}, // 0
|
|
|
|
{dataBlocks: 3, disks: 6, offDisks: 2, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false}, // 1
|
|
|
|
{dataBlocks: 4, disks: 8, offDisks: 2, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false}, // 2
|
|
|
|
{dataBlocks: 5, disks: 10, offDisks: 3, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false}, // 3
|
|
|
|
{dataBlocks: 6, disks: 12, offDisks: 2, badDisks: 3, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: SHA256, shouldFail: false}, // 4
|
|
|
|
{dataBlocks: 7, disks: 14, offDisks: 4, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false}, // 5
|
|
|
|
{dataBlocks: 8, disks: 16, offDisks: 6, badDisks: 1, badStaleDisks: 1, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false}, // 6
|
2017-09-20 22:20:27 +05:30
|
|
|
{dataBlocks: 7, disks: 14, offDisks: 2, badDisks: 3, badStaleDisks: 0, blocksize: int64(oneMiByte / 2), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false}, // 7
|
|
|
|
{dataBlocks: 6, disks: 12, offDisks: 1, badDisks: 0, badStaleDisks: 1, blocksize: int64(oneMiByte - 1), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true}, // 8
|
|
|
|
{dataBlocks: 5, disks: 10, offDisks: 3, badDisks: 0, badStaleDisks: 3, blocksize: int64(oneMiByte / 2), size: oneMiByte, algorithm: SHA256, shouldFail: true}, // 9
|
[feat]: change erasure coding default block size from 10MiB to 1MiB (#11721)
major performance improvements in range GETs to avoid large
read amplification when ranges are tiny and random
```
-------------------
Operation: GET
Operations: 142014 -> 339421
Duration: 4m50s -> 4m56s
* Average: +139.41% (+1177.3 MiB/s) throughput, +139.11% (+658.4) obj/s
* Fastest: +125.24% (+1207.4 MiB/s) throughput, +132.32% (+612.9) obj/s
* 50% Median: +139.06% (+1175.7 MiB/s) throughput, +133.46% (+660.9) obj/s
* Slowest: +203.40% (+1267.9 MiB/s) throughput, +198.59% (+753.5) obj/s
```
TTFB from 10MiB BlockSize
```
* First Access TTFB: Avg: 81ms, Median: 61ms, Best: 20ms, Worst: 2.056s
```
TTFB from 1MiB BlockSize
```
* First Access TTFB: Avg: 22ms, Median: 21ms, Best: 8ms, Worst: 91ms
```
Full object reads however do see a slight change which won't be
noticeable in real world, so not doing any comparisons
TTFB still had improvements with full object reads with 1MiB
```
* First Access TTFB: Avg: 68ms, Median: 35ms, Best: 11ms, Worst: 1.16s
```
v/s
TTFB with 10MiB
```
* First Access TTFB: Avg: 388ms, Median: 98ms, Best: 20ms, Worst: 4.156s
```
This change should affect all new uploads, previous uploads should
continue to work with business as usual. But dramatic improvements can
be seen with these changes.
2021-03-06 14:09:34 -08:00
|
|
|
{dataBlocks: 4, disks: 8, offDisks: 1, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false}, // 10
|
|
|
|
{dataBlocks: 2, disks: 4, offDisks: 1, badDisks: 0, badStaleDisks: 1, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true}, // 11
|
|
|
|
{dataBlocks: 6, disks: 12, offDisks: 8, badDisks: 3, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true}, // 12
|
|
|
|
{dataBlocks: 7, disks: 14, offDisks: 3, badDisks: 4, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false}, // 13
|
|
|
|
{dataBlocks: 7, disks: 14, offDisks: 6, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false}, // 14
|
|
|
|
{dataBlocks: 8, disks: 16, offDisks: 4, badDisks: 5, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: true}, // 15
|
|
|
|
{dataBlocks: 2, disks: 4, offDisks: 1, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false}, // 16
|
|
|
|
{dataBlocks: 12, disks: 16, offDisks: 2, badDisks: 1, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: DefaultBitrotAlgorithm, shouldFail: false}, // 17
|
|
|
|
{dataBlocks: 6, disks: 8, offDisks: 1, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte, algorithm: BLAKE2b512, shouldFail: false}, // 18
|
|
|
|
{dataBlocks: 2, disks: 4, offDisks: 1, badDisks: 0, badStaleDisks: 0, blocksize: int64(blockSizeV2), size: oneMiByte * 64, algorithm: SHA256, shouldFail: false}, // 19
|
2017-08-14 18:08:42 -07:00
|
|
|
}
|
2016-08-18 00:06:33 +05:30
|
|
|
|
2018-08-23 23:35:37 -07:00
|
|
|
func TestErasureHeal(t *testing.T) {
|
|
|
|
for i, test := range erasureHealTests {
|
2017-09-20 22:20:27 +05:30
|
|
|
if test.offDisks < test.badStaleDisks {
|
|
|
|
// test case sanity check
|
|
|
|
t.Fatalf("Test %d: Bad test case - number of stale disks cannot be less than number of badstale disks", i)
|
|
|
|
}
|
|
|
|
|
|
|
|
// create some test data
|
2022-07-26 03:37:26 +08:00
|
|
|
setup, err := newErasureTestSetup(t, test.dataBlocks, test.disks-test.dataBlocks, test.blocksize)
|
2016-08-18 00:06:33 +05:30
|
|
|
if err != nil {
|
2020-06-12 20:04:01 -07:00
|
|
|
t.Fatalf("Test %d: failed to setup Erasure environment: %v", i, err)
|
2016-08-18 00:06:33 +05:30
|
|
|
}
|
2018-08-06 15:14:08 -07:00
|
|
|
disks := setup.disks
|
2018-08-23 23:35:37 -07:00
|
|
|
erasure, err := NewErasure(context.Background(), test.dataBlocks, test.disks-test.dataBlocks, test.blocksize)
|
2017-08-14 18:08:42 -07:00
|
|
|
if err != nil {
|
|
|
|
t.Fatalf("Test %d: failed to create ErasureStorage: %v", i, err)
|
2016-08-18 00:06:33 +05:30
|
|
|
}
|
2017-08-14 18:08:42 -07:00
|
|
|
data := make([]byte, test.size)
|
|
|
|
if _, err = io.ReadFull(rand.Reader, data); err != nil {
|
|
|
|
t.Fatalf("Test %d: failed to create random test data: %v", i, err)
|
2016-08-18 00:06:33 +05:30
|
|
|
}
|
2017-08-14 18:08:42 -07:00
|
|
|
buffer := make([]byte, test.blocksize, 2*test.blocksize)
|
2019-01-17 04:58:18 -08:00
|
|
|
writers := make([]io.Writer, len(disks))
|
2018-08-06 15:14:08 -07:00
|
|
|
for i, disk := range disks {
|
2021-05-17 17:32:28 +02:00
|
|
|
writers[i] = newBitrotWriter(disk, "testbucket", "testobject", erasure.ShardFileSize(test.size), test.algorithm, erasure.ShardSize())
|
2018-08-06 15:14:08 -07:00
|
|
|
}
|
2018-08-23 23:35:37 -07:00
|
|
|
_, err = erasure.Encode(context.Background(), bytes.NewReader(data), writers, buffer, erasure.dataBlocks+1)
|
2019-01-17 04:58:18 -08:00
|
|
|
closeBitrotWriters(writers)
|
2016-08-18 00:06:33 +05:30
|
|
|
if err != nil {
|
2017-08-14 18:08:42 -07:00
|
|
|
t.Fatalf("Test %d: failed to create random test data: %v", i, err)
|
2016-08-18 00:06:33 +05:30
|
|
|
}
|
|
|
|
|
2019-01-17 04:58:18 -08:00
|
|
|
readers := make([]io.ReaderAt, len(disks))
|
2018-08-06 15:14:08 -07:00
|
|
|
for i, disk := range disks {
|
2019-01-17 04:58:18 -08:00
|
|
|
shardFilesize := erasure.ShardFileSize(test.size)
|
2021-01-07 19:27:31 -08:00
|
|
|
readers[i] = newBitrotReader(disk, nil, "testbucket", "testobject", shardFilesize, test.algorithm, bitrotWriterSum(writers[i]), erasure.ShardSize())
|
2018-08-06 15:14:08 -07:00
|
|
|
}
|
|
|
|
|
2017-09-20 22:20:27 +05:30
|
|
|
// setup stale disks for the test case
|
2018-08-06 15:14:08 -07:00
|
|
|
staleDisks := make([]StorageAPI, len(disks))
|
|
|
|
copy(staleDisks, disks)
|
|
|
|
for j := 0; j < len(staleDisks); j++ {
|
2017-09-20 22:20:27 +05:30
|
|
|
if j < test.offDisks {
|
2018-08-06 15:14:08 -07:00
|
|
|
readers[j] = nil
|
2017-09-20 22:20:27 +05:30
|
|
|
} else {
|
|
|
|
staleDisks[j] = nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for j := 0; j < test.badDisks; j++ {
|
2019-01-17 04:58:18 -08:00
|
|
|
switch r := readers[test.offDisks+j].(type) {
|
|
|
|
case *streamingBitrotReader:
|
|
|
|
r.disk = badDisk{nil}
|
|
|
|
case *wholeBitrotReader:
|
|
|
|
r.disk = badDisk{nil}
|
|
|
|
}
|
2017-09-20 22:20:27 +05:30
|
|
|
}
|
|
|
|
for j := 0; j < test.badStaleDisks; j++ {
|
|
|
|
staleDisks[j] = badDisk{nil}
|
|
|
|
}
|
|
|
|
|
2019-01-17 04:58:18 -08:00
|
|
|
staleWriters := make([]io.Writer, len(staleDisks))
|
2018-08-06 15:14:08 -07:00
|
|
|
for i, disk := range staleDisks {
|
|
|
|
if disk == nil {
|
|
|
|
continue
|
|
|
|
}
|
2019-01-17 04:58:18 -08:00
|
|
|
os.Remove(pathJoin(disk.String(), "testbucket", "testobject"))
|
2021-05-17 17:32:28 +02:00
|
|
|
staleWriters[i] = newBitrotWriter(disk, "testbucket", "testobject", erasure.ShardFileSize(test.size), test.algorithm, erasure.ShardSize())
|
2018-08-06 15:14:08 -07:00
|
|
|
}
|
|
|
|
|
2019-01-17 04:58:18 -08:00
|
|
|
// test case setup is complete - now call Heal()
|
2021-12-04 01:26:30 +08:00
|
|
|
err = erasure.Heal(context.Background(), staleWriters, readers, test.size)
|
2019-01-17 04:58:18 -08:00
|
|
|
closeBitrotReaders(readers)
|
|
|
|
closeBitrotWriters(staleWriters)
|
2017-08-14 18:08:42 -07:00
|
|
|
if err != nil && !test.shouldFail {
|
|
|
|
t.Errorf("Test %d: should pass but it failed with: %v", i, err)
|
|
|
|
}
|
|
|
|
if err == nil && test.shouldFail {
|
|
|
|
t.Errorf("Test %d: should fail but it passed", i)
|
|
|
|
}
|
|
|
|
if err == nil {
|
2017-09-20 22:20:27 +05:30
|
|
|
// Verify that checksums of staleDisks
|
|
|
|
// match expected values
|
2018-08-06 15:14:08 -07:00
|
|
|
for i := range staleWriters {
|
|
|
|
if staleWriters[i] == nil {
|
2017-09-20 22:20:27 +05:30
|
|
|
continue
|
2017-08-14 18:08:42 -07:00
|
|
|
}
|
2019-01-17 04:58:18 -08:00
|
|
|
if !bytes.Equal(bitrotWriterSum(staleWriters[i]), bitrotWriterSum(writers[i])) {
|
2017-08-14 18:08:42 -07:00
|
|
|
t.Errorf("Test %d: heal returned different bitrot checksums", i)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2016-08-18 00:06:33 +05:30
|
|
|
}
|
|
|
|
}
|