Automatically set goroutines based on shardSize (#5346)

Update reedsolomon library to enable feature to automatically
set number of go-routines based on the input shard size,
since shard size is sort of a constant in Minio for
objects > 10MiB (default blocksize)

klauspost reported around 15-20% improvement in performance
numbers on older systems such as AVX and SSE3

```
name                  old speed      new speed      delta
Encode10x2x10000-8    5.45GB/s ± 1%  6.22GB/s ± 1%  +14.20%    (p=0.000 n=9+9)
Encode100x20x10000-8  1.44GB/s ± 1%  1.64GB/s ± 1%  +13.77%  (p=0.000 n=10+10)
Encode17x3x1M-8       10.0GB/s ± 5%  12.0GB/s ± 1%  +19.88%  (p=0.000 n=10+10)
Encode10x4x16M-8      7.81GB/s ± 5%  8.56GB/s ± 5%   +9.58%   (p=0.000 n=10+9)
Encode5x2x1M-8        15.3GB/s ± 2%  19.6GB/s ± 2%  +28.57%   (p=0.000 n=9+10)
Encode10x2x1M-8       12.2GB/s ± 5%  15.0GB/s ± 5%  +22.45%  (p=0.000 n=10+10)
Encode10x4x1M-8       7.84GB/s ± 1%  9.03GB/s ± 1%  +15.19%    (p=0.000 n=9+9)
Encode50x20x1M-8      1.73GB/s ± 4%  2.09GB/s ± 4%  +20.59%   (p=0.000 n=10+9)
Encode17x3x16M-8      10.6GB/s ± 1%  11.7GB/s ± 4%  +10.12%   (p=0.000 n=8+10)
```
This commit is contained in:
Harshavardhana
2018-01-03 13:47:22 -08:00
committed by kannappanr
parent b1fb550d5c
commit c0721164be
13 changed files with 151 additions and 47 deletions

View File

@@ -15,7 +15,10 @@ import (
"bytes"
"errors"
"io"
"runtime"
"sync"
"github.com/klauspost/cpuid"
)
// Encoder is an interface to encode Reed-Salomon parity sets for your data.
@@ -239,6 +242,33 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
if err != nil {
return nil, err
}
if r.o.shardSize > 0 {
cacheSize := cpuid.CPU.Cache.L2
if cacheSize <= 0 {
// Set to 128K if undetectable.
cacheSize = 128 << 10
}
p := runtime.NumCPU()
// 1 input + parity must fit in cache, and we add one more to be safer.
shards := 1 + parityShards
g := (r.o.shardSize * shards) / (cacheSize - (cacheSize >> 4))
if cpuid.CPU.ThreadsPerCore > 1 {
// If multiple threads per core, make sure they don't contend for cache.
g *= cpuid.CPU.ThreadsPerCore
}
g *= 2
if g < p {
g = p
}
// Have g be multiple of p
g += p - 1
g -= g % p
r.o.maxGoroutines = g
}
// Inverted matrices are cached in a tree keyed by the indices
// of the invalid rows of the data to reconstruct.
@@ -431,6 +461,8 @@ func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outpu
if do < r.o.minSplitSize {
do = r.o.minSplitSize
}
// Make sizes divisible by 16
do = (do + 15) & (^15)
start := 0
for start < byteCount {
if start+do > byteCount {
@@ -490,6 +522,8 @@ func (r reedSolomon) checkSomeShardsP(matrixRows, inputs, toCheck [][]byte, outp
if do < r.o.minSplitSize {
do = r.o.minSplitSize
}
// Make sizes divisible by 16
do = (do + 15) & (^15)
start := 0
for start < byteCount {
if start+do > byteCount {