Automatically set goroutines based on shardSize (#5346)

Update reedsolomon library to enable feature to automatically
set number of go-routines based on the input shard size,
since shard size is sort of a constant in Minio for
objects > 10MiB (default blocksize)

klauspost reported around 15-20% improvement in performance
numbers on older systems such as AVX and SSE3

```
name                  old speed      new speed      delta
Encode10x2x10000-8    5.45GB/s ± 1%  6.22GB/s ± 1%  +14.20%    (p=0.000 n=9+9)
Encode100x20x10000-8  1.44GB/s ± 1%  1.64GB/s ± 1%  +13.77%  (p=0.000 n=10+10)
Encode17x3x1M-8       10.0GB/s ± 5%  12.0GB/s ± 1%  +19.88%  (p=0.000 n=10+10)
Encode10x4x16M-8      7.81GB/s ± 5%  8.56GB/s ± 5%   +9.58%   (p=0.000 n=10+9)
Encode5x2x1M-8        15.3GB/s ± 2%  19.6GB/s ± 2%  +28.57%   (p=0.000 n=9+10)
Encode10x2x1M-8       12.2GB/s ± 5%  15.0GB/s ± 5%  +22.45%  (p=0.000 n=10+10)
Encode10x4x1M-8       7.84GB/s ± 1%  9.03GB/s ± 1%  +15.19%    (p=0.000 n=9+9)
Encode50x20x1M-8      1.73GB/s ± 4%  2.09GB/s ± 4%  +20.59%   (p=0.000 n=10+9)
Encode17x3x16M-8      10.6GB/s ± 1%  11.7GB/s ± 4%  +10.12%   (p=0.000 n=8+10)
```
This commit is contained in:
Harshavardhana
2018-01-03 13:47:22 -08:00
committed by kannappanr
parent b1fb550d5c
commit c0721164be
13 changed files with 151 additions and 47 deletions

View File

@@ -19,8 +19,35 @@ TEXT ·galMulSSSE3Xor(SB), 7, $0
MOVQ out+72(FP), DX // DX: &out
PSHUFB X5, X8 // X8: lomask (unpacked)
SHRQ $4, R9 // len(in) / 16
MOVQ SI, AX
MOVQ DX, BX
ANDQ $15, AX
ANDQ $15, BX
CMPQ R9, $0
JEQ done_xor
ORQ AX, BX
CMPQ BX, $0
JNZ loopback_xor
loopback_xor_aligned:
MOVOA (SI), X0 // in[x]
MOVOA (DX), X4 // out[x]
MOVOA X0, X1 // in[x]
MOVOA X6, X2 // low copy
MOVOA X7, X3 // high copy
PSRLQ $4, X1 // X1: high input
PAND X8, X0 // X0: low input
PAND X8, X1 // X0: high input
PSHUFB X0, X2 // X2: mul low part
PSHUFB X1, X3 // X3: mul high part
PXOR X2, X3 // X3: Result
PXOR X4, X3 // X3: Result xor existing out
MOVOA X3, (DX) // Store
ADDQ $16, SI // in+=16
ADDQ $16, DX // out+=16
SUBQ $1, R9
JNZ loopback_xor_aligned
JMP done_xor
loopback_xor:
MOVOU (SI), X0 // in[x]
@@ -57,15 +84,40 @@ TEXT ·galMulSSSE3(SB), 7, $0
MOVQ in_len+56(FP), R9 // R9: len(in)
MOVQ out+72(FP), DX // DX: &out
PSHUFB X5, X8 // X8: lomask (unpacked)
MOVQ SI, AX
MOVQ DX, BX
SHRQ $4, R9 // len(in) / 16
ANDQ $15, AX
ANDQ $15, BX
CMPQ R9, $0
JEQ done
ORQ AX, BX
CMPQ BX, $0
JNZ loopback
loopback_aligned:
MOVOA (SI), X0 // in[x]
MOVOA X0, X1 // in[x]
MOVOA X6, X2 // low copy
MOVOA X7, X3 // high copy
PSRLQ $4, X1 // X1: high input
PAND X8, X0 // X0: low input
PAND X8, X1 // X0: high input
PSHUFB X0, X2 // X2: mul low part
PSHUFB X1, X3 // X3: mul high part
PXOR X2, X3 // X3: Result
MOVOA X3, (DX) // Store
ADDQ $16, SI // in+=16
ADDQ $16, DX // out+=16
SUBQ $1, R9
JNZ loopback_aligned
JMP done
loopback:
MOVOU (SI), X0 // in[x]
MOVOU X0, X1 // in[x]
MOVOU X6, X2 // low copy
MOVOU X7, X3 // high copy
MOVOA X6, X2 // low copy
MOVOA X7, X3 // high copy
PSRLQ $4, X1 // X1: high input
PAND X8, X0 // X0: low input
PAND X8, X1 // X0: high input