Update reedsolomon/highwayhash to start using ppc64le support (#7003)

Thanks to @fwessels for the upstream work on reedsolomon and
highwayhash which has resulted in 10x performance improvement
on ppc64 architecture.
This commit is contained in:
Harshavardhana
2018-12-20 09:47:05 -08:00
committed by Nitish Tiwari
parent bc67410548
commit def04f01cf
17 changed files with 460 additions and 14 deletions

View File

@@ -24,6 +24,10 @@ go get -u github.com/klauspost/reedsolomon
# Changes
## December 18, 2018
Assembly code for ppc64le has been contributed, this boosts performance by about 10x on this platform.
## November 18, 2017
Added [WithAutoGoroutines](https://godoc.org/github.com/klauspost/reedsolomon#WithAutoGoroutines) which will attempt to calculate the optimal number of goroutines to use based on your expected shard size and detected CPU.
@@ -259,6 +263,18 @@ By exploiting NEON instructions the performance for ARM has been accelerated. Be
| 10 | 2 | 20% | 188 | 1738 | 925% |
| 10 | 4 | 40% | 96 | 839 | 877% |
# Performance on ppc64le
The performance for ppc64le has been accelerated. This gives roughly a 10x performance improvement on this architecture as can been seen below:
```
benchmark old MB/s new MB/s speedup
BenchmarkGalois128K-160 948.87 8878.85 9.36x
BenchmarkGalois1M-160 968.85 9041.92 9.33x
BenchmarkGaloisXor128K-160 862.02 7905.00 9.17x
BenchmarkGaloisXor1M-160 784.60 6296.65 8.03x
```
# asm2plan9s
[asm2plan9s](https://github.com/fwessels/asm2plan9s) is used for assembling the AVX2 instructions into their BYTE/WORD/LONG equivalents.
@@ -266,8 +282,10 @@ By exploiting NEON instructions the performance for ARM has been accelerated. Be
# Links
* [Backblaze Open Sources Reed-Solomon Erasure Coding Source Code](https://www.backblaze.com/blog/reed-solomon/).
* [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon). Compatible java library by Backblaze.
* [ocaml-reed-solomon-erasure](https://gitlab.com/darrenldl/ocaml-reed-solomon-erasure). Compatible OCaml implementation.
* [reedsolomon-c](https://github.com/jannson/reedsolomon-c). C version, compatible with output from this package.
* [Reed-Solomon Erasure Coding in Haskell](https://github.com/NicolasT/reedsolomon). Haskell port of the package with similar performance.
* [reed-solomon-erasure](https://github.com/darrenldl/reed-solomon-erasure). Compatible Rust implementation.
* [go-erasure](https://github.com/somethingnew2-0/go-erasure). A similar library using cgo, slower in my tests.
* [rsraid](https://github.com/goayame/rsraid). A similar library written in Go. Slower, but supports more shards.
* [Screaming Fast Galois Field Arithmetic](http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf). Basis for SSE3 optimizations.

View File

@@ -1,5 +1,6 @@
//+build !noasm
//+build !appengine
//+build !gccgo
// Copyright 2015, Klaus Post, see LICENSE for details.

View File

@@ -1,4 +1,4 @@
//+build !noasm !appengine
//+build !noasm !appengine !gccgo
// Copyright 2015, Klaus Post, see LICENSE for details.

View File

@@ -1,5 +1,6 @@
//+build !noasm
//+build !appengine
//+build !gccgo
// Copyright 2015, Klaus Post, see LICENSE for details.
// Copyright 2017, Minio, Inc.

View File

@@ -1,4 +1,4 @@
//+build !noasm !appengine
//+build !noasm !appengine !gccgo
// Copyright 2015, Klaus Post, see LICENSE for details.
// Copyright 2017, Minio, Inc.

View File

@@ -1,5 +1,6 @@
//+build !amd64 noasm appengine
//+build !arm64 noasm appengine
//+build !amd64 noasm appengine gccgo
//+build !arm64 noasm appengine gccgo
//+build !ppc64le noasm appengine gccgo
// Copyright 2015, Klaus Post, see LICENSE for details.

View File

@@ -0,0 +1,67 @@
//+build !noasm
//+build !appengine
//+build !gccgo
// Copyright 2015, Klaus Post, see LICENSE for details.
// Copyright 2018, Minio, Inc.
package reedsolomon
//go:noescape
func galMulPpc(low, high, in, out []byte)
//go:noescape
func galMulPpcXor(low, high, in, out []byte)
// This is what the assembler routines do in blocks of 16 bytes:
/*
func galMulPpc(low, high, in, out []byte) {
for n, input := range in {
l := input & 0xf
h := input >> 4
out[n] = low[l] ^ high[h]
}
}
func galMulPpcXor(low, high, in, out []byte) {
for n, input := range in {
l := input & 0xf
h := input >> 4
out[n] ^= low[l] ^ high[h]
}
}
*/
func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
done := (len(in) >> 4) << 4
if done > 0 {
galMulPpc(mulTableLow[c][:], mulTableHigh[c][:], in[:done], out)
}
remain := len(in) - done
if remain > 0 {
mt := mulTable[c]
for i := done; i < len(in); i++ {
out[i] = mt[in[i]]
}
}
}
func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
done := (len(in) >> 4) << 4
if done > 0 {
galMulPpcXor(mulTableLow[c][:], mulTableHigh[c][:], in[:done], out)
}
remain := len(in) - done
if remain > 0 {
mt := mulTable[c]
for i := done; i < len(in); i++ {
out[i] ^= mt[in[i]]
}
}
}
// slice galois add
func sliceXor(in, out []byte, sse2 bool) {
for n, input := range in {
out[n] ^= input
}
}

View File

@@ -0,0 +1,126 @@
//+build !noasm !appengine !gccgo
// Copyright 2015, Klaus Post, see LICENSE for details.
// Copyright 2018, Minio, Inc.
#include "textflag.h"
#define LOW R3
#define HIGH R4
#define IN R5
#define LEN R6
#define OUT R7
#define CONSTANTS R8
#define OFFSET R9
#define OFFSET1 R10
#define OFFSET2 R11
#define X6 VS34
#define X6_ V2
#define X7 VS35
#define X7_ V3
#define MSG VS36
#define MSG_ V4
#define MSG_HI VS37
#define MSG_HI_ V5
#define RESULT VS38
#define RESULT_ V6
#define ROTATE VS39
#define ROTATE_ V7
#define MASK VS40
#define MASK_ V8
#define FLIP VS41
#define FLIP_ V9
// func galMulPpc(low, high, in, out []byte)
TEXT ·galMulPpc(SB), NOFRAME|NOSPLIT, $0-96
MOVD low+0(FP), LOW
MOVD high+24(FP), HIGH
MOVD in+48(FP), IN
MOVD in_len+56(FP), LEN
MOVD out+72(FP), OUT
MOVD $16, OFFSET1
MOVD $32, OFFSET2
MOVD $·constants(SB), CONSTANTS
LXVD2X (CONSTANTS)(R0), ROTATE
LXVD2X (CONSTANTS)(OFFSET1), MASK
LXVD2X (CONSTANTS)(OFFSET2), FLIP
LXVD2X (LOW)(R0), X6
LXVD2X (HIGH)(R0), X7
VPERM X6_, V31, FLIP_, X6_
VPERM X7_, V31, FLIP_, X7_
MOVD $0, OFFSET
loop:
LXVD2X (IN)(OFFSET), MSG
VSRB MSG_, ROTATE_, MSG_HI_
VAND MSG_, MASK_, MSG_
VPERM X6_, V31, MSG_, MSG_
VPERM X7_, V31, MSG_HI_, MSG_HI_
VXOR MSG_, MSG_HI_, MSG_
STXVD2X MSG, (OUT)(OFFSET)
ADD $16, OFFSET, OFFSET
CMP LEN, OFFSET
BGT loop
RET
// func galMulPpcXorlow, high, in, out []byte)
TEXT ·galMulPpcXor(SB), NOFRAME|NOSPLIT, $0-96
MOVD low+0(FP), LOW
MOVD high+24(FP), HIGH
MOVD in+48(FP), IN
MOVD in_len+56(FP), LEN
MOVD out+72(FP), OUT
MOVD $16, OFFSET1
MOVD $32, OFFSET2
MOVD $·constants(SB), CONSTANTS
LXVD2X (CONSTANTS)(R0), ROTATE
LXVD2X (CONSTANTS)(OFFSET1), MASK
LXVD2X (CONSTANTS)(OFFSET2), FLIP
LXVD2X (LOW)(R0), X6
LXVD2X (HIGH)(R0), X7
VPERM X6_, V31, FLIP_, X6_
VPERM X7_, V31, FLIP_, X7_
MOVD $0, OFFSET
loopXor:
LXVD2X (IN)(OFFSET), MSG
LXVD2X (OUT)(OFFSET), RESULT
VSRB MSG_, ROTATE_, MSG_HI_
VAND MSG_, MASK_, MSG_
VPERM X6_, V31, MSG_, MSG_
VPERM X7_, V31, MSG_HI_, MSG_HI_
VXOR MSG_, MSG_HI_, MSG_
VXOR MSG_, RESULT_, RESULT_
STXVD2X RESULT, (OUT)(OFFSET)
ADD $16, OFFSET, OFFSET
CMP LEN, OFFSET
BGT loopXor
RET
DATA ·constants+0x0(SB)/8, $0x0404040404040404
DATA ·constants+0x8(SB)/8, $0x0404040404040404
DATA ·constants+0x10(SB)/8, $0x0f0f0f0f0f0f0f0f
DATA ·constants+0x18(SB)/8, $0x0f0f0f0f0f0f0f0f
DATA ·constants+0x20(SB)/8, $0x0706050403020100
DATA ·constants+0x28(SB)/8, $0x0f0e0d0c0b0a0908
GLOBL ·constants(SB), 8, $48

View File

@@ -471,12 +471,12 @@ func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outpu
wg.Add(1)
go func(start, stop int) {
for c := 0; c < r.DataShards; c++ {
in := inputs[c]
in := inputs[c][start:stop]
for iRow := 0; iRow < outputCount; iRow++ {
if c == 0 {
galMulSlice(matrixRows[iRow][c], in[start:stop], outputs[iRow][start:stop], r.o.useSSSE3, r.o.useAVX2)
galMulSlice(matrixRows[iRow][c], in, outputs[iRow][start:stop], r.o.useSSSE3, r.o.useAVX2)
} else {
galMulSliceXor(matrixRows[iRow][c], in[start:stop], outputs[iRow][start:stop], r.o.useSSSE3, r.o.useAVX2)
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][start:stop], r.o.useSSSE3, r.o.useAVX2)
}
}
}