Update reedsolomon/highwayhash to start using ppc64le support (#7003)

Thanks to @fwessels for the upstream work on reedsolomon and highwayhash which has resulted in 10x performance improvement on ppc64 architecture.
2025-11-26 04:26:12 -05:00 · 2018-12-20 09:47:05 -08:00
parent bc67410548
commit def04f01cf
17 changed files with 460 additions and 14 deletions
--- a/vendor/github.com/klauspost/reedsolomon/README.md
+++ b/vendor/github.com/klauspost/reedsolomon/README.md
@@ -24,6 +24,10 @@ go get -u github.com/klauspost/reedsolomon

 # Changes

+## December 18, 2018
+
+Assembly code for ppc64le has been contributed, this boosts performance by about 10x on this platform.
+
 ## November 18, 2017

 Added [WithAutoGoroutines](https://godoc.org/github.com/klauspost/reedsolomon#WithAutoGoroutines) which will attempt to calculate the optimal number of goroutines to use based on your expected shard size and detected CPU.
@@ -259,6 +263,18 @@ By exploiting NEON instructions the performance for ARM has been accelerated. Be
 | 10   | 2      | 20%    |           188 |            1738 |       925% |
 | 10   | 4      | 40%    |            96 |             839 |       877% |

+# Performance on ppc64le
+
+The performance for ppc64le has been accelerated. This gives roughly a 10x performance improvement on this architecture as can been seen below:
+
+```
+benchmark                      old MB/s     new MB/s     speedup
+BenchmarkGalois128K-160        948.87       8878.85      9.36x
+BenchmarkGalois1M-160          968.85       9041.92      9.33x
+BenchmarkGaloisXor128K-160     862.02       7905.00      9.17x
+BenchmarkGaloisXor1M-160       784.60       6296.65      8.03x
+```
+
 # asm2plan9s

 [asm2plan9s](https://github.com/fwessels/asm2plan9s) is used for assembling the AVX2 instructions into their BYTE/WORD/LONG equivalents.
@@ -266,8 +282,10 @@ By exploiting NEON instructions the performance for ARM has been accelerated. Be
 # Links
 * [Backblaze Open Sources Reed-Solomon Erasure Coding Source Code](https://www.backblaze.com/blog/reed-solomon/).
 * [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon). Compatible java library by Backblaze.
+* [ocaml-reed-solomon-erasure](https://gitlab.com/darrenldl/ocaml-reed-solomon-erasure). Compatible OCaml implementation.
 * [reedsolomon-c](https://github.com/jannson/reedsolomon-c). C version, compatible with output from this package.
 * [Reed-Solomon Erasure Coding in Haskell](https://github.com/NicolasT/reedsolomon). Haskell port of the package with similar performance.
+* [reed-solomon-erasure](https://github.com/darrenldl/reed-solomon-erasure). Compatible Rust implementation.
 * [go-erasure](https://github.com/somethingnew2-0/go-erasure). A similar library using cgo, slower in my tests.
 * [rsraid](https://github.com/goayame/rsraid). A similar library written in Go. Slower, but supports more shards.
 * [Screaming Fast Galois Field Arithmetic](http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf). Basis for SSE3 optimizations.
--- a/vendor/github.com/klauspost/reedsolomon/galois_amd64.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois_amd64.go
@@ -1,5 +1,6 @@
 //+build !noasm
 //+build !appengine
+//+build !gccgo

 // Copyright 2015, Klaus Post, see LICENSE for details.

--- a/vendor/github.com/klauspost/reedsolomon/galois_amd64.s
+++ b/vendor/github.com/klauspost/reedsolomon/galois_amd64.s
@@ -1,4 +1,4 @@
-//+build !noasm !appengine
+//+build !noasm !appengine !gccgo

 // Copyright 2015, Klaus Post, see LICENSE for details.

--- a/vendor/github.com/klauspost/reedsolomon/galois_arm64.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois_arm64.go
@@ -1,5 +1,6 @@
 //+build !noasm
 //+build !appengine
+//+build !gccgo

 // Copyright 2015, Klaus Post, see LICENSE for details.
 // Copyright 2017, Minio, Inc.
--- a/vendor/github.com/klauspost/reedsolomon/galois_arm64.s
+++ b/vendor/github.com/klauspost/reedsolomon/galois_arm64.s
@@ -1,4 +1,4 @@
-//+build !noasm !appengine
+//+build !noasm !appengine !gccgo

 // Copyright 2015, Klaus Post, see LICENSE for details.
 // Copyright 2017, Minio, Inc.
--- a/vendor/github.com/klauspost/reedsolomon/galois_noasm.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois_noasm.go
@@ -1,5 +1,6 @@
-//+build !amd64 noasm appengine
-//+build !arm64 noasm appengine
+//+build !amd64 noasm appengine gccgo
+//+build !arm64 noasm appengine gccgo
+//+build !ppc64le noasm appengine gccgo

 // Copyright 2015, Klaus Post, see LICENSE for details.

--- a/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.go
@@ -0,0 +1,67 @@
+//+build !noasm
+//+build !appengine
+//+build !gccgo
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+// Copyright 2018, Minio, Inc.
+
+package reedsolomon
+
+//go:noescape
+func galMulPpc(low, high, in, out []byte)
+
+//go:noescape
+func galMulPpcXor(low, high, in, out []byte)
+
+// This is what the assembler routines do in blocks of 16 bytes:
+/*
+func galMulPpc(low, high, in, out []byte) {
+	for n, input := range in {
+		l := input & 0xf
+		h := input >> 4
+		out[n] = low[l] ^ high[h]
+	}
+}
+func galMulPpcXor(low, high, in, out []byte) {
+	for n, input := range in {
+		l := input & 0xf
+		h := input >> 4
+		out[n] ^= low[l] ^ high[h]
+	}
+}
+*/
+
+func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
+	done := (len(in) >> 4) << 4
+	if done > 0 {
+		galMulPpc(mulTableLow[c][:], mulTableHigh[c][:], in[:done], out)
+	}
+	remain := len(in) - done
+	if remain > 0 {
+		mt := mulTable[c]
+		for i := done; i < len(in); i++ {
+			out[i] = mt[in[i]]
+		}
+	}
+}
+
+func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
+	done := (len(in) >> 4) << 4
+	if done > 0 {
+		galMulPpcXor(mulTableLow[c][:], mulTableHigh[c][:], in[:done], out)
+	}
+	remain := len(in) - done
+	if remain > 0 {
+		mt := mulTable[c]
+		for i := done; i < len(in); i++ {
+			out[i] ^= mt[in[i]]
+		}
+	}
+}
+
+// slice galois add
+func sliceXor(in, out []byte, sse2 bool) {
+	for n, input := range in {
+		out[n] ^= input
+	}
+}
--- a/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.s
+++ b/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.s
@@ -0,0 +1,126 @@
+//+build !noasm !appengine !gccgo
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+// Copyright 2018, Minio, Inc.
+
+#include "textflag.h"
+
+#define LOW       R3
+#define HIGH      R4
+#define IN        R5
+#define LEN       R6
+#define OUT       R7
+#define CONSTANTS R8
+#define OFFSET    R9
+#define OFFSET1   R10
+#define OFFSET2   R11
+
+#define X6        VS34
+#define X6_       V2
+#define X7        VS35
+#define X7_       V3
+#define MSG       VS36
+#define MSG_      V4
+#define MSG_HI    VS37
+#define MSG_HI_   V5
+#define RESULT    VS38
+#define RESULT_   V6
+#define ROTATE    VS39
+#define ROTATE_   V7
+#define MASK      VS40
+#define MASK_     V8
+#define FLIP      VS41
+#define FLIP_     V9
+
+
+// func galMulPpc(low, high, in, out []byte)
+TEXT ·galMulPpc(SB), NOFRAME|NOSPLIT, $0-96
+    MOVD    low+0(FP), LOW
+    MOVD    high+24(FP), HIGH
+    MOVD    in+48(FP), IN
+    MOVD    in_len+56(FP), LEN
+    MOVD    out+72(FP), OUT
+
+    MOVD    $16, OFFSET1
+    MOVD    $32, OFFSET2
+
+    MOVD    $·constants(SB), CONSTANTS
+    LXVD2X  (CONSTANTS)(R0), ROTATE
+    LXVD2X  (CONSTANTS)(OFFSET1), MASK
+    LXVD2X  (CONSTANTS)(OFFSET2), FLIP
+
+    LXVD2X  (LOW)(R0), X6
+    LXVD2X  (HIGH)(R0), X7
+    VPERM   X6_, V31, FLIP_, X6_
+    VPERM   X7_, V31, FLIP_, X7_
+
+    MOVD    $0, OFFSET
+
+loop:
+    LXVD2X  (IN)(OFFSET), MSG
+
+    VSRB    MSG_, ROTATE_, MSG_HI_
+    VAND    MSG_, MASK_, MSG_
+    VPERM   X6_, V31, MSG_, MSG_
+    VPERM   X7_, V31, MSG_HI_, MSG_HI_
+
+    VXOR    MSG_, MSG_HI_, MSG_
+
+    STXVD2X MSG, (OUT)(OFFSET)
+
+    ADD     $16, OFFSET, OFFSET
+    CMP     LEN, OFFSET
+    BGT     loop
+    RET
+
+
+// func galMulPpcXorlow, high, in, out []byte)
+TEXT ·galMulPpcXor(SB), NOFRAME|NOSPLIT, $0-96
+    MOVD    low+0(FP), LOW
+    MOVD    high+24(FP), HIGH
+    MOVD    in+48(FP), IN
+    MOVD    in_len+56(FP), LEN
+    MOVD    out+72(FP), OUT
+
+    MOVD    $16, OFFSET1
+    MOVD    $32, OFFSET2
+
+    MOVD    $·constants(SB), CONSTANTS
+    LXVD2X  (CONSTANTS)(R0), ROTATE
+    LXVD2X  (CONSTANTS)(OFFSET1), MASK
+    LXVD2X  (CONSTANTS)(OFFSET2), FLIP
+
+    LXVD2X  (LOW)(R0), X6
+    LXVD2X  (HIGH)(R0), X7
+    VPERM   X6_, V31, FLIP_, X6_
+    VPERM   X7_, V31, FLIP_, X7_
+
+    MOVD    $0, OFFSET
+
+loopXor:
+    LXVD2X  (IN)(OFFSET), MSG
+    LXVD2X  (OUT)(OFFSET), RESULT
+
+    VSRB    MSG_, ROTATE_, MSG_HI_
+    VAND    MSG_, MASK_, MSG_
+    VPERM   X6_, V31, MSG_, MSG_
+    VPERM   X7_, V31, MSG_HI_, MSG_HI_
+
+    VXOR    MSG_, MSG_HI_, MSG_
+    VXOR    MSG_, RESULT_, RESULT_
+
+    STXVD2X RESULT, (OUT)(OFFSET)
+
+    ADD     $16, OFFSET, OFFSET
+    CMP     LEN, OFFSET
+    BGT     loopXor
+    RET
+
+DATA ·constants+0x0(SB)/8, $0x0404040404040404
+DATA ·constants+0x8(SB)/8, $0x0404040404040404
+DATA ·constants+0x10(SB)/8, $0x0f0f0f0f0f0f0f0f
+DATA ·constants+0x18(SB)/8, $0x0f0f0f0f0f0f0f0f
+DATA ·constants+0x20(SB)/8, $0x0706050403020100
+DATA ·constants+0x28(SB)/8, $0x0f0e0d0c0b0a0908
+
+GLOBL ·constants(SB), 8, $48
--- a/vendor/github.com/klauspost/reedsolomon/reedsolomon.go
+++ b/vendor/github.com/klauspost/reedsolomon/reedsolomon.go
@@ -471,12 +471,12 @@ func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outpu
 		wg.Add(1)
 		go func(start, stop int) {
 			for c := 0; c < r.DataShards; c++ {
-				in := inputs[c]
+				in := inputs[c][start:stop]
 				for iRow := 0; iRow < outputCount; iRow++ {
 					if c == 0 {
-						galMulSlice(matrixRows[iRow][c], in[start:stop], outputs[iRow][start:stop], r.o.useSSSE3, r.o.useAVX2)
+						galMulSlice(matrixRows[iRow][c], in, outputs[iRow][start:stop], r.o.useSSSE3, r.o.useAVX2)
 					} else {
-						galMulSliceXor(matrixRows[iRow][c], in[start:stop], outputs[iRow][start:stop], r.o.useSSSE3, r.o.useAVX2)
+						galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][start:stop], r.o.useSSSE3, r.o.useAVX2)
 					}
 				}
 			}