Updated version of klauspost/reedsolomon with NEON support for ARM (#4865)

This commit is contained in:
Frank Wessels 2017-08-30 09:49:00 -07:00 committed by Dee Koder
parent 6dca044ea8
commit 93f126364e
10 changed files with 1143 additions and 15 deletions

View File

@ -8,7 +8,7 @@
Reed-Solomon Erasure Coding in Go, with speeds exceeding 1GB/s/cpu core implemented in pure Go.
This is a golang port of the [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon) library released by [Backblaze](http://backblaze.com), with some additional optimizations.
This is a Go port of the [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon) library released by [Backblaze](http://backblaze.com), with some additional optimizations.
For an introduction on erasure coding, see the post on the [Backblaze blog](https://www.backblaze.com/blog/reed-solomon/).
@ -19,11 +19,17 @@ Godoc: https://godoc.org/github.com/klauspost/reedsolomon
# Installation
To get the package use the standard:
```bash
go get github.com/klauspost/reedsolomon
go get -u github.com/klauspost/reedsolomon
```
# Changes
## August 26, 2017
* The[`Encoder()`](https://godoc.org/github.com/klauspost/reedsolomon#Encoder) now contains an `Update` function contributed by [chenzhongtao](https://github.com/chenzhongtao).
* [Frank Wessels](https://github.com/fwessels) kindly contributed ARM 64 bit assembly, which gives a huge performance boost on this platform.
## July 20, 2017
`ReconstructData` added to [`Encoder`](https://godoc.org/github.com/klauspost/reedsolomon#Encoder) interface. This can cause compatibility issues if you implement your own Encoder. A simple workaround can be added:
@ -234,6 +240,16 @@ BenchmarkReconstruct50x20x1M-8 1364.35 4189.79 3.07x
BenchmarkReconstruct10x4x16M-8 1484.35 5779.53 3.89x
```
# Performance on ARM64 NEON
By exploiting NEON instructions the performance for ARM has been accelerated. Below are the performance numbers for a single core on an ARM Cortex-A53 CPU @ 1.2GHz (Debian 8.0 Jessie running Go: 1.7.4):
| Data | Parity | Parity | ARM64 Go MB/s | ARM64 NEON MB/s | NEON Speed |
|------|--------|--------|--------------:|----------------:|-----------:|
| 5 | 2 | 40% | 189 | 1304 | 588% |
| 10 | 2 | 20% | 188 | 1738 | 925% |
| 10 | 4 | 40% | 96 | 839 | 877% |
# asm2plan9s
[asm2plan9s](https://github.com/fwessels/asm2plan9s) is used for assembling the AVX2 instructions into their BYTE/WORD/LONG equivalents.

File diff suppressed because one or more lines are too long

View File

@ -17,7 +17,10 @@ func galMulAVX2Xor(low, high, in, out []byte)
//go:noescape
func galMulAVX2(low, high, in, out []byte)
// This is what the assembler rountes does in blocks of 16 bytes:
//go:noescape
func sSE2XorSlice(in, out []byte)
// This is what the assembler routines do in blocks of 16 bytes:
/*
func galMulSSSE3(low, high, in, out []byte) {
for n, input := range in {
@ -71,3 +74,18 @@ func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
}
}
}
// slice galois add
func sliceXor(in, out []byte, sse2 bool) {
var done int
if sse2 {
sSE2XorSlice(in, out)
done = (len(in) >> 4) << 4
}
remain := len(in) - done
if remain > 0 {
for i := done; i < len(in); i++ {
out[i] ^= in[i]
}
}
}

View File

@ -162,3 +162,25 @@ done_avx2:
BYTE $0xc5; BYTE $0xf8; BYTE $0x77 // VZEROUPPER
RET
// func sSE2XorSlice(in, out []byte)
TEXT ·sSE2XorSlice(SB), 7, $0
MOVQ in+0(FP), SI // SI: &in
MOVQ in_len+8(FP), R9 // R9: len(in)
MOVQ out+24(FP), DX // DX: &out
SHRQ $4, R9 // len(in) / 16
CMPQ R9, $0
JEQ done_xor_sse2
loopback_xor_sse2:
MOVOU (SI), X0 // in[x]
MOVOU (DX), X1 // out[x]
PXOR X0, X1
MOVOU X1, (DX)
ADDQ $16, SI // in+=16
ADDQ $16, DX // out+=16
SUBQ $1, R9
JNZ loopback_xor_sse2
done_xor_sse2:
RET

View File

@ -0,0 +1,48 @@
//+build !noasm
//+build !appengine
// Copyright 2015, Klaus Post, see LICENSE for details.
// Copyright 2017, Minio, Inc.
package reedsolomon
//go:noescape
func galMulNEON(c uint64, in, out []byte)
//go:noescape
func galMulXorNEON(c uint64, in, out []byte)
func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
var done int
galMulNEON(uint64(c), in, out)
done = (len(in) >> 5) << 5
remain := len(in) - done
if remain > 0 {
mt := mulTable[c]
for i := done; i < len(in); i++ {
out[i] = mt[in[i]]
}
}
}
func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
var done int
galMulXorNEON(uint64(c), in, out)
done = (len(in) >> 5) << 5
remain := len(in) - done
if remain > 0 {
mt := mulTable[c]
for i := done; i < len(in); i++ {
out[i] ^= mt[in[i]]
}
}
}
// slice galois add
func sliceXor(in, out []byte, sse2 bool) {
for n, input := range in {
out[n] ^= input
}
}

141
vendor/github.com/klauspost/reedsolomon/galois_arm64.s generated vendored Normal file
View File

@ -0,0 +1,141 @@
//+build !noasm !appengine
// Copyright 2015, Klaus Post, see LICENSE for details.
// Copyright 2017, Minio, Inc.
// Use github.com/minio/asm2plan9s on this file to assemble ARM instructions to
// the opcodes of their Plan9 equivalents
// polynomial multiplication
#define POLYNOMIAL_MULTIPLICATION \
WORD $0x0e3ce340 \ // pmull v0.8h,v26.8b,v28.8b
WORD $0x4e3ce346 \ // pmull2 v6.8h,v26.16b,v28.16b
WORD $0x0e3ce36c \ // pmull v12.8h,v27.8b,v28.8b
WORD $0x4e3ce372 // pmull2 v18.8h,v27.16b,v28.16b
// first reduction
#define FIRST_REDUCTION \
WORD $0x0f088402 \ // shrn v2.8b, v0.8h, #8
WORD $0x0f0884c8 \ // shrn v8.8b, v6.8h, #8
WORD $0x0f08858e \ // shrn v14.8b, v12.8h, #8
WORD $0x0f088654 \ // shrn v20.8b, v18.8h, #8
WORD $0x0e22e3c3 \ // pmull v3.8h,v30.8b,v2.8b
WORD $0x0e28e3c9 \ // pmull v9.8h,v30.8b,v8.8b
WORD $0x0e2ee3cf \ // pmull v15.8h,v30.8b,v14.8b
WORD $0x0e34e3d5 \ // pmull v21.8h,v30.8b,v20.8b
WORD $0x6e201c60 \ // eor v0.16b,v3.16b,v0.16b
WORD $0x6e261d26 \ // eor v6.16b,v9.16b,v6.16b
WORD $0x6e2c1dec \ // eor v12.16b,v15.16b,v12.16b
WORD $0x6e321eb2 // eor v18.16b,v21.16b,v18.16b
// second reduction
#define SECOND_REDUCTION \
WORD $0x0f088404 \ // shrn v4.8b, v0.8h, #8
WORD $0x0f0884ca \ // shrn v10.8b, v6.8h, #8
WORD $0x0f088590 \ // shrn v16.8b, v12.8h, #8
WORD $0x0f088656 \ // shrn v22.8b, v18.8h, #8
WORD $0x6e241c44 \ // eor v4.16b,v2.16b,v4.16b
WORD $0x6e2a1d0a \ // eor v10.16b,v8.16b,v10.16b
WORD $0x6e301dd0 \ // eor v16.16b,v14.16b,v16.16b
WORD $0x6e361e96 \ // eor v22.16b,v20.16b,v22.16b
WORD $0x0e24e3c5 \ // pmull v5.8h,v30.8b,v4.8b
WORD $0x0e2ae3cb \ // pmull v11.8h,v30.8b,v10.8b
WORD $0x0e30e3d1 \ // pmull v17.8h,v30.8b,v16.8b
WORD $0x0e36e3d7 \ // pmull v23.8h,v30.8b,v22.8b
WORD $0x6e201ca0 \ // eor v0.16b,v5.16b,v0.16b
WORD $0x6e261d61 \ // eor v1.16b,v11.16b,v6.16b
WORD $0x6e2c1e22 \ // eor v2.16b,v17.16b,v12.16b
WORD $0x6e321ee3 // eor v3.16b,v23.16b,v18.16b
// func galMulNEON(c uint64, in, out []byte)
TEXT ·galMulNEON(SB), 7, $0
MOVD c+0(FP), R0
MOVD in_base+8(FP), R1
MOVD in_len+16(FP), R2 // length of message
MOVD out_base+32(FP), R5
SUBS $32, R2
BMI complete
// Load constants table pointer
MOVD $·constants(SB), R3
// and load constants into v30 & v31
WORD $0x4c40a07e // ld1 {v30.16b-v31.16b}, [x3]
WORD $0x4e010c1c // dup v28.16b, w0
loop:
// Main loop
WORD $0x4cdfa83a // ld1 {v26.4s-v27.4s}, [x1], #32
POLYNOMIAL_MULTIPLICATION
FIRST_REDUCTION
SECOND_REDUCTION
// combine results
WORD $0x4e1f2000 // tbl v0.16b,{v0.16b,v1.16b},v31.16b
WORD $0x4e1f2041 // tbl v1.16b,{v2.16b,v3.16b},v31.16b
// Store result
WORD $0x4c9faca0 // st1 {v0.2d-v1.2d}, [x5], #32
SUBS $32, R2
BPL loop
complete:
RET
// func galMulXorNEON(c uint64, in, out []byte)
TEXT ·galMulXorNEON(SB), 7, $0
MOVD c+0(FP), R0
MOVD in_base+8(FP), R1
MOVD in_len+16(FP), R2 // length of message
MOVD out_base+32(FP), R5
SUBS $32, R2
BMI completeXor
// Load constants table pointer
MOVD $·constants(SB), R3
// and load constants into v30 & v31
WORD $0x4c40a07e // ld1 {v30.16b-v31.16b}, [x3]
WORD $0x4e010c1c // dup v28.16b, w0
loopXor:
// Main loop
WORD $0x4cdfa83a // ld1 {v26.4s-v27.4s}, [x1], #32
WORD $0x4c40a8b8 // ld1 {v24.4s-v25.4s}, [x5]
POLYNOMIAL_MULTIPLICATION
FIRST_REDUCTION
SECOND_REDUCTION
// combine results
WORD $0x4e1f2000 // tbl v0.16b,{v0.16b,v1.16b},v31.16b
WORD $0x4e1f2041 // tbl v1.16b,{v2.16b,v3.16b},v31.16b
// Xor result and store
WORD $0x6e381c00 // eor v0.16b,v0.16b,v24.16b
WORD $0x6e391c21 // eor v1.16b,v1.16b,v25.16b
WORD $0x4c9faca0 // st1 {v0.2d-v1.2d}, [x5], #32
SUBS $32, R2
BPL loopXor
completeXor:
RET
// Constants table
// generating polynomial is 29 (= 0x1d)
DATA ·constants+0x0(SB)/8, $0x1d1d1d1d1d1d1d1d
DATA ·constants+0x8(SB)/8, $0x1d1d1d1d1d1d1d1d
// constant for TBL instruction
DATA ·constants+0x10(SB)/8, $0x0e0c0a0806040200
DATA ·constants+0x18(SB)/8, $0x1e1c1a1816141210
GLOBL ·constants(SB), 8, $32

View File

@ -1,4 +1,5 @@
//+build !amd64 noasm appengine
//+build !arm64 noasm appengine
// Copyright 2015, Klaus Post, see LICENSE for details.
@ -17,3 +18,10 @@ func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
out[n] ^= mt[input]
}
}
// slice galois add
func sliceXor(in, out []byte, sse2 bool) {
for n, input := range in {
out[n] ^= input
}
}

View File

@ -12,7 +12,7 @@ type Option func(*options)
type options struct {
maxGoroutines int
minSplitSize int
useAVX2, useSSSE3 bool
useAVX2, useSSSE3, useSSE2 bool
usePAR1Matrix bool
}
@ -28,6 +28,7 @@ func init() {
// Detect CPU capabilities.
defaultOptions.useSSSE3 = cpuid.CPU.SSSE3()
defaultOptions.useAVX2 = cpuid.CPU.AVX2()
defaultOptions.useSSE2 = cpuid.CPU.SSE2()
}
// WithMaxGoroutines is the maximum number of goroutines number for encoding & decoding.
@ -67,6 +68,12 @@ func withAVX2(enabled bool) Option {
}
}
func withSSE2(enabled bool) Option {
return func(o *options) {
o.useSSE2 = enabled
}
}
// WithPAR1Matrix causes the encoder to build the matrix how PARv1
// does. Note that the method they use is buggy, and may lead to cases
// where recovery is impossible, even if there are enough parity

View File

@ -64,6 +64,14 @@ type Encoder interface {
// calling the Verify function is likely to fail.
ReconstructData(shards [][]byte) error
// Update parity is use for change a few data shards and update it's parity.
// Input 'newDatashards' containing data shards changed.
// Input 'shards' containing old data shards (if data shard not changed, it can be nil) and old parity shards.
// new parity shards will in shards[DataShards:]
// Update is very useful if DataShards much larger than ParityShards and changed data shards is few. It will
// faster than Encode and not need read all data shards to encode.
Update(shards [][]byte, newDatashards [][]byte) error
// Split a data slice into the number of shards given to the encoder,
// and create empty parity shards.
//
@ -221,7 +229,7 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
}
// ErrTooFewShards is returned if too few shards where given to
// Encode/Verify/Reconstruct. It will also be returned from Reconstruct
// Encode/Verify/Reconstruct/Update. It will also be returned from Reconstruct
// if there were too few shards to reconstruct the missing data.
var ErrTooFewShards = errors.New("too few shards given")
@ -249,6 +257,101 @@ func (r reedSolomon) Encode(shards [][]byte) error {
return nil
}
// ErrInvalidInput is returned if invalid input parameter of Update.
var ErrInvalidInput = errors.New("invalid input")
func (r reedSolomon) Update(shards [][]byte, newDatashards [][]byte) error {
if len(shards) != r.Shards {
return ErrTooFewShards
}
if len(newDatashards) != r.DataShards {
return ErrTooFewShards
}
err := checkShards(shards, true)
if err != nil {
return err
}
err = checkShards(newDatashards, true)
if err != nil {
return err
}
for i := range newDatashards {
if newDatashards[i] != nil && shards[i] == nil {
return ErrInvalidInput
}
}
for _, p := range shards[r.DataShards:] {
if p == nil {
return ErrInvalidInput
}
}
shardSize := shardSize(shards)
// Get the slice of output buffers.
output := shards[r.DataShards:]
// Do the coding.
r.updateParityShards(r.parity, shards[0:r.DataShards], newDatashards[0:r.DataShards], output, r.ParityShards, shardSize)
return nil
}
func (r reedSolomon) updateParityShards(matrixRows, oldinputs, newinputs, outputs [][]byte, outputCount, byteCount int) {
if r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize {
r.updateParityShardsP(matrixRows, oldinputs, newinputs, outputs, outputCount, byteCount)
return
}
for c := 0; c < r.DataShards; c++ {
in := newinputs[c]
if in == nil {
continue
}
oldin := oldinputs[c]
// oldinputs data will be change
sliceXor(in, oldin, r.o.useSSE2)
for iRow := 0; iRow < outputCount; iRow++ {
galMulSliceXor(matrixRows[iRow][c], oldin, outputs[iRow], r.o.useSSSE3, r.o.useAVX2)
}
}
}
func (r reedSolomon) updateParityShardsP(matrixRows, oldinputs, newinputs, outputs [][]byte, outputCount, byteCount int) {
var wg sync.WaitGroup
do := byteCount / r.o.maxGoroutines
if do < r.o.minSplitSize {
do = r.o.minSplitSize
}
start := 0
for start < byteCount {
if start+do > byteCount {
do = byteCount - start
}
wg.Add(1)
go func(start, stop int) {
for c := 0; c < r.DataShards; c++ {
in := newinputs[c]
if in == nil {
continue
}
oldin := oldinputs[c]
// oldinputs data will be change
sliceXor(in[start:stop], oldin[start:stop], r.o.useSSE2)
for iRow := 0; iRow < outputCount; iRow++ {
galMulSliceXor(matrixRows[iRow][c], oldin[start:stop], outputs[iRow][start:stop], r.o.useSSSE3, r.o.useAVX2)
}
}
wg.Done()
}(start, start+do)
start += do
}
wg.Wait()
}
// Verify returns true if the parity shards contain the right data.
// The data is the same format as Encode. No data is modified.
func (r reedSolomon) Verify(shards [][]byte) (bool, error) {

6
vendor/vendor.json vendored
View File

@ -243,10 +243,10 @@
"revisionTime": "2016-10-16T15:41:25Z"
},
{
"checksumSHA1": "gYAsuckCW3o4veePKZzEHvCcJro=",
"checksumSHA1": "R9saYJznxosfknAq2aPnVKxqI3w=",
"path": "github.com/klauspost/reedsolomon",
"revision": "48a4fd05f1730dd3ef9c3f9e943f6091d063f2c4",
"revisionTime": "2017-07-22T14:16:58Z"
"revision": "87ba8262ab3d167ae4d38e22796312cd2a9d0b19",
"revisionTime": "2017-08-26T09:54:10Z"
},
{
"checksumSHA1": "dNYxHiBLalTqluak2/Z8c3RsSEM=",