Updated version of klauspost/reedsolomon with NEON support for ARM (#4865)

This commit is contained in:
Frank Wessels 2017-08-30 09:49:00 -07:00 committed by Dee Koder
parent 6dca044ea8
commit 93f126364e
10 changed files with 1143 additions and 15 deletions

View File

@ -8,7 +8,7 @@
Reed-Solomon Erasure Coding in Go, with speeds exceeding 1GB/s/cpu core implemented in pure Go. Reed-Solomon Erasure Coding in Go, with speeds exceeding 1GB/s/cpu core implemented in pure Go.
This is a golang port of the [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon) library released by [Backblaze](http://backblaze.com), with some additional optimizations. This is a Go port of the [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon) library released by [Backblaze](http://backblaze.com), with some additional optimizations.
For an introduction on erasure coding, see the post on the [Backblaze blog](https://www.backblaze.com/blog/reed-solomon/). For an introduction on erasure coding, see the post on the [Backblaze blog](https://www.backblaze.com/blog/reed-solomon/).
@ -19,11 +19,17 @@ Godoc: https://godoc.org/github.com/klauspost/reedsolomon
# Installation # Installation
To get the package use the standard: To get the package use the standard:
```bash ```bash
go get github.com/klauspost/reedsolomon go get -u github.com/klauspost/reedsolomon
``` ```
# Changes # Changes
## August 26, 2017
* The[`Encoder()`](https://godoc.org/github.com/klauspost/reedsolomon#Encoder) now contains an `Update` function contributed by [chenzhongtao](https://github.com/chenzhongtao).
* [Frank Wessels](https://github.com/fwessels) kindly contributed ARM 64 bit assembly, which gives a huge performance boost on this platform.
## July 20, 2017 ## July 20, 2017
`ReconstructData` added to [`Encoder`](https://godoc.org/github.com/klauspost/reedsolomon#Encoder) interface. This can cause compatibility issues if you implement your own Encoder. A simple workaround can be added: `ReconstructData` added to [`Encoder`](https://godoc.org/github.com/klauspost/reedsolomon#Encoder) interface. This can cause compatibility issues if you implement your own Encoder. A simple workaround can be added:
@ -186,7 +192,7 @@ There is no buffering or timeouts/retry specified. If you want to add that, you
For complete examples of a streaming encoder and decoder see the [examples folder](https://github.com/klauspost/reedsolomon/tree/master/examples). For complete examples of a streaming encoder and decoder see the [examples folder](https://github.com/klauspost/reedsolomon/tree/master/examples).
#Advanced Options # Advanced Options
You can modify internal options which affects how jobs are split between and processed by goroutines. You can modify internal options which affects how jobs are split between and processed by goroutines.
@ -234,6 +240,16 @@ BenchmarkReconstruct50x20x1M-8 1364.35 4189.79 3.07x
BenchmarkReconstruct10x4x16M-8 1484.35 5779.53 3.89x BenchmarkReconstruct10x4x16M-8 1484.35 5779.53 3.89x
``` ```
# Performance on ARM64 NEON
By exploiting NEON instructions the performance for ARM has been accelerated. Below are the performance numbers for a single core on an ARM Cortex-A53 CPU @ 1.2GHz (Debian 8.0 Jessie running Go: 1.7.4):
| Data | Parity | Parity | ARM64 Go MB/s | ARM64 NEON MB/s | NEON Speed |
|------|--------|--------|--------------:|----------------:|-----------:|
| 5 | 2 | 40% | 189 | 1304 | 588% |
| 10 | 2 | 20% | 188 | 1738 | 925% |
| 10 | 4 | 40% | 96 | 839 | 877% |
# asm2plan9s # asm2plan9s
[asm2plan9s](https://github.com/fwessels/asm2plan9s) is used for assembling the AVX2 instructions into their BYTE/WORD/LONG equivalents. [asm2plan9s](https://github.com/fwessels/asm2plan9s) is used for assembling the AVX2 instructions into their BYTE/WORD/LONG equivalents.

File diff suppressed because one or more lines are too long

View File

@ -17,7 +17,10 @@ func galMulAVX2Xor(low, high, in, out []byte)
//go:noescape //go:noescape
func galMulAVX2(low, high, in, out []byte) func galMulAVX2(low, high, in, out []byte)
// This is what the assembler rountes does in blocks of 16 bytes: //go:noescape
func sSE2XorSlice(in, out []byte)
// This is what the assembler routines do in blocks of 16 bytes:
/* /*
func galMulSSSE3(low, high, in, out []byte) { func galMulSSSE3(low, high, in, out []byte) {
for n, input := range in { for n, input := range in {
@ -71,3 +74,18 @@ func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
} }
} }
} }
// slice galois add
func sliceXor(in, out []byte, sse2 bool) {
var done int
if sse2 {
sSE2XorSlice(in, out)
done = (len(in) >> 4) << 4
}
remain := len(in) - done
if remain > 0 {
for i := done; i < len(in); i++ {
out[i] ^= in[i]
}
}
}

View File

@ -162,3 +162,25 @@ done_avx2:
BYTE $0xc5; BYTE $0xf8; BYTE $0x77 // VZEROUPPER BYTE $0xc5; BYTE $0xf8; BYTE $0x77 // VZEROUPPER
RET RET
// func sSE2XorSlice(in, out []byte)
TEXT ·sSE2XorSlice(SB), 7, $0
MOVQ in+0(FP), SI // SI: &in
MOVQ in_len+8(FP), R9 // R9: len(in)
MOVQ out+24(FP), DX // DX: &out
SHRQ $4, R9 // len(in) / 16
CMPQ R9, $0
JEQ done_xor_sse2
loopback_xor_sse2:
MOVOU (SI), X0 // in[x]
MOVOU (DX), X1 // out[x]
PXOR X0, X1
MOVOU X1, (DX)
ADDQ $16, SI // in+=16
ADDQ $16, DX // out+=16
SUBQ $1, R9
JNZ loopback_xor_sse2
done_xor_sse2:
RET

View File

@ -0,0 +1,48 @@
//+build !noasm
//+build !appengine
// Copyright 2015, Klaus Post, see LICENSE for details.
// Copyright 2017, Minio, Inc.
package reedsolomon
//go:noescape
func galMulNEON(c uint64, in, out []byte)
//go:noescape
func galMulXorNEON(c uint64, in, out []byte)
func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
var done int
galMulNEON(uint64(c), in, out)
done = (len(in) >> 5) << 5
remain := len(in) - done
if remain > 0 {
mt := mulTable[c]
for i := done; i < len(in); i++ {
out[i] = mt[in[i]]
}
}
}
func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
var done int
galMulXorNEON(uint64(c), in, out)
done = (len(in) >> 5) << 5
remain := len(in) - done
if remain > 0 {
mt := mulTable[c]
for i := done; i < len(in); i++ {
out[i] ^= mt[in[i]]
}
}
}
// slice galois add
func sliceXor(in, out []byte, sse2 bool) {
for n, input := range in {
out[n] ^= input
}
}

141
vendor/github.com/klauspost/reedsolomon/galois_arm64.s generated vendored Normal file
View File

@ -0,0 +1,141 @@
//+build !noasm !appengine
// Copyright 2015, Klaus Post, see LICENSE for details.
// Copyright 2017, Minio, Inc.
// Use github.com/minio/asm2plan9s on this file to assemble ARM instructions to
// the opcodes of their Plan9 equivalents
// polynomial multiplication
#define POLYNOMIAL_MULTIPLICATION \
WORD $0x0e3ce340 \ // pmull v0.8h,v26.8b,v28.8b
WORD $0x4e3ce346 \ // pmull2 v6.8h,v26.16b,v28.16b
WORD $0x0e3ce36c \ // pmull v12.8h,v27.8b,v28.8b
WORD $0x4e3ce372 // pmull2 v18.8h,v27.16b,v28.16b
// first reduction
#define FIRST_REDUCTION \
WORD $0x0f088402 \ // shrn v2.8b, v0.8h, #8
WORD $0x0f0884c8 \ // shrn v8.8b, v6.8h, #8
WORD $0x0f08858e \ // shrn v14.8b, v12.8h, #8
WORD $0x0f088654 \ // shrn v20.8b, v18.8h, #8
WORD $0x0e22e3c3 \ // pmull v3.8h,v30.8b,v2.8b
WORD $0x0e28e3c9 \ // pmull v9.8h,v30.8b,v8.8b
WORD $0x0e2ee3cf \ // pmull v15.8h,v30.8b,v14.8b
WORD $0x0e34e3d5 \ // pmull v21.8h,v30.8b,v20.8b
WORD $0x6e201c60 \ // eor v0.16b,v3.16b,v0.16b
WORD $0x6e261d26 \ // eor v6.16b,v9.16b,v6.16b
WORD $0x6e2c1dec \ // eor v12.16b,v15.16b,v12.16b
WORD $0x6e321eb2 // eor v18.16b,v21.16b,v18.16b
// second reduction
#define SECOND_REDUCTION \
WORD $0x0f088404 \ // shrn v4.8b, v0.8h, #8
WORD $0x0f0884ca \ // shrn v10.8b, v6.8h, #8
WORD $0x0f088590 \ // shrn v16.8b, v12.8h, #8
WORD $0x0f088656 \ // shrn v22.8b, v18.8h, #8
WORD $0x6e241c44 \ // eor v4.16b,v2.16b,v4.16b
WORD $0x6e2a1d0a \ // eor v10.16b,v8.16b,v10.16b
WORD $0x6e301dd0 \ // eor v16.16b,v14.16b,v16.16b
WORD $0x6e361e96 \ // eor v22.16b,v20.16b,v22.16b
WORD $0x0e24e3c5 \ // pmull v5.8h,v30.8b,v4.8b
WORD $0x0e2ae3cb \ // pmull v11.8h,v30.8b,v10.8b
WORD $0x0e30e3d1 \ // pmull v17.8h,v30.8b,v16.8b
WORD $0x0e36e3d7 \ // pmull v23.8h,v30.8b,v22.8b
WORD $0x6e201ca0 \ // eor v0.16b,v5.16b,v0.16b
WORD $0x6e261d61 \ // eor v1.16b,v11.16b,v6.16b
WORD $0x6e2c1e22 \ // eor v2.16b,v17.16b,v12.16b
WORD $0x6e321ee3 // eor v3.16b,v23.16b,v18.16b
// func galMulNEON(c uint64, in, out []byte)
TEXT ·galMulNEON(SB), 7, $0
MOVD c+0(FP), R0
MOVD in_base+8(FP), R1
MOVD in_len+16(FP), R2 // length of message
MOVD out_base+32(FP), R5
SUBS $32, R2
BMI complete
// Load constants table pointer
MOVD $·constants(SB), R3
// and load constants into v30 & v31
WORD $0x4c40a07e // ld1 {v30.16b-v31.16b}, [x3]
WORD $0x4e010c1c // dup v28.16b, w0
loop:
// Main loop
WORD $0x4cdfa83a // ld1 {v26.4s-v27.4s}, [x1], #32
POLYNOMIAL_MULTIPLICATION
FIRST_REDUCTION
SECOND_REDUCTION
// combine results
WORD $0x4e1f2000 // tbl v0.16b,{v0.16b,v1.16b},v31.16b
WORD $0x4e1f2041 // tbl v1.16b,{v2.16b,v3.16b},v31.16b
// Store result
WORD $0x4c9faca0 // st1 {v0.2d-v1.2d}, [x5], #32
SUBS $32, R2
BPL loop
complete:
RET
// func galMulXorNEON(c uint64, in, out []byte)
TEXT ·galMulXorNEON(SB), 7, $0
MOVD c+0(FP), R0
MOVD in_base+8(FP), R1
MOVD in_len+16(FP), R2 // length of message
MOVD out_base+32(FP), R5
SUBS $32, R2
BMI completeXor
// Load constants table pointer
MOVD $·constants(SB), R3
// and load constants into v30 & v31
WORD $0x4c40a07e // ld1 {v30.16b-v31.16b}, [x3]
WORD $0x4e010c1c // dup v28.16b, w0
loopXor:
// Main loop
WORD $0x4cdfa83a // ld1 {v26.4s-v27.4s}, [x1], #32
WORD $0x4c40a8b8 // ld1 {v24.4s-v25.4s}, [x5]
POLYNOMIAL_MULTIPLICATION
FIRST_REDUCTION
SECOND_REDUCTION
// combine results
WORD $0x4e1f2000 // tbl v0.16b,{v0.16b,v1.16b},v31.16b
WORD $0x4e1f2041 // tbl v1.16b,{v2.16b,v3.16b},v31.16b
// Xor result and store
WORD $0x6e381c00 // eor v0.16b,v0.16b,v24.16b
WORD $0x6e391c21 // eor v1.16b,v1.16b,v25.16b
WORD $0x4c9faca0 // st1 {v0.2d-v1.2d}, [x5], #32
SUBS $32, R2
BPL loopXor
completeXor:
RET
// Constants table
// generating polynomial is 29 (= 0x1d)
DATA ·constants+0x0(SB)/8, $0x1d1d1d1d1d1d1d1d
DATA ·constants+0x8(SB)/8, $0x1d1d1d1d1d1d1d1d
// constant for TBL instruction
DATA ·constants+0x10(SB)/8, $0x0e0c0a0806040200
DATA ·constants+0x18(SB)/8, $0x1e1c1a1816141210
GLOBL ·constants(SB), 8, $32

View File

@ -1,4 +1,5 @@
//+build !amd64 noasm appengine //+build !amd64 noasm appengine
//+build !arm64 noasm appengine
// Copyright 2015, Klaus Post, see LICENSE for details. // Copyright 2015, Klaus Post, see LICENSE for details.
@ -17,3 +18,10 @@ func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
out[n] ^= mt[input] out[n] ^= mt[input]
} }
} }
// slice galois add
func sliceXor(in, out []byte, sse2 bool) {
for n, input := range in {
out[n] ^= input
}
}

View File

@ -10,10 +10,10 @@ import (
type Option func(*options) type Option func(*options)
type options struct { type options struct {
maxGoroutines int maxGoroutines int
minSplitSize int minSplitSize int
useAVX2, useSSSE3 bool useAVX2, useSSSE3, useSSE2 bool
usePAR1Matrix bool usePAR1Matrix bool
} }
var defaultOptions = options{ var defaultOptions = options{
@ -28,6 +28,7 @@ func init() {
// Detect CPU capabilities. // Detect CPU capabilities.
defaultOptions.useSSSE3 = cpuid.CPU.SSSE3() defaultOptions.useSSSE3 = cpuid.CPU.SSSE3()
defaultOptions.useAVX2 = cpuid.CPU.AVX2() defaultOptions.useAVX2 = cpuid.CPU.AVX2()
defaultOptions.useSSE2 = cpuid.CPU.SSE2()
} }
// WithMaxGoroutines is the maximum number of goroutines number for encoding & decoding. // WithMaxGoroutines is the maximum number of goroutines number for encoding & decoding.
@ -67,6 +68,12 @@ func withAVX2(enabled bool) Option {
} }
} }
func withSSE2(enabled bool) Option {
return func(o *options) {
o.useSSE2 = enabled
}
}
// WithPAR1Matrix causes the encoder to build the matrix how PARv1 // WithPAR1Matrix causes the encoder to build the matrix how PARv1
// does. Note that the method they use is buggy, and may lead to cases // does. Note that the method they use is buggy, and may lead to cases
// where recovery is impossible, even if there are enough parity // where recovery is impossible, even if there are enough parity

View File

@ -64,6 +64,14 @@ type Encoder interface {
// calling the Verify function is likely to fail. // calling the Verify function is likely to fail.
ReconstructData(shards [][]byte) error ReconstructData(shards [][]byte) error
// Update parity is use for change a few data shards and update it's parity.
// Input 'newDatashards' containing data shards changed.
// Input 'shards' containing old data shards (if data shard not changed, it can be nil) and old parity shards.
// new parity shards will in shards[DataShards:]
// Update is very useful if DataShards much larger than ParityShards and changed data shards is few. It will
// faster than Encode and not need read all data shards to encode.
Update(shards [][]byte, newDatashards [][]byte) error
// Split a data slice into the number of shards given to the encoder, // Split a data slice into the number of shards given to the encoder,
// and create empty parity shards. // and create empty parity shards.
// //
@ -221,7 +229,7 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
} }
// ErrTooFewShards is returned if too few shards where given to // ErrTooFewShards is returned if too few shards where given to
// Encode/Verify/Reconstruct. It will also be returned from Reconstruct // Encode/Verify/Reconstruct/Update. It will also be returned from Reconstruct
// if there were too few shards to reconstruct the missing data. // if there were too few shards to reconstruct the missing data.
var ErrTooFewShards = errors.New("too few shards given") var ErrTooFewShards = errors.New("too few shards given")
@ -249,6 +257,101 @@ func (r reedSolomon) Encode(shards [][]byte) error {
return nil return nil
} }
// ErrInvalidInput is returned if invalid input parameter of Update.
var ErrInvalidInput = errors.New("invalid input")
func (r reedSolomon) Update(shards [][]byte, newDatashards [][]byte) error {
if len(shards) != r.Shards {
return ErrTooFewShards
}
if len(newDatashards) != r.DataShards {
return ErrTooFewShards
}
err := checkShards(shards, true)
if err != nil {
return err
}
err = checkShards(newDatashards, true)
if err != nil {
return err
}
for i := range newDatashards {
if newDatashards[i] != nil && shards[i] == nil {
return ErrInvalidInput
}
}
for _, p := range shards[r.DataShards:] {
if p == nil {
return ErrInvalidInput
}
}
shardSize := shardSize(shards)
// Get the slice of output buffers.
output := shards[r.DataShards:]
// Do the coding.
r.updateParityShards(r.parity, shards[0:r.DataShards], newDatashards[0:r.DataShards], output, r.ParityShards, shardSize)
return nil
}
func (r reedSolomon) updateParityShards(matrixRows, oldinputs, newinputs, outputs [][]byte, outputCount, byteCount int) {
if r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize {
r.updateParityShardsP(matrixRows, oldinputs, newinputs, outputs, outputCount, byteCount)
return
}
for c := 0; c < r.DataShards; c++ {
in := newinputs[c]
if in == nil {
continue
}
oldin := oldinputs[c]
// oldinputs data will be change
sliceXor(in, oldin, r.o.useSSE2)
for iRow := 0; iRow < outputCount; iRow++ {
galMulSliceXor(matrixRows[iRow][c], oldin, outputs[iRow], r.o.useSSSE3, r.o.useAVX2)
}
}
}
func (r reedSolomon) updateParityShardsP(matrixRows, oldinputs, newinputs, outputs [][]byte, outputCount, byteCount int) {
var wg sync.WaitGroup
do := byteCount / r.o.maxGoroutines
if do < r.o.minSplitSize {
do = r.o.minSplitSize
}
start := 0
for start < byteCount {
if start+do > byteCount {
do = byteCount - start
}
wg.Add(1)
go func(start, stop int) {
for c := 0; c < r.DataShards; c++ {
in := newinputs[c]
if in == nil {
continue
}
oldin := oldinputs[c]
// oldinputs data will be change
sliceXor(in[start:stop], oldin[start:stop], r.o.useSSE2)
for iRow := 0; iRow < outputCount; iRow++ {
galMulSliceXor(matrixRows[iRow][c], oldin[start:stop], outputs[iRow][start:stop], r.o.useSSSE3, r.o.useAVX2)
}
}
wg.Done()
}(start, start+do)
start += do
}
wg.Wait()
}
// Verify returns true if the parity shards contain the right data. // Verify returns true if the parity shards contain the right data.
// The data is the same format as Encode. No data is modified. // The data is the same format as Encode. No data is modified.
func (r reedSolomon) Verify(shards [][]byte) (bool, error) { func (r reedSolomon) Verify(shards [][]byte) (bool, error) {

6
vendor/vendor.json vendored
View File

@ -243,10 +243,10 @@
"revisionTime": "2016-10-16T15:41:25Z" "revisionTime": "2016-10-16T15:41:25Z"
}, },
{ {
"checksumSHA1": "gYAsuckCW3o4veePKZzEHvCcJro=", "checksumSHA1": "R9saYJznxosfknAq2aPnVKxqI3w=",
"path": "github.com/klauspost/reedsolomon", "path": "github.com/klauspost/reedsolomon",
"revision": "48a4fd05f1730dd3ef9c3f9e943f6091d063f2c4", "revision": "87ba8262ab3d167ae4d38e22796312cd2a9d0b19",
"revisionTime": "2017-07-22T14:16:58Z" "revisionTime": "2017-08-26T09:54:10Z"
}, },
{ {
"checksumSHA1": "dNYxHiBLalTqluak2/Z8c3RsSEM=", "checksumSHA1": "dNYxHiBLalTqluak2/Z8c3RsSEM=",