mirror of
https://github.com/minio/minio.git
synced 2025-01-25 21:53:16 -05:00
Vendor the upstream changes with Avx512 (#7225)
Thanks to @fwessels we have Avx512 support with 4x improvement
This commit is contained in:
parent
fef5416b3c
commit
118270d76f
24
vendor/github.com/klauspost/reedsolomon/README.md
generated
vendored
24
vendor/github.com/klauspost/reedsolomon/README.md
generated
vendored
@ -24,6 +24,10 @@ go get -u github.com/klauspost/reedsolomon
|
||||
|
||||
# Changes
|
||||
|
||||
## February 8, 2019
|
||||
|
||||
AVX512 accelerated version added for Intel Skylake CPUs. This can give up to a 4x speed improvement as compared to AVX2. See [here](https://github.com/klauspost/reedsolomon#performance-on-avx512) for more details.
|
||||
|
||||
## December 18, 2018
|
||||
|
||||
Assembly code for ppc64le has been contributed, this boosts performance by about 10x on this platform.
|
||||
@ -253,6 +257,25 @@ BenchmarkReconstruct50x20x1M-8 1364.35 4189.79 3.07x
|
||||
BenchmarkReconstruct10x4x16M-8 1484.35 5779.53 3.89x
|
||||
```
|
||||
|
||||
# Performance on AVX512
|
||||
|
||||
The performance on AVX512 has been accelerated for Intel CPUs. This gives speedups on a per-core basis of up to 4x compared to AVX2 as can be seen in the following table:
|
||||
|
||||
```
|
||||
$ benchcmp avx2.txt avx512.txt
|
||||
benchmark AVX2 MB/s AVX512 MB/s speedup
|
||||
BenchmarkEncode8x8x1M-72 1681.35 4125.64 2.45x
|
||||
BenchmarkEncode8x4x8M-72 1529.36 5507.97 3.60x
|
||||
BenchmarkEncode8x8x8M-72 791.16 2952.29 3.73x
|
||||
BenchmarkEncode8x8x32M-72 573.26 2168.61 3.78x
|
||||
BenchmarkEncode12x4x12M-72 1234.41 4912.37 3.98x
|
||||
BenchmarkEncode16x4x16M-72 1189.59 5138.01 4.32x
|
||||
BenchmarkEncode24x8x24M-72 690.68 2583.70 3.74x
|
||||
BenchmarkEncode24x8x48M-72 674.20 2643.31 3.92x
|
||||
```
|
||||
|
||||
This speedup has been achieved by computing multiple parity blocks in parallel as opposed to one after the other. In doing so it is possible to minimize the memory bandwidth required for loading all data shards. At the same time the calculations are performed in the 512-bit wide ZMM registers and the surplus of ZMM registers (32 in total) is used to keep more data around (most notably the matrix coefficients).
|
||||
|
||||
# Performance on ARM64 NEON
|
||||
|
||||
By exploiting NEON instructions the performance for ARM has been accelerated. Below are the performance numbers for a single core on an ARM Cortex-A53 CPU @ 1.2GHz (Debian 8.0 Jessie running Go: 1.7.4):
|
||||
@ -287,7 +310,6 @@ BenchmarkGaloisXor1M-160 784.60 6296.65 8.03x
|
||||
* [Reed-Solomon Erasure Coding in Haskell](https://github.com/NicolasT/reedsolomon). Haskell port of the package with similar performance.
|
||||
* [reed-solomon-erasure](https://github.com/darrenldl/reed-solomon-erasure). Compatible Rust implementation.
|
||||
* [go-erasure](https://github.com/somethingnew2-0/go-erasure). A similar library using cgo, slower in my tests.
|
||||
* [rsraid](https://github.com/goayame/rsraid). A similar library written in Go. Slower, but supports more shards.
|
||||
* [Screaming Fast Galois Field Arithmetic](http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf). Basis for SSE3 optimizations.
|
||||
|
||||
# License
|
||||
|
184
vendor/github.com/klauspost/reedsolomon/galoisAvx512_amd64.go
generated
vendored
Normal file
184
vendor/github.com/klauspost/reedsolomon/galoisAvx512_amd64.go
generated
vendored
Normal file
@ -0,0 +1,184 @@
|
||||
//+build !noasm
|
||||
//+build !appengine
|
||||
//+build !gccgo
|
||||
|
||||
// Copyright 2015, Klaus Post, see LICENSE for details.
|
||||
// Copyright 2019, Minio, Inc.
|
||||
|
||||
package reedsolomon
|
||||
|
||||
//go:noescape
|
||||
func _galMulAVX512Parallel82(in, out [][]byte, matrix *[matrixSize82]byte, addTo bool)
|
||||
|
||||
//go:noescape
|
||||
func _galMulAVX512Parallel84(in, out [][]byte, matrix *[matrixSize84]byte, addTo bool)
|
||||
|
||||
const (
|
||||
dimIn = 8 // Number of input rows processed simultaneously
|
||||
dimOut82 = 2 // Number of output rows processed simultaneously for x2 routine
|
||||
dimOut84 = 4 // Number of output rows processed simultaneously for x4 routine
|
||||
matrixSize82 = (16 + 16) * dimIn * dimOut82 // Dimension of slice of matrix coefficient passed into x2 routine
|
||||
matrixSize84 = (16 + 16) * dimIn * dimOut84 // Dimension of slice of matrix coefficient passed into x4 routine
|
||||
)
|
||||
|
||||
// Construct block of matrix coefficients for 2 outputs rows in parallel
|
||||
func setupMatrix82(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize82]byte) {
|
||||
offset := 0
|
||||
for c := inputOffset; c < inputOffset+dimIn; c++ {
|
||||
for iRow := outputOffset; iRow < outputOffset+dimOut82; iRow++ {
|
||||
if c < len(matrixRows[iRow]) {
|
||||
coeff := matrixRows[iRow][c]
|
||||
copy(matrix[offset*32:], mulTableLow[coeff][:])
|
||||
copy(matrix[offset*32+16:], mulTableHigh[coeff][:])
|
||||
} else {
|
||||
// coefficients not used for this input shard (so null out)
|
||||
v := matrix[offset*32 : offset*32+32]
|
||||
for i := range v {
|
||||
v[i] = 0
|
||||
}
|
||||
}
|
||||
offset += dimIn
|
||||
if offset >= dimIn*dimOut82 {
|
||||
offset -= dimIn*dimOut82 - 1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Construct block of matrix coefficients for 4 outputs rows in parallel
|
||||
func setupMatrix84(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize84]byte) {
|
||||
offset := 0
|
||||
for c := inputOffset; c < inputOffset+dimIn; c++ {
|
||||
for iRow := outputOffset; iRow < outputOffset+dimOut84; iRow++ {
|
||||
if c < len(matrixRows[iRow]) {
|
||||
coeff := matrixRows[iRow][c]
|
||||
copy(matrix[offset*32:], mulTableLow[coeff][:])
|
||||
copy(matrix[offset*32+16:], mulTableHigh[coeff][:])
|
||||
} else {
|
||||
// coefficients not used for this input shard (so null out)
|
||||
v := matrix[offset*32 : offset*32+32]
|
||||
for i := range v {
|
||||
v[i] = 0
|
||||
}
|
||||
}
|
||||
offset += dimIn
|
||||
if offset >= dimIn*dimOut84 {
|
||||
offset -= dimIn*dimOut84 - 1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Invoke AVX512 routine for 2 output rows in parallel
|
||||
func galMulAVX512Parallel82(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset int) {
|
||||
done := len(in[0])
|
||||
if done == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
inputEnd := inputOffset + dimIn
|
||||
if inputEnd > len(in) {
|
||||
inputEnd = len(in)
|
||||
}
|
||||
outputEnd := outputOffset + dimOut82
|
||||
if outputEnd > len(out) {
|
||||
outputEnd = len(out)
|
||||
}
|
||||
|
||||
matrix82 := [matrixSize82]byte{}
|
||||
setupMatrix82(matrixRows, inputOffset, outputOffset, &matrix82)
|
||||
addTo := inputOffset != 0 // Except for the first input column, add to previous results
|
||||
_galMulAVX512Parallel82(in[inputOffset:inputEnd], out[outputOffset:outputEnd], &matrix82, addTo)
|
||||
|
||||
done = (done >> 6) << 6
|
||||
if len(in[0])-done == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
for c := inputOffset; c < inputOffset+dimIn; c++ {
|
||||
for iRow := outputOffset; iRow < outputOffset+dimOut82; iRow++ {
|
||||
if c < len(matrixRows[iRow]) {
|
||||
mt := mulTable[matrixRows[iRow][c]]
|
||||
for i := done; i < len(in[0]); i++ {
|
||||
if c == 0 { // only set value for first input column
|
||||
out[iRow][i] = mt[in[c][i]]
|
||||
} else { // and add for all others
|
||||
out[iRow][i] ^= mt[in[c][i]]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Invoke AVX512 routine for 4 output rows in parallel
|
||||
func galMulAVX512Parallel84(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset int) {
|
||||
done := len(in[0])
|
||||
if done == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
inputEnd := inputOffset + dimIn
|
||||
if inputEnd > len(in) {
|
||||
inputEnd = len(in)
|
||||
}
|
||||
outputEnd := outputOffset + dimOut84
|
||||
if outputEnd > len(out) {
|
||||
outputEnd = len(out)
|
||||
}
|
||||
|
||||
matrix84 := [matrixSize84]byte{}
|
||||
setupMatrix84(matrixRows, inputOffset, outputOffset, &matrix84)
|
||||
addTo := inputOffset != 0 // Except for the first input column, add to previous results
|
||||
_galMulAVX512Parallel84(in[inputOffset:inputEnd], out[outputOffset:outputEnd], &matrix84, addTo)
|
||||
|
||||
done = (done >> 6) << 6
|
||||
if len(in[0])-done == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
for c := inputOffset; c < inputOffset+dimIn; c++ {
|
||||
for iRow := outputOffset; iRow < outputOffset+dimOut84; iRow++ {
|
||||
if c < len(matrixRows[iRow]) {
|
||||
mt := mulTable[matrixRows[iRow][c]]
|
||||
for i := done; i < len(in[0]); i++ {
|
||||
if c == 0 { // only set value for first input column
|
||||
out[iRow][i] = mt[in[c][i]]
|
||||
} else { // and add for all others
|
||||
out[iRow][i] ^= mt[in[c][i]]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Perform the same as codeSomeShards, but taking advantage of
|
||||
// AVX512 parallelism for up to 4x faster execution as compared to AVX2
|
||||
func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
|
||||
outputRow := 0
|
||||
// First process (multiple) batches of 4 output rows in parallel
|
||||
for ; outputRow+dimOut84 <= len(outputs); outputRow += dimOut84 {
|
||||
for inputRow := 0; inputRow < len(inputs); inputRow += dimIn {
|
||||
galMulAVX512Parallel84(inputs, outputs, matrixRows, inputRow, outputRow)
|
||||
}
|
||||
}
|
||||
// Then process a (single) batch of 2 output rows in parallel
|
||||
if outputRow+dimOut82 <= len(outputs) {
|
||||
// fmt.Println(outputRow, len(outputs))
|
||||
for inputRow := 0; inputRow < len(inputs); inputRow += dimIn {
|
||||
galMulAVX512Parallel82(inputs, outputs, matrixRows, inputRow, outputRow)
|
||||
}
|
||||
outputRow += dimOut82
|
||||
}
|
||||
// Lastly, we may have a single output row left (for uneven parity)
|
||||
if outputRow < len(outputs) {
|
||||
for c := 0; c < r.DataShards; c++ {
|
||||
if c == 0 {
|
||||
galMulSlice(matrixRows[outputRow][c], inputs[c], outputs[outputRow], &r.o)
|
||||
} else {
|
||||
galMulSliceXor(matrixRows[outputRow][c], inputs[c], outputs[outputRow], &r.o)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
590
vendor/github.com/klauspost/reedsolomon/galoisAvx512_amd64.s
generated
vendored
Normal file
590
vendor/github.com/klauspost/reedsolomon/galoisAvx512_amd64.s
generated
vendored
Normal file
@ -0,0 +1,590 @@
|
||||
//+build !noasm !appengine !gccgo
|
||||
|
||||
// Copyright 2015, Klaus Post, see LICENSE for details.
|
||||
// Copyright 2019, Minio, Inc.
|
||||
|
||||
//
|
||||
// Process 2 output rows in parallel from a total of 8 input rows
|
||||
//
|
||||
// func _galMulAVX512Parallel82(in, out [][]byte, matrix *[matrixSize82]byte, addTo bool)
|
||||
TEXT ·_galMulAVX512Parallel82(SB), 7, $0
|
||||
MOVQ in+0(FP), SI //
|
||||
MOVQ 8(SI), R9 // R9: len(in)
|
||||
SHRQ $6, R9 // len(in) / 64
|
||||
TESTQ R9, R9
|
||||
JZ done_avx512_parallel82
|
||||
|
||||
MOVQ matrix+48(FP), SI
|
||||
LONG $0x48fee162; WORD $0x066f // VMOVDQU64 ZMM16, 0x000[rsi]
|
||||
LONG $0x48fee162; WORD $0x4e6f; BYTE $0x01 // VMOVDQU64 ZMM17, 0x040[rsi]
|
||||
LONG $0x48fee162; WORD $0x566f; BYTE $0x02 // VMOVDQU64 ZMM18, 0x080[rsi]
|
||||
LONG $0x48fee162; WORD $0x5e6f; BYTE $0x03 // VMOVDQU64 ZMM19, 0x0c0[rsi]
|
||||
LONG $0x48fee162; WORD $0x666f; BYTE $0x04 // VMOVDQU64 ZMM20, 0x100[rsi]
|
||||
LONG $0x48fee162; WORD $0x6e6f; BYTE $0x05 // VMOVDQU64 ZMM21, 0x140[rsi]
|
||||
LONG $0x48fee162; WORD $0x766f; BYTE $0x06 // VMOVDQU64 ZMM22, 0x180[rsi]
|
||||
LONG $0x48fee162; WORD $0x7e6f; BYTE $0x07 // VMOVDQU64 ZMM23, 0x1c0[rsi]
|
||||
|
||||
MOVQ $15, BX
|
||||
MOVQ BX, X5
|
||||
LONG $0x487df262; WORD $0xd578 // VPBROADCASTB ZMM2, XMM5
|
||||
|
||||
MOVB addTo+56(FP), AX
|
||||
LONG $0xffc0c749; WORD $0xffff; BYTE $0xff // mov r8, -1
|
||||
WORD $0xf749; BYTE $0xe0 // mul r8
|
||||
LONG $0x92fbe1c4; BYTE $0xc8 // kmovq k1, rax
|
||||
MOVQ in+0(FP), SI // SI: &in
|
||||
MOVQ in_len+8(FP), AX // number of inputs
|
||||
XORQ R11, R11
|
||||
MOVQ out+24(FP), DX
|
||||
MOVQ 24(DX), CX // CX: &out[1][0]
|
||||
MOVQ (DX), DX // DX: &out[0][0]
|
||||
|
||||
loopback_avx512_parallel82:
|
||||
LONG $0xc9fef162; WORD $0x226f // VMOVDQU64 ZMM4{k1}{z}, [rdx]
|
||||
LONG $0xc9fef162; WORD $0x296f // VMOVDQU64 ZMM5{k1}{z}, [rcx]
|
||||
|
||||
MOVQ (SI), BX // BX: &in[0][0]
|
||||
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
|
||||
LONG $0x40fd3362; WORD $0xf043; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM16, ZMM16, 0x00
|
||||
LONG $0x40fd3362; WORD $0xf843; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM16, ZMM16, 0x55
|
||||
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
|
||||
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
|
||||
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
|
||||
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
|
||||
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
|
||||
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
|
||||
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
|
||||
|
||||
LONG $0x40dd3362; WORD $0xe443; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM20, ZMM20, 0x00
|
||||
LONG $0x40dd3362; WORD $0xec43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM20, ZMM20, 0x55
|
||||
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
|
||||
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
|
||||
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
|
||||
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
|
||||
|
||||
CMPQ AX, $1
|
||||
JE skip_avx512_parallel82
|
||||
|
||||
MOVQ 24(SI), BX // BX: &in[1][0]
|
||||
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
|
||||
LONG $0x40fd3362; WORD $0xf043; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM16, ZMM16, 0xaa
|
||||
LONG $0x40fd3362; WORD $0xf843; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM16, ZMM16, 0xff
|
||||
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
|
||||
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
|
||||
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
|
||||
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
|
||||
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
|
||||
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
|
||||
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
|
||||
|
||||
LONG $0x40dd3362; WORD $0xe443; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM20, ZMM20, 0xaa
|
||||
LONG $0x40dd3362; WORD $0xec43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM20, ZMM20, 0xff
|
||||
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
|
||||
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
|
||||
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
|
||||
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
|
||||
|
||||
CMPQ AX, $2
|
||||
JE skip_avx512_parallel82
|
||||
|
||||
MOVQ 48(SI), BX // BX: &in[2][0]
|
||||
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
|
||||
LONG $0x40f53362; WORD $0xf143; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM17, ZMM17, 0x00
|
||||
LONG $0x40f53362; WORD $0xf943; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM17, ZMM17, 0x55
|
||||
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
|
||||
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
|
||||
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
|
||||
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
|
||||
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
|
||||
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
|
||||
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
|
||||
|
||||
LONG $0x40d53362; WORD $0xe543; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM21, ZMM21, 0x00
|
||||
LONG $0x40d53362; WORD $0xed43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM21, ZMM21, 0x55
|
||||
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
|
||||
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
|
||||
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
|
||||
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
|
||||
|
||||
CMPQ AX, $3
|
||||
JE skip_avx512_parallel82
|
||||
|
||||
MOVQ 72(SI), BX // BX: &in[3][0]
|
||||
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
|
||||
LONG $0x40f53362; WORD $0xf143; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM17, ZMM17, 0xaa
|
||||
LONG $0x40f53362; WORD $0xf943; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM17, ZMM17, 0xff
|
||||
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
|
||||
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
|
||||
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
|
||||
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
|
||||
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
|
||||
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
|
||||
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
|
||||
|
||||
LONG $0x40d53362; WORD $0xe543; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM21, ZMM21, 0xaa
|
||||
LONG $0x40d53362; WORD $0xed43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM21, ZMM21, 0xff
|
||||
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
|
||||
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
|
||||
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
|
||||
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
|
||||
|
||||
CMPQ AX, $4
|
||||
JE skip_avx512_parallel82
|
||||
|
||||
MOVQ 96(SI), BX // BX: &in[4][0]
|
||||
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
|
||||
LONG $0x40ed3362; WORD $0xf243; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM18, ZMM18, 0x00
|
||||
LONG $0x40ed3362; WORD $0xfa43; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM18, ZMM18, 0x55
|
||||
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
|
||||
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
|
||||
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
|
||||
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
|
||||
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
|
||||
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
|
||||
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
|
||||
|
||||
LONG $0x40cd3362; WORD $0xe643; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM22, ZMM22, 0x00
|
||||
LONG $0x40cd3362; WORD $0xee43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM22, ZMM22, 0x55
|
||||
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
|
||||
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
|
||||
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
|
||||
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
|
||||
|
||||
CMPQ AX, $5
|
||||
JE skip_avx512_parallel82
|
||||
|
||||
MOVQ 120(SI), BX // BX: &in[5][0]
|
||||
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
|
||||
LONG $0x40ed3362; WORD $0xf243; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM18, ZMM18, 0xaa
|
||||
LONG $0x40ed3362; WORD $0xfa43; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM18, ZMM18, 0xff
|
||||
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
|
||||
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
|
||||
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
|
||||
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
|
||||
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
|
||||
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
|
||||
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
|
||||
|
||||
LONG $0x40cd3362; WORD $0xe643; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM22, ZMM22, 0xaa
|
||||
LONG $0x40cd3362; WORD $0xee43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM22, ZMM22, 0xff
|
||||
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
|
||||
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
|
||||
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
|
||||
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
|
||||
|
||||
CMPQ AX, $6
|
||||
JE skip_avx512_parallel82
|
||||
|
||||
MOVQ 144(SI), BX // BX: &in[6][0]
|
||||
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
|
||||
LONG $0x40e53362; WORD $0xf343; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM19, ZMM19, 0x00
|
||||
LONG $0x40e53362; WORD $0xfb43; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM19, ZMM19, 0x55
|
||||
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
|
||||
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
|
||||
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
|
||||
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
|
||||
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
|
||||
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
|
||||
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
|
||||
|
||||
LONG $0x40c53362; WORD $0xe743; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM23, ZMM23, 0x00
|
||||
LONG $0x40c53362; WORD $0xef43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM23, ZMM23, 0x55
|
||||
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
|
||||
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
|
||||
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
|
||||
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
|
||||
|
||||
CMPQ AX, $7
|
||||
JE skip_avx512_parallel82
|
||||
|
||||
MOVQ 168(SI), BX // BX: &in[7][0]
|
||||
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
|
||||
LONG $0x40e53362; WORD $0xf343; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM19, ZMM19, 0xaa
|
||||
LONG $0x40e53362; WORD $0xfb43; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM19, ZMM19, 0xff
|
||||
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
|
||||
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
|
||||
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
|
||||
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
|
||||
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
|
||||
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
|
||||
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
|
||||
|
||||
LONG $0x40c53362; WORD $0xe743; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM23, ZMM23, 0xaa
|
||||
LONG $0x40c53362; WORD $0xef43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM23, ZMM23, 0xff
|
||||
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
|
||||
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
|
||||
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
|
||||
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
|
||||
|
||||
skip_avx512_parallel82:
|
||||
LONG $0x48fef162; WORD $0x227f // VMOVDQU64 [rdx], ZMM4
|
||||
LONG $0x48fef162; WORD $0x297f // VMOVDQU64 [rcx], ZMM5
|
||||
|
||||
ADDQ $64, R11 // in4+=64
|
||||
|
||||
ADDQ $64, DX // out+=64
|
||||
ADDQ $64, CX // out2+=64
|
||||
|
||||
SUBQ $1, R9
|
||||
JNZ loopback_avx512_parallel82
|
||||
|
||||
done_avx512_parallel82:
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
//
|
||||
// Process 4 output rows in parallel from a total of 8 input rows
|
||||
//
|
||||
// func _galMulAVX512Parallel84(in, out [][]byte, matrix *[matrixSize84]byte, addTo bool)
|
||||
TEXT ·_galMulAVX512Parallel84(SB), 7, $0
|
||||
MOVQ in+0(FP), SI //
|
||||
MOVQ 8(SI), R9 // R9: len(in)
|
||||
SHRQ $6, R9 // len(in) / 64
|
||||
TESTQ R9, R9
|
||||
JZ done_avx512_parallel84
|
||||
|
||||
MOVQ matrix+48(FP), SI
|
||||
LONG $0x48fee162; WORD $0x066f // VMOVDQU64 ZMM16, 0x000[rsi]
|
||||
LONG $0x48fee162; WORD $0x4e6f; BYTE $0x01 // VMOVDQU64 ZMM17, 0x040[rsi]
|
||||
LONG $0x48fee162; WORD $0x566f; BYTE $0x02 // VMOVDQU64 ZMM18, 0x080[rsi]
|
||||
LONG $0x48fee162; WORD $0x5e6f; BYTE $0x03 // VMOVDQU64 ZMM19, 0x0c0[rsi]
|
||||
LONG $0x48fee162; WORD $0x666f; BYTE $0x04 // VMOVDQU64 ZMM20, 0x100[rsi]
|
||||
LONG $0x48fee162; WORD $0x6e6f; BYTE $0x05 // VMOVDQU64 ZMM21, 0x140[rsi]
|
||||
LONG $0x48fee162; WORD $0x766f; BYTE $0x06 // VMOVDQU64 ZMM22, 0x180[rsi]
|
||||
LONG $0x48fee162; WORD $0x7e6f; BYTE $0x07 // VMOVDQU64 ZMM23, 0x1c0[rsi]
|
||||
LONG $0x48fe6162; WORD $0x466f; BYTE $0x08 // VMOVDQU64 ZMM24, 0x200[rsi]
|
||||
LONG $0x48fe6162; WORD $0x4e6f; BYTE $0x09 // VMOVDQU64 ZMM25, 0x240[rsi]
|
||||
LONG $0x48fe6162; WORD $0x566f; BYTE $0x0a // VMOVDQU64 ZMM26, 0x280[rsi]
|
||||
LONG $0x48fe6162; WORD $0x5e6f; BYTE $0x0b // VMOVDQU64 ZMM27, 0x2c0[rsi]
|
||||
LONG $0x48fe6162; WORD $0x666f; BYTE $0x0c // VMOVDQU64 ZMM28, 0x300[rsi]
|
||||
LONG $0x48fe6162; WORD $0x6e6f; BYTE $0x0d // VMOVDQU64 ZMM29, 0x340[rsi]
|
||||
LONG $0x48fe6162; WORD $0x766f; BYTE $0x0e // VMOVDQU64 ZMM30, 0x380[rsi]
|
||||
LONG $0x48fe6162; WORD $0x7e6f; BYTE $0x0f // VMOVDQU64 ZMM31, 0x3c0[rsi]
|
||||
|
||||
MOVQ $15, BX
|
||||
MOVQ BX, X5
|
||||
LONG $0x487df262; WORD $0xd578 // VPBROADCASTB ZMM2, XMM5
|
||||
|
||||
MOVB addTo+56(FP), AX
|
||||
LONG $0xffc0c749; WORD $0xffff; BYTE $0xff // mov r8, -1
|
||||
WORD $0xf749; BYTE $0xe0 // mul r8
|
||||
LONG $0x92fbe1c4; BYTE $0xc8 // kmovq k1, rax
|
||||
MOVQ in+0(FP), SI // SI: &in
|
||||
MOVQ in_len+8(FP), AX // number of inputs
|
||||
XORQ R11, R11
|
||||
MOVQ out+24(FP), DX
|
||||
MOVQ 24(DX), CX // CX: &out[1][0]
|
||||
MOVQ 48(DX), R10 // R10: &out[2][0]
|
||||
MOVQ 72(DX), R12 // R12: &out[3][0]
|
||||
MOVQ (DX), DX // DX: &out[0][0]
|
||||
|
||||
loopback_avx512_parallel84:
|
||||
LONG $0xc9fef162; WORD $0x226f // VMOVDQU64 ZMM4{k1}{z}, [rdx]
|
||||
LONG $0xc9fef162; WORD $0x296f // VMOVDQU64 ZMM5{k1}{z}, [rcx]
|
||||
LONG $0xc9fed162; WORD $0x326f // VMOVDQU64 ZMM6{k1}{z}, [r10]
|
||||
LONG $0xc9fed162; WORD $0x3c6f; BYTE $0x24 // VMOVDQU64 ZMM7{k1}{z}, [r12]
|
||||
|
||||
MOVQ (SI), BX // BX: &in[0][0]
|
||||
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
|
||||
LONG $0x40fd3362; WORD $0xf043; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM16, ZMM16, 0x00
|
||||
LONG $0x40fd3362; WORD $0xf843; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM16, ZMM16, 0x55
|
||||
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
|
||||
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
|
||||
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
|
||||
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
|
||||
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
|
||||
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
|
||||
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
|
||||
|
||||
LONG $0x40dd3362; WORD $0xe443; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM20, ZMM20, 0x00
|
||||
LONG $0x40dd3362; WORD $0xec43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM20, ZMM20, 0x55
|
||||
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
|
||||
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
|
||||
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
|
||||
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
|
||||
|
||||
LONG $0x40bd1362; WORD $0xd043; BYTE $0x00 // VSHUFI64x2 ZMM10, ZMM24, ZMM24, 0x00
|
||||
LONG $0x40bd1362; WORD $0xd843; BYTE $0x55 // VSHUFI64x2 ZMM11, ZMM24, ZMM24, 0x55
|
||||
LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
|
||||
LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
|
||||
LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
|
||||
LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
|
||||
|
||||
LONG $0x409d1362; WORD $0xc443; BYTE $0x00 // VSHUFI64x2 ZMM8, ZMM28, ZMM28, 0x00
|
||||
LONG $0x409d1362; WORD $0xcc43; BYTE $0x55 // VSHUFI64x2 ZMM9, ZMM28, ZMM28, 0x55
|
||||
LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
|
||||
LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
|
||||
LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
|
||||
LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
|
||||
|
||||
CMPQ AX, $1
|
||||
JE skip_avx512_parallel84
|
||||
|
||||
MOVQ 24(SI), BX // BX: &in[1][0]
|
||||
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
|
||||
LONG $0x40fd3362; WORD $0xf043; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM16, ZMM16, 0xaa
|
||||
LONG $0x40fd3362; WORD $0xf843; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM16, ZMM16, 0xff
|
||||
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
|
||||
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
|
||||
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
|
||||
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
|
||||
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
|
||||
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
|
||||
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
|
||||
|
||||
LONG $0x40dd3362; WORD $0xe443; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM20, ZMM20, 0xaa
|
||||
LONG $0x40dd3362; WORD $0xec43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM20, ZMM20, 0xff
|
||||
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
|
||||
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
|
||||
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
|
||||
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
|
||||
|
||||
LONG $0x40bd1362; WORD $0xd043; BYTE $0xaa // VSHUFI64x2 ZMM10, ZMM24, ZMM24, 0xaa
|
||||
LONG $0x40bd1362; WORD $0xd843; BYTE $0xff // VSHUFI64x2 ZMM11, ZMM24, ZMM24, 0xff
|
||||
LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
|
||||
LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
|
||||
LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
|
||||
LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
|
||||
|
||||
LONG $0x409d1362; WORD $0xc443; BYTE $0xaa // VSHUFI64x2 ZMM8, ZMM28, ZMM28, 0xaa
|
||||
LONG $0x409d1362; WORD $0xcc43; BYTE $0xff // VSHUFI64x2 ZMM9, ZMM28, ZMM28, 0xff
|
||||
LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
|
||||
LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
|
||||
LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
|
||||
LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
|
||||
|
||||
CMPQ AX, $2
|
||||
JE skip_avx512_parallel84
|
||||
|
||||
MOVQ 48(SI), BX // BX: &in[2][0]
|
||||
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
|
||||
LONG $0x40f53362; WORD $0xf143; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM17, ZMM17, 0x00
|
||||
LONG $0x40f53362; WORD $0xf943; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM17, ZMM17, 0x55
|
||||
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
|
||||
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
|
||||
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
|
||||
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
|
||||
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
|
||||
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
|
||||
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
|
||||
|
||||
LONG $0x40d53362; WORD $0xe543; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM21, ZMM21, 0x00
|
||||
LONG $0x40d53362; WORD $0xed43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM21, ZMM21, 0x55
|
||||
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
|
||||
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
|
||||
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
|
||||
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
|
||||
|
||||
LONG $0x40b51362; WORD $0xd143; BYTE $0x00 // VSHUFI64x2 ZMM10, ZMM25, ZMM25, 0x00
|
||||
LONG $0x40b51362; WORD $0xd943; BYTE $0x55 // VSHUFI64x2 ZMM11, ZMM25, ZMM25, 0x55
|
||||
LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
|
||||
LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
|
||||
LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
|
||||
LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
|
||||
|
||||
LONG $0x40951362; WORD $0xc543; BYTE $0x00 // VSHUFI64x2 ZMM8, ZMM29, ZMM29, 0x00
|
||||
LONG $0x40951362; WORD $0xcd43; BYTE $0x55 // VSHUFI64x2 ZMM9, ZMM29, ZMM29, 0x55
|
||||
LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
|
||||
LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
|
||||
LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
|
||||
LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
|
||||
|
||||
CMPQ AX, $3
|
||||
JE skip_avx512_parallel84
|
||||
|
||||
MOVQ 72(SI), BX // BX: &in[3][0]
|
||||
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
|
||||
LONG $0x40f53362; WORD $0xf143; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM17, ZMM17, 0xaa
|
||||
LONG $0x40f53362; WORD $0xf943; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM17, ZMM17, 0xff
|
||||
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
|
||||
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
|
||||
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
|
||||
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
|
||||
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
|
||||
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
|
||||
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
|
||||
|
||||
LONG $0x40d53362; WORD $0xe543; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM21, ZMM21, 0xaa
|
||||
LONG $0x40d53362; WORD $0xed43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM21, ZMM21, 0xff
|
||||
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
|
||||
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
|
||||
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
|
||||
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
|
||||
|
||||
LONG $0x40b51362; WORD $0xd143; BYTE $0xaa // VSHUFI64x2 ZMM10, ZMM25, ZMM25, 0xaa
|
||||
LONG $0x40b51362; WORD $0xd943; BYTE $0xff // VSHUFI64x2 ZMM11, ZMM25, ZMM25, 0xff
|
||||
LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
|
||||
LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
|
||||
LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
|
||||
LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
|
||||
|
||||
LONG $0x40951362; WORD $0xc543; BYTE $0xaa // VSHUFI64x2 ZMM8, ZMM29, ZMM29, 0xaa
|
||||
LONG $0x40951362; WORD $0xcd43; BYTE $0xff // VSHUFI64x2 ZMM9, ZMM29, ZMM29, 0xff
|
||||
LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
|
||||
LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
|
||||
LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
|
||||
LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
|
||||
|
||||
CMPQ AX, $4
|
||||
JE skip_avx512_parallel84
|
||||
|
||||
MOVQ 96(SI), BX // BX: &in[4][0]
|
||||
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
|
||||
LONG $0x40ed3362; WORD $0xf243; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM18, ZMM18, 0x00
|
||||
LONG $0x40ed3362; WORD $0xfa43; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM18, ZMM18, 0x55
|
||||
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
|
||||
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
|
||||
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
|
||||
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
|
||||
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
|
||||
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
|
||||
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
|
||||
|
||||
LONG $0x40cd3362; WORD $0xe643; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM22, ZMM22, 0x00
|
||||
LONG $0x40cd3362; WORD $0xee43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM22, ZMM22, 0x55
|
||||
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
|
||||
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
|
||||
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
|
||||
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
|
||||
|
||||
LONG $0x40ad1362; WORD $0xd243; BYTE $0x00 // VSHUFI64x2 ZMM10, ZMM26, ZMM26, 0x00
|
||||
LONG $0x40ad1362; WORD $0xda43; BYTE $0x55 // VSHUFI64x2 ZMM11, ZMM26, ZMM26, 0x55
|
||||
LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
|
||||
LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
|
||||
LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
|
||||
LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
|
||||
|
||||
LONG $0x408d1362; WORD $0xc643; BYTE $0x00 // VSHUFI64x2 ZMM8, ZMM30, ZMM30, 0x00
|
||||
LONG $0x408d1362; WORD $0xce43; BYTE $0x55 // VSHUFI64x2 ZMM9, ZMM30, ZMM30, 0x55
|
||||
LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
|
||||
LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
|
||||
LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
|
||||
LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
|
||||
|
||||
CMPQ AX, $5
|
||||
JE skip_avx512_parallel84
|
||||
|
||||
MOVQ 120(SI), BX // BX: &in[5][0]
|
||||
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
|
||||
LONG $0x40ed3362; WORD $0xf243; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM18, ZMM18, 0xaa
|
||||
LONG $0x40ed3362; WORD $0xfa43; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM18, ZMM18, 0xff
|
||||
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
|
||||
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
|
||||
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
|
||||
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
|
||||
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
|
||||
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
|
||||
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
|
||||
|
||||
LONG $0x40cd3362; WORD $0xe643; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM22, ZMM22, 0xaa
|
||||
LONG $0x40cd3362; WORD $0xee43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM22, ZMM22, 0xff
|
||||
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
|
||||
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
|
||||
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
|
||||
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
|
||||
|
||||
LONG $0x40ad1362; WORD $0xd243; BYTE $0xaa // VSHUFI64x2 ZMM10, ZMM26, ZMM26, 0xaa
|
||||
LONG $0x40ad1362; WORD $0xda43; BYTE $0xff // VSHUFI64x2 ZMM11, ZMM26, ZMM26, 0xff
|
||||
LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
|
||||
LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
|
||||
LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
|
||||
LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
|
||||
|
||||
LONG $0x408d1362; WORD $0xc643; BYTE $0xaa // VSHUFI64x2 ZMM8, ZMM30, ZMM30, 0xaa
|
||||
LONG $0x408d1362; WORD $0xce43; BYTE $0xff // VSHUFI64x2 ZMM9, ZMM30, ZMM30, 0xff
|
||||
LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
|
||||
LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
|
||||
LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
|
||||
LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
|
||||
|
||||
CMPQ AX, $6
|
||||
JE skip_avx512_parallel84
|
||||
|
||||
MOVQ 144(SI), BX // BX: &in[6][0]
|
||||
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
|
||||
LONG $0x40e53362; WORD $0xf343; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM19, ZMM19, 0x00
|
||||
LONG $0x40e53362; WORD $0xfb43; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM19, ZMM19, 0x55
|
||||
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
|
||||
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
|
||||
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
|
||||
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
|
||||
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
|
||||
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
|
||||
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
|
||||
|
||||
LONG $0x40c53362; WORD $0xe743; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM23, ZMM23, 0x00
|
||||
LONG $0x40c53362; WORD $0xef43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM23, ZMM23, 0x55
|
||||
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
|
||||
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
|
||||
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
|
||||
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
|
||||
|
||||
LONG $0x40a51362; WORD $0xd343; BYTE $0x00 // VSHUFI64x2 ZMM10, ZMM27, ZMM27, 0x00
|
||||
LONG $0x40a51362; WORD $0xdb43; BYTE $0x55 // VSHUFI64x2 ZMM11, ZMM27, ZMM27, 0x55
|
||||
LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
|
||||
LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
|
||||
LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
|
||||
LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
|
||||
|
||||
LONG $0x40851362; WORD $0xc743; BYTE $0x00 // VSHUFI64x2 ZMM8, ZMM31, ZMM31, 0x00
|
||||
LONG $0x40851362; WORD $0xcf43; BYTE $0x55 // VSHUFI64x2 ZMM9, ZMM31, ZMM31, 0x55
|
||||
LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
|
||||
LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
|
||||
LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
|
||||
LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
|
||||
|
||||
CMPQ AX, $7
|
||||
JE skip_avx512_parallel84
|
||||
|
||||
MOVQ 168(SI), BX // BX: &in[7][0]
|
||||
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
|
||||
LONG $0x40e53362; WORD $0xf343; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM19, ZMM19, 0xaa
|
||||
LONG $0x40e53362; WORD $0xfb43; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM19, ZMM19, 0xff
|
||||
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
|
||||
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
|
||||
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
|
||||
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
|
||||
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
|
||||
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
|
||||
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
|
||||
|
||||
LONG $0x40c53362; WORD $0xe743; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM23, ZMM23, 0xaa
|
||||
LONG $0x40c53362; WORD $0xef43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM23, ZMM23, 0xff
|
||||
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
|
||||
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
|
||||
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
|
||||
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
|
||||
|
||||
LONG $0x40a51362; WORD $0xd343; BYTE $0xaa // VSHUFI64x2 ZMM10, ZMM27, ZMM27, 0xaa
|
||||
LONG $0x40a51362; WORD $0xdb43; BYTE $0xff // VSHUFI64x2 ZMM11, ZMM27, ZMM27, 0xff
|
||||
LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
|
||||
LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
|
||||
LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
|
||||
LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
|
||||
|
||||
LONG $0x40851362; WORD $0xc743; BYTE $0xaa // VSHUFI64x2 ZMM8, ZMM31, ZMM31, 0xaa
|
||||
LONG $0x40851362; WORD $0xcf43; BYTE $0xff // VSHUFI64x2 ZMM9, ZMM31, ZMM31, 0xff
|
||||
LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
|
||||
LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
|
||||
LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
|
||||
LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
|
||||
|
||||
skip_avx512_parallel84:
|
||||
LONG $0x48fef162; WORD $0x227f // VMOVDQU64 [rdx], ZMM4
|
||||
LONG $0x48fef162; WORD $0x297f // VMOVDQU64 [rcx], ZMM5
|
||||
LONG $0x48fed162; WORD $0x327f // VMOVDQU64 [r10], ZMM6
|
||||
LONG $0x48fed162; WORD $0x3c7f; BYTE $0x24 // VMOVDQU64 [r12], ZMM7
|
||||
|
||||
ADDQ $64, R11 // in4+=64
|
||||
|
||||
ADDQ $64, DX // out+=64
|
||||
ADDQ $64, CX // out2+=64
|
||||
ADDQ $64, R10 // out3+=64
|
||||
ADDQ $64, R12 // out4+=64
|
||||
|
||||
SUBQ $1, R9
|
||||
JNZ loopback_avx512_parallel84
|
||||
|
||||
done_avx512_parallel84:
|
||||
VZEROUPPER
|
||||
RET
|
12
vendor/github.com/klauspost/reedsolomon/galois_amd64.go
generated
vendored
12
vendor/github.com/klauspost/reedsolomon/galois_amd64.go
generated
vendored
@ -40,12 +40,12 @@ func galMulSSSE3Xor(low, high, in, out []byte) {
|
||||
}
|
||||
*/
|
||||
|
||||
func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
|
||||
func galMulSlice(c byte, in, out []byte, o *options) {
|
||||
var done int
|
||||
if avx2 {
|
||||
if o.useAVX2 {
|
||||
galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
||||
done = (len(in) >> 5) << 5
|
||||
} else if ssse3 {
|
||||
} else if o.useSSSE3 {
|
||||
galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
||||
done = (len(in) >> 4) << 4
|
||||
}
|
||||
@ -58,12 +58,12 @@ func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
|
||||
}
|
||||
}
|
||||
|
||||
func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
|
||||
func galMulSliceXor(c byte, in, out []byte, o *options) {
|
||||
var done int
|
||||
if avx2 {
|
||||
if o.useAVX2 {
|
||||
galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
||||
done = (len(in) >> 5) << 5
|
||||
} else if ssse3 {
|
||||
} else if o.useSSSE3 {
|
||||
galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
||||
done = (len(in) >> 4) << 4
|
||||
}
|
||||
|
7
vendor/github.com/klauspost/reedsolomon/galois_arm64.go
generated
vendored
7
vendor/github.com/klauspost/reedsolomon/galois_arm64.go
generated
vendored
@ -13,7 +13,7 @@ func galMulNEON(c uint64, in, out []byte)
|
||||
//go:noescape
|
||||
func galMulXorNEON(c uint64, in, out []byte)
|
||||
|
||||
func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
|
||||
func galMulSlice(c byte, in, out []byte, o *options) {
|
||||
var done int
|
||||
galMulNEON(uint64(c), in, out)
|
||||
done = (len(in) >> 5) << 5
|
||||
@ -27,7 +27,7 @@ func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
|
||||
}
|
||||
}
|
||||
|
||||
func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
|
||||
func galMulSliceXor(c byte, in, out []byte, o *options) {
|
||||
var done int
|
||||
galMulXorNEON(uint64(c), in, out)
|
||||
done = (len(in) >> 5) << 5
|
||||
@ -47,3 +47,6 @@ func sliceXor(in, out []byte, sse2 bool) {
|
||||
out[n] ^= input
|
||||
}
|
||||
}
|
||||
|
||||
func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
|
||||
}
|
||||
|
7
vendor/github.com/klauspost/reedsolomon/galois_noasm.go
generated
vendored
7
vendor/github.com/klauspost/reedsolomon/galois_noasm.go
generated
vendored
@ -6,14 +6,14 @@
|
||||
|
||||
package reedsolomon
|
||||
|
||||
func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
|
||||
func galMulSlice(c byte, in, out []byte, o *options) {
|
||||
mt := mulTable[c]
|
||||
for n, input := range in {
|
||||
out[n] = mt[input]
|
||||
}
|
||||
}
|
||||
|
||||
func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
|
||||
func galMulSliceXor(c byte, in, out []byte, o *options) {
|
||||
mt := mulTable[c]
|
||||
for n, input := range in {
|
||||
out[n] ^= mt[input]
|
||||
@ -26,3 +26,6 @@ func sliceXor(in, out []byte, sse2 bool) {
|
||||
out[n] ^= input
|
||||
}
|
||||
}
|
||||
|
||||
func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
|
||||
}
|
||||
|
7
vendor/github.com/klauspost/reedsolomon/galois_ppc64le.go
generated
vendored
7
vendor/github.com/klauspost/reedsolomon/galois_ppc64le.go
generated
vendored
@ -31,7 +31,7 @@ func galMulPpcXor(low, high, in, out []byte) {
|
||||
}
|
||||
*/
|
||||
|
||||
func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
|
||||
func galMulSlice(c byte, in, out []byte, o *options) {
|
||||
done := (len(in) >> 4) << 4
|
||||
if done > 0 {
|
||||
galMulPpc(mulTableLow[c][:], mulTableHigh[c][:], in[:done], out)
|
||||
@ -45,7 +45,7 @@ func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
|
||||
}
|
||||
}
|
||||
|
||||
func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
|
||||
func galMulSliceXor(c byte, in, out []byte, o *options) {
|
||||
done := (len(in) >> 4) << 4
|
||||
if done > 0 {
|
||||
galMulPpcXor(mulTableLow[c][:], mulTableHigh[c][:], in[:done], out)
|
||||
@ -65,3 +65,6 @@ func sliceXor(in, out []byte, sse2 bool) {
|
||||
out[n] ^= input
|
||||
}
|
||||
}
|
||||
|
||||
func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
|
||||
}
|
||||
|
11
vendor/github.com/klauspost/reedsolomon/options.go
generated
vendored
11
vendor/github.com/klauspost/reedsolomon/options.go
generated
vendored
@ -12,7 +12,7 @@ type Option func(*options)
|
||||
type options struct {
|
||||
maxGoroutines int
|
||||
minSplitSize int
|
||||
useAVX2, useSSSE3, useSSE2 bool
|
||||
useAVX512, useAVX2, useSSSE3, useSSE2 bool
|
||||
usePAR1Matrix bool
|
||||
useCauchy bool
|
||||
shardSize int
|
||||
@ -29,8 +29,9 @@ func init() {
|
||||
}
|
||||
// Detect CPU capabilities.
|
||||
defaultOptions.useSSSE3 = cpuid.CPU.SSSE3()
|
||||
defaultOptions.useAVX2 = cpuid.CPU.AVX2()
|
||||
defaultOptions.useSSE2 = cpuid.CPU.SSE2()
|
||||
defaultOptions.useAVX2 = cpuid.CPU.AVX2()
|
||||
defaultOptions.useAVX512 = cpuid.CPU.AVX512F() && cpuid.CPU.AVX512BW()
|
||||
}
|
||||
|
||||
// WithMaxGoroutines is the maximum number of goroutines number for encoding & decoding.
|
||||
@ -88,6 +89,12 @@ func withSSE2(enabled bool) Option {
|
||||
}
|
||||
}
|
||||
|
||||
func withAVX512(enabled bool) Option {
|
||||
return func(o *options) {
|
||||
o.useAVX512 = enabled
|
||||
}
|
||||
}
|
||||
|
||||
// WithPAR1Matrix causes the encoder to build the matrix how PARv1
|
||||
// does. Note that the method they use is buggy, and may lead to cases
|
||||
// where recovery is impossible, even if there are enough parity
|
||||
|
21
vendor/github.com/klauspost/reedsolomon/reedsolomon.go
generated
vendored
21
vendor/github.com/klauspost/reedsolomon/reedsolomon.go
generated
vendored
@ -372,7 +372,7 @@ func (r reedSolomon) updateParityShards(matrixRows, oldinputs, newinputs, output
|
||||
// oldinputs data will be change
|
||||
sliceXor(in, oldin, r.o.useSSE2)
|
||||
for iRow := 0; iRow < outputCount; iRow++ {
|
||||
galMulSliceXor(matrixRows[iRow][c], oldin, outputs[iRow], r.o.useSSSE3, r.o.useAVX2)
|
||||
galMulSliceXor(matrixRows[iRow][c], oldin, outputs[iRow], &r.o)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -399,7 +399,7 @@ func (r reedSolomon) updateParityShardsP(matrixRows, oldinputs, newinputs, outpu
|
||||
// oldinputs data will be change
|
||||
sliceXor(in[start:stop], oldin[start:stop], r.o.useSSE2)
|
||||
for iRow := 0; iRow < outputCount; iRow++ {
|
||||
galMulSliceXor(matrixRows[iRow][c], oldin[start:stop], outputs[iRow][start:stop], r.o.useSSSE3, r.o.useAVX2)
|
||||
galMulSliceXor(matrixRows[iRow][c], oldin[start:stop], outputs[iRow][start:stop], &r.o)
|
||||
}
|
||||
}
|
||||
wg.Done()
|
||||
@ -437,7 +437,10 @@ func (r reedSolomon) Verify(shards [][]byte) (bool, error) {
|
||||
// number of matrix rows used, is determined by
|
||||
// outputCount, which is the number of outputs to compute.
|
||||
func (r reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
|
||||
if r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize {
|
||||
if r.o.useAVX512 && len(inputs) >= 4 && len(outputs) >= 2 {
|
||||
r.codeSomeShardsAvx512(matrixRows, inputs, outputs, outputCount, byteCount)
|
||||
return
|
||||
} else if r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize {
|
||||
r.codeSomeShardsP(matrixRows, inputs, outputs, outputCount, byteCount)
|
||||
return
|
||||
}
|
||||
@ -445,9 +448,9 @@ func (r reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, output
|
||||
in := inputs[c]
|
||||
for iRow := 0; iRow < outputCount; iRow++ {
|
||||
if c == 0 {
|
||||
galMulSlice(matrixRows[iRow][c], in, outputs[iRow], r.o.useSSSE3, r.o.useAVX2)
|
||||
galMulSlice(matrixRows[iRow][c], in, outputs[iRow], &r.o)
|
||||
} else {
|
||||
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow], r.o.useSSSE3, r.o.useAVX2)
|
||||
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow], &r.o)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -474,9 +477,9 @@ func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outpu
|
||||
in := inputs[c][start:stop]
|
||||
for iRow := 0; iRow < outputCount; iRow++ {
|
||||
if c == 0 {
|
||||
galMulSlice(matrixRows[iRow][c], in, outputs[iRow][start:stop], r.o.useSSSE3, r.o.useAVX2)
|
||||
galMulSlice(matrixRows[iRow][c], in, outputs[iRow][start:stop], &r.o)
|
||||
} else {
|
||||
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][start:stop], r.o.useSSSE3, r.o.useAVX2)
|
||||
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][start:stop], &r.o)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -501,7 +504,7 @@ func (r reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, outpu
|
||||
for c := 0; c < r.DataShards; c++ {
|
||||
in := inputs[c]
|
||||
for iRow := 0; iRow < outputCount; iRow++ {
|
||||
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow], r.o.useSSSE3, r.o.useAVX2)
|
||||
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow], &r.o)
|
||||
}
|
||||
}
|
||||
|
||||
@ -545,7 +548,7 @@ func (r reedSolomon) checkSomeShardsP(matrixRows, inputs, toCheck [][]byte, outp
|
||||
mu.RUnlock()
|
||||
in := inputs[c][start : start+do]
|
||||
for iRow := 0; iRow < outputCount; iRow++ {
|
||||
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow], r.o.useSSSE3, r.o.useAVX2)
|
||||
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow], &r.o)
|
||||
}
|
||||
}
|
||||
|
||||
|
6
vendor/vendor.json
vendored
6
vendor/vendor.json
vendored
@ -544,10 +544,10 @@
|
||||
"revisionTime": "2018-06-06T15:09:39Z"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "KiQa3vguztElzJkoqeIGHlfLFJA=",
|
||||
"checksumSHA1": "JzX1Hslj6KPshEfXSPgG4NpHUgk=",
|
||||
"path": "github.com/klauspost/reedsolomon",
|
||||
"revision": "8885f3a1c73882e6f11b766242c69a1eb8f44b28",
|
||||
"revisionTime": "2018-12-18T19:39:59Z"
|
||||
"revision": "2b210cf0866da6ba2a449223993cf7c971f444e1",
|
||||
"revisionTime": "2019-02-10T21:49:25Z"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "xxLSo5tKtXc7jGrR70yoEfza8Cw=",
|
||||
|
Loading…
x
Reference in New Issue
Block a user