minio/vendor/github.com/klauspost/reedsolomon/galoisAvx512_amd64.s
Harshavardhana 118270d76f Vendor the upstream changes with Avx512 (#7225)
Thanks to @fwessels we have Avx512 support with 4x improvement
2019-02-12 17:32:27 +05:30

591 lines
32 KiB
ArmAsm

//+build !noasm !appengine !gccgo
// Copyright 2015, Klaus Post, see LICENSE for details.
// Copyright 2019, Minio, Inc.
//
// Process 2 output rows in parallel from a total of 8 input rows
//
// func _galMulAVX512Parallel82(in, out [][]byte, matrix *[matrixSize82]byte, addTo bool)
TEXT ·_galMulAVX512Parallel82(SB), 7, $0
MOVQ in+0(FP), SI //
MOVQ 8(SI), R9 // R9: len(in)
SHRQ $6, R9 // len(in) / 64
TESTQ R9, R9
JZ done_avx512_parallel82
MOVQ matrix+48(FP), SI
LONG $0x48fee162; WORD $0x066f // VMOVDQU64 ZMM16, 0x000[rsi]
LONG $0x48fee162; WORD $0x4e6f; BYTE $0x01 // VMOVDQU64 ZMM17, 0x040[rsi]
LONG $0x48fee162; WORD $0x566f; BYTE $0x02 // VMOVDQU64 ZMM18, 0x080[rsi]
LONG $0x48fee162; WORD $0x5e6f; BYTE $0x03 // VMOVDQU64 ZMM19, 0x0c0[rsi]
LONG $0x48fee162; WORD $0x666f; BYTE $0x04 // VMOVDQU64 ZMM20, 0x100[rsi]
LONG $0x48fee162; WORD $0x6e6f; BYTE $0x05 // VMOVDQU64 ZMM21, 0x140[rsi]
LONG $0x48fee162; WORD $0x766f; BYTE $0x06 // VMOVDQU64 ZMM22, 0x180[rsi]
LONG $0x48fee162; WORD $0x7e6f; BYTE $0x07 // VMOVDQU64 ZMM23, 0x1c0[rsi]
MOVQ $15, BX
MOVQ BX, X5
LONG $0x487df262; WORD $0xd578 // VPBROADCASTB ZMM2, XMM5
MOVB addTo+56(FP), AX
LONG $0xffc0c749; WORD $0xffff; BYTE $0xff // mov r8, -1
WORD $0xf749; BYTE $0xe0 // mul r8
LONG $0x92fbe1c4; BYTE $0xc8 // kmovq k1, rax
MOVQ in+0(FP), SI // SI: &in
MOVQ in_len+8(FP), AX // number of inputs
XORQ R11, R11
MOVQ out+24(FP), DX
MOVQ 24(DX), CX // CX: &out[1][0]
MOVQ (DX), DX // DX: &out[0][0]
loopback_avx512_parallel82:
LONG $0xc9fef162; WORD $0x226f // VMOVDQU64 ZMM4{k1}{z}, [rdx]
LONG $0xc9fef162; WORD $0x296f // VMOVDQU64 ZMM5{k1}{z}, [rcx]
MOVQ (SI), BX // BX: &in[0][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40fd3362; WORD $0xf043; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM16, ZMM16, 0x00
LONG $0x40fd3362; WORD $0xf843; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM16, ZMM16, 0x55
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40dd3362; WORD $0xe443; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM20, ZMM20, 0x00
LONG $0x40dd3362; WORD $0xec43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM20, ZMM20, 0x55
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
CMPQ AX, $1
JE skip_avx512_parallel82
MOVQ 24(SI), BX // BX: &in[1][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40fd3362; WORD $0xf043; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM16, ZMM16, 0xaa
LONG $0x40fd3362; WORD $0xf843; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM16, ZMM16, 0xff
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40dd3362; WORD $0xe443; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM20, ZMM20, 0xaa
LONG $0x40dd3362; WORD $0xec43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM20, ZMM20, 0xff
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
CMPQ AX, $2
JE skip_avx512_parallel82
MOVQ 48(SI), BX // BX: &in[2][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40f53362; WORD $0xf143; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM17, ZMM17, 0x00
LONG $0x40f53362; WORD $0xf943; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM17, ZMM17, 0x55
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40d53362; WORD $0xe543; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM21, ZMM21, 0x00
LONG $0x40d53362; WORD $0xed43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM21, ZMM21, 0x55
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
CMPQ AX, $3
JE skip_avx512_parallel82
MOVQ 72(SI), BX // BX: &in[3][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40f53362; WORD $0xf143; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM17, ZMM17, 0xaa
LONG $0x40f53362; WORD $0xf943; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM17, ZMM17, 0xff
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40d53362; WORD $0xe543; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM21, ZMM21, 0xaa
LONG $0x40d53362; WORD $0xed43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM21, ZMM21, 0xff
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
CMPQ AX, $4
JE skip_avx512_parallel82
MOVQ 96(SI), BX // BX: &in[4][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40ed3362; WORD $0xf243; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM18, ZMM18, 0x00
LONG $0x40ed3362; WORD $0xfa43; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM18, ZMM18, 0x55
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40cd3362; WORD $0xe643; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM22, ZMM22, 0x00
LONG $0x40cd3362; WORD $0xee43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM22, ZMM22, 0x55
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
CMPQ AX, $5
JE skip_avx512_parallel82
MOVQ 120(SI), BX // BX: &in[5][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40ed3362; WORD $0xf243; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM18, ZMM18, 0xaa
LONG $0x40ed3362; WORD $0xfa43; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM18, ZMM18, 0xff
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40cd3362; WORD $0xe643; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM22, ZMM22, 0xaa
LONG $0x40cd3362; WORD $0xee43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM22, ZMM22, 0xff
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
CMPQ AX, $6
JE skip_avx512_parallel82
MOVQ 144(SI), BX // BX: &in[6][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40e53362; WORD $0xf343; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM19, ZMM19, 0x00
LONG $0x40e53362; WORD $0xfb43; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM19, ZMM19, 0x55
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40c53362; WORD $0xe743; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM23, ZMM23, 0x00
LONG $0x40c53362; WORD $0xef43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM23, ZMM23, 0x55
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
CMPQ AX, $7
JE skip_avx512_parallel82
MOVQ 168(SI), BX // BX: &in[7][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40e53362; WORD $0xf343; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM19, ZMM19, 0xaa
LONG $0x40e53362; WORD $0xfb43; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM19, ZMM19, 0xff
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40c53362; WORD $0xe743; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM23, ZMM23, 0xaa
LONG $0x40c53362; WORD $0xef43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM23, ZMM23, 0xff
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
skip_avx512_parallel82:
LONG $0x48fef162; WORD $0x227f // VMOVDQU64 [rdx], ZMM4
LONG $0x48fef162; WORD $0x297f // VMOVDQU64 [rcx], ZMM5
ADDQ $64, R11 // in4+=64
ADDQ $64, DX // out+=64
ADDQ $64, CX // out2+=64
SUBQ $1, R9
JNZ loopback_avx512_parallel82
done_avx512_parallel82:
VZEROUPPER
RET
//
// Process 4 output rows in parallel from a total of 8 input rows
//
// func _galMulAVX512Parallel84(in, out [][]byte, matrix *[matrixSize84]byte, addTo bool)
TEXT ·_galMulAVX512Parallel84(SB), 7, $0
MOVQ in+0(FP), SI //
MOVQ 8(SI), R9 // R9: len(in)
SHRQ $6, R9 // len(in) / 64
TESTQ R9, R9
JZ done_avx512_parallel84
MOVQ matrix+48(FP), SI
LONG $0x48fee162; WORD $0x066f // VMOVDQU64 ZMM16, 0x000[rsi]
LONG $0x48fee162; WORD $0x4e6f; BYTE $0x01 // VMOVDQU64 ZMM17, 0x040[rsi]
LONG $0x48fee162; WORD $0x566f; BYTE $0x02 // VMOVDQU64 ZMM18, 0x080[rsi]
LONG $0x48fee162; WORD $0x5e6f; BYTE $0x03 // VMOVDQU64 ZMM19, 0x0c0[rsi]
LONG $0x48fee162; WORD $0x666f; BYTE $0x04 // VMOVDQU64 ZMM20, 0x100[rsi]
LONG $0x48fee162; WORD $0x6e6f; BYTE $0x05 // VMOVDQU64 ZMM21, 0x140[rsi]
LONG $0x48fee162; WORD $0x766f; BYTE $0x06 // VMOVDQU64 ZMM22, 0x180[rsi]
LONG $0x48fee162; WORD $0x7e6f; BYTE $0x07 // VMOVDQU64 ZMM23, 0x1c0[rsi]
LONG $0x48fe6162; WORD $0x466f; BYTE $0x08 // VMOVDQU64 ZMM24, 0x200[rsi]
LONG $0x48fe6162; WORD $0x4e6f; BYTE $0x09 // VMOVDQU64 ZMM25, 0x240[rsi]
LONG $0x48fe6162; WORD $0x566f; BYTE $0x0a // VMOVDQU64 ZMM26, 0x280[rsi]
LONG $0x48fe6162; WORD $0x5e6f; BYTE $0x0b // VMOVDQU64 ZMM27, 0x2c0[rsi]
LONG $0x48fe6162; WORD $0x666f; BYTE $0x0c // VMOVDQU64 ZMM28, 0x300[rsi]
LONG $0x48fe6162; WORD $0x6e6f; BYTE $0x0d // VMOVDQU64 ZMM29, 0x340[rsi]
LONG $0x48fe6162; WORD $0x766f; BYTE $0x0e // VMOVDQU64 ZMM30, 0x380[rsi]
LONG $0x48fe6162; WORD $0x7e6f; BYTE $0x0f // VMOVDQU64 ZMM31, 0x3c0[rsi]
MOVQ $15, BX
MOVQ BX, X5
LONG $0x487df262; WORD $0xd578 // VPBROADCASTB ZMM2, XMM5
MOVB addTo+56(FP), AX
LONG $0xffc0c749; WORD $0xffff; BYTE $0xff // mov r8, -1
WORD $0xf749; BYTE $0xe0 // mul r8
LONG $0x92fbe1c4; BYTE $0xc8 // kmovq k1, rax
MOVQ in+0(FP), SI // SI: &in
MOVQ in_len+8(FP), AX // number of inputs
XORQ R11, R11
MOVQ out+24(FP), DX
MOVQ 24(DX), CX // CX: &out[1][0]
MOVQ 48(DX), R10 // R10: &out[2][0]
MOVQ 72(DX), R12 // R12: &out[3][0]
MOVQ (DX), DX // DX: &out[0][0]
loopback_avx512_parallel84:
LONG $0xc9fef162; WORD $0x226f // VMOVDQU64 ZMM4{k1}{z}, [rdx]
LONG $0xc9fef162; WORD $0x296f // VMOVDQU64 ZMM5{k1}{z}, [rcx]
LONG $0xc9fed162; WORD $0x326f // VMOVDQU64 ZMM6{k1}{z}, [r10]
LONG $0xc9fed162; WORD $0x3c6f; BYTE $0x24 // VMOVDQU64 ZMM7{k1}{z}, [r12]
MOVQ (SI), BX // BX: &in[0][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40fd3362; WORD $0xf043; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM16, ZMM16, 0x00
LONG $0x40fd3362; WORD $0xf843; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM16, ZMM16, 0x55
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40dd3362; WORD $0xe443; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM20, ZMM20, 0x00
LONG $0x40dd3362; WORD $0xec43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM20, ZMM20, 0x55
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
LONG $0x40bd1362; WORD $0xd043; BYTE $0x00 // VSHUFI64x2 ZMM10, ZMM24, ZMM24, 0x00
LONG $0x40bd1362; WORD $0xd843; BYTE $0x55 // VSHUFI64x2 ZMM11, ZMM24, ZMM24, 0x55
LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
LONG $0x409d1362; WORD $0xc443; BYTE $0x00 // VSHUFI64x2 ZMM8, ZMM28, ZMM28, 0x00
LONG $0x409d1362; WORD $0xcc43; BYTE $0x55 // VSHUFI64x2 ZMM9, ZMM28, ZMM28, 0x55
LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
CMPQ AX, $1
JE skip_avx512_parallel84
MOVQ 24(SI), BX // BX: &in[1][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40fd3362; WORD $0xf043; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM16, ZMM16, 0xaa
LONG $0x40fd3362; WORD $0xf843; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM16, ZMM16, 0xff
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40dd3362; WORD $0xe443; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM20, ZMM20, 0xaa
LONG $0x40dd3362; WORD $0xec43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM20, ZMM20, 0xff
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
LONG $0x40bd1362; WORD $0xd043; BYTE $0xaa // VSHUFI64x2 ZMM10, ZMM24, ZMM24, 0xaa
LONG $0x40bd1362; WORD $0xd843; BYTE $0xff // VSHUFI64x2 ZMM11, ZMM24, ZMM24, 0xff
LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
LONG $0x409d1362; WORD $0xc443; BYTE $0xaa // VSHUFI64x2 ZMM8, ZMM28, ZMM28, 0xaa
LONG $0x409d1362; WORD $0xcc43; BYTE $0xff // VSHUFI64x2 ZMM9, ZMM28, ZMM28, 0xff
LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
CMPQ AX, $2
JE skip_avx512_parallel84
MOVQ 48(SI), BX // BX: &in[2][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40f53362; WORD $0xf143; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM17, ZMM17, 0x00
LONG $0x40f53362; WORD $0xf943; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM17, ZMM17, 0x55
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40d53362; WORD $0xe543; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM21, ZMM21, 0x00
LONG $0x40d53362; WORD $0xed43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM21, ZMM21, 0x55
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
LONG $0x40b51362; WORD $0xd143; BYTE $0x00 // VSHUFI64x2 ZMM10, ZMM25, ZMM25, 0x00
LONG $0x40b51362; WORD $0xd943; BYTE $0x55 // VSHUFI64x2 ZMM11, ZMM25, ZMM25, 0x55
LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
LONG $0x40951362; WORD $0xc543; BYTE $0x00 // VSHUFI64x2 ZMM8, ZMM29, ZMM29, 0x00
LONG $0x40951362; WORD $0xcd43; BYTE $0x55 // VSHUFI64x2 ZMM9, ZMM29, ZMM29, 0x55
LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
CMPQ AX, $3
JE skip_avx512_parallel84
MOVQ 72(SI), BX // BX: &in[3][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40f53362; WORD $0xf143; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM17, ZMM17, 0xaa
LONG $0x40f53362; WORD $0xf943; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM17, ZMM17, 0xff
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40d53362; WORD $0xe543; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM21, ZMM21, 0xaa
LONG $0x40d53362; WORD $0xed43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM21, ZMM21, 0xff
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
LONG $0x40b51362; WORD $0xd143; BYTE $0xaa // VSHUFI64x2 ZMM10, ZMM25, ZMM25, 0xaa
LONG $0x40b51362; WORD $0xd943; BYTE $0xff // VSHUFI64x2 ZMM11, ZMM25, ZMM25, 0xff
LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
LONG $0x40951362; WORD $0xc543; BYTE $0xaa // VSHUFI64x2 ZMM8, ZMM29, ZMM29, 0xaa
LONG $0x40951362; WORD $0xcd43; BYTE $0xff // VSHUFI64x2 ZMM9, ZMM29, ZMM29, 0xff
LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
CMPQ AX, $4
JE skip_avx512_parallel84
MOVQ 96(SI), BX // BX: &in[4][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40ed3362; WORD $0xf243; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM18, ZMM18, 0x00
LONG $0x40ed3362; WORD $0xfa43; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM18, ZMM18, 0x55
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40cd3362; WORD $0xe643; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM22, ZMM22, 0x00
LONG $0x40cd3362; WORD $0xee43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM22, ZMM22, 0x55
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
LONG $0x40ad1362; WORD $0xd243; BYTE $0x00 // VSHUFI64x2 ZMM10, ZMM26, ZMM26, 0x00
LONG $0x40ad1362; WORD $0xda43; BYTE $0x55 // VSHUFI64x2 ZMM11, ZMM26, ZMM26, 0x55
LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
LONG $0x408d1362; WORD $0xc643; BYTE $0x00 // VSHUFI64x2 ZMM8, ZMM30, ZMM30, 0x00
LONG $0x408d1362; WORD $0xce43; BYTE $0x55 // VSHUFI64x2 ZMM9, ZMM30, ZMM30, 0x55
LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
CMPQ AX, $5
JE skip_avx512_parallel84
MOVQ 120(SI), BX // BX: &in[5][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40ed3362; WORD $0xf243; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM18, ZMM18, 0xaa
LONG $0x40ed3362; WORD $0xfa43; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM18, ZMM18, 0xff
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40cd3362; WORD $0xe643; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM22, ZMM22, 0xaa
LONG $0x40cd3362; WORD $0xee43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM22, ZMM22, 0xff
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
LONG $0x40ad1362; WORD $0xd243; BYTE $0xaa // VSHUFI64x2 ZMM10, ZMM26, ZMM26, 0xaa
LONG $0x40ad1362; WORD $0xda43; BYTE $0xff // VSHUFI64x2 ZMM11, ZMM26, ZMM26, 0xff
LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
LONG $0x408d1362; WORD $0xc643; BYTE $0xaa // VSHUFI64x2 ZMM8, ZMM30, ZMM30, 0xaa
LONG $0x408d1362; WORD $0xce43; BYTE $0xff // VSHUFI64x2 ZMM9, ZMM30, ZMM30, 0xff
LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
CMPQ AX, $6
JE skip_avx512_parallel84
MOVQ 144(SI), BX // BX: &in[6][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40e53362; WORD $0xf343; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM19, ZMM19, 0x00
LONG $0x40e53362; WORD $0xfb43; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM19, ZMM19, 0x55
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40c53362; WORD $0xe743; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM23, ZMM23, 0x00
LONG $0x40c53362; WORD $0xef43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM23, ZMM23, 0x55
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
LONG $0x40a51362; WORD $0xd343; BYTE $0x00 // VSHUFI64x2 ZMM10, ZMM27, ZMM27, 0x00
LONG $0x40a51362; WORD $0xdb43; BYTE $0x55 // VSHUFI64x2 ZMM11, ZMM27, ZMM27, 0x55
LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
LONG $0x40851362; WORD $0xc743; BYTE $0x00 // VSHUFI64x2 ZMM8, ZMM31, ZMM31, 0x00
LONG $0x40851362; WORD $0xcf43; BYTE $0x55 // VSHUFI64x2 ZMM9, ZMM31, ZMM31, 0x55
LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
CMPQ AX, $7
JE skip_avx512_parallel84
MOVQ 168(SI), BX // BX: &in[7][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40e53362; WORD $0xf343; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM19, ZMM19, 0xaa
LONG $0x40e53362; WORD $0xfb43; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM19, ZMM19, 0xff
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40c53362; WORD $0xe743; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM23, ZMM23, 0xaa
LONG $0x40c53362; WORD $0xef43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM23, ZMM23, 0xff
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
LONG $0x40a51362; WORD $0xd343; BYTE $0xaa // VSHUFI64x2 ZMM10, ZMM27, ZMM27, 0xaa
LONG $0x40a51362; WORD $0xdb43; BYTE $0xff // VSHUFI64x2 ZMM11, ZMM27, ZMM27, 0xff
LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
LONG $0x40851362; WORD $0xc743; BYTE $0xaa // VSHUFI64x2 ZMM8, ZMM31, ZMM31, 0xaa
LONG $0x40851362; WORD $0xcf43; BYTE $0xff // VSHUFI64x2 ZMM9, ZMM31, ZMM31, 0xff
LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
skip_avx512_parallel84:
LONG $0x48fef162; WORD $0x227f // VMOVDQU64 [rdx], ZMM4
LONG $0x48fef162; WORD $0x297f // VMOVDQU64 [rcx], ZMM5
LONG $0x48fed162; WORD $0x327f // VMOVDQU64 [r10], ZMM6
LONG $0x48fed162; WORD $0x3c7f; BYTE $0x24 // VMOVDQU64 [r12], ZMM7
ADDQ $64, R11 // in4+=64
ADDQ $64, DX // out+=64
ADDQ $64, CX // out2+=64
ADDQ $64, R10 // out3+=64
ADDQ $64, R12 // out4+=64
SUBQ $1, R9
JNZ loopback_avx512_parallel84
done_avx512_parallel84:
VZEROUPPER
RET