mirror of
https://github.com/minio/minio.git
synced 2024-12-24 22:25:54 -05:00
vendorize deps for snappy, blake2b and sha256 (#3476)
Bring in new optimization and portability changes. Fixes https://github.com/minio/minio-go/issues/578
This commit is contained in:
parent
85c6bb9809
commit
faa6b1e925
@ -18,7 +18,6 @@ package cmd
|
||||
|
||||
import (
|
||||
"crypto/md5"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"hash"
|
||||
@ -26,6 +25,8 @@ import (
|
||||
"path"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/minio/sha256-simd"
|
||||
)
|
||||
|
||||
// listMultipartUploads - lists all multipart uploads.
|
||||
|
@ -18,7 +18,6 @@ package cmd
|
||||
|
||||
import (
|
||||
"crypto/md5"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"errors"
|
||||
"fmt"
|
||||
@ -29,6 +28,7 @@ import (
|
||||
"strings"
|
||||
|
||||
"github.com/minio/minio/pkg/mimedb"
|
||||
"github.com/minio/sha256-simd"
|
||||
)
|
||||
|
||||
// fsObjects - Implements fs object layer.
|
||||
|
@ -18,9 +18,10 @@ package cmd
|
||||
|
||||
import (
|
||||
"crypto/md5"
|
||||
"crypto/sha256"
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
|
||||
"github.com/minio/sha256-simd"
|
||||
)
|
||||
|
||||
// getSHA256Hash returns SHA-256 hash of given data.
|
||||
|
@ -18,7 +18,6 @@ package cmd
|
||||
|
||||
import (
|
||||
"crypto/md5"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"hash"
|
||||
@ -29,6 +28,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/minio/minio/pkg/mimedb"
|
||||
"github.com/minio/sha256-simd"
|
||||
)
|
||||
|
||||
// listMultipartUploads - lists all multipart uploads.
|
||||
|
@ -18,7 +18,6 @@ package cmd
|
||||
|
||||
import (
|
||||
"crypto/md5"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"hash"
|
||||
"io"
|
||||
@ -30,6 +29,7 @@ import (
|
||||
"github.com/minio/minio/pkg/bpool"
|
||||
"github.com/minio/minio/pkg/mimedb"
|
||||
"github.com/minio/minio/pkg/objcache"
|
||||
"github.com/minio/sha256-simd"
|
||||
)
|
||||
|
||||
// list all errors which can be ignored in object operations.
|
||||
|
101
vendor/github.com/golang/snappy/decode_other.go
generated
vendored
Normal file
101
vendor/github.com/golang/snappy/decode_other.go
generated
vendored
Normal file
@ -0,0 +1,101 @@
|
||||
// Copyright 2016 The Snappy-Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !amd64 appengine !gc noasm
|
||||
|
||||
package snappy
|
||||
|
||||
// decode writes the decoding of src to dst. It assumes that the varint-encoded
|
||||
// length of the decompressed bytes has already been read, and that len(dst)
|
||||
// equals that length.
|
||||
//
|
||||
// It returns 0 on success or a decodeErrCodeXxx error code on failure.
|
||||
func decode(dst, src []byte) int {
|
||||
var d, s, offset, length int
|
||||
for s < len(src) {
|
||||
switch src[s] & 0x03 {
|
||||
case tagLiteral:
|
||||
x := uint32(src[s] >> 2)
|
||||
switch {
|
||||
case x < 60:
|
||||
s++
|
||||
case x == 60:
|
||||
s += 2
|
||||
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
|
||||
return decodeErrCodeCorrupt
|
||||
}
|
||||
x = uint32(src[s-1])
|
||||
case x == 61:
|
||||
s += 3
|
||||
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
|
||||
return decodeErrCodeCorrupt
|
||||
}
|
||||
x = uint32(src[s-2]) | uint32(src[s-1])<<8
|
||||
case x == 62:
|
||||
s += 4
|
||||
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
|
||||
return decodeErrCodeCorrupt
|
||||
}
|
||||
x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
|
||||
case x == 63:
|
||||
s += 5
|
||||
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
|
||||
return decodeErrCodeCorrupt
|
||||
}
|
||||
x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
|
||||
}
|
||||
length = int(x) + 1
|
||||
if length <= 0 {
|
||||
return decodeErrCodeUnsupportedLiteralLength
|
||||
}
|
||||
if length > len(dst)-d || length > len(src)-s {
|
||||
return decodeErrCodeCorrupt
|
||||
}
|
||||
copy(dst[d:], src[s:s+length])
|
||||
d += length
|
||||
s += length
|
||||
continue
|
||||
|
||||
case tagCopy1:
|
||||
s += 2
|
||||
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
|
||||
return decodeErrCodeCorrupt
|
||||
}
|
||||
length = 4 + int(src[s-2])>>2&0x7
|
||||
offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
|
||||
|
||||
case tagCopy2:
|
||||
s += 3
|
||||
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
|
||||
return decodeErrCodeCorrupt
|
||||
}
|
||||
length = 1 + int(src[s-3])>>2
|
||||
offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
|
||||
|
||||
case tagCopy4:
|
||||
s += 5
|
||||
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
|
||||
return decodeErrCodeCorrupt
|
||||
}
|
||||
length = 1 + int(src[s-5])>>2
|
||||
offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
|
||||
}
|
||||
|
||||
if offset <= 0 || d < offset || length > len(dst)-d {
|
||||
return decodeErrCodeCorrupt
|
||||
}
|
||||
// Copy from an earlier sub-slice of dst to a later sub-slice. Unlike
|
||||
// the built-in copy function, this byte-by-byte copy always runs
|
||||
// forwards, even if the slices overlap. Conceptually, this is:
|
||||
//
|
||||
// d += forwardCopy(dst[d:d+length], dst[d-offset:])
|
||||
for end := d + length; d != end; d++ {
|
||||
dst[d] = dst[d-offset]
|
||||
}
|
||||
}
|
||||
if d != len(dst) {
|
||||
return decodeErrCodeCorrupt
|
||||
}
|
||||
return 0
|
||||
}
|
238
vendor/github.com/golang/snappy/encode_other.go
generated
vendored
Normal file
238
vendor/github.com/golang/snappy/encode_other.go
generated
vendored
Normal file
@ -0,0 +1,238 @@
|
||||
// Copyright 2016 The Snappy-Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !amd64 appengine !gc noasm
|
||||
|
||||
package snappy
|
||||
|
||||
func load32(b []byte, i int) uint32 {
|
||||
b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
|
||||
return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
|
||||
}
|
||||
|
||||
func load64(b []byte, i int) uint64 {
|
||||
b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
|
||||
return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
|
||||
uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
|
||||
}
|
||||
|
||||
// emitLiteral writes a literal chunk and returns the number of bytes written.
|
||||
//
|
||||
// It assumes that:
|
||||
// dst is long enough to hold the encoded bytes
|
||||
// 1 <= len(lit) && len(lit) <= 65536
|
||||
func emitLiteral(dst, lit []byte) int {
|
||||
i, n := 0, uint(len(lit)-1)
|
||||
switch {
|
||||
case n < 60:
|
||||
dst[0] = uint8(n)<<2 | tagLiteral
|
||||
i = 1
|
||||
case n < 1<<8:
|
||||
dst[0] = 60<<2 | tagLiteral
|
||||
dst[1] = uint8(n)
|
||||
i = 2
|
||||
default:
|
||||
dst[0] = 61<<2 | tagLiteral
|
||||
dst[1] = uint8(n)
|
||||
dst[2] = uint8(n >> 8)
|
||||
i = 3
|
||||
}
|
||||
return i + copy(dst[i:], lit)
|
||||
}
|
||||
|
||||
// emitCopy writes a copy chunk and returns the number of bytes written.
|
||||
//
|
||||
// It assumes that:
|
||||
// dst is long enough to hold the encoded bytes
|
||||
// 1 <= offset && offset <= 65535
|
||||
// 4 <= length && length <= 65535
|
||||
func emitCopy(dst []byte, offset, length int) int {
|
||||
i := 0
|
||||
// The maximum length for a single tagCopy1 or tagCopy2 op is 64 bytes. The
|
||||
// threshold for this loop is a little higher (at 68 = 64 + 4), and the
|
||||
// length emitted down below is is a little lower (at 60 = 64 - 4), because
|
||||
// it's shorter to encode a length 67 copy as a length 60 tagCopy2 followed
|
||||
// by a length 7 tagCopy1 (which encodes as 3+2 bytes) than to encode it as
|
||||
// a length 64 tagCopy2 followed by a length 3 tagCopy2 (which encodes as
|
||||
// 3+3 bytes). The magic 4 in the 64±4 is because the minimum length for a
|
||||
// tagCopy1 op is 4 bytes, which is why a length 3 copy has to be an
|
||||
// encodes-as-3-bytes tagCopy2 instead of an encodes-as-2-bytes tagCopy1.
|
||||
for length >= 68 {
|
||||
// Emit a length 64 copy, encoded as 3 bytes.
|
||||
dst[i+0] = 63<<2 | tagCopy2
|
||||
dst[i+1] = uint8(offset)
|
||||
dst[i+2] = uint8(offset >> 8)
|
||||
i += 3
|
||||
length -= 64
|
||||
}
|
||||
if length > 64 {
|
||||
// Emit a length 60 copy, encoded as 3 bytes.
|
||||
dst[i+0] = 59<<2 | tagCopy2
|
||||
dst[i+1] = uint8(offset)
|
||||
dst[i+2] = uint8(offset >> 8)
|
||||
i += 3
|
||||
length -= 60
|
||||
}
|
||||
if length >= 12 || offset >= 2048 {
|
||||
// Emit the remaining copy, encoded as 3 bytes.
|
||||
dst[i+0] = uint8(length-1)<<2 | tagCopy2
|
||||
dst[i+1] = uint8(offset)
|
||||
dst[i+2] = uint8(offset >> 8)
|
||||
return i + 3
|
||||
}
|
||||
// Emit the remaining copy, encoded as 2 bytes.
|
||||
dst[i+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
|
||||
dst[i+1] = uint8(offset)
|
||||
return i + 2
|
||||
}
|
||||
|
||||
// extendMatch returns the largest k such that k <= len(src) and that
|
||||
// src[i:i+k-j] and src[j:k] have the same contents.
|
||||
//
|
||||
// It assumes that:
|
||||
// 0 <= i && i < j && j <= len(src)
|
||||
func extendMatch(src []byte, i, j int) int {
|
||||
for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 {
|
||||
}
|
||||
return j
|
||||
}
|
||||
|
||||
func hash(u, shift uint32) uint32 {
|
||||
return (u * 0x1e35a7bd) >> shift
|
||||
}
|
||||
|
||||
// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
|
||||
// assumes that the varint-encoded length of the decompressed bytes has already
|
||||
// been written.
|
||||
//
|
||||
// It also assumes that:
|
||||
// len(dst) >= MaxEncodedLen(len(src)) &&
|
||||
// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
|
||||
func encodeBlock(dst, src []byte) (d int) {
|
||||
// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
|
||||
// The table element type is uint16, as s < sLimit and sLimit < len(src)
|
||||
// and len(src) <= maxBlockSize and maxBlockSize == 65536.
|
||||
const (
|
||||
maxTableSize = 1 << 14
|
||||
// tableMask is redundant, but helps the compiler eliminate bounds
|
||||
// checks.
|
||||
tableMask = maxTableSize - 1
|
||||
)
|
||||
shift := uint32(32 - 8)
|
||||
for tableSize := 1 << 8; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
|
||||
shift--
|
||||
}
|
||||
// In Go, all array elements are zero-initialized, so there is no advantage
|
||||
// to a smaller tableSize per se. However, it matches the C++ algorithm,
|
||||
// and in the asm versions of this code, we can get away with zeroing only
|
||||
// the first tableSize elements.
|
||||
var table [maxTableSize]uint16
|
||||
|
||||
// sLimit is when to stop looking for offset/length copies. The inputMargin
|
||||
// lets us use a fast path for emitLiteral in the main loop, while we are
|
||||
// looking for copies.
|
||||
sLimit := len(src) - inputMargin
|
||||
|
||||
// nextEmit is where in src the next emitLiteral should start from.
|
||||
nextEmit := 0
|
||||
|
||||
// The encoded form must start with a literal, as there are no previous
|
||||
// bytes to copy, so we start looking for hash matches at s == 1.
|
||||
s := 1
|
||||
nextHash := hash(load32(src, s), shift)
|
||||
|
||||
for {
|
||||
// Copied from the C++ snappy implementation:
|
||||
//
|
||||
// Heuristic match skipping: If 32 bytes are scanned with no matches
|
||||
// found, start looking only at every other byte. If 32 more bytes are
|
||||
// scanned (or skipped), look at every third byte, etc.. When a match
|
||||
// is found, immediately go back to looking at every byte. This is a
|
||||
// small loss (~5% performance, ~0.1% density) for compressible data
|
||||
// due to more bookkeeping, but for non-compressible data (such as
|
||||
// JPEG) it's a huge win since the compressor quickly "realizes" the
|
||||
// data is incompressible and doesn't bother looking for matches
|
||||
// everywhere.
|
||||
//
|
||||
// The "skip" variable keeps track of how many bytes there are since
|
||||
// the last match; dividing it by 32 (ie. right-shifting by five) gives
|
||||
// the number of bytes to move ahead for each iteration.
|
||||
skip := 32
|
||||
|
||||
nextS := s
|
||||
candidate := 0
|
||||
for {
|
||||
s = nextS
|
||||
bytesBetweenHashLookups := skip >> 5
|
||||
nextS = s + bytesBetweenHashLookups
|
||||
skip += bytesBetweenHashLookups
|
||||
if nextS > sLimit {
|
||||
goto emitRemainder
|
||||
}
|
||||
candidate = int(table[nextHash&tableMask])
|
||||
table[nextHash&tableMask] = uint16(s)
|
||||
nextHash = hash(load32(src, nextS), shift)
|
||||
if load32(src, s) == load32(src, candidate) {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// A 4-byte match has been found. We'll later see if more than 4 bytes
|
||||
// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
|
||||
// them as literal bytes.
|
||||
d += emitLiteral(dst[d:], src[nextEmit:s])
|
||||
|
||||
// Call emitCopy, and then see if another emitCopy could be our next
|
||||
// move. Repeat until we find no match for the input immediately after
|
||||
// what was consumed by the last emitCopy call.
|
||||
//
|
||||
// If we exit this loop normally then we need to call emitLiteral next,
|
||||
// though we don't yet know how big the literal will be. We handle that
|
||||
// by proceeding to the next iteration of the main loop. We also can
|
||||
// exit this loop via goto if we get close to exhausting the input.
|
||||
for {
|
||||
// Invariant: we have a 4-byte match at s, and no need to emit any
|
||||
// literal bytes prior to s.
|
||||
base := s
|
||||
|
||||
// Extend the 4-byte match as long as possible.
|
||||
//
|
||||
// This is an inlined version of:
|
||||
// s = extendMatch(src, candidate+4, s+4)
|
||||
s += 4
|
||||
for i := candidate + 4; s < len(src) && src[i] == src[s]; i, s = i+1, s+1 {
|
||||
}
|
||||
|
||||
d += emitCopy(dst[d:], base-candidate, s-base)
|
||||
nextEmit = s
|
||||
if s >= sLimit {
|
||||
goto emitRemainder
|
||||
}
|
||||
|
||||
// We could immediately start working at s now, but to improve
|
||||
// compression we first update the hash table at s-1 and at s. If
|
||||
// another emitCopy is not our next move, also calculate nextHash
|
||||
// at s+1. At least on GOARCH=amd64, these three hash calculations
|
||||
// are faster as one load64 call (with some shifts) instead of
|
||||
// three load32 calls.
|
||||
x := load64(src, s-1)
|
||||
prevHash := hash(uint32(x>>0), shift)
|
||||
table[prevHash&tableMask] = uint16(s - 1)
|
||||
currHash := hash(uint32(x>>8), shift)
|
||||
candidate = int(table[currHash&tableMask])
|
||||
table[currHash&tableMask] = uint16(s)
|
||||
if uint32(x>>8) != load32(src, candidate) {
|
||||
nextHash = hash(uint32(x>>16), shift)
|
||||
s++
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
emitRemainder:
|
||||
if nextEmit < len(src) {
|
||||
d += emitLiteral(dst[d:], src[nextEmit:])
|
||||
}
|
||||
return d
|
||||
}
|
2
vendor/github.com/klauspost/reedsolomon/README.md
generated
vendored
2
vendor/github.com/klauspost/reedsolomon/README.md
generated
vendored
@ -193,6 +193,8 @@ Example of performance scaling on Intel(R) Core(TM) i7-2600 CPU @ 3.40GHz - 4 ph
|
||||
# Links
|
||||
* [Backblaze Open Sources Reed-Solomon Erasure Coding Source Code](https://www.backblaze.com/blog/reed-solomon/).
|
||||
* [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon). Compatible java library by Backblaze.
|
||||
* [reedsolomon-c](https://github.com/jannson/reedsolomon-c). C version, compatible with output from this package.
|
||||
* [Reed-Solomon Erasure Coding in Haskell](https://github.com/NicolasT/reedsolomon). Haskell port of the package with similar performance.
|
||||
* [go-erasure](https://github.com/somethingnew2-0/go-erasure). A similar library using cgo, slower in my tests.
|
||||
* [rsraid](https://github.com/goayame/rsraid). A similar library written in Go. Slower, but supports more shards.
|
||||
* [Screaming Fast Galois Field Arithmetic](http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf). Basis for SSE3 optimizations.
|
||||
|
2
vendor/github.com/minio/blake2b-simd/README.md
generated
vendored
2
vendor/github.com/minio/blake2b-simd/README.md
generated
vendored
@ -24,7 +24,7 @@ This is a summary of the performance improvements. Full details are shown below.
|
||||
asm2plan9s
|
||||
----------
|
||||
|
||||
In order to be able to work more easily with AVX2/AVX instructions, a separate tool was developed to convert AVX2/AVX instructions into the corresponding BYTE sequence as accepted by Go assembly. See [asm2plan9s](https://github.com/fwessels/asm2plan9s) for more information.
|
||||
In order to be able to work more easily with AVX2/AVX instructions, a separate tool was developed to convert AVX2/AVX instructions into the corresponding BYTE sequence as accepted by Go assembly. See [asm2plan9s](https://github.com/minio/asm2plan9s) for more information.
|
||||
|
||||
bt2sum
|
||||
------
|
||||
|
11
vendor/github.com/minio/blake2b-simd/compressAvx2_amd64.go
generated
vendored
11
vendor/github.com/minio/blake2b-simd/compressAvx2_amd64.go
generated
vendored
@ -23,11 +23,12 @@ package blake2b
|
||||
func compressAVX2Loop(p []uint8, in, iv, t, f, shffle, out []uint64)
|
||||
|
||||
func compressAVX2(d *digest, p []uint8) {
|
||||
var (
|
||||
in [8]uint64
|
||||
out [8]uint64
|
||||
shffle [8]uint64
|
||||
)
|
||||
|
||||
in := make([]uint64, 8, 8)
|
||||
out := make([]uint64, 8, 8)
|
||||
|
||||
shffle := make([]uint64, 8, 8)
|
||||
// vector for PSHUFB instruction
|
||||
shffle[0] = 0x0201000706050403
|
||||
shffle[1] = 0x0a09080f0e0d0c0b
|
||||
@ -40,7 +41,7 @@ func compressAVX2(d *digest, p []uint8) {
|
||||
|
||||
in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7] = d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7]
|
||||
|
||||
compressAVX2Loop(p, in, iv[:], d.t[:], d.f[:], shffle, out)
|
||||
compressAVX2Loop(p, in[:], iv[:], d.t[:], d.f[:], shffle[:], out[:])
|
||||
|
||||
d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7] = out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7]
|
||||
}
|
||||
|
11
vendor/github.com/minio/blake2b-simd/compressAvx_amd64.go
generated
vendored
11
vendor/github.com/minio/blake2b-simd/compressAvx_amd64.go
generated
vendored
@ -23,18 +23,19 @@ package blake2b
|
||||
func blockAVXLoop(p []uint8, in, iv, t, f, shffle, out []uint64)
|
||||
|
||||
func compressAVX(d *digest, p []uint8) {
|
||||
var (
|
||||
in [8]uint64
|
||||
out [8]uint64
|
||||
shffle [2]uint64
|
||||
)
|
||||
|
||||
in := make([]uint64, 8, 8)
|
||||
out := make([]uint64, 8, 8)
|
||||
|
||||
shffle := make([]uint64, 2, 2)
|
||||
// vector for PSHUFB instruction
|
||||
shffle[0] = 0x0201000706050403
|
||||
shffle[1] = 0x0a09080f0e0d0c0b
|
||||
|
||||
in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7] = d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7]
|
||||
|
||||
blockAVXLoop(p, in, iv[:], d.t[:], d.f[:], shffle, out)
|
||||
blockAVXLoop(p, in[:], iv[:], d.t[:], d.f[:], shffle[:], out[:])
|
||||
|
||||
d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7] = out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7]
|
||||
}
|
||||
|
496
vendor/github.com/minio/blake2b-simd/compressAvx_amd64.s
generated
vendored
496
vendor/github.com/minio/blake2b-simd/compressAvx_amd64.s
generated
vendored
@ -44,43 +44,43 @@
|
||||
|
||||
#define G1 \
|
||||
\ // G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1);
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x79; BYTE $0xd4; BYTE $0xc0 \ // VPADDQ XMM0,XMM0,XMM8 /* v0 += m[0], v1 += m[2] */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x71; BYTE $0xd4; BYTE $0xc9 \ // VPADDQ XMM1,XMM1,XMM9 /* v2 += m[4], v3 += m[6] */
|
||||
BYTE $0xc5; BYTE $0xf9; BYTE $0xd4; BYTE $0xc2 \ // VPADDQ XMM0,XMM0,XMM2 /* v0 += v4, v1 += v5 */
|
||||
BYTE $0xc5; BYTE $0xf1; BYTE $0xd4; BYTE $0xcb \ // VPADDQ XMM1,XMM1,XMM3 /* v2 += v6, v3 += v7 */
|
||||
BYTE $0xc5; BYTE $0xc9; BYTE $0xef; BYTE $0xf0 \ // VPXOR XMM6,XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */
|
||||
BYTE $0xc5; BYTE $0xc1; BYTE $0xef; BYTE $0xf9 \ // VPXOR XMM7,XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */
|
||||
BYTE $0xc5; BYTE $0xf9; BYTE $0x70; BYTE $0xf6; BYTE $0xb1 \ // VPSHUFD XMM6,XMM6,0xb1 /* v12 = v12<<(64-32) | v12>>32, v13 = v13<<(64-32) | v13>>32 */
|
||||
BYTE $0xc5; BYTE $0xf9; BYTE $0x70; BYTE $0xff; BYTE $0xb1 \ // VPSHUFD XMM7,XMM7,0xb1 /* v14 = v14<<(64-32) | v14>>32, v15 = v15<<(64-32) | v15>>32 */
|
||||
BYTE $0xc5; BYTE $0xd9; BYTE $0xd4; BYTE $0xe6 \ // VPADDQ XMM4,XMM4,XMM6 /* v8 += v12, v9 += v13 */
|
||||
BYTE $0xc5; BYTE $0xd1; BYTE $0xd4; BYTE $0xef \ // VPADDQ XMM5,XMM5,XMM7 /* v10 += v14, v11 += v15 */
|
||||
BYTE $0xc5; BYTE $0xe9; BYTE $0xef; BYTE $0xd4 \ // VPXOR XMM2,XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */
|
||||
BYTE $0xc5; BYTE $0xe1; BYTE $0xef; BYTE $0xdd \ // VPXOR XMM3,XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */
|
||||
BYTE $0xc4; BYTE $0xc2; BYTE $0x69; BYTE $0x00; BYTE $0xd4 \ // VPSHUFB XMM2,XMM2,XMM12 /* v4 = v4<<(64-24) | v4>>24, v5 = v5<<(64-24) | v5>>24 */
|
||||
BYTE $0xc4; BYTE $0xc2; BYTE $0x61; BYTE $0x00; BYTE $0xdc // VPSHUFB XMM3,XMM3,XMM12 /* v6 = v6<<(64-24) | v6>>24, v7 = v7<<(64-24) | v7>>24 */
|
||||
LONG $0xd479c1c4; BYTE $0xc0 \ // VPADDQ XMM0,XMM0,XMM8 /* v0 += m[0], v1 += m[2] */
|
||||
LONG $0xd471c1c4; BYTE $0xc9 \ // VPADDQ XMM1,XMM1,XMM9 /* v2 += m[4], v3 += m[6] */
|
||||
LONG $0xc2d4f9c5 \ // VPADDQ XMM0,XMM0,XMM2 /* v0 += v4, v1 += v5 */
|
||||
LONG $0xcbd4f1c5 \ // VPADDQ XMM1,XMM1,XMM3 /* v2 += v6, v3 += v7 */
|
||||
LONG $0xf0efc9c5 \ // VPXOR XMM6,XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */
|
||||
LONG $0xf9efc1c5 \ // VPXOR XMM7,XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */
|
||||
LONG $0xf670f9c5; BYTE $0xb1 \ // VPSHUFD XMM6,XMM6,0xb1 /* v12 = v12<<(64-32) | v12>>32, v13 = v13<<(64-32) | v13>>32 */
|
||||
LONG $0xff70f9c5; BYTE $0xb1 \ // VPSHUFD XMM7,XMM7,0xb1 /* v14 = v14<<(64-32) | v14>>32, v15 = v15<<(64-32) | v15>>32 */
|
||||
LONG $0xe6d4d9c5 \ // VPADDQ XMM4,XMM4,XMM6 /* v8 += v12, v9 += v13 */
|
||||
LONG $0xefd4d1c5 \ // VPADDQ XMM5,XMM5,XMM7 /* v10 += v14, v11 += v15 */
|
||||
LONG $0xd4efe9c5 \ // VPXOR XMM2,XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */
|
||||
LONG $0xddefe1c5 \ // VPXOR XMM3,XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */
|
||||
LONG $0x0069c2c4; BYTE $0xd4 \ // VPSHUFB XMM2,XMM2,XMM12 /* v4 = v4<<(64-24) | v4>>24, v5 = v5<<(64-24) | v5>>24 */
|
||||
LONG $0x0061c2c4; BYTE $0xdc // VPSHUFB XMM3,XMM3,XMM12 /* v6 = v6<<(64-24) | v6>>24, v7 = v7<<(64-24) | v7>>24 */
|
||||
|
||||
#define G2 \
|
||||
\ // G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1);
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x79; BYTE $0xd4; BYTE $0xc2 \ // VPADDQ XMM0,XMM0,XMM10 /* v0 += m[1], v1 += m[3] */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x71; BYTE $0xd4; BYTE $0xcb \ // VPADDQ XMM1,XMM1,XMM11 /* v2 += m[5], v3 += m[7] */
|
||||
BYTE $0xc5; BYTE $0xf9; BYTE $0xd4; BYTE $0xc2 \ // VPADDQ XMM0,XMM0,XMM2 /* v0 += v4, v1 += v5 */
|
||||
BYTE $0xc5; BYTE $0xf1; BYTE $0xd4; BYTE $0xcb \ // VPADDQ XMM1,XMM1,XMM3 /* v2 += v6, v3 += v7 */
|
||||
BYTE $0xc5; BYTE $0xc9; BYTE $0xef; BYTE $0xf0 \ // VPXOR XMM6,XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */
|
||||
BYTE $0xc5; BYTE $0xc1; BYTE $0xef; BYTE $0xf9 \ // VPXOR XMM7,XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */
|
||||
BYTE $0xc5; BYTE $0xfb; BYTE $0x70; BYTE $0xf6; BYTE $0x39 \ // VPSHUFLW XMM6,XMM6,0x39 /* combined with next ... */
|
||||
BYTE $0xc5; BYTE $0xfa; BYTE $0x70; BYTE $0xf6; BYTE $0x39 \ // VPSHUFHW XMM6,XMM6,0x39 /* v12 = v12<<(64-16) | v12>>16, v13 = v13<<(64-16) | v13>>16 */
|
||||
BYTE $0xc5; BYTE $0xfb; BYTE $0x70; BYTE $0xff; BYTE $0x39 \ // VPSHUFLW XMM7,XMM7,0x39 /* combined with next ... */
|
||||
BYTE $0xc5; BYTE $0xfa; BYTE $0x70; BYTE $0xff; BYTE $0x39 \ // VPSHUFHW XMM7,XMM7,0x39 /* v14 = v14<<(64-16) | v14>>16, v15 = v15<<(64-16) | v15>>16 */
|
||||
BYTE $0xc5; BYTE $0xd9; BYTE $0xd4; BYTE $0xe6 \ // VPADDQ XMM4,XMM4,XMM6 /* v8 += v12, v9 += v13 */
|
||||
BYTE $0xc5; BYTE $0xd1; BYTE $0xd4; BYTE $0xef \ // VPADDQ XMM5,XMM5,XMM7 /* v10 += v14, v11 += v15 */
|
||||
BYTE $0xc5; BYTE $0xe9; BYTE $0xef; BYTE $0xd4 \ // VPXOR XMM2,XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */
|
||||
BYTE $0xc5; BYTE $0xe1; BYTE $0xef; BYTE $0xdd \ // VPXOR XMM3,XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */
|
||||
BYTE $0xc5; BYTE $0x69; BYTE $0xd4; BYTE $0xfa \ // VPADDQ XMM15,XMM2,XMM2 /* temp reg = reg*2 */
|
||||
BYTE $0xc5; BYTE $0xe9; BYTE $0x73; BYTE $0xd2; BYTE $0x3f \ // VPSRLQ XMM2,XMM2,0x3f /* reg = reg>>63 */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x69; BYTE $0xef; BYTE $0xd7 \ // VPXOR XMM2,XMM2,XMM15 /* ORed together: v4 = v4<<(64-63) | v4>>63, v5 = v5<<(64-63) | v5>>63 */
|
||||
BYTE $0xc5; BYTE $0x61; BYTE $0xd4; BYTE $0xfb \ // VPADDQ XMM15,XMM3,XMM3 /* temp reg = reg*2 */
|
||||
BYTE $0xc5; BYTE $0xe1; BYTE $0x73; BYTE $0xd3; BYTE $0x3f \ // VPSRLQ XMM3,XMM3,0x3f /* reg = reg>>63 */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x61; BYTE $0xef; BYTE $0xdf // VPXOR XMM3,XMM3,XMM15 /* ORed together: v6 = v6<<(64-63) | v6>>63, v7 = v7<<(64-63) | v7>>63 */
|
||||
LONG $0xd479c1c4; BYTE $0xc2 \ // VPADDQ XMM0,XMM0,XMM10 /* v0 += m[1], v1 += m[3] */
|
||||
LONG $0xd471c1c4; BYTE $0xcb \ // VPADDQ XMM1,XMM1,XMM11 /* v2 += m[5], v3 += m[7] */
|
||||
LONG $0xc2d4f9c5 \ // VPADDQ XMM0,XMM0,XMM2 /* v0 += v4, v1 += v5 */
|
||||
LONG $0xcbd4f1c5 \ // VPADDQ XMM1,XMM1,XMM3 /* v2 += v6, v3 += v7 */
|
||||
LONG $0xf0efc9c5 \ // VPXOR XMM6,XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */
|
||||
LONG $0xf9efc1c5 \ // VPXOR XMM7,XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */
|
||||
LONG $0xf670fbc5; BYTE $0x39 \ // VPSHUFLW XMM6,XMM6,0x39 /* combined with next ... */
|
||||
LONG $0xf670fac5; BYTE $0x39 \ // VPSHUFHW XMM6,XMM6,0x39 /* v12 = v12<<(64-16) | v12>>16, v13 = v13<<(64-16) | v13>>16 */
|
||||
LONG $0xff70fbc5; BYTE $0x39 \ // VPSHUFLW XMM7,XMM7,0x39 /* combined with next ... */
|
||||
LONG $0xff70fac5; BYTE $0x39 \ // VPSHUFHW XMM7,XMM7,0x39 /* v14 = v14<<(64-16) | v14>>16, v15 = v15<<(64-16) | v15>>16 */
|
||||
LONG $0xe6d4d9c5 \ // VPADDQ XMM4,XMM4,XMM6 /* v8 += v12, v9 += v13 */
|
||||
LONG $0xefd4d1c5 \ // VPADDQ XMM5,XMM5,XMM7 /* v10 += v14, v11 += v15 */
|
||||
LONG $0xd4efe9c5 \ // VPXOR XMM2,XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */
|
||||
LONG $0xddefe1c5 \ // VPXOR XMM3,XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */
|
||||
LONG $0xfad469c5 \ // VPADDQ XMM15,XMM2,XMM2 /* temp reg = reg*2 */
|
||||
LONG $0xd273e9c5; BYTE $0x3f \ // VPSRLQ XMM2,XMM2,0x3f /* reg = reg>>63 */
|
||||
LONG $0xef69c1c4; BYTE $0xd7 \ // VPXOR XMM2,XMM2,XMM15 /* ORed together: v4 = v4<<(64-63) | v4>>63, v5 = v5<<(64-63) | v5>>63 */
|
||||
LONG $0xfbd461c5 \ // VPADDQ XMM15,XMM3,XMM3 /* temp reg = reg*2 */
|
||||
LONG $0xd373e1c5; BYTE $0x3f \ // VPSRLQ XMM3,XMM3,0x3f /* reg = reg>>63 */
|
||||
LONG $0xef61c1c4; BYTE $0xdf // VPXOR XMM3,XMM3,XMM15 /* ORed together: v6 = v6<<(64-63) | v6>>63, v7 = v7<<(64-63) | v7>>63 */
|
||||
|
||||
#define DIAGONALIZE \
|
||||
\ // DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
|
||||
@ -89,14 +89,14 @@
|
||||
MOVOU X4, X6 \ /* row4l = row3l;\ */
|
||||
MOVOU X5, X4 \ /* row3l = row3h;\ */
|
||||
MOVOU X6, X5 \ /* row3h = row4l;\ */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xfd \ // VPUNPCKLQDQ XMM15, XMM13, XMM13 /* _mm_unpacklo_epi64(t0, t0) */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x41; BYTE $0x6d; BYTE $0xf7 \ // VPUNPCKHQDQ XMM6, XMM7, XMM15 /* row4l = _mm_unpackhi_epi64(row4h, ); \ */
|
||||
BYTE $0xc5; BYTE $0x41; BYTE $0x6c; BYTE $0xff \ // VPUNPCKLQDQ XMM15, XMM7, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x11; BYTE $0x6d; BYTE $0xff \ // VPUNPCKHQDQ XMM7, XMM13, XMM15 /* row4h = _mm_unpackhi_epi64(t0, ); \ */
|
||||
BYTE $0xc5; BYTE $0x61; BYTE $0x6c; BYTE $0xfb \ // VPUNPCKLQDQ XMM15, XMM3, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x69; BYTE $0x6d; BYTE $0xd7 \ // VPUNPCKHQDQ XMM2, XMM2, XMM15 /* row2l = _mm_unpackhi_epi64(row2l, ); \ */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xfe \ // VPUNPCKLQDQ XMM15, XMM14, XMM14 /* _mm_unpacklo_epi64(t1, t1) */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x61; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM3, XMM3, XMM15 /* row2h = _mm_unpackhi_epi64(row2h, ) */
|
||||
LONG $0x6c1141c4; BYTE $0xfd \ // VPUNPCKLQDQ XMM15, XMM13, XMM13 /* _mm_unpacklo_epi64(t0, t0) */
|
||||
LONG $0x6d41c1c4; BYTE $0xf7 \ // VPUNPCKHQDQ XMM6, XMM7, XMM15 /* row4l = _mm_unpackhi_epi64(row4h, ); \ */
|
||||
LONG $0xff6c41c5 \ // VPUNPCKLQDQ XMM15, XMM7, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */
|
||||
LONG $0x6d11c1c4; BYTE $0xff \ // VPUNPCKHQDQ XMM7, XMM13, XMM15 /* row4h = _mm_unpackhi_epi64(t0, ); \ */
|
||||
LONG $0xfb6c61c5 \ // VPUNPCKLQDQ XMM15, XMM3, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */
|
||||
LONG $0x6d69c1c4; BYTE $0xd7 \ // VPUNPCKHQDQ XMM2, XMM2, XMM15 /* row2l = _mm_unpackhi_epi64(row2l, ); \ */
|
||||
LONG $0x6c0941c4; BYTE $0xfe \ // VPUNPCKLQDQ XMM15, XMM14, XMM14 /* _mm_unpacklo_epi64(t1, t1) */
|
||||
LONG $0x6d61c1c4; BYTE $0xdf // VPUNPCKHQDQ XMM3, XMM3, XMM15 /* row2h = _mm_unpackhi_epi64(row2h, ) */
|
||||
|
||||
#define UNDIAGONALIZE \
|
||||
\ // UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
|
||||
@ -105,14 +105,14 @@
|
||||
MOVOU X13, X5 \ /* row3h = t0;\ */
|
||||
MOVOU X2, X13 \ /* t0 = row2l;\ */
|
||||
MOVOU X6, X14 \ /* t1 = row4l;\ */
|
||||
BYTE $0xc5; BYTE $0x69; BYTE $0x6c; BYTE $0xfa \ // VPUNPCKLQDQ XMM15, XMM2, XMM2 /* _mm_unpacklo_epi64(row2l, row2l) */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x61; BYTE $0x6d; BYTE $0xd7 \ // VPUNPCKHQDQ XMM2, XMM3, XMM15 /* row2l = _mm_unpackhi_epi64(row2h, ); \ */
|
||||
BYTE $0xc5; BYTE $0x61; BYTE $0x6c; BYTE $0xfb \ // VPUNPCKLQDQ XMM15, XMM3, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x11; BYTE $0x6d; BYTE $0xdf \ // VPUNPCKHQDQ XMM3, XMM13, XMM15 /* row2h = _mm_unpackhi_epi64(t0, ); \ */
|
||||
BYTE $0xc5; BYTE $0x41; BYTE $0x6c; BYTE $0xff \ // VPUNPCKLQDQ XMM15, XMM7, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x49; BYTE $0x6d; BYTE $0xf7 \ // VPUNPCKHQDQ XMM6, XMM6, XMM15 /* row4l = _mm_unpackhi_epi64(row4l, ); \ */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xfe \ // VPUNPCKLQDQ XMM15, XMM14, XMM14 /* _mm_unpacklo_epi64(t1, t1) */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x41; BYTE $0x6d; BYTE $0xff // VPUNPCKHQDQ XMM7, XMM7, XMM15 /* row4h = _mm_unpackhi_epi64(row4h, ) */
|
||||
LONG $0xfa6c69c5 \ // VPUNPCKLQDQ XMM15, XMM2, XMM2 /* _mm_unpacklo_epi64(row2l, row2l) */
|
||||
LONG $0x6d61c1c4; BYTE $0xd7 \ // VPUNPCKHQDQ XMM2, XMM3, XMM15 /* row2l = _mm_unpackhi_epi64(row2h, ); \ */
|
||||
LONG $0xfb6c61c5 \ // VPUNPCKLQDQ XMM15, XMM3, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */
|
||||
LONG $0x6d11c1c4; BYTE $0xdf \ // VPUNPCKHQDQ XMM3, XMM13, XMM15 /* row2h = _mm_unpackhi_epi64(t0, ); \ */
|
||||
LONG $0xff6c41c5 \ // VPUNPCKLQDQ XMM15, XMM7, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */
|
||||
LONG $0x6d49c1c4; BYTE $0xf7 \ // VPUNPCKHQDQ XMM6, XMM6, XMM15 /* row4l = _mm_unpackhi_epi64(row4l, ); \ */
|
||||
LONG $0x6c0941c4; BYTE $0xfe \ // VPUNPCKLQDQ XMM15, XMM14, XMM14 /* _mm_unpacklo_epi64(t1, t1) */
|
||||
LONG $0x6d41c1c4; BYTE $0xff // VPUNPCKHQDQ XMM7, XMM7, XMM15 /* row4h = _mm_unpackhi_epi64(row4h, ) */
|
||||
|
||||
#define LOAD_SHUFFLE \
|
||||
\ // Load shuffle value
|
||||
@ -154,14 +154,14 @@ TEXT ·blockAVXLoop(SB), 7, $0
|
||||
loop:
|
||||
// Increment counter
|
||||
MOVQ t+72(FP), SI // SI: &t
|
||||
MOVQ 0(SI), R9 //
|
||||
MOVQ 0(SI), R9
|
||||
ADDQ $128, R9 // /* d.t[0] += BlockSize */
|
||||
MOVQ R9, 0(SI) //
|
||||
MOVQ R9, 0(SI)
|
||||
CMPQ R9, $128 // /* if d.t[0] < BlockSize { */
|
||||
JGE noincr //
|
||||
MOVQ 8(SI), R9 //
|
||||
JGE noincr
|
||||
MOVQ 8(SI), R9
|
||||
ADDQ $1, R9 // /* d.t[1]++ */
|
||||
MOVQ R9, 8(SI) //
|
||||
MOVQ R9, 8(SI)
|
||||
noincr: // /* } */
|
||||
|
||||
// Load initialization vector
|
||||
@ -181,582 +181,472 @@ noincr: // /* } */
|
||||
// R O U N D 1
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+m[1]
|
||||
MOVOU 16(DX), X13 // X13 = m[2]+m[3]
|
||||
MOVOU 32(DX), X14 // X14 = m[4]+m[5]
|
||||
MOVOU 48(DX), X15 // X15 = m[6]+m[7]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[2] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[4], m[6] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[1], m[3] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[5], m[7] */
|
||||
LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[2] */
|
||||
LONG $0x6c0941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[4], m[6] */
|
||||
LONG $0x6d1941c4; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[1], m[3] */
|
||||
LONG $0x6d0941c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[5], m[7] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 64(DX), X12 // X12 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X13 // X13 = m[10]+m[11]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[8],m[10] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[12],m[14] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[9],m[11] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[13],m[15] */
|
||||
LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[8],m[10] */
|
||||
LONG $0x6c0941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[12],m[14] */
|
||||
LONG $0x6d1941c4; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[9],m[11] */
|
||||
LONG $0x6d0941c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[13],m[15] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 2
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 112(DX), X12 // X12 = m[14]+m[15]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 96(DX), X15 // X15 = m[12]+m[13]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[14], m[4] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM14, XMM15 /* m[9], m[13] */
|
||||
LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[14], m[4] */
|
||||
LONG $0x6d0941c4; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM14, XMM15 /* m[9], m[13] */
|
||||
MOVOU 80(DX), X13 // X13 = m[10]+m[11]
|
||||
MOVOU 48(DX), X15 // X15 = m[6]+ m[7]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[10], m[8] */
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x01; BYTE $0x0f; BYTE $0xdc // VPALIGNR XMM11, XMM15, XMM12, 0x8 /* m[15], m[6] */
|
||||
BYTE $0x08
|
||||
LONG $0x6c1141c4; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[10], m[8] */
|
||||
LONG $0x0f0143c4; WORD $0x08dc // VPALIGNR XMM11, XMM15, XMM12, 0x8 /* m[15], m[6] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x19; BYTE $0x0f; BYTE $0xc4 // VPALIGNR XMM8, XMM12, XMM12, 0x8 /* m[1], m[0] */
|
||||
BYTE $0x08
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xcd // VPUNPCKHQDQ XMM9, XMM14, XMM13 /* m[11], m[5] */
|
||||
LONG $0x0f1943c4; WORD $0x08c4 // VPALIGNR XMM8, XMM12, XMM12, 0x8 /* m[1], m[0] */
|
||||
LONG $0x6d0941c4; BYTE $0xcd // VPUNPCKHQDQ XMM9, XMM14, XMM13 /* m[11], m[5] */
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[2] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM13, XMM12 /* m[7], m[3] */
|
||||
LONG $0x6c0941c4; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[2] */
|
||||
LONG $0x6d1141c4; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM13, XMM12 /* m[7], m[3] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 3
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 32(DX), X12 // X12 = m[4]+ m[5]
|
||||
MOVOU 80(DX), X13 // X13 = m[10]+m[11]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xc5 // VPALIGNR XMM8, XMM14, XMM13, 0x8 /* m[11], m[12] */
|
||||
BYTE $0x08
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM12, XMM15 /* m[5], m[15] */
|
||||
LONG $0x0f0943c4; WORD $0x08c5 // VPALIGNR XMM8, XMM14, XMM13, 0x8 /* m[11], m[12] */
|
||||
LONG $0x6d1941c4; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM12, XMM15 /* m[5], m[15] */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
|
||||
MOVOU 64(DX), X15 // X15 = m[8]+ m[9]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM15, XMM12 /* m[8], m[0] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[13] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[2], ___ */
|
||||
LONG $0x6c0141c4; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM15, XMM12 /* m[8], m[0] */
|
||||
LONG $0x6d0941c4; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[13] */
|
||||
LONG $0x6c1141c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[2], ___ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X15 // X15 = m[10]+m[11]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM12, XMM12 /* ___, m[3] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[10], ___ */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM13, XMM14 /* m[7], m[9] */
|
||||
LONG $0x6d1941c4; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM12, XMM12 /* ___, m[3] */
|
||||
LONG $0x6c0141c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[10], ___ */
|
||||
LONG $0x6d1141c4; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM13, XMM14 /* m[7], m[9] */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 32(DX), X14 // X14 = m[4]+ m[5]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xd5 // VPUNPCKLQDQ XMM10, XMM15, XMM13 /* m[14], m[6] */
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xdc // VPALIGNR XMM11, XMM14, XMM12, 0x8 /* m[1], m[4] */
|
||||
BYTE $0x08
|
||||
LONG $0x6c0141c4; BYTE $0xd5 // VPUNPCKLQDQ XMM10, XMM15, XMM13 /* m[14], m[6] */
|
||||
LONG $0x0f0943c4; WORD $0x08dc // VPALIGNR XMM11, XMM14, XMM12, 0x8 /* m[1], m[4] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 4
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
MOVOU 96(DX), X15 // X15 = m[12]+m[13]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM13, XMM12 /* m[7], m[3] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM15, XMM14 /* m[13], m[11] */
|
||||
LONG $0x6d1141c4; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM13, XMM12 /* m[7], m[3] */
|
||||
LONG $0x6d0141c4; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM15, XMM14 /* m[13], m[11] */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 64(DX), X13 // X13 = m[8]+ m[9]
|
||||
MOVOU 112(DX), X14 // X14 = m[14]+m[15]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xd4 // VPUNPCKHQDQ XMM10, XMM13, XMM12 /* m[9], m[1] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xde // VPUNPCKLQDQ XMM11, XMM15, XMM14 /* m[12], m[14] */
|
||||
LONG $0x6d1141c4; BYTE $0xd4 // VPUNPCKHQDQ XMM10, XMM13, XMM12 /* m[9], m[1] */
|
||||
LONG $0x6c0141c4; BYTE $0xde // VPUNPCKLQDQ XMM11, XMM15, XMM14 /* m[12], m[14] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM13, XMM13 /* ___, m[5] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM12, XMM8 /* m[2], ____ */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM15, XMM15 /* ___, m[15] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM13, XMM9 /* m[4], ____ */
|
||||
LONG $0x6d1141c4; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM13, XMM13 /* ___, m[5] */
|
||||
LONG $0x6c1941c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM12, XMM8 /* m[2], ____ */
|
||||
LONG $0x6d0141c4; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM15, XMM15 /* ___, m[15] */
|
||||
LONG $0x6c1141c4; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM13, XMM9 /* m[4], ____ */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 64(DX), X15 // X15 = m[8]+ m[9]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[6], m[10] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xdf // VPUNPCKLQDQ XMM11, XMM12, XMM15 /* m[0], m[8] */
|
||||
LONG $0x6c1141c4; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[6], m[10] */
|
||||
LONG $0x6c1941c4; BYTE $0xdf // VPUNPCKLQDQ XMM11, XMM12, XMM15 /* m[0], m[8] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 5
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X15 // X15 = m[10]+m[11]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM14, XMM13 /* m[9], m[5] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM12, XMM15 /* m[2], m[10] */
|
||||
LONG $0x6d0941c4; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM14, XMM13 /* m[9], m[5] */
|
||||
LONG $0x6c1941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM12, XMM15 /* m[2], m[10] */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 48(DX), X14 // X14 = m[6]+ m[7]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM14, XMM14 /* ___, m[7] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xd2 // VPUNPCKLQDQ XMM10, XMM12, XMM10 /* m[0], ____ */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM15, XMM15 /* ___, m[15] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[4], ____ */
|
||||
LONG $0x6d0941c4; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM14, XMM14 /* ___, m[7] */
|
||||
LONG $0x6c1941c4; BYTE $0xd2 // VPUNPCKLQDQ XMM10, XMM12, XMM10 /* m[0], ____ */
|
||||
LONG $0x6d0141c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM15, XMM15 /* ___, m[15] */
|
||||
LONG $0x6c1141c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[4], ____ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xc6 // VPUNPCKHQDQ XMM8, XMM14, XMM14 /* ___, m[11] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[14], ____ */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM12, XMM12 /* ___, m[3] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM13, XMM9 /* m[6], ____ */
|
||||
LONG $0x6d0941c4; BYTE $0xc6 // VPUNPCKHQDQ XMM8, XMM14, XMM14 /* ___, m[11] */
|
||||
LONG $0x6c0141c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[14], ____ */
|
||||
LONG $0x6d1941c4; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM12, XMM12 /* ___, m[3] */
|
||||
LONG $0x6c1141c4; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM13, XMM9 /* m[6], ____ */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 64(DX), X13 // X13 = m[8]+ m[9]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xd4 // VPALIGNR XMM10, XMM14, XMM12, 0x8 /* m[1], m[12] */
|
||||
BYTE $0x08
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[13] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[8], ____ */
|
||||
LONG $0x0f0943c4; WORD $0x08d4 // VPALIGNR XMM10, XMM14, XMM12, 0x8 /* m[1], m[12] */
|
||||
LONG $0x6d0941c4; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[13] */
|
||||
LONG $0x6c1141c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[8], ____ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 6
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X14 // X14 = m[6]+ m[7]
|
||||
MOVOU 64(DX), X15 // X15 = m[8]+ m[9]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xc6 // VPUNPCKLQDQ XMM8, XMM13, XMM14 /* m[2], m[6] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM12, XMM15 /* m[0], m[8] */
|
||||
LONG $0x6c1141c4; BYTE $0xc6 // VPUNPCKLQDQ XMM8, XMM13, XMM14 /* m[2], m[6] */
|
||||
LONG $0x6c1941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM12, XMM15 /* m[0], m[8] */
|
||||
MOVOU 80(DX), X12 // X12 = m[10]+m[11]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[10] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xdd // VPUNPCKHQDQ XMM11, XMM12, XMM13 /* m[11], m[3] */
|
||||
LONG $0x6c0941c4; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[10] */
|
||||
LONG $0x6d1941c4; BYTE $0xdd // VPUNPCKHQDQ XMM11, XMM12, XMM13 /* m[11], m[3] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 48(DX), X14 // X14 = m[6]+ m[7]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xc6 // VPUNPCKHQDQ XMM8, XMM14, XMM14 /* ___, m[7] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM13, XMM8 /* m[4], ____ */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM15, XMM12 /* m[15], m[1] */
|
||||
LONG $0x6d0941c4; BYTE $0xc6 // VPUNPCKHQDQ XMM8, XMM14, XMM14 /* ___, m[7] */
|
||||
LONG $0x6c1141c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM13, XMM8 /* m[4], ____ */
|
||||
LONG $0x6d0141c4; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM15, XMM12 /* m[15], m[1] */
|
||||
MOVOU 64(DX), X12 // X12 = m[8]+ m[9]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM14, XMM13 /* m[13], m[5] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM12, XMM12 /* ___, m[9] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM15, XMM11 /* m[14], ____ */
|
||||
LONG $0x6d0941c4; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM14, XMM13 /* m[13], m[5] */
|
||||
LONG $0x6d1941c4; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM12, XMM12 /* ___, m[9] */
|
||||
LONG $0x6c0141c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM15, XMM11 /* m[14], ____ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 7
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM12, XMM12 /* ___, m[1] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM14, XMM8 /* m[12], ____ */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xcd // VPUNPCKLQDQ XMM9, XMM15, XMM13 /* m[14], m[4] */
|
||||
LONG $0x6d1941c4; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM12, XMM12 /* ___, m[1] */
|
||||
LONG $0x6c0941c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM14, XMM8 /* m[12], ____ */
|
||||
LONG $0x6c0141c4; BYTE $0xcd // VPUNPCKLQDQ XMM9, XMM15, XMM13 /* m[14], m[4] */
|
||||
MOVOU 80(DX), X12 // X12 = m[10]+m[11]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xd7 // VPUNPCKHQDQ XMM10, XMM13, XMM15 /* m[5], m[15] */
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x19; BYTE $0x0f; BYTE $0xde // VPALIGNR XMM11, XMM12, XMM14, 0x8 /* m[13], m[10] */
|
||||
BYTE $0x08
|
||||
LONG $0x6d1141c4; BYTE $0xd7 // VPUNPCKHQDQ XMM10, XMM13, XMM15 /* m[5], m[15] */
|
||||
LONG $0x0f1943c4; WORD $0x08de // VPALIGNR XMM11, XMM12, XMM14, 0x8 /* m[13], m[10] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X15 // X15 = m[10]+m[11]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[6] */
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xce // VPALIGNR XMM9, XMM14, XMM14, 0x8 /* m[9], m[8] */
|
||||
BYTE $0x08
|
||||
LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[6] */
|
||||
LONG $0x0f0943c4; WORD $0x08ce // VPALIGNR XMM9, XMM14, XMM14, 0x8 /* m[9], m[8] */
|
||||
MOVOU 16(DX), X14 // X14 = m[2]+ m[3]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM13, XMM14 /* m[7], m[3] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM15, XMM15 /* ___, m[11] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM14, XMM11 /* m[2], ____ */
|
||||
LONG $0x6d1141c4; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM13, XMM14 /* m[7], m[3] */
|
||||
LONG $0x6d0141c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM15, XMM15 /* ___, m[11] */
|
||||
LONG $0x6c0941c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM14, XMM11 /* m[2], ____ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 8
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM14, XMM13 /* m[13], m[7] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM12, XMM12 /* ___, m[3] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM14, XMM9 /* m[12], ____ */
|
||||
LONG $0x6d0941c4; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM14, XMM13 /* m[13], m[7] */
|
||||
LONG $0x6d1941c4; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM12, XMM12 /* ___, m[3] */
|
||||
LONG $0x6c0941c4; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM14, XMM9 /* m[12], ____ */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 64(DX), X13 // X13 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x01; BYTE $0x0f; BYTE $0xd6 // VPALIGNR XMM10, XMM15, XMM14, 0x8 /* m[11], m[14] */
|
||||
BYTE $0x08
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xdd // VPUNPCKHQDQ XMM11, XMM12, XMM13 /* m[1], m[9] */
|
||||
LONG $0x0f0143c4; WORD $0x08d6 // VPALIGNR XMM10, XMM15, XMM14, 0x8 /* m[11], m[14] */
|
||||
LONG $0x6d1941c4; BYTE $0xdd // VPUNPCKHQDQ XMM11, XMM12, XMM13 /* m[1], m[9] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xc7 // VPUNPCKHQDQ XMM8, XMM13, XMM15 /* m[5], m[15] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcc // VPUNPCKLQDQ XMM9, XMM14, XMM12 /* m[8], m[2] */
|
||||
LONG $0x6d1141c4; BYTE $0xc7 // VPUNPCKHQDQ XMM8, XMM13, XMM15 /* m[5], m[15] */
|
||||
LONG $0x6c0941c4; BYTE $0xcc // VPUNPCKLQDQ XMM9, XMM14, XMM12 /* m[8], m[2] */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 48(DX), X14 // X14 = m[6]+ m[7]
|
||||
MOVOU 80(DX), X15 // X15 = m[10]+m[11]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xd5 // VPUNPCKLQDQ XMM10, XMM12, XMM13 /* m[0], m[4] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xdf // VPUNPCKLQDQ XMM11, XMM14, XMM15 /* m[6], m[10] */
|
||||
LONG $0x6c1941c4; BYTE $0xd5 // VPUNPCKLQDQ XMM10, XMM12, XMM13 /* m[0], m[4] */
|
||||
LONG $0x6c0941c4; BYTE $0xdf // VPUNPCKLQDQ XMM11, XMM14, XMM15 /* m[6], m[10] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 9
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xc7 // VPUNPCKLQDQ XMM8, XMM13, XMM15 /* m[6], m[14] */
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x19; BYTE $0x0f; BYTE $0xce // VPALIGNR XMM9, XMM12, XMM14, 0x8 /* m[11], m[0] */
|
||||
BYTE $0x08
|
||||
LONG $0x6c1141c4; BYTE $0xc7 // VPUNPCKLQDQ XMM8, XMM13, XMM15 /* m[6], m[14] */
|
||||
LONG $0x0f1943c4; WORD $0x08ce // VPALIGNR XMM9, XMM12, XMM14, 0x8 /* m[11], m[0] */
|
||||
MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM15, XMM14 /* m[15], m[9] */
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xdd // VPALIGNR XMM11, XMM14, XMM13, 0x8 /* m[3], m[8] */
|
||||
BYTE $0x08
|
||||
LONG $0x6d0141c4; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM15, XMM14 /* m[15], m[9] */
|
||||
LONG $0x0f0943c4; WORD $0x08dd // VPALIGNR XMM11, XMM14, XMM13, 0x8 /* m[3], m[8] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
MOVOU 96(DX), X15 // X15 = m[12]+m[13]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xc7 // VPUNPCKHQDQ XMM8, XMM15, XMM15 /* ___, m[13] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[12], ____ */
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xcc // VPALIGNR XMM9, XMM14, XMM12, 0x8 /* m[1], m[10] */
|
||||
BYTE $0x08
|
||||
LONG $0x6d0141c4; BYTE $0xc7 // VPUNPCKHQDQ XMM8, XMM15, XMM15 /* ___, m[13] */
|
||||
LONG $0x6c0141c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[12], ____ */
|
||||
LONG $0x0f0943c4; WORD $0x08cc // VPALIGNR XMM9, XMM14, XMM12, 0x8 /* m[1], m[10] */
|
||||
MOVOU 32(DX), X12 // X12 = m[4]+ m[5]
|
||||
MOVOU 48(DX), X15 // X15 = m[6]+ m[7]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xd7 // VPUNPCKHQDQ XMM10, XMM15, XMM15 /* ___, m[7] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xd2 // VPUNPCKLQDQ XMM10, XMM13, XMM10 /* m[2], ____ */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM12, XMM12 /* ___, m[5] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM12, XMM11 /* m[4], ____ */
|
||||
LONG $0x6d0141c4; BYTE $0xd7 // VPUNPCKHQDQ XMM10, XMM15, XMM15 /* ___, m[7] */
|
||||
LONG $0x6c1141c4; BYTE $0xd2 // VPUNPCKLQDQ XMM10, XMM13, XMM10 /* m[2], ____ */
|
||||
LONG $0x6d1941c4; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM12, XMM12 /* ___, m[5] */
|
||||
LONG $0x6c1941c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM12, XMM11 /* m[4], ____ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 1 0
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X15 // X15 = m[10]+m[11]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xc6 // VPUNPCKLQDQ XMM8, XMM15, XMM14 /* m[10], m[8] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM13, XMM12 /* m[7], m[1] */
|
||||
LONG $0x6c0141c4; BYTE $0xc6 // VPUNPCKLQDQ XMM8, XMM15, XMM14 /* m[10], m[8] */
|
||||
LONG $0x6d1141c4; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM13, XMM12 /* m[7], m[1] */
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 32(DX), X14 // X14 = m[4]+ m[5]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM12, XMM14 /* m[2], m[4] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[5] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[6], ____ */
|
||||
LONG $0x6c1941c4; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM12, XMM14 /* m[2], m[4] */
|
||||
LONG $0x6d0941c4; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[5] */
|
||||
LONG $0x6c1141c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[6], ____ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 64(DX), X13 // X13 = m[8]+ m[9]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM15, XMM13 /* m[15], m[9] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM12, XMM14 /* m[3], m[13] */
|
||||
LONG $0x6d0141c4; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM15, XMM13 /* m[15], m[9] */
|
||||
LONG $0x6d1941c4; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM12, XMM14 /* m[3], m[13] */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 80(DX), X13 // X13 = m[10]+m[11]
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x01; BYTE $0x0f; BYTE $0xd5 // VPALIGNR XMM10, XMM15, XMM13, 0x8 /* m[11], m[14] */
|
||||
BYTE $0x08
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xdc // VPUNPCKLQDQ XMM11, XMM14, XMM12 /* m[12], m[0] */
|
||||
LONG $0x0f0143c4; WORD $0x08d5 // VPALIGNR XMM10, XMM15, XMM13, 0x8 /* m[11], m[14] */
|
||||
LONG $0x6c0941c4; BYTE $0xdc // VPUNPCKLQDQ XMM11, XMM14, XMM12 /* m[12], m[0] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 1 1
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+m[1]
|
||||
MOVOU 16(DX), X13 // X13 = m[2]+m[3]
|
||||
MOVOU 32(DX), X14 // X14 = m[4]+m[5]
|
||||
MOVOU 48(DX), X15 // X15 = m[6]+m[7]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[2] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[4], m[6] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[1], m[3] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[5], m[7] */
|
||||
LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[2] */
|
||||
LONG $0x6c0941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[4], m[6] */
|
||||
LONG $0x6d1941c4; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[1], m[3] */
|
||||
LONG $0x6d0941c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[5], m[7] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 64(DX), X12 // X12 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X13 // X13 = m[10]+m[11]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[8],m[10] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[12],m[14] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[9],m[11] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[13],m[15] */
|
||||
LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[8],m[10] */
|
||||
LONG $0x6c0941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[12],m[14] */
|
||||
LONG $0x6d1941c4; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[9],m[11] */
|
||||
LONG $0x6d0941c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[13],m[15] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 1 2
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 112(DX), X12 // X12 = m[14]+m[15]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 96(DX), X15 // X15 = m[12]+m[13]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[14], m[4] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM14, XMM15 /* m[9], m[13] */
|
||||
LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[14], m[4] */
|
||||
LONG $0x6d0941c4; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM14, XMM15 /* m[9], m[13] */
|
||||
MOVOU 80(DX), X13 // X13 = m[10]+m[11]
|
||||
MOVOU 48(DX), X15 // X15 = m[6]+ m[7]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[10], m[8] */
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x01; BYTE $0x0f; BYTE $0xdc // VPALIGNR XMM11, XMM15, XMM12, 0x8 /* m[15], m[6] */
|
||||
BYTE $0x08
|
||||
LONG $0x6c1141c4; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[10], m[8] */
|
||||
LONG $0x0f0143c4; WORD $0x08dc // VPALIGNR XMM11, XMM15, XMM12, 0x8 /* m[15], m[6] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x19; BYTE $0x0f; BYTE $0xc4 // VPALIGNR XMM8, XMM12, XMM12, 0x8 /* m[1], m[0] */
|
||||
BYTE $0x08
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xcd // VPUNPCKHQDQ XMM9, XMM14, XMM13 /* m[11], m[5] */
|
||||
LONG $0x0f1943c4; WORD $0x08c4 // VPALIGNR XMM8, XMM12, XMM12, 0x8 /* m[1], m[0] */
|
||||
LONG $0x6d0941c4; BYTE $0xcd // VPUNPCKHQDQ XMM9, XMM14, XMM13 /* m[11], m[5] */
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[2] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM13, XMM12 /* m[7], m[3] */
|
||||
LONG $0x6c0941c4; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[2] */
|
||||
LONG $0x6d1141c4; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM13, XMM12 /* m[7], m[3] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
// Reload digest (most current value store in &out)
|
||||
|
11
vendor/github.com/minio/blake2b-simd/compressSse_amd64.go
generated
vendored
11
vendor/github.com/minio/blake2b-simd/compressSse_amd64.go
generated
vendored
@ -23,18 +23,19 @@ package blake2b
|
||||
func blockSSELoop(p []uint8, in, iv, t, f, shffle, out []uint64)
|
||||
|
||||
func compressSSE(d *digest, p []uint8) {
|
||||
var (
|
||||
in [8]uint64
|
||||
out [8]uint64
|
||||
shffle [2]uint64
|
||||
)
|
||||
|
||||
in := make([]uint64, 8, 8)
|
||||
out := make([]uint64, 8, 8)
|
||||
|
||||
shffle := make([]uint64, 2, 2)
|
||||
// vector for PSHUFB instruction
|
||||
shffle[0] = 0x0201000706050403
|
||||
shffle[1] = 0x0a09080f0e0d0c0b
|
||||
|
||||
in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7] = d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7]
|
||||
|
||||
blockSSELoop(p, in, iv[:], d.t[:], d.f[:], shffle, out)
|
||||
blockSSELoop(p, in[:], iv[:], d.t[:], d.f[:], shffle[:], out[:])
|
||||
|
||||
d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7] = out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7]
|
||||
}
|
||||
|
500
vendor/github.com/minio/blake2b-simd/compressSse_amd64.s
generated
vendored
500
vendor/github.com/minio/blake2b-simd/compressSse_amd64.s
generated
vendored
@ -44,48 +44,45 @@
|
||||
|
||||
#define G1 \
|
||||
\ // G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1);
|
||||
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xd4; BYTE $0xc0 \ // PADDQ XMM0,XMM8 /* v0 += m[0], v1 += m[2] */
|
||||
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xd4; BYTE $0xc9 \ // PADDQ XMM1,XMM9 /* v2 += m[4], v3 += m[6] */
|
||||
BYTE $0x66; BYTE $0x0f; BYTE $0xd4; BYTE $0xc2 \ // PADDQ XMM0,XMM2 /* v0 += v4, v1 += v5 */
|
||||
BYTE $0x66; BYTE $0x0f; BYTE $0xd4; BYTE $0xcb \ // PADDQ XMM1,XMM3 /* v2 += v6, v3 += v7 */
|
||||
BYTE $0x66; BYTE $0x0f; BYTE $0xef; BYTE $0xf0 \ // PXOR XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */
|
||||
BYTE $0x66; BYTE $0x0f; BYTE $0xef; BYTE $0xf9 \ // PXOR XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */
|
||||
BYTE $0x66; BYTE $0x0f; BYTE $0x70; BYTE $0xf6; BYTE $0xb1 \ // PSHUFD XMM6,XMM6,0xb1 /* v12 = v12<<(64-32) | v12>>32, v13 = v13<<(64-32) | v13>>32 */
|
||||
BYTE $0x66; BYTE $0x0f; BYTE $0x70; BYTE $0xff; BYTE $0xb1 \ // PSHUFD XMM7,XMM7,0xb1 /* v14 = v14<<(64-32) | v14>>32, v15 = v15<<(64-32) | v15>>32 */
|
||||
BYTE $0x66; BYTE $0x0f; BYTE $0xd4; BYTE $0xe6 \ // PADDQ XMM4,XMM6 /* v8 += v12, v9 += v13 */
|
||||
BYTE $0x66; BYTE $0x0f; BYTE $0xd4; BYTE $0xef \ // PADDQ XMM5,XMM7 /* v10 += v14, v11 += v15 */
|
||||
BYTE $0x66; BYTE $0x0f; BYTE $0xef; BYTE $0xd4 \ // PXOR XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */
|
||||
BYTE $0x66; BYTE $0x0f; BYTE $0xef; BYTE $0xdd \ // PXOR XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */
|
||||
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x38; BYTE $0x00 \ // PSHUFB XMM2,XMM12 /* v4 = v4<<(64-24) | v4>>24, v5 = v5<<(64-24) | v5>>24 */
|
||||
BYTE $0xd4 \
|
||||
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x38; BYTE $0x00 \ // PSHUFB XMM3,XMM12 /* v6 = v6<<(64-24) | v6>>24, v7 = v7<<(64-24) | v7>>24 */
|
||||
BYTE $0xdc \
|
||||
// DO NOT DELETE -- macro delimiter (previous line extended)
|
||||
LONG $0xd40f4166; BYTE $0xc0 \ // PADDQ XMM0,XMM8 /* v0 += m[0], v1 += m[2] */
|
||||
LONG $0xd40f4166; BYTE $0xc9 \ // PADDQ XMM1,XMM9 /* v2 += m[4], v3 += m[6] */
|
||||
LONG $0xc2d40f66 \ // PADDQ XMM0,XMM2 /* v0 += v4, v1 += v5 */
|
||||
LONG $0xcbd40f66 \ // PADDQ XMM1,XMM3 /* v2 += v6, v3 += v7 */
|
||||
LONG $0xf0ef0f66 \ // PXOR XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */
|
||||
LONG $0xf9ef0f66 \ // PXOR XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */
|
||||
LONG $0xf6700f66; BYTE $0xb1 \ // PSHUFD XMM6,XMM6,0xb1 /* v12 = v12<<(64-32) | v12>>32, v13 = v13<<(64-32) | v13>>32 */
|
||||
LONG $0xff700f66; BYTE $0xb1 \ // PSHUFD XMM7,XMM7,0xb1 /* v14 = v14<<(64-32) | v14>>32, v15 = v15<<(64-32) | v15>>32 */
|
||||
LONG $0xe6d40f66 \ // PADDQ XMM4,XMM6 /* v8 += v12, v9 += v13 */
|
||||
LONG $0xefd40f66 \ // PADDQ XMM5,XMM7 /* v10 += v14, v11 += v15 */
|
||||
LONG $0xd4ef0f66 \ // PXOR XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */
|
||||
LONG $0xddef0f66 \ // PXOR XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */
|
||||
LONG $0x380f4166; WORD $0xd400 \ // PSHUFB XMM2,XMM12 /* v4 = v4<<(64-24) | v4>>24, v5 = v5<<(64-24) | v5>>24 */
|
||||
LONG $0x380f4166; WORD $0xdc00 // PSHUFB XMM3,XMM12 /* v6 = v6<<(64-24) | v6>>24, v7 = v7<<(64-24) | v7>>24 */
|
||||
|
||||
#define G2 \
|
||||
\ // G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1);
|
||||
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xd4; BYTE $0xc2 \ // PADDQ XMM0,XMM10 /* v0 += m[1], v1 += m[3] */
|
||||
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xd4; BYTE $0xcb \ // PADDQ XMM1,XMM11 /* v2 += m[5], v3 += m[7] */
|
||||
BYTE $0x66; BYTE $0x0f; BYTE $0xd4; BYTE $0xc2 \ // PADDQ XMM0,XMM2 /* v0 += v4, v1 += v5 */
|
||||
BYTE $0x66; BYTE $0x0f; BYTE $0xd4; BYTE $0xcb \ // PADDQ XMM1,XMM3 /* v2 += v6, v3 += v7 */
|
||||
BYTE $0x66; BYTE $0x0f; BYTE $0xef; BYTE $0xf0 \ // PXOR XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */
|
||||
BYTE $0x66; BYTE $0x0f; BYTE $0xef; BYTE $0xf9 \ // PXOR XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */
|
||||
BYTE $0xf2; BYTE $0x0f; BYTE $0x70; BYTE $0xf6; BYTE $0x39 \ // PSHUFLW XMM6,XMM6,0x39 /* combined with next ... */
|
||||
BYTE $0xf3; BYTE $0x0f; BYTE $0x70; BYTE $0xf6; BYTE $0x39 \ // PSHUFHW XMM6,XMM6,0x39 /* v12 = v12<<(64-16) | v12>>16, v13 = v13<<(64-16) | v13>>16 */
|
||||
BYTE $0xf2; BYTE $0x0f; BYTE $0x70; BYTE $0xff; BYTE $0x39 \ // PSHUFLW XMM7,XMM7,0x39 /* combined with next ... */
|
||||
BYTE $0xf3; BYTE $0x0f; BYTE $0x70; BYTE $0xff; BYTE $0x39 \ // PSHUFHW XMM7,XMM7,0x39 /* v14 = v14<<(64-16) | v14>>16, v15 = v15<<(64-16) | v15>>16 */
|
||||
BYTE $0x66; BYTE $0x0f; BYTE $0xd4; BYTE $0xe6 \ // PADDQ XMM4,XMM6 /* v8 += v12, v9 += v13 */
|
||||
BYTE $0x66; BYTE $0x0f; BYTE $0xd4; BYTE $0xef \ // PADDQ XMM5,XMM7 /* v10 += v14, v11 += v15 */
|
||||
BYTE $0x66; BYTE $0x0f; BYTE $0xef; BYTE $0xd4 \ // PXOR XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */
|
||||
BYTE $0x66; BYTE $0x0f; BYTE $0xef; BYTE $0xdd \ // PXOR XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */
|
||||
LONG $0xd40f4166; BYTE $0xc2 \ // PADDQ XMM0,XMM10 /* v0 += m[1], v1 += m[3] */
|
||||
LONG $0xd40f4166; BYTE $0xcb \ // PADDQ XMM1,XMM11 /* v2 += m[5], v3 += m[7] */
|
||||
LONG $0xc2d40f66 \ // PADDQ XMM0,XMM2 /* v0 += v4, v1 += v5 */
|
||||
LONG $0xcbd40f66 \ // PADDQ XMM1,XMM3 /* v2 += v6, v3 += v7 */
|
||||
LONG $0xf0ef0f66 \ // PXOR XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */
|
||||
LONG $0xf9ef0f66 \ // PXOR XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */
|
||||
LONG $0xf6700ff2; BYTE $0x39 \ // PSHUFLW XMM6,XMM6,0x39 /* combined with next ... */
|
||||
LONG $0xf6700ff3; BYTE $0x39 \ // PSHUFHW XMM6,XMM6,0x39 /* v12 = v12<<(64-16) | v12>>16, v13 = v13<<(64-16) | v13>>16 */
|
||||
LONG $0xff700ff2; BYTE $0x39 \ // PSHUFLW XMM7,XMM7,0x39 /* combined with next ... */
|
||||
LONG $0xff700ff3; BYTE $0x39 \ // PSHUFHW XMM7,XMM7,0x39 /* v14 = v14<<(64-16) | v14>>16, v15 = v15<<(64-16) | v15>>16 */
|
||||
LONG $0xe6d40f66 \ // PADDQ XMM4,XMM6 /* v8 += v12, v9 += v13 */
|
||||
LONG $0xefd40f66 \ // PADDQ XMM5,XMM7 /* v10 += v14, v11 += v15 */
|
||||
LONG $0xd4ef0f66 \ // PXOR XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */
|
||||
LONG $0xddef0f66 \ // PXOR XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */
|
||||
MOVOU X2, X15 \
|
||||
BYTE $0x66; BYTE $0x44; BYTE $0x0f; BYTE $0xd4; BYTE $0xfa \ // PADDQ XMM15,XMM2 /* temp reg = reg*2 */
|
||||
BYTE $0x66; BYTE $0x0f; BYTE $0x73; BYTE $0xd2; BYTE $0x3f \ // PSRLQ XMM2,0x3f /* reg = reg>>63 */
|
||||
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xef; BYTE $0xd7 \ // PXOR XMM2,XMM15 /* ORed together: v4 = v4<<(64-63) | v4>>63, v5 = v5<<(64-63) | v5>>63 */
|
||||
LONG $0xd40f4466; BYTE $0xfa \ // PADDQ XMM15,XMM2 /* temp reg = reg*2 */
|
||||
LONG $0xd2730f66; BYTE $0x3f \ // PSRLQ XMM2,0x3f /* reg = reg>>63 */
|
||||
LONG $0xef0f4166; BYTE $0xd7 \ // PXOR XMM2,XMM15 /* ORed together: v4 = v4<<(64-63) | v4>>63, v5 = v5<<(64-63) | v5>>63 */
|
||||
MOVOU X3, X15 \
|
||||
BYTE $0x66; BYTE $0x44; BYTE $0x0f; BYTE $0xd4; BYTE $0xfb \ // PADDQ XMM15,XMM3 /* temp reg = reg*2 */
|
||||
BYTE $0x66; BYTE $0x0f; BYTE $0x73; BYTE $0xd3; BYTE $0x3f \ // PSRLQ XMM3,0x3f /* reg = reg>>63 */
|
||||
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xef; BYTE $0xdf // PXOR XMM3,XMM15 /* ORed together: v6 = v6<<(64-63) | v6>>63, v7 = v7<<(64-63) | v7>>63 */
|
||||
LONG $0xd40f4466; BYTE $0xfb \ // PADDQ XMM15,XMM3 /* temp reg = reg*2 */
|
||||
LONG $0xd3730f66; BYTE $0x3f \ // PSRLQ XMM3,0x3f /* reg = reg>>63 */
|
||||
LONG $0xef0f4166; BYTE $0xdf // PXOR XMM3,XMM15 /* ORed together: v6 = v6<<(64-63) | v6>>63, v7 = v7<<(64-63) | v7>>63 */
|
||||
|
||||
#define DIAGONALIZE \
|
||||
\ // DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
|
||||
@ -94,16 +91,16 @@
|
||||
MOVOU X4, X6 \ /* row4l = row3l;\ */
|
||||
MOVOU X5, X4 \ /* row3l = row3h;\ */
|
||||
MOVOU X6, X5 \ /* row3h = row4l;\ */
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xfd \ // PUNPCKLQDQ XMM15, XMM13 /* _mm_unpacklo_epi64(t0, t0) */
|
||||
LONG $0x6c0f4566; BYTE $0xfd \ // PUNPCKLQDQ XMM15, XMM13 /* _mm_unpacklo_epi64(t0, t0) */
|
||||
MOVOU X7, X6 \
|
||||
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x6d; BYTE $0xf7 \ // PUNPCKHQDQ XMM6, XMM15 /* row4l = _mm_unpackhi_epi64(row4h, ); \ */
|
||||
BYTE $0x66; BYTE $0x44; BYTE $0x0f; BYTE $0x6c; BYTE $0xff \ // PUNPCKLQDQ XMM15, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */
|
||||
LONG $0x6d0f4166; BYTE $0xf7 \ // PUNPCKHQDQ XMM6, XMM15 /* row4l = _mm_unpackhi_epi64(row4h, ); \ */
|
||||
LONG $0x6c0f4466; BYTE $0xff \ // PUNPCKLQDQ XMM15, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */
|
||||
MOVOU X13, X7 \
|
||||
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x6d; BYTE $0xff \ // PUNPCKHQDQ XMM7, XMM15 /* row4h = _mm_unpackhi_epi64(t0, ); \ */
|
||||
BYTE $0x66; BYTE $0x44; BYTE $0x0f; BYTE $0x6c; BYTE $0xfb \ // PUNPCKLQDQ XMM15, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */
|
||||
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x6d; BYTE $0xd7 \ // PUNPCKHQDQ XMM2, XMM15 /* row2l = _mm_unpackhi_epi64(row2l, ); \ */
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xfe \ // PUNPCKLQDQ XMM15, XMM14 /* _mm_unpacklo_epi64(t1, t1) */
|
||||
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x6d; BYTE $0xdf // PUNPCKHQDQ XMM3, XMM15 /* row2h = _mm_unpackhi_epi64(row2h, ) */
|
||||
LONG $0x6d0f4166; BYTE $0xff \ // PUNPCKHQDQ XMM7, XMM15 /* row4h = _mm_unpackhi_epi64(t0, ); \ */
|
||||
LONG $0x6c0f4466; BYTE $0xfb \ // PUNPCKLQDQ XMM15, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */
|
||||
LONG $0x6d0f4166; BYTE $0xd7 \ // PUNPCKHQDQ XMM2, XMM15 /* row2l = _mm_unpackhi_epi64(row2l, ); \ */
|
||||
LONG $0x6c0f4566; BYTE $0xfe \ // PUNPCKLQDQ XMM15, XMM14 /* _mm_unpacklo_epi64(t1, t1) */
|
||||
LONG $0x6d0f4166; BYTE $0xdf // PUNPCKHQDQ XMM3, XMM15 /* row2h = _mm_unpackhi_epi64(row2h, ) */
|
||||
|
||||
#define UNDIAGONALIZE \
|
||||
\ // UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
|
||||
@ -112,16 +109,16 @@
|
||||
MOVOU X13, X5 \ /* row3h = t0;\ */
|
||||
MOVOU X2, X13 \ /* t0 = row2l;\ */
|
||||
MOVOU X6, X14 \ /* t1 = row4l;\ */
|
||||
BYTE $0x66; BYTE $0x44; BYTE $0x0f; BYTE $0x6c; BYTE $0xfa \ // PUNPCKLQDQ XMM15, XMM2 /* _mm_unpacklo_epi64(row2l, row2l) */
|
||||
LONG $0x6c0f4466; BYTE $0xfa \ // PUNPCKLQDQ XMM15, XMM2 /* _mm_unpacklo_epi64(row2l, row2l) */
|
||||
MOVOU X3, X2 \
|
||||
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x6d; BYTE $0xd7 \ // PUNPCKHQDQ XMM2, XMM15 /* row2l = _mm_unpackhi_epi64(row2h, ); \ */
|
||||
BYTE $0x66; BYTE $0x44; BYTE $0x0f; BYTE $0x6c; BYTE $0xfb \ // PUNPCKLQDQ XMM15, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */
|
||||
LONG $0x6d0f4166; BYTE $0xd7 \ // PUNPCKHQDQ XMM2, XMM15 /* row2l = _mm_unpackhi_epi64(row2h, ); \ */
|
||||
LONG $0x6c0f4466; BYTE $0xfb \ // PUNPCKLQDQ XMM15, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */
|
||||
MOVOU X13, X3 \
|
||||
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x6d; BYTE $0xdf \ // PUNPCKHQDQ XMM3, XMM15 /* row2h = _mm_unpackhi_epi64(t0, ); \ */
|
||||
BYTE $0x66; BYTE $0x44; BYTE $0x0f; BYTE $0x6c; BYTE $0xff \ // PUNPCKLQDQ XMM15, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */
|
||||
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x6d; BYTE $0xf7 \ // PUNPCKHQDQ XMM6, XMM15 /* row4l = _mm_unpackhi_epi64(row4l, ); \ */
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xfe \ // PUNPCKLQDQ XMM15, XMM14 /* _mm_unpacklo_epi64(t1, t1) */
|
||||
BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x6d; BYTE $0xff // PUNPCKHQDQ XMM7, XMM15 /* row4h = _mm_unpackhi_epi64(row4h, ) */
|
||||
LONG $0x6d0f4166; BYTE $0xdf \ // PUNPCKHQDQ XMM3, XMM15 /* row2h = _mm_unpackhi_epi64(t0, ); \ */
|
||||
LONG $0x6c0f4466; BYTE $0xff \ // PUNPCKLQDQ XMM15, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */
|
||||
LONG $0x6d0f4166; BYTE $0xf7 \ // PUNPCKHQDQ XMM6, XMM15 /* row4l = _mm_unpackhi_epi64(row4l, ); \ */
|
||||
LONG $0x6c0f4566; BYTE $0xfe \ // PUNPCKLQDQ XMM15, XMM14 /* _mm_unpacklo_epi64(t1, t1) */
|
||||
LONG $0x6d0f4166; BYTE $0xff // PUNPCKHQDQ XMM7, XMM15 /* row4h = _mm_unpackhi_epi64(row4h, ) */
|
||||
|
||||
#define LOAD_SHUFFLE \
|
||||
\ // Load shuffle value
|
||||
@ -163,14 +160,15 @@ TEXT ·blockSSELoop(SB), 7, $0
|
||||
loop:
|
||||
// Increment counter
|
||||
MOVQ t+72(FP), SI // SI: &t
|
||||
MOVQ 0(SI), R9 //
|
||||
MOVQ 0(SI), R9
|
||||
ADDQ $128, R9 // /* d.t[0] += BlockSize */
|
||||
MOVQ R9, 0(SI) //
|
||||
MOVQ R9, 0(SI)
|
||||
CMPQ R9, $128 // /* if d.t[0] < BlockSize { */
|
||||
JGE noincr //
|
||||
MOVQ 8(SI), R9 //
|
||||
JGE noincr
|
||||
MOVQ 8(SI), R9
|
||||
ADDQ $1, R9 // /* d.t[1]++ */
|
||||
MOVQ R9, 8(SI) //
|
||||
MOVQ R9, 8(SI)
|
||||
|
||||
noincr: // /* } */
|
||||
|
||||
// Load initialization vector
|
||||
@ -190,663 +188,553 @@ noincr: // /* } */
|
||||
// R O U N D 1
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+m[1]
|
||||
MOVOU 16(DX), X13 // X13 = m[2]+m[3]
|
||||
MOVOU 32(DX), X14 // X14 = m[4]+m[5]
|
||||
MOVOU 48(DX), X15 // X15 = m[6]+m[7]
|
||||
MOVOU X12, X8
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[0], m[2] */
|
||||
LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[0], m[2] */
|
||||
MOVOU X14, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[4], m[6] */
|
||||
LONG $0x6c0f4566; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[4], m[6] */
|
||||
MOVOU X12, X10
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[1], m[3] */
|
||||
LONG $0x6d0f4566; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[1], m[3] */
|
||||
MOVOU X14, X11
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[5], m[7] */
|
||||
LONG $0x6d0f4566; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[5], m[7] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 64(DX), X12 // X12 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X13 // X13 = m[10]+m[11]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
MOVOU X12, X8
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[8],m[10] */
|
||||
LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[8],m[10] */
|
||||
MOVOU X14, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[12],m[14] */
|
||||
LONG $0x6c0f4566; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[12],m[14] */
|
||||
MOVOU X12, X10
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[9],m[11] */
|
||||
LONG $0x6d0f4566; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[9],m[11] */
|
||||
MOVOU X14, X11
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[13],m[15] */
|
||||
LONG $0x6d0f4566; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[13],m[15] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 2
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 112(DX), X12 // X12 = m[14]+m[15]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 96(DX), X15 // X15 = m[12]+m[13]
|
||||
MOVOU X12, X8
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[14], m[4] */
|
||||
LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[14], m[4] */
|
||||
MOVOU X14, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* m[9], m[13] */
|
||||
LONG $0x6d0f4566; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* m[9], m[13] */
|
||||
MOVOU 80(DX), X10 // X10 = m[10]+m[11]
|
||||
MOVOU 48(DX), X11 // X11 = m[6]+ m[7]
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[10], m[8] */
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM11, XMM12, 0x8 /* m[15], m[6] */; ; ; ; ;
|
||||
BYTE $0xdc; BYTE $0x08
|
||||
LONG $0x6c0f4566; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[10], m[8] */
|
||||
LONG $0x3a0f4566; WORD $0xdc0f; BYTE $0x08 // PALIGNR XMM11, XMM12, 0x8 /* m[15], m[6] */; ; ; ; ;
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
MOVOU X12, X8
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM8, XMM12, 0x8 /* m[1], m[0] */
|
||||
BYTE $0xc4; BYTE $0x08
|
||||
LONG $0x3a0f4566; WORD $0xc40f; BYTE $0x08 // PALIGNR XMM8, XMM12, 0x8 /* m[1], m[0] */
|
||||
MOVOU X14, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcd // PUNPCKHQDQ XMM9, XMM13 /* m[11], m[5] */
|
||||
LONG $0x6d0f4566; BYTE $0xcd // PUNPCKHQDQ XMM9, XMM13 /* m[11], m[5] */
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X11 // X11 = m[6]+ m[7]
|
||||
MOVOU 96(DX), X10 // X10 = m[12]+m[13]
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[12], m[2] */
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdc // PUNPCKHQDQ XMM11, XMM12 /* m[7], m[3] */
|
||||
LONG $0x6c0f4566; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[12], m[2] */
|
||||
LONG $0x6d0f4566; BYTE $0xdc // PUNPCKHQDQ XMM11, XMM12 /* m[7], m[3] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 3
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 32(DX), X12 // X12 = m[4]+ m[5]
|
||||
MOVOU 80(DX), X13 // X13 = m[10]+m[11]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
MOVOU X14, X8
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM8, XMM13, 0x8 /* m[11], m[12] */
|
||||
BYTE $0xc5; BYTE $0x08
|
||||
LONG $0x3a0f4566; WORD $0xc50f; BYTE $0x08 // PALIGNR XMM8, XMM13, 0x8 /* m[11], m[12] */
|
||||
MOVOU X12, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* m[5], m[15] */
|
||||
LONG $0x6d0f4566; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* m[5], m[15] */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
|
||||
MOVOU 64(DX), X10 // X10 = m[8]+ m[9]
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[8], m[0] */
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[13] */
|
||||
LONG $0x6c0f4566; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[8], m[0] */
|
||||
LONG $0x6d0f4566; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[13] */
|
||||
MOVOU X13, X11
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xde // PUNPCKLQDQ XMM11, XMM14 /* m[2], ___ */
|
||||
LONG $0x6c0f4566; BYTE $0xde // PUNPCKLQDQ XMM11, XMM14 /* m[2], ___ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X15 // X15 = m[10]+m[11]
|
||||
MOVOU X12, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* ___, m[3] */
|
||||
LONG $0x6d0f4566; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* ___, m[3] */
|
||||
MOVOU X15, X8
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[10], ___ */
|
||||
LONG $0x6c0f4566; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[10], ___ */
|
||||
MOVOU X13, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* m[7], m[9] */
|
||||
LONG $0x6d0f4566; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* m[7], m[9] */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 32(DX), X11 // X11 = m[4]+ m[5]
|
||||
MOVOU 112(DX), X10 // X10 = m[14]+m[15]
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd5 // PUNPCKLQDQ XMM10, XMM13 /* m[14], m[6] */
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM11, XMM12, 0x8 /* m[1], m[4] */
|
||||
BYTE $0xdc; BYTE $0x08
|
||||
LONG $0x6c0f4566; BYTE $0xd5 // PUNPCKLQDQ XMM10, XMM13 /* m[14], m[6] */
|
||||
LONG $0x3a0f4566; WORD $0xdc0f; BYTE $0x08 // PALIGNR XMM11, XMM12, 0x8 /* m[1], m[4] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 4
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
MOVOU 96(DX), X15 // X15 = m[12]+m[13]
|
||||
MOVOU X13, X8
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xc4 // PUNPCKHQDQ XMM8, XMM12 /* m[7], m[3] */
|
||||
LONG $0x6d0f4566; BYTE $0xc4 // PUNPCKHQDQ XMM8, XMM12 /* m[7], m[3] */
|
||||
MOVOU X15, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* m[13], m[11] */
|
||||
LONG $0x6d0f4566; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* m[13], m[11] */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 64(DX), X10 // X10 = m[8]+ m[9]
|
||||
MOVOU 112(DX), X14 // X14 = m[14]+m[15]
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd4 // PUNPCKHQDQ XMM10, XMM12 /* m[9], m[1] */
|
||||
LONG $0x6d0f4566; BYTE $0xd4 // PUNPCKHQDQ XMM10, XMM12 /* m[9], m[1] */
|
||||
MOVOU X15, X11
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xde // PUNPCKLQDQ XMM11, XMM14 /* m[12], m[14] */
|
||||
LONG $0x6c0f4566; BYTE $0xde // PUNPCKLQDQ XMM11, XMM14 /* m[12], m[14] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
MOVOU X13, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcd // PUNPCKHQDQ XMM9, XMM13 /* ___, m[5] */
|
||||
LONG $0x6d0f4566; BYTE $0xcd // PUNPCKHQDQ XMM9, XMM13 /* ___, m[5] */
|
||||
MOVOU X12, X8
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[2], ____ */
|
||||
LONG $0x6c0f4566; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[2], ____ */
|
||||
MOVOU X15, X10
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd7 // PUNPCKHQDQ XMM10, XMM15 /* ___, m[15] */
|
||||
LONG $0x6d0f4566; BYTE $0xd7 // PUNPCKHQDQ XMM10, XMM15 /* ___, m[15] */
|
||||
MOVOU X13, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xca // PUNPCKLQDQ XMM9, XMM10 /* m[4], ____ */
|
||||
LONG $0x6c0f4566; BYTE $0xca // PUNPCKLQDQ XMM9, XMM10 /* m[4], ____ */
|
||||
MOVOU 0(DX), X11 // X11 = m[0]+ m[1]
|
||||
MOVOU 48(DX), X10 // X10 = m[6]+ m[7]
|
||||
MOVOU 64(DX), X15 // X15 = m[8]+ m[9]
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[6], m[10] */
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[0], m[8] */
|
||||
LONG $0x6c0f4566; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[6], m[10] */
|
||||
LONG $0x6c0f4566; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[0], m[8] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 5
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X15 // X15 = m[10]+m[11]
|
||||
MOVOU X14, X8
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xc5 // PUNPCKHQDQ XMM8, XMM13 /* m[9], m[5] */
|
||||
LONG $0x6d0f4566; BYTE $0xc5 // PUNPCKHQDQ XMM8, XMM13 /* m[9], m[5] */
|
||||
MOVOU X12, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[2], m[10] */
|
||||
LONG $0x6c0f4566; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[2], m[10] */
|
||||
MOVOU 0(DX), X10 // X10 = m[0]+ m[1]
|
||||
MOVOU 48(DX), X14 // X14 = m[6]+ m[7]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[7] */
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[0], ____ */
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xff // PUNPCKHQDQ XMM15, XMM15 /* ___, m[15] */
|
||||
LONG $0x6d0f4566; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[7] */
|
||||
LONG $0x6c0f4566; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[0], ____ */
|
||||
LONG $0x6d0f4566; BYTE $0xff // PUNPCKHQDQ XMM15, XMM15 /* ___, m[15] */
|
||||
MOVOU X13, X11
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[4], ____ */
|
||||
LONG $0x6c0f4566; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[4], ____ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[11] */
|
||||
LONG $0x6d0f4566; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[11] */
|
||||
MOVOU X15, X8
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc6 // PUNPCKLQDQ XMM8, XMM14 /* m[14], ____ */
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xe4 // PUNPCKHQDQ XMM12, XMM12 /* ___, m[3] */
|
||||
LONG $0x6c0f4566; BYTE $0xc6 // PUNPCKLQDQ XMM8, XMM14 /* m[14], ____ */
|
||||
LONG $0x6d0f4566; BYTE $0xe4 // PUNPCKHQDQ XMM12, XMM12 /* ___, m[3] */
|
||||
MOVOU X13, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcc // PUNPCKLQDQ XMM9, XMM12 /* m[6], ____ */
|
||||
LONG $0x6c0f4566; BYTE $0xcc // PUNPCKLQDQ XMM9, XMM12 /* m[6], ____ */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 64(DX), X11 // X11 = m[8]+ m[9]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
MOVOU X14, X10
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM10, XMM12, 0x8 /* m[1], m[12] */
|
||||
BYTE $0xd4; BYTE $0x08
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[13] */
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xde // PUNPCKLQDQ XMM11, XMM14 /* m[8], ____ */
|
||||
LONG $0x3a0f4566; WORD $0xd40f; BYTE $0x08 // PALIGNR XMM10, XMM12, 0x8 /* m[1], m[12] */
|
||||
LONG $0x6d0f4566; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[13] */
|
||||
LONG $0x6c0f4566; BYTE $0xde // PUNPCKLQDQ XMM11, XMM14 /* m[8], ____ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 6
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X14 // X14 = m[6]+ m[7]
|
||||
MOVOU 64(DX), X15 // X15 = m[8]+ m[9]
|
||||
MOVOU X13, X8
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc6 // PUNPCKLQDQ XMM8, XMM14 /* m[2], m[6] */
|
||||
LONG $0x6c0f4566; BYTE $0xc6 // PUNPCKLQDQ XMM8, XMM14 /* m[2], m[6] */
|
||||
MOVOU X12, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[0], m[8] */
|
||||
LONG $0x6c0f4566; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[0], m[8] */
|
||||
MOVOU 80(DX), X12 // X12 = m[10]+m[11]
|
||||
MOVOU 96(DX), X10 // X10 = m[12]+m[13]
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[12], m[10] */
|
||||
LONG $0x6c0f4566; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[12], m[10] */
|
||||
MOVOU X12, X11
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdd // PUNPCKHQDQ XMM11, XMM13 /* m[11], m[3] */
|
||||
LONG $0x6d0f4566; BYTE $0xdd // PUNPCKHQDQ XMM11, XMM13 /* m[11], m[3] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 48(DX), X14 // X14 = m[6]+ m[7]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
MOVOU X14, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* ___, m[7] */
|
||||
LONG $0x6d0f4566; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* ___, m[7] */
|
||||
MOVOU X13, X8
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[4], ____ */
|
||||
LONG $0x6c0f4566; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[4], ____ */
|
||||
MOVOU X15, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* m[15], m[1] */
|
||||
LONG $0x6d0f4566; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* m[15], m[1] */
|
||||
MOVOU 64(DX), X12 // X12 = m[8]+ m[9]
|
||||
MOVOU 96(DX), X10 // X10 = m[12]+m[13]
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[13], m[5] */
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xe4 // PUNPCKHQDQ XMM12, XMM12 /* ___, m[9] */
|
||||
LONG $0x6d0f4566; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[13], m[5] */
|
||||
LONG $0x6d0f4566; BYTE $0xe4 // PUNPCKHQDQ XMM12, XMM12 /* ___, m[9] */
|
||||
MOVOU X15, X11
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xdc // PUNPCKLQDQ XMM11, XMM12 /* m[14], ____ */
|
||||
LONG $0x6c0f4566; BYTE $0xdc // PUNPCKLQDQ XMM11, XMM12 /* m[14], ____ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 7
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
MOVOU X12, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* ___, m[1] */
|
||||
LONG $0x6d0f4566; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* ___, m[1] */
|
||||
MOVOU X14, X8
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[12], ____ */
|
||||
LONG $0x6c0f4566; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[12], ____ */
|
||||
MOVOU X15, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcd // PUNPCKLQDQ XMM9, XMM13 /* m[14], m[4] */
|
||||
LONG $0x6c0f4566; BYTE $0xcd // PUNPCKLQDQ XMM9, XMM13 /* m[14], m[4] */
|
||||
MOVOU 80(DX), X11 // X11 = m[10]+m[11]
|
||||
MOVOU X13, X10
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd7 // PUNPCKHQDQ XMM10, XMM15 /* m[5], m[15] */
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM11, XMM14, 0x8 /* m[13], m[10] */
|
||||
BYTE $0xde; BYTE $0x08
|
||||
LONG $0x6d0f4566; BYTE $0xd7 // PUNPCKHQDQ XMM10, XMM15 /* m[5], m[15] */
|
||||
LONG $0x3a0f4566; WORD $0xde0f; BYTE $0x08 // PALIGNR XMM11, XMM14, 0x8 /* m[13], m[10] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X15 // X15 = m[10]+m[11]
|
||||
MOVOU X12, X8
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[0], m[6] */
|
||||
LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[0], m[6] */
|
||||
MOVOU X14, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM9, XMM14, 0x8 /* m[9], m[8] */
|
||||
BYTE $0xce; BYTE $0x08
|
||||
LONG $0x3a0f4566; WORD $0xce0f; BYTE $0x08 // PALIGNR XMM9, XMM14, 0x8 /* m[9], m[8] */
|
||||
MOVOU 16(DX), X11 // X14 = m[2]+ m[3]
|
||||
MOVOU X13, X10
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd3 // PUNPCKHQDQ XMM10, XMM11 /* m[7], m[3] */
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xff // PUNPCKHQDQ XMM15, XMM15 /* ___, m[11] */
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[2], ____ */
|
||||
LONG $0x6d0f4566; BYTE $0xd3 // PUNPCKHQDQ XMM10, XMM11 /* m[7], m[3] */
|
||||
LONG $0x6d0f4566; BYTE $0xff // PUNPCKHQDQ XMM15, XMM15 /* ___, m[11] */
|
||||
LONG $0x6c0f4566; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[2], ____ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 8
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
MOVOU X14, X8
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xc5 // PUNPCKHQDQ XMM8, XMM13 /* m[13], m[7] */
|
||||
LONG $0x6d0f4566; BYTE $0xc5 // PUNPCKHQDQ XMM8, XMM13 /* m[13], m[7] */
|
||||
MOVOU X12, X10
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd4 // PUNPCKHQDQ XMM10, XMM12 /* ___, m[3] */
|
||||
LONG $0x6d0f4566; BYTE $0xd4 // PUNPCKHQDQ XMM10, XMM12 /* ___, m[3] */
|
||||
MOVOU X14, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xca // PUNPCKLQDQ XMM9, XMM10 /* m[12], ____ */
|
||||
LONG $0x6c0f4566; BYTE $0xca // PUNPCKLQDQ XMM9, XMM10 /* m[12], ____ */
|
||||
MOVOU 0(DX), X11 // X11 = m[0]+ m[1]
|
||||
MOVOU 64(DX), X13 // X13 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
MOVOU X15, X10
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM10, XMM14, 0x8 /* m[11], m[14] */
|
||||
BYTE $0xd6; BYTE $0x08
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdd // PUNPCKHQDQ XMM11, XMM13 /* m[1], m[9] */
|
||||
LONG $0x3a0f4566; WORD $0xd60f; BYTE $0x08 // PALIGNR XMM10, XMM14, 0x8 /* m[11], m[14] */
|
||||
LONG $0x6d0f4566; BYTE $0xdd // PUNPCKHQDQ XMM11, XMM13 /* m[1], m[9] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
MOVOU X13, X8
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xc7 // PUNPCKHQDQ XMM8, XMM15 /* m[5], m[15] */
|
||||
LONG $0x6d0f4566; BYTE $0xc7 // PUNPCKHQDQ XMM8, XMM15 /* m[5], m[15] */
|
||||
MOVOU X14, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcc // PUNPCKLQDQ XMM9, XMM12 /* m[8], m[2] */
|
||||
LONG $0x6c0f4566; BYTE $0xcc // PUNPCKLQDQ XMM9, XMM12 /* m[8], m[2] */
|
||||
MOVOU 0(DX), X10 // X10 = m[0]+ m[1]
|
||||
MOVOU 48(DX), X11 // X11 = m[6]+ m[7]
|
||||
MOVOU 80(DX), X15 // X15 = m[10]+m[11]
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd5 // PUNPCKLQDQ XMM10, XMM13 /* m[0], m[4] */
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[6], m[10] */
|
||||
LONG $0x6c0f4566; BYTE $0xd5 // PUNPCKLQDQ XMM10, XMM13 /* m[0], m[4] */
|
||||
LONG $0x6c0f4566; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[6], m[10] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 9
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
MOVOU X13, X8
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc7 // PUNPCKLQDQ XMM8, XMM15 /* m[6], m[14] */
|
||||
LONG $0x6c0f4566; BYTE $0xc7 // PUNPCKLQDQ XMM8, XMM15 /* m[6], m[14] */
|
||||
MOVOU X12, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM9, XMM14, 0x8 /* m[11], m[0] */
|
||||
BYTE $0xce; BYTE $0x08
|
||||
LONG $0x3a0f4566; WORD $0xce0f; BYTE $0x08 // PALIGNR XMM9, XMM14, 0x8 /* m[11], m[0] */
|
||||
MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
|
||||
MOVOU 64(DX), X11 // X11 = m[8]+ m[9]
|
||||
MOVOU X15, X10
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd3 // PUNPCKHQDQ XMM10, XMM11 /* m[15], m[9] */
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM11, XMM13, 0x8 /* m[3], m[8] */
|
||||
BYTE $0xdd; BYTE $0x08
|
||||
LONG $0x6d0f4566; BYTE $0xd3 // PUNPCKHQDQ XMM10, XMM11 /* m[15], m[9] */
|
||||
LONG $0x3a0f4566; WORD $0xdd0f; BYTE $0x08 // PALIGNR XMM11, XMM13, 0x8 /* m[3], m[8] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
MOVOU 96(DX), X15 // X15 = m[12]+m[13]
|
||||
MOVOU X15, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* ___, m[13] */
|
||||
LONG $0x6d0f4566; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* ___, m[13] */
|
||||
MOVOU X15, X8
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[12], ____ */
|
||||
LONG $0x6c0f4566; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[12], ____ */
|
||||
MOVOU X14, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM9, XMM12, 0x8 /* m[1], m[10] */
|
||||
BYTE $0xcc; BYTE $0x08
|
||||
LONG $0x3a0f4566; WORD $0xcc0f; BYTE $0x08 // PALIGNR XMM9, XMM12, 0x8 /* m[1], m[10] */
|
||||
MOVOU 32(DX), X12 // X12 = m[4]+ m[5]
|
||||
MOVOU 48(DX), X15 // X15 = m[6]+ m[7]
|
||||
MOVOU X15, X11
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* ___, m[7] */
|
||||
LONG $0x6d0f4566; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* ___, m[7] */
|
||||
MOVOU X13, X10
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd3 // PUNPCKLQDQ XMM10, XMM11 /* m[2], ____ */
|
||||
LONG $0x6c0f4566; BYTE $0xd3 // PUNPCKLQDQ XMM10, XMM11 /* m[2], ____ */
|
||||
MOVOU X12, X15
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xfc // PUNPCKHQDQ XMM15, XMM12 /* ___, m[5] */
|
||||
LONG $0x6d0f4566; BYTE $0xfc // PUNPCKHQDQ XMM15, XMM12 /* ___, m[5] */
|
||||
MOVOU X12, X11
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[4], ____ */
|
||||
LONG $0x6c0f4566; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[4], ____ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 1 0
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X15 // X15 = m[10]+m[11]
|
||||
MOVOU X15, X8
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc6 // PUNPCKLQDQ XMM8, XMM14 /* m[10], m[8] */
|
||||
LONG $0x6c0f4566; BYTE $0xc6 // PUNPCKLQDQ XMM8, XMM14 /* m[10], m[8] */
|
||||
MOVOU X13, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* m[7], m[1] */
|
||||
LONG $0x6d0f4566; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* m[7], m[1] */
|
||||
MOVOU 16(DX), X10 // X10 = m[2]+ m[3]
|
||||
MOVOU 32(DX), X14 // X14 = m[4]+ m[5]
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[2], m[4] */
|
||||
LONG $0x6c0f4566; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[2], m[4] */
|
||||
MOVOU X14, X15
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xfe // PUNPCKHQDQ XMM15, XMM14 /* ___, m[5] */
|
||||
LONG $0x6d0f4566; BYTE $0xfe // PUNPCKHQDQ XMM15, XMM14 /* ___, m[5] */
|
||||
MOVOU X13, X11
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[6], ____ */
|
||||
LONG $0x6c0f4566; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[6], ____ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 64(DX), X13 // X13 = m[8]+ m[9]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
MOVOU X15, X8
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xc5 // PUNPCKHQDQ XMM8, XMM13 /* m[15], m[9] */
|
||||
LONG $0x6d0f4566; BYTE $0xc5 // PUNPCKHQDQ XMM8, XMM13 /* m[15], m[9] */
|
||||
MOVOU X12, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* m[3], m[13] */
|
||||
LONG $0x6d0f4566; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* m[3], m[13] */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 80(DX), X13 // X13 = m[10]+m[11]
|
||||
MOVOU X15, X10
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM10, XMM13, 0x8 /* m[11], m[14] */
|
||||
BYTE $0xd5; BYTE $0x08
|
||||
LONG $0x3a0f4566; WORD $0xd50f; BYTE $0x08 // PALIGNR XMM10, XMM13, 0x8 /* m[11], m[14] */
|
||||
MOVOU X14, X11
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xdc // PUNPCKLQDQ XMM11, XMM12 /* m[12], m[0] */
|
||||
LONG $0x6c0f4566; BYTE $0xdc // PUNPCKLQDQ XMM11, XMM12 /* m[12], m[0] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 1 1
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+m[1]
|
||||
MOVOU 16(DX), X13 // X13 = m[2]+m[3]
|
||||
MOVOU 32(DX), X14 // X14 = m[4]+m[5]
|
||||
MOVOU 48(DX), X15 // X15 = m[6]+m[7]
|
||||
MOVOU X12, X8
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[0], m[2] */
|
||||
LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[0], m[2] */
|
||||
MOVOU X14, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[4], m[6] */
|
||||
LONG $0x6c0f4566; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[4], m[6] */
|
||||
MOVOU X12, X10
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[1], m[3] */
|
||||
LONG $0x6d0f4566; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[1], m[3] */
|
||||
MOVOU X14, X11
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[5], m[7] */
|
||||
LONG $0x6d0f4566; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[5], m[7] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 64(DX), X12 // X12 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X13 // X13 = m[10]+m[11]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
MOVOU X12, X8
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[8],m[10] */
|
||||
LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[8],m[10] */
|
||||
MOVOU X14, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[12],m[14] */
|
||||
LONG $0x6c0f4566; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[12],m[14] */
|
||||
MOVOU X12, X10
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[9],m[11] */
|
||||
LONG $0x6d0f4566; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[9],m[11] */
|
||||
MOVOU X14, X11
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[13],m[15] */
|
||||
LONG $0x6d0f4566; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[13],m[15] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 1 2
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 112(DX), X12 // X12 = m[14]+m[15]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 96(DX), X15 // X15 = m[12]+m[13]
|
||||
MOVOU X12, X8
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[14], m[4] */
|
||||
LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[14], m[4] */
|
||||
MOVOU X14, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* m[9], m[13] */
|
||||
LONG $0x6d0f4566; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* m[9], m[13] */
|
||||
MOVOU 80(DX), X10 // X10 = m[10]+m[11]
|
||||
MOVOU 48(DX), X11 // X11 = m[6]+ m[7]
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[10], m[8] */
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM11, XMM12, 0x8 /* m[15], m[6] */; ; ; ; ;
|
||||
BYTE $0xdc; BYTE $0x08
|
||||
LONG $0x6c0f4566; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[10], m[8] */
|
||||
LONG $0x3a0f4566; WORD $0xdc0f; BYTE $0x08 // PALIGNR XMM11, XMM12, 0x8 /* m[15], m[6] */; ; ; ; ;
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register)
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
MOVOU X12, X8
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM8, XMM12, 0x8 /* m[1], m[0] */
|
||||
BYTE $0xc4; BYTE $0x08
|
||||
LONG $0x3a0f4566; WORD $0xc40f; BYTE $0x08 // PALIGNR XMM8, XMM12, 0x8 /* m[1], m[0] */
|
||||
MOVOU X14, X9
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcd // PUNPCKHQDQ XMM9, XMM13 /* m[11], m[5] */
|
||||
LONG $0x6d0f4566; BYTE $0xcd // PUNPCKHQDQ XMM9, XMM13 /* m[11], m[5] */
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X11 // X11 = m[6]+ m[7]
|
||||
MOVOU 96(DX), X10 // X10 = m[12]+m[13]
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[12], m[2] */
|
||||
BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdc // PUNPCKHQDQ XMM11, XMM12 /* m[7], m[3] */
|
||||
LONG $0x6c0f4566; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[12], m[2] */
|
||||
LONG $0x6d0f4566; BYTE $0xdc // PUNPCKHQDQ XMM11, XMM12 /* m[7], m[3] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
// Reload digest (most current value store in &out)
|
||||
|
9
vendor/github.com/minio/blake2b-simd/compress_generic.go
generated
vendored
9
vendor/github.com/minio/blake2b-simd/compress_generic.go
generated
vendored
@ -26,12 +26,13 @@ func compressGeneric(d *digest, p []uint8) {
|
||||
v13 := iv[5] ^ d.t[1]
|
||||
v14 := iv[6] ^ d.f[0]
|
||||
v15 := iv[7] ^ d.f[1]
|
||||
var m [16]uint64
|
||||
|
||||
j := 0
|
||||
for i := 0; i < 16; i++ {
|
||||
m[i] = uint64(p[j]) | uint64(p[j+1])<<8 | uint64(p[j+2])<<16 | uint64(p[j+3])<<24 |
|
||||
uint64(p[j+4])<<32 | uint64(p[j+5])<<40 | uint64(p[j+6])<<48 | uint64(p[j+7])<<56
|
||||
var m [16]uint64
|
||||
for i := range m {
|
||||
m[i] = uint64(p[j]) | uint64(p[j+1])<<8 | uint64(p[j+2])<<16 |
|
||||
uint64(p[j+3])<<24 | uint64(p[j+4])<<32 | uint64(p[j+5])<<40 |
|
||||
uint64(p[j+6])<<48 | uint64(p[j+7])<<56
|
||||
j += 8
|
||||
}
|
||||
|
||||
|
22
vendor/github.com/minio/sha256-simd/cpuid_386.s
generated
vendored
22
vendor/github.com/minio/sha256-simd/cpuid_386.s
generated
vendored
@ -1,4 +1,24 @@
|
||||
// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
|
||||
// The MIT License (MIT)
|
||||
//
|
||||
// Copyright (c) 2015 Klaus Post
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
// +build 386,!gccgo
|
||||
|
||||
|
22
vendor/github.com/minio/sha256-simd/cpuid_amd64.s
generated
vendored
22
vendor/github.com/minio/sha256-simd/cpuid_amd64.s
generated
vendored
@ -1,4 +1,24 @@
|
||||
// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
|
||||
// The MIT License (MIT)
|
||||
//
|
||||
// Copyright (c) 2015 Klaus Post
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
// +build amd64,!gccgo
|
||||
|
||||
|
3
vendor/github.com/minio/sha256-simd/cpuid_arm.go
generated
vendored
3
vendor/github.com/minio/sha256-simd/cpuid_arm.go
generated
vendored
@ -28,6 +28,5 @@ func xgetbv(index uint32) (eax, edx uint32) {
|
||||
}
|
||||
|
||||
func haveArmSha() bool {
|
||||
// TODO: Implement feature detection for ARM
|
||||
return true
|
||||
return false
|
||||
}
|
||||
|
49
vendor/github.com/minio/sha256-simd/cpuid_linux_arm64.go
generated
vendored
Normal file
49
vendor/github.com/minio/sha256-simd/cpuid_linux_arm64.go
generated
vendored
Normal file
@ -0,0 +1,49 @@
|
||||
// +build arm64,linux
|
||||
|
||||
// Minio Cloud Storage, (C) 2016 Minio, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
package sha256
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io/ioutil"
|
||||
)
|
||||
|
||||
func cpuid(op uint32) (eax, ebx, ecx, edx uint32) {
|
||||
return 0, 0, 0, 0
|
||||
}
|
||||
|
||||
func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) {
|
||||
return 0, 0, 0, 0
|
||||
}
|
||||
|
||||
func xgetbv(index uint32) (eax, edx uint32) {
|
||||
return 0, 0
|
||||
}
|
||||
|
||||
// File to check for cpu capabilities.
|
||||
const procCPUInfo = "/proc/cpuinfo"
|
||||
|
||||
// Feature to check for.
|
||||
const sha256Feature = "sha2"
|
||||
|
||||
func haveArmSha() bool {
|
||||
cpuInfo, err := ioutil.ReadFile(procCPUInfo)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return bytes.Contains(cpuInfo, []byte(sha256Feature))
|
||||
}
|
35
vendor/github.com/minio/sha256-simd/cpuid_others_arm64.go
generated
vendored
Normal file
35
vendor/github.com/minio/sha256-simd/cpuid_others_arm64.go
generated
vendored
Normal file
@ -0,0 +1,35 @@
|
||||
// +build arm64,!linux
|
||||
|
||||
// Minio Cloud Storage, (C) 2016 Minio, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
package sha256
|
||||
|
||||
func cpuid(op uint32) (eax, ebx, ecx, edx uint32) {
|
||||
return 0, 0, 0, 0
|
||||
}
|
||||
|
||||
func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) {
|
||||
return 0, 0, 0, 0
|
||||
}
|
||||
|
||||
func xgetbv(index uint32) (eax, edx uint32) {
|
||||
return 0, 0
|
||||
}
|
||||
|
||||
// Check for sha2 instruction flag.
|
||||
func haveArmSha() bool {
|
||||
return false
|
||||
}
|
@ -28,6 +28,5 @@ func xgetbv(index uint32) (eax, edx uint32) {
|
||||
}
|
||||
|
||||
func haveArmSha() bool {
|
||||
// TODO: Implement feature detection for ARM
|
||||
return true
|
||||
return false
|
||||
}
|
32
vendor/github.com/minio/sha256-simd/cpuid_ppc64le.go
generated
vendored
Normal file
32
vendor/github.com/minio/sha256-simd/cpuid_ppc64le.go
generated
vendored
Normal file
@ -0,0 +1,32 @@
|
||||
// Minio Cloud Storage, (C) 2016 Minio, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
package sha256
|
||||
|
||||
func cpuid(op uint32) (eax, ebx, ecx, edx uint32) {
|
||||
return 0, 0, 0, 0
|
||||
}
|
||||
|
||||
func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) {
|
||||
return 0, 0, 0, 0
|
||||
}
|
||||
|
||||
func xgetbv(index uint32) (eax, edx uint32) {
|
||||
return 0, 0
|
||||
}
|
||||
|
||||
func haveArmSha() bool {
|
||||
return false
|
||||
}
|
3
vendor/github.com/minio/sha256-simd/sha256.go
generated
vendored
3
vendor/github.com/minio/sha256-simd/sha256.go
generated
vendored
@ -89,7 +89,8 @@ func New() hash.Hash {
|
||||
d.Reset()
|
||||
return d
|
||||
}
|
||||
// default back to the standard golang implementation
|
||||
// Fallback to the standard golang implementation
|
||||
// if no features were found.
|
||||
return sha256.New()
|
||||
}
|
||||
|
||||
|
22
vendor/github.com/minio/sha256-simd/sha256block_ppc64.go
generated
vendored
Normal file
22
vendor/github.com/minio/sha256-simd/sha256block_ppc64.go
generated
vendored
Normal file
@ -0,0 +1,22 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2016 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package sha256
|
||||
|
||||
func blockAvx2Go(dig *digest, p []byte) {}
|
||||
func blockAvxGo(dig *digest, p []byte) {}
|
||||
func blockSsseGo(dig *digest, p []byte) {}
|
||||
func blockArmGo(dig *digest, p []byte) {}
|
22
vendor/github.com/minio/sha256-simd/sha256block_ppc64le.go
generated
vendored
Normal file
22
vendor/github.com/minio/sha256-simd/sha256block_ppc64le.go
generated
vendored
Normal file
@ -0,0 +1,22 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2016 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package sha256
|
||||
|
||||
func blockAvx2Go(dig *digest, p []byte) {}
|
||||
func blockAvxGo(dig *digest, p []byte) {}
|
||||
func blockSsseGo(dig *digest, p []byte) {}
|
||||
func blockArmGo(dig *digest, p []byte) {}
|
21
vendor/vendor.json
vendored
21
vendor/vendor.json
vendored
@ -1,6 +1,6 @@
|
||||
{
|
||||
"comment": "",
|
||||
"ignore": "appengine test",
|
||||
"ignore": "test",
|
||||
"package": [
|
||||
{
|
||||
"path": "github.com/Sirupsen/logrus",
|
||||
@ -62,7 +62,7 @@
|
||||
"revisionTime": "2016-06-10T14:06:02+03:00"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "2a/SsTUBMKtcM6VtpbdPGO+c6c8=",
|
||||
"checksumSHA1": "W+E/2xXcE1GmJ0Qb784ald0Fn6I=",
|
||||
"path": "github.com/golang/snappy",
|
||||
"revision": "d9eb7a3d35ec988b8585d4a0068e462c27d28380",
|
||||
"revisionTime": "2016-05-29T05:00:41Z"
|
||||
@ -109,10 +109,10 @@
|
||||
"revisionTime": "2016-10-16T15:41:25Z"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "XRii0aDqXZvztXflEB2EE9TRoks=",
|
||||
"checksumSHA1": "Pzd1bfm8Yj1radncaohNZu+UT1I=",
|
||||
"path": "github.com/klauspost/reedsolomon",
|
||||
"revision": "c54154da9e35cab25232314cf69ab9d78447f9a5",
|
||||
"revisionTime": "2016-09-12T19:31:07Z"
|
||||
"revision": "d0a56f72c0d40a6cdde43a1575ad9686a0098b70",
|
||||
"revisionTime": "2016-10-28T07:13:20Z"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "dNYxHiBLalTqluak2/Z8c3RsSEM=",
|
||||
@ -137,9 +137,10 @@
|
||||
"revisionTime": "2015-12-11T09:06:21+09:00"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "IgPoMBktWdCLuyzDBfzi34sT+jg=",
|
||||
"path": "github.com/minio/blake2b-simd",
|
||||
"revision": "25efc542f2c5064cf312cdca043790a7af861c4c",
|
||||
"revisionTime": "2016-07-06T10:29:24+02:00"
|
||||
"revision": "c50cace0dc7d72a80244a5f88ddd3e08a73db8de",
|
||||
"revisionTime": "2016-07-22T09:38:12Z"
|
||||
},
|
||||
{
|
||||
"path": "github.com/minio/cli",
|
||||
@ -187,10 +188,10 @@
|
||||
"revisionTime": "2016-07-24T00:05:56Z"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "i8Hl0yGP1jqorMgfFMoJCItnI38=",
|
||||
"checksumSHA1": "URVle4qtadmW9w9BulDRHY3kxnA=",
|
||||
"path": "github.com/minio/sha256-simd",
|
||||
"revision": "6f50cd1d784b2bea46167b6929f16c0d12eefbfb",
|
||||
"revisionTime": "2016-08-16T22:25:11Z"
|
||||
"revision": "e82e73b775766b9011503e80e6772fc32b9afc5b",
|
||||
"revisionTime": "2016-12-19T23:17:30Z"
|
||||
},
|
||||
{
|
||||
"checksumSHA1": "Nj7vQ2GlvJiPP7sqJX5AurrDSD4=",
|
||||
|
Loading…
Reference in New Issue
Block a user