From faa6b1e925f01da76ca7190d73e6bf38cb740ab4 Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Mon, 19 Dec 2016 19:32:55 -0800 Subject: [PATCH] vendorize deps for snappy, blake2b and sha256 (#3476) Bring in new optimization and portability changes. Fixes https://github.com/minio/minio-go/issues/578 --- cmd/fs-v1-multipart.go | 3 +- cmd/fs-v1.go | 2 +- cmd/hasher.go | 3 +- cmd/xl-v1-multipart.go | 2 +- cmd/xl-v1-object.go | 2 +- .../github.com/golang/snappy/decode_other.go | 101 ++ .../github.com/golang/snappy/encode_other.go | 238 +++ .../klauspost/reedsolomon/README.md | 2 + .../github.com/minio/blake2b-simd/README.md | 2 +- .../minio/blake2b-simd/compressAvx2_amd64.go | 11 +- .../minio/blake2b-simd/compressAvx_amd64.go | 11 +- .../minio/blake2b-simd/compressAvx_amd64.s | 1348 +++++++-------- .../minio/blake2b-simd/compressSse_amd64.go | 11 +- .../minio/blake2b-simd/compressSse_amd64.s | 1526 ++++++++--------- .../minio/blake2b-simd/compress_generic.go | 9 +- .../github.com/minio/sha256-simd/cpuid_386.s | 22 +- .../minio/sha256-simd/cpuid_amd64.s | 22 +- .../github.com/minio/sha256-simd/cpuid_arm.go | 3 +- .../minio/sha256-simd/cpuid_linux_arm64.go | 49 + .../minio/sha256-simd/cpuid_others_arm64.go | 35 + .../{cpuid_arm64.go => cpuid_ppc64.go} | 3 +- .../minio/sha256-simd/cpuid_ppc64le.go | 32 + vendor/github.com/minio/sha256-simd/sha256.go | 3 +- .../minio/sha256-simd/sha256block_ppc64.go | 22 + .../minio/sha256-simd/sha256block_ppc64le.go | 22 + vendor/vendor.json | 21 +- 26 files changed, 1915 insertions(+), 1590 deletions(-) create mode 100644 vendor/github.com/golang/snappy/decode_other.go create mode 100644 vendor/github.com/golang/snappy/encode_other.go create mode 100644 vendor/github.com/minio/sha256-simd/cpuid_linux_arm64.go create mode 100644 vendor/github.com/minio/sha256-simd/cpuid_others_arm64.go rename vendor/github.com/minio/sha256-simd/{cpuid_arm64.go => cpuid_ppc64.go} (93%) create mode 100644 vendor/github.com/minio/sha256-simd/cpuid_ppc64le.go create mode 100644 vendor/github.com/minio/sha256-simd/sha256block_ppc64.go create mode 100644 vendor/github.com/minio/sha256-simd/sha256block_ppc64le.go diff --git a/cmd/fs-v1-multipart.go b/cmd/fs-v1-multipart.go index 4f7e45dc3..963f6b291 100644 --- a/cmd/fs-v1-multipart.go +++ b/cmd/fs-v1-multipart.go @@ -18,7 +18,6 @@ package cmd import ( "crypto/md5" - "crypto/sha256" "encoding/hex" "fmt" "hash" @@ -26,6 +25,8 @@ import ( "path" "strings" "time" + + "github.com/minio/sha256-simd" ) // listMultipartUploads - lists all multipart uploads. diff --git a/cmd/fs-v1.go b/cmd/fs-v1.go index ead0c5745..b7588a47f 100644 --- a/cmd/fs-v1.go +++ b/cmd/fs-v1.go @@ -18,7 +18,6 @@ package cmd import ( "crypto/md5" - "crypto/sha256" "encoding/hex" "errors" "fmt" @@ -29,6 +28,7 @@ import ( "strings" "github.com/minio/minio/pkg/mimedb" + "github.com/minio/sha256-simd" ) // fsObjects - Implements fs object layer. diff --git a/cmd/hasher.go b/cmd/hasher.go index 83e2c474a..2988ede1b 100644 --- a/cmd/hasher.go +++ b/cmd/hasher.go @@ -18,9 +18,10 @@ package cmd import ( "crypto/md5" - "crypto/sha256" "encoding/base64" "encoding/hex" + + "github.com/minio/sha256-simd" ) // getSHA256Hash returns SHA-256 hash of given data. diff --git a/cmd/xl-v1-multipart.go b/cmd/xl-v1-multipart.go index 802c34ee4..1ce509639 100644 --- a/cmd/xl-v1-multipart.go +++ b/cmd/xl-v1-multipart.go @@ -18,7 +18,6 @@ package cmd import ( "crypto/md5" - "crypto/sha256" "encoding/hex" "fmt" "hash" @@ -29,6 +28,7 @@ import ( "time" "github.com/minio/minio/pkg/mimedb" + "github.com/minio/sha256-simd" ) // listMultipartUploads - lists all multipart uploads. diff --git a/cmd/xl-v1-object.go b/cmd/xl-v1-object.go index 6ee87336a..a39fef4a1 100644 --- a/cmd/xl-v1-object.go +++ b/cmd/xl-v1-object.go @@ -18,7 +18,6 @@ package cmd import ( "crypto/md5" - "crypto/sha256" "encoding/hex" "hash" "io" @@ -30,6 +29,7 @@ import ( "github.com/minio/minio/pkg/bpool" "github.com/minio/minio/pkg/mimedb" "github.com/minio/minio/pkg/objcache" + "github.com/minio/sha256-simd" ) // list all errors which can be ignored in object operations. diff --git a/vendor/github.com/golang/snappy/decode_other.go b/vendor/github.com/golang/snappy/decode_other.go new file mode 100644 index 000000000..8c9f2049b --- /dev/null +++ b/vendor/github.com/golang/snappy/decode_other.go @@ -0,0 +1,101 @@ +// Copyright 2016 The Snappy-Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !amd64 appengine !gc noasm + +package snappy + +// decode writes the decoding of src to dst. It assumes that the varint-encoded +// length of the decompressed bytes has already been read, and that len(dst) +// equals that length. +// +// It returns 0 on success or a decodeErrCodeXxx error code on failure. +func decode(dst, src []byte) int { + var d, s, offset, length int + for s < len(src) { + switch src[s] & 0x03 { + case tagLiteral: + x := uint32(src[s] >> 2) + switch { + case x < 60: + s++ + case x == 60: + s += 2 + if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. + return decodeErrCodeCorrupt + } + x = uint32(src[s-1]) + case x == 61: + s += 3 + if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. + return decodeErrCodeCorrupt + } + x = uint32(src[s-2]) | uint32(src[s-1])<<8 + case x == 62: + s += 4 + if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. + return decodeErrCodeCorrupt + } + x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 + case x == 63: + s += 5 + if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. + return decodeErrCodeCorrupt + } + x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 + } + length = int(x) + 1 + if length <= 0 { + return decodeErrCodeUnsupportedLiteralLength + } + if length > len(dst)-d || length > len(src)-s { + return decodeErrCodeCorrupt + } + copy(dst[d:], src[s:s+length]) + d += length + s += length + continue + + case tagCopy1: + s += 2 + if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. + return decodeErrCodeCorrupt + } + length = 4 + int(src[s-2])>>2&0x7 + offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) + + case tagCopy2: + s += 3 + if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. + return decodeErrCodeCorrupt + } + length = 1 + int(src[s-3])>>2 + offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) + + case tagCopy4: + s += 5 + if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line. + return decodeErrCodeCorrupt + } + length = 1 + int(src[s-5])>>2 + offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) + } + + if offset <= 0 || d < offset || length > len(dst)-d { + return decodeErrCodeCorrupt + } + // Copy from an earlier sub-slice of dst to a later sub-slice. Unlike + // the built-in copy function, this byte-by-byte copy always runs + // forwards, even if the slices overlap. Conceptually, this is: + // + // d += forwardCopy(dst[d:d+length], dst[d-offset:]) + for end := d + length; d != end; d++ { + dst[d] = dst[d-offset] + } + } + if d != len(dst) { + return decodeErrCodeCorrupt + } + return 0 +} diff --git a/vendor/github.com/golang/snappy/encode_other.go b/vendor/github.com/golang/snappy/encode_other.go new file mode 100644 index 000000000..dbcae905e --- /dev/null +++ b/vendor/github.com/golang/snappy/encode_other.go @@ -0,0 +1,238 @@ +// Copyright 2016 The Snappy-Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !amd64 appengine !gc noasm + +package snappy + +func load32(b []byte, i int) uint32 { + b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line. + return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 +} + +func load64(b []byte, i int) uint64 { + b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line. + return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | + uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 +} + +// emitLiteral writes a literal chunk and returns the number of bytes written. +// +// It assumes that: +// dst is long enough to hold the encoded bytes +// 1 <= len(lit) && len(lit) <= 65536 +func emitLiteral(dst, lit []byte) int { + i, n := 0, uint(len(lit)-1) + switch { + case n < 60: + dst[0] = uint8(n)<<2 | tagLiteral + i = 1 + case n < 1<<8: + dst[0] = 60<<2 | tagLiteral + dst[1] = uint8(n) + i = 2 + default: + dst[0] = 61<<2 | tagLiteral + dst[1] = uint8(n) + dst[2] = uint8(n >> 8) + i = 3 + } + return i + copy(dst[i:], lit) +} + +// emitCopy writes a copy chunk and returns the number of bytes written. +// +// It assumes that: +// dst is long enough to hold the encoded bytes +// 1 <= offset && offset <= 65535 +// 4 <= length && length <= 65535 +func emitCopy(dst []byte, offset, length int) int { + i := 0 + // The maximum length for a single tagCopy1 or tagCopy2 op is 64 bytes. The + // threshold for this loop is a little higher (at 68 = 64 + 4), and the + // length emitted down below is is a little lower (at 60 = 64 - 4), because + // it's shorter to encode a length 67 copy as a length 60 tagCopy2 followed + // by a length 7 tagCopy1 (which encodes as 3+2 bytes) than to encode it as + // a length 64 tagCopy2 followed by a length 3 tagCopy2 (which encodes as + // 3+3 bytes). The magic 4 in the 64±4 is because the minimum length for a + // tagCopy1 op is 4 bytes, which is why a length 3 copy has to be an + // encodes-as-3-bytes tagCopy2 instead of an encodes-as-2-bytes tagCopy1. + for length >= 68 { + // Emit a length 64 copy, encoded as 3 bytes. + dst[i+0] = 63<<2 | tagCopy2 + dst[i+1] = uint8(offset) + dst[i+2] = uint8(offset >> 8) + i += 3 + length -= 64 + } + if length > 64 { + // Emit a length 60 copy, encoded as 3 bytes. + dst[i+0] = 59<<2 | tagCopy2 + dst[i+1] = uint8(offset) + dst[i+2] = uint8(offset >> 8) + i += 3 + length -= 60 + } + if length >= 12 || offset >= 2048 { + // Emit the remaining copy, encoded as 3 bytes. + dst[i+0] = uint8(length-1)<<2 | tagCopy2 + dst[i+1] = uint8(offset) + dst[i+2] = uint8(offset >> 8) + return i + 3 + } + // Emit the remaining copy, encoded as 2 bytes. + dst[i+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1 + dst[i+1] = uint8(offset) + return i + 2 +} + +// extendMatch returns the largest k such that k <= len(src) and that +// src[i:i+k-j] and src[j:k] have the same contents. +// +// It assumes that: +// 0 <= i && i < j && j <= len(src) +func extendMatch(src []byte, i, j int) int { + for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 { + } + return j +} + +func hash(u, shift uint32) uint32 { + return (u * 0x1e35a7bd) >> shift +} + +// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// len(dst) >= MaxEncodedLen(len(src)) && +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +func encodeBlock(dst, src []byte) (d int) { + // Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive. + // The table element type is uint16, as s < sLimit and sLimit < len(src) + // and len(src) <= maxBlockSize and maxBlockSize == 65536. + const ( + maxTableSize = 1 << 14 + // tableMask is redundant, but helps the compiler eliminate bounds + // checks. + tableMask = maxTableSize - 1 + ) + shift := uint32(32 - 8) + for tableSize := 1 << 8; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 { + shift-- + } + // In Go, all array elements are zero-initialized, so there is no advantage + // to a smaller tableSize per se. However, it matches the C++ algorithm, + // and in the asm versions of this code, we can get away with zeroing only + // the first tableSize elements. + var table [maxTableSize]uint16 + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := len(src) - inputMargin + + // nextEmit is where in src the next emitLiteral should start from. + nextEmit := 0 + + // The encoded form must start with a literal, as there are no previous + // bytes to copy, so we start looking for hash matches at s == 1. + s := 1 + nextHash := hash(load32(src, s), shift) + + for { + // Copied from the C++ snappy implementation: + // + // Heuristic match skipping: If 32 bytes are scanned with no matches + // found, start looking only at every other byte. If 32 more bytes are + // scanned (or skipped), look at every third byte, etc.. When a match + // is found, immediately go back to looking at every byte. This is a + // small loss (~5% performance, ~0.1% density) for compressible data + // due to more bookkeeping, but for non-compressible data (such as + // JPEG) it's a huge win since the compressor quickly "realizes" the + // data is incompressible and doesn't bother looking for matches + // everywhere. + // + // The "skip" variable keeps track of how many bytes there are since + // the last match; dividing it by 32 (ie. right-shifting by five) gives + // the number of bytes to move ahead for each iteration. + skip := 32 + + nextS := s + candidate := 0 + for { + s = nextS + bytesBetweenHashLookups := skip >> 5 + nextS = s + bytesBetweenHashLookups + skip += bytesBetweenHashLookups + if nextS > sLimit { + goto emitRemainder + } + candidate = int(table[nextHash&tableMask]) + table[nextHash&tableMask] = uint16(s) + nextHash = hash(load32(src, nextS), shift) + if load32(src, s) == load32(src, candidate) { + break + } + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes + // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit + // them as literal bytes. + d += emitLiteral(dst[d:], src[nextEmit:s]) + + // Call emitCopy, and then see if another emitCopy could be our next + // move. Repeat until we find no match for the input immediately after + // what was consumed by the last emitCopy call. + // + // If we exit this loop normally then we need to call emitLiteral next, + // though we don't yet know how big the literal will be. We handle that + // by proceeding to the next iteration of the main loop. We also can + // exit this loop via goto if we get close to exhausting the input. + for { + // Invariant: we have a 4-byte match at s, and no need to emit any + // literal bytes prior to s. + base := s + + // Extend the 4-byte match as long as possible. + // + // This is an inlined version of: + // s = extendMatch(src, candidate+4, s+4) + s += 4 + for i := candidate + 4; s < len(src) && src[i] == src[s]; i, s = i+1, s+1 { + } + + d += emitCopy(dst[d:], base-candidate, s-base) + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + + // We could immediately start working at s now, but to improve + // compression we first update the hash table at s-1 and at s. If + // another emitCopy is not our next move, also calculate nextHash + // at s+1. At least on GOARCH=amd64, these three hash calculations + // are faster as one load64 call (with some shifts) instead of + // three load32 calls. + x := load64(src, s-1) + prevHash := hash(uint32(x>>0), shift) + table[prevHash&tableMask] = uint16(s - 1) + currHash := hash(uint32(x>>8), shift) + candidate = int(table[currHash&tableMask]) + table[currHash&tableMask] = uint16(s) + if uint32(x>>8) != load32(src, candidate) { + nextHash = hash(uint32(x>>16), shift) + s++ + break + } + } + } + +emitRemainder: + if nextEmit < len(src) { + d += emitLiteral(dst[d:], src[nextEmit:]) + } + return d +} diff --git a/vendor/github.com/klauspost/reedsolomon/README.md b/vendor/github.com/klauspost/reedsolomon/README.md index cd6fc04cc..3e7f51841 100644 --- a/vendor/github.com/klauspost/reedsolomon/README.md +++ b/vendor/github.com/klauspost/reedsolomon/README.md @@ -193,6 +193,8 @@ Example of performance scaling on Intel(R) Core(TM) i7-2600 CPU @ 3.40GHz - 4 ph # Links * [Backblaze Open Sources Reed-Solomon Erasure Coding Source Code](https://www.backblaze.com/blog/reed-solomon/). * [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon). Compatible java library by Backblaze. +* [reedsolomon-c](https://github.com/jannson/reedsolomon-c). C version, compatible with output from this package. +* [Reed-Solomon Erasure Coding in Haskell](https://github.com/NicolasT/reedsolomon). Haskell port of the package with similar performance. * [go-erasure](https://github.com/somethingnew2-0/go-erasure). A similar library using cgo, slower in my tests. * [rsraid](https://github.com/goayame/rsraid). A similar library written in Go. Slower, but supports more shards. * [Screaming Fast Galois Field Arithmetic](http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf). Basis for SSE3 optimizations. diff --git a/vendor/github.com/minio/blake2b-simd/README.md b/vendor/github.com/minio/blake2b-simd/README.md index b4f6ac558..31fcbf749 100644 --- a/vendor/github.com/minio/blake2b-simd/README.md +++ b/vendor/github.com/minio/blake2b-simd/README.md @@ -24,7 +24,7 @@ This is a summary of the performance improvements. Full details are shown below. asm2plan9s ---------- -In order to be able to work more easily with AVX2/AVX instructions, a separate tool was developed to convert AVX2/AVX instructions into the corresponding BYTE sequence as accepted by Go assembly. See [asm2plan9s](https://github.com/fwessels/asm2plan9s) for more information. +In order to be able to work more easily with AVX2/AVX instructions, a separate tool was developed to convert AVX2/AVX instructions into the corresponding BYTE sequence as accepted by Go assembly. See [asm2plan9s](https://github.com/minio/asm2plan9s) for more information. bt2sum ------ diff --git a/vendor/github.com/minio/blake2b-simd/compressAvx2_amd64.go b/vendor/github.com/minio/blake2b-simd/compressAvx2_amd64.go index 1b3ebae0f..ec53599f8 100644 --- a/vendor/github.com/minio/blake2b-simd/compressAvx2_amd64.go +++ b/vendor/github.com/minio/blake2b-simd/compressAvx2_amd64.go @@ -23,11 +23,12 @@ package blake2b func compressAVX2Loop(p []uint8, in, iv, t, f, shffle, out []uint64) func compressAVX2(d *digest, p []uint8) { + var ( + in [8]uint64 + out [8]uint64 + shffle [8]uint64 + ) - in := make([]uint64, 8, 8) - out := make([]uint64, 8, 8) - - shffle := make([]uint64, 8, 8) // vector for PSHUFB instruction shffle[0] = 0x0201000706050403 shffle[1] = 0x0a09080f0e0d0c0b @@ -40,7 +41,7 @@ func compressAVX2(d *digest, p []uint8) { in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7] = d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7] - compressAVX2Loop(p, in, iv[:], d.t[:], d.f[:], shffle, out) + compressAVX2Loop(p, in[:], iv[:], d.t[:], d.f[:], shffle[:], out[:]) d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7] = out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7] } diff --git a/vendor/github.com/minio/blake2b-simd/compressAvx_amd64.go b/vendor/github.com/minio/blake2b-simd/compressAvx_amd64.go index 7bed76c02..cfa12c04f 100644 --- a/vendor/github.com/minio/blake2b-simd/compressAvx_amd64.go +++ b/vendor/github.com/minio/blake2b-simd/compressAvx_amd64.go @@ -23,18 +23,19 @@ package blake2b func blockAVXLoop(p []uint8, in, iv, t, f, shffle, out []uint64) func compressAVX(d *digest, p []uint8) { + var ( + in [8]uint64 + out [8]uint64 + shffle [2]uint64 + ) - in := make([]uint64, 8, 8) - out := make([]uint64, 8, 8) - - shffle := make([]uint64, 2, 2) // vector for PSHUFB instruction shffle[0] = 0x0201000706050403 shffle[1] = 0x0a09080f0e0d0c0b in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7] = d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7] - blockAVXLoop(p, in, iv[:], d.t[:], d.f[:], shffle, out) + blockAVXLoop(p, in[:], iv[:], d.t[:], d.f[:], shffle[:], out[:]) d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7] = out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7] } diff --git a/vendor/github.com/minio/blake2b-simd/compressAvx_amd64.s b/vendor/github.com/minio/blake2b-simd/compressAvx_amd64.s index 987d0bf47..f68e17392 100644 --- a/vendor/github.com/minio/blake2b-simd/compressAvx_amd64.s +++ b/vendor/github.com/minio/blake2b-simd/compressAvx_amd64.s @@ -43,750 +43,640 @@ // #define G1 \ - \ // G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); - BYTE $0xc4; BYTE $0xc1; BYTE $0x79; BYTE $0xd4; BYTE $0xc0 \ // VPADDQ XMM0,XMM0,XMM8 /* v0 += m[0], v1 += m[2] */ - BYTE $0xc4; BYTE $0xc1; BYTE $0x71; BYTE $0xd4; BYTE $0xc9 \ // VPADDQ XMM1,XMM1,XMM9 /* v2 += m[4], v3 += m[6] */ - BYTE $0xc5; BYTE $0xf9; BYTE $0xd4; BYTE $0xc2 \ // VPADDQ XMM0,XMM0,XMM2 /* v0 += v4, v1 += v5 */ - BYTE $0xc5; BYTE $0xf1; BYTE $0xd4; BYTE $0xcb \ // VPADDQ XMM1,XMM1,XMM3 /* v2 += v6, v3 += v7 */ - BYTE $0xc5; BYTE $0xc9; BYTE $0xef; BYTE $0xf0 \ // VPXOR XMM6,XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */ - BYTE $0xc5; BYTE $0xc1; BYTE $0xef; BYTE $0xf9 \ // VPXOR XMM7,XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */ - BYTE $0xc5; BYTE $0xf9; BYTE $0x70; BYTE $0xf6; BYTE $0xb1 \ // VPSHUFD XMM6,XMM6,0xb1 /* v12 = v12<<(64-32) | v12>>32, v13 = v13<<(64-32) | v13>>32 */ - BYTE $0xc5; BYTE $0xf9; BYTE $0x70; BYTE $0xff; BYTE $0xb1 \ // VPSHUFD XMM7,XMM7,0xb1 /* v14 = v14<<(64-32) | v14>>32, v15 = v15<<(64-32) | v15>>32 */ - BYTE $0xc5; BYTE $0xd9; BYTE $0xd4; BYTE $0xe6 \ // VPADDQ XMM4,XMM4,XMM6 /* v8 += v12, v9 += v13 */ - BYTE $0xc5; BYTE $0xd1; BYTE $0xd4; BYTE $0xef \ // VPADDQ XMM5,XMM5,XMM7 /* v10 += v14, v11 += v15 */ - BYTE $0xc5; BYTE $0xe9; BYTE $0xef; BYTE $0xd4 \ // VPXOR XMM2,XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */ - BYTE $0xc5; BYTE $0xe1; BYTE $0xef; BYTE $0xdd \ // VPXOR XMM3,XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */ - BYTE $0xc4; BYTE $0xc2; BYTE $0x69; BYTE $0x00; BYTE $0xd4 \ // VPSHUFB XMM2,XMM2,XMM12 /* v4 = v4<<(64-24) | v4>>24, v5 = v5<<(64-24) | v5>>24 */ - BYTE $0xc4; BYTE $0xc2; BYTE $0x61; BYTE $0x00; BYTE $0xdc // VPSHUFB XMM3,XMM3,XMM12 /* v6 = v6<<(64-24) | v6>>24, v7 = v7<<(64-24) | v7>>24 */ + \ // G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); + LONG $0xd479c1c4; BYTE $0xc0 \ // VPADDQ XMM0,XMM0,XMM8 /* v0 += m[0], v1 += m[2] */ + LONG $0xd471c1c4; BYTE $0xc9 \ // VPADDQ XMM1,XMM1,XMM9 /* v2 += m[4], v3 += m[6] */ + LONG $0xc2d4f9c5 \ // VPADDQ XMM0,XMM0,XMM2 /* v0 += v4, v1 += v5 */ + LONG $0xcbd4f1c5 \ // VPADDQ XMM1,XMM1,XMM3 /* v2 += v6, v3 += v7 */ + LONG $0xf0efc9c5 \ // VPXOR XMM6,XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */ + LONG $0xf9efc1c5 \ // VPXOR XMM7,XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */ + LONG $0xf670f9c5; BYTE $0xb1 \ // VPSHUFD XMM6,XMM6,0xb1 /* v12 = v12<<(64-32) | v12>>32, v13 = v13<<(64-32) | v13>>32 */ + LONG $0xff70f9c5; BYTE $0xb1 \ // VPSHUFD XMM7,XMM7,0xb1 /* v14 = v14<<(64-32) | v14>>32, v15 = v15<<(64-32) | v15>>32 */ + LONG $0xe6d4d9c5 \ // VPADDQ XMM4,XMM4,XMM6 /* v8 += v12, v9 += v13 */ + LONG $0xefd4d1c5 \ // VPADDQ XMM5,XMM5,XMM7 /* v10 += v14, v11 += v15 */ + LONG $0xd4efe9c5 \ // VPXOR XMM2,XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */ + LONG $0xddefe1c5 \ // VPXOR XMM3,XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */ + LONG $0x0069c2c4; BYTE $0xd4 \ // VPSHUFB XMM2,XMM2,XMM12 /* v4 = v4<<(64-24) | v4>>24, v5 = v5<<(64-24) | v5>>24 */ + LONG $0x0061c2c4; BYTE $0xdc // VPSHUFB XMM3,XMM3,XMM12 /* v6 = v6<<(64-24) | v6>>24, v7 = v7<<(64-24) | v7>>24 */ #define G2 \ - \ // G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); - BYTE $0xc4; BYTE $0xc1; BYTE $0x79; BYTE $0xd4; BYTE $0xc2 \ // VPADDQ XMM0,XMM0,XMM10 /* v0 += m[1], v1 += m[3] */ - BYTE $0xc4; BYTE $0xc1; BYTE $0x71; BYTE $0xd4; BYTE $0xcb \ // VPADDQ XMM1,XMM1,XMM11 /* v2 += m[5], v3 += m[7] */ - BYTE $0xc5; BYTE $0xf9; BYTE $0xd4; BYTE $0xc2 \ // VPADDQ XMM0,XMM0,XMM2 /* v0 += v4, v1 += v5 */ - BYTE $0xc5; BYTE $0xf1; BYTE $0xd4; BYTE $0xcb \ // VPADDQ XMM1,XMM1,XMM3 /* v2 += v6, v3 += v7 */ - BYTE $0xc5; BYTE $0xc9; BYTE $0xef; BYTE $0xf0 \ // VPXOR XMM6,XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */ - BYTE $0xc5; BYTE $0xc1; BYTE $0xef; BYTE $0xf9 \ // VPXOR XMM7,XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */ - BYTE $0xc5; BYTE $0xfb; BYTE $0x70; BYTE $0xf6; BYTE $0x39 \ // VPSHUFLW XMM6,XMM6,0x39 /* combined with next ... */ - BYTE $0xc5; BYTE $0xfa; BYTE $0x70; BYTE $0xf6; BYTE $0x39 \ // VPSHUFHW XMM6,XMM6,0x39 /* v12 = v12<<(64-16) | v12>>16, v13 = v13<<(64-16) | v13>>16 */ - BYTE $0xc5; BYTE $0xfb; BYTE $0x70; BYTE $0xff; BYTE $0x39 \ // VPSHUFLW XMM7,XMM7,0x39 /* combined with next ... */ - BYTE $0xc5; BYTE $0xfa; BYTE $0x70; BYTE $0xff; BYTE $0x39 \ // VPSHUFHW XMM7,XMM7,0x39 /* v14 = v14<<(64-16) | v14>>16, v15 = v15<<(64-16) | v15>>16 */ - BYTE $0xc5; BYTE $0xd9; BYTE $0xd4; BYTE $0xe6 \ // VPADDQ XMM4,XMM4,XMM6 /* v8 += v12, v9 += v13 */ - BYTE $0xc5; BYTE $0xd1; BYTE $0xd4; BYTE $0xef \ // VPADDQ XMM5,XMM5,XMM7 /* v10 += v14, v11 += v15 */ - BYTE $0xc5; BYTE $0xe9; BYTE $0xef; BYTE $0xd4 \ // VPXOR XMM2,XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */ - BYTE $0xc5; BYTE $0xe1; BYTE $0xef; BYTE $0xdd \ // VPXOR XMM3,XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */ - BYTE $0xc5; BYTE $0x69; BYTE $0xd4; BYTE $0xfa \ // VPADDQ XMM15,XMM2,XMM2 /* temp reg = reg*2 */ - BYTE $0xc5; BYTE $0xe9; BYTE $0x73; BYTE $0xd2; BYTE $0x3f \ // VPSRLQ XMM2,XMM2,0x3f /* reg = reg>>63 */ - BYTE $0xc4; BYTE $0xc1; BYTE $0x69; BYTE $0xef; BYTE $0xd7 \ // VPXOR XMM2,XMM2,XMM15 /* ORed together: v4 = v4<<(64-63) | v4>>63, v5 = v5<<(64-63) | v5>>63 */ - BYTE $0xc5; BYTE $0x61; BYTE $0xd4; BYTE $0xfb \ // VPADDQ XMM15,XMM3,XMM3 /* temp reg = reg*2 */ - BYTE $0xc5; BYTE $0xe1; BYTE $0x73; BYTE $0xd3; BYTE $0x3f \ // VPSRLQ XMM3,XMM3,0x3f /* reg = reg>>63 */ - BYTE $0xc4; BYTE $0xc1; BYTE $0x61; BYTE $0xef; BYTE $0xdf // VPXOR XMM3,XMM3,XMM15 /* ORed together: v6 = v6<<(64-63) | v6>>63, v7 = v7<<(64-63) | v7>>63 */ + \ // G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); + LONG $0xd479c1c4; BYTE $0xc2 \ // VPADDQ XMM0,XMM0,XMM10 /* v0 += m[1], v1 += m[3] */ + LONG $0xd471c1c4; BYTE $0xcb \ // VPADDQ XMM1,XMM1,XMM11 /* v2 += m[5], v3 += m[7] */ + LONG $0xc2d4f9c5 \ // VPADDQ XMM0,XMM0,XMM2 /* v0 += v4, v1 += v5 */ + LONG $0xcbd4f1c5 \ // VPADDQ XMM1,XMM1,XMM3 /* v2 += v6, v3 += v7 */ + LONG $0xf0efc9c5 \ // VPXOR XMM6,XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */ + LONG $0xf9efc1c5 \ // VPXOR XMM7,XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */ + LONG $0xf670fbc5; BYTE $0x39 \ // VPSHUFLW XMM6,XMM6,0x39 /* combined with next ... */ + LONG $0xf670fac5; BYTE $0x39 \ // VPSHUFHW XMM6,XMM6,0x39 /* v12 = v12<<(64-16) | v12>>16, v13 = v13<<(64-16) | v13>>16 */ + LONG $0xff70fbc5; BYTE $0x39 \ // VPSHUFLW XMM7,XMM7,0x39 /* combined with next ... */ + LONG $0xff70fac5; BYTE $0x39 \ // VPSHUFHW XMM7,XMM7,0x39 /* v14 = v14<<(64-16) | v14>>16, v15 = v15<<(64-16) | v15>>16 */ + LONG $0xe6d4d9c5 \ // VPADDQ XMM4,XMM4,XMM6 /* v8 += v12, v9 += v13 */ + LONG $0xefd4d1c5 \ // VPADDQ XMM5,XMM5,XMM7 /* v10 += v14, v11 += v15 */ + LONG $0xd4efe9c5 \ // VPXOR XMM2,XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */ + LONG $0xddefe1c5 \ // VPXOR XMM3,XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */ + LONG $0xfad469c5 \ // VPADDQ XMM15,XMM2,XMM2 /* temp reg = reg*2 */ + LONG $0xd273e9c5; BYTE $0x3f \ // VPSRLQ XMM2,XMM2,0x3f /* reg = reg>>63 */ + LONG $0xef69c1c4; BYTE $0xd7 \ // VPXOR XMM2,XMM2,XMM15 /* ORed together: v4 = v4<<(64-63) | v4>>63, v5 = v5<<(64-63) | v5>>63 */ + LONG $0xfbd461c5 \ // VPADDQ XMM15,XMM3,XMM3 /* temp reg = reg*2 */ + LONG $0xd373e1c5; BYTE $0x3f \ // VPSRLQ XMM3,XMM3,0x3f /* reg = reg>>63 */ + LONG $0xef61c1c4; BYTE $0xdf // VPXOR XMM3,XMM3,XMM15 /* ORed together: v6 = v6<<(64-63) | v6>>63, v7 = v7<<(64-63) | v7>>63 */ #define DIAGONALIZE \ - \ // DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); - MOVOU X6, X13 \ /* t0 = row4l;\ */ - MOVOU X2, X14 \ /* t1 = row2l;\ */ - MOVOU X4, X6 \ /* row4l = row3l;\ */ - MOVOU X5, X4 \ /* row3l = row3h;\ */ - MOVOU X6, X5 \ /* row3h = row4l;\ */ - BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xfd \ // VPUNPCKLQDQ XMM15, XMM13, XMM13 /* _mm_unpacklo_epi64(t0, t0) */ - BYTE $0xc4; BYTE $0xc1; BYTE $0x41; BYTE $0x6d; BYTE $0xf7 \ // VPUNPCKHQDQ XMM6, XMM7, XMM15 /* row4l = _mm_unpackhi_epi64(row4h, ); \ */ - BYTE $0xc5; BYTE $0x41; BYTE $0x6c; BYTE $0xff \ // VPUNPCKLQDQ XMM15, XMM7, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */ - BYTE $0xc4; BYTE $0xc1; BYTE $0x11; BYTE $0x6d; BYTE $0xff \ // VPUNPCKHQDQ XMM7, XMM13, XMM15 /* row4h = _mm_unpackhi_epi64(t0, ); \ */ - BYTE $0xc5; BYTE $0x61; BYTE $0x6c; BYTE $0xfb \ // VPUNPCKLQDQ XMM15, XMM3, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */ - BYTE $0xc4; BYTE $0xc1; BYTE $0x69; BYTE $0x6d; BYTE $0xd7 \ // VPUNPCKHQDQ XMM2, XMM2, XMM15 /* row2l = _mm_unpackhi_epi64(row2l, ); \ */ - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xfe \ // VPUNPCKLQDQ XMM15, XMM14, XMM14 /* _mm_unpacklo_epi64(t1, t1) */ - BYTE $0xc4; BYTE $0xc1; BYTE $0x61; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM3, XMM3, XMM15 /* row2h = _mm_unpackhi_epi64(row2h, ) */ + \ // DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); + MOVOU X6, X13 \ /* t0 = row4l;\ */ + MOVOU X2, X14 \ /* t1 = row2l;\ */ + MOVOU X4, X6 \ /* row4l = row3l;\ */ + MOVOU X5, X4 \ /* row3l = row3h;\ */ + MOVOU X6, X5 \ /* row3h = row4l;\ */ + LONG $0x6c1141c4; BYTE $0xfd \ // VPUNPCKLQDQ XMM15, XMM13, XMM13 /* _mm_unpacklo_epi64(t0, t0) */ + LONG $0x6d41c1c4; BYTE $0xf7 \ // VPUNPCKHQDQ XMM6, XMM7, XMM15 /* row4l = _mm_unpackhi_epi64(row4h, ); \ */ + LONG $0xff6c41c5 \ // VPUNPCKLQDQ XMM15, XMM7, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */ + LONG $0x6d11c1c4; BYTE $0xff \ // VPUNPCKHQDQ XMM7, XMM13, XMM15 /* row4h = _mm_unpackhi_epi64(t0, ); \ */ + LONG $0xfb6c61c5 \ // VPUNPCKLQDQ XMM15, XMM3, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */ + LONG $0x6d69c1c4; BYTE $0xd7 \ // VPUNPCKHQDQ XMM2, XMM2, XMM15 /* row2l = _mm_unpackhi_epi64(row2l, ); \ */ + LONG $0x6c0941c4; BYTE $0xfe \ // VPUNPCKLQDQ XMM15, XMM14, XMM14 /* _mm_unpacklo_epi64(t1, t1) */ + LONG $0x6d61c1c4; BYTE $0xdf // VPUNPCKHQDQ XMM3, XMM3, XMM15 /* row2h = _mm_unpackhi_epi64(row2h, ) */ #define UNDIAGONALIZE \ - \ // UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); - MOVOU X4, X13 \ /* t0 = row3l;\ */ - MOVOU X5, X4 \ /* row3l = row3h;\ */ - MOVOU X13, X5 \ /* row3h = t0;\ */ - MOVOU X2, X13 \ /* t0 = row2l;\ */ - MOVOU X6, X14 \ /* t1 = row4l;\ */ - BYTE $0xc5; BYTE $0x69; BYTE $0x6c; BYTE $0xfa \ // VPUNPCKLQDQ XMM15, XMM2, XMM2 /* _mm_unpacklo_epi64(row2l, row2l) */ - BYTE $0xc4; BYTE $0xc1; BYTE $0x61; BYTE $0x6d; BYTE $0xd7 \ // VPUNPCKHQDQ XMM2, XMM3, XMM15 /* row2l = _mm_unpackhi_epi64(row2h, ); \ */ - BYTE $0xc5; BYTE $0x61; BYTE $0x6c; BYTE $0xfb \ // VPUNPCKLQDQ XMM15, XMM3, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */ - BYTE $0xc4; BYTE $0xc1; BYTE $0x11; BYTE $0x6d; BYTE $0xdf \ // VPUNPCKHQDQ XMM3, XMM13, XMM15 /* row2h = _mm_unpackhi_epi64(t0, ); \ */ - BYTE $0xc5; BYTE $0x41; BYTE $0x6c; BYTE $0xff \ // VPUNPCKLQDQ XMM15, XMM7, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */ - BYTE $0xc4; BYTE $0xc1; BYTE $0x49; BYTE $0x6d; BYTE $0xf7 \ // VPUNPCKHQDQ XMM6, XMM6, XMM15 /* row4l = _mm_unpackhi_epi64(row4l, ); \ */ - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xfe \ // VPUNPCKLQDQ XMM15, XMM14, XMM14 /* _mm_unpacklo_epi64(t1, t1) */ - BYTE $0xc4; BYTE $0xc1; BYTE $0x41; BYTE $0x6d; BYTE $0xff // VPUNPCKHQDQ XMM7, XMM7, XMM15 /* row4h = _mm_unpackhi_epi64(row4h, ) */ + \ // UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); + MOVOU X4, X13 \ /* t0 = row3l;\ */ + MOVOU X5, X4 \ /* row3l = row3h;\ */ + MOVOU X13, X5 \ /* row3h = t0;\ */ + MOVOU X2, X13 \ /* t0 = row2l;\ */ + MOVOU X6, X14 \ /* t1 = row4l;\ */ + LONG $0xfa6c69c5 \ // VPUNPCKLQDQ XMM15, XMM2, XMM2 /* _mm_unpacklo_epi64(row2l, row2l) */ + LONG $0x6d61c1c4; BYTE $0xd7 \ // VPUNPCKHQDQ XMM2, XMM3, XMM15 /* row2l = _mm_unpackhi_epi64(row2h, ); \ */ + LONG $0xfb6c61c5 \ // VPUNPCKLQDQ XMM15, XMM3, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */ + LONG $0x6d11c1c4; BYTE $0xdf \ // VPUNPCKHQDQ XMM3, XMM13, XMM15 /* row2h = _mm_unpackhi_epi64(t0, ); \ */ + LONG $0xff6c41c5 \ // VPUNPCKLQDQ XMM15, XMM7, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */ + LONG $0x6d49c1c4; BYTE $0xf7 \ // VPUNPCKHQDQ XMM6, XMM6, XMM15 /* row4l = _mm_unpackhi_epi64(row4l, ); \ */ + LONG $0x6c0941c4; BYTE $0xfe \ // VPUNPCKLQDQ XMM15, XMM14, XMM14 /* _mm_unpacklo_epi64(t1, t1) */ + LONG $0x6d41c1c4; BYTE $0xff // VPUNPCKHQDQ XMM7, XMM7, XMM15 /* row4h = _mm_unpackhi_epi64(row4h, ) */ #define LOAD_SHUFFLE \ - \ // Load shuffle value - MOVQ shffle+120(FP), SI \ // SI: &shuffle - MOVOU 0(SI), X12 // X12 = 03040506 07000102 0b0c0d0e 0f08090a + \ // Load shuffle value + MOVQ shffle+120(FP), SI \ // SI: &shuffle + MOVOU 0(SI), X12 // X12 = 03040506 07000102 0b0c0d0e 0f08090a // func blockAVXLoop(p []uint8, in, iv, t, f, shffle, out []uint64) TEXT ·blockAVXLoop(SB), 7, $0 - // REGISTER USE - // R8: loop counter - // DX: message pointer - // SI: temp pointer for loading - // X0 - X7: v0 - v15 - // X8 - X11: m[0] - m[7] - // X12: shuffle value - // X13 - X15: temp registers + // REGISTER USE + // R8: loop counter + // DX: message pointer + // SI: temp pointer for loading + // X0 - X7: v0 - v15 + // X8 - X11: m[0] - m[7] + // X12: shuffle value + // X13 - X15: temp registers - // Load digest - MOVQ in+24(FP), SI // SI: &in - MOVOU 0(SI), X0 // X0 = in[0]+in[1] /* row1l = LOAD( &S->h[0] ); */ - MOVOU 16(SI), X1 // X1 = in[2]+in[3] /* row1h = LOAD( &S->h[2] ); */ - MOVOU 32(SI), X2 // X2 = in[4]+in[5] /* row2l = LOAD( &S->h[4] ); */ - MOVOU 48(SI), X3 // X3 = in[6]+in[7] /* row2h = LOAD( &S->h[6] ); */ + // Load digest + MOVQ in+24(FP), SI // SI: &in + MOVOU 0(SI), X0 // X0 = in[0]+in[1] /* row1l = LOAD( &S->h[0] ); */ + MOVOU 16(SI), X1 // X1 = in[2]+in[3] /* row1h = LOAD( &S->h[2] ); */ + MOVOU 32(SI), X2 // X2 = in[4]+in[5] /* row2l = LOAD( &S->h[4] ); */ + MOVOU 48(SI), X3 // X3 = in[6]+in[7] /* row2h = LOAD( &S->h[6] ); */ - // Already store digest into &out (so we can reload it later generically) - MOVQ out+144(FP), SI // SI: &out - MOVOU X0, 0(SI) // out[0]+out[1] = X0 - MOVOU X1, 16(SI) // out[2]+out[3] = X1 - MOVOU X2, 32(SI) // out[4]+out[5] = X2 - MOVOU X3, 48(SI) // out[6]+out[7] = X3 + // Already store digest into &out (so we can reload it later generically) + MOVQ out+144(FP), SI // SI: &out + MOVOU X0, 0(SI) // out[0]+out[1] = X0 + MOVOU X1, 16(SI) // out[2]+out[3] = X1 + MOVOU X2, 32(SI) // out[4]+out[5] = X2 + MOVOU X3, 48(SI) // out[6]+out[7] = X3 - // Initialize message pointer and loop counter - MOVQ message+0(FP), DX // DX: &p (message) - MOVQ message_len+8(FP), R8 // R8: len(message) - SHRQ $7, R8 // len(message) / 128 - CMPQ R8, $0 - JEQ complete + // Initialize message pointer and loop counter + MOVQ message+0(FP), DX // DX: &p (message) + MOVQ message_len+8(FP), R8 // R8: len(message) + SHRQ $7, R8 // len(message) / 128 + CMPQ R8, $0 + JEQ complete loop: - // Increment counter - MOVQ t+72(FP), SI // SI: &t - MOVQ 0(SI), R9 // - ADDQ $128, R9 // /* d.t[0] += BlockSize */ - MOVQ R9, 0(SI) // - CMPQ R9, $128 // /* if d.t[0] < BlockSize { */ - JGE noincr // - MOVQ 8(SI), R9 // - ADDQ $1, R9 // /* d.t[1]++ */ - MOVQ R9, 8(SI) // -noincr: // /* } */ - - // Load initialization vector - MOVQ iv+48(FP), SI // SI: &iv - MOVOU 0(SI), X4 // X4 = iv[0]+iv[1] /* row3l = LOAD( &blake2b_IV[0] ); */ - MOVOU 16(SI), X5 // X5 = iv[2]+iv[3] /* row3h = LOAD( &blake2b_IV[2] ); */ - MOVOU 32(SI), X6 // X6 = iv[4]+iv[5] /* LOAD( &blake2b_IV[4] ) */ - MOVOU 48(SI), X7 // X7 = iv[6]+iv[7] /* LOAD( &blake2b_IV[6] ) */ - MOVQ t+72(FP), SI // SI: &t - MOVOU 0(SI), X8 // X8 = t[0]+t[1] /* LOAD( &S->t[0] ) */ - PXOR X8, X6 // X6 = X6 ^ X8 /* row4l = _mm_xor_si128( , ); */ - MOVQ t+96(FP), SI // SI: &f - MOVOU 0(SI), X8 // X8 = f[0]+f[1] /* LOAD( &S->f[0] ) */ - PXOR X8, X7 // X7 = X7 ^ X8 /* row4h = _mm_xor_si128( , ); */ - - /////////////////////////////////////////////////////////////////////////// - // R O U N D 1 - /////////////////////////////////////////////////////////////////////////// - - // LOAD_MSG_ ##r ##_1(b0, b1); - // LOAD_MSG_ ##r ##_2(b0, b1); - // (X12 used as additional temp register) - MOVOU 0(DX), X12 // X12 = m[0]+m[1] - MOVOU 16(DX), X13 // X13 = m[2]+m[3] - MOVOU 32(DX), X14 // X14 = m[4]+m[5] - MOVOU 48(DX), X15 // X15 = m[6]+m[7] - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[2] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[4], m[6] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[1], m[3] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[5], m[7] */ - - LOAD_SHUFFLE - - G1 - G2 - - DIAGONALIZE - - // LOAD_MSG_ ##r ##_3(b0, b1); - // LOAD_MSG_ ##r ##_4(b0, b1); - // (X12 used as additional temp register) - MOVOU 64(DX), X12 // X12 = m[8]+ m[9] - MOVOU 80(DX), X13 // X13 = m[10]+m[11] - MOVOU 96(DX), X14 // X14 = m[12]+m[13] - MOVOU 112(DX), X15 // X15 = m[14]+m[15] - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[8],m[10] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[12],m[14] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[9],m[11] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[13],m[15] */ - - LOAD_SHUFFLE - - G1 - G2 - - UNDIAGONALIZE - - /////////////////////////////////////////////////////////////////////////// - // R O U N D 2 - /////////////////////////////////////////////////////////////////////////// - - // LOAD_MSG_ ##r ##_1(b0, b1); - // LOAD_MSG_ ##r ##_2(b0, b1); - // (X12 used as additional temp register) - MOVOU 112(DX), X12 // X12 = m[14]+m[15] - MOVOU 32(DX), X13 // X13 = m[4]+ m[5] - MOVOU 64(DX), X14 // X14 = m[8]+ m[9] - MOVOU 96(DX), X15 // X15 = m[12]+m[13] - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[14], m[4] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM14, XMM15 /* m[9], m[13] */ - MOVOU 80(DX), X13 // X13 = m[10]+m[11] - MOVOU 48(DX), X15 // X15 = m[6]+ m[7] - BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[10], m[8] */ - BYTE $0xc4; BYTE $0x43; BYTE $0x01; BYTE $0x0f; BYTE $0xdc // VPALIGNR XMM11, XMM15, XMM12, 0x8 /* m[15], m[6] */ - BYTE $0x08 - - LOAD_SHUFFLE - - G1 - G2 - - DIAGONALIZE - - // LOAD_MSG_ ##r ##_3(b0, b1); - // LOAD_MSG_ ##r ##_4(b0, b1); - // (X12 used as additional temp register) - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 32(DX), X13 // X13 = m[4]+ m[5] - MOVOU 80(DX), X14 // X14 = m[10]+m[11] - BYTE $0xc4; BYTE $0x43; BYTE $0x19; BYTE $0x0f; BYTE $0xc4 // VPALIGNR XMM8, XMM12, XMM12, 0x8 /* m[1], m[0] */ - BYTE $0x08 - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xcd // VPUNPCKHQDQ XMM9, XMM14, XMM13 /* m[11], m[5] */ - MOVOU 16(DX), X12 // X12 = m[2]+ m[3] - MOVOU 48(DX), X13 // X13 = m[6]+ m[7] - MOVOU 96(DX), X14 // X14 = m[12]+m[13] - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[2] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM13, XMM12 /* m[7], m[3] */ - - LOAD_SHUFFLE - - G1 - G2 - - UNDIAGONALIZE - - /////////////////////////////////////////////////////////////////////////// - // R O U N D 3 - /////////////////////////////////////////////////////////////////////////// - - // LOAD_MSG_ ##r ##_1(b0, b1); - // LOAD_MSG_ ##r ##_2(b0, b1); - // (X12 used as additional temp register) - MOVOU 32(DX), X12 // X12 = m[4]+ m[5] - MOVOU 80(DX), X13 // X13 = m[10]+m[11] - MOVOU 96(DX), X14 // X14 = m[12]+m[13] - MOVOU 112(DX), X15 // X15 = m[14]+m[15] - BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xc5 // VPALIGNR XMM8, XMM14, XMM13, 0x8 /* m[11], m[12] */ - BYTE $0x08 - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM12, XMM15 /* m[5], m[15] */ - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 16(DX), X13 // X13 = m[2]+ m[3] - MOVOU 64(DX), X15 // X15 = m[8]+ m[9] - BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM15, XMM12 /* m[8], m[0] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[13] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[2], ___ */ - - LOAD_SHUFFLE - - G1 - G2 - - DIAGONALIZE - - // LOAD_MSG_ ##r ##_3(b0, b1); - // LOAD_MSG_ ##r ##_4(b0, b1); - // (X12 used as additional temp register) - MOVOU 16(DX), X12 // X12 = m[2]+ m[3] - MOVOU 48(DX), X13 // X13 = m[6]+ m[7] - MOVOU 64(DX), X14 // X14 = m[8]+ m[9] - MOVOU 80(DX), X15 // X15 = m[10]+m[11] - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM12, XMM12 /* ___, m[3] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[10], ___ */ - BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM13, XMM14 /* m[7], m[9] */ - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 32(DX), X14 // X14 = m[4]+ m[5] - MOVOU 112(DX), X15 // X15 = m[14]+m[15] - BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xd5 // VPUNPCKLQDQ XMM10, XMM15, XMM13 /* m[14], m[6] */ - BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xdc // VPALIGNR XMM11, XMM14, XMM12, 0x8 /* m[1], m[4] */ - BYTE $0x08 - - LOAD_SHUFFLE - - G1 - G2 - - UNDIAGONALIZE - - /////////////////////////////////////////////////////////////////////////// - // R O U N D 4 - /////////////////////////////////////////////////////////////////////////// - - // LOAD_MSG_ ##r ##_1(b0, b1); - // LOAD_MSG_ ##r ##_2(b0, b1); - // (X12 used as additional temp register) - MOVOU 16(DX), X12 // X12 = m[2]+ m[3] - MOVOU 48(DX), X13 // X13 = m[6]+ m[7] - MOVOU 80(DX), X14 // X14 = m[10]+m[11] - MOVOU 96(DX), X15 // X15 = m[12]+m[13] - BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM13, XMM12 /* m[7], m[3] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM15, XMM14 /* m[13], m[11] */ - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 64(DX), X13 // X13 = m[8]+ m[9] - MOVOU 112(DX), X14 // X14 = m[14]+m[15] - BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xd4 // VPUNPCKHQDQ XMM10, XMM13, XMM12 /* m[9], m[1] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xde // VPUNPCKLQDQ XMM11, XMM15, XMM14 /* m[12], m[14] */ - - LOAD_SHUFFLE - - G1 - G2 - - DIAGONALIZE - - // LOAD_MSG_ ##r ##_3(b0, b1); - // LOAD_MSG_ ##r ##_4(b0, b1); - // (X12 used as additional temp register) - MOVOU 16(DX), X12 // X12 = m[2]+ m[3] - MOVOU 32(DX), X13 // X13 = m[4]+ m[5] - MOVOU 80(DX), X14 // X14 = m[10]+m[11] - MOVOU 112(DX), X15 // X15 = m[14]+m[15] - BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM13, XMM13 /* ___, m[5] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM12, XMM8 /* m[2], ____ */ - BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM15, XMM15 /* ___, m[15] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM13, XMM9 /* m[4], ____ */ - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 48(DX), X13 // X13 = m[6]+ m[7] - MOVOU 64(DX), X15 // X15 = m[8]+ m[9] - BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[6], m[10] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xdf // VPUNPCKLQDQ XMM11, XMM12, XMM15 /* m[0], m[8] */ - - LOAD_SHUFFLE - - G1 - G2 - - UNDIAGONALIZE - - /////////////////////////////////////////////////////////////////////////// - // R O U N D 5 - /////////////////////////////////////////////////////////////////////////// - - // LOAD_MSG_ ##r ##_1(b0, b1); - // LOAD_MSG_ ##r ##_2(b0, b1); - // (X12 used as additional temp register) - MOVOU 16(DX), X12 // X12 = m[2]+ m[3] - MOVOU 32(DX), X13 // X13 = m[4]+ m[5] - MOVOU 64(DX), X14 // X14 = m[8]+ m[9] - MOVOU 80(DX), X15 // X15 = m[10]+m[11] - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM14, XMM13 /* m[9], m[5] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM12, XMM15 /* m[2], m[10] */ - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 48(DX), X14 // X14 = m[6]+ m[7] - MOVOU 112(DX), X15 // X15 = m[14]+m[15] - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM14, XMM14 /* ___, m[7] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xd2 // VPUNPCKLQDQ XMM10, XMM12, XMM10 /* m[0], ____ */ - BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM15, XMM15 /* ___, m[15] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[4], ____ */ - - LOAD_SHUFFLE - - G1 - G2 - - DIAGONALIZE - - // LOAD_MSG_ ##r ##_3(b0, b1); - // LOAD_MSG_ ##r ##_4(b0, b1); - // (X12 used as additional temp register) - MOVOU 16(DX), X12 // X12 = m[2]+ m[3] - MOVOU 48(DX), X13 // X13 = m[6]+ m[7] - MOVOU 80(DX), X14 // X14 = m[10]+m[11] - MOVOU 112(DX), X15 // X15 = m[14]+m[15] - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xc6 // VPUNPCKHQDQ XMM8, XMM14, XMM14 /* ___, m[11] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[14], ____ */ - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM12, XMM12 /* ___, m[3] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM13, XMM9 /* m[6], ____ */ - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 64(DX), X13 // X13 = m[8]+ m[9] - MOVOU 96(DX), X14 // X14 = m[12]+m[13] - BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xd4 // VPALIGNR XMM10, XMM14, XMM12, 0x8 /* m[1], m[12] */ - BYTE $0x08 - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[13] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[8], ____ */ - - LOAD_SHUFFLE - - G1 - G2 - - UNDIAGONALIZE - - /////////////////////////////////////////////////////////////////////////// - // R O U N D 6 - /////////////////////////////////////////////////////////////////////////// - - // LOAD_MSG_ ##r ##_1(b0, b1); - // LOAD_MSG_ ##r ##_2(b0, b1); - // (X12 used as additional temp register) - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 16(DX), X13 // X13 = m[2]+ m[3] - MOVOU 48(DX), X14 // X14 = m[6]+ m[7] - MOVOU 64(DX), X15 // X15 = m[8]+ m[9] - BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xc6 // VPUNPCKLQDQ XMM8, XMM13, XMM14 /* m[2], m[6] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM12, XMM15 /* m[0], m[8] */ - MOVOU 80(DX), X12 // X12 = m[10]+m[11] - MOVOU 96(DX), X14 // X14 = m[12]+m[13] - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[10] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xdd // VPUNPCKHQDQ XMM11, XMM12, XMM13 /* m[11], m[3] */ - - LOAD_SHUFFLE - - G1 - G2 - - DIAGONALIZE - - // LOAD_MSG_ ##r ##_3(b0, b1); - // LOAD_MSG_ ##r ##_4(b0, b1); - // (X12 used as additional temp register) - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 32(DX), X13 // X13 = m[4]+ m[5] - MOVOU 48(DX), X14 // X14 = m[6]+ m[7] - MOVOU 112(DX), X15 // X15 = m[14]+m[15] - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xc6 // VPUNPCKHQDQ XMM8, XMM14, XMM14 /* ___, m[7] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM13, XMM8 /* m[4], ____ */ - BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM15, XMM12 /* m[15], m[1] */ - MOVOU 64(DX), X12 // X12 = m[8]+ m[9] - MOVOU 96(DX), X14 // X14 = m[12]+m[13] - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM14, XMM13 /* m[13], m[5] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM12, XMM12 /* ___, m[9] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM15, XMM11 /* m[14], ____ */ - - LOAD_SHUFFLE - - G1 - G2 - - UNDIAGONALIZE - - /////////////////////////////////////////////////////////////////////////// - // R O U N D 7 - /////////////////////////////////////////////////////////////////////////// - - // LOAD_MSG_ ##r ##_1(b0, b1); - // LOAD_MSG_ ##r ##_2(b0, b1); - // (X12 used as additional temp register) - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 32(DX), X13 // X13 = m[4]+ m[5] - MOVOU 96(DX), X14 // X14 = m[12]+m[13] - MOVOU 112(DX), X15 // X15 = m[14]+m[15] - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM12, XMM12 /* ___, m[1] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM14, XMM8 /* m[12], ____ */ - BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xcd // VPUNPCKLQDQ XMM9, XMM15, XMM13 /* m[14], m[4] */ - MOVOU 80(DX), X12 // X12 = m[10]+m[11] - BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xd7 // VPUNPCKHQDQ XMM10, XMM13, XMM15 /* m[5], m[15] */ - BYTE $0xc4; BYTE $0x43; BYTE $0x19; BYTE $0x0f; BYTE $0xde // VPALIGNR XMM11, XMM12, XMM14, 0x8 /* m[13], m[10] */ - BYTE $0x08 - - LOAD_SHUFFLE - - G1 - G2 - - DIAGONALIZE - - // LOAD_MSG_ ##r ##_3(b0, b1); - // LOAD_MSG_ ##r ##_4(b0, b1); - // (X12 used as additional temp register) - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 48(DX), X13 // X13 = m[6]+ m[7] - MOVOU 64(DX), X14 // X14 = m[8]+ m[9] - MOVOU 80(DX), X15 // X15 = m[10]+m[11] - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[6] */ - BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xce // VPALIGNR XMM9, XMM14, XMM14, 0x8 /* m[9], m[8] */ - BYTE $0x08 - MOVOU 16(DX), X14 // X14 = m[2]+ m[3] - BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM13, XMM14 /* m[7], m[3] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM15, XMM15 /* ___, m[11] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM14, XMM11 /* m[2], ____ */ - - LOAD_SHUFFLE - - G1 - G2 - - UNDIAGONALIZE - - /////////////////////////////////////////////////////////////////////////// - // R O U N D 8 - /////////////////////////////////////////////////////////////////////////// - - // LOAD_MSG_ ##r ##_1(b0, b1); - // LOAD_MSG_ ##r ##_2(b0, b1); - // (X12 used as additional temp register) - MOVOU 16(DX), X12 // X12 = m[2]+ m[3] - MOVOU 48(DX), X13 // X13 = m[6]+ m[7] - MOVOU 96(DX), X14 // X14 = m[12]+m[13] - MOVOU 112(DX), X15 // X15 = m[14]+m[15] - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM14, XMM13 /* m[13], m[7] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM12, XMM12 /* ___, m[3] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM14, XMM9 /* m[12], ____ */ - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 64(DX), X13 // X13 = m[8]+ m[9] - MOVOU 80(DX), X14 // X14 = m[10]+m[11] - BYTE $0xc4; BYTE $0x43; BYTE $0x01; BYTE $0x0f; BYTE $0xd6 // VPALIGNR XMM10, XMM15, XMM14, 0x8 /* m[11], m[14] */ - BYTE $0x08 - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xdd // VPUNPCKHQDQ XMM11, XMM12, XMM13 /* m[1], m[9] */ - - LOAD_SHUFFLE - - G1 - G2 - - DIAGONALIZE - - // LOAD_MSG_ ##r ##_3(b0, b1); - // LOAD_MSG_ ##r ##_4(b0, b1); - // (X12 used as additional temp register) - MOVOU 16(DX), X12 // X12 = m[2]+ m[3] - MOVOU 32(DX), X13 // X13 = m[4]+ m[5] - MOVOU 64(DX), X14 // X14 = m[8]+ m[9] - MOVOU 112(DX), X15 // X15 = m[14]+m[15] - BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xc7 // VPUNPCKHQDQ XMM8, XMM13, XMM15 /* m[5], m[15] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcc // VPUNPCKLQDQ XMM9, XMM14, XMM12 /* m[8], m[2] */ - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 48(DX), X14 // X14 = m[6]+ m[7] - MOVOU 80(DX), X15 // X15 = m[10]+m[11] - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xd5 // VPUNPCKLQDQ XMM10, XMM12, XMM13 /* m[0], m[4] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xdf // VPUNPCKLQDQ XMM11, XMM14, XMM15 /* m[6], m[10] */ - - LOAD_SHUFFLE - - G1 - G2 - - UNDIAGONALIZE - - /////////////////////////////////////////////////////////////////////////// - // R O U N D 9 - /////////////////////////////////////////////////////////////////////////// - - // LOAD_MSG_ ##r ##_1(b0, b1); - // LOAD_MSG_ ##r ##_2(b0, b1); - // (X12 used as additional temp register) - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 48(DX), X13 // X13 = m[6]+ m[7] - MOVOU 80(DX), X14 // X14 = m[10]+m[11] - MOVOU 112(DX), X15 // X15 = m[14]+m[15] - BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xc7 // VPUNPCKLQDQ XMM8, XMM13, XMM15 /* m[6], m[14] */ - BYTE $0xc4; BYTE $0x43; BYTE $0x19; BYTE $0x0f; BYTE $0xce // VPALIGNR XMM9, XMM12, XMM14, 0x8 /* m[11], m[0] */ - BYTE $0x08 - MOVOU 16(DX), X13 // X13 = m[2]+ m[3] - MOVOU 64(DX), X14 // X14 = m[8]+ m[9] - BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM15, XMM14 /* m[15], m[9] */ - BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xdd // VPALIGNR XMM11, XMM14, XMM13, 0x8 /* m[3], m[8] */ - BYTE $0x08 - - LOAD_SHUFFLE - - G1 - G2 - - DIAGONALIZE - - // LOAD_MSG_ ##r ##_3(b0, b1); - // LOAD_MSG_ ##r ##_4(b0, b1); - // (X12 used as additional temp register) - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 16(DX), X13 // X13 = m[2]+ m[3] - MOVOU 80(DX), X14 // X14 = m[10]+m[11] - MOVOU 96(DX), X15 // X15 = m[12]+m[13] - BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xc7 // VPUNPCKHQDQ XMM8, XMM15, XMM15 /* ___, m[13] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[12], ____ */ - BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xcc // VPALIGNR XMM9, XMM14, XMM12, 0x8 /* m[1], m[10] */ - BYTE $0x08 - MOVOU 32(DX), X12 // X12 = m[4]+ m[5] - MOVOU 48(DX), X15 // X15 = m[6]+ m[7] - BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xd7 // VPUNPCKHQDQ XMM10, XMM15, XMM15 /* ___, m[7] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xd2 // VPUNPCKLQDQ XMM10, XMM13, XMM10 /* m[2], ____ */ - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM12, XMM12 /* ___, m[5] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM12, XMM11 /* m[4], ____ */ - - LOAD_SHUFFLE - - G1 - G2 - - UNDIAGONALIZE - - /////////////////////////////////////////////////////////////////////////// - // R O U N D 1 0 - /////////////////////////////////////////////////////////////////////////// - - // LOAD_MSG_ ##r ##_1(b0, b1); - // LOAD_MSG_ ##r ##_2(b0, b1); - // (X12 used as additional temp register) - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 48(DX), X13 // X13 = m[6]+ m[7] - MOVOU 64(DX), X14 // X14 = m[8]+ m[9] - MOVOU 80(DX), X15 // X15 = m[10]+m[11] - BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xc6 // VPUNPCKLQDQ XMM8, XMM15, XMM14 /* m[10], m[8] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM13, XMM12 /* m[7], m[1] */ - MOVOU 16(DX), X12 // X12 = m[2]+ m[3] - MOVOU 32(DX), X14 // X14 = m[4]+ m[5] - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM12, XMM14 /* m[2], m[4] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[5] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[6], ____ */ - - LOAD_SHUFFLE - - G1 - G2 - - DIAGONALIZE - - // LOAD_MSG_ ##r ##_3(b0, b1); - // LOAD_MSG_ ##r ##_4(b0, b1); - // (X12 used as additional temp register) - MOVOU 16(DX), X12 // X12 = m[2]+ m[3] - MOVOU 64(DX), X13 // X13 = m[8]+ m[9] - MOVOU 96(DX), X14 // X14 = m[12]+m[13] - MOVOU 112(DX), X15 // X15 = m[14]+m[15] - BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM15, XMM13 /* m[15], m[9] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM12, XMM14 /* m[3], m[13] */ - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 80(DX), X13 // X13 = m[10]+m[11] - BYTE $0xc4; BYTE $0x43; BYTE $0x01; BYTE $0x0f; BYTE $0xd5 // VPALIGNR XMM10, XMM15, XMM13, 0x8 /* m[11], m[14] */ - BYTE $0x08 - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xdc // VPUNPCKLQDQ XMM11, XMM14, XMM12 /* m[12], m[0] */ - - LOAD_SHUFFLE - - G1 - G2 - - UNDIAGONALIZE - - /////////////////////////////////////////////////////////////////////////// - // R O U N D 1 1 - /////////////////////////////////////////////////////////////////////////// - - // LOAD_MSG_ ##r ##_1(b0, b1); - // LOAD_MSG_ ##r ##_2(b0, b1); - // (X12 used as additional temp register) - MOVOU 0(DX), X12 // X12 = m[0]+m[1] - MOVOU 16(DX), X13 // X13 = m[2]+m[3] - MOVOU 32(DX), X14 // X14 = m[4]+m[5] - MOVOU 48(DX), X15 // X15 = m[6]+m[7] - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[2] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[4], m[6] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[1], m[3] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[5], m[7] */ - - LOAD_SHUFFLE - - G1 - G2 - - DIAGONALIZE - - // LOAD_MSG_ ##r ##_3(b0, b1); - // LOAD_MSG_ ##r ##_4(b0, b1); - // (X12 used as additional temp register) - MOVOU 64(DX), X12 // X12 = m[8]+ m[9] - MOVOU 80(DX), X13 // X13 = m[10]+m[11] - MOVOU 96(DX), X14 // X14 = m[12]+m[13] - MOVOU 112(DX), X15 // X15 = m[14]+m[15] - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[8],m[10] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[12],m[14] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[9],m[11] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[13],m[15] */ - - LOAD_SHUFFLE - - G1 - G2 - - UNDIAGONALIZE - - /////////////////////////////////////////////////////////////////////////// - // R O U N D 1 2 - /////////////////////////////////////////////////////////////////////////// - - // LOAD_MSG_ ##r ##_1(b0, b1); - // LOAD_MSG_ ##r ##_2(b0, b1); - // (X12 used as additional temp register) - MOVOU 112(DX), X12 // X12 = m[14]+m[15] - MOVOU 32(DX), X13 // X13 = m[4]+ m[5] - MOVOU 64(DX), X14 // X14 = m[8]+ m[9] - MOVOU 96(DX), X15 // X15 = m[12]+m[13] - BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[14], m[4] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM14, XMM15 /* m[9], m[13] */ - MOVOU 80(DX), X13 // X13 = m[10]+m[11] - MOVOU 48(DX), X15 // X15 = m[6]+ m[7] - BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[10], m[8] */ - BYTE $0xc4; BYTE $0x43; BYTE $0x01; BYTE $0x0f; BYTE $0xdc // VPALIGNR XMM11, XMM15, XMM12, 0x8 /* m[15], m[6] */ - BYTE $0x08 - - LOAD_SHUFFLE - - G1 - G2 - - DIAGONALIZE - - // LOAD_MSG_ ##r ##_3(b0, b1); - // LOAD_MSG_ ##r ##_4(b0, b1); - // (X12 used as additional temp register) - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 32(DX), X13 // X13 = m[4]+ m[5] - MOVOU 80(DX), X14 // X14 = m[10]+m[11] - BYTE $0xc4; BYTE $0x43; BYTE $0x19; BYTE $0x0f; BYTE $0xc4 // VPALIGNR XMM8, XMM12, XMM12, 0x8 /* m[1], m[0] */ - BYTE $0x08 - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xcd // VPUNPCKHQDQ XMM9, XMM14, XMM13 /* m[11], m[5] */ - MOVOU 16(DX), X12 // X12 = m[2]+ m[3] - MOVOU 48(DX), X13 // X13 = m[6]+ m[7] - MOVOU 96(DX), X14 // X14 = m[12]+m[13] - BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[2] */ - BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM13, XMM12 /* m[7], m[3] */ - - LOAD_SHUFFLE - - G1 - G2 - - UNDIAGONALIZE - - // Reload digest (most current value store in &out) - MOVQ out+144(FP), SI // SI: &in - MOVOU 0(SI), X12 // X12 = in[0]+in[1] /* row1l = LOAD( &S->h[0] ); */ - MOVOU 16(SI), X13 // X13 = in[2]+in[3] /* row1h = LOAD( &S->h[2] ); */ - MOVOU 32(SI), X14 // X14 = in[4]+in[5] /* row2l = LOAD( &S->h[4] ); */ - MOVOU 48(SI), X15 // X15 = in[6]+in[7] /* row2h = LOAD( &S->h[6] ); */ - - // Final computations and prepare for storing - PXOR X4, X0 // X0 = X0 ^ X4 /* row1l = _mm_xor_si128( row3l, row1l ); */ - PXOR X5, X1 // X1 = X1 ^ X5 /* row1h = _mm_xor_si128( row3h, row1h ); */ - PXOR X12, X0 // X0 = X0 ^ X12 /* STORE( &S->h[0], _mm_xor_si128( LOAD( &S->h[0] ), row1l ) ); */ - PXOR X13, X1 // X1 = X1 ^ X13 /* STORE( &S->h[2], _mm_xor_si128( LOAD( &S->h[2] ), row1h ) ); */ - PXOR X6, X2 // X2 = X2 ^ X6 /* row2l = _mm_xor_si128( row4l, row2l ); */ - PXOR X7, X3 // X3 = X3 ^ X7 /* row2h = _mm_xor_si128( row4h, row2h ); */ - PXOR X14, X2 // X2 = X2 ^ X14 /* STORE( &S->h[4], _mm_xor_si128( LOAD( &S->h[4] ), row2l ) ); */ - PXOR X15, X3 // X3 = X3 ^ X15 /* STORE( &S->h[6], _mm_xor_si128( LOAD( &S->h[6] ), row2h ) ); */ - - // Store digest into &out - MOVQ out+144(FP), SI // SI: &out - MOVOU X0, 0(SI) // out[0]+out[1] = X0 - MOVOU X1, 16(SI) // out[2]+out[3] = X1 - MOVOU X2, 32(SI) // out[4]+out[5] = X2 - MOVOU X3, 48(SI) // out[6]+out[7] = X3 - - // Increment message pointer and check if there's more to do - ADDQ $128, DX // message += 128 - SUBQ $1, R8 - JNZ loop + // Increment counter + MOVQ t+72(FP), SI // SI: &t + MOVQ 0(SI), R9 + ADDQ $128, R9 // /* d.t[0] += BlockSize */ + MOVQ R9, 0(SI) + CMPQ R9, $128 // /* if d.t[0] < BlockSize { */ + JGE noincr + MOVQ 8(SI), R9 + ADDQ $1, R9 // /* d.t[1]++ */ + MOVQ R9, 8(SI) +noincr: // /* } */ + + // Load initialization vector + MOVQ iv+48(FP), SI // SI: &iv + MOVOU 0(SI), X4 // X4 = iv[0]+iv[1] /* row3l = LOAD( &blake2b_IV[0] ); */ + MOVOU 16(SI), X5 // X5 = iv[2]+iv[3] /* row3h = LOAD( &blake2b_IV[2] ); */ + MOVOU 32(SI), X6 // X6 = iv[4]+iv[5] /* LOAD( &blake2b_IV[4] ) */ + MOVOU 48(SI), X7 // X7 = iv[6]+iv[7] /* LOAD( &blake2b_IV[6] ) */ + MOVQ t+72(FP), SI // SI: &t + MOVOU 0(SI), X8 // X8 = t[0]+t[1] /* LOAD( &S->t[0] ) */ + PXOR X8, X6 // X6 = X6 ^ X8 /* row4l = _mm_xor_si128( , ); */ + MOVQ t+96(FP), SI // SI: &f + MOVOU 0(SI), X8 // X8 = f[0]+f[1] /* LOAD( &S->f[0] ) */ + PXOR X8, X7 // X7 = X7 ^ X8 /* row4h = _mm_xor_si128( , ); */ + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 1 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register) + MOVOU 0(DX), X12 // X12 = m[0]+m[1] + MOVOU 16(DX), X13 // X13 = m[2]+m[3] + MOVOU 32(DX), X14 // X14 = m[4]+m[5] + MOVOU 48(DX), X15 // X15 = m[6]+m[7] + LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[2] */ + LONG $0x6c0941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[4], m[6] */ + LONG $0x6d1941c4; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[1], m[3] */ + LONG $0x6d0941c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[5], m[7] */ + + LOAD_SHUFFLE + G1 + G2 + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register) + MOVOU 64(DX), X12 // X12 = m[8]+ m[9] + MOVOU 80(DX), X13 // X13 = m[10]+m[11] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[8],m[10] */ + LONG $0x6c0941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[12],m[14] */ + LONG $0x6d1941c4; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[9],m[11] */ + LONG $0x6d0941c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[13],m[15] */ + + LOAD_SHUFFLE + G1 + G2 + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 2 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register) + MOVOU 112(DX), X12 // X12 = m[14]+m[15] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 64(DX), X14 // X14 = m[8]+ m[9] + MOVOU 96(DX), X15 // X15 = m[12]+m[13] + LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[14], m[4] */ + LONG $0x6d0941c4; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM14, XMM15 /* m[9], m[13] */ + MOVOU 80(DX), X13 // X13 = m[10]+m[11] + MOVOU 48(DX), X15 // X15 = m[6]+ m[7] + LONG $0x6c1141c4; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[10], m[8] */ + LONG $0x0f0143c4; WORD $0x08dc // VPALIGNR XMM11, XMM15, XMM12, 0x8 /* m[15], m[6] */ + + LOAD_SHUFFLE + G1 + G2 + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 80(DX), X14 // X14 = m[10]+m[11] + LONG $0x0f1943c4; WORD $0x08c4 // VPALIGNR XMM8, XMM12, XMM12, 0x8 /* m[1], m[0] */ + LONG $0x6d0941c4; BYTE $0xcd // VPUNPCKHQDQ XMM9, XMM14, XMM13 /* m[11], m[5] */ + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + LONG $0x6c0941c4; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[2] */ + LONG $0x6d1141c4; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM13, XMM12 /* m[7], m[3] */ + + LOAD_SHUFFLE + G1 + G2 + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 3 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register) + MOVOU 32(DX), X12 // X12 = m[4]+ m[5] + MOVOU 80(DX), X13 // X13 = m[10]+m[11] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + LONG $0x0f0943c4; WORD $0x08c5 // VPALIGNR XMM8, XMM14, XMM13, 0x8 /* m[11], m[12] */ + LONG $0x6d1941c4; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM12, XMM15 /* m[5], m[15] */ + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 16(DX), X13 // X13 = m[2]+ m[3] + MOVOU 64(DX), X15 // X15 = m[8]+ m[9] + LONG $0x6c0141c4; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM15, XMM12 /* m[8], m[0] */ + LONG $0x6d0941c4; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[13] */ + LONG $0x6c1141c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[2], ___ */ + + LOAD_SHUFFLE + G1 + G2 + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register) + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 64(DX), X14 // X14 = m[8]+ m[9] + MOVOU 80(DX), X15 // X15 = m[10]+m[11] + LONG $0x6d1941c4; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM12, XMM12 /* ___, m[3] */ + LONG $0x6c0141c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[10], ___ */ + LONG $0x6d1141c4; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM13, XMM14 /* m[7], m[9] */ + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 32(DX), X14 // X14 = m[4]+ m[5] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + LONG $0x6c0141c4; BYTE $0xd5 // VPUNPCKLQDQ XMM10, XMM15, XMM13 /* m[14], m[6] */ + LONG $0x0f0943c4; WORD $0x08dc // VPALIGNR XMM11, XMM14, XMM12, 0x8 /* m[1], m[4] */ + + LOAD_SHUFFLE + G1 + G2 + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 4 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register) + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 80(DX), X14 // X14 = m[10]+m[11] + MOVOU 96(DX), X15 // X15 = m[12]+m[13] + LONG $0x6d1141c4; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM13, XMM12 /* m[7], m[3] */ + LONG $0x6d0141c4; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM15, XMM14 /* m[13], m[11] */ + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 64(DX), X13 // X13 = m[8]+ m[9] + MOVOU 112(DX), X14 // X14 = m[14]+m[15] + LONG $0x6d1141c4; BYTE $0xd4 // VPUNPCKHQDQ XMM10, XMM13, XMM12 /* m[9], m[1] */ + LONG $0x6c0141c4; BYTE $0xde // VPUNPCKLQDQ XMM11, XMM15, XMM14 /* m[12], m[14] */ + + LOAD_SHUFFLE + G1 + G2 + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register) + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 80(DX), X14 // X14 = m[10]+m[11] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + LONG $0x6d1141c4; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM13, XMM13 /* ___, m[5] */ + LONG $0x6c1941c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM12, XMM8 /* m[2], ____ */ + LONG $0x6d0141c4; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM15, XMM15 /* ___, m[15] */ + LONG $0x6c1141c4; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM13, XMM9 /* m[4], ____ */ + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 64(DX), X15 // X15 = m[8]+ m[9] + LONG $0x6c1141c4; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[6], m[10] */ + LONG $0x6c1941c4; BYTE $0xdf // VPUNPCKLQDQ XMM11, XMM12, XMM15 /* m[0], m[8] */ + + LOAD_SHUFFLE + G1 + G2 + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 5 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register) + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 64(DX), X14 // X14 = m[8]+ m[9] + MOVOU 80(DX), X15 // X15 = m[10]+m[11] + LONG $0x6d0941c4; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM14, XMM13 /* m[9], m[5] */ + LONG $0x6c1941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM12, XMM15 /* m[2], m[10] */ + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 48(DX), X14 // X14 = m[6]+ m[7] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + LONG $0x6d0941c4; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM14, XMM14 /* ___, m[7] */ + LONG $0x6c1941c4; BYTE $0xd2 // VPUNPCKLQDQ XMM10, XMM12, XMM10 /* m[0], ____ */ + LONG $0x6d0141c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM15, XMM15 /* ___, m[15] */ + LONG $0x6c1141c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[4], ____ */ + + LOAD_SHUFFLE + G1 + G2 + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register) + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 80(DX), X14 // X14 = m[10]+m[11] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + LONG $0x6d0941c4; BYTE $0xc6 // VPUNPCKHQDQ XMM8, XMM14, XMM14 /* ___, m[11] */ + LONG $0x6c0141c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[14], ____ */ + LONG $0x6d1941c4; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM12, XMM12 /* ___, m[3] */ + LONG $0x6c1141c4; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM13, XMM9 /* m[6], ____ */ + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 64(DX), X13 // X13 = m[8]+ m[9] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + LONG $0x0f0943c4; WORD $0x08d4 // VPALIGNR XMM10, XMM14, XMM12, 0x8 /* m[1], m[12] */ + LONG $0x6d0941c4; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[13] */ + LONG $0x6c1141c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[8], ____ */ + + LOAD_SHUFFLE + G1 + G2 + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 6 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 16(DX), X13 // X13 = m[2]+ m[3] + MOVOU 48(DX), X14 // X14 = m[6]+ m[7] + MOVOU 64(DX), X15 // X15 = m[8]+ m[9] + LONG $0x6c1141c4; BYTE $0xc6 // VPUNPCKLQDQ XMM8, XMM13, XMM14 /* m[2], m[6] */ + LONG $0x6c1941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM12, XMM15 /* m[0], m[8] */ + MOVOU 80(DX), X12 // X12 = m[10]+m[11] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + LONG $0x6c0941c4; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[10] */ + LONG $0x6d1941c4; BYTE $0xdd // VPUNPCKHQDQ XMM11, XMM12, XMM13 /* m[11], m[3] */ + + LOAD_SHUFFLE + G1 + G2 + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 48(DX), X14 // X14 = m[6]+ m[7] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + LONG $0x6d0941c4; BYTE $0xc6 // VPUNPCKHQDQ XMM8, XMM14, XMM14 /* ___, m[7] */ + LONG $0x6c1141c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM13, XMM8 /* m[4], ____ */ + LONG $0x6d0141c4; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM15, XMM12 /* m[15], m[1] */ + MOVOU 64(DX), X12 // X12 = m[8]+ m[9] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + LONG $0x6d0941c4; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM14, XMM13 /* m[13], m[5] */ + LONG $0x6d1941c4; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM12, XMM12 /* ___, m[9] */ + LONG $0x6c0141c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM15, XMM11 /* m[14], ____ */ + + LOAD_SHUFFLE + G1 + G2 + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 7 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + LONG $0x6d1941c4; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM12, XMM12 /* ___, m[1] */ + LONG $0x6c0941c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM14, XMM8 /* m[12], ____ */ + LONG $0x6c0141c4; BYTE $0xcd // VPUNPCKLQDQ XMM9, XMM15, XMM13 /* m[14], m[4] */ + MOVOU 80(DX), X12 // X12 = m[10]+m[11] + LONG $0x6d1141c4; BYTE $0xd7 // VPUNPCKHQDQ XMM10, XMM13, XMM15 /* m[5], m[15] */ + LONG $0x0f1943c4; WORD $0x08de // VPALIGNR XMM11, XMM12, XMM14, 0x8 /* m[13], m[10] */ + + LOAD_SHUFFLE + G1 + G2 + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 64(DX), X14 // X14 = m[8]+ m[9] + MOVOU 80(DX), X15 // X15 = m[10]+m[11] + LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[6] */ + LONG $0x0f0943c4; WORD $0x08ce // VPALIGNR XMM9, XMM14, XMM14, 0x8 /* m[9], m[8] */ + MOVOU 16(DX), X14 // X14 = m[2]+ m[3] + LONG $0x6d1141c4; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM13, XMM14 /* m[7], m[3] */ + LONG $0x6d0141c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM15, XMM15 /* ___, m[11] */ + LONG $0x6c0941c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM14, XMM11 /* m[2], ____ */ + + LOAD_SHUFFLE + G1 + G2 + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 8 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register) + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + LONG $0x6d0941c4; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM14, XMM13 /* m[13], m[7] */ + LONG $0x6d1941c4; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM12, XMM12 /* ___, m[3] */ + LONG $0x6c0941c4; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM14, XMM9 /* m[12], ____ */ + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 64(DX), X13 // X13 = m[8]+ m[9] + MOVOU 80(DX), X14 // X14 = m[10]+m[11] + LONG $0x0f0143c4; WORD $0x08d6 // VPALIGNR XMM10, XMM15, XMM14, 0x8 /* m[11], m[14] */ + LONG $0x6d1941c4; BYTE $0xdd // VPUNPCKHQDQ XMM11, XMM12, XMM13 /* m[1], m[9] */ + + LOAD_SHUFFLE + G1 + G2 + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register) + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 64(DX), X14 // X14 = m[8]+ m[9] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + LONG $0x6d1141c4; BYTE $0xc7 // VPUNPCKHQDQ XMM8, XMM13, XMM15 /* m[5], m[15] */ + LONG $0x6c0941c4; BYTE $0xcc // VPUNPCKLQDQ XMM9, XMM14, XMM12 /* m[8], m[2] */ + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 48(DX), X14 // X14 = m[6]+ m[7] + MOVOU 80(DX), X15 // X15 = m[10]+m[11] + LONG $0x6c1941c4; BYTE $0xd5 // VPUNPCKLQDQ XMM10, XMM12, XMM13 /* m[0], m[4] */ + LONG $0x6c0941c4; BYTE $0xdf // VPUNPCKLQDQ XMM11, XMM14, XMM15 /* m[6], m[10] */ + + LOAD_SHUFFLE + G1 + G2 + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 9 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 80(DX), X14 // X14 = m[10]+m[11] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + LONG $0x6c1141c4; BYTE $0xc7 // VPUNPCKLQDQ XMM8, XMM13, XMM15 /* m[6], m[14] */ + LONG $0x0f1943c4; WORD $0x08ce // VPALIGNR XMM9, XMM12, XMM14, 0x8 /* m[11], m[0] */ + MOVOU 16(DX), X13 // X13 = m[2]+ m[3] + MOVOU 64(DX), X14 // X14 = m[8]+ m[9] + LONG $0x6d0141c4; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM15, XMM14 /* m[15], m[9] */ + LONG $0x0f0943c4; WORD $0x08dd // VPALIGNR XMM11, XMM14, XMM13, 0x8 /* m[3], m[8] */ + + LOAD_SHUFFLE + G1 + G2 + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 16(DX), X13 // X13 = m[2]+ m[3] + MOVOU 80(DX), X14 // X14 = m[10]+m[11] + MOVOU 96(DX), X15 // X15 = m[12]+m[13] + LONG $0x6d0141c4; BYTE $0xc7 // VPUNPCKHQDQ XMM8, XMM15, XMM15 /* ___, m[13] */ + LONG $0x6c0141c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[12], ____ */ + LONG $0x0f0943c4; WORD $0x08cc // VPALIGNR XMM9, XMM14, XMM12, 0x8 /* m[1], m[10] */ + MOVOU 32(DX), X12 // X12 = m[4]+ m[5] + MOVOU 48(DX), X15 // X15 = m[6]+ m[7] + LONG $0x6d0141c4; BYTE $0xd7 // VPUNPCKHQDQ XMM10, XMM15, XMM15 /* ___, m[7] */ + LONG $0x6c1141c4; BYTE $0xd2 // VPUNPCKLQDQ XMM10, XMM13, XMM10 /* m[2], ____ */ + LONG $0x6d1941c4; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM12, XMM12 /* ___, m[5] */ + LONG $0x6c1941c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM12, XMM11 /* m[4], ____ */ + + LOAD_SHUFFLE + G1 + G2 + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 1 0 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 64(DX), X14 // X14 = m[8]+ m[9] + MOVOU 80(DX), X15 // X15 = m[10]+m[11] + LONG $0x6c0141c4; BYTE $0xc6 // VPUNPCKLQDQ XMM8, XMM15, XMM14 /* m[10], m[8] */ + LONG $0x6d1141c4; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM13, XMM12 /* m[7], m[1] */ + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 32(DX), X14 // X14 = m[4]+ m[5] + LONG $0x6c1941c4; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM12, XMM14 /* m[2], m[4] */ + LONG $0x6d0941c4; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[5] */ + LONG $0x6c1141c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[6], ____ */ + + LOAD_SHUFFLE + G1 + G2 + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register) + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 64(DX), X13 // X13 = m[8]+ m[9] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + LONG $0x6d0141c4; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM15, XMM13 /* m[15], m[9] */ + LONG $0x6d1941c4; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM12, XMM14 /* m[3], m[13] */ + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 80(DX), X13 // X13 = m[10]+m[11] + LONG $0x0f0143c4; WORD $0x08d5 // VPALIGNR XMM10, XMM15, XMM13, 0x8 /* m[11], m[14] */ + LONG $0x6c0941c4; BYTE $0xdc // VPUNPCKLQDQ XMM11, XMM14, XMM12 /* m[12], m[0] */ + + LOAD_SHUFFLE + G1 + G2 + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 1 1 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register) + MOVOU 0(DX), X12 // X12 = m[0]+m[1] + MOVOU 16(DX), X13 // X13 = m[2]+m[3] + MOVOU 32(DX), X14 // X14 = m[4]+m[5] + MOVOU 48(DX), X15 // X15 = m[6]+m[7] + LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[2] */ + LONG $0x6c0941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[4], m[6] */ + LONG $0x6d1941c4; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[1], m[3] */ + LONG $0x6d0941c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[5], m[7] */ + + LOAD_SHUFFLE + G1 + G2 + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register) + MOVOU 64(DX), X12 // X12 = m[8]+ m[9] + MOVOU 80(DX), X13 // X13 = m[10]+m[11] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[8],m[10] */ + LONG $0x6c0941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[12],m[14] */ + LONG $0x6d1941c4; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[9],m[11] */ + LONG $0x6d0941c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[13],m[15] */ + + LOAD_SHUFFLE + G1 + G2 + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 1 2 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register) + MOVOU 112(DX), X12 // X12 = m[14]+m[15] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 64(DX), X14 // X14 = m[8]+ m[9] + MOVOU 96(DX), X15 // X15 = m[12]+m[13] + LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[14], m[4] */ + LONG $0x6d0941c4; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM14, XMM15 /* m[9], m[13] */ + MOVOU 80(DX), X13 // X13 = m[10]+m[11] + MOVOU 48(DX), X15 // X15 = m[6]+ m[7] + LONG $0x6c1141c4; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[10], m[8] */ + LONG $0x0f0143c4; WORD $0x08dc // VPALIGNR XMM11, XMM15, XMM12, 0x8 /* m[15], m[6] */ + + LOAD_SHUFFLE + G1 + G2 + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 80(DX), X14 // X14 = m[10]+m[11] + LONG $0x0f1943c4; WORD $0x08c4 // VPALIGNR XMM8, XMM12, XMM12, 0x8 /* m[1], m[0] */ + LONG $0x6d0941c4; BYTE $0xcd // VPUNPCKHQDQ XMM9, XMM14, XMM13 /* m[11], m[5] */ + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + LONG $0x6c0941c4; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[2] */ + LONG $0x6d1141c4; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM13, XMM12 /* m[7], m[3] */ + + LOAD_SHUFFLE + G1 + G2 + UNDIAGONALIZE + + // Reload digest (most current value store in &out) + MOVQ out+144(FP), SI // SI: &in + MOVOU 0(SI), X12 // X12 = in[0]+in[1] /* row1l = LOAD( &S->h[0] ); */ + MOVOU 16(SI), X13 // X13 = in[2]+in[3] /* row1h = LOAD( &S->h[2] ); */ + MOVOU 32(SI), X14 // X14 = in[4]+in[5] /* row2l = LOAD( &S->h[4] ); */ + MOVOU 48(SI), X15 // X15 = in[6]+in[7] /* row2h = LOAD( &S->h[6] ); */ + + // Final computations and prepare for storing + PXOR X4, X0 // X0 = X0 ^ X4 /* row1l = _mm_xor_si128( row3l, row1l ); */ + PXOR X5, X1 // X1 = X1 ^ X5 /* row1h = _mm_xor_si128( row3h, row1h ); */ + PXOR X12, X0 // X0 = X0 ^ X12 /* STORE( &S->h[0], _mm_xor_si128( LOAD( &S->h[0] ), row1l ) ); */ + PXOR X13, X1 // X1 = X1 ^ X13 /* STORE( &S->h[2], _mm_xor_si128( LOAD( &S->h[2] ), row1h ) ); */ + PXOR X6, X2 // X2 = X2 ^ X6 /* row2l = _mm_xor_si128( row4l, row2l ); */ + PXOR X7, X3 // X3 = X3 ^ X7 /* row2h = _mm_xor_si128( row4h, row2h ); */ + PXOR X14, X2 // X2 = X2 ^ X14 /* STORE( &S->h[4], _mm_xor_si128( LOAD( &S->h[4] ), row2l ) ); */ + PXOR X15, X3 // X3 = X3 ^ X15 /* STORE( &S->h[6], _mm_xor_si128( LOAD( &S->h[6] ), row2h ) ); */ + + // Store digest into &out + MOVQ out+144(FP), SI // SI: &out + MOVOU X0, 0(SI) // out[0]+out[1] = X0 + MOVOU X1, 16(SI) // out[2]+out[3] = X1 + MOVOU X2, 32(SI) // out[4]+out[5] = X2 + MOVOU X3, 48(SI) // out[6]+out[7] = X3 + + // Increment message pointer and check if there's more to do + ADDQ $128, DX // message += 128 + SUBQ $1, R8 + JNZ loop complete: - RET + RET diff --git a/vendor/github.com/minio/blake2b-simd/compressSse_amd64.go b/vendor/github.com/minio/blake2b-simd/compressSse_amd64.go index 7032f46e6..d539a7ade 100644 --- a/vendor/github.com/minio/blake2b-simd/compressSse_amd64.go +++ b/vendor/github.com/minio/blake2b-simd/compressSse_amd64.go @@ -23,18 +23,19 @@ package blake2b func blockSSELoop(p []uint8, in, iv, t, f, shffle, out []uint64) func compressSSE(d *digest, p []uint8) { + var ( + in [8]uint64 + out [8]uint64 + shffle [2]uint64 + ) - in := make([]uint64, 8, 8) - out := make([]uint64, 8, 8) - - shffle := make([]uint64, 2, 2) // vector for PSHUFB instruction shffle[0] = 0x0201000706050403 shffle[1] = 0x0a09080f0e0d0c0b in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7] = d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7] - blockSSELoop(p, in, iv[:], d.t[:], d.f[:], shffle, out) + blockSSELoop(p, in[:], iv[:], d.t[:], d.f[:], shffle[:], out[:]) d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7] = out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7] } diff --git a/vendor/github.com/minio/blake2b-simd/compressSse_amd64.s b/vendor/github.com/minio/blake2b-simd/compressSse_amd64.s index 051a6d6b9..6f31c949e 100644 --- a/vendor/github.com/minio/blake2b-simd/compressSse_amd64.s +++ b/vendor/github.com/minio/blake2b-simd/compressSse_amd64.s @@ -43,840 +43,728 @@ // #define G1 \ - \ // G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); - BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xd4; BYTE $0xc0 \ // PADDQ XMM0,XMM8 /* v0 += m[0], v1 += m[2] */ - BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xd4; BYTE $0xc9 \ // PADDQ XMM1,XMM9 /* v2 += m[4], v3 += m[6] */ - BYTE $0x66; BYTE $0x0f; BYTE $0xd4; BYTE $0xc2 \ // PADDQ XMM0,XMM2 /* v0 += v4, v1 += v5 */ - BYTE $0x66; BYTE $0x0f; BYTE $0xd4; BYTE $0xcb \ // PADDQ XMM1,XMM3 /* v2 += v6, v3 += v7 */ - BYTE $0x66; BYTE $0x0f; BYTE $0xef; BYTE $0xf0 \ // PXOR XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */ - BYTE $0x66; BYTE $0x0f; BYTE $0xef; BYTE $0xf9 \ // PXOR XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */ - BYTE $0x66; BYTE $0x0f; BYTE $0x70; BYTE $0xf6; BYTE $0xb1 \ // PSHUFD XMM6,XMM6,0xb1 /* v12 = v12<<(64-32) | v12>>32, v13 = v13<<(64-32) | v13>>32 */ - BYTE $0x66; BYTE $0x0f; BYTE $0x70; BYTE $0xff; BYTE $0xb1 \ // PSHUFD XMM7,XMM7,0xb1 /* v14 = v14<<(64-32) | v14>>32, v15 = v15<<(64-32) | v15>>32 */ - BYTE $0x66; BYTE $0x0f; BYTE $0xd4; BYTE $0xe6 \ // PADDQ XMM4,XMM6 /* v8 += v12, v9 += v13 */ - BYTE $0x66; BYTE $0x0f; BYTE $0xd4; BYTE $0xef \ // PADDQ XMM5,XMM7 /* v10 += v14, v11 += v15 */ - BYTE $0x66; BYTE $0x0f; BYTE $0xef; BYTE $0xd4 \ // PXOR XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */ - BYTE $0x66; BYTE $0x0f; BYTE $0xef; BYTE $0xdd \ // PXOR XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */ - BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x38; BYTE $0x00 \ // PSHUFB XMM2,XMM12 /* v4 = v4<<(64-24) | v4>>24, v5 = v5<<(64-24) | v5>>24 */ - BYTE $0xd4 \ - BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x38; BYTE $0x00 \ // PSHUFB XMM3,XMM12 /* v6 = v6<<(64-24) | v6>>24, v7 = v7<<(64-24) | v7>>24 */ - BYTE $0xdc \ - // DO NOT DELETE -- macro delimiter (previous line extended) + \ // G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); + LONG $0xd40f4166; BYTE $0xc0 \ // PADDQ XMM0,XMM8 /* v0 += m[0], v1 += m[2] */ + LONG $0xd40f4166; BYTE $0xc9 \ // PADDQ XMM1,XMM9 /* v2 += m[4], v3 += m[6] */ + LONG $0xc2d40f66 \ // PADDQ XMM0,XMM2 /* v0 += v4, v1 += v5 */ + LONG $0xcbd40f66 \ // PADDQ XMM1,XMM3 /* v2 += v6, v3 += v7 */ + LONG $0xf0ef0f66 \ // PXOR XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */ + LONG $0xf9ef0f66 \ // PXOR XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */ + LONG $0xf6700f66; BYTE $0xb1 \ // PSHUFD XMM6,XMM6,0xb1 /* v12 = v12<<(64-32) | v12>>32, v13 = v13<<(64-32) | v13>>32 */ + LONG $0xff700f66; BYTE $0xb1 \ // PSHUFD XMM7,XMM7,0xb1 /* v14 = v14<<(64-32) | v14>>32, v15 = v15<<(64-32) | v15>>32 */ + LONG $0xe6d40f66 \ // PADDQ XMM4,XMM6 /* v8 += v12, v9 += v13 */ + LONG $0xefd40f66 \ // PADDQ XMM5,XMM7 /* v10 += v14, v11 += v15 */ + LONG $0xd4ef0f66 \ // PXOR XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */ + LONG $0xddef0f66 \ // PXOR XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */ + LONG $0x380f4166; WORD $0xd400 \ // PSHUFB XMM2,XMM12 /* v4 = v4<<(64-24) | v4>>24, v5 = v5<<(64-24) | v5>>24 */ + LONG $0x380f4166; WORD $0xdc00 // PSHUFB XMM3,XMM12 /* v6 = v6<<(64-24) | v6>>24, v7 = v7<<(64-24) | v7>>24 */ #define G2 \ - \ // G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); - BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xd4; BYTE $0xc2 \ // PADDQ XMM0,XMM10 /* v0 += m[1], v1 += m[3] */ - BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xd4; BYTE $0xcb \ // PADDQ XMM1,XMM11 /* v2 += m[5], v3 += m[7] */ - BYTE $0x66; BYTE $0x0f; BYTE $0xd4; BYTE $0xc2 \ // PADDQ XMM0,XMM2 /* v0 += v4, v1 += v5 */ - BYTE $0x66; BYTE $0x0f; BYTE $0xd4; BYTE $0xcb \ // PADDQ XMM1,XMM3 /* v2 += v6, v3 += v7 */ - BYTE $0x66; BYTE $0x0f; BYTE $0xef; BYTE $0xf0 \ // PXOR XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */ - BYTE $0x66; BYTE $0x0f; BYTE $0xef; BYTE $0xf9 \ // PXOR XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */ - BYTE $0xf2; BYTE $0x0f; BYTE $0x70; BYTE $0xf6; BYTE $0x39 \ // PSHUFLW XMM6,XMM6,0x39 /* combined with next ... */ - BYTE $0xf3; BYTE $0x0f; BYTE $0x70; BYTE $0xf6; BYTE $0x39 \ // PSHUFHW XMM6,XMM6,0x39 /* v12 = v12<<(64-16) | v12>>16, v13 = v13<<(64-16) | v13>>16 */ - BYTE $0xf2; BYTE $0x0f; BYTE $0x70; BYTE $0xff; BYTE $0x39 \ // PSHUFLW XMM7,XMM7,0x39 /* combined with next ... */ - BYTE $0xf3; BYTE $0x0f; BYTE $0x70; BYTE $0xff; BYTE $0x39 \ // PSHUFHW XMM7,XMM7,0x39 /* v14 = v14<<(64-16) | v14>>16, v15 = v15<<(64-16) | v15>>16 */ - BYTE $0x66; BYTE $0x0f; BYTE $0xd4; BYTE $0xe6 \ // PADDQ XMM4,XMM6 /* v8 += v12, v9 += v13 */ - BYTE $0x66; BYTE $0x0f; BYTE $0xd4; BYTE $0xef \ // PADDQ XMM5,XMM7 /* v10 += v14, v11 += v15 */ - BYTE $0x66; BYTE $0x0f; BYTE $0xef; BYTE $0xd4 \ // PXOR XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */ - BYTE $0x66; BYTE $0x0f; BYTE $0xef; BYTE $0xdd \ // PXOR XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */ - MOVOU X2, X15 \ - BYTE $0x66; BYTE $0x44; BYTE $0x0f; BYTE $0xd4; BYTE $0xfa \ // PADDQ XMM15,XMM2 /* temp reg = reg*2 */ - BYTE $0x66; BYTE $0x0f; BYTE $0x73; BYTE $0xd2; BYTE $0x3f \ // PSRLQ XMM2,0x3f /* reg = reg>>63 */ - BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xef; BYTE $0xd7 \ // PXOR XMM2,XMM15 /* ORed together: v4 = v4<<(64-63) | v4>>63, v5 = v5<<(64-63) | v5>>63 */ - MOVOU X3, X15 \ - BYTE $0x66; BYTE $0x44; BYTE $0x0f; BYTE $0xd4; BYTE $0xfb \ // PADDQ XMM15,XMM3 /* temp reg = reg*2 */ - BYTE $0x66; BYTE $0x0f; BYTE $0x73; BYTE $0xd3; BYTE $0x3f \ // PSRLQ XMM3,0x3f /* reg = reg>>63 */ - BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xef; BYTE $0xdf // PXOR XMM3,XMM15 /* ORed together: v6 = v6<<(64-63) | v6>>63, v7 = v7<<(64-63) | v7>>63 */ + \ // G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); + LONG $0xd40f4166; BYTE $0xc2 \ // PADDQ XMM0,XMM10 /* v0 += m[1], v1 += m[3] */ + LONG $0xd40f4166; BYTE $0xcb \ // PADDQ XMM1,XMM11 /* v2 += m[5], v3 += m[7] */ + LONG $0xc2d40f66 \ // PADDQ XMM0,XMM2 /* v0 += v4, v1 += v5 */ + LONG $0xcbd40f66 \ // PADDQ XMM1,XMM3 /* v2 += v6, v3 += v7 */ + LONG $0xf0ef0f66 \ // PXOR XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */ + LONG $0xf9ef0f66 \ // PXOR XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */ + LONG $0xf6700ff2; BYTE $0x39 \ // PSHUFLW XMM6,XMM6,0x39 /* combined with next ... */ + LONG $0xf6700ff3; BYTE $0x39 \ // PSHUFHW XMM6,XMM6,0x39 /* v12 = v12<<(64-16) | v12>>16, v13 = v13<<(64-16) | v13>>16 */ + LONG $0xff700ff2; BYTE $0x39 \ // PSHUFLW XMM7,XMM7,0x39 /* combined with next ... */ + LONG $0xff700ff3; BYTE $0x39 \ // PSHUFHW XMM7,XMM7,0x39 /* v14 = v14<<(64-16) | v14>>16, v15 = v15<<(64-16) | v15>>16 */ + LONG $0xe6d40f66 \ // PADDQ XMM4,XMM6 /* v8 += v12, v9 += v13 */ + LONG $0xefd40f66 \ // PADDQ XMM5,XMM7 /* v10 += v14, v11 += v15 */ + LONG $0xd4ef0f66 \ // PXOR XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */ + LONG $0xddef0f66 \ // PXOR XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */ + MOVOU X2, X15 \ + LONG $0xd40f4466; BYTE $0xfa \ // PADDQ XMM15,XMM2 /* temp reg = reg*2 */ + LONG $0xd2730f66; BYTE $0x3f \ // PSRLQ XMM2,0x3f /* reg = reg>>63 */ + LONG $0xef0f4166; BYTE $0xd7 \ // PXOR XMM2,XMM15 /* ORed together: v4 = v4<<(64-63) | v4>>63, v5 = v5<<(64-63) | v5>>63 */ + MOVOU X3, X15 \ + LONG $0xd40f4466; BYTE $0xfb \ // PADDQ XMM15,XMM3 /* temp reg = reg*2 */ + LONG $0xd3730f66; BYTE $0x3f \ // PSRLQ XMM3,0x3f /* reg = reg>>63 */ + LONG $0xef0f4166; BYTE $0xdf // PXOR XMM3,XMM15 /* ORed together: v6 = v6<<(64-63) | v6>>63, v7 = v7<<(64-63) | v7>>63 */ #define DIAGONALIZE \ - \ // DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); - MOVOU X6, X13 \ /* t0 = row4l;\ */ - MOVOU X2, X14 \ /* t1 = row2l;\ */ - MOVOU X4, X6 \ /* row4l = row3l;\ */ - MOVOU X5, X4 \ /* row3l = row3h;\ */ - MOVOU X6, X5 \ /* row3h = row4l;\ */ - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xfd \ // PUNPCKLQDQ XMM15, XMM13 /* _mm_unpacklo_epi64(t0, t0) */ - MOVOU X7, X6 \ - BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x6d; BYTE $0xf7 \ // PUNPCKHQDQ XMM6, XMM15 /* row4l = _mm_unpackhi_epi64(row4h, ); \ */ - BYTE $0x66; BYTE $0x44; BYTE $0x0f; BYTE $0x6c; BYTE $0xff \ // PUNPCKLQDQ XMM15, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */ - MOVOU X13, X7 \ - BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x6d; BYTE $0xff \ // PUNPCKHQDQ XMM7, XMM15 /* row4h = _mm_unpackhi_epi64(t0, ); \ */ - BYTE $0x66; BYTE $0x44; BYTE $0x0f; BYTE $0x6c; BYTE $0xfb \ // PUNPCKLQDQ XMM15, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */ - BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x6d; BYTE $0xd7 \ // PUNPCKHQDQ XMM2, XMM15 /* row2l = _mm_unpackhi_epi64(row2l, ); \ */ - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xfe \ // PUNPCKLQDQ XMM15, XMM14 /* _mm_unpacklo_epi64(t1, t1) */ - BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x6d; BYTE $0xdf // PUNPCKHQDQ XMM3, XMM15 /* row2h = _mm_unpackhi_epi64(row2h, ) */ + \ // DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); + MOVOU X6, X13 \ /* t0 = row4l;\ */ + MOVOU X2, X14 \ /* t1 = row2l;\ */ + MOVOU X4, X6 \ /* row4l = row3l;\ */ + MOVOU X5, X4 \ /* row3l = row3h;\ */ + MOVOU X6, X5 \ /* row3h = row4l;\ */ + LONG $0x6c0f4566; BYTE $0xfd \ // PUNPCKLQDQ XMM15, XMM13 /* _mm_unpacklo_epi64(t0, t0) */ + MOVOU X7, X6 \ + LONG $0x6d0f4166; BYTE $0xf7 \ // PUNPCKHQDQ XMM6, XMM15 /* row4l = _mm_unpackhi_epi64(row4h, ); \ */ + LONG $0x6c0f4466; BYTE $0xff \ // PUNPCKLQDQ XMM15, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */ + MOVOU X13, X7 \ + LONG $0x6d0f4166; BYTE $0xff \ // PUNPCKHQDQ XMM7, XMM15 /* row4h = _mm_unpackhi_epi64(t0, ); \ */ + LONG $0x6c0f4466; BYTE $0xfb \ // PUNPCKLQDQ XMM15, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */ + LONG $0x6d0f4166; BYTE $0xd7 \ // PUNPCKHQDQ XMM2, XMM15 /* row2l = _mm_unpackhi_epi64(row2l, ); \ */ + LONG $0x6c0f4566; BYTE $0xfe \ // PUNPCKLQDQ XMM15, XMM14 /* _mm_unpacklo_epi64(t1, t1) */ + LONG $0x6d0f4166; BYTE $0xdf // PUNPCKHQDQ XMM3, XMM15 /* row2h = _mm_unpackhi_epi64(row2h, ) */ #define UNDIAGONALIZE \ - \ // UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); - MOVOU X4, X13 \ /* t0 = row3l;\ */ - MOVOU X5, X4 \ /* row3l = row3h;\ */ - MOVOU X13, X5 \ /* row3h = t0;\ */ - MOVOU X2, X13 \ /* t0 = row2l;\ */ - MOVOU X6, X14 \ /* t1 = row4l;\ */ - BYTE $0x66; BYTE $0x44; BYTE $0x0f; BYTE $0x6c; BYTE $0xfa \ // PUNPCKLQDQ XMM15, XMM2 /* _mm_unpacklo_epi64(row2l, row2l) */ - MOVOU X3, X2 \ - BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x6d; BYTE $0xd7 \ // PUNPCKHQDQ XMM2, XMM15 /* row2l = _mm_unpackhi_epi64(row2h, ); \ */ - BYTE $0x66; BYTE $0x44; BYTE $0x0f; BYTE $0x6c; BYTE $0xfb \ // PUNPCKLQDQ XMM15, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */ - MOVOU X13, X3 \ - BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x6d; BYTE $0xdf \ // PUNPCKHQDQ XMM3, XMM15 /* row2h = _mm_unpackhi_epi64(t0, ); \ */ - BYTE $0x66; BYTE $0x44; BYTE $0x0f; BYTE $0x6c; BYTE $0xff \ // PUNPCKLQDQ XMM15, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */ - BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x6d; BYTE $0xf7 \ // PUNPCKHQDQ XMM6, XMM15 /* row4l = _mm_unpackhi_epi64(row4l, ); \ */ - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xfe \ // PUNPCKLQDQ XMM15, XMM14 /* _mm_unpacklo_epi64(t1, t1) */ - BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0x6d; BYTE $0xff // PUNPCKHQDQ XMM7, XMM15 /* row4h = _mm_unpackhi_epi64(row4h, ) */ + \ // UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); + MOVOU X4, X13 \ /* t0 = row3l;\ */ + MOVOU X5, X4 \ /* row3l = row3h;\ */ + MOVOU X13, X5 \ /* row3h = t0;\ */ + MOVOU X2, X13 \ /* t0 = row2l;\ */ + MOVOU X6, X14 \ /* t1 = row4l;\ */ + LONG $0x6c0f4466; BYTE $0xfa \ // PUNPCKLQDQ XMM15, XMM2 /* _mm_unpacklo_epi64(row2l, row2l) */ + MOVOU X3, X2 \ + LONG $0x6d0f4166; BYTE $0xd7 \ // PUNPCKHQDQ XMM2, XMM15 /* row2l = _mm_unpackhi_epi64(row2h, ); \ */ + LONG $0x6c0f4466; BYTE $0xfb \ // PUNPCKLQDQ XMM15, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */ + MOVOU X13, X3 \ + LONG $0x6d0f4166; BYTE $0xdf \ // PUNPCKHQDQ XMM3, XMM15 /* row2h = _mm_unpackhi_epi64(t0, ); \ */ + LONG $0x6c0f4466; BYTE $0xff \ // PUNPCKLQDQ XMM15, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */ + LONG $0x6d0f4166; BYTE $0xf7 \ // PUNPCKHQDQ XMM6, XMM15 /* row4l = _mm_unpackhi_epi64(row4l, ); \ */ + LONG $0x6c0f4566; BYTE $0xfe \ // PUNPCKLQDQ XMM15, XMM14 /* _mm_unpacklo_epi64(t1, t1) */ + LONG $0x6d0f4166; BYTE $0xff // PUNPCKHQDQ XMM7, XMM15 /* row4h = _mm_unpackhi_epi64(row4h, ) */ #define LOAD_SHUFFLE \ - \ // Load shuffle value - MOVQ shffle+120(FP), SI \ // SI: &shuffle - MOVOU 0(SI), X12 // X12 = 03040506 07000102 0b0c0d0e 0f08090a + \ // Load shuffle value + MOVQ shffle+120(FP), SI \ // SI: &shuffle + MOVOU 0(SI), X12 // X12 = 03040506 07000102 0b0c0d0e 0f08090a // func blockSSELoop(p []uint8, in, iv, t, f, shffle, out []uint64) TEXT ·blockSSELoop(SB), 7, $0 - // REGISTER USE - // R8: loop counter - // DX: message pointer - // SI: temp pointer for loading - // X0 - X7: v0 - v15 - // X8 - X11: m[0] - m[7] - // X12: shuffle value - // X13 - X15: temp registers + // REGISTER USE + // R8: loop counter + // DX: message pointer + // SI: temp pointer for loading + // X0 - X7: v0 - v15 + // X8 - X11: m[0] - m[7] + // X12: shuffle value + // X13 - X15: temp registers - // Load digest - MOVQ in+24(FP), SI // SI: &in - MOVOU 0(SI), X0 // X0 = in[0]+in[1] /* row1l = LOAD( &S->h[0] ); */ - MOVOU 16(SI), X1 // X1 = in[2]+in[3] /* row1h = LOAD( &S->h[2] ); */ - MOVOU 32(SI), X2 // X2 = in[4]+in[5] /* row2l = LOAD( &S->h[4] ); */ - MOVOU 48(SI), X3 // X3 = in[6]+in[7] /* row2h = LOAD( &S->h[6] ); */ + // Load digest + MOVQ in+24(FP), SI // SI: &in + MOVOU 0(SI), X0 // X0 = in[0]+in[1] /* row1l = LOAD( &S->h[0] ); */ + MOVOU 16(SI), X1 // X1 = in[2]+in[3] /* row1h = LOAD( &S->h[2] ); */ + MOVOU 32(SI), X2 // X2 = in[4]+in[5] /* row2l = LOAD( &S->h[4] ); */ + MOVOU 48(SI), X3 // X3 = in[6]+in[7] /* row2h = LOAD( &S->h[6] ); */ - // Already store digest into &out (so we can reload it later generically) - MOVQ out+144(FP), SI // SI: &out - MOVOU X0, 0(SI) // out[0]+out[1] = X0 - MOVOU X1, 16(SI) // out[2]+out[3] = X1 - MOVOU X2, 32(SI) // out[4]+out[5] = X2 - MOVOU X3, 48(SI) // out[6]+out[7] = X3 + // Already store digest into &out (so we can reload it later generically) + MOVQ out+144(FP), SI // SI: &out + MOVOU X0, 0(SI) // out[0]+out[1] = X0 + MOVOU X1, 16(SI) // out[2]+out[3] = X1 + MOVOU X2, 32(SI) // out[4]+out[5] = X2 + MOVOU X3, 48(SI) // out[6]+out[7] = X3 - // Initialize message pointer and loop counter - MOVQ message+0(FP), DX // DX: &p (message) - MOVQ message_len+8(FP), R8 // R8: len(message) - SHRQ $7, R8 // len(message) / 128 - CMPQ R8, $0 - JEQ complete + // Initialize message pointer and loop counter + MOVQ message+0(FP), DX // DX: &p (message) + MOVQ message_len+8(FP), R8 // R8: len(message) + SHRQ $7, R8 // len(message) / 128 + CMPQ R8, $0 + JEQ complete loop: - // Increment counter - MOVQ t+72(FP), SI // SI: &t - MOVQ 0(SI), R9 // - ADDQ $128, R9 // /* d.t[0] += BlockSize */ - MOVQ R9, 0(SI) // - CMPQ R9, $128 // /* if d.t[0] < BlockSize { */ - JGE noincr // - MOVQ 8(SI), R9 // - ADDQ $1, R9 // /* d.t[1]++ */ - MOVQ R9, 8(SI) // -noincr: // /* } */ - - // Load initialization vector - MOVQ iv+48(FP), SI // SI: &iv - MOVOU 0(SI), X4 // X4 = iv[0]+iv[1] /* row3l = LOAD( &blake2b_IV[0] ); */ - MOVOU 16(SI), X5 // X5 = iv[2]+iv[3] /* row3h = LOAD( &blake2b_IV[2] ); */ - MOVOU 32(SI), X6 // X6 = iv[4]+iv[5] /* LOAD( &blake2b_IV[4] ) */ - MOVOU 48(SI), X7 // X7 = iv[6]+iv[7] /* LOAD( &blake2b_IV[6] ) */ - MOVQ t+72(FP), SI // SI: &t - MOVOU 0(SI), X8 // X8 = t[0]+t[1] /* LOAD( &S->t[0] ) */ - PXOR X8, X6 // X6 = X6 ^ X8 /* row4l = _mm_xor_si128( , ); */ - MOVQ t+96(FP), SI // SI: &f - MOVOU 0(SI), X8 // X8 = f[0]+f[1] /* LOAD( &S->f[0] ) */ - PXOR X8, X7 // X7 = X7 ^ X8 /* row4h = _mm_xor_si128( , ); */ - - /////////////////////////////////////////////////////////////////////////// - // R O U N D 1 - /////////////////////////////////////////////////////////////////////////// - - // LOAD_MSG_ ##r ##_1(b0, b1); - // LOAD_MSG_ ##r ##_2(b0, b1); - // (X12 used as additional temp register) - MOVOU 0(DX), X12 // X12 = m[0]+m[1] - MOVOU 16(DX), X13 // X13 = m[2]+m[3] - MOVOU 32(DX), X14 // X14 = m[4]+m[5] - MOVOU 48(DX), X15 // X15 = m[6]+m[7] - MOVOU X12, X8 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[0], m[2] */ - MOVOU X14, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[4], m[6] */ - MOVOU X12, X10 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[1], m[3] */ - MOVOU X14, X11 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[5], m[7] */ - - LOAD_SHUFFLE - - G1 - G2 - - DIAGONALIZE - - // LOAD_MSG_ ##r ##_3(b0, b1); - // LOAD_MSG_ ##r ##_4(b0, b1); - // (X12 used as additional temp register) - MOVOU 64(DX), X12 // X12 = m[8]+ m[9] - MOVOU 80(DX), X13 // X13 = m[10]+m[11] - MOVOU 96(DX), X14 // X14 = m[12]+m[13] - MOVOU 112(DX), X15 // X15 = m[14]+m[15] - MOVOU X12, X8 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[8],m[10] */ - MOVOU X14, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[12],m[14] */ - MOVOU X12, X10 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[9],m[11] */ - MOVOU X14, X11 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[13],m[15] */ - - LOAD_SHUFFLE - - G1 - G2 - - UNDIAGONALIZE - - /////////////////////////////////////////////////////////////////////////// - // R O U N D 2 - /////////////////////////////////////////////////////////////////////////// - - // LOAD_MSG_ ##r ##_1(b0, b1); - // LOAD_MSG_ ##r ##_2(b0, b1); - // (X12 used as additional temp register) - MOVOU 112(DX), X12 // X12 = m[14]+m[15] - MOVOU 32(DX), X13 // X13 = m[4]+ m[5] - MOVOU 64(DX), X14 // X14 = m[8]+ m[9] - MOVOU 96(DX), X15 // X15 = m[12]+m[13] - MOVOU X12, X8 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[14], m[4] */ - MOVOU X14, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* m[9], m[13] */ - MOVOU 80(DX), X10 // X10 = m[10]+m[11] - MOVOU 48(DX), X11 // X11 = m[6]+ m[7] - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[10], m[8] */ - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM11, XMM12, 0x8 /* m[15], m[6] */; ; ; ; ; - BYTE $0xdc; BYTE $0x08 - - LOAD_SHUFFLE - - G1 - G2 - - DIAGONALIZE - - // LOAD_MSG_ ##r ##_3(b0, b1); - // LOAD_MSG_ ##r ##_4(b0, b1); - // (X12 used as additional temp register) - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 32(DX), X13 // X13 = m[4]+ m[5] - MOVOU 80(DX), X14 // X14 = m[10]+m[11] - MOVOU X12, X8 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM8, XMM12, 0x8 /* m[1], m[0] */ - BYTE $0xc4; BYTE $0x08 - MOVOU X14, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcd // PUNPCKHQDQ XMM9, XMM13 /* m[11], m[5] */ - MOVOU 16(DX), X12 // X12 = m[2]+ m[3] - MOVOU 48(DX), X11 // X11 = m[6]+ m[7] - MOVOU 96(DX), X10 // X10 = m[12]+m[13] - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[12], m[2] */ - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdc // PUNPCKHQDQ XMM11, XMM12 /* m[7], m[3] */ - - LOAD_SHUFFLE - - G1 - G2 - - UNDIAGONALIZE - - /////////////////////////////////////////////////////////////////////////// - // R O U N D 3 - /////////////////////////////////////////////////////////////////////////// - - // LOAD_MSG_ ##r ##_1(b0, b1); - // LOAD_MSG_ ##r ##_2(b0, b1); - // (X12 used as additional temp register) - MOVOU 32(DX), X12 // X12 = m[4]+ m[5] - MOVOU 80(DX), X13 // X13 = m[10]+m[11] - MOVOU 96(DX), X14 // X14 = m[12]+m[13] - MOVOU 112(DX), X15 // X15 = m[14]+m[15] - MOVOU X14, X8 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM8, XMM13, 0x8 /* m[11], m[12] */ - BYTE $0xc5; BYTE $0x08 - MOVOU X12, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* m[5], m[15] */ - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 16(DX), X13 // X13 = m[2]+ m[3] - MOVOU 64(DX), X10 // X10 = m[8]+ m[9] - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[8], m[0] */ - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[13] */ - MOVOU X13, X11 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xde // PUNPCKLQDQ XMM11, XMM14 /* m[2], ___ */ - - LOAD_SHUFFLE - - G1 - G2 - - DIAGONALIZE - - // LOAD_MSG_ ##r ##_3(b0, b1); - // LOAD_MSG_ ##r ##_4(b0, b1); - // (X12 used as additional temp register) - MOVOU 16(DX), X12 // X12 = m[2]+ m[3] - MOVOU 48(DX), X13 // X13 = m[6]+ m[7] - MOVOU 64(DX), X14 // X14 = m[8]+ m[9] - MOVOU 80(DX), X15 // X15 = m[10]+m[11] - MOVOU X12, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* ___, m[3] */ - MOVOU X15, X8 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[10], ___ */ - MOVOU X13, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* m[7], m[9] */ - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 32(DX), X11 // X11 = m[4]+ m[5] - MOVOU 112(DX), X10 // X10 = m[14]+m[15] - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd5 // PUNPCKLQDQ XMM10, XMM13 /* m[14], m[6] */ - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM11, XMM12, 0x8 /* m[1], m[4] */ - BYTE $0xdc; BYTE $0x08 - - LOAD_SHUFFLE - - G1 - G2 - - UNDIAGONALIZE - - /////////////////////////////////////////////////////////////////////////// - // R O U N D 4 - /////////////////////////////////////////////////////////////////////////// - - // LOAD_MSG_ ##r ##_1(b0, b1); - // LOAD_MSG_ ##r ##_2(b0, b1); - // (X12 used as additional temp register) - MOVOU 16(DX), X12 // X12 = m[2]+ m[3] - MOVOU 48(DX), X13 // X13 = m[6]+ m[7] - MOVOU 80(DX), X14 // X14 = m[10]+m[11] - MOVOU 96(DX), X15 // X15 = m[12]+m[13] - MOVOU X13, X8 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xc4 // PUNPCKHQDQ XMM8, XMM12 /* m[7], m[3] */ - MOVOU X15, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* m[13], m[11] */ - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 64(DX), X10 // X10 = m[8]+ m[9] - MOVOU 112(DX), X14 // X14 = m[14]+m[15] - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd4 // PUNPCKHQDQ XMM10, XMM12 /* m[9], m[1] */ - MOVOU X15, X11 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xde // PUNPCKLQDQ XMM11, XMM14 /* m[12], m[14] */ - - LOAD_SHUFFLE - - G1 - G2 - - DIAGONALIZE - - // LOAD_MSG_ ##r ##_3(b0, b1); - // LOAD_MSG_ ##r ##_4(b0, b1); - // (X12 used as additional temp register) - MOVOU 16(DX), X12 // X12 = m[2]+ m[3] - MOVOU 32(DX), X13 // X13 = m[4]+ m[5] - MOVOU 80(DX), X14 // X14 = m[10]+m[11] - MOVOU 112(DX), X15 // X15 = m[14]+m[15] - MOVOU X13, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcd // PUNPCKHQDQ XMM9, XMM13 /* ___, m[5] */ - MOVOU X12, X8 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[2], ____ */ - MOVOU X15, X10 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd7 // PUNPCKHQDQ XMM10, XMM15 /* ___, m[15] */ - MOVOU X13, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xca // PUNPCKLQDQ XMM9, XMM10 /* m[4], ____ */ - MOVOU 0(DX), X11 // X11 = m[0]+ m[1] - MOVOU 48(DX), X10 // X10 = m[6]+ m[7] - MOVOU 64(DX), X15 // X15 = m[8]+ m[9] - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[6], m[10] */ - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[0], m[8] */ - - LOAD_SHUFFLE - - G1 - G2 - - UNDIAGONALIZE - - /////////////////////////////////////////////////////////////////////////// - // R O U N D 5 - /////////////////////////////////////////////////////////////////////////// - - // LOAD_MSG_ ##r ##_1(b0, b1); - // LOAD_MSG_ ##r ##_2(b0, b1); - // (X12 used as additional temp register) - MOVOU 16(DX), X12 // X12 = m[2]+ m[3] - MOVOU 32(DX), X13 // X13 = m[4]+ m[5] - MOVOU 64(DX), X14 // X14 = m[8]+ m[9] - MOVOU 80(DX), X15 // X15 = m[10]+m[11] - MOVOU X14, X8 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xc5 // PUNPCKHQDQ XMM8, XMM13 /* m[9], m[5] */ - MOVOU X12, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[2], m[10] */ - MOVOU 0(DX), X10 // X10 = m[0]+ m[1] - MOVOU 48(DX), X14 // X14 = m[6]+ m[7] - MOVOU 112(DX), X15 // X15 = m[14]+m[15] - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[7] */ - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[0], ____ */ - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xff // PUNPCKHQDQ XMM15, XMM15 /* ___, m[15] */ - MOVOU X13, X11 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[4], ____ */ - - LOAD_SHUFFLE - - G1 - G2 - - DIAGONALIZE - - // LOAD_MSG_ ##r ##_3(b0, b1); - // LOAD_MSG_ ##r ##_4(b0, b1); - // (X12 used as additional temp register) - MOVOU 16(DX), X12 // X12 = m[2]+ m[3] - MOVOU 48(DX), X13 // X13 = m[6]+ m[7] - MOVOU 80(DX), X14 // X14 = m[10]+m[11] - MOVOU 112(DX), X15 // X15 = m[14]+m[15] - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[11] */ - MOVOU X15, X8 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc6 // PUNPCKLQDQ XMM8, XMM14 /* m[14], ____ */ - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xe4 // PUNPCKHQDQ XMM12, XMM12 /* ___, m[3] */ - MOVOU X13, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcc // PUNPCKLQDQ XMM9, XMM12 /* m[6], ____ */ - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 64(DX), X11 // X11 = m[8]+ m[9] - MOVOU 96(DX), X14 // X14 = m[12]+m[13] - MOVOU X14, X10 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM10, XMM12, 0x8 /* m[1], m[12] */ - BYTE $0xd4; BYTE $0x08 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[13] */ - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xde // PUNPCKLQDQ XMM11, XMM14 /* m[8], ____ */ - - LOAD_SHUFFLE - - G1 - G2 - - UNDIAGONALIZE - - /////////////////////////////////////////////////////////////////////////// - // R O U N D 6 - /////////////////////////////////////////////////////////////////////////// - - // LOAD_MSG_ ##r ##_1(b0, b1); - // LOAD_MSG_ ##r ##_2(b0, b1); - // (X12 used as additional temp register) - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 16(DX), X13 // X13 = m[2]+ m[3] - MOVOU 48(DX), X14 // X14 = m[6]+ m[7] - MOVOU 64(DX), X15 // X15 = m[8]+ m[9] - MOVOU X13, X8 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc6 // PUNPCKLQDQ XMM8, XMM14 /* m[2], m[6] */ - MOVOU X12, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[0], m[8] */ - MOVOU 80(DX), X12 // X12 = m[10]+m[11] - MOVOU 96(DX), X10 // X10 = m[12]+m[13] - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[12], m[10] */ - MOVOU X12, X11 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdd // PUNPCKHQDQ XMM11, XMM13 /* m[11], m[3] */ - - LOAD_SHUFFLE - - G1 - G2 - - DIAGONALIZE - - // LOAD_MSG_ ##r ##_3(b0, b1); - // LOAD_MSG_ ##r ##_4(b0, b1); - // (X12 used as additional temp register) - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 32(DX), X13 // X13 = m[4]+ m[5] - MOVOU 48(DX), X14 // X14 = m[6]+ m[7] - MOVOU 112(DX), X15 // X15 = m[14]+m[15] - MOVOU X14, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* ___, m[7] */ - MOVOU X13, X8 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[4], ____ */ - MOVOU X15, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* m[15], m[1] */ - MOVOU 64(DX), X12 // X12 = m[8]+ m[9] - MOVOU 96(DX), X10 // X10 = m[12]+m[13] - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[13], m[5] */ - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xe4 // PUNPCKHQDQ XMM12, XMM12 /* ___, m[9] */ - MOVOU X15, X11 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xdc // PUNPCKLQDQ XMM11, XMM12 /* m[14], ____ */ - - LOAD_SHUFFLE - - G1 - G2 - - UNDIAGONALIZE - - /////////////////////////////////////////////////////////////////////////// - // R O U N D 7 - /////////////////////////////////////////////////////////////////////////// - - // LOAD_MSG_ ##r ##_1(b0, b1); - // LOAD_MSG_ ##r ##_2(b0, b1); - // (X12 used as additional temp register) - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 32(DX), X13 // X13 = m[4]+ m[5] - MOVOU 96(DX), X14 // X14 = m[12]+m[13] - MOVOU 112(DX), X15 // X15 = m[14]+m[15] - MOVOU X12, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* ___, m[1] */ - MOVOU X14, X8 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[12], ____ */ - MOVOU X15, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcd // PUNPCKLQDQ XMM9, XMM13 /* m[14], m[4] */ - MOVOU 80(DX), X11 // X11 = m[10]+m[11] - MOVOU X13, X10 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd7 // PUNPCKHQDQ XMM10, XMM15 /* m[5], m[15] */ - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM11, XMM14, 0x8 /* m[13], m[10] */ - BYTE $0xde; BYTE $0x08 - - LOAD_SHUFFLE - - G1 - G2 - - DIAGONALIZE - - // LOAD_MSG_ ##r ##_3(b0, b1); - // LOAD_MSG_ ##r ##_4(b0, b1); - // (X12 used as additional temp register) - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 48(DX), X13 // X13 = m[6]+ m[7] - MOVOU 64(DX), X14 // X14 = m[8]+ m[9] - MOVOU 80(DX), X15 // X15 = m[10]+m[11] - MOVOU X12, X8 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[0], m[6] */ - MOVOU X14, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM9, XMM14, 0x8 /* m[9], m[8] */ - BYTE $0xce; BYTE $0x08 - MOVOU 16(DX), X11 // X14 = m[2]+ m[3] - MOVOU X13, X10 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd3 // PUNPCKHQDQ XMM10, XMM11 /* m[7], m[3] */ - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xff // PUNPCKHQDQ XMM15, XMM15 /* ___, m[11] */ - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[2], ____ */ - - LOAD_SHUFFLE - - G1 - G2 - - UNDIAGONALIZE - - /////////////////////////////////////////////////////////////////////////// - // R O U N D 8 - /////////////////////////////////////////////////////////////////////////// - - // LOAD_MSG_ ##r ##_1(b0, b1); - // LOAD_MSG_ ##r ##_2(b0, b1); - // (X12 used as additional temp register) - MOVOU 16(DX), X12 // X12 = m[2]+ m[3] - MOVOU 48(DX), X13 // X13 = m[6]+ m[7] - MOVOU 96(DX), X14 // X14 = m[12]+m[13] - MOVOU 112(DX), X15 // X15 = m[14]+m[15] - MOVOU X14, X8 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xc5 // PUNPCKHQDQ XMM8, XMM13 /* m[13], m[7] */ - MOVOU X12, X10 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd4 // PUNPCKHQDQ XMM10, XMM12 /* ___, m[3] */ - MOVOU X14, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xca // PUNPCKLQDQ XMM9, XMM10 /* m[12], ____ */ - MOVOU 0(DX), X11 // X11 = m[0]+ m[1] - MOVOU 64(DX), X13 // X13 = m[8]+ m[9] - MOVOU 80(DX), X14 // X14 = m[10]+m[11] - MOVOU X15, X10 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM10, XMM14, 0x8 /* m[11], m[14] */ - BYTE $0xd6; BYTE $0x08 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdd // PUNPCKHQDQ XMM11, XMM13 /* m[1], m[9] */ - - LOAD_SHUFFLE - - G1 - G2 - - DIAGONALIZE - - // LOAD_MSG_ ##r ##_3(b0, b1); - // LOAD_MSG_ ##r ##_4(b0, b1); - // (X12 used as additional temp register) - MOVOU 16(DX), X12 // X12 = m[2]+ m[3] - MOVOU 32(DX), X13 // X13 = m[4]+ m[5] - MOVOU 64(DX), X14 // X14 = m[8]+ m[9] - MOVOU 112(DX), X15 // X15 = m[14]+m[15] - MOVOU X13, X8 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xc7 // PUNPCKHQDQ XMM8, XMM15 /* m[5], m[15] */ - MOVOU X14, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcc // PUNPCKLQDQ XMM9, XMM12 /* m[8], m[2] */ - MOVOU 0(DX), X10 // X10 = m[0]+ m[1] - MOVOU 48(DX), X11 // X11 = m[6]+ m[7] - MOVOU 80(DX), X15 // X15 = m[10]+m[11] - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd5 // PUNPCKLQDQ XMM10, XMM13 /* m[0], m[4] */ - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[6], m[10] */ - - LOAD_SHUFFLE - - G1 - G2 - - UNDIAGONALIZE - - /////////////////////////////////////////////////////////////////////////// - // R O U N D 9 - /////////////////////////////////////////////////////////////////////////// - - // LOAD_MSG_ ##r ##_1(b0, b1); - // LOAD_MSG_ ##r ##_2(b0, b1); - // (X12 used as additional temp register) - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 48(DX), X13 // X13 = m[6]+ m[7] - MOVOU 80(DX), X14 // X14 = m[10]+m[11] - MOVOU 112(DX), X15 // X15 = m[14]+m[15] - MOVOU X13, X8 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc7 // PUNPCKLQDQ XMM8, XMM15 /* m[6], m[14] */ - MOVOU X12, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM9, XMM14, 0x8 /* m[11], m[0] */ - BYTE $0xce; BYTE $0x08 - MOVOU 16(DX), X13 // X13 = m[2]+ m[3] - MOVOU 64(DX), X11 // X11 = m[8]+ m[9] - MOVOU X15, X10 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd3 // PUNPCKHQDQ XMM10, XMM11 /* m[15], m[9] */ - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM11, XMM13, 0x8 /* m[3], m[8] */ - BYTE $0xdd; BYTE $0x08 - - LOAD_SHUFFLE - - G1 - G2 - - DIAGONALIZE - - // LOAD_MSG_ ##r ##_3(b0, b1); - // LOAD_MSG_ ##r ##_4(b0, b1); - // (X12 used as additional temp register) - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 16(DX), X13 // X13 = m[2]+ m[3] - MOVOU 80(DX), X14 // X14 = m[10]+m[11] - MOVOU 96(DX), X15 // X15 = m[12]+m[13] - MOVOU X15, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* ___, m[13] */ - MOVOU X15, X8 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[12], ____ */ - MOVOU X14, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM9, XMM12, 0x8 /* m[1], m[10] */ - BYTE $0xcc; BYTE $0x08 - MOVOU 32(DX), X12 // X12 = m[4]+ m[5] - MOVOU 48(DX), X15 // X15 = m[6]+ m[7] - MOVOU X15, X11 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* ___, m[7] */ - MOVOU X13, X10 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd3 // PUNPCKLQDQ XMM10, XMM11 /* m[2], ____ */ - MOVOU X12, X15 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xfc // PUNPCKHQDQ XMM15, XMM12 /* ___, m[5] */ - MOVOU X12, X11 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[4], ____ */ - - LOAD_SHUFFLE - - G1 - G2 - - UNDIAGONALIZE - - /////////////////////////////////////////////////////////////////////////// - // R O U N D 1 0 - /////////////////////////////////////////////////////////////////////////// - - // LOAD_MSG_ ##r ##_1(b0, b1); - // LOAD_MSG_ ##r ##_2(b0, b1); - // (X12 used as additional temp register) - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 48(DX), X13 // X13 = m[6]+ m[7] - MOVOU 64(DX), X14 // X14 = m[8]+ m[9] - MOVOU 80(DX), X15 // X15 = m[10]+m[11] - MOVOU X15, X8 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc6 // PUNPCKLQDQ XMM8, XMM14 /* m[10], m[8] */ - MOVOU X13, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* m[7], m[1] */ - MOVOU 16(DX), X10 // X10 = m[2]+ m[3] - MOVOU 32(DX), X14 // X14 = m[4]+ m[5] - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[2], m[4] */ - MOVOU X14, X15 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xfe // PUNPCKHQDQ XMM15, XMM14 /* ___, m[5] */ - MOVOU X13, X11 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[6], ____ */ - - LOAD_SHUFFLE - - G1 - G2 - - DIAGONALIZE - - // LOAD_MSG_ ##r ##_3(b0, b1); - // LOAD_MSG_ ##r ##_4(b0, b1); - // (X12 used as additional temp register) - MOVOU 16(DX), X12 // X12 = m[2]+ m[3] - MOVOU 64(DX), X13 // X13 = m[8]+ m[9] - MOVOU 96(DX), X14 // X14 = m[12]+m[13] - MOVOU 112(DX), X15 // X15 = m[14]+m[15] - MOVOU X15, X8 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xc5 // PUNPCKHQDQ XMM8, XMM13 /* m[15], m[9] */ - MOVOU X12, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* m[3], m[13] */ - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 80(DX), X13 // X13 = m[10]+m[11] - MOVOU X15, X10 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM10, XMM13, 0x8 /* m[11], m[14] */ - BYTE $0xd5; BYTE $0x08 - MOVOU X14, X11 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xdc // PUNPCKLQDQ XMM11, XMM12 /* m[12], m[0] */ - - LOAD_SHUFFLE - - G1 - G2 - - UNDIAGONALIZE - - /////////////////////////////////////////////////////////////////////////// - // R O U N D 1 1 - /////////////////////////////////////////////////////////////////////////// - - // LOAD_MSG_ ##r ##_1(b0, b1); - // LOAD_MSG_ ##r ##_2(b0, b1); - // (X12 used as additional temp register) - MOVOU 0(DX), X12 // X12 = m[0]+m[1] - MOVOU 16(DX), X13 // X13 = m[2]+m[3] - MOVOU 32(DX), X14 // X14 = m[4]+m[5] - MOVOU 48(DX), X15 // X15 = m[6]+m[7] - MOVOU X12, X8 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[0], m[2] */ - MOVOU X14, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[4], m[6] */ - MOVOU X12, X10 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[1], m[3] */ - MOVOU X14, X11 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[5], m[7] */ - - LOAD_SHUFFLE - - G1 - G2 - - DIAGONALIZE - - // LOAD_MSG_ ##r ##_3(b0, b1); - // LOAD_MSG_ ##r ##_4(b0, b1); - // (X12 used as additional temp register) - MOVOU 64(DX), X12 // X12 = m[8]+ m[9] - MOVOU 80(DX), X13 // X13 = m[10]+m[11] - MOVOU 96(DX), X14 // X14 = m[12]+m[13] - MOVOU 112(DX), X15 // X15 = m[14]+m[15] - MOVOU X12, X8 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[8],m[10] */ - MOVOU X14, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[12],m[14] */ - MOVOU X12, X10 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[9],m[11] */ - MOVOU X14, X11 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[13],m[15] */ - - LOAD_SHUFFLE - - G1 - G2 - - UNDIAGONALIZE - - /////////////////////////////////////////////////////////////////////////// - // R O U N D 1 2 - /////////////////////////////////////////////////////////////////////////// - - // LOAD_MSG_ ##r ##_1(b0, b1); - // LOAD_MSG_ ##r ##_2(b0, b1); - // (X12 used as additional temp register) - MOVOU 112(DX), X12 // X12 = m[14]+m[15] - MOVOU 32(DX), X13 // X13 = m[4]+ m[5] - MOVOU 64(DX), X14 // X14 = m[8]+ m[9] - MOVOU 96(DX), X15 // X15 = m[12]+m[13] - MOVOU X12, X8 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[14], m[4] */ - MOVOU X14, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* m[9], m[13] */ - MOVOU 80(DX), X10 // X10 = m[10]+m[11] - MOVOU 48(DX), X11 // X11 = m[6]+ m[7] - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[10], m[8] */ - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM11, XMM12, 0x8 /* m[15], m[6] */; ; ; ; ; - BYTE $0xdc; BYTE $0x08 - - LOAD_SHUFFLE - - G1 - G2 - - DIAGONALIZE - - // LOAD_MSG_ ##r ##_3(b0, b1); - // LOAD_MSG_ ##r ##_4(b0, b1); - // (X12 used as additional temp register) - MOVOU 0(DX), X12 // X12 = m[0]+ m[1] - MOVOU 32(DX), X13 // X13 = m[4]+ m[5] - MOVOU 80(DX), X14 // X14 = m[10]+m[11] - MOVOU X12, X8 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f // PALIGNR XMM8, XMM12, 0x8 /* m[1], m[0] */ - BYTE $0xc4; BYTE $0x08 - MOVOU X14, X9 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xcd // PUNPCKHQDQ XMM9, XMM13 /* m[11], m[5] */ - MOVOU 16(DX), X12 // X12 = m[2]+ m[3] - MOVOU 48(DX), X11 // X11 = m[6]+ m[7] - MOVOU 96(DX), X10 // X10 = m[12]+m[13] - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6c; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[12], m[2] */ - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x6d; BYTE $0xdc // PUNPCKHQDQ XMM11, XMM12 /* m[7], m[3] */ - - LOAD_SHUFFLE - - G1 - G2 - - UNDIAGONALIZE - - // Reload digest (most current value store in &out) - MOVQ out+144(FP), SI // SI: &in - MOVOU 0(SI), X12 // X12 = in[0]+in[1] /* row1l = LOAD( &S->h[0] ); */ - MOVOU 16(SI), X13 // X13 = in[2]+in[3] /* row1h = LOAD( &S->h[2] ); */ - MOVOU 32(SI), X14 // X14 = in[4]+in[5] /* row2l = LOAD( &S->h[4] ); */ - MOVOU 48(SI), X15 // X15 = in[6]+in[7] /* row2h = LOAD( &S->h[6] ); */ - - // Final computations and prepare for storing - PXOR X4, X0 // X0 = X0 ^ X4 /* row1l = _mm_xor_si128( row3l, row1l ); */ - PXOR X5, X1 // X1 = X1 ^ X5 /* row1h = _mm_xor_si128( row3h, row1h ); */ - PXOR X12, X0 // X0 = X0 ^ X12 /* STORE( &S->h[0], _mm_xor_si128( LOAD( &S->h[0] ), row1l ) ); */ - PXOR X13, X1 // X1 = X1 ^ X13 /* STORE( &S->h[2], _mm_xor_si128( LOAD( &S->h[2] ), row1h ) ); */ - PXOR X6, X2 // X2 = X2 ^ X6 /* row2l = _mm_xor_si128( row4l, row2l ); */ - PXOR X7, X3 // X3 = X3 ^ X7 /* row2h = _mm_xor_si128( row4h, row2h ); */ - PXOR X14, X2 // X2 = X2 ^ X14 /* STORE( &S->h[4], _mm_xor_si128( LOAD( &S->h[4] ), row2l ) ); */ - PXOR X15, X3 // X3 = X3 ^ X15 /* STORE( &S->h[6], _mm_xor_si128( LOAD( &S->h[6] ), row2h ) ); */ - - // Store digest into &out - MOVQ out+144(FP), SI // SI: &out - MOVOU X0, 0(SI) // out[0]+out[1] = X0 - MOVOU X1, 16(SI) // out[2]+out[3] = X1 - MOVOU X2, 32(SI) // out[4]+out[5] = X2 - MOVOU X3, 48(SI) // out[6]+out[7] = X3 - - // Increment message pointer and check if there's more to do - ADDQ $128, DX // message += 128 - SUBQ $1, R8 - JNZ loop + // Increment counter + MOVQ t+72(FP), SI // SI: &t + MOVQ 0(SI), R9 + ADDQ $128, R9 // /* d.t[0] += BlockSize */ + MOVQ R9, 0(SI) + CMPQ R9, $128 // /* if d.t[0] < BlockSize { */ + JGE noincr + MOVQ 8(SI), R9 + ADDQ $1, R9 // /* d.t[1]++ */ + MOVQ R9, 8(SI) + +noincr: // /* } */ + + // Load initialization vector + MOVQ iv+48(FP), SI // SI: &iv + MOVOU 0(SI), X4 // X4 = iv[0]+iv[1] /* row3l = LOAD( &blake2b_IV[0] ); */ + MOVOU 16(SI), X5 // X5 = iv[2]+iv[3] /* row3h = LOAD( &blake2b_IV[2] ); */ + MOVOU 32(SI), X6 // X6 = iv[4]+iv[5] /* LOAD( &blake2b_IV[4] ) */ + MOVOU 48(SI), X7 // X7 = iv[6]+iv[7] /* LOAD( &blake2b_IV[6] ) */ + MOVQ t+72(FP), SI // SI: &t + MOVOU 0(SI), X8 // X8 = t[0]+t[1] /* LOAD( &S->t[0] ) */ + PXOR X8, X6 // X6 = X6 ^ X8 /* row4l = _mm_xor_si128( , ); */ + MOVQ t+96(FP), SI // SI: &f + MOVOU 0(SI), X8 // X8 = f[0]+f[1] /* LOAD( &S->f[0] ) */ + PXOR X8, X7 // X7 = X7 ^ X8 /* row4h = _mm_xor_si128( , ); */ + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 1 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register) + MOVOU 0(DX), X12 // X12 = m[0]+m[1] + MOVOU 16(DX), X13 // X13 = m[2]+m[3] + MOVOU 32(DX), X14 // X14 = m[4]+m[5] + MOVOU 48(DX), X15 // X15 = m[6]+m[7] + MOVOU X12, X8 + LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[0], m[2] */ + MOVOU X14, X9 + LONG $0x6c0f4566; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[4], m[6] */ + MOVOU X12, X10 + LONG $0x6d0f4566; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[1], m[3] */ + MOVOU X14, X11 + LONG $0x6d0f4566; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[5], m[7] */ + + LOAD_SHUFFLE + G1 + G2 + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register) + MOVOU 64(DX), X12 // X12 = m[8]+ m[9] + MOVOU 80(DX), X13 // X13 = m[10]+m[11] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + MOVOU X12, X8 + LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[8],m[10] */ + MOVOU X14, X9 + LONG $0x6c0f4566; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[12],m[14] */ + MOVOU X12, X10 + LONG $0x6d0f4566; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[9],m[11] */ + MOVOU X14, X11 + LONG $0x6d0f4566; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[13],m[15] */ + + LOAD_SHUFFLE + G1 + G2 + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 2 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register) + MOVOU 112(DX), X12 // X12 = m[14]+m[15] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 64(DX), X14 // X14 = m[8]+ m[9] + MOVOU 96(DX), X15 // X15 = m[12]+m[13] + MOVOU X12, X8 + LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[14], m[4] */ + MOVOU X14, X9 + LONG $0x6d0f4566; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* m[9], m[13] */ + MOVOU 80(DX), X10 // X10 = m[10]+m[11] + MOVOU 48(DX), X11 // X11 = m[6]+ m[7] + LONG $0x6c0f4566; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[10], m[8] */ + LONG $0x3a0f4566; WORD $0xdc0f; BYTE $0x08 // PALIGNR XMM11, XMM12, 0x8 /* m[15], m[6] */; ; ; ; ; + + LOAD_SHUFFLE + G1 + G2 + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 80(DX), X14 // X14 = m[10]+m[11] + MOVOU X12, X8 + LONG $0x3a0f4566; WORD $0xc40f; BYTE $0x08 // PALIGNR XMM8, XMM12, 0x8 /* m[1], m[0] */ + MOVOU X14, X9 + LONG $0x6d0f4566; BYTE $0xcd // PUNPCKHQDQ XMM9, XMM13 /* m[11], m[5] */ + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 48(DX), X11 // X11 = m[6]+ m[7] + MOVOU 96(DX), X10 // X10 = m[12]+m[13] + LONG $0x6c0f4566; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[12], m[2] */ + LONG $0x6d0f4566; BYTE $0xdc // PUNPCKHQDQ XMM11, XMM12 /* m[7], m[3] */ + + LOAD_SHUFFLE + G1 + G2 + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 3 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register) + MOVOU 32(DX), X12 // X12 = m[4]+ m[5] + MOVOU 80(DX), X13 // X13 = m[10]+m[11] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + MOVOU X14, X8 + LONG $0x3a0f4566; WORD $0xc50f; BYTE $0x08 // PALIGNR XMM8, XMM13, 0x8 /* m[11], m[12] */ + MOVOU X12, X9 + LONG $0x6d0f4566; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* m[5], m[15] */ + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 16(DX), X13 // X13 = m[2]+ m[3] + MOVOU 64(DX), X10 // X10 = m[8]+ m[9] + LONG $0x6c0f4566; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[8], m[0] */ + LONG $0x6d0f4566; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[13] */ + MOVOU X13, X11 + LONG $0x6c0f4566; BYTE $0xde // PUNPCKLQDQ XMM11, XMM14 /* m[2], ___ */ + + LOAD_SHUFFLE + G1 + G2 + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register) + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 64(DX), X14 // X14 = m[8]+ m[9] + MOVOU 80(DX), X15 // X15 = m[10]+m[11] + MOVOU X12, X9 + LONG $0x6d0f4566; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* ___, m[3] */ + MOVOU X15, X8 + LONG $0x6c0f4566; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[10], ___ */ + MOVOU X13, X9 + LONG $0x6d0f4566; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* m[7], m[9] */ + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 32(DX), X11 // X11 = m[4]+ m[5] + MOVOU 112(DX), X10 // X10 = m[14]+m[15] + LONG $0x6c0f4566; BYTE $0xd5 // PUNPCKLQDQ XMM10, XMM13 /* m[14], m[6] */ + LONG $0x3a0f4566; WORD $0xdc0f; BYTE $0x08 // PALIGNR XMM11, XMM12, 0x8 /* m[1], m[4] */ + + LOAD_SHUFFLE + G1 + G2 + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 4 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register) + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 80(DX), X14 // X14 = m[10]+m[11] + MOVOU 96(DX), X15 // X15 = m[12]+m[13] + MOVOU X13, X8 + LONG $0x6d0f4566; BYTE $0xc4 // PUNPCKHQDQ XMM8, XMM12 /* m[7], m[3] */ + MOVOU X15, X9 + LONG $0x6d0f4566; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* m[13], m[11] */ + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 64(DX), X10 // X10 = m[8]+ m[9] + MOVOU 112(DX), X14 // X14 = m[14]+m[15] + LONG $0x6d0f4566; BYTE $0xd4 // PUNPCKHQDQ XMM10, XMM12 /* m[9], m[1] */ + MOVOU X15, X11 + LONG $0x6c0f4566; BYTE $0xde // PUNPCKLQDQ XMM11, XMM14 /* m[12], m[14] */ + + LOAD_SHUFFLE + G1 + G2 + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register) + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 80(DX), X14 // X14 = m[10]+m[11] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + MOVOU X13, X9 + LONG $0x6d0f4566; BYTE $0xcd // PUNPCKHQDQ XMM9, XMM13 /* ___, m[5] */ + MOVOU X12, X8 + LONG $0x6c0f4566; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[2], ____ */ + MOVOU X15, X10 + LONG $0x6d0f4566; BYTE $0xd7 // PUNPCKHQDQ XMM10, XMM15 /* ___, m[15] */ + MOVOU X13, X9 + LONG $0x6c0f4566; BYTE $0xca // PUNPCKLQDQ XMM9, XMM10 /* m[4], ____ */ + MOVOU 0(DX), X11 // X11 = m[0]+ m[1] + MOVOU 48(DX), X10 // X10 = m[6]+ m[7] + MOVOU 64(DX), X15 // X15 = m[8]+ m[9] + LONG $0x6c0f4566; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[6], m[10] */ + LONG $0x6c0f4566; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[0], m[8] */ + + LOAD_SHUFFLE + G1 + G2 + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 5 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register) + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 64(DX), X14 // X14 = m[8]+ m[9] + MOVOU 80(DX), X15 // X15 = m[10]+m[11] + MOVOU X14, X8 + LONG $0x6d0f4566; BYTE $0xc5 // PUNPCKHQDQ XMM8, XMM13 /* m[9], m[5] */ + MOVOU X12, X9 + LONG $0x6c0f4566; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[2], m[10] */ + MOVOU 0(DX), X10 // X10 = m[0]+ m[1] + MOVOU 48(DX), X14 // X14 = m[6]+ m[7] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + LONG $0x6d0f4566; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[7] */ + LONG $0x6c0f4566; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[0], ____ */ + LONG $0x6d0f4566; BYTE $0xff // PUNPCKHQDQ XMM15, XMM15 /* ___, m[15] */ + MOVOU X13, X11 + LONG $0x6c0f4566; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[4], ____ */ + + LOAD_SHUFFLE + G1 + G2 + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register) + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 80(DX), X14 // X14 = m[10]+m[11] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + LONG $0x6d0f4566; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[11] */ + MOVOU X15, X8 + LONG $0x6c0f4566; BYTE $0xc6 // PUNPCKLQDQ XMM8, XMM14 /* m[14], ____ */ + LONG $0x6d0f4566; BYTE $0xe4 // PUNPCKHQDQ XMM12, XMM12 /* ___, m[3] */ + MOVOU X13, X9 + LONG $0x6c0f4566; BYTE $0xcc // PUNPCKLQDQ XMM9, XMM12 /* m[6], ____ */ + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 64(DX), X11 // X11 = m[8]+ m[9] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + MOVOU X14, X10 + LONG $0x3a0f4566; WORD $0xd40f; BYTE $0x08 // PALIGNR XMM10, XMM12, 0x8 /* m[1], m[12] */ + LONG $0x6d0f4566; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[13] */ + LONG $0x6c0f4566; BYTE $0xde // PUNPCKLQDQ XMM11, XMM14 /* m[8], ____ */ + + LOAD_SHUFFLE + G1 + G2 + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 6 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 16(DX), X13 // X13 = m[2]+ m[3] + MOVOU 48(DX), X14 // X14 = m[6]+ m[7] + MOVOU 64(DX), X15 // X15 = m[8]+ m[9] + MOVOU X13, X8 + LONG $0x6c0f4566; BYTE $0xc6 // PUNPCKLQDQ XMM8, XMM14 /* m[2], m[6] */ + MOVOU X12, X9 + LONG $0x6c0f4566; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[0], m[8] */ + MOVOU 80(DX), X12 // X12 = m[10]+m[11] + MOVOU 96(DX), X10 // X10 = m[12]+m[13] + LONG $0x6c0f4566; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[12], m[10] */ + MOVOU X12, X11 + LONG $0x6d0f4566; BYTE $0xdd // PUNPCKHQDQ XMM11, XMM13 /* m[11], m[3] */ + + LOAD_SHUFFLE + G1 + G2 + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 48(DX), X14 // X14 = m[6]+ m[7] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + MOVOU X14, X9 + LONG $0x6d0f4566; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* ___, m[7] */ + MOVOU X13, X8 + LONG $0x6c0f4566; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[4], ____ */ + MOVOU X15, X9 + LONG $0x6d0f4566; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* m[15], m[1] */ + MOVOU 64(DX), X12 // X12 = m[8]+ m[9] + MOVOU 96(DX), X10 // X10 = m[12]+m[13] + LONG $0x6d0f4566; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[13], m[5] */ + LONG $0x6d0f4566; BYTE $0xe4 // PUNPCKHQDQ XMM12, XMM12 /* ___, m[9] */ + MOVOU X15, X11 + LONG $0x6c0f4566; BYTE $0xdc // PUNPCKLQDQ XMM11, XMM12 /* m[14], ____ */ + + LOAD_SHUFFLE + G1 + G2 + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 7 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + MOVOU X12, X9 + LONG $0x6d0f4566; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* ___, m[1] */ + MOVOU X14, X8 + LONG $0x6c0f4566; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[12], ____ */ + MOVOU X15, X9 + LONG $0x6c0f4566; BYTE $0xcd // PUNPCKLQDQ XMM9, XMM13 /* m[14], m[4] */ + MOVOU 80(DX), X11 // X11 = m[10]+m[11] + MOVOU X13, X10 + LONG $0x6d0f4566; BYTE $0xd7 // PUNPCKHQDQ XMM10, XMM15 /* m[5], m[15] */ + LONG $0x3a0f4566; WORD $0xde0f; BYTE $0x08 // PALIGNR XMM11, XMM14, 0x8 /* m[13], m[10] */ + + LOAD_SHUFFLE + G1 + G2 + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 64(DX), X14 // X14 = m[8]+ m[9] + MOVOU 80(DX), X15 // X15 = m[10]+m[11] + MOVOU X12, X8 + LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[0], m[6] */ + MOVOU X14, X9 + LONG $0x3a0f4566; WORD $0xce0f; BYTE $0x08 // PALIGNR XMM9, XMM14, 0x8 /* m[9], m[8] */ + MOVOU 16(DX), X11 // X14 = m[2]+ m[3] + MOVOU X13, X10 + LONG $0x6d0f4566; BYTE $0xd3 // PUNPCKHQDQ XMM10, XMM11 /* m[7], m[3] */ + LONG $0x6d0f4566; BYTE $0xff // PUNPCKHQDQ XMM15, XMM15 /* ___, m[11] */ + LONG $0x6c0f4566; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[2], ____ */ + + LOAD_SHUFFLE + G1 + G2 + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 8 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register) + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + MOVOU X14, X8 + LONG $0x6d0f4566; BYTE $0xc5 // PUNPCKHQDQ XMM8, XMM13 /* m[13], m[7] */ + MOVOU X12, X10 + LONG $0x6d0f4566; BYTE $0xd4 // PUNPCKHQDQ XMM10, XMM12 /* ___, m[3] */ + MOVOU X14, X9 + LONG $0x6c0f4566; BYTE $0xca // PUNPCKLQDQ XMM9, XMM10 /* m[12], ____ */ + MOVOU 0(DX), X11 // X11 = m[0]+ m[1] + MOVOU 64(DX), X13 // X13 = m[8]+ m[9] + MOVOU 80(DX), X14 // X14 = m[10]+m[11] + MOVOU X15, X10 + LONG $0x3a0f4566; WORD $0xd60f; BYTE $0x08 // PALIGNR XMM10, XMM14, 0x8 /* m[11], m[14] */ + LONG $0x6d0f4566; BYTE $0xdd // PUNPCKHQDQ XMM11, XMM13 /* m[1], m[9] */ + + LOAD_SHUFFLE + G1 + G2 + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register) + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 64(DX), X14 // X14 = m[8]+ m[9] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + MOVOU X13, X8 + LONG $0x6d0f4566; BYTE $0xc7 // PUNPCKHQDQ XMM8, XMM15 /* m[5], m[15] */ + MOVOU X14, X9 + LONG $0x6c0f4566; BYTE $0xcc // PUNPCKLQDQ XMM9, XMM12 /* m[8], m[2] */ + MOVOU 0(DX), X10 // X10 = m[0]+ m[1] + MOVOU 48(DX), X11 // X11 = m[6]+ m[7] + MOVOU 80(DX), X15 // X15 = m[10]+m[11] + LONG $0x6c0f4566; BYTE $0xd5 // PUNPCKLQDQ XMM10, XMM13 /* m[0], m[4] */ + LONG $0x6c0f4566; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[6], m[10] */ + + LOAD_SHUFFLE + G1 + G2 + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 9 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 80(DX), X14 // X14 = m[10]+m[11] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + MOVOU X13, X8 + LONG $0x6c0f4566; BYTE $0xc7 // PUNPCKLQDQ XMM8, XMM15 /* m[6], m[14] */ + MOVOU X12, X9 + LONG $0x3a0f4566; WORD $0xce0f; BYTE $0x08 // PALIGNR XMM9, XMM14, 0x8 /* m[11], m[0] */ + MOVOU 16(DX), X13 // X13 = m[2]+ m[3] + MOVOU 64(DX), X11 // X11 = m[8]+ m[9] + MOVOU X15, X10 + LONG $0x6d0f4566; BYTE $0xd3 // PUNPCKHQDQ XMM10, XMM11 /* m[15], m[9] */ + LONG $0x3a0f4566; WORD $0xdd0f; BYTE $0x08 // PALIGNR XMM11, XMM13, 0x8 /* m[3], m[8] */ + + LOAD_SHUFFLE + G1 + G2 + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 16(DX), X13 // X13 = m[2]+ m[3] + MOVOU 80(DX), X14 // X14 = m[10]+m[11] + MOVOU 96(DX), X15 // X15 = m[12]+m[13] + MOVOU X15, X9 + LONG $0x6d0f4566; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* ___, m[13] */ + MOVOU X15, X8 + LONG $0x6c0f4566; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[12], ____ */ + MOVOU X14, X9 + LONG $0x3a0f4566; WORD $0xcc0f; BYTE $0x08 // PALIGNR XMM9, XMM12, 0x8 /* m[1], m[10] */ + MOVOU 32(DX), X12 // X12 = m[4]+ m[5] + MOVOU 48(DX), X15 // X15 = m[6]+ m[7] + MOVOU X15, X11 + LONG $0x6d0f4566; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* ___, m[7] */ + MOVOU X13, X10 + LONG $0x6c0f4566; BYTE $0xd3 // PUNPCKLQDQ XMM10, XMM11 /* m[2], ____ */ + MOVOU X12, X15 + LONG $0x6d0f4566; BYTE $0xfc // PUNPCKHQDQ XMM15, XMM12 /* ___, m[5] */ + MOVOU X12, X11 + LONG $0x6c0f4566; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[4], ____ */ + + LOAD_SHUFFLE + G1 + G2 + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 1 0 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 64(DX), X14 // X14 = m[8]+ m[9] + MOVOU 80(DX), X15 // X15 = m[10]+m[11] + MOVOU X15, X8 + LONG $0x6c0f4566; BYTE $0xc6 // PUNPCKLQDQ XMM8, XMM14 /* m[10], m[8] */ + MOVOU X13, X9 + LONG $0x6d0f4566; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* m[7], m[1] */ + MOVOU 16(DX), X10 // X10 = m[2]+ m[3] + MOVOU 32(DX), X14 // X14 = m[4]+ m[5] + LONG $0x6c0f4566; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[2], m[4] */ + MOVOU X14, X15 + LONG $0x6d0f4566; BYTE $0xfe // PUNPCKHQDQ XMM15, XMM14 /* ___, m[5] */ + MOVOU X13, X11 + LONG $0x6c0f4566; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[6], ____ */ + + LOAD_SHUFFLE + G1 + G2 + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register) + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 64(DX), X13 // X13 = m[8]+ m[9] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + MOVOU X15, X8 + LONG $0x6d0f4566; BYTE $0xc5 // PUNPCKHQDQ XMM8, XMM13 /* m[15], m[9] */ + MOVOU X12, X9 + LONG $0x6d0f4566; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* m[3], m[13] */ + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 80(DX), X13 // X13 = m[10]+m[11] + MOVOU X15, X10 + LONG $0x3a0f4566; WORD $0xd50f; BYTE $0x08 // PALIGNR XMM10, XMM13, 0x8 /* m[11], m[14] */ + MOVOU X14, X11 + LONG $0x6c0f4566; BYTE $0xdc // PUNPCKLQDQ XMM11, XMM12 /* m[12], m[0] */ + + LOAD_SHUFFLE + G1 + G2 + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 1 1 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register) + MOVOU 0(DX), X12 // X12 = m[0]+m[1] + MOVOU 16(DX), X13 // X13 = m[2]+m[3] + MOVOU 32(DX), X14 // X14 = m[4]+m[5] + MOVOU 48(DX), X15 // X15 = m[6]+m[7] + MOVOU X12, X8 + LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[0], m[2] */ + MOVOU X14, X9 + LONG $0x6c0f4566; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[4], m[6] */ + MOVOU X12, X10 + LONG $0x6d0f4566; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[1], m[3] */ + MOVOU X14, X11 + LONG $0x6d0f4566; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[5], m[7] */ + + LOAD_SHUFFLE + G1 + G2 + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register) + MOVOU 64(DX), X12 // X12 = m[8]+ m[9] + MOVOU 80(DX), X13 // X13 = m[10]+m[11] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + MOVOU X12, X8 + LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[8],m[10] */ + MOVOU X14, X9 + LONG $0x6c0f4566; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[12],m[14] */ + MOVOU X12, X10 + LONG $0x6d0f4566; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[9],m[11] */ + MOVOU X14, X11 + LONG $0x6d0f4566; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[13],m[15] */ + + LOAD_SHUFFLE + G1 + G2 + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 1 2 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register) + MOVOU 112(DX), X12 // X12 = m[14]+m[15] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 64(DX), X14 // X14 = m[8]+ m[9] + MOVOU 96(DX), X15 // X15 = m[12]+m[13] + MOVOU X12, X8 + LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[14], m[4] */ + MOVOU X14, X9 + LONG $0x6d0f4566; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* m[9], m[13] */ + MOVOU 80(DX), X10 // X10 = m[10]+m[11] + MOVOU 48(DX), X11 // X11 = m[6]+ m[7] + LONG $0x6c0f4566; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[10], m[8] */ + LONG $0x3a0f4566; WORD $0xdc0f; BYTE $0x08 // PALIGNR XMM11, XMM12, 0x8 /* m[15], m[6] */; ; ; ; ; + + LOAD_SHUFFLE + G1 + G2 + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 80(DX), X14 // X14 = m[10]+m[11] + MOVOU X12, X8 + LONG $0x3a0f4566; WORD $0xc40f; BYTE $0x08 // PALIGNR XMM8, XMM12, 0x8 /* m[1], m[0] */ + MOVOU X14, X9 + LONG $0x6d0f4566; BYTE $0xcd // PUNPCKHQDQ XMM9, XMM13 /* m[11], m[5] */ + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 48(DX), X11 // X11 = m[6]+ m[7] + MOVOU 96(DX), X10 // X10 = m[12]+m[13] + LONG $0x6c0f4566; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[12], m[2] */ + LONG $0x6d0f4566; BYTE $0xdc // PUNPCKHQDQ XMM11, XMM12 /* m[7], m[3] */ + + LOAD_SHUFFLE + G1 + G2 + UNDIAGONALIZE + + // Reload digest (most current value store in &out) + MOVQ out+144(FP), SI // SI: &in + MOVOU 0(SI), X12 // X12 = in[0]+in[1] /* row1l = LOAD( &S->h[0] ); */ + MOVOU 16(SI), X13 // X13 = in[2]+in[3] /* row1h = LOAD( &S->h[2] ); */ + MOVOU 32(SI), X14 // X14 = in[4]+in[5] /* row2l = LOAD( &S->h[4] ); */ + MOVOU 48(SI), X15 // X15 = in[6]+in[7] /* row2h = LOAD( &S->h[6] ); */ + + // Final computations and prepare for storing + PXOR X4, X0 // X0 = X0 ^ X4 /* row1l = _mm_xor_si128( row3l, row1l ); */ + PXOR X5, X1 // X1 = X1 ^ X5 /* row1h = _mm_xor_si128( row3h, row1h ); */ + PXOR X12, X0 // X0 = X0 ^ X12 /* STORE( &S->h[0], _mm_xor_si128( LOAD( &S->h[0] ), row1l ) ); */ + PXOR X13, X1 // X1 = X1 ^ X13 /* STORE( &S->h[2], _mm_xor_si128( LOAD( &S->h[2] ), row1h ) ); */ + PXOR X6, X2 // X2 = X2 ^ X6 /* row2l = _mm_xor_si128( row4l, row2l ); */ + PXOR X7, X3 // X3 = X3 ^ X7 /* row2h = _mm_xor_si128( row4h, row2h ); */ + PXOR X14, X2 // X2 = X2 ^ X14 /* STORE( &S->h[4], _mm_xor_si128( LOAD( &S->h[4] ), row2l ) ); */ + PXOR X15, X3 // X3 = X3 ^ X15 /* STORE( &S->h[6], _mm_xor_si128( LOAD( &S->h[6] ), row2h ) ); */ + + // Store digest into &out + MOVQ out+144(FP), SI // SI: &out + MOVOU X0, 0(SI) // out[0]+out[1] = X0 + MOVOU X1, 16(SI) // out[2]+out[3] = X1 + MOVOU X2, 32(SI) // out[4]+out[5] = X2 + MOVOU X3, 48(SI) // out[6]+out[7] = X3 + + // Increment message pointer and check if there's more to do + ADDQ $128, DX // message += 128 + SUBQ $1, R8 + JNZ loop complete: - RET + RET diff --git a/vendor/github.com/minio/blake2b-simd/compress_generic.go b/vendor/github.com/minio/blake2b-simd/compress_generic.go index 62d81aaab..e9e16e8b9 100644 --- a/vendor/github.com/minio/blake2b-simd/compress_generic.go +++ b/vendor/github.com/minio/blake2b-simd/compress_generic.go @@ -26,12 +26,13 @@ func compressGeneric(d *digest, p []uint8) { v13 := iv[5] ^ d.t[1] v14 := iv[6] ^ d.f[0] v15 := iv[7] ^ d.f[1] - var m [16]uint64 j := 0 - for i := 0; i < 16; i++ { - m[i] = uint64(p[j]) | uint64(p[j+1])<<8 | uint64(p[j+2])<<16 | uint64(p[j+3])<<24 | - uint64(p[j+4])<<32 | uint64(p[j+5])<<40 | uint64(p[j+6])<<48 | uint64(p[j+7])<<56 + var m [16]uint64 + for i := range m { + m[i] = uint64(p[j]) | uint64(p[j+1])<<8 | uint64(p[j+2])<<16 | + uint64(p[j+3])<<24 | uint64(p[j+4])<<32 | uint64(p[j+5])<<40 | + uint64(p[j+6])<<48 | uint64(p[j+7])<<56 j += 8 } diff --git a/vendor/github.com/minio/sha256-simd/cpuid_386.s b/vendor/github.com/minio/sha256-simd/cpuid_386.s index fa38814ec..f908ae862 100644 --- a/vendor/github.com/minio/sha256-simd/cpuid_386.s +++ b/vendor/github.com/minio/sha256-simd/cpuid_386.s @@ -1,4 +1,24 @@ -// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. +// The MIT License (MIT) +// +// Copyright (c) 2015 Klaus Post +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. // +build 386,!gccgo diff --git a/vendor/github.com/minio/sha256-simd/cpuid_amd64.s b/vendor/github.com/minio/sha256-simd/cpuid_amd64.s index fb45a6560..9a8a03297 100644 --- a/vendor/github.com/minio/sha256-simd/cpuid_amd64.s +++ b/vendor/github.com/minio/sha256-simd/cpuid_amd64.s @@ -1,4 +1,24 @@ -// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. +// The MIT License (MIT) +// +// Copyright (c) 2015 Klaus Post +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. // +build amd64,!gccgo diff --git a/vendor/github.com/minio/sha256-simd/cpuid_arm.go b/vendor/github.com/minio/sha256-simd/cpuid_arm.go index 28637d391..351dff4b6 100644 --- a/vendor/github.com/minio/sha256-simd/cpuid_arm.go +++ b/vendor/github.com/minio/sha256-simd/cpuid_arm.go @@ -28,6 +28,5 @@ func xgetbv(index uint32) (eax, edx uint32) { } func haveArmSha() bool { - // TODO: Implement feature detection for ARM - return true + return false } diff --git a/vendor/github.com/minio/sha256-simd/cpuid_linux_arm64.go b/vendor/github.com/minio/sha256-simd/cpuid_linux_arm64.go new file mode 100644 index 000000000..e739996d9 --- /dev/null +++ b/vendor/github.com/minio/sha256-simd/cpuid_linux_arm64.go @@ -0,0 +1,49 @@ +// +build arm64,linux + +// Minio Cloud Storage, (C) 2016 Minio, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package sha256 + +import ( + "bytes" + "io/ioutil" +) + +func cpuid(op uint32) (eax, ebx, ecx, edx uint32) { + return 0, 0, 0, 0 +} + +func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) { + return 0, 0, 0, 0 +} + +func xgetbv(index uint32) (eax, edx uint32) { + return 0, 0 +} + +// File to check for cpu capabilities. +const procCPUInfo = "/proc/cpuinfo" + +// Feature to check for. +const sha256Feature = "sha2" + +func haveArmSha() bool { + cpuInfo, err := ioutil.ReadFile(procCPUInfo) + if err != nil { + return false + } + return bytes.Contains(cpuInfo, []byte(sha256Feature)) +} diff --git a/vendor/github.com/minio/sha256-simd/cpuid_others_arm64.go b/vendor/github.com/minio/sha256-simd/cpuid_others_arm64.go new file mode 100644 index 000000000..0fb4022f7 --- /dev/null +++ b/vendor/github.com/minio/sha256-simd/cpuid_others_arm64.go @@ -0,0 +1,35 @@ +// +build arm64,!linux + +// Minio Cloud Storage, (C) 2016 Minio, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package sha256 + +func cpuid(op uint32) (eax, ebx, ecx, edx uint32) { + return 0, 0, 0, 0 +} + +func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) { + return 0, 0, 0, 0 +} + +func xgetbv(index uint32) (eax, edx uint32) { + return 0, 0 +} + +// Check for sha2 instruction flag. +func haveArmSha() bool { + return false +} diff --git a/vendor/github.com/minio/sha256-simd/cpuid_arm64.go b/vendor/github.com/minio/sha256-simd/cpuid_ppc64.go similarity index 93% rename from vendor/github.com/minio/sha256-simd/cpuid_arm64.go rename to vendor/github.com/minio/sha256-simd/cpuid_ppc64.go index 28637d391..351dff4b6 100644 --- a/vendor/github.com/minio/sha256-simd/cpuid_arm64.go +++ b/vendor/github.com/minio/sha256-simd/cpuid_ppc64.go @@ -28,6 +28,5 @@ func xgetbv(index uint32) (eax, edx uint32) { } func haveArmSha() bool { - // TODO: Implement feature detection for ARM - return true + return false } diff --git a/vendor/github.com/minio/sha256-simd/cpuid_ppc64le.go b/vendor/github.com/minio/sha256-simd/cpuid_ppc64le.go new file mode 100644 index 000000000..351dff4b6 --- /dev/null +++ b/vendor/github.com/minio/sha256-simd/cpuid_ppc64le.go @@ -0,0 +1,32 @@ +// Minio Cloud Storage, (C) 2016 Minio, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package sha256 + +func cpuid(op uint32) (eax, ebx, ecx, edx uint32) { + return 0, 0, 0, 0 +} + +func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) { + return 0, 0, 0, 0 +} + +func xgetbv(index uint32) (eax, edx uint32) { + return 0, 0 +} + +func haveArmSha() bool { + return false +} diff --git a/vendor/github.com/minio/sha256-simd/sha256.go b/vendor/github.com/minio/sha256-simd/sha256.go index a753e2886..f28236eba 100644 --- a/vendor/github.com/minio/sha256-simd/sha256.go +++ b/vendor/github.com/minio/sha256-simd/sha256.go @@ -89,7 +89,8 @@ func New() hash.Hash { d.Reset() return d } - // default back to the standard golang implementation + // Fallback to the standard golang implementation + // if no features were found. return sha256.New() } diff --git a/vendor/github.com/minio/sha256-simd/sha256block_ppc64.go b/vendor/github.com/minio/sha256-simd/sha256block_ppc64.go new file mode 100644 index 000000000..b81017e8d --- /dev/null +++ b/vendor/github.com/minio/sha256-simd/sha256block_ppc64.go @@ -0,0 +1,22 @@ +/* + * Minio Cloud Storage, (C) 2016 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package sha256 + +func blockAvx2Go(dig *digest, p []byte) {} +func blockAvxGo(dig *digest, p []byte) {} +func blockSsseGo(dig *digest, p []byte) {} +func blockArmGo(dig *digest, p []byte) {} diff --git a/vendor/github.com/minio/sha256-simd/sha256block_ppc64le.go b/vendor/github.com/minio/sha256-simd/sha256block_ppc64le.go new file mode 100644 index 000000000..b81017e8d --- /dev/null +++ b/vendor/github.com/minio/sha256-simd/sha256block_ppc64le.go @@ -0,0 +1,22 @@ +/* + * Minio Cloud Storage, (C) 2016 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package sha256 + +func blockAvx2Go(dig *digest, p []byte) {} +func blockAvxGo(dig *digest, p []byte) {} +func blockSsseGo(dig *digest, p []byte) {} +func blockArmGo(dig *digest, p []byte) {} diff --git a/vendor/vendor.json b/vendor/vendor.json index 6698c14ed..4ac32a7ac 100644 --- a/vendor/vendor.json +++ b/vendor/vendor.json @@ -1,6 +1,6 @@ { "comment": "", - "ignore": "appengine test", + "ignore": "test", "package": [ { "path": "github.com/Sirupsen/logrus", @@ -62,7 +62,7 @@ "revisionTime": "2016-06-10T14:06:02+03:00" }, { - "checksumSHA1": "2a/SsTUBMKtcM6VtpbdPGO+c6c8=", + "checksumSHA1": "W+E/2xXcE1GmJ0Qb784ald0Fn6I=", "path": "github.com/golang/snappy", "revision": "d9eb7a3d35ec988b8585d4a0068e462c27d28380", "revisionTime": "2016-05-29T05:00:41Z" @@ -109,10 +109,10 @@ "revisionTime": "2016-10-16T15:41:25Z" }, { - "checksumSHA1": "XRii0aDqXZvztXflEB2EE9TRoks=", + "checksumSHA1": "Pzd1bfm8Yj1radncaohNZu+UT1I=", "path": "github.com/klauspost/reedsolomon", - "revision": "c54154da9e35cab25232314cf69ab9d78447f9a5", - "revisionTime": "2016-09-12T19:31:07Z" + "revision": "d0a56f72c0d40a6cdde43a1575ad9686a0098b70", + "revisionTime": "2016-10-28T07:13:20Z" }, { "checksumSHA1": "dNYxHiBLalTqluak2/Z8c3RsSEM=", @@ -137,9 +137,10 @@ "revisionTime": "2015-12-11T09:06:21+09:00" }, { + "checksumSHA1": "IgPoMBktWdCLuyzDBfzi34sT+jg=", "path": "github.com/minio/blake2b-simd", - "revision": "25efc542f2c5064cf312cdca043790a7af861c4c", - "revisionTime": "2016-07-06T10:29:24+02:00" + "revision": "c50cace0dc7d72a80244a5f88ddd3e08a73db8de", + "revisionTime": "2016-07-22T09:38:12Z" }, { "path": "github.com/minio/cli", @@ -187,10 +188,10 @@ "revisionTime": "2016-07-24T00:05:56Z" }, { - "checksumSHA1": "i8Hl0yGP1jqorMgfFMoJCItnI38=", + "checksumSHA1": "URVle4qtadmW9w9BulDRHY3kxnA=", "path": "github.com/minio/sha256-simd", - "revision": "6f50cd1d784b2bea46167b6929f16c0d12eefbfb", - "revisionTime": "2016-08-16T22:25:11Z" + "revision": "e82e73b775766b9011503e80e6772fc32b9afc5b", + "revisionTime": "2016-12-19T23:17:30Z" }, { "checksumSHA1": "Nj7vQ2GlvJiPP7sqJX5AurrDSD4=",