mirror of
https://github.com/minio/minio.git
synced 2025-11-09 05:34:56 -05:00
replace blake2b implementation (#3481)
* replace blake2b implementation replace the blake2b-simd with the golang/x/crypto implementation ``` name old time/op new time/op delta Size64-8 715ns ±13% 614ns ± 3% ~ (p=0.084 n=6+6) Size128-8 612ns ± 5% 634ns ± 8% ~ (p=0.084 n=6+6) Size1K-8 2.18µs ± 5% 2.09µs ± 7% ~ (p=0.084 n=6+6) Size8K-8 13.1µs ± 2% 13.4µs ± 3% ~ (p=0.084 n=6+6) Size32K-8 48.5µs ± 1% 49.5µs ± 3% ~ (p=0.775 n=6+6) Size128K-8 199µs ± 0% 198µs ± 3% ~ (p=0.468 n=6+6) name old speed new speed delta Size64-8 92.6MB/s ±11% 104.2MB/s ± 3% ~ (p=0.139 n=6+6) Size128-8 208MB/s ± 6% 202MB/s ± 8% ~ (p=0.102 n=6+6) Size1K-8 466MB/s ± 7% 492MB/s ± 7% ~ (p=0.139 n=6+6) Size8K-8 621MB/s ± 2% 610MB/s ± 3% ~ (p=0.102 n=6+6) Size32K-8 672MB/s ± 2% 669MB/s ± 1% ~ (p=0.818 n=6+6) Size128K-8 657MB/s ± 1% 672MB/s ± 0% +2.28% (p=0.002 n=6+6) name old time/op new time/op delta Size64-4 334ns ± 1% 243ns ± 0% -27.14% (p=0.029 n=4+4) Size128-4 296ns ± 1% 242ns ± 0% -18.21% (p=0.029 n=4+4) Size1K-4 1.44µs ± 0% 1.28µs ± 0% -10.83% (p=0.029 n=4+4) Size8K-4 10.0µs ± 0% 9.4µs ± 0% -6.23% (p=0.029 n=4+4) Size32K-4 39.8µs ± 1% 37.3µs ± 0% -6.31% (p=0.029 n=4+4) Size128K-4 162µs ± 3% 149µs ± 0% -7.72% (p=0.029 n=4+4) name old speed new speed delta Size64-4 192MB/s ± 1% 263MB/s ± 0% +37.24% (p=0.029 n=4+4) Size128-4 431MB/s ± 0% 526MB/s ± 0% +22.04% (p=0.029 n=4+4) Size1K-4 713MB/s ± 0% 800MB/s ± 0% +12.17% (p=0.029 n=4+4) Size8K-4 815MB/s ± 0% 869MB/s ± 0% +6.64% (p=0.029 n=4+4) Size32K-4 823MB/s ± 1% 878MB/s ± 0% +6.72% (p=0.029 n=4+4) Size128K-4 810MB/s ± 3% 877MB/s ± 0% +8.23% (p=0.029 n=4+4) ``` See: https://go-review.googlesource.com/#/c/34319/
This commit is contained in:
committed by
Harshavardhana
parent
15b4c49621
commit
1ac36a95aa
202
vendor/github.com/minio/blake2b-simd/LICENSE
generated
vendored
202
vendor/github.com/minio/blake2b-simd/LICENSE
generated
vendored
@@ -1,202 +0,0 @@
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
144
vendor/github.com/minio/blake2b-simd/README.md
generated
vendored
144
vendor/github.com/minio/blake2b-simd/README.md
generated
vendored
@@ -1,144 +0,0 @@
|
||||
BLAKE2b-SIMD
|
||||
============
|
||||
|
||||
Pure Go implementation of BLAKE2b using SIMD optimizations.
|
||||
|
||||
Introduction
|
||||
------------
|
||||
|
||||
This package was initially based on the pure go [BLAKE2b](https://github.com/dchest/blake2b) implementation of Dmitry Chestnykh and merged with the (`cgo` dependent) AVX optimized [BLAKE2](https://github.com/codahale/blake2) implementation (which in turn is based on the [official implementation](https://github.com/BLAKE2/BLAKE2). It does so by using [Go's Assembler](https://golang.org/doc/asm) for amd64 architectures with a golang only fallback for other architectures.
|
||||
|
||||
In addition to AVX there is also support for AVX2 as well as SSE. Best performance is obtained with AVX2 which gives roughly a **4X** performance increase approaching hashing speeds of **1GB/sec** on a single core.
|
||||
|
||||
Benchmarks
|
||||
----------
|
||||
|
||||
This is a summary of the performance improvements. Full details are shown below.
|
||||
|
||||
| Technology | 128K |
|
||||
| ---------- |:-----:|
|
||||
| AVX2 | 3.94x |
|
||||
| AVX | 3.28x |
|
||||
| SSE | 2.85x |
|
||||
|
||||
asm2plan9s
|
||||
----------
|
||||
|
||||
In order to be able to work more easily with AVX2/AVX instructions, a separate tool was developed to convert AVX2/AVX instructions into the corresponding BYTE sequence as accepted by Go assembly. See [asm2plan9s](https://github.com/minio/asm2plan9s) for more information.
|
||||
|
||||
bt2sum
|
||||
------
|
||||
|
||||
[bt2sum](https://github.com/s3git/bt2sum) is a utility that takes advantages of the BLAKE2b SIMD optimizations to compute check sums using the BLAKE2 Tree hashing mode in so called 'unlimited fanout' mode.
|
||||
|
||||
Technical details
|
||||
-----------------
|
||||
|
||||
BLAKE2b is a hashing algorithm that operates on 64-bit integer values. The AVX2 version uses the 256-bit wide YMM registers in order to essentially process four operations in parallel. AVX and SSE operate on 128-bit values simultaneously (two operations in parallel). Below are excerpts from `compressAvx2_amd64.s`, `compressAvx_amd64.s`, and `compress_generic.go` respectively.
|
||||
|
||||
```
|
||||
VPADDQ YMM0,YMM0,YMM1 /* v0 += v4, v1 += v5, v2 += v6, v3 += v7 */
|
||||
```
|
||||
|
||||
```
|
||||
VPADDQ XMM0,XMM0,XMM2 /* v0 += v4, v1 += v5 */
|
||||
VPADDQ XMM1,XMM1,XMM3 /* v2 += v6, v3 += v7 */
|
||||
```
|
||||
|
||||
```
|
||||
v0 += v4
|
||||
v1 += v5
|
||||
v2 += v6
|
||||
v3 += v7
|
||||
```
|
||||
|
||||
Detailed benchmarks
|
||||
-------------------
|
||||
|
||||
Example performance metrics were generated on Intel(R) Xeon(R) CPU E5-2620 v3 @ 2.40GHz - 6 physical cores, 12 logical cores running Ubuntu GNU/Linux with kernel version 4.4.0-24-generic (vanilla with no optimizations).
|
||||
|
||||
### AVX2
|
||||
|
||||
```
|
||||
$ benchcmp go.txt avx2.txt
|
||||
benchmark old ns/op new ns/op delta
|
||||
BenchmarkHash64-12 1481 849 -42.67%
|
||||
BenchmarkHash128-12 1428 746 -47.76%
|
||||
BenchmarkHash1K-12 6379 2227 -65.09%
|
||||
BenchmarkHash8K-12 37219 11714 -68.53%
|
||||
BenchmarkHash32K-12 140716 35935 -74.46%
|
||||
BenchmarkHash128K-12 561656 142634 -74.60%
|
||||
|
||||
benchmark old MB/s new MB/s speedup
|
||||
BenchmarkHash64-12 43.20 75.37 1.74x
|
||||
BenchmarkHash128-12 89.64 171.35 1.91x
|
||||
BenchmarkHash1K-12 160.52 459.69 2.86x
|
||||
BenchmarkHash8K-12 220.10 699.32 3.18x
|
||||
BenchmarkHash32K-12 232.87 911.85 3.92x
|
||||
BenchmarkHash128K-12 233.37 918.93 3.94x
|
||||
```
|
||||
|
||||
### AVX2: Comparison to other hashing techniques
|
||||
|
||||
```
|
||||
$ go test -bench=Comparison
|
||||
BenchmarkComparisonMD5-12 1000 1726121 ns/op 607.48 MB/s
|
||||
BenchmarkComparisonSHA1-12 500 2005164 ns/op 522.94 MB/s
|
||||
BenchmarkComparisonSHA256-12 300 5531036 ns/op 189.58 MB/s
|
||||
BenchmarkComparisonSHA512-12 500 3423030 ns/op 306.33 MB/s
|
||||
BenchmarkComparisonBlake2B-12 1000 1232690 ns/op 850.64 MB/s
|
||||
```
|
||||
|
||||
Benchmarks below were generated on a MacBook Pro with a 2.7 GHz Intel Core i7.
|
||||
|
||||
### AVX
|
||||
|
||||
```
|
||||
$ benchcmp go.txt avx.txt
|
||||
benchmark old ns/op new ns/op delta
|
||||
BenchmarkHash64-8 813 458 -43.67%
|
||||
BenchmarkHash128-8 766 401 -47.65%
|
||||
BenchmarkHash1K-8 4881 1763 -63.88%
|
||||
BenchmarkHash8K-8 36127 12273 -66.03%
|
||||
BenchmarkHash32K-8 140582 43155 -69.30%
|
||||
BenchmarkHash128K-8 567850 173246 -69.49%
|
||||
|
||||
benchmark old MB/s new MB/s speedup
|
||||
BenchmarkHash64-8 78.63 139.57 1.78x
|
||||
BenchmarkHash128-8 166.98 318.73 1.91x
|
||||
BenchmarkHash1K-8 209.76 580.68 2.77x
|
||||
BenchmarkHash8K-8 226.76 667.46 2.94x
|
||||
BenchmarkHash32K-8 233.09 759.29 3.26x
|
||||
BenchmarkHash128K-8 230.82 756.56 3.28x
|
||||
```
|
||||
|
||||
### SSE
|
||||
|
||||
```
|
||||
$ benchcmp go.txt sse.txt
|
||||
benchmark old ns/op new ns/op delta
|
||||
BenchmarkHash64-8 813 478 -41.21%
|
||||
BenchmarkHash128-8 766 411 -46.34%
|
||||
BenchmarkHash1K-8 4881 1870 -61.69%
|
||||
BenchmarkHash8K-8 36127 12427 -65.60%
|
||||
BenchmarkHash32K-8 140582 49512 -64.78%
|
||||
BenchmarkHash128K-8 567850 199040 -64.95%
|
||||
|
||||
benchmark old MB/s new MB/s speedup
|
||||
BenchmarkHash64-8 78.63 133.78 1.70x
|
||||
BenchmarkHash128-8 166.98 311.23 1.86x
|
||||
BenchmarkHash1K-8 209.76 547.37 2.61x
|
||||
BenchmarkHash8K-8 226.76 659.20 2.91x
|
||||
BenchmarkHash32K-8 233.09 661.81 2.84x
|
||||
BenchmarkHash128K-8 230.82 658.52 2.85x
|
||||
```
|
||||
|
||||
License
|
||||
-------
|
||||
|
||||
Released under the Apache License v2.0. You can find the complete text in the file LICENSE.
|
||||
|
||||
Contributing
|
||||
------------
|
||||
|
||||
Contributions are welcome, please send PRs for any enhancements.
|
||||
301
vendor/github.com/minio/blake2b-simd/blake2b.go
generated
vendored
301
vendor/github.com/minio/blake2b-simd/blake2b.go
generated
vendored
@@ -1,301 +0,0 @@
|
||||
// Written in 2012 by Dmitry Chestnykh.
|
||||
//
|
||||
// To the extent possible under law, the author have dedicated all copyright
|
||||
// and related and neighboring rights to this software to the public domain
|
||||
// worldwide. This software is distributed without any warranty.
|
||||
// http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
// Package blake2b implements BLAKE2b cryptographic hash function.
|
||||
package blake2b
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"hash"
|
||||
)
|
||||
|
||||
const (
|
||||
BlockSize = 128 // block size of algorithm
|
||||
Size = 64 // maximum digest size
|
||||
SaltSize = 16 // maximum salt size
|
||||
PersonSize = 16 // maximum personalization string size
|
||||
KeySize = 64 // maximum size of key
|
||||
)
|
||||
|
||||
type digest struct {
|
||||
h [8]uint64 // current chain value
|
||||
t [2]uint64 // message bytes counter
|
||||
f [2]uint64 // finalization flags
|
||||
x [BlockSize]byte // buffer for data not yet compressed
|
||||
nx int // number of bytes in buffer
|
||||
|
||||
ih [8]uint64 // initial chain value (after config)
|
||||
paddedKey [BlockSize]byte // copy of key, padded with zeros
|
||||
isKeyed bool // indicates whether hash was keyed
|
||||
size uint8 // digest size in bytes
|
||||
isLastNode bool // indicates processing of the last node in tree hashing
|
||||
}
|
||||
|
||||
// Initialization values.
|
||||
var iv = [8]uint64{
|
||||
0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
|
||||
0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
|
||||
0x510e527fade682d1, 0x9b05688c2b3e6c1f,
|
||||
0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
|
||||
}
|
||||
|
||||
// Config is used to configure hash function parameters and keying.
|
||||
// All parameters are optional.
|
||||
type Config struct {
|
||||
Size uint8 // digest size (if zero, default size of 64 bytes is used)
|
||||
Key []byte // key for prefix-MAC
|
||||
Salt []byte // salt (if < 16 bytes, padded with zeros)
|
||||
Person []byte // personalization (if < 16 bytes, padded with zeros)
|
||||
Tree *Tree // parameters for tree hashing
|
||||
}
|
||||
|
||||
// Tree represents parameters for tree hashing.
|
||||
type Tree struct {
|
||||
Fanout uint8 // fanout
|
||||
MaxDepth uint8 // maximal depth
|
||||
LeafSize uint32 // leaf maximal byte length (0 for unlimited)
|
||||
NodeOffset uint64 // node offset (0 for first, leftmost or leaf)
|
||||
NodeDepth uint8 // node depth (0 for leaves)
|
||||
InnerHashSize uint8 // inner hash byte length
|
||||
IsLastNode bool // indicates processing of the last node of layer
|
||||
}
|
||||
|
||||
var (
|
||||
defaultConfig = &Config{Size: Size}
|
||||
config256 = &Config{Size: 32}
|
||||
)
|
||||
|
||||
func verifyConfig(c *Config) error {
|
||||
if c.Size > Size {
|
||||
return errors.New("digest size is too large")
|
||||
}
|
||||
if len(c.Key) > KeySize {
|
||||
return errors.New("key is too large")
|
||||
}
|
||||
if len(c.Salt) > SaltSize {
|
||||
// Smaller salt is okay: it will be padded with zeros.
|
||||
return errors.New("salt is too large")
|
||||
}
|
||||
if len(c.Person) > PersonSize {
|
||||
// Smaller personalization is okay: it will be padded with zeros.
|
||||
return errors.New("personalization is too large")
|
||||
}
|
||||
if c.Tree != nil {
|
||||
if c.Tree.Fanout == 1 {
|
||||
return errors.New("fanout of 1 is not allowed in tree mode")
|
||||
}
|
||||
if c.Tree.MaxDepth < 2 {
|
||||
return errors.New("incorrect tree depth")
|
||||
}
|
||||
if c.Tree.InnerHashSize < 1 || c.Tree.InnerHashSize > Size {
|
||||
return errors.New("incorrect tree inner hash size")
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// New returns a new hash.Hash configured with the given Config.
|
||||
// Config can be nil, in which case the default one is used, calculating 64-byte digest.
|
||||
// Returns non-nil error if Config contains invalid parameters.
|
||||
func New(c *Config) (hash.Hash, error) {
|
||||
if c == nil {
|
||||
c = defaultConfig
|
||||
} else {
|
||||
if c.Size == 0 {
|
||||
// Set default size if it's zero.
|
||||
c.Size = Size
|
||||
}
|
||||
if err := verifyConfig(c); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
d := new(digest)
|
||||
d.initialize(c)
|
||||
return d, nil
|
||||
}
|
||||
|
||||
// initialize initializes digest with the given
|
||||
// config, which must be non-nil and verified.
|
||||
func (d *digest) initialize(c *Config) {
|
||||
// Create parameter block.
|
||||
var p [BlockSize]byte
|
||||
p[0] = c.Size
|
||||
p[1] = uint8(len(c.Key))
|
||||
if c.Salt != nil {
|
||||
copy(p[32:], c.Salt)
|
||||
}
|
||||
if c.Person != nil {
|
||||
copy(p[48:], c.Person)
|
||||
}
|
||||
if c.Tree != nil {
|
||||
p[2] = c.Tree.Fanout
|
||||
p[3] = c.Tree.MaxDepth
|
||||
binary.LittleEndian.PutUint32(p[4:], c.Tree.LeafSize)
|
||||
binary.LittleEndian.PutUint64(p[8:], c.Tree.NodeOffset)
|
||||
p[16] = c.Tree.NodeDepth
|
||||
p[17] = c.Tree.InnerHashSize
|
||||
} else {
|
||||
p[2] = 1
|
||||
p[3] = 1
|
||||
}
|
||||
|
||||
// Initialize.
|
||||
d.size = c.Size
|
||||
for i := 0; i < 8; i++ {
|
||||
d.h[i] = iv[i] ^ binary.LittleEndian.Uint64(p[i*8:])
|
||||
}
|
||||
if c.Tree != nil && c.Tree.IsLastNode {
|
||||
d.isLastNode = true
|
||||
}
|
||||
|
||||
// Process key.
|
||||
if c.Key != nil {
|
||||
copy(d.paddedKey[:], c.Key)
|
||||
d.Write(d.paddedKey[:])
|
||||
d.isKeyed = true
|
||||
}
|
||||
// Save a copy of initialized state.
|
||||
copy(d.ih[:], d.h[:])
|
||||
}
|
||||
|
||||
// New512 returns a new hash.Hash computing the BLAKE2b 64-byte checksum.
|
||||
func New512() hash.Hash {
|
||||
d := new(digest)
|
||||
d.initialize(defaultConfig)
|
||||
return d
|
||||
}
|
||||
|
||||
// New256 returns a new hash.Hash computing the BLAKE2b 32-byte checksum.
|
||||
func New256() hash.Hash {
|
||||
d := new(digest)
|
||||
d.initialize(config256)
|
||||
return d
|
||||
}
|
||||
|
||||
// NewMAC returns a new hash.Hash computing BLAKE2b prefix-
|
||||
// Message Authentication Code of the given size in bytes
|
||||
// (up to 64) with the given key (up to 64 bytes in length).
|
||||
func NewMAC(outBytes uint8, key []byte) hash.Hash {
|
||||
d, err := New(&Config{Size: outBytes, Key: key})
|
||||
if err != nil {
|
||||
panic(err.Error())
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
// Reset resets the state of digest to the initial state
|
||||
// after configuration and keying.
|
||||
func (d *digest) Reset() {
|
||||
copy(d.h[:], d.ih[:])
|
||||
d.t[0] = 0
|
||||
d.t[1] = 0
|
||||
d.f[0] = 0
|
||||
d.f[1] = 0
|
||||
d.nx = 0
|
||||
if d.isKeyed {
|
||||
d.Write(d.paddedKey[:])
|
||||
}
|
||||
}
|
||||
|
||||
// Size returns the digest size in bytes.
|
||||
func (d *digest) Size() int { return int(d.size) }
|
||||
|
||||
// BlockSize returns the algorithm block size in bytes.
|
||||
func (d *digest) BlockSize() int { return BlockSize }
|
||||
|
||||
func (d *digest) Write(p []byte) (nn int, err error) {
|
||||
nn = len(p)
|
||||
left := BlockSize - d.nx
|
||||
if len(p) > left {
|
||||
// Process buffer.
|
||||
copy(d.x[d.nx:], p[:left])
|
||||
p = p[left:]
|
||||
compress(d, d.x[:])
|
||||
d.nx = 0
|
||||
}
|
||||
// Process full blocks except for the last one.
|
||||
if len(p) > BlockSize {
|
||||
n := len(p) &^ (BlockSize - 1)
|
||||
if n == len(p) {
|
||||
n -= BlockSize
|
||||
}
|
||||
compress(d, p[:n])
|
||||
p = p[n:]
|
||||
}
|
||||
// Fill buffer.
|
||||
d.nx += copy(d.x[d.nx:], p)
|
||||
return
|
||||
}
|
||||
|
||||
// Sum returns the calculated checksum.
|
||||
func (d *digest) Sum(in []byte) []byte {
|
||||
// Make a copy of d so that caller can keep writing and summing.
|
||||
d0 := *d
|
||||
hash := d0.checkSum()
|
||||
return append(in, hash[:d0.size]...)
|
||||
}
|
||||
|
||||
func (d *digest) checkSum() [Size]byte {
|
||||
// Do not create unnecessary copies of the key.
|
||||
if d.isKeyed {
|
||||
for i := 0; i < len(d.paddedKey); i++ {
|
||||
d.paddedKey[i] = 0
|
||||
}
|
||||
}
|
||||
|
||||
dec := BlockSize - uint64(d.nx)
|
||||
if d.t[0] < dec {
|
||||
d.t[1]--
|
||||
}
|
||||
d.t[0] -= dec
|
||||
|
||||
// Pad buffer with zeros.
|
||||
for i := d.nx; i < len(d.x); i++ {
|
||||
d.x[i] = 0
|
||||
}
|
||||
// Set last block flag.
|
||||
d.f[0] = 0xffffffffffffffff
|
||||
if d.isLastNode {
|
||||
d.f[1] = 0xffffffffffffffff
|
||||
}
|
||||
// Compress last block.
|
||||
compress(d, d.x[:])
|
||||
|
||||
var out [Size]byte
|
||||
j := 0
|
||||
for _, s := range d.h[:(d.size-1)/8+1] {
|
||||
out[j+0] = byte(s >> 0)
|
||||
out[j+1] = byte(s >> 8)
|
||||
out[j+2] = byte(s >> 16)
|
||||
out[j+3] = byte(s >> 24)
|
||||
out[j+4] = byte(s >> 32)
|
||||
out[j+5] = byte(s >> 40)
|
||||
out[j+6] = byte(s >> 48)
|
||||
out[j+7] = byte(s >> 56)
|
||||
j += 8
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// Sum512 returns a 64-byte BLAKE2b hash of data.
|
||||
func Sum512(data []byte) [64]byte {
|
||||
var d digest
|
||||
d.initialize(defaultConfig)
|
||||
d.Write(data)
|
||||
return d.checkSum()
|
||||
}
|
||||
|
||||
// Sum256 returns a 32-byte BLAKE2b hash of data.
|
||||
func Sum256(data []byte) (out [32]byte) {
|
||||
var d digest
|
||||
d.initialize(config256)
|
||||
d.Write(data)
|
||||
sum := d.checkSum()
|
||||
copy(out[:], sum[:32])
|
||||
return
|
||||
}
|
||||
47
vendor/github.com/minio/blake2b-simd/compressAvx2_amd64.go
generated
vendored
47
vendor/github.com/minio/blake2b-simd/compressAvx2_amd64.go
generated
vendored
@@ -1,47 +0,0 @@
|
||||
//+build !noasm
|
||||
//+build !appengine
|
||||
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2016 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package blake2b
|
||||
|
||||
//go:noescape
|
||||
func compressAVX2Loop(p []uint8, in, iv, t, f, shffle, out []uint64)
|
||||
|
||||
func compressAVX2(d *digest, p []uint8) {
|
||||
var (
|
||||
in [8]uint64
|
||||
out [8]uint64
|
||||
shffle [8]uint64
|
||||
)
|
||||
|
||||
// vector for PSHUFB instruction
|
||||
shffle[0] = 0x0201000706050403
|
||||
shffle[1] = 0x0a09080f0e0d0c0b
|
||||
shffle[2] = 0x0201000706050403
|
||||
shffle[3] = 0x0a09080f0e0d0c0b
|
||||
shffle[4] = 0x0100070605040302
|
||||
shffle[5] = 0x09080f0e0d0c0b0a
|
||||
shffle[6] = 0x0100070605040302
|
||||
shffle[7] = 0x09080f0e0d0c0b0a
|
||||
|
||||
in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7] = d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7]
|
||||
|
||||
compressAVX2Loop(p, in[:], iv[:], d.t[:], d.f[:], shffle[:], out[:])
|
||||
|
||||
d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7] = out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7]
|
||||
}
|
||||
671
vendor/github.com/minio/blake2b-simd/compressAvx2_amd64.s
generated
vendored
671
vendor/github.com/minio/blake2b-simd/compressAvx2_amd64.s
generated
vendored
@@ -1,671 +0,0 @@
|
||||
//+build !noasm !appengine
|
||||
|
||||
//
|
||||
// Minio Cloud Storage, (C) 2016 Minio, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
//
|
||||
// Based on AVX2 implementation from https://github.com/sneves/blake2-avx2/blob/master/blake2b-common.h
|
||||
//
|
||||
// Use github.com/fwessels/asm2plan9s on this file to assemble instructions to their Plan9 equivalent
|
||||
//
|
||||
// Assembly code below essentially follows the ROUND macro (see blake2b-round.h) which is defined as:
|
||||
// #define ROUND(r) \
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1); \
|
||||
// G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1); \
|
||||
// G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
// DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1); \
|
||||
// G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1); \
|
||||
// G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
// UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
|
||||
//
|
||||
// as well as the go equivalent in https://github.com/dchest/blake2b/blob/master/block.go
|
||||
//
|
||||
// As in the macro, G1/G2 in the 1st and 2nd half are identical (so literal copy of assembly)
|
||||
//
|
||||
// Rounds are also the same, except for the loading of the message (and rounds 1 & 11 and
|
||||
// rounds 2 & 12 are identical)
|
||||
//
|
||||
|
||||
#define G1 \
|
||||
\ // G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1);
|
||||
BYTE $0xc5; BYTE $0xfd; BYTE $0xd4; BYTE $0xc4 \ // VPADDQ YMM0,YMM0,YMM4 /* v0 += m[0], v1 += m[2], v2 += m[4], v3 += m[6] */
|
||||
BYTE $0xc5; BYTE $0xfd; BYTE $0xd4; BYTE $0xc1 \ // VPADDQ YMM0,YMM0,YMM1 /* v0 += v4, v1 += v5, v2 += v6, v3 += v7 */
|
||||
BYTE $0xc5; BYTE $0xe5; BYTE $0xef; BYTE $0xd8 \ // VPXOR YMM3,YMM3,YMM0 /* v12 ^= v0, v13 ^= v1, v14 ^= v2, v15 ^= v3 */
|
||||
BYTE $0xc5; BYTE $0xfd; BYTE $0x70; BYTE $0xdb; BYTE $0xb1 \ // VPSHUFD YMM3,YMM3,0xb1 /* v12 = v12<<(64-32) | v12>>32, v13 = */
|
||||
BYTE $0xc5; BYTE $0xed; BYTE $0xd4; BYTE $0xd3 \ // VPADDQ YMM2,YMM2,YMM3 /* v8 += v12, v9 += v13, v10 += v14, v11 += v15 */
|
||||
BYTE $0xc5; BYTE $0xf5; BYTE $0xef; BYTE $0xca \ // VPXOR YMM1,YMM1,YMM2 /* v4 ^= v8, v5 ^= v9, v6 ^= v10, v7 ^= v11 */
|
||||
BYTE $0xc4; BYTE $0xe2; BYTE $0x75; BYTE $0x00; BYTE $0xce // VPSHUFB YMM1,YMM1,YMM6 /* v4 = v4<<(64-24) | v4>>24, ..., ..., v7 = v7<<(64-24) | v7>>24 */
|
||||
|
||||
#define G2 \
|
||||
BYTE $0xc5; BYTE $0xfd; BYTE $0xd4; BYTE $0xc5 \ // VPADDQ YMM0,YMM0,YMM5 /* v0 += m[1], v1 += m[3], v2 += m[5], v3 += m[7] */
|
||||
BYTE $0xc5; BYTE $0xfd; BYTE $0xd4; BYTE $0xc1 \ // VPADDQ YMM0,YMM0,YMM1 /* v0 += v4, v1 += v5, v2 += v6, v3 += v7 */
|
||||
BYTE $0xc5; BYTE $0xe5; BYTE $0xef; BYTE $0xd8 \ // VPXOR YMM3,YMM3,YMM0 /* v12 ^= v0, v13 ^= v1, v14 ^= v2, v15 ^= v3 */
|
||||
BYTE $0xc4; BYTE $0xe2; BYTE $0x65; BYTE $0x00; BYTE $0xdf \ // VPSHUFB YMM3,YMM3,YMM7 /* v12 = v12<<(64-16) | v12>>16, ..., ..., v15 = v15<<(64-16) | v15>>16 */
|
||||
BYTE $0xc5; BYTE $0xed; BYTE $0xd4; BYTE $0xd3 \ // VPADDQ YMM2,YMM2,YMM3 /* v8 += v12, v9 += v13, v10 += v14, v11 += v15 */
|
||||
BYTE $0xc5; BYTE $0xf5; BYTE $0xef; BYTE $0xca \ // VPXOR YMM1,YMM1,YMM2 /* v4 ^= v8, v5 ^= v9, v6 ^= v10, v7 ^= v11 */
|
||||
BYTE $0xc5; BYTE $0x75; BYTE $0xd4; BYTE $0xf9 \ // VPADDQ YMM15,YMM1,YMM1 /* temp reg = reg*2 */
|
||||
BYTE $0xc5; BYTE $0xf5; BYTE $0x73; BYTE $0xd1; BYTE $0x3f \ // VPSRLQ YMM1,YMM1,0x3f /* reg = reg>>63 */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x75; BYTE $0xef; BYTE $0xcf // VPXOR YMM1,YMM1,YMM15 /* ORed together: v4 = v4<<(64-63) | v4>>63, v5 = v5<<(64-63) | v5>>63 */
|
||||
|
||||
#define DIAGONALIZE \
|
||||
BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb \ // VPERMQ YMM3, YMM3, 0x93
|
||||
BYTE $0x93 \
|
||||
BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2 \ // VPERMQ YMM2, YMM2, 0x4e
|
||||
BYTE $0x4e \
|
||||
BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9 \ // VPERMQ YMM1, YMM1, 0x39
|
||||
BYTE $0x39 \
|
||||
// DO NOT DELETE -- macro delimiter (previous line extended)
|
||||
|
||||
#define UNDIAGONALIZE \
|
||||
BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb \ // VPERMQ YMM3, YMM3, 0x39
|
||||
BYTE $0x39 \
|
||||
BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2 \ // VPERMQ YMM2, YMM2, 0x4e
|
||||
BYTE $0x4e \
|
||||
BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9 \ // VPERMQ YMM1, YMM1, 0x93
|
||||
BYTE $0x93 \
|
||||
// DO NOT DELETE -- macro delimiter (previous line extended)
|
||||
|
||||
#define LOAD_SHUFFLE \
|
||||
MOVQ shffle+120(FP), SI \ // SI: &shuffle
|
||||
BYTE $0xc5; BYTE $0xfe; BYTE $0x6f; BYTE $0x36 \ // VMOVDQU YMM6, [rsi]
|
||||
BYTE $0xc5; BYTE $0xfe; BYTE $0x6f; BYTE $0x7e; BYTE $0x20 // VMOVDQU YMM7, 32[rsi]
|
||||
|
||||
// func compressAVX2Loop(compressSSE(p []uint8, in, iv, t, f, shffle, out []uint64)
|
||||
TEXT ·compressAVX2Loop(SB), 7, $0
|
||||
|
||||
// REGISTER USE
|
||||
// Y0 - Y3: v0 - v15
|
||||
// Y4 - Y5: m[0] - m[7]
|
||||
// Y6 - Y7: shuffle value
|
||||
// Y8 - Y9: temp registers
|
||||
// Y10 -Y13: copy of full message
|
||||
// Y15: temp register
|
||||
|
||||
// Load digest
|
||||
MOVQ in+24(FP), SI // SI: &in
|
||||
BYTE $0xc5; BYTE $0xfe; BYTE $0x6f; BYTE $0x06 // VMOVDQU YMM0, [rsi]
|
||||
BYTE $0xc5; BYTE $0xfe; BYTE $0x6f; BYTE $0x4e; BYTE $0x20 // VMOVDQU YMM1, 32[rsi]
|
||||
|
||||
// Already store digest into &out (so we can reload it later generically)
|
||||
MOVQ out+144(FP), SI // SI: &out
|
||||
BYTE $0xc5; BYTE $0xfe; BYTE $0x7f; BYTE $0x06 // VMOVDQU [rsi], YMM0
|
||||
BYTE $0xc5; BYTE $0xfe; BYTE $0x7f; BYTE $0x4e; BYTE $0x20 // VMOVDQU 32[rsi], YMM1
|
||||
|
||||
// Initialize message pointer and loop counter
|
||||
MOVQ message+0(FP), DX // DX: &p (message)
|
||||
MOVQ message_len+8(FP), R8 // R8: len(message)
|
||||
SHRQ $7, R8 // len(message) / 128
|
||||
CMPQ R8, $0
|
||||
JEQ complete
|
||||
|
||||
loop:
|
||||
// Increment counter
|
||||
MOVQ t+72(FP), SI // SI: &t
|
||||
MOVQ 0(SI), R9 //
|
||||
ADDQ $128, R9 // /* d.t[0] += BlockSize */
|
||||
MOVQ R9, 0(SI) //
|
||||
CMPQ R9, $128 // /* if d.t[0] < BlockSize { */
|
||||
JGE noincr //
|
||||
MOVQ 8(SI), R9 //
|
||||
ADDQ $1, R9 // /* d.t[1]++ */
|
||||
MOVQ R9, 8(SI) //
|
||||
noincr: // /* } */
|
||||
|
||||
// Load initialization vector
|
||||
MOVQ iv+48(FP), SI // SI: &iv
|
||||
BYTE $0xc5; BYTE $0xfe; BYTE $0x6f; BYTE $0x16 // VMOVDQU YMM2, [rsi]
|
||||
BYTE $0xc5; BYTE $0xfe; BYTE $0x6f; BYTE $0x5e; BYTE $0x20 // VMOVDQU YMM3, 32[rsi]
|
||||
MOVQ t+72(FP), SI // SI: &t
|
||||
BYTE $0xc4; BYTE $0x63; BYTE $0x3d; BYTE $0x38; BYTE $0x06 // VINSERTI128 YMM8, YMM8, [rsi], 0 /* Y8 = t[0]+t[1] */
|
||||
BYTE $0x00
|
||||
MOVQ t+96(FP), SI // SI: &f
|
||||
BYTE $0xc4; BYTE $0x63; BYTE $0x3d; BYTE $0x38; BYTE $0x06 // VINSERTI128 YMM8, YMM8, [rsi], 1 /* Y8 = t[0]+t[1]+f[0]+f[1] */
|
||||
BYTE $0x01
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x65; BYTE $0xef; BYTE $0xd8 // VPXOR YMM3,YMM3,YMM8 /* Y3 = Y3 ^ Y8 */
|
||||
|
||||
BYTE $0xc5; BYTE $0x7e; BYTE $0x6f; BYTE $0x12 // VMOVDQU YMM10, [rdx] /* Y10 = m[0]+ m[1]+ m[2]+ m[3] */
|
||||
BYTE $0xc5; BYTE $0x7e; BYTE $0x6f; BYTE $0x5a; BYTE $0x20 // VMOVDQU YMM11, 32[rdx] /* Y11 = m[4]+ m[5]+ m[6]+ m[7] */
|
||||
BYTE $0xc5; BYTE $0x7e; BYTE $0x6f; BYTE $0x62; BYTE $0x40 // VMOVDQU YMM12, 64[rdx] /* Y12 = m[8]+ m[9]+m[10]+m[11] */
|
||||
BYTE $0xc5; BYTE $0x7e; BYTE $0x6f; BYTE $0x6a; BYTE $0x60 // VMOVDQU YMM13, 96[rdx] /* Y13 = m[12]+m[13]+m[14]+m[15] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 1
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x2d; BYTE $0x6c; BYTE $0xe3 // VPUNPCKLQDQ YMM4, YMM10, YMM11 /* m[0], m[4], m[2], m[6] */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x2d; BYTE $0x6d; BYTE $0xeb // VPUNPCKHQDQ YMM5, YMM10, YMM11 /* m[1], m[5], m[3], m[7] */
|
||||
BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xe4 // VPERMQ YMM4, YMM4, 0xd8 /* 0x1101 1000 = 0xd8 */
|
||||
BYTE $0xd8
|
||||
BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xed // VPERMQ YMM5, YMM5, 0xd8 /* 0x1101 1000 = 0xd8 */
|
||||
BYTE $0xd8
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x1d; BYTE $0x6c; BYTE $0xe5 // VPUNPCKLQDQ YMM4, YMM12, YMM13 /* m[8], m[12], m[10], m[14] */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x1d; BYTE $0x6d; BYTE $0xed // VPUNPCKHQDQ YMM5, YMM12, YMM13 /* m[9], m[13], m[11], m[15] */
|
||||
BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xe4 // VPERMQ YMM4, YMM4, 0xd8 /* 0x1101 1000 = 0xd8 */
|
||||
BYTE $0xd8
|
||||
BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xed // VPERMQ YMM5, YMM5, 0xd8 /* 0x1101 1000 = 0xd8 */
|
||||
BYTE $0xd8
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 2
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ YMM8, YMM11, YMM13 /* m[4], ____, ____, m[14] */
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc0 // VPERMQ YMM8, YMM8, 0x03 /* m[14], m[4], ____, ____ */ /* xxxx 0011 = 0x03 */
|
||||
BYTE $0x03
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x1d; BYTE $0x6d; BYTE $0xcd // VPUNPCKHQDQ YMM9, YMM12, YMM13 /* m[9], m[13], ____, ____ */
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x20 /* m[9], m[13], ____, ____ */ /* 0010 0000 = 0x20 */
|
||||
BYTE $0x20
|
||||
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc4 // VPERMQ YMM8, YMM12, 0x02 /* m[10], m[8], ____, ____ */ /* xxxx 0010 = 0x02 */
|
||||
BYTE $0x02
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcd // VPERMQ YMM9, YMM13, 0x30 /* ____, ____, m[15], ____ */ /* xx11 xxxx = 0x30 */
|
||||
BYTE $0x30
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x35; BYTE $0x6c; BYTE $0xcb // VPUNPCKLQDQ YMM9, YMM9, YMM11 /* ____, ____, m[15], m[6] */
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x30 /* m[9], m[13], m[15], m[6] */ /* 0011 0000 = 0x30 */
|
||||
BYTE $0x30
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc2 // VPERMQ YMM8, YMM10, 0x01 /* m[1], m[0], ____, ____ */ /* xxxx 0001 = 0x01 */
|
||||
BYTE $0x01
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ YMM9, YMM11, YMM12 /* m[5], ____, ____, m[11] */
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc9 // VPERMQ YMM9, YMM9, 0x03 /* m[11], m[5], ____, ____ */ /* xxxx 0011 = 0x03 */
|
||||
BYTE $0x03
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x20 /* m[1], m[0], m[11], m[5] */ /* 0010 0000 = 0x20 */
|
||||
BYTE $0x20
|
||||
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ YMM8, YMM10, YMM13 /* ___, m[12], m[2], ____ */
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc0 // VPERMQ YMM8, YMM8, 0x09 /* m[12], m[2], ____, ____ */ /* xxxx 1001 = 0x09 */
|
||||
BYTE $0x09
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6d; BYTE $0xca // VPUNPCKHQDQ YMM9, YMM11, YMM10 /* ____, ____, m[7], m[3] */
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x30 /* m[9], m[13], m[15], m[6] */ /* 0011 0000 = 0x30 */
|
||||
BYTE $0x30
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 3
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc5 // VPERMQ YMM8, YMM13, 0x00
|
||||
BYTE $0x00
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x1d; BYTE $0x6d; BYTE $0xc0 // VPUNPCKHQDQ YMM8, YMM12, YMM8
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6d; BYTE $0xcd // VPUNPCKHQDQ YMM9, YMM11, YMM13
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc9 // VPERMQ YMM9, YMM9, 0x0c
|
||||
BYTE $0x0c
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x21
|
||||
BYTE $0x21
|
||||
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x1d; BYTE $0x6c; BYTE $0xc2 // VPUNPCKLQDQ YMM8, YMM12, YMM10
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcd // VPERMQ YMM9, YMM13, 0x55
|
||||
BYTE $0x55
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ YMM9, YMM10, YMM9
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x30
|
||||
BYTE $0x30
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc2 // VPERMQ YMM8, YMM10, 0xff
|
||||
BYTE $0xff
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x1d; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ YMM8, YMM12, YMM8
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ YMM9, YMM11, YMM12
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc9 // VPERMQ YMM9, YMM9, 0x60
|
||||
BYTE $0x60
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x31
|
||||
BYTE $0x31
|
||||
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6c; BYTE $0xc3 // VPUNPCKLQDQ YMM8, YMM13, YMM11
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcb // VPERMQ YMM9, YMM11, 0x00
|
||||
BYTE $0x00
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6d; BYTE $0xc9 // VPUNPCKHQDQ YMM9, YMM10, YMM9
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x21
|
||||
BYTE $0x21
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 4
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6d; BYTE $0xc2 // VPUNPCKHQDQ YMM8, YMM11, YMM10
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ YMM9, YMM13, YMM12
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc9 // VPERMQ YMM9, YMM9, 0x0c
|
||||
BYTE $0x0c
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x21
|
||||
BYTE $0x21
|
||||
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x1d; BYTE $0x6d; BYTE $0xc2 // VPUNPCKHQDQ YMM8, YMM12, YMM10
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcd // VPERMQ YMM9, YMM13, 0x08
|
||||
BYTE $0x08
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x20
|
||||
BYTE $0x20
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc3 // VPERMQ YMM8, YMM11, 0x55
|
||||
BYTE $0x55
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ YMM8, YMM10, YMM8
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcd // VPERMQ YMM9, YMM13, 0xff
|
||||
BYTE $0xff
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ YMM9, YMM11, YMM9
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x21
|
||||
BYTE $0x21
|
||||
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6c; BYTE $0xc4 // VPUNPCKLQDQ YMM8, YMM11, YMM12
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xcc // VPUNPCKLQDQ YMM9, YMM10, YMM12
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x21
|
||||
BYTE $0x21
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 5
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x1d; BYTE $0x6d; BYTE $0xc3 // VPUNPCKHQDQ YMM8, YMM12, YMM11
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xcc // VPUNPCKLQDQ YMM9, YMM10, YMM12
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x30
|
||||
BYTE $0x30
|
||||
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc3 // VPERMQ YMM8, YMM11, 0xff
|
||||
BYTE $0xff
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ YMM8, YMM10, YMM8
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcd // VPERMQ YMM9, YMM13, 0xff
|
||||
BYTE $0xff
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ YMM9, YMM11, YMM9
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x20
|
||||
BYTE $0x20
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc4 // VPERMQ YMM8, YMM12, 0xff
|
||||
BYTE $0xff
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ YMM8, YMM13, YMM8
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xca // VPERMQ YMM9, YMM10, 0xff
|
||||
BYTE $0xff
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ YMM9, YMM11, YMM9
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x31
|
||||
BYTE $0x31
|
||||
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc5 // VPERMQ YMM8, YMM13, 0x00
|
||||
BYTE $0x00
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6d; BYTE $0xc0 // VPUNPCKHQDQ YMM8, YMM10, YMM8
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcd // VPERMQ YMM9, YMM13, 0x55
|
||||
BYTE $0x55
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x1d; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ YMM9, YMM12, YMM9
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x20
|
||||
BYTE $0x20
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 6
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xc3 // VPUNPCKLQDQ YMM8, YMM10, YMM11
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xcc // VPUNPCKLQDQ YMM9, YMM10, YMM12
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x21
|
||||
BYTE $0x21
|
||||
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6c; BYTE $0xc4 // VPUNPCKLQDQ YMM8, YMM13, YMM12
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc0 // VPERMQ YMM8, YMM8, 0x0c
|
||||
BYTE $0x0c
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x1d; BYTE $0x6d; BYTE $0xca // VPUNPCKHQDQ YMM9, YMM12, YMM10
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x30
|
||||
BYTE $0x30
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc3 // VPERMQ YMM8, YMM11, 0x0c
|
||||
BYTE $0x0c
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6d; BYTE $0xca // VPUNPCKHQDQ YMM9, YMM13, YMM10
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc9 // VPERMQ YMM9, YMM9, 0x60
|
||||
BYTE $0x60
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x30
|
||||
BYTE $0x30
|
||||
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6d; BYTE $0xc3 // VPUNPCKHQDQ YMM8, YMM13, YMM11
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcc // VPERMQ YMM9, YMM12, 0x55
|
||||
BYTE $0x55
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ YMM9, YMM13, YMM9
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x30
|
||||
BYTE $0x30
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 7
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc2 // VPERMQ YMM8, YMM10, 0x55
|
||||
BYTE $0x55
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ YMM8, YMM13, YMM8
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6c; BYTE $0xcb // VPUNPCKLQDQ YMM9, YMM13, YMM11
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc9 // VPERMQ YMM9, YMM9, 0x60
|
||||
BYTE $0x60
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x30
|
||||
BYTE $0x30
|
||||
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6d; BYTE $0xc5 // VPUNPCKHQDQ YMM8, YMM11, YMM13
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc0 // VPERMQ YMM8, YMM8, 0x0c
|
||||
BYTE $0x0c
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcc // VPERMQ YMM9, YMM12, 0xaa
|
||||
BYTE $0xaa
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6d; BYTE $0xc9 // VPUNPCKHQDQ YMM9, YMM13, YMM9
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x20
|
||||
BYTE $0x20
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xc3 // VPUNPCKLQDQ YMM8, YMM10, YMM11
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc0 // VPERMQ YMM8, YMM8, 0x0c
|
||||
BYTE $0x0c
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcc // VPERMQ YMM9, YMM12, 0x01
|
||||
BYTE $0x01
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x20
|
||||
BYTE $0x20
|
||||
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6d; BYTE $0xc2 // VPUNPCKHQDQ YMM8, YMM11, YMM10
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcc // VPERMQ YMM9, YMM12, 0xff
|
||||
BYTE $0xff
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ YMM9, YMM10, YMM9
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x31
|
||||
BYTE $0x31
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 8
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6d; BYTE $0xc3 // VPUNPCKHQDQ YMM8, YMM13, YMM11
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc0 // VPERMQ YMM8, YMM8, 0x0c
|
||||
BYTE $0x0c
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xca // VPERMQ YMM9, YMM10, 0xff
|
||||
BYTE $0xff
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ YMM9, YMM13, YMM9
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x20
|
||||
BYTE $0x20
|
||||
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc5 // VPERMQ YMM8, YMM13, 0xaa
|
||||
BYTE $0xaa
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x1d; BYTE $0x6d; BYTE $0xc0 // VPUNPCKHQDQ YMM8, YMM12, YMM8
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ YMM9, YMM10, YMM12
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x21
|
||||
BYTE $0x21
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6d; BYTE $0xc5 // VPUNPCKHQDQ YMM8, YMM11, YMM13
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc0 // VPERMQ YMM8, YMM8, 0x0c
|
||||
BYTE $0x0c
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x1d; BYTE $0x6c; BYTE $0xca // VPUNPCKLQDQ YMM9, YMM12, YMM10
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc9 // VPERMQ YMM9, YMM9, 0x0c
|
||||
BYTE $0x0c
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x20
|
||||
BYTE $0x20
|
||||
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xc3 // VPUNPCKLQDQ YMM8, YMM10, YMM11
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6c; BYTE $0xcc // VPUNPCKLQDQ YMM9, YMM11, YMM12
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x30
|
||||
BYTE $0x30
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 9
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ YMM8, YMM11, YMM13
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xca // VPERMQ YMM9, YMM10, 0x00
|
||||
BYTE $0x00
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x1d; BYTE $0x6d; BYTE $0xc9 // VPUNPCKHQDQ YMM9, YMM12, YMM9
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x31
|
||||
BYTE $0x31
|
||||
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6d; BYTE $0xc4 // VPUNPCKHQDQ YMM8, YMM13, YMM12
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc0 // VPERMQ YMM8, YMM8, 0x60
|
||||
BYTE $0x60
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcc // VPERMQ YMM9, YMM12, 0x00
|
||||
BYTE $0x00
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6d; BYTE $0xc9 // VPUNPCKHQDQ YMM9, YMM10, YMM9
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x31
|
||||
BYTE $0x31
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcc // VPERMQ YMM9, YMM12, 0xaa
|
||||
BYTE $0xaa
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6d; BYTE $0xc9 // VPUNPCKHQDQ YMM9, YMM10, YMM9
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x15; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM13, YMM9, 0x20
|
||||
BYTE $0x20
|
||||
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc3 // VPERMQ YMM8, YMM11, 0xff
|
||||
BYTE $0xff
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ YMM8, YMM10, YMM8
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcb // VPERMQ YMM9, YMM11, 0x04
|
||||
BYTE $0x04
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x21
|
||||
BYTE $0x21
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 10
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc4 // VPERMQ YMM8, YMM12, 0x20
|
||||
BYTE $0x20
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6d; BYTE $0xca // VPUNPCKHQDQ YMM9, YMM11, YMM10
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc9 // VPERMQ YMM9, YMM9, 0x60
|
||||
BYTE $0x60
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x31
|
||||
BYTE $0x31
|
||||
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xc3 // VPUNPCKLQDQ YMM8, YMM10, YMM11
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc0 // VPERMQ YMM8, YMM8, 0x60
|
||||
BYTE $0x60
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcb // VPERMQ YMM9, YMM11, 0x60
|
||||
BYTE $0x60
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x31
|
||||
BYTE $0x31
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6d; BYTE $0xc4 // VPUNPCKHQDQ YMM8, YMM13, YMM12
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc0 // VPERMQ YMM8, YMM8, 0x60
|
||||
BYTE $0x60
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6d; BYTE $0xcd // VPUNPCKHQDQ YMM9, YMM10, YMM13
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc9 // VPERMQ YMM9, YMM9, 0x60
|
||||
BYTE $0x60
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x31
|
||||
BYTE $0x31
|
||||
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc5 // VPERMQ YMM8, YMM13, 0xaa
|
||||
BYTE $0xaa
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x1d; BYTE $0x6d; BYTE $0xc0 // VPUNPCKHQDQ YMM8, YMM12, YMM8
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x15; BYTE $0x6c; BYTE $0xca // VPUNPCKLQDQ YMM9, YMM13, YMM10
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x21
|
||||
BYTE $0x21
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 1 1
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x2d; BYTE $0x6c; BYTE $0xe3 // VPUNPCKLQDQ YMM4, YMM10, YMM11 /* m[0], m[4], m[2], m[6] */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x2d; BYTE $0x6d; BYTE $0xeb // VPUNPCKHQDQ YMM5, YMM10, YMM11 /* m[1], m[5], m[3], m[7] */
|
||||
BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xe4 // VPERMQ YMM4, YMM4, 0xd8 /* 0x1101 1000 = 0xd8 */
|
||||
BYTE $0xd8
|
||||
BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xed // VPERMQ YMM5, YMM5, 0xd8 /* 0x1101 1000 = 0xd8 */
|
||||
BYTE $0xd8
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x1d; BYTE $0x6c; BYTE $0xe5 // VPUNPCKLQDQ YMM4, YMM12, YMM13 /* m[8], m[12], m[10], m[14] */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x1d; BYTE $0x6d; BYTE $0xed // VPUNPCKHQDQ YMM5, YMM12, YMM13 /* m[9], m[13], m[11], m[15] */
|
||||
BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xe4 // VPERMQ YMM4, YMM4, 0xd8 /* 0x1101 1000 = 0xd8 */
|
||||
BYTE $0xd8
|
||||
BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xed // VPERMQ YMM5, YMM5, 0xd8 /* 0x1101 1000 = 0xd8 */
|
||||
BYTE $0xd8
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 1 2
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ YMM8, YMM11, YMM13 /* m[4], ____, ____, m[14] */
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc0 // VPERMQ YMM8, YMM8, 0x03 /* m[14], m[4], ____, ____ */ /* xxxx 0011 = 0x03 */
|
||||
BYTE $0x03
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x1d; BYTE $0x6d; BYTE $0xcd // VPUNPCKHQDQ YMM9, YMM12, YMM13 /* m[9], m[13], ____, ____ */
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x20 /* m[9], m[13], ____, ____ */ /* 0010 0000 = 0x20 */
|
||||
BYTE $0x20
|
||||
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc4 // VPERMQ YMM8, YMM12, 0x02 /* m[10], m[8], ____, ____ */ /* xxxx 0010 = 0x02 */
|
||||
BYTE $0x02
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xcd // VPERMQ YMM9, YMM13, 0x30 /* ____, ____, m[15], ____ */ /* xx11 xxxx = 0x30 */
|
||||
BYTE $0x30
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x35; BYTE $0x6c; BYTE $0xcb // VPUNPCKLQDQ YMM9, YMM9, YMM11 /* ____, ____, m[15], m[6] */
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x30 /* m[9], m[13], m[15], m[6] */ /* 0011 0000 = 0x30 */
|
||||
BYTE $0x30
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
DIAGONALIZE
|
||||
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc2 // VPERMQ YMM8, YMM10, 0x01 /* m[1], m[0], ____, ____ */ /* xxxx 0001 = 0x01 */
|
||||
BYTE $0x01
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ YMM9, YMM11, YMM12 /* m[5], ____, ____, m[11] */
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc9 // VPERMQ YMM9, YMM9, 0x03 /* m[11], m[5], ____, ____ */ /* xxxx 0011 = 0x03 */
|
||||
BYTE $0x03
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe1 // VPERM2I128 YMM4, YMM8, YMM9, 0x20 /* m[1], m[0], m[11], m[5] */ /* 0010 0000 = 0x20 */
|
||||
BYTE $0x20
|
||||
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x2d; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ YMM8, YMM10, YMM13 /* ___, m[12], m[2], ____ */
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0xfd; BYTE $0x00; BYTE $0xc0 // VPERMQ YMM8, YMM8, 0x09 /* m[12], m[2], ____, ____ */ /* xxxx 1001 = 0x09 */
|
||||
BYTE $0x09
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x25; BYTE $0x6d; BYTE $0xca // VPUNPCKHQDQ YMM9, YMM11, YMM10 /* ____, ____, m[7], m[3] */
|
||||
BYTE $0xc4; BYTE $0xc3; BYTE $0x3d; BYTE $0x46; BYTE $0xe9 // VPERM2I128 YMM5, YMM8, YMM9, 0x30 /* m[9], m[13], m[15], m[6] */ /* 0011 0000 = 0x30 */
|
||||
BYTE $0x30
|
||||
|
||||
G1
|
||||
G2
|
||||
|
||||
UNDIAGONALIZE
|
||||
|
||||
// Reload digest (most current value store in &out)
|
||||
MOVQ out+144(FP), SI // SI: &in
|
||||
BYTE $0xc5; BYTE $0x7e; BYTE $0x6f; BYTE $0x26 // VMOVDQU YMM12, [rsi]
|
||||
BYTE $0xc5; BYTE $0x7e; BYTE $0x6f; BYTE $0x6e; BYTE $0x20 // VMOVDQU YMM13, 32[rsi]
|
||||
|
||||
BYTE $0xc5; BYTE $0xfd; BYTE $0xef; BYTE $0xc2 // VPXOR YMM0,YMM0,YMM2 /* X0 = X0 ^ X4, X1 = X1 ^ X5 */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x7d; BYTE $0xef; BYTE $0xc4 // VPXOR YMM0,YMM0,YMM12 /* X0 = X0 ^ X12, X1 = X1 ^ X13 */
|
||||
BYTE $0xc5; BYTE $0xf5; BYTE $0xef; BYTE $0xcb // VPXOR YMM1,YMM1,YMM3 /* X2 = X2 ^ X6, X3 = X3 ^ X7 */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x75; BYTE $0xef; BYTE $0xcd // VPXOR YMM1,YMM1,YMM13 /* X2 = X2 ^ X14, X3 = X3 ^ X15 */
|
||||
|
||||
// Store digest into &out
|
||||
MOVQ out+144(FP), SI // SI: &out
|
||||
BYTE $0xc5; BYTE $0xfe; BYTE $0x7f; BYTE $0x06 // VMOVDQU [rsi], YMM0
|
||||
BYTE $0xc5; BYTE $0xfe; BYTE $0x7f; BYTE $0x4e; BYTE $0x20 // VMOVDQU 32[rsi], YMM1
|
||||
|
||||
// Increment message pointer and check if there's more to do
|
||||
ADDQ $128, DX // message += 128
|
||||
SUBQ $1, R8
|
||||
JNZ loop
|
||||
|
||||
complete:
|
||||
BYTE $0xc5; BYTE $0xf8; BYTE $0x77 // VZEROUPPER /* Prevent further context switches */
|
||||
RET
|
||||
|
||||
41
vendor/github.com/minio/blake2b-simd/compressAvx_amd64.go
generated
vendored
41
vendor/github.com/minio/blake2b-simd/compressAvx_amd64.go
generated
vendored
@@ -1,41 +0,0 @@
|
||||
//+build !noasm
|
||||
//+build !appengine
|
||||
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2016 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package blake2b
|
||||
|
||||
//go:noescape
|
||||
func blockAVXLoop(p []uint8, in, iv, t, f, shffle, out []uint64)
|
||||
|
||||
func compressAVX(d *digest, p []uint8) {
|
||||
var (
|
||||
in [8]uint64
|
||||
out [8]uint64
|
||||
shffle [2]uint64
|
||||
)
|
||||
|
||||
// vector for PSHUFB instruction
|
||||
shffle[0] = 0x0201000706050403
|
||||
shffle[1] = 0x0a09080f0e0d0c0b
|
||||
|
||||
in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7] = d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7]
|
||||
|
||||
blockAVXLoop(p, in[:], iv[:], d.t[:], d.f[:], shffle[:], out[:])
|
||||
|
||||
d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7] = out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7]
|
||||
}
|
||||
682
vendor/github.com/minio/blake2b-simd/compressAvx_amd64.s
generated
vendored
682
vendor/github.com/minio/blake2b-simd/compressAvx_amd64.s
generated
vendored
@@ -1,682 +0,0 @@
|
||||
//+build !noasm !appengine
|
||||
|
||||
//
|
||||
// Minio Cloud Storage, (C) 2016 Minio, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
//
|
||||
// Based on SSE implementation from https://github.com/BLAKE2/BLAKE2/blob/master/sse/blake2b.c
|
||||
//
|
||||
// Use github.com/fwessels/asm2plan9s on this file to assemble instructions to their Plan9 equivalent
|
||||
//
|
||||
// Assembly code below essentially follows the ROUND macro (see blake2b-round.h) which is defined as:
|
||||
// #define ROUND(r) \
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1); \
|
||||
// G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1); \
|
||||
// G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
// DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1); \
|
||||
// G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1); \
|
||||
// G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
// UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
|
||||
//
|
||||
// as well as the go equivalent in https://github.com/dchest/blake2b/blob/master/block.go
|
||||
//
|
||||
// As in the macro, G1/G2 in the 1st and 2nd half are identical (so literal copy of assembly)
|
||||
//
|
||||
// Rounds are also the same, except for the loading of the message (and rounds 1 & 11 and
|
||||
// rounds 2 & 12 are identical)
|
||||
//
|
||||
|
||||
#define G1 \
|
||||
\ // G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1);
|
||||
LONG $0xd479c1c4; BYTE $0xc0 \ // VPADDQ XMM0,XMM0,XMM8 /* v0 += m[0], v1 += m[2] */
|
||||
LONG $0xd471c1c4; BYTE $0xc9 \ // VPADDQ XMM1,XMM1,XMM9 /* v2 += m[4], v3 += m[6] */
|
||||
LONG $0xc2d4f9c5 \ // VPADDQ XMM0,XMM0,XMM2 /* v0 += v4, v1 += v5 */
|
||||
LONG $0xcbd4f1c5 \ // VPADDQ XMM1,XMM1,XMM3 /* v2 += v6, v3 += v7 */
|
||||
LONG $0xf0efc9c5 \ // VPXOR XMM6,XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */
|
||||
LONG $0xf9efc1c5 \ // VPXOR XMM7,XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */
|
||||
LONG $0xf670f9c5; BYTE $0xb1 \ // VPSHUFD XMM6,XMM6,0xb1 /* v12 = v12<<(64-32) | v12>>32, v13 = v13<<(64-32) | v13>>32 */
|
||||
LONG $0xff70f9c5; BYTE $0xb1 \ // VPSHUFD XMM7,XMM7,0xb1 /* v14 = v14<<(64-32) | v14>>32, v15 = v15<<(64-32) | v15>>32 */
|
||||
LONG $0xe6d4d9c5 \ // VPADDQ XMM4,XMM4,XMM6 /* v8 += v12, v9 += v13 */
|
||||
LONG $0xefd4d1c5 \ // VPADDQ XMM5,XMM5,XMM7 /* v10 += v14, v11 += v15 */
|
||||
LONG $0xd4efe9c5 \ // VPXOR XMM2,XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */
|
||||
LONG $0xddefe1c5 \ // VPXOR XMM3,XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */
|
||||
LONG $0x0069c2c4; BYTE $0xd4 \ // VPSHUFB XMM2,XMM2,XMM12 /* v4 = v4<<(64-24) | v4>>24, v5 = v5<<(64-24) | v5>>24 */
|
||||
LONG $0x0061c2c4; BYTE $0xdc // VPSHUFB XMM3,XMM3,XMM12 /* v6 = v6<<(64-24) | v6>>24, v7 = v7<<(64-24) | v7>>24 */
|
||||
|
||||
#define G2 \
|
||||
\ // G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1);
|
||||
LONG $0xd479c1c4; BYTE $0xc2 \ // VPADDQ XMM0,XMM0,XMM10 /* v0 += m[1], v1 += m[3] */
|
||||
LONG $0xd471c1c4; BYTE $0xcb \ // VPADDQ XMM1,XMM1,XMM11 /* v2 += m[5], v3 += m[7] */
|
||||
LONG $0xc2d4f9c5 \ // VPADDQ XMM0,XMM0,XMM2 /* v0 += v4, v1 += v5 */
|
||||
LONG $0xcbd4f1c5 \ // VPADDQ XMM1,XMM1,XMM3 /* v2 += v6, v3 += v7 */
|
||||
LONG $0xf0efc9c5 \ // VPXOR XMM6,XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */
|
||||
LONG $0xf9efc1c5 \ // VPXOR XMM7,XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */
|
||||
LONG $0xf670fbc5; BYTE $0x39 \ // VPSHUFLW XMM6,XMM6,0x39 /* combined with next ... */
|
||||
LONG $0xf670fac5; BYTE $0x39 \ // VPSHUFHW XMM6,XMM6,0x39 /* v12 = v12<<(64-16) | v12>>16, v13 = v13<<(64-16) | v13>>16 */
|
||||
LONG $0xff70fbc5; BYTE $0x39 \ // VPSHUFLW XMM7,XMM7,0x39 /* combined with next ... */
|
||||
LONG $0xff70fac5; BYTE $0x39 \ // VPSHUFHW XMM7,XMM7,0x39 /* v14 = v14<<(64-16) | v14>>16, v15 = v15<<(64-16) | v15>>16 */
|
||||
LONG $0xe6d4d9c5 \ // VPADDQ XMM4,XMM4,XMM6 /* v8 += v12, v9 += v13 */
|
||||
LONG $0xefd4d1c5 \ // VPADDQ XMM5,XMM5,XMM7 /* v10 += v14, v11 += v15 */
|
||||
LONG $0xd4efe9c5 \ // VPXOR XMM2,XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */
|
||||
LONG $0xddefe1c5 \ // VPXOR XMM3,XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */
|
||||
LONG $0xfad469c5 \ // VPADDQ XMM15,XMM2,XMM2 /* temp reg = reg*2 */
|
||||
LONG $0xd273e9c5; BYTE $0x3f \ // VPSRLQ XMM2,XMM2,0x3f /* reg = reg>>63 */
|
||||
LONG $0xef69c1c4; BYTE $0xd7 \ // VPXOR XMM2,XMM2,XMM15 /* ORed together: v4 = v4<<(64-63) | v4>>63, v5 = v5<<(64-63) | v5>>63 */
|
||||
LONG $0xfbd461c5 \ // VPADDQ XMM15,XMM3,XMM3 /* temp reg = reg*2 */
|
||||
LONG $0xd373e1c5; BYTE $0x3f \ // VPSRLQ XMM3,XMM3,0x3f /* reg = reg>>63 */
|
||||
LONG $0xef61c1c4; BYTE $0xdf // VPXOR XMM3,XMM3,XMM15 /* ORed together: v6 = v6<<(64-63) | v6>>63, v7 = v7<<(64-63) | v7>>63 */
|
||||
|
||||
#define DIAGONALIZE \
|
||||
\ // DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
|
||||
MOVOU X6, X13 \ /* t0 = row4l;\ */
|
||||
MOVOU X2, X14 \ /* t1 = row2l;\ */
|
||||
MOVOU X4, X6 \ /* row4l = row3l;\ */
|
||||
MOVOU X5, X4 \ /* row3l = row3h;\ */
|
||||
MOVOU X6, X5 \ /* row3h = row4l;\ */
|
||||
LONG $0x6c1141c4; BYTE $0xfd \ // VPUNPCKLQDQ XMM15, XMM13, XMM13 /* _mm_unpacklo_epi64(t0, t0) */
|
||||
LONG $0x6d41c1c4; BYTE $0xf7 \ // VPUNPCKHQDQ XMM6, XMM7, XMM15 /* row4l = _mm_unpackhi_epi64(row4h, ); \ */
|
||||
LONG $0xff6c41c5 \ // VPUNPCKLQDQ XMM15, XMM7, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */
|
||||
LONG $0x6d11c1c4; BYTE $0xff \ // VPUNPCKHQDQ XMM7, XMM13, XMM15 /* row4h = _mm_unpackhi_epi64(t0, ); \ */
|
||||
LONG $0xfb6c61c5 \ // VPUNPCKLQDQ XMM15, XMM3, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */
|
||||
LONG $0x6d69c1c4; BYTE $0xd7 \ // VPUNPCKHQDQ XMM2, XMM2, XMM15 /* row2l = _mm_unpackhi_epi64(row2l, ); \ */
|
||||
LONG $0x6c0941c4; BYTE $0xfe \ // VPUNPCKLQDQ XMM15, XMM14, XMM14 /* _mm_unpacklo_epi64(t1, t1) */
|
||||
LONG $0x6d61c1c4; BYTE $0xdf // VPUNPCKHQDQ XMM3, XMM3, XMM15 /* row2h = _mm_unpackhi_epi64(row2h, ) */
|
||||
|
||||
#define UNDIAGONALIZE \
|
||||
\ // UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
|
||||
MOVOU X4, X13 \ /* t0 = row3l;\ */
|
||||
MOVOU X5, X4 \ /* row3l = row3h;\ */
|
||||
MOVOU X13, X5 \ /* row3h = t0;\ */
|
||||
MOVOU X2, X13 \ /* t0 = row2l;\ */
|
||||
MOVOU X6, X14 \ /* t1 = row4l;\ */
|
||||
LONG $0xfa6c69c5 \ // VPUNPCKLQDQ XMM15, XMM2, XMM2 /* _mm_unpacklo_epi64(row2l, row2l) */
|
||||
LONG $0x6d61c1c4; BYTE $0xd7 \ // VPUNPCKHQDQ XMM2, XMM3, XMM15 /* row2l = _mm_unpackhi_epi64(row2h, ); \ */
|
||||
LONG $0xfb6c61c5 \ // VPUNPCKLQDQ XMM15, XMM3, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */
|
||||
LONG $0x6d11c1c4; BYTE $0xdf \ // VPUNPCKHQDQ XMM3, XMM13, XMM15 /* row2h = _mm_unpackhi_epi64(t0, ); \ */
|
||||
LONG $0xff6c41c5 \ // VPUNPCKLQDQ XMM15, XMM7, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */
|
||||
LONG $0x6d49c1c4; BYTE $0xf7 \ // VPUNPCKHQDQ XMM6, XMM6, XMM15 /* row4l = _mm_unpackhi_epi64(row4l, ); \ */
|
||||
LONG $0x6c0941c4; BYTE $0xfe \ // VPUNPCKLQDQ XMM15, XMM14, XMM14 /* _mm_unpacklo_epi64(t1, t1) */
|
||||
LONG $0x6d41c1c4; BYTE $0xff // VPUNPCKHQDQ XMM7, XMM7, XMM15 /* row4h = _mm_unpackhi_epi64(row4h, ) */
|
||||
|
||||
#define LOAD_SHUFFLE \
|
||||
\ // Load shuffle value
|
||||
MOVQ shffle+120(FP), SI \ // SI: &shuffle
|
||||
MOVOU 0(SI), X12 // X12 = 03040506 07000102 0b0c0d0e 0f08090a
|
||||
|
||||
// func blockAVXLoop(p []uint8, in, iv, t, f, shffle, out []uint64)
|
||||
TEXT ·blockAVXLoop(SB), 7, $0
|
||||
// REGISTER USE
|
||||
// R8: loop counter
|
||||
// DX: message pointer
|
||||
// SI: temp pointer for loading
|
||||
// X0 - X7: v0 - v15
|
||||
// X8 - X11: m[0] - m[7]
|
||||
// X12: shuffle value
|
||||
// X13 - X15: temp registers
|
||||
|
||||
// Load digest
|
||||
MOVQ in+24(FP), SI // SI: &in
|
||||
MOVOU 0(SI), X0 // X0 = in[0]+in[1] /* row1l = LOAD( &S->h[0] ); */
|
||||
MOVOU 16(SI), X1 // X1 = in[2]+in[3] /* row1h = LOAD( &S->h[2] ); */
|
||||
MOVOU 32(SI), X2 // X2 = in[4]+in[5] /* row2l = LOAD( &S->h[4] ); */
|
||||
MOVOU 48(SI), X3 // X3 = in[6]+in[7] /* row2h = LOAD( &S->h[6] ); */
|
||||
|
||||
// Already store digest into &out (so we can reload it later generically)
|
||||
MOVQ out+144(FP), SI // SI: &out
|
||||
MOVOU X0, 0(SI) // out[0]+out[1] = X0
|
||||
MOVOU X1, 16(SI) // out[2]+out[3] = X1
|
||||
MOVOU X2, 32(SI) // out[4]+out[5] = X2
|
||||
MOVOU X3, 48(SI) // out[6]+out[7] = X3
|
||||
|
||||
// Initialize message pointer and loop counter
|
||||
MOVQ message+0(FP), DX // DX: &p (message)
|
||||
MOVQ message_len+8(FP), R8 // R8: len(message)
|
||||
SHRQ $7, R8 // len(message) / 128
|
||||
CMPQ R8, $0
|
||||
JEQ complete
|
||||
|
||||
loop:
|
||||
// Increment counter
|
||||
MOVQ t+72(FP), SI // SI: &t
|
||||
MOVQ 0(SI), R9
|
||||
ADDQ $128, R9 // /* d.t[0] += BlockSize */
|
||||
MOVQ R9, 0(SI)
|
||||
CMPQ R9, $128 // /* if d.t[0] < BlockSize { */
|
||||
JGE noincr
|
||||
MOVQ 8(SI), R9
|
||||
ADDQ $1, R9 // /* d.t[1]++ */
|
||||
MOVQ R9, 8(SI)
|
||||
noincr: // /* } */
|
||||
|
||||
// Load initialization vector
|
||||
MOVQ iv+48(FP), SI // SI: &iv
|
||||
MOVOU 0(SI), X4 // X4 = iv[0]+iv[1] /* row3l = LOAD( &blake2b_IV[0] ); */
|
||||
MOVOU 16(SI), X5 // X5 = iv[2]+iv[3] /* row3h = LOAD( &blake2b_IV[2] ); */
|
||||
MOVOU 32(SI), X6 // X6 = iv[4]+iv[5] /* LOAD( &blake2b_IV[4] ) */
|
||||
MOVOU 48(SI), X7 // X7 = iv[6]+iv[7] /* LOAD( &blake2b_IV[6] ) */
|
||||
MOVQ t+72(FP), SI // SI: &t
|
||||
MOVOU 0(SI), X8 // X8 = t[0]+t[1] /* LOAD( &S->t[0] ) */
|
||||
PXOR X8, X6 // X6 = X6 ^ X8 /* row4l = _mm_xor_si128( , ); */
|
||||
MOVQ t+96(FP), SI // SI: &f
|
||||
MOVOU 0(SI), X8 // X8 = f[0]+f[1] /* LOAD( &S->f[0] ) */
|
||||
PXOR X8, X7 // X7 = X7 ^ X8 /* row4h = _mm_xor_si128( , ); */
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 1
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+m[1]
|
||||
MOVOU 16(DX), X13 // X13 = m[2]+m[3]
|
||||
MOVOU 32(DX), X14 // X14 = m[4]+m[5]
|
||||
MOVOU 48(DX), X15 // X15 = m[6]+m[7]
|
||||
LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[2] */
|
||||
LONG $0x6c0941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[4], m[6] */
|
||||
LONG $0x6d1941c4; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[1], m[3] */
|
||||
LONG $0x6d0941c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[5], m[7] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 64(DX), X12 // X12 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X13 // X13 = m[10]+m[11]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[8],m[10] */
|
||||
LONG $0x6c0941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[12],m[14] */
|
||||
LONG $0x6d1941c4; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[9],m[11] */
|
||||
LONG $0x6d0941c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[13],m[15] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 2
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 112(DX), X12 // X12 = m[14]+m[15]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 96(DX), X15 // X15 = m[12]+m[13]
|
||||
LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[14], m[4] */
|
||||
LONG $0x6d0941c4; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM14, XMM15 /* m[9], m[13] */
|
||||
MOVOU 80(DX), X13 // X13 = m[10]+m[11]
|
||||
MOVOU 48(DX), X15 // X15 = m[6]+ m[7]
|
||||
LONG $0x6c1141c4; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[10], m[8] */
|
||||
LONG $0x0f0143c4; WORD $0x08dc // VPALIGNR XMM11, XMM15, XMM12, 0x8 /* m[15], m[6] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
LONG $0x0f1943c4; WORD $0x08c4 // VPALIGNR XMM8, XMM12, XMM12, 0x8 /* m[1], m[0] */
|
||||
LONG $0x6d0941c4; BYTE $0xcd // VPUNPCKHQDQ XMM9, XMM14, XMM13 /* m[11], m[5] */
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
LONG $0x6c0941c4; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[2] */
|
||||
LONG $0x6d1141c4; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM13, XMM12 /* m[7], m[3] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 3
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 32(DX), X12 // X12 = m[4]+ m[5]
|
||||
MOVOU 80(DX), X13 // X13 = m[10]+m[11]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
LONG $0x0f0943c4; WORD $0x08c5 // VPALIGNR XMM8, XMM14, XMM13, 0x8 /* m[11], m[12] */
|
||||
LONG $0x6d1941c4; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM12, XMM15 /* m[5], m[15] */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
|
||||
MOVOU 64(DX), X15 // X15 = m[8]+ m[9]
|
||||
LONG $0x6c0141c4; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM15, XMM12 /* m[8], m[0] */
|
||||
LONG $0x6d0941c4; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[13] */
|
||||
LONG $0x6c1141c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[2], ___ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X15 // X15 = m[10]+m[11]
|
||||
LONG $0x6d1941c4; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM12, XMM12 /* ___, m[3] */
|
||||
LONG $0x6c0141c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[10], ___ */
|
||||
LONG $0x6d1141c4; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM13, XMM14 /* m[7], m[9] */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 32(DX), X14 // X14 = m[4]+ m[5]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
LONG $0x6c0141c4; BYTE $0xd5 // VPUNPCKLQDQ XMM10, XMM15, XMM13 /* m[14], m[6] */
|
||||
LONG $0x0f0943c4; WORD $0x08dc // VPALIGNR XMM11, XMM14, XMM12, 0x8 /* m[1], m[4] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 4
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
MOVOU 96(DX), X15 // X15 = m[12]+m[13]
|
||||
LONG $0x6d1141c4; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM13, XMM12 /* m[7], m[3] */
|
||||
LONG $0x6d0141c4; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM15, XMM14 /* m[13], m[11] */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 64(DX), X13 // X13 = m[8]+ m[9]
|
||||
MOVOU 112(DX), X14 // X14 = m[14]+m[15]
|
||||
LONG $0x6d1141c4; BYTE $0xd4 // VPUNPCKHQDQ XMM10, XMM13, XMM12 /* m[9], m[1] */
|
||||
LONG $0x6c0141c4; BYTE $0xde // VPUNPCKLQDQ XMM11, XMM15, XMM14 /* m[12], m[14] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
LONG $0x6d1141c4; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM13, XMM13 /* ___, m[5] */
|
||||
LONG $0x6c1941c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM12, XMM8 /* m[2], ____ */
|
||||
LONG $0x6d0141c4; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM15, XMM15 /* ___, m[15] */
|
||||
LONG $0x6c1141c4; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM13, XMM9 /* m[4], ____ */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 64(DX), X15 // X15 = m[8]+ m[9]
|
||||
LONG $0x6c1141c4; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[6], m[10] */
|
||||
LONG $0x6c1941c4; BYTE $0xdf // VPUNPCKLQDQ XMM11, XMM12, XMM15 /* m[0], m[8] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 5
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X15 // X15 = m[10]+m[11]
|
||||
LONG $0x6d0941c4; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM14, XMM13 /* m[9], m[5] */
|
||||
LONG $0x6c1941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM12, XMM15 /* m[2], m[10] */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 48(DX), X14 // X14 = m[6]+ m[7]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
LONG $0x6d0941c4; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM14, XMM14 /* ___, m[7] */
|
||||
LONG $0x6c1941c4; BYTE $0xd2 // VPUNPCKLQDQ XMM10, XMM12, XMM10 /* m[0], ____ */
|
||||
LONG $0x6d0141c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM15, XMM15 /* ___, m[15] */
|
||||
LONG $0x6c1141c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[4], ____ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
LONG $0x6d0941c4; BYTE $0xc6 // VPUNPCKHQDQ XMM8, XMM14, XMM14 /* ___, m[11] */
|
||||
LONG $0x6c0141c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[14], ____ */
|
||||
LONG $0x6d1941c4; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM12, XMM12 /* ___, m[3] */
|
||||
LONG $0x6c1141c4; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM13, XMM9 /* m[6], ____ */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 64(DX), X13 // X13 = m[8]+ m[9]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
LONG $0x0f0943c4; WORD $0x08d4 // VPALIGNR XMM10, XMM14, XMM12, 0x8 /* m[1], m[12] */
|
||||
LONG $0x6d0941c4; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[13] */
|
||||
LONG $0x6c1141c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[8], ____ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 6
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X14 // X14 = m[6]+ m[7]
|
||||
MOVOU 64(DX), X15 // X15 = m[8]+ m[9]
|
||||
LONG $0x6c1141c4; BYTE $0xc6 // VPUNPCKLQDQ XMM8, XMM13, XMM14 /* m[2], m[6] */
|
||||
LONG $0x6c1941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM12, XMM15 /* m[0], m[8] */
|
||||
MOVOU 80(DX), X12 // X12 = m[10]+m[11]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
LONG $0x6c0941c4; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[10] */
|
||||
LONG $0x6d1941c4; BYTE $0xdd // VPUNPCKHQDQ XMM11, XMM12, XMM13 /* m[11], m[3] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 48(DX), X14 // X14 = m[6]+ m[7]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
LONG $0x6d0941c4; BYTE $0xc6 // VPUNPCKHQDQ XMM8, XMM14, XMM14 /* ___, m[7] */
|
||||
LONG $0x6c1141c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM13, XMM8 /* m[4], ____ */
|
||||
LONG $0x6d0141c4; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM15, XMM12 /* m[15], m[1] */
|
||||
MOVOU 64(DX), X12 // X12 = m[8]+ m[9]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
LONG $0x6d0941c4; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM14, XMM13 /* m[13], m[5] */
|
||||
LONG $0x6d1941c4; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM12, XMM12 /* ___, m[9] */
|
||||
LONG $0x6c0141c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM15, XMM11 /* m[14], ____ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 7
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
LONG $0x6d1941c4; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM12, XMM12 /* ___, m[1] */
|
||||
LONG $0x6c0941c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM14, XMM8 /* m[12], ____ */
|
||||
LONG $0x6c0141c4; BYTE $0xcd // VPUNPCKLQDQ XMM9, XMM15, XMM13 /* m[14], m[4] */
|
||||
MOVOU 80(DX), X12 // X12 = m[10]+m[11]
|
||||
LONG $0x6d1141c4; BYTE $0xd7 // VPUNPCKHQDQ XMM10, XMM13, XMM15 /* m[5], m[15] */
|
||||
LONG $0x0f1943c4; WORD $0x08de // VPALIGNR XMM11, XMM12, XMM14, 0x8 /* m[13], m[10] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X15 // X15 = m[10]+m[11]
|
||||
LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[6] */
|
||||
LONG $0x0f0943c4; WORD $0x08ce // VPALIGNR XMM9, XMM14, XMM14, 0x8 /* m[9], m[8] */
|
||||
MOVOU 16(DX), X14 // X14 = m[2]+ m[3]
|
||||
LONG $0x6d1141c4; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM13, XMM14 /* m[7], m[3] */
|
||||
LONG $0x6d0141c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM15, XMM15 /* ___, m[11] */
|
||||
LONG $0x6c0941c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM14, XMM11 /* m[2], ____ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 8
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
LONG $0x6d0941c4; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM14, XMM13 /* m[13], m[7] */
|
||||
LONG $0x6d1941c4; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM12, XMM12 /* ___, m[3] */
|
||||
LONG $0x6c0941c4; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM14, XMM9 /* m[12], ____ */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 64(DX), X13 // X13 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
LONG $0x0f0143c4; WORD $0x08d6 // VPALIGNR XMM10, XMM15, XMM14, 0x8 /* m[11], m[14] */
|
||||
LONG $0x6d1941c4; BYTE $0xdd // VPUNPCKHQDQ XMM11, XMM12, XMM13 /* m[1], m[9] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
LONG $0x6d1141c4; BYTE $0xc7 // VPUNPCKHQDQ XMM8, XMM13, XMM15 /* m[5], m[15] */
|
||||
LONG $0x6c0941c4; BYTE $0xcc // VPUNPCKLQDQ XMM9, XMM14, XMM12 /* m[8], m[2] */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 48(DX), X14 // X14 = m[6]+ m[7]
|
||||
MOVOU 80(DX), X15 // X15 = m[10]+m[11]
|
||||
LONG $0x6c1941c4; BYTE $0xd5 // VPUNPCKLQDQ XMM10, XMM12, XMM13 /* m[0], m[4] */
|
||||
LONG $0x6c0941c4; BYTE $0xdf // VPUNPCKLQDQ XMM11, XMM14, XMM15 /* m[6], m[10] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 9
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
LONG $0x6c1141c4; BYTE $0xc7 // VPUNPCKLQDQ XMM8, XMM13, XMM15 /* m[6], m[14] */
|
||||
LONG $0x0f1943c4; WORD $0x08ce // VPALIGNR XMM9, XMM12, XMM14, 0x8 /* m[11], m[0] */
|
||||
MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
LONG $0x6d0141c4; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM15, XMM14 /* m[15], m[9] */
|
||||
LONG $0x0f0943c4; WORD $0x08dd // VPALIGNR XMM11, XMM14, XMM13, 0x8 /* m[3], m[8] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
MOVOU 96(DX), X15 // X15 = m[12]+m[13]
|
||||
LONG $0x6d0141c4; BYTE $0xc7 // VPUNPCKHQDQ XMM8, XMM15, XMM15 /* ___, m[13] */
|
||||
LONG $0x6c0141c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[12], ____ */
|
||||
LONG $0x0f0943c4; WORD $0x08cc // VPALIGNR XMM9, XMM14, XMM12, 0x8 /* m[1], m[10] */
|
||||
MOVOU 32(DX), X12 // X12 = m[4]+ m[5]
|
||||
MOVOU 48(DX), X15 // X15 = m[6]+ m[7]
|
||||
LONG $0x6d0141c4; BYTE $0xd7 // VPUNPCKHQDQ XMM10, XMM15, XMM15 /* ___, m[7] */
|
||||
LONG $0x6c1141c4; BYTE $0xd2 // VPUNPCKLQDQ XMM10, XMM13, XMM10 /* m[2], ____ */
|
||||
LONG $0x6d1941c4; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM12, XMM12 /* ___, m[5] */
|
||||
LONG $0x6c1941c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM12, XMM11 /* m[4], ____ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 1 0
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X15 // X15 = m[10]+m[11]
|
||||
LONG $0x6c0141c4; BYTE $0xc6 // VPUNPCKLQDQ XMM8, XMM15, XMM14 /* m[10], m[8] */
|
||||
LONG $0x6d1141c4; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM13, XMM12 /* m[7], m[1] */
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 32(DX), X14 // X14 = m[4]+ m[5]
|
||||
LONG $0x6c1941c4; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM12, XMM14 /* m[2], m[4] */
|
||||
LONG $0x6d0941c4; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[5] */
|
||||
LONG $0x6c1141c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[6], ____ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 64(DX), X13 // X13 = m[8]+ m[9]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
LONG $0x6d0141c4; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM15, XMM13 /* m[15], m[9] */
|
||||
LONG $0x6d1941c4; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM12, XMM14 /* m[3], m[13] */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 80(DX), X13 // X13 = m[10]+m[11]
|
||||
LONG $0x0f0143c4; WORD $0x08d5 // VPALIGNR XMM10, XMM15, XMM13, 0x8 /* m[11], m[14] */
|
||||
LONG $0x6c0941c4; BYTE $0xdc // VPUNPCKLQDQ XMM11, XMM14, XMM12 /* m[12], m[0] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 1 1
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+m[1]
|
||||
MOVOU 16(DX), X13 // X13 = m[2]+m[3]
|
||||
MOVOU 32(DX), X14 // X14 = m[4]+m[5]
|
||||
MOVOU 48(DX), X15 // X15 = m[6]+m[7]
|
||||
LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[2] */
|
||||
LONG $0x6c0941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[4], m[6] */
|
||||
LONG $0x6d1941c4; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[1], m[3] */
|
||||
LONG $0x6d0941c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[5], m[7] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 64(DX), X12 // X12 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X13 // X13 = m[10]+m[11]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[8],m[10] */
|
||||
LONG $0x6c0941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[12],m[14] */
|
||||
LONG $0x6d1941c4; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[9],m[11] */
|
||||
LONG $0x6d0941c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[13],m[15] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 1 2
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 112(DX), X12 // X12 = m[14]+m[15]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 96(DX), X15 // X15 = m[12]+m[13]
|
||||
LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[14], m[4] */
|
||||
LONG $0x6d0941c4; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM14, XMM15 /* m[9], m[13] */
|
||||
MOVOU 80(DX), X13 // X13 = m[10]+m[11]
|
||||
MOVOU 48(DX), X15 // X15 = m[6]+ m[7]
|
||||
LONG $0x6c1141c4; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[10], m[8] */
|
||||
LONG $0x0f0143c4; WORD $0x08dc // VPALIGNR XMM11, XMM15, XMM12, 0x8 /* m[15], m[6] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
LONG $0x0f1943c4; WORD $0x08c4 // VPALIGNR XMM8, XMM12, XMM12, 0x8 /* m[1], m[0] */
|
||||
LONG $0x6d0941c4; BYTE $0xcd // VPUNPCKHQDQ XMM9, XMM14, XMM13 /* m[11], m[5] */
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
LONG $0x6c0941c4; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[2] */
|
||||
LONG $0x6d1141c4; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM13, XMM12 /* m[7], m[3] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
UNDIAGONALIZE
|
||||
|
||||
// Reload digest (most current value store in &out)
|
||||
MOVQ out+144(FP), SI // SI: &in
|
||||
MOVOU 0(SI), X12 // X12 = in[0]+in[1] /* row1l = LOAD( &S->h[0] ); */
|
||||
MOVOU 16(SI), X13 // X13 = in[2]+in[3] /* row1h = LOAD( &S->h[2] ); */
|
||||
MOVOU 32(SI), X14 // X14 = in[4]+in[5] /* row2l = LOAD( &S->h[4] ); */
|
||||
MOVOU 48(SI), X15 // X15 = in[6]+in[7] /* row2h = LOAD( &S->h[6] ); */
|
||||
|
||||
// Final computations and prepare for storing
|
||||
PXOR X4, X0 // X0 = X0 ^ X4 /* row1l = _mm_xor_si128( row3l, row1l ); */
|
||||
PXOR X5, X1 // X1 = X1 ^ X5 /* row1h = _mm_xor_si128( row3h, row1h ); */
|
||||
PXOR X12, X0 // X0 = X0 ^ X12 /* STORE( &S->h[0], _mm_xor_si128( LOAD( &S->h[0] ), row1l ) ); */
|
||||
PXOR X13, X1 // X1 = X1 ^ X13 /* STORE( &S->h[2], _mm_xor_si128( LOAD( &S->h[2] ), row1h ) ); */
|
||||
PXOR X6, X2 // X2 = X2 ^ X6 /* row2l = _mm_xor_si128( row4l, row2l ); */
|
||||
PXOR X7, X3 // X3 = X3 ^ X7 /* row2h = _mm_xor_si128( row4h, row2h ); */
|
||||
PXOR X14, X2 // X2 = X2 ^ X14 /* STORE( &S->h[4], _mm_xor_si128( LOAD( &S->h[4] ), row2l ) ); */
|
||||
PXOR X15, X3 // X3 = X3 ^ X15 /* STORE( &S->h[6], _mm_xor_si128( LOAD( &S->h[6] ), row2h ) ); */
|
||||
|
||||
// Store digest into &out
|
||||
MOVQ out+144(FP), SI // SI: &out
|
||||
MOVOU X0, 0(SI) // out[0]+out[1] = X0
|
||||
MOVOU X1, 16(SI) // out[2]+out[3] = X1
|
||||
MOVOU X2, 32(SI) // out[4]+out[5] = X2
|
||||
MOVOU X3, 48(SI) // out[6]+out[7] = X3
|
||||
|
||||
// Increment message pointer and check if there's more to do
|
||||
ADDQ $128, DX // message += 128
|
||||
SUBQ $1, R8
|
||||
JNZ loop
|
||||
|
||||
complete:
|
||||
RET
|
||||
41
vendor/github.com/minio/blake2b-simd/compressSse_amd64.go
generated
vendored
41
vendor/github.com/minio/blake2b-simd/compressSse_amd64.go
generated
vendored
@@ -1,41 +0,0 @@
|
||||
//+build !noasm
|
||||
//+build !appengine
|
||||
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2016 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package blake2b
|
||||
|
||||
//go:noescape
|
||||
func blockSSELoop(p []uint8, in, iv, t, f, shffle, out []uint64)
|
||||
|
||||
func compressSSE(d *digest, p []uint8) {
|
||||
var (
|
||||
in [8]uint64
|
||||
out [8]uint64
|
||||
shffle [2]uint64
|
||||
)
|
||||
|
||||
// vector for PSHUFB instruction
|
||||
shffle[0] = 0x0201000706050403
|
||||
shffle[1] = 0x0a09080f0e0d0c0b
|
||||
|
||||
in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7] = d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7]
|
||||
|
||||
blockSSELoop(p, in[:], iv[:], d.t[:], d.f[:], shffle[:], out[:])
|
||||
|
||||
d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7] = out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7]
|
||||
}
|
||||
770
vendor/github.com/minio/blake2b-simd/compressSse_amd64.s
generated
vendored
770
vendor/github.com/minio/blake2b-simd/compressSse_amd64.s
generated
vendored
@@ -1,770 +0,0 @@
|
||||
//+build !noasm !appengine
|
||||
|
||||
//
|
||||
// Minio Cloud Storage, (C) 2016 Minio, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
//
|
||||
// Based on SSE implementation from https://github.com/BLAKE2/BLAKE2/blob/master/sse/blake2b.c
|
||||
//
|
||||
// Use github.com/fwessels/asm2plan9s on this file to assemble instructions to their Plan9 equivalent
|
||||
//
|
||||
// Assembly code below essentially follows the ROUND macro (see blake2b-round.h) which is defined as:
|
||||
// #define ROUND(r) \
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1); \
|
||||
// G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1); \
|
||||
// G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
// DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1); \
|
||||
// G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1); \
|
||||
// G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
// UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
|
||||
//
|
||||
// as well as the go equivalent in https://github.com/dchest/blake2b/blob/master/block.go
|
||||
//
|
||||
// As in the macro, G1/G2 in the 1st and 2nd half are identical (so literal copy of assembly)
|
||||
//
|
||||
// Rounds are also the same, except for the loading of the message (and rounds 1 & 11 and
|
||||
// rounds 2 & 12 are identical)
|
||||
//
|
||||
|
||||
#define G1 \
|
||||
\ // G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1);
|
||||
LONG $0xd40f4166; BYTE $0xc0 \ // PADDQ XMM0,XMM8 /* v0 += m[0], v1 += m[2] */
|
||||
LONG $0xd40f4166; BYTE $0xc9 \ // PADDQ XMM1,XMM9 /* v2 += m[4], v3 += m[6] */
|
||||
LONG $0xc2d40f66 \ // PADDQ XMM0,XMM2 /* v0 += v4, v1 += v5 */
|
||||
LONG $0xcbd40f66 \ // PADDQ XMM1,XMM3 /* v2 += v6, v3 += v7 */
|
||||
LONG $0xf0ef0f66 \ // PXOR XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */
|
||||
LONG $0xf9ef0f66 \ // PXOR XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */
|
||||
LONG $0xf6700f66; BYTE $0xb1 \ // PSHUFD XMM6,XMM6,0xb1 /* v12 = v12<<(64-32) | v12>>32, v13 = v13<<(64-32) | v13>>32 */
|
||||
LONG $0xff700f66; BYTE $0xb1 \ // PSHUFD XMM7,XMM7,0xb1 /* v14 = v14<<(64-32) | v14>>32, v15 = v15<<(64-32) | v15>>32 */
|
||||
LONG $0xe6d40f66 \ // PADDQ XMM4,XMM6 /* v8 += v12, v9 += v13 */
|
||||
LONG $0xefd40f66 \ // PADDQ XMM5,XMM7 /* v10 += v14, v11 += v15 */
|
||||
LONG $0xd4ef0f66 \ // PXOR XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */
|
||||
LONG $0xddef0f66 \ // PXOR XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */
|
||||
LONG $0x380f4166; WORD $0xd400 \ // PSHUFB XMM2,XMM12 /* v4 = v4<<(64-24) | v4>>24, v5 = v5<<(64-24) | v5>>24 */
|
||||
LONG $0x380f4166; WORD $0xdc00 // PSHUFB XMM3,XMM12 /* v6 = v6<<(64-24) | v6>>24, v7 = v7<<(64-24) | v7>>24 */
|
||||
|
||||
#define G2 \
|
||||
\ // G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1);
|
||||
LONG $0xd40f4166; BYTE $0xc2 \ // PADDQ XMM0,XMM10 /* v0 += m[1], v1 += m[3] */
|
||||
LONG $0xd40f4166; BYTE $0xcb \ // PADDQ XMM1,XMM11 /* v2 += m[5], v3 += m[7] */
|
||||
LONG $0xc2d40f66 \ // PADDQ XMM0,XMM2 /* v0 += v4, v1 += v5 */
|
||||
LONG $0xcbd40f66 \ // PADDQ XMM1,XMM3 /* v2 += v6, v3 += v7 */
|
||||
LONG $0xf0ef0f66 \ // PXOR XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */
|
||||
LONG $0xf9ef0f66 \ // PXOR XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */
|
||||
LONG $0xf6700ff2; BYTE $0x39 \ // PSHUFLW XMM6,XMM6,0x39 /* combined with next ... */
|
||||
LONG $0xf6700ff3; BYTE $0x39 \ // PSHUFHW XMM6,XMM6,0x39 /* v12 = v12<<(64-16) | v12>>16, v13 = v13<<(64-16) | v13>>16 */
|
||||
LONG $0xff700ff2; BYTE $0x39 \ // PSHUFLW XMM7,XMM7,0x39 /* combined with next ... */
|
||||
LONG $0xff700ff3; BYTE $0x39 \ // PSHUFHW XMM7,XMM7,0x39 /* v14 = v14<<(64-16) | v14>>16, v15 = v15<<(64-16) | v15>>16 */
|
||||
LONG $0xe6d40f66 \ // PADDQ XMM4,XMM6 /* v8 += v12, v9 += v13 */
|
||||
LONG $0xefd40f66 \ // PADDQ XMM5,XMM7 /* v10 += v14, v11 += v15 */
|
||||
LONG $0xd4ef0f66 \ // PXOR XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */
|
||||
LONG $0xddef0f66 \ // PXOR XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */
|
||||
MOVOU X2, X15 \
|
||||
LONG $0xd40f4466; BYTE $0xfa \ // PADDQ XMM15,XMM2 /* temp reg = reg*2 */
|
||||
LONG $0xd2730f66; BYTE $0x3f \ // PSRLQ XMM2,0x3f /* reg = reg>>63 */
|
||||
LONG $0xef0f4166; BYTE $0xd7 \ // PXOR XMM2,XMM15 /* ORed together: v4 = v4<<(64-63) | v4>>63, v5 = v5<<(64-63) | v5>>63 */
|
||||
MOVOU X3, X15 \
|
||||
LONG $0xd40f4466; BYTE $0xfb \ // PADDQ XMM15,XMM3 /* temp reg = reg*2 */
|
||||
LONG $0xd3730f66; BYTE $0x3f \ // PSRLQ XMM3,0x3f /* reg = reg>>63 */
|
||||
LONG $0xef0f4166; BYTE $0xdf // PXOR XMM3,XMM15 /* ORed together: v6 = v6<<(64-63) | v6>>63, v7 = v7<<(64-63) | v7>>63 */
|
||||
|
||||
#define DIAGONALIZE \
|
||||
\ // DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
|
||||
MOVOU X6, X13 \ /* t0 = row4l;\ */
|
||||
MOVOU X2, X14 \ /* t1 = row2l;\ */
|
||||
MOVOU X4, X6 \ /* row4l = row3l;\ */
|
||||
MOVOU X5, X4 \ /* row3l = row3h;\ */
|
||||
MOVOU X6, X5 \ /* row3h = row4l;\ */
|
||||
LONG $0x6c0f4566; BYTE $0xfd \ // PUNPCKLQDQ XMM15, XMM13 /* _mm_unpacklo_epi64(t0, t0) */
|
||||
MOVOU X7, X6 \
|
||||
LONG $0x6d0f4166; BYTE $0xf7 \ // PUNPCKHQDQ XMM6, XMM15 /* row4l = _mm_unpackhi_epi64(row4h, ); \ */
|
||||
LONG $0x6c0f4466; BYTE $0xff \ // PUNPCKLQDQ XMM15, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */
|
||||
MOVOU X13, X7 \
|
||||
LONG $0x6d0f4166; BYTE $0xff \ // PUNPCKHQDQ XMM7, XMM15 /* row4h = _mm_unpackhi_epi64(t0, ); \ */
|
||||
LONG $0x6c0f4466; BYTE $0xfb \ // PUNPCKLQDQ XMM15, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */
|
||||
LONG $0x6d0f4166; BYTE $0xd7 \ // PUNPCKHQDQ XMM2, XMM15 /* row2l = _mm_unpackhi_epi64(row2l, ); \ */
|
||||
LONG $0x6c0f4566; BYTE $0xfe \ // PUNPCKLQDQ XMM15, XMM14 /* _mm_unpacklo_epi64(t1, t1) */
|
||||
LONG $0x6d0f4166; BYTE $0xdf // PUNPCKHQDQ XMM3, XMM15 /* row2h = _mm_unpackhi_epi64(row2h, ) */
|
||||
|
||||
#define UNDIAGONALIZE \
|
||||
\ // UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
|
||||
MOVOU X4, X13 \ /* t0 = row3l;\ */
|
||||
MOVOU X5, X4 \ /* row3l = row3h;\ */
|
||||
MOVOU X13, X5 \ /* row3h = t0;\ */
|
||||
MOVOU X2, X13 \ /* t0 = row2l;\ */
|
||||
MOVOU X6, X14 \ /* t1 = row4l;\ */
|
||||
LONG $0x6c0f4466; BYTE $0xfa \ // PUNPCKLQDQ XMM15, XMM2 /* _mm_unpacklo_epi64(row2l, row2l) */
|
||||
MOVOU X3, X2 \
|
||||
LONG $0x6d0f4166; BYTE $0xd7 \ // PUNPCKHQDQ XMM2, XMM15 /* row2l = _mm_unpackhi_epi64(row2h, ); \ */
|
||||
LONG $0x6c0f4466; BYTE $0xfb \ // PUNPCKLQDQ XMM15, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */
|
||||
MOVOU X13, X3 \
|
||||
LONG $0x6d0f4166; BYTE $0xdf \ // PUNPCKHQDQ XMM3, XMM15 /* row2h = _mm_unpackhi_epi64(t0, ); \ */
|
||||
LONG $0x6c0f4466; BYTE $0xff \ // PUNPCKLQDQ XMM15, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */
|
||||
LONG $0x6d0f4166; BYTE $0xf7 \ // PUNPCKHQDQ XMM6, XMM15 /* row4l = _mm_unpackhi_epi64(row4l, ); \ */
|
||||
LONG $0x6c0f4566; BYTE $0xfe \ // PUNPCKLQDQ XMM15, XMM14 /* _mm_unpacklo_epi64(t1, t1) */
|
||||
LONG $0x6d0f4166; BYTE $0xff // PUNPCKHQDQ XMM7, XMM15 /* row4h = _mm_unpackhi_epi64(row4h, ) */
|
||||
|
||||
#define LOAD_SHUFFLE \
|
||||
\ // Load shuffle value
|
||||
MOVQ shffle+120(FP), SI \ // SI: &shuffle
|
||||
MOVOU 0(SI), X12 // X12 = 03040506 07000102 0b0c0d0e 0f08090a
|
||||
|
||||
// func blockSSELoop(p []uint8, in, iv, t, f, shffle, out []uint64)
|
||||
TEXT ·blockSSELoop(SB), 7, $0
|
||||
// REGISTER USE
|
||||
// R8: loop counter
|
||||
// DX: message pointer
|
||||
// SI: temp pointer for loading
|
||||
// X0 - X7: v0 - v15
|
||||
// X8 - X11: m[0] - m[7]
|
||||
// X12: shuffle value
|
||||
// X13 - X15: temp registers
|
||||
|
||||
// Load digest
|
||||
MOVQ in+24(FP), SI // SI: &in
|
||||
MOVOU 0(SI), X0 // X0 = in[0]+in[1] /* row1l = LOAD( &S->h[0] ); */
|
||||
MOVOU 16(SI), X1 // X1 = in[2]+in[3] /* row1h = LOAD( &S->h[2] ); */
|
||||
MOVOU 32(SI), X2 // X2 = in[4]+in[5] /* row2l = LOAD( &S->h[4] ); */
|
||||
MOVOU 48(SI), X3 // X3 = in[6]+in[7] /* row2h = LOAD( &S->h[6] ); */
|
||||
|
||||
// Already store digest into &out (so we can reload it later generically)
|
||||
MOVQ out+144(FP), SI // SI: &out
|
||||
MOVOU X0, 0(SI) // out[0]+out[1] = X0
|
||||
MOVOU X1, 16(SI) // out[2]+out[3] = X1
|
||||
MOVOU X2, 32(SI) // out[4]+out[5] = X2
|
||||
MOVOU X3, 48(SI) // out[6]+out[7] = X3
|
||||
|
||||
// Initialize message pointer and loop counter
|
||||
MOVQ message+0(FP), DX // DX: &p (message)
|
||||
MOVQ message_len+8(FP), R8 // R8: len(message)
|
||||
SHRQ $7, R8 // len(message) / 128
|
||||
CMPQ R8, $0
|
||||
JEQ complete
|
||||
|
||||
loop:
|
||||
// Increment counter
|
||||
MOVQ t+72(FP), SI // SI: &t
|
||||
MOVQ 0(SI), R9
|
||||
ADDQ $128, R9 // /* d.t[0] += BlockSize */
|
||||
MOVQ R9, 0(SI)
|
||||
CMPQ R9, $128 // /* if d.t[0] < BlockSize { */
|
||||
JGE noincr
|
||||
MOVQ 8(SI), R9
|
||||
ADDQ $1, R9 // /* d.t[1]++ */
|
||||
MOVQ R9, 8(SI)
|
||||
|
||||
noincr: // /* } */
|
||||
|
||||
// Load initialization vector
|
||||
MOVQ iv+48(FP), SI // SI: &iv
|
||||
MOVOU 0(SI), X4 // X4 = iv[0]+iv[1] /* row3l = LOAD( &blake2b_IV[0] ); */
|
||||
MOVOU 16(SI), X5 // X5 = iv[2]+iv[3] /* row3h = LOAD( &blake2b_IV[2] ); */
|
||||
MOVOU 32(SI), X6 // X6 = iv[4]+iv[5] /* LOAD( &blake2b_IV[4] ) */
|
||||
MOVOU 48(SI), X7 // X7 = iv[6]+iv[7] /* LOAD( &blake2b_IV[6] ) */
|
||||
MOVQ t+72(FP), SI // SI: &t
|
||||
MOVOU 0(SI), X8 // X8 = t[0]+t[1] /* LOAD( &S->t[0] ) */
|
||||
PXOR X8, X6 // X6 = X6 ^ X8 /* row4l = _mm_xor_si128( , ); */
|
||||
MOVQ t+96(FP), SI // SI: &f
|
||||
MOVOU 0(SI), X8 // X8 = f[0]+f[1] /* LOAD( &S->f[0] ) */
|
||||
PXOR X8, X7 // X7 = X7 ^ X8 /* row4h = _mm_xor_si128( , ); */
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 1
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+m[1]
|
||||
MOVOU 16(DX), X13 // X13 = m[2]+m[3]
|
||||
MOVOU 32(DX), X14 // X14 = m[4]+m[5]
|
||||
MOVOU 48(DX), X15 // X15 = m[6]+m[7]
|
||||
MOVOU X12, X8
|
||||
LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[0], m[2] */
|
||||
MOVOU X14, X9
|
||||
LONG $0x6c0f4566; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[4], m[6] */
|
||||
MOVOU X12, X10
|
||||
LONG $0x6d0f4566; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[1], m[3] */
|
||||
MOVOU X14, X11
|
||||
LONG $0x6d0f4566; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[5], m[7] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 64(DX), X12 // X12 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X13 // X13 = m[10]+m[11]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
MOVOU X12, X8
|
||||
LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[8],m[10] */
|
||||
MOVOU X14, X9
|
||||
LONG $0x6c0f4566; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[12],m[14] */
|
||||
MOVOU X12, X10
|
||||
LONG $0x6d0f4566; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[9],m[11] */
|
||||
MOVOU X14, X11
|
||||
LONG $0x6d0f4566; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[13],m[15] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 2
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 112(DX), X12 // X12 = m[14]+m[15]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 96(DX), X15 // X15 = m[12]+m[13]
|
||||
MOVOU X12, X8
|
||||
LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[14], m[4] */
|
||||
MOVOU X14, X9
|
||||
LONG $0x6d0f4566; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* m[9], m[13] */
|
||||
MOVOU 80(DX), X10 // X10 = m[10]+m[11]
|
||||
MOVOU 48(DX), X11 // X11 = m[6]+ m[7]
|
||||
LONG $0x6c0f4566; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[10], m[8] */
|
||||
LONG $0x3a0f4566; WORD $0xdc0f; BYTE $0x08 // PALIGNR XMM11, XMM12, 0x8 /* m[15], m[6] */; ; ; ; ;
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
MOVOU X12, X8
|
||||
LONG $0x3a0f4566; WORD $0xc40f; BYTE $0x08 // PALIGNR XMM8, XMM12, 0x8 /* m[1], m[0] */
|
||||
MOVOU X14, X9
|
||||
LONG $0x6d0f4566; BYTE $0xcd // PUNPCKHQDQ XMM9, XMM13 /* m[11], m[5] */
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X11 // X11 = m[6]+ m[7]
|
||||
MOVOU 96(DX), X10 // X10 = m[12]+m[13]
|
||||
LONG $0x6c0f4566; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[12], m[2] */
|
||||
LONG $0x6d0f4566; BYTE $0xdc // PUNPCKHQDQ XMM11, XMM12 /* m[7], m[3] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 3
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 32(DX), X12 // X12 = m[4]+ m[5]
|
||||
MOVOU 80(DX), X13 // X13 = m[10]+m[11]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
MOVOU X14, X8
|
||||
LONG $0x3a0f4566; WORD $0xc50f; BYTE $0x08 // PALIGNR XMM8, XMM13, 0x8 /* m[11], m[12] */
|
||||
MOVOU X12, X9
|
||||
LONG $0x6d0f4566; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* m[5], m[15] */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
|
||||
MOVOU 64(DX), X10 // X10 = m[8]+ m[9]
|
||||
LONG $0x6c0f4566; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[8], m[0] */
|
||||
LONG $0x6d0f4566; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[13] */
|
||||
MOVOU X13, X11
|
||||
LONG $0x6c0f4566; BYTE $0xde // PUNPCKLQDQ XMM11, XMM14 /* m[2], ___ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X15 // X15 = m[10]+m[11]
|
||||
MOVOU X12, X9
|
||||
LONG $0x6d0f4566; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* ___, m[3] */
|
||||
MOVOU X15, X8
|
||||
LONG $0x6c0f4566; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[10], ___ */
|
||||
MOVOU X13, X9
|
||||
LONG $0x6d0f4566; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* m[7], m[9] */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 32(DX), X11 // X11 = m[4]+ m[5]
|
||||
MOVOU 112(DX), X10 // X10 = m[14]+m[15]
|
||||
LONG $0x6c0f4566; BYTE $0xd5 // PUNPCKLQDQ XMM10, XMM13 /* m[14], m[6] */
|
||||
LONG $0x3a0f4566; WORD $0xdc0f; BYTE $0x08 // PALIGNR XMM11, XMM12, 0x8 /* m[1], m[4] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 4
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
MOVOU 96(DX), X15 // X15 = m[12]+m[13]
|
||||
MOVOU X13, X8
|
||||
LONG $0x6d0f4566; BYTE $0xc4 // PUNPCKHQDQ XMM8, XMM12 /* m[7], m[3] */
|
||||
MOVOU X15, X9
|
||||
LONG $0x6d0f4566; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* m[13], m[11] */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 64(DX), X10 // X10 = m[8]+ m[9]
|
||||
MOVOU 112(DX), X14 // X14 = m[14]+m[15]
|
||||
LONG $0x6d0f4566; BYTE $0xd4 // PUNPCKHQDQ XMM10, XMM12 /* m[9], m[1] */
|
||||
MOVOU X15, X11
|
||||
LONG $0x6c0f4566; BYTE $0xde // PUNPCKLQDQ XMM11, XMM14 /* m[12], m[14] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
MOVOU X13, X9
|
||||
LONG $0x6d0f4566; BYTE $0xcd // PUNPCKHQDQ XMM9, XMM13 /* ___, m[5] */
|
||||
MOVOU X12, X8
|
||||
LONG $0x6c0f4566; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[2], ____ */
|
||||
MOVOU X15, X10
|
||||
LONG $0x6d0f4566; BYTE $0xd7 // PUNPCKHQDQ XMM10, XMM15 /* ___, m[15] */
|
||||
MOVOU X13, X9
|
||||
LONG $0x6c0f4566; BYTE $0xca // PUNPCKLQDQ XMM9, XMM10 /* m[4], ____ */
|
||||
MOVOU 0(DX), X11 // X11 = m[0]+ m[1]
|
||||
MOVOU 48(DX), X10 // X10 = m[6]+ m[7]
|
||||
MOVOU 64(DX), X15 // X15 = m[8]+ m[9]
|
||||
LONG $0x6c0f4566; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[6], m[10] */
|
||||
LONG $0x6c0f4566; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[0], m[8] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 5
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X15 // X15 = m[10]+m[11]
|
||||
MOVOU X14, X8
|
||||
LONG $0x6d0f4566; BYTE $0xc5 // PUNPCKHQDQ XMM8, XMM13 /* m[9], m[5] */
|
||||
MOVOU X12, X9
|
||||
LONG $0x6c0f4566; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[2], m[10] */
|
||||
MOVOU 0(DX), X10 // X10 = m[0]+ m[1]
|
||||
MOVOU 48(DX), X14 // X14 = m[6]+ m[7]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
LONG $0x6d0f4566; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[7] */
|
||||
LONG $0x6c0f4566; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[0], ____ */
|
||||
LONG $0x6d0f4566; BYTE $0xff // PUNPCKHQDQ XMM15, XMM15 /* ___, m[15] */
|
||||
MOVOU X13, X11
|
||||
LONG $0x6c0f4566; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[4], ____ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
LONG $0x6d0f4566; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[11] */
|
||||
MOVOU X15, X8
|
||||
LONG $0x6c0f4566; BYTE $0xc6 // PUNPCKLQDQ XMM8, XMM14 /* m[14], ____ */
|
||||
LONG $0x6d0f4566; BYTE $0xe4 // PUNPCKHQDQ XMM12, XMM12 /* ___, m[3] */
|
||||
MOVOU X13, X9
|
||||
LONG $0x6c0f4566; BYTE $0xcc // PUNPCKLQDQ XMM9, XMM12 /* m[6], ____ */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 64(DX), X11 // X11 = m[8]+ m[9]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
MOVOU X14, X10
|
||||
LONG $0x3a0f4566; WORD $0xd40f; BYTE $0x08 // PALIGNR XMM10, XMM12, 0x8 /* m[1], m[12] */
|
||||
LONG $0x6d0f4566; BYTE $0xf6 // PUNPCKHQDQ XMM14, XMM14 /* ___, m[13] */
|
||||
LONG $0x6c0f4566; BYTE $0xde // PUNPCKLQDQ XMM11, XMM14 /* m[8], ____ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 6
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X14 // X14 = m[6]+ m[7]
|
||||
MOVOU 64(DX), X15 // X15 = m[8]+ m[9]
|
||||
MOVOU X13, X8
|
||||
LONG $0x6c0f4566; BYTE $0xc6 // PUNPCKLQDQ XMM8, XMM14 /* m[2], m[6] */
|
||||
MOVOU X12, X9
|
||||
LONG $0x6c0f4566; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[0], m[8] */
|
||||
MOVOU 80(DX), X12 // X12 = m[10]+m[11]
|
||||
MOVOU 96(DX), X10 // X10 = m[12]+m[13]
|
||||
LONG $0x6c0f4566; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[12], m[10] */
|
||||
MOVOU X12, X11
|
||||
LONG $0x6d0f4566; BYTE $0xdd // PUNPCKHQDQ XMM11, XMM13 /* m[11], m[3] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 48(DX), X14 // X14 = m[6]+ m[7]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
MOVOU X14, X9
|
||||
LONG $0x6d0f4566; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* ___, m[7] */
|
||||
MOVOU X13, X8
|
||||
LONG $0x6c0f4566; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[4], ____ */
|
||||
MOVOU X15, X9
|
||||
LONG $0x6d0f4566; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* m[15], m[1] */
|
||||
MOVOU 64(DX), X12 // X12 = m[8]+ m[9]
|
||||
MOVOU 96(DX), X10 // X10 = m[12]+m[13]
|
||||
LONG $0x6d0f4566; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[13], m[5] */
|
||||
LONG $0x6d0f4566; BYTE $0xe4 // PUNPCKHQDQ XMM12, XMM12 /* ___, m[9] */
|
||||
MOVOU X15, X11
|
||||
LONG $0x6c0f4566; BYTE $0xdc // PUNPCKLQDQ XMM11, XMM12 /* m[14], ____ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 7
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
MOVOU X12, X9
|
||||
LONG $0x6d0f4566; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* ___, m[1] */
|
||||
MOVOU X14, X8
|
||||
LONG $0x6c0f4566; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[12], ____ */
|
||||
MOVOU X15, X9
|
||||
LONG $0x6c0f4566; BYTE $0xcd // PUNPCKLQDQ XMM9, XMM13 /* m[14], m[4] */
|
||||
MOVOU 80(DX), X11 // X11 = m[10]+m[11]
|
||||
MOVOU X13, X10
|
||||
LONG $0x6d0f4566; BYTE $0xd7 // PUNPCKHQDQ XMM10, XMM15 /* m[5], m[15] */
|
||||
LONG $0x3a0f4566; WORD $0xde0f; BYTE $0x08 // PALIGNR XMM11, XMM14, 0x8 /* m[13], m[10] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X15 // X15 = m[10]+m[11]
|
||||
MOVOU X12, X8
|
||||
LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[0], m[6] */
|
||||
MOVOU X14, X9
|
||||
LONG $0x3a0f4566; WORD $0xce0f; BYTE $0x08 // PALIGNR XMM9, XMM14, 0x8 /* m[9], m[8] */
|
||||
MOVOU 16(DX), X11 // X14 = m[2]+ m[3]
|
||||
MOVOU X13, X10
|
||||
LONG $0x6d0f4566; BYTE $0xd3 // PUNPCKHQDQ XMM10, XMM11 /* m[7], m[3] */
|
||||
LONG $0x6d0f4566; BYTE $0xff // PUNPCKHQDQ XMM15, XMM15 /* ___, m[11] */
|
||||
LONG $0x6c0f4566; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[2], ____ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 8
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
MOVOU X14, X8
|
||||
LONG $0x6d0f4566; BYTE $0xc5 // PUNPCKHQDQ XMM8, XMM13 /* m[13], m[7] */
|
||||
MOVOU X12, X10
|
||||
LONG $0x6d0f4566; BYTE $0xd4 // PUNPCKHQDQ XMM10, XMM12 /* ___, m[3] */
|
||||
MOVOU X14, X9
|
||||
LONG $0x6c0f4566; BYTE $0xca // PUNPCKLQDQ XMM9, XMM10 /* m[12], ____ */
|
||||
MOVOU 0(DX), X11 // X11 = m[0]+ m[1]
|
||||
MOVOU 64(DX), X13 // X13 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
MOVOU X15, X10
|
||||
LONG $0x3a0f4566; WORD $0xd60f; BYTE $0x08 // PALIGNR XMM10, XMM14, 0x8 /* m[11], m[14] */
|
||||
LONG $0x6d0f4566; BYTE $0xdd // PUNPCKHQDQ XMM11, XMM13 /* m[1], m[9] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
MOVOU X13, X8
|
||||
LONG $0x6d0f4566; BYTE $0xc7 // PUNPCKHQDQ XMM8, XMM15 /* m[5], m[15] */
|
||||
MOVOU X14, X9
|
||||
LONG $0x6c0f4566; BYTE $0xcc // PUNPCKLQDQ XMM9, XMM12 /* m[8], m[2] */
|
||||
MOVOU 0(DX), X10 // X10 = m[0]+ m[1]
|
||||
MOVOU 48(DX), X11 // X11 = m[6]+ m[7]
|
||||
MOVOU 80(DX), X15 // X15 = m[10]+m[11]
|
||||
LONG $0x6c0f4566; BYTE $0xd5 // PUNPCKLQDQ XMM10, XMM13 /* m[0], m[4] */
|
||||
LONG $0x6c0f4566; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[6], m[10] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 9
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
MOVOU X13, X8
|
||||
LONG $0x6c0f4566; BYTE $0xc7 // PUNPCKLQDQ XMM8, XMM15 /* m[6], m[14] */
|
||||
MOVOU X12, X9
|
||||
LONG $0x3a0f4566; WORD $0xce0f; BYTE $0x08 // PALIGNR XMM9, XMM14, 0x8 /* m[11], m[0] */
|
||||
MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
|
||||
MOVOU 64(DX), X11 // X11 = m[8]+ m[9]
|
||||
MOVOU X15, X10
|
||||
LONG $0x6d0f4566; BYTE $0xd3 // PUNPCKHQDQ XMM10, XMM11 /* m[15], m[9] */
|
||||
LONG $0x3a0f4566; WORD $0xdd0f; BYTE $0x08 // PALIGNR XMM11, XMM13, 0x8 /* m[3], m[8] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
MOVOU 96(DX), X15 // X15 = m[12]+m[13]
|
||||
MOVOU X15, X9
|
||||
LONG $0x6d0f4566; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* ___, m[13] */
|
||||
MOVOU X15, X8
|
||||
LONG $0x6c0f4566; BYTE $0xc1 // PUNPCKLQDQ XMM8, XMM9 /* m[12], ____ */
|
||||
MOVOU X14, X9
|
||||
LONG $0x3a0f4566; WORD $0xcc0f; BYTE $0x08 // PALIGNR XMM9, XMM12, 0x8 /* m[1], m[10] */
|
||||
MOVOU 32(DX), X12 // X12 = m[4]+ m[5]
|
||||
MOVOU 48(DX), X15 // X15 = m[6]+ m[7]
|
||||
MOVOU X15, X11
|
||||
LONG $0x6d0f4566; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* ___, m[7] */
|
||||
MOVOU X13, X10
|
||||
LONG $0x6c0f4566; BYTE $0xd3 // PUNPCKLQDQ XMM10, XMM11 /* m[2], ____ */
|
||||
MOVOU X12, X15
|
||||
LONG $0x6d0f4566; BYTE $0xfc // PUNPCKHQDQ XMM15, XMM12 /* ___, m[5] */
|
||||
MOVOU X12, X11
|
||||
LONG $0x6c0f4566; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[4], ____ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 1 0
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X15 // X15 = m[10]+m[11]
|
||||
MOVOU X15, X8
|
||||
LONG $0x6c0f4566; BYTE $0xc6 // PUNPCKLQDQ XMM8, XMM14 /* m[10], m[8] */
|
||||
MOVOU X13, X9
|
||||
LONG $0x6d0f4566; BYTE $0xcc // PUNPCKHQDQ XMM9, XMM12 /* m[7], m[1] */
|
||||
MOVOU 16(DX), X10 // X10 = m[2]+ m[3]
|
||||
MOVOU 32(DX), X14 // X14 = m[4]+ m[5]
|
||||
LONG $0x6c0f4566; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[2], m[4] */
|
||||
MOVOU X14, X15
|
||||
LONG $0x6d0f4566; BYTE $0xfe // PUNPCKHQDQ XMM15, XMM14 /* ___, m[5] */
|
||||
MOVOU X13, X11
|
||||
LONG $0x6c0f4566; BYTE $0xdf // PUNPCKLQDQ XMM11, XMM15 /* m[6], ____ */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 64(DX), X13 // X13 = m[8]+ m[9]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
MOVOU X15, X8
|
||||
LONG $0x6d0f4566; BYTE $0xc5 // PUNPCKHQDQ XMM8, XMM13 /* m[15], m[9] */
|
||||
MOVOU X12, X9
|
||||
LONG $0x6d0f4566; BYTE $0xce // PUNPCKHQDQ XMM9, XMM14 /* m[3], m[13] */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 80(DX), X13 // X13 = m[10]+m[11]
|
||||
MOVOU X15, X10
|
||||
LONG $0x3a0f4566; WORD $0xd50f; BYTE $0x08 // PALIGNR XMM10, XMM13, 0x8 /* m[11], m[14] */
|
||||
MOVOU X14, X11
|
||||
LONG $0x6c0f4566; BYTE $0xdc // PUNPCKLQDQ XMM11, XMM12 /* m[12], m[0] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 1 1
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+m[1]
|
||||
MOVOU 16(DX), X13 // X13 = m[2]+m[3]
|
||||
MOVOU 32(DX), X14 // X14 = m[4]+m[5]
|
||||
MOVOU 48(DX), X15 // X15 = m[6]+m[7]
|
||||
MOVOU X12, X8
|
||||
LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[0], m[2] */
|
||||
MOVOU X14, X9
|
||||
LONG $0x6c0f4566; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[4], m[6] */
|
||||
MOVOU X12, X10
|
||||
LONG $0x6d0f4566; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[1], m[3] */
|
||||
MOVOU X14, X11
|
||||
LONG $0x6d0f4566; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[5], m[7] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 64(DX), X12 // X12 = m[8]+ m[9]
|
||||
MOVOU 80(DX), X13 // X13 = m[10]+m[11]
|
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13]
|
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15]
|
||||
MOVOU X12, X8
|
||||
LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[8],m[10] */
|
||||
MOVOU X14, X9
|
||||
LONG $0x6c0f4566; BYTE $0xcf // PUNPCKLQDQ XMM9, XMM15 /* m[12],m[14] */
|
||||
MOVOU X12, X10
|
||||
LONG $0x6d0f4566; BYTE $0xd5 // PUNPCKHQDQ XMM10, XMM13 /* m[9],m[11] */
|
||||
MOVOU X14, X11
|
||||
LONG $0x6d0f4566; BYTE $0xdf // PUNPCKHQDQ XMM11, XMM15 /* m[13],m[15] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
UNDIAGONALIZE
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// R O U N D 1 2
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
|
||||
MOVOU 112(DX), X12 // X12 = m[14]+m[15]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
|
||||
MOVOU 96(DX), X15 // X15 = m[12]+m[13]
|
||||
MOVOU X12, X8
|
||||
LONG $0x6c0f4566; BYTE $0xc5 // PUNPCKLQDQ XMM8, XMM13 /* m[14], m[4] */
|
||||
MOVOU X14, X9
|
||||
LONG $0x6d0f4566; BYTE $0xcf // PUNPCKHQDQ XMM9, XMM15 /* m[9], m[13] */
|
||||
MOVOU 80(DX), X10 // X10 = m[10]+m[11]
|
||||
MOVOU 48(DX), X11 // X11 = m[6]+ m[7]
|
||||
LONG $0x6c0f4566; BYTE $0xd6 // PUNPCKLQDQ XMM10, XMM14 /* m[10], m[8] */
|
||||
LONG $0x3a0f4566; WORD $0xdc0f; BYTE $0x08 // PALIGNR XMM11, XMM12, 0x8 /* m[15], m[6] */; ; ; ; ;
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
DIAGONALIZE
|
||||
|
||||
// LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
|
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
|
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11]
|
||||
MOVOU X12, X8
|
||||
LONG $0x3a0f4566; WORD $0xc40f; BYTE $0x08 // PALIGNR XMM8, XMM12, 0x8 /* m[1], m[0] */
|
||||
MOVOU X14, X9
|
||||
LONG $0x6d0f4566; BYTE $0xcd // PUNPCKHQDQ XMM9, XMM13 /* m[11], m[5] */
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
|
||||
MOVOU 48(DX), X11 // X11 = m[6]+ m[7]
|
||||
MOVOU 96(DX), X10 // X10 = m[12]+m[13]
|
||||
LONG $0x6c0f4566; BYTE $0xd4 // PUNPCKLQDQ XMM10, XMM12 /* m[12], m[2] */
|
||||
LONG $0x6d0f4566; BYTE $0xdc // PUNPCKHQDQ XMM11, XMM12 /* m[7], m[3] */
|
||||
|
||||
LOAD_SHUFFLE
|
||||
G1
|
||||
G2
|
||||
UNDIAGONALIZE
|
||||
|
||||
// Reload digest (most current value store in &out)
|
||||
MOVQ out+144(FP), SI // SI: &in
|
||||
MOVOU 0(SI), X12 // X12 = in[0]+in[1] /* row1l = LOAD( &S->h[0] ); */
|
||||
MOVOU 16(SI), X13 // X13 = in[2]+in[3] /* row1h = LOAD( &S->h[2] ); */
|
||||
MOVOU 32(SI), X14 // X14 = in[4]+in[5] /* row2l = LOAD( &S->h[4] ); */
|
||||
MOVOU 48(SI), X15 // X15 = in[6]+in[7] /* row2h = LOAD( &S->h[6] ); */
|
||||
|
||||
// Final computations and prepare for storing
|
||||
PXOR X4, X0 // X0 = X0 ^ X4 /* row1l = _mm_xor_si128( row3l, row1l ); */
|
||||
PXOR X5, X1 // X1 = X1 ^ X5 /* row1h = _mm_xor_si128( row3h, row1h ); */
|
||||
PXOR X12, X0 // X0 = X0 ^ X12 /* STORE( &S->h[0], _mm_xor_si128( LOAD( &S->h[0] ), row1l ) ); */
|
||||
PXOR X13, X1 // X1 = X1 ^ X13 /* STORE( &S->h[2], _mm_xor_si128( LOAD( &S->h[2] ), row1h ) ); */
|
||||
PXOR X6, X2 // X2 = X2 ^ X6 /* row2l = _mm_xor_si128( row4l, row2l ); */
|
||||
PXOR X7, X3 // X3 = X3 ^ X7 /* row2h = _mm_xor_si128( row4h, row2h ); */
|
||||
PXOR X14, X2 // X2 = X2 ^ X14 /* STORE( &S->h[4], _mm_xor_si128( LOAD( &S->h[4] ), row2l ) ); */
|
||||
PXOR X15, X3 // X3 = X3 ^ X15 /* STORE( &S->h[6], _mm_xor_si128( LOAD( &S->h[6] ), row2h ) ); */
|
||||
|
||||
// Store digest into &out
|
||||
MOVQ out+144(FP), SI // SI: &out
|
||||
MOVOU X0, 0(SI) // out[0]+out[1] = X0
|
||||
MOVOU X1, 16(SI) // out[2]+out[3] = X1
|
||||
MOVOU X2, 32(SI) // out[4]+out[5] = X2
|
||||
MOVOU X3, 48(SI) // out[6]+out[7] = X3
|
||||
|
||||
// Increment message pointer and check if there's more to do
|
||||
ADDQ $128, DX // message += 128
|
||||
SUBQ $1, R8
|
||||
JNZ loop
|
||||
|
||||
complete:
|
||||
RET
|
||||
30
vendor/github.com/minio/blake2b-simd/compress_amd64.go
generated
vendored
30
vendor/github.com/minio/blake2b-simd/compress_amd64.go
generated
vendored
@@ -1,30 +0,0 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2016 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package blake2b
|
||||
|
||||
func compress(d *digest, p []uint8) {
|
||||
// Verifies if AVX2 or AVX is available, use optimized code path.
|
||||
if avx2 {
|
||||
compressAVX2(d, p)
|
||||
} else if avx {
|
||||
compressAVX(d, p)
|
||||
} else if ssse3 {
|
||||
compressSSE(d, p)
|
||||
} else {
|
||||
compressGeneric(d, p)
|
||||
}
|
||||
}
|
||||
1419
vendor/github.com/minio/blake2b-simd/compress_generic.go
generated
vendored
1419
vendor/github.com/minio/blake2b-simd/compress_generic.go
generated
vendored
File diff suppressed because it is too large
Load Diff
23
vendor/github.com/minio/blake2b-simd/compress_noasm.go
generated
vendored
23
vendor/github.com/minio/blake2b-simd/compress_noasm.go
generated
vendored
@@ -1,23 +0,0 @@
|
||||
//+build !amd64 noasm appengine
|
||||
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2016 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package blake2b
|
||||
|
||||
func compress(d *digest, p []uint8) {
|
||||
compressGeneric(d, p)
|
||||
}
|
||||
60
vendor/github.com/minio/blake2b-simd/cpuid.go
generated
vendored
60
vendor/github.com/minio/blake2b-simd/cpuid.go
generated
vendored
@@ -1,60 +0,0 @@
|
||||
// +build 386,!gccgo amd64,!gccgo
|
||||
|
||||
// Copyright 2016 Frank Wessels <fwessels@xs4all.nl>
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
package blake2b
|
||||
|
||||
func cpuid(op uint32) (eax, ebx, ecx, edx uint32)
|
||||
func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
|
||||
func xgetbv(index uint32) (eax, edx uint32)
|
||||
|
||||
// True when SIMD instructions are available.
|
||||
var avx2 = haveAVX2()
|
||||
var avx = haveAVX()
|
||||
var ssse3 = haveSSSE3()
|
||||
|
||||
// haveAVX returns true when there is AVX support
|
||||
func haveAVX() bool {
|
||||
_, _, c, _ := cpuid(1)
|
||||
|
||||
// Check XGETBV, OXSAVE and AVX bits
|
||||
if c&(1<<26) != 0 && c&(1<<27) != 0 && c&(1<<28) != 0 {
|
||||
// Check for OS support
|
||||
eax, _ := xgetbv(0)
|
||||
return (eax & 0x6) == 0x6
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// haveAVX2 returns true when there is AVX2 support
|
||||
func haveAVX2() bool {
|
||||
mfi, _, _, _ := cpuid(0)
|
||||
|
||||
// Check AVX2, AVX2 requires OS support, but BMI1/2 don't.
|
||||
if mfi >= 7 && haveAVX() {
|
||||
_, ebx, _, _ := cpuidex(7, 0)
|
||||
return (ebx & 0x00000020) != 0
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// haveSSSE3 returns true when there is SSSE3 support
|
||||
func haveSSSE3() bool {
|
||||
|
||||
_, _, c, _ := cpuid(1)
|
||||
|
||||
return (c & 0x00000200) != 0
|
||||
}
|
||||
33
vendor/github.com/minio/blake2b-simd/cpuid_386.s
generated
vendored
33
vendor/github.com/minio/blake2b-simd/cpuid_386.s
generated
vendored
@@ -1,33 +0,0 @@
|
||||
// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
|
||||
|
||||
// +build 386,!gccgo
|
||||
|
||||
// func cpuid(op uint32) (eax, ebx, ecx, edx uint32)
|
||||
TEXT ·cpuid(SB), 7, $0
|
||||
XORL CX, CX
|
||||
MOVL op+0(FP), AX
|
||||
CPUID
|
||||
MOVL AX, eax+4(FP)
|
||||
MOVL BX, ebx+8(FP)
|
||||
MOVL CX, ecx+12(FP)
|
||||
MOVL DX, edx+16(FP)
|
||||
RET
|
||||
|
||||
// func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
|
||||
TEXT ·cpuidex(SB), 7, $0
|
||||
MOVL op+0(FP), AX
|
||||
MOVL op2+4(FP), CX
|
||||
CPUID
|
||||
MOVL AX, eax+8(FP)
|
||||
MOVL BX, ebx+12(FP)
|
||||
MOVL CX, ecx+16(FP)
|
||||
MOVL DX, edx+20(FP)
|
||||
RET
|
||||
|
||||
// func xgetbv(index uint32) (eax, edx uint32)
|
||||
TEXT ·xgetbv(SB), 7, $0
|
||||
MOVL index+0(FP), CX
|
||||
BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
|
||||
MOVL AX, eax+4(FP)
|
||||
MOVL DX, edx+8(FP)
|
||||
RET
|
||||
34
vendor/github.com/minio/blake2b-simd/cpuid_amd64.s
generated
vendored
34
vendor/github.com/minio/blake2b-simd/cpuid_amd64.s
generated
vendored
@@ -1,34 +0,0 @@
|
||||
// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
|
||||
|
||||
// +build amd64,!gccgo
|
||||
|
||||
// func cpuid(op uint32) (eax, ebx, ecx, edx uint32)
|
||||
TEXT ·cpuid(SB), 7, $0
|
||||
XORQ CX, CX
|
||||
MOVL op+0(FP), AX
|
||||
CPUID
|
||||
MOVL AX, eax+8(FP)
|
||||
MOVL BX, ebx+12(FP)
|
||||
MOVL CX, ecx+16(FP)
|
||||
MOVL DX, edx+20(FP)
|
||||
RET
|
||||
|
||||
|
||||
// func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
|
||||
TEXT ·cpuidex(SB), 7, $0
|
||||
MOVL op+0(FP), AX
|
||||
MOVL op2+4(FP), CX
|
||||
CPUID
|
||||
MOVL AX, eax+8(FP)
|
||||
MOVL BX, ebx+12(FP)
|
||||
MOVL CX, ecx+16(FP)
|
||||
MOVL DX, edx+20(FP)
|
||||
RET
|
||||
|
||||
// func xgetbv(index uint32) (eax, edx uint32)
|
||||
TEXT ·xgetbv(SB), 7, $0
|
||||
MOVL index+0(FP), CX
|
||||
BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
|
||||
MOVL AX, eax+8(FP)
|
||||
MOVL DX, edx+12(FP)
|
||||
RET
|
||||
Reference in New Issue
Block a user