xl: Moved to minio/minio - fixes #1112

2025-11-09 13:39:46 -05:00 · 2016-02-10 16:40:09 -08:00
parent 33bd97d581
commit 62f6ffb6db
137 changed files with 9408 additions and 515 deletions
--- a/pkg/crypto/sha1/.gitignore
+++ b/pkg/crypto/sha1/.gitignore
@@ -0,0 +1 @@
+*.syso
--- a/pkg/crypto/sha1/LICENSE
+++ b/pkg/crypto/sha1/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/pkg/crypto/sha1/sha1.go
+++ b/pkg/crypto/sha1/sha1.go
@@ -0,0 +1,150 @@
+/*
+ * Minio Cloud Storage, (C) 2015-2016 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file of
+// Golang project:
+//    https://github.com/golang/go/blob/master/LICENSE
+
+// Using this part of Minio codebase under the license
+// Apache License Version 2.0 with modifications
+
+// Package sha1 implements the SHA1 hash algorithm as defined in RFC 3174.
+package sha1
+
+import "hash"
+
+// Size - The size of a SHA1 checksum in bytes.
+const Size = 20
+
+// BlockSize - The blocksize of SHA1 in bytes.
+const BlockSize = 64
+
+const (
+	chunk = 64
+	init0 = 0x67452301
+	init1 = 0xEFCDAB89
+	init2 = 0x98BADCFE
+	init3 = 0x10325476
+	init4 = 0xC3D2E1F0
+)
+
+// digest represents the partial evaluation of a checksum.
+type digest struct {
+	h   [5]uint32
+	x   [chunk]byte
+	nx  int
+	len uint64
+}
+
+// Reset digest
+func (d *digest) Reset() {
+	d.h[0] = init0
+	d.h[1] = init1
+	d.h[2] = init2
+	d.h[3] = init3
+	d.h[4] = init4
+	d.nx = 0
+	d.len = 0
+}
+
+// New returns a new hash.Hash computing the SHA1 checksum.
+func New() hash.Hash {
+	d := new(digest)
+	d.Reset()
+	return d
+}
+
+// Return output size
+func (d *digest) Size() int { return Size }
+
+// Return checksum blocksize
+func (d *digest) BlockSize() int { return BlockSize }
+
+// Write to digest
+func (d *digest) Write(p []byte) (nn int, err error) {
+	nn = len(p)
+	d.len += uint64(nn)
+	if d.nx > 0 {
+		n := copy(d.x[d.nx:], p)
+		d.nx += n
+		if d.nx == chunk {
+			block(d, d.x[:])
+			d.nx = 0
+		}
+		p = p[n:]
+	}
+	if len(p) >= chunk {
+		n := len(p) &^ (chunk - 1)
+		block(d, p[:n])
+		p = p[n:]
+	}
+	if len(p) > 0 {
+		d.nx = copy(d.x[:], p)
+	}
+	return
+}
+
+// Return checksum bytes
+func (d *digest) Sum(in []byte) []byte {
+	// Make a copy of d0 so that caller can keep writing and summing.
+	d0 := *d
+	hash := d0.checkSum()
+	return append(in, hash[:]...)
+}
+
+// Intermediate checksum function
+func (d *digest) checkSum() [Size]byte {
+	len := d.len
+	// Padding.  Add a 1 bit and 0 bits until 56 bytes mod 64.
+	var tmp [64]byte
+	tmp[0] = 0x80
+	if len%64 < 56 {
+		d.Write(tmp[0 : 56-len%64])
+	} else {
+		d.Write(tmp[0 : 64+56-len%64])
+	}
+
+	// Length in bits.
+	len <<= 3
+	for i := uint(0); i < 8; i++ {
+		tmp[i] = byte(len >> (56 - 8*i))
+	}
+	d.Write(tmp[0:8])
+
+	if d.nx != 0 {
+		panic("d.nx != 0")
+	}
+
+	var digest [Size]byte
+	for i, s := range d.h {
+		digest[i*4] = byte(s >> 24)
+		digest[i*4+1] = byte(s >> 16)
+		digest[i*4+2] = byte(s >> 8)
+		digest[i*4+3] = byte(s)
+	}
+
+	return digest
+}
+
+// Sum - single caller sha1 helper
+func Sum(data []byte) [Size]byte {
+	var d digest
+	d.Reset()
+	d.Write(data)
+	return d.checkSum()
+}
--- a/pkg/crypto/sha1/sha1_linux.S
+++ b/pkg/crypto/sha1/sha1_linux.S
@@ -0,0 +1,967 @@
+/*
+ *	Implement fast SHA-1 with AVX2 instructions. (x86_64)
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Ilya Albrekht <ilya.albrekht@intel.com>
+ * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
+ * Ronen Zohar <ronen.zohar@intel.com>
+ * Chandramouli Narayanan <mouli@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
+ *
+ *This implementation is based on the previous SSSE3 release:
+ * https://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1
+ *
+ *Updates 20-byte SHA-1 record in 'hash' for even number of
+ *'num_blocks' consecutive 64-byte blocks
+ *
+ */
+
+/*
+ * Using this part of Minio codebase under the license
+ * Apache License Version 2.0 with modifications
+ *
+ */
+
+#ifdef HAS_AVX2
+#ifndef ENTRY
+#define ENTRY(name) \
+        .globl name             ; \
+        .align 4,0x90           ; \
+        name:
+#endif
+
+#ifndef END
+#define END(name) \
+        .size name, .-name
+#endif
+
+#ifndef ENDPROC
+#define ENDPROC(name) \
+        .type name, @function   ; \
+        END(name)
+#endif
+
+#define NUM_INVALID		100
+
+#define TYPE_R32		0
+#define TYPE_R64		1
+#define TYPE_XMM		2
+#define TYPE_INVALID	100
+
+	.macro R32_NUM opd r32
+	\opd = NUM_INVALID
+	.ifc \r32,%eax
+	\opd = 0
+	.endif
+	.ifc \r32,%ecx
+	\opd = 1
+	.endif
+	.ifc \r32,%edx
+	\opd = 2
+	.endif
+	.ifc \r32,%ebx
+	\opd = 3
+	.endif
+	.ifc \r32,%esp
+	\opd = 4
+	.endif
+	.ifc \r32,%ebp
+	\opd = 5
+	.endif
+	.ifc \r32,%esi
+	\opd = 6
+	.endif
+	.ifc \r32,%edi
+	\opd = 7
+	.endif
+#ifdef X86_64
+	.ifc \r32,%r8d
+	\opd = 8
+	.endif
+	.ifc \r32,%r9d
+	\opd = 9
+	.endif
+	.ifc \r32,%r10d
+	\opd = 10
+	.endif
+	.ifc \r32,%r11d
+	\opd = 11
+	.endif
+	.ifc \r32,%r12d
+	\opd = 12
+	.endif
+	.ifc \r32,%r13d
+	\opd = 13
+	.endif
+	.ifc \r32,%r14d
+	\opd = 14
+	.endif
+	.ifc \r32,%r15d
+	\opd = 15
+	.endif
+#endif
+	.endm
+
+	.macro R64_NUM opd r64
+	\opd = NUM_INVALID
+#ifdef X86_64
+	.ifc \r64,%rax
+	\opd = 0
+	.endif
+	.ifc \r64,%rcx
+	\opd = 1
+	.endif
+	.ifc \r64,%rdx
+	\opd = 2
+	.endif
+	.ifc \r64,%rbx
+	\opd = 3
+	.endif
+	.ifc \r64,%rsp
+	\opd = 4
+	.endif
+	.ifc \r64,%rbp
+	\opd = 5
+	.endif
+	.ifc \r64,%rsi
+	\opd = 6
+	.endif
+	.ifc \r64,%rdi
+	\opd = 7
+	.endif
+	.ifc \r64,%r8
+	\opd = 8
+	.endif
+	.ifc \r64,%r9
+	\opd = 9
+	.endif
+	.ifc \r64,%r10
+	\opd = 10
+	.endif
+	.ifc \r64,%r11
+	\opd = 11
+	.endif
+	.ifc \r64,%r12
+	\opd = 12
+	.endif
+	.ifc \r64,%r13
+	\opd = 13
+	.endif
+	.ifc \r64,%r14
+	\opd = 14
+	.endif
+	.ifc \r64,%r15
+	\opd = 15
+	.endif
+#endif
+	.endm
+
+	.macro XMM_NUM opd xmm
+	\opd = NUM_INVALID
+	.ifc \xmm,%xmm0
+	\opd = 0
+	.endif
+	.ifc \xmm,%xmm1
+	\opd = 1
+	.endif
+	.ifc \xmm,%xmm2
+	\opd = 2
+	.endif
+	.ifc \xmm,%xmm3
+	\opd = 3
+	.endif
+	.ifc \xmm,%xmm4
+	\opd = 4
+	.endif
+	.ifc \xmm,%xmm5
+	\opd = 5
+	.endif
+	.ifc \xmm,%xmm6
+	\opd = 6
+	.endif
+	.ifc \xmm,%xmm7
+	\opd = 7
+	.endif
+	.ifc \xmm,%xmm8
+	\opd = 8
+	.endif
+	.ifc \xmm,%xmm9
+	\opd = 9
+	.endif
+	.ifc \xmm,%xmm10
+	\opd = 10
+	.endif
+	.ifc \xmm,%xmm11
+	\opd = 11
+	.endif
+	.ifc \xmm,%xmm12
+	\opd = 12
+	.endif
+	.ifc \xmm,%xmm13
+	\opd = 13
+	.endif
+	.ifc \xmm,%xmm14
+	\opd = 14
+	.endif
+	.ifc \xmm,%xmm15
+	\opd = 15
+	.endif
+	.endm
+
+	.macro TYPE type reg
+	R32_NUM reg_type_r32 \reg
+	R64_NUM reg_type_r64 \reg
+	XMM_NUM reg_type_xmm \reg
+	.if reg_type_r64 <> NUM_INVALID
+	\type = TYPE_R64
+	.elseif reg_type_r32 <> NUM_INVALID
+	\type = TYPE_R32
+	.elseif reg_type_xmm <> NUM_INVALID
+	\type = TYPE_XMM
+	.else
+	\type = TYPE_INVALID
+	.endif
+	.endm
+
+	.macro PFX_OPD_SIZE
+	.byte 0x66
+	.endm
+
+	.macro PFX_REX opd1 opd2 W=0
+	.if ((\opd1 | \opd2) & 8) || \W
+	.byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3)
+	.endif
+	.endm
+
+	.macro MODRM mod opd1 opd2
+	.byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3)
+	.endm
+
+	.macro PSHUFB_XMM xmm1 xmm2
+	XMM_NUM pshufb_opd1 \xmm1
+	XMM_NUM pshufb_opd2 \xmm2
+	PFX_OPD_SIZE
+	PFX_REX pshufb_opd1 pshufb_opd2
+	.byte 0x0f, 0x38, 0x00
+	MODRM 0xc0 pshufb_opd1 pshufb_opd2
+	.endm
+
+	.macro PCLMULQDQ imm8 xmm1 xmm2
+	XMM_NUM clmul_opd1 \xmm1
+	XMM_NUM clmul_opd2 \xmm2
+	PFX_OPD_SIZE
+	PFX_REX clmul_opd1 clmul_opd2
+	.byte 0x0f, 0x3a, 0x44
+	MODRM 0xc0 clmul_opd1 clmul_opd2
+	.byte \imm8
+	.endm
+
+	.macro PEXTRD imm8 xmm gpr
+	R32_NUM extrd_opd1 \gpr
+	XMM_NUM extrd_opd2 \xmm
+	PFX_OPD_SIZE
+	PFX_REX extrd_opd1 extrd_opd2
+	.byte 0x0f, 0x3a, 0x16
+	MODRM 0xc0 extrd_opd1 extrd_opd2
+	.byte \imm8
+	.endm
+
+	.macro MOVQ_R64_XMM opd1 opd2
+	TYPE movq_r64_xmm_opd1_type \opd1
+	.if movq_r64_xmm_opd1_type == TYPE_XMM
+	XMM_NUM movq_r64_xmm_opd1 \opd1
+	R64_NUM movq_r64_xmm_opd2 \opd2
+	.else
+	R64_NUM movq_r64_xmm_opd1 \opd1
+	XMM_NUM movq_r64_xmm_opd2 \opd2
+	.endif
+	PFX_OPD_SIZE
+	PFX_REX movq_r64_xmm_opd1 movq_r64_xmm_opd2 1
+	.if movq_r64_xmm_opd1_type == TYPE_XMM
+	.byte 0x0f, 0x7e
+	.else
+	.byte 0x0f, 0x6e
+	.endif
+	MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2
+	.endm
+
+#define	CTX	%rdi	/* arg1 */
+#define BUF	%rsi	/* arg2 */
+#define CNT	%rdx	/* arg3 */
+
+#define	REG_A	%ecx
+#define	REG_B	%esi
+#define	REG_C	%edi
+#define	REG_D	%eax
+#define	REG_E	%edx
+#define	REG_TB	%ebx
+#define	REG_TA	%r12d
+#define	REG_RA	%rcx
+#define	REG_RB	%rsi
+#define	REG_RC	%rdi
+#define	REG_RD	%rax
+#define	REG_RE	%rdx
+#define	REG_RTA	%r12
+#define	REG_RTB	%rbx
+#define	REG_T1	%ebp
+#define	xmm_mov	vmovups
+#define	avx2_zeroupper	vzeroupper
+#define	RND_F1	1
+#define	RND_F2	2
+#define	RND_F3	3
+
+.macro REGALLOC
+	.set A, REG_A
+	.set B, REG_B
+	.set C, REG_C
+	.set D, REG_D
+	.set E, REG_E
+	.set TB, REG_TB
+	.set TA, REG_TA
+
+	.set RA, REG_RA
+	.set RB, REG_RB
+	.set RC, REG_RC
+	.set RD, REG_RD
+	.set RE, REG_RE
+
+	.set RTA, REG_RTA
+	.set RTB, REG_RTB
+
+	.set T1, REG_T1
+.endm
+
+#define K_BASE		%r8
+#define HASH_PTR	%r9
+#define BUFFER_PTR	%r10
+#define BUFFER_PTR2	%r13
+#define BUFFER_END	%r11
+
+#define PRECALC_BUF	%r14
+#define WK_BUF		%r15
+
+#define W_TMP		%xmm0
+#define WY_TMP		%ymm0
+#define WY_TMP2		%ymm9
+
+# AVX2 variables
+#define WY0		%ymm3
+#define WY4		%ymm5
+#define WY08		%ymm7
+#define WY12		%ymm8
+#define WY16		%ymm12
+#define WY20		%ymm13
+#define WY24		%ymm14
+#define WY28		%ymm15
+
+#define YMM_SHUFB_BSWAP	%ymm10
+
+/*
+ * Keep 2 iterations precalculated at a time:
+ *    - 80 DWORDs per iteration * 2
+ */
+#define W_SIZE		(80*2*2 +16)
+
+#define WK(t)	((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
+#define PRECALC_WK(t)	((t)*2*2)(PRECALC_BUF)
+
+
+.macro UPDATE_HASH  hash, val
+	add	\hash, \val
+	mov	\val, \hash
+.endm
+
+.macro PRECALC_RESET_WY
+	.set WY_00, WY0
+	.set WY_04, WY4
+	.set WY_08, WY08
+	.set WY_12, WY12
+	.set WY_16, WY16
+	.set WY_20, WY20
+	.set WY_24, WY24
+	.set WY_28, WY28
+	.set WY_32, WY_00
+.endm
+
+.macro PRECALC_ROTATE_WY
+	/* Rotate macros */
+	.set WY_32, WY_28
+	.set WY_28, WY_24
+	.set WY_24, WY_20
+	.set WY_20, WY_16
+	.set WY_16, WY_12
+	.set WY_12, WY_08
+	.set WY_08, WY_04
+	.set WY_04, WY_00
+	.set WY_00, WY_32
+
+	/* Define register aliases */
+	.set WY, WY_00
+	.set WY_minus_04, WY_04
+	.set WY_minus_08, WY_08
+	.set WY_minus_12, WY_12
+	.set WY_minus_16, WY_16
+	.set WY_minus_20, WY_20
+	.set WY_minus_24, WY_24
+	.set WY_minus_28, WY_28
+	.set WY_minus_32, WY
+.endm
+
+.macro PRECALC_00_15
+	.if (i == 0) # Initialize and rotate registers
+		PRECALC_RESET_WY
+		PRECALC_ROTATE_WY
+	.endif
+
+	/* message scheduling pre-compute for rounds 0-15 */
+	.if   ((i & 7) == 0)
+		/*
+		 * blended AVX2 and ALU instruction scheduling
+		 * 1 vector iteration per 8 rounds
+		 */
+		vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP
+	.elseif ((i & 7) == 1)
+		vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\
+			 WY_TMP, WY_TMP
+	.elseif ((i & 7) == 2)
+		vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
+	.elseif ((i & 7) == 4)
+		vpaddd  K_XMM(K_BASE), WY, WY_TMP
+	.elseif ((i & 7) == 7)
+		vmovdqu  WY_TMP, PRECALC_WK(i&~7)
+
+		PRECALC_ROTATE_WY
+	.endif
+.endm
+
+.macro PRECALC_16_31
+	/*
+	 * message scheduling pre-compute for rounds 16-31
+	 * calculating last 32 w[i] values in 8 XMM registers
+	 * pre-calculate K+w[i] values and store to mem
+	 * for later load by ALU add instruction
+	 *
+	 * "brute force" vectorization for rounds 16-31 only
+	 * due to w[i]->w[i-3] dependency
+	 */
+	.if   ((i & 7) == 0)
+		/*
+		 * blended AVX2 and ALU instruction scheduling
+		 * 1 vector iteration per 8 rounds
+		 */
+		/* w[i-14] */
+		vpalignr	$8, WY_minus_16, WY_minus_12, WY
+		vpsrldq	$4, WY_minus_04, WY_TMP               /* w[i-3] */
+	.elseif ((i & 7) == 1)
+		vpxor	WY_minus_08, WY, WY
+		vpxor	WY_minus_16, WY_TMP, WY_TMP
+	.elseif ((i & 7) == 2)
+		vpxor	WY_TMP, WY, WY
+		vpslldq	$12, WY, WY_TMP2
+	.elseif ((i & 7) == 3)
+		vpslld	$1, WY, WY_TMP
+		vpsrld	$31, WY, WY
+	.elseif ((i & 7) == 4)
+		vpor	WY, WY_TMP, WY_TMP
+		vpslld	$2, WY_TMP2, WY
+	.elseif ((i & 7) == 5)
+		vpsrld	$30, WY_TMP2, WY_TMP2
+		vpxor	WY, WY_TMP, WY_TMP
+	.elseif ((i & 7) == 7)
+		vpxor	WY_TMP2, WY_TMP, WY
+		vpaddd	K_XMM(K_BASE), WY, WY_TMP
+		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
+
+		PRECALC_ROTATE_WY
+	.endif
+.endm
+
+.macro PRECALC_32_79
+	/*
+	 * in SHA-1 specification:
+	 * w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
+	 * instead we do equal:
+	 * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+	 * allows more efficient vectorization
+	 * since w[i]=>w[i-3] dependency is broken
+	 */
+
+	.if   ((i & 7) == 0)
+	/*
+	 * blended AVX2 and ALU instruction scheduling
+	 * 1 vector iteration per 8 rounds
+	 */
+		vpalignr	$8, WY_minus_08, WY_minus_04, WY_TMP
+	.elseif ((i & 7) == 1)
+		/* W is W_minus_32 before xor */
+		vpxor	WY_minus_28, WY, WY
+	.elseif ((i & 7) == 2)
+		vpxor	WY_minus_16, WY_TMP, WY_TMP
+	.elseif ((i & 7) == 3)
+		vpxor	WY_TMP, WY, WY
+	.elseif ((i & 7) == 4)
+		vpslld	$2, WY, WY_TMP
+	.elseif ((i & 7) == 5)
+		vpsrld	$30, WY, WY
+		vpor	WY, WY_TMP, WY
+	.elseif ((i & 7) == 7)
+		vpaddd	K_XMM(K_BASE), WY, WY_TMP
+		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
+
+		PRECALC_ROTATE_WY
+	.endif
+.endm
+
+.macro PRECALC r, s
+	.set i, \r
+
+	.if (i < 40)
+		.set K_XMM, 32*0
+	.elseif (i < 80)
+		.set K_XMM, 32*1
+	.elseif (i < 120)
+		.set K_XMM, 32*2
+	.else
+		.set K_XMM, 32*3
+	.endif
+
+	.if (i<32)
+		PRECALC_00_15	\s
+	.elseif (i<64)
+		PRECALC_16_31	\s
+	.elseif (i < 160)
+		PRECALC_32_79	\s
+	.endif
+.endm
+
+.macro ROTATE_STATE
+	.set T_REG, E
+	.set E, D
+	.set D, C
+	.set C, B
+	.set B, TB
+	.set TB, A
+	.set A, T_REG
+
+	.set T_REG, RE
+	.set RE, RD
+	.set RD, RC
+	.set RC, RB
+	.set RB, RTB
+	.set RTB, RA
+	.set RA, T_REG
+.endm
+
+/* Macro relies on saved ROUND_Fx */
+
+.macro RND_FUN f, r
+	.if (\f == RND_F1)
+		ROUND_F1	\r
+	.elseif (\f == RND_F2)
+		ROUND_F2	\r
+	.elseif (\f == RND_F3)
+		ROUND_F3	\r
+	.endif
+.endm
+
+.macro RR r
+	.set round_id, (\r % 80)
+
+	.if (round_id == 0)        /* Precalculate F for first round */
+		.set ROUND_FUNC, RND_F1
+		mov	B, TB
+
+		rorx	$(32-30), B, B    /* b>>>2 */
+		andn	D, TB, T1
+		and	C, TB
+		xor	T1, TB
+	.endif
+
+	RND_FUN ROUND_FUNC, \r
+	ROTATE_STATE
+
+	.if   (round_id == 18)
+		.set ROUND_FUNC, RND_F2
+	.elseif (round_id == 38)
+		.set ROUND_FUNC, RND_F3
+	.elseif (round_id == 58)
+		.set ROUND_FUNC, RND_F2
+	.endif
+
+	.set round_id, ( (\r+1) % 80)
+
+	RND_FUN ROUND_FUNC, (\r+1)
+	ROTATE_STATE
+.endm
+
+.macro ROUND_F1 r
+	add	WK(\r), E
+
+	andn	C, A, T1			/* ~b&d */
+	lea	(RE,RTB), E		/* Add F from the previous round */
+
+	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
+	rorx	$(32-30),A, TB		/* b>>>2 for next round */
+
+	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
+
+	/*
+	 * Calculate F for the next round
+	 * (b & c) ^ andn[b, d]
+	 */
+	and	B, A			/* b&c */
+	xor	T1, A			/* F1 = (b&c) ^ (~b&d) */
+
+	lea	(RE,RTA), E		/* E += A >>> 5 */
+.endm
+
+.macro ROUND_F2 r
+	add	WK(\r), E
+	lea	(RE,RTB), E		/* Add F from the previous round */
+
+	/* Calculate F for the next round */
+	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
+	.if ((round_id) < 79)
+		rorx	$(32-30), A, TB	/* b>>>2 for next round */
+	.endif
+	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
+
+	.if ((round_id) < 79)
+		xor	B, A
+	.endif
+
+	add	TA, E			/* E += A >>> 5 */
+
+	.if ((round_id) < 79)
+		xor	C, A
+	.endif
+.endm
+
+.macro ROUND_F3 r
+	add	WK(\r), E
+	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
+
+	lea	(RE,RTB), E		/* Add F from the previous round */
+
+	mov	B, T1
+	or	A, T1
+
+	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
+	rorx	$(32-30), A, TB		/* b>>>2 for next round */
+
+	/* Calculate F for the next round
+	 * (b and c) or (d and (b or c))
+	 */
+	and	C, T1
+	and	B, A
+	or	T1, A
+
+	add	TA, E			/* E += A >>> 5 */
+
+.endm
+
+/*
+ * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
+ */
+.macro SHA1_PIPELINED_MAIN_BODY
+
+	REGALLOC
+
+	mov	(HASH_PTR), A
+	mov	4(HASH_PTR), B
+	mov	8(HASH_PTR), C
+	mov	12(HASH_PTR), D
+	mov	16(HASH_PTR), E
+
+	mov	%rsp, PRECALC_BUF
+	lea	(2*4*80+32)(%rsp), WK_BUF
+
+	# Precalc WK for first 2 blocks
+	PRECALC_OFFSET = 0
+	.set i, 0
+	.rept    160
+		PRECALC i
+		.set i, i + 1
+	.endr
+	PRECALC_OFFSET = 128
+	xchg	WK_BUF, PRECALC_BUF
+
+	.align 32
+_loop:
+	/*
+	 * code loops through more than one block
+	 * we use K_BASE value as a signal of a last block,
+	 * it is set below by: cmovae BUFFER_PTR, K_BASE
+	 */
+	cmp	K_BASE, BUFFER_PTR
+	jne	_begin
+	.align 32
+	jmp	_end
+	.align 32
+_begin:
+
+	/*
+	 * Do first block
+	 * rounds: 0,2,4,6,8
+	 */
+	.set j, 0
+	.rept 5
+		RR	j
+		.set j, j+2
+	.endr
+
+	jmp _loop0
+_loop0:
+
+	/*
+	 * rounds:
+	 * 10,12,14,16,18
+	 * 20,22,24,26,28
+	 * 30,32,34,36,38
+	 * 40,42,44,46,48
+	 * 50,52,54,56,58
+	 */
+	.rept 25
+		RR	j
+		.set j, j+2
+	.endr
+
+	add	$(2*64), BUFFER_PTR       /* move to next odd-64-byte block */
+	cmp	BUFFER_END, BUFFER_PTR    /* is current block the last one? */
+	cmovae	K_BASE, BUFFER_PTR	/* signal the last iteration smartly */
+
+	/*
+	 * rounds
+	 * 60,62,64,66,68
+	 * 70,72,74,76,78
+	 */
+	.rept 10
+		RR	j
+		.set j, j+2
+	.endr
+
+	UPDATE_HASH	(HASH_PTR), A
+	UPDATE_HASH	4(HASH_PTR), TB
+	UPDATE_HASH	8(HASH_PTR), C
+	UPDATE_HASH	12(HASH_PTR), D
+	UPDATE_HASH	16(HASH_PTR), E
+
+	cmp	K_BASE, BUFFER_PTR	/* is current block the last one? */
+	je	_loop
+
+	mov	TB, B
+
+	/* Process second block */
+	/*
+	 * rounds
+	 *  0+80, 2+80, 4+80, 6+80, 8+80
+	 * 10+80,12+80,14+80,16+80,18+80
+	 */
+
+	.set j, 0
+	.rept 10
+		RR	j+80
+		.set j, j+2
+	.endr
+
+	jmp	_loop1
+_loop1:
+	/*
+	 * rounds
+	 * 20+80,22+80,24+80,26+80,28+80
+	 * 30+80,32+80,34+80,36+80,38+80
+	 */
+	.rept 10
+		RR	j+80
+		.set j, j+2
+	.endr
+
+	jmp	_loop2
+_loop2:
+
+	/*
+	 * rounds
+	 * 40+80,42+80,44+80,46+80,48+80
+	 * 50+80,52+80,54+80,56+80,58+80
+	 */
+	.rept 10
+		RR	j+80
+		.set j, j+2
+	.endr
+
+	add	$(2*64), BUFFER_PTR2      /* move to next even-64-byte block */
+
+	cmp	BUFFER_END, BUFFER_PTR2   /* is current block the last one */
+	cmovae	K_BASE, BUFFER_PTR       /* signal the last iteration smartly */
+
+	jmp	_loop3
+_loop3:
+
+	/*
+	 * rounds
+	 * 60+80,62+80,64+80,66+80,68+80
+	 * 70+80,72+80,74+80,76+80,78+80
+	 */
+	.rept 10
+		RR	j+80
+		.set j, j+2
+	.endr
+
+	UPDATE_HASH	(HASH_PTR), A
+	UPDATE_HASH	4(HASH_PTR), TB
+	UPDATE_HASH	8(HASH_PTR), C
+	UPDATE_HASH	12(HASH_PTR), D
+	UPDATE_HASH	16(HASH_PTR), E
+
+	/* Reset state for AVX2 reg permutation */
+	mov	A, TA
+	mov	TB, A
+	mov	C, TB
+	mov	E, C
+	mov	D, B
+	mov	TA, D
+
+	REGALLOC
+
+	xchg	WK_BUF, PRECALC_BUF
+
+	jmp	_loop
+
+	.align 32
+	_end:
+
+.endm
+
+.section .rodata
+
+#define K1 0x5a827999
+#define K2 0x6ed9eba1
+#define K3 0x8f1bbcdc
+#define K4 0xca62c1d6
+
+.align 128
+K_XMM_AR:
+	.long K1, K1, K1, K1
+	.long K1, K1, K1, K1
+	.long K2, K2, K2, K2
+	.long K2, K2, K2, K2
+	.long K3, K3, K3, K3
+	.long K3, K3, K3, K3
+	.long K4, K4, K4, K4
+	.long K4, K4, K4, K4
+
+BSWAP_SHUFB_CTL:
+	.long 0x00010203
+	.long 0x04050607
+	.long 0x08090a0b
+	.long 0x0c0d0e0f
+	.long 0x00010203
+	.long 0x04050607
+	.long 0x08090a0b
+	.long 0x0c0d0e0f
+
+# void sha1_transform(int32_t *hash, const char* input, size_t num_blocks) ;
+        .text
+	ENTRY(sha1_transform)
+
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	RESERVE_STACK  = (W_SIZE*4 + 8+24)
+
+	/* Align stack */
+	mov	%rsp, %rbx
+	and	$~(0x20-1), %rsp
+	push	%rbx
+	sub	$RESERVE_STACK, %rsp
+
+	avx2_zeroupper
+
+	lea	K_XMM_AR(%rip), K_BASE
+
+	mov	CTX, HASH_PTR
+	mov	BUF, BUFFER_PTR
+	lea	64(BUF), BUFFER_PTR2
+
+	shl	$6, CNT			/* mul by 64 */
+	add	BUF, CNT
+	add	$64, CNT
+	mov	CNT, BUFFER_END
+
+	cmp	BUFFER_END, BUFFER_PTR2
+	cmovae	K_BASE, BUFFER_PTR2
+
+	xmm_mov	BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
+
+	SHA1_PIPELINED_MAIN_BODY
+
+	avx2_zeroupper
+
+	add	$RESERVE_STACK, %rsp
+	pop	%rsp
+
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+
+	ret
+
+	ENDPROC(sha1_transform)
+#endif
--- a/pkg/crypto/sha1/sha1_sse3_amd64.asm
+++ b/pkg/crypto/sha1/sha1_sse3_amd64.asm
@@ -0,0 +1,579 @@
+;---------------------
+;   https://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1
+;
+; License information:
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  This implementation notably advances the performance of SHA-1 algorithm compared to existing
+;  implementations. We are encouraging all projects utilizing SHA-1 to integrate this new fast
+;  implementation and are ready to help if issues or concerns arise (you are welcome to leave
+;  a comment or write an email to the authors). It is provided 'as is' and free for either
+;  commercial or non-commercial use.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+;   This code implements two interfaces of SHA-1 update function: 1) working on a single
+;   64-byte block and 2) working on a buffer of multiple 64-bit blocks. Multiple blocks
+;   version of code is software pipelined and faster overall, it is a default. Assemble
+;   with -DINTEL_SHA1_SINGLEBLOCK to select single 64-byte block function interface.
+;
+;   C++ prototypes of implemented functions are below:
+;
+;   #ifndef INTEL_SHA1_SINGLEBLOCK
+;      // Updates 20-byte SHA-1 record in 'hash' for 'num_blocks' consequtive 64-byte blocks
+;      extern "C" void sha1_update_intel(int *hash, const char* input, size_t num_blocks );
+;   #else
+;      // Updates 20-byte SHA-1 record in 'hash' for one 64-byte block pointed by 'input'
+;      extern "C" void sha1_update_intel(int *hash, const char* input);
+;   #endif
+;
+;   Function name 'sha1_update_intel' can be changed in the source or via macro:
+;     -DINTEL_SHA1_UPDATE_FUNCNAME=my_sha1_update_func_name
+;
+;   It implements both UNIX(default) and Windows ABIs, use -DWIN_ABI on Windows
+;
+;   Code checks CPU for SSSE3 support via CPUID feature flag (CPUID.1.ECX.SSSE3[bit 9]==1),
+;   and performs dispatch. Since in most cases the functionality on non-SSSE3 supporting CPUs
+;   is also required, the default (e.g. one being replaced) function can be provided for
+;   dispatch on such CPUs, the name of old function can be changed in the source or via macro:
+;      -DINTEL_SHA1_UPDATE_DEFAULT_DISPATCH=default_sha1_update_function_name
+;
+;   Authors: Maxim Locktyukhin and Ronen Zohar at Intel.com
+;
+
+%ifndef INTEL_SHA1_UPDATE_DEFAULT_DISPATCH
+ ;; can be replaced with a default SHA-1 update function name
+%define INTEL_SHA1_UPDATE_DEFAULT_DISPATCH  sha1_intel_non_ssse3_cpu_stub_
+%else
+extern  INTEL_SHA1_UPDATE_DEFAULT_DISPATCH
+%endif
+
+;; provide alternative SHA-1 update function's name here
+%ifndef INTEL_SHA1_UPDATE_FUNCNAME
+%define INTEL_SHA1_UPDATE_FUNCNAME     sha1_update_intel
+%endif
+
+global INTEL_SHA1_UPDATE_FUNCNAME
+
+
+%ifndef INTEL_SHA1_SINGLEBLOCK
+%assign multiblock 1
+%else
+%assign multiblock 0
+%endif
+
+
+bits 64
+default rel
+
+%ifdef WIN_ABI
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+%endif
+
+%xdefine ctx arg1
+%xdefine buf arg2
+%xdefine cnt arg3
+
+%macro REGALLOC 0
+ %xdefine A ecx
+ %xdefine B esi
+ %xdefine C edi
+ %xdefine D ebp
+ %xdefine E edx
+
+ %xdefine T1 eax
+ %xdefine T2 ebx
+%endmacro
+
+%xdefine K_BASE     r8
+%xdefine HASH_PTR   r9
+%xdefine BUFFER_PTR r10
+%xdefine BUFFER_END r11
+
+%xdefine W_TMP  xmm0
+%xdefine W_TMP2 xmm9
+
+%xdefine W0  xmm1
+%xdefine W4  xmm2
+%xdefine W8  xmm3
+%xdefine W12 xmm4
+%xdefine W16 xmm5
+%xdefine W20 xmm6
+%xdefine W24 xmm7
+%xdefine W28 xmm8
+
+%xdefine XMM_SHUFB_BSWAP xmm10
+
+;; we keep window of 64 w[i]+K pre-calculated values in a circular buffer
+%xdefine WK(t) (rsp + (t & 15)*4)
+
+;------------------------------------------------------------------------------
+;
+; macro implements SHA-1 function's body for single or several 64-byte blocks
+; first param: function's name
+; second param: =0 - function implements single 64-byte block hash
+;               =1 - function implements multiple64-byte blocks hash
+;                    3rd function's argument is a number, greater 0, of 64-byte blocks to calc hash for
+;
+%macro  SHA1_VECTOR_ASM  2
+align 4096
+%1:
+ push rbx
+ push rbp
+
+ %ifdef WIN_ABI
+ push rdi
+ push rsi
+
+ %xdefine stack_size (16*4 + 16*5 + 8)
+ %else
+ %xdefine stack_size (16*4 + 8)
+ %endif
+
+ sub     rsp, stack_size
+
+ %ifdef WIN_ABI
+ %xdefine xmm_save_base (rsp + 16*4)
+
+ xmm_mov [xmm_save_base + 0*16], xmm6
+ xmm_mov [xmm_save_base + 1*16], xmm7
+ xmm_mov [xmm_save_base + 2*16], xmm8
+ xmm_mov [xmm_save_base + 3*16], xmm9
+ xmm_mov [xmm_save_base + 4*16], xmm10
+ %endif
+
+ mov     HASH_PTR, ctx
+ mov     BUFFER_PTR, buf
+
+ %if (%2 == 1)
+ shl     cnt, 6           ;; mul by 64
+ add     cnt, buf
+ mov     BUFFER_END, cnt
+ %endif
+
+ lea     K_BASE, [K_XMM_AR]
+ xmm_mov XMM_SHUFB_BSWAP, [bswap_shufb_ctl]
+
+ SHA1_PIPELINED_MAIN_BODY %2
+
+ %ifdef WIN_ABI
+ xmm_mov xmm6, [xmm_save_base + 0*16]
+ xmm_mov xmm7, [xmm_save_base + 1*16]
+ xmm_mov xmm8, [xmm_save_base + 2*16]
+ xmm_mov xmm9, [xmm_save_base + 3*16]
+ xmm_mov xmm10,[xmm_save_base + 4*16]
+ %endif
+
+ add rsp, stack_size
+
+ %ifdef WIN_ABI
+ pop rsi
+ pop rdi
+ %endif
+
+ pop rbp
+ pop rbx
+
+ ret
+%endmacro
+
+;--------------------------------------------
+; macro implements 80 rounds of SHA-1, for one 64-byte block or multiple blocks with s/w pipelining
+; macro param: =0 - process single 64-byte block
+;              =1 - multiple blocks
+;
+%macro SHA1_PIPELINED_MAIN_BODY 1
+
+ REGALLOC
+
+ mov A, [HASH_PTR   ]
+ mov B, [HASH_PTR+ 4]
+ mov C, [HASH_PTR+ 8]
+ mov D, [HASH_PTR+12]
+
+ mov E, [HASH_PTR+16]
+
+ %assign i 0
+ %rep    W_PRECALC_AHEAD
+ W_PRECALC i
+ %assign i i+1
+ %endrep
+
+ %xdefine F F1
+
+ %if (%1 == 1)                         ;; code loops through more than one block
+ %%_loop:
+ cmp BUFFER_PTR, K_BASE          ;; we use K_BASE value as a signal of a last block,
+ jne %%_begin                    ;; it is set below by: cmovae BUFFER_PTR, K_BASE
+ jmp %%_end
+
+ align 32
+ %%_begin:
+ %endif
+ RR A,B,C,D,E,0
+ RR D,E,A,B,C,2
+ RR B,C,D,E,A,4
+ RR E,A,B,C,D,6
+ RR C,D,E,A,B,8
+
+ RR A,B,C,D,E,10
+ RR D,E,A,B,C,12
+ RR B,C,D,E,A,14
+ RR E,A,B,C,D,16
+ RR C,D,E,A,B,18
+
+ %xdefine F F2
+
+ RR A,B,C,D,E,20
+ RR D,E,A,B,C,22
+ RR B,C,D,E,A,24
+ RR E,A,B,C,D,26
+ RR C,D,E,A,B,28
+
+ RR A,B,C,D,E,30
+ RR D,E,A,B,C,32
+ RR B,C,D,E,A,34
+ RR E,A,B,C,D,36
+ RR C,D,E,A,B,38
+
+ %xdefine F F3
+
+ RR A,B,C,D,E,40
+ RR D,E,A,B,C,42
+ RR B,C,D,E,A,44
+ RR E,A,B,C,D,46
+ RR C,D,E,A,B,48
+
+ RR A,B,C,D,E,50
+ RR D,E,A,B,C,52
+ RR B,C,D,E,A,54
+ RR E,A,B,C,D,56
+ RR C,D,E,A,B,58
+
+ %xdefine F F4
+
+ %if (%1 == 1)                         ;; if code loops through more than one block
+ add   BUFFER_PTR, 64            ;; move to next 64-byte block
+ cmp   BUFFER_PTR, BUFFER_END    ;; check if current block is the last one
+ cmovae BUFFER_PTR, K_BASE       ;; smart way to signal the last iteration
+ %else
+ %xdefine W_NO_TAIL_PRECALC 1    ;; no software pipelining for single block interface
+ %endif
+
+ RR A,B,C,D,E,60
+ RR D,E,A,B,C,62
+ RR B,C,D,E,A,64
+ RR E,A,B,C,D,66
+ RR C,D,E,A,B,68
+
+ RR A,B,C,D,E,70
+ RR D,E,A,B,C,72
+ RR B,C,D,E,A,74
+ RR E,A,B,C,D,76
+ RR C,D,E,A,B,78
+
+ UPDATE_HASH [HASH_PTR   ],A
+ UPDATE_HASH [HASH_PTR+ 4],B
+ UPDATE_HASH [HASH_PTR+ 8],C
+ UPDATE_HASH [HASH_PTR+12],D
+ UPDATE_HASH [HASH_PTR+16],E
+
+ %if (%1 == 1)
+ jmp %%_loop
+
+ align 32
+ %%_end:
+ %endif
+
+
+ %xdefine W_NO_TAIL_PRECALC 0
+ %xdefine F %error
+
+%endmacro
+
+
+%macro F1 3
+ mov T1,%2
+ xor T1,%3
+ and T1,%1
+ xor T1,%3
+%endmacro
+
+%macro F2 3
+ mov T1,%3
+ xor T1,%2
+ xor T1,%1
+%endmacro
+
+%macro F3 3
+ mov T1,%2
+ mov T2,%1
+ or  T1,%1
+ and T2,%2
+ and T1,%3
+ or  T1,T2
+%endmacro
+
+%define F4 F2
+
+%macro UPDATE_HASH 2
+ add %2, %1
+ mov %1, %2
+%endmacro
+
+
+%macro W_PRECALC 1
+ %xdefine i (%1)
+
+ %if (i < 20)
+ %xdefine K_XMM  0
+ %elif (i < 40)
+ %xdefine K_XMM  16
+ %elif (i < 60)
+ %xdefine K_XMM  32
+ %else
+ %xdefine K_XMM  48
+ %endif
+
+ %if (i<16 || (i>=80 && i<(80 + W_PRECALC_AHEAD)))
+
+ %if (W_NO_TAIL_PRECALC == 0)
+
+ %xdefine i ((%1) % 80)        ;; pre-compute for the next iteration
+
+ %if (i == 0)
+ W_PRECALC_RESET
+ %endif
+
+
+ W_PRECALC_00_15
+ %endif
+
+ %elif (i < 32)
+ W_PRECALC_16_31
+ %elif (i < 80)   ;; rounds 32-79
+ W_PRECALC_32_79
+ %endif
+%endmacro
+
+%macro W_PRECALC_RESET 0
+ %xdefine    W             W0
+ %xdefine    W_minus_04    W4
+ %xdefine    W_minus_08    W8
+ %xdefine    W_minus_12    W12
+ %xdefine    W_minus_16    W16
+ %xdefine    W_minus_20    W20
+ %xdefine    W_minus_24    W24
+ %xdefine    W_minus_28    W28
+ %xdefine    W_minus_32    W
+%endmacro
+
+%macro W_PRECALC_ROTATE 0
+ %xdefine    W_minus_32    W_minus_28
+ %xdefine    W_minus_28    W_minus_24
+ %xdefine    W_minus_24    W_minus_20
+ %xdefine    W_minus_20    W_minus_16
+ %xdefine    W_minus_16    W_minus_12
+ %xdefine    W_minus_12    W_minus_08
+ %xdefine    W_minus_08    W_minus_04
+ %xdefine    W_minus_04    W
+ %xdefine    W             W_minus_32
+%endmacro
+
+%xdefine W_PRECALC_AHEAD   16
+%xdefine W_NO_TAIL_PRECALC 0
+
+
+%xdefine xmm_mov            movdqa
+
+%macro W_PRECALC_00_15 0
+ ;; message scheduling pre-compute for rounds 0-15
+ %if ((i & 3) == 0)       ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds
+ movdqu W_TMP, [BUFFER_PTR + (i * 4)]
+ %elif ((i & 3) == 1)
+ pshufb W_TMP, XMM_SHUFB_BSWAP
+ movdqa W, W_TMP
+ %elif ((i & 3) == 2)
+ paddd  W_TMP, [K_BASE]
+ %elif ((i & 3) == 3)
+ movdqa  [WK(i&~3)], W_TMP
+
+ W_PRECALC_ROTATE
+ %endif
+%endmacro
+
+%macro W_PRECALC_16_31 0
+ ;; message scheduling pre-compute for rounds 16-31
+ ;; calculating last 32 w[i] values in 8 XMM registers
+ ;; pre-calculate K+w[i] values and store to mem, for later load by ALU add instruction
+ ;;
+ ;; "brute force" vectorization for rounds 16-31 only due to w[i]->w[i-3] dependency
+ ;;
+ %if ((i & 3) == 0)    ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds
+ movdqa  W, W_minus_12
+ palignr W, W_minus_16, 8       ;; w[i-14]
+ movdqa  W_TMP, W_minus_04
+ psrldq  W_TMP, 4               ;; w[i-3]
+ pxor    W, W_minus_08
+ %elif ((i & 3) == 1)
+ pxor    W_TMP, W_minus_16
+ pxor    W, W_TMP
+ movdqa  W_TMP2, W
+ movdqa  W_TMP, W
+ pslldq  W_TMP2, 12
+ %elif ((i & 3) == 2)
+ psrld   W, 31
+ pslld   W_TMP, 1
+ por     W_TMP, W
+ movdqa  W, W_TMP2
+ psrld   W_TMP2, 30
+ pslld   W, 2
+ %elif ((i & 3) == 3)
+ pxor    W_TMP, W
+ pxor    W_TMP, W_TMP2
+ movdqa  W, W_TMP
+ paddd   W_TMP, [K_BASE + K_XMM]
+ movdqa  [WK(i&~3)],W_TMP
+
+ W_PRECALC_ROTATE
+ %endif
+%endmacro
+
+%macro W_PRECALC_32_79 0
+ ;; in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
+ ;; instead we do equal:    w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+ ;; allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
+ ;;
+ %if ((i & 3) == 0)    ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds
+ movdqa  W_TMP, W_minus_04
+ pxor    W, W_minus_28         ;; W is W_minus_32 before xor
+ palignr W_TMP, W_minus_08, 8
+ %elif ((i & 3) == 1)
+ pxor    W, W_minus_16
+ pxor    W, W_TMP
+ movdqa  W_TMP, W
+ %elif ((i & 3) == 2)
+ psrld   W, 30
+ pslld   W_TMP, 2
+ por     W_TMP, W
+ %elif ((i & 3) == 3)
+ movdqa  W, W_TMP
+ paddd   W_TMP, [K_BASE + K_XMM]
+ movdqa  [WK(i&~3)],W_TMP
+
+ W_PRECALC_ROTATE
+ %endif
+%endmacro
+
+%macro RR 6             ;; RR does two rounds of SHA-1 back to back with W pre-calculation
+
+ ;;     TEMP = A
+ ;;     A = F( i, B, C, D ) + E + ROTATE_LEFT( A, 5 ) + W[i] + K(i)
+ ;;     C = ROTATE_LEFT( B, 30 )
+ ;;     D = C
+ ;;     E = D
+ ;;     B = TEMP
+
+ W_PRECALC (%6 + W_PRECALC_AHEAD)
+ F    %2, %3, %4     ;; F returns result in T1
+ add  %5, [WK(%6)]
+ rol  %2, 30
+ mov  T2, %1
+ add  %4, [WK(%6 + 1)]
+ rol  T2, 5
+ add  %5, T1
+
+ W_PRECALC (%6 + W_PRECALC_AHEAD + 1)
+ add  T2, %5
+ mov  %5, T2
+ rol  T2, 5
+ add  %4, T2
+ F    %1, %2, %3    ;; F returns result in T1
+ add  %4, T1
+ rol  %1, 30
+
+;; write:  %1, %2
+;; rotate: %1<=%4, %2<=%5, %3<=%1, %4<=%2, %5<=%3
+%endmacro
+
+
+
+;;----------------------
+section .data align=128
+
+%xdefine K1 0x5a827999
+%xdefine K2 0x6ed9eba1
+%xdefine K3 0x8f1bbcdc
+%xdefine K4 0xca62c1d6
+
+align 128
+K_XMM_AR:
+ DD K1, K1, K1, K1
+ DD K2, K2, K2, K2
+ DD K3, K3, K3, K3
+ DD K4, K4, K4, K4
+
+align 16
+bswap_shufb_ctl:
+ DD 00010203h
+ DD 04050607h
+ DD 08090a0bh
+ DD 0c0d0e0fh
+
+;; dispatch pointer, points to the init routine for the first invocation
+sha1_update_intel_dispatched:
+ DQ  sha1_update_intel_init_
+
+;;----------------------
+section .text align=4096
+
+SHA1_VECTOR_ASM     sha1_update_intel_ssse3_, multiblock
+
+align 32
+sha1_update_intel_init_:       ;; we get here with the first time invocation
+ call    sha1_update_intel_dispacth_init_
+INTEL_SHA1_UPDATE_FUNCNAME:    ;; we get here after init
+ jmp     qword [sha1_update_intel_dispatched]
+
+;; CPUID feature flag based dispatch
+sha1_update_intel_dispacth_init_:
+ push    rax
+ push    rbx
+ push    rcx
+ push    rdx
+ push    rsi
+
+ lea     rsi, [INTEL_SHA1_UPDATE_DEFAULT_DISPATCH]
+
+ mov     eax, 1
+ cpuid
+
+ test    ecx, 0200h          ;; SSSE3 support, CPUID.1.ECX[bit 9]
+ jz      _done
+
+ lea     rsi, [sha1_update_intel_ssse3_]
+
+_done:
+ mov     [sha1_update_intel_dispatched], rsi
+
+ pop     rsi
+ pop     rdx
+ pop     rcx
+ pop     rbx
+ pop     rax
+ ret
+
+;;----------------------
+;; in the case a default SHA-1 update function implementation was not provided
+;; and code was invoked on a non-SSSE3 supporting CPU, dispatch handles this
+;; failure in a safest way - jumps to the stub function with UD2 instruction below
+sha1_intel_non_ssse3_cpu_stub_:
+ ud2     ;; in the case no default SHA-1 was provided non-SSSE3 CPUs safely fail here
+ ret
+
+; END
+;----------------------
--- a/pkg/crypto/sha1/sha1_test.go
+++ b/pkg/crypto/sha1/sha1_test.go
@@ -0,0 +1,154 @@
+/*
+ * Minio Cloud Storage, (C) 2015-2016 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file of
+// Golang project:
+//    https://github.com/golang/go/blob/master/LICENSE
+
+// Using this part of Minio codebase under the license
+// Apache License Version 2.0 with modifications
+
+// SHA1 hash algorithm.  See RFC 3174.
+
+package sha1
+
+import (
+	"crypto/rand"
+	"fmt"
+	"io"
+	"testing"
+)
+
+type sha1Test struct {
+	out string
+	in  string
+}
+
+var golden = []sha1Test{
+	{"da39a3ee5e6b4b0d3255bfef95601890afd80709", ""},
+	{"86f7e437faa5a7fce15d1ddcb9eaeaea377667b8", "a"},
+	{"da23614e02469a0d7c7bd1bdab5c9c474b1904dc", "ab"},
+	{"a9993e364706816aba3e25717850c26c9cd0d89d", "abc"},
+	{"81fe8bfe87576c3ecb22426f8e57847382917acf", "abcd"},
+	{"03de6c570bfe24bfc328ccd7ca46b76eadaf4334", "abcde"},
+	{"1f8ac10f23c5b5bc1167bda84b833e5c057a77d2", "abcdef"},
+	{"2fb5e13419fc89246865e7a324f476ec624e8740", "abcdefg"},
+	{"425af12a0743502b322e93a015bcf868e324d56a", "abcdefgh"},
+	{"c63b19f1e4c8b5f76b25c49b8b87f57d8e4872a1", "abcdefghi"},
+	{"d68c19a0a345b7eab78d5e11e991c026ec60db63", "abcdefghij"},
+	{"ebf81ddcbe5bf13aaabdc4d65354fdf2044f38a7", "Discard medicine more than two years old."},
+	{"e5dea09392dd886ca63531aaa00571dc07554bb6", "He who has a shady past knows that nice guys finish last."},
+	{"45988f7234467b94e3e9494434c96ee3609d8f8f", "I wouldn't marry him with a ten foot pole."},
+	{"55dee037eb7460d5a692d1ce11330b260e40c988", "Free! Free!/A trip/to Mars/for 900/empty jars/Burma Shave"},
+	{"b7bc5fb91080c7de6b582ea281f8a396d7c0aee8", "The days of the digital watch are numbered.  -Tom Stoppard"},
+	{"c3aed9358f7c77f523afe86135f06b95b3999797", "Nepal premier won't resign."},
+	{"6e29d302bf6e3a5e4305ff318d983197d6906bb9", "For every action there is an equal and opposite government program."},
+	{"597f6a540010f94c15d71806a99a2c8710e747bd", "His money is twice tainted: 'taint yours and 'taint mine."},
+	{"6859733b2590a8a091cecf50086febc5ceef1e80", "There is no reason for any individual to have a computer in their home. -Ken Olsen, 1977"},
+	{"514b2630ec089b8aee18795fc0cf1f4860cdacad", "It's a tiny change to the code and not completely disgusting. - Bob Manchek"},
+	{"c5ca0d4a7b6676fc7aa72caa41cc3d5df567ed69", "size:  a.out:  bad magic"},
+	{"74c51fa9a04eadc8c1bbeaa7fc442f834b90a00a", "The major problem is with sendmail.  -Mark Horton"},
+	{"0b4c4ce5f52c3ad2821852a8dc00217fa18b8b66", "Give me a rock, paper and scissors and I will move the world.  CCFestoon"},
+	{"3ae7937dd790315beb0f48330e8642237c61550a", "If the enemy is within range, then so are you."},
+	{"410a2b296df92b9a47412b13281df8f830a9f44b", "It's well we cannot hear the screams/That we create in others' dreams."},
+	{"841e7c85ca1adcddbdd0187f1289acb5c642f7f5", "You remind me of a TV show, but that's all right: I watch it anyway."},
+	{"163173b825d03b952601376b25212df66763e1db", "C is as portable as Stonehedge!!"},
+	{"32b0377f2687eb88e22106f133c586ab314d5279", "Even if I could be Shakespeare, I think I should still choose to be Faraday. - A. Huxley"},
+	{"0885aaf99b569542fd165fa44e322718f4a984e0", "The fugacity of a constituent in a mixture of gases at a given temperature is proportional to its mole fraction.  Lewis-Randall Rule"},
+	{"6627d6904d71420b0bf3886ab629623538689f45", "How can you write a big system without C++?  -Paul Glick"},
+}
+
+func TestGolden(t *testing.T) {
+	for i := 0; i < len(golden); i++ {
+		g := golden[i]
+		s := fmt.Sprintf("%x", Sum([]byte(g.in)))
+		if s != g.out {
+			t.Fatalf("Sum function: sha1(%s) = %s want %s", g.in, s, g.out)
+		}
+		c := New()
+		for j := 0; j < 3; j++ {
+			if j < 2 {
+				io.WriteString(c, g.in)
+			} else {
+				io.WriteString(c, g.in[0:len(g.in)/2])
+				c.Sum(nil)
+				io.WriteString(c, g.in[len(g.in)/2:])
+			}
+			s := fmt.Sprintf("%x", c.Sum(nil))
+			if s != g.out {
+				t.Fatalf("sha1[%d](%s) = %s want %s", j, g.in, s, g.out)
+			}
+			c.Reset()
+		}
+	}
+}
+
+func TestSize(t *testing.T) {
+	c := New()
+	if got := c.Size(); got != Size {
+		t.Errorf("Size = %d; want %d", got, Size)
+	}
+}
+
+func TestBlockSize(t *testing.T) {
+	c := New()
+	if got := c.BlockSize(); got != BlockSize {
+		t.Errorf("BlockSize = %d; want %d", got, BlockSize)
+	}
+}
+
+// Tests that blockGeneric (pure Go) and block (in assembly for amd64, 386, arm) match.
+func TestBlockGeneric(t *testing.T) {
+	gen, asm := New().(*digest), New().(*digest)
+	buf := make([]byte, BlockSize*20) // arbitrary factor
+	rand.Read(buf)
+	blockGeneric(gen, buf)
+	block(asm, buf)
+	if *gen != *asm {
+		t.Error("block and blockGeneric resulted in different states")
+	}
+}
+
+var bench = New()
+var buf = make([]byte, 1024*1024)
+
+func benchmarkSize(b *testing.B, size int) {
+	b.SetBytes(int64(size))
+	sum := make([]byte, bench.Size())
+	for i := 0; i < b.N; i++ {
+		bench.Reset()
+		bench.Write(buf[:size])
+		bench.Sum(sum[:0])
+	}
+}
+
+func BenchmarkHash8Bytes(b *testing.B) {
+	benchmarkSize(b, 8)
+}
+
+func BenchmarkHash1K(b *testing.B) {
+	benchmarkSize(b, 1024)
+}
+
+func BenchmarkHash8K(b *testing.B) {
+	benchmarkSize(b, 8192)
+}
+
+func BenchmarkHash1M(b *testing.B) {
+	benchmarkSize(b, 1024*1024)
+}
--- a/pkg/crypto/sha1/sha1_yasm_darwin.go
+++ b/pkg/crypto/sha1/sha1_yasm_darwin.go
@@ -0,0 +1,21 @@
+// +build darwin,amd64
+
+/*
+ * Minio Cloud Storage, (C) 2014-2016 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package sha1
+
+//go:generate yasm -f macho64 -DINTEL_SHA1_UPDATE_FUNCNAME=_sha1_update_intel sha1_sse3_amd64.asm -o sha1_sse3_amd64.syso
--- a/pkg/crypto/sha1/sha1_yasm_linux.go
+++ b/pkg/crypto/sha1/sha1_yasm_linux.go
@@ -0,0 +1,21 @@
+// +build linux,amd64
+
+/*
+ * Minio Cloud Storage, (C) 2014-2016 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package sha1
+
+//go:generate yasm -f elf64 sha1_sse3_amd64.asm -o sha1_sse3_amd64.syso
--- a/pkg/crypto/sha1/sha1_yasm_windows.go
+++ b/pkg/crypto/sha1/sha1_yasm_windows.go
@@ -0,0 +1,21 @@
+// +build windows,amd64
+
+/*
+ * Minio Cloud Storage, (C) 2014-2016 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package sha1
+
+//go:generate yasm -f win64 -DWIN_ABI=1 sha1_sse3_amd64.asm -o sha1_sse3_amd64.syso
--- a/pkg/crypto/sha1/sha1block.go
+++ b/pkg/crypto/sha1/sha1block.go
@@ -0,0 +1,43 @@
+// +build amd64,cgo
+// +build darwin windows
+
+/*
+ * Minio Cloud Storage, (C) 2014-2016 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package sha1
+
+// #include <stdint.h>
+// #include <stdlib.h>
+// void sha1_update_intel(int32_t *hash, const char* input, size_t num_blocks);
+import "C"
+import (
+	"unsafe"
+
+	"github.com/minio/minio/pkg/cpu"
+)
+
+func block(dig *digest, p []byte) {
+	switch true {
+	case cpu.HasSSE41() == true:
+		blockSSE3(dig, p)
+	default:
+		blockGeneric(dig, p)
+	}
+}
+
+func blockSSE3(dig *digest, p []byte) {
+	C.sha1_update_intel((*C.int32_t)(unsafe.Pointer(&dig.h[0])), (*C.char)(unsafe.Pointer(&p[0])), (C.size_t)(len(p)/chunk))
+}
--- a/pkg/crypto/sha1/sha1block_generic.go
+++ b/pkg/crypto/sha1/sha1block_generic.go
@@ -0,0 +1,110 @@
+/*
+ * Minio Cloud Storage, (C) 2015-2016 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file of
+// Golang project:
+//    https://github.com/golang/go/blob/master/LICENSE
+
+// Using this part of Minio codebase under the license
+// Apache License Version 2.0 with modifications
+
+package sha1
+
+const (
+	_K0 = 0x5A827999
+	_K1 = 0x6ED9EBA1
+	_K2 = 0x8F1BBCDC
+	_K3 = 0xCA62C1D6
+)
+
+// blockGeneric is a portable, pure Go version of the SHA1 block step.
+// It's used by sha1block_generic.go and tests.
+func blockGeneric(dig *digest, p []byte) {
+	var w [16]uint32
+
+	h0, h1, h2, h3, h4 := dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4]
+	for len(p) >= chunk {
+		// Can interlace the computation of w with the
+		// rounds below if needed for speed.
+		for i := 0; i < 16; i++ {
+			j := i * 4
+			w[i] = uint32(p[j])<<24 | uint32(p[j+1])<<16 | uint32(p[j+2])<<8 | uint32(p[j+3])
+		}
+
+		a, b, c, d, e := h0, h1, h2, h3, h4
+
+		// Each of the four 20-iteration rounds
+		// differs only in the computation of f and
+		// the choice of K (_K0, _K1, etc).
+		i := 0
+		for ; i < 16; i++ {
+			f := b&c | (^b)&d
+			a5 := a<<5 | a>>(32-5)
+			b30 := b<<30 | b>>(32-30)
+			t := a5 + f + e + w[i&0xf] + _K0
+			a, b, c, d, e = t, a, b30, c, d
+		}
+		for ; i < 20; i++ {
+			tmp := w[(i-3)&0xf] ^ w[(i-8)&0xf] ^ w[(i-14)&0xf] ^ w[(i)&0xf]
+			w[i&0xf] = tmp<<1 | tmp>>(32-1)
+
+			f := b&c | (^b)&d
+			a5 := a<<5 | a>>(32-5)
+			b30 := b<<30 | b>>(32-30)
+			t := a5 + f + e + w[i&0xf] + _K0
+			a, b, c, d, e = t, a, b30, c, d
+		}
+		for ; i < 40; i++ {
+			tmp := w[(i-3)&0xf] ^ w[(i-8)&0xf] ^ w[(i-14)&0xf] ^ w[(i)&0xf]
+			w[i&0xf] = tmp<<1 | tmp>>(32-1)
+			f := b ^ c ^ d
+			a5 := a<<5 | a>>(32-5)
+			b30 := b<<30 | b>>(32-30)
+			t := a5 + f + e + w[i&0xf] + _K1
+			a, b, c, d, e = t, a, b30, c, d
+		}
+		for ; i < 60; i++ {
+			tmp := w[(i-3)&0xf] ^ w[(i-8)&0xf] ^ w[(i-14)&0xf] ^ w[(i)&0xf]
+			w[i&0xf] = tmp<<1 | tmp>>(32-1)
+			f := ((b | c) & d) | (b & c)
+
+			a5 := a<<5 | a>>(32-5)
+			b30 := b<<30 | b>>(32-30)
+			t := a5 + f + e + w[i&0xf] + _K2
+			a, b, c, d, e = t, a, b30, c, d
+		}
+		for ; i < 80; i++ {
+			tmp := w[(i-3)&0xf] ^ w[(i-8)&0xf] ^ w[(i-14)&0xf] ^ w[(i)&0xf]
+			w[i&0xf] = tmp<<1 | tmp>>(32-1)
+			f := b ^ c ^ d
+			a5 := a<<5 | a>>(32-5)
+			b30 := b<<30 | b>>(32-30)
+			t := a5 + f + e + w[i&0xf] + _K3
+			a, b, c, d, e = t, a, b30, c, d
+		}
+
+		h0 += a
+		h1 += b
+		h2 += c
+		h3 += d
+		h4 += e
+
+		p = p[chunk:]
+	}
+	dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4] = h0, h1, h2, h3, h4
+}
--- a/pkg/crypto/sha1/sha1block_linux.go
+++ b/pkg/crypto/sha1/sha1block_linux.go
@@ -0,0 +1,50 @@
+// +build linux,amd64,cgo
+
+/*
+ * Minio Cloud Storage, (C) 2014-2016 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package sha1
+
+// #cgo CFLAGS: -DHAS_AVX2
+// #include <stdint.h>
+// #include <stdlib.h>
+// void sha1_transform(int32_t *hash, const char* input, size_t num_blocks);
+// void sha1_update_intel(int32_t *hash, const char* input, size_t num_blocks);
+import "C"
+import (
+	"unsafe"
+
+	"github.com/minio/minio/pkg/cpu"
+)
+
+func block(dig *digest, p []byte) {
+	switch true {
+	case cpu.HasAVX2():
+		blockAVX2(dig, p)
+	case cpu.HasSSE41():
+		blockSSE3(dig, p)
+	default:
+		blockGeneric(dig, p)
+	}
+}
+
+func blockAVX2(dig *digest, p []byte) {
+	C.sha1_transform((*C.int32_t)(unsafe.Pointer(&dig.h[0])), (*C.char)(unsafe.Pointer(&p[0])), (C.size_t)(len(p)/chunk))
+}
+
+func blockSSE3(dig *digest, p []byte) {
+	C.sha1_update_intel((*C.int32_t)(unsafe.Pointer(&dig.h[0])), (*C.char)(unsafe.Pointer(&p[0])), (C.size_t)(len(p)/chunk))
+}
--- a/pkg/crypto/sha1/sha1block_nocgo.go
+++ b/pkg/crypto/sha1/sha1block_nocgo.go
@@ -0,0 +1,23 @@
+// +build !cgo arm
+
+/*
+ * Minio Cloud Storage, (C) 2016 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package sha1
+
+func block(dig *digest, p []byte) {
+	blockGeneric(dig, p)
+}
--- a/pkg/crypto/sha256/LICENSE
+++ b/pkg/crypto/sha256/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/pkg/crypto/sha256/sha256-avx-asm_linux_amd64.S
+++ b/pkg/crypto/sha256/sha256-avx-asm_linux_amd64.S
@@ -0,0 +1,759 @@
+########################################################################
+# Implement fast SHA-256 with AVX1 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 1 block at a time, with 4 lanes per block
+########################################################################
+# Using this part of Minio codebase under the license
+# Apache License Version 2.0 with modifications
+##
+
+#ifdef HAS_AVX
+#ifndef ENTRY
+#define ENTRY(name) \
+        .globl name             ; \
+        .align 4,0x90           ; \
+        name:
+#endif
+
+#ifndef END
+#define END(name) \
+        .size name, .-name
+#endif
+
+#ifndef ENDPROC
+#define ENDPROC(name) \
+        .type name, @function   ; \
+        END(name)
+#endif
+
+#define NUM_INVALID		100
+
+#define TYPE_R32		0
+#define TYPE_R64		1
+#define TYPE_XMM		2
+#define TYPE_INVALID	100
+
+	.macro R32_NUM opd r32
+	\opd = NUM_INVALID
+	.ifc \r32,%eax
+	\opd = 0
+	.endif
+	.ifc \r32,%ecx
+	\opd = 1
+	.endif
+	.ifc \r32,%edx
+	\opd = 2
+	.endif
+	.ifc \r32,%ebx
+	\opd = 3
+	.endif
+	.ifc \r32,%esp
+	\opd = 4
+	.endif
+	.ifc \r32,%ebp
+	\opd = 5
+	.endif
+	.ifc \r32,%esi
+	\opd = 6
+	.endif
+	.ifc \r32,%edi
+	\opd = 7
+	.endif
+#ifdef X86_64
+	.ifc \r32,%r8d
+	\opd = 8
+	.endif
+	.ifc \r32,%r9d
+	\opd = 9
+	.endif
+	.ifc \r32,%r10d
+	\opd = 10
+	.endif
+	.ifc \r32,%r11d
+	\opd = 11
+	.endif
+	.ifc \r32,%r12d
+	\opd = 12
+	.endif
+	.ifc \r32,%r13d
+	\opd = 13
+	.endif
+	.ifc \r32,%r14d
+	\opd = 14
+	.endif
+	.ifc \r32,%r15d
+	\opd = 15
+	.endif
+#endif
+	.endm
+
+	.macro R64_NUM opd r64
+	\opd = NUM_INVALID
+#ifdef X86_64
+	.ifc \r64,%rax
+	\opd = 0
+	.endif
+	.ifc \r64,%rcx
+	\opd = 1
+	.endif
+	.ifc \r64,%rdx
+	\opd = 2
+	.endif
+	.ifc \r64,%rbx
+	\opd = 3
+	.endif
+	.ifc \r64,%rsp
+	\opd = 4
+	.endif
+	.ifc \r64,%rbp
+	\opd = 5
+	.endif
+	.ifc \r64,%rsi
+	\opd = 6
+	.endif
+	.ifc \r64,%rdi
+	\opd = 7
+	.endif
+	.ifc \r64,%r8
+	\opd = 8
+	.endif
+	.ifc \r64,%r9
+	\opd = 9
+	.endif
+	.ifc \r64,%r10
+	\opd = 10
+	.endif
+	.ifc \r64,%r11
+	\opd = 11
+	.endif
+	.ifc \r64,%r12
+	\opd = 12
+	.endif
+	.ifc \r64,%r13
+	\opd = 13
+	.endif
+	.ifc \r64,%r14
+	\opd = 14
+	.endif
+	.ifc \r64,%r15
+	\opd = 15
+	.endif
+#endif
+	.endm
+
+	.macro XMM_NUM opd xmm
+	\opd = NUM_INVALID
+	.ifc \xmm,%xmm0
+	\opd = 0
+	.endif
+	.ifc \xmm,%xmm1
+	\opd = 1
+	.endif
+	.ifc \xmm,%xmm2
+	\opd = 2
+	.endif
+	.ifc \xmm,%xmm3
+	\opd = 3
+	.endif
+	.ifc \xmm,%xmm4
+	\opd = 4
+	.endif
+	.ifc \xmm,%xmm5
+	\opd = 5
+	.endif
+	.ifc \xmm,%xmm6
+	\opd = 6
+	.endif
+	.ifc \xmm,%xmm7
+	\opd = 7
+	.endif
+	.ifc \xmm,%xmm8
+	\opd = 8
+	.endif
+	.ifc \xmm,%xmm9
+	\opd = 9
+	.endif
+	.ifc \xmm,%xmm10
+	\opd = 10
+	.endif
+	.ifc \xmm,%xmm11
+	\opd = 11
+	.endif
+	.ifc \xmm,%xmm12
+	\opd = 12
+	.endif
+	.ifc \xmm,%xmm13
+	\opd = 13
+	.endif
+	.ifc \xmm,%xmm14
+	\opd = 14
+	.endif
+	.ifc \xmm,%xmm15
+	\opd = 15
+	.endif
+	.endm
+
+	.macro TYPE type reg
+	R32_NUM reg_type_r32 \reg
+	R64_NUM reg_type_r64 \reg
+	XMM_NUM reg_type_xmm \reg
+	.if reg_type_r64 <> NUM_INVALID
+	\type = TYPE_R64
+	.elseif reg_type_r32 <> NUM_INVALID
+	\type = TYPE_R32
+	.elseif reg_type_xmm <> NUM_INVALID
+	\type = TYPE_XMM
+	.else
+	\type = TYPE_INVALID
+	.endif
+	.endm
+
+	.macro PFX_OPD_SIZE
+	.byte 0x66
+	.endm
+
+	.macro PFX_REX opd1 opd2 W=0
+	.if ((\opd1 | \opd2) & 8) || \W
+	.byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3)
+	.endif
+	.endm
+
+	.macro MODRM mod opd1 opd2
+	.byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3)
+	.endm
+
+	.macro PSHUFB_XMM xmm1 xmm2
+	XMM_NUM pshufb_opd1 \xmm1
+	XMM_NUM pshufb_opd2 \xmm2
+	PFX_OPD_SIZE
+	PFX_REX pshufb_opd1 pshufb_opd2
+	.byte 0x0f, 0x38, 0x00
+	MODRM 0xc0 pshufb_opd1 pshufb_opd2
+	.endm
+
+	.macro PCLMULQDQ imm8 xmm1 xmm2
+	XMM_NUM clmul_opd1 \xmm1
+	XMM_NUM clmul_opd2 \xmm2
+	PFX_OPD_SIZE
+	PFX_REX clmul_opd1 clmul_opd2
+	.byte 0x0f, 0x3a, 0x44
+	MODRM 0xc0 clmul_opd1 clmul_opd2
+	.byte \imm8
+	.endm
+
+	.macro PEXTRD imm8 xmm gpr
+	R32_NUM extrd_opd1 \gpr
+	XMM_NUM extrd_opd2 \xmm
+	PFX_OPD_SIZE
+	PFX_REX extrd_opd1 extrd_opd2
+	.byte 0x0f, 0x3a, 0x16
+	MODRM 0xc0 extrd_opd1 extrd_opd2
+	.byte \imm8
+	.endm
+
+	.macro MOVQ_R64_XMM opd1 opd2
+	TYPE movq_r64_xmm_opd1_type \opd1
+	.if movq_r64_xmm_opd1_type == TYPE_XMM
+	XMM_NUM movq_r64_xmm_opd1 \opd1
+	R64_NUM movq_r64_xmm_opd2 \opd2
+	.else
+	R64_NUM movq_r64_xmm_opd1 \opd1
+	XMM_NUM movq_r64_xmm_opd2 \opd2
+	.endif
+	PFX_OPD_SIZE
+	PFX_REX movq_r64_xmm_opd1 movq_r64_xmm_opd2 1
+	.if movq_r64_xmm_opd1_type == TYPE_XMM
+	.byte 0x0f, 0x7e
+	.else
+	.byte 0x0f, 0x6e
+	.endif
+	MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2
+	.endm
+
+## assume buffers not aligned
+#define    VMOVDQ vmovdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+	add     \p1, \p2
+	mov     \p2, \p1
+.endm
+
+
+.macro MY_ROR p1 p2
+	shld    $(32-(\p1)), \p2, \p2
+.endm
+
+################################
+
+# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+# Load xmm with mem and byte swap each dword
+.macro COPY_XMM_AND_BSWAP p1 p2 p3
+	VMOVDQ \p2, \p1
+	vpshufb \p3, \p1, \p1
+.endm
+
+################################
+
+X0 = %xmm4
+X1 = %xmm5
+X2 = %xmm6
+X3 = %xmm7
+
+XTMP0 = %xmm0
+XTMP1 = %xmm1
+XTMP2 = %xmm2
+XTMP3 = %xmm3
+XTMP4 = %xmm8
+XFER = %xmm9
+XTMP5 = %xmm11
+
+SHUF_00BA = %xmm10      # shuffle xBxA -> 00BA
+SHUF_DC00 = %xmm12      # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %xmm13
+
+NUM_BLKS = %rdx   # 3rd arg
+CTX = %rsi        # 2nd arg
+INP = %rdi        # 1st arg
+
+SRND = %rdi       # clobbers INP
+c = %ecx
+d = %r8d
+e = %edx
+TBL = %rbp
+a = %eax
+b = %ebx
+
+f = %r9d
+g = %r10d
+h = %r11d
+
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+_INP_END_SIZE = 8
+_INP_SIZE = 8
+_XFER_SIZE = 16
+_XMM_SAVE_SIZE = 0
+
+_INP_END = 0
+_INP            = _INP_END  + _INP_END_SIZE
+_XFER           = _INP      + _INP_SIZE
+_XMM_SAVE       = _XFER     + _XFER_SIZE
+STACK_SIZE      = _XMM_SAVE + _XMM_SAVE_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+X_ = X0
+X0 = X1
+X1 = X2
+X2 = X3
+X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+	## compute s0 four at a time and s1 two at a time
+	## compute W[-16] + W[-7] 4 at a time
+
+	mov     e, y0			# y0 = e
+	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
+	mov     a, y1                   # y1 = a
+	vpalignr $4, X2, X3, XTMP0      # XTMP0 = W[-7]
+	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	mov     f, y2                   # y2 = f
+	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	xor     g, y2                   # y2 = f^g
+	vpaddd  X0, XTMP0, XTMP0        # XTMP0 = W[-7] + W[-16]
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	## compute s0
+	vpalignr $4, X0, X1, XTMP1      # XTMP1 = W[-15]
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add     y0, y2                  # y2 = S1 + CH
+	add     _XFER(%rsp), y2         # y2 = k + w + S1 + CH
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	vpsrld  $7, XTMP1, XTMP2
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	vpslld  $(32-7), XTMP1, XTMP3
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	vpor    XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+	mov     e, y0                   # y0 = e
+	mov     a, y1                   # y1 = a
+	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	mov     f, y2                   # y2 = f
+	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
+	vpsrld  $18, XTMP1, XTMP2       #
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor     g, y2                   # y2 = f^g
+	vpsrld  $3, XTMP1, XTMP4        # XTMP4 = W[-15] >> 3
+	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	vpslld  $(32-18), XTMP1, XTMP1
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	vpxor   XTMP1, XTMP3, XTMP3     #
+	add     y0, y2                  # y2 = S1 + CH
+	add     (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	vpxor   XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	vpxor   XTMP4, XTMP3, XTMP1     # XTMP1 = s0
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	## compute low s1
+	vpshufd $0b11111010, X3, XTMP2  # XTMP2 = W[-2] {BBAA}
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	vpaddd  XTMP1, XTMP0, XTMP0     # XTMP0 = W[-16] + W[-7] + s0
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+	mov     e, y0                   # y0 = e
+	mov     a, y1                   # y1 = a
+	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
+	mov     f, y2                   # y2 = f
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	vpsrld  $10, XTMP2, XTMP4       # XTMP4 = W[-2] >> 10 {BBAA}
+	xor     g, y2                   # y2 = f^g
+	vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xBxA}
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xBxA}
+	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	vpxor   XTMP3, XTMP2, XTMP2     #
+	add     y0, y2                  # y2 = S1 + CH
+	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add     (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+	vpxor   XTMP2, XTMP4, XTMP4     # XTMP4 = s1 {xBxA}
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	vpaddd  XTMP4, XTMP0, XTMP0     # XTMP0 = {..., ..., W[1], W[0]}
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	## compute high s1
+	vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+	mov     e, y0                   # y0 = e
+	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
+	mov     a, y1                   # y1 = a
+	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	mov     f, y2                   # y2 = f
+	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	vpsrld  $10, XTMP2, XTMP5       # XTMP5 = W[-2] >> 10 {DDCC}
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	xor     g, y2                   # y2 = f^g
+	vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xDxC}
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xDxC}
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	vpxor   XTMP3, XTMP2, XTMP2
+	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add     y0, y2                  # y2 = S1 + CH
+	add     (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+	vpxor   XTMP2, XTMP5, XTMP5     # XTMP5 = s1 {xDxC}
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	vpaddd  XTMP0, XTMP5, X0        # X0 = {W[3], W[2], W[1], W[0]}
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+	rotate_Xs
+.endm
+
+## input is [rsp + _XFER + %1 * 4]
+.macro DO_ROUND round
+	mov	e, y0			# y0 = e
+        MY_ROR  (25-11), y0             # y0 = e >> (25-11)
+        mov     a, y1                   # y1 = a
+        xor     e, y0                   # y0 = e ^ (e >> (25-11))
+        MY_ROR  (22-13), y1             # y1 = a >> (22-13)
+        mov     f, y2                   # y2 = f
+        xor     a, y1                   # y1 = a ^ (a >> (22-13)
+        MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
+        xor     g, y2                   # y2 = f^g
+        xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+        MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
+        and     e, y2                   # y2 = (f^g)&e
+        xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+        MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+        xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+        add     y0, y2                  # y2 = S1 + CH
+        MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+        offset = \round * 4 + _XFER     #
+        add     offset(%rsp), y2	# y2 = k + w + S1 + CH
+        mov     a, y0			# y0 = a
+        add     y2, h                   # h = h + S1 + CH + k + w
+        mov     a, y2                   # y2 = a
+        or      c, y0                   # y0 = a|c
+        add     h, d                    # d = d + h + S1 + CH + k + w
+        and     c, y2                   # y2 = a&c
+        and     b, y0                   # y0 = (a|c)&b
+        add     y1, h                   # h = h + S1 + CH + k + w + S0
+        or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+        add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+        ROTATE_ARGS
+.endm
+
+########################################################################
+## void sha256_transform_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
+## arg 1 : pointer to input data
+## arg 2 : pointer to digest
+## arg 3 : Num blocks
+########################################################################
+.text
+ENTRY(sha256_transform_avx)
+.align 32
+	pushq   %rbx
+	pushq   %rbp
+	pushq   %r13
+	pushq   %r14
+	pushq   %r15
+	pushq   %r12
+
+	mov	%rsp, %r12
+	subq    $STACK_SIZE, %rsp	# allocate stack space
+	and	$~15, %rsp		# align stack pointer
+
+	shl     $6, NUM_BLKS		# convert to bytes
+	jz      done_hash
+	add     INP, NUM_BLKS		# pointer to end of data
+	mov     NUM_BLKS, _INP_END(%rsp)
+
+	## load initial digest
+	mov     4*0(CTX), a
+	mov     4*1(CTX), b
+	mov     4*2(CTX), c
+	mov     4*3(CTX), d
+	mov     4*4(CTX), e
+	mov     4*5(CTX), f
+	mov     4*6(CTX), g
+	mov     4*7(CTX), h
+
+	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
+	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
+loop0:
+	lea     K256(%rip), TBL
+
+	## byte swap first 16 dwords
+	COPY_XMM_AND_BSWAP      X0, 0*16(INP), BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X1, 1*16(INP), BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X2, 2*16(INP), BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X3, 3*16(INP), BYTE_FLIP_MASK
+
+	mov     INP, _INP(%rsp)
+
+	## schedule 48 input dwords, by doing 3 rounds of 16 each
+	mov     $3, SRND
+.align 16
+loop1:
+	vpaddd  (TBL), X0, XFER
+	vmovdqa XFER, _XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddd  1*16(TBL), X0, XFER
+	vmovdqa XFER, _XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddd  2*16(TBL), X0, XFER
+	vmovdqa XFER, _XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddd  3*16(TBL), X0, XFER
+	vmovdqa XFER, _XFER(%rsp)
+	add	$4*16, TBL
+	FOUR_ROUNDS_AND_SCHED
+
+	sub     $1, SRND
+	jne     loop1
+
+	mov     $2, SRND
+loop2:
+	vpaddd  (TBL), X0, XFER
+	vmovdqa XFER, _XFER(%rsp)
+	DO_ROUND        0
+	DO_ROUND        1
+	DO_ROUND        2
+	DO_ROUND        3
+
+	vpaddd  1*16(TBL), X1, XFER
+	vmovdqa XFER, _XFER(%rsp)
+	add     $2*16, TBL
+	DO_ROUND        0
+	DO_ROUND        1
+	DO_ROUND        2
+	DO_ROUND        3
+
+	vmovdqa X2, X0
+	vmovdqa X3, X1
+
+	sub     $1, SRND
+	jne     loop2
+
+	addm    (4*0)(CTX),a
+	addm    (4*1)(CTX),b
+	addm    (4*2)(CTX),c
+	addm    (4*3)(CTX),d
+	addm    (4*4)(CTX),e
+	addm    (4*5)(CTX),f
+	addm    (4*6)(CTX),g
+	addm    (4*7)(CTX),h
+
+	mov     _INP(%rsp), INP
+	add     $64, INP
+	cmp     _INP_END(%rsp), INP
+	jne     loop0
+
+done_hash:
+
+	mov	%r12, %rsp
+
+	popq	%r12
+	popq    %r15
+	popq    %r14
+	popq    %r13
+	popq    %rbp
+	popq    %rbx
+	ret
+ENDPROC(sha256_transform_avx)
+
+.data
+.align 64
+K256:
+	.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x0c0d0e0f08090a0b0405060700010203
+
+# shuffle xBxA -> 00BA
+_SHUF_00BA:
+	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+# shuffle xDxC -> DC00
+_SHUF_DC00:
+	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
+#endif
--- a/pkg/crypto/sha256/sha256-avx2-asm_linux_amd64.S
+++ b/pkg/crypto/sha256/sha256-avx2-asm_linux_amd64.S
--- a/pkg/crypto/sha256/sha256-ssse3-asm_linux_amd64.S
+++ b/pkg/crypto/sha256/sha256-ssse3-asm_linux_amd64.S
@@ -0,0 +1,772 @@
+########################################################################
+# Implement fast SHA-256 with SSSE3 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+#
+# Using this part of Minio codebase under the license
+# Apache License Version 2.0 with modifications
+##
+
+#ifdef HAS_SSE41
+#ifndef ENTRY
+#define ENTRY(name) \
+        .globl name             ; \
+        .align 4,0x90           ; \
+        name:
+#endif
+
+#ifndef END
+#define END(name) \
+        .size name, .-name
+#endif
+
+#ifndef ENDPROC
+#define ENDPROC(name) \
+        .type name, @function   ; \
+        END(name)
+#endif
+
+#define NUM_INVALID		100
+
+#define TYPE_R32		0
+#define TYPE_R64		1
+#define TYPE_XMM		2
+#define TYPE_INVALID	100
+
+	.macro R32_NUM opd r32
+	\opd = NUM_INVALID
+	.ifc \r32,%eax
+	\opd = 0
+	.endif
+	.ifc \r32,%ecx
+	\opd = 1
+	.endif
+	.ifc \r32,%edx
+	\opd = 2
+	.endif
+	.ifc \r32,%ebx
+	\opd = 3
+	.endif
+	.ifc \r32,%esp
+	\opd = 4
+	.endif
+	.ifc \r32,%ebp
+	\opd = 5
+	.endif
+	.ifc \r32,%esi
+	\opd = 6
+	.endif
+	.ifc \r32,%edi
+	\opd = 7
+	.endif
+#ifdef X86_64
+	.ifc \r32,%r8d
+	\opd = 8
+	.endif
+	.ifc \r32,%r9d
+	\opd = 9
+	.endif
+	.ifc \r32,%r10d
+	\opd = 10
+	.endif
+	.ifc \r32,%r11d
+	\opd = 11
+	.endif
+	.ifc \r32,%r12d
+	\opd = 12
+	.endif
+	.ifc \r32,%r13d
+	\opd = 13
+	.endif
+	.ifc \r32,%r14d
+	\opd = 14
+	.endif
+	.ifc \r32,%r15d
+	\opd = 15
+	.endif
+#endif
+	.endm
+
+	.macro R64_NUM opd r64
+	\opd = NUM_INVALID
+#ifdef X86_64
+	.ifc \r64,%rax
+	\opd = 0
+	.endif
+	.ifc \r64,%rcx
+	\opd = 1
+	.endif
+	.ifc \r64,%rdx
+	\opd = 2
+	.endif
+	.ifc \r64,%rbx
+	\opd = 3
+	.endif
+	.ifc \r64,%rsp
+	\opd = 4
+	.endif
+	.ifc \r64,%rbp
+	\opd = 5
+	.endif
+	.ifc \r64,%rsi
+	\opd = 6
+	.endif
+	.ifc \r64,%rdi
+	\opd = 7
+	.endif
+	.ifc \r64,%r8
+	\opd = 8
+	.endif
+	.ifc \r64,%r9
+	\opd = 9
+	.endif
+	.ifc \r64,%r10
+	\opd = 10
+	.endif
+	.ifc \r64,%r11
+	\opd = 11
+	.endif
+	.ifc \r64,%r12
+	\opd = 12
+	.endif
+	.ifc \r64,%r13
+	\opd = 13
+	.endif
+	.ifc \r64,%r14
+	\opd = 14
+	.endif
+	.ifc \r64,%r15
+	\opd = 15
+	.endif
+#endif
+	.endm
+
+	.macro XMM_NUM opd xmm
+	\opd = NUM_INVALID
+	.ifc \xmm,%xmm0
+	\opd = 0
+	.endif
+	.ifc \xmm,%xmm1
+	\opd = 1
+	.endif
+	.ifc \xmm,%xmm2
+	\opd = 2
+	.endif
+	.ifc \xmm,%xmm3
+	\opd = 3
+	.endif
+	.ifc \xmm,%xmm4
+	\opd = 4
+	.endif
+	.ifc \xmm,%xmm5
+	\opd = 5
+	.endif
+	.ifc \xmm,%xmm6
+	\opd = 6
+	.endif
+	.ifc \xmm,%xmm7
+	\opd = 7
+	.endif
+	.ifc \xmm,%xmm8
+	\opd = 8
+	.endif
+	.ifc \xmm,%xmm9
+	\opd = 9
+	.endif
+	.ifc \xmm,%xmm10
+	\opd = 10
+	.endif
+	.ifc \xmm,%xmm11
+	\opd = 11
+	.endif
+	.ifc \xmm,%xmm12
+	\opd = 12
+	.endif
+	.ifc \xmm,%xmm13
+	\opd = 13
+	.endif
+	.ifc \xmm,%xmm14
+	\opd = 14
+	.endif
+	.ifc \xmm,%xmm15
+	\opd = 15
+	.endif
+	.endm
+
+	.macro TYPE type reg
+	R32_NUM reg_type_r32 \reg
+	R64_NUM reg_type_r64 \reg
+	XMM_NUM reg_type_xmm \reg
+	.if reg_type_r64 <> NUM_INVALID
+	\type = TYPE_R64
+	.elseif reg_type_r32 <> NUM_INVALID
+	\type = TYPE_R32
+	.elseif reg_type_xmm <> NUM_INVALID
+	\type = TYPE_XMM
+	.else
+	\type = TYPE_INVALID
+	.endif
+	.endm
+
+	.macro PFX_OPD_SIZE
+	.byte 0x66
+	.endm
+
+	.macro PFX_REX opd1 opd2 W=0
+	.if ((\opd1 | \opd2) & 8) || \W
+	.byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3)
+	.endif
+	.endm
+
+	.macro MODRM mod opd1 opd2
+	.byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3)
+	.endm
+
+	.macro PSHUFB_XMM xmm1 xmm2
+	XMM_NUM pshufb_opd1 \xmm1
+	XMM_NUM pshufb_opd2 \xmm2
+	PFX_OPD_SIZE
+	PFX_REX pshufb_opd1 pshufb_opd2
+	.byte 0x0f, 0x38, 0x00
+	MODRM 0xc0 pshufb_opd1 pshufb_opd2
+	.endm
+
+	.macro PCLMULQDQ imm8 xmm1 xmm2
+	XMM_NUM clmul_opd1 \xmm1
+	XMM_NUM clmul_opd2 \xmm2
+	PFX_OPD_SIZE
+	PFX_REX clmul_opd1 clmul_opd2
+	.byte 0x0f, 0x3a, 0x44
+	MODRM 0xc0 clmul_opd1 clmul_opd2
+	.byte \imm8
+	.endm
+
+	.macro PEXTRD imm8 xmm gpr
+	R32_NUM extrd_opd1 \gpr
+	XMM_NUM extrd_opd2 \xmm
+	PFX_OPD_SIZE
+	PFX_REX extrd_opd1 extrd_opd2
+	.byte 0x0f, 0x3a, 0x16
+	MODRM 0xc0 extrd_opd1 extrd_opd2
+	.byte \imm8
+	.endm
+
+	.macro MOVQ_R64_XMM opd1 opd2
+	TYPE movq_r64_xmm_opd1_type \opd1
+	.if movq_r64_xmm_opd1_type == TYPE_XMM
+	XMM_NUM movq_r64_xmm_opd1 \opd1
+	R64_NUM movq_r64_xmm_opd2 \opd2
+	.else
+	R64_NUM movq_r64_xmm_opd1 \opd1
+	XMM_NUM movq_r64_xmm_opd2 \opd2
+	.endif
+	PFX_OPD_SIZE
+	PFX_REX movq_r64_xmm_opd1 movq_r64_xmm_opd2 1
+	.if movq_r64_xmm_opd1_type == TYPE_XMM
+	.byte 0x0f, 0x7e
+	.else
+	.byte 0x0f, 0x6e
+	.endif
+	MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2
+	.endm
+
+## assume buffers not aligned
+#define    MOVDQ movdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+        add     \p1, \p2
+        mov     \p2, \p1
+.endm
+
+################################
+
+# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+# Load xmm with mem and byte swap each dword
+.macro COPY_XMM_AND_BSWAP p1 p2 p3
+        MOVDQ \p2, \p1
+        pshufb \p3, \p1
+.endm
+
+################################
+
+X0 = %xmm4
+X1 = %xmm5
+X2 = %xmm6
+X3 = %xmm7
+
+XTMP0 = %xmm0
+XTMP1 = %xmm1
+XTMP2 = %xmm2
+XTMP3 = %xmm3
+XTMP4 = %xmm8
+XFER = %xmm9
+
+SHUF_00BA = %xmm10      # shuffle xBxA -> 00BA
+SHUF_DC00 = %xmm11      # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %xmm12
+
+NUM_BLKS = %rdx   # 3rd arg
+CTX = %rsi        # 2nd arg
+INP = %rdi        # 1st arg
+
+SRND = %rdi       # clobbers INP
+c = %ecx
+d = %r8d
+e = %edx
+TBL = %rbp
+a = %eax
+b = %ebx
+
+f = %r9d
+g = %r10d
+h = %r11d
+
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+
+_INP_END_SIZE = 8
+_INP_SIZE = 8
+_XFER_SIZE = 16
+_XMM_SAVE_SIZE = 0
+
+_INP_END = 0
+_INP            = _INP_END  + _INP_END_SIZE
+_XFER           = _INP      + _INP_SIZE
+_XMM_SAVE       = _XFER     + _XFER_SIZE
+STACK_SIZE      = _XMM_SAVE + _XMM_SAVE_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+X_ = X0
+X0 = X1
+X1 = X2
+X2 = X3
+X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+	## compute s0 four at a time and s1 two at a time
+	## compute W[-16] + W[-7] 4 at a time
+	movdqa  X3, XTMP0
+	mov     e, y0			# y0 = e
+	ror     $(25-11), y0            # y0 = e >> (25-11)
+	mov     a, y1                   # y1 = a
+	palignr $4, X2, XTMP0           # XTMP0 = W[-7]
+	ror     $(22-13), y1            # y1 = a >> (22-13)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	mov     f, y2                   # y2 = f
+	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	movdqa  X1, XTMP1
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	xor     g, y2                   # y2 = f^g
+	paddd   X0, XTMP0               # XTMP0 = W[-7] + W[-16]
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	## compute s0
+	palignr $4, X0, XTMP1           # XTMP1 = W[-15]
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	movdqa  XTMP1, XTMP2            # XTMP2 = W[-15]
+	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add     y0, y2                  # y2 = S1 + CH
+	add     _XFER(%rsp) , y2        # y2 = k + w + S1 + CH
+	movdqa  XTMP1, XTMP3            # XTMP3 = W[-15]
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	pslld   $(32-7), XTMP1          #
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	psrld   $7, XTMP2               #
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	por     XTMP2, XTMP1            # XTMP1 = W[-15] ror 7
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+					#
+	ROTATE_ARGS                     #
+	movdqa  XTMP3, XTMP2            # XTMP2 = W[-15]
+	mov     e, y0                   # y0 = e
+	mov     a, y1                   # y1 = a
+	movdqa  XTMP3, XTMP4            # XTMP4 = W[-15]
+	ror     $(25-11), y0            # y0 = e >> (25-11)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	mov     f, y2                   # y2 = f
+	ror     $(22-13), y1            # y1 = a >> (22-13)
+	pslld   $(32-18), XTMP3         #
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor     g, y2                   # y2 = f^g
+	psrld   $18, XTMP2              #
+	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	pxor    XTMP3, XTMP1
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	psrld   $3, XTMP4               # XTMP4 = W[-15] >> 3
+	add     y0, y2                  # y2 = S1 + CH
+	add     (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	pxor    XTMP2, XTMP1            # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	pxor    XTMP4, XTMP1            # XTMP1 = s0
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	## compute low s1
+	pshufd  $0b11111010, X3, XTMP2   # XTMP2 = W[-2] {BBAA}
+	and     b, y0			# y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	paddd   XTMP1, XTMP0            # XTMP0 = W[-16] + W[-7] + s0
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+
+	ROTATE_ARGS
+	movdqa  XTMP2, XTMP3            # XTMP3 = W[-2] {BBAA}
+	mov     e, y0                   # y0 = e
+	mov     a, y1                   # y1 = a
+	ror     $(25-11), y0            # y0 = e >> (25-11)
+	movdqa  XTMP2, XTMP4            # XTMP4 = W[-2] {BBAA}
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	ror     $(22-13), y1            # y1 = a >> (22-13)
+	mov     f, y2                   # y2 = f
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	psrlq   $17, XTMP2              # XTMP2 = W[-2] ror 17 {xBxA}
+	xor     g, y2                   # y2 = f^g
+	psrlq   $19, XTMP3              # XTMP3 = W[-2] ror 19 {xBxA}
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	psrld   $10, XTMP4              # XTMP4 = W[-2] >> 10 {BBAA}
+	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	pxor    XTMP3, XTMP2
+	add     y0, y2                  # y2 = S1 + CH
+	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add     (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+	pxor    XTMP2, XTMP4            # XTMP4 = s1 {xBxA}
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	pshufb  SHUF_00BA, XTMP4        # XTMP4 = s1 {00BA}
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	paddd   XTMP4, XTMP0            # XTMP0 = {..., ..., W[1], W[0]}
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	## compute high s1
+	pshufd  $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+					#
+	ROTATE_ARGS                     #
+	movdqa  XTMP2, XTMP3            # XTMP3 = W[-2] {DDCC}
+	mov     e, y0                   # y0 = e
+	ror     $(25-11), y0            # y0 = e >> (25-11)
+	mov     a, y1                   # y1 = a
+	movdqa  XTMP2, X0               # X0    = W[-2] {DDCC}
+	ror     $(22-13), y1            # y1 = a >> (22-13)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	mov     f, y2                   # y2 = f
+	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	psrlq   $17, XTMP2              # XTMP2 = W[-2] ror 17 {xDxC}
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	xor     g, y2                   # y2 = f^g
+	psrlq   $19, XTMP3              # XTMP3 = W[-2] ror 19 {xDxC}
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25
+	and     e, y2                   # y2 = (f^g)&e
+	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	psrld   $10, X0                 # X0 = W[-2] >> 10 {DDCC}
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22
+	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	pxor    XTMP3, XTMP2            #
+	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
+	add     y0, y2                  # y2 = S1 + CH
+	add     (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+	pxor    XTMP2, X0               # X0 = s1 {xDxC}
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	pshufb  SHUF_DC00, X0           # X0 = s1 {DC00}
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	paddd   XTMP0, X0               # X0 = {W[3], W[2], W[1], W[0]}
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+
+	ROTATE_ARGS
+	rotate_Xs
+.endm
+
+## input is [rsp + _XFER + %1 * 4]
+.macro DO_ROUND round
+	mov     e, y0                 # y0 = e
+	ror     $(25-11), y0          # y0 = e >> (25-11)
+	mov     a, y1                 # y1 = a
+	xor     e, y0                 # y0 = e ^ (e >> (25-11))
+	ror     $(22-13), y1          # y1 = a >> (22-13)
+	mov     f, y2                 # y2 = f
+	xor     a, y1                 # y1 = a ^ (a >> (22-13)
+	ror     $(11-6), y0           # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor     g, y2                 # y2 = f^g
+	xor     e, y0                 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	ror     $(13-2), y1           # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	and     e, y2                 # y2 = (f^g)&e
+	xor     a, y1                 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	ror     $6, y0                # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor     g, y2                 # y2 = CH = ((f^g)&e)^g
+	add     y0, y2                # y2 = S1 + CH
+	ror     $2, y1                # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	offset = \round * 4 + _XFER
+	add     offset(%rsp), y2      # y2 = k + w + S1 + CH
+	mov     a, y0                 # y0 = a
+	add     y2, h                 # h = h + S1 + CH + k + w
+	mov     a, y2                 # y2 = a
+	or      c, y0                 # y0 = a|c
+	add     h, d                  # d = d + h + S1 + CH + k + w
+	and     c, y2                 # y2 = a&c
+	and     b, y0                 # y0 = (a|c)&b
+	add     y1, h                 # h = h + S1 + CH + k + w + S0
+	or      y2, y0		      # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h		      # h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+.endm
+
+########################################################################
+## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks)
+## arg 1 : pointer to input data
+## arg 2 : pointer to digest
+## arg 3 : Num blocks
+########################################################################
+.text
+ENTRY(sha256_transform_ssse3)
+.align 32
+	pushq   %rbx
+	pushq   %rbp
+	pushq   %r13
+	pushq   %r14
+	pushq   %r15
+	pushq   %r12
+
+	mov	%rsp, %r12
+	subq    $STACK_SIZE, %rsp
+	and	$~15, %rsp
+
+	shl     $6, NUM_BLKS		 # convert to bytes
+	jz      done_hash
+	add     INP, NUM_BLKS
+	mov     NUM_BLKS, _INP_END(%rsp) # pointer to end of data
+
+	## load initial digest
+	mov     4*0(CTX), a
+	mov     4*1(CTX), b
+	mov     4*2(CTX), c
+	mov     4*3(CTX), d
+	mov     4*4(CTX), e
+	mov     4*5(CTX), f
+	mov     4*6(CTX), g
+	mov     4*7(CTX), h
+
+	movdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+	movdqa  _SHUF_00BA(%rip), SHUF_00BA
+	movdqa  _SHUF_DC00(%rip), SHUF_DC00
+
+loop0:
+	lea     K256(%rip), TBL
+
+	## byte swap first 16 dwords
+	COPY_XMM_AND_BSWAP      X0, 0*16(INP), BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X1, 1*16(INP), BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X2, 2*16(INP), BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X3, 3*16(INP), BYTE_FLIP_MASK
+
+	mov     INP, _INP(%rsp)
+
+	## schedule 48 input dwords, by doing 3 rounds of 16 each
+	mov     $3, SRND
+.align 16
+loop1:
+	movdqa  (TBL), XFER
+	paddd   X0, XFER
+	movdqa  XFER, _XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	movdqa  1*16(TBL), XFER
+	paddd   X0, XFER
+	movdqa  XFER, _XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	movdqa  2*16(TBL), XFER
+	paddd   X0, XFER
+	movdqa  XFER, _XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	movdqa  3*16(TBL), XFER
+	paddd   X0, XFER
+	movdqa  XFER, _XFER(%rsp)
+	add     $4*16, TBL
+	FOUR_ROUNDS_AND_SCHED
+
+	sub     $1, SRND
+	jne     loop1
+
+	mov     $2, SRND
+loop2:
+	paddd   (TBL), X0
+	movdqa  X0, _XFER(%rsp)
+	DO_ROUND        0
+	DO_ROUND        1
+	DO_ROUND        2
+	DO_ROUND        3
+	paddd   1*16(TBL), X1
+	movdqa  X1, _XFER(%rsp)
+	add     $2*16, TBL
+	DO_ROUND        0
+	DO_ROUND        1
+	DO_ROUND        2
+	DO_ROUND        3
+
+	movdqa  X2, X0
+	movdqa  X3, X1
+
+	sub     $1, SRND
+	jne     loop2
+
+	addm    (4*0)(CTX),a
+	addm    (4*1)(CTX),b
+	addm    (4*2)(CTX),c
+	addm    (4*3)(CTX),d
+	addm    (4*4)(CTX),e
+	addm    (4*5)(CTX),f
+	addm    (4*6)(CTX),g
+	addm    (4*7)(CTX),h
+
+	mov     _INP(%rsp), INP
+	add     $64, INP
+	cmp     _INP_END(%rsp), INP
+	jne     loop0
+
+done_hash:
+
+	mov	%r12, %rsp
+
+	popq    %r12
+	popq    %r15
+	popq    %r14
+	popq    %r13
+	popq    %rbp
+	popq    %rbx
+
+	ret
+ENDPROC(sha256_transform_ssse3)
+
+.data
+.align 64
+K256:
+        .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+        .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+        .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+        .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+        .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+        .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+        .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+        .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+        .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+        .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+        .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+        .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+        .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+        .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+        .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+        .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x0c0d0e0f08090a0b0405060700010203
+
+# shuffle xBxA -> 00BA
+_SHUF_00BA:
+	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+# shuffle xDxC -> DC00
+_SHUF_DC00:
+	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
+#endif
--- a/pkg/crypto/sha256/sha256.go
+++ b/pkg/crypto/sha256/sha256.go
@@ -0,0 +1,41 @@
+// +build darwin windows 386 arm !cgo
+
+/*
+ * Minio Cloud Storage, (C) 2014-2016 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package sha256
+
+import (
+	"hash"
+
+	"crypto/sha256"
+)
+
+// Size - The size of a SHA256 checksum in bytes.
+const Size = 32
+
+// BlockSize - The blocksize of SHA256 in bytes.
+const BlockSize = 64
+
+// New returns a new hash.Hash computing SHA256.
+func New() hash.Hash {
+	return sha256.New()
+}
+
+// Sum256 - single caller sha256 helper
+func Sum256(data []byte) [Size]byte {
+	return sha256.Sum256(data)
+}
--- a/pkg/crypto/sha256/sha256_linux.go
+++ b/pkg/crypto/sha256/sha256_linux.go
@@ -0,0 +1,177 @@
+// +build linux,amd64,cgo
+
+/*
+ * Minio Cloud Storage, (C) 2016 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file of
+// Golang project:
+//    https://github.com/golang/go/blob/master/LICENSE
+
+// Using this part of Minio codebase under the license
+// Apache License Version 2.0 with modifications
+
+// Package sha256 provides SHA256SSE3, SHA256AVX, SHA256AVX2
+package sha256
+
+import (
+	"hash"
+
+	"github.com/minio/minio/pkg/cpu"
+)
+
+// Size - The size of a SHA256 checksum in bytes.
+const Size = 32
+
+// BlockSize - The blocksize of SHA256 in bytes.
+const BlockSize = 64
+
+const (
+	chunk = 64
+	init0 = 0x6A09E667
+	init1 = 0xBB67AE85
+	init2 = 0x3C6EF372
+	init3 = 0xA54FF53A
+	init4 = 0x510E527F
+	init5 = 0x9B05688C
+	init6 = 0x1F83D9AB
+	init7 = 0x5BE0CD19
+)
+
+// digest represents the partial evaluation of a checksum.
+type digest struct {
+	h   [8]uint32
+	x   [chunk]byte
+	nx  int
+	len uint64
+}
+
+// Reset digest back to default
+func (d *digest) Reset() {
+	d.h[0] = init0
+	d.h[1] = init1
+	d.h[2] = init2
+	d.h[3] = init3
+	d.h[4] = init4
+	d.h[5] = init5
+	d.h[6] = init6
+	d.h[7] = init7
+	d.nx = 0
+	d.len = 0
+}
+
+func block(dig *digest, p []byte) {
+	switch true {
+	case cpu.HasAVX2() == true:
+		blockAVX2(dig, p)
+	case cpu.HasAVX() == true:
+		blockAVX(dig, p)
+	case cpu.HasSSE41() == true:
+		blockSSE(dig, p)
+	default:
+		blockGeneric(dig, p)
+	}
+}
+
+// New returns a new hash.Hash computing the SHA256 checksum.
+func New() hash.Hash {
+	d := new(digest)
+	d.Reset()
+	return d
+}
+
+// Sum256 - single caller sha256 helper
+func Sum256(data []byte) [Size]byte {
+	var d digest
+	d.Reset()
+	d.Write(data)
+	return d.checkSum()
+}
+
+// Return size of checksum
+func (d *digest) Size() int { return Size }
+
+// Return blocksize of checksum
+func (d *digest) BlockSize() int { return BlockSize }
+
+// Write to digest
+func (d *digest) Write(p []byte) (nn int, err error) {
+	nn = len(p)
+	d.len += uint64(nn)
+	if d.nx > 0 {
+		n := copy(d.x[d.nx:], p)
+		d.nx += n
+		if d.nx == chunk {
+			block(d, d.x[:])
+			d.nx = 0
+		}
+		p = p[n:]
+	}
+	if len(p) >= chunk {
+		n := len(p) &^ (chunk - 1)
+		block(d, p[:n])
+		p = p[n:]
+	}
+	if len(p) > 0 {
+		d.nx = copy(d.x[:], p)
+	}
+	return
+}
+
+// Return sha256 sum in bytes
+func (d *digest) Sum(in []byte) []byte {
+	// Make a copy of d0 so that caller can keep writing and summing.
+	d0 := *d
+	hash := d0.checkSum()
+	return append(in, hash[:]...)
+}
+
+// Intermediate checksum function
+func (d *digest) checkSum() [Size]byte {
+	len := d.len
+	// Padding.  Add a 1 bit and 0 bits until 56 bytes mod 64.
+	var tmp [64]byte
+	tmp[0] = 0x80
+	if len%64 < 56 {
+		d.Write(tmp[0 : 56-len%64])
+	} else {
+		d.Write(tmp[0 : 64+56-len%64])
+	}
+
+	// Length in bits.
+	len <<= 3
+	for i := uint(0); i < 8; i++ {
+		tmp[i] = byte(len >> (56 - 8*i))
+	}
+	d.Write(tmp[0:8])
+
+	if d.nx != 0 {
+		panic("d.nx != 0")
+	}
+
+	h := d.h[:]
+
+	var digest [Size]byte
+	for i, s := range h {
+		digest[i*4] = byte(s >> 24)
+		digest[i*4+1] = byte(s >> 16)
+		digest[i*4+2] = byte(s >> 8)
+		digest[i*4+3] = byte(s)
+	}
+
+	return digest
+}
--- a/pkg/crypto/sha256/sha256_test.go
+++ b/pkg/crypto/sha256/sha256_test.go
@@ -0,0 +1,141 @@
+/*
+ * Minio Cloud Storage, (C) 2014-2016 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file of
+// Golang project:
+//    https://github.com/golang/go/blob/master/LICENSE
+
+// Using this part of Minio codebase under the license
+// Apache License Version 2.0 with modifications
+
+// SHA256 hash algorithm.  See FIPS 180-2.
+
+package sha256
+
+import (
+	"fmt"
+	"io"
+	"testing"
+)
+
+type sha256Test struct {
+	out string
+	in  string
+}
+
+var golden = []sha256Test{
+	{"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", ""},
+	{"ca978112ca1bbdcafac231b39a23dc4da786eff8147c4e72b9807785afee48bb", "a"},
+	{"fb8e20fc2e4c3f248c60c39bd652f3c1347298bb977b8b4d5903b85055620603", "ab"},
+	{"ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad", "abc"},
+	{"88d4266fd4e6338d13b845fcf289579d209c897823b9217da3e161936f031589", "abcd"},
+	{"36bbe50ed96841d10443bcb670d6554f0a34b761be67ec9c4a8ad2c0c44ca42c", "abcde"},
+	{"bef57ec7f53a6d40beb640a780a639c83bc29ac8a9816f1fc6c5c6dcd93c4721", "abcdef"},
+	{"7d1a54127b222502f5b79b5fb0803061152a44f92b37e23c6527baf665d4da9a", "abcdefg"},
+	{"9c56cc51b374c3ba189210d5b6d4bf57790d351c96c47c02190ecf1e430635ab", "abcdefgh"},
+	{"19cc02f26df43cc571bc9ed7b0c4d29224a3ec229529221725ef76d021c8326f", "abcdefghi"},
+	{"72399361da6a7754fec986dca5b7cbaf1c810a28ded4abaf56b2106d06cb78b0", "abcdefghij"},
+	{"a144061c271f152da4d151034508fed1c138b8c976339de229c3bb6d4bbb4fce", "Discard medicine more than two years old."},
+	{"6dae5caa713a10ad04b46028bf6dad68837c581616a1589a265a11288d4bb5c4", "He who has a shady past knows that nice guys finish last."},
+	{"ae7a702a9509039ddbf29f0765e70d0001177914b86459284dab8b348c2dce3f", "I wouldn't marry him with a ten foot pole."},
+	{"6748450b01c568586715291dfa3ee018da07d36bb7ea6f180c1af6270215c64f", "Free! Free!/A trip/to Mars/for 900/empty jars/Burma Shave"},
+	{"14b82014ad2b11f661b5ae6a99b75105c2ffac278cd071cd6c05832793635774", "The days of the digital watch are numbered.  -Tom Stoppard"},
+	{"7102cfd76e2e324889eece5d6c41921b1e142a4ac5a2692be78803097f6a48d8", "Nepal premier won't resign."},
+	{"23b1018cd81db1d67983c5f7417c44da9deb582459e378d7a068552ea649dc9f", "For every action there is an equal and opposite government program."},
+	{"8001f190dfb527261c4cfcab70c98e8097a7a1922129bc4096950e57c7999a5a", "His money is twice tainted: 'taint yours and 'taint mine."},
+	{"8c87deb65505c3993eb24b7a150c4155e82eee6960cf0c3a8114ff736d69cad5", "There is no reason for any individual to have a computer in their home. -Ken Olsen, 1977"},
+	{"bfb0a67a19cdec3646498b2e0f751bddc41bba4b7f30081b0b932aad214d16d7", "It's a tiny change to the code and not completely disgusting. - Bob Manchek"},
+	{"7f9a0b9bf56332e19f5a0ec1ad9c1425a153da1c624868fda44561d6b74daf36", "size:  a.out:  bad magic"},
+	{"b13f81b8aad9e3666879af19886140904f7f429ef083286195982a7588858cfc", "The major problem is with sendmail.  -Mark Horton"},
+	{"b26c38d61519e894480c70c8374ea35aa0ad05b2ae3d6674eec5f52a69305ed4", "Give me a rock, paper and scissors and I will move the world.  CCFestoon"},
+	{"049d5e26d4f10222cd841a119e38bd8d2e0d1129728688449575d4ff42b842c1", "If the enemy is within range, then so are you."},
+	{"0e116838e3cc1c1a14cd045397e29b4d087aa11b0853fc69ec82e90330d60949", "It's well we cannot hear the screams/That we create in others' dreams."},
+	{"4f7d8eb5bcf11de2a56b971021a444aa4eafd6ecd0f307b5109e4e776cd0fe46", "You remind me of a TV show, but that's all right: I watch it anyway."},
+	{"61c0cc4c4bd8406d5120b3fb4ebc31ce87667c162f29468b3c779675a85aebce", "C is as portable as Stonehedge!!"},
+	{"1fb2eb3688093c4a3f80cd87a5547e2ce940a4f923243a79a2a1e242220693ac", "Even if I could be Shakespeare, I think I should still choose to be Faraday. - A. Huxley"},
+	{"395585ce30617b62c80b93e8208ce866d4edc811a177fdb4b82d3911d8696423", "The fugacity of a constituent in a mixture of gases at a given temperature is proportional to its mole fraction.  Lewis-Randall Rule"},
+	{"4f9b189a13d030838269dce846b16a1ce9ce81fe63e65de2f636863336a98fe6", "How can you write a big system without C++?  -Paul Glick"},
+}
+
+func TestGolden(t *testing.T) {
+	for i := 0; i < len(golden); i++ {
+		g := golden[i]
+		s := fmt.Sprintf("%x", Sum256([]byte(g.in)))
+		if s != g.out {
+			t.Fatalf("Sum256 function: sha256(%s) = %s want %s", g.in, s, g.out)
+		}
+		c := New()
+		for j := 0; j < 3; j++ {
+			if j < 2 {
+				io.WriteString(c, g.in)
+			} else {
+				io.WriteString(c, g.in[0:len(g.in)/2])
+				c.Sum(nil)
+				io.WriteString(c, g.in[len(g.in)/2:])
+			}
+			s := fmt.Sprintf("%x", c.Sum(nil))
+			if s != g.out {
+				t.Fatalf("sha256[%d](%s) = %s want %s", j, g.in, s, g.out)
+			}
+			c.Reset()
+		}
+	}
+}
+
+func TestSize(t *testing.T) {
+	c := New()
+	if got := c.Size(); got != Size {
+		t.Errorf("Size = %d; want %d", got, Size)
+	}
+}
+
+func TestBlockSize(t *testing.T) {
+	c := New()
+	if got := c.BlockSize(); got != BlockSize {
+		t.Errorf("BlockSize = %d want %d", got, BlockSize)
+	}
+}
+
+var bench = New()
+var buf = make([]byte, 1024*1024)
+
+func benchmarkSize(b *testing.B, size int) {
+	b.SetBytes(int64(size))
+	sum := make([]byte, bench.Size())
+	for i := 0; i < b.N; i++ {
+		bench.Reset()
+		bench.Write(buf[:size])
+		bench.Sum(sum[:0])
+	}
+}
+
+func BenchmarkHash8Bytes(b *testing.B) {
+	benchmarkSize(b, 8)
+}
+
+func BenchmarkHash1K(b *testing.B) {
+	benchmarkSize(b, 1024)
+}
+
+func BenchmarkHash8K(b *testing.B) {
+	benchmarkSize(b, 8192)
+}
+
+func BenchmarkHash1M(b *testing.B) {
+	benchmarkSize(b, 1024*1024)
+}
--- a/pkg/crypto/sha256/sha256block.go
+++ b/pkg/crypto/sha256/sha256block.go
@@ -0,0 +1,162 @@
+// +build linux,amd64,cgo
+
+//
+// Minio Cloud Storage, (C) 2015 Minio, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Software block transform are provided by The Go Authors:
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file of
+// Golang project:
+//    https://github.com/golang/go/blob/master/LICENSE
+
+package sha256
+
+// #cgo CFLAGS: -DHAS_SSE41 -DHAS_AVX -DHAS_AVX2
+// #include <stdint.h>
+// void sha256_transform_ssse3 (const char *input_data, uint32_t *digest, unsigned long num_blks);
+// void sha256_transform_avx (const char *input_data, uint32_t *digest, unsigned long num_blks);
+// void sha256_transform_rorx (const char *input_data, uint32_t *digest, unsigned long num_blks);
+import "C"
+import "unsafe"
+
+func blockSSE(dig *digest, p []byte) {
+	C.sha256_transform_ssse3((*C.char)(unsafe.Pointer(&p[0])), (*C.uint32_t)(unsafe.Pointer(&dig.h[0])), (C.ulong)(len(p)/64))
+}
+
+func blockAVX(dig *digest, p []byte) {
+	C.sha256_transform_avx((*C.char)(unsafe.Pointer(&p[0])), (*C.uint32_t)(unsafe.Pointer(&dig.h[0])), (C.ulong)(len(p)/64))
+}
+
+func blockAVX2(dig *digest, p []byte) {
+	C.sha256_transform_rorx((*C.char)(unsafe.Pointer(&p[0])), (*C.uint32_t)(unsafe.Pointer(&dig.h[0])), (C.ulong)(len(p)/64))
+}
+
+func blockGeneric(dig *digest, p []byte) {
+	var w [64]uint32
+	h0, h1, h2, h3, h4, h5, h6, h7 := dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7]
+	for len(p) >= chunk {
+		// Can interlace the computation of w with the
+		// rounds below if needed for speed.
+		for i := 0; i < 16; i++ {
+			j := i * 4
+			w[i] = uint32(p[j])<<24 | uint32(p[j+1])<<16 | uint32(p[j+2])<<8 | uint32(p[j+3])
+		}
+		for i := 16; i < 64; i++ {
+			v1 := w[i-2]
+			t1 := (v1>>17 | v1<<(32-17)) ^ (v1>>19 | v1<<(32-19)) ^ (v1 >> 10)
+			v2 := w[i-15]
+			t2 := (v2>>7 | v2<<(32-7)) ^ (v2>>18 | v2<<(32-18)) ^ (v2 >> 3)
+			w[i] = t1 + w[i-7] + t2 + w[i-16]
+		}
+
+		a, b, c, d, e, f, g, h := h0, h1, h2, h3, h4, h5, h6, h7
+
+		for i := 0; i < 64; i++ {
+			t1 := h + ((e>>6 | e<<(32-6)) ^ (e>>11 | e<<(32-11)) ^ (e>>25 | e<<(32-25))) + ((e & f) ^ (^e & g)) + _K[i] + w[i]
+
+			t2 := ((a>>2 | a<<(32-2)) ^ (a>>13 | a<<(32-13)) ^ (a>>22 | a<<(32-22))) + ((a & b) ^ (a & c) ^ (b & c))
+
+			h = g
+			g = f
+			f = e
+			e = d + t1
+			d = c
+			c = b
+			b = a
+			a = t1 + t2
+		}
+
+		h0 += a
+		h1 += b
+		h2 += c
+		h3 += d
+		h4 += e
+		h5 += f
+		h6 += g
+		h7 += h
+
+		p = p[chunk:]
+	}
+
+	dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7] = h0, h1, h2, h3, h4, h5, h6, h7
+}
+
+var _K = []uint32{
+	0x428a2f98,
+	0x71374491,
+	0xb5c0fbcf,
+	0xe9b5dba5,
+	0x3956c25b,
+	0x59f111f1,
+	0x923f82a4,
+	0xab1c5ed5,
+	0xd807aa98,
+	0x12835b01,
+	0x243185be,
+	0x550c7dc3,
+	0x72be5d74,
+	0x80deb1fe,
+	0x9bdc06a7,
+	0xc19bf174,
+	0xe49b69c1,
+	0xefbe4786,
+	0x0fc19dc6,
+	0x240ca1cc,
+	0x2de92c6f,
+	0x4a7484aa,
+	0x5cb0a9dc,
+	0x76f988da,
+	0x983e5152,
+	0xa831c66d,
+	0xb00327c8,
+	0xbf597fc7,
+	0xc6e00bf3,
+	0xd5a79147,
+	0x06ca6351,
+	0x14292967,
+	0x27b70a85,
+	0x2e1b2138,
+	0x4d2c6dfc,
+	0x53380d13,
+	0x650a7354,
+	0x766a0abb,
+	0x81c2c92e,
+	0x92722c85,
+	0xa2bfe8a1,
+	0xa81a664b,
+	0xc24b8b70,
+	0xc76c51a3,
+	0xd192e819,
+	0xd6990624,
+	0xf40e3585,
+	0x106aa070,
+	0x19a4c116,
+	0x1e376c08,
+	0x2748774c,
+	0x34b0bcb5,
+	0x391c0cb3,
+	0x4ed8aa4a,
+	0x5b9cca4f,
+	0x682e6ff3,
+	0x748f82ee,
+	0x78a5636f,
+	0x84c87814,
+	0x8cc70208,
+	0x90befffa,
+	0xa4506ceb,
+	0xbef9a3f7,
+	0xc67178f2,
+}
--- a/pkg/crypto/sha512/LICENSE
+++ b/pkg/crypto/sha512/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/pkg/crypto/sha512/sha512-avx-asm_linux_amd64.S
+++ b/pkg/crypto/sha512/sha512-avx-asm_linux_amd64.S
@@ -0,0 +1,686 @@
+########################################################################
+# Implement fast SHA-512 with AVX instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     David Cote <david.m.cote@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# Using this part of Minio codebase under the license
+# Apache License Version 2.0 with modifications
+##
+
+#ifdef HAS_AVX
+#ifndef ENTRY
+#define ENTRY(name) \
+        .globl name             ; \
+        .align 4,0x90           ; \
+        name:
+#endif
+
+#ifndef END
+#define END(name) \
+        .size name, .-name
+#endif
+
+#ifndef ENDPROC
+#define ENDPROC(name) \
+        .type name, @function   ; \
+        END(name)
+#endif
+
+#define NUM_INVALID		100
+
+#define TYPE_R32		0
+#define TYPE_R64		1
+#define TYPE_XMM		2
+#define TYPE_INVALID	100
+
+	.macro R32_NUM opd r32
+	\opd = NUM_INVALID
+	.ifc \r32,%eax
+	\opd = 0
+	.endif
+	.ifc \r32,%ecx
+	\opd = 1
+	.endif
+	.ifc \r32,%edx
+	\opd = 2
+	.endif
+	.ifc \r32,%ebx
+	\opd = 3
+	.endif
+	.ifc \r32,%esp
+	\opd = 4
+	.endif
+	.ifc \r32,%ebp
+	\opd = 5
+	.endif
+	.ifc \r32,%esi
+	\opd = 6
+	.endif
+	.ifc \r32,%edi
+	\opd = 7
+	.endif
+#ifdef X86_64
+	.ifc \r32,%r8d
+	\opd = 8
+	.endif
+	.ifc \r32,%r9d
+	\opd = 9
+	.endif
+	.ifc \r32,%r10d
+	\opd = 10
+	.endif
+	.ifc \r32,%r11d
+	\opd = 11
+	.endif
+	.ifc \r32,%r12d
+	\opd = 12
+	.endif
+	.ifc \r32,%r13d
+	\opd = 13
+	.endif
+	.ifc \r32,%r14d
+	\opd = 14
+	.endif
+	.ifc \r32,%r15d
+	\opd = 15
+	.endif
+#endif
+	.endm
+
+	.macro R64_NUM opd r64
+	\opd = NUM_INVALID
+#ifdef X86_64
+	.ifc \r64,%rax
+	\opd = 0
+	.endif
+	.ifc \r64,%rcx
+	\opd = 1
+	.endif
+	.ifc \r64,%rdx
+	\opd = 2
+	.endif
+	.ifc \r64,%rbx
+	\opd = 3
+	.endif
+	.ifc \r64,%rsp
+	\opd = 4
+	.endif
+	.ifc \r64,%rbp
+	\opd = 5
+	.endif
+	.ifc \r64,%rsi
+	\opd = 6
+	.endif
+	.ifc \r64,%rdi
+	\opd = 7
+	.endif
+	.ifc \r64,%r8
+	\opd = 8
+	.endif
+	.ifc \r64,%r9
+	\opd = 9
+	.endif
+	.ifc \r64,%r10
+	\opd = 10
+	.endif
+	.ifc \r64,%r11
+	\opd = 11
+	.endif
+	.ifc \r64,%r12
+	\opd = 12
+	.endif
+	.ifc \r64,%r13
+	\opd = 13
+	.endif
+	.ifc \r64,%r14
+	\opd = 14
+	.endif
+	.ifc \r64,%r15
+	\opd = 15
+	.endif
+#endif
+	.endm
+
+	.macro XMM_NUM opd xmm
+	\opd = NUM_INVALID
+	.ifc \xmm,%xmm0
+	\opd = 0
+	.endif
+	.ifc \xmm,%xmm1
+	\opd = 1
+	.endif
+	.ifc \xmm,%xmm2
+	\opd = 2
+	.endif
+	.ifc \xmm,%xmm3
+	\opd = 3
+	.endif
+	.ifc \xmm,%xmm4
+	\opd = 4
+	.endif
+	.ifc \xmm,%xmm5
+	\opd = 5
+	.endif
+	.ifc \xmm,%xmm6
+	\opd = 6
+	.endif
+	.ifc \xmm,%xmm7
+	\opd = 7
+	.endif
+	.ifc \xmm,%xmm8
+	\opd = 8
+	.endif
+	.ifc \xmm,%xmm9
+	\opd = 9
+	.endif
+	.ifc \xmm,%xmm10
+	\opd = 10
+	.endif
+	.ifc \xmm,%xmm11
+	\opd = 11
+	.endif
+	.ifc \xmm,%xmm12
+	\opd = 12
+	.endif
+	.ifc \xmm,%xmm13
+	\opd = 13
+	.endif
+	.ifc \xmm,%xmm14
+	\opd = 14
+	.endif
+	.ifc \xmm,%xmm15
+	\opd = 15
+	.endif
+	.endm
+
+	.macro TYPE type reg
+	R32_NUM reg_type_r32 \reg
+	R64_NUM reg_type_r64 \reg
+	XMM_NUM reg_type_xmm \reg
+	.if reg_type_r64 <> NUM_INVALID
+	\type = TYPE_R64
+	.elseif reg_type_r32 <> NUM_INVALID
+	\type = TYPE_R32
+	.elseif reg_type_xmm <> NUM_INVALID
+	\type = TYPE_XMM
+	.else
+	\type = TYPE_INVALID
+	.endif
+	.endm
+
+	.macro PFX_OPD_SIZE
+	.byte 0x66
+	.endm
+
+	.macro PFX_REX opd1 opd2 W=0
+	.if ((\opd1 | \opd2) & 8) || \W
+	.byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3)
+	.endif
+	.endm
+
+	.macro MODRM mod opd1 opd2
+	.byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3)
+	.endm
+
+	.macro PSHUFB_XMM xmm1 xmm2
+	XMM_NUM pshufb_opd1 \xmm1
+	XMM_NUM pshufb_opd2 \xmm2
+	PFX_OPD_SIZE
+	PFX_REX pshufb_opd1 pshufb_opd2
+	.byte 0x0f, 0x38, 0x00
+	MODRM 0xc0 pshufb_opd1 pshufb_opd2
+	.endm
+
+	.macro PCLMULQDQ imm8 xmm1 xmm2
+	XMM_NUM clmul_opd1 \xmm1
+	XMM_NUM clmul_opd2 \xmm2
+	PFX_OPD_SIZE
+	PFX_REX clmul_opd1 clmul_opd2
+	.byte 0x0f, 0x3a, 0x44
+	MODRM 0xc0 clmul_opd1 clmul_opd2
+	.byte \imm8
+	.endm
+
+	.macro PEXTRD imm8 xmm gpr
+	R32_NUM extrd_opd1 \gpr
+	XMM_NUM extrd_opd2 \xmm
+	PFX_OPD_SIZE
+	PFX_REX extrd_opd1 extrd_opd2
+	.byte 0x0f, 0x3a, 0x16
+	MODRM 0xc0 extrd_opd1 extrd_opd2
+	.byte \imm8
+	.endm
+
+	.macro MOVQ_R64_XMM opd1 opd2
+	TYPE movq_r64_xmm_opd1_type \opd1
+	.if movq_r64_xmm_opd1_type == TYPE_XMM
+	XMM_NUM movq_r64_xmm_opd1 \opd1
+	R64_NUM movq_r64_xmm_opd2 \opd2
+	.else
+	R64_NUM movq_r64_xmm_opd1 \opd1
+	XMM_NUM movq_r64_xmm_opd2 \opd2
+	.endif
+	PFX_OPD_SIZE
+	PFX_REX movq_r64_xmm_opd1 movq_r64_xmm_opd2 1
+	.if movq_r64_xmm_opd1_type == TYPE_XMM
+	.byte 0x0f, 0x7e
+	.else
+	.byte 0x0f, 0x6e
+	.endif
+	MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2
+	.endm
+
+.text
+
+# Virtual Registers
+# ARG1
+msg	= %rdi
+# ARG2
+digest	= %rsi
+# ARG3
+msglen	= %rdx
+T1	= %rcx
+T2	= %r8
+a_64	= %r9
+b_64	= %r10
+c_64	= %r11
+d_64	= %r12
+e_64	= %r13
+f_64	= %r14
+g_64	= %r15
+h_64	= %rbx
+tmp0	= %rax
+
+# Local variables (stack frame)
+
+# Message Schedule
+W_SIZE = 80*8
+# W[t] + K[t] | W[t+1] + K[t+1]
+WK_SIZE = 2*8
+RSPSAVE_SIZE = 1*8
+GPRSAVE_SIZE = 5*8
+
+frame_W = 0
+frame_WK = frame_W + W_SIZE
+frame_RSPSAVE = frame_WK + WK_SIZE
+frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
+frame_size = frame_GPRSAVE + GPRSAVE_SIZE
+
+# Useful QWORD "arrays" for simpler memory references
+# MSG, DIGEST, K_t, W_t are arrays
+# WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
+
+# Input message (arg1)
+#define MSG(i)    8*i(msg)
+
+# Output Digest (arg2)
+#define DIGEST(i) 8*i(digest)
+
+# SHA Constants (static mem)
+#define K_t(i)    8*i+K512(%rip)
+
+# Message Schedule (stack frame)
+#define W_t(i)    8*i+frame_W(%rsp)
+
+# W[t]+K[t] (stack frame)
+#define WK_2(i)   8*((i%2))+frame_WK(%rsp)
+
+.macro RotateState
+	# Rotate symbols a..h right
+	TMP   = h_64
+	h_64  = g_64
+	g_64  = f_64
+	f_64  = e_64
+	e_64  = d_64
+	d_64  = c_64
+	c_64  = b_64
+	b_64  = a_64
+	a_64  = TMP
+.endm
+
+.macro RORQ p1 p2
+	# shld is faster than ror on Sandybridge
+	shld	$(64-\p2), \p1, \p1
+.endm
+
+.macro SHA512_Round rnd
+	# Compute Round %%t
+	mov     f_64, T1          # T1 = f
+	mov     e_64, tmp0        # tmp = e
+	xor     g_64, T1          # T1 = f ^ g
+	RORQ    tmp0, 23   # 41    # tmp = e ror 23
+	and     e_64, T1          # T1 = (f ^ g) & e
+	xor     e_64, tmp0        # tmp = (e ror 23) ^ e
+	xor     g_64, T1          # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+	idx = \rnd
+	add     WK_2(idx), T1     # W[t] + K[t] from message scheduler
+	RORQ    tmp0, 4   # 18    # tmp = ((e ror 23) ^ e) ror 4
+	xor     e_64, tmp0        # tmp = (((e ror 23) ^ e) ror 4) ^ e
+	mov     a_64, T2          # T2 = a
+	add     h_64, T1          # T1 = CH(e,f,g) + W[t] + K[t] + h
+	RORQ    tmp0, 14  # 14    # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+	add     tmp0, T1          # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+	mov     a_64, tmp0        # tmp = a
+	xor     c_64, T2          # T2 = a ^ c
+	and     c_64, tmp0        # tmp = a & c
+	and     b_64, T2          # T2 = (a ^ c) & b
+	xor     tmp0, T2          # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+	mov     a_64, tmp0        # tmp = a
+	RORQ    tmp0, 5  # 39     # tmp = a ror 5
+	xor     a_64, tmp0        # tmp = (a ror 5) ^ a
+	add     T1, d_64          # e(next_state) = d + T1
+	RORQ    tmp0, 6  # 34     # tmp = ((a ror 5) ^ a) ror 6
+	xor     a_64, tmp0        # tmp = (((a ror 5) ^ a) ror 6) ^ a
+	lea     (T1, T2), h_64    # a(next_state) = T1 + Maj(a,b,c)
+	RORQ    tmp0, 28  # 28    # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+	add     tmp0, h_64        # a(next_state) = T1 + Maj(a,b,c) S0(a)
+	RotateState
+.endm
+
+.macro SHA512_2Sched_2Round_avx rnd
+	# Compute rounds t-2 and t-1
+	# Compute message schedule QWORDS t and t+1
+
+	#   Two rounds are computed based on the values for K[t-2]+W[t-2] and
+	# K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+	# scheduler.
+	#   The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)].
+	# They are then added to their respective SHA512 constants at
+	# [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)]
+	#   For brievity, the comments following vectored instructions only refer to
+	# the first of a pair of QWORDS.
+	# Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]}
+	#   The computation of the message schedule and the rounds are tightly
+	# stitched to take advantage of instruction-level parallelism.
+
+	idx = \rnd - 2
+	vmovdqa	W_t(idx), %xmm4		# XMM4 = W[t-2]
+	idx = \rnd - 15
+	vmovdqu	W_t(idx), %xmm5		# XMM5 = W[t-15]
+	mov	f_64, T1
+	vpsrlq	$61, %xmm4, %xmm0	# XMM0 = W[t-2]>>61
+	mov	e_64, tmp0
+	vpsrlq	$1, %xmm5, %xmm6	# XMM6 = W[t-15]>>1
+	xor	g_64, T1
+	RORQ	tmp0, 23 # 41
+	vpsrlq	$19, %xmm4, %xmm1	# XMM1 = W[t-2]>>19
+	and	e_64, T1
+	xor	e_64, tmp0
+	vpxor	%xmm1, %xmm0, %xmm0	# XMM0 = W[t-2]>>61 ^ W[t-2]>>19
+	xor	g_64, T1
+	idx = \rnd
+	add	WK_2(idx), T1#
+	vpsrlq	$8, %xmm5, %xmm7	# XMM7 = W[t-15]>>8
+	RORQ	tmp0, 4 # 18
+	vpsrlq	$6, %xmm4, %xmm2	# XMM2 = W[t-2]>>6
+	xor	e_64, tmp0
+	mov	a_64, T2
+	add	h_64, T1
+	vpxor	%xmm7, %xmm6, %xmm6	# XMM6 = W[t-15]>>1 ^ W[t-15]>>8
+	RORQ	tmp0, 14 # 14
+	add	tmp0, T1
+	vpsrlq	$7, %xmm5, %xmm8	# XMM8 = W[t-15]>>7
+	mov	a_64, tmp0
+	xor	c_64, T2
+	vpsllq	$(64-61), %xmm4, %xmm3  # XMM3 = W[t-2]<<3
+	and	c_64, tmp0
+	and	b_64, T2
+	vpxor	%xmm3, %xmm2, %xmm2	# XMM2 = W[t-2]>>6 ^ W[t-2]<<3
+	xor	tmp0, T2
+	mov	a_64, tmp0
+	vpsllq	$(64-1), %xmm5, %xmm9	# XMM9 = W[t-15]<<63
+	RORQ	tmp0, 5 # 39
+	vpxor	%xmm9, %xmm8, %xmm8	# XMM8 = W[t-15]>>7 ^ W[t-15]<<63
+	xor	a_64, tmp0
+	add	T1, d_64
+	RORQ	tmp0, 6 # 34
+	xor	a_64, tmp0
+	vpxor	%xmm8, %xmm6, %xmm6	# XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^
+					#  W[t-15]>>7 ^ W[t-15]<<63
+	lea	(T1, T2), h_64
+	RORQ	tmp0, 28 # 28
+	vpsllq	$(64-19), %xmm4, %xmm4  # XMM4 = W[t-2]<<25
+	add	tmp0, h_64
+	RotateState
+	vpxor	%xmm4, %xmm0, %xmm0     # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^
+					#        W[t-2]<<25
+	mov	f_64, T1
+	vpxor	%xmm2, %xmm0, %xmm0     # XMM0 = s1(W[t-2])
+	mov	e_64, tmp0
+	xor	g_64, T1
+	idx = \rnd - 16
+	vpaddq	W_t(idx), %xmm0, %xmm0  # XMM0 = s1(W[t-2]) + W[t-16]
+	idx = \rnd - 7
+	vmovdqu	W_t(idx), %xmm1		# XMM1 = W[t-7]
+	RORQ	tmp0, 23 # 41
+	and	e_64, T1
+	xor	e_64, tmp0
+	xor	g_64, T1
+	vpsllq	$(64-8), %xmm5, %xmm5   # XMM5 = W[t-15]<<56
+	idx = \rnd + 1
+	add	WK_2(idx), T1
+	vpxor	%xmm5, %xmm6, %xmm6     # XMM6 = s0(W[t-15])
+	RORQ	tmp0, 4 # 18
+	vpaddq	%xmm6, %xmm0, %xmm0     # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15])
+	xor	e_64, tmp0
+	vpaddq	%xmm1, %xmm0, %xmm0     # XMM0 = W[t] = s1(W[t-2]) + W[t-7] +
+					#               s0(W[t-15]) + W[t-16]
+	mov	a_64, T2
+	add	h_64, T1
+	RORQ	tmp0, 14 # 14
+	add	tmp0, T1
+	idx = \rnd
+	vmovdqa	%xmm0, W_t(idx)		# Store W[t]
+	vpaddq	K_t(idx), %xmm0, %xmm0  # Compute W[t]+K[t]
+	vmovdqa	%xmm0, WK_2(idx)	# Store W[t]+K[t] for next rounds
+	mov	a_64, tmp0
+	xor	c_64, T2
+	and	c_64, tmp0
+	and	b_64, T2
+	xor	tmp0, T2
+	mov	a_64, tmp0
+	RORQ	tmp0, 5 # 39
+	xor	a_64, tmp0
+	add	T1, d_64
+	RORQ	tmp0, 6 # 34
+	xor	a_64, tmp0
+	lea	(T1, T2), h_64
+	RORQ	tmp0, 28 # 28
+	add	tmp0, h_64
+	RotateState
+.endm
+
+########################################################################
+# void sha512_transform_avx(const void* M, void* D, u64 L)
+# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+# The size of the message pointed to by M must be an integer multiple of SHA512
+# message blocks.
+# L is the message length in SHA512 blocks
+########################################################################
+ENTRY(sha512_transform_avx)
+	cmp $0, msglen
+	je nowork
+
+	# Allocate Stack Space
+	mov	%rsp, %rax
+	sub     $frame_size, %rsp
+	and	$~(0x20 - 1), %rsp
+	mov	%rax, frame_RSPSAVE(%rsp)
+
+	# Save GPRs
+	mov     %rbx, frame_GPRSAVE(%rsp)
+	mov     %r12, frame_GPRSAVE +8*1(%rsp)
+	mov     %r13, frame_GPRSAVE +8*2(%rsp)
+	mov     %r14, frame_GPRSAVE +8*3(%rsp)
+	mov     %r15, frame_GPRSAVE +8*4(%rsp)
+
+updateblock:
+
+	# Load state variables
+	mov     DIGEST(0), a_64
+	mov     DIGEST(1), b_64
+	mov     DIGEST(2), c_64
+	mov     DIGEST(3), d_64
+	mov     DIGEST(4), e_64
+	mov     DIGEST(5), f_64
+	mov     DIGEST(6), g_64
+	mov     DIGEST(7), h_64
+
+	t = 0
+	.rept 80/2 + 1
+	# (80 rounds) / (2 rounds/iteration) + (1 iteration)
+	# +1 iteration because the scheduler leads hashing by 1 iteration
+		.if t < 2
+			# BSWAP 2 QWORDS
+			vmovdqa  XMM_QWORD_BSWAP(%rip), %xmm1
+			vmovdqu  MSG(t), %xmm0
+			vpshufb  %xmm1, %xmm0, %xmm0    # BSWAP
+			vmovdqa  %xmm0, W_t(t) # Store Scheduled Pair
+			vpaddq   K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
+			vmovdqa  %xmm0, WK_2(t) # Store into WK for rounds
+		.elseif t < 16
+			# BSWAP 2 QWORDS# Compute 2 Rounds
+			vmovdqu  MSG(t), %xmm0
+			vpshufb  %xmm1, %xmm0, %xmm0    # BSWAP
+			SHA512_Round t-2    # Round t-2
+			vmovdqa  %xmm0, W_t(t) # Store Scheduled Pair
+			vpaddq   K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
+			SHA512_Round t-1    # Round t-1
+			vmovdqa  %xmm0, WK_2(t)# Store W[t]+K[t] into WK
+		.elseif t < 79
+			# Schedule 2 QWORDS# Compute 2 Rounds
+			SHA512_2Sched_2Round_avx t
+		.else
+			# Compute 2 Rounds
+			SHA512_Round t-2
+			SHA512_Round t-1
+		.endif
+		t = t+2
+	.endr
+
+	# Update digest
+	add     a_64, DIGEST(0)
+	add     b_64, DIGEST(1)
+	add     c_64, DIGEST(2)
+	add     d_64, DIGEST(3)
+	add     e_64, DIGEST(4)
+	add     f_64, DIGEST(5)
+	add     g_64, DIGEST(6)
+	add     h_64, DIGEST(7)
+
+	# Advance to next message block
+	add     $16*8, msg
+	dec     msglen
+	jnz     updateblock
+
+	# Restore GPRs
+	mov     frame_GPRSAVE(%rsp),      %rbx
+	mov     frame_GPRSAVE +8*1(%rsp), %r12
+	mov     frame_GPRSAVE +8*2(%rsp), %r13
+	mov     frame_GPRSAVE +8*3(%rsp), %r14
+	mov     frame_GPRSAVE +8*4(%rsp), %r15
+
+	# Restore Stack Pointer
+	mov	frame_RSPSAVE(%rsp), %rsp
+
+nowork:
+	ret
+ENDPROC(sha512_transform_avx)
+
+########################################################################
+### Binary Data
+
+.data
+
+.align 16
+
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP:
+	.octa 0x08090a0b0c0d0e0f0001020304050607
+
+# K[t] used in SHA512 hashing
+K512:
+	.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad 0x3956c25bf348b538,0x59f111f1b605d019
+	.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad 0xd807aa98a3030242,0x12835b0145706fbe
+	.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad 0x06ca6351e003826f,0x142929670a0e6e70
+	.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad 0x81c2c92e47edaee6,0x92722c851482353b
+	.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad 0xd192e819d6ef5218,0xd69906245565a910
+	.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad 0x90befffa23631e28,0xa4506cebde82bde9
+	.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad 0xca273eceea26619c,0xd186b8c721c0c207
+	.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad 0x113f9804bef90dae,0x1b710b35131c471b
+	.quad 0x28db77f523047d84,0x32caab7b40c72493
+	.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+#endif
--- a/pkg/crypto/sha512/sha512-avx2-asm_linux_amd64.S
+++ b/pkg/crypto/sha512/sha512-avx2-asm_linux_amd64.S
--- a/pkg/crypto/sha512/sha512-ssse3-asm_linux_amd64.S
+++ b/pkg/crypto/sha512/sha512-ssse3-asm_linux_amd64.S
@@ -0,0 +1,686 @@
+########################################################################
+# Implement fast SHA-512 with SSSE3 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     David Cote <david.m.cote@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# Using this part of Minio codebase under the license
+# Apache License Version 2.0 with modifications
+##
+
+#ifdef HAS_SSE41
+#ifndef ENTRY
+#define ENTRY(name) \
+        .globl name             ; \
+        .align 4,0x90           ; \
+        name:
+#endif
+
+#ifndef END
+#define END(name) \
+        .size name, .-name
+#endif
+
+#ifndef ENDPROC
+#define ENDPROC(name) \
+        .type name, @function   ; \
+        END(name)
+#endif
+
+#define NUM_INVALID		100
+
+#define TYPE_R32		0
+#define TYPE_R64		1
+#define TYPE_XMM		2
+#define TYPE_INVALID	100
+
+	.macro R32_NUM opd r32
+	\opd = NUM_INVALID
+	.ifc \r32,%eax
+	\opd = 0
+	.endif
+	.ifc \r32,%ecx
+	\opd = 1
+	.endif
+	.ifc \r32,%edx
+	\opd = 2
+	.endif
+	.ifc \r32,%ebx
+	\opd = 3
+	.endif
+	.ifc \r32,%esp
+	\opd = 4
+	.endif
+	.ifc \r32,%ebp
+	\opd = 5
+	.endif
+	.ifc \r32,%esi
+	\opd = 6
+	.endif
+	.ifc \r32,%edi
+	\opd = 7
+	.endif
+#ifdef X86_64
+	.ifc \r32,%r8d
+	\opd = 8
+	.endif
+	.ifc \r32,%r9d
+	\opd = 9
+	.endif
+	.ifc \r32,%r10d
+	\opd = 10
+	.endif
+	.ifc \r32,%r11d
+	\opd = 11
+	.endif
+	.ifc \r32,%r12d
+	\opd = 12
+	.endif
+	.ifc \r32,%r13d
+	\opd = 13
+	.endif
+	.ifc \r32,%r14d
+	\opd = 14
+	.endif
+	.ifc \r32,%r15d
+	\opd = 15
+	.endif
+#endif
+	.endm
+
+	.macro R64_NUM opd r64
+	\opd = NUM_INVALID
+#ifdef X86_64
+	.ifc \r64,%rax
+	\opd = 0
+	.endif
+	.ifc \r64,%rcx
+	\opd = 1
+	.endif
+	.ifc \r64,%rdx
+	\opd = 2
+	.endif
+	.ifc \r64,%rbx
+	\opd = 3
+	.endif
+	.ifc \r64,%rsp
+	\opd = 4
+	.endif
+	.ifc \r64,%rbp
+	\opd = 5
+	.endif
+	.ifc \r64,%rsi
+	\opd = 6
+	.endif
+	.ifc \r64,%rdi
+	\opd = 7
+	.endif
+	.ifc \r64,%r8
+	\opd = 8
+	.endif
+	.ifc \r64,%r9
+	\opd = 9
+	.endif
+	.ifc \r64,%r10
+	\opd = 10
+	.endif
+	.ifc \r64,%r11
+	\opd = 11
+	.endif
+	.ifc \r64,%r12
+	\opd = 12
+	.endif
+	.ifc \r64,%r13
+	\opd = 13
+	.endif
+	.ifc \r64,%r14
+	\opd = 14
+	.endif
+	.ifc \r64,%r15
+	\opd = 15
+	.endif
+#endif
+	.endm
+
+	.macro XMM_NUM opd xmm
+	\opd = NUM_INVALID
+	.ifc \xmm,%xmm0
+	\opd = 0
+	.endif
+	.ifc \xmm,%xmm1
+	\opd = 1
+	.endif
+	.ifc \xmm,%xmm2
+	\opd = 2
+	.endif
+	.ifc \xmm,%xmm3
+	\opd = 3
+	.endif
+	.ifc \xmm,%xmm4
+	\opd = 4
+	.endif
+	.ifc \xmm,%xmm5
+	\opd = 5
+	.endif
+	.ifc \xmm,%xmm6
+	\opd = 6
+	.endif
+	.ifc \xmm,%xmm7
+	\opd = 7
+	.endif
+	.ifc \xmm,%xmm8
+	\opd = 8
+	.endif
+	.ifc \xmm,%xmm9
+	\opd = 9
+	.endif
+	.ifc \xmm,%xmm10
+	\opd = 10
+	.endif
+	.ifc \xmm,%xmm11
+	\opd = 11
+	.endif
+	.ifc \xmm,%xmm12
+	\opd = 12
+	.endif
+	.ifc \xmm,%xmm13
+	\opd = 13
+	.endif
+	.ifc \xmm,%xmm14
+	\opd = 14
+	.endif
+	.ifc \xmm,%xmm15
+	\opd = 15
+	.endif
+	.endm
+
+	.macro TYPE type reg
+	R32_NUM reg_type_r32 \reg
+	R64_NUM reg_type_r64 \reg
+	XMM_NUM reg_type_xmm \reg
+	.if reg_type_r64 <> NUM_INVALID
+	\type = TYPE_R64
+	.elseif reg_type_r32 <> NUM_INVALID
+	\type = TYPE_R32
+	.elseif reg_type_xmm <> NUM_INVALID
+	\type = TYPE_XMM
+	.else
+	\type = TYPE_INVALID
+	.endif
+	.endm
+
+	.macro PFX_OPD_SIZE
+	.byte 0x66
+	.endm
+
+	.macro PFX_REX opd1 opd2 W=0
+	.if ((\opd1 | \opd2) & 8) || \W
+	.byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3)
+	.endif
+	.endm
+
+	.macro MODRM mod opd1 opd2
+	.byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3)
+	.endm
+
+	.macro PSHUFB_XMM xmm1 xmm2
+	XMM_NUM pshufb_opd1 \xmm1
+	XMM_NUM pshufb_opd2 \xmm2
+	PFX_OPD_SIZE
+	PFX_REX pshufb_opd1 pshufb_opd2
+	.byte 0x0f, 0x38, 0x00
+	MODRM 0xc0 pshufb_opd1 pshufb_opd2
+	.endm
+
+	.macro PCLMULQDQ imm8 xmm1 xmm2
+	XMM_NUM clmul_opd1 \xmm1
+	XMM_NUM clmul_opd2 \xmm2
+	PFX_OPD_SIZE
+	PFX_REX clmul_opd1 clmul_opd2
+	.byte 0x0f, 0x3a, 0x44
+	MODRM 0xc0 clmul_opd1 clmul_opd2
+	.byte \imm8
+	.endm
+
+	.macro PEXTRD imm8 xmm gpr
+	R32_NUM extrd_opd1 \gpr
+	XMM_NUM extrd_opd2 \xmm
+	PFX_OPD_SIZE
+	PFX_REX extrd_opd1 extrd_opd2
+	.byte 0x0f, 0x3a, 0x16
+	MODRM 0xc0 extrd_opd1 extrd_opd2
+	.byte \imm8
+	.endm
+
+	.macro MOVQ_R64_XMM opd1 opd2
+	TYPE movq_r64_xmm_opd1_type \opd1
+	.if movq_r64_xmm_opd1_type == TYPE_XMM
+	XMM_NUM movq_r64_xmm_opd1 \opd1
+	R64_NUM movq_r64_xmm_opd2 \opd2
+	.else
+	R64_NUM movq_r64_xmm_opd1 \opd1
+	XMM_NUM movq_r64_xmm_opd2 \opd2
+	.endif
+	PFX_OPD_SIZE
+	PFX_REX movq_r64_xmm_opd1 movq_r64_xmm_opd2 1
+	.if movq_r64_xmm_opd1_type == TYPE_XMM
+	.byte 0x0f, 0x7e
+	.else
+	.byte 0x0f, 0x6e
+	.endif
+	MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2
+	.endm
+
+.text
+
+# Virtual Registers
+# ARG1
+msg =		%rdi
+# ARG2
+digest =	%rsi
+# ARG3
+msglen =	%rdx
+T1 =		%rcx
+T2 =		%r8
+a_64 =		%r9
+b_64 =		%r10
+c_64 =		%r11
+d_64 =		%r12
+e_64 =		%r13
+f_64 =		%r14
+g_64 =		%r15
+h_64 =		%rbx
+tmp0 =		%rax
+
+# Local variables (stack frame)
+
+W_SIZE = 80*8
+WK_SIZE = 2*8
+RSPSAVE_SIZE = 1*8
+GPRSAVE_SIZE = 5*8
+
+frame_W = 0
+frame_WK = frame_W + W_SIZE
+frame_RSPSAVE = frame_WK + WK_SIZE
+frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
+frame_size = frame_GPRSAVE + GPRSAVE_SIZE
+
+# Useful QWORD "arrays" for simpler memory references
+# MSG, DIGEST, K_t, W_t are arrays
+# WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
+
+# Input message (arg1)
+#define MSG(i)    8*i(msg)
+
+# Output Digest (arg2)
+#define DIGEST(i) 8*i(digest)
+
+# SHA Constants (static mem)
+#define K_t(i)    8*i+K512(%rip)
+
+# Message Schedule (stack frame)
+#define W_t(i)    8*i+frame_W(%rsp)
+
+# W[t]+K[t] (stack frame)
+#define WK_2(i)   8*((i%2))+frame_WK(%rsp)
+
+.macro RotateState
+	# Rotate symbols a..h right
+	TMP   = h_64
+	h_64  = g_64
+	g_64  = f_64
+	f_64  = e_64
+	e_64  = d_64
+	d_64  = c_64
+	c_64  = b_64
+	b_64  = a_64
+	a_64  = TMP
+.endm
+
+.macro SHA512_Round rnd
+
+	# Compute Round %%t
+	mov	f_64, T1          # T1 = f
+	mov	e_64, tmp0        # tmp = e
+	xor	g_64, T1          # T1 = f ^ g
+	ror	$23, tmp0 # 41    # tmp = e ror 23
+	and	e_64, T1          # T1 = (f ^ g) & e
+	xor	e_64, tmp0        # tmp = (e ror 23) ^ e
+	xor	g_64, T1          # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+	idx = \rnd
+	add	WK_2(idx), T1     # W[t] + K[t] from message scheduler
+	ror	$4, tmp0  # 18    # tmp = ((e ror 23) ^ e) ror 4
+	xor	e_64, tmp0        # tmp = (((e ror 23) ^ e) ror 4) ^ e
+	mov	a_64, T2          # T2 = a
+	add	h_64, T1          # T1 = CH(e,f,g) + W[t] + K[t] + h
+	ror	$14, tmp0 # 14    # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+	add	tmp0, T1          # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+	mov	a_64, tmp0        # tmp = a
+	xor	c_64, T2          # T2 = a ^ c
+	and	c_64, tmp0        # tmp = a & c
+	and	b_64, T2          # T2 = (a ^ c) & b
+	xor	tmp0, T2          # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+	mov	a_64, tmp0        # tmp = a
+	ror	$5, tmp0 # 39     # tmp = a ror 5
+	xor	a_64, tmp0        # tmp = (a ror 5) ^ a
+	add	T1, d_64          # e(next_state) = d + T1
+	ror	$6, tmp0 # 34     # tmp = ((a ror 5) ^ a) ror 6
+	xor	a_64, tmp0        # tmp = (((a ror 5) ^ a) ror 6) ^ a
+	lea	(T1, T2), h_64    # a(next_state) = T1 + Maj(a,b,c)
+	ror	$28, tmp0 # 28    # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+	add	tmp0, h_64        # a(next_state) = T1 + Maj(a,b,c) S0(a)
+	RotateState
+.endm
+
+.macro SHA512_2Sched_2Round_sse rnd
+
+	# Compute rounds t-2 and t-1
+	# Compute message schedule QWORDS t and t+1
+
+	#   Two rounds are computed based on the values for K[t-2]+W[t-2] and
+	# K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+	# scheduler.
+	#   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
+	# They are then added to their respective SHA512 constants at
+	# [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
+	#   For brievity, the comments following vectored instructions only refer to
+	# the first of a pair of QWORDS.
+	# Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
+	#   The computation of the message schedule and the rounds are tightly
+	# stitched to take advantage of instruction-level parallelism.
+	# For clarity, integer instructions (for the rounds calculation) are indented
+	# by one tab. Vectored instructions (for the message scheduler) are indented
+	# by two tabs.
+
+	mov	f_64, T1
+	idx = \rnd -2
+	movdqa	W_t(idx), %xmm2		    # XMM2 = W[t-2]
+	xor	g_64, T1
+	and	e_64, T1
+	movdqa	%xmm2, %xmm0	            # XMM0 = W[t-2]
+	xor	g_64, T1
+	idx = \rnd
+	add	WK_2(idx), T1
+	idx = \rnd - 15
+	movdqu	W_t(idx), %xmm5		    # XMM5 = W[t-15]
+	mov	e_64, tmp0
+	ror	$23, tmp0 # 41
+	movdqa	%xmm5, %xmm3	            # XMM3 = W[t-15]
+	xor	e_64, tmp0
+	ror	$4, tmp0 # 18
+	psrlq	$61-19, %xmm0		    # XMM0 = W[t-2] >> 42
+	xor	e_64, tmp0
+	ror	$14, tmp0 # 14
+	psrlq	$(8-7), %xmm3		    # XMM3 = W[t-15] >> 1
+	add	tmp0, T1
+	add	h_64, T1
+	pxor	%xmm2, %xmm0                # XMM0 = (W[t-2] >> 42) ^ W[t-2]
+	mov	a_64, T2
+	xor	c_64, T2
+	pxor	%xmm5, %xmm3                # XMM3 = (W[t-15] >> 1) ^ W[t-15]
+	and	b_64, T2
+	mov	a_64, tmp0
+	psrlq	$(19-6), %xmm0		    # XMM0 = ((W[t-2]>>42)^W[t-2])>>13
+	and	c_64, tmp0
+	xor	tmp0, T2
+	psrlq	$(7-1), %xmm3		    # XMM3 = ((W[t-15]>>1)^W[t-15])>>6
+	mov	a_64, tmp0
+	ror	$5, tmp0 # 39
+	pxor	%xmm2, %xmm0	            # XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2]
+	xor	a_64, tmp0
+	ror	$6, tmp0 # 34
+	pxor	%xmm5, %xmm3                # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]
+	xor	a_64, tmp0
+	ror	$28, tmp0 # 28
+	psrlq	$6, %xmm0                   # XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6
+	add	tmp0, T2
+	add	T1, d_64
+	psrlq	$1, %xmm3                   # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1
+	lea	(T1, T2), h_64
+	RotateState
+	movdqa	%xmm2, %xmm1	            # XMM1 = W[t-2]
+	mov	f_64, T1
+	xor	g_64, T1
+	movdqa	%xmm5, %xmm4		    # XMM4 = W[t-15]
+	and	e_64, T1
+	xor	g_64, T1
+	psllq	$(64-19)-(64-61) , %xmm1    # XMM1 = W[t-2] << 42
+	idx = \rnd + 1
+	add	WK_2(idx), T1
+	mov	e_64, tmp0
+	psllq	$(64-1)-(64-8), %xmm4	    # XMM4 = W[t-15] << 7
+	ror	$23, tmp0 # 41
+	xor	e_64, tmp0
+	pxor	%xmm2, %xmm1		    # XMM1 = (W[t-2] << 42)^W[t-2]
+	ror	$4, tmp0 # 18
+	xor	e_64, tmp0
+	pxor	%xmm5, %xmm4		    # XMM4 = (W[t-15]<<7)^W[t-15]
+	ror	$14, tmp0 # 14
+	add	tmp0, T1
+	psllq	$(64-61), %xmm1		    # XMM1 = ((W[t-2] << 42)^W[t-2])<<3
+	add	h_64, T1
+	mov	a_64, T2
+	psllq	$(64-8), %xmm4		    # XMM4 = ((W[t-15]<<7)^W[t-15])<<56
+	xor	c_64, T2
+	and	b_64, T2
+	pxor	%xmm1, %xmm0		    # XMM0 = s1(W[t-2])
+	mov	a_64, tmp0
+	and	c_64, tmp0
+	idx = \rnd - 7
+	movdqu	W_t(idx), %xmm1		    # XMM1 = W[t-7]
+	xor	tmp0, T2
+	pxor	%xmm4, %xmm3                # XMM3 = s0(W[t-15])
+	mov	a_64, tmp0
+	paddq	%xmm3, %xmm0		    # XMM0 = s1(W[t-2]) + s0(W[t-15])
+	ror	$5, tmp0 # 39
+	idx =\rnd-16
+	paddq	W_t(idx), %xmm0		    # XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16]
+	xor	a_64, tmp0
+	paddq	%xmm1, %xmm0	            # XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
+	ror	$6, tmp0 # 34
+	movdqa	%xmm0, W_t(\rnd)	    # Store scheduled qwords
+	xor	a_64, tmp0
+	paddq	K_t(\rnd), %xmm0	    # Compute W[t]+K[t]
+	ror	$28, tmp0 # 28
+	idx = \rnd
+	movdqa	%xmm0, WK_2(idx)	    # Store W[t]+K[t] for next rounds
+	add	tmp0, T2
+	add	T1, d_64
+	lea	(T1, T2), h_64
+	RotateState
+.endm
+
+########################################################################
+# void sha512_transform_ssse3(const void* M, void* D, u64 L)#
+# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+# The size of the message pointed to by M must be an integer multiple of SHA512
+#   message blocks.
+# L is the message length in SHA512 blocks.
+########################################################################
+ENTRY(sha512_transform_ssse3)
+
+	cmp $0, msglen
+	je nowork
+
+	# Allocate Stack Space
+	mov	%rsp, %rax
+	sub	$frame_size, %rsp
+	and	$~(0x20 - 1), %rsp
+	mov	%rax, frame_RSPSAVE(%rsp)
+
+	# Save GPRs
+	mov	%rbx, frame_GPRSAVE(%rsp)
+	mov	%r12, frame_GPRSAVE +8*1(%rsp)
+	mov	%r13, frame_GPRSAVE +8*2(%rsp)
+	mov	%r14, frame_GPRSAVE +8*3(%rsp)
+	mov	%r15, frame_GPRSAVE +8*4(%rsp)
+
+updateblock:
+
+# Load state variables
+	mov	DIGEST(0), a_64
+	mov	DIGEST(1), b_64
+	mov	DIGEST(2), c_64
+	mov	DIGEST(3), d_64
+	mov	DIGEST(4), e_64
+	mov	DIGEST(5), f_64
+	mov	DIGEST(6), g_64
+	mov	DIGEST(7), h_64
+
+	t = 0
+	.rept 80/2 + 1
+	# (80 rounds) / (2 rounds/iteration) + (1 iteration)
+	# +1 iteration because the scheduler leads hashing by 1 iteration
+		.if t < 2
+			# BSWAP 2 QWORDS
+			movdqa	XMM_QWORD_BSWAP(%rip), %xmm1
+			movdqu	MSG(t), %xmm0
+			pshufb	%xmm1, %xmm0	# BSWAP
+			movdqa	%xmm0, W_t(t)	# Store Scheduled Pair
+			paddq	K_t(t), %xmm0	# Compute W[t]+K[t]
+			movdqa	%xmm0, WK_2(t)	# Store into WK for rounds
+		.elseif t < 16
+			# BSWAP 2 QWORDS# Compute 2 Rounds
+			movdqu	MSG(t), %xmm0
+			pshufb	%xmm1, %xmm0	# BSWAP
+			SHA512_Round t-2	# Round t-2
+			movdqa	%xmm0, W_t(t)	# Store Scheduled Pair
+			paddq	K_t(t), %xmm0	# Compute W[t]+K[t]
+			SHA512_Round t-1	# Round t-1
+			movdqa	%xmm0, WK_2(t)	# Store W[t]+K[t] into WK
+		.elseif t < 79
+			# Schedule 2 QWORDS# Compute 2 Rounds
+			SHA512_2Sched_2Round_sse t
+		.else
+			# Compute 2 Rounds
+			SHA512_Round t-2
+			SHA512_Round t-1
+		.endif
+		t = t+2
+	.endr
+
+	# Update digest
+	add	a_64, DIGEST(0)
+	add	b_64, DIGEST(1)
+	add	c_64, DIGEST(2)
+	add	d_64, DIGEST(3)
+	add	e_64, DIGEST(4)
+	add	f_64, DIGEST(5)
+	add	g_64, DIGEST(6)
+	add	h_64, DIGEST(7)
+
+	# Advance to next message block
+	add	$16*8, msg
+	dec	msglen
+	jnz	updateblock
+
+	# Restore GPRs
+	mov	frame_GPRSAVE(%rsp),      %rbx
+	mov	frame_GPRSAVE +8*1(%rsp), %r12
+	mov	frame_GPRSAVE +8*2(%rsp), %r13
+	mov	frame_GPRSAVE +8*3(%rsp), %r14
+	mov	frame_GPRSAVE +8*4(%rsp), %r15
+
+	# Restore Stack Pointer
+	mov	frame_RSPSAVE(%rsp), %rsp
+
+nowork:
+	ret
+ENDPROC(sha512_transform_ssse3)
+
+########################################################################
+### Binary Data
+
+.data
+
+.align 16
+
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP:
+	.octa 0x08090a0b0c0d0e0f0001020304050607
+
+# K[t] used in SHA512 hashing
+K512:
+	.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad 0x3956c25bf348b538,0x59f111f1b605d019
+	.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad 0xd807aa98a3030242,0x12835b0145706fbe
+	.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad 0x06ca6351e003826f,0x142929670a0e6e70
+	.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad 0x81c2c92e47edaee6,0x92722c851482353b
+	.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad 0xd192e819d6ef5218,0xd69906245565a910
+	.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad 0x90befffa23631e28,0xa4506cebde82bde9
+	.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad 0xca273eceea26619c,0xd186b8c721c0c207
+	.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad 0x113f9804bef90dae,0x1b710b35131c471b
+	.quad 0x28db77f523047d84,0x32caab7b40c72493
+	.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+#endif
--- a/pkg/crypto/sha512/sha512.go
+++ b/pkg/crypto/sha512/sha512.go
@@ -0,0 +1,41 @@
+// +build darwin windows 386 arm !cgo
+
+/*
+ * Minio Cloud Storage, (C) 2014-2016 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package sha512
+
+import (
+	"hash"
+
+	"crypto/sha512"
+)
+
+// Size - The size of a SHA512 checksum in bytes.
+const Size = 64
+
+// BlockSize - The blocksize of SHA512 in bytes.
+const BlockSize = 128
+
+// New returns a new hash.Hash computing SHA512.
+func New() hash.Hash {
+	return sha512.New()
+}
+
+// Sum512 - single caller sha512 helper
+func Sum512(data []byte) [Size]byte {
+	return sha512.Sum512(data)
+}
--- a/pkg/crypto/sha512/sha512_linux.go
+++ b/pkg/crypto/sha512/sha512_linux.go
@@ -0,0 +1,166 @@
+// +build linux,amd64,cgo
+
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file of
+// Golang project:
+//    https://github.com/golang/go/blob/master/LICENSE
+
+// Using this part of Minio codebase under the license
+// Apache License Version 2.0 with modifications
+
+// Package sha512 implements the SHA512 hash algorithms as defined
+// in FIPS 180-2.
+package sha512
+
+import (
+	"hash"
+
+	"github.com/minio/minio/pkg/cpu"
+)
+
+// Size - The size of a SHA512 checksum in bytes.
+const Size = 64
+
+// BlockSize - The blocksize of SHA512 in bytes.
+const BlockSize = 128
+
+const (
+	chunk = 128
+	init0 = 0x6a09e667f3bcc908
+	init1 = 0xbb67ae8584caa73b
+	init2 = 0x3c6ef372fe94f82b
+	init3 = 0xa54ff53a5f1d36f1
+	init4 = 0x510e527fade682d1
+	init5 = 0x9b05688c2b3e6c1f
+	init6 = 0x1f83d9abfb41bd6b
+	init7 = 0x5be0cd19137e2179
+)
+
+// digest represents the partial evaluation of a checksum.
+type digest struct {
+	h   [8]uint64
+	x   [chunk]byte
+	nx  int
+	len uint64
+}
+
+func block(dig *digest, p []byte) {
+	switch true {
+	case cpu.HasAVX2() == true:
+		blockAVX2(dig, p)
+	case cpu.HasAVX() == true:
+		blockAVX(dig, p)
+	case cpu.HasSSE41() == true:
+		blockSSE(dig, p)
+	default:
+		blockGeneric(dig, p)
+	}
+}
+
+// Reset digest to its default value
+func (d *digest) Reset() {
+	d.h[0] = init0
+	d.h[1] = init1
+	d.h[2] = init2
+	d.h[3] = init3
+	d.h[4] = init4
+	d.h[5] = init5
+	d.h[6] = init6
+	d.h[7] = init7
+	d.nx = 0
+	d.len = 0
+}
+
+// New returns a new hash.Hash computing the SHA512 checksum.
+func New() hash.Hash {
+	d := new(digest)
+	d.Reset()
+	return d
+}
+
+// Sum512 - single caller sha512 helper
+func Sum512(data []byte) [Size]byte {
+	var d digest
+	d.Reset()
+	d.Write(data)
+	return d.checkSum()
+}
+
+// Return output array byte size
+func (d *digest) Size() int { return Size }
+
+// Return blockSize
+func (d *digest) BlockSize() int { return BlockSize }
+
+// Write blocks
+func (d *digest) Write(p []byte) (nn int, err error) {
+	nn = len(p)
+	d.len += uint64(nn)
+	if d.nx > 0 {
+		n := copy(d.x[d.nx:], p)
+		d.nx += n
+		if d.nx == chunk {
+			block(d, d.x[:])
+			d.nx = 0
+		}
+		p = p[n:]
+	}
+	if len(p) >= chunk {
+		n := len(p) &^ (chunk - 1)
+		block(d, p[:n])
+		p = p[n:]
+	}
+	if len(p) > 0 {
+		d.nx = copy(d.x[:], p)
+	}
+	return
+}
+
+// Calculate sha512
+func (d *digest) Sum(in []byte) []byte {
+	// Make a copy of d0 so that caller can keep writing and summing.
+	d0 := *d
+	hash := d0.checkSum()
+	return append(in, hash[:]...)
+}
+
+// internal checksum calculation, returns [Size]byte
+func (d *digest) checkSum() [Size]byte {
+	// Padding.  Add a 1 bit and 0 bits until 112 bytes mod 128.
+	len := d.len
+	var tmp [128]byte
+	tmp[0] = 0x80
+	if len%128 < 112 {
+		d.Write(tmp[0 : 112-len%128])
+	} else {
+		d.Write(tmp[0 : 128+112-len%128])
+	}
+
+	// Length in bits.
+	len <<= 3
+	for i := uint(0); i < 16; i++ {
+		tmp[i] = byte(len >> (120 - 8*i))
+	}
+	d.Write(tmp[0:16])
+
+	if d.nx != 0 {
+		panic("d.nx != 0")
+	}
+
+	h := d.h[:]
+
+	var digest [Size]byte
+	for i, s := range h {
+		digest[i*8] = byte(s >> 56)
+		digest[i*8+1] = byte(s >> 48)
+		digest[i*8+2] = byte(s >> 40)
+		digest[i*8+3] = byte(s >> 32)
+		digest[i*8+4] = byte(s >> 24)
+		digest[i*8+5] = byte(s >> 16)
+		digest[i*8+6] = byte(s >> 8)
+		digest[i*8+7] = byte(s)
+	}
+
+	return digest
+}
--- a/pkg/crypto/sha512/sha512_test.go
+++ b/pkg/crypto/sha512/sha512_test.go
@@ -0,0 +1,141 @@
+/*
+ * Minio Cloud Storage, (C) 2014-2016 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file of
+// Golang project:
+//    https://github.com/golang/go/blob/master/LICENSE
+
+// Using this part of Minio codebase under the license
+// Apache License Version 2.0 with modifications
+
+// SHA512 hash algorithm.  See FIPS 180-2.
+
+package sha512
+
+import (
+	"fmt"
+	"io"
+	"testing"
+)
+
+type sha512Test struct {
+	out string
+	in  string
+}
+
+var golden = []sha512Test{
+	{"cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e", ""},
+	{"1f40fc92da241694750979ee6cf582f2d5d7d28e18335de05abc54d0560e0f5302860c652bf08d560252aa5e74210546f369fbbbce8c12cfc7957b2652fe9a75", "a"},
+	{"2d408a0717ec188158278a796c689044361dc6fdde28d6f04973b80896e1823975cdbf12eb63f9e0591328ee235d80e9b5bf1aa6a44f4617ff3caf6400eb172d", "ab"},
+	{"ddaf35a193617abacc417349ae20413112e6fa4e89a97ea20a9eeee64b55d39a2192992a274fc1a836ba3c23a3feebbd454d4423643ce80e2a9ac94fa54ca49f", "abc"},
+	{"d8022f2060ad6efd297ab73dcc5355c9b214054b0d1776a136a669d26a7d3b14f73aa0d0ebff19ee333368f0164b6419a96da49e3e481753e7e96b716bdccb6f", "abcd"},
+	{"878ae65a92e86cac011a570d4c30a7eaec442b85ce8eca0c2952b5e3cc0628c2e79d889ad4d5c7c626986d452dd86374b6ffaa7cd8b67665bef2289a5c70b0a1", "abcde"},
+	{"e32ef19623e8ed9d267f657a81944b3d07adbb768518068e88435745564e8d4150a0a703be2a7d88b61e3d390c2bb97e2d4c311fdc69d6b1267f05f59aa920e7", "abcdef"},
+	{"d716a4188569b68ab1b6dfac178e570114cdf0ea3a1cc0e31486c3e41241bc6a76424e8c37ab26f096fc85ef9886c8cb634187f4fddff645fb099f1ff54c6b8c", "abcdefg"},
+	{"a3a8c81bc97c2560010d7389bc88aac974a104e0e2381220c6e084c4dccd1d2d17d4f86db31c2a851dc80e6681d74733c55dcd03dd96f6062cdda12a291ae6ce", "abcdefgh"},
+	{"f22d51d25292ca1d0f68f69aedc7897019308cc9db46efb75a03dd494fc7f126c010e8ade6a00a0c1a5f1b75d81e0ed5a93ce98dc9b833db7839247b1d9c24fe", "abcdefghi"},
+	{"ef6b97321f34b1fea2169a7db9e1960b471aa13302a988087357c520be957ca119c3ba68e6b4982c019ec89de3865ccf6a3cda1fe11e59f98d99f1502c8b9745", "abcdefghij"},
+	{"2210d99af9c8bdecda1b4beff822136753d8342505ddce37f1314e2cdbb488c6016bdaa9bd2ffa513dd5de2e4b50f031393d8ab61f773b0e0130d7381e0f8a1d", "Discard medicine more than two years old."},
+	{"a687a8985b4d8d0a24f115fe272255c6afaf3909225838546159c1ed685c211a203796ae8ecc4c81a5b6315919b3a64f10713da07e341fcdbb08541bf03066ce", "He who has a shady past knows that nice guys finish last."},
+	{"8ddb0392e818b7d585ab22769a50df660d9f6d559cca3afc5691b8ca91b8451374e42bcdabd64589ed7c91d85f626596228a5c8572677eb98bc6b624befb7af8", "I wouldn't marry him with a ten foot pole."},
+	{"26ed8f6ca7f8d44b6a8a54ae39640fa8ad5c673f70ee9ce074ba4ef0d483eea00bab2f61d8695d6b34df9c6c48ae36246362200ed820448bdc03a720366a87c6", "Free! Free!/A trip/to Mars/for 900/empty jars/Burma Shave"},
+	{"e5a14bf044be69615aade89afcf1ab0389d5fc302a884d403579d1386a2400c089b0dbb387ed0f463f9ee342f8244d5a38cfbc0e819da9529fbff78368c9a982", "The days of the digital watch are numbered.  -Tom Stoppard"},
+	{"420a1faa48919e14651bed45725abe0f7a58e0f099424c4e5a49194946e38b46c1f8034b18ef169b2e31050d1648e0b982386595f7df47da4b6fd18e55333015", "Nepal premier won't resign."},
+	{"d926a863beadb20134db07683535c72007b0e695045876254f341ddcccde132a908c5af57baa6a6a9c63e6649bba0c213dc05fadcf9abccea09f23dcfb637fbe", "For every action there is an equal and opposite government program."},
+	{"9a98dd9bb67d0da7bf83da5313dff4fd60a4bac0094f1b05633690ffa7f6d61de9a1d4f8617937d560833a9aaa9ccafe3fd24db418d0e728833545cadd3ad92d", "His money is twice tainted: 'taint yours and 'taint mine."},
+	{"d7fde2d2351efade52f4211d3746a0780a26eec3df9b2ed575368a8a1c09ec452402293a8ea4eceb5a4f60064ea29b13cdd86918cd7a4faf366160b009804107", "There is no reason for any individual to have a computer in their home. -Ken Olsen, 1977"},
+	{"b0f35ffa2697359c33a56f5c0cf715c7aeed96da9905ca2698acadb08fbc9e669bf566b6bd5d61a3e86dc22999bcc9f2224e33d1d4f32a228cf9d0349e2db518", "It's a tiny change to the code and not completely disgusting. - Bob Manchek"},
+	{"3d2e5f91778c9e66f7e061293aaa8a8fc742dd3b2e4f483772464b1144189b49273e610e5cccd7a81a19ca1fa70f16b10f1a100a4d8c1372336be8484c64b311", "size:  a.out:  bad magic"},
+	{"b2f68ff58ac015efb1c94c908b0d8c2bf06f491e4de8e6302c49016f7f8a33eac3e959856c7fddbc464de618701338a4b46f76dbfaf9a1e5262b5f40639771c7", "The major problem is with sendmail.  -Mark Horton"},
+	{"d8c92db5fdf52cf8215e4df3b4909d29203ff4d00e9ad0b64a6a4e04dec5e74f62e7c35c7fb881bd5de95442123df8f57a489b0ae616bd326f84d10021121c57", "Give me a rock, paper and scissors and I will move the world.  CCFestoon"},
+	{"19a9f8dc0a233e464e8566ad3ca9b91e459a7b8c4780985b015776e1bf239a19bc233d0556343e2b0a9bc220900b4ebf4f8bdf89ff8efeaf79602d6849e6f72e", "If the enemy is within range, then so are you."},
+	{"00b4c41f307bde87301cdc5b5ab1ae9a592e8ecbb2021dd7bc4b34e2ace60741cc362560bec566ba35178595a91932b8d5357e2c9cec92d393b0fa7831852476", "It's well we cannot hear the screams/That we create in others' dreams."},
+	{"91eccc3d5375fd026e4d6787874b1dce201cecd8a27dbded5065728cb2d09c58a3d467bb1faf353bf7ba567e005245d5321b55bc344f7c07b91cb6f26c959be7", "You remind me of a TV show, but that's all right: I watch it anyway."},
+	{"fabbbe22180f1f137cfdc9556d2570e775d1ae02a597ded43a72a40f9b485d500043b7be128fb9fcd982b83159a0d99aa855a9e7cc4240c00dc01a9bdf8218d7", "C is as portable as Stonehedge!!"},
+	{"2ecdec235c1fa4fc2a154d8fba1dddb8a72a1ad73838b51d792331d143f8b96a9f6fcb0f34d7caa351fe6d88771c4f105040e0392f06e0621689d33b2f3ba92e", "Even if I could be Shakespeare, I think I should still choose to be Faraday. - A. Huxley"},
+	{"7ad681f6f96f82f7abfa7ecc0334e8fa16d3dc1cdc45b60b7af43fe4075d2357c0c1d60e98350f1afb1f2fe7a4d7cd2ad55b88e458e06b73c40b437331f5dab4", "The fugacity of a constituent in a mixture of gases at a given temperature is proportional to its mole fraction.  Lewis-Randall Rule"},
+	{"833f9248ab4a3b9e5131f745fda1ffd2dd435b30e965957e78291c7ab73605fd1912b0794e5c233ab0a12d205a39778d19b83515d6a47003f19cdee51d98c7e0", "How can you write a big system without C++?  -Paul Glick"},
+}
+
+func TestGolden(t *testing.T) {
+	for i := 0; i < len(golden); i++ {
+		g := golden[i]
+		s := fmt.Sprintf("%x", Sum512([]byte(g.in)))
+		if s != g.out {
+			t.Fatalf("Sum512 function: sha512(%s) = %s want %s", g.in, s, g.out)
+		}
+		c := New()
+		for j := 0; j < 3; j++ {
+			if j < 2 {
+				io.WriteString(c, g.in)
+			} else {
+				io.WriteString(c, g.in[0:len(g.in)/2])
+				c.Sum(nil)
+				io.WriteString(c, g.in[len(g.in)/2:])
+			}
+			s := fmt.Sprintf("%x", c.Sum(nil))
+			if s != g.out {
+				t.Fatalf("sha512[%d](%s) = %s want %s", j, g.in, s, g.out)
+			}
+			c.Reset()
+		}
+	}
+}
+
+func TestSize(t *testing.T) {
+	c := New()
+	if got := c.Size(); got != Size {
+		t.Errorf("Size = %d; want %d", got, Size)
+	}
+}
+
+func TestBlockSize(t *testing.T) {
+	c := New()
+	if got := c.BlockSize(); got != BlockSize {
+		t.Errorf("BlockSize = %d; want %d", got, BlockSize)
+	}
+}
+
+var bench = New()
+var buf = make([]byte, 1024*1024)
+
+func benchmarkSize(b *testing.B, size int) {
+	b.SetBytes(int64(size))
+	sum := make([]byte, bench.Size())
+	for i := 0; i < b.N; i++ {
+		bench.Reset()
+		bench.Write(buf[:size])
+		bench.Sum(sum[:0])
+	}
+}
+
+func BenchmarkHash8Bytes(b *testing.B) {
+	benchmarkSize(b, 8)
+}
+
+func BenchmarkHash1K(b *testing.B) {
+	benchmarkSize(b, 1024)
+}
+
+func BenchmarkHash8K(b *testing.B) {
+	benchmarkSize(b, 8192)
+}
+
+func BenchmarkHash1M(b *testing.B) {
+	benchmarkSize(b, 1024*1024)
+}
--- a/pkg/crypto/sha512/sha512block.go
+++ b/pkg/crypto/sha512/sha512block.go
@@ -0,0 +1,181 @@
+// +build linux,amd64,cgo
+
+/*
+ * Minio Cloud Storage, (C) 2014-2016 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Software block transform are provided by The Go Authors:
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file of
+// Golang project:
+//    https://github.com/golang/go/blob/master/LICENSE
+
+package sha512
+
+// #cgo CFLAGS: -DHAS_SSE41 -DHAS_AVX -DHAS_AVX2
+// #include <stdint.h>
+// void sha512_transform_ssse3 (const void* M, void* D, uint64_t L);
+// void sha512_transform_avx (const void* M, void* D, uint64_t L);
+// void sha512_transform_rorx (const void* M, void* D, uint64_t L);
+import "C"
+import (
+	"unsafe"
+)
+
+func blockSSE(dig *digest, p []byte) {
+	C.sha512_transform_ssse3(unsafe.Pointer(&p[0]), unsafe.Pointer(&dig.h[0]), (C.uint64_t)(len(p)/chunk))
+}
+
+func blockAVX(dig *digest, p []byte) {
+	C.sha512_transform_avx(unsafe.Pointer(&p[0]), unsafe.Pointer(&dig.h[0]), (C.uint64_t)(len(p)/chunk))
+}
+
+func blockAVX2(dig *digest, p []byte) {
+	C.sha512_transform_rorx(unsafe.Pointer(&p[0]), unsafe.Pointer(&dig.h[0]), (C.uint64_t)(len(p)/chunk))
+}
+
+func blockGeneric(dig *digest, p []byte) {
+	var w [80]uint64
+	h0, h1, h2, h3, h4, h5, h6, h7 := dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7]
+	for len(p) >= chunk {
+		for i := 0; i < 16; i++ {
+			j := i * 8
+			w[i] = uint64(p[j])<<56 | uint64(p[j+1])<<48 | uint64(p[j+2])<<40 | uint64(p[j+3])<<32 |
+				uint64(p[j+4])<<24 | uint64(p[j+5])<<16 | uint64(p[j+6])<<8 | uint64(p[j+7])
+		}
+		for i := 16; i < 80; i++ {
+			v1 := w[i-2]
+			t1 := (v1>>19 | v1<<(64-19)) ^ (v1>>61 | v1<<(64-61)) ^ (v1 >> 6)
+			v2 := w[i-15]
+			t2 := (v2>>1 | v2<<(64-1)) ^ (v2>>8 | v2<<(64-8)) ^ (v2 >> 7)
+
+			w[i] = t1 + w[i-7] + t2 + w[i-16]
+		}
+
+		a, b, c, d, e, f, g, h := h0, h1, h2, h3, h4, h5, h6, h7
+
+		for i := 0; i < 80; i++ {
+			t1 := h + ((e>>14 | e<<(64-14)) ^ (e>>18 | e<<(64-18)) ^ (e>>41 | e<<(64-41))) + ((e & f) ^ (^e & g)) + _K[i] + w[i]
+
+			t2 := ((a>>28 | a<<(64-28)) ^ (a>>34 | a<<(64-34)) ^ (a>>39 | a<<(64-39))) + ((a & b) ^ (a & c) ^ (b & c))
+
+			h = g
+			g = f
+			f = e
+			e = d + t1
+			d = c
+			c = b
+			b = a
+			a = t1 + t2
+		}
+
+		h0 += a
+		h1 += b
+		h2 += c
+		h3 += d
+		h4 += e
+		h5 += f
+		h6 += g
+		h7 += h
+
+		p = p[chunk:]
+	}
+
+	dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7] = h0, h1, h2, h3, h4, h5, h6, h7
+}
+
+var _K = []uint64{
+	0x428a2f98d728ae22,
+	0x7137449123ef65cd,
+	0xb5c0fbcfec4d3b2f,
+	0xe9b5dba58189dbbc,
+	0x3956c25bf348b538,
+	0x59f111f1b605d019,
+	0x923f82a4af194f9b,
+	0xab1c5ed5da6d8118,
+	0xd807aa98a3030242,
+	0x12835b0145706fbe,
+	0x243185be4ee4b28c,
+	0x550c7dc3d5ffb4e2,
+	0x72be5d74f27b896f,
+	0x80deb1fe3b1696b1,
+	0x9bdc06a725c71235,
+	0xc19bf174cf692694,
+	0xe49b69c19ef14ad2,
+	0xefbe4786384f25e3,
+	0x0fc19dc68b8cd5b5,
+	0x240ca1cc77ac9c65,
+	0x2de92c6f592b0275,
+	0x4a7484aa6ea6e483,
+	0x5cb0a9dcbd41fbd4,
+	0x76f988da831153b5,
+	0x983e5152ee66dfab,
+	0xa831c66d2db43210,
+	0xb00327c898fb213f,
+	0xbf597fc7beef0ee4,
+	0xc6e00bf33da88fc2,
+	0xd5a79147930aa725,
+	0x06ca6351e003826f,
+	0x142929670a0e6e70,
+	0x27b70a8546d22ffc,
+	0x2e1b21385c26c926,
+	0x4d2c6dfc5ac42aed,
+	0x53380d139d95b3df,
+	0x650a73548baf63de,
+	0x766a0abb3c77b2a8,
+	0x81c2c92e47edaee6,
+	0x92722c851482353b,
+	0xa2bfe8a14cf10364,
+	0xa81a664bbc423001,
+	0xc24b8b70d0f89791,
+	0xc76c51a30654be30,
+	0xd192e819d6ef5218,
+	0xd69906245565a910,
+	0xf40e35855771202a,
+	0x106aa07032bbd1b8,
+	0x19a4c116b8d2d0c8,
+	0x1e376c085141ab53,
+	0x2748774cdf8eeb99,
+	0x34b0bcb5e19b48a8,
+	0x391c0cb3c5c95a63,
+	0x4ed8aa4ae3418acb,
+	0x5b9cca4f7763e373,
+	0x682e6ff3d6b2b8a3,
+	0x748f82ee5defb2fc,
+	0x78a5636f43172f60,
+	0x84c87814a1f0ab72,
+	0x8cc702081a6439ec,
+	0x90befffa23631e28,
+	0xa4506cebde82bde9,
+	0xbef9a3f7b2c67915,
+	0xc67178f2e372532b,
+	0xca273eceea26619c,
+	0xd186b8c721c0c207,
+	0xeada7dd6cde0eb1e,
+	0xf57d4f7fee6ed178,
+	0x06f067aa72176fba,
+	0x0a637dc5a2c898a6,
+	0x113f9804bef90dae,
+	0x1b710b35131c471b,
+	0x28db77f523047d84,
+	0x32caab7b40c72493,
+	0x3c9ebe0a15c9bebc,
+	0x431d67c49c100d4c,
+	0x4cc5d4becb3e42b6,
+	0x597f299cfc657e2a,
+	0x5fcb6fab3ad6faec,
+	0x6c44198c4a475817,
+}