diff --git a/appveyor.yml b/appveyor.yml
index c6ecd38ba..e88d6ed1c 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -40,17 +40,19 @@ test_script:
   - mkdir build\coverage
   - go test -v -timeout 17m -race github.com/minio/minio/cmd...
   - go test -v -race github.com/minio/minio/pkg...
-  - go test -v -timeout 17m -coverprofile=build\coverage\coverage.txt -covermode=atomic github.com/minio/minio/cmd
+  # FIXME(aead): enable codecov after issue https://github.com/golang/go/issues/18468 is solved.
+  # - go test -v -timeout 17m -coverprofile=build\coverage\coverage.txt -covermode=atomic github.com/minio/minio/cmd
   - ps: Update-AppveyorTest "Unit Tests" -Outcome Passed
 
 after_test:
-  - go tool cover -html=build\coverage\coverage.txt -o build\coverage\coverage.html
-  - ps: Push-AppveyorArtifact build\coverage\coverage.txt
-  - ps: Push-AppveyorArtifact build\coverage\coverage.html
+  # FIXME(aead): enable codecov after issue https://github.com/golang/go/issues/18468 is solved.
+  # - go tool cover -html=build\coverage\coverage.txt -o build\coverage\coverage.html
+  # - ps: Push-AppveyorArtifact build\coverage\coverage.txt
+  # - ps: Push-AppveyorArtifact build\coverage\coverage.html
   # Upload coverage report.
-  - "SET PATH=C:\\Python34;C:\\Python34\\Scripts;%PATH%"
-  - pip install codecov
-  - codecov -X gcov -f "build\coverage\coverage.txt"
+  # - "SET PATH=C:\\Python34;C:\\Python34\\Scripts;%PATH%"
+  # - pip install codecov
+  # - codecov -X gcov -f "build\coverage\coverage.txt"
 
 # to disable deployment
 deploy: off
diff --git a/cmd/api-errors.go b/cmd/api-errors.go
index 339545afb..e3bb31d52 100644
--- a/cmd/api-errors.go
+++ b/cmd/api-errors.go
@@ -122,6 +122,17 @@ const (
 	ErrUnsupportedMetadata
 	// Add new error codes here.
 
+	// Server-Side-Encryption (with Customer provided key) related API errors.
+
+	ErrInsecureSSECustomerRequest
+	ErrSSEEncryptedObject
+	ErrInvalidEncryptionParameters
+	ErrInvalidSSECustomerAlgorithm
+	ErrInvalidSSECustomerKey
+	ErrMissingSSECustomerKey
+	ErrMissingSSECustomerKeyMD5
+	ErrSSECustomerKeyMD5Mismatch
+
 	// Bucket notification related errors.
 	ErrEventNotification
 	ErrARNNotification
@@ -159,6 +170,7 @@ const (
 	ErrAdminConfigNoQuorum
 	ErrAdminCredentialsMismatch
 	ErrInsecureClientRequest
+	ErrObjectTampered
 )
 
 // error code to APIError structure, these fields carry respective
@@ -574,6 +586,51 @@ var errorCodeResponse = map[APIErrorCode]APIError{
 		Description:    "Range specified is not valid for source object",
 		HTTPStatusCode: http.StatusBadRequest,
 	},
+	ErrMetadataTooLarge: {
+		Code:           "InvalidArgument",
+		Description:    "Your metadata headers exceed the maximum allowed metadata size.",
+		HTTPStatusCode: http.StatusBadRequest,
+	},
+	ErrInsecureSSECustomerRequest: {
+		Code:           "InvalidRequest",
+		Description:    errInsecureSSERequest.Error(),
+		HTTPStatusCode: http.StatusBadRequest,
+	},
+	ErrSSEEncryptedObject: {
+		Code:           "InvalidRequest",
+		Description:    errEncryptedObject.Error(),
+		HTTPStatusCode: http.StatusBadRequest,
+	},
+	ErrInvalidEncryptionParameters: {
+		Code:           "InvalidRequest",
+		Description:    "The encryption parameters are not applicable to this object.",
+		HTTPStatusCode: http.StatusBadRequest,
+	},
+	ErrInvalidSSECustomerAlgorithm: {
+		Code:           "InvalidArgument",
+		Description:    errInvalidSSEAlgorithm.Error(),
+		HTTPStatusCode: http.StatusBadRequest,
+	},
+	ErrInvalidSSECustomerKey: {
+		Code:           "InvalidArgument",
+		Description:    errInvalidSSEKey.Error(),
+		HTTPStatusCode: http.StatusBadRequest,
+	},
+	ErrMissingSSECustomerKey: {
+		Code:           "InvalidArgument",
+		Description:    errMissingSSEKey.Error(),
+		HTTPStatusCode: http.StatusBadRequest,
+	},
+	ErrMissingSSECustomerKeyMD5: {
+		Code:           "InvalidArgument",
+		Description:    errMissingSSEKeyMD5.Error(),
+		HTTPStatusCode: http.StatusBadRequest,
+	},
+	ErrSSECustomerKeyMD5Mismatch: {
+		Code:           "InvalidArgument",
+		Description:    errSSEKeyMD5Mismatch.Error(),
+		HTTPStatusCode: http.StatusBadRequest,
+	},
 
 	/// S3 extensions.
 	ErrContentSHA256Mismatch: {
@@ -653,11 +710,6 @@ var errorCodeResponse = map[APIErrorCode]APIError{
 		Description:    "A timeout occurred while trying to lock a resource",
 		HTTPStatusCode: http.StatusRequestTimeout,
 	},
-	ErrMetadataTooLarge: {
-		Code:           "InvalidArgument",
-		Description:    "Your metadata headers exceed the maximum allowed metadata size.",
-		HTTPStatusCode: http.StatusBadRequest,
-	},
 	ErrUnsupportedMetadata: {
 		Code:           "InvalidArgument",
 		Description:    "Your metadata headers are not supported.",
@@ -668,6 +720,11 @@ var errorCodeResponse = map[APIErrorCode]APIError{
 		Description:    "All parts except the last part should be of the same size.",
 		HTTPStatusCode: http.StatusBadRequest,
 	},
+	ErrObjectTampered: {
+		Code:           "XMinioObjectTampered",
+		Description:    errObjectTampered.Error(),
+		HTTPStatusCode: http.StatusPartialContent,
+	},
 	// Add your error structure here.
 }
 
@@ -699,6 +756,27 @@ func toAPIErrorCode(err error) (apiErr APIErrorCode) {
 		return apiErr
 	}
 
+	switch err { // SSE errors
+	case errInsecureSSERequest:
+		return ErrInsecureSSECustomerRequest
+	case errInvalidSSEAlgorithm:
+		return ErrInvalidSSECustomerAlgorithm
+	case errInvalidSSEKey:
+		return ErrInvalidSSECustomerKey
+	case errMissingSSEKey:
+		return ErrMissingSSECustomerKey
+	case errMissingSSEKeyMD5:
+		return ErrMissingSSECustomerKeyMD5
+	case errSSEKeyMD5Mismatch:
+		return ErrSSECustomerKeyMD5Mismatch
+	case errObjectTampered:
+		return ErrObjectTampered
+	case errEncryptedObject:
+		return ErrSSEEncryptedObject
+	case errSSEKeyMismatch:
+		return ErrAccessDenied // no access without correct key
+	}
+
 	switch err.(type) {
 	case StorageFull:
 		apiErr = ErrStorageFull
diff --git a/cmd/api-errors_test.go b/cmd/api-errors_test.go
index 88a73d7aa..f92395126 100644
--- a/cmd/api-errors_test.go
+++ b/cmd/api-errors_test.go
@@ -23,116 +23,48 @@ import (
 	"github.com/minio/minio/pkg/hash"
 )
 
+var toAPIErrorCodeTests = []struct {
+	err     error
+	errCode APIErrorCode
+}{
+	{err: hash.BadDigest{}, errCode: ErrBadDigest},
+	{err: hash.SHA256Mismatch{}, errCode: ErrContentSHA256Mismatch},
+	{err: IncompleteBody{}, errCode: ErrIncompleteBody},
+	{err: ObjectExistsAsDirectory{}, errCode: ErrObjectExistsAsDirectory},
+	{err: BucketNameInvalid{}, errCode: ErrInvalidBucketName},
+	{err: BucketExists{}, errCode: ErrBucketAlreadyOwnedByYou},
+	{err: ObjectNotFound{}, errCode: ErrNoSuchKey},
+	{err: ObjectNameInvalid{}, errCode: ErrInvalidObjectName},
+	{err: InvalidUploadID{}, errCode: ErrNoSuchUpload},
+	{err: InvalidPart{}, errCode: ErrInvalidPart},
+	{err: InsufficientReadQuorum{}, errCode: ErrReadQuorum},
+	{err: InsufficientWriteQuorum{}, errCode: ErrWriteQuorum},
+	{err: UnsupportedDelimiter{}, errCode: ErrNotImplemented},
+	{err: InvalidMarkerPrefixCombination{}, errCode: ErrNotImplemented},
+	{err: InvalidUploadIDKeyCombination{}, errCode: ErrNotImplemented},
+	{err: MalformedUploadID{}, errCode: ErrNoSuchUpload},
+	{err: PartTooSmall{}, errCode: ErrEntityTooSmall},
+	{err: BucketNotEmpty{}, errCode: ErrBucketNotEmpty},
+	{err: BucketNotFound{}, errCode: ErrNoSuchBucket},
+	{err: StorageFull{}, errCode: ErrStorageFull},
+	{err: NotImplemented{}, errCode: ErrNotImplemented},
+	{err: errSignatureMismatch, errCode: ErrSignatureDoesNotMatch},
+
+	// SSE-C errors
+	{err: errInsecureSSERequest, errCode: ErrInsecureSSECustomerRequest},
+	{err: errInvalidSSEAlgorithm, errCode: ErrInvalidSSECustomerAlgorithm},
+	{err: errMissingSSEKey, errCode: ErrMissingSSECustomerKey},
+	{err: errInvalidSSEKey, errCode: ErrInvalidSSECustomerKey},
+	{err: errMissingSSEKeyMD5, errCode: ErrMissingSSECustomerKeyMD5},
+	{err: errSSEKeyMD5Mismatch, errCode: ErrSSECustomerKeyMD5Mismatch},
+	{err: errObjectTampered, errCode: ErrObjectTampered},
+
+	{err: nil, errCode: ErrNone},
+	{err: errors.New("Custom error"), errCode: ErrInternalError}, // Case where err type is unknown.
+}
+
 func TestAPIErrCode(t *testing.T) {
-	testCases := []struct {
-		err     error
-		errCode APIErrorCode
-	}{
-		// Valid cases.
-		{
-			hash.BadDigest{},
-			ErrBadDigest,
-		},
-		{
-			hash.SHA256Mismatch{},
-			ErrContentSHA256Mismatch,
-		},
-		{
-			IncompleteBody{},
-			ErrIncompleteBody,
-		},
-		{
-			ObjectExistsAsDirectory{},
-			ErrObjectExistsAsDirectory,
-		},
-		{
-			BucketNameInvalid{},
-			ErrInvalidBucketName,
-		},
-		{
-			BucketExists{},
-			ErrBucketAlreadyOwnedByYou,
-		},
-		{
-			ObjectNotFound{},
-			ErrNoSuchKey,
-		},
-		{
-			ObjectNameInvalid{},
-			ErrInvalidObjectName,
-		},
-		{
-			InvalidUploadID{},
-			ErrNoSuchUpload,
-		},
-		{
-			InvalidPart{},
-			ErrInvalidPart,
-		},
-		{
-			InsufficientReadQuorum{},
-			ErrReadQuorum,
-		},
-		{
-			InsufficientWriteQuorum{},
-			ErrWriteQuorum,
-		},
-		{
-			UnsupportedDelimiter{},
-			ErrNotImplemented,
-		},
-		{
-			InvalidMarkerPrefixCombination{},
-			ErrNotImplemented,
-		},
-		{
-			InvalidUploadIDKeyCombination{},
-			ErrNotImplemented,
-		},
-		{
-			MalformedUploadID{},
-			ErrNoSuchUpload,
-		},
-		{
-			PartTooSmall{},
-			ErrEntityTooSmall,
-		},
-		{
-			BucketNotEmpty{},
-			ErrBucketNotEmpty,
-		},
-		{
-			BucketNotFound{},
-			ErrNoSuchBucket,
-		},
-		{
-			StorageFull{},
-			ErrStorageFull,
-		},
-		{
-			NotImplemented{},
-			ErrNotImplemented,
-		},
-		{
-			errSignatureMismatch,
-			ErrSignatureDoesNotMatch,
-		}, // End of all valid cases.
-
-		// Case where err is nil.
-		{
-			nil,
-			ErrNone,
-		},
-
-		// Case where err type is unknown.
-		{
-			errors.New("Custom error"),
-			ErrInternalError,
-		},
-	}
-
-	// Validate all the errors with their API error codes.
-	for i, testCase := range testCases {
+	for i, testCase := range toAPIErrorCodeTests {
 		errCode := toAPIErrorCode(testCase.err)
 		if errCode != testCase.errCode {
 			t.Errorf("Test %d: Expected error code %d, got %d", i+1, testCase.errCode, errCode)
diff --git a/cmd/encryption-v1.go b/cmd/encryption-v1.go
new file mode 100644
index 000000000..06156b1c9
--- /dev/null
+++ b/cmd/encryption-v1.go
@@ -0,0 +1,300 @@
+/*
+ * Minio Cloud Storage, (C) 2017 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package cmd
+
+import (
+	"bytes"
+	"crypto/hmac"
+	"crypto/md5"
+	"crypto/rand"
+	"encoding/base64"
+	"errors"
+	"io"
+	"net/http"
+
+	sha256 "github.com/minio/sha256-simd"
+	"github.com/minio/sio"
+)
+
+var (
+	// AWS errors for invalid SSE-C requests.
+	errInsecureSSERequest  = errors.New("Requests specifying Server Side Encryption with Customer provided keys must be made over a secure connection")
+	errEncryptedObject     = errors.New("The object was stored using a form of Server Side Encryption. The correct parameters must be provided to retrieve the object")
+	errInvalidSSEAlgorithm = errors.New("Requests specifying Server Side Encryption with Customer provided keys must provide a valid encryption algorithm")
+	errMissingSSEKey       = errors.New("Requests specifying Server Side Encryption with Customer provided keys must provide an appropriate secret key")
+	errInvalidSSEKey       = errors.New("The secret key was invalid for the specified algorithm")
+	errMissingSSEKeyMD5    = errors.New("Requests specifying Server Side Encryption with Customer provided keys must provide the client calculated MD5 of the secret key")
+	errSSEKeyMD5Mismatch   = errors.New("The calculated MD5 hash of the key did not match the hash that was provided")
+	errSSEKeyMismatch      = errors.New("The client provided key does not match the key provided when the object was encrypted") // this msg is not shown to the client
+
+	// Additional Minio errors for SSE-C requests.
+	errObjectTampered = errors.New("The requested object was modified and may be compromised")
+)
+
+const (
+	// SSECustomerAlgorithm is the AWS SSE-C algorithm HTTP header key.
+	SSECustomerAlgorithm = "X-Amz-Server-Side-Encryption-Customer-Algorithm"
+	// SSECustomerKey is the AWS SSE-C encryption key HTTP header key.
+	SSECustomerKey = "X-Amz-Server-Side-Encryption-Customer-Key"
+	// SSECustomerKeyMD5 is the AWS SSE-C encryption key MD5 HTTP header key.
+	SSECustomerKeyMD5 = "X-Amz-Server-Side-Encryption-Customer-Key-MD5"
+)
+
+const (
+	// SSECustomerKeySize is the size of valid client provided encryption keys in bytes.
+	// Currently AWS supports only AES256. So the SSE-C key size is fixed to 32 bytes.
+	SSECustomerKeySize = 32
+
+	// SSECustomerAlgorithmAES256 the only valid S3 SSE-C encryption algorithm identifier.
+	SSECustomerAlgorithmAES256 = "AES256"
+)
+
+// SSE-C key derivation:
+// H: Hash function, M: MAC function
+//
+// key     := 32 bytes              # client provided key
+// r       := H(random(32 bytes))   # saved as object metadata [ServerSideEncryptionIV]
+// key_mac := M(H(key), r)          # saved as object metadata [ServerSideEncryptionKeyMAC]
+// enc_key := M(key, key_mac)
+//
+//
+// SSE-C key verification:
+// H: Hash function, M: MAC function
+//
+// key      := 32 bytes             # client provided key
+// r        := object metadata [ServerSideEncryptionIV]
+// key_mac  := object metadata [ServerSideEncryptionKeyMAC]
+// key_mac' := M(H(key), r)
+//
+// check:   key_mac != key_mac' => fail with invalid key
+//
+// enc_key  := M(key, key_mac')
+const (
+	// ServerSideEncryptionIV is a 32 byte randomly generated IV used to derive an
+	// unique encryption key from the client provided key. The combination of this value
+	// and the client-provided key must be unique to provide the DARE tamper-proof property.
+	ServerSideEncryptionIV = ReservedMetadataPrefix + "Server-Side-Encryption-Iv"
+
+	// ServerSideEncryptionKDF is the combination of a hash and MAC function used to derive
+	// the SSE-C encryption key from the user-provided key.
+	ServerSideEncryptionKDF = ReservedMetadataPrefix + "Server-Side-Encryption-Kdf"
+
+	// ServerSideEncryptionKeyMAC is the MAC of the hash of the client-provided key and the
+	// X-Minio-Server-Side-Encryption-Iv. This value must be used to verify that the client
+	// provided the correct key to follow S3 spec.
+	ServerSideEncryptionKeyMAC = ReservedMetadataPrefix + "Server-Side-Encryption-Key-Mac"
+)
+
+// SSEKeyDerivationHmacSha256 specifies SHA-256 as hash function and HMAC-SHA256 as MAC function
+// as the functions used to derive the SSE-C encryption keys from the client-provided key.
+const SSEKeyDerivationHmacSha256 = "HMAC-SHA256"
+
+// IsSSECustomerRequest returns true if the given HTTP header
+// contains server-side-encryption with customer provided key fields.
+func IsSSECustomerRequest(header http.Header) bool {
+	return header.Get(SSECustomerAlgorithm) != "" || header.Get(SSECustomerKey) != "" || header.Get(SSECustomerKeyMD5) != ""
+}
+
+// ParseSSECustomerRequest parses the SSE-C header fields of the provided request.
+// It returns the client provided key on success.
+func ParseSSECustomerRequest(r *http.Request) (key []byte, err error) {
+	if !globalIsSSL { // minio only supports HTTP or HTTPS requests not both at the same time
+		// we cannot use r.TLS == nil here because Go's http implementation reflects on
+		// the net.Conn and sets the TLS field of http.Request only if it's an tls.Conn.
+		// Minio uses a BufConn (wrapping a tls.Conn) so the type check within the http package
+		// will always fail -> r.TLS is always nil even for TLS requests.
+		return nil, errInsecureSSERequest
+	}
+	header := r.Header
+	if algorithm := header.Get(SSECustomerAlgorithm); algorithm != SSECustomerAlgorithmAES256 {
+		return nil, errInvalidSSEAlgorithm
+	}
+	if header.Get(SSECustomerKey) == "" {
+		return nil, errMissingSSEKey
+	}
+	if header.Get(SSECustomerKeyMD5) == "" {
+		return nil, errMissingSSEKeyMD5
+	}
+
+	key, err = base64.StdEncoding.DecodeString(header.Get(SSECustomerKey))
+	if err != nil {
+		return nil, errInvalidSSEKey
+	}
+	header.Del(SSECustomerKey) // make sure we do not save the key by accident
+
+	if len(key) != SSECustomerKeySize {
+		return nil, errInvalidSSEKey
+	}
+
+	keyMD5, err := base64.StdEncoding.DecodeString(header.Get(SSECustomerKeyMD5))
+	if err != nil {
+		return nil, errSSEKeyMD5Mismatch
+	}
+	if md5Sum := md5.Sum(key); !bytes.Equal(md5Sum[:], keyMD5) {
+		return nil, errSSEKeyMD5Mismatch
+	}
+	return key, nil
+}
+
+// EncryptRequest takes the client provided content and encrypts the data
+// with the client provided key. It also marks the object as client-side-encrypted
+// and sets the correct headers.
+func EncryptRequest(content io.Reader, r *http.Request, metadata map[string]string) (io.Reader, error) {
+	key, err := ParseSSECustomerRequest(r)
+	if err != nil {
+		return nil, err
+	}
+	delete(metadata, SSECustomerKey) // make sure we do not save the key by accident
+
+	// security notice:
+	// Reusing a tuple (nonce, client provided key) will produce the same encryption key
+	// twice and breaks the tamper-proof property. However objects are still confidential.
+	// Therefore the nonce must be unique but need not to be undistinguishable from true
+	// randomness.
+	nonce := make([]byte, 32) // generate random nonce to derive encryption key
+	if _, err = io.ReadFull(rand.Reader, nonce); err != nil {
+		return nil, err
+	}
+	iv := sha256.Sum256(nonce) // hash output to not reveal any stat. weaknesses of the PRNG
+
+	keyHash := sha256.Sum256(key) // derive MAC of the client-provided key
+	mac := hmac.New(sha256.New, keyHash[:])
+	mac.Write(iv[:])
+	keyMAC := mac.Sum(nil)
+
+	mac = hmac.New(sha256.New, key) // derive encryption key
+	mac.Write(keyMAC)
+	reader, err := sio.EncryptReader(content, sio.Config{Key: mac.Sum(nil)})
+	if err != nil {
+		return nil, errInvalidSSEKey
+	}
+
+	metadata[ServerSideEncryptionIV] = base64.StdEncoding.EncodeToString(iv[:])
+	metadata[ServerSideEncryptionKDF] = SSEKeyDerivationHmacSha256
+	metadata[ServerSideEncryptionKeyMAC] = base64.StdEncoding.EncodeToString(keyMAC)
+	return reader, nil
+}
+
+// DecryptRequest decrypts the object with the client provided key. It also removes
+// the client-side-encryption metadata from the object and sets the correct headers.
+func DecryptRequest(client io.Writer, r *http.Request, metadata map[string]string) (io.WriteCloser, error) {
+	key, err := ParseSSECustomerRequest(r)
+	if err != nil {
+		return nil, err
+	}
+	delete(metadata, SSECustomerKey) // make sure we do not save the key by accident
+
+	if metadata[ServerSideEncryptionKDF] != SSEKeyDerivationHmacSha256 { // currently HMAC-SHA256 is the only option
+		return nil, errObjectTampered
+	}
+	nonce, err := base64.StdEncoding.DecodeString(metadata[ServerSideEncryptionIV])
+	if err != nil || len(nonce) != 32 {
+		return nil, errObjectTampered
+	}
+	keyMAC, err := base64.StdEncoding.DecodeString(metadata[ServerSideEncryptionKeyMAC])
+	if err != nil || len(keyMAC) != 32 {
+		return nil, errObjectTampered
+	}
+
+	keyHash := sha256.Sum256(key) // verify that client provided correct key
+	mac := hmac.New(sha256.New, keyHash[:])
+	mac.Write(nonce)
+	if !hmac.Equal(keyMAC, mac.Sum(nil)) {
+		return nil, errSSEKeyMismatch // client-provided key is wrong or object metadata was modified
+	}
+
+	mac = hmac.New(sha256.New, key) // derive decryption key
+	mac.Write(keyMAC)
+	writer, err := sio.DecryptWriter(client, sio.Config{Key: mac.Sum(nil)})
+	if err != nil {
+		return nil, errInvalidSSEKey
+	}
+
+	delete(metadata, ServerSideEncryptionIV)
+	delete(metadata, ServerSideEncryptionKDF)
+	delete(metadata, ServerSideEncryptionKeyMAC)
+	return writer, nil
+}
+
+// IsEncrypted returns true if the object is marked as encrypted.
+func (o *ObjectInfo) IsEncrypted() bool {
+	if _, ok := o.UserDefined[ServerSideEncryptionIV]; ok {
+		return true
+	}
+	if _, ok := o.UserDefined[ServerSideEncryptionKDF]; ok {
+		return true
+	}
+	if _, ok := o.UserDefined[ServerSideEncryptionKeyMAC]; ok {
+		return true
+	}
+	return false
+}
+
+// DecryptedSize returns the size of the object after decryption in bytes.
+// It returns an error if the object is not encrypted or marked as encrypted
+// but has an invalid size.
+// DecryptedSize panics if the referred object is not encrypted.
+func (o *ObjectInfo) DecryptedSize() (int64, error) {
+	if !o.IsEncrypted() {
+		panic("cannot compute decrypted size of an object which is not encrypted")
+	}
+	if o.Size == 0 {
+		return o.Size, nil
+	}
+	size := (o.Size / (32 + 64*1024)) * (64 * 1024)
+	if mod := o.Size % (32 + 64*1024); mod > 0 {
+		if mod < 33 {
+			return -1, errObjectTampered // object is not 0 size but smaller than the smallest valid encrypted object
+		}
+		size += mod - 32
+	}
+	return size, nil
+}
+
+// EncryptedSize returns the size of the object after encryption.
+// An encrypted object is always larger than a plain object
+// except for zero size objects.
+func (o *ObjectInfo) EncryptedSize() int64 {
+	size := (o.Size / (64 * 1024)) * (32 + 64*1024)
+	if mod := o.Size % (64 * 1024); mod > 0 {
+		size += mod + 32
+	}
+	return size
+}
+
+// DecryptObjectInfo tries to decrypt the provided object if it is encrypted.
+// It fails if the object is encrypted and the HTTP headers don't contain
+// SSE-C headers or the object is not encrypted but SSE-C headers are provided. (AWS behavior)
+// DecryptObjectInfo returns 'ErrNone' if the object is not encrypted or the
+// decryption succeeded.
+//
+// DecryptObjectInfo also returns whether the object is encrypted or not.
+func DecryptObjectInfo(info *ObjectInfo, headers http.Header) (apiErr APIErrorCode, encrypted bool) {
+	if apiErr, encrypted = ErrNone, info.IsEncrypted(); !encrypted && IsSSECustomerRequest(headers) {
+		apiErr = ErrInvalidEncryptionParameters
+	} else if encrypted {
+		if !IsSSECustomerRequest(headers) {
+			apiErr = ErrSSEEncryptedObject
+			return
+		}
+		var err error
+		if info.Size, err = info.DecryptedSize(); err != nil {
+			apiErr = toAPIErrorCode(err)
+		}
+	}
+	return
+}
diff --git a/cmd/encryption-v1_test.go b/cmd/encryption-v1_test.go
new file mode 100644
index 000000000..a7824c7b5
--- /dev/null
+++ b/cmd/encryption-v1_test.go
@@ -0,0 +1,390 @@
+/*
+ * Minio Cloud Storage, (C) 2017 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package cmd
+
+import (
+	"bytes"
+	"net/http"
+	"testing"
+)
+
+var isSSECustomerRequestTests = []struct {
+	headers    map[string]string
+	sseRequest bool
+}{
+	{headers: map[string]string{SSECustomerAlgorithm: "AES256", SSECustomerKey: "key", SSECustomerKeyMD5: "md5"}, sseRequest: true},                    // 0
+	{headers: map[string]string{SSECustomerAlgorithm: "AES256"}, sseRequest: true},                                                                     // 1
+	{headers: map[string]string{SSECustomerKey: "key"}, sseRequest: true},                                                                              // 2
+	{headers: map[string]string{SSECustomerKeyMD5: "md5"}, sseRequest: true},                                                                           // 3
+	{headers: map[string]string{}, sseRequest: false},                                                                                                  // 4
+	{headers: map[string]string{SSECustomerAlgorithm + " ": "AES256", " " + SSECustomerKey: "key", SSECustomerKeyMD5 + " ": "md5"}, sseRequest: false}, // 5
+	{headers: map[string]string{SSECustomerAlgorithm: "", SSECustomerKey: "", SSECustomerKeyMD5: ""}, sseRequest: false},                               // 6
+}
+
+func TestIsSSECustomerRequest(t *testing.T) {
+	for i, test := range isSSECustomerRequestTests {
+		headers := http.Header{}
+		for k, v := range test.headers {
+			headers.Set(k, v)
+		}
+		if IsSSECustomerRequest(headers) != test.sseRequest {
+			t.Errorf("Test %d: Expected IsSSECustomerRequest to return %v", i, test.sseRequest)
+		}
+	}
+}
+
+var parseSSECustomerRequestTests = []struct {
+	headers map[string]string
+	useTLS  bool
+	err     error
+}{
+	{
+		headers: map[string]string{
+			SSECustomerAlgorithm: "AES256",
+			SSECustomerKey:       "XAm0dRrJsEsyPb1UuFNezv1bl9hxuYsgUVC/MUctE2k=", // 0
+			SSECustomerKeyMD5:    "bY4wkxQejw9mUJfo72k53A==",
+		},
+		useTLS: true, err: nil,
+	},
+	{
+		headers: map[string]string{
+			SSECustomerAlgorithm: "AES256",
+			SSECustomerKey:       "XAm0dRrJsEsyPb1UuFNezv1bl9hxuYsgUVC/MUctE2k=", // 1
+			SSECustomerKeyMD5:    "bY4wkxQejw9mUJfo72k53A==",
+		},
+		useTLS: false, err: errInsecureSSERequest,
+	},
+	{
+		headers: map[string]string{
+			SSECustomerAlgorithm: "AES 256",
+			SSECustomerKey:       "XAm0dRrJsEsyPb1UuFNezv1bl9hxuYsgUVC/MUctE2k=", // 2
+			SSECustomerKeyMD5:    "bY4wkxQejw9mUJfo72k53A==",
+		},
+		useTLS: true, err: errInvalidSSEAlgorithm,
+	},
+	{
+		headers: map[string]string{
+			SSECustomerAlgorithm: "AES256",
+			SSECustomerKey:       "NjE0SL87s+ZhYtaTrg5eI5cjhCQLGPVMKenPG2bCJFw=", // 3
+			SSECustomerKeyMD5:    "H+jq/LwEOEO90YtiTuNFVw==",
+		},
+		useTLS: true, err: errSSEKeyMD5Mismatch,
+	},
+	{
+		headers: map[string]string{
+			SSECustomerAlgorithm: "AES256",
+			SSECustomerKey:       " jE0SL87s+ZhYtaTrg5eI5cjhCQLGPVMKenPG2bCJFw=", // 4
+			SSECustomerKeyMD5:    "H+jq/LwEOEO90YtiTuNFVw==",
+		},
+		useTLS: true, err: errInvalidSSEKey,
+	},
+	{
+		headers: map[string]string{
+			SSECustomerAlgorithm: "AES256",
+			SSECustomerKey:       "NjE0SL87s+ZhYtaTrg5eI5cjhCQLGPVMKenPG2bCJFw=", // 5
+			SSECustomerKeyMD5:    " +jq/LwEOEO90YtiTuNFVw==",
+		},
+		useTLS: true, err: errSSEKeyMD5Mismatch,
+	},
+	{
+		headers: map[string]string{
+			SSECustomerAlgorithm: "AES256",
+			SSECustomerKey:       "vFQ9ScFOF6Tu/BfzMS+rVMvlZGJHi5HmGJenJfrfKI45", // 6
+			SSECustomerKeyMD5:    "9KPgDdZNTHimuYCwnJTp5g==",
+		},
+		useTLS: true, err: errInvalidSSEKey,
+	},
+	{
+		headers: map[string]string{
+			SSECustomerAlgorithm: "AES256",
+			SSECustomerKey:       "", // 7
+			SSECustomerKeyMD5:    "9KPgDdZNTHimuYCwnJTp5g==",
+		},
+		useTLS: true, err: errMissingSSEKey,
+	},
+	{
+		headers: map[string]string{
+			SSECustomerAlgorithm: "AES256",
+			SSECustomerKey:       "vFQ9ScFOF6Tu/BfzMS+rVMvlZGJHi5HmGJenJfrfKI45", // 8
+			SSECustomerKeyMD5:    "",
+		},
+		useTLS: true, err: errMissingSSEKeyMD5,
+	},
+}
+
+func TestParseSSECustomerRequest(t *testing.T) {
+	defer func(flag bool) { globalIsSSL = flag }(globalIsSSL)
+	for i, test := range parseSSECustomerRequestTests {
+		headers := http.Header{}
+		for k, v := range test.headers {
+			headers.Set(k, v)
+		}
+		request := &http.Request{}
+		request.Header = headers
+		globalIsSSL = test.useTLS
+
+		_, err := ParseSSECustomerRequest(request)
+		if err != test.err {
+			t.Errorf("Test %d: Parse returned: %v want: %v", i, err, test.err)
+		}
+		key := request.Header.Get(SSECustomerKey)
+		if (err == nil || err == errSSEKeyMD5Mismatch) && key != "" {
+			t.Errorf("Test %d: Client key survived parsing - found key: %v", i, key)
+		}
+	}
+}
+
+var encryptedSizeTests = []struct {
+	size, encsize int64
+}{
+	{size: 0, encsize: 0},                                           // 0
+	{size: 1, encsize: 33},                                          // 1
+	{size: 1024, encsize: 1024 + 32},                                // 2
+	{size: 2 * 64 * 1024, encsize: 2 * (64*1024 + 32)},              // 3
+	{size: 100*64*1024 + 1, encsize: 100*(64*1024+32) + 33},         // 4
+	{size: 64*1024 + 1, encsize: (64*1024 + 32) + 33},               // 5
+	{size: 5 * 1024 * 1024 * 1024, encsize: 81920 * (64*1024 + 32)}, // 6
+}
+
+func TestEncryptedSize(t *testing.T) {
+	for i, test := range encryptedSizeTests {
+		objInfo := ObjectInfo{Size: test.size}
+		if size := objInfo.EncryptedSize(); test.encsize != size {
+			t.Errorf("Test %d: got encrypted size: #%d want: #%d", i, size, test.encsize)
+		}
+	}
+}
+
+var decryptSSECustomerObjectInfoTests = []struct {
+	encsize, size int64
+	err           error
+}{
+	{encsize: 0, size: 0, err: nil},                                           // 0
+	{encsize: 33, size: 1, err: nil},                                          // 1
+	{encsize: 1024 + 32, size: 1024, err: nil},                                // 2
+	{encsize: 2 * (64*1024 + 32), size: 2 * 64 * 1024, err: nil},              // 3
+	{encsize: 100*(64*1024+32) + 33, size: 100*64*1024 + 1, err: nil},         // 4
+	{encsize: (64*1024 + 32) + 33, size: 64*1024 + 1, err: nil},               // 5
+	{encsize: 81920 * (64*1024 + 32), size: 5 * 1024 * 1024 * 1024, err: nil}, // 6
+	{encsize: 0, size: 0, err: nil},                                           // 7
+	{encsize: 64*1024 + 32 + 31, size: 0, err: errObjectTampered},             // 8
+}
+
+func TestDecryptedSize(t *testing.T) {
+	for i, test := range decryptSSECustomerObjectInfoTests {
+		objInfo := ObjectInfo{Size: test.encsize}
+		objInfo.UserDefined = map[string]string{
+			ServerSideEncryptionKDF: SSEKeyDerivationHmacSha256,
+		}
+
+		size, err := objInfo.DecryptedSize()
+		if err != test.err || (size != test.size && err == nil) {
+			t.Errorf("Test %d: decryption returned: %v want: %v", i, err, test.err)
+		}
+		if err == nil && size != test.size {
+			t.Errorf("Test %d: got decrypted size: #%d want: #%d", i, size, test.size)
+		}
+	}
+}
+
+var encryptRequestTests = []struct {
+	header   map[string]string
+	metadata map[string]string
+}{
+	{
+		header: map[string]string{
+			SSECustomerAlgorithm: "AES256",
+			SSECustomerKey:       "XAm0dRrJsEsyPb1UuFNezv1bl9hxuYsgUVC/MUctE2k=",
+			SSECustomerKeyMD5:    "bY4wkxQejw9mUJfo72k53A==",
+		},
+		metadata: map[string]string{},
+	},
+	{
+		header: map[string]string{
+			SSECustomerAlgorithm: "AES256",
+			SSECustomerKey:       "XAm0dRrJsEsyPb1UuFNezv1bl9hxuYsgUVC/MUctE2k=",
+			SSECustomerKeyMD5:    "bY4wkxQejw9mUJfo72k53A==",
+		},
+		metadata: map[string]string{
+			SSECustomerKey: "XAm0dRrJsEsyPb1UuFNezv1bl9hxuYsgUVC/MUctE2k=",
+		},
+	},
+}
+
+func TestEncryptRequest(t *testing.T) {
+	defer func(flag bool) { globalIsSSL = flag }(globalIsSSL)
+	globalIsSSL = true
+	for i, test := range encryptRequestTests {
+		content := bytes.NewReader(make([]byte, 64))
+		req := &http.Request{Header: http.Header{}}
+		for k, v := range test.header {
+			req.Header.Set(k, v)
+		}
+		_, err := EncryptRequest(content, req, test.metadata)
+		if err != nil {
+			t.Fatalf("Test %d: Failed to encrypt request: %v", i, err)
+		}
+		if key, ok := test.metadata[SSECustomerKey]; ok {
+			t.Errorf("Test %d: Client provided key survived in metadata - key: %s", i, key)
+		}
+		if kdf, ok := test.metadata[ServerSideEncryptionKDF]; !ok {
+			t.Errorf("Test %d: ServerSideEncryptionKDF must be part of metadata: %v", i, kdf)
+		}
+		if iv, ok := test.metadata[ServerSideEncryptionIV]; !ok {
+			t.Errorf("Test %d: ServerSideEncryptionIV must be part of metadata: %v", i, iv)
+		}
+		if mac, ok := test.metadata[ServerSideEncryptionKeyMAC]; !ok {
+			t.Errorf("Test %d: ServerSideEncryptionKeyMAC must be part of metadata: %v", i, mac)
+		}
+	}
+}
+
+var decryptRequestTests = []struct {
+	header     map[string]string
+	metadata   map[string]string
+	shouldFail bool
+}{
+	{
+		header: map[string]string{
+			SSECustomerAlgorithm: "AES256",
+			SSECustomerKey:       "XAm0dRrJsEsyPb1UuFNezv1bl9hxuYsgUVC/MUctE2k=",
+			SSECustomerKeyMD5:    "bY4wkxQejw9mUJfo72k53A==",
+		},
+		metadata: map[string]string{
+			ServerSideEncryptionKDF:    SSEKeyDerivationHmacSha256,
+			ServerSideEncryptionIV:     "XAm0dRrJsEsyPb1UuFNezv1bl9hxuYsgUVC/MUctE2k=",
+			ServerSideEncryptionKeyMAC: "SY5E9AvI2tI7/nUrUAssIGE32Hcs4rR9z/CUuPqu5N4=",
+		},
+		shouldFail: false,
+	},
+	{
+		header: map[string]string{
+			SSECustomerAlgorithm: "AES256",
+			SSECustomerKey:       "XAm0dRrJsEsyPb1UuFNezv1bl9hxuYsgUVC/MUctE2k=",
+			SSECustomerKeyMD5:    "bY4wkxQejw9mUJfo72k53A==",
+		},
+		metadata: map[string]string{
+			ServerSideEncryptionKDF:    "HMAC-SHA3",
+			ServerSideEncryptionIV:     "XAm0dRrJsEsyPb1UuFNezv1bl9hxuYsgUVC/MUctE2k=",
+			ServerSideEncryptionKeyMAC: "SY5E9AvI2tI7/nUrUAssIGE32Hcs4rR9z/CUuPqu5N4=",
+		},
+		shouldFail: true,
+	},
+	{
+		header: map[string]string{
+			SSECustomerAlgorithm: "AES256",
+			SSECustomerKey:       "XAm0dRrJsEsyPb1UuFNezv1bl9hxuYsgUVC/MUctE2k=",
+			SSECustomerKeyMD5:    "bY4wkxQejw9mUJfo72k53A==",
+		},
+		metadata: map[string]string{
+			ServerSideEncryptionKDF:    SSEKeyDerivationHmacSha256,
+			ServerSideEncryptionIV:     "RrJsEsyPb1UuFNezv1bl9hxuYsgUVC/MUctE2k=",
+			ServerSideEncryptionKeyMAC: "SY5E9AvI2tI7/nUrUAssIGE32Hcs4rR9z/CUuPqu5N4=",
+		},
+		shouldFail: true,
+	},
+	{
+		header: map[string]string{
+			SSECustomerAlgorithm: "AES256",
+			SSECustomerKey:       "XAm0dRrJsEsyPb1UuFNezv1bl9hxuYsgUVC/MUctE2k=",
+			SSECustomerKeyMD5:    "bY4wkxQejw9mUJfo72k53A==",
+		},
+		metadata: map[string]string{
+			ServerSideEncryptionKDF:    SSEKeyDerivationHmacSha256,
+			ServerSideEncryptionIV:     "XAm0dRrJsEsyPb1UuFNezv1bl9ehxuYsgUVC/MUctE2k=",
+			ServerSideEncryptionKeyMAC: "SY5E9AvI2tI7/nUrUAssIGE32Hds4rR9z/CUuPqu5N4=",
+		},
+		shouldFail: true,
+	},
+}
+
+func TestDecryptRequest(t *testing.T) {
+	defer func(flag bool) { globalIsSSL = flag }(globalIsSSL)
+	globalIsSSL = true
+	for i, test := range decryptRequestTests {
+		client := bytes.NewBuffer(nil)
+		req := &http.Request{Header: http.Header{}}
+		for k, v := range test.header {
+			req.Header.Set(k, v)
+		}
+		_, err := DecryptRequest(client, req, test.metadata)
+		if err != nil && !test.shouldFail {
+			t.Fatalf("Test %d: Failed to encrypt request: %v", i, err)
+		}
+		if key, ok := test.metadata[SSECustomerKey]; ok {
+			t.Errorf("Test %d: Client provided key survived in metadata - key: %s", i, key)
+		}
+		if kdf, ok := test.metadata[ServerSideEncryptionKDF]; ok && !test.shouldFail {
+			t.Errorf("Test %d: ServerSideEncryptionKDF should not be part of metadata: %v", i, kdf)
+		}
+		if iv, ok := test.metadata[ServerSideEncryptionIV]; ok && !test.shouldFail {
+			t.Errorf("Test %d: ServerSideEncryptionIV should not be part of metadata: %v", i, iv)
+		}
+		if mac, ok := test.metadata[ServerSideEncryptionKeyMAC]; ok && !test.shouldFail {
+			t.Errorf("Test %d: ServerSideEncryptionKeyMAC should not be part of metadata: %v", i, mac)
+		}
+	}
+}
+
+var decryptObjectInfoTests = []struct {
+	info    ObjectInfo
+	headers http.Header
+	expErr  APIErrorCode
+}{
+	{
+		info:    ObjectInfo{Size: 100},
+		headers: http.Header{},
+		expErr:  ErrNone,
+	},
+	{
+		info:    ObjectInfo{Size: 100, UserDefined: map[string]string{ServerSideEncryptionKDF: SSEKeyDerivationHmacSha256}},
+		headers: http.Header{SSECustomerAlgorithm: []string{SSECustomerAlgorithmAES256}},
+		expErr:  ErrNone,
+	},
+	{
+		info:    ObjectInfo{Size: 0, UserDefined: map[string]string{ServerSideEncryptionKDF: SSEKeyDerivationHmacSha256}},
+		headers: http.Header{SSECustomerAlgorithm: []string{SSECustomerAlgorithmAES256}},
+		expErr:  ErrNone,
+	},
+	{
+		info:    ObjectInfo{Size: 100, UserDefined: map[string]string{ServerSideEncryptionKDF: SSEKeyDerivationHmacSha256}},
+		headers: http.Header{},
+		expErr:  ErrSSEEncryptedObject,
+	},
+	{
+		info:    ObjectInfo{Size: 100, UserDefined: map[string]string{}},
+		headers: http.Header{SSECustomerAlgorithm: []string{SSECustomerAlgorithmAES256}},
+		expErr:  ErrInvalidEncryptionParameters,
+	},
+	{
+		info:    ObjectInfo{Size: 31, UserDefined: map[string]string{ServerSideEncryptionKDF: SSEKeyDerivationHmacSha256}},
+		headers: http.Header{SSECustomerAlgorithm: []string{SSECustomerAlgorithmAES256}},
+		expErr:  ErrObjectTampered,
+	},
+}
+
+func TestDecryptObjectInfo(t *testing.T) {
+	for i, test := range decryptObjectInfoTests {
+		if err, encrypted := DecryptObjectInfo(&test.info, test.headers); err != test.expErr {
+			t.Errorf("Test %d: Decryption returned wrong error code: got %d , want %d", i, err, test.expErr)
+		} else if enc := test.info.IsEncrypted(); encrypted && enc != encrypted {
+			t.Errorf("Test %d: Decryption thinks object is encrypted but it is not", i)
+		} else if !encrypted && enc != encrypted {
+			t.Errorf("Test %d: Decryption thinks object is not encrypted but it is", i)
+		}
+	}
+}
diff --git a/cmd/gateway-gcs.go b/cmd/gateway-gcs.go
index ca359f0c1..3c48c5ece 100644
--- a/cmd/gateway-gcs.go
+++ b/cmd/gateway-gcs.go
@@ -958,7 +958,6 @@ func (l *gcsGateway) PutObjectPart(bucket string, key string, uploadID string, p
 	}
 	// Make sure to close the object writer upon success.
 	w.Close()
-
 	return PartInfo{
 		PartNumber:   partNumber,
 		ETag:         etag,
diff --git a/cmd/gateway-handlers.go b/cmd/gateway-handlers.go
index d7933b8d5..edbfb90e0 100644
--- a/cmd/gateway-handlers.go
+++ b/cmd/gateway-handlers.go
@@ -18,7 +18,7 @@ package cmd
 
 import (
 	"io"
-	"io/ioutil"
+	goioutil "io/ioutil"
 	"net"
 	"net/http"
 	"strconv"
@@ -29,6 +29,7 @@ import (
 	router "github.com/gorilla/mux"
 	"github.com/minio/minio-go/pkg/policy"
 	"github.com/minio/minio/pkg/hash"
+	"github.com/minio/minio/pkg/ioutil"
 )
 
 // GetObjectHandler - GET Object
@@ -118,32 +119,19 @@ func (api gatewayAPIHandlers) GetObjectHandler(w http.ResponseWriter, r *http.Re
 		startOffset = hrange.offsetBegin
 		length = hrange.getLength()
 	}
-	// Indicates if any data was written to the http.ResponseWriter
-	dataWritten := false
-	// io.Writer type which keeps track if any data was written.
-	writer := funcToWriter(func(p []byte) (int, error) {
-		if !dataWritten {
-			// Set headers on the first write.
-			// Set standard object headers.
-			setObjectHeaders(w, objInfo, hrange)
-
-			// Set any additional requested response headers.
-			setHeadGetRespHeaders(w, r.URL.Query())
-
-			dataWritten = true
-		}
-		return w.Write(p)
-	})
 
 	getObject := objectAPI.GetObject
 	if reqAuthType == authTypeAnonymous {
 		getObject = objectAPI.AnonGetObject
 	}
 
+	setObjectHeaders(w, objInfo, hrange)
+	setHeadGetRespHeaders(w, r.URL.Query())
+	httpWriter := ioutil.WriteOnClose(w)
 	// Reads the object at startOffset and writes to mw.
-	if err = getObject(bucket, object, startOffset, length, writer); err != nil {
+	if err = getObject(bucket, object, startOffset, length, httpWriter); err != nil {
 		errorIf(err, "Unable to write to client.")
-		if !dataWritten {
+		if !httpWriter.HasWritten() {
 			// Error response only if no data has been written to client yet. i.e if
 			// partial data has already been written before an error
 			// occurred then no point in setting StatusCode and
@@ -152,11 +140,11 @@ func (api gatewayAPIHandlers) GetObjectHandler(w http.ResponseWriter, r *http.Re
 		}
 		return
 	}
-	if !dataWritten {
-		// If ObjectAPI.GetObject did not return error and no data has
-		// been written it would mean that it is a 0-byte object.
-		// call wrter.Write(nil) to set appropriate headers.
-		writer.Write(nil)
+	if err = httpWriter.Close(); err != nil {
+		if !httpWriter.HasWritten() { // write error response only if no data has been written to client yet
+			writeErrorResponse(w, toAPIErrorCode(err), r.URL)
+			return
+		}
 	}
 
 	// Get host and port from Request.RemoteAddr.
@@ -472,7 +460,7 @@ func (api gatewayAPIHandlers) PutBucketPolicyHandler(w http.ResponseWriter, r *h
 	// Read access policy up to maxAccessPolicySize.
 	// http://docs.aws.amazon.com/AmazonS3/latest/dev/access-policy-language-overview.html
 	// bucket policies are limited to 20KB in size, using a limit reader.
-	policyBytes, err := ioutil.ReadAll(io.LimitReader(r.Body, maxAccessPolicySize))
+	policyBytes, err := goioutil.ReadAll(io.LimitReader(r.Body, maxAccessPolicySize))
 	if err != nil {
 		errorIf(err, "Unable to read from client.")
 		writeErrorResponse(w, toAPIErrorCode(err), r.URL)
diff --git a/cmd/generic-handlers.go b/cmd/generic-handlers.go
index ac1a6b06c..97f19ad4a 100644
--- a/cmd/generic-handlers.go
+++ b/cmd/generic-handlers.go
@@ -108,6 +108,40 @@ func isHTTPHeaderSizeTooLarge(header http.Header) bool {
 	return false
 }
 
+// ReservedMetadataPrefix is the prefix of a metadata key which
+// is reserved and for internal use only.
+const ReservedMetadataPrefix = "X-Minio-Internal-"
+
+type reservedMetadataHandler struct {
+	http.Handler
+}
+
+func filterReservedMetadata(h http.Handler) http.Handler {
+	return reservedMetadataHandler{h}
+}
+
+// ServeHTTP fails if the request contains at least one reserved header which
+// would be treated as metadata.
+func (h reservedMetadataHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
+	if containsReservedMetadata(r.Header) {
+		writeErrorResponse(w, ErrUnsupportedMetadata, r.URL)
+		return
+	}
+	h.Handler.ServeHTTP(w, r)
+}
+
+// containsReservedMetadata returns true if the http.Header contains
+// keys which are treated as metadata but are reserved for internal use
+// and must not set by clients
+func containsReservedMetadata(header http.Header) bool {
+	for key := range header {
+		if strings.HasPrefix(key, ReservedMetadataPrefix) {
+			return true
+		}
+	}
+	return false
+}
+
 // Reserved bucket.
 const (
 	minioReservedBucket     = "minio"
diff --git a/cmd/generic-handlers_test.go b/cmd/generic-handlers_test.go
index bda87767f..a93dfee16 100644
--- a/cmd/generic-handlers_test.go
+++ b/cmd/generic-handlers_test.go
@@ -144,3 +144,38 @@ func TestIsHTTPHeaderSizeTooLarge(t *testing.T) {
 		}
 	}
 }
+
+var containsReservedMetadataTests = []struct {
+	header     http.Header
+	shouldFail bool
+}{
+	{
+		header: http.Header{"X-Minio-Key": []string{"value"}},
+	},
+	{
+		header:     http.Header{ServerSideEncryptionIV: []string{"iv"}},
+		shouldFail: true,
+	},
+	{
+		header:     http.Header{ServerSideEncryptionKDF: []string{SSEKeyDerivationHmacSha256}},
+		shouldFail: true,
+	},
+	{
+		header:     http.Header{ServerSideEncryptionKeyMAC: []string{"mac"}},
+		shouldFail: true,
+	},
+	{
+		header:     http.Header{ReservedMetadataPrefix + "Key": []string{"value"}},
+		shouldFail: true,
+	},
+}
+
+func TestContainsReservedMetadata(t *testing.T) {
+	for i, test := range containsReservedMetadataTests {
+		if contains := containsReservedMetadata(test.header); contains && !test.shouldFail {
+			t.Errorf("Test %d: contains reserved header but should not fail", i)
+		} else if !contains && test.shouldFail {
+			t.Errorf("Test %d: does not contain reserved header but failed", i)
+		}
+	}
+}
diff --git a/cmd/object-handlers.go b/cmd/object-handlers.go
index d94cf199a..186ec56d6 100644
--- a/cmd/object-handlers.go
+++ b/cmd/object-handlers.go
@@ -19,7 +19,8 @@ package cmd
 import (
 	"encoding/hex"
 	"encoding/xml"
-	"io/ioutil"
+	"io"
+	goioutil "io/ioutil"
 	"net"
 	"net/http"
 	"net/url"
@@ -28,6 +29,7 @@ import (
 
 	mux "github.com/gorilla/mux"
 	"github.com/minio/minio/pkg/hash"
+	"github.com/minio/minio/pkg/ioutil"
 )
 
 // supportedHeadGetReqParams - supported request parameters for GET and HEAD presigned request.
@@ -82,13 +84,6 @@ func errAllowableObjectNotFound(bucket string, r *http.Request) APIErrorCode {
 	return ErrNoSuchKey
 }
 
-// Simple way to convert a func to io.Writer type.
-type funcToWriter func([]byte) (int, error)
-
-func (f funcToWriter) Write(p []byte) (int, error) {
-	return f(p)
-}
-
 // GetObjectHandler - GET Object
 // ----------
 // This implementation of the GET operation retrieves object. To use GET,
@@ -129,6 +124,10 @@ func (api objectAPIHandlers) GetObjectHandler(w http.ResponseWriter, r *http.Req
 		writeErrorResponse(w, apiErr, r.URL)
 		return
 	}
+	if apiErr, _ := DecryptObjectInfo(&objInfo, r.Header); apiErr != ErrNone {
+		writeErrorResponse(w, apiErr, r.URL)
+		return
+	}
 
 	// Get request range.
 	var hrange *httpRange
@@ -160,40 +159,41 @@ func (api objectAPIHandlers) GetObjectHandler(w http.ResponseWriter, r *http.Req
 		length = hrange.getLength()
 	}
 
-	// Indicates if any data was written to the http.ResponseWriter
-	dataWritten := false
-	// io.Writer type which keeps track if any data was written.
-	writer := funcToWriter(func(p []byte) (int, error) {
-		if !dataWritten {
-			// Set headers on the first write.
-			// Set standard object headers.
-			setObjectHeaders(w, objInfo, hrange)
-
-			// Set any additional requested response headers.
-			setHeadGetRespHeaders(w, r.URL.Query())
-
-			dataWritten = true
+	var writer io.Writer
+	writer = w
+	if IsSSECustomerRequest(r.Header) {
+		writer, err = DecryptRequest(writer, r, objInfo.UserDefined)
+		if err != nil {
+			writeErrorResponse(w, toAPIErrorCode(err), r.URL)
+			return
 		}
-		return w.Write(p)
-	})
+		w.Header().Set(SSECustomerAlgorithm, r.Header.Get(SSECustomerAlgorithm))
+		w.Header().Set(SSECustomerKeyMD5, r.Header.Get(SSECustomerKeyMD5))
 
+		if startOffset != 0 || length < objInfo.Size {
+			writeErrorResponse(w, ErrNotImplemented, r.URL) // SSE-C requests with HTTP range are not supported yet
+			return
+		}
+		length = objInfo.EncryptedSize()
+	}
+
+	setObjectHeaders(w, objInfo, hrange)
+	setHeadGetRespHeaders(w, r.URL.Query())
+
+	httpWriter := ioutil.WriteOnClose(writer)
 	// Reads the object at startOffset and writes to mw.
-	if err = objectAPI.GetObject(bucket, object, startOffset, length, writer); err != nil {
+	if err = objectAPI.GetObject(bucket, object, startOffset, length, httpWriter); err != nil {
 		errorIf(err, "Unable to write to client.")
-		if !dataWritten {
-			// Error response only if no data has been written to client yet. i.e if
-			// partial data has already been written before an error
-			// occurred then no point in setting StatusCode and
-			// sending error XML.
+		if !httpWriter.HasWritten() { // write error response only if no data has been written to client yet
 			writeErrorResponse(w, toAPIErrorCode(err), r.URL)
 		}
 		return
 	}
-	if !dataWritten {
-		// If ObjectAPI.GetObject did not return error and no data has
-		// been written it would mean that it is a 0-byte object.
-		// call wrter.Write(nil) to set appropriate headers.
-		writer.Write(nil)
+	if err = httpWriter.Close(); err != nil {
+		if !httpWriter.HasWritten() { // write error response only if no data has been written to client yet
+			writeErrorResponse(w, toAPIErrorCode(err), r.URL)
+			return
+		}
 	}
 
 	// Get host and port from Request.RemoteAddr.
@@ -252,6 +252,15 @@ func (api objectAPIHandlers) HeadObjectHandler(w http.ResponseWriter, r *http.Re
 		writeErrorResponseHeadersOnly(w, apiErr)
 		return
 	}
+	if apiErr, encrypted := DecryptObjectInfo(&objInfo, r.Header); apiErr != ErrNone {
+		writeErrorResponse(w, apiErr, r.URL)
+		return
+	} else if encrypted {
+		if _, err = DecryptRequest(w, r, objInfo.UserDefined); err != nil {
+			writeErrorResponse(w, ErrSSEEncryptedObject, r.URL)
+			return
+		}
+	}
 
 	// Validate pre-conditions if any.
 	if checkPreconditions(w, r, objInfo) {
@@ -530,9 +539,10 @@ func (api objectAPIHandlers) PutObjectHandler(w http.ResponseWriter, r *http.Req
 	var (
 		md5hex    = hex.EncodeToString(md5Bytes)
 		sha256hex = ""
-		reader    = r.Body
+		reader    io.Reader
+		s3Err     APIErrorCode
 	)
-
+	reader = r.Body
 	switch rAuthType {
 	default:
 		// For all unknown auth types return error.
@@ -541,31 +551,29 @@ func (api objectAPIHandlers) PutObjectHandler(w http.ResponseWriter, r *http.Req
 	case authTypeAnonymous:
 		// http://docs.aws.amazon.com/AmazonS3/latest/dev/using-with-s3-actions.html
 		sourceIP := getSourceIPAddress(r)
-		if s3Error := enforceBucketPolicy(bucket, "s3:PutObject", r.URL.Path,
-			r.Referer(), sourceIP, r.URL.Query()); s3Error != ErrNone {
-			writeErrorResponse(w, s3Error, r.URL)
+		if s3Err = enforceBucketPolicy(bucket, "s3:PutObject", r.URL.Path, r.Referer(), sourceIP, r.URL.Query()); s3Err != ErrNone {
+			writeErrorResponse(w, s3Err, r.URL)
 			return
 		}
 	case authTypeStreamingSigned:
 		// Initialize stream signature verifier.
-		var s3Error APIErrorCode
-		reader, s3Error = newSignV4ChunkedReader(r)
-		if s3Error != ErrNone {
+		reader, s3Err = newSignV4ChunkedReader(r)
+		if s3Err != ErrNone {
 			errorIf(errSignatureMismatch, "%s", dumpRequest(r))
-			writeErrorResponse(w, s3Error, r.URL)
+			writeErrorResponse(w, s3Err, r.URL)
 			return
 		}
 	case authTypeSignedV2, authTypePresignedV2:
-		s3Error := isReqAuthenticatedV2(r)
-		if s3Error != ErrNone {
+		s3Err = isReqAuthenticatedV2(r)
+		if s3Err != ErrNone {
 			errorIf(errSignatureMismatch, "%s", dumpRequest(r))
-			writeErrorResponse(w, s3Error, r.URL)
+			writeErrorResponse(w, s3Err, r.URL)
 			return
 		}
 	case authTypePresigned, authTypeSigned:
-		if s3Error := reqSignatureV4Verify(r, serverConfig.GetRegion()); s3Error != ErrNone {
+		if s3Err = reqSignatureV4Verify(r, serverConfig.GetRegion()); s3Err != ErrNone {
 			errorIf(errSignatureMismatch, "%s", dumpRequest(r))
-			writeErrorResponse(w, s3Error, r.URL)
+			writeErrorResponse(w, s3Err, r.URL)
 			return
 		}
 		if !skipContentSha256Cksum(r) {
@@ -579,7 +587,20 @@ func (api objectAPIHandlers) PutObjectHandler(w http.ResponseWriter, r *http.Req
 		return
 	}
 
-	// Create the object..
+	if IsSSECustomerRequest(r.Header) { // handle SSE-C requests
+		reader, err = EncryptRequest(hashReader, r, metadata)
+		if err != nil {
+			writeErrorResponse(w, toAPIErrorCode(err), r.URL)
+			return
+		}
+		info := ObjectInfo{Size: size}
+		hashReader, err = hash.NewReader(reader, info.EncryptedSize(), "", "") // do not try to verify encrypted content
+		if err != nil {
+			writeErrorResponse(w, toAPIErrorCode(err), r.URL)
+			return
+		}
+	}
+
 	objInfo, err := objectAPI.PutObject(bucket, object, hashReader, metadata)
 	if err != nil {
 		errorIf(err, "Unable to create an object. %s", r.URL.Path)
@@ -587,6 +608,10 @@ func (api objectAPIHandlers) PutObjectHandler(w http.ResponseWriter, r *http.Req
 		return
 	}
 	w.Header().Set("ETag", "\""+objInfo.ETag+"\"")
+	if IsSSECustomerRequest(r.Header) {
+		w.Header().Set(SSECustomerAlgorithm, r.Header.Get(SSECustomerAlgorithm))
+		w.Header().Set(SSECustomerKeyMD5, r.Header.Get(SSECustomerKeyMD5))
+	}
 	writeSuccessResponseHeadersOnly(w)
 
 	// Get host and port from Request.RemoteAddr.
@@ -627,6 +652,12 @@ func (api objectAPIHandlers) NewMultipartUploadHandler(w http.ResponseWriter, r
 		return
 	}
 
+	if IsSSECustomerRequest(r.Header) { // handle SSE-C requests
+		// SSE-C is not implemented for multipart operations yet
+		writeErrorResponse(w, ErrNotImplemented, r.URL)
+		return
+	}
+
 	// Extract metadata that needs to be saved.
 	metadata, err := extractMetadataFromHeader(r.Header)
 	if err != nil {
@@ -666,6 +697,12 @@ func (api objectAPIHandlers) CopyObjectPartHandler(w http.ResponseWriter, r *htt
 		return
 	}
 
+	if IsSSECustomerRequest(r.Header) { // handle SSE-C requests
+		// SSE-C is not implemented for multipart operations yet
+		writeErrorResponse(w, ErrNotImplemented, r.URL)
+		return
+	}
+
 	// Copy source path.
 	cpSrcPath, err := url.QueryUnescape(r.Header.Get("X-Amz-Copy-Source"))
 	if err != nil {
@@ -784,6 +821,12 @@ func (api objectAPIHandlers) PutObjectPartHandler(w http.ResponseWriter, r *http
 		return
 	}
 
+	if IsSSECustomerRequest(r.Header) { // handle SSE-C requests
+		// SSE-C is not implemented for multipart operations yet
+		writeErrorResponse(w, ErrNotImplemented, r.URL)
+		return
+	}
+
 	/// if Content-Length is unknown/missing, throw away
 	size := r.ContentLength
 
@@ -977,7 +1020,7 @@ func (api objectAPIHandlers) CompleteMultipartUploadHandler(w http.ResponseWrite
 	// Get upload id.
 	uploadID, _, _, _ := getObjectResources(r.URL.Query())
 
-	completeMultipartBytes, err := ioutil.ReadAll(r.Body)
+	completeMultipartBytes, err := goioutil.ReadAll(r.Body)
 	if err != nil {
 		errorIf(err, "Unable to complete multipart upload.")
 		writeErrorResponse(w, ErrInternalError, r.URL)
diff --git a/cmd/routers.go b/cmd/routers.go
index 3326d8591..6f2a55cb1 100644
--- a/cmd/routers.go
+++ b/cmd/routers.go
@@ -112,6 +112,9 @@ func configureServerHandler(endpoints EndpointList) (http.Handler, error) {
 		// routes them accordingly. Client receives a HTTP error for
 		// invalid/unsupported signatures.
 		setAuthHandler,
+		// filters HTTP headers which are treated as metadata and are reserved
+		// for internal use only.
+		filterReservedMetadata,
 		// Add new handlers here.
 	}
 
diff --git a/pkg/ioutil/ioutil.go b/pkg/ioutil/ioutil.go
new file mode 100644
index 000000000..27605e6bb
--- /dev/null
+++ b/pkg/ioutil/ioutil.go
@@ -0,0 +1,61 @@
+/*
+ * Minio Cloud Storage, (C) 2017 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Package ioutil implements some I/O utility functions which are not covered
+// by the standard library.
+package ioutil
+
+import (
+	"io"
+)
+
+// WriteOnCloser implements io.WriteCloser and always
+// exectues at least one write operation if it is closed.
+//
+// This can be useful within the context of HTTP. At least
+// one write operation must happen to send the HTTP headers
+// to the peer.
+type WriteOnCloser struct {
+	io.Writer
+	hasWritten bool
+}
+
+func (w *WriteOnCloser) Write(p []byte) (int, error) {
+	w.hasWritten = true
+	return w.Writer.Write(p)
+}
+
+// Close closes the WriteOnCloser. It behaves like io.Closer.
+func (w *WriteOnCloser) Close() error {
+	if !w.hasWritten {
+		_, err := w.Write(nil)
+		if err != nil {
+			return err
+		}
+	}
+	if closer, ok := w.Writer.(io.Closer); ok {
+		return closer.Close()
+	}
+	return nil
+}
+
+// HasWritten returns true if at least one write operation was performed.
+func (w *WriteOnCloser) HasWritten() bool { return w.hasWritten }
+
+// WriteOnClose takes an io.Writer and returns an ioutil.WriteOnCloser.
+func WriteOnClose(w io.Writer) *WriteOnCloser {
+	return &WriteOnCloser{w, false}
+}
diff --git a/pkg/ioutil/ioutil_test.go b/pkg/ioutil/ioutil_test.go
new file mode 100644
index 000000000..3b6d6dc1e
--- /dev/null
+++ b/pkg/ioutil/ioutil_test.go
@@ -0,0 +1,39 @@
+/*
+ * Minio Cloud Storage, (C) 2017 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ioutil
+
+import (
+	goioutil "io/ioutil"
+	"testing"
+)
+
+func TestCloseOnWriter(t *testing.T) {
+	writer := WriteOnClose(goioutil.Discard)
+	if writer.HasWritten() {
+		t.Error("WriteOnCloser must not be marked as HasWritten")
+	}
+	writer.Write(nil)
+	if !writer.HasWritten() {
+		t.Error("WriteOnCloser must be marked as HasWritten")
+	}
+
+	writer = WriteOnClose(goioutil.Discard)
+	writer.Close()
+	if !writer.HasWritten() {
+		t.Error("WriteOnCloser must be marked as HasWritten")
+	}
+}
diff --git a/vendor/github.com/minio/sio/DARE.md b/vendor/github.com/minio/sio/DARE.md
new file mode 100644
index 000000000..67c2ab2f8
--- /dev/null
+++ b/vendor/github.com/minio/sio/DARE.md
@@ -0,0 +1,223 @@
+# Data At Rest Encryption (DARE) - Version 1.0
+
+**This is a draft**
+
+## 1. Introduction
+
+This document describes the Data At Rest Encryption (DARE) format for encrypting
+data in a tamper-resistant way. DARE is designed to securely encrypt data stored
+on (untrusted) storage providers.
+
+## 2. Overview
+
+DARE specifies how to split an arbitrary data stream into small chunks (packages)
+and concatenate them into a tamper-proof chain. Tamper-proof means that an attacker
+is not able to:
+ - decrypt one or more packages.
+ - modify the content of one or more packages.
+ - reorder/rearrange one or more packages.
+
+An attacker is defined as somebody who has full access to the encrypted data
+but not to the encryption key. An attacker can also act as storage provider.
+
+### 2.1 Cryptographic Notation
+
+DARE will use the following notations:
+ - The set **{a,b}** means select **one** of the provided values **a**, **b**.
+ - The concatenation of the byte sequences **a** and **b** is **a || b**.
+ - The function **len(seq)** returns the length of a byte sequence **seq** in bytes.
+ - The index access **seq[i]** accesses one byte at index **i** of the sequence **seq**.
+ - The range access **seq[i : j]** accesses a range of bytes starting at **i** (inclusive)
+   and ending at **j** (exclusive).
+ - The compare functions **a == b => f** and **a != b => f** succeed when **a** 
+   is equal to **b** and **a** is not equal to **b** respectively and execute the command **f**.
+ - The function **CTC(a, b)** returns **1** only if **a** and **b** are equal, 0 otherwise.
+   CTC compares both values in **constant time**.
+ - **ENC(key, nonce, plaintext, addData)** represents the byte sequence which is
+   the output from an AEAD cipher authenticating the *addData*, encrypting and
+   authenticating the *plaintext* with the secret encryption *key* and the *nonce*.
+ - **DEC(key, nonce, ciphertext, addData)** represents the byte sequence which is
+   the output from an AEAD cipher verifying the integrity of the *ciphertext* &
+   *addData* and decrypting the *ciphertext* with the secret encryption *key* and
+   the *nonce*. The decryption **always** fails if the integrity check fails. 
+
+All numbers must be converted into byte sequences by using the little endian byte
+order. An AEAD cipher will be either AES-256_GCM or CHACHA20_POLY1305. 
+
+## 2.2 Keys
+
+Both ciphers - AES-256_GCM and CHACHA20_POLY1305 - require a 32 byte key. The key
+**must** be unique for one encrypted data stream. Reusing a key **compromises**
+some security properties provided by DARE. See Appendix A for recommendations
+about generating keys and preventing key reuse. 
+
+## 2.3 Errors
+
+DARE defines the following errors:
+ - **err_unsupported_version**: Indicates that the header version is not supported.
+ - **err_unsupported_cipher**: Indicates that the cipher suite is not supported.
+ - **err_missing_header**: Indicates that the payload header is missing or incomplete.
+ - **err_payload_too_short**: Indicates that the actual payload size is smaller than the
+  payload size field of the header.
+ - **err_package_out_of_order**: Indicates that the sequence number of the package does
+   not match the expected sequence number.
+ - **err_tag_mismatch**: Indicates that the tag of the package does not match the tag
+   computed while decrypting the package.
+
+## 3. Package Format
+
+DARE splits an arbitrary data stream into a sequence of packages. Each package is
+encrypted separately. A package consists of a header, a payload and an authentication
+tag.
+
+Header   | Payload        | Tag
+---------|----------------|---------
+16 bytes | 1 byte - 64 KB | 16 bytes
+
+The header contains information about the package. It consists of:
+
+Version | Cipher suite | Payload size     | Sequence number  | nonce
+--------|--------------|------------------|------------------|---------
+1 byte  | 1 byte       | 2 bytes / uint16 | 4 bytes / uint32 | 8 bytes
+
+The first byte specifies the version of the format and is equal to 0x10 for DARE 
+version 1.0. The second byte specifies the cipher used to encrypt the package.
+
+Cipher            | Value
+------------------|-------
+AES-256_GCM       | 0x00
+CHACHA20_POLY1305 | 0x01
+
+The payload size is an uint16 number. The real payload size is defined as the payload
+size field as uint32 + 1. This ensures that the payload can be exactly 64 KB long and
+prevents empty packages without a payload.
+
+The sequence number is an uint32 number identifying the package within a sequence of
+packages. It is a monotonically increasing number. The sequence number **must** be 0 for
+the first package and **must** be incremented for every subsequent package. The 
+sequence number of the n-th package is n-1. This means a sequence of packages can consist
+of 2 ^ 32 packages and each package can hold up to 64 KB data. The maximum size
+of a data stream is limited by `64 KB * 2^32 = 256 TB`. This should be sufficient
+for current use cases. However, if necessary, the maximum size of a data stream can increased
+in the future by slightly tweaking the header (with a new version).
+
+The nonce **should** be a random value for each data stream and **should** be kept constant
+for all its packages. Even if a key is accidentally used
+twice to encrypt two different data streams an attacker should not be able to decrypt one
+of those data streams. However, an attacker is always able to exchange corresponding packages
+between the streams whenever a key is reused. DARE is only tamper-proof when the encryption
+key is unique. See Appendix A.
+
+The payload contains the encrypted data. It must be at least 1 byte long and can contain a maximum of 64 KB.
+
+The authentication tag is generated by the AEAD cipher while encrypting and authenticating the
+package. The authentication tag **must** always be verified while decrypting the package.
+Decrypted content **must never** be returned before the authentication tag is successfully 
+verified.
+
+## 4. Encryption
+
+DARE encrypts every package separately. The header version, cipher suite and nonce **should**
+be the same for all encrypted packages of one data stream. It is **recommended** to not change
+this values within one sequence of packages. The nonce **should** be generated randomly once
+at the beginning of the encryption process and repeated in every header. See Appendix B for 
+recommendations about generating random numbers.
+
+The sequence number is the sequence number of the previous package plus 1. The sequence number
+**must** be a monotonically increasing number within one sequence of packages. The sequence number
+of the first package is **always** 0.
+
+The payload field is the length of the plaintext in bytes minus 1. The encryption process is
+defined as following:
+
+```
+header[0]       = 0x10
+header[1]       = {AES-256_GCM, CHACHA20_POLY1305}
+header[2:4]     = little_endian( len(plaintext) - 1 )
+header[4:8]     = little_endian( sequence_number )
+header[8:16]    = nonce
+
+payload || tag  = ENC(key, header[4:16], plaintext, header[0:4]) 
+
+sequence_number = sequence_number + 1
+```
+
+## 5. Decryption
+
+DARE decrypts every package separately. Every package **must** be successfully decrypted **before**
+plaintext is returned. The decryption happens in three steps:
+
+1. Verify that the header version is correct (`header[0] == 0x10`) and the cipher suite is supported.
+2. Verify that the sequence number of the packages matches the expected sequence number. It is required
+   to save the first expected sequence number at the beginning of the decryption process. After every
+   successfully decrypted package this sequence number is incremented by 1. The sequence number of
+   all packages **must** match the saved / expected number.
+3. Verify that the authentication tag at the end of the package is equal to the authentication tag 
+   computed while decrypting the package. This **must** happen in constant time.
+
+The decryption is defined as following:
+
+```
+header[0]                          != 0x10                            => err_unsupported_version
+header[1]                          != {AES-256_GCM,CHACHA20_POLY1305} => err_unsupported_cipher
+little_endian_uint32(header[4:8])  != expected_sequence_number        => err_package_out_of_order
+
+payload_size      := little_endian_uint32(header[2:4]) + 1
+plaintext || tag  := DEC(key, header[4:16], ciphertext, header[0:4])
+
+CTC(ciphertext[len(plaintext) : len(plaintext) + 16], tag) != 1       => err_tag_mismatch
+
+expected_sequence_number = expected_sequence_number + 1
+``` 
+
+## Security
+
+DARE provides confidentiality and integrity of the encrypted data as long as the encryption key
+is never reused. This means that a **different** encryption key **must** be used for every data
+stream. See Appendix A for recommendations.  
+
+If the same encryption key is used to encrypt two different data streams, an attacker is able to
+exchange packages with the same sequence number. This means that the attacker is able to replace 
+any package of a sequence with another package as long as:
+ - Both packages are encrypted with the same key.
+ - The sequence numbers of both packages are equal.
+
+If two data streams are encrypted with the same key the attacker will not be able to decrypt any
+package of those streams without breaking the cipher as long as the nonces are different. To be
+more precise the attacker may only be able to decrypt a package if:
+ - There is another package encrypted with the same key.
+ - The sequence number and nonce of those two packages (encrypted with the same key) are equal.
+
+As long as the nonce of a sequence of packages differs from every other nonce (and the nonce is
+repeated within one sequence - which is **recommended**) the attacker will not be able to decrypt
+any package. It is not required that the nonce is indistinguishable from a truly random bit sequence.
+It is sufficient when the nonces differ from each other in at least one bit.
+
+## Appendices
+
+### Appendix A - Key Derivation from a Master Key
+
+DARE needs a unique encryption key per data stream. The best approach to ensure that the keys
+are unique is to derive every encryption key from a master key. Therefore a key derivation function
+(KDF) - e.g. HKDF, BLAKE2X or HChaCha20 -  can be used. The master key itself may be derived from
+a password using functions like Argon2 or scrypt. Deriving those keys is the responsibility of the
+users of DARE.
+
+It is **not recommended** to derive encryption keys from a master key and an identifier (like the
+file path). If a different data stream is stored under the same identifier - e.g. overwriting the 
+data - the derived key would be the same for both streams.
+
+Instead encryption keys should be derived from a master key and a random value. It is not required
+that the random value is indistinguishable from a truly random bit sequence. The random value **must**
+be unique but need not be secret - depending on the security properties of the KDF.
+
+To keep this simple: The combination of master key and random value used to derive the encryption key
+must be unique all the time.
+
+### Appendix B - Generating random values
+
+DARE does not require random values which are indistinguishable from a truly random bit sequence.
+However, a random value **must** never be repeated. Therefore it is **recommended** to use a
+cryptographically secure pseudorandom number generator (CSPRNG) to generate random values. Many
+operating systems and cryptographic libraries already provide appropriate PRNG implementations.
+These implementations should always be preferred over crafting a new one.
diff --git a/vendor/github.com/minio/sio/LICENSE b/vendor/github.com/minio/sio/LICENSE
new file mode 100644
index 000000000..d64569567
--- /dev/null
+++ b/vendor/github.com/minio/sio/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/vendor/github.com/minio/sio/README.md b/vendor/github.com/minio/sio/README.md
new file mode 100644
index 000000000..1ad7a6ff4
--- /dev/null
+++ b/vendor/github.com/minio/sio/README.md
@@ -0,0 +1,66 @@
+[![Godoc Reference](https://godoc.org/github.com/minio/sio?status.svg)](https://godoc.org/github.com/minio/sio)
+[![Travis CI](https://travis-ci.org/minio/sio.svg?branch=master)](https://travis-ci.org/minio/sio)
+[![Go Report Card](https://goreportcard.com/badge/minio/sio)](https://goreportcard.com/report/minio/sio)
+
+# Secure IO
+## Go implementation of the Data At Rest Encryption (DARE) format.
+
+## Introduction
+
+It is a common problem to store data securely - especially on untrusted remote storage. 
+One solution to this problem is cryptography. Before data is stored it is encrypted
+to ensure that the data is confidential. Unfortunately encrypting data is not enough to
+prevent more sophisticated attacks. Anyone who has access to the stored data can try to
+manipulate the data - even if the data is encrypted.
+
+To prevent these kinds of attacks the data must be encrypted in a tamper-resistant way.
+This means an attacker should not be able to:
+ - Read the stored data - this is achieved by modern encryption algorithms.
+ - Modify the data by changing parts of the encrypted data.
+ - Rearrange or reorder parts of the encrypted data. 
+
+Authenticated encryption schemes (AE) - like AES-GCM or ChaCha20-Poly1305 - encrypt and
+authenticate data. Any modification to the encrypted data (ciphertext) is detected while
+decrypting the data. But even an AE scheme alone is not sufficiently enough to prevent all
+kinds of data manipulation.
+
+All modern AE schemes produce an authentication tag which is verified after the ciphertext
+is decrypted. If a large amount of data is decrypted it is not always possible to buffer
+all decrypted data until the authentication tag is verified. Returning unauthenticated 
+data has the same issues like encrypting data without authentication.
+
+Splitting the data into small chunks fixes the problem of deferred authentication checks
+but introduces a new one. The chunks can be reordered - e.g. exchanging chunk 1 and 2 - 
+because every chunk is encrypted separately. Therefore the order of the chunks must be
+encoded somehow into the chunks itself to be able to detect rearranging any number of 
+chunks.     
+
+This project specifies a [format](https://github.com/minio/sio/blob/master/DARE.md) for 
+en/decrypting an arbitrary data stream and gives some [recommendations](https://github.com/minio/sio/blob/master/DARE.md#appendices)
+about how to use and implement data at rest encryption (DARE). Additionally this project
+provides a reference implementation in Go.  
+
+## Applications
+
+DARE is designed with simplicity and efficiency in mind. It combines modern AE schemes
+with a very simple reorder protection mechanism to build a tamper-resistant encryption
+scheme. DARE can be used to encrypt files, backups and even large object storage systems.
+
+Its main properties are:
+ - Security and high performance by relying on modern AEAD ciphers
+ - Small overhead - encryption increases the amount of data by ~0.05%
+ - Support for long data streams - up to 256 TB under the same key  
+ - Random access - arbitrary sequences / ranges can be decrypted independently
+
+**Install:** `go get -u github.com/minio/sio`
+
+DARE and `github.com/minio/sio` are finalized and can be used in production.
+
+## Performance
+
+Cipher            |   8 KB   |   64 KB   |   512 KB  |  1 MB
+----------------- | -------- | --------- | --------- | --------
+AES_256_GCM       |  90 MB/s | 1.96 GB/s | 2.64 GB/s | 2.83 GB/s
+CHACHA20_POLY1305 |  97 MB/s | 1.23 GB/s | 1.54 GB/s | 1.57 GB/s
+
+*On i7-6500U 2 x 2.5 GHz | Linux 4.10.0-32-generic | Go 1.8.3 | AES-NI & AVX2*
diff --git a/vendor/github.com/minio/sio/dare.go b/vendor/github.com/minio/sio/dare.go
new file mode 100644
index 000000000..1de0677b8
--- /dev/null
+++ b/vendor/github.com/minio/sio/dare.go
@@ -0,0 +1,277 @@
+// Copyright (C) 2017 Minio Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package sio implements the DARE format. It provides an API
+// for secure en/decrypting IO operations.
+package sio // import "github.com/minio/sio"
+
+import (
+	"crypto/aes"
+	"crypto/cipher"
+	"crypto/rand"
+	"errors"
+	"io"
+
+	"golang.org/x/crypto/chacha20poly1305"
+)
+
+// Version10 specifies version 1.0
+const Version10 byte = 0x10
+
+const (
+	// AES_256_GCM specifies the cipher suite AES-GCM with 256 bit keys.
+	AES_256_GCM byte = iota
+	// CHACHA20_POLY1305 specifies the cipher suite ChaCha20Poly1305 with 256 bit keys.
+	CHACHA20_POLY1305
+)
+
+const (
+	headerSize  = 16
+	payloadSize = 64 * 1024
+	tagSize     = 16
+)
+
+var (
+	errMissingHeader      = errors.New("sio: incomplete header")
+	errPayloadTooShort    = errors.New("sio: payload too short")
+	errPackageOutOfOrder  = errors.New("sio: sequence number mismatch")
+	errTagMismatch        = errors.New("sio: authentication failed")
+	errUnsupportedVersion = errors.New("sio: unsupported version")
+	errUnsupportedCipher  = errors.New("sio: unsupported cipher suite")
+)
+
+var newAesGcm = func(key []byte) (cipher.AEAD, error) {
+	aes256, err := aes.NewCipher(key)
+	if err != nil {
+		return nil, err
+	}
+	return cipher.NewGCM(aes256)
+}
+
+var supportedCiphers = [...]func([]byte) (cipher.AEAD, error){
+	AES_256_GCM:       newAesGcm,
+	CHACHA20_POLY1305: chacha20poly1305.New,
+}
+
+// Encrypt reads from src until it encounters an io.EOF and encrypts all received
+// data. The encrypted data is written to dst. It returns the number of bytes
+// encrypted and the first error encountered while encrypting, if any.
+//
+// Encrypt returns the number of bytes written to dst.
+func Encrypt(dst io.Writer, src io.Reader, config Config) (n int64, err error) {
+	encReader, err := EncryptReader(src, config)
+	if err != nil {
+		return
+	}
+	return io.CopyBuffer(dst, encReader, make([]byte, payloadSize))
+}
+
+// Decrypt reads from src until it encounters an io.EOF and decrypts all received
+// data. The decrypted data is written to dst. It returns the number of bytes
+// decrypted and the first error encountered while decrypting, if any.
+//
+// Decrypt returns the number of bytes written to dst. Decrypt only writes data to
+// dst if the data was decrypted successfully.
+func Decrypt(dst io.Writer, src io.Reader, config Config) (n int64, err error) {
+	decReader, err := DecryptReader(src, config)
+	if err != nil {
+		return
+	}
+	return io.CopyBuffer(dst, decReader, make([]byte, headerSize+payloadSize+tagSize))
+}
+
+// EncryptReader wraps the given src and returns an io.Reader which encrypts
+// all received data. EncryptReader returns an error if the provided encryption
+// configuration is invalid.
+func EncryptReader(src io.Reader, config Config) (io.Reader, error) {
+	if err := setConfigDefaults(&config); err != nil {
+		return nil, err
+	}
+	cipher, err := supportedCiphers[config.CipherSuites[0]](config.Key)
+	if err != nil {
+		return nil, err
+	}
+	nonce, err := config.generateNonce()
+	if err != nil {
+		return nil, err
+	}
+	return &encryptedReader{
+		src:            src,
+		config:         config,
+		nonce:          nonce,
+		cipher:         cipher,
+		sequenceNumber: config.SequenceNumber,
+	}, nil
+}
+
+// DecryptReader wraps the given src and returns an io.Reader which decrypts
+// all received data. DecryptReader returns an error if the provided decryption
+// configuration is invalid.
+func DecryptReader(src io.Reader, config Config) (io.Reader, error) {
+	if err := setConfigDefaults(&config); err != nil {
+		return nil, err
+	}
+	ciphers, err := config.createCiphers()
+	if err != nil {
+		return nil, err
+	}
+	return &decryptedReader{
+		src:            src,
+		config:         config,
+		ciphers:        ciphers,
+		sequenceNumber: config.SequenceNumber,
+	}, nil
+}
+
+// EncryptWriter wraps the given dst and returns an io.WriteCloser which
+// encrypts all data written to it. EncryptWriter returns an error if the
+// provided decryption configuration is invalid.
+//
+// The returned io.WriteCloser must be closed successfully to finalize the
+// encryption process.
+func EncryptWriter(dst io.Writer, config Config) (io.WriteCloser, error) {
+	if err := setConfigDefaults(&config); err != nil {
+		return nil, err
+	}
+	cipher, err := supportedCiphers[config.CipherSuites[0]](config.Key)
+	if err != nil {
+		return nil, err
+	}
+	nonce, err := config.generateNonce()
+	if err != nil {
+		return nil, err
+	}
+	return &encryptedWriter{
+		dst:            dst,
+		config:         config,
+		nonce:          nonce,
+		cipher:         cipher,
+		sequenceNumber: config.SequenceNumber,
+	}, nil
+}
+
+// DecryptWriter wraps the given dst and returns an io.WriteCloser which
+// decrypts all data written to it. DecryptWriter returns an error if the
+// provided decryption configuration is invalid.
+//
+// The returned io.WriteCloser must be closed successfully to finalize the
+// decryption process.
+func DecryptWriter(dst io.Writer, config Config) (io.WriteCloser, error) {
+	if err := setConfigDefaults(&config); err != nil {
+		return nil, err
+	}
+	ciphers, err := config.createCiphers()
+	if err != nil {
+		return nil, err
+	}
+	return &decryptedWriter{
+		dst:            dst,
+		config:         config,
+		ciphers:        ciphers,
+		sequenceNumber: config.SequenceNumber,
+	}, nil
+}
+
+func setConfigDefaults(config *Config) error {
+	if config.MinVersion > Version10 {
+		return errors.New("sio: unknown minimum version")
+	}
+	if config.MaxVersion > Version10 {
+		return errors.New("dare: unknown maximum version")
+	}
+	if len(config.Key) != 32 {
+		return errors.New("sio: invalid key size")
+	}
+	if len(config.CipherSuites) > 2 {
+		return errors.New("sio: too many cipher suites")
+	}
+	for _, c := range config.CipherSuites {
+		if int(c) >= len(supportedCiphers) {
+			return errors.New("sio: unknown cipher suite")
+		}
+	}
+	if config.MinVersion < Version10 {
+		config.MinVersion = Version10
+	}
+	if config.MaxVersion < Version10 {
+		config.MaxVersion = config.MinVersion
+	}
+	if len(config.CipherSuites) == 0 {
+		config.CipherSuites = []byte{AES_256_GCM, CHACHA20_POLY1305}
+	}
+	if config.Rand == nil {
+		config.Rand = rand.Reader
+	}
+	return nil
+}
+
+// Config contains the format configuration. The only field
+// which must always be set manually is the secret key.
+type Config struct {
+	// The minimal supported version of the format. If
+	// not set the default value is used.
+	MinVersion byte
+	// The highest supported version of the format. If
+	// not set the default value is used.
+	MaxVersion byte
+	// A list of supported cipher suites. If not set the
+	// default value is used.
+	CipherSuites []byte
+	// The secret encryption key. It must be 32 bytes long.
+	Key []byte
+	// The first expected sequence number. It should only
+	// be set manually when appending data to an existing
+	// sequence of packages or decrypting a range within
+	// a sequence of packages.
+	SequenceNumber uint32
+	// The RNG used to generate random values. If not set
+	// the default value (crypto/rand.Reader) is used.
+	Rand io.Reader
+}
+
+func (c *Config) verifyHeader(header headerV10) error {
+	if version := header.Version(); version < c.MinVersion || version > c.MaxVersion {
+		return errUnsupportedVersion
+	}
+	if !c.isCipherSuiteSupported(header.Cipher()) {
+		return errUnsupportedCipher
+	}
+	return nil
+}
+
+func (c *Config) createCiphers() ([]cipher.AEAD, error) {
+	ciphers := make([]cipher.AEAD, len(supportedCiphers))
+	for _, v := range c.CipherSuites {
+		aeadCipher, err := supportedCiphers[v](c.Key)
+		if err != nil {
+			return nil, err
+		}
+		ciphers[v] = aeadCipher
+	}
+	return ciphers, nil
+}
+
+func (c *Config) generateNonce() (nonce [8]byte, err error) {
+	_, err = io.ReadFull(c.Rand, nonce[:])
+	return nonce, err
+}
+
+func (c *Config) isCipherSuiteSupported(suite byte) bool {
+	for _, c := range c.CipherSuites {
+		if suite == c {
+			return true
+		}
+	}
+	return false
+}
diff --git a/vendor/github.com/minio/sio/header.go b/vendor/github.com/minio/sio/header.go
new file mode 100644
index 000000000..ddaeeff1c
--- /dev/null
+++ b/vendor/github.com/minio/sio/header.go
@@ -0,0 +1,39 @@
+// Copyright (C) 2017 Minio Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sio
+
+import "encoding/binary"
+
+type headerV10 []byte
+
+func header(b []byte) headerV10 { return headerV10(b[:headerSize]) }
+
+func (h headerV10) Version() byte { return h[0] }
+
+func (h headerV10) Cipher() byte { return h[1] }
+
+func (h headerV10) Len() int { return int(binary.LittleEndian.Uint16(h[2:])) + 1 }
+
+func (h headerV10) SequenceNumber() uint32 { return binary.LittleEndian.Uint32(h[4:]) }
+
+func (h headerV10) SetVersion(version byte) { h[0] = version }
+
+func (h headerV10) SetCipher(suite byte) { h[1] = suite }
+
+func (h headerV10) SetLen(length int) { binary.LittleEndian.PutUint16(h[2:], uint16(length-1)) }
+
+func (h headerV10) SetSequenceNumber(num uint32) { binary.LittleEndian.PutUint32(h[4:], num) }
+
+func (h headerV10) SetNonce(nonce [8]byte) { copy(h[8:], nonce[:]) }
diff --git a/vendor/github.com/minio/sio/reader.go b/vendor/github.com/minio/sio/reader.go
new file mode 100644
index 000000000..ef35a3774
--- /dev/null
+++ b/vendor/github.com/minio/sio/reader.go
@@ -0,0 +1,169 @@
+// Copyright (C) 2017 Minio Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sio
+
+import (
+	"crypto/cipher"
+	"io"
+)
+
+type encryptedReader struct {
+	src    io.Reader
+	config Config
+
+	sequenceNumber uint32
+	nonce          [8]byte
+	cipher         cipher.AEAD
+
+	pack   [headerSize + payloadSize + tagSize]byte
+	offset int
+}
+
+func (r *encryptedReader) Read(p []byte) (n int, err error) {
+	if r.offset > 0 {
+		remaining := headerSize + header(r.pack[:]).Len() + tagSize - r.offset
+		if len(p) < remaining {
+			n = copy(p, r.pack[r.offset:r.offset+len(p)])
+			r.offset += n
+			return
+		}
+		n = copy(p, r.pack[r.offset:r.offset+remaining])
+		p = p[remaining:]
+		r.offset = 0
+	}
+	for len(p) >= headerSize+payloadSize+tagSize {
+		nn, err := io.ReadFull(r.src, r.pack[headerSize:headerSize+payloadSize])
+		if err != nil && err != io.ErrUnexpectedEOF {
+			return n, err
+		}
+		r.encrypt(p, nn)
+		n += headerSize + nn + tagSize
+		p = p[headerSize+nn+tagSize:]
+	}
+	if len(p) > 0 {
+		nn, err := io.ReadFull(r.src, r.pack[headerSize:headerSize+payloadSize])
+		if err != nil && err != io.ErrUnexpectedEOF {
+			return n, err
+		}
+		r.encrypt(r.pack[:], nn)
+		if headerSize+nn+tagSize < len(p) {
+			r.offset = copy(p, r.pack[:headerSize+nn+tagSize])
+		} else {
+			r.offset = copy(p, r.pack[:len(p)])
+		}
+		n += r.offset
+	}
+	return
+}
+
+func (r *encryptedReader) encrypt(dst []byte, length int) {
+	header := header(dst)
+	header.SetVersion(r.config.MaxVersion)
+	header.SetCipher(r.config.CipherSuites[0])
+	header.SetLen(length)
+	header.SetSequenceNumber(r.sequenceNumber)
+	header.SetNonce(r.nonce)
+
+	copy(dst[:headerSize], header)
+	r.cipher.Seal(dst[headerSize:headerSize], header[4:headerSize], r.pack[headerSize:headerSize+length], header[:4])
+
+	r.sequenceNumber++
+}
+
+type decryptedReader struct {
+	src    io.Reader
+	config Config
+
+	sequenceNumber uint32
+	ciphers        []cipher.AEAD
+
+	pack   [headerSize + payloadSize + tagSize]byte
+	offset int
+}
+
+func (r *decryptedReader) Read(p []byte) (n int, err error) {
+	if r.offset > 0 {
+		remaining := header(r.pack[:]).Len() - r.offset
+		if len(p) < remaining {
+			n = copy(p, r.pack[headerSize+r.offset:headerSize+r.offset+len(p)])
+			r.offset += n
+			return
+		}
+		n = copy(p, r.pack[headerSize+r.offset:headerSize+r.offset+remaining])
+		p = p[remaining:]
+		r.offset = 0
+	}
+	for len(p) >= payloadSize {
+		if err = r.readHeader(); err != nil {
+			return n, err
+		}
+		nn, err := r.decrypt(p[:payloadSize])
+		if err != nil {
+			return n, err
+		}
+		p = p[nn:]
+		n += nn
+	}
+	if len(p) > 0 {
+		if err = r.readHeader(); err != nil {
+			return n, err
+		}
+		nn, err := r.decrypt(r.pack[headerSize:])
+		if err != nil {
+			return n, err
+		}
+		if nn < len(p) {
+			r.offset = copy(p, r.pack[headerSize:headerSize+nn])
+		} else {
+			r.offset = copy(p, r.pack[headerSize:headerSize+len(p)])
+		}
+		n += r.offset
+	}
+	return
+}
+
+func (r *decryptedReader) readHeader() error {
+	n, err := io.ReadFull(r.src, header(r.pack[:]))
+	if n != headerSize && err == io.ErrUnexpectedEOF {
+		return errMissingHeader
+	} else if err != nil {
+		return err
+	}
+	return nil
+}
+
+func (r *decryptedReader) decrypt(dst []byte) (n int, err error) {
+	header := header(r.pack[:])
+	if err = r.config.verifyHeader(header); err != nil {
+		return 0, err
+	}
+	if header.SequenceNumber() != r.sequenceNumber {
+		return 0, errPackageOutOfOrder
+	}
+	ciphertext := r.pack[headerSize : headerSize+header.Len()+tagSize]
+	n, err = io.ReadFull(r.src, ciphertext)
+	if err == io.EOF || err == io.ErrUnexpectedEOF {
+		return 0, errPayloadTooShort
+	} else if err != nil {
+		return 0, err
+	}
+	aeadCipher := r.ciphers[header.Cipher()]
+	plaintext, err := aeadCipher.Open(dst[:0], header[4:], ciphertext, header[:4])
+	if err != nil {
+		return 0, errTagMismatch
+	}
+	r.sequenceNumber++
+	return len(plaintext), nil
+}
diff --git a/vendor/github.com/minio/sio/writer.go b/vendor/github.com/minio/sio/writer.go
new file mode 100644
index 000000000..6292da6ce
--- /dev/null
+++ b/vendor/github.com/minio/sio/writer.go
@@ -0,0 +1,193 @@
+// Copyright (C) 2017 Minio Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sio
+
+import (
+	"crypto/cipher"
+	"io"
+)
+
+type decryptedWriter struct {
+	dst    io.Writer
+	config Config
+
+	sequenceNumber uint32
+	ciphers        []cipher.AEAD
+
+	pack   [headerSize + payloadSize + tagSize]byte
+	offset int
+}
+
+func (w *decryptedWriter) Write(p []byte) (n int, err error) {
+	if w.offset > 0 && w.offset < headerSize {
+		remaining := headerSize - w.offset
+		if len(p) < remaining {
+			n = copy(w.pack[w.offset:], p)
+			w.offset += n
+			return
+		}
+		n = copy(w.pack[w.offset:], p[:remaining])
+		p = p[remaining:]
+		w.offset += n
+	}
+	if w.offset >= headerSize {
+		remaining := headerSize + header(w.pack[:]).Len() + tagSize - w.offset
+		if len(p) < remaining {
+			nn := copy(w.pack[w.offset:], p)
+			w.offset += nn
+			return n + nn, err
+		}
+		n += copy(w.pack[w.offset:], p[:remaining])
+		if err = w.decrypt(w.pack[:]); err != nil {
+			return n, err
+		}
+		p = p[remaining:]
+		w.offset = 0
+	}
+	for len(p) > headerSize {
+		header := header(p)
+		if len(p) < headerSize+header.Len()+tagSize {
+			w.offset = copy(w.pack[:], p)
+			n += w.offset
+			return
+		}
+		if err = w.decrypt(p); err != nil {
+			return n, err
+		}
+		p = p[headerSize+header.Len()+tagSize:]
+		n += headerSize + header.Len() + tagSize
+	}
+	w.offset = copy(w.pack[:], p)
+	n += w.offset
+	return
+}
+
+func (w *decryptedWriter) Close() error {
+	if w.offset > 0 {
+		if w.offset < headerSize {
+			return errMissingHeader
+		}
+		if w.offset < headerSize+header(w.pack[:]).Len()+tagSize {
+			return errPayloadTooShort
+		}
+		if err := w.decrypt(w.pack[:]); err != nil {
+			return err
+		}
+	}
+	if dst, ok := w.dst.(io.Closer); ok {
+		return dst.Close()
+	}
+	return nil
+}
+
+func (w *decryptedWriter) decrypt(src []byte) error {
+	header := header(src)
+	if err := w.config.verifyHeader(header); err != nil {
+		return err
+	}
+	if header.SequenceNumber() != w.sequenceNumber {
+		return errPackageOutOfOrder
+	}
+
+	aeadCipher := w.ciphers[header.Cipher()]
+	plaintext, err := aeadCipher.Open(w.pack[headerSize:headerSize], header[4:headerSize], src[headerSize:headerSize+header.Len()+tagSize], header[:4])
+	if err != nil {
+		return errTagMismatch
+	}
+
+	n, err := w.dst.Write(plaintext)
+	if err != nil {
+		return err
+	}
+	if n != len(plaintext) {
+		return io.ErrShortWrite
+	}
+
+	w.sequenceNumber++
+	return nil
+}
+
+type encryptedWriter struct {
+	dst    io.Writer
+	config Config
+
+	nonce          [8]byte
+	sequenceNumber uint32
+	cipher         cipher.AEAD
+
+	pack   [headerSize + payloadSize + tagSize]byte
+	offset int
+}
+
+func (w *encryptedWriter) Write(p []byte) (n int, err error) {
+	if w.offset > 0 {
+		remaining := payloadSize - w.offset
+		if len(p) < remaining {
+			n = copy(w.pack[headerSize+w.offset:], p)
+			w.offset += n
+			return
+		}
+		n = copy(w.pack[headerSize+w.offset:], p[:remaining])
+		if err = w.encrypt(w.pack[headerSize : headerSize+payloadSize]); err != nil {
+			return
+		}
+		p = p[remaining:]
+		w.offset = 0
+	}
+	for len(p) >= payloadSize {
+		if err = w.encrypt(p[:payloadSize]); err != nil {
+			return
+		}
+		p = p[payloadSize:]
+		n += payloadSize
+	}
+	if len(p) > 0 {
+		w.offset = copy(w.pack[headerSize:], p)
+		n += w.offset
+	}
+	return
+}
+
+func (w *encryptedWriter) Close() error {
+	if w.offset > 0 {
+		return w.encrypt(w.pack[headerSize : headerSize+w.offset])
+	}
+	if dst, ok := w.dst.(io.Closer); ok {
+		return dst.Close()
+	}
+	return nil
+}
+
+func (w *encryptedWriter) encrypt(src []byte) error {
+	header := header(w.pack[:])
+	header.SetVersion(w.config.MaxVersion)
+	header.SetCipher(w.config.CipherSuites[0])
+	header.SetLen(len(src))
+	header.SetSequenceNumber(w.sequenceNumber)
+	header.SetNonce(w.nonce)
+
+	w.cipher.Seal(w.pack[headerSize:headerSize], header[4:headerSize], src, header[:4])
+
+	n, err := w.dst.Write(w.pack[:headerSize+len(src)+tagSize])
+	if err != nil {
+		return err
+	}
+	if n != headerSize+len(src)+tagSize {
+		return io.ErrShortWrite
+	}
+
+	w.sequenceNumber++
+	return nil
+}
diff --git a/vendor/golang.org/x/crypto/LICENSE b/vendor/golang.org/x/crypto/LICENSE
new file mode 100644
index 000000000..6a66aea5e
--- /dev/null
+++ b/vendor/golang.org/x/crypto/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2009 The Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/golang.org/x/crypto/PATENTS b/vendor/golang.org/x/crypto/PATENTS
new file mode 100644
index 000000000..733099041
--- /dev/null
+++ b/vendor/golang.org/x/crypto/PATENTS
@@ -0,0 +1,22 @@
+Additional IP Rights Grant (Patents)
+
+"This implementation" means the copyrightable works distributed by
+Google as part of the Go project.
+
+Google hereby grants to You a perpetual, worldwide, non-exclusive,
+no-charge, royalty-free, irrevocable (except as stated in this section)
+patent license to make, have made, use, offer to sell, sell, import,
+transfer and otherwise run, modify and propagate the contents of this
+implementation of Go, where such license applies only to those patent
+claims, both currently owned or controlled by Google and acquired in
+the future, licensable by Google that are necessarily infringed by this
+implementation of Go.  This grant does not include claims that would be
+infringed only as a consequence of further modification of this
+implementation.  If you or your agent or exclusive licensee institute or
+order or agree to the institution of patent litigation against any
+entity (including a cross-claim or counterclaim in a lawsuit) alleging
+that this implementation of Go or any code incorporated within this
+implementation of Go constitutes direct or contributory patent
+infringement, or inducement of patent infringement, then any patent
+rights granted to you under this License for this implementation of Go
+shall terminate as of the date such litigation is filed.
diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go
new file mode 100644
index 000000000..3f0dcb9d8
--- /dev/null
+++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go
@@ -0,0 +1,83 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package chacha20poly1305 implements the ChaCha20-Poly1305 AEAD as specified in RFC 7539.
+package chacha20poly1305 // import "golang.org/x/crypto/chacha20poly1305"
+
+import (
+	"crypto/cipher"
+	"errors"
+)
+
+const (
+	// KeySize is the size of the key used by this AEAD, in bytes.
+	KeySize = 32
+	// NonceSize is the size of the nonce used with this AEAD, in bytes.
+	NonceSize = 12
+)
+
+type chacha20poly1305 struct {
+	key [32]byte
+}
+
+// New returns a ChaCha20-Poly1305 AEAD that uses the given, 256-bit key.
+func New(key []byte) (cipher.AEAD, error) {
+	if len(key) != KeySize {
+		return nil, errors.New("chacha20poly1305: bad key length")
+	}
+	ret := new(chacha20poly1305)
+	copy(ret.key[:], key)
+	return ret, nil
+}
+
+func (c *chacha20poly1305) NonceSize() int {
+	return NonceSize
+}
+
+func (c *chacha20poly1305) Overhead() int {
+	return 16
+}
+
+func (c *chacha20poly1305) Seal(dst, nonce, plaintext, additionalData []byte) []byte {
+	if len(nonce) != NonceSize {
+		panic("chacha20poly1305: bad nonce length passed to Seal")
+	}
+
+	if uint64(len(plaintext)) > (1<<38)-64 {
+		panic("chacha20poly1305: plaintext too large")
+	}
+
+	return c.seal(dst, nonce, plaintext, additionalData)
+}
+
+var errOpen = errors.New("chacha20poly1305: message authentication failed")
+
+func (c *chacha20poly1305) Open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) {
+	if len(nonce) != NonceSize {
+		panic("chacha20poly1305: bad nonce length passed to Open")
+	}
+	if len(ciphertext) < 16 {
+		return nil, errOpen
+	}
+	if uint64(len(ciphertext)) > (1<<38)-48 {
+		panic("chacha20poly1305: ciphertext too large")
+	}
+
+	return c.open(dst, nonce, ciphertext, additionalData)
+}
+
+// sliceForAppend takes a slice and a requested number of bytes. It returns a
+// slice with the contents of the given slice followed by that many bytes and a
+// second slice that aliases into it and contains only the extra bytes. If the
+// original slice has sufficient capacity then no allocation is performed.
+func sliceForAppend(in []byte, n int) (head, tail []byte) {
+	if total := len(in) + n; cap(in) >= total {
+		head = in[:total]
+	} else {
+		head = make([]byte, total)
+		copy(head, in)
+	}
+	tail = head[len(in):]
+	return
+}
diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go
new file mode 100644
index 000000000..7cd7ad834
--- /dev/null
+++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go
@@ -0,0 +1,127 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build go1.7,amd64,!gccgo,!appengine
+
+package chacha20poly1305
+
+import "encoding/binary"
+
+//go:noescape
+func chacha20Poly1305Open(dst []byte, key []uint32, src, ad []byte) bool
+
+//go:noescape
+func chacha20Poly1305Seal(dst []byte, key []uint32, src, ad []byte)
+
+// cpuid is implemented in chacha20poly1305_amd64.s.
+func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
+
+// xgetbv with ecx = 0 is implemented in chacha20poly1305_amd64.s.
+func xgetbv() (eax, edx uint32)
+
+var (
+	useASM  bool
+	useAVX2 bool
+)
+
+func init() {
+	detectCPUFeatures()
+}
+
+// detectCPUFeatures is used to detect if cpu instructions
+// used by the functions implemented in assembler in
+// chacha20poly1305_amd64.s are supported.
+func detectCPUFeatures() {
+	maxID, _, _, _ := cpuid(0, 0)
+	if maxID < 1 {
+		return
+	}
+
+	_, _, ecx1, _ := cpuid(1, 0)
+
+	haveSSSE3 := isSet(9, ecx1)
+	useASM = haveSSSE3
+
+	haveOSXSAVE := isSet(27, ecx1)
+
+	osSupportsAVX := false
+	// For XGETBV, OSXSAVE bit is required and sufficient.
+	if haveOSXSAVE {
+		eax, _ := xgetbv()
+		// Check if XMM and YMM registers have OS support.
+		osSupportsAVX = isSet(1, eax) && isSet(2, eax)
+	}
+	haveAVX := isSet(28, ecx1) && osSupportsAVX
+
+	if maxID < 7 {
+		return
+	}
+
+	_, ebx7, _, _ := cpuid(7, 0)
+	haveAVX2 := isSet(5, ebx7) && haveAVX
+	haveBMI2 := isSet(8, ebx7)
+
+	useAVX2 = haveAVX2 && haveBMI2
+}
+
+// isSet checks if bit at bitpos is set in value.
+func isSet(bitpos uint, value uint32) bool {
+	return value&(1<<bitpos) != 0
+}
+
+// setupState writes a ChaCha20 input matrix to state. See
+// https://tools.ietf.org/html/rfc7539#section-2.3.
+func setupState(state *[16]uint32, key *[32]byte, nonce []byte) {
+	state[0] = 0x61707865
+	state[1] = 0x3320646e
+	state[2] = 0x79622d32
+	state[3] = 0x6b206574
+
+	state[4] = binary.LittleEndian.Uint32(key[:4])
+	state[5] = binary.LittleEndian.Uint32(key[4:8])
+	state[6] = binary.LittleEndian.Uint32(key[8:12])
+	state[7] = binary.LittleEndian.Uint32(key[12:16])
+	state[8] = binary.LittleEndian.Uint32(key[16:20])
+	state[9] = binary.LittleEndian.Uint32(key[20:24])
+	state[10] = binary.LittleEndian.Uint32(key[24:28])
+	state[11] = binary.LittleEndian.Uint32(key[28:32])
+
+	state[12] = 0
+	state[13] = binary.LittleEndian.Uint32(nonce[:4])
+	state[14] = binary.LittleEndian.Uint32(nonce[4:8])
+	state[15] = binary.LittleEndian.Uint32(nonce[8:12])
+}
+
+func (c *chacha20poly1305) seal(dst, nonce, plaintext, additionalData []byte) []byte {
+	if !useASM {
+		return c.sealGeneric(dst, nonce, plaintext, additionalData)
+	}
+
+	var state [16]uint32
+	setupState(&state, &c.key, nonce)
+
+	ret, out := sliceForAppend(dst, len(plaintext)+16)
+	chacha20Poly1305Seal(out[:], state[:], plaintext, additionalData)
+	return ret
+}
+
+func (c *chacha20poly1305) open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) {
+	if !useASM {
+		return c.openGeneric(dst, nonce, ciphertext, additionalData)
+	}
+
+	var state [16]uint32
+	setupState(&state, &c.key, nonce)
+
+	ciphertext = ciphertext[:len(ciphertext)-16]
+	ret, out := sliceForAppend(dst, len(ciphertext))
+	if !chacha20Poly1305Open(out, state[:], ciphertext, additionalData) {
+		for i := range out {
+			out[i] = 0
+		}
+		return nil, errOpen
+	}
+
+	return ret, nil
+}
diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s
new file mode 100644
index 000000000..1c57e3894
--- /dev/null
+++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s
@@ -0,0 +1,2714 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare.
+
+// +build go1.7,amd64,!gccgo,!appengine
+
+#include "textflag.h"
+// General register allocation
+#define oup DI
+#define inp SI
+#define inl BX
+#define adp CX // free to reuse, after we hash the additional data
+#define keyp R8 // free to reuse, when we copy the key to stack
+#define itr2 R9 // general iterator
+#define itr1 CX // general iterator
+#define acc0 R10
+#define acc1 R11
+#define acc2 R12
+#define t0 R13
+#define t1 R14
+#define t2 R15
+#define t3 R8
+// Register and stack allocation for the SSE code
+#define rStore (0*16)(BP)
+#define sStore (1*16)(BP)
+#define state1Store (2*16)(BP)
+#define state2Store (3*16)(BP)
+#define tmpStore (4*16)(BP)
+#define ctr0Store (5*16)(BP)
+#define ctr1Store (6*16)(BP)
+#define ctr2Store (7*16)(BP)
+#define ctr3Store (8*16)(BP)
+#define A0 X0
+#define A1 X1
+#define A2 X2
+#define B0 X3
+#define B1 X4
+#define B2 X5
+#define C0 X6
+#define C1 X7
+#define C2 X8
+#define D0 X9
+#define D1 X10
+#define D2 X11
+#define T0 X12
+#define T1 X13
+#define T2 X14
+#define T3 X15
+#define A3 T0
+#define B3 T1
+#define C3 T2
+#define D3 T3
+// Register and stack allocation for the AVX2 code
+#define rsStoreAVX2 (0*32)(BP)
+#define state1StoreAVX2 (1*32)(BP)
+#define state2StoreAVX2 (2*32)(BP)
+#define ctr0StoreAVX2 (3*32)(BP)
+#define ctr1StoreAVX2 (4*32)(BP)
+#define ctr2StoreAVX2 (5*32)(BP)
+#define ctr3StoreAVX2 (6*32)(BP)
+#define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack
+#define AA0 Y0
+#define AA1 Y5
+#define AA2 Y6
+#define AA3 Y7
+#define BB0 Y14
+#define BB1 Y9
+#define BB2 Y10
+#define BB3 Y11
+#define CC0 Y12
+#define CC1 Y13
+#define CC2 Y8
+#define CC3 Y15
+#define DD0 Y4
+#define DD1 Y1
+#define DD2 Y2
+#define DD3 Y3
+#define TT0 DD3
+#define TT1 AA3
+#define TT2 BB3
+#define TT3 CC3
+// ChaCha20 constants
+DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865
+DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e
+DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32
+DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574
+DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865
+DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e
+DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32
+DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574
+// <<< 16 with PSHUFB
+DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
+DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
+DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302
+DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
+// <<< 8 with PSHUFB
+DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
+DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
+DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003
+DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
+
+DATA ·avx2InitMask<>+0x00(SB)/8, $0x0
+DATA ·avx2InitMask<>+0x08(SB)/8, $0x0
+DATA ·avx2InitMask<>+0x10(SB)/8, $0x1
+DATA ·avx2InitMask<>+0x18(SB)/8, $0x0
+
+DATA ·avx2IncMask<>+0x00(SB)/8, $0x2
+DATA ·avx2IncMask<>+0x08(SB)/8, $0x0
+DATA ·avx2IncMask<>+0x10(SB)/8, $0x2
+DATA ·avx2IncMask<>+0x18(SB)/8, $0x0
+// Poly1305 key clamp
+DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
+DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
+DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
+DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
+
+DATA ·sseIncMask<>+0x00(SB)/8, $0x1
+DATA ·sseIncMask<>+0x08(SB)/8, $0x0
+// To load/store the last < 16 bytes in a buffer
+DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff
+DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000
+DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff
+DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000
+DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff
+DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000
+DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff
+DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000
+DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff
+DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000
+DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff
+DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000
+DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff
+DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000
+DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000
+DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff
+DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff
+DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff
+DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff
+DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff
+DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
+DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff
+DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
+
+GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32
+GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32
+GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32
+GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16
+GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32
+GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32
+GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32
+GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
+// No PALIGNR in Go ASM yet (but VPALIGNR is present).
+#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
+#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
+#define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5
+#define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13
+#define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6
+#define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7
+#define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8
+#define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14
+#define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9
+#define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10
+#define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11
+#define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15
+#define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3
+#define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4
+#define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5
+#define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13
+#define shiftC0Right shiftC0Left
+#define shiftC1Right shiftC1Left
+#define shiftC2Right shiftC2Left
+#define shiftC3Right shiftC3Left
+#define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9
+#define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10
+#define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11
+#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
+// Some macros
+#define chachaQR(A, B, C, D, T) \
+	PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D                            \
+	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
+	PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D                             \
+	PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
+
+#define chachaQR_AVX2(A, B, C, D, T) \
+	VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D                         \
+	VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
+	VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D                          \
+	VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
+
+#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
+#define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2
+#define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX
+#define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3
+#define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t2:t3; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2
+
+#define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2
+#define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3
+#define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3
+
+#define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage
+#define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage
+// ----------------------------------------------------------------------------
+TEXT polyHashADInternal<>(SB), NOSPLIT, $0
+	// adp points to beginning of additional data
+	// itr2 holds ad length
+	XORQ acc0, acc0
+	XORQ acc1, acc1
+	XORQ acc2, acc2
+	CMPQ itr2, $13
+	JNE  hashADLoop
+
+openFastTLSAD:
+	// Special treatment for the TLS case of 13 bytes
+	MOVQ (adp), acc0
+	MOVQ 5(adp), acc1
+	SHRQ $24, acc1
+	MOVQ $1, acc2
+	polyMul
+	RET
+
+hashADLoop:
+	// Hash in 16 byte chunks
+	CMPQ itr2, $16
+	JB   hashADTail
+	polyAdd(0(adp))
+	LEAQ (1*16)(adp), adp
+	SUBQ $16, itr2
+	polyMul
+	JMP  hashADLoop
+
+hashADTail:
+	CMPQ itr2, $0
+	JE   hashADDone
+
+	// Hash last < 16 byte tail
+	XORQ t0, t0
+	XORQ t1, t1
+	XORQ t2, t2
+	ADDQ itr2, adp
+
+hashADTailLoop:
+	SHLQ $8, t1:t0
+	SHLQ $8, t0
+	MOVB -1(adp), t2
+	XORQ t2, t0
+	DECQ adp
+	DECQ itr2
+	JNE  hashADTailLoop
+
+hashADTailFinish:
+	ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
+	polyMul
+
+	// Finished AD
+hashADDone:
+	RET
+
+// ----------------------------------------------------------------------------
+// func chacha20Poly1305Open(dst, key, src, ad []byte) bool
+TEXT ·chacha20Poly1305Open(SB), 0, $288-97
+	// For aligned stack access
+	MOVQ SP, BP
+	ADDQ $32, BP
+	ANDQ $-32, BP
+	MOVQ dst+0(FP), oup
+	MOVQ key+24(FP), keyp
+	MOVQ src+48(FP), inp
+	MOVQ src_len+56(FP), inl
+	MOVQ ad+72(FP), adp
+
+	// Check for AVX2 support
+	CMPB ·useAVX2(SB), $1
+	JE   chacha20Poly1305Open_AVX2
+
+	// Special optimization, for very short buffers
+	CMPQ inl, $128
+	JBE  openSSE128 // About 16% faster
+
+	// For long buffers, prepare the poly key first
+	MOVOU ·chacha20Constants<>(SB), A0
+	MOVOU (1*16)(keyp), B0
+	MOVOU (2*16)(keyp), C0
+	MOVOU (3*16)(keyp), D0
+	MOVO  D0, T1
+
+	// Store state on stack for future use
+	MOVO B0, state1Store
+	MOVO C0, state2Store
+	MOVO D0, ctr3Store
+	MOVQ $10, itr2
+
+openSSEPreparePolyKey:
+	chachaQR(A0, B0, C0, D0, T0)
+	shiftB0Left;  shiftC0Left; shiftD0Left
+	chachaQR(A0, B0, C0, D0, T0)
+	shiftB0Right; shiftC0Right; shiftD0Right
+	DECQ          itr2
+	JNE           openSSEPreparePolyKey
+
+	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
+	PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0
+
+	// Clamp and store the key
+	PAND ·polyClampMask<>(SB), A0
+	MOVO A0, rStore; MOVO B0, sStore
+
+	// Hash AAD
+	MOVQ ad_len+80(FP), itr2
+	CALL polyHashADInternal<>(SB)
+
+openSSEMainLoop:
+	CMPQ inl, $256
+	JB   openSSEMainLoopDone
+
+	// Load state, increment counter blocks
+	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
+	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
+	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
+	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+
+	// Store counters
+	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
+
+	// There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
+	MOVQ $4, itr1
+	MOVQ inp, itr2
+
+openSSEInternalLoop:
+	MOVO          C3, tmpStore
+	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
+	MOVO          tmpStore, C3
+	MOVO          C1, tmpStore
+	chachaQR(A3, B3, C3, D3, C1)
+	MOVO          tmpStore, C1
+	polyAdd(0(itr2))
+	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
+	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
+	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
+	polyMulStage1
+	polyMulStage2
+	LEAQ          (2*8)(itr2), itr2
+	MOVO          C3, tmpStore
+	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
+	MOVO          tmpStore, C3
+	MOVO          C1, tmpStore
+	polyMulStage3
+	chachaQR(A3, B3, C3, D3, C1)
+	MOVO          tmpStore, C1
+	polyMulReduceStage
+	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
+	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
+	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
+	DECQ          itr1
+	JGE           openSSEInternalLoop
+
+	polyAdd(0(itr2))
+	polyMul
+	LEAQ (2*8)(itr2), itr2
+
+	CMPQ itr1, $-6
+	JG   openSSEInternalLoop
+
+	// Add in the state
+	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
+	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
+	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
+	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
+
+	// Load - xor - store
+	MOVO  D3, tmpStore
+	MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup)
+	MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup)
+	MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup)
+	MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup)
+	MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup)
+	MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup)
+	MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup)
+	MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup)
+	MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup)
+	MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup)
+	MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup)
+	MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup)
+	MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup)
+	MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup)
+	MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup)
+	MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup)
+	LEAQ  256(inp), inp
+	LEAQ  256(oup), oup
+	SUBQ  $256, inl
+	JMP   openSSEMainLoop
+
+openSSEMainLoopDone:
+	// Handle the various tail sizes efficiently
+	TESTQ inl, inl
+	JE    openSSEFinalize
+	CMPQ  inl, $64
+	JBE   openSSETail64
+	CMPQ  inl, $128
+	JBE   openSSETail128
+	CMPQ  inl, $192
+	JBE   openSSETail192
+	JMP   openSSETail256
+
+openSSEFinalize:
+	// Hash in the PT, AAD lengths
+	ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2
+	polyMul
+
+	// Final reduce
+	MOVQ    acc0, t0
+	MOVQ    acc1, t1
+	MOVQ    acc2, t2
+	SUBQ    $-5, acc0
+	SBBQ    $-1, acc1
+	SBBQ    $3, acc2
+	CMOVQCS t0, acc0
+	CMOVQCS t1, acc1
+	CMOVQCS t2, acc2
+
+	// Add in the "s" part of the key
+	ADDQ 0+sStore, acc0
+	ADCQ 8+sStore, acc1
+
+	// Finally, constant time compare to the tag at the end of the message
+	XORQ    AX, AX
+	MOVQ    $1, DX
+	XORQ    (0*8)(inp), acc0
+	XORQ    (1*8)(inp), acc1
+	ORQ     acc1, acc0
+	CMOVQEQ DX, AX
+
+	// Return true iff tags are equal
+	MOVB AX, ret+96(FP)
+	RET
+
+// ----------------------------------------------------------------------------
+// Special optimization for buffers smaller than 129 bytes
+openSSE128:
+	// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
+	MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
+	MOVO  A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
+	MOVO  A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
+	MOVO  B0, T1; MOVO C0, T2; MOVO D1, T3
+	MOVQ  $10, itr2
+
+openSSE128InnerCipherLoop:
+	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
+	shiftB0Left;  shiftB1Left; shiftB2Left
+	shiftC0Left;  shiftC1Left; shiftC2Left
+	shiftD0Left;  shiftD1Left; shiftD2Left
+	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
+	shiftB0Right; shiftB1Right; shiftB2Right
+	shiftC0Right; shiftC1Right; shiftC2Right
+	shiftD0Right; shiftD1Right; shiftD2Right
+	DECQ          itr2
+	JNE           openSSE128InnerCipherLoop
+
+	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
+	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
+	PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
+	PADDL T2, C1; PADDL T2, C2
+	PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
+
+	// Clamp and store the key
+	PAND  ·polyClampMask<>(SB), A0
+	MOVOU A0, rStore; MOVOU B0, sStore
+
+	// Hash
+	MOVQ ad_len+80(FP), itr2
+	CALL polyHashADInternal<>(SB)
+
+openSSE128Open:
+	CMPQ inl, $16
+	JB   openSSETail16
+	SUBQ $16, inl
+
+	// Load for hashing
+	polyAdd(0(inp))
+
+	// Load for decryption
+	MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup)
+	LEAQ  (1*16)(inp), inp
+	LEAQ  (1*16)(oup), oup
+	polyMul
+
+	// Shift the stream "left"
+	MOVO B1, A1
+	MOVO C1, B1
+	MOVO D1, C1
+	MOVO A2, D1
+	MOVO B2, A2
+	MOVO C2, B2
+	MOVO D2, C2
+	JMP  openSSE128Open
+
+openSSETail16:
+	TESTQ inl, inl
+	JE    openSSEFinalize
+
+	// We can safely load the CT from the end, because it is padded with the MAC
+	MOVQ   inl, itr2
+	SHLQ   $4, itr2
+	LEAQ   ·andMask<>(SB), t0
+	MOVOU  (inp), T0
+	ADDQ   inl, inp
+	PAND   -16(t0)(itr2*1), T0
+	MOVO   T0, 0+tmpStore
+	MOVQ   T0, t0
+	MOVQ   8+tmpStore, t1
+	PXOR   A1, T0
+
+	// We can only store one byte at a time, since plaintext can be shorter than 16 bytes
+openSSETail16Store:
+	MOVQ T0, t3
+	MOVB t3, (oup)
+	PSRLDQ $1, T0
+	INCQ   oup
+	DECQ   inl
+	JNE    openSSETail16Store
+	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
+	polyMul
+	JMP    openSSEFinalize
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 64 bytes of ciphertext
+openSSETail64:
+	// Need to decrypt up to 64 bytes - prepare single block
+	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
+	XORQ itr2, itr2
+	MOVQ inl, itr1
+	CMPQ itr1, $16
+	JB   openSSETail64LoopB
+
+openSSETail64LoopA:
+	// Perform ChaCha rounds, while hashing the remaining input
+	polyAdd(0(inp)(itr2*1))
+	polyMul
+	SUBQ $16, itr1
+
+openSSETail64LoopB:
+	ADDQ          $16, itr2
+	chachaQR(A0, B0, C0, D0, T0)
+	shiftB0Left;  shiftC0Left; shiftD0Left
+	chachaQR(A0, B0, C0, D0, T0)
+	shiftB0Right; shiftC0Right; shiftD0Right
+
+	CMPQ itr1, $16
+	JAE  openSSETail64LoopA
+
+	CMPQ itr2, $160
+	JNE  openSSETail64LoopB
+
+	PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
+
+openSSETail64DecLoop:
+	CMPQ  inl, $16
+	JB    openSSETail64DecLoopDone
+	SUBQ  $16, inl
+	MOVOU (inp), T0
+	PXOR  T0, A0
+	MOVOU A0, (oup)
+	LEAQ  16(inp), inp
+	LEAQ  16(oup), oup
+	MOVO  B0, A0
+	MOVO  C0, B0
+	MOVO  D0, C0
+	JMP   openSSETail64DecLoop
+
+openSSETail64DecLoopDone:
+	MOVO A0, A1
+	JMP  openSSETail16
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 128 bytes of ciphertext
+openSSETail128:
+	// Need to decrypt up to 128 bytes - prepare two blocks
+	MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store
+	MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store
+	XORQ itr2, itr2
+	MOVQ inl, itr1
+	ANDQ $-16, itr1
+
+openSSETail128LoopA:
+	// Perform ChaCha rounds, while hashing the remaining input
+	polyAdd(0(inp)(itr2*1))
+	polyMul
+
+openSSETail128LoopB:
+	ADDQ          $16, itr2
+	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
+	shiftB0Left;  shiftC0Left; shiftD0Left
+	shiftB1Left;  shiftC1Left; shiftD1Left
+	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
+	shiftB0Right; shiftC0Right; shiftD0Right
+	shiftB1Right; shiftC1Right; shiftD1Right
+
+	CMPQ itr2, itr1
+	JB   openSSETail128LoopA
+
+	CMPQ itr2, $160
+	JNE  openSSETail128LoopB
+
+	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
+	PADDL state1Store, B0; PADDL state1Store, B1
+	PADDL state2Store, C0; PADDL state2Store, C1
+	PADDL ctr1Store, D0; PADDL ctr0Store, D1
+
+	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
+	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
+	MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
+
+	SUBQ $64, inl
+	LEAQ 64(inp), inp
+	LEAQ 64(oup), oup
+	JMP  openSSETail64DecLoop
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 192 bytes of ciphertext
+openSSETail192:
+	// Need to decrypt up to 192 bytes - prepare three blocks
+	MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store
+	MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
+	MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store
+
+	MOVQ    inl, itr1
+	MOVQ    $160, itr2
+	CMPQ    itr1, $160
+	CMOVQGT itr2, itr1
+	ANDQ    $-16, itr1
+	XORQ    itr2, itr2
+
+openSSLTail192LoopA:
+	// Perform ChaCha rounds, while hashing the remaining input
+	polyAdd(0(inp)(itr2*1))
+	polyMul
+
+openSSLTail192LoopB:
+	ADDQ         $16, itr2
+	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
+	shiftB0Left; shiftC0Left; shiftD0Left
+	shiftB1Left; shiftC1Left; shiftD1Left
+	shiftB2Left; shiftC2Left; shiftD2Left
+
+	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
+	shiftB0Right; shiftC0Right; shiftD0Right
+	shiftB1Right; shiftC1Right; shiftD1Right
+	shiftB2Right; shiftC2Right; shiftD2Right
+
+	CMPQ itr2, itr1
+	JB   openSSLTail192LoopA
+
+	CMPQ itr2, $160
+	JNE  openSSLTail192LoopB
+
+	CMPQ inl, $176
+	JB   openSSLTail192Store
+
+	polyAdd(160(inp))
+	polyMul
+
+	CMPQ inl, $192
+	JB   openSSLTail192Store
+
+	polyAdd(176(inp))
+	polyMul
+
+openSSLTail192Store:
+	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
+	PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
+	PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
+	PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
+
+	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
+	PXOR  T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2
+	MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup)
+
+	MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
+	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
+	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
+
+	SUBQ $128, inl
+	LEAQ 128(inp), inp
+	LEAQ 128(oup), oup
+	JMP  openSSETail64DecLoop
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 256 bytes of ciphertext
+openSSETail256:
+	// Need to decrypt up to 256 bytes - prepare four blocks
+	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
+	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
+	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
+	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+
+	// Store counters
+	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
+	XORQ itr2, itr2
+
+openSSETail256Loop:
+	// This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication
+	polyAdd(0(inp)(itr2*1))
+	MOVO          C3, tmpStore
+	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
+	MOVO          tmpStore, C3
+	MOVO          C1, tmpStore
+	chachaQR(A3, B3, C3, D3, C1)
+	MOVO          tmpStore, C1
+	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
+	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
+	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
+	polyMulStage1
+	polyMulStage2
+	MOVO          C3, tmpStore
+	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
+	MOVO          tmpStore, C3
+	MOVO          C1, tmpStore
+	chachaQR(A3, B3, C3, D3, C1)
+	MOVO          tmpStore, C1
+	polyMulStage3
+	polyMulReduceStage
+	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
+	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
+	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
+	ADDQ          $2*8, itr2
+	CMPQ          itr2, $160
+	JB            openSSETail256Loop
+	MOVQ          inl, itr1
+	ANDQ          $-16, itr1
+
+openSSETail256HashLoop:
+	polyAdd(0(inp)(itr2*1))
+	polyMul
+	ADDQ $2*8, itr2
+	CMPQ itr2, itr1
+	JB   openSSETail256HashLoop
+
+	// Add in the state
+	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
+	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
+	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
+	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
+	MOVO  D3, tmpStore
+
+	// Load - xor - store
+	MOVOU (0*16)(inp), D3; PXOR D3, A0
+	MOVOU (1*16)(inp), D3; PXOR D3, B0
+	MOVOU (2*16)(inp), D3; PXOR D3, C0
+	MOVOU (3*16)(inp), D3; PXOR D3, D0
+	MOVOU A0, (0*16)(oup)
+	MOVOU B0, (1*16)(oup)
+	MOVOU C0, (2*16)(oup)
+	MOVOU D0, (3*16)(oup)
+	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
+	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
+	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
+	MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
+	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
+	MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
+	LEAQ  192(inp), inp
+	LEAQ  192(oup), oup
+	SUBQ  $192, inl
+	MOVO  A3, A0
+	MOVO  B3, B0
+	MOVO  C3, C0
+	MOVO  tmpStore, D0
+
+	JMP openSSETail64DecLoop
+
+// ----------------------------------------------------------------------------
+// ------------------------- AVX2 Code ----------------------------------------
+chacha20Poly1305Open_AVX2:
+	VZEROUPPER
+	VMOVDQU ·chacha20Constants<>(SB), AA0
+	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
+	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
+	BYTE    $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
+	VPADDD  ·avx2InitMask<>(SB), DD0, DD0
+
+	// Special optimization, for very short buffers
+	CMPQ inl, $192
+	JBE  openAVX2192
+	CMPQ inl, $320
+	JBE  openAVX2320
+
+	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
+	VMOVDQA BB0, state1StoreAVX2
+	VMOVDQA CC0, state2StoreAVX2
+	VMOVDQA DD0, ctr3StoreAVX2
+	MOVQ    $10, itr2
+
+openAVX2PreparePolyKey:
+	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
+	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
+	DECQ     itr2
+	JNE      openAVX2PreparePolyKey
+
+	VPADDD ·chacha20Constants<>(SB), AA0, AA0
+	VPADDD state1StoreAVX2, BB0, BB0
+	VPADDD state2StoreAVX2, CC0, CC0
+	VPADDD ctr3StoreAVX2, DD0, DD0
+
+	VPERM2I128 $0x02, AA0, BB0, TT0
+
+	// Clamp and store poly key
+	VPAND   ·polyClampMask<>(SB), TT0, TT0
+	VMOVDQA TT0, rsStoreAVX2
+
+	// Stream for the first 64 bytes
+	VPERM2I128 $0x13, AA0, BB0, AA0
+	VPERM2I128 $0x13, CC0, DD0, BB0
+
+	// Hash AD + first 64 bytes
+	MOVQ ad_len+80(FP), itr2
+	CALL polyHashADInternal<>(SB)
+	XORQ itr1, itr1
+
+openAVX2InitialHash64:
+	polyAdd(0(inp)(itr1*1))
+	polyMulAVX2
+	ADDQ $16, itr1
+	CMPQ itr1, $64
+	JNE  openAVX2InitialHash64
+
+	// Decrypt the first 64 bytes
+	VPXOR   (0*32)(inp), AA0, AA0
+	VPXOR   (1*32)(inp), BB0, BB0
+	VMOVDQU AA0, (0*32)(oup)
+	VMOVDQU BB0, (1*32)(oup)
+	LEAQ    (2*32)(inp), inp
+	LEAQ    (2*32)(oup), oup
+	SUBQ    $64, inl
+
+openAVX2MainLoop:
+	CMPQ inl, $512
+	JB   openAVX2MainLoopDone
+
+	// Load state, increment counter blocks, store the incremented counters
+	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
+	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
+	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
+	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
+	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
+	XORQ    itr1, itr1
+
+openAVX2InternalLoop:
+	// Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications
+	// Effectively per 512 bytes of stream we hash 480 bytes of ciphertext
+	polyAdd(0*8(inp)(itr1*1))
+	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+	polyMulStage1_AVX2
+	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
+	polyMulStage2_AVX2
+	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+	polyMulStage3_AVX2
+	VMOVDQA  CC3, tmpStoreAVX2
+	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
+	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
+	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
+	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
+	VMOVDQA  tmpStoreAVX2, CC3
+	polyMulReduceStage
+	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
+	polyAdd(2*8(inp)(itr1*1))
+	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+	polyMulStage1_AVX2
+	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+	VMOVDQA  CC3, tmpStoreAVX2
+	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
+	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
+	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
+	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
+	VMOVDQA  tmpStoreAVX2, CC3
+	polyMulStage2_AVX2
+	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
+	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
+	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
+	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+	polyMulStage3_AVX2
+	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
+	polyMulReduceStage
+	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+	polyAdd(4*8(inp)(itr1*1))
+	LEAQ     (6*8)(itr1), itr1
+	VMOVDQA  CC3, tmpStoreAVX2
+	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
+	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
+	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
+	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
+	VMOVDQA  tmpStoreAVX2, CC3
+	polyMulStage1_AVX2
+	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+	polyMulStage2_AVX2
+	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
+	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+	polyMulStage3_AVX2
+	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+	VMOVDQA  CC3, tmpStoreAVX2
+	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
+	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
+	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
+	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
+	VMOVDQA  tmpStoreAVX2, CC3
+	polyMulReduceStage
+	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
+	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
+	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
+	CMPQ     itr1, $480
+	JNE      openAVX2InternalLoop
+
+	VPADDD  ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
+	VPADDD  state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
+	VPADDD  state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
+	VPADDD  ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
+	VMOVDQA CC3, tmpStoreAVX2
+
+	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
+	polyAdd(480(inp))
+	polyMulAVX2
+	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
+	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
+	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
+	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
+	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
+	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
+
+	// and here
+	polyAdd(496(inp))
+	polyMulAVX2
+	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
+	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
+	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
+	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
+	VPXOR      (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
+	VMOVDQU    AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
+	LEAQ       (32*16)(inp), inp
+	LEAQ       (32*16)(oup), oup
+	SUBQ       $(32*16), inl
+	JMP        openAVX2MainLoop
+
+openAVX2MainLoopDone:
+	// Handle the various tail sizes efficiently
+	TESTQ inl, inl
+	JE    openSSEFinalize
+	CMPQ  inl, $128
+	JBE   openAVX2Tail128
+	CMPQ  inl, $256
+	JBE   openAVX2Tail256
+	CMPQ  inl, $384
+	JBE   openAVX2Tail384
+	JMP   openAVX2Tail512
+
+// ----------------------------------------------------------------------------
+// Special optimization for buffers smaller than 193 bytes
+openAVX2192:
+	// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
+	VMOVDQA AA0, AA1
+	VMOVDQA BB0, BB1
+	VMOVDQA CC0, CC1
+	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
+	VMOVDQA AA0, AA2
+	VMOVDQA BB0, BB2
+	VMOVDQA CC0, CC2
+	VMOVDQA DD0, DD2
+	VMOVDQA DD1, TT3
+	MOVQ    $10, itr2
+
+openAVX2192InnerCipherLoop:
+	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+	VPALIGNR   $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
+	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
+	VPALIGNR   $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
+	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+	VPALIGNR   $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
+	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
+	VPALIGNR   $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
+	DECQ       itr2
+	JNE        openAVX2192InnerCipherLoop
+	VPADDD     AA2, AA0, AA0; VPADDD AA2, AA1, AA1
+	VPADDD     BB2, BB0, BB0; VPADDD BB2, BB1, BB1
+	VPADDD     CC2, CC0, CC0; VPADDD CC2, CC1, CC1
+	VPADDD     DD2, DD0, DD0; VPADDD TT3, DD1, DD1
+	VPERM2I128 $0x02, AA0, BB0, TT0
+
+	// Clamp and store poly key
+	VPAND   ·polyClampMask<>(SB), TT0, TT0
+	VMOVDQA TT0, rsStoreAVX2
+
+	// Stream for up to 192 bytes
+	VPERM2I128 $0x13, AA0, BB0, AA0
+	VPERM2I128 $0x13, CC0, DD0, BB0
+	VPERM2I128 $0x02, AA1, BB1, CC0
+	VPERM2I128 $0x02, CC1, DD1, DD0
+	VPERM2I128 $0x13, AA1, BB1, AA1
+	VPERM2I128 $0x13, CC1, DD1, BB1
+
+openAVX2ShortOpen:
+	// Hash
+	MOVQ ad_len+80(FP), itr2
+	CALL polyHashADInternal<>(SB)
+
+openAVX2ShortOpenLoop:
+	CMPQ inl, $32
+	JB   openAVX2ShortTail32
+	SUBQ $32, inl
+
+	// Load for hashing
+	polyAdd(0*8(inp))
+	polyMulAVX2
+	polyAdd(2*8(inp))
+	polyMulAVX2
+
+	// Load for decryption
+	VPXOR   (inp), AA0, AA0
+	VMOVDQU AA0, (oup)
+	LEAQ    (1*32)(inp), inp
+	LEAQ    (1*32)(oup), oup
+
+	// Shift stream left
+	VMOVDQA BB0, AA0
+	VMOVDQA CC0, BB0
+	VMOVDQA DD0, CC0
+	VMOVDQA AA1, DD0
+	VMOVDQA BB1, AA1
+	VMOVDQA CC1, BB1
+	VMOVDQA DD1, CC1
+	VMOVDQA AA2, DD1
+	VMOVDQA BB2, AA2
+	JMP     openAVX2ShortOpenLoop
+
+openAVX2ShortTail32:
+	CMPQ    inl, $16
+	VMOVDQA A0, A1
+	JB      openAVX2ShortDone
+
+	SUBQ $16, inl
+
+	// Load for hashing
+	polyAdd(0*8(inp))
+	polyMulAVX2
+
+	// Load for decryption
+	VPXOR      (inp), A0, T0
+	VMOVDQU    T0, (oup)
+	LEAQ       (1*16)(inp), inp
+	LEAQ       (1*16)(oup), oup
+	VPERM2I128 $0x11, AA0, AA0, AA0
+	VMOVDQA    A0, A1
+
+openAVX2ShortDone:
+	VZEROUPPER
+	JMP openSSETail16
+
+// ----------------------------------------------------------------------------
+// Special optimization for buffers smaller than 321 bytes
+openAVX2320:
+	// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
+	VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
+	VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
+	VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
+	MOVQ    $10, itr2
+
+openAVX2320InnerCipherLoop:
+	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
+	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
+	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
+	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
+	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
+	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
+	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
+	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
+	DECQ     itr2
+	JNE      openAVX2320InnerCipherLoop
+
+	VMOVDQA ·chacha20Constants<>(SB), TT0
+	VPADDD  TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
+	VPADDD  TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
+	VPADDD  TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
+	VMOVDQA ·avx2IncMask<>(SB), TT0
+	VPADDD  TT3, DD0, DD0; VPADDD TT0, TT3, TT3
+	VPADDD  TT3, DD1, DD1; VPADDD TT0, TT3, TT3
+	VPADDD  TT3, DD2, DD2
+
+	// Clamp and store poly key
+	VPERM2I128 $0x02, AA0, BB0, TT0
+	VPAND      ·polyClampMask<>(SB), TT0, TT0
+	VMOVDQA    TT0, rsStoreAVX2
+
+	// Stream for up to 320 bytes
+	VPERM2I128 $0x13, AA0, BB0, AA0
+	VPERM2I128 $0x13, CC0, DD0, BB0
+	VPERM2I128 $0x02, AA1, BB1, CC0
+	VPERM2I128 $0x02, CC1, DD1, DD0
+	VPERM2I128 $0x13, AA1, BB1, AA1
+	VPERM2I128 $0x13, CC1, DD1, BB1
+	VPERM2I128 $0x02, AA2, BB2, CC1
+	VPERM2I128 $0x02, CC2, DD2, DD1
+	VPERM2I128 $0x13, AA2, BB2, AA2
+	VPERM2I128 $0x13, CC2, DD2, BB2
+	JMP        openAVX2ShortOpen
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 128 bytes of ciphertext
+openAVX2Tail128:
+	// Need to decrypt up to 128 bytes - prepare two blocks
+	VMOVDQA ·chacha20Constants<>(SB), AA1
+	VMOVDQA state1StoreAVX2, BB1
+	VMOVDQA state2StoreAVX2, CC1
+	VMOVDQA ctr3StoreAVX2, DD1
+	VPADDD  ·avx2IncMask<>(SB), DD1, DD1
+	VMOVDQA DD1, DD0
+
+	XORQ  itr2, itr2
+	MOVQ  inl, itr1
+	ANDQ  $-16, itr1
+	TESTQ itr1, itr1
+	JE    openAVX2Tail128LoopB
+
+openAVX2Tail128LoopA:
+	// Perform ChaCha rounds, while hashing the remaining input
+	polyAdd(0(inp)(itr2*1))
+	polyMulAVX2
+
+openAVX2Tail128LoopB:
+	ADDQ     $16, itr2
+	chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+	VPALIGNR $4, BB1, BB1, BB1
+	VPALIGNR $8, CC1, CC1, CC1
+	VPALIGNR $12, DD1, DD1, DD1
+	chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+	VPALIGNR $12, BB1, BB1, BB1
+	VPALIGNR $8, CC1, CC1, CC1
+	VPALIGNR $4, DD1, DD1, DD1
+	CMPQ     itr2, itr1
+	JB       openAVX2Tail128LoopA
+	CMPQ     itr2, $160
+	JNE      openAVX2Tail128LoopB
+
+	VPADDD     ·chacha20Constants<>(SB), AA1, AA1
+	VPADDD     state1StoreAVX2, BB1, BB1
+	VPADDD     state2StoreAVX2, CC1, CC1
+	VPADDD     DD0, DD1, DD1
+	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
+
+openAVX2TailLoop:
+	CMPQ inl, $32
+	JB   openAVX2Tail
+	SUBQ $32, inl
+
+	// Load for decryption
+	VPXOR   (inp), AA0, AA0
+	VMOVDQU AA0, (oup)
+	LEAQ    (1*32)(inp), inp
+	LEAQ    (1*32)(oup), oup
+	VMOVDQA BB0, AA0
+	VMOVDQA CC0, BB0
+	VMOVDQA DD0, CC0
+	JMP     openAVX2TailLoop
+
+openAVX2Tail:
+	CMPQ    inl, $16
+	VMOVDQA A0, A1
+	JB      openAVX2TailDone
+	SUBQ    $16, inl
+
+	// Load for decryption
+	VPXOR      (inp), A0, T0
+	VMOVDQU    T0, (oup)
+	LEAQ       (1*16)(inp), inp
+	LEAQ       (1*16)(oup), oup
+	VPERM2I128 $0x11, AA0, AA0, AA0
+	VMOVDQA    A0, A1
+
+openAVX2TailDone:
+	VZEROUPPER
+	JMP openSSETail16
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 256 bytes of ciphertext
+openAVX2Tail256:
+	// Need to decrypt up to 256 bytes - prepare four blocks
+	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
+	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
+	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
+	VMOVDQA ctr3StoreAVX2, DD0
+	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
+	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
+	VMOVDQA DD0, TT1
+	VMOVDQA DD1, TT2
+
+	// Compute the number of iterations that will hash data
+	MOVQ    inl, tmpStoreAVX2
+	MOVQ    inl, itr1
+	SUBQ    $128, itr1
+	SHRQ    $4, itr1
+	MOVQ    $10, itr2
+	CMPQ    itr1, $10
+	CMOVQGT itr2, itr1
+	MOVQ    inp, inl
+	XORQ    itr2, itr2
+
+openAVX2Tail256LoopA:
+	polyAdd(0(inl))
+	polyMulAVX2
+	LEAQ 16(inl), inl
+
+	// Perform ChaCha rounds, while hashing the remaining input
+openAVX2Tail256LoopB:
+	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
+	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
+	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
+	INCQ     itr2
+	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
+	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
+	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
+	CMPQ     itr2, itr1
+	JB       openAVX2Tail256LoopA
+
+	CMPQ itr2, $10
+	JNE  openAVX2Tail256LoopB
+
+	MOVQ inl, itr2
+	SUBQ inp, inl
+	MOVQ inl, itr1
+	MOVQ tmpStoreAVX2, inl
+
+	// Hash the remainder of data (if any)
+openAVX2Tail256Hash:
+	ADDQ $16, itr1
+	CMPQ itr1, inl
+	JGT  openAVX2Tail256HashEnd
+	polyAdd (0(itr2))
+	polyMulAVX2
+	LEAQ 16(itr2), itr2
+	JMP  openAVX2Tail256Hash
+
+// Store 128 bytes safely, then go to store loop
+openAVX2Tail256HashEnd:
+	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
+	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
+	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
+	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1
+	VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2
+	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
+
+	VPXOR   (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2
+	VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup)
+	LEAQ    (4*32)(inp), inp
+	LEAQ    (4*32)(oup), oup
+	SUBQ    $4*32, inl
+
+	JMP openAVX2TailLoop
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 384 bytes of ciphertext
+openAVX2Tail384:
+	// Need to decrypt up to 384 bytes - prepare six blocks
+	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
+	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
+	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
+	VMOVDQA ctr3StoreAVX2, DD0
+	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
+	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
+	VPADDD  ·avx2IncMask<>(SB), DD1, DD2
+	VMOVDQA DD0, ctr0StoreAVX2
+	VMOVDQA DD1, ctr1StoreAVX2
+	VMOVDQA DD2, ctr2StoreAVX2
+
+	// Compute the number of iterations that will hash two blocks of data
+	MOVQ    inl, tmpStoreAVX2
+	MOVQ    inl, itr1
+	SUBQ    $256, itr1
+	SHRQ    $4, itr1
+	ADDQ    $6, itr1
+	MOVQ    $10, itr2
+	CMPQ    itr1, $10
+	CMOVQGT itr2, itr1
+	MOVQ    inp, inl
+	XORQ    itr2, itr2
+
+	// Perform ChaCha rounds, while hashing the remaining input
+openAVX2Tail384LoopB:
+	polyAdd(0(inl))
+	polyMulAVX2
+	LEAQ 16(inl), inl
+
+openAVX2Tail384LoopA:
+	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
+	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
+	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
+	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
+	polyAdd(0(inl))
+	polyMulAVX2
+	LEAQ     16(inl), inl
+	INCQ     itr2
+	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
+	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
+	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
+	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
+
+	CMPQ itr2, itr1
+	JB   openAVX2Tail384LoopB
+
+	CMPQ itr2, $10
+	JNE  openAVX2Tail384LoopA
+
+	MOVQ inl, itr2
+	SUBQ inp, inl
+	MOVQ inl, itr1
+	MOVQ tmpStoreAVX2, inl
+
+openAVX2Tail384Hash:
+	ADDQ $16, itr1
+	CMPQ itr1, inl
+	JGT  openAVX2Tail384HashEnd
+	polyAdd(0(itr2))
+	polyMulAVX2
+	LEAQ 16(itr2), itr2
+	JMP  openAVX2Tail384Hash
+
+// Store 256 bytes safely, then go to store loop
+openAVX2Tail384HashEnd:
+	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
+	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
+	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
+	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
+	VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3
+	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
+	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
+	VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3
+	VPXOR      (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
+	VMOVDQU    TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
+	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
+	LEAQ       (8*32)(inp), inp
+	LEAQ       (8*32)(oup), oup
+	SUBQ       $8*32, inl
+	JMP        openAVX2TailLoop
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 512 bytes of ciphertext
+openAVX2Tail512:
+	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
+	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
+	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
+	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
+	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
+	XORQ    itr1, itr1
+	MOVQ    inp, itr2
+
+openAVX2Tail512LoopB:
+	polyAdd(0(itr2))
+	polyMulAVX2
+	LEAQ (2*8)(itr2), itr2
+
+openAVX2Tail512LoopA:
+	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
+	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+	VMOVDQA  CC3, tmpStoreAVX2
+	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
+	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
+	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
+	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
+	VMOVDQA  tmpStoreAVX2, CC3
+	polyAdd(0*8(itr2))
+	polyMulAVX2
+	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
+	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+	VMOVDQA  CC3, tmpStoreAVX2
+	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
+	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
+	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
+	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
+	VMOVDQA  tmpStoreAVX2, CC3
+	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
+	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
+	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
+	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
+	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+	polyAdd(2*8(itr2))
+	polyMulAVX2
+	LEAQ     (4*8)(itr2), itr2
+	VMOVDQA  CC3, tmpStoreAVX2
+	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
+	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
+	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
+	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
+	VMOVDQA  tmpStoreAVX2, CC3
+	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
+	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+	VMOVDQA  CC3, tmpStoreAVX2
+	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
+	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
+	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
+	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
+	VMOVDQA  tmpStoreAVX2, CC3
+	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
+	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
+	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
+	INCQ     itr1
+	CMPQ     itr1, $4
+	JLT      openAVX2Tail512LoopB
+
+	CMPQ itr1, $10
+	JNE  openAVX2Tail512LoopA
+
+	MOVQ inl, itr1
+	SUBQ $384, itr1
+	ANDQ $-16, itr1
+
+openAVX2Tail512HashLoop:
+	TESTQ itr1, itr1
+	JE    openAVX2Tail512HashEnd
+	polyAdd(0(itr2))
+	polyMulAVX2
+	LEAQ  16(itr2), itr2
+	SUBQ  $16, itr1
+	JMP   openAVX2Tail512HashLoop
+
+openAVX2Tail512HashEnd:
+	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
+	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
+	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
+	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
+	VMOVDQA    CC3, tmpStoreAVX2
+	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
+	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
+	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
+	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
+	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
+	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
+	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
+	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
+	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
+	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
+
+	LEAQ (12*32)(inp), inp
+	LEAQ (12*32)(oup), oup
+	SUBQ $12*32, inl
+
+	JMP openAVX2TailLoop
+
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+// func chacha20Poly1305Seal(dst, key, src, ad []byte)
+TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
+	// For aligned stack access
+	MOVQ SP, BP
+	ADDQ $32, BP
+	ANDQ $-32, BP
+	MOVQ dst+0(FP), oup
+	MOVQ key+24(FP), keyp
+	MOVQ src+48(FP), inp
+	MOVQ src_len+56(FP), inl
+	MOVQ ad+72(FP), adp
+
+	CMPB ·useAVX2(SB), $1
+	JE   chacha20Poly1305Seal_AVX2
+
+	// Special optimization, for very short buffers
+	CMPQ inl, $128
+	JBE  sealSSE128 // About 15% faster
+
+	// In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
+	MOVOU ·chacha20Constants<>(SB), A0
+	MOVOU (1*16)(keyp), B0
+	MOVOU (2*16)(keyp), C0
+	MOVOU (3*16)(keyp), D0
+
+	// Store state on stack for future use
+	MOVO B0, state1Store
+	MOVO C0, state2Store
+
+	// Load state, increment counter blocks
+	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
+	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
+	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+
+	// Store counters
+	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
+	MOVQ $10, itr2
+
+sealSSEIntroLoop:
+	MOVO         C3, tmpStore
+	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
+	MOVO         tmpStore, C3
+	MOVO         C1, tmpStore
+	chachaQR(A3, B3, C3, D3, C1)
+	MOVO         tmpStore, C1
+	shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
+	shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
+	shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
+
+	MOVO          C3, tmpStore
+	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
+	MOVO          tmpStore, C3
+	MOVO          C1, tmpStore
+	chachaQR(A3, B3, C3, D3, C1)
+	MOVO          tmpStore, C1
+	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
+	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
+	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
+	DECQ          itr2
+	JNE           sealSSEIntroLoop
+
+	// Add in the state
+	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
+	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
+	PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
+	PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
+
+	// Clamp and store the key
+	PAND ·polyClampMask<>(SB), A0
+	MOVO A0, rStore
+	MOVO B0, sStore
+
+	// Hash AAD
+	MOVQ ad_len+80(FP), itr2
+	CALL polyHashADInternal<>(SB)
+
+	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
+	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
+	MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
+	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
+	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
+	MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup)
+
+	MOVQ $128, itr1
+	SUBQ $128, inl
+	LEAQ 128(inp), inp
+
+	MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1
+
+	CMPQ inl, $64
+	JBE  sealSSE128SealHash
+
+	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
+	PXOR  A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
+	MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup)
+
+	ADDQ $64, itr1
+	SUBQ $64, inl
+	LEAQ 64(inp), inp
+
+	MOVQ $2, itr1
+	MOVQ $8, itr2
+
+	CMPQ inl, $64
+	JBE  sealSSETail64
+	CMPQ inl, $128
+	JBE  sealSSETail128
+	CMPQ inl, $192
+	JBE  sealSSETail192
+
+sealSSEMainLoop:
+	// Load state, increment counter blocks
+	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
+	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
+	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
+	MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
+
+	// Store counters
+	MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
+
+sealSSEInnerLoop:
+	MOVO          C3, tmpStore
+	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
+	MOVO          tmpStore, C3
+	MOVO          C1, tmpStore
+	chachaQR(A3, B3, C3, D3, C1)
+	MOVO          tmpStore, C1
+	polyAdd(0(oup))
+	shiftB0Left;  shiftB1Left; shiftB2Left; shiftB3Left
+	shiftC0Left;  shiftC1Left; shiftC2Left; shiftC3Left
+	shiftD0Left;  shiftD1Left; shiftD2Left; shiftD3Left
+	polyMulStage1
+	polyMulStage2
+	LEAQ          (2*8)(oup), oup
+	MOVO          C3, tmpStore
+	chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
+	MOVO          tmpStore, C3
+	MOVO          C1, tmpStore
+	polyMulStage3
+	chachaQR(A3, B3, C3, D3, C1)
+	MOVO          tmpStore, C1
+	polyMulReduceStage
+	shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
+	shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
+	shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
+	DECQ          itr2
+	JGE           sealSSEInnerLoop
+	polyAdd(0(oup))
+	polyMul
+	LEAQ          (2*8)(oup), oup
+	DECQ          itr1
+	JG            sealSSEInnerLoop
+
+	// Add in the state
+	PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
+	PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
+	PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
+	PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
+	MOVO  D3, tmpStore
+
+	// Load - xor - store
+	MOVOU (0*16)(inp), D3; PXOR D3, A0
+	MOVOU (1*16)(inp), D3; PXOR D3, B0
+	MOVOU (2*16)(inp), D3; PXOR D3, C0
+	MOVOU (3*16)(inp), D3; PXOR D3, D0
+	MOVOU A0, (0*16)(oup)
+	MOVOU B0, (1*16)(oup)
+	MOVOU C0, (2*16)(oup)
+	MOVOU D0, (3*16)(oup)
+	MOVO  tmpStore, D3
+
+	MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
+	PXOR  A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
+	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
+	MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
+	PXOR  A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
+	MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
+	ADDQ  $192, inp
+	MOVQ  $192, itr1
+	SUBQ  $192, inl
+	MOVO  A3, A1
+	MOVO  B3, B1
+	MOVO  C3, C1
+	MOVO  D3, D1
+	CMPQ  inl, $64
+	JBE   sealSSE128SealHash
+	MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
+	PXOR  A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
+	MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup)
+	LEAQ  64(inp), inp
+	SUBQ  $64, inl
+	MOVQ  $6, itr1
+	MOVQ  $4, itr2
+	CMPQ  inl, $192
+	JG    sealSSEMainLoop
+
+	MOVQ  inl, itr1
+	TESTQ inl, inl
+	JE    sealSSE128SealHash
+	MOVQ  $6, itr1
+	CMPQ  inl, $64
+	JBE   sealSSETail64
+	CMPQ  inl, $128
+	JBE   sealSSETail128
+	JMP   sealSSETail192
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 64 bytes of plaintext
+sealSSETail64:
+	// Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
+	MOVO  ·chacha20Constants<>(SB), A1
+	MOVO  state1Store, B1
+	MOVO  state2Store, C1
+	MOVO  ctr3Store, D1
+	PADDL ·sseIncMask<>(SB), D1
+	MOVO  D1, ctr0Store
+
+sealSSETail64LoopA:
+	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
+	polyAdd(0(oup))
+	polyMul
+	LEAQ 16(oup), oup
+
+sealSSETail64LoopB:
+	chachaQR(A1, B1, C1, D1, T1)
+	shiftB1Left;  shiftC1Left; shiftD1Left
+	chachaQR(A1, B1, C1, D1, T1)
+	shiftB1Right; shiftC1Right; shiftD1Right
+	polyAdd(0(oup))
+	polyMul
+	LEAQ          16(oup), oup
+
+	DECQ itr1
+	JG   sealSSETail64LoopA
+
+	DECQ  itr2
+	JGE   sealSSETail64LoopB
+	PADDL ·chacha20Constants<>(SB), A1
+	PADDL state1Store, B1
+	PADDL state2Store, C1
+	PADDL ctr0Store, D1
+
+	JMP sealSSE128Seal
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 128 bytes of plaintext
+sealSSETail128:
+	// Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
+	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
+	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
+
+sealSSETail128LoopA:
+	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
+	polyAdd(0(oup))
+	polyMul
+	LEAQ 16(oup), oup
+
+sealSSETail128LoopB:
+	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
+	shiftB0Left;  shiftC0Left; shiftD0Left
+	shiftB1Left;  shiftC1Left; shiftD1Left
+	polyAdd(0(oup))
+	polyMul
+	LEAQ          16(oup), oup
+	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
+	shiftB0Right; shiftC0Right; shiftD0Right
+	shiftB1Right; shiftC1Right; shiftD1Right
+
+	DECQ itr1
+	JG   sealSSETail128LoopA
+
+	DECQ itr2
+	JGE  sealSSETail128LoopB
+
+	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
+	PADDL state1Store, B0; PADDL state1Store, B1
+	PADDL state2Store, C0; PADDL state2Store, C1
+	PADDL ctr0Store, D0; PADDL ctr1Store, D1
+
+	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
+	PXOR  T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
+	MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
+
+	MOVQ $64, itr1
+	LEAQ 64(inp), inp
+	SUBQ $64, inl
+
+	JMP sealSSE128SealHash
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 192 bytes of plaintext
+sealSSETail192:
+	// Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
+	MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
+	MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
+	MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store
+
+sealSSETail192LoopA:
+	// Perform ChaCha rounds, while hashing the previously encrypted ciphertext
+	polyAdd(0(oup))
+	polyMul
+	LEAQ 16(oup), oup
+
+sealSSETail192LoopB:
+	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
+	shiftB0Left; shiftC0Left; shiftD0Left
+	shiftB1Left; shiftC1Left; shiftD1Left
+	shiftB2Left; shiftC2Left; shiftD2Left
+
+	polyAdd(0(oup))
+	polyMul
+	LEAQ 16(oup), oup
+
+	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
+	shiftB0Right; shiftC0Right; shiftD0Right
+	shiftB1Right; shiftC1Right; shiftD1Right
+	shiftB2Right; shiftC2Right; shiftD2Right
+
+	DECQ itr1
+	JG   sealSSETail192LoopA
+
+	DECQ itr2
+	JGE  sealSSETail192LoopB
+
+	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
+	PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
+	PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
+	PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
+
+	MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
+	PXOR  T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
+	MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
+	MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
+	PXOR  T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
+	MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
+
+	MOVO A2, A1
+	MOVO B2, B1
+	MOVO C2, C1
+	MOVO D2, D1
+	MOVQ $128, itr1
+	LEAQ 128(inp), inp
+	SUBQ $128, inl
+
+	JMP sealSSE128SealHash
+
+// ----------------------------------------------------------------------------
+// Special seal optimization for buffers smaller than 129 bytes
+sealSSE128:
+	// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
+	MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
+	MOVO  A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
+	MOVO  A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
+	MOVO  B0, T1; MOVO C0, T2; MOVO D1, T3
+	MOVQ  $10, itr2
+
+sealSSE128InnerCipherLoop:
+	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
+	shiftB0Left;  shiftB1Left; shiftB2Left
+	shiftC0Left;  shiftC1Left; shiftC2Left
+	shiftD0Left;  shiftD1Left; shiftD2Left
+	chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
+	shiftB0Right; shiftB1Right; shiftB2Right
+	shiftC0Right; shiftC1Right; shiftC2Right
+	shiftD0Right; shiftD1Right; shiftD2Right
+	DECQ          itr2
+	JNE           sealSSE128InnerCipherLoop
+
+	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
+	PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
+	PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
+	PADDL T2, C1; PADDL T2, C2
+	PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
+	PAND  ·polyClampMask<>(SB), A0
+	MOVOU A0, rStore
+	MOVOU B0, sStore
+
+	// Hash
+	MOVQ ad_len+80(FP), itr2
+	CALL polyHashADInternal<>(SB)
+	XORQ itr1, itr1
+
+sealSSE128SealHash:
+	// itr1 holds the number of bytes encrypted but not yet hashed
+	CMPQ itr1, $16
+	JB   sealSSE128Seal
+	polyAdd(0(oup))
+	polyMul
+
+	SUBQ $16, itr1
+	ADDQ $16, oup
+
+	JMP sealSSE128SealHash
+
+sealSSE128Seal:
+	CMPQ inl, $16
+	JB   sealSSETail
+	SUBQ $16, inl
+
+	// Load for decryption
+	MOVOU (inp), T0
+	PXOR  T0, A1
+	MOVOU A1, (oup)
+	LEAQ  (1*16)(inp), inp
+	LEAQ  (1*16)(oup), oup
+
+	// Extract for hashing
+	MOVQ   A1, t0
+	PSRLDQ $8, A1
+	MOVQ A1, t1
+	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
+	polyMul
+
+	// Shift the stream "left"
+	MOVO B1, A1
+	MOVO C1, B1
+	MOVO D1, C1
+	MOVO A2, D1
+	MOVO B2, A2
+	MOVO C2, B2
+	MOVO D2, C2
+	JMP  sealSSE128Seal
+
+sealSSETail:
+	TESTQ inl, inl
+	JE    sealSSEFinalize
+
+	// We can only load the PT one byte at a time to avoid read after end of buffer
+	MOVQ inl, itr2
+	SHLQ $4, itr2
+	LEAQ ·andMask<>(SB), t0
+	MOVQ inl, itr1
+	LEAQ -1(inp)(inl*1), inp
+	XORQ t2, t2
+	XORQ t3, t3
+	XORQ AX, AX
+
+sealSSETailLoadLoop:
+	SHLQ $8, t2, t3
+	SHLQ $8, t2
+	MOVB (inp), AX
+	XORQ AX, t2
+	LEAQ   -1(inp), inp
+	DECQ   itr1
+	JNE    sealSSETailLoadLoop
+	MOVQ t2, 0+tmpStore
+	MOVQ t3, 8+tmpStore
+	PXOR 0+tmpStore, A1
+	MOVOU  A1, (oup)
+	MOVOU  -16(t0)(itr2*1), T0
+	PAND   T0, A1
+	MOVQ   A1, t0
+	PSRLDQ $8, A1
+	MOVQ   A1, t1
+	ADDQ   t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
+	polyMul
+
+	ADDQ inl, oup
+
+sealSSEFinalize:
+	// Hash in the buffer lengths
+	ADDQ ad_len+80(FP), acc0
+	ADCQ src_len+56(FP), acc1
+	ADCQ $1, acc2
+	polyMul
+
+	// Final reduce
+	MOVQ    acc0, t0
+	MOVQ    acc1, t1
+	MOVQ    acc2, t2
+	SUBQ    $-5, acc0
+	SBBQ    $-1, acc1
+	SBBQ    $3, acc2
+	CMOVQCS t0, acc0
+	CMOVQCS t1, acc1
+	CMOVQCS t2, acc2
+
+	// Add in the "s" part of the key
+	ADDQ 0+sStore, acc0
+	ADCQ 8+sStore, acc1
+
+	// Finally store the tag at the end of the message
+	MOVQ acc0, (0*8)(oup)
+	MOVQ acc1, (1*8)(oup)
+	RET
+
+// ----------------------------------------------------------------------------
+// ------------------------- AVX2 Code ----------------------------------------
+chacha20Poly1305Seal_AVX2:
+	VZEROUPPER
+	VMOVDQU ·chacha20Constants<>(SB), AA0
+	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
+	BYTE    $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
+	BYTE    $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
+	VPADDD  ·avx2InitMask<>(SB), DD0, DD0
+
+	// Special optimizations, for very short buffers
+	CMPQ inl, $192
+	JBE  seal192AVX2 // 33% faster
+	CMPQ inl, $320
+	JBE  seal320AVX2 // 17% faster
+
+	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
+	VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
+	VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
+	VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
+	VPADDD  ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
+	VPADDD  ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
+	VPADDD  ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
+	VMOVDQA DD3, ctr3StoreAVX2
+	MOVQ    $10, itr2
+
+sealAVX2IntroLoop:
+	VMOVDQA CC3, tmpStoreAVX2
+	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
+	VMOVDQA tmpStoreAVX2, CC3
+	VMOVDQA CC1, tmpStoreAVX2
+	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
+	VMOVDQA tmpStoreAVX2, CC1
+
+	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
+	VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
+	VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
+	VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
+
+	VMOVDQA CC3, tmpStoreAVX2
+	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
+	VMOVDQA tmpStoreAVX2, CC3
+	VMOVDQA CC1, tmpStoreAVX2
+	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
+	VMOVDQA tmpStoreAVX2, CC1
+
+	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
+	VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
+	VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
+	VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
+	DECQ     itr2
+	JNE      sealAVX2IntroLoop
+
+	VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
+	VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
+	VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
+	VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
+
+	VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127
+	VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key
+	VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
+
+	// Clamp and store poly key
+	VPAND   ·polyClampMask<>(SB), DD0, DD0
+	VMOVDQA DD0, rsStoreAVX2
+
+	// Hash AD
+	MOVQ ad_len+80(FP), itr2
+	CALL polyHashADInternal<>(SB)
+
+	// Can store at least 320 bytes
+	VPXOR   (0*32)(inp), AA0, AA0
+	VPXOR   (1*32)(inp), CC0, CC0
+	VMOVDQU AA0, (0*32)(oup)
+	VMOVDQU CC0, (1*32)(oup)
+
+	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
+	VPXOR      (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0
+	VMOVDQU    AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup)
+	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
+	VPXOR      (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0
+	VMOVDQU    AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup)
+
+	MOVQ $320, itr1
+	SUBQ $320, inl
+	LEAQ 320(inp), inp
+
+	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0
+	CMPQ       inl, $128
+	JBE        sealAVX2SealHash
+
+	VPXOR   (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0
+	VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup)
+	SUBQ    $128, inl
+	LEAQ    128(inp), inp
+
+	MOVQ $8, itr1
+	MOVQ $2, itr2
+
+	CMPQ inl, $128
+	JBE  sealAVX2Tail128
+	CMPQ inl, $256
+	JBE  sealAVX2Tail256
+	CMPQ inl, $384
+	JBE  sealAVX2Tail384
+	CMPQ inl, $512
+	JBE  sealAVX2Tail512
+
+	// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
+	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
+	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
+	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
+	VMOVDQA ctr3StoreAVX2, DD0
+	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
+	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
+
+	VMOVDQA CC3, tmpStoreAVX2
+	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
+	VMOVDQA tmpStoreAVX2, CC3
+	VMOVDQA CC1, tmpStoreAVX2
+	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
+	VMOVDQA tmpStoreAVX2, CC1
+
+	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
+	VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
+	VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
+	VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
+
+	VMOVDQA CC3, tmpStoreAVX2
+	chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
+	VMOVDQA tmpStoreAVX2, CC3
+	VMOVDQA CC1, tmpStoreAVX2
+	chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
+	VMOVDQA tmpStoreAVX2, CC1
+
+	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
+	VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
+	VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
+	VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
+	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
+	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+	VMOVDQA  CC3, tmpStoreAVX2
+	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
+	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
+	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
+	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
+	VMOVDQA  tmpStoreAVX2, CC3
+
+	SUBQ $16, oup                  // Adjust the pointer
+	MOVQ $9, itr1
+	JMP  sealAVX2InternalLoopStart
+
+sealAVX2MainLoop:
+	// Load state, increment counter blocks, store the incremented counters
+	VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
+	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
+	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
+	VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
+	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
+	MOVQ    $10, itr1
+
+sealAVX2InternalLoop:
+	polyAdd(0*8(oup))
+	VPADDD  BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+	polyMulStage1_AVX2
+	VPXOR   AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+	VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
+	polyMulStage2_AVX2
+	VPADDD  DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+	VPXOR   CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+	polyMulStage3_AVX2
+	VMOVDQA CC3, tmpStoreAVX2
+	VPSLLD  $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
+	VPSLLD  $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
+	VPSLLD  $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
+	VPSLLD  $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
+	VMOVDQA tmpStoreAVX2, CC3
+	polyMulReduceStage
+
+sealAVX2InternalLoopStart:
+	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
+	polyAdd(2*8(oup))
+	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+	polyMulStage1_AVX2
+	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+	VMOVDQA  CC3, tmpStoreAVX2
+	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
+	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
+	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
+	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
+	VMOVDQA  tmpStoreAVX2, CC3
+	polyMulStage2_AVX2
+	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
+	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
+	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
+	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+	polyMulStage3_AVX2
+	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
+	polyMulReduceStage
+	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+	polyAdd(4*8(oup))
+	LEAQ     (6*8)(oup), oup
+	VMOVDQA  CC3, tmpStoreAVX2
+	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
+	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
+	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
+	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
+	VMOVDQA  tmpStoreAVX2, CC3
+	polyMulStage1_AVX2
+	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+	polyMulStage2_AVX2
+	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
+	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+	polyMulStage3_AVX2
+	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+	VMOVDQA  CC3, tmpStoreAVX2
+	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
+	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
+	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
+	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
+	VMOVDQA  tmpStoreAVX2, CC3
+	polyMulReduceStage
+	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
+	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
+	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
+	DECQ     itr1
+	JNE      sealAVX2InternalLoop
+
+	VPADDD  ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
+	VPADDD  state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
+	VPADDD  state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
+	VPADDD  ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
+	VMOVDQA CC3, tmpStoreAVX2
+
+	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
+	polyAdd(0*8(oup))
+	polyMulAVX2
+	LEAQ       (4*8)(oup), oup
+	VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
+	VPXOR      (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
+	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
+	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
+	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
+	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
+
+	// and here
+	polyAdd(-2*8(oup))
+	polyMulAVX2
+	VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
+	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
+	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
+	VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
+	VPXOR      (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
+	VMOVDQU    AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
+	LEAQ       (32*16)(inp), inp
+	SUBQ       $(32*16), inl
+	CMPQ       inl, $512
+	JG         sealAVX2MainLoop
+
+	// Tail can only hash 480 bytes
+	polyAdd(0*8(oup))
+	polyMulAVX2
+	polyAdd(2*8(oup))
+	polyMulAVX2
+	LEAQ 32(oup), oup
+
+	MOVQ $10, itr1
+	MOVQ $0, itr2
+	CMPQ inl, $128
+	JBE  sealAVX2Tail128
+	CMPQ inl, $256
+	JBE  sealAVX2Tail256
+	CMPQ inl, $384
+	JBE  sealAVX2Tail384
+	JMP  sealAVX2Tail512
+
+// ----------------------------------------------------------------------------
+// Special optimization for buffers smaller than 193 bytes
+seal192AVX2:
+	// For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
+	VMOVDQA AA0, AA1
+	VMOVDQA BB0, BB1
+	VMOVDQA CC0, CC1
+	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
+	VMOVDQA AA0, AA2
+	VMOVDQA BB0, BB2
+	VMOVDQA CC0, CC2
+	VMOVDQA DD0, DD2
+	VMOVDQA DD1, TT3
+	MOVQ    $10, itr2
+
+sealAVX2192InnerCipherLoop:
+	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+	VPALIGNR   $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
+	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
+	VPALIGNR   $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
+	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+	VPALIGNR   $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
+	VPALIGNR   $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
+	VPALIGNR   $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
+	DECQ       itr2
+	JNE        sealAVX2192InnerCipherLoop
+	VPADDD     AA2, AA0, AA0; VPADDD AA2, AA1, AA1
+	VPADDD     BB2, BB0, BB0; VPADDD BB2, BB1, BB1
+	VPADDD     CC2, CC0, CC0; VPADDD CC2, CC1, CC1
+	VPADDD     DD2, DD0, DD0; VPADDD TT3, DD1, DD1
+	VPERM2I128 $0x02, AA0, BB0, TT0
+
+	// Clamp and store poly key
+	VPAND   ·polyClampMask<>(SB), TT0, TT0
+	VMOVDQA TT0, rsStoreAVX2
+
+	// Stream for up to 192 bytes
+	VPERM2I128 $0x13, AA0, BB0, AA0
+	VPERM2I128 $0x13, CC0, DD0, BB0
+	VPERM2I128 $0x02, AA1, BB1, CC0
+	VPERM2I128 $0x02, CC1, DD1, DD0
+	VPERM2I128 $0x13, AA1, BB1, AA1
+	VPERM2I128 $0x13, CC1, DD1, BB1
+
+sealAVX2ShortSeal:
+	// Hash aad
+	MOVQ ad_len+80(FP), itr2
+	CALL polyHashADInternal<>(SB)
+	XORQ itr1, itr1
+
+sealAVX2SealHash:
+	// itr1 holds the number of bytes encrypted but not yet hashed
+	CMPQ itr1, $16
+	JB   sealAVX2ShortSealLoop
+	polyAdd(0(oup))
+	polyMul
+	SUBQ $16, itr1
+	ADDQ $16, oup
+	JMP  sealAVX2SealHash
+
+sealAVX2ShortSealLoop:
+	CMPQ inl, $32
+	JB   sealAVX2ShortTail32
+	SUBQ $32, inl
+
+	// Load for encryption
+	VPXOR   (inp), AA0, AA0
+	VMOVDQU AA0, (oup)
+	LEAQ    (1*32)(inp), inp
+
+	// Now can hash
+	polyAdd(0*8(oup))
+	polyMulAVX2
+	polyAdd(2*8(oup))
+	polyMulAVX2
+	LEAQ (1*32)(oup), oup
+
+	// Shift stream left
+	VMOVDQA BB0, AA0
+	VMOVDQA CC0, BB0
+	VMOVDQA DD0, CC0
+	VMOVDQA AA1, DD0
+	VMOVDQA BB1, AA1
+	VMOVDQA CC1, BB1
+	VMOVDQA DD1, CC1
+	VMOVDQA AA2, DD1
+	VMOVDQA BB2, AA2
+	JMP     sealAVX2ShortSealLoop
+
+sealAVX2ShortTail32:
+	CMPQ    inl, $16
+	VMOVDQA A0, A1
+	JB      sealAVX2ShortDone
+
+	SUBQ $16, inl
+
+	// Load for encryption
+	VPXOR   (inp), A0, T0
+	VMOVDQU T0, (oup)
+	LEAQ    (1*16)(inp), inp
+
+	// Hash
+	polyAdd(0*8(oup))
+	polyMulAVX2
+	LEAQ       (1*16)(oup), oup
+	VPERM2I128 $0x11, AA0, AA0, AA0
+	VMOVDQA    A0, A1
+
+sealAVX2ShortDone:
+	VZEROUPPER
+	JMP sealSSETail
+
+// ----------------------------------------------------------------------------
+// Special optimization for buffers smaller than 321 bytes
+seal320AVX2:
+	// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
+	VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
+	VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
+	VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
+	MOVQ    $10, itr2
+
+sealAVX2320InnerCipherLoop:
+	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
+	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
+	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
+	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
+	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
+	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
+	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
+	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
+	DECQ     itr2
+	JNE      sealAVX2320InnerCipherLoop
+
+	VMOVDQA ·chacha20Constants<>(SB), TT0
+	VPADDD  TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
+	VPADDD  TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
+	VPADDD  TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
+	VMOVDQA ·avx2IncMask<>(SB), TT0
+	VPADDD  TT3, DD0, DD0; VPADDD TT0, TT3, TT3
+	VPADDD  TT3, DD1, DD1; VPADDD TT0, TT3, TT3
+	VPADDD  TT3, DD2, DD2
+
+	// Clamp and store poly key
+	VPERM2I128 $0x02, AA0, BB0, TT0
+	VPAND      ·polyClampMask<>(SB), TT0, TT0
+	VMOVDQA    TT0, rsStoreAVX2
+
+	// Stream for up to 320 bytes
+	VPERM2I128 $0x13, AA0, BB0, AA0
+	VPERM2I128 $0x13, CC0, DD0, BB0
+	VPERM2I128 $0x02, AA1, BB1, CC0
+	VPERM2I128 $0x02, CC1, DD1, DD0
+	VPERM2I128 $0x13, AA1, BB1, AA1
+	VPERM2I128 $0x13, CC1, DD1, BB1
+	VPERM2I128 $0x02, AA2, BB2, CC1
+	VPERM2I128 $0x02, CC2, DD2, DD1
+	VPERM2I128 $0x13, AA2, BB2, AA2
+	VPERM2I128 $0x13, CC2, DD2, BB2
+	JMP        sealAVX2ShortSeal
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 128 bytes of ciphertext
+sealAVX2Tail128:
+	// Need to decrypt up to 128 bytes - prepare two blocks
+	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
+	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
+	VMOVDQA ·chacha20Constants<>(SB), AA0
+	VMOVDQA state1StoreAVX2, BB0
+	VMOVDQA state2StoreAVX2, CC0
+	VMOVDQA ctr3StoreAVX2, DD0
+	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
+	VMOVDQA DD0, DD1
+
+sealAVX2Tail128LoopA:
+	polyAdd(0(oup))
+	polyMul
+	LEAQ 16(oup), oup
+
+sealAVX2Tail128LoopB:
+	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+	polyAdd(0(oup))
+	polyMul
+	VPALIGNR $4, BB0, BB0, BB0
+	VPALIGNR $8, CC0, CC0, CC0
+	VPALIGNR $12, DD0, DD0, DD0
+	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
+	polyAdd(16(oup))
+	polyMul
+	LEAQ     32(oup), oup
+	VPALIGNR $12, BB0, BB0, BB0
+	VPALIGNR $8, CC0, CC0, CC0
+	VPALIGNR $4, DD0, DD0, DD0
+	DECQ     itr1
+	JG       sealAVX2Tail128LoopA
+	DECQ     itr2
+	JGE      sealAVX2Tail128LoopB
+
+	VPADDD ·chacha20Constants<>(SB), AA0, AA1
+	VPADDD state1StoreAVX2, BB0, BB1
+	VPADDD state2StoreAVX2, CC0, CC1
+	VPADDD DD1, DD0, DD1
+
+	VPERM2I128 $0x02, AA1, BB1, AA0
+	VPERM2I128 $0x02, CC1, DD1, BB0
+	VPERM2I128 $0x13, AA1, BB1, CC0
+	VPERM2I128 $0x13, CC1, DD1, DD0
+	JMP        sealAVX2ShortSealLoop
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 256 bytes of ciphertext
+sealAVX2Tail256:
+	// Need to decrypt up to 256 bytes - prepare two blocks
+	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
+	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
+	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1
+	VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
+	VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
+	VMOVDQA ctr3StoreAVX2, DD0
+	VPADDD  ·avx2IncMask<>(SB), DD0, DD0
+	VPADDD  ·avx2IncMask<>(SB), DD0, DD1
+	VMOVDQA DD0, TT1
+	VMOVDQA DD1, TT2
+
+sealAVX2Tail256LoopA:
+	polyAdd(0(oup))
+	polyMul
+	LEAQ 16(oup), oup
+
+sealAVX2Tail256LoopB:
+	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+	polyAdd(0(oup))
+	polyMul
+	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
+	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
+	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
+	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
+	polyAdd(16(oup))
+	polyMul
+	LEAQ     32(oup), oup
+	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
+	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
+	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
+	DECQ     itr1
+	JG       sealAVX2Tail256LoopA
+	DECQ     itr2
+	JGE      sealAVX2Tail256LoopB
+
+	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
+	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
+	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
+	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1
+	VPERM2I128 $0x02, AA0, BB0, TT0
+	VPERM2I128 $0x02, CC0, DD0, TT1
+	VPERM2I128 $0x13, AA0, BB0, TT2
+	VPERM2I128 $0x13, CC0, DD0, TT3
+	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
+	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
+	MOVQ       $128, itr1
+	LEAQ       128(inp), inp
+	SUBQ       $128, inl
+	VPERM2I128 $0x02, AA1, BB1, AA0
+	VPERM2I128 $0x02, CC1, DD1, BB0
+	VPERM2I128 $0x13, AA1, BB1, CC0
+	VPERM2I128 $0x13, CC1, DD1, DD0
+
+	JMP sealAVX2SealHash
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 384 bytes of ciphertext
+sealAVX2Tail384:
+	// Need to decrypt up to 384 bytes - prepare two blocks
+	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
+	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
+	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
+	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
+	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
+	VMOVDQA ctr3StoreAVX2, DD0
+	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2
+	VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
+
+sealAVX2Tail384LoopA:
+	polyAdd(0(oup))
+	polyMul
+	LEAQ 16(oup), oup
+
+sealAVX2Tail384LoopB:
+	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
+	polyAdd(0(oup))
+	polyMul
+	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
+	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
+	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
+	chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
+	polyAdd(16(oup))
+	polyMul
+	LEAQ     32(oup), oup
+	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
+	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
+	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
+	DECQ     itr1
+	JG       sealAVX2Tail384LoopA
+	DECQ     itr2
+	JGE      sealAVX2Tail384LoopB
+
+	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
+	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
+	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
+	VPADDD     TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
+	VPERM2I128 $0x02, AA0, BB0, TT0
+	VPERM2I128 $0x02, CC0, DD0, TT1
+	VPERM2I128 $0x13, AA0, BB0, TT2
+	VPERM2I128 $0x13, CC0, DD0, TT3
+	VPXOR      (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
+	VMOVDQU    TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
+	VPERM2I128 $0x02, AA1, BB1, TT0
+	VPERM2I128 $0x02, CC1, DD1, TT1
+	VPERM2I128 $0x13, AA1, BB1, TT2
+	VPERM2I128 $0x13, CC1, DD1, TT3
+	VPXOR      (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
+	VMOVDQU    TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
+	MOVQ       $256, itr1
+	LEAQ       256(inp), inp
+	SUBQ       $256, inl
+	VPERM2I128 $0x02, AA2, BB2, AA0
+	VPERM2I128 $0x02, CC2, DD2, BB0
+	VPERM2I128 $0x13, AA2, BB2, CC0
+	VPERM2I128 $0x13, CC2, DD2, DD0
+
+	JMP sealAVX2SealHash
+
+// ----------------------------------------------------------------------------
+// Special optimization for the last 512 bytes of ciphertext
+sealAVX2Tail512:
+	// Need to decrypt up to 512 bytes - prepare two blocks
+	// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
+	// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
+	VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
+	VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
+	VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
+	VMOVDQA ctr3StoreAVX2, DD0
+	VPADDD  ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
+	VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
+
+sealAVX2Tail512LoopA:
+	polyAdd(0(oup))
+	polyMul
+	LEAQ 16(oup), oup
+
+sealAVX2Tail512LoopB:
+	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
+	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+	VMOVDQA  CC3, tmpStoreAVX2
+	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
+	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
+	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
+	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
+	VMOVDQA  tmpStoreAVX2, CC3
+	polyAdd(0*8(oup))
+	polyMulAVX2
+	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
+	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+	VMOVDQA  CC3, tmpStoreAVX2
+	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
+	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
+	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
+	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
+	VMOVDQA  tmpStoreAVX2, CC3
+	VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
+	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
+	VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
+	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+	VPSHUFB  ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
+	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+	polyAdd(2*8(oup))
+	polyMulAVX2
+	LEAQ     (4*8)(oup), oup
+	VMOVDQA  CC3, tmpStoreAVX2
+	VPSLLD   $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
+	VPSLLD   $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
+	VPSLLD   $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
+	VPSLLD   $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
+	VMOVDQA  tmpStoreAVX2, CC3
+	VPADDD   BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
+	VPXOR    AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
+	VPSHUFB  ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
+	VPADDD   DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
+	VPXOR    CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
+	VMOVDQA  CC3, tmpStoreAVX2
+	VPSLLD   $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
+	VPSLLD   $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
+	VPSLLD   $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
+	VPSLLD   $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
+	VMOVDQA  tmpStoreAVX2, CC3
+	VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
+	VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
+	VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
+
+	DECQ itr1
+	JG   sealAVX2Tail512LoopA
+	DECQ itr2
+	JGE  sealAVX2Tail512LoopB
+
+	VPADDD     ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
+	VPADDD     state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
+	VPADDD     state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
+	VPADDD     ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
+	VMOVDQA    CC3, tmpStoreAVX2
+	VPERM2I128 $0x02, AA0, BB0, CC3
+	VPXOR      (0*32)(inp), CC3, CC3
+	VMOVDQU    CC3, (0*32)(oup)
+	VPERM2I128 $0x02, CC0, DD0, CC3
+	VPXOR      (1*32)(inp), CC3, CC3
+	VMOVDQU    CC3, (1*32)(oup)
+	VPERM2I128 $0x13, AA0, BB0, CC3
+	VPXOR      (2*32)(inp), CC3, CC3
+	VMOVDQU    CC3, (2*32)(oup)
+	VPERM2I128 $0x13, CC0, DD0, CC3
+	VPXOR      (3*32)(inp), CC3, CC3
+	VMOVDQU    CC3, (3*32)(oup)
+
+	VPERM2I128 $0x02, AA1, BB1, AA0
+	VPERM2I128 $0x02, CC1, DD1, BB0
+	VPERM2I128 $0x13, AA1, BB1, CC0
+	VPERM2I128 $0x13, CC1, DD1, DD0
+	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
+	VMOVDQU    AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
+
+	VPERM2I128 $0x02, AA2, BB2, AA0
+	VPERM2I128 $0x02, CC2, DD2, BB0
+	VPERM2I128 $0x13, AA2, BB2, CC0
+	VPERM2I128 $0x13, CC2, DD2, DD0
+	VPXOR      (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
+	VMOVDQU    AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
+
+	MOVQ       $384, itr1
+	LEAQ       384(inp), inp
+	SUBQ       $384, inl
+	VPERM2I128 $0x02, AA3, BB3, AA0
+	VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0
+	VPERM2I128 $0x13, AA3, BB3, CC0
+	VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
+
+	JMP sealAVX2SealHash
+
+// func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·cpuid(SB), NOSPLIT, $0-24
+	MOVL eaxArg+0(FP), AX
+	MOVL ecxArg+4(FP), CX
+	CPUID
+	MOVL AX, eax+8(FP)
+	MOVL BX, ebx+12(FP)
+	MOVL CX, ecx+16(FP)
+	MOVL DX, edx+20(FP)
+	RET
+
+// func xgetbv() (eax, edx uint32)
+TEXT ·xgetbv(SB),NOSPLIT,$0-8
+	MOVL $0, CX
+	XGETBV
+	MOVL AX, eax+0(FP)
+	MOVL DX, edx+4(FP)
+	RET
diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go
new file mode 100644
index 000000000..f7e4bfb1c
--- /dev/null
+++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go
@@ -0,0 +1,70 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package chacha20poly1305
+
+import (
+	"encoding/binary"
+
+	"golang.org/x/crypto/chacha20poly1305/internal/chacha20"
+	"golang.org/x/crypto/poly1305"
+)
+
+func roundTo16(n int) int {
+	return 16 * ((n + 15) / 16)
+}
+
+func (c *chacha20poly1305) sealGeneric(dst, nonce, plaintext, additionalData []byte) []byte {
+	var counter [16]byte
+	copy(counter[4:], nonce)
+
+	var polyKey [32]byte
+	chacha20.XORKeyStream(polyKey[:], polyKey[:], &counter, &c.key)
+
+	ret, out := sliceForAppend(dst, len(plaintext)+poly1305.TagSize)
+	counter[0] = 1
+	chacha20.XORKeyStream(out, plaintext, &counter, &c.key)
+
+	polyInput := make([]byte, roundTo16(len(additionalData))+roundTo16(len(plaintext))+8+8)
+	copy(polyInput, additionalData)
+	copy(polyInput[roundTo16(len(additionalData)):], out[:len(plaintext)])
+	binary.LittleEndian.PutUint64(polyInput[len(polyInput)-16:], uint64(len(additionalData)))
+	binary.LittleEndian.PutUint64(polyInput[len(polyInput)-8:], uint64(len(plaintext)))
+
+	var tag [poly1305.TagSize]byte
+	poly1305.Sum(&tag, polyInput, &polyKey)
+	copy(out[len(plaintext):], tag[:])
+
+	return ret
+}
+
+func (c *chacha20poly1305) openGeneric(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) {
+	var tag [poly1305.TagSize]byte
+	copy(tag[:], ciphertext[len(ciphertext)-16:])
+	ciphertext = ciphertext[:len(ciphertext)-16]
+
+	var counter [16]byte
+	copy(counter[4:], nonce)
+
+	var polyKey [32]byte
+	chacha20.XORKeyStream(polyKey[:], polyKey[:], &counter, &c.key)
+
+	polyInput := make([]byte, roundTo16(len(additionalData))+roundTo16(len(ciphertext))+8+8)
+	copy(polyInput, additionalData)
+	copy(polyInput[roundTo16(len(additionalData)):], ciphertext)
+	binary.LittleEndian.PutUint64(polyInput[len(polyInput)-16:], uint64(len(additionalData)))
+	binary.LittleEndian.PutUint64(polyInput[len(polyInput)-8:], uint64(len(ciphertext)))
+
+	ret, out := sliceForAppend(dst, len(ciphertext))
+	if !poly1305.Verify(&tag, polyInput, &polyKey) {
+		for i := range out {
+			out[i] = 0
+		}
+		return nil, errOpen
+	}
+
+	counter[0] = 1
+	chacha20.XORKeyStream(out, ciphertext, &counter, &c.key)
+	return ret, nil
+}
diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_noasm.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_noasm.go
new file mode 100644
index 000000000..4c2eb703c
--- /dev/null
+++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_noasm.go
@@ -0,0 +1,15 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64 !go1.7 gccgo appengine
+
+package chacha20poly1305
+
+func (c *chacha20poly1305) seal(dst, nonce, plaintext, additionalData []byte) []byte {
+	return c.sealGeneric(dst, nonce, plaintext, additionalData)
+}
+
+func (c *chacha20poly1305) open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) {
+	return c.openGeneric(dst, nonce, ciphertext, additionalData)
+}
diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/internal/chacha20/chacha_generic.go b/vendor/golang.org/x/crypto/chacha20poly1305/internal/chacha20/chacha_generic.go
new file mode 100644
index 000000000..f9e8a3a2f
--- /dev/null
+++ b/vendor/golang.org/x/crypto/chacha20poly1305/internal/chacha20/chacha_generic.go
@@ -0,0 +1,199 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package ChaCha20 implements the core ChaCha20 function as specified in https://tools.ietf.org/html/rfc7539#section-2.3.
+package chacha20
+
+import "encoding/binary"
+
+const rounds = 20
+
+// core applies the ChaCha20 core function to 16-byte input in, 32-byte key k,
+// and 16-byte constant c, and puts the result into 64-byte array out.
+func core(out *[64]byte, in *[16]byte, k *[32]byte) {
+	j0 := uint32(0x61707865)
+	j1 := uint32(0x3320646e)
+	j2 := uint32(0x79622d32)
+	j3 := uint32(0x6b206574)
+	j4 := binary.LittleEndian.Uint32(k[0:4])
+	j5 := binary.LittleEndian.Uint32(k[4:8])
+	j6 := binary.LittleEndian.Uint32(k[8:12])
+	j7 := binary.LittleEndian.Uint32(k[12:16])
+	j8 := binary.LittleEndian.Uint32(k[16:20])
+	j9 := binary.LittleEndian.Uint32(k[20:24])
+	j10 := binary.LittleEndian.Uint32(k[24:28])
+	j11 := binary.LittleEndian.Uint32(k[28:32])
+	j12 := binary.LittleEndian.Uint32(in[0:4])
+	j13 := binary.LittleEndian.Uint32(in[4:8])
+	j14 := binary.LittleEndian.Uint32(in[8:12])
+	j15 := binary.LittleEndian.Uint32(in[12:16])
+
+	x0, x1, x2, x3, x4, x5, x6, x7 := j0, j1, j2, j3, j4, j5, j6, j7
+	x8, x9, x10, x11, x12, x13, x14, x15 := j8, j9, j10, j11, j12, j13, j14, j15
+
+	for i := 0; i < rounds; i += 2 {
+		x0 += x4
+		x12 ^= x0
+		x12 = (x12 << 16) | (x12 >> (16))
+		x8 += x12
+		x4 ^= x8
+		x4 = (x4 << 12) | (x4 >> (20))
+		x0 += x4
+		x12 ^= x0
+		x12 = (x12 << 8) | (x12 >> (24))
+		x8 += x12
+		x4 ^= x8
+		x4 = (x4 << 7) | (x4 >> (25))
+		x1 += x5
+		x13 ^= x1
+		x13 = (x13 << 16) | (x13 >> 16)
+		x9 += x13
+		x5 ^= x9
+		x5 = (x5 << 12) | (x5 >> 20)
+		x1 += x5
+		x13 ^= x1
+		x13 = (x13 << 8) | (x13 >> 24)
+		x9 += x13
+		x5 ^= x9
+		x5 = (x5 << 7) | (x5 >> 25)
+		x2 += x6
+		x14 ^= x2
+		x14 = (x14 << 16) | (x14 >> 16)
+		x10 += x14
+		x6 ^= x10
+		x6 = (x6 << 12) | (x6 >> 20)
+		x2 += x6
+		x14 ^= x2
+		x14 = (x14 << 8) | (x14 >> 24)
+		x10 += x14
+		x6 ^= x10
+		x6 = (x6 << 7) | (x6 >> 25)
+		x3 += x7
+		x15 ^= x3
+		x15 = (x15 << 16) | (x15 >> 16)
+		x11 += x15
+		x7 ^= x11
+		x7 = (x7 << 12) | (x7 >> 20)
+		x3 += x7
+		x15 ^= x3
+		x15 = (x15 << 8) | (x15 >> 24)
+		x11 += x15
+		x7 ^= x11
+		x7 = (x7 << 7) | (x7 >> 25)
+		x0 += x5
+		x15 ^= x0
+		x15 = (x15 << 16) | (x15 >> 16)
+		x10 += x15
+		x5 ^= x10
+		x5 = (x5 << 12) | (x5 >> 20)
+		x0 += x5
+		x15 ^= x0
+		x15 = (x15 << 8) | (x15 >> 24)
+		x10 += x15
+		x5 ^= x10
+		x5 = (x5 << 7) | (x5 >> 25)
+		x1 += x6
+		x12 ^= x1
+		x12 = (x12 << 16) | (x12 >> 16)
+		x11 += x12
+		x6 ^= x11
+		x6 = (x6 << 12) | (x6 >> 20)
+		x1 += x6
+		x12 ^= x1
+		x12 = (x12 << 8) | (x12 >> 24)
+		x11 += x12
+		x6 ^= x11
+		x6 = (x6 << 7) | (x6 >> 25)
+		x2 += x7
+		x13 ^= x2
+		x13 = (x13 << 16) | (x13 >> 16)
+		x8 += x13
+		x7 ^= x8
+		x7 = (x7 << 12) | (x7 >> 20)
+		x2 += x7
+		x13 ^= x2
+		x13 = (x13 << 8) | (x13 >> 24)
+		x8 += x13
+		x7 ^= x8
+		x7 = (x7 << 7) | (x7 >> 25)
+		x3 += x4
+		x14 ^= x3
+		x14 = (x14 << 16) | (x14 >> 16)
+		x9 += x14
+		x4 ^= x9
+		x4 = (x4 << 12) | (x4 >> 20)
+		x3 += x4
+		x14 ^= x3
+		x14 = (x14 << 8) | (x14 >> 24)
+		x9 += x14
+		x4 ^= x9
+		x4 = (x4 << 7) | (x4 >> 25)
+	}
+
+	x0 += j0
+	x1 += j1
+	x2 += j2
+	x3 += j3
+	x4 += j4
+	x5 += j5
+	x6 += j6
+	x7 += j7
+	x8 += j8
+	x9 += j9
+	x10 += j10
+	x11 += j11
+	x12 += j12
+	x13 += j13
+	x14 += j14
+	x15 += j15
+
+	binary.LittleEndian.PutUint32(out[0:4], x0)
+	binary.LittleEndian.PutUint32(out[4:8], x1)
+	binary.LittleEndian.PutUint32(out[8:12], x2)
+	binary.LittleEndian.PutUint32(out[12:16], x3)
+	binary.LittleEndian.PutUint32(out[16:20], x4)
+	binary.LittleEndian.PutUint32(out[20:24], x5)
+	binary.LittleEndian.PutUint32(out[24:28], x6)
+	binary.LittleEndian.PutUint32(out[28:32], x7)
+	binary.LittleEndian.PutUint32(out[32:36], x8)
+	binary.LittleEndian.PutUint32(out[36:40], x9)
+	binary.LittleEndian.PutUint32(out[40:44], x10)
+	binary.LittleEndian.PutUint32(out[44:48], x11)
+	binary.LittleEndian.PutUint32(out[48:52], x12)
+	binary.LittleEndian.PutUint32(out[52:56], x13)
+	binary.LittleEndian.PutUint32(out[56:60], x14)
+	binary.LittleEndian.PutUint32(out[60:64], x15)
+}
+
+// XORKeyStream crypts bytes from in to out using the given key and counters.
+// In and out may be the same slice but otherwise should not overlap. Counter
+// contains the raw ChaCha20 counter bytes (i.e. block counter followed by
+// nonce).
+func XORKeyStream(out, in []byte, counter *[16]byte, key *[32]byte) {
+	var block [64]byte
+	var counterCopy [16]byte
+	copy(counterCopy[:], counter[:])
+
+	for len(in) >= 64 {
+		core(&block, &counterCopy, key)
+		for i, x := range block {
+			out[i] = in[i] ^ x
+		}
+		u := uint32(1)
+		for i := 0; i < 4; i++ {
+			u += uint32(counterCopy[i])
+			counterCopy[i] = byte(u)
+			u >>= 8
+		}
+		in = in[64:]
+		out = out[64:]
+	}
+
+	if len(in) > 0 {
+		core(&block, &counterCopy, key)
+		for i, v := range in {
+			out[i] = v ^ block[i]
+		}
+	}
+}
diff --git a/vendor/golang.org/x/crypto/poly1305/poly1305.go b/vendor/golang.org/x/crypto/poly1305/poly1305.go
new file mode 100644
index 000000000..f562fa571
--- /dev/null
+++ b/vendor/golang.org/x/crypto/poly1305/poly1305.go
@@ -0,0 +1,33 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+Package poly1305 implements Poly1305 one-time message authentication code as
+specified in https://cr.yp.to/mac/poly1305-20050329.pdf.
+
+Poly1305 is a fast, one-time authentication function. It is infeasible for an
+attacker to generate an authenticator for a message without the key. However, a
+key must only be used for a single message. Authenticating two different
+messages with the same key allows an attacker to forge authenticators for other
+messages with the same key.
+
+Poly1305 was originally coupled with AES in order to make Poly1305-AES. AES was
+used with a fixed key in order to generate one-time keys from an nonce.
+However, in this package AES isn't used and the one-time key is specified
+directly.
+*/
+package poly1305 // import "golang.org/x/crypto/poly1305"
+
+import "crypto/subtle"
+
+// TagSize is the size, in bytes, of a poly1305 authenticator.
+const TagSize = 16
+
+// Verify returns true if mac is a valid authenticator for m with the given
+// key.
+func Verify(mac *[16]byte, m []byte, key *[32]byte) bool {
+	var tmp [16]byte
+	Sum(&tmp, m, key)
+	return subtle.ConstantTimeCompare(tmp[:], mac[:]) == 1
+}
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_amd64.go b/vendor/golang.org/x/crypto/poly1305/sum_amd64.go
new file mode 100644
index 000000000..4dd72fe79
--- /dev/null
+++ b/vendor/golang.org/x/crypto/poly1305/sum_amd64.go
@@ -0,0 +1,22 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64,!gccgo,!appengine
+
+package poly1305
+
+// This function is implemented in sum_amd64.s
+//go:noescape
+func poly1305(out *[16]byte, m *byte, mlen uint64, key *[32]byte)
+
+// Sum generates an authenticator for m using a one-time key and puts the
+// 16-byte result into out. Authenticating two different messages with the same
+// key allows an attacker to forge messages at will.
+func Sum(out *[16]byte, m []byte, key *[32]byte) {
+	var mPtr *byte
+	if len(m) > 0 {
+		mPtr = &m[0]
+	}
+	poly1305(out, mPtr, uint64(len(m)), key)
+}
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_amd64.s b/vendor/golang.org/x/crypto/poly1305/sum_amd64.s
new file mode 100644
index 000000000..2edae6382
--- /dev/null
+++ b/vendor/golang.org/x/crypto/poly1305/sum_amd64.s
@@ -0,0 +1,125 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64,!gccgo,!appengine
+
+#include "textflag.h"
+
+#define POLY1305_ADD(msg, h0, h1, h2) \
+	ADDQ 0(msg), h0;  \
+	ADCQ 8(msg), h1;  \
+	ADCQ $1, h2;      \
+	LEAQ 16(msg), msg
+
+#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \
+	MOVQ  r0, AX;                  \
+	MULQ  h0;                      \
+	MOVQ  AX, t0;                  \
+	MOVQ  DX, t1;                  \
+	MOVQ  r0, AX;                  \
+	MULQ  h1;                      \
+	ADDQ  AX, t1;                  \
+	ADCQ  $0, DX;                  \
+	MOVQ  r0, t2;                  \
+	IMULQ h2, t2;                  \
+	ADDQ  DX, t2;                  \
+	                               \
+	MOVQ  r1, AX;                  \
+	MULQ  h0;                      \
+	ADDQ  AX, t1;                  \
+	ADCQ  $0, DX;                  \
+	MOVQ  DX, h0;                  \
+	MOVQ  r1, t3;                  \
+	IMULQ h2, t3;                  \
+	MOVQ  r1, AX;                  \
+	MULQ  h1;                      \
+	ADDQ  AX, t2;                  \
+	ADCQ  DX, t3;                  \
+	ADDQ  h0, t2;                  \
+	ADCQ  $0, t3;                  \
+	                               \
+	MOVQ  t0, h0;                  \
+	MOVQ  t1, h1;                  \
+	MOVQ  t2, h2;                  \
+	ANDQ  $3, h2;                  \
+	MOVQ  t2, t0;                  \
+	ANDQ  $0xFFFFFFFFFFFFFFFC, t0; \
+	ADDQ  t0, h0;                  \
+	ADCQ  t3, h1;                  \
+	ADCQ  $0, h2;                  \
+	SHRQ  $2, t3, t2;              \
+	SHRQ  $2, t3;                  \
+	ADDQ  t2, h0;                  \
+	ADCQ  t3, h1;                  \
+	ADCQ  $0, h2
+
+DATA ·poly1305Mask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
+DATA ·poly1305Mask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
+GLOBL ·poly1305Mask<>(SB), RODATA, $16
+
+// func poly1305(out *[16]byte, m *byte, mlen uint64, key *[32]key)
+TEXT ·poly1305(SB), $0-32
+	MOVQ out+0(FP), DI
+	MOVQ m+8(FP), SI
+	MOVQ mlen+16(FP), R15
+	MOVQ key+24(FP), AX
+
+	MOVQ 0(AX), R11
+	MOVQ 8(AX), R12
+	ANDQ ·poly1305Mask<>(SB), R11   // r0
+	ANDQ ·poly1305Mask<>+8(SB), R12 // r1
+	XORQ R8, R8                    // h0
+	XORQ R9, R9                    // h1
+	XORQ R10, R10                  // h2
+
+	CMPQ R15, $16
+	JB   bytes_between_0_and_15
+
+loop:
+	POLY1305_ADD(SI, R8, R9, R10)
+
+multiply:
+	POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14)
+	SUBQ $16, R15
+	CMPQ R15, $16
+	JAE  loop
+
+bytes_between_0_and_15:
+	TESTQ R15, R15
+	JZ    done
+	MOVQ  $1, BX
+	XORQ  CX, CX
+	XORQ  R13, R13
+	ADDQ  R15, SI
+
+flush_buffer:
+	SHLQ $8, BX, CX
+	SHLQ $8, BX
+	MOVB -1(SI), R13
+	XORQ R13, BX
+	DECQ SI
+	DECQ R15
+	JNZ  flush_buffer
+
+	ADDQ BX, R8
+	ADCQ CX, R9
+	ADCQ $0, R10
+	MOVQ $16, R15
+	JMP  multiply
+
+done:
+	MOVQ    R8, AX
+	MOVQ    R9, BX
+	SUBQ    $0xFFFFFFFFFFFFFFFB, AX
+	SBBQ    $0xFFFFFFFFFFFFFFFF, BX
+	SBBQ    $3, R10
+	CMOVQCS R8, AX
+	CMOVQCS R9, BX
+	MOVQ    key+24(FP), R8
+	ADDQ    16(R8), AX
+	ADCQ    24(R8), BX
+
+	MOVQ AX, 0(DI)
+	MOVQ BX, 8(DI)
+	RET
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_arm.go b/vendor/golang.org/x/crypto/poly1305/sum_arm.go
new file mode 100644
index 000000000..5dc321c2f
--- /dev/null
+++ b/vendor/golang.org/x/crypto/poly1305/sum_arm.go
@@ -0,0 +1,22 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build arm,!gccgo,!appengine,!nacl
+
+package poly1305
+
+// This function is implemented in sum_arm.s
+//go:noescape
+func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]byte)
+
+// Sum generates an authenticator for m using a one-time key and puts the
+// 16-byte result into out. Authenticating two different messages with the same
+// key allows an attacker to forge messages at will.
+func Sum(out *[16]byte, m []byte, key *[32]byte) {
+	var mPtr *byte
+	if len(m) > 0 {
+		mPtr = &m[0]
+	}
+	poly1305_auth_armv6(out, mPtr, uint32(len(m)), key)
+}
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_arm.s b/vendor/golang.org/x/crypto/poly1305/sum_arm.s
new file mode 100644
index 000000000..f70b4ac48
--- /dev/null
+++ b/vendor/golang.org/x/crypto/poly1305/sum_arm.s
@@ -0,0 +1,427 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build arm,!gccgo,!appengine,!nacl
+
+#include "textflag.h"
+
+// This code was translated into a form compatible with 5a from the public
+// domain source by Andrew Moon: github.com/floodyberry/poly1305-opt/blob/master/app/extensions/poly1305.
+
+DATA ·poly1305_init_constants_armv6<>+0x00(SB)/4, $0x3ffffff
+DATA ·poly1305_init_constants_armv6<>+0x04(SB)/4, $0x3ffff03
+DATA ·poly1305_init_constants_armv6<>+0x08(SB)/4, $0x3ffc0ff
+DATA ·poly1305_init_constants_armv6<>+0x0c(SB)/4, $0x3f03fff
+DATA ·poly1305_init_constants_armv6<>+0x10(SB)/4, $0x00fffff
+GLOBL ·poly1305_init_constants_armv6<>(SB), 8, $20
+
+// Warning: the linker may use R11 to synthesize certain instructions. Please
+// take care and verify that no synthetic instructions use it.
+
+TEXT poly1305_init_ext_armv6<>(SB), NOSPLIT, $0
+	// Needs 16 bytes of stack and 64 bytes of space pointed to by R0.  (It
+	// might look like it's only 60 bytes of space but the final four bytes
+	// will be written by another function.) We need to skip over four
+	// bytes of stack because that's saving the value of 'g'.
+	ADD       $4, R13, R8
+	MOVM.IB   [R4-R7], (R8)
+	MOVM.IA.W (R1), [R2-R5]
+	MOVW      $·poly1305_init_constants_armv6<>(SB), R7
+	MOVW      R2, R8
+	MOVW      R2>>26, R9
+	MOVW      R3>>20, g
+	MOVW      R4>>14, R11
+	MOVW      R5>>8, R12
+	ORR       R3<<6, R9, R9
+	ORR       R4<<12, g, g
+	ORR       R5<<18, R11, R11
+	MOVM.IA   (R7), [R2-R6]
+	AND       R8, R2, R2
+	AND       R9, R3, R3
+	AND       g, R4, R4
+	AND       R11, R5, R5
+	AND       R12, R6, R6
+	MOVM.IA.W [R2-R6], (R0)
+	EOR       R2, R2, R2
+	EOR       R3, R3, R3
+	EOR       R4, R4, R4
+	EOR       R5, R5, R5
+	EOR       R6, R6, R6
+	MOVM.IA.W [R2-R6], (R0)
+	MOVM.IA.W (R1), [R2-R5]
+	MOVM.IA   [R2-R6], (R0)
+	ADD       $20, R13, R0
+	MOVM.DA   (R0), [R4-R7]
+	RET
+
+#define MOVW_UNALIGNED(Rsrc, Rdst, Rtmp, offset) \
+	MOVBU (offset+0)(Rsrc), Rtmp; \
+	MOVBU Rtmp, (offset+0)(Rdst); \
+	MOVBU (offset+1)(Rsrc), Rtmp; \
+	MOVBU Rtmp, (offset+1)(Rdst); \
+	MOVBU (offset+2)(Rsrc), Rtmp; \
+	MOVBU Rtmp, (offset+2)(Rdst); \
+	MOVBU (offset+3)(Rsrc), Rtmp; \
+	MOVBU Rtmp, (offset+3)(Rdst)
+
+TEXT poly1305_blocks_armv6<>(SB), NOSPLIT, $0
+	// Needs 24 bytes of stack for saved registers and then 88 bytes of
+	// scratch space after that. We assume that 24 bytes at (R13) have
+	// already been used: four bytes for the link register saved in the
+	// prelude of poly1305_auth_armv6, four bytes for saving the value of g
+	// in that function and 16 bytes of scratch space used around
+	// poly1305_finish_ext_armv6_skip1.
+	ADD     $24, R13, R12
+	MOVM.IB [R4-R8, R14], (R12)
+	MOVW    R0, 88(R13)
+	MOVW    R1, 92(R13)
+	MOVW    R2, 96(R13)
+	MOVW    R1, R14
+	MOVW    R2, R12
+	MOVW    56(R0), R8
+	WORD    $0xe1180008                // TST R8, R8 not working see issue 5921
+	EOR     R6, R6, R6
+	MOVW.EQ $(1<<24), R6
+	MOVW    R6, 84(R13)
+	ADD     $116, R13, g
+	MOVM.IA (R0), [R0-R9]
+	MOVM.IA [R0-R4], (g)
+	CMP     $16, R12
+	BLO     poly1305_blocks_armv6_done
+
+poly1305_blocks_armv6_mainloop:
+	WORD    $0xe31e0003                            // TST R14, #3 not working see issue 5921
+	BEQ     poly1305_blocks_armv6_mainloop_aligned
+	ADD     $100, R13, g
+	MOVW_UNALIGNED(R14, g, R0, 0)
+	MOVW_UNALIGNED(R14, g, R0, 4)
+	MOVW_UNALIGNED(R14, g, R0, 8)
+	MOVW_UNALIGNED(R14, g, R0, 12)
+	MOVM.IA (g), [R0-R3]
+	ADD     $16, R14
+	B       poly1305_blocks_armv6_mainloop_loaded
+
+poly1305_blocks_armv6_mainloop_aligned:
+	MOVM.IA.W (R14), [R0-R3]
+
+poly1305_blocks_armv6_mainloop_loaded:
+	MOVW    R0>>26, g
+	MOVW    R1>>20, R11
+	MOVW    R2>>14, R12
+	MOVW    R14, 92(R13)
+	MOVW    R3>>8, R4
+	ORR     R1<<6, g, g
+	ORR     R2<<12, R11, R11
+	ORR     R3<<18, R12, R12
+	BIC     $0xfc000000, R0, R0
+	BIC     $0xfc000000, g, g
+	MOVW    84(R13), R3
+	BIC     $0xfc000000, R11, R11
+	BIC     $0xfc000000, R12, R12
+	ADD     R0, R5, R5
+	ADD     g, R6, R6
+	ORR     R3, R4, R4
+	ADD     R11, R7, R7
+	ADD     $116, R13, R14
+	ADD     R12, R8, R8
+	ADD     R4, R9, R9
+	MOVM.IA (R14), [R0-R4]
+	MULLU   R4, R5, (R11, g)
+	MULLU   R3, R5, (R14, R12)
+	MULALU  R3, R6, (R11, g)
+	MULALU  R2, R6, (R14, R12)
+	MULALU  R2, R7, (R11, g)
+	MULALU  R1, R7, (R14, R12)
+	ADD     R4<<2, R4, R4
+	ADD     R3<<2, R3, R3
+	MULALU  R1, R8, (R11, g)
+	MULALU  R0, R8, (R14, R12)
+	MULALU  R0, R9, (R11, g)
+	MULALU  R4, R9, (R14, R12)
+	MOVW    g, 76(R13)
+	MOVW    R11, 80(R13)
+	MOVW    R12, 68(R13)
+	MOVW    R14, 72(R13)
+	MULLU   R2, R5, (R11, g)
+	MULLU   R1, R5, (R14, R12)
+	MULALU  R1, R6, (R11, g)
+	MULALU  R0, R6, (R14, R12)
+	MULALU  R0, R7, (R11, g)
+	MULALU  R4, R7, (R14, R12)
+	ADD     R2<<2, R2, R2
+	ADD     R1<<2, R1, R1
+	MULALU  R4, R8, (R11, g)
+	MULALU  R3, R8, (R14, R12)
+	MULALU  R3, R9, (R11, g)
+	MULALU  R2, R9, (R14, R12)
+	MOVW    g, 60(R13)
+	MOVW    R11, 64(R13)
+	MOVW    R12, 52(R13)
+	MOVW    R14, 56(R13)
+	MULLU   R0, R5, (R11, g)
+	MULALU  R4, R6, (R11, g)
+	MULALU  R3, R7, (R11, g)
+	MULALU  R2, R8, (R11, g)
+	MULALU  R1, R9, (R11, g)
+	ADD     $52, R13, R0
+	MOVM.IA (R0), [R0-R7]
+	MOVW    g>>26, R12
+	MOVW    R4>>26, R14
+	ORR     R11<<6, R12, R12
+	ORR     R5<<6, R14, R14
+	BIC     $0xfc000000, g, g
+	BIC     $0xfc000000, R4, R4
+	ADD.S   R12, R0, R0
+	ADC     $0, R1, R1
+	ADD.S   R14, R6, R6
+	ADC     $0, R7, R7
+	MOVW    R0>>26, R12
+	MOVW    R6>>26, R14
+	ORR     R1<<6, R12, R12
+	ORR     R7<<6, R14, R14
+	BIC     $0xfc000000, R0, R0
+	BIC     $0xfc000000, R6, R6
+	ADD     R14<<2, R14, R14
+	ADD.S   R12, R2, R2
+	ADC     $0, R3, R3
+	ADD     R14, g, g
+	MOVW    R2>>26, R12
+	MOVW    g>>26, R14
+	ORR     R3<<6, R12, R12
+	BIC     $0xfc000000, g, R5
+	BIC     $0xfc000000, R2, R7
+	ADD     R12, R4, R4
+	ADD     R14, R0, R0
+	MOVW    R4>>26, R12
+	BIC     $0xfc000000, R4, R8
+	ADD     R12, R6, R9
+	MOVW    96(R13), R12
+	MOVW    92(R13), R14
+	MOVW    R0, R6
+	CMP     $32, R12
+	SUB     $16, R12, R12
+	MOVW    R12, 96(R13)
+	BHS     poly1305_blocks_armv6_mainloop
+
+poly1305_blocks_armv6_done:
+	MOVW    88(R13), R12
+	MOVW    R5, 20(R12)
+	MOVW    R6, 24(R12)
+	MOVW    R7, 28(R12)
+	MOVW    R8, 32(R12)
+	MOVW    R9, 36(R12)
+	ADD     $48, R13, R0
+	MOVM.DA (R0), [R4-R8, R14]
+	RET
+
+#define MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp) \
+	MOVBU.P 1(Rsrc), Rtmp; \
+	MOVBU.P Rtmp, 1(Rdst); \
+	MOVBU.P 1(Rsrc), Rtmp; \
+	MOVBU.P Rtmp, 1(Rdst)
+
+#define MOVWP_UNALIGNED(Rsrc, Rdst, Rtmp) \
+	MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp); \
+	MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp)
+
+// func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]key)
+TEXT ·poly1305_auth_armv6(SB), $196-16
+	// The value 196, just above, is the sum of 64 (the size of the context
+	// structure) and 132 (the amount of stack needed).
+	//
+	// At this point, the stack pointer (R13) has been moved down. It
+	// points to the saved link register and there's 196 bytes of free
+	// space above it.
+	//
+	// The stack for this function looks like:
+	//
+	// +---------------------
+	// |
+	// | 64 bytes of context structure
+	// |
+	// +---------------------
+	// |
+	// | 112 bytes for poly1305_blocks_armv6
+	// |
+	// +---------------------
+	// | 16 bytes of final block, constructed at
+	// | poly1305_finish_ext_armv6_skip8
+	// +---------------------
+	// | four bytes of saved 'g'
+	// +---------------------
+	// | lr, saved by prelude    <- R13 points here
+	// +---------------------
+	MOVW g, 4(R13)
+
+	MOVW out+0(FP), R4
+	MOVW m+4(FP), R5
+	MOVW mlen+8(FP), R6
+	MOVW key+12(FP), R7
+
+	ADD  $136, R13, R0 // 136 = 4 + 4 + 16 + 112
+	MOVW R7, R1
+
+	// poly1305_init_ext_armv6 will write to the stack from R13+4, but
+	// that's ok because none of the other values have been written yet.
+	BL    poly1305_init_ext_armv6<>(SB)
+	BIC.S $15, R6, R2
+	BEQ   poly1305_auth_armv6_noblocks
+	ADD   $136, R13, R0
+	MOVW  R5, R1
+	ADD   R2, R5, R5
+	SUB   R2, R6, R6
+	BL    poly1305_blocks_armv6<>(SB)
+
+poly1305_auth_armv6_noblocks:
+	ADD  $136, R13, R0
+	MOVW R5, R1
+	MOVW R6, R2
+	MOVW R4, R3
+
+	MOVW  R0, R5
+	MOVW  R1, R6
+	MOVW  R2, R7
+	MOVW  R3, R8
+	AND.S R2, R2, R2
+	BEQ   poly1305_finish_ext_armv6_noremaining
+	EOR   R0, R0
+	ADD   $8, R13, R9                           // 8 = offset to 16 byte scratch space
+	MOVW  R0, (R9)
+	MOVW  R0, 4(R9)
+	MOVW  R0, 8(R9)
+	MOVW  R0, 12(R9)
+	WORD  $0xe3110003                           // TST R1, #3 not working see issue 5921
+	BEQ   poly1305_finish_ext_armv6_aligned
+	WORD  $0xe3120008                           // TST R2, #8 not working see issue 5921
+	BEQ   poly1305_finish_ext_armv6_skip8
+	MOVWP_UNALIGNED(R1, R9, g)
+	MOVWP_UNALIGNED(R1, R9, g)
+
+poly1305_finish_ext_armv6_skip8:
+	WORD $0xe3120004                     // TST $4, R2 not working see issue 5921
+	BEQ  poly1305_finish_ext_armv6_skip4
+	MOVWP_UNALIGNED(R1, R9, g)
+
+poly1305_finish_ext_armv6_skip4:
+	WORD $0xe3120002                     // TST $2, R2 not working see issue 5921
+	BEQ  poly1305_finish_ext_armv6_skip2
+	MOVHUP_UNALIGNED(R1, R9, g)
+	B    poly1305_finish_ext_armv6_skip2
+
+poly1305_finish_ext_armv6_aligned:
+	WORD      $0xe3120008                             // TST R2, #8 not working see issue 5921
+	BEQ       poly1305_finish_ext_armv6_skip8_aligned
+	MOVM.IA.W (R1), [g-R11]
+	MOVM.IA.W [g-R11], (R9)
+
+poly1305_finish_ext_armv6_skip8_aligned:
+	WORD   $0xe3120004                             // TST $4, R2 not working see issue 5921
+	BEQ    poly1305_finish_ext_armv6_skip4_aligned
+	MOVW.P 4(R1), g
+	MOVW.P g, 4(R9)
+
+poly1305_finish_ext_armv6_skip4_aligned:
+	WORD    $0xe3120002                     // TST $2, R2 not working see issue 5921
+	BEQ     poly1305_finish_ext_armv6_skip2
+	MOVHU.P 2(R1), g
+	MOVH.P  g, 2(R9)
+
+poly1305_finish_ext_armv6_skip2:
+	WORD    $0xe3120001                     // TST $1, R2 not working see issue 5921
+	BEQ     poly1305_finish_ext_armv6_skip1
+	MOVBU.P 1(R1), g
+	MOVBU.P g, 1(R9)
+
+poly1305_finish_ext_armv6_skip1:
+	MOVW  $1, R11
+	MOVBU R11, 0(R9)
+	MOVW  R11, 56(R5)
+	MOVW  R5, R0
+	ADD   $8, R13, R1
+	MOVW  $16, R2
+	BL    poly1305_blocks_armv6<>(SB)
+
+poly1305_finish_ext_armv6_noremaining:
+	MOVW      20(R5), R0
+	MOVW      24(R5), R1
+	MOVW      28(R5), R2
+	MOVW      32(R5), R3
+	MOVW      36(R5), R4
+	MOVW      R4>>26, R12
+	BIC       $0xfc000000, R4, R4
+	ADD       R12<<2, R12, R12
+	ADD       R12, R0, R0
+	MOVW      R0>>26, R12
+	BIC       $0xfc000000, R0, R0
+	ADD       R12, R1, R1
+	MOVW      R1>>26, R12
+	BIC       $0xfc000000, R1, R1
+	ADD       R12, R2, R2
+	MOVW      R2>>26, R12
+	BIC       $0xfc000000, R2, R2
+	ADD       R12, R3, R3
+	MOVW      R3>>26, R12
+	BIC       $0xfc000000, R3, R3
+	ADD       R12, R4, R4
+	ADD       $5, R0, R6
+	MOVW      R6>>26, R12
+	BIC       $0xfc000000, R6, R6
+	ADD       R12, R1, R7
+	MOVW      R7>>26, R12
+	BIC       $0xfc000000, R7, R7
+	ADD       R12, R2, g
+	MOVW      g>>26, R12
+	BIC       $0xfc000000, g, g
+	ADD       R12, R3, R11
+	MOVW      $-(1<<26), R12
+	ADD       R11>>26, R12, R12
+	BIC       $0xfc000000, R11, R11
+	ADD       R12, R4, R9
+	MOVW      R9>>31, R12
+	SUB       $1, R12
+	AND       R12, R6, R6
+	AND       R12, R7, R7
+	AND       R12, g, g
+	AND       R12, R11, R11
+	AND       R12, R9, R9
+	MVN       R12, R12
+	AND       R12, R0, R0
+	AND       R12, R1, R1
+	AND       R12, R2, R2
+	AND       R12, R3, R3
+	AND       R12, R4, R4
+	ORR       R6, R0, R0
+	ORR       R7, R1, R1
+	ORR       g, R2, R2
+	ORR       R11, R3, R3
+	ORR       R9, R4, R4
+	ORR       R1<<26, R0, R0
+	MOVW      R1>>6, R1
+	ORR       R2<<20, R1, R1
+	MOVW      R2>>12, R2
+	ORR       R3<<14, R2, R2
+	MOVW      R3>>18, R3
+	ORR       R4<<8, R3, R3
+	MOVW      40(R5), R6
+	MOVW      44(R5), R7
+	MOVW      48(R5), g
+	MOVW      52(R5), R11
+	ADD.S     R6, R0, R0
+	ADC.S     R7, R1, R1
+	ADC.S     g, R2, R2
+	ADC.S     R11, R3, R3
+	MOVM.IA   [R0-R3], (R8)
+	MOVW      R5, R12
+	EOR       R0, R0, R0
+	EOR       R1, R1, R1
+	EOR       R2, R2, R2
+	EOR       R3, R3, R3
+	EOR       R4, R4, R4
+	EOR       R5, R5, R5
+	EOR       R6, R6, R6
+	EOR       R7, R7, R7
+	MOVM.IA.W [R0-R7], (R12)
+	MOVM.IA   [R0-R7], (R12)
+	MOVW      4(R13), g
+	RET
diff --git a/vendor/golang.org/x/crypto/poly1305/sum_ref.go b/vendor/golang.org/x/crypto/poly1305/sum_ref.go
new file mode 100644
index 000000000..b2805a5ca
--- /dev/null
+++ b/vendor/golang.org/x/crypto/poly1305/sum_ref.go
@@ -0,0 +1,141 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64,!arm gccgo appengine nacl
+
+package poly1305
+
+import "encoding/binary"
+
+// Sum generates an authenticator for msg using a one-time key and puts the
+// 16-byte result into out. Authenticating two different messages with the same
+// key allows an attacker to forge messages at will.
+func Sum(out *[TagSize]byte, msg []byte, key *[32]byte) {
+	var (
+		h0, h1, h2, h3, h4 uint32 // the hash accumulators
+		r0, r1, r2, r3, r4 uint64 // the r part of the key
+	)
+
+	r0 = uint64(binary.LittleEndian.Uint32(key[0:]) & 0x3ffffff)
+	r1 = uint64((binary.LittleEndian.Uint32(key[3:]) >> 2) & 0x3ffff03)
+	r2 = uint64((binary.LittleEndian.Uint32(key[6:]) >> 4) & 0x3ffc0ff)
+	r3 = uint64((binary.LittleEndian.Uint32(key[9:]) >> 6) & 0x3f03fff)
+	r4 = uint64((binary.LittleEndian.Uint32(key[12:]) >> 8) & 0x00fffff)
+
+	R1, R2, R3, R4 := r1*5, r2*5, r3*5, r4*5
+
+	for len(msg) >= TagSize {
+		// h += msg
+		h0 += binary.LittleEndian.Uint32(msg[0:]) & 0x3ffffff
+		h1 += (binary.LittleEndian.Uint32(msg[3:]) >> 2) & 0x3ffffff
+		h2 += (binary.LittleEndian.Uint32(msg[6:]) >> 4) & 0x3ffffff
+		h3 += (binary.LittleEndian.Uint32(msg[9:]) >> 6) & 0x3ffffff
+		h4 += (binary.LittleEndian.Uint32(msg[12:]) >> 8) | (1 << 24)
+
+		// h *= r
+		d0 := (uint64(h0) * r0) + (uint64(h1) * R4) + (uint64(h2) * R3) + (uint64(h3) * R2) + (uint64(h4) * R1)
+		d1 := (d0 >> 26) + (uint64(h0) * r1) + (uint64(h1) * r0) + (uint64(h2) * R4) + (uint64(h3) * R3) + (uint64(h4) * R2)
+		d2 := (d1 >> 26) + (uint64(h0) * r2) + (uint64(h1) * r1) + (uint64(h2) * r0) + (uint64(h3) * R4) + (uint64(h4) * R3)
+		d3 := (d2 >> 26) + (uint64(h0) * r3) + (uint64(h1) * r2) + (uint64(h2) * r1) + (uint64(h3) * r0) + (uint64(h4) * R4)
+		d4 := (d3 >> 26) + (uint64(h0) * r4) + (uint64(h1) * r3) + (uint64(h2) * r2) + (uint64(h3) * r1) + (uint64(h4) * r0)
+
+		// h %= p
+		h0 = uint32(d0) & 0x3ffffff
+		h1 = uint32(d1) & 0x3ffffff
+		h2 = uint32(d2) & 0x3ffffff
+		h3 = uint32(d3) & 0x3ffffff
+		h4 = uint32(d4) & 0x3ffffff
+
+		h0 += uint32(d4>>26) * 5
+		h1 += h0 >> 26
+		h0 = h0 & 0x3ffffff
+
+		msg = msg[TagSize:]
+	}
+
+	if len(msg) > 0 {
+		var block [TagSize]byte
+		off := copy(block[:], msg)
+		block[off] = 0x01
+
+		// h += msg
+		h0 += binary.LittleEndian.Uint32(block[0:]) & 0x3ffffff
+		h1 += (binary.LittleEndian.Uint32(block[3:]) >> 2) & 0x3ffffff
+		h2 += (binary.LittleEndian.Uint32(block[6:]) >> 4) & 0x3ffffff
+		h3 += (binary.LittleEndian.Uint32(block[9:]) >> 6) & 0x3ffffff
+		h4 += (binary.LittleEndian.Uint32(block[12:]) >> 8)
+
+		// h *= r
+		d0 := (uint64(h0) * r0) + (uint64(h1) * R4) + (uint64(h2) * R3) + (uint64(h3) * R2) + (uint64(h4) * R1)
+		d1 := (d0 >> 26) + (uint64(h0) * r1) + (uint64(h1) * r0) + (uint64(h2) * R4) + (uint64(h3) * R3) + (uint64(h4) * R2)
+		d2 := (d1 >> 26) + (uint64(h0) * r2) + (uint64(h1) * r1) + (uint64(h2) * r0) + (uint64(h3) * R4) + (uint64(h4) * R3)
+		d3 := (d2 >> 26) + (uint64(h0) * r3) + (uint64(h1) * r2) + (uint64(h2) * r1) + (uint64(h3) * r0) + (uint64(h4) * R4)
+		d4 := (d3 >> 26) + (uint64(h0) * r4) + (uint64(h1) * r3) + (uint64(h2) * r2) + (uint64(h3) * r1) + (uint64(h4) * r0)
+
+		// h %= p
+		h0 = uint32(d0) & 0x3ffffff
+		h1 = uint32(d1) & 0x3ffffff
+		h2 = uint32(d2) & 0x3ffffff
+		h3 = uint32(d3) & 0x3ffffff
+		h4 = uint32(d4) & 0x3ffffff
+
+		h0 += uint32(d4>>26) * 5
+		h1 += h0 >> 26
+		h0 = h0 & 0x3ffffff
+	}
+
+	// h %= p reduction
+	h2 += h1 >> 26
+	h1 &= 0x3ffffff
+	h3 += h2 >> 26
+	h2 &= 0x3ffffff
+	h4 += h3 >> 26
+	h3 &= 0x3ffffff
+	h0 += 5 * (h4 >> 26)
+	h4 &= 0x3ffffff
+	h1 += h0 >> 26
+	h0 &= 0x3ffffff
+
+	// h - p
+	t0 := h0 + 5
+	t1 := h1 + (t0 >> 26)
+	t2 := h2 + (t1 >> 26)
+	t3 := h3 + (t2 >> 26)
+	t4 := h4 + (t3 >> 26) - (1 << 26)
+	t0 &= 0x3ffffff
+	t1 &= 0x3ffffff
+	t2 &= 0x3ffffff
+	t3 &= 0x3ffffff
+
+	// select h if h < p else h - p
+	t_mask := (t4 >> 31) - 1
+	h_mask := ^t_mask
+	h0 = (h0 & h_mask) | (t0 & t_mask)
+	h1 = (h1 & h_mask) | (t1 & t_mask)
+	h2 = (h2 & h_mask) | (t2 & t_mask)
+	h3 = (h3 & h_mask) | (t3 & t_mask)
+	h4 = (h4 & h_mask) | (t4 & t_mask)
+
+	// h %= 2^128
+	h0 |= h1 << 26
+	h1 = ((h1 >> 6) | (h2 << 20))
+	h2 = ((h2 >> 12) | (h3 << 14))
+	h3 = ((h3 >> 18) | (h4 << 8))
+
+	// s: the s part of the key
+	// tag = (h + s) % (2^128)
+	t := uint64(h0) + uint64(binary.LittleEndian.Uint32(key[16:]))
+	h0 = uint32(t)
+	t = uint64(h1) + uint64(binary.LittleEndian.Uint32(key[20:])) + (t >> 32)
+	h1 = uint32(t)
+	t = uint64(h2) + uint64(binary.LittleEndian.Uint32(key[24:])) + (t >> 32)
+	h2 = uint32(t)
+	t = uint64(h3) + uint64(binary.LittleEndian.Uint32(key[28:])) + (t >> 32)
+	h3 = uint32(t)
+
+	binary.LittleEndian.PutUint32(out[0:], h0)
+	binary.LittleEndian.PutUint32(out[4:], h1)
+	binary.LittleEndian.PutUint32(out[8:], h2)
+	binary.LittleEndian.PutUint32(out[12:], h3)
+}
diff --git a/vendor/vendor.json b/vendor/vendor.json
index be3ae0ed5..f539fc07d 100644
--- a/vendor/vendor.json
+++ b/vendor/vendor.json
@@ -389,6 +389,12 @@
 			"revision": "43ed500fe4d485d97534014d9f98521216240002",
 			"revisionTime": "2017-08-28T17:39:33Z"
 		},
+		{
+			"checksumSHA1": "/VYXTlcksV92qL4Avr5S3NJ/Fxs=",
+			"path": "github.com/minio/sio",
+			"revision": "d8be2518a912f0db0dd17dc55f46da4360ee60d8",
+			"revisionTime": "2017-09-06T19:57:40Z"
+		},
 		{
 			"checksumSHA1": "zvQr4zOz1/g/Fui6co0sctxrJ28=",
 			"path": "github.com/nats-io/go-nats",
@@ -488,6 +494,24 @@
 			"revision": "3b8db5e93c4c02efbc313e17b2e796b0914a01fb",
 			"revisionTime": "2016-12-15T19:56:52Z"
 		},
+		{
+			"checksumSHA1": "CvMkf3KUUGUVHibg6G/zI7XtVbM=",
+			"path": "golang.org/x/crypto/chacha20poly1305",
+			"revision": "81e90905daefcd6fd217b62423c0908922eadb30",
+			"revisionTime": "2017-08-25T20:24:07Z"
+		},
+		{
+			"checksumSHA1": "8f1lWFSOLu7O6Jqk/e+ydH1yoms=",
+			"path": "golang.org/x/crypto/chacha20poly1305/internal/chacha20",
+			"revision": "81e90905daefcd6fd217b62423c0908922eadb30",
+			"revisionTime": "2017-08-25T20:24:07Z"
+		},
+		{
+			"checksumSHA1": "kVKE0OX1Xdw5mG7XKT86DLLKE2I=",
+			"path": "golang.org/x/crypto/poly1305",
+			"revision": "81e90905daefcd6fd217b62423c0908922eadb30",
+			"revisionTime": "2017-08-25T20:24:07Z"
+		},
 		{
 			"path": "golang.org/x/net/context",
 			"revision": "a728288923b47049b2ce791836767ffbe964a5bd",