From ca6b4773ed438e46720e34b27bb90600416014f4 Mon Sep 17 00:00:00 2001 From: Andreas Auernhammer Date: Wed, 8 Nov 2017 00:18:59 +0100 Subject: [PATCH] add SSE-C support for HEAD, GET, PUT (#4894) This change adds server-side-encryption support for HEAD, GET and PUT operations. This PR only addresses single-part PUTs and GETs without HTTP ranges. Further this change adds the concept of reserved object metadata which is required to make encrypted objects tamper-proof and provide API compatibility to AWS S3. This PR adds the following reserved metadata entries: - X-Minio-Internal-Server-Side-Encryption-Iv ('guarantees' tamper-proof property) - X-Minio-Internal-Server-Side-Encryption-Kdf (makes Key-MAC computation negotiable in future) - X-Minio-Internal-Server-Side-Encryption-Key-Mac (provides AWS S3 API compatibility) The prefix `X-Minio_Internal` specifies an internal metadata entry which must not send to clients. All client requests containing a metadata key starting with `X-Minio-Internal` must also rejected. This is implemented by a generic-handler. This PR implements SSE-C separated from client-side-encryption (CSE). This cannot decrypt server-side-encrypted objects on the client-side. However, clients can encrypted the same object with CSE and SSE-C. This PR does not address: - SSE-C Copy and Copy part - SSE-C GET with HTTP ranges - SSE-C multipart PUT - SSE-C Gateway Each point must be addressed in a separate PR. Added to vendor dir: - x/crypto/chacha20poly1305 - x/crypto/poly1305 - github.com/minio/sio --- appveyor.yml | 16 +- cmd/api-errors.go | 88 +- cmd/api-errors_test.go | 150 +- cmd/encryption-v1.go | 300 ++ cmd/encryption-v1_test.go | 390 +++ cmd/gateway-gcs.go | 1 - cmd/gateway-handlers.go | 38 +- cmd/generic-handlers.go | 34 + cmd/generic-handlers_test.go | 35 + cmd/object-handlers.go | 143 +- cmd/routers.go | 3 + pkg/ioutil/ioutil.go | 61 + pkg/ioutil/ioutil_test.go | 39 + vendor/github.com/minio/sio/DARE.md | 223 ++ vendor/github.com/minio/sio/LICENSE | 202 ++ vendor/github.com/minio/sio/README.md | 66 + vendor/github.com/minio/sio/dare.go | 277 ++ vendor/github.com/minio/sio/header.go | 39 + vendor/github.com/minio/sio/reader.go | 169 + vendor/github.com/minio/sio/writer.go | 193 ++ vendor/golang.org/x/crypto/LICENSE | 27 + vendor/golang.org/x/crypto/PATENTS | 22 + .../chacha20poly1305/chacha20poly1305.go | 83 + .../chacha20poly1305_amd64.go | 127 + .../chacha20poly1305/chacha20poly1305_amd64.s | 2714 +++++++++++++++++ .../chacha20poly1305_generic.go | 70 + .../chacha20poly1305_noasm.go | 15 + .../internal/chacha20/chacha_generic.go | 199 ++ .../golang.org/x/crypto/poly1305/poly1305.go | 33 + .../golang.org/x/crypto/poly1305/sum_amd64.go | 22 + .../golang.org/x/crypto/poly1305/sum_amd64.s | 125 + .../golang.org/x/crypto/poly1305/sum_arm.go | 22 + vendor/golang.org/x/crypto/poly1305/sum_arm.s | 427 +++ .../golang.org/x/crypto/poly1305/sum_ref.go | 141 + vendor/vendor.json | 24 + 35 files changed, 6321 insertions(+), 197 deletions(-) create mode 100644 cmd/encryption-v1.go create mode 100644 cmd/encryption-v1_test.go create mode 100644 pkg/ioutil/ioutil.go create mode 100644 pkg/ioutil/ioutil_test.go create mode 100644 vendor/github.com/minio/sio/DARE.md create mode 100644 vendor/github.com/minio/sio/LICENSE create mode 100644 vendor/github.com/minio/sio/README.md create mode 100644 vendor/github.com/minio/sio/dare.go create mode 100644 vendor/github.com/minio/sio/header.go create mode 100644 vendor/github.com/minio/sio/reader.go create mode 100644 vendor/github.com/minio/sio/writer.go create mode 100644 vendor/golang.org/x/crypto/LICENSE create mode 100644 vendor/golang.org/x/crypto/PATENTS create mode 100644 vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go create mode 100644 vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go create mode 100644 vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s create mode 100644 vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go create mode 100644 vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_noasm.go create mode 100644 vendor/golang.org/x/crypto/chacha20poly1305/internal/chacha20/chacha_generic.go create mode 100644 vendor/golang.org/x/crypto/poly1305/poly1305.go create mode 100644 vendor/golang.org/x/crypto/poly1305/sum_amd64.go create mode 100644 vendor/golang.org/x/crypto/poly1305/sum_amd64.s create mode 100644 vendor/golang.org/x/crypto/poly1305/sum_arm.go create mode 100644 vendor/golang.org/x/crypto/poly1305/sum_arm.s create mode 100644 vendor/golang.org/x/crypto/poly1305/sum_ref.go diff --git a/appveyor.yml b/appveyor.yml index c6ecd38ba..e88d6ed1c 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -40,17 +40,19 @@ test_script: - mkdir build\coverage - go test -v -timeout 17m -race github.com/minio/minio/cmd... - go test -v -race github.com/minio/minio/pkg... - - go test -v -timeout 17m -coverprofile=build\coverage\coverage.txt -covermode=atomic github.com/minio/minio/cmd + # FIXME(aead): enable codecov after issue https://github.com/golang/go/issues/18468 is solved. + # - go test -v -timeout 17m -coverprofile=build\coverage\coverage.txt -covermode=atomic github.com/minio/minio/cmd - ps: Update-AppveyorTest "Unit Tests" -Outcome Passed after_test: - - go tool cover -html=build\coverage\coverage.txt -o build\coverage\coverage.html - - ps: Push-AppveyorArtifact build\coverage\coverage.txt - - ps: Push-AppveyorArtifact build\coverage\coverage.html + # FIXME(aead): enable codecov after issue https://github.com/golang/go/issues/18468 is solved. + # - go tool cover -html=build\coverage\coverage.txt -o build\coverage\coverage.html + # - ps: Push-AppveyorArtifact build\coverage\coverage.txt + # - ps: Push-AppveyorArtifact build\coverage\coverage.html # Upload coverage report. - - "SET PATH=C:\\Python34;C:\\Python34\\Scripts;%PATH%" - - pip install codecov - - codecov -X gcov -f "build\coverage\coverage.txt" + # - "SET PATH=C:\\Python34;C:\\Python34\\Scripts;%PATH%" + # - pip install codecov + # - codecov -X gcov -f "build\coverage\coverage.txt" # to disable deployment deploy: off diff --git a/cmd/api-errors.go b/cmd/api-errors.go index 339545afb..e3bb31d52 100644 --- a/cmd/api-errors.go +++ b/cmd/api-errors.go @@ -122,6 +122,17 @@ const ( ErrUnsupportedMetadata // Add new error codes here. + // Server-Side-Encryption (with Customer provided key) related API errors. + + ErrInsecureSSECustomerRequest + ErrSSEEncryptedObject + ErrInvalidEncryptionParameters + ErrInvalidSSECustomerAlgorithm + ErrInvalidSSECustomerKey + ErrMissingSSECustomerKey + ErrMissingSSECustomerKeyMD5 + ErrSSECustomerKeyMD5Mismatch + // Bucket notification related errors. ErrEventNotification ErrARNNotification @@ -159,6 +170,7 @@ const ( ErrAdminConfigNoQuorum ErrAdminCredentialsMismatch ErrInsecureClientRequest + ErrObjectTampered ) // error code to APIError structure, these fields carry respective @@ -574,6 +586,51 @@ var errorCodeResponse = map[APIErrorCode]APIError{ Description: "Range specified is not valid for source object", HTTPStatusCode: http.StatusBadRequest, }, + ErrMetadataTooLarge: { + Code: "InvalidArgument", + Description: "Your metadata headers exceed the maximum allowed metadata size.", + HTTPStatusCode: http.StatusBadRequest, + }, + ErrInsecureSSECustomerRequest: { + Code: "InvalidRequest", + Description: errInsecureSSERequest.Error(), + HTTPStatusCode: http.StatusBadRequest, + }, + ErrSSEEncryptedObject: { + Code: "InvalidRequest", + Description: errEncryptedObject.Error(), + HTTPStatusCode: http.StatusBadRequest, + }, + ErrInvalidEncryptionParameters: { + Code: "InvalidRequest", + Description: "The encryption parameters are not applicable to this object.", + HTTPStatusCode: http.StatusBadRequest, + }, + ErrInvalidSSECustomerAlgorithm: { + Code: "InvalidArgument", + Description: errInvalidSSEAlgorithm.Error(), + HTTPStatusCode: http.StatusBadRequest, + }, + ErrInvalidSSECustomerKey: { + Code: "InvalidArgument", + Description: errInvalidSSEKey.Error(), + HTTPStatusCode: http.StatusBadRequest, + }, + ErrMissingSSECustomerKey: { + Code: "InvalidArgument", + Description: errMissingSSEKey.Error(), + HTTPStatusCode: http.StatusBadRequest, + }, + ErrMissingSSECustomerKeyMD5: { + Code: "InvalidArgument", + Description: errMissingSSEKeyMD5.Error(), + HTTPStatusCode: http.StatusBadRequest, + }, + ErrSSECustomerKeyMD5Mismatch: { + Code: "InvalidArgument", + Description: errSSEKeyMD5Mismatch.Error(), + HTTPStatusCode: http.StatusBadRequest, + }, /// S3 extensions. ErrContentSHA256Mismatch: { @@ -653,11 +710,6 @@ var errorCodeResponse = map[APIErrorCode]APIError{ Description: "A timeout occurred while trying to lock a resource", HTTPStatusCode: http.StatusRequestTimeout, }, - ErrMetadataTooLarge: { - Code: "InvalidArgument", - Description: "Your metadata headers exceed the maximum allowed metadata size.", - HTTPStatusCode: http.StatusBadRequest, - }, ErrUnsupportedMetadata: { Code: "InvalidArgument", Description: "Your metadata headers are not supported.", @@ -668,6 +720,11 @@ var errorCodeResponse = map[APIErrorCode]APIError{ Description: "All parts except the last part should be of the same size.", HTTPStatusCode: http.StatusBadRequest, }, + ErrObjectTampered: { + Code: "XMinioObjectTampered", + Description: errObjectTampered.Error(), + HTTPStatusCode: http.StatusPartialContent, + }, // Add your error structure here. } @@ -699,6 +756,27 @@ func toAPIErrorCode(err error) (apiErr APIErrorCode) { return apiErr } + switch err { // SSE errors + case errInsecureSSERequest: + return ErrInsecureSSECustomerRequest + case errInvalidSSEAlgorithm: + return ErrInvalidSSECustomerAlgorithm + case errInvalidSSEKey: + return ErrInvalidSSECustomerKey + case errMissingSSEKey: + return ErrMissingSSECustomerKey + case errMissingSSEKeyMD5: + return ErrMissingSSECustomerKeyMD5 + case errSSEKeyMD5Mismatch: + return ErrSSECustomerKeyMD5Mismatch + case errObjectTampered: + return ErrObjectTampered + case errEncryptedObject: + return ErrSSEEncryptedObject + case errSSEKeyMismatch: + return ErrAccessDenied // no access without correct key + } + switch err.(type) { case StorageFull: apiErr = ErrStorageFull diff --git a/cmd/api-errors_test.go b/cmd/api-errors_test.go index 88a73d7aa..f92395126 100644 --- a/cmd/api-errors_test.go +++ b/cmd/api-errors_test.go @@ -23,116 +23,48 @@ import ( "github.com/minio/minio/pkg/hash" ) +var toAPIErrorCodeTests = []struct { + err error + errCode APIErrorCode +}{ + {err: hash.BadDigest{}, errCode: ErrBadDigest}, + {err: hash.SHA256Mismatch{}, errCode: ErrContentSHA256Mismatch}, + {err: IncompleteBody{}, errCode: ErrIncompleteBody}, + {err: ObjectExistsAsDirectory{}, errCode: ErrObjectExistsAsDirectory}, + {err: BucketNameInvalid{}, errCode: ErrInvalidBucketName}, + {err: BucketExists{}, errCode: ErrBucketAlreadyOwnedByYou}, + {err: ObjectNotFound{}, errCode: ErrNoSuchKey}, + {err: ObjectNameInvalid{}, errCode: ErrInvalidObjectName}, + {err: InvalidUploadID{}, errCode: ErrNoSuchUpload}, + {err: InvalidPart{}, errCode: ErrInvalidPart}, + {err: InsufficientReadQuorum{}, errCode: ErrReadQuorum}, + {err: InsufficientWriteQuorum{}, errCode: ErrWriteQuorum}, + {err: UnsupportedDelimiter{}, errCode: ErrNotImplemented}, + {err: InvalidMarkerPrefixCombination{}, errCode: ErrNotImplemented}, + {err: InvalidUploadIDKeyCombination{}, errCode: ErrNotImplemented}, + {err: MalformedUploadID{}, errCode: ErrNoSuchUpload}, + {err: PartTooSmall{}, errCode: ErrEntityTooSmall}, + {err: BucketNotEmpty{}, errCode: ErrBucketNotEmpty}, + {err: BucketNotFound{}, errCode: ErrNoSuchBucket}, + {err: StorageFull{}, errCode: ErrStorageFull}, + {err: NotImplemented{}, errCode: ErrNotImplemented}, + {err: errSignatureMismatch, errCode: ErrSignatureDoesNotMatch}, + + // SSE-C errors + {err: errInsecureSSERequest, errCode: ErrInsecureSSECustomerRequest}, + {err: errInvalidSSEAlgorithm, errCode: ErrInvalidSSECustomerAlgorithm}, + {err: errMissingSSEKey, errCode: ErrMissingSSECustomerKey}, + {err: errInvalidSSEKey, errCode: ErrInvalidSSECustomerKey}, + {err: errMissingSSEKeyMD5, errCode: ErrMissingSSECustomerKeyMD5}, + {err: errSSEKeyMD5Mismatch, errCode: ErrSSECustomerKeyMD5Mismatch}, + {err: errObjectTampered, errCode: ErrObjectTampered}, + + {err: nil, errCode: ErrNone}, + {err: errors.New("Custom error"), errCode: ErrInternalError}, // Case where err type is unknown. +} + func TestAPIErrCode(t *testing.T) { - testCases := []struct { - err error - errCode APIErrorCode - }{ - // Valid cases. - { - hash.BadDigest{}, - ErrBadDigest, - }, - { - hash.SHA256Mismatch{}, - ErrContentSHA256Mismatch, - }, - { - IncompleteBody{}, - ErrIncompleteBody, - }, - { - ObjectExistsAsDirectory{}, - ErrObjectExistsAsDirectory, - }, - { - BucketNameInvalid{}, - ErrInvalidBucketName, - }, - { - BucketExists{}, - ErrBucketAlreadyOwnedByYou, - }, - { - ObjectNotFound{}, - ErrNoSuchKey, - }, - { - ObjectNameInvalid{}, - ErrInvalidObjectName, - }, - { - InvalidUploadID{}, - ErrNoSuchUpload, - }, - { - InvalidPart{}, - ErrInvalidPart, - }, - { - InsufficientReadQuorum{}, - ErrReadQuorum, - }, - { - InsufficientWriteQuorum{}, - ErrWriteQuorum, - }, - { - UnsupportedDelimiter{}, - ErrNotImplemented, - }, - { - InvalidMarkerPrefixCombination{}, - ErrNotImplemented, - }, - { - InvalidUploadIDKeyCombination{}, - ErrNotImplemented, - }, - { - MalformedUploadID{}, - ErrNoSuchUpload, - }, - { - PartTooSmall{}, - ErrEntityTooSmall, - }, - { - BucketNotEmpty{}, - ErrBucketNotEmpty, - }, - { - BucketNotFound{}, - ErrNoSuchBucket, - }, - { - StorageFull{}, - ErrStorageFull, - }, - { - NotImplemented{}, - ErrNotImplemented, - }, - { - errSignatureMismatch, - ErrSignatureDoesNotMatch, - }, // End of all valid cases. - - // Case where err is nil. - { - nil, - ErrNone, - }, - - // Case where err type is unknown. - { - errors.New("Custom error"), - ErrInternalError, - }, - } - - // Validate all the errors with their API error codes. - for i, testCase := range testCases { + for i, testCase := range toAPIErrorCodeTests { errCode := toAPIErrorCode(testCase.err) if errCode != testCase.errCode { t.Errorf("Test %d: Expected error code %d, got %d", i+1, testCase.errCode, errCode) diff --git a/cmd/encryption-v1.go b/cmd/encryption-v1.go new file mode 100644 index 000000000..06156b1c9 --- /dev/null +++ b/cmd/encryption-v1.go @@ -0,0 +1,300 @@ +/* + * Minio Cloud Storage, (C) 2017 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package cmd + +import ( + "bytes" + "crypto/hmac" + "crypto/md5" + "crypto/rand" + "encoding/base64" + "errors" + "io" + "net/http" + + sha256 "github.com/minio/sha256-simd" + "github.com/minio/sio" +) + +var ( + // AWS errors for invalid SSE-C requests. + errInsecureSSERequest = errors.New("Requests specifying Server Side Encryption with Customer provided keys must be made over a secure connection") + errEncryptedObject = errors.New("The object was stored using a form of Server Side Encryption. The correct parameters must be provided to retrieve the object") + errInvalidSSEAlgorithm = errors.New("Requests specifying Server Side Encryption with Customer provided keys must provide a valid encryption algorithm") + errMissingSSEKey = errors.New("Requests specifying Server Side Encryption with Customer provided keys must provide an appropriate secret key") + errInvalidSSEKey = errors.New("The secret key was invalid for the specified algorithm") + errMissingSSEKeyMD5 = errors.New("Requests specifying Server Side Encryption with Customer provided keys must provide the client calculated MD5 of the secret key") + errSSEKeyMD5Mismatch = errors.New("The calculated MD5 hash of the key did not match the hash that was provided") + errSSEKeyMismatch = errors.New("The client provided key does not match the key provided when the object was encrypted") // this msg is not shown to the client + + // Additional Minio errors for SSE-C requests. + errObjectTampered = errors.New("The requested object was modified and may be compromised") +) + +const ( + // SSECustomerAlgorithm is the AWS SSE-C algorithm HTTP header key. + SSECustomerAlgorithm = "X-Amz-Server-Side-Encryption-Customer-Algorithm" + // SSECustomerKey is the AWS SSE-C encryption key HTTP header key. + SSECustomerKey = "X-Amz-Server-Side-Encryption-Customer-Key" + // SSECustomerKeyMD5 is the AWS SSE-C encryption key MD5 HTTP header key. + SSECustomerKeyMD5 = "X-Amz-Server-Side-Encryption-Customer-Key-MD5" +) + +const ( + // SSECustomerKeySize is the size of valid client provided encryption keys in bytes. + // Currently AWS supports only AES256. So the SSE-C key size is fixed to 32 bytes. + SSECustomerKeySize = 32 + + // SSECustomerAlgorithmAES256 the only valid S3 SSE-C encryption algorithm identifier. + SSECustomerAlgorithmAES256 = "AES256" +) + +// SSE-C key derivation: +// H: Hash function, M: MAC function +// +// key := 32 bytes # client provided key +// r := H(random(32 bytes)) # saved as object metadata [ServerSideEncryptionIV] +// key_mac := M(H(key), r) # saved as object metadata [ServerSideEncryptionKeyMAC] +// enc_key := M(key, key_mac) +// +// +// SSE-C key verification: +// H: Hash function, M: MAC function +// +// key := 32 bytes # client provided key +// r := object metadata [ServerSideEncryptionIV] +// key_mac := object metadata [ServerSideEncryptionKeyMAC] +// key_mac' := M(H(key), r) +// +// check: key_mac != key_mac' => fail with invalid key +// +// enc_key := M(key, key_mac') +const ( + // ServerSideEncryptionIV is a 32 byte randomly generated IV used to derive an + // unique encryption key from the client provided key. The combination of this value + // and the client-provided key must be unique to provide the DARE tamper-proof property. + ServerSideEncryptionIV = ReservedMetadataPrefix + "Server-Side-Encryption-Iv" + + // ServerSideEncryptionKDF is the combination of a hash and MAC function used to derive + // the SSE-C encryption key from the user-provided key. + ServerSideEncryptionKDF = ReservedMetadataPrefix + "Server-Side-Encryption-Kdf" + + // ServerSideEncryptionKeyMAC is the MAC of the hash of the client-provided key and the + // X-Minio-Server-Side-Encryption-Iv. This value must be used to verify that the client + // provided the correct key to follow S3 spec. + ServerSideEncryptionKeyMAC = ReservedMetadataPrefix + "Server-Side-Encryption-Key-Mac" +) + +// SSEKeyDerivationHmacSha256 specifies SHA-256 as hash function and HMAC-SHA256 as MAC function +// as the functions used to derive the SSE-C encryption keys from the client-provided key. +const SSEKeyDerivationHmacSha256 = "HMAC-SHA256" + +// IsSSECustomerRequest returns true if the given HTTP header +// contains server-side-encryption with customer provided key fields. +func IsSSECustomerRequest(header http.Header) bool { + return header.Get(SSECustomerAlgorithm) != "" || header.Get(SSECustomerKey) != "" || header.Get(SSECustomerKeyMD5) != "" +} + +// ParseSSECustomerRequest parses the SSE-C header fields of the provided request. +// It returns the client provided key on success. +func ParseSSECustomerRequest(r *http.Request) (key []byte, err error) { + if !globalIsSSL { // minio only supports HTTP or HTTPS requests not both at the same time + // we cannot use r.TLS == nil here because Go's http implementation reflects on + // the net.Conn and sets the TLS field of http.Request only if it's an tls.Conn. + // Minio uses a BufConn (wrapping a tls.Conn) so the type check within the http package + // will always fail -> r.TLS is always nil even for TLS requests. + return nil, errInsecureSSERequest + } + header := r.Header + if algorithm := header.Get(SSECustomerAlgorithm); algorithm != SSECustomerAlgorithmAES256 { + return nil, errInvalidSSEAlgorithm + } + if header.Get(SSECustomerKey) == "" { + return nil, errMissingSSEKey + } + if header.Get(SSECustomerKeyMD5) == "" { + return nil, errMissingSSEKeyMD5 + } + + key, err = base64.StdEncoding.DecodeString(header.Get(SSECustomerKey)) + if err != nil { + return nil, errInvalidSSEKey + } + header.Del(SSECustomerKey) // make sure we do not save the key by accident + + if len(key) != SSECustomerKeySize { + return nil, errInvalidSSEKey + } + + keyMD5, err := base64.StdEncoding.DecodeString(header.Get(SSECustomerKeyMD5)) + if err != nil { + return nil, errSSEKeyMD5Mismatch + } + if md5Sum := md5.Sum(key); !bytes.Equal(md5Sum[:], keyMD5) { + return nil, errSSEKeyMD5Mismatch + } + return key, nil +} + +// EncryptRequest takes the client provided content and encrypts the data +// with the client provided key. It also marks the object as client-side-encrypted +// and sets the correct headers. +func EncryptRequest(content io.Reader, r *http.Request, metadata map[string]string) (io.Reader, error) { + key, err := ParseSSECustomerRequest(r) + if err != nil { + return nil, err + } + delete(metadata, SSECustomerKey) // make sure we do not save the key by accident + + // security notice: + // Reusing a tuple (nonce, client provided key) will produce the same encryption key + // twice and breaks the tamper-proof property. However objects are still confidential. + // Therefore the nonce must be unique but need not to be undistinguishable from true + // randomness. + nonce := make([]byte, 32) // generate random nonce to derive encryption key + if _, err = io.ReadFull(rand.Reader, nonce); err != nil { + return nil, err + } + iv := sha256.Sum256(nonce) // hash output to not reveal any stat. weaknesses of the PRNG + + keyHash := sha256.Sum256(key) // derive MAC of the client-provided key + mac := hmac.New(sha256.New, keyHash[:]) + mac.Write(iv[:]) + keyMAC := mac.Sum(nil) + + mac = hmac.New(sha256.New, key) // derive encryption key + mac.Write(keyMAC) + reader, err := sio.EncryptReader(content, sio.Config{Key: mac.Sum(nil)}) + if err != nil { + return nil, errInvalidSSEKey + } + + metadata[ServerSideEncryptionIV] = base64.StdEncoding.EncodeToString(iv[:]) + metadata[ServerSideEncryptionKDF] = SSEKeyDerivationHmacSha256 + metadata[ServerSideEncryptionKeyMAC] = base64.StdEncoding.EncodeToString(keyMAC) + return reader, nil +} + +// DecryptRequest decrypts the object with the client provided key. It also removes +// the client-side-encryption metadata from the object and sets the correct headers. +func DecryptRequest(client io.Writer, r *http.Request, metadata map[string]string) (io.WriteCloser, error) { + key, err := ParseSSECustomerRequest(r) + if err != nil { + return nil, err + } + delete(metadata, SSECustomerKey) // make sure we do not save the key by accident + + if metadata[ServerSideEncryptionKDF] != SSEKeyDerivationHmacSha256 { // currently HMAC-SHA256 is the only option + return nil, errObjectTampered + } + nonce, err := base64.StdEncoding.DecodeString(metadata[ServerSideEncryptionIV]) + if err != nil || len(nonce) != 32 { + return nil, errObjectTampered + } + keyMAC, err := base64.StdEncoding.DecodeString(metadata[ServerSideEncryptionKeyMAC]) + if err != nil || len(keyMAC) != 32 { + return nil, errObjectTampered + } + + keyHash := sha256.Sum256(key) // verify that client provided correct key + mac := hmac.New(sha256.New, keyHash[:]) + mac.Write(nonce) + if !hmac.Equal(keyMAC, mac.Sum(nil)) { + return nil, errSSEKeyMismatch // client-provided key is wrong or object metadata was modified + } + + mac = hmac.New(sha256.New, key) // derive decryption key + mac.Write(keyMAC) + writer, err := sio.DecryptWriter(client, sio.Config{Key: mac.Sum(nil)}) + if err != nil { + return nil, errInvalidSSEKey + } + + delete(metadata, ServerSideEncryptionIV) + delete(metadata, ServerSideEncryptionKDF) + delete(metadata, ServerSideEncryptionKeyMAC) + return writer, nil +} + +// IsEncrypted returns true if the object is marked as encrypted. +func (o *ObjectInfo) IsEncrypted() bool { + if _, ok := o.UserDefined[ServerSideEncryptionIV]; ok { + return true + } + if _, ok := o.UserDefined[ServerSideEncryptionKDF]; ok { + return true + } + if _, ok := o.UserDefined[ServerSideEncryptionKeyMAC]; ok { + return true + } + return false +} + +// DecryptedSize returns the size of the object after decryption in bytes. +// It returns an error if the object is not encrypted or marked as encrypted +// but has an invalid size. +// DecryptedSize panics if the referred object is not encrypted. +func (o *ObjectInfo) DecryptedSize() (int64, error) { + if !o.IsEncrypted() { + panic("cannot compute decrypted size of an object which is not encrypted") + } + if o.Size == 0 { + return o.Size, nil + } + size := (o.Size / (32 + 64*1024)) * (64 * 1024) + if mod := o.Size % (32 + 64*1024); mod > 0 { + if mod < 33 { + return -1, errObjectTampered // object is not 0 size but smaller than the smallest valid encrypted object + } + size += mod - 32 + } + return size, nil +} + +// EncryptedSize returns the size of the object after encryption. +// An encrypted object is always larger than a plain object +// except for zero size objects. +func (o *ObjectInfo) EncryptedSize() int64 { + size := (o.Size / (64 * 1024)) * (32 + 64*1024) + if mod := o.Size % (64 * 1024); mod > 0 { + size += mod + 32 + } + return size +} + +// DecryptObjectInfo tries to decrypt the provided object if it is encrypted. +// It fails if the object is encrypted and the HTTP headers don't contain +// SSE-C headers or the object is not encrypted but SSE-C headers are provided. (AWS behavior) +// DecryptObjectInfo returns 'ErrNone' if the object is not encrypted or the +// decryption succeeded. +// +// DecryptObjectInfo also returns whether the object is encrypted or not. +func DecryptObjectInfo(info *ObjectInfo, headers http.Header) (apiErr APIErrorCode, encrypted bool) { + if apiErr, encrypted = ErrNone, info.IsEncrypted(); !encrypted && IsSSECustomerRequest(headers) { + apiErr = ErrInvalidEncryptionParameters + } else if encrypted { + if !IsSSECustomerRequest(headers) { + apiErr = ErrSSEEncryptedObject + return + } + var err error + if info.Size, err = info.DecryptedSize(); err != nil { + apiErr = toAPIErrorCode(err) + } + } + return +} diff --git a/cmd/encryption-v1_test.go b/cmd/encryption-v1_test.go new file mode 100644 index 000000000..a7824c7b5 --- /dev/null +++ b/cmd/encryption-v1_test.go @@ -0,0 +1,390 @@ +/* + * Minio Cloud Storage, (C) 2017 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package cmd + +import ( + "bytes" + "net/http" + "testing" +) + +var isSSECustomerRequestTests = []struct { + headers map[string]string + sseRequest bool +}{ + {headers: map[string]string{SSECustomerAlgorithm: "AES256", SSECustomerKey: "key", SSECustomerKeyMD5: "md5"}, sseRequest: true}, // 0 + {headers: map[string]string{SSECustomerAlgorithm: "AES256"}, sseRequest: true}, // 1 + {headers: map[string]string{SSECustomerKey: "key"}, sseRequest: true}, // 2 + {headers: map[string]string{SSECustomerKeyMD5: "md5"}, sseRequest: true}, // 3 + {headers: map[string]string{}, sseRequest: false}, // 4 + {headers: map[string]string{SSECustomerAlgorithm + " ": "AES256", " " + SSECustomerKey: "key", SSECustomerKeyMD5 + " ": "md5"}, sseRequest: false}, // 5 + {headers: map[string]string{SSECustomerAlgorithm: "", SSECustomerKey: "", SSECustomerKeyMD5: ""}, sseRequest: false}, // 6 +} + +func TestIsSSECustomerRequest(t *testing.T) { + for i, test := range isSSECustomerRequestTests { + headers := http.Header{} + for k, v := range test.headers { + headers.Set(k, v) + } + if IsSSECustomerRequest(headers) != test.sseRequest { + t.Errorf("Test %d: Expected IsSSECustomerRequest to return %v", i, test.sseRequest) + } + } +} + +var parseSSECustomerRequestTests = []struct { + headers map[string]string + useTLS bool + err error +}{ + { + headers: map[string]string{ + SSECustomerAlgorithm: "AES256", + SSECustomerKey: "XAm0dRrJsEsyPb1UuFNezv1bl9hxuYsgUVC/MUctE2k=", // 0 + SSECustomerKeyMD5: "bY4wkxQejw9mUJfo72k53A==", + }, + useTLS: true, err: nil, + }, + { + headers: map[string]string{ + SSECustomerAlgorithm: "AES256", + SSECustomerKey: "XAm0dRrJsEsyPb1UuFNezv1bl9hxuYsgUVC/MUctE2k=", // 1 + SSECustomerKeyMD5: "bY4wkxQejw9mUJfo72k53A==", + }, + useTLS: false, err: errInsecureSSERequest, + }, + { + headers: map[string]string{ + SSECustomerAlgorithm: "AES 256", + SSECustomerKey: "XAm0dRrJsEsyPb1UuFNezv1bl9hxuYsgUVC/MUctE2k=", // 2 + SSECustomerKeyMD5: "bY4wkxQejw9mUJfo72k53A==", + }, + useTLS: true, err: errInvalidSSEAlgorithm, + }, + { + headers: map[string]string{ + SSECustomerAlgorithm: "AES256", + SSECustomerKey: "NjE0SL87s+ZhYtaTrg5eI5cjhCQLGPVMKenPG2bCJFw=", // 3 + SSECustomerKeyMD5: "H+jq/LwEOEO90YtiTuNFVw==", + }, + useTLS: true, err: errSSEKeyMD5Mismatch, + }, + { + headers: map[string]string{ + SSECustomerAlgorithm: "AES256", + SSECustomerKey: " jE0SL87s+ZhYtaTrg5eI5cjhCQLGPVMKenPG2bCJFw=", // 4 + SSECustomerKeyMD5: "H+jq/LwEOEO90YtiTuNFVw==", + }, + useTLS: true, err: errInvalidSSEKey, + }, + { + headers: map[string]string{ + SSECustomerAlgorithm: "AES256", + SSECustomerKey: "NjE0SL87s+ZhYtaTrg5eI5cjhCQLGPVMKenPG2bCJFw=", // 5 + SSECustomerKeyMD5: " +jq/LwEOEO90YtiTuNFVw==", + }, + useTLS: true, err: errSSEKeyMD5Mismatch, + }, + { + headers: map[string]string{ + SSECustomerAlgorithm: "AES256", + SSECustomerKey: "vFQ9ScFOF6Tu/BfzMS+rVMvlZGJHi5HmGJenJfrfKI45", // 6 + SSECustomerKeyMD5: "9KPgDdZNTHimuYCwnJTp5g==", + }, + useTLS: true, err: errInvalidSSEKey, + }, + { + headers: map[string]string{ + SSECustomerAlgorithm: "AES256", + SSECustomerKey: "", // 7 + SSECustomerKeyMD5: "9KPgDdZNTHimuYCwnJTp5g==", + }, + useTLS: true, err: errMissingSSEKey, + }, + { + headers: map[string]string{ + SSECustomerAlgorithm: "AES256", + SSECustomerKey: "vFQ9ScFOF6Tu/BfzMS+rVMvlZGJHi5HmGJenJfrfKI45", // 8 + SSECustomerKeyMD5: "", + }, + useTLS: true, err: errMissingSSEKeyMD5, + }, +} + +func TestParseSSECustomerRequest(t *testing.T) { + defer func(flag bool) { globalIsSSL = flag }(globalIsSSL) + for i, test := range parseSSECustomerRequestTests { + headers := http.Header{} + for k, v := range test.headers { + headers.Set(k, v) + } + request := &http.Request{} + request.Header = headers + globalIsSSL = test.useTLS + + _, err := ParseSSECustomerRequest(request) + if err != test.err { + t.Errorf("Test %d: Parse returned: %v want: %v", i, err, test.err) + } + key := request.Header.Get(SSECustomerKey) + if (err == nil || err == errSSEKeyMD5Mismatch) && key != "" { + t.Errorf("Test %d: Client key survived parsing - found key: %v", i, key) + } + } +} + +var encryptedSizeTests = []struct { + size, encsize int64 +}{ + {size: 0, encsize: 0}, // 0 + {size: 1, encsize: 33}, // 1 + {size: 1024, encsize: 1024 + 32}, // 2 + {size: 2 * 64 * 1024, encsize: 2 * (64*1024 + 32)}, // 3 + {size: 100*64*1024 + 1, encsize: 100*(64*1024+32) + 33}, // 4 + {size: 64*1024 + 1, encsize: (64*1024 + 32) + 33}, // 5 + {size: 5 * 1024 * 1024 * 1024, encsize: 81920 * (64*1024 + 32)}, // 6 +} + +func TestEncryptedSize(t *testing.T) { + for i, test := range encryptedSizeTests { + objInfo := ObjectInfo{Size: test.size} + if size := objInfo.EncryptedSize(); test.encsize != size { + t.Errorf("Test %d: got encrypted size: #%d want: #%d", i, size, test.encsize) + } + } +} + +var decryptSSECustomerObjectInfoTests = []struct { + encsize, size int64 + err error +}{ + {encsize: 0, size: 0, err: nil}, // 0 + {encsize: 33, size: 1, err: nil}, // 1 + {encsize: 1024 + 32, size: 1024, err: nil}, // 2 + {encsize: 2 * (64*1024 + 32), size: 2 * 64 * 1024, err: nil}, // 3 + {encsize: 100*(64*1024+32) + 33, size: 100*64*1024 + 1, err: nil}, // 4 + {encsize: (64*1024 + 32) + 33, size: 64*1024 + 1, err: nil}, // 5 + {encsize: 81920 * (64*1024 + 32), size: 5 * 1024 * 1024 * 1024, err: nil}, // 6 + {encsize: 0, size: 0, err: nil}, // 7 + {encsize: 64*1024 + 32 + 31, size: 0, err: errObjectTampered}, // 8 +} + +func TestDecryptedSize(t *testing.T) { + for i, test := range decryptSSECustomerObjectInfoTests { + objInfo := ObjectInfo{Size: test.encsize} + objInfo.UserDefined = map[string]string{ + ServerSideEncryptionKDF: SSEKeyDerivationHmacSha256, + } + + size, err := objInfo.DecryptedSize() + if err != test.err || (size != test.size && err == nil) { + t.Errorf("Test %d: decryption returned: %v want: %v", i, err, test.err) + } + if err == nil && size != test.size { + t.Errorf("Test %d: got decrypted size: #%d want: #%d", i, size, test.size) + } + } +} + +var encryptRequestTests = []struct { + header map[string]string + metadata map[string]string +}{ + { + header: map[string]string{ + SSECustomerAlgorithm: "AES256", + SSECustomerKey: "XAm0dRrJsEsyPb1UuFNezv1bl9hxuYsgUVC/MUctE2k=", + SSECustomerKeyMD5: "bY4wkxQejw9mUJfo72k53A==", + }, + metadata: map[string]string{}, + }, + { + header: map[string]string{ + SSECustomerAlgorithm: "AES256", + SSECustomerKey: "XAm0dRrJsEsyPb1UuFNezv1bl9hxuYsgUVC/MUctE2k=", + SSECustomerKeyMD5: "bY4wkxQejw9mUJfo72k53A==", + }, + metadata: map[string]string{ + SSECustomerKey: "XAm0dRrJsEsyPb1UuFNezv1bl9hxuYsgUVC/MUctE2k=", + }, + }, +} + +func TestEncryptRequest(t *testing.T) { + defer func(flag bool) { globalIsSSL = flag }(globalIsSSL) + globalIsSSL = true + for i, test := range encryptRequestTests { + content := bytes.NewReader(make([]byte, 64)) + req := &http.Request{Header: http.Header{}} + for k, v := range test.header { + req.Header.Set(k, v) + } + _, err := EncryptRequest(content, req, test.metadata) + if err != nil { + t.Fatalf("Test %d: Failed to encrypt request: %v", i, err) + } + if key, ok := test.metadata[SSECustomerKey]; ok { + t.Errorf("Test %d: Client provided key survived in metadata - key: %s", i, key) + } + if kdf, ok := test.metadata[ServerSideEncryptionKDF]; !ok { + t.Errorf("Test %d: ServerSideEncryptionKDF must be part of metadata: %v", i, kdf) + } + if iv, ok := test.metadata[ServerSideEncryptionIV]; !ok { + t.Errorf("Test %d: ServerSideEncryptionIV must be part of metadata: %v", i, iv) + } + if mac, ok := test.metadata[ServerSideEncryptionKeyMAC]; !ok { + t.Errorf("Test %d: ServerSideEncryptionKeyMAC must be part of metadata: %v", i, mac) + } + } +} + +var decryptRequestTests = []struct { + header map[string]string + metadata map[string]string + shouldFail bool +}{ + { + header: map[string]string{ + SSECustomerAlgorithm: "AES256", + SSECustomerKey: "XAm0dRrJsEsyPb1UuFNezv1bl9hxuYsgUVC/MUctE2k=", + SSECustomerKeyMD5: "bY4wkxQejw9mUJfo72k53A==", + }, + metadata: map[string]string{ + ServerSideEncryptionKDF: SSEKeyDerivationHmacSha256, + ServerSideEncryptionIV: "XAm0dRrJsEsyPb1UuFNezv1bl9hxuYsgUVC/MUctE2k=", + ServerSideEncryptionKeyMAC: "SY5E9AvI2tI7/nUrUAssIGE32Hcs4rR9z/CUuPqu5N4=", + }, + shouldFail: false, + }, + { + header: map[string]string{ + SSECustomerAlgorithm: "AES256", + SSECustomerKey: "XAm0dRrJsEsyPb1UuFNezv1bl9hxuYsgUVC/MUctE2k=", + SSECustomerKeyMD5: "bY4wkxQejw9mUJfo72k53A==", + }, + metadata: map[string]string{ + ServerSideEncryptionKDF: "HMAC-SHA3", + ServerSideEncryptionIV: "XAm0dRrJsEsyPb1UuFNezv1bl9hxuYsgUVC/MUctE2k=", + ServerSideEncryptionKeyMAC: "SY5E9AvI2tI7/nUrUAssIGE32Hcs4rR9z/CUuPqu5N4=", + }, + shouldFail: true, + }, + { + header: map[string]string{ + SSECustomerAlgorithm: "AES256", + SSECustomerKey: "XAm0dRrJsEsyPb1UuFNezv1bl9hxuYsgUVC/MUctE2k=", + SSECustomerKeyMD5: "bY4wkxQejw9mUJfo72k53A==", + }, + metadata: map[string]string{ + ServerSideEncryptionKDF: SSEKeyDerivationHmacSha256, + ServerSideEncryptionIV: "RrJsEsyPb1UuFNezv1bl9hxuYsgUVC/MUctE2k=", + ServerSideEncryptionKeyMAC: "SY5E9AvI2tI7/nUrUAssIGE32Hcs4rR9z/CUuPqu5N4=", + }, + shouldFail: true, + }, + { + header: map[string]string{ + SSECustomerAlgorithm: "AES256", + SSECustomerKey: "XAm0dRrJsEsyPb1UuFNezv1bl9hxuYsgUVC/MUctE2k=", + SSECustomerKeyMD5: "bY4wkxQejw9mUJfo72k53A==", + }, + metadata: map[string]string{ + ServerSideEncryptionKDF: SSEKeyDerivationHmacSha256, + ServerSideEncryptionIV: "XAm0dRrJsEsyPb1UuFNezv1bl9ehxuYsgUVC/MUctE2k=", + ServerSideEncryptionKeyMAC: "SY5E9AvI2tI7/nUrUAssIGE32Hds4rR9z/CUuPqu5N4=", + }, + shouldFail: true, + }, +} + +func TestDecryptRequest(t *testing.T) { + defer func(flag bool) { globalIsSSL = flag }(globalIsSSL) + globalIsSSL = true + for i, test := range decryptRequestTests { + client := bytes.NewBuffer(nil) + req := &http.Request{Header: http.Header{}} + for k, v := range test.header { + req.Header.Set(k, v) + } + _, err := DecryptRequest(client, req, test.metadata) + if err != nil && !test.shouldFail { + t.Fatalf("Test %d: Failed to encrypt request: %v", i, err) + } + if key, ok := test.metadata[SSECustomerKey]; ok { + t.Errorf("Test %d: Client provided key survived in metadata - key: %s", i, key) + } + if kdf, ok := test.metadata[ServerSideEncryptionKDF]; ok && !test.shouldFail { + t.Errorf("Test %d: ServerSideEncryptionKDF should not be part of metadata: %v", i, kdf) + } + if iv, ok := test.metadata[ServerSideEncryptionIV]; ok && !test.shouldFail { + t.Errorf("Test %d: ServerSideEncryptionIV should not be part of metadata: %v", i, iv) + } + if mac, ok := test.metadata[ServerSideEncryptionKeyMAC]; ok && !test.shouldFail { + t.Errorf("Test %d: ServerSideEncryptionKeyMAC should not be part of metadata: %v", i, mac) + } + } +} + +var decryptObjectInfoTests = []struct { + info ObjectInfo + headers http.Header + expErr APIErrorCode +}{ + { + info: ObjectInfo{Size: 100}, + headers: http.Header{}, + expErr: ErrNone, + }, + { + info: ObjectInfo{Size: 100, UserDefined: map[string]string{ServerSideEncryptionKDF: SSEKeyDerivationHmacSha256}}, + headers: http.Header{SSECustomerAlgorithm: []string{SSECustomerAlgorithmAES256}}, + expErr: ErrNone, + }, + { + info: ObjectInfo{Size: 0, UserDefined: map[string]string{ServerSideEncryptionKDF: SSEKeyDerivationHmacSha256}}, + headers: http.Header{SSECustomerAlgorithm: []string{SSECustomerAlgorithmAES256}}, + expErr: ErrNone, + }, + { + info: ObjectInfo{Size: 100, UserDefined: map[string]string{ServerSideEncryptionKDF: SSEKeyDerivationHmacSha256}}, + headers: http.Header{}, + expErr: ErrSSEEncryptedObject, + }, + { + info: ObjectInfo{Size: 100, UserDefined: map[string]string{}}, + headers: http.Header{SSECustomerAlgorithm: []string{SSECustomerAlgorithmAES256}}, + expErr: ErrInvalidEncryptionParameters, + }, + { + info: ObjectInfo{Size: 31, UserDefined: map[string]string{ServerSideEncryptionKDF: SSEKeyDerivationHmacSha256}}, + headers: http.Header{SSECustomerAlgorithm: []string{SSECustomerAlgorithmAES256}}, + expErr: ErrObjectTampered, + }, +} + +func TestDecryptObjectInfo(t *testing.T) { + for i, test := range decryptObjectInfoTests { + if err, encrypted := DecryptObjectInfo(&test.info, test.headers); err != test.expErr { + t.Errorf("Test %d: Decryption returned wrong error code: got %d , want %d", i, err, test.expErr) + } else if enc := test.info.IsEncrypted(); encrypted && enc != encrypted { + t.Errorf("Test %d: Decryption thinks object is encrypted but it is not", i) + } else if !encrypted && enc != encrypted { + t.Errorf("Test %d: Decryption thinks object is not encrypted but it is", i) + } + } +} diff --git a/cmd/gateway-gcs.go b/cmd/gateway-gcs.go index ca359f0c1..3c48c5ece 100644 --- a/cmd/gateway-gcs.go +++ b/cmd/gateway-gcs.go @@ -958,7 +958,6 @@ func (l *gcsGateway) PutObjectPart(bucket string, key string, uploadID string, p } // Make sure to close the object writer upon success. w.Close() - return PartInfo{ PartNumber: partNumber, ETag: etag, diff --git a/cmd/gateway-handlers.go b/cmd/gateway-handlers.go index d7933b8d5..edbfb90e0 100644 --- a/cmd/gateway-handlers.go +++ b/cmd/gateway-handlers.go @@ -18,7 +18,7 @@ package cmd import ( "io" - "io/ioutil" + goioutil "io/ioutil" "net" "net/http" "strconv" @@ -29,6 +29,7 @@ import ( router "github.com/gorilla/mux" "github.com/minio/minio-go/pkg/policy" "github.com/minio/minio/pkg/hash" + "github.com/minio/minio/pkg/ioutil" ) // GetObjectHandler - GET Object @@ -118,32 +119,19 @@ func (api gatewayAPIHandlers) GetObjectHandler(w http.ResponseWriter, r *http.Re startOffset = hrange.offsetBegin length = hrange.getLength() } - // Indicates if any data was written to the http.ResponseWriter - dataWritten := false - // io.Writer type which keeps track if any data was written. - writer := funcToWriter(func(p []byte) (int, error) { - if !dataWritten { - // Set headers on the first write. - // Set standard object headers. - setObjectHeaders(w, objInfo, hrange) - - // Set any additional requested response headers. - setHeadGetRespHeaders(w, r.URL.Query()) - - dataWritten = true - } - return w.Write(p) - }) getObject := objectAPI.GetObject if reqAuthType == authTypeAnonymous { getObject = objectAPI.AnonGetObject } + setObjectHeaders(w, objInfo, hrange) + setHeadGetRespHeaders(w, r.URL.Query()) + httpWriter := ioutil.WriteOnClose(w) // Reads the object at startOffset and writes to mw. - if err = getObject(bucket, object, startOffset, length, writer); err != nil { + if err = getObject(bucket, object, startOffset, length, httpWriter); err != nil { errorIf(err, "Unable to write to client.") - if !dataWritten { + if !httpWriter.HasWritten() { // Error response only if no data has been written to client yet. i.e if // partial data has already been written before an error // occurred then no point in setting StatusCode and @@ -152,11 +140,11 @@ func (api gatewayAPIHandlers) GetObjectHandler(w http.ResponseWriter, r *http.Re } return } - if !dataWritten { - // If ObjectAPI.GetObject did not return error and no data has - // been written it would mean that it is a 0-byte object. - // call wrter.Write(nil) to set appropriate headers. - writer.Write(nil) + if err = httpWriter.Close(); err != nil { + if !httpWriter.HasWritten() { // write error response only if no data has been written to client yet + writeErrorResponse(w, toAPIErrorCode(err), r.URL) + return + } } // Get host and port from Request.RemoteAddr. @@ -472,7 +460,7 @@ func (api gatewayAPIHandlers) PutBucketPolicyHandler(w http.ResponseWriter, r *h // Read access policy up to maxAccessPolicySize. // http://docs.aws.amazon.com/AmazonS3/latest/dev/access-policy-language-overview.html // bucket policies are limited to 20KB in size, using a limit reader. - policyBytes, err := ioutil.ReadAll(io.LimitReader(r.Body, maxAccessPolicySize)) + policyBytes, err := goioutil.ReadAll(io.LimitReader(r.Body, maxAccessPolicySize)) if err != nil { errorIf(err, "Unable to read from client.") writeErrorResponse(w, toAPIErrorCode(err), r.URL) diff --git a/cmd/generic-handlers.go b/cmd/generic-handlers.go index ac1a6b06c..97f19ad4a 100644 --- a/cmd/generic-handlers.go +++ b/cmd/generic-handlers.go @@ -108,6 +108,40 @@ func isHTTPHeaderSizeTooLarge(header http.Header) bool { return false } +// ReservedMetadataPrefix is the prefix of a metadata key which +// is reserved and for internal use only. +const ReservedMetadataPrefix = "X-Minio-Internal-" + +type reservedMetadataHandler struct { + http.Handler +} + +func filterReservedMetadata(h http.Handler) http.Handler { + return reservedMetadataHandler{h} +} + +// ServeHTTP fails if the request contains at least one reserved header which +// would be treated as metadata. +func (h reservedMetadataHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + if containsReservedMetadata(r.Header) { + writeErrorResponse(w, ErrUnsupportedMetadata, r.URL) + return + } + h.Handler.ServeHTTP(w, r) +} + +// containsReservedMetadata returns true if the http.Header contains +// keys which are treated as metadata but are reserved for internal use +// and must not set by clients +func containsReservedMetadata(header http.Header) bool { + for key := range header { + if strings.HasPrefix(key, ReservedMetadataPrefix) { + return true + } + } + return false +} + // Reserved bucket. const ( minioReservedBucket = "minio" diff --git a/cmd/generic-handlers_test.go b/cmd/generic-handlers_test.go index bda87767f..a93dfee16 100644 --- a/cmd/generic-handlers_test.go +++ b/cmd/generic-handlers_test.go @@ -144,3 +144,38 @@ func TestIsHTTPHeaderSizeTooLarge(t *testing.T) { } } } + +var containsReservedMetadataTests = []struct { + header http.Header + shouldFail bool +}{ + { + header: http.Header{"X-Minio-Key": []string{"value"}}, + }, + { + header: http.Header{ServerSideEncryptionIV: []string{"iv"}}, + shouldFail: true, + }, + { + header: http.Header{ServerSideEncryptionKDF: []string{SSEKeyDerivationHmacSha256}}, + shouldFail: true, + }, + { + header: http.Header{ServerSideEncryptionKeyMAC: []string{"mac"}}, + shouldFail: true, + }, + { + header: http.Header{ReservedMetadataPrefix + "Key": []string{"value"}}, + shouldFail: true, + }, +} + +func TestContainsReservedMetadata(t *testing.T) { + for i, test := range containsReservedMetadataTests { + if contains := containsReservedMetadata(test.header); contains && !test.shouldFail { + t.Errorf("Test %d: contains reserved header but should not fail", i) + } else if !contains && test.shouldFail { + t.Errorf("Test %d: does not contain reserved header but failed", i) + } + } +} diff --git a/cmd/object-handlers.go b/cmd/object-handlers.go index d94cf199a..186ec56d6 100644 --- a/cmd/object-handlers.go +++ b/cmd/object-handlers.go @@ -19,7 +19,8 @@ package cmd import ( "encoding/hex" "encoding/xml" - "io/ioutil" + "io" + goioutil "io/ioutil" "net" "net/http" "net/url" @@ -28,6 +29,7 @@ import ( mux "github.com/gorilla/mux" "github.com/minio/minio/pkg/hash" + "github.com/minio/minio/pkg/ioutil" ) // supportedHeadGetReqParams - supported request parameters for GET and HEAD presigned request. @@ -82,13 +84,6 @@ func errAllowableObjectNotFound(bucket string, r *http.Request) APIErrorCode { return ErrNoSuchKey } -// Simple way to convert a func to io.Writer type. -type funcToWriter func([]byte) (int, error) - -func (f funcToWriter) Write(p []byte) (int, error) { - return f(p) -} - // GetObjectHandler - GET Object // ---------- // This implementation of the GET operation retrieves object. To use GET, @@ -129,6 +124,10 @@ func (api objectAPIHandlers) GetObjectHandler(w http.ResponseWriter, r *http.Req writeErrorResponse(w, apiErr, r.URL) return } + if apiErr, _ := DecryptObjectInfo(&objInfo, r.Header); apiErr != ErrNone { + writeErrorResponse(w, apiErr, r.URL) + return + } // Get request range. var hrange *httpRange @@ -160,40 +159,41 @@ func (api objectAPIHandlers) GetObjectHandler(w http.ResponseWriter, r *http.Req length = hrange.getLength() } - // Indicates if any data was written to the http.ResponseWriter - dataWritten := false - // io.Writer type which keeps track if any data was written. - writer := funcToWriter(func(p []byte) (int, error) { - if !dataWritten { - // Set headers on the first write. - // Set standard object headers. - setObjectHeaders(w, objInfo, hrange) - - // Set any additional requested response headers. - setHeadGetRespHeaders(w, r.URL.Query()) - - dataWritten = true + var writer io.Writer + writer = w + if IsSSECustomerRequest(r.Header) { + writer, err = DecryptRequest(writer, r, objInfo.UserDefined) + if err != nil { + writeErrorResponse(w, toAPIErrorCode(err), r.URL) + return } - return w.Write(p) - }) + w.Header().Set(SSECustomerAlgorithm, r.Header.Get(SSECustomerAlgorithm)) + w.Header().Set(SSECustomerKeyMD5, r.Header.Get(SSECustomerKeyMD5)) + if startOffset != 0 || length < objInfo.Size { + writeErrorResponse(w, ErrNotImplemented, r.URL) // SSE-C requests with HTTP range are not supported yet + return + } + length = objInfo.EncryptedSize() + } + + setObjectHeaders(w, objInfo, hrange) + setHeadGetRespHeaders(w, r.URL.Query()) + + httpWriter := ioutil.WriteOnClose(writer) // Reads the object at startOffset and writes to mw. - if err = objectAPI.GetObject(bucket, object, startOffset, length, writer); err != nil { + if err = objectAPI.GetObject(bucket, object, startOffset, length, httpWriter); err != nil { errorIf(err, "Unable to write to client.") - if !dataWritten { - // Error response only if no data has been written to client yet. i.e if - // partial data has already been written before an error - // occurred then no point in setting StatusCode and - // sending error XML. + if !httpWriter.HasWritten() { // write error response only if no data has been written to client yet writeErrorResponse(w, toAPIErrorCode(err), r.URL) } return } - if !dataWritten { - // If ObjectAPI.GetObject did not return error and no data has - // been written it would mean that it is a 0-byte object. - // call wrter.Write(nil) to set appropriate headers. - writer.Write(nil) + if err = httpWriter.Close(); err != nil { + if !httpWriter.HasWritten() { // write error response only if no data has been written to client yet + writeErrorResponse(w, toAPIErrorCode(err), r.URL) + return + } } // Get host and port from Request.RemoteAddr. @@ -252,6 +252,15 @@ func (api objectAPIHandlers) HeadObjectHandler(w http.ResponseWriter, r *http.Re writeErrorResponseHeadersOnly(w, apiErr) return } + if apiErr, encrypted := DecryptObjectInfo(&objInfo, r.Header); apiErr != ErrNone { + writeErrorResponse(w, apiErr, r.URL) + return + } else if encrypted { + if _, err = DecryptRequest(w, r, objInfo.UserDefined); err != nil { + writeErrorResponse(w, ErrSSEEncryptedObject, r.URL) + return + } + } // Validate pre-conditions if any. if checkPreconditions(w, r, objInfo) { @@ -530,9 +539,10 @@ func (api objectAPIHandlers) PutObjectHandler(w http.ResponseWriter, r *http.Req var ( md5hex = hex.EncodeToString(md5Bytes) sha256hex = "" - reader = r.Body + reader io.Reader + s3Err APIErrorCode ) - + reader = r.Body switch rAuthType { default: // For all unknown auth types return error. @@ -541,31 +551,29 @@ func (api objectAPIHandlers) PutObjectHandler(w http.ResponseWriter, r *http.Req case authTypeAnonymous: // http://docs.aws.amazon.com/AmazonS3/latest/dev/using-with-s3-actions.html sourceIP := getSourceIPAddress(r) - if s3Error := enforceBucketPolicy(bucket, "s3:PutObject", r.URL.Path, - r.Referer(), sourceIP, r.URL.Query()); s3Error != ErrNone { - writeErrorResponse(w, s3Error, r.URL) + if s3Err = enforceBucketPolicy(bucket, "s3:PutObject", r.URL.Path, r.Referer(), sourceIP, r.URL.Query()); s3Err != ErrNone { + writeErrorResponse(w, s3Err, r.URL) return } case authTypeStreamingSigned: // Initialize stream signature verifier. - var s3Error APIErrorCode - reader, s3Error = newSignV4ChunkedReader(r) - if s3Error != ErrNone { + reader, s3Err = newSignV4ChunkedReader(r) + if s3Err != ErrNone { errorIf(errSignatureMismatch, "%s", dumpRequest(r)) - writeErrorResponse(w, s3Error, r.URL) + writeErrorResponse(w, s3Err, r.URL) return } case authTypeSignedV2, authTypePresignedV2: - s3Error := isReqAuthenticatedV2(r) - if s3Error != ErrNone { + s3Err = isReqAuthenticatedV2(r) + if s3Err != ErrNone { errorIf(errSignatureMismatch, "%s", dumpRequest(r)) - writeErrorResponse(w, s3Error, r.URL) + writeErrorResponse(w, s3Err, r.URL) return } case authTypePresigned, authTypeSigned: - if s3Error := reqSignatureV4Verify(r, serverConfig.GetRegion()); s3Error != ErrNone { + if s3Err = reqSignatureV4Verify(r, serverConfig.GetRegion()); s3Err != ErrNone { errorIf(errSignatureMismatch, "%s", dumpRequest(r)) - writeErrorResponse(w, s3Error, r.URL) + writeErrorResponse(w, s3Err, r.URL) return } if !skipContentSha256Cksum(r) { @@ -579,7 +587,20 @@ func (api objectAPIHandlers) PutObjectHandler(w http.ResponseWriter, r *http.Req return } - // Create the object.. + if IsSSECustomerRequest(r.Header) { // handle SSE-C requests + reader, err = EncryptRequest(hashReader, r, metadata) + if err != nil { + writeErrorResponse(w, toAPIErrorCode(err), r.URL) + return + } + info := ObjectInfo{Size: size} + hashReader, err = hash.NewReader(reader, info.EncryptedSize(), "", "") // do not try to verify encrypted content + if err != nil { + writeErrorResponse(w, toAPIErrorCode(err), r.URL) + return + } + } + objInfo, err := objectAPI.PutObject(bucket, object, hashReader, metadata) if err != nil { errorIf(err, "Unable to create an object. %s", r.URL.Path) @@ -587,6 +608,10 @@ func (api objectAPIHandlers) PutObjectHandler(w http.ResponseWriter, r *http.Req return } w.Header().Set("ETag", "\""+objInfo.ETag+"\"") + if IsSSECustomerRequest(r.Header) { + w.Header().Set(SSECustomerAlgorithm, r.Header.Get(SSECustomerAlgorithm)) + w.Header().Set(SSECustomerKeyMD5, r.Header.Get(SSECustomerKeyMD5)) + } writeSuccessResponseHeadersOnly(w) // Get host and port from Request.RemoteAddr. @@ -627,6 +652,12 @@ func (api objectAPIHandlers) NewMultipartUploadHandler(w http.ResponseWriter, r return } + if IsSSECustomerRequest(r.Header) { // handle SSE-C requests + // SSE-C is not implemented for multipart operations yet + writeErrorResponse(w, ErrNotImplemented, r.URL) + return + } + // Extract metadata that needs to be saved. metadata, err := extractMetadataFromHeader(r.Header) if err != nil { @@ -666,6 +697,12 @@ func (api objectAPIHandlers) CopyObjectPartHandler(w http.ResponseWriter, r *htt return } + if IsSSECustomerRequest(r.Header) { // handle SSE-C requests + // SSE-C is not implemented for multipart operations yet + writeErrorResponse(w, ErrNotImplemented, r.URL) + return + } + // Copy source path. cpSrcPath, err := url.QueryUnescape(r.Header.Get("X-Amz-Copy-Source")) if err != nil { @@ -784,6 +821,12 @@ func (api objectAPIHandlers) PutObjectPartHandler(w http.ResponseWriter, r *http return } + if IsSSECustomerRequest(r.Header) { // handle SSE-C requests + // SSE-C is not implemented for multipart operations yet + writeErrorResponse(w, ErrNotImplemented, r.URL) + return + } + /// if Content-Length is unknown/missing, throw away size := r.ContentLength @@ -977,7 +1020,7 @@ func (api objectAPIHandlers) CompleteMultipartUploadHandler(w http.ResponseWrite // Get upload id. uploadID, _, _, _ := getObjectResources(r.URL.Query()) - completeMultipartBytes, err := ioutil.ReadAll(r.Body) + completeMultipartBytes, err := goioutil.ReadAll(r.Body) if err != nil { errorIf(err, "Unable to complete multipart upload.") writeErrorResponse(w, ErrInternalError, r.URL) diff --git a/cmd/routers.go b/cmd/routers.go index 3326d8591..6f2a55cb1 100644 --- a/cmd/routers.go +++ b/cmd/routers.go @@ -112,6 +112,9 @@ func configureServerHandler(endpoints EndpointList) (http.Handler, error) { // routes them accordingly. Client receives a HTTP error for // invalid/unsupported signatures. setAuthHandler, + // filters HTTP headers which are treated as metadata and are reserved + // for internal use only. + filterReservedMetadata, // Add new handlers here. } diff --git a/pkg/ioutil/ioutil.go b/pkg/ioutil/ioutil.go new file mode 100644 index 000000000..27605e6bb --- /dev/null +++ b/pkg/ioutil/ioutil.go @@ -0,0 +1,61 @@ +/* + * Minio Cloud Storage, (C) 2017 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Package ioutil implements some I/O utility functions which are not covered +// by the standard library. +package ioutil + +import ( + "io" +) + +// WriteOnCloser implements io.WriteCloser and always +// exectues at least one write operation if it is closed. +// +// This can be useful within the context of HTTP. At least +// one write operation must happen to send the HTTP headers +// to the peer. +type WriteOnCloser struct { + io.Writer + hasWritten bool +} + +func (w *WriteOnCloser) Write(p []byte) (int, error) { + w.hasWritten = true + return w.Writer.Write(p) +} + +// Close closes the WriteOnCloser. It behaves like io.Closer. +func (w *WriteOnCloser) Close() error { + if !w.hasWritten { + _, err := w.Write(nil) + if err != nil { + return err + } + } + if closer, ok := w.Writer.(io.Closer); ok { + return closer.Close() + } + return nil +} + +// HasWritten returns true if at least one write operation was performed. +func (w *WriteOnCloser) HasWritten() bool { return w.hasWritten } + +// WriteOnClose takes an io.Writer and returns an ioutil.WriteOnCloser. +func WriteOnClose(w io.Writer) *WriteOnCloser { + return &WriteOnCloser{w, false} +} diff --git a/pkg/ioutil/ioutil_test.go b/pkg/ioutil/ioutil_test.go new file mode 100644 index 000000000..3b6d6dc1e --- /dev/null +++ b/pkg/ioutil/ioutil_test.go @@ -0,0 +1,39 @@ +/* + * Minio Cloud Storage, (C) 2017 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ioutil + +import ( + goioutil "io/ioutil" + "testing" +) + +func TestCloseOnWriter(t *testing.T) { + writer := WriteOnClose(goioutil.Discard) + if writer.HasWritten() { + t.Error("WriteOnCloser must not be marked as HasWritten") + } + writer.Write(nil) + if !writer.HasWritten() { + t.Error("WriteOnCloser must be marked as HasWritten") + } + + writer = WriteOnClose(goioutil.Discard) + writer.Close() + if !writer.HasWritten() { + t.Error("WriteOnCloser must be marked as HasWritten") + } +} diff --git a/vendor/github.com/minio/sio/DARE.md b/vendor/github.com/minio/sio/DARE.md new file mode 100644 index 000000000..67c2ab2f8 --- /dev/null +++ b/vendor/github.com/minio/sio/DARE.md @@ -0,0 +1,223 @@ +# Data At Rest Encryption (DARE) - Version 1.0 + +**This is a draft** + +## 1. Introduction + +This document describes the Data At Rest Encryption (DARE) format for encrypting +data in a tamper-resistant way. DARE is designed to securely encrypt data stored +on (untrusted) storage providers. + +## 2. Overview + +DARE specifies how to split an arbitrary data stream into small chunks (packages) +and concatenate them into a tamper-proof chain. Tamper-proof means that an attacker +is not able to: + - decrypt one or more packages. + - modify the content of one or more packages. + - reorder/rearrange one or more packages. + +An attacker is defined as somebody who has full access to the encrypted data +but not to the encryption key. An attacker can also act as storage provider. + +### 2.1 Cryptographic Notation + +DARE will use the following notations: + - The set **{a,b}** means select **one** of the provided values **a**, **b**. + - The concatenation of the byte sequences **a** and **b** is **a || b**. + - The function **len(seq)** returns the length of a byte sequence **seq** in bytes. + - The index access **seq[i]** accesses one byte at index **i** of the sequence **seq**. + - The range access **seq[i : j]** accesses a range of bytes starting at **i** (inclusive) + and ending at **j** (exclusive). + - The compare functions **a == b => f** and **a != b => f** succeed when **a** + is equal to **b** and **a** is not equal to **b** respectively and execute the command **f**. + - The function **CTC(a, b)** returns **1** only if **a** and **b** are equal, 0 otherwise. + CTC compares both values in **constant time**. + - **ENC(key, nonce, plaintext, addData)** represents the byte sequence which is + the output from an AEAD cipher authenticating the *addData*, encrypting and + authenticating the *plaintext* with the secret encryption *key* and the *nonce*. + - **DEC(key, nonce, ciphertext, addData)** represents the byte sequence which is + the output from an AEAD cipher verifying the integrity of the *ciphertext* & + *addData* and decrypting the *ciphertext* with the secret encryption *key* and + the *nonce*. The decryption **always** fails if the integrity check fails. + +All numbers must be converted into byte sequences by using the little endian byte +order. An AEAD cipher will be either AES-256_GCM or CHACHA20_POLY1305. + +## 2.2 Keys + +Both ciphers - AES-256_GCM and CHACHA20_POLY1305 - require a 32 byte key. The key +**must** be unique for one encrypted data stream. Reusing a key **compromises** +some security properties provided by DARE. See Appendix A for recommendations +about generating keys and preventing key reuse. + +## 2.3 Errors + +DARE defines the following errors: + - **err_unsupported_version**: Indicates that the header version is not supported. + - **err_unsupported_cipher**: Indicates that the cipher suite is not supported. + - **err_missing_header**: Indicates that the payload header is missing or incomplete. + - **err_payload_too_short**: Indicates that the actual payload size is smaller than the + payload size field of the header. + - **err_package_out_of_order**: Indicates that the sequence number of the package does + not match the expected sequence number. + - **err_tag_mismatch**: Indicates that the tag of the package does not match the tag + computed while decrypting the package. + +## 3. Package Format + +DARE splits an arbitrary data stream into a sequence of packages. Each package is +encrypted separately. A package consists of a header, a payload and an authentication +tag. + +Header | Payload | Tag +---------|----------------|--------- +16 bytes | 1 byte - 64 KB | 16 bytes + +The header contains information about the package. It consists of: + +Version | Cipher suite | Payload size | Sequence number | nonce +--------|--------------|------------------|------------------|--------- +1 byte | 1 byte | 2 bytes / uint16 | 4 bytes / uint32 | 8 bytes + +The first byte specifies the version of the format and is equal to 0x10 for DARE +version 1.0. The second byte specifies the cipher used to encrypt the package. + +Cipher | Value +------------------|------- +AES-256_GCM | 0x00 +CHACHA20_POLY1305 | 0x01 + +The payload size is an uint16 number. The real payload size is defined as the payload +size field as uint32 + 1. This ensures that the payload can be exactly 64 KB long and +prevents empty packages without a payload. + +The sequence number is an uint32 number identifying the package within a sequence of +packages. It is a monotonically increasing number. The sequence number **must** be 0 for +the first package and **must** be incremented for every subsequent package. The +sequence number of the n-th package is n-1. This means a sequence of packages can consist +of 2 ^ 32 packages and each package can hold up to 64 KB data. The maximum size +of a data stream is limited by `64 KB * 2^32 = 256 TB`. This should be sufficient +for current use cases. However, if necessary, the maximum size of a data stream can increased +in the future by slightly tweaking the header (with a new version). + +The nonce **should** be a random value for each data stream and **should** be kept constant +for all its packages. Even if a key is accidentally used +twice to encrypt two different data streams an attacker should not be able to decrypt one +of those data streams. However, an attacker is always able to exchange corresponding packages +between the streams whenever a key is reused. DARE is only tamper-proof when the encryption +key is unique. See Appendix A. + +The payload contains the encrypted data. It must be at least 1 byte long and can contain a maximum of 64 KB. + +The authentication tag is generated by the AEAD cipher while encrypting and authenticating the +package. The authentication tag **must** always be verified while decrypting the package. +Decrypted content **must never** be returned before the authentication tag is successfully +verified. + +## 4. Encryption + +DARE encrypts every package separately. The header version, cipher suite and nonce **should** +be the same for all encrypted packages of one data stream. It is **recommended** to not change +this values within one sequence of packages. The nonce **should** be generated randomly once +at the beginning of the encryption process and repeated in every header. See Appendix B for +recommendations about generating random numbers. + +The sequence number is the sequence number of the previous package plus 1. The sequence number +**must** be a monotonically increasing number within one sequence of packages. The sequence number +of the first package is **always** 0. + +The payload field is the length of the plaintext in bytes minus 1. The encryption process is +defined as following: + +``` +header[0] = 0x10 +header[1] = {AES-256_GCM, CHACHA20_POLY1305} +header[2:4] = little_endian( len(plaintext) - 1 ) +header[4:8] = little_endian( sequence_number ) +header[8:16] = nonce + +payload || tag = ENC(key, header[4:16], plaintext, header[0:4]) + +sequence_number = sequence_number + 1 +``` + +## 5. Decryption + +DARE decrypts every package separately. Every package **must** be successfully decrypted **before** +plaintext is returned. The decryption happens in three steps: + +1. Verify that the header version is correct (`header[0] == 0x10`) and the cipher suite is supported. +2. Verify that the sequence number of the packages matches the expected sequence number. It is required + to save the first expected sequence number at the beginning of the decryption process. After every + successfully decrypted package this sequence number is incremented by 1. The sequence number of + all packages **must** match the saved / expected number. +3. Verify that the authentication tag at the end of the package is equal to the authentication tag + computed while decrypting the package. This **must** happen in constant time. + +The decryption is defined as following: + +``` +header[0] != 0x10 => err_unsupported_version +header[1] != {AES-256_GCM,CHACHA20_POLY1305} => err_unsupported_cipher +little_endian_uint32(header[4:8]) != expected_sequence_number => err_package_out_of_order + +payload_size := little_endian_uint32(header[2:4]) + 1 +plaintext || tag := DEC(key, header[4:16], ciphertext, header[0:4]) + +CTC(ciphertext[len(plaintext) : len(plaintext) + 16], tag) != 1 => err_tag_mismatch + +expected_sequence_number = expected_sequence_number + 1 +``` + +## Security + +DARE provides confidentiality and integrity of the encrypted data as long as the encryption key +is never reused. This means that a **different** encryption key **must** be used for every data +stream. See Appendix A for recommendations. + +If the same encryption key is used to encrypt two different data streams, an attacker is able to +exchange packages with the same sequence number. This means that the attacker is able to replace +any package of a sequence with another package as long as: + - Both packages are encrypted with the same key. + - The sequence numbers of both packages are equal. + +If two data streams are encrypted with the same key the attacker will not be able to decrypt any +package of those streams without breaking the cipher as long as the nonces are different. To be +more precise the attacker may only be able to decrypt a package if: + - There is another package encrypted with the same key. + - The sequence number and nonce of those two packages (encrypted with the same key) are equal. + +As long as the nonce of a sequence of packages differs from every other nonce (and the nonce is +repeated within one sequence - which is **recommended**) the attacker will not be able to decrypt +any package. It is not required that the nonce is indistinguishable from a truly random bit sequence. +It is sufficient when the nonces differ from each other in at least one bit. + +## Appendices + +### Appendix A - Key Derivation from a Master Key + +DARE needs a unique encryption key per data stream. The best approach to ensure that the keys +are unique is to derive every encryption key from a master key. Therefore a key derivation function +(KDF) - e.g. HKDF, BLAKE2X or HChaCha20 - can be used. The master key itself may be derived from +a password using functions like Argon2 or scrypt. Deriving those keys is the responsibility of the +users of DARE. + +It is **not recommended** to derive encryption keys from a master key and an identifier (like the +file path). If a different data stream is stored under the same identifier - e.g. overwriting the +data - the derived key would be the same for both streams. + +Instead encryption keys should be derived from a master key and a random value. It is not required +that the random value is indistinguishable from a truly random bit sequence. The random value **must** +be unique but need not be secret - depending on the security properties of the KDF. + +To keep this simple: The combination of master key and random value used to derive the encryption key +must be unique all the time. + +### Appendix B - Generating random values + +DARE does not require random values which are indistinguishable from a truly random bit sequence. +However, a random value **must** never be repeated. Therefore it is **recommended** to use a +cryptographically secure pseudorandom number generator (CSPRNG) to generate random values. Many +operating systems and cryptographic libraries already provide appropriate PRNG implementations. +These implementations should always be preferred over crafting a new one. diff --git a/vendor/github.com/minio/sio/LICENSE b/vendor/github.com/minio/sio/LICENSE new file mode 100644 index 000000000..d64569567 --- /dev/null +++ b/vendor/github.com/minio/sio/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/vendor/github.com/minio/sio/README.md b/vendor/github.com/minio/sio/README.md new file mode 100644 index 000000000..1ad7a6ff4 --- /dev/null +++ b/vendor/github.com/minio/sio/README.md @@ -0,0 +1,66 @@ +[![Godoc Reference](https://godoc.org/github.com/minio/sio?status.svg)](https://godoc.org/github.com/minio/sio) +[![Travis CI](https://travis-ci.org/minio/sio.svg?branch=master)](https://travis-ci.org/minio/sio) +[![Go Report Card](https://goreportcard.com/badge/minio/sio)](https://goreportcard.com/report/minio/sio) + +# Secure IO +## Go implementation of the Data At Rest Encryption (DARE) format. + +## Introduction + +It is a common problem to store data securely - especially on untrusted remote storage. +One solution to this problem is cryptography. Before data is stored it is encrypted +to ensure that the data is confidential. Unfortunately encrypting data is not enough to +prevent more sophisticated attacks. Anyone who has access to the stored data can try to +manipulate the data - even if the data is encrypted. + +To prevent these kinds of attacks the data must be encrypted in a tamper-resistant way. +This means an attacker should not be able to: + - Read the stored data - this is achieved by modern encryption algorithms. + - Modify the data by changing parts of the encrypted data. + - Rearrange or reorder parts of the encrypted data. + +Authenticated encryption schemes (AE) - like AES-GCM or ChaCha20-Poly1305 - encrypt and +authenticate data. Any modification to the encrypted data (ciphertext) is detected while +decrypting the data. But even an AE scheme alone is not sufficiently enough to prevent all +kinds of data manipulation. + +All modern AE schemes produce an authentication tag which is verified after the ciphertext +is decrypted. If a large amount of data is decrypted it is not always possible to buffer +all decrypted data until the authentication tag is verified. Returning unauthenticated +data has the same issues like encrypting data without authentication. + +Splitting the data into small chunks fixes the problem of deferred authentication checks +but introduces a new one. The chunks can be reordered - e.g. exchanging chunk 1 and 2 - +because every chunk is encrypted separately. Therefore the order of the chunks must be +encoded somehow into the chunks itself to be able to detect rearranging any number of +chunks. + +This project specifies a [format](https://github.com/minio/sio/blob/master/DARE.md) for +en/decrypting an arbitrary data stream and gives some [recommendations](https://github.com/minio/sio/blob/master/DARE.md#appendices) +about how to use and implement data at rest encryption (DARE). Additionally this project +provides a reference implementation in Go. + +## Applications + +DARE is designed with simplicity and efficiency in mind. It combines modern AE schemes +with a very simple reorder protection mechanism to build a tamper-resistant encryption +scheme. DARE can be used to encrypt files, backups and even large object storage systems. + +Its main properties are: + - Security and high performance by relying on modern AEAD ciphers + - Small overhead - encryption increases the amount of data by ~0.05% + - Support for long data streams - up to 256 TB under the same key + - Random access - arbitrary sequences / ranges can be decrypted independently + +**Install:** `go get -u github.com/minio/sio` + +DARE and `github.com/minio/sio` are finalized and can be used in production. + +## Performance + +Cipher | 8 KB | 64 KB | 512 KB | 1 MB +----------------- | -------- | --------- | --------- | -------- +AES_256_GCM | 90 MB/s | 1.96 GB/s | 2.64 GB/s | 2.83 GB/s +CHACHA20_POLY1305 | 97 MB/s | 1.23 GB/s | 1.54 GB/s | 1.57 GB/s + +*On i7-6500U 2 x 2.5 GHz | Linux 4.10.0-32-generic | Go 1.8.3 | AES-NI & AVX2* diff --git a/vendor/github.com/minio/sio/dare.go b/vendor/github.com/minio/sio/dare.go new file mode 100644 index 000000000..1de0677b8 --- /dev/null +++ b/vendor/github.com/minio/sio/dare.go @@ -0,0 +1,277 @@ +// Copyright (C) 2017 Minio Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package sio implements the DARE format. It provides an API +// for secure en/decrypting IO operations. +package sio // import "github.com/minio/sio" + +import ( + "crypto/aes" + "crypto/cipher" + "crypto/rand" + "errors" + "io" + + "golang.org/x/crypto/chacha20poly1305" +) + +// Version10 specifies version 1.0 +const Version10 byte = 0x10 + +const ( + // AES_256_GCM specifies the cipher suite AES-GCM with 256 bit keys. + AES_256_GCM byte = iota + // CHACHA20_POLY1305 specifies the cipher suite ChaCha20Poly1305 with 256 bit keys. + CHACHA20_POLY1305 +) + +const ( + headerSize = 16 + payloadSize = 64 * 1024 + tagSize = 16 +) + +var ( + errMissingHeader = errors.New("sio: incomplete header") + errPayloadTooShort = errors.New("sio: payload too short") + errPackageOutOfOrder = errors.New("sio: sequence number mismatch") + errTagMismatch = errors.New("sio: authentication failed") + errUnsupportedVersion = errors.New("sio: unsupported version") + errUnsupportedCipher = errors.New("sio: unsupported cipher suite") +) + +var newAesGcm = func(key []byte) (cipher.AEAD, error) { + aes256, err := aes.NewCipher(key) + if err != nil { + return nil, err + } + return cipher.NewGCM(aes256) +} + +var supportedCiphers = [...]func([]byte) (cipher.AEAD, error){ + AES_256_GCM: newAesGcm, + CHACHA20_POLY1305: chacha20poly1305.New, +} + +// Encrypt reads from src until it encounters an io.EOF and encrypts all received +// data. The encrypted data is written to dst. It returns the number of bytes +// encrypted and the first error encountered while encrypting, if any. +// +// Encrypt returns the number of bytes written to dst. +func Encrypt(dst io.Writer, src io.Reader, config Config) (n int64, err error) { + encReader, err := EncryptReader(src, config) + if err != nil { + return + } + return io.CopyBuffer(dst, encReader, make([]byte, payloadSize)) +} + +// Decrypt reads from src until it encounters an io.EOF and decrypts all received +// data. The decrypted data is written to dst. It returns the number of bytes +// decrypted and the first error encountered while decrypting, if any. +// +// Decrypt returns the number of bytes written to dst. Decrypt only writes data to +// dst if the data was decrypted successfully. +func Decrypt(dst io.Writer, src io.Reader, config Config) (n int64, err error) { + decReader, err := DecryptReader(src, config) + if err != nil { + return + } + return io.CopyBuffer(dst, decReader, make([]byte, headerSize+payloadSize+tagSize)) +} + +// EncryptReader wraps the given src and returns an io.Reader which encrypts +// all received data. EncryptReader returns an error if the provided encryption +// configuration is invalid. +func EncryptReader(src io.Reader, config Config) (io.Reader, error) { + if err := setConfigDefaults(&config); err != nil { + return nil, err + } + cipher, err := supportedCiphers[config.CipherSuites[0]](config.Key) + if err != nil { + return nil, err + } + nonce, err := config.generateNonce() + if err != nil { + return nil, err + } + return &encryptedReader{ + src: src, + config: config, + nonce: nonce, + cipher: cipher, + sequenceNumber: config.SequenceNumber, + }, nil +} + +// DecryptReader wraps the given src and returns an io.Reader which decrypts +// all received data. DecryptReader returns an error if the provided decryption +// configuration is invalid. +func DecryptReader(src io.Reader, config Config) (io.Reader, error) { + if err := setConfigDefaults(&config); err != nil { + return nil, err + } + ciphers, err := config.createCiphers() + if err != nil { + return nil, err + } + return &decryptedReader{ + src: src, + config: config, + ciphers: ciphers, + sequenceNumber: config.SequenceNumber, + }, nil +} + +// EncryptWriter wraps the given dst and returns an io.WriteCloser which +// encrypts all data written to it. EncryptWriter returns an error if the +// provided decryption configuration is invalid. +// +// The returned io.WriteCloser must be closed successfully to finalize the +// encryption process. +func EncryptWriter(dst io.Writer, config Config) (io.WriteCloser, error) { + if err := setConfigDefaults(&config); err != nil { + return nil, err + } + cipher, err := supportedCiphers[config.CipherSuites[0]](config.Key) + if err != nil { + return nil, err + } + nonce, err := config.generateNonce() + if err != nil { + return nil, err + } + return &encryptedWriter{ + dst: dst, + config: config, + nonce: nonce, + cipher: cipher, + sequenceNumber: config.SequenceNumber, + }, nil +} + +// DecryptWriter wraps the given dst and returns an io.WriteCloser which +// decrypts all data written to it. DecryptWriter returns an error if the +// provided decryption configuration is invalid. +// +// The returned io.WriteCloser must be closed successfully to finalize the +// decryption process. +func DecryptWriter(dst io.Writer, config Config) (io.WriteCloser, error) { + if err := setConfigDefaults(&config); err != nil { + return nil, err + } + ciphers, err := config.createCiphers() + if err != nil { + return nil, err + } + return &decryptedWriter{ + dst: dst, + config: config, + ciphers: ciphers, + sequenceNumber: config.SequenceNumber, + }, nil +} + +func setConfigDefaults(config *Config) error { + if config.MinVersion > Version10 { + return errors.New("sio: unknown minimum version") + } + if config.MaxVersion > Version10 { + return errors.New("dare: unknown maximum version") + } + if len(config.Key) != 32 { + return errors.New("sio: invalid key size") + } + if len(config.CipherSuites) > 2 { + return errors.New("sio: too many cipher suites") + } + for _, c := range config.CipherSuites { + if int(c) >= len(supportedCiphers) { + return errors.New("sio: unknown cipher suite") + } + } + if config.MinVersion < Version10 { + config.MinVersion = Version10 + } + if config.MaxVersion < Version10 { + config.MaxVersion = config.MinVersion + } + if len(config.CipherSuites) == 0 { + config.CipherSuites = []byte{AES_256_GCM, CHACHA20_POLY1305} + } + if config.Rand == nil { + config.Rand = rand.Reader + } + return nil +} + +// Config contains the format configuration. The only field +// which must always be set manually is the secret key. +type Config struct { + // The minimal supported version of the format. If + // not set the default value is used. + MinVersion byte + // The highest supported version of the format. If + // not set the default value is used. + MaxVersion byte + // A list of supported cipher suites. If not set the + // default value is used. + CipherSuites []byte + // The secret encryption key. It must be 32 bytes long. + Key []byte + // The first expected sequence number. It should only + // be set manually when appending data to an existing + // sequence of packages or decrypting a range within + // a sequence of packages. + SequenceNumber uint32 + // The RNG used to generate random values. If not set + // the default value (crypto/rand.Reader) is used. + Rand io.Reader +} + +func (c *Config) verifyHeader(header headerV10) error { + if version := header.Version(); version < c.MinVersion || version > c.MaxVersion { + return errUnsupportedVersion + } + if !c.isCipherSuiteSupported(header.Cipher()) { + return errUnsupportedCipher + } + return nil +} + +func (c *Config) createCiphers() ([]cipher.AEAD, error) { + ciphers := make([]cipher.AEAD, len(supportedCiphers)) + for _, v := range c.CipherSuites { + aeadCipher, err := supportedCiphers[v](c.Key) + if err != nil { + return nil, err + } + ciphers[v] = aeadCipher + } + return ciphers, nil +} + +func (c *Config) generateNonce() (nonce [8]byte, err error) { + _, err = io.ReadFull(c.Rand, nonce[:]) + return nonce, err +} + +func (c *Config) isCipherSuiteSupported(suite byte) bool { + for _, c := range c.CipherSuites { + if suite == c { + return true + } + } + return false +} diff --git a/vendor/github.com/minio/sio/header.go b/vendor/github.com/minio/sio/header.go new file mode 100644 index 000000000..ddaeeff1c --- /dev/null +++ b/vendor/github.com/minio/sio/header.go @@ -0,0 +1,39 @@ +// Copyright (C) 2017 Minio Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sio + +import "encoding/binary" + +type headerV10 []byte + +func header(b []byte) headerV10 { return headerV10(b[:headerSize]) } + +func (h headerV10) Version() byte { return h[0] } + +func (h headerV10) Cipher() byte { return h[1] } + +func (h headerV10) Len() int { return int(binary.LittleEndian.Uint16(h[2:])) + 1 } + +func (h headerV10) SequenceNumber() uint32 { return binary.LittleEndian.Uint32(h[4:]) } + +func (h headerV10) SetVersion(version byte) { h[0] = version } + +func (h headerV10) SetCipher(suite byte) { h[1] = suite } + +func (h headerV10) SetLen(length int) { binary.LittleEndian.PutUint16(h[2:], uint16(length-1)) } + +func (h headerV10) SetSequenceNumber(num uint32) { binary.LittleEndian.PutUint32(h[4:], num) } + +func (h headerV10) SetNonce(nonce [8]byte) { copy(h[8:], nonce[:]) } diff --git a/vendor/github.com/minio/sio/reader.go b/vendor/github.com/minio/sio/reader.go new file mode 100644 index 000000000..ef35a3774 --- /dev/null +++ b/vendor/github.com/minio/sio/reader.go @@ -0,0 +1,169 @@ +// Copyright (C) 2017 Minio Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sio + +import ( + "crypto/cipher" + "io" +) + +type encryptedReader struct { + src io.Reader + config Config + + sequenceNumber uint32 + nonce [8]byte + cipher cipher.AEAD + + pack [headerSize + payloadSize + tagSize]byte + offset int +} + +func (r *encryptedReader) Read(p []byte) (n int, err error) { + if r.offset > 0 { + remaining := headerSize + header(r.pack[:]).Len() + tagSize - r.offset + if len(p) < remaining { + n = copy(p, r.pack[r.offset:r.offset+len(p)]) + r.offset += n + return + } + n = copy(p, r.pack[r.offset:r.offset+remaining]) + p = p[remaining:] + r.offset = 0 + } + for len(p) >= headerSize+payloadSize+tagSize { + nn, err := io.ReadFull(r.src, r.pack[headerSize:headerSize+payloadSize]) + if err != nil && err != io.ErrUnexpectedEOF { + return n, err + } + r.encrypt(p, nn) + n += headerSize + nn + tagSize + p = p[headerSize+nn+tagSize:] + } + if len(p) > 0 { + nn, err := io.ReadFull(r.src, r.pack[headerSize:headerSize+payloadSize]) + if err != nil && err != io.ErrUnexpectedEOF { + return n, err + } + r.encrypt(r.pack[:], nn) + if headerSize+nn+tagSize < len(p) { + r.offset = copy(p, r.pack[:headerSize+nn+tagSize]) + } else { + r.offset = copy(p, r.pack[:len(p)]) + } + n += r.offset + } + return +} + +func (r *encryptedReader) encrypt(dst []byte, length int) { + header := header(dst) + header.SetVersion(r.config.MaxVersion) + header.SetCipher(r.config.CipherSuites[0]) + header.SetLen(length) + header.SetSequenceNumber(r.sequenceNumber) + header.SetNonce(r.nonce) + + copy(dst[:headerSize], header) + r.cipher.Seal(dst[headerSize:headerSize], header[4:headerSize], r.pack[headerSize:headerSize+length], header[:4]) + + r.sequenceNumber++ +} + +type decryptedReader struct { + src io.Reader + config Config + + sequenceNumber uint32 + ciphers []cipher.AEAD + + pack [headerSize + payloadSize + tagSize]byte + offset int +} + +func (r *decryptedReader) Read(p []byte) (n int, err error) { + if r.offset > 0 { + remaining := header(r.pack[:]).Len() - r.offset + if len(p) < remaining { + n = copy(p, r.pack[headerSize+r.offset:headerSize+r.offset+len(p)]) + r.offset += n + return + } + n = copy(p, r.pack[headerSize+r.offset:headerSize+r.offset+remaining]) + p = p[remaining:] + r.offset = 0 + } + for len(p) >= payloadSize { + if err = r.readHeader(); err != nil { + return n, err + } + nn, err := r.decrypt(p[:payloadSize]) + if err != nil { + return n, err + } + p = p[nn:] + n += nn + } + if len(p) > 0 { + if err = r.readHeader(); err != nil { + return n, err + } + nn, err := r.decrypt(r.pack[headerSize:]) + if err != nil { + return n, err + } + if nn < len(p) { + r.offset = copy(p, r.pack[headerSize:headerSize+nn]) + } else { + r.offset = copy(p, r.pack[headerSize:headerSize+len(p)]) + } + n += r.offset + } + return +} + +func (r *decryptedReader) readHeader() error { + n, err := io.ReadFull(r.src, header(r.pack[:])) + if n != headerSize && err == io.ErrUnexpectedEOF { + return errMissingHeader + } else if err != nil { + return err + } + return nil +} + +func (r *decryptedReader) decrypt(dst []byte) (n int, err error) { + header := header(r.pack[:]) + if err = r.config.verifyHeader(header); err != nil { + return 0, err + } + if header.SequenceNumber() != r.sequenceNumber { + return 0, errPackageOutOfOrder + } + ciphertext := r.pack[headerSize : headerSize+header.Len()+tagSize] + n, err = io.ReadFull(r.src, ciphertext) + if err == io.EOF || err == io.ErrUnexpectedEOF { + return 0, errPayloadTooShort + } else if err != nil { + return 0, err + } + aeadCipher := r.ciphers[header.Cipher()] + plaintext, err := aeadCipher.Open(dst[:0], header[4:], ciphertext, header[:4]) + if err != nil { + return 0, errTagMismatch + } + r.sequenceNumber++ + return len(plaintext), nil +} diff --git a/vendor/github.com/minio/sio/writer.go b/vendor/github.com/minio/sio/writer.go new file mode 100644 index 000000000..6292da6ce --- /dev/null +++ b/vendor/github.com/minio/sio/writer.go @@ -0,0 +1,193 @@ +// Copyright (C) 2017 Minio Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sio + +import ( + "crypto/cipher" + "io" +) + +type decryptedWriter struct { + dst io.Writer + config Config + + sequenceNumber uint32 + ciphers []cipher.AEAD + + pack [headerSize + payloadSize + tagSize]byte + offset int +} + +func (w *decryptedWriter) Write(p []byte) (n int, err error) { + if w.offset > 0 && w.offset < headerSize { + remaining := headerSize - w.offset + if len(p) < remaining { + n = copy(w.pack[w.offset:], p) + w.offset += n + return + } + n = copy(w.pack[w.offset:], p[:remaining]) + p = p[remaining:] + w.offset += n + } + if w.offset >= headerSize { + remaining := headerSize + header(w.pack[:]).Len() + tagSize - w.offset + if len(p) < remaining { + nn := copy(w.pack[w.offset:], p) + w.offset += nn + return n + nn, err + } + n += copy(w.pack[w.offset:], p[:remaining]) + if err = w.decrypt(w.pack[:]); err != nil { + return n, err + } + p = p[remaining:] + w.offset = 0 + } + for len(p) > headerSize { + header := header(p) + if len(p) < headerSize+header.Len()+tagSize { + w.offset = copy(w.pack[:], p) + n += w.offset + return + } + if err = w.decrypt(p); err != nil { + return n, err + } + p = p[headerSize+header.Len()+tagSize:] + n += headerSize + header.Len() + tagSize + } + w.offset = copy(w.pack[:], p) + n += w.offset + return +} + +func (w *decryptedWriter) Close() error { + if w.offset > 0 { + if w.offset < headerSize { + return errMissingHeader + } + if w.offset < headerSize+header(w.pack[:]).Len()+tagSize { + return errPayloadTooShort + } + if err := w.decrypt(w.pack[:]); err != nil { + return err + } + } + if dst, ok := w.dst.(io.Closer); ok { + return dst.Close() + } + return nil +} + +func (w *decryptedWriter) decrypt(src []byte) error { + header := header(src) + if err := w.config.verifyHeader(header); err != nil { + return err + } + if header.SequenceNumber() != w.sequenceNumber { + return errPackageOutOfOrder + } + + aeadCipher := w.ciphers[header.Cipher()] + plaintext, err := aeadCipher.Open(w.pack[headerSize:headerSize], header[4:headerSize], src[headerSize:headerSize+header.Len()+tagSize], header[:4]) + if err != nil { + return errTagMismatch + } + + n, err := w.dst.Write(plaintext) + if err != nil { + return err + } + if n != len(plaintext) { + return io.ErrShortWrite + } + + w.sequenceNumber++ + return nil +} + +type encryptedWriter struct { + dst io.Writer + config Config + + nonce [8]byte + sequenceNumber uint32 + cipher cipher.AEAD + + pack [headerSize + payloadSize + tagSize]byte + offset int +} + +func (w *encryptedWriter) Write(p []byte) (n int, err error) { + if w.offset > 0 { + remaining := payloadSize - w.offset + if len(p) < remaining { + n = copy(w.pack[headerSize+w.offset:], p) + w.offset += n + return + } + n = copy(w.pack[headerSize+w.offset:], p[:remaining]) + if err = w.encrypt(w.pack[headerSize : headerSize+payloadSize]); err != nil { + return + } + p = p[remaining:] + w.offset = 0 + } + for len(p) >= payloadSize { + if err = w.encrypt(p[:payloadSize]); err != nil { + return + } + p = p[payloadSize:] + n += payloadSize + } + if len(p) > 0 { + w.offset = copy(w.pack[headerSize:], p) + n += w.offset + } + return +} + +func (w *encryptedWriter) Close() error { + if w.offset > 0 { + return w.encrypt(w.pack[headerSize : headerSize+w.offset]) + } + if dst, ok := w.dst.(io.Closer); ok { + return dst.Close() + } + return nil +} + +func (w *encryptedWriter) encrypt(src []byte) error { + header := header(w.pack[:]) + header.SetVersion(w.config.MaxVersion) + header.SetCipher(w.config.CipherSuites[0]) + header.SetLen(len(src)) + header.SetSequenceNumber(w.sequenceNumber) + header.SetNonce(w.nonce) + + w.cipher.Seal(w.pack[headerSize:headerSize], header[4:headerSize], src, header[:4]) + + n, err := w.dst.Write(w.pack[:headerSize+len(src)+tagSize]) + if err != nil { + return err + } + if n != headerSize+len(src)+tagSize { + return io.ErrShortWrite + } + + w.sequenceNumber++ + return nil +} diff --git a/vendor/golang.org/x/crypto/LICENSE b/vendor/golang.org/x/crypto/LICENSE new file mode 100644 index 000000000..6a66aea5e --- /dev/null +++ b/vendor/golang.org/x/crypto/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2009 The Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/golang.org/x/crypto/PATENTS b/vendor/golang.org/x/crypto/PATENTS new file mode 100644 index 000000000..733099041 --- /dev/null +++ b/vendor/golang.org/x/crypto/PATENTS @@ -0,0 +1,22 @@ +Additional IP Rights Grant (Patents) + +"This implementation" means the copyrightable works distributed by +Google as part of the Go project. + +Google hereby grants to You a perpetual, worldwide, non-exclusive, +no-charge, royalty-free, irrevocable (except as stated in this section) +patent license to make, have made, use, offer to sell, sell, import, +transfer and otherwise run, modify and propagate the contents of this +implementation of Go, where such license applies only to those patent +claims, both currently owned or controlled by Google and acquired in +the future, licensable by Google that are necessarily infringed by this +implementation of Go. This grant does not include claims that would be +infringed only as a consequence of further modification of this +implementation. If you or your agent or exclusive licensee institute or +order or agree to the institution of patent litigation against any +entity (including a cross-claim or counterclaim in a lawsuit) alleging +that this implementation of Go or any code incorporated within this +implementation of Go constitutes direct or contributory patent +infringement, or inducement of patent infringement, then any patent +rights granted to you under this License for this implementation of Go +shall terminate as of the date such litigation is filed. diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go new file mode 100644 index 000000000..3f0dcb9d8 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305.go @@ -0,0 +1,83 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package chacha20poly1305 implements the ChaCha20-Poly1305 AEAD as specified in RFC 7539. +package chacha20poly1305 // import "golang.org/x/crypto/chacha20poly1305" + +import ( + "crypto/cipher" + "errors" +) + +const ( + // KeySize is the size of the key used by this AEAD, in bytes. + KeySize = 32 + // NonceSize is the size of the nonce used with this AEAD, in bytes. + NonceSize = 12 +) + +type chacha20poly1305 struct { + key [32]byte +} + +// New returns a ChaCha20-Poly1305 AEAD that uses the given, 256-bit key. +func New(key []byte) (cipher.AEAD, error) { + if len(key) != KeySize { + return nil, errors.New("chacha20poly1305: bad key length") + } + ret := new(chacha20poly1305) + copy(ret.key[:], key) + return ret, nil +} + +func (c *chacha20poly1305) NonceSize() int { + return NonceSize +} + +func (c *chacha20poly1305) Overhead() int { + return 16 +} + +func (c *chacha20poly1305) Seal(dst, nonce, plaintext, additionalData []byte) []byte { + if len(nonce) != NonceSize { + panic("chacha20poly1305: bad nonce length passed to Seal") + } + + if uint64(len(plaintext)) > (1<<38)-64 { + panic("chacha20poly1305: plaintext too large") + } + + return c.seal(dst, nonce, plaintext, additionalData) +} + +var errOpen = errors.New("chacha20poly1305: message authentication failed") + +func (c *chacha20poly1305) Open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) { + if len(nonce) != NonceSize { + panic("chacha20poly1305: bad nonce length passed to Open") + } + if len(ciphertext) < 16 { + return nil, errOpen + } + if uint64(len(ciphertext)) > (1<<38)-48 { + panic("chacha20poly1305: ciphertext too large") + } + + return c.open(dst, nonce, ciphertext, additionalData) +} + +// sliceForAppend takes a slice and a requested number of bytes. It returns a +// slice with the contents of the given slice followed by that many bytes and a +// second slice that aliases into it and contains only the extra bytes. If the +// original slice has sufficient capacity then no allocation is performed. +func sliceForAppend(in []byte, n int) (head, tail []byte) { + if total := len(in) + n; cap(in) >= total { + head = in[:total] + } else { + head = make([]byte, total) + copy(head, in) + } + tail = head[len(in):] + return +} diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go new file mode 100644 index 000000000..7cd7ad834 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.go @@ -0,0 +1,127 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build go1.7,amd64,!gccgo,!appengine + +package chacha20poly1305 + +import "encoding/binary" + +//go:noescape +func chacha20Poly1305Open(dst []byte, key []uint32, src, ad []byte) bool + +//go:noescape +func chacha20Poly1305Seal(dst []byte, key []uint32, src, ad []byte) + +// cpuid is implemented in chacha20poly1305_amd64.s. +func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32) + +// xgetbv with ecx = 0 is implemented in chacha20poly1305_amd64.s. +func xgetbv() (eax, edx uint32) + +var ( + useASM bool + useAVX2 bool +) + +func init() { + detectCPUFeatures() +} + +// detectCPUFeatures is used to detect if cpu instructions +// used by the functions implemented in assembler in +// chacha20poly1305_amd64.s are supported. +func detectCPUFeatures() { + maxID, _, _, _ := cpuid(0, 0) + if maxID < 1 { + return + } + + _, _, ecx1, _ := cpuid(1, 0) + + haveSSSE3 := isSet(9, ecx1) + useASM = haveSSSE3 + + haveOSXSAVE := isSet(27, ecx1) + + osSupportsAVX := false + // For XGETBV, OSXSAVE bit is required and sufficient. + if haveOSXSAVE { + eax, _ := xgetbv() + // Check if XMM and YMM registers have OS support. + osSupportsAVX = isSet(1, eax) && isSet(2, eax) + } + haveAVX := isSet(28, ecx1) && osSupportsAVX + + if maxID < 7 { + return + } + + _, ebx7, _, _ := cpuid(7, 0) + haveAVX2 := isSet(5, ebx7) && haveAVX + haveBMI2 := isSet(8, ebx7) + + useAVX2 = haveAVX2 && haveBMI2 +} + +// isSet checks if bit at bitpos is set in value. +func isSet(bitpos uint, value uint32) bool { + return value&(1<+0x00(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574 +DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574 +// <<< 16 with PSHUFB +DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302 +DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A +DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302 +DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A +// <<< 8 with PSHUFB +DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003 +DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B +DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003 +DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B + +DATA ·avx2InitMask<>+0x00(SB)/8, $0x0 +DATA ·avx2InitMask<>+0x08(SB)/8, $0x0 +DATA ·avx2InitMask<>+0x10(SB)/8, $0x1 +DATA ·avx2InitMask<>+0x18(SB)/8, $0x0 + +DATA ·avx2IncMask<>+0x00(SB)/8, $0x2 +DATA ·avx2IncMask<>+0x08(SB)/8, $0x0 +DATA ·avx2IncMask<>+0x10(SB)/8, $0x2 +DATA ·avx2IncMask<>+0x18(SB)/8, $0x0 +// Poly1305 key clamp +DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF +DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC +DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF +DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF + +DATA ·sseIncMask<>+0x00(SB)/8, $0x1 +DATA ·sseIncMask<>+0x08(SB)/8, $0x0 +// To load/store the last < 16 bytes in a buffer +DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff +DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff +DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff +DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000 +DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff +DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff +DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff + +GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32 +GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32 +GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32 +GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16 +GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32 +GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32 +GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32 +GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240 +// No PALIGNR in Go ASM yet (but VPALIGNR is present). +#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3 +#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4 +#define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5 +#define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13 +#define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6 +#define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7 +#define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8 +#define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14 +#define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9 +#define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10 +#define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11 +#define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15 +#define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3 +#define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4 +#define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5 +#define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13 +#define shiftC0Right shiftC0Left +#define shiftC1Right shiftC1Left +#define shiftC2Right shiftC2Left +#define shiftC3Right shiftC3Left +#define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9 +#define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10 +#define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11 +#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15 +// Some macros +#define chachaQR(A, B, C, D, T) \ + PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D \ + PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \ + PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D \ + PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B + +#define chachaQR_AVX2(A, B, C, D, T) \ + VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \ + VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \ + VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \ + VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B + +#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2 +#define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2 +#define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX +#define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3 +#define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t2:t3; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2 + +#define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2 +#define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3 +#define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3 + +#define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage +#define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage +// ---------------------------------------------------------------------------- +TEXT polyHashADInternal<>(SB), NOSPLIT, $0 + // adp points to beginning of additional data + // itr2 holds ad length + XORQ acc0, acc0 + XORQ acc1, acc1 + XORQ acc2, acc2 + CMPQ itr2, $13 + JNE hashADLoop + +openFastTLSAD: + // Special treatment for the TLS case of 13 bytes + MOVQ (adp), acc0 + MOVQ 5(adp), acc1 + SHRQ $24, acc1 + MOVQ $1, acc2 + polyMul + RET + +hashADLoop: + // Hash in 16 byte chunks + CMPQ itr2, $16 + JB hashADTail + polyAdd(0(adp)) + LEAQ (1*16)(adp), adp + SUBQ $16, itr2 + polyMul + JMP hashADLoop + +hashADTail: + CMPQ itr2, $0 + JE hashADDone + + // Hash last < 16 byte tail + XORQ t0, t0 + XORQ t1, t1 + XORQ t2, t2 + ADDQ itr2, adp + +hashADTailLoop: + SHLQ $8, t1:t0 + SHLQ $8, t0 + MOVB -1(adp), t2 + XORQ t2, t0 + DECQ adp + DECQ itr2 + JNE hashADTailLoop + +hashADTailFinish: + ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 + polyMul + + // Finished AD +hashADDone: + RET + +// ---------------------------------------------------------------------------- +// func chacha20Poly1305Open(dst, key, src, ad []byte) bool +TEXT ·chacha20Poly1305Open(SB), 0, $288-97 + // For aligned stack access + MOVQ SP, BP + ADDQ $32, BP + ANDQ $-32, BP + MOVQ dst+0(FP), oup + MOVQ key+24(FP), keyp + MOVQ src+48(FP), inp + MOVQ src_len+56(FP), inl + MOVQ ad+72(FP), adp + + // Check for AVX2 support + CMPB ·useAVX2(SB), $1 + JE chacha20Poly1305Open_AVX2 + + // Special optimization, for very short buffers + CMPQ inl, $128 + JBE openSSE128 // About 16% faster + + // For long buffers, prepare the poly key first + MOVOU ·chacha20Constants<>(SB), A0 + MOVOU (1*16)(keyp), B0 + MOVOU (2*16)(keyp), C0 + MOVOU (3*16)(keyp), D0 + MOVO D0, T1 + + // Store state on stack for future use + MOVO B0, state1Store + MOVO C0, state2Store + MOVO D0, ctr3Store + MOVQ $10, itr2 + +openSSEPreparePolyKey: + chachaQR(A0, B0, C0, D0, T0) + shiftB0Left; shiftC0Left; shiftD0Left + chachaQR(A0, B0, C0, D0, T0) + shiftB0Right; shiftC0Right; shiftD0Right + DECQ itr2 + JNE openSSEPreparePolyKey + + // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded + PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0 + + // Clamp and store the key + PAND ·polyClampMask<>(SB), A0 + MOVO A0, rStore; MOVO B0, sStore + + // Hash AAD + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + +openSSEMainLoop: + CMPQ inl, $256 + JB openSSEMainLoopDone + + // Load state, increment counter blocks + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + + // Store counters + MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + + // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16 + MOVQ $4, itr1 + MOVQ inp, itr2 + +openSSEInternalLoop: + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + polyAdd(0(itr2)) + shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left + shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left + shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left + polyMulStage1 + polyMulStage2 + LEAQ (2*8)(itr2), itr2 + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + polyMulStage3 + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + polyMulReduceStage + shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right + shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right + shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right + DECQ itr1 + JGE openSSEInternalLoop + + polyAdd(0(itr2)) + polyMul + LEAQ (2*8)(itr2), itr2 + + CMPQ itr1, $-6 + JG openSSEInternalLoop + + // Add in the state + PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 + PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 + PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 + PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + + // Load - xor - store + MOVO D3, tmpStore + MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup) + MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup) + MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup) + MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup) + MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup) + MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup) + MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup) + MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup) + MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup) + MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup) + MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup) + MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup) + MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup) + MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup) + MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup) + MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup) + LEAQ 256(inp), inp + LEAQ 256(oup), oup + SUBQ $256, inl + JMP openSSEMainLoop + +openSSEMainLoopDone: + // Handle the various tail sizes efficiently + TESTQ inl, inl + JE openSSEFinalize + CMPQ inl, $64 + JBE openSSETail64 + CMPQ inl, $128 + JBE openSSETail128 + CMPQ inl, $192 + JBE openSSETail192 + JMP openSSETail256 + +openSSEFinalize: + // Hash in the PT, AAD lengths + ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2 + polyMul + + // Final reduce + MOVQ acc0, t0 + MOVQ acc1, t1 + MOVQ acc2, t2 + SUBQ $-5, acc0 + SBBQ $-1, acc1 + SBBQ $3, acc2 + CMOVQCS t0, acc0 + CMOVQCS t1, acc1 + CMOVQCS t2, acc2 + + // Add in the "s" part of the key + ADDQ 0+sStore, acc0 + ADCQ 8+sStore, acc1 + + // Finally, constant time compare to the tag at the end of the message + XORQ AX, AX + MOVQ $1, DX + XORQ (0*8)(inp), acc0 + XORQ (1*8)(inp), acc1 + ORQ acc1, acc0 + CMOVQEQ DX, AX + + // Return true iff tags are equal + MOVB AX, ret+96(FP) + RET + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 129 bytes +openSSE128: + // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks + MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 + MOVQ $10, itr2 + +openSSE128InnerCipherLoop: + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Left; shiftB1Left; shiftB2Left + shiftC0Left; shiftC1Left; shiftC2Left + shiftD0Left; shiftD1Left; shiftD2Left + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Right; shiftB1Right; shiftB2Right + shiftC0Right; shiftC1Right; shiftC2Right + shiftD0Right; shiftD1Right; shiftD2Right + DECQ itr2 + JNE openSSE128InnerCipherLoop + + // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 + PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 + PADDL T2, C1; PADDL T2, C2 + PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 + + // Clamp and store the key + PAND ·polyClampMask<>(SB), A0 + MOVOU A0, rStore; MOVOU B0, sStore + + // Hash + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + +openSSE128Open: + CMPQ inl, $16 + JB openSSETail16 + SUBQ $16, inl + + // Load for hashing + polyAdd(0(inp)) + + // Load for decryption + MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup) + LEAQ (1*16)(inp), inp + LEAQ (1*16)(oup), oup + polyMul + + // Shift the stream "left" + MOVO B1, A1 + MOVO C1, B1 + MOVO D1, C1 + MOVO A2, D1 + MOVO B2, A2 + MOVO C2, B2 + MOVO D2, C2 + JMP openSSE128Open + +openSSETail16: + TESTQ inl, inl + JE openSSEFinalize + + // We can safely load the CT from the end, because it is padded with the MAC + MOVQ inl, itr2 + SHLQ $4, itr2 + LEAQ ·andMask<>(SB), t0 + MOVOU (inp), T0 + ADDQ inl, inp + PAND -16(t0)(itr2*1), T0 + MOVO T0, 0+tmpStore + MOVQ T0, t0 + MOVQ 8+tmpStore, t1 + PXOR A1, T0 + + // We can only store one byte at a time, since plaintext can be shorter than 16 bytes +openSSETail16Store: + MOVQ T0, t3 + MOVB t3, (oup) + PSRLDQ $1, T0 + INCQ oup + DECQ inl + JNE openSSETail16Store + ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 + polyMul + JMP openSSEFinalize + +// ---------------------------------------------------------------------------- +// Special optimization for the last 64 bytes of ciphertext +openSSETail64: + // Need to decrypt up to 64 bytes - prepare single block + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store + XORQ itr2, itr2 + MOVQ inl, itr1 + CMPQ itr1, $16 + JB openSSETail64LoopB + +openSSETail64LoopA: + // Perform ChaCha rounds, while hashing the remaining input + polyAdd(0(inp)(itr2*1)) + polyMul + SUBQ $16, itr1 + +openSSETail64LoopB: + ADDQ $16, itr2 + chachaQR(A0, B0, C0, D0, T0) + shiftB0Left; shiftC0Left; shiftD0Left + chachaQR(A0, B0, C0, D0, T0) + shiftB0Right; shiftC0Right; shiftD0Right + + CMPQ itr1, $16 + JAE openSSETail64LoopA + + CMPQ itr2, $160 + JNE openSSETail64LoopB + + PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0 + +openSSETail64DecLoop: + CMPQ inl, $16 + JB openSSETail64DecLoopDone + SUBQ $16, inl + MOVOU (inp), T0 + PXOR T0, A0 + MOVOU A0, (oup) + LEAQ 16(inp), inp + LEAQ 16(oup), oup + MOVO B0, A0 + MOVO C0, B0 + MOVO D0, C0 + JMP openSSETail64DecLoop + +openSSETail64DecLoopDone: + MOVO A0, A1 + JMP openSSETail16 + +// ---------------------------------------------------------------------------- +// Special optimization for the last 128 bytes of ciphertext +openSSETail128: + // Need to decrypt up to 128 bytes - prepare two blocks + MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store + MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store + XORQ itr2, itr2 + MOVQ inl, itr1 + ANDQ $-16, itr1 + +openSSETail128LoopA: + // Perform ChaCha rounds, while hashing the remaining input + polyAdd(0(inp)(itr2*1)) + polyMul + +openSSETail128LoopB: + ADDQ $16, itr2 + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) + shiftB0Left; shiftC0Left; shiftD0Left + shiftB1Left; shiftC1Left; shiftD1Left + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) + shiftB0Right; shiftC0Right; shiftD0Right + shiftB1Right; shiftC1Right; shiftD1Right + + CMPQ itr2, itr1 + JB openSSETail128LoopA + + CMPQ itr2, $160 + JNE openSSETail128LoopB + + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 + PADDL state1Store, B0; PADDL state1Store, B1 + PADDL state2Store, C0; PADDL state2Store, C1 + PADDL ctr1Store, D0; PADDL ctr0Store, D1 + + MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 + PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 + MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) + + SUBQ $64, inl + LEAQ 64(inp), inp + LEAQ 64(oup), oup + JMP openSSETail64DecLoop + +// ---------------------------------------------------------------------------- +// Special optimization for the last 192 bytes of ciphertext +openSSETail192: + // Need to decrypt up to 192 bytes - prepare three blocks + MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store + MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store + MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store + + MOVQ inl, itr1 + MOVQ $160, itr2 + CMPQ itr1, $160 + CMOVQGT itr2, itr1 + ANDQ $-16, itr1 + XORQ itr2, itr2 + +openSSLTail192LoopA: + // Perform ChaCha rounds, while hashing the remaining input + polyAdd(0(inp)(itr2*1)) + polyMul + +openSSLTail192LoopB: + ADDQ $16, itr2 + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Left; shiftC0Left; shiftD0Left + shiftB1Left; shiftC1Left; shiftD1Left + shiftB2Left; shiftC2Left; shiftD2Left + + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Right; shiftC0Right; shiftD0Right + shiftB1Right; shiftC1Right; shiftD1Right + shiftB2Right; shiftC2Right; shiftD2Right + + CMPQ itr2, itr1 + JB openSSLTail192LoopA + + CMPQ itr2, $160 + JNE openSSLTail192LoopB + + CMPQ inl, $176 + JB openSSLTail192Store + + polyAdd(160(inp)) + polyMul + + CMPQ inl, $192 + JB openSSLTail192Store + + polyAdd(176(inp)) + polyMul + +openSSLTail192Store: + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 + PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 + PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 + PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2 + + MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 + PXOR T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2 + MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup) + + MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 + PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 + MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) + + SUBQ $128, inl + LEAQ 128(inp), inp + LEAQ 128(oup), oup + JMP openSSETail64DecLoop + +// ---------------------------------------------------------------------------- +// Special optimization for the last 256 bytes of ciphertext +openSSETail256: + // Need to decrypt up to 256 bytes - prepare four blocks + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + + // Store counters + MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + XORQ itr2, itr2 + +openSSETail256Loop: + // This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication + polyAdd(0(inp)(itr2*1)) + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left + shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left + shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left + polyMulStage1 + polyMulStage2 + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + polyMulStage3 + polyMulReduceStage + shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right + shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right + shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right + ADDQ $2*8, itr2 + CMPQ itr2, $160 + JB openSSETail256Loop + MOVQ inl, itr1 + ANDQ $-16, itr1 + +openSSETail256HashLoop: + polyAdd(0(inp)(itr2*1)) + polyMul + ADDQ $2*8, itr2 + CMPQ itr2, itr1 + JB openSSETail256HashLoop + + // Add in the state + PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 + PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 + PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 + PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + MOVO D3, tmpStore + + // Load - xor - store + MOVOU (0*16)(inp), D3; PXOR D3, A0 + MOVOU (1*16)(inp), D3; PXOR D3, B0 + MOVOU (2*16)(inp), D3; PXOR D3, C0 + MOVOU (3*16)(inp), D3; PXOR D3, D0 + MOVOU A0, (0*16)(oup) + MOVOU B0, (1*16)(oup) + MOVOU C0, (2*16)(oup) + MOVOU D0, (3*16)(oup) + MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 + PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 + MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) + MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 + PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 + MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) + LEAQ 192(inp), inp + LEAQ 192(oup), oup + SUBQ $192, inl + MOVO A3, A0 + MOVO B3, B0 + MOVO C3, C0 + MOVO tmpStore, D0 + + JMP openSSETail64DecLoop + +// ---------------------------------------------------------------------------- +// ------------------------- AVX2 Code ---------------------------------------- +chacha20Poly1305Open_AVX2: + VZEROUPPER + VMOVDQU ·chacha20Constants<>(SB), AA0 + BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 + BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 + BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 + VPADDD ·avx2InitMask<>(SB), DD0, DD0 + + // Special optimization, for very short buffers + CMPQ inl, $192 + JBE openAVX2192 + CMPQ inl, $320 + JBE openAVX2320 + + // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream + VMOVDQA BB0, state1StoreAVX2 + VMOVDQA CC0, state2StoreAVX2 + VMOVDQA DD0, ctr3StoreAVX2 + MOVQ $10, itr2 + +openAVX2PreparePolyKey: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 + DECQ itr2 + JNE openAVX2PreparePolyKey + + VPADDD ·chacha20Constants<>(SB), AA0, AA0 + VPADDD state1StoreAVX2, BB0, BB0 + VPADDD state2StoreAVX2, CC0, CC0 + VPADDD ctr3StoreAVX2, DD0, DD0 + + VPERM2I128 $0x02, AA0, BB0, TT0 + + // Clamp and store poly key + VPAND ·polyClampMask<>(SB), TT0, TT0 + VMOVDQA TT0, rsStoreAVX2 + + // Stream for the first 64 bytes + VPERM2I128 $0x13, AA0, BB0, AA0 + VPERM2I128 $0x13, CC0, DD0, BB0 + + // Hash AD + first 64 bytes + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + XORQ itr1, itr1 + +openAVX2InitialHash64: + polyAdd(0(inp)(itr1*1)) + polyMulAVX2 + ADDQ $16, itr1 + CMPQ itr1, $64 + JNE openAVX2InitialHash64 + + // Decrypt the first 64 bytes + VPXOR (0*32)(inp), AA0, AA0 + VPXOR (1*32)(inp), BB0, BB0 + VMOVDQU AA0, (0*32)(oup) + VMOVDQU BB0, (1*32)(oup) + LEAQ (2*32)(inp), inp + LEAQ (2*32)(oup), oup + SUBQ $64, inl + +openAVX2MainLoop: + CMPQ inl, $512 + JB openAVX2MainLoopDone + + // Load state, increment counter blocks, store the incremented counters + VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 + VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 + VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 + XORQ itr1, itr1 + +openAVX2InternalLoop: + // Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications + // Effectively per 512 bytes of stream we hash 480 bytes of ciphertext + polyAdd(0*8(inp)(itr1*1)) + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + polyMulStage1_AVX2 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + polyMulStage2_AVX2 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + polyMulStage3_AVX2 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulReduceStage + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + polyAdd(2*8(inp)(itr1*1)) + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + polyMulStage1_AVX2 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulStage2_AVX2 + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + polyMulStage3_AVX2 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + polyMulReduceStage + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + polyAdd(4*8(inp)(itr1*1)) + LEAQ (6*8)(itr1), itr1 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulStage1_AVX2 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + polyMulStage2_AVX2 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + polyMulStage3_AVX2 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulReduceStage + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 + CMPQ itr1, $480 + JNE openAVX2InternalLoop + + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 + VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 + VMOVDQA CC3, tmpStoreAVX2 + + // We only hashed 480 of the 512 bytes available - hash the remaining 32 here + polyAdd(480(inp)) + polyMulAVX2 + VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 + VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 + VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) + VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 + VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + + // and here + polyAdd(496(inp)) + polyMulAVX2 + VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 + VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 + VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) + VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 + VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 + VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) + LEAQ (32*16)(inp), inp + LEAQ (32*16)(oup), oup + SUBQ $(32*16), inl + JMP openAVX2MainLoop + +openAVX2MainLoopDone: + // Handle the various tail sizes efficiently + TESTQ inl, inl + JE openSSEFinalize + CMPQ inl, $128 + JBE openAVX2Tail128 + CMPQ inl, $256 + JBE openAVX2Tail256 + CMPQ inl, $384 + JBE openAVX2Tail384 + JMP openAVX2Tail512 + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 193 bytes +openAVX2192: + // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks + VMOVDQA AA0, AA1 + VMOVDQA BB0, BB1 + VMOVDQA CC0, CC1 + VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VMOVDQA AA0, AA2 + VMOVDQA BB0, BB2 + VMOVDQA CC0, CC2 + VMOVDQA DD0, DD2 + VMOVDQA DD1, TT3 + MOVQ $10, itr2 + +openAVX2192InnerCipherLoop: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 + DECQ itr2 + JNE openAVX2192InnerCipherLoop + VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 + VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 + VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 + VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 + VPERM2I128 $0x02, AA0, BB0, TT0 + + // Clamp and store poly key + VPAND ·polyClampMask<>(SB), TT0, TT0 + VMOVDQA TT0, rsStoreAVX2 + + // Stream for up to 192 bytes + VPERM2I128 $0x13, AA0, BB0, AA0 + VPERM2I128 $0x13, CC0, DD0, BB0 + VPERM2I128 $0x02, AA1, BB1, CC0 + VPERM2I128 $0x02, CC1, DD1, DD0 + VPERM2I128 $0x13, AA1, BB1, AA1 + VPERM2I128 $0x13, CC1, DD1, BB1 + +openAVX2ShortOpen: + // Hash + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + +openAVX2ShortOpenLoop: + CMPQ inl, $32 + JB openAVX2ShortTail32 + SUBQ $32, inl + + // Load for hashing + polyAdd(0*8(inp)) + polyMulAVX2 + polyAdd(2*8(inp)) + polyMulAVX2 + + // Load for decryption + VPXOR (inp), AA0, AA0 + VMOVDQU AA0, (oup) + LEAQ (1*32)(inp), inp + LEAQ (1*32)(oup), oup + + // Shift stream left + VMOVDQA BB0, AA0 + VMOVDQA CC0, BB0 + VMOVDQA DD0, CC0 + VMOVDQA AA1, DD0 + VMOVDQA BB1, AA1 + VMOVDQA CC1, BB1 + VMOVDQA DD1, CC1 + VMOVDQA AA2, DD1 + VMOVDQA BB2, AA2 + JMP openAVX2ShortOpenLoop + +openAVX2ShortTail32: + CMPQ inl, $16 + VMOVDQA A0, A1 + JB openAVX2ShortDone + + SUBQ $16, inl + + // Load for hashing + polyAdd(0*8(inp)) + polyMulAVX2 + + // Load for decryption + VPXOR (inp), A0, T0 + VMOVDQU T0, (oup) + LEAQ (1*16)(inp), inp + LEAQ (1*16)(oup), oup + VPERM2I128 $0x11, AA0, AA0, AA0 + VMOVDQA A0, A1 + +openAVX2ShortDone: + VZEROUPPER + JMP openSSETail16 + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 321 bytes +openAVX2320: + // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks + VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 + VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 + MOVQ $10, itr2 + +openAVX2320InnerCipherLoop: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 + DECQ itr2 + JNE openAVX2320InnerCipherLoop + + VMOVDQA ·chacha20Constants<>(SB), TT0 + VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 + VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 + VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 + VMOVDQA ·avx2IncMask<>(SB), TT0 + VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 + VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 + VPADDD TT3, DD2, DD2 + + // Clamp and store poly key + VPERM2I128 $0x02, AA0, BB0, TT0 + VPAND ·polyClampMask<>(SB), TT0, TT0 + VMOVDQA TT0, rsStoreAVX2 + + // Stream for up to 320 bytes + VPERM2I128 $0x13, AA0, BB0, AA0 + VPERM2I128 $0x13, CC0, DD0, BB0 + VPERM2I128 $0x02, AA1, BB1, CC0 + VPERM2I128 $0x02, CC1, DD1, DD0 + VPERM2I128 $0x13, AA1, BB1, AA1 + VPERM2I128 $0x13, CC1, DD1, BB1 + VPERM2I128 $0x02, AA2, BB2, CC1 + VPERM2I128 $0x02, CC2, DD2, DD1 + VPERM2I128 $0x13, AA2, BB2, AA2 + VPERM2I128 $0x13, CC2, DD2, BB2 + JMP openAVX2ShortOpen + +// ---------------------------------------------------------------------------- +// Special optimization for the last 128 bytes of ciphertext +openAVX2Tail128: + // Need to decrypt up to 128 bytes - prepare two blocks + VMOVDQA ·chacha20Constants<>(SB), AA1 + VMOVDQA state1StoreAVX2, BB1 + VMOVDQA state2StoreAVX2, CC1 + VMOVDQA ctr3StoreAVX2, DD1 + VPADDD ·avx2IncMask<>(SB), DD1, DD1 + VMOVDQA DD1, DD0 + + XORQ itr2, itr2 + MOVQ inl, itr1 + ANDQ $-16, itr1 + TESTQ itr1, itr1 + JE openAVX2Tail128LoopB + +openAVX2Tail128LoopA: + // Perform ChaCha rounds, while hashing the remaining input + polyAdd(0(inp)(itr2*1)) + polyMulAVX2 + +openAVX2Tail128LoopB: + ADDQ $16, itr2 + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $4, BB1, BB1, BB1 + VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $12, DD1, DD1, DD1 + chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $12, BB1, BB1, BB1 + VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $4, DD1, DD1, DD1 + CMPQ itr2, itr1 + JB openAVX2Tail128LoopA + CMPQ itr2, $160 + JNE openAVX2Tail128LoopB + + VPADDD ·chacha20Constants<>(SB), AA1, AA1 + VPADDD state1StoreAVX2, BB1, BB1 + VPADDD state2StoreAVX2, CC1, CC1 + VPADDD DD0, DD1, DD1 + VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + +openAVX2TailLoop: + CMPQ inl, $32 + JB openAVX2Tail + SUBQ $32, inl + + // Load for decryption + VPXOR (inp), AA0, AA0 + VMOVDQU AA0, (oup) + LEAQ (1*32)(inp), inp + LEAQ (1*32)(oup), oup + VMOVDQA BB0, AA0 + VMOVDQA CC0, BB0 + VMOVDQA DD0, CC0 + JMP openAVX2TailLoop + +openAVX2Tail: + CMPQ inl, $16 + VMOVDQA A0, A1 + JB openAVX2TailDone + SUBQ $16, inl + + // Load for decryption + VPXOR (inp), A0, T0 + VMOVDQU T0, (oup) + LEAQ (1*16)(inp), inp + LEAQ (1*16)(oup), oup + VPERM2I128 $0x11, AA0, AA0, AA0 + VMOVDQA A0, A1 + +openAVX2TailDone: + VZEROUPPER + JMP openSSETail16 + +// ---------------------------------------------------------------------------- +// Special optimization for the last 256 bytes of ciphertext +openAVX2Tail256: + // Need to decrypt up to 256 bytes - prepare four blocks + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VMOVDQA DD0, TT1 + VMOVDQA DD1, TT2 + + // Compute the number of iterations that will hash data + MOVQ inl, tmpStoreAVX2 + MOVQ inl, itr1 + SUBQ $128, itr1 + SHRQ $4, itr1 + MOVQ $10, itr2 + CMPQ itr1, $10 + CMOVQGT itr2, itr1 + MOVQ inp, inl + XORQ itr2, itr2 + +openAVX2Tail256LoopA: + polyAdd(0(inl)) + polyMulAVX2 + LEAQ 16(inl), inl + + // Perform ChaCha rounds, while hashing the remaining input +openAVX2Tail256LoopB: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 + INCQ itr2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 + CMPQ itr2, itr1 + JB openAVX2Tail256LoopA + + CMPQ itr2, $10 + JNE openAVX2Tail256LoopB + + MOVQ inl, itr2 + SUBQ inp, inl + MOVQ inl, itr1 + MOVQ tmpStoreAVX2, inl + + // Hash the remainder of data (if any) +openAVX2Tail256Hash: + ADDQ $16, itr1 + CMPQ itr1, inl + JGT openAVX2Tail256HashEnd + polyAdd (0(itr2)) + polyMulAVX2 + LEAQ 16(itr2), itr2 + JMP openAVX2Tail256Hash + +// Store 128 bytes safely, then go to store loop +openAVX2Tail256HashEnd: + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 + VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 + VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2 + VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + + VPXOR (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2 + VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup) + LEAQ (4*32)(inp), inp + LEAQ (4*32)(oup), oup + SUBQ $4*32, inl + + JMP openAVX2TailLoop + +// ---------------------------------------------------------------------------- +// Special optimization for the last 384 bytes of ciphertext +openAVX2Tail384: + // Need to decrypt up to 384 bytes - prepare six blocks + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VPADDD ·avx2IncMask<>(SB), DD1, DD2 + VMOVDQA DD0, ctr0StoreAVX2 + VMOVDQA DD1, ctr1StoreAVX2 + VMOVDQA DD2, ctr2StoreAVX2 + + // Compute the number of iterations that will hash two blocks of data + MOVQ inl, tmpStoreAVX2 + MOVQ inl, itr1 + SUBQ $256, itr1 + SHRQ $4, itr1 + ADDQ $6, itr1 + MOVQ $10, itr2 + CMPQ itr1, $10 + CMOVQGT itr2, itr1 + MOVQ inp, inl + XORQ itr2, itr2 + + // Perform ChaCha rounds, while hashing the remaining input +openAVX2Tail384LoopB: + polyAdd(0(inl)) + polyMulAVX2 + LEAQ 16(inl), inl + +openAVX2Tail384LoopA: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 + polyAdd(0(inl)) + polyMulAVX2 + LEAQ 16(inl), inl + INCQ itr2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 + + CMPQ itr2, itr1 + JB openAVX2Tail384LoopB + + CMPQ itr2, $10 + JNE openAVX2Tail384LoopA + + MOVQ inl, itr2 + SUBQ inp, inl + MOVQ inl, itr1 + MOVQ tmpStoreAVX2, inl + +openAVX2Tail384Hash: + ADDQ $16, itr1 + CMPQ itr1, inl + JGT openAVX2Tail384HashEnd + polyAdd(0(itr2)) + polyMulAVX2 + LEAQ 16(itr2), itr2 + JMP openAVX2Tail384Hash + +// Store 256 bytes safely, then go to store loop +openAVX2Tail384HashEnd: + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 + VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2 + VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3 + VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 + VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) + VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3 + VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 + VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) + VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 + LEAQ (8*32)(inp), inp + LEAQ (8*32)(oup), oup + SUBQ $8*32, inl + JMP openAVX2TailLoop + +// ---------------------------------------------------------------------------- +// Special optimization for the last 512 bytes of ciphertext +openAVX2Tail512: + VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 + VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 + VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 + XORQ itr1, itr1 + MOVQ inp, itr2 + +openAVX2Tail512LoopB: + polyAdd(0(itr2)) + polyMulAVX2 + LEAQ (2*8)(itr2), itr2 + +openAVX2Tail512LoopA: + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyAdd(0*8(itr2)) + polyMulAVX2 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + polyAdd(2*8(itr2)) + polyMulAVX2 + LEAQ (4*8)(itr2), itr2 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 + INCQ itr1 + CMPQ itr1, $4 + JLT openAVX2Tail512LoopB + + CMPQ itr1, $10 + JNE openAVX2Tail512LoopA + + MOVQ inl, itr1 + SUBQ $384, itr1 + ANDQ $-16, itr1 + +openAVX2Tail512HashLoop: + TESTQ itr1, itr1 + JE openAVX2Tail512HashEnd + polyAdd(0(itr2)) + polyMulAVX2 + LEAQ 16(itr2), itr2 + SUBQ $16, itr1 + JMP openAVX2Tail512HashLoop + +openAVX2Tail512HashEnd: + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 + VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 + VMOVDQA CC3, tmpStoreAVX2 + VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 + VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 + VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) + VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 + VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 + VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 + VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) + VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 + + LEAQ (12*32)(inp), inp + LEAQ (12*32)(oup), oup + SUBQ $12*32, inl + + JMP openAVX2TailLoop + +// ---------------------------------------------------------------------------- +// ---------------------------------------------------------------------------- +// func chacha20Poly1305Seal(dst, key, src, ad []byte) +TEXT ·chacha20Poly1305Seal(SB), 0, $288-96 + // For aligned stack access + MOVQ SP, BP + ADDQ $32, BP + ANDQ $-32, BP + MOVQ dst+0(FP), oup + MOVQ key+24(FP), keyp + MOVQ src+48(FP), inp + MOVQ src_len+56(FP), inl + MOVQ ad+72(FP), adp + + CMPB ·useAVX2(SB), $1 + JE chacha20Poly1305Seal_AVX2 + + // Special optimization, for very short buffers + CMPQ inl, $128 + JBE sealSSE128 // About 15% faster + + // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration + MOVOU ·chacha20Constants<>(SB), A0 + MOVOU (1*16)(keyp), B0 + MOVOU (2*16)(keyp), C0 + MOVOU (3*16)(keyp), D0 + + // Store state on stack for future use + MOVO B0, state1Store + MOVO C0, state2Store + + // Load state, increment counter blocks + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + + // Store counters + MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + MOVQ $10, itr2 + +sealSSEIntroLoop: + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left + shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left + shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left + + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right + shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right + shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right + DECQ itr2 + JNE sealSSEIntroLoop + + // Add in the state + PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 + PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 + PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 + PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + + // Clamp and store the key + PAND ·polyClampMask<>(SB), A0 + MOVO A0, rStore + MOVO B0, sStore + + // Hash AAD + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + + MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 + PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 + MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) + MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 + PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 + MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup) + + MOVQ $128, itr1 + SUBQ $128, inl + LEAQ 128(inp), inp + + MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1 + + CMPQ inl, $64 + JBE sealSSE128SealHash + + MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 + PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 + MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup) + + ADDQ $64, itr1 + SUBQ $64, inl + LEAQ 64(inp), inp + + MOVQ $2, itr1 + MOVQ $8, itr2 + + CMPQ inl, $64 + JBE sealSSETail64 + CMPQ inl, $128 + JBE sealSSETail128 + CMPQ inl, $192 + JBE sealSSETail192 + +sealSSEMainLoop: + // Load state, increment counter blocks + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + + // Store counters + MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + +sealSSEInnerLoop: + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + polyAdd(0(oup)) + shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left + shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left + shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left + polyMulStage1 + polyMulStage2 + LEAQ (2*8)(oup), oup + MOVO C3, tmpStore + chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) + MOVO tmpStore, C3 + MOVO C1, tmpStore + polyMulStage3 + chachaQR(A3, B3, C3, D3, C1) + MOVO tmpStore, C1 + polyMulReduceStage + shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right + shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right + shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right + DECQ itr2 + JGE sealSSEInnerLoop + polyAdd(0(oup)) + polyMul + LEAQ (2*8)(oup), oup + DECQ itr1 + JG sealSSEInnerLoop + + // Add in the state + PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 + PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 + PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 + PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + MOVO D3, tmpStore + + // Load - xor - store + MOVOU (0*16)(inp), D3; PXOR D3, A0 + MOVOU (1*16)(inp), D3; PXOR D3, B0 + MOVOU (2*16)(inp), D3; PXOR D3, C0 + MOVOU (3*16)(inp), D3; PXOR D3, D0 + MOVOU A0, (0*16)(oup) + MOVOU B0, (1*16)(oup) + MOVOU C0, (2*16)(oup) + MOVOU D0, (3*16)(oup) + MOVO tmpStore, D3 + + MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 + PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 + MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) + MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 + PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 + MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) + ADDQ $192, inp + MOVQ $192, itr1 + SUBQ $192, inl + MOVO A3, A1 + MOVO B3, B1 + MOVO C3, C1 + MOVO D3, D1 + CMPQ inl, $64 + JBE sealSSE128SealHash + MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 + PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 + MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup) + LEAQ 64(inp), inp + SUBQ $64, inl + MOVQ $6, itr1 + MOVQ $4, itr2 + CMPQ inl, $192 + JG sealSSEMainLoop + + MOVQ inl, itr1 + TESTQ inl, inl + JE sealSSE128SealHash + MOVQ $6, itr1 + CMPQ inl, $64 + JBE sealSSETail64 + CMPQ inl, $128 + JBE sealSSETail128 + JMP sealSSETail192 + +// ---------------------------------------------------------------------------- +// Special optimization for the last 64 bytes of plaintext +sealSSETail64: + // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes + MOVO ·chacha20Constants<>(SB), A1 + MOVO state1Store, B1 + MOVO state2Store, C1 + MOVO ctr3Store, D1 + PADDL ·sseIncMask<>(SB), D1 + MOVO D1, ctr0Store + +sealSSETail64LoopA: + // Perform ChaCha rounds, while hashing the previously encrypted ciphertext + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealSSETail64LoopB: + chachaQR(A1, B1, C1, D1, T1) + shiftB1Left; shiftC1Left; shiftD1Left + chachaQR(A1, B1, C1, D1, T1) + shiftB1Right; shiftC1Right; shiftD1Right + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + + DECQ itr1 + JG sealSSETail64LoopA + + DECQ itr2 + JGE sealSSETail64LoopB + PADDL ·chacha20Constants<>(SB), A1 + PADDL state1Store, B1 + PADDL state2Store, C1 + PADDL ctr0Store, D1 + + JMP sealSSE128Seal + +// ---------------------------------------------------------------------------- +// Special optimization for the last 128 bytes of plaintext +sealSSETail128: + // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store + +sealSSETail128LoopA: + // Perform ChaCha rounds, while hashing the previously encrypted ciphertext + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealSSETail128LoopB: + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) + shiftB0Left; shiftC0Left; shiftD0Left + shiftB1Left; shiftC1Left; shiftD1Left + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) + shiftB0Right; shiftC0Right; shiftD0Right + shiftB1Right; shiftC1Right; shiftD1Right + + DECQ itr1 + JG sealSSETail128LoopA + + DECQ itr2 + JGE sealSSETail128LoopB + + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 + PADDL state1Store, B0; PADDL state1Store, B1 + PADDL state2Store, C0; PADDL state2Store, C1 + PADDL ctr0Store, D0; PADDL ctr1Store, D1 + + MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 + PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 + MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) + + MOVQ $64, itr1 + LEAQ 64(inp), inp + SUBQ $64, inl + + JMP sealSSE128SealHash + +// ---------------------------------------------------------------------------- +// Special optimization for the last 192 bytes of plaintext +sealSSETail192: + // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes + MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store + +sealSSETail192LoopA: + // Perform ChaCha rounds, while hashing the previously encrypted ciphertext + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealSSETail192LoopB: + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Left; shiftC0Left; shiftD0Left + shiftB1Left; shiftC1Left; shiftD1Left + shiftB2Left; shiftC2Left; shiftD2Left + + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Right; shiftC0Right; shiftD0Right + shiftB1Right; shiftC1Right; shiftD1Right + shiftB2Right; shiftC2Right; shiftD2Right + + DECQ itr1 + JG sealSSETail192LoopA + + DECQ itr2 + JGE sealSSETail192LoopB + + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 + PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 + PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 + PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2 + + MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 + PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 + MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) + MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 + PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 + MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) + + MOVO A2, A1 + MOVO B2, B1 + MOVO C2, C1 + MOVO D2, D1 + MOVQ $128, itr1 + LEAQ 128(inp), inp + SUBQ $128, inl + + JMP sealSSE128SealHash + +// ---------------------------------------------------------------------------- +// Special seal optimization for buffers smaller than 129 bytes +sealSSE128: + // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks + MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 + MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 + MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 + MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 + MOVQ $10, itr2 + +sealSSE128InnerCipherLoop: + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Left; shiftB1Left; shiftB2Left + shiftC0Left; shiftC1Left; shiftC2Left + shiftD0Left; shiftD1Left; shiftD2Left + chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) + shiftB0Right; shiftB1Right; shiftB2Right + shiftC0Right; shiftC1Right; shiftC2Right + shiftD0Right; shiftD1Right; shiftD2Right + DECQ itr2 + JNE sealSSE128InnerCipherLoop + + // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded + PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 + PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 + PADDL T2, C1; PADDL T2, C2 + PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 + PAND ·polyClampMask<>(SB), A0 + MOVOU A0, rStore + MOVOU B0, sStore + + // Hash + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + XORQ itr1, itr1 + +sealSSE128SealHash: + // itr1 holds the number of bytes encrypted but not yet hashed + CMPQ itr1, $16 + JB sealSSE128Seal + polyAdd(0(oup)) + polyMul + + SUBQ $16, itr1 + ADDQ $16, oup + + JMP sealSSE128SealHash + +sealSSE128Seal: + CMPQ inl, $16 + JB sealSSETail + SUBQ $16, inl + + // Load for decryption + MOVOU (inp), T0 + PXOR T0, A1 + MOVOU A1, (oup) + LEAQ (1*16)(inp), inp + LEAQ (1*16)(oup), oup + + // Extract for hashing + MOVQ A1, t0 + PSRLDQ $8, A1 + MOVQ A1, t1 + ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 + polyMul + + // Shift the stream "left" + MOVO B1, A1 + MOVO C1, B1 + MOVO D1, C1 + MOVO A2, D1 + MOVO B2, A2 + MOVO C2, B2 + MOVO D2, C2 + JMP sealSSE128Seal + +sealSSETail: + TESTQ inl, inl + JE sealSSEFinalize + + // We can only load the PT one byte at a time to avoid read after end of buffer + MOVQ inl, itr2 + SHLQ $4, itr2 + LEAQ ·andMask<>(SB), t0 + MOVQ inl, itr1 + LEAQ -1(inp)(inl*1), inp + XORQ t2, t2 + XORQ t3, t3 + XORQ AX, AX + +sealSSETailLoadLoop: + SHLQ $8, t2, t3 + SHLQ $8, t2 + MOVB (inp), AX + XORQ AX, t2 + LEAQ -1(inp), inp + DECQ itr1 + JNE sealSSETailLoadLoop + MOVQ t2, 0+tmpStore + MOVQ t3, 8+tmpStore + PXOR 0+tmpStore, A1 + MOVOU A1, (oup) + MOVOU -16(t0)(itr2*1), T0 + PAND T0, A1 + MOVQ A1, t0 + PSRLDQ $8, A1 + MOVQ A1, t1 + ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 + polyMul + + ADDQ inl, oup + +sealSSEFinalize: + // Hash in the buffer lengths + ADDQ ad_len+80(FP), acc0 + ADCQ src_len+56(FP), acc1 + ADCQ $1, acc2 + polyMul + + // Final reduce + MOVQ acc0, t0 + MOVQ acc1, t1 + MOVQ acc2, t2 + SUBQ $-5, acc0 + SBBQ $-1, acc1 + SBBQ $3, acc2 + CMOVQCS t0, acc0 + CMOVQCS t1, acc1 + CMOVQCS t2, acc2 + + // Add in the "s" part of the key + ADDQ 0+sStore, acc0 + ADCQ 8+sStore, acc1 + + // Finally store the tag at the end of the message + MOVQ acc0, (0*8)(oup) + MOVQ acc1, (1*8)(oup) + RET + +// ---------------------------------------------------------------------------- +// ------------------------- AVX2 Code ---------------------------------------- +chacha20Poly1305Seal_AVX2: + VZEROUPPER + VMOVDQU ·chacha20Constants<>(SB), AA0 + BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 + BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 + BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 + VPADDD ·avx2InitMask<>(SB), DD0, DD0 + + // Special optimizations, for very short buffers + CMPQ inl, $192 + JBE seal192AVX2 // 33% faster + CMPQ inl, $320 + JBE seal320AVX2 // 17% faster + + // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream + VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2 + VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2 + VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2 + VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2 + VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2 + VMOVDQA DD3, ctr3StoreAVX2 + MOVQ $10, itr2 + +sealAVX2IntroLoop: + VMOVDQA CC3, tmpStoreAVX2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA tmpStoreAVX2, CC3 + VMOVDQA CC1, tmpStoreAVX2 + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA tmpStoreAVX2, CC1 + + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 + VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 + VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 + VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 + + VMOVDQA CC3, tmpStoreAVX2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA tmpStoreAVX2, CC3 + VMOVDQA CC1, tmpStoreAVX2 + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA tmpStoreAVX2, CC1 + + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 + VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 + VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 + VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 + DECQ itr2 + JNE sealAVX2IntroLoop + + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 + VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 + + VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127 + VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key + VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95 + + // Clamp and store poly key + VPAND ·polyClampMask<>(SB), DD0, DD0 + VMOVDQA DD0, rsStoreAVX2 + + // Hash AD + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + + // Can store at least 320 bytes + VPXOR (0*32)(inp), AA0, AA0 + VPXOR (1*32)(inp), CC0, CC0 + VMOVDQU AA0, (0*32)(oup) + VMOVDQU CC0, (1*32)(oup) + + VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + VPXOR (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0 + VMOVDQU AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup) + VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 + VPXOR (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0 + VMOVDQU AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup) + + MOVQ $320, itr1 + SUBQ $320, inl + LEAQ 320(inp), inp + + VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0 + CMPQ inl, $128 + JBE sealAVX2SealHash + + VPXOR (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0 + VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup) + SUBQ $128, inl + LEAQ 128(inp), inp + + MOVQ $8, itr1 + MOVQ $2, itr2 + + CMPQ inl, $128 + JBE sealAVX2Tail128 + CMPQ inl, $256 + JBE sealAVX2Tail256 + CMPQ inl, $384 + JBE sealAVX2Tail384 + CMPQ inl, $512 + JBE sealAVX2Tail512 + + // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 + VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 + + VMOVDQA CC3, tmpStoreAVX2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA tmpStoreAVX2, CC3 + VMOVDQA CC1, tmpStoreAVX2 + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA tmpStoreAVX2, CC1 + + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 + VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 + VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 + VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 + + VMOVDQA CC3, tmpStoreAVX2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) + VMOVDQA tmpStoreAVX2, CC3 + VMOVDQA CC1, tmpStoreAVX2 + chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) + VMOVDQA tmpStoreAVX2, CC1 + + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 + VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 + VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 + VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + + SUBQ $16, oup // Adjust the pointer + MOVQ $9, itr1 + JMP sealAVX2InternalLoopStart + +sealAVX2MainLoop: + // Load state, increment counter blocks, store the incremented counters + VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 + VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 + VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 + MOVQ $10, itr1 + +sealAVX2InternalLoop: + polyAdd(0*8(oup)) + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + polyMulStage1_AVX2 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + polyMulStage2_AVX2 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + polyMulStage3_AVX2 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulReduceStage + +sealAVX2InternalLoopStart: + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + polyAdd(2*8(oup)) + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + polyMulStage1_AVX2 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulStage2_AVX2 + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + polyMulStage3_AVX2 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + polyMulReduceStage + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + polyAdd(4*8(oup)) + LEAQ (6*8)(oup), oup + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulStage1_AVX2 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + polyMulStage2_AVX2 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + polyMulStage3_AVX2 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyMulReduceStage + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 + DECQ itr1 + JNE sealAVX2InternalLoop + + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 + VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 + VMOVDQA CC3, tmpStoreAVX2 + + // We only hashed 480 of the 512 bytes available - hash the remaining 32 here + polyAdd(0*8(oup)) + polyMulAVX2 + LEAQ (4*8)(oup), oup + VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 + VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 + VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) + VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 + VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + + // and here + polyAdd(-2*8(oup)) + polyMulAVX2 + VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 + VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 + VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) + VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 + VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 + VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) + LEAQ (32*16)(inp), inp + SUBQ $(32*16), inl + CMPQ inl, $512 + JG sealAVX2MainLoop + + // Tail can only hash 480 bytes + polyAdd(0*8(oup)) + polyMulAVX2 + polyAdd(2*8(oup)) + polyMulAVX2 + LEAQ 32(oup), oup + + MOVQ $10, itr1 + MOVQ $0, itr2 + CMPQ inl, $128 + JBE sealAVX2Tail128 + CMPQ inl, $256 + JBE sealAVX2Tail256 + CMPQ inl, $384 + JBE sealAVX2Tail384 + JMP sealAVX2Tail512 + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 193 bytes +seal192AVX2: + // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks + VMOVDQA AA0, AA1 + VMOVDQA BB0, BB1 + VMOVDQA CC0, CC1 + VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VMOVDQA AA0, AA2 + VMOVDQA BB0, BB2 + VMOVDQA CC0, CC2 + VMOVDQA DD0, DD2 + VMOVDQA DD1, TT3 + MOVQ $10, itr2 + +sealAVX2192InnerCipherLoop: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 + DECQ itr2 + JNE sealAVX2192InnerCipherLoop + VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 + VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 + VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 + VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 + VPERM2I128 $0x02, AA0, BB0, TT0 + + // Clamp and store poly key + VPAND ·polyClampMask<>(SB), TT0, TT0 + VMOVDQA TT0, rsStoreAVX2 + + // Stream for up to 192 bytes + VPERM2I128 $0x13, AA0, BB0, AA0 + VPERM2I128 $0x13, CC0, DD0, BB0 + VPERM2I128 $0x02, AA1, BB1, CC0 + VPERM2I128 $0x02, CC1, DD1, DD0 + VPERM2I128 $0x13, AA1, BB1, AA1 + VPERM2I128 $0x13, CC1, DD1, BB1 + +sealAVX2ShortSeal: + // Hash aad + MOVQ ad_len+80(FP), itr2 + CALL polyHashADInternal<>(SB) + XORQ itr1, itr1 + +sealAVX2SealHash: + // itr1 holds the number of bytes encrypted but not yet hashed + CMPQ itr1, $16 + JB sealAVX2ShortSealLoop + polyAdd(0(oup)) + polyMul + SUBQ $16, itr1 + ADDQ $16, oup + JMP sealAVX2SealHash + +sealAVX2ShortSealLoop: + CMPQ inl, $32 + JB sealAVX2ShortTail32 + SUBQ $32, inl + + // Load for encryption + VPXOR (inp), AA0, AA0 + VMOVDQU AA0, (oup) + LEAQ (1*32)(inp), inp + + // Now can hash + polyAdd(0*8(oup)) + polyMulAVX2 + polyAdd(2*8(oup)) + polyMulAVX2 + LEAQ (1*32)(oup), oup + + // Shift stream left + VMOVDQA BB0, AA0 + VMOVDQA CC0, BB0 + VMOVDQA DD0, CC0 + VMOVDQA AA1, DD0 + VMOVDQA BB1, AA1 + VMOVDQA CC1, BB1 + VMOVDQA DD1, CC1 + VMOVDQA AA2, DD1 + VMOVDQA BB2, AA2 + JMP sealAVX2ShortSealLoop + +sealAVX2ShortTail32: + CMPQ inl, $16 + VMOVDQA A0, A1 + JB sealAVX2ShortDone + + SUBQ $16, inl + + // Load for encryption + VPXOR (inp), A0, T0 + VMOVDQU T0, (oup) + LEAQ (1*16)(inp), inp + + // Hash + polyAdd(0*8(oup)) + polyMulAVX2 + LEAQ (1*16)(oup), oup + VPERM2I128 $0x11, AA0, AA0, AA0 + VMOVDQA A0, A1 + +sealAVX2ShortDone: + VZEROUPPER + JMP sealSSETail + +// ---------------------------------------------------------------------------- +// Special optimization for buffers smaller than 321 bytes +seal320AVX2: + // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks + VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 + VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 + MOVQ $10, itr2 + +sealAVX2320InnerCipherLoop: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 + DECQ itr2 + JNE sealAVX2320InnerCipherLoop + + VMOVDQA ·chacha20Constants<>(SB), TT0 + VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 + VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 + VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 + VMOVDQA ·avx2IncMask<>(SB), TT0 + VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 + VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 + VPADDD TT3, DD2, DD2 + + // Clamp and store poly key + VPERM2I128 $0x02, AA0, BB0, TT0 + VPAND ·polyClampMask<>(SB), TT0, TT0 + VMOVDQA TT0, rsStoreAVX2 + + // Stream for up to 320 bytes + VPERM2I128 $0x13, AA0, BB0, AA0 + VPERM2I128 $0x13, CC0, DD0, BB0 + VPERM2I128 $0x02, AA1, BB1, CC0 + VPERM2I128 $0x02, CC1, DD1, DD0 + VPERM2I128 $0x13, AA1, BB1, AA1 + VPERM2I128 $0x13, CC1, DD1, BB1 + VPERM2I128 $0x02, AA2, BB2, CC1 + VPERM2I128 $0x02, CC2, DD2, DD1 + VPERM2I128 $0x13, AA2, BB2, AA2 + VPERM2I128 $0x13, CC2, DD2, BB2 + JMP sealAVX2ShortSeal + +// ---------------------------------------------------------------------------- +// Special optimization for the last 128 bytes of ciphertext +sealAVX2Tail128: + // Need to decrypt up to 128 bytes - prepare two blocks + // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed + // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed + VMOVDQA ·chacha20Constants<>(SB), AA0 + VMOVDQA state1StoreAVX2, BB0 + VMOVDQA state2StoreAVX2, CC0 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0 + VMOVDQA DD0, DD1 + +sealAVX2Tail128LoopA: + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealAVX2Tail128LoopB: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + polyAdd(0(oup)) + polyMul + VPALIGNR $4, BB0, BB0, BB0 + VPALIGNR $8, CC0, CC0, CC0 + VPALIGNR $12, DD0, DD0, DD0 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) + polyAdd(16(oup)) + polyMul + LEAQ 32(oup), oup + VPALIGNR $12, BB0, BB0, BB0 + VPALIGNR $8, CC0, CC0, CC0 + VPALIGNR $4, DD0, DD0, DD0 + DECQ itr1 + JG sealAVX2Tail128LoopA + DECQ itr2 + JGE sealAVX2Tail128LoopB + + VPADDD ·chacha20Constants<>(SB), AA0, AA1 + VPADDD state1StoreAVX2, BB0, BB1 + VPADDD state2StoreAVX2, CC0, CC1 + VPADDD DD1, DD0, DD1 + + VPERM2I128 $0x02, AA1, BB1, AA0 + VPERM2I128 $0x02, CC1, DD1, BB0 + VPERM2I128 $0x13, AA1, BB1, CC0 + VPERM2I128 $0x13, CC1, DD1, DD0 + JMP sealAVX2ShortSealLoop + +// ---------------------------------------------------------------------------- +// Special optimization for the last 256 bytes of ciphertext +sealAVX2Tail256: + // Need to decrypt up to 256 bytes - prepare two blocks + // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed + // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD1 + VMOVDQA DD0, TT1 + VMOVDQA DD1, TT2 + +sealAVX2Tail256LoopA: + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealAVX2Tail256LoopB: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + polyAdd(0(oup)) + polyMul + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) + polyAdd(16(oup)) + polyMul + LEAQ 32(oup), oup + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 + DECQ itr1 + JG sealAVX2Tail256LoopA + DECQ itr2 + JGE sealAVX2Tail256LoopB + + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 + VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 + VPERM2I128 $0x02, AA0, BB0, TT0 + VPERM2I128 $0x02, CC0, DD0, TT1 + VPERM2I128 $0x13, AA0, BB0, TT2 + VPERM2I128 $0x13, CC0, DD0, TT3 + VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 + VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) + MOVQ $128, itr1 + LEAQ 128(inp), inp + SUBQ $128, inl + VPERM2I128 $0x02, AA1, BB1, AA0 + VPERM2I128 $0x02, CC1, DD1, BB0 + VPERM2I128 $0x13, AA1, BB1, CC0 + VPERM2I128 $0x13, CC1, DD1, DD0 + + JMP sealAVX2SealHash + +// ---------------------------------------------------------------------------- +// Special optimization for the last 384 bytes of ciphertext +sealAVX2Tail384: + // Need to decrypt up to 384 bytes - prepare two blocks + // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed + // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2 + VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3 + +sealAVX2Tail384LoopA: + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealAVX2Tail384LoopB: + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + polyAdd(0(oup)) + polyMul + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 + chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) + polyAdd(16(oup)) + polyMul + LEAQ 32(oup), oup + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 + DECQ itr1 + JG sealAVX2Tail384LoopA + DECQ itr2 + JGE sealAVX2Tail384LoopB + + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 + VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2 + VPERM2I128 $0x02, AA0, BB0, TT0 + VPERM2I128 $0x02, CC0, DD0, TT1 + VPERM2I128 $0x13, AA0, BB0, TT2 + VPERM2I128 $0x13, CC0, DD0, TT3 + VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 + VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) + VPERM2I128 $0x02, AA1, BB1, TT0 + VPERM2I128 $0x02, CC1, DD1, TT1 + VPERM2I128 $0x13, AA1, BB1, TT2 + VPERM2I128 $0x13, CC1, DD1, TT3 + VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 + VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) + MOVQ $256, itr1 + LEAQ 256(inp), inp + SUBQ $256, inl + VPERM2I128 $0x02, AA2, BB2, AA0 + VPERM2I128 $0x02, CC2, DD2, BB0 + VPERM2I128 $0x13, AA2, BB2, CC0 + VPERM2I128 $0x13, CC2, DD2, DD0 + + JMP sealAVX2SealHash + +// ---------------------------------------------------------------------------- +// Special optimization for the last 512 bytes of ciphertext +sealAVX2Tail512: + // Need to decrypt up to 512 bytes - prepare two blocks + // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed + // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed + VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 + VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 + VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 + VMOVDQA ctr3StoreAVX2, DD0 + VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 + VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 + +sealAVX2Tail512LoopA: + polyAdd(0(oup)) + polyMul + LEAQ 16(oup), oup + +sealAVX2Tail512LoopB: + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + polyAdd(0*8(oup)) + polyMulAVX2 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + polyAdd(2*8(oup)) + polyMulAVX2 + LEAQ (4*8)(oup), oup + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 + VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 + VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 + VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 + VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 + VMOVDQA CC3, tmpStoreAVX2 + VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 + VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 + VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 + VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 + VMOVDQA tmpStoreAVX2, CC3 + VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 + VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 + VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 + + DECQ itr1 + JG sealAVX2Tail512LoopA + DECQ itr2 + JGE sealAVX2Tail512LoopB + + VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 + VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 + VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 + VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 + VMOVDQA CC3, tmpStoreAVX2 + VPERM2I128 $0x02, AA0, BB0, CC3 + VPXOR (0*32)(inp), CC3, CC3 + VMOVDQU CC3, (0*32)(oup) + VPERM2I128 $0x02, CC0, DD0, CC3 + VPXOR (1*32)(inp), CC3, CC3 + VMOVDQU CC3, (1*32)(oup) + VPERM2I128 $0x13, AA0, BB0, CC3 + VPXOR (2*32)(inp), CC3, CC3 + VMOVDQU CC3, (2*32)(oup) + VPERM2I128 $0x13, CC0, DD0, CC3 + VPXOR (3*32)(inp), CC3, CC3 + VMOVDQU CC3, (3*32)(oup) + + VPERM2I128 $0x02, AA1, BB1, AA0 + VPERM2I128 $0x02, CC1, DD1, BB0 + VPERM2I128 $0x13, AA1, BB1, CC0 + VPERM2I128 $0x13, CC1, DD1, DD0 + VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 + VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + + VPERM2I128 $0x02, AA2, BB2, AA0 + VPERM2I128 $0x02, CC2, DD2, BB0 + VPERM2I128 $0x13, AA2, BB2, CC0 + VPERM2I128 $0x13, CC2, DD2, DD0 + VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 + VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) + + MOVQ $384, itr1 + LEAQ 384(inp), inp + SUBQ $384, inl + VPERM2I128 $0x02, AA3, BB3, AA0 + VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0 + VPERM2I128 $0x13, AA3, BB3, CC0 + VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 + + JMP sealAVX2SealHash + +// func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32) +TEXT ·cpuid(SB), NOSPLIT, $0-24 + MOVL eaxArg+0(FP), AX + MOVL ecxArg+4(FP), CX + CPUID + MOVL AX, eax+8(FP) + MOVL BX, ebx+12(FP) + MOVL CX, ecx+16(FP) + MOVL DX, edx+20(FP) + RET + +// func xgetbv() (eax, edx uint32) +TEXT ·xgetbv(SB),NOSPLIT,$0-8 + MOVL $0, CX + XGETBV + MOVL AX, eax+0(FP) + MOVL DX, edx+4(FP) + RET diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go new file mode 100644 index 000000000..f7e4bfb1c --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_generic.go @@ -0,0 +1,70 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package chacha20poly1305 + +import ( + "encoding/binary" + + "golang.org/x/crypto/chacha20poly1305/internal/chacha20" + "golang.org/x/crypto/poly1305" +) + +func roundTo16(n int) int { + return 16 * ((n + 15) / 16) +} + +func (c *chacha20poly1305) sealGeneric(dst, nonce, plaintext, additionalData []byte) []byte { + var counter [16]byte + copy(counter[4:], nonce) + + var polyKey [32]byte + chacha20.XORKeyStream(polyKey[:], polyKey[:], &counter, &c.key) + + ret, out := sliceForAppend(dst, len(plaintext)+poly1305.TagSize) + counter[0] = 1 + chacha20.XORKeyStream(out, plaintext, &counter, &c.key) + + polyInput := make([]byte, roundTo16(len(additionalData))+roundTo16(len(plaintext))+8+8) + copy(polyInput, additionalData) + copy(polyInput[roundTo16(len(additionalData)):], out[:len(plaintext)]) + binary.LittleEndian.PutUint64(polyInput[len(polyInput)-16:], uint64(len(additionalData))) + binary.LittleEndian.PutUint64(polyInput[len(polyInput)-8:], uint64(len(plaintext))) + + var tag [poly1305.TagSize]byte + poly1305.Sum(&tag, polyInput, &polyKey) + copy(out[len(plaintext):], tag[:]) + + return ret +} + +func (c *chacha20poly1305) openGeneric(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) { + var tag [poly1305.TagSize]byte + copy(tag[:], ciphertext[len(ciphertext)-16:]) + ciphertext = ciphertext[:len(ciphertext)-16] + + var counter [16]byte + copy(counter[4:], nonce) + + var polyKey [32]byte + chacha20.XORKeyStream(polyKey[:], polyKey[:], &counter, &c.key) + + polyInput := make([]byte, roundTo16(len(additionalData))+roundTo16(len(ciphertext))+8+8) + copy(polyInput, additionalData) + copy(polyInput[roundTo16(len(additionalData)):], ciphertext) + binary.LittleEndian.PutUint64(polyInput[len(polyInput)-16:], uint64(len(additionalData))) + binary.LittleEndian.PutUint64(polyInput[len(polyInput)-8:], uint64(len(ciphertext))) + + ret, out := sliceForAppend(dst, len(ciphertext)) + if !poly1305.Verify(&tag, polyInput, &polyKey) { + for i := range out { + out[i] = 0 + } + return nil, errOpen + } + + counter[0] = 1 + chacha20.XORKeyStream(out, ciphertext, &counter, &c.key) + return ret, nil +} diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_noasm.go b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_noasm.go new file mode 100644 index 000000000..4c2eb703c --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_noasm.go @@ -0,0 +1,15 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !amd64 !go1.7 gccgo appengine + +package chacha20poly1305 + +func (c *chacha20poly1305) seal(dst, nonce, plaintext, additionalData []byte) []byte { + return c.sealGeneric(dst, nonce, plaintext, additionalData) +} + +func (c *chacha20poly1305) open(dst, nonce, ciphertext, additionalData []byte) ([]byte, error) { + return c.openGeneric(dst, nonce, ciphertext, additionalData) +} diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/internal/chacha20/chacha_generic.go b/vendor/golang.org/x/crypto/chacha20poly1305/internal/chacha20/chacha_generic.go new file mode 100644 index 000000000..f9e8a3a2f --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20poly1305/internal/chacha20/chacha_generic.go @@ -0,0 +1,199 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package ChaCha20 implements the core ChaCha20 function as specified in https://tools.ietf.org/html/rfc7539#section-2.3. +package chacha20 + +import "encoding/binary" + +const rounds = 20 + +// core applies the ChaCha20 core function to 16-byte input in, 32-byte key k, +// and 16-byte constant c, and puts the result into 64-byte array out. +func core(out *[64]byte, in *[16]byte, k *[32]byte) { + j0 := uint32(0x61707865) + j1 := uint32(0x3320646e) + j2 := uint32(0x79622d32) + j3 := uint32(0x6b206574) + j4 := binary.LittleEndian.Uint32(k[0:4]) + j5 := binary.LittleEndian.Uint32(k[4:8]) + j6 := binary.LittleEndian.Uint32(k[8:12]) + j7 := binary.LittleEndian.Uint32(k[12:16]) + j8 := binary.LittleEndian.Uint32(k[16:20]) + j9 := binary.LittleEndian.Uint32(k[20:24]) + j10 := binary.LittleEndian.Uint32(k[24:28]) + j11 := binary.LittleEndian.Uint32(k[28:32]) + j12 := binary.LittleEndian.Uint32(in[0:4]) + j13 := binary.LittleEndian.Uint32(in[4:8]) + j14 := binary.LittleEndian.Uint32(in[8:12]) + j15 := binary.LittleEndian.Uint32(in[12:16]) + + x0, x1, x2, x3, x4, x5, x6, x7 := j0, j1, j2, j3, j4, j5, j6, j7 + x8, x9, x10, x11, x12, x13, x14, x15 := j8, j9, j10, j11, j12, j13, j14, j15 + + for i := 0; i < rounds; i += 2 { + x0 += x4 + x12 ^= x0 + x12 = (x12 << 16) | (x12 >> (16)) + x8 += x12 + x4 ^= x8 + x4 = (x4 << 12) | (x4 >> (20)) + x0 += x4 + x12 ^= x0 + x12 = (x12 << 8) | (x12 >> (24)) + x8 += x12 + x4 ^= x8 + x4 = (x4 << 7) | (x4 >> (25)) + x1 += x5 + x13 ^= x1 + x13 = (x13 << 16) | (x13 >> 16) + x9 += x13 + x5 ^= x9 + x5 = (x5 << 12) | (x5 >> 20) + x1 += x5 + x13 ^= x1 + x13 = (x13 << 8) | (x13 >> 24) + x9 += x13 + x5 ^= x9 + x5 = (x5 << 7) | (x5 >> 25) + x2 += x6 + x14 ^= x2 + x14 = (x14 << 16) | (x14 >> 16) + x10 += x14 + x6 ^= x10 + x6 = (x6 << 12) | (x6 >> 20) + x2 += x6 + x14 ^= x2 + x14 = (x14 << 8) | (x14 >> 24) + x10 += x14 + x6 ^= x10 + x6 = (x6 << 7) | (x6 >> 25) + x3 += x7 + x15 ^= x3 + x15 = (x15 << 16) | (x15 >> 16) + x11 += x15 + x7 ^= x11 + x7 = (x7 << 12) | (x7 >> 20) + x3 += x7 + x15 ^= x3 + x15 = (x15 << 8) | (x15 >> 24) + x11 += x15 + x7 ^= x11 + x7 = (x7 << 7) | (x7 >> 25) + x0 += x5 + x15 ^= x0 + x15 = (x15 << 16) | (x15 >> 16) + x10 += x15 + x5 ^= x10 + x5 = (x5 << 12) | (x5 >> 20) + x0 += x5 + x15 ^= x0 + x15 = (x15 << 8) | (x15 >> 24) + x10 += x15 + x5 ^= x10 + x5 = (x5 << 7) | (x5 >> 25) + x1 += x6 + x12 ^= x1 + x12 = (x12 << 16) | (x12 >> 16) + x11 += x12 + x6 ^= x11 + x6 = (x6 << 12) | (x6 >> 20) + x1 += x6 + x12 ^= x1 + x12 = (x12 << 8) | (x12 >> 24) + x11 += x12 + x6 ^= x11 + x6 = (x6 << 7) | (x6 >> 25) + x2 += x7 + x13 ^= x2 + x13 = (x13 << 16) | (x13 >> 16) + x8 += x13 + x7 ^= x8 + x7 = (x7 << 12) | (x7 >> 20) + x2 += x7 + x13 ^= x2 + x13 = (x13 << 8) | (x13 >> 24) + x8 += x13 + x7 ^= x8 + x7 = (x7 << 7) | (x7 >> 25) + x3 += x4 + x14 ^= x3 + x14 = (x14 << 16) | (x14 >> 16) + x9 += x14 + x4 ^= x9 + x4 = (x4 << 12) | (x4 >> 20) + x3 += x4 + x14 ^= x3 + x14 = (x14 << 8) | (x14 >> 24) + x9 += x14 + x4 ^= x9 + x4 = (x4 << 7) | (x4 >> 25) + } + + x0 += j0 + x1 += j1 + x2 += j2 + x3 += j3 + x4 += j4 + x5 += j5 + x6 += j6 + x7 += j7 + x8 += j8 + x9 += j9 + x10 += j10 + x11 += j11 + x12 += j12 + x13 += j13 + x14 += j14 + x15 += j15 + + binary.LittleEndian.PutUint32(out[0:4], x0) + binary.LittleEndian.PutUint32(out[4:8], x1) + binary.LittleEndian.PutUint32(out[8:12], x2) + binary.LittleEndian.PutUint32(out[12:16], x3) + binary.LittleEndian.PutUint32(out[16:20], x4) + binary.LittleEndian.PutUint32(out[20:24], x5) + binary.LittleEndian.PutUint32(out[24:28], x6) + binary.LittleEndian.PutUint32(out[28:32], x7) + binary.LittleEndian.PutUint32(out[32:36], x8) + binary.LittleEndian.PutUint32(out[36:40], x9) + binary.LittleEndian.PutUint32(out[40:44], x10) + binary.LittleEndian.PutUint32(out[44:48], x11) + binary.LittleEndian.PutUint32(out[48:52], x12) + binary.LittleEndian.PutUint32(out[52:56], x13) + binary.LittleEndian.PutUint32(out[56:60], x14) + binary.LittleEndian.PutUint32(out[60:64], x15) +} + +// XORKeyStream crypts bytes from in to out using the given key and counters. +// In and out may be the same slice but otherwise should not overlap. Counter +// contains the raw ChaCha20 counter bytes (i.e. block counter followed by +// nonce). +func XORKeyStream(out, in []byte, counter *[16]byte, key *[32]byte) { + var block [64]byte + var counterCopy [16]byte + copy(counterCopy[:], counter[:]) + + for len(in) >= 64 { + core(&block, &counterCopy, key) + for i, x := range block { + out[i] = in[i] ^ x + } + u := uint32(1) + for i := 0; i < 4; i++ { + u += uint32(counterCopy[i]) + counterCopy[i] = byte(u) + u >>= 8 + } + in = in[64:] + out = out[64:] + } + + if len(in) > 0 { + core(&block, &counterCopy, key) + for i, v := range in { + out[i] = v ^ block[i] + } + } +} diff --git a/vendor/golang.org/x/crypto/poly1305/poly1305.go b/vendor/golang.org/x/crypto/poly1305/poly1305.go new file mode 100644 index 000000000..f562fa571 --- /dev/null +++ b/vendor/golang.org/x/crypto/poly1305/poly1305.go @@ -0,0 +1,33 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/* +Package poly1305 implements Poly1305 one-time message authentication code as +specified in https://cr.yp.to/mac/poly1305-20050329.pdf. + +Poly1305 is a fast, one-time authentication function. It is infeasible for an +attacker to generate an authenticator for a message without the key. However, a +key must only be used for a single message. Authenticating two different +messages with the same key allows an attacker to forge authenticators for other +messages with the same key. + +Poly1305 was originally coupled with AES in order to make Poly1305-AES. AES was +used with a fixed key in order to generate one-time keys from an nonce. +However, in this package AES isn't used and the one-time key is specified +directly. +*/ +package poly1305 // import "golang.org/x/crypto/poly1305" + +import "crypto/subtle" + +// TagSize is the size, in bytes, of a poly1305 authenticator. +const TagSize = 16 + +// Verify returns true if mac is a valid authenticator for m with the given +// key. +func Verify(mac *[16]byte, m []byte, key *[32]byte) bool { + var tmp [16]byte + Sum(&tmp, m, key) + return subtle.ConstantTimeCompare(tmp[:], mac[:]) == 1 +} diff --git a/vendor/golang.org/x/crypto/poly1305/sum_amd64.go b/vendor/golang.org/x/crypto/poly1305/sum_amd64.go new file mode 100644 index 000000000..4dd72fe79 --- /dev/null +++ b/vendor/golang.org/x/crypto/poly1305/sum_amd64.go @@ -0,0 +1,22 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build amd64,!gccgo,!appengine + +package poly1305 + +// This function is implemented in sum_amd64.s +//go:noescape +func poly1305(out *[16]byte, m *byte, mlen uint64, key *[32]byte) + +// Sum generates an authenticator for m using a one-time key and puts the +// 16-byte result into out. Authenticating two different messages with the same +// key allows an attacker to forge messages at will. +func Sum(out *[16]byte, m []byte, key *[32]byte) { + var mPtr *byte + if len(m) > 0 { + mPtr = &m[0] + } + poly1305(out, mPtr, uint64(len(m)), key) +} diff --git a/vendor/golang.org/x/crypto/poly1305/sum_amd64.s b/vendor/golang.org/x/crypto/poly1305/sum_amd64.s new file mode 100644 index 000000000..2edae6382 --- /dev/null +++ b/vendor/golang.org/x/crypto/poly1305/sum_amd64.s @@ -0,0 +1,125 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build amd64,!gccgo,!appengine + +#include "textflag.h" + +#define POLY1305_ADD(msg, h0, h1, h2) \ + ADDQ 0(msg), h0; \ + ADCQ 8(msg), h1; \ + ADCQ $1, h2; \ + LEAQ 16(msg), msg + +#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \ + MOVQ r0, AX; \ + MULQ h0; \ + MOVQ AX, t0; \ + MOVQ DX, t1; \ + MOVQ r0, AX; \ + MULQ h1; \ + ADDQ AX, t1; \ + ADCQ $0, DX; \ + MOVQ r0, t2; \ + IMULQ h2, t2; \ + ADDQ DX, t2; \ + \ + MOVQ r1, AX; \ + MULQ h0; \ + ADDQ AX, t1; \ + ADCQ $0, DX; \ + MOVQ DX, h0; \ + MOVQ r1, t3; \ + IMULQ h2, t3; \ + MOVQ r1, AX; \ + MULQ h1; \ + ADDQ AX, t2; \ + ADCQ DX, t3; \ + ADDQ h0, t2; \ + ADCQ $0, t3; \ + \ + MOVQ t0, h0; \ + MOVQ t1, h1; \ + MOVQ t2, h2; \ + ANDQ $3, h2; \ + MOVQ t2, t0; \ + ANDQ $0xFFFFFFFFFFFFFFFC, t0; \ + ADDQ t0, h0; \ + ADCQ t3, h1; \ + ADCQ $0, h2; \ + SHRQ $2, t3, t2; \ + SHRQ $2, t3; \ + ADDQ t2, h0; \ + ADCQ t3, h1; \ + ADCQ $0, h2 + +DATA ·poly1305Mask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF +DATA ·poly1305Mask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC +GLOBL ·poly1305Mask<>(SB), RODATA, $16 + +// func poly1305(out *[16]byte, m *byte, mlen uint64, key *[32]key) +TEXT ·poly1305(SB), $0-32 + MOVQ out+0(FP), DI + MOVQ m+8(FP), SI + MOVQ mlen+16(FP), R15 + MOVQ key+24(FP), AX + + MOVQ 0(AX), R11 + MOVQ 8(AX), R12 + ANDQ ·poly1305Mask<>(SB), R11 // r0 + ANDQ ·poly1305Mask<>+8(SB), R12 // r1 + XORQ R8, R8 // h0 + XORQ R9, R9 // h1 + XORQ R10, R10 // h2 + + CMPQ R15, $16 + JB bytes_between_0_and_15 + +loop: + POLY1305_ADD(SI, R8, R9, R10) + +multiply: + POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14) + SUBQ $16, R15 + CMPQ R15, $16 + JAE loop + +bytes_between_0_and_15: + TESTQ R15, R15 + JZ done + MOVQ $1, BX + XORQ CX, CX + XORQ R13, R13 + ADDQ R15, SI + +flush_buffer: + SHLQ $8, BX, CX + SHLQ $8, BX + MOVB -1(SI), R13 + XORQ R13, BX + DECQ SI + DECQ R15 + JNZ flush_buffer + + ADDQ BX, R8 + ADCQ CX, R9 + ADCQ $0, R10 + MOVQ $16, R15 + JMP multiply + +done: + MOVQ R8, AX + MOVQ R9, BX + SUBQ $0xFFFFFFFFFFFFFFFB, AX + SBBQ $0xFFFFFFFFFFFFFFFF, BX + SBBQ $3, R10 + CMOVQCS R8, AX + CMOVQCS R9, BX + MOVQ key+24(FP), R8 + ADDQ 16(R8), AX + ADCQ 24(R8), BX + + MOVQ AX, 0(DI) + MOVQ BX, 8(DI) + RET diff --git a/vendor/golang.org/x/crypto/poly1305/sum_arm.go b/vendor/golang.org/x/crypto/poly1305/sum_arm.go new file mode 100644 index 000000000..5dc321c2f --- /dev/null +++ b/vendor/golang.org/x/crypto/poly1305/sum_arm.go @@ -0,0 +1,22 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build arm,!gccgo,!appengine,!nacl + +package poly1305 + +// This function is implemented in sum_arm.s +//go:noescape +func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]byte) + +// Sum generates an authenticator for m using a one-time key and puts the +// 16-byte result into out. Authenticating two different messages with the same +// key allows an attacker to forge messages at will. +func Sum(out *[16]byte, m []byte, key *[32]byte) { + var mPtr *byte + if len(m) > 0 { + mPtr = &m[0] + } + poly1305_auth_armv6(out, mPtr, uint32(len(m)), key) +} diff --git a/vendor/golang.org/x/crypto/poly1305/sum_arm.s b/vendor/golang.org/x/crypto/poly1305/sum_arm.s new file mode 100644 index 000000000..f70b4ac48 --- /dev/null +++ b/vendor/golang.org/x/crypto/poly1305/sum_arm.s @@ -0,0 +1,427 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build arm,!gccgo,!appengine,!nacl + +#include "textflag.h" + +// This code was translated into a form compatible with 5a from the public +// domain source by Andrew Moon: github.com/floodyberry/poly1305-opt/blob/master/app/extensions/poly1305. + +DATA ·poly1305_init_constants_armv6<>+0x00(SB)/4, $0x3ffffff +DATA ·poly1305_init_constants_armv6<>+0x04(SB)/4, $0x3ffff03 +DATA ·poly1305_init_constants_armv6<>+0x08(SB)/4, $0x3ffc0ff +DATA ·poly1305_init_constants_armv6<>+0x0c(SB)/4, $0x3f03fff +DATA ·poly1305_init_constants_armv6<>+0x10(SB)/4, $0x00fffff +GLOBL ·poly1305_init_constants_armv6<>(SB), 8, $20 + +// Warning: the linker may use R11 to synthesize certain instructions. Please +// take care and verify that no synthetic instructions use it. + +TEXT poly1305_init_ext_armv6<>(SB), NOSPLIT, $0 + // Needs 16 bytes of stack and 64 bytes of space pointed to by R0. (It + // might look like it's only 60 bytes of space but the final four bytes + // will be written by another function.) We need to skip over four + // bytes of stack because that's saving the value of 'g'. + ADD $4, R13, R8 + MOVM.IB [R4-R7], (R8) + MOVM.IA.W (R1), [R2-R5] + MOVW $·poly1305_init_constants_armv6<>(SB), R7 + MOVW R2, R8 + MOVW R2>>26, R9 + MOVW R3>>20, g + MOVW R4>>14, R11 + MOVW R5>>8, R12 + ORR R3<<6, R9, R9 + ORR R4<<12, g, g + ORR R5<<18, R11, R11 + MOVM.IA (R7), [R2-R6] + AND R8, R2, R2 + AND R9, R3, R3 + AND g, R4, R4 + AND R11, R5, R5 + AND R12, R6, R6 + MOVM.IA.W [R2-R6], (R0) + EOR R2, R2, R2 + EOR R3, R3, R3 + EOR R4, R4, R4 + EOR R5, R5, R5 + EOR R6, R6, R6 + MOVM.IA.W [R2-R6], (R0) + MOVM.IA.W (R1), [R2-R5] + MOVM.IA [R2-R6], (R0) + ADD $20, R13, R0 + MOVM.DA (R0), [R4-R7] + RET + +#define MOVW_UNALIGNED(Rsrc, Rdst, Rtmp, offset) \ + MOVBU (offset+0)(Rsrc), Rtmp; \ + MOVBU Rtmp, (offset+0)(Rdst); \ + MOVBU (offset+1)(Rsrc), Rtmp; \ + MOVBU Rtmp, (offset+1)(Rdst); \ + MOVBU (offset+2)(Rsrc), Rtmp; \ + MOVBU Rtmp, (offset+2)(Rdst); \ + MOVBU (offset+3)(Rsrc), Rtmp; \ + MOVBU Rtmp, (offset+3)(Rdst) + +TEXT poly1305_blocks_armv6<>(SB), NOSPLIT, $0 + // Needs 24 bytes of stack for saved registers and then 88 bytes of + // scratch space after that. We assume that 24 bytes at (R13) have + // already been used: four bytes for the link register saved in the + // prelude of poly1305_auth_armv6, four bytes for saving the value of g + // in that function and 16 bytes of scratch space used around + // poly1305_finish_ext_armv6_skip1. + ADD $24, R13, R12 + MOVM.IB [R4-R8, R14], (R12) + MOVW R0, 88(R13) + MOVW R1, 92(R13) + MOVW R2, 96(R13) + MOVW R1, R14 + MOVW R2, R12 + MOVW 56(R0), R8 + WORD $0xe1180008 // TST R8, R8 not working see issue 5921 + EOR R6, R6, R6 + MOVW.EQ $(1<<24), R6 + MOVW R6, 84(R13) + ADD $116, R13, g + MOVM.IA (R0), [R0-R9] + MOVM.IA [R0-R4], (g) + CMP $16, R12 + BLO poly1305_blocks_armv6_done + +poly1305_blocks_armv6_mainloop: + WORD $0xe31e0003 // TST R14, #3 not working see issue 5921 + BEQ poly1305_blocks_armv6_mainloop_aligned + ADD $100, R13, g + MOVW_UNALIGNED(R14, g, R0, 0) + MOVW_UNALIGNED(R14, g, R0, 4) + MOVW_UNALIGNED(R14, g, R0, 8) + MOVW_UNALIGNED(R14, g, R0, 12) + MOVM.IA (g), [R0-R3] + ADD $16, R14 + B poly1305_blocks_armv6_mainloop_loaded + +poly1305_blocks_armv6_mainloop_aligned: + MOVM.IA.W (R14), [R0-R3] + +poly1305_blocks_armv6_mainloop_loaded: + MOVW R0>>26, g + MOVW R1>>20, R11 + MOVW R2>>14, R12 + MOVW R14, 92(R13) + MOVW R3>>8, R4 + ORR R1<<6, g, g + ORR R2<<12, R11, R11 + ORR R3<<18, R12, R12 + BIC $0xfc000000, R0, R0 + BIC $0xfc000000, g, g + MOVW 84(R13), R3 + BIC $0xfc000000, R11, R11 + BIC $0xfc000000, R12, R12 + ADD R0, R5, R5 + ADD g, R6, R6 + ORR R3, R4, R4 + ADD R11, R7, R7 + ADD $116, R13, R14 + ADD R12, R8, R8 + ADD R4, R9, R9 + MOVM.IA (R14), [R0-R4] + MULLU R4, R5, (R11, g) + MULLU R3, R5, (R14, R12) + MULALU R3, R6, (R11, g) + MULALU R2, R6, (R14, R12) + MULALU R2, R7, (R11, g) + MULALU R1, R7, (R14, R12) + ADD R4<<2, R4, R4 + ADD R3<<2, R3, R3 + MULALU R1, R8, (R11, g) + MULALU R0, R8, (R14, R12) + MULALU R0, R9, (R11, g) + MULALU R4, R9, (R14, R12) + MOVW g, 76(R13) + MOVW R11, 80(R13) + MOVW R12, 68(R13) + MOVW R14, 72(R13) + MULLU R2, R5, (R11, g) + MULLU R1, R5, (R14, R12) + MULALU R1, R6, (R11, g) + MULALU R0, R6, (R14, R12) + MULALU R0, R7, (R11, g) + MULALU R4, R7, (R14, R12) + ADD R2<<2, R2, R2 + ADD R1<<2, R1, R1 + MULALU R4, R8, (R11, g) + MULALU R3, R8, (R14, R12) + MULALU R3, R9, (R11, g) + MULALU R2, R9, (R14, R12) + MOVW g, 60(R13) + MOVW R11, 64(R13) + MOVW R12, 52(R13) + MOVW R14, 56(R13) + MULLU R0, R5, (R11, g) + MULALU R4, R6, (R11, g) + MULALU R3, R7, (R11, g) + MULALU R2, R8, (R11, g) + MULALU R1, R9, (R11, g) + ADD $52, R13, R0 + MOVM.IA (R0), [R0-R7] + MOVW g>>26, R12 + MOVW R4>>26, R14 + ORR R11<<6, R12, R12 + ORR R5<<6, R14, R14 + BIC $0xfc000000, g, g + BIC $0xfc000000, R4, R4 + ADD.S R12, R0, R0 + ADC $0, R1, R1 + ADD.S R14, R6, R6 + ADC $0, R7, R7 + MOVW R0>>26, R12 + MOVW R6>>26, R14 + ORR R1<<6, R12, R12 + ORR R7<<6, R14, R14 + BIC $0xfc000000, R0, R0 + BIC $0xfc000000, R6, R6 + ADD R14<<2, R14, R14 + ADD.S R12, R2, R2 + ADC $0, R3, R3 + ADD R14, g, g + MOVW R2>>26, R12 + MOVW g>>26, R14 + ORR R3<<6, R12, R12 + BIC $0xfc000000, g, R5 + BIC $0xfc000000, R2, R7 + ADD R12, R4, R4 + ADD R14, R0, R0 + MOVW R4>>26, R12 + BIC $0xfc000000, R4, R8 + ADD R12, R6, R9 + MOVW 96(R13), R12 + MOVW 92(R13), R14 + MOVW R0, R6 + CMP $32, R12 + SUB $16, R12, R12 + MOVW R12, 96(R13) + BHS poly1305_blocks_armv6_mainloop + +poly1305_blocks_armv6_done: + MOVW 88(R13), R12 + MOVW R5, 20(R12) + MOVW R6, 24(R12) + MOVW R7, 28(R12) + MOVW R8, 32(R12) + MOVW R9, 36(R12) + ADD $48, R13, R0 + MOVM.DA (R0), [R4-R8, R14] + RET + +#define MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp) \ + MOVBU.P 1(Rsrc), Rtmp; \ + MOVBU.P Rtmp, 1(Rdst); \ + MOVBU.P 1(Rsrc), Rtmp; \ + MOVBU.P Rtmp, 1(Rdst) + +#define MOVWP_UNALIGNED(Rsrc, Rdst, Rtmp) \ + MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp); \ + MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp) + +// func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]key) +TEXT ·poly1305_auth_armv6(SB), $196-16 + // The value 196, just above, is the sum of 64 (the size of the context + // structure) and 132 (the amount of stack needed). + // + // At this point, the stack pointer (R13) has been moved down. It + // points to the saved link register and there's 196 bytes of free + // space above it. + // + // The stack for this function looks like: + // + // +--------------------- + // | + // | 64 bytes of context structure + // | + // +--------------------- + // | + // | 112 bytes for poly1305_blocks_armv6 + // | + // +--------------------- + // | 16 bytes of final block, constructed at + // | poly1305_finish_ext_armv6_skip8 + // +--------------------- + // | four bytes of saved 'g' + // +--------------------- + // | lr, saved by prelude <- R13 points here + // +--------------------- + MOVW g, 4(R13) + + MOVW out+0(FP), R4 + MOVW m+4(FP), R5 + MOVW mlen+8(FP), R6 + MOVW key+12(FP), R7 + + ADD $136, R13, R0 // 136 = 4 + 4 + 16 + 112 + MOVW R7, R1 + + // poly1305_init_ext_armv6 will write to the stack from R13+4, but + // that's ok because none of the other values have been written yet. + BL poly1305_init_ext_armv6<>(SB) + BIC.S $15, R6, R2 + BEQ poly1305_auth_armv6_noblocks + ADD $136, R13, R0 + MOVW R5, R1 + ADD R2, R5, R5 + SUB R2, R6, R6 + BL poly1305_blocks_armv6<>(SB) + +poly1305_auth_armv6_noblocks: + ADD $136, R13, R0 + MOVW R5, R1 + MOVW R6, R2 + MOVW R4, R3 + + MOVW R0, R5 + MOVW R1, R6 + MOVW R2, R7 + MOVW R3, R8 + AND.S R2, R2, R2 + BEQ poly1305_finish_ext_armv6_noremaining + EOR R0, R0 + ADD $8, R13, R9 // 8 = offset to 16 byte scratch space + MOVW R0, (R9) + MOVW R0, 4(R9) + MOVW R0, 8(R9) + MOVW R0, 12(R9) + WORD $0xe3110003 // TST R1, #3 not working see issue 5921 + BEQ poly1305_finish_ext_armv6_aligned + WORD $0xe3120008 // TST R2, #8 not working see issue 5921 + BEQ poly1305_finish_ext_armv6_skip8 + MOVWP_UNALIGNED(R1, R9, g) + MOVWP_UNALIGNED(R1, R9, g) + +poly1305_finish_ext_armv6_skip8: + WORD $0xe3120004 // TST $4, R2 not working see issue 5921 + BEQ poly1305_finish_ext_armv6_skip4 + MOVWP_UNALIGNED(R1, R9, g) + +poly1305_finish_ext_armv6_skip4: + WORD $0xe3120002 // TST $2, R2 not working see issue 5921 + BEQ poly1305_finish_ext_armv6_skip2 + MOVHUP_UNALIGNED(R1, R9, g) + B poly1305_finish_ext_armv6_skip2 + +poly1305_finish_ext_armv6_aligned: + WORD $0xe3120008 // TST R2, #8 not working see issue 5921 + BEQ poly1305_finish_ext_armv6_skip8_aligned + MOVM.IA.W (R1), [g-R11] + MOVM.IA.W [g-R11], (R9) + +poly1305_finish_ext_armv6_skip8_aligned: + WORD $0xe3120004 // TST $4, R2 not working see issue 5921 + BEQ poly1305_finish_ext_armv6_skip4_aligned + MOVW.P 4(R1), g + MOVW.P g, 4(R9) + +poly1305_finish_ext_armv6_skip4_aligned: + WORD $0xe3120002 // TST $2, R2 not working see issue 5921 + BEQ poly1305_finish_ext_armv6_skip2 + MOVHU.P 2(R1), g + MOVH.P g, 2(R9) + +poly1305_finish_ext_armv6_skip2: + WORD $0xe3120001 // TST $1, R2 not working see issue 5921 + BEQ poly1305_finish_ext_armv6_skip1 + MOVBU.P 1(R1), g + MOVBU.P g, 1(R9) + +poly1305_finish_ext_armv6_skip1: + MOVW $1, R11 + MOVBU R11, 0(R9) + MOVW R11, 56(R5) + MOVW R5, R0 + ADD $8, R13, R1 + MOVW $16, R2 + BL poly1305_blocks_armv6<>(SB) + +poly1305_finish_ext_armv6_noremaining: + MOVW 20(R5), R0 + MOVW 24(R5), R1 + MOVW 28(R5), R2 + MOVW 32(R5), R3 + MOVW 36(R5), R4 + MOVW R4>>26, R12 + BIC $0xfc000000, R4, R4 + ADD R12<<2, R12, R12 + ADD R12, R0, R0 + MOVW R0>>26, R12 + BIC $0xfc000000, R0, R0 + ADD R12, R1, R1 + MOVW R1>>26, R12 + BIC $0xfc000000, R1, R1 + ADD R12, R2, R2 + MOVW R2>>26, R12 + BIC $0xfc000000, R2, R2 + ADD R12, R3, R3 + MOVW R3>>26, R12 + BIC $0xfc000000, R3, R3 + ADD R12, R4, R4 + ADD $5, R0, R6 + MOVW R6>>26, R12 + BIC $0xfc000000, R6, R6 + ADD R12, R1, R7 + MOVW R7>>26, R12 + BIC $0xfc000000, R7, R7 + ADD R12, R2, g + MOVW g>>26, R12 + BIC $0xfc000000, g, g + ADD R12, R3, R11 + MOVW $-(1<<26), R12 + ADD R11>>26, R12, R12 + BIC $0xfc000000, R11, R11 + ADD R12, R4, R9 + MOVW R9>>31, R12 + SUB $1, R12 + AND R12, R6, R6 + AND R12, R7, R7 + AND R12, g, g + AND R12, R11, R11 + AND R12, R9, R9 + MVN R12, R12 + AND R12, R0, R0 + AND R12, R1, R1 + AND R12, R2, R2 + AND R12, R3, R3 + AND R12, R4, R4 + ORR R6, R0, R0 + ORR R7, R1, R1 + ORR g, R2, R2 + ORR R11, R3, R3 + ORR R9, R4, R4 + ORR R1<<26, R0, R0 + MOVW R1>>6, R1 + ORR R2<<20, R1, R1 + MOVW R2>>12, R2 + ORR R3<<14, R2, R2 + MOVW R3>>18, R3 + ORR R4<<8, R3, R3 + MOVW 40(R5), R6 + MOVW 44(R5), R7 + MOVW 48(R5), g + MOVW 52(R5), R11 + ADD.S R6, R0, R0 + ADC.S R7, R1, R1 + ADC.S g, R2, R2 + ADC.S R11, R3, R3 + MOVM.IA [R0-R3], (R8) + MOVW R5, R12 + EOR R0, R0, R0 + EOR R1, R1, R1 + EOR R2, R2, R2 + EOR R3, R3, R3 + EOR R4, R4, R4 + EOR R5, R5, R5 + EOR R6, R6, R6 + EOR R7, R7, R7 + MOVM.IA.W [R0-R7], (R12) + MOVM.IA [R0-R7], (R12) + MOVW 4(R13), g + RET diff --git a/vendor/golang.org/x/crypto/poly1305/sum_ref.go b/vendor/golang.org/x/crypto/poly1305/sum_ref.go new file mode 100644 index 000000000..b2805a5ca --- /dev/null +++ b/vendor/golang.org/x/crypto/poly1305/sum_ref.go @@ -0,0 +1,141 @@ +// Copyright 2012 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !amd64,!arm gccgo appengine nacl + +package poly1305 + +import "encoding/binary" + +// Sum generates an authenticator for msg using a one-time key and puts the +// 16-byte result into out. Authenticating two different messages with the same +// key allows an attacker to forge messages at will. +func Sum(out *[TagSize]byte, msg []byte, key *[32]byte) { + var ( + h0, h1, h2, h3, h4 uint32 // the hash accumulators + r0, r1, r2, r3, r4 uint64 // the r part of the key + ) + + r0 = uint64(binary.LittleEndian.Uint32(key[0:]) & 0x3ffffff) + r1 = uint64((binary.LittleEndian.Uint32(key[3:]) >> 2) & 0x3ffff03) + r2 = uint64((binary.LittleEndian.Uint32(key[6:]) >> 4) & 0x3ffc0ff) + r3 = uint64((binary.LittleEndian.Uint32(key[9:]) >> 6) & 0x3f03fff) + r4 = uint64((binary.LittleEndian.Uint32(key[12:]) >> 8) & 0x00fffff) + + R1, R2, R3, R4 := r1*5, r2*5, r3*5, r4*5 + + for len(msg) >= TagSize { + // h += msg + h0 += binary.LittleEndian.Uint32(msg[0:]) & 0x3ffffff + h1 += (binary.LittleEndian.Uint32(msg[3:]) >> 2) & 0x3ffffff + h2 += (binary.LittleEndian.Uint32(msg[6:]) >> 4) & 0x3ffffff + h3 += (binary.LittleEndian.Uint32(msg[9:]) >> 6) & 0x3ffffff + h4 += (binary.LittleEndian.Uint32(msg[12:]) >> 8) | (1 << 24) + + // h *= r + d0 := (uint64(h0) * r0) + (uint64(h1) * R4) + (uint64(h2) * R3) + (uint64(h3) * R2) + (uint64(h4) * R1) + d1 := (d0 >> 26) + (uint64(h0) * r1) + (uint64(h1) * r0) + (uint64(h2) * R4) + (uint64(h3) * R3) + (uint64(h4) * R2) + d2 := (d1 >> 26) + (uint64(h0) * r2) + (uint64(h1) * r1) + (uint64(h2) * r0) + (uint64(h3) * R4) + (uint64(h4) * R3) + d3 := (d2 >> 26) + (uint64(h0) * r3) + (uint64(h1) * r2) + (uint64(h2) * r1) + (uint64(h3) * r0) + (uint64(h4) * R4) + d4 := (d3 >> 26) + (uint64(h0) * r4) + (uint64(h1) * r3) + (uint64(h2) * r2) + (uint64(h3) * r1) + (uint64(h4) * r0) + + // h %= p + h0 = uint32(d0) & 0x3ffffff + h1 = uint32(d1) & 0x3ffffff + h2 = uint32(d2) & 0x3ffffff + h3 = uint32(d3) & 0x3ffffff + h4 = uint32(d4) & 0x3ffffff + + h0 += uint32(d4>>26) * 5 + h1 += h0 >> 26 + h0 = h0 & 0x3ffffff + + msg = msg[TagSize:] + } + + if len(msg) > 0 { + var block [TagSize]byte + off := copy(block[:], msg) + block[off] = 0x01 + + // h += msg + h0 += binary.LittleEndian.Uint32(block[0:]) & 0x3ffffff + h1 += (binary.LittleEndian.Uint32(block[3:]) >> 2) & 0x3ffffff + h2 += (binary.LittleEndian.Uint32(block[6:]) >> 4) & 0x3ffffff + h3 += (binary.LittleEndian.Uint32(block[9:]) >> 6) & 0x3ffffff + h4 += (binary.LittleEndian.Uint32(block[12:]) >> 8) + + // h *= r + d0 := (uint64(h0) * r0) + (uint64(h1) * R4) + (uint64(h2) * R3) + (uint64(h3) * R2) + (uint64(h4) * R1) + d1 := (d0 >> 26) + (uint64(h0) * r1) + (uint64(h1) * r0) + (uint64(h2) * R4) + (uint64(h3) * R3) + (uint64(h4) * R2) + d2 := (d1 >> 26) + (uint64(h0) * r2) + (uint64(h1) * r1) + (uint64(h2) * r0) + (uint64(h3) * R4) + (uint64(h4) * R3) + d3 := (d2 >> 26) + (uint64(h0) * r3) + (uint64(h1) * r2) + (uint64(h2) * r1) + (uint64(h3) * r0) + (uint64(h4) * R4) + d4 := (d3 >> 26) + (uint64(h0) * r4) + (uint64(h1) * r3) + (uint64(h2) * r2) + (uint64(h3) * r1) + (uint64(h4) * r0) + + // h %= p + h0 = uint32(d0) & 0x3ffffff + h1 = uint32(d1) & 0x3ffffff + h2 = uint32(d2) & 0x3ffffff + h3 = uint32(d3) & 0x3ffffff + h4 = uint32(d4) & 0x3ffffff + + h0 += uint32(d4>>26) * 5 + h1 += h0 >> 26 + h0 = h0 & 0x3ffffff + } + + // h %= p reduction + h2 += h1 >> 26 + h1 &= 0x3ffffff + h3 += h2 >> 26 + h2 &= 0x3ffffff + h4 += h3 >> 26 + h3 &= 0x3ffffff + h0 += 5 * (h4 >> 26) + h4 &= 0x3ffffff + h1 += h0 >> 26 + h0 &= 0x3ffffff + + // h - p + t0 := h0 + 5 + t1 := h1 + (t0 >> 26) + t2 := h2 + (t1 >> 26) + t3 := h3 + (t2 >> 26) + t4 := h4 + (t3 >> 26) - (1 << 26) + t0 &= 0x3ffffff + t1 &= 0x3ffffff + t2 &= 0x3ffffff + t3 &= 0x3ffffff + + // select h if h < p else h - p + t_mask := (t4 >> 31) - 1 + h_mask := ^t_mask + h0 = (h0 & h_mask) | (t0 & t_mask) + h1 = (h1 & h_mask) | (t1 & t_mask) + h2 = (h2 & h_mask) | (t2 & t_mask) + h3 = (h3 & h_mask) | (t3 & t_mask) + h4 = (h4 & h_mask) | (t4 & t_mask) + + // h %= 2^128 + h0 |= h1 << 26 + h1 = ((h1 >> 6) | (h2 << 20)) + h2 = ((h2 >> 12) | (h3 << 14)) + h3 = ((h3 >> 18) | (h4 << 8)) + + // s: the s part of the key + // tag = (h + s) % (2^128) + t := uint64(h0) + uint64(binary.LittleEndian.Uint32(key[16:])) + h0 = uint32(t) + t = uint64(h1) + uint64(binary.LittleEndian.Uint32(key[20:])) + (t >> 32) + h1 = uint32(t) + t = uint64(h2) + uint64(binary.LittleEndian.Uint32(key[24:])) + (t >> 32) + h2 = uint32(t) + t = uint64(h3) + uint64(binary.LittleEndian.Uint32(key[28:])) + (t >> 32) + h3 = uint32(t) + + binary.LittleEndian.PutUint32(out[0:], h0) + binary.LittleEndian.PutUint32(out[4:], h1) + binary.LittleEndian.PutUint32(out[8:], h2) + binary.LittleEndian.PutUint32(out[12:], h3) +} diff --git a/vendor/vendor.json b/vendor/vendor.json index be3ae0ed5..f539fc07d 100644 --- a/vendor/vendor.json +++ b/vendor/vendor.json @@ -389,6 +389,12 @@ "revision": "43ed500fe4d485d97534014d9f98521216240002", "revisionTime": "2017-08-28T17:39:33Z" }, + { + "checksumSHA1": "/VYXTlcksV92qL4Avr5S3NJ/Fxs=", + "path": "github.com/minio/sio", + "revision": "d8be2518a912f0db0dd17dc55f46da4360ee60d8", + "revisionTime": "2017-09-06T19:57:40Z" + }, { "checksumSHA1": "zvQr4zOz1/g/Fui6co0sctxrJ28=", "path": "github.com/nats-io/go-nats", @@ -488,6 +494,24 @@ "revision": "3b8db5e93c4c02efbc313e17b2e796b0914a01fb", "revisionTime": "2016-12-15T19:56:52Z" }, + { + "checksumSHA1": "CvMkf3KUUGUVHibg6G/zI7XtVbM=", + "path": "golang.org/x/crypto/chacha20poly1305", + "revision": "81e90905daefcd6fd217b62423c0908922eadb30", + "revisionTime": "2017-08-25T20:24:07Z" + }, + { + "checksumSHA1": "8f1lWFSOLu7O6Jqk/e+ydH1yoms=", + "path": "golang.org/x/crypto/chacha20poly1305/internal/chacha20", + "revision": "81e90905daefcd6fd217b62423c0908922eadb30", + "revisionTime": "2017-08-25T20:24:07Z" + }, + { + "checksumSHA1": "kVKE0OX1Xdw5mG7XKT86DLLKE2I=", + "path": "golang.org/x/crypto/poly1305", + "revision": "81e90905daefcd6fd217b62423c0908922eadb30", + "revisionTime": "2017-08-25T20:24:07Z" + }, { "path": "golang.org/x/net/context", "revision": "a728288923b47049b2ce791836767ffbe964a5bd",