minio/pkg/s3select/output.go
Harshavardhana 7e1661f4fa Performance improvements to SELECT API on certain query operations (#6752)
This improves the performance of certain queries dramatically,
such as 'count(*)' etc.

Without this PR
```
~ time mc select --query "select count(*) from S3Object" myminio/sjm-airlines/star2000.csv.gz
2173762

real	0m42.464s
user	0m0.071s
sys	0m0.010s
```

With this PR
```
~ time mc select --query "select count(*) from S3Object" myminio/sjm-airlines/star2000.csv.gz
2173762

real	0m17.603s
user	0m0.093s
sys	0m0.008s
```

Almost a 250% improvement in performance. This PR avoids a lot of type
conversions and instead relies on raw sequences of data and interprets
them lazily.

```
benchcmp old new
benchmark                        old ns/op       new ns/op       delta
BenchmarkSQLAggregate_100K-4     551213          259782          -52.87%
BenchmarkSQLAggregate_1M-4       6981901985      2432413729      -65.16%
BenchmarkSQLAggregate_2M-4       13511978488     4536903552      -66.42%
BenchmarkSQLAggregate_10M-4      68427084908     23266283336     -66.00%

benchmark                        old allocs     new allocs     delta
BenchmarkSQLAggregate_100K-4     2366           485            -79.50%
BenchmarkSQLAggregate_1M-4       47455492       21462860       -54.77%
BenchmarkSQLAggregate_2M-4       95163637       43110771       -54.70%
BenchmarkSQLAggregate_10M-4      476959550      216906510      -54.52%

benchmark                        old bytes       new bytes      delta
BenchmarkSQLAggregate_100K-4     1233079         1086024        -11.93%
BenchmarkSQLAggregate_1M-4       2607984120      557038536      -78.64%
BenchmarkSQLAggregate_2M-4       5254103616      1128149168     -78.53%
BenchmarkSQLAggregate_10M-4      26443524872     5722715992     -78.36%
```
2018-11-14 15:55:10 -08:00

461 lines
16 KiB
Go

/*
* Minio Cloud Storage, (C) 2018 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// DO NOT EDIT THIS PACKAGE DIRECTLY: This follows the protocol defined by
// AmazonS3 found at
// https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html
// Consult the Spec before making direct edits.
package s3select
import (
"bytes"
"encoding/binary"
"hash/crc32"
)
// Record Headers
// -11 -event type - 7 - 7 "Records"
// -13 -content-type -7 -24 "application/octet-stream"
// -13 -message-type -7 5 "event"
// This is predefined from AMZ protocol found here:
// https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html
var recordHeaders []byte
// End Headers
// -13 -message-type -7 -5 "event"
// -11 -:event-type -7 -3 "End"
// This is predefined from AMZ protocol found here:
// https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html
var endHeaders []byte
// Continuation Headers
// -13 -message-type -7 -5 "event"
// -11 -:event-type -7 -4 "Cont"
// This is predefined from AMZ protocol found here:
// https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html
var contHeaders []byte
// Stat Headers
// -11 -event type - 7 - 5 "Stat" -20
// -13 -content-type -7 -8 "text/xml" -25
// -13 -message-type -7 -5 "event" -22
// This is predefined from AMZ protocol found here:
// https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html
var statHeaders []byte
// Progress Headers
// -11 -event type - 7 - 8 "Progress" -23
// -13 -content-type -7 -8 "text/xml" -25
// -13 -message-type -7 -5 "event" -22
// This is predefined from AMZ protocol found here:
// https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html
var progressHeaders []byte
// The length of the nonvariable portion of the ErrHeaders
// The below are the specifications of the header for a "error" event
// -11 -error-code - 7 - DEFINED "DEFINED"
// -14 -error-message -7 -DEFINED "DEFINED"
// -13 -message-type -7 -5 "error"
// This is predefined from AMZ protocol found here:
// https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html
var errHdrLen int
func init() {
recordHeaders = writeRecordHeader()
endHeaders = writeEndHeader()
contHeaders = writeContHeader()
statHeaders = writeStatHeader()
progressHeaders = writeProgressHeader()
errHdrLen = 55
}
// encodeString encodes a string in a []byte, lenBytes is the number of bytes
// used to encode the length of the string.
func encodeHeaderStringValue(s string) []byte {
n := uint16(len(s))
lenSlice := make([]byte, 2)
binary.BigEndian.PutUint16(lenSlice[0:], n)
return append(lenSlice, []byte(s)...)
}
func encodeHeaderStringName(s string) []byte {
lenSlice := make([]byte, 1)
lenSlice[0] = byte(len(s))
return append(lenSlice, []byte(s)...)
}
// encodeNumber encodes a number in a []byte, lenBytes is the number of bytes
// used to encode the length of the string.
func encodeNumber(n byte, lenBytes int) []byte {
lenSlice := make([]byte, lenBytes)
lenSlice[0] = n
return lenSlice
}
// writePayloadSize writes the 4byte payload size portion of the protocol.
func writePayloadSize(payloadSize int, headerLength int) []byte {
totalByteLen := make([]byte, 4)
totalMsgLen := uint32(payloadSize + headerLength + 16)
binary.BigEndian.PutUint32(totalByteLen, totalMsgLen)
return totalByteLen
}
// writeHeaderSize writes the 4byte header size portion of the protocol.
func writeHeaderSize(headerLength int) []byte {
totalHeaderLen := make([]byte, 4)
totalLen := uint32(headerLength)
binary.BigEndian.PutUint32(totalHeaderLen, totalLen)
return totalHeaderLen
}
// writeCRC writes the CRC for both the prelude and and the end of the protocol.
func writeCRC(buffer []byte) []byte {
// Calculate the CRC here:
crc := make([]byte, 4)
cksum := crc32.ChecksumIEEE(buffer)
binary.BigEndian.PutUint32(crc, cksum)
return crc
}
// writePayload writes the Payload for those protocols which the Payload is
// necessary.
func writePayload(myPayload string) []byte {
convertedPayload := []byte(myPayload)
payloadStore := make([]byte, len(convertedPayload))
copy(payloadStore[0:], myPayload)
return payloadStore
}
// writeRecordHeader is a function which writes the headers for the continuation
// Message
func writeRecordHeader() []byte {
// This is predefined from AMZ protocol found here:
// https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html
var currentMessage = &bytes.Buffer{}
// 11 -event type - 7 - 7 "Records"
// header name
currentMessage.Write(encodeHeaderStringName(":event-type"))
// header type
currentMessage.Write(encodeNumber(7, 1))
// header value and header value length
currentMessage.Write(encodeHeaderStringValue("Records"))
// Creation of the Header for Content-Type // 13 -content-type -7 -24
// "application/octet-stream"
// header name
currentMessage.Write(encodeHeaderStringName(":content-type"))
// header type
currentMessage.Write(encodeNumber(7, 1))
// header value and header value length
currentMessage.Write(encodeHeaderStringValue("application/octet-stream"))
// Creation of the Header for message-type 13 -message-type -7 5 "event"
// header name
currentMessage.Write(encodeHeaderStringName(":message-type"))
// header type
currentMessage.Write(encodeNumber(7, 1))
// header value and header value length
currentMessage.Write(encodeHeaderStringValue("event"))
return currentMessage.Bytes()
}
// writeEndHeader is a function which writes the headers for the continuation
// Message
func writeEndHeader() []byte {
// This is predefined from AMZ protocol found here:
// https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html
var currentMessage = &bytes.Buffer{}
// header name
currentMessage.Write(encodeHeaderStringName(":event-type"))
// header type
currentMessage.Write(encodeNumber(7, 1))
// header value and header value length
currentMessage.Write(encodeHeaderStringValue("End"))
// Creation of the Header for message-type 13 -message-type -7 5 "event"
// header name
currentMessage.Write(encodeHeaderStringName(":message-type"))
// header type
currentMessage.Write(encodeNumber(7, 1))
// header value and header value length
currentMessage.Write(encodeHeaderStringValue("event"))
return currentMessage.Bytes()
}
// writeContHeader is a function which writes the headers for the continuation
// Message
func writeContHeader() []byte {
// This is predefined from AMZ protocol found here:
// https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html
var currentMessage = &bytes.Buffer{}
// header name
currentMessage.Write(encodeHeaderStringName(":event-type"))
// header type
currentMessage.Write(encodeNumber(7, 1))
// header value and header value length
currentMessage.Write(encodeHeaderStringValue("Cont"))
// Creation of the Header for message-type 13 -message-type -7 5 "event"
// header name
currentMessage.Write(encodeHeaderStringName(":message-type"))
// header type
currentMessage.Write(encodeNumber(7, 1))
// header value and header value length
currentMessage.Write(encodeHeaderStringValue("event"))
return currentMessage.Bytes()
}
// writeStatHeader is a function which writes the headers for the Stat
// Message
func writeStatHeader() []byte {
// This is predefined from AMZ protocol found here:
// https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html
var currentMessage = &bytes.Buffer{}
// header name
currentMessage.Write(encodeHeaderStringName(":event-type"))
// header type
currentMessage.Write(encodeNumber(7, 1))
// header value and header value length
currentMessage.Write(encodeHeaderStringValue("Stats"))
// Creation of the Header for Content-Type // 13 -content-type -7 -8
// "text/xml"
// header name
currentMessage.Write(encodeHeaderStringName(":content-type"))
// header type
currentMessage.Write(encodeNumber(7, 1))
// header value and header value length
currentMessage.Write(encodeHeaderStringValue("text/xml"))
// Creation of the Header for message-type 13 -message-type -7 5 "event"
currentMessage.Write(encodeHeaderStringName(":message-type"))
// header type
currentMessage.Write(encodeNumber(7, 1))
// header value and header value length
currentMessage.Write(encodeHeaderStringValue("event"))
return currentMessage.Bytes()
}
// writeProgressHeader is a function which writes the headers for the Progress
// Message
func writeProgressHeader() []byte {
// This is predefined from AMZ protocol found here:
// https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html
var currentMessage = &bytes.Buffer{}
// header name
currentMessage.Write(encodeHeaderStringName(":event-type"))
// header type
currentMessage.Write(encodeNumber(7, 1))
// header value and header value length
currentMessage.Write(encodeHeaderStringValue("Progress"))
// Creation of the Header for Content-Type // 13 -content-type -7 -8
// "text/xml"
// header name
currentMessage.Write(encodeHeaderStringName(":content-type"))
// header type
currentMessage.Write(encodeNumber(7, 1))
// header value and header value length
currentMessage.Write(encodeHeaderStringValue("text/xml"))
// Creation of the Header for message-type 13 -message-type -7 5 "event"
// header name
currentMessage.Write(encodeHeaderStringName(":message-type"))
// header type
currentMessage.Write(encodeNumber(7, 1))
// header value and header value length
currentMessage.Write(encodeHeaderStringValue("event"))
return currentMessage.Bytes()
}
// writeRecordMessage is the function which constructs the binary message for a
// record message to be sent.
func writeRecordMessage(payload string, currentMessage *bytes.Buffer) *bytes.Buffer {
// The below are the specifications of the header for a "record" event
// 11 -event type - 7 - 7 "Records"
// 13 -content-type -7 -24 "application/octet-stream"
// 13 -message-type -7 5 "event"
// This is predefined from AMZ protocol found here:
// https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html
headerLen := len(recordHeaders)
// Writes the total size of the message.
currentMessage.Write(writePayloadSize(len(payload), headerLen))
// Writes the total size of the header.
currentMessage.Write(writeHeaderSize(headerLen))
// Writes the CRC of the Prelude
currentMessage.Write(writeCRC(currentMessage.Bytes()))
currentMessage.Write(recordHeaders)
// This part is where the payload is written, this will be only one row, since
// we're sending one message at a types
currentMessage.Write(writePayload(payload))
// Now we do a CRC check on the entire messages
currentMessage.Write(writeCRC(currentMessage.Bytes()))
return currentMessage
}
// writeContinuationMessage is the function which constructs the binary message
// for a continuation message to be sent.
func writeContinuationMessage(currentMessage *bytes.Buffer) *bytes.Buffer {
// 11 -event type - 7 - 4 "Cont"
// 13 -message-type -7 5 "event"
// This is predefined from AMZ protocol found here:
// https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html
headerLen := len(contHeaders)
currentMessage.Write(writePayloadSize(0, headerLen))
currentMessage.Write(writeHeaderSize(headerLen))
// Calculate the Prelude CRC here:
currentMessage.Write(writeCRC(currentMessage.Bytes()))
currentMessage.Write(contHeaders)
//Now we do a CRC check on the entire messages
currentMessage.Write(writeCRC(currentMessage.Bytes()))
return currentMessage
}
// writeEndMessage is the function which constructs the binary message
// for a end message to be sent.
func writeEndMessage(currentMessage *bytes.Buffer) *bytes.Buffer {
// 11 -event type - 7 - 3 "End"
// 13 -message-type -7 5 "event"
// This is predefined from AMZ protocol found here:
// https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html
headerLen := len(endHeaders)
currentMessage.Write(writePayloadSize(0, headerLen))
currentMessage.Write(writeHeaderSize(headerLen))
//Calculate the Prelude CRC here:
currentMessage.Write(writeCRC(currentMessage.Bytes()))
currentMessage.Write(endHeaders)
// Now we do a CRC check on the entire messages
currentMessage.Write(writeCRC(currentMessage.Bytes()))
return currentMessage
}
// writeStateMessage is the function which constructs the binary message for a
// state message to be sent.
func writeStatMessage(payload string, currentMessage *bytes.Buffer) *bytes.Buffer {
// 11 -event type - 7 - 5 "Stat" 20
// 13 -content-type -7 -8 "text/xml" 25
// 13 -message-type -7 5 "event" 22
// This is predefined from AMZ protocol found here:
// https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html
headerLen := len(statHeaders)
currentMessage.Write(writePayloadSize(len(payload), headerLen))
currentMessage.Write(writeHeaderSize(headerLen))
currentMessage.Write(writeCRC(currentMessage.Bytes()))
currentMessage.Write(statHeaders)
// This part is where the payload is written, this will be only one row, since
// we're sending one message at a types
currentMessage.Write(writePayload(payload))
// Now we do a CRC check on the entire messages
currentMessage.Write(writeCRC(currentMessage.Bytes()))
return currentMessage
}
// writeProgressMessage is the function which constructs the binary message for
// a progress message to be sent.
func writeProgressMessage(payload string, currentMessage *bytes.Buffer) *bytes.Buffer {
// The below are the specifications of the header for a "Progress" event
// 11 -event type - 7 - 8 "Progress" 23
// 13 -content-type -7 -8 "text/xml" 25
// 13 -message-type -7 5 "event" 22
// This is predefined from AMZ protocol found here:
// https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html
headerLen := len(progressHeaders)
currentMessage.Write(writePayloadSize(len(payload), headerLen))
currentMessage.Write(writeHeaderSize(headerLen))
currentMessage.Write(writeCRC(currentMessage.Bytes()))
currentMessage.Write(progressHeaders)
// This part is where the payload is written, this will be only one row, since
// we're sending one message at a types
currentMessage.Write(writePayload(payload))
// Now we do a CRC check on the entire messages
currentMessage.Write(writeCRC(currentMessage.Bytes()))
return currentMessage
}
// writeErrorMessage is the function which constructs the binary message for a
// error message to be sent.
func writeErrorMessage(errorMessage error, currentMessage *bytes.Buffer) *bytes.Buffer {
// The below are the specifications of the header for a "error" event
// 11 -error-code - 7 - DEFINED "DEFINED"
// 14 -error-message -7 -DEFINED "DEFINED"
// 13 -message-type -7 5 "error"
// This is predefined from AMZ protocol found here:
// https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html
sizeOfErrorCode := len(errorCodeResponse[errorMessage])
sizeOfErrorMessage := len(errorMessage.Error())
headerLen := errHdrLen + sizeOfErrorCode + sizeOfErrorMessage
currentMessage.Write(writePayloadSize(0, headerLen))
currentMessage.Write(writeHeaderSize(headerLen))
currentMessage.Write(writeCRC(currentMessage.Bytes()))
// header name
currentMessage.Write(encodeHeaderStringName(":error-code"))
// header type
currentMessage.Write(encodeNumber(7, 1))
// header value and header value length
currentMessage.Write(encodeHeaderStringValue(errorCodeResponse[errorMessage]))
// 14 -error-message -7 -DEFINED "DEFINED"
// header name
currentMessage.Write(encodeHeaderStringName(":error-message"))
// header type
currentMessage.Write(encodeNumber(7, 1))
// header value and header value length
currentMessage.Write(encodeHeaderStringValue(errorMessage.Error()))
// Creation of the Header for message-type 13 -message-type -7 5 "error"
// header name
currentMessage.Write(encodeHeaderStringName(":message-type"))
// header type
currentMessage.Write(encodeNumber(7, 1))
// header value and header value length
currentMessage.Write(encodeHeaderStringValue("error"))
// Now we do a CRC check on the entire messages
currentMessage.Write(writeCRC(currentMessage.Bytes()))
return currentMessage
}