mirror of
https://github.com/minio/minio.git
synced 2024-12-26 15:15:55 -05:00
ddea0bdf11
``` CSV parsing, BEFORE: BenchmarkReaderBasic-12 2842 407533 ns/op 397860 B/op 957 allocs/op BenchmarkReaderReplace-12 2718 429914 ns/op 397844 B/op 957 allocs/op BenchmarkReaderReplaceTwo-12 2718 435556 ns/op 397855 B/op 957 allocs/op BenchmarkAggregateCount_100K-12 171 6798974 ns/op 16667102 B/op 308077 allocs/op BenchmarkAggregateCount_1M-12 19 65657411 ns/op 168057743 B/op 3146610 allocs/op BenchmarkSelectAll_10M-12 1 20882119900 ns/op 2758799896 B/op 41978762 allocs/op CSV parsing, AFTER: BenchmarkReaderBasic-12 3721 312549 ns/op 101920 B/op 338 allocs/op BenchmarkReaderReplace-12 3776 318810 ns/op 101993 B/op 340 allocs/op BenchmarkReaderReplaceTwo-12 3610 330967 ns/op 102012 B/op 341 allocs/op BenchmarkAggregateCount_100K-12 295 4149588 ns/op 3553623 B/op 103261 allocs/op BenchmarkAggregateCount_1M-12 30 37746503 ns/op 33827931 B/op 1049435 allocs/op BenchmarkSelectAll_10M-12 1 17608495800 ns/op 1416504040 B/op 21007082 allocs/op ~ benchcmp old.txt new.txt benchmark old ns/op new ns/op delta BenchmarkReaderBasic-12 407533 312549 -23.31% BenchmarkReaderReplace-12 429914 318810 -25.84% BenchmarkReaderReplaceTwo-12 435556 330967 -24.01% BenchmarkAggregateCount_100K-12 6798974 4149588 -38.97% BenchmarkAggregateCount_1M-12 65657411 37746503 -42.51% BenchmarkSelectAll_10M-12 20882119900 17608495800 -15.68% benchmark old allocs new allocs delta BenchmarkReaderBasic-12 957 338 -64.68% BenchmarkReaderReplace-12 957 340 -64.47% BenchmarkReaderReplaceTwo-12 957 341 -64.37% BenchmarkAggregateCount_100K-12 308077 103261 -66.48% BenchmarkAggregateCount_1M-12 3146610 1049435 -66.65% BenchmarkSelectAll_10M-12 41978762 21007082 -49.96% benchmark old bytes new bytes delta BenchmarkReaderBasic-12 397860 101920 -74.38% BenchmarkReaderReplace-12 397844 101993 -74.36% BenchmarkReaderReplaceTwo-12 397855 102012 -74.36% BenchmarkAggregateCount_100K-12 16667102 3553623 -78.68% BenchmarkAggregateCount_1M-12 168057743 33827931 -79.87% BenchmarkSelectAll_10M-12 2758799896 1416504040 -48.66% ``` ``` BenchmarkReaderHuge/97K-12 2200 540840 ns/op 184.32 MB/s 1604450 B/op 687 allocs/op BenchmarkReaderHuge/194K-12 1522 752257 ns/op 265.04 MB/s 2143135 B/op 1335 allocs/op BenchmarkReaderHuge/389K-12 1190 947858 ns/op 420.69 MB/s 3221831 B/op 2630 allocs/op BenchmarkReaderHuge/778K-12 806 1472486 ns/op 541.61 MB/s 5201856 B/op 5187 allocs/op BenchmarkReaderHuge/1557K-12 426 2575269 ns/op 619.36 MB/s 9101330 B/op 10233 allocs/op BenchmarkReaderHuge/3115K-12 286 4034656 ns/op 790.66 MB/s 12397968 B/op 16099 allocs/op BenchmarkReaderHuge/6230K-12 172 6830563 ns/op 934.05 MB/s 16008416 B/op 26844 allocs/op BenchmarkReaderHuge/12461K-12 100 11409467 ns/op 1118.39 MB/s 22655163 B/op 48107 allocs/op BenchmarkReaderHuge/24922K-12 66 19780395 ns/op 1290.19 MB/s 35158559 B/op 90216 allocs/op BenchmarkReaderHuge/49844K-12 34 37282559 ns/op 1369.03 MB/s 60528624 B/op 174497 allocs/op ```
476 lines
12 KiB
Go
476 lines
12 KiB
Go
/*
|
|
* MinIO Cloud Storage, (C) 2019 MinIO, Inc.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
package s3select
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"encoding/xml"
|
|
"fmt"
|
|
"io"
|
|
"io/ioutil"
|
|
"net/http"
|
|
"strings"
|
|
"sync"
|
|
|
|
"github.com/minio/minio/pkg/s3select/csv"
|
|
"github.com/minio/minio/pkg/s3select/json"
|
|
"github.com/minio/minio/pkg/s3select/parquet"
|
|
"github.com/minio/minio/pkg/s3select/sql"
|
|
)
|
|
|
|
type recordReader interface {
|
|
// Read a record.
|
|
// dst is optional but will be used if valid.
|
|
Read(dst sql.Record) (sql.Record, error)
|
|
Close() error
|
|
}
|
|
|
|
const (
|
|
csvFormat = "csv"
|
|
jsonFormat = "json"
|
|
parquetFormat = "parquet"
|
|
)
|
|
|
|
// CompressionType - represents value inside <CompressionType/> in request XML.
|
|
type CompressionType string
|
|
|
|
const (
|
|
noneType CompressionType = "none"
|
|
gzipType CompressionType = "gzip"
|
|
bzip2Type CompressionType = "bzip2"
|
|
)
|
|
|
|
const (
|
|
maxRecordSize = 1 << 20 // 1 MiB
|
|
)
|
|
|
|
var bufPool = sync.Pool{
|
|
New: func() interface{} {
|
|
return new(bytes.Buffer)
|
|
},
|
|
}
|
|
|
|
var bufioWriterPool = sync.Pool{
|
|
New: func() interface{} {
|
|
// ioutil.Discard is just used to create the writer. Actual destination
|
|
// writer is set later by Reset() before using it.
|
|
return bufio.NewWriter(ioutil.Discard)
|
|
},
|
|
}
|
|
|
|
// UnmarshalXML - decodes XML data.
|
|
func (c *CompressionType) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
|
|
var s string
|
|
if err := d.DecodeElement(&s, &start); err != nil {
|
|
return errMalformedXML(err)
|
|
}
|
|
|
|
parsedType := CompressionType(strings.ToLower(s))
|
|
if s == "" {
|
|
parsedType = noneType
|
|
}
|
|
|
|
switch parsedType {
|
|
case noneType, gzipType, bzip2Type:
|
|
default:
|
|
return errInvalidCompressionFormat(fmt.Errorf("invalid compression format '%v'", s))
|
|
}
|
|
|
|
*c = parsedType
|
|
return nil
|
|
}
|
|
|
|
// InputSerialization - represents elements inside <InputSerialization/> in request XML.
|
|
type InputSerialization struct {
|
|
CompressionType CompressionType `xml:"CompressionType"`
|
|
CSVArgs csv.ReaderArgs `xml:"CSV"`
|
|
JSONArgs json.ReaderArgs `xml:"JSON"`
|
|
ParquetArgs parquet.ReaderArgs `xml:"Parquet"`
|
|
unmarshaled bool
|
|
format string
|
|
}
|
|
|
|
// IsEmpty - returns whether input serialization is empty or not.
|
|
func (input *InputSerialization) IsEmpty() bool {
|
|
return !input.unmarshaled
|
|
}
|
|
|
|
// UnmarshalXML - decodes XML data.
|
|
func (input *InputSerialization) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
|
|
// Make subtype to avoid recursive UnmarshalXML().
|
|
type subInputSerialization InputSerialization
|
|
parsedInput := subInputSerialization{}
|
|
if err := d.DecodeElement(&parsedInput, &start); err != nil {
|
|
return errMalformedXML(err)
|
|
}
|
|
|
|
// If no compression is specified, set to noneType
|
|
if parsedInput.CompressionType == CompressionType("") {
|
|
parsedInput.CompressionType = noneType
|
|
}
|
|
|
|
found := 0
|
|
if !parsedInput.CSVArgs.IsEmpty() {
|
|
parsedInput.format = csvFormat
|
|
found++
|
|
}
|
|
if !parsedInput.JSONArgs.IsEmpty() {
|
|
parsedInput.format = jsonFormat
|
|
found++
|
|
}
|
|
if !parsedInput.ParquetArgs.IsEmpty() {
|
|
if parsedInput.CompressionType != "" && parsedInput.CompressionType != noneType {
|
|
return errInvalidRequestParameter(fmt.Errorf("CompressionType must be NONE for Parquet format"))
|
|
}
|
|
|
|
parsedInput.format = parquetFormat
|
|
found++
|
|
}
|
|
|
|
if found != 1 {
|
|
return errInvalidDataSource(nil)
|
|
}
|
|
|
|
*input = InputSerialization(parsedInput)
|
|
input.unmarshaled = true
|
|
return nil
|
|
}
|
|
|
|
// OutputSerialization - represents elements inside <OutputSerialization/> in request XML.
|
|
type OutputSerialization struct {
|
|
CSVArgs csv.WriterArgs `xml:"CSV"`
|
|
JSONArgs json.WriterArgs `xml:"JSON"`
|
|
unmarshaled bool
|
|
format string
|
|
}
|
|
|
|
// IsEmpty - returns whether output serialization is empty or not.
|
|
func (output *OutputSerialization) IsEmpty() bool {
|
|
return !output.unmarshaled
|
|
}
|
|
|
|
// UnmarshalXML - decodes XML data.
|
|
func (output *OutputSerialization) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
|
|
// Make subtype to avoid recursive UnmarshalXML().
|
|
type subOutputSerialization OutputSerialization
|
|
parsedOutput := subOutputSerialization{}
|
|
if err := d.DecodeElement(&parsedOutput, &start); err != nil {
|
|
return errMalformedXML(err)
|
|
}
|
|
|
|
found := 0
|
|
if !parsedOutput.CSVArgs.IsEmpty() {
|
|
parsedOutput.format = csvFormat
|
|
found++
|
|
}
|
|
if !parsedOutput.JSONArgs.IsEmpty() {
|
|
parsedOutput.format = jsonFormat
|
|
found++
|
|
}
|
|
if found != 1 {
|
|
return errObjectSerializationConflict(fmt.Errorf("either CSV or JSON should be present in OutputSerialization"))
|
|
}
|
|
|
|
*output = OutputSerialization(parsedOutput)
|
|
output.unmarshaled = true
|
|
return nil
|
|
}
|
|
|
|
// RequestProgress - represents elements inside <RequestProgress/> in request XML.
|
|
type RequestProgress struct {
|
|
Enabled bool `xml:"Enabled"`
|
|
}
|
|
|
|
// S3Select - filters the contents on a simple structured query language (SQL) statement. It
|
|
// represents elements inside <SelectRequest/> in request XML specified in detail at
|
|
// https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html.
|
|
type S3Select struct {
|
|
XMLName xml.Name `xml:"SelectRequest"`
|
|
Expression string `xml:"Expression"`
|
|
ExpressionType string `xml:"ExpressionType"`
|
|
Input InputSerialization `xml:"InputSerialization"`
|
|
Output OutputSerialization `xml:"OutputSerialization"`
|
|
Progress RequestProgress `xml:"RequestProgress"`
|
|
|
|
statement *sql.SelectStatement
|
|
progressReader *progressReader
|
|
recordReader recordReader
|
|
}
|
|
|
|
var (
|
|
legacyXMLName = "SelectObjectContentRequest"
|
|
)
|
|
|
|
// UnmarshalXML - decodes XML data.
|
|
func (s3Select *S3Select) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
|
|
// S3 also supports the older SelectObjectContentRequest tag,
|
|
// though it is no longer found in documentation. This is
|
|
// checked and renamed below to allow older clients to also
|
|
// work.
|
|
if start.Name.Local == legacyXMLName {
|
|
start.Name = xml.Name{Space: "", Local: "SelectRequest"}
|
|
}
|
|
|
|
// Make subtype to avoid recursive UnmarshalXML().
|
|
type subS3Select S3Select
|
|
parsedS3Select := subS3Select{}
|
|
if err := d.DecodeElement(&parsedS3Select, &start); err != nil {
|
|
if _, ok := err.(*s3Error); ok {
|
|
return err
|
|
}
|
|
|
|
return errMalformedXML(err)
|
|
}
|
|
|
|
parsedS3Select.ExpressionType = strings.ToLower(parsedS3Select.ExpressionType)
|
|
if parsedS3Select.ExpressionType != "sql" {
|
|
return errInvalidExpressionType(fmt.Errorf("invalid expression type '%v'", parsedS3Select.ExpressionType))
|
|
}
|
|
|
|
if parsedS3Select.Input.IsEmpty() {
|
|
return errMissingRequiredParameter(fmt.Errorf("InputSerialization must be provided"))
|
|
}
|
|
|
|
if parsedS3Select.Output.IsEmpty() {
|
|
return errMissingRequiredParameter(fmt.Errorf("OutputSerialization must be provided"))
|
|
}
|
|
|
|
statement, err := sql.ParseSelectStatement(parsedS3Select.Expression)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
parsedS3Select.statement = &statement
|
|
|
|
*s3Select = S3Select(parsedS3Select)
|
|
return nil
|
|
}
|
|
|
|
func (s3Select *S3Select) outputRecord() sql.Record {
|
|
switch s3Select.Output.format {
|
|
case csvFormat:
|
|
return csv.NewRecord()
|
|
case jsonFormat:
|
|
return json.NewRecord(sql.SelectFmtJSON)
|
|
}
|
|
|
|
panic(fmt.Errorf("unknown output format '%v'", s3Select.Output.format))
|
|
}
|
|
|
|
func (s3Select *S3Select) getProgress() (bytesScanned, bytesProcessed int64) {
|
|
if s3Select.progressReader != nil {
|
|
return s3Select.progressReader.Stats()
|
|
}
|
|
|
|
return -1, -1
|
|
}
|
|
|
|
// Open - opens S3 object by using callback for SQL selection query.
|
|
// Currently CSV, JSON and Apache Parquet formats are supported.
|
|
func (s3Select *S3Select) Open(getReader func(offset, length int64) (io.ReadCloser, error)) error {
|
|
switch s3Select.Input.format {
|
|
case csvFormat:
|
|
rc, err := getReader(0, -1)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
s3Select.progressReader, err = newProgressReader(rc, s3Select.Input.CompressionType)
|
|
if err != nil {
|
|
rc.Close()
|
|
return err
|
|
}
|
|
|
|
s3Select.recordReader, err = csv.NewReader(s3Select.progressReader, &s3Select.Input.CSVArgs)
|
|
if err != nil {
|
|
rc.Close()
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
case jsonFormat:
|
|
rc, err := getReader(0, -1)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
s3Select.progressReader, err = newProgressReader(rc, s3Select.Input.CompressionType)
|
|
if err != nil {
|
|
rc.Close()
|
|
return err
|
|
}
|
|
|
|
s3Select.recordReader = json.NewReader(s3Select.progressReader, &s3Select.Input.JSONArgs)
|
|
return nil
|
|
case parquetFormat:
|
|
var err error
|
|
s3Select.recordReader, err = parquet.NewReader(getReader, &s3Select.Input.ParquetArgs)
|
|
return err
|
|
}
|
|
|
|
panic(fmt.Errorf("unknown input format '%v'", s3Select.Input.format))
|
|
}
|
|
|
|
func (s3Select *S3Select) marshal(buf *bytes.Buffer, record sql.Record) error {
|
|
switch s3Select.Output.format {
|
|
case csvFormat:
|
|
// Use bufio Writer to prevent csv.Writer from allocating a new buffer.
|
|
bufioWriter := bufioWriterPool.Get().(*bufio.Writer)
|
|
defer func() {
|
|
bufioWriter.Reset(ioutil.Discard)
|
|
bufioWriterPool.Put(bufioWriter)
|
|
}()
|
|
|
|
bufioWriter.Reset(buf)
|
|
err := record.WriteCSV(bufioWriter, []rune(s3Select.Output.CSVArgs.FieldDelimiter)[0])
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
buf.Truncate(buf.Len() - 1)
|
|
buf.WriteString(s3Select.Output.CSVArgs.RecordDelimiter)
|
|
|
|
return nil
|
|
case jsonFormat:
|
|
err := record.WriteJSON(buf)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
buf.Truncate(buf.Len() - 1)
|
|
buf.WriteString(s3Select.Output.JSONArgs.RecordDelimiter)
|
|
|
|
return nil
|
|
}
|
|
|
|
panic(fmt.Errorf("unknown output format '%v'", s3Select.Output.format))
|
|
}
|
|
|
|
// Evaluate - filters and sends records read from opened reader as per select statement to http response writer.
|
|
func (s3Select *S3Select) Evaluate(w http.ResponseWriter) {
|
|
getProgressFunc := s3Select.getProgress
|
|
if !s3Select.Progress.Enabled {
|
|
getProgressFunc = nil
|
|
}
|
|
writer := newMessageWriter(w, getProgressFunc)
|
|
|
|
var inputRecord sql.Record
|
|
var outputRecord sql.Record
|
|
var err error
|
|
sendRecord := func() bool {
|
|
if outputRecord == nil {
|
|
return true
|
|
}
|
|
|
|
buf := bufPool.Get().(*bytes.Buffer)
|
|
buf.Reset()
|
|
|
|
if err = s3Select.marshal(buf, outputRecord); err != nil {
|
|
bufPool.Put(buf)
|
|
return false
|
|
}
|
|
|
|
if buf.Len() > maxRecordSize {
|
|
writer.FinishWithError("OverMaxRecordSize", "The length of a record in the input or result is greater than maxCharsPerRecord of 1 MB.")
|
|
bufPool.Put(buf)
|
|
return false
|
|
}
|
|
|
|
if err = writer.SendRecord(buf); err != nil {
|
|
// FIXME: log this error.
|
|
err = nil
|
|
bufPool.Put(buf)
|
|
return false
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
var rec sql.Record
|
|
for {
|
|
if s3Select.statement.LimitReached() {
|
|
if err = writer.Finish(s3Select.getProgress()); err != nil {
|
|
// FIXME: log this error.
|
|
err = nil
|
|
}
|
|
break
|
|
}
|
|
|
|
if rec, err = s3Select.recordReader.Read(rec); err != nil {
|
|
if err != io.EOF {
|
|
break
|
|
}
|
|
|
|
if s3Select.statement.IsAggregated() {
|
|
outputRecord = s3Select.outputRecord()
|
|
if err = s3Select.statement.AggregateResult(outputRecord); err != nil {
|
|
break
|
|
}
|
|
|
|
if !sendRecord() {
|
|
break
|
|
}
|
|
}
|
|
|
|
if err = writer.Finish(s3Select.getProgress()); err != nil {
|
|
// FIXME: log this error.
|
|
err = nil
|
|
}
|
|
break
|
|
}
|
|
|
|
if inputRecord, err = s3Select.statement.EvalFrom(s3Select.Input.format, rec); err != nil {
|
|
break
|
|
}
|
|
|
|
if s3Select.statement.IsAggregated() {
|
|
if err = s3Select.statement.AggregateRow(inputRecord); err != nil {
|
|
break
|
|
}
|
|
} else {
|
|
outputRecord = s3Select.outputRecord()
|
|
if outputRecord, err = s3Select.statement.Eval(inputRecord, outputRecord); err != nil {
|
|
break
|
|
}
|
|
|
|
if !sendRecord() {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
if err != nil {
|
|
_ = writer.FinishWithError("InternalError", err.Error())
|
|
}
|
|
}
|
|
|
|
// Close - closes opened S3 object.
|
|
func (s3Select *S3Select) Close() error {
|
|
return s3Select.recordReader.Close()
|
|
}
|
|
|
|
// NewS3Select - creates new S3Select by given request XML reader.
|
|
func NewS3Select(r io.Reader) (*S3Select, error) {
|
|
s3Select := &S3Select{}
|
|
if err := xml.NewDecoder(r).Decode(s3Select); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return s3Select, nil
|
|
}
|