speed up the performance of s3select on csv (#7945)

This commit is contained in:
Yao Zongyou
2019-08-31 15:07:40 +08:00
committed by Harshavardhana
parent fa3546bb03
commit ec9bfd3aef
8 changed files with 128 additions and 77 deletions

View File

@@ -17,11 +17,15 @@
package s3select
import (
"bufio"
"bytes"
"encoding/xml"
"fmt"
"io"
"io/ioutil"
"net/http"
"strings"
"sync"
"github.com/minio/minio/pkg/s3select/csv"
"github.com/minio/minio/pkg/s3select/json"
@@ -53,6 +57,20 @@ const (
maxRecordSize = 1 << 20 // 1 MiB
)
var bufPool = sync.Pool{
New: func() interface{} {
return new(bytes.Buffer)
},
}
var bufioWriterPool = sync.Pool{
New: func() interface{} {
// ioutil.Discard is just used to create the writer. Actual destination
// writer is set later by Reset() before using it.
return bufio.NewWriter(ioutil.Discard)
},
}
// UnmarshalXML - decodes XML data.
func (c *CompressionType) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
var s string
@@ -306,22 +324,36 @@ func (s3Select *S3Select) Open(getReader func(offset, length int64) (io.ReadClos
panic(fmt.Errorf("unknown input format '%v'", s3Select.Input.format))
}
func (s3Select *S3Select) marshal(record sql.Record) ([]byte, error) {
func (s3Select *S3Select) marshal(buf *bytes.Buffer, record sql.Record) error {
switch s3Select.Output.format {
case csvFormat:
data, err := record.MarshalCSV([]rune(s3Select.Output.CSVArgs.FieldDelimiter)[0])
// Use bufio Writer to prevent csv.Writer from allocating a new buffer.
bufioWriter := bufioWriterPool.Get().(*bufio.Writer)
defer func() {
bufioWriter.Reset(ioutil.Discard)
bufioWriterPool.Put(bufioWriter)
}()
bufioWriter.Reset(buf)
err := record.WriteCSV(bufioWriter, []rune(s3Select.Output.CSVArgs.FieldDelimiter)[0])
if err != nil {
return nil, err
return err
}
return append(data, []byte(s3Select.Output.CSVArgs.RecordDelimiter)...), nil
buf.Truncate(buf.Len() - 1)
buf.WriteString(s3Select.Output.CSVArgs.RecordDelimiter)
return nil
case jsonFormat:
data, err := record.MarshalJSON()
err := record.WriteJSON(buf)
if err != nil {
return nil, err
return err
}
return append(data, []byte(s3Select.Output.JSONArgs.RecordDelimiter)...), nil
buf.Truncate(buf.Len() - 1)
buf.WriteString(s3Select.Output.JSONArgs.RecordDelimiter)
return nil
}
panic(fmt.Errorf("unknown output format '%v'", s3Select.Output.format))
@@ -338,24 +370,29 @@ func (s3Select *S3Select) Evaluate(w http.ResponseWriter) {
var inputRecord sql.Record
var outputRecord sql.Record
var err error
var data []byte
sendRecord := func() bool {
if outputRecord == nil {
return true
}
if data, err = s3Select.marshal(outputRecord); err != nil {
buf := bufPool.Get().(*bytes.Buffer)
buf.Reset()
if err = s3Select.marshal(buf, outputRecord); err != nil {
bufPool.Put(buf)
return false
}
if len(data) > maxRecordSize {
if buf.Len() > maxRecordSize {
writer.FinishWithError("OverMaxRecordSize", "The length of a record in the input or result is greater than maxCharsPerRecord of 1 MB.")
bufPool.Put(buf)
return false
}
if err = writer.SendRecord(data); err != nil {
if err = writer.SendRecord(buf); err != nil {
// FIXME: log this error.
err = nil
bufPool.Put(buf)
return false
}