mirror of
https://github.com/minio/minio.git
synced 2025-11-22 18:47:43 -05:00
Performance improvements to SELECT API on certain query operations (#6752)
This improves the performance of certain queries dramatically, such as 'count(*)' etc. Without this PR ``` ~ time mc select --query "select count(*) from S3Object" myminio/sjm-airlines/star2000.csv.gz 2173762 real 0m42.464s user 0m0.071s sys 0m0.010s ``` With this PR ``` ~ time mc select --query "select count(*) from S3Object" myminio/sjm-airlines/star2000.csv.gz 2173762 real 0m17.603s user 0m0.093s sys 0m0.008s ``` Almost a 250% improvement in performance. This PR avoids a lot of type conversions and instead relies on raw sequences of data and interprets them lazily. ``` benchcmp old new benchmark old ns/op new ns/op delta BenchmarkSQLAggregate_100K-4 551213 259782 -52.87% BenchmarkSQLAggregate_1M-4 6981901985 2432413729 -65.16% BenchmarkSQLAggregate_2M-4 13511978488 4536903552 -66.42% BenchmarkSQLAggregate_10M-4 68427084908 23266283336 -66.00% benchmark old allocs new allocs delta BenchmarkSQLAggregate_100K-4 2366 485 -79.50% BenchmarkSQLAggregate_1M-4 47455492 21462860 -54.77% BenchmarkSQLAggregate_2M-4 95163637 43110771 -54.70% BenchmarkSQLAggregate_10M-4 476959550 216906510 -54.52% benchmark old bytes new bytes delta BenchmarkSQLAggregate_100K-4 1233079 1086024 -11.93% BenchmarkSQLAggregate_1M-4 2607984120 557038536 -78.64% BenchmarkSQLAggregate_2M-4 5254103616 1128149168 -78.53% BenchmarkSQLAggregate_10M-4 26443524872 5722715992 -78.36% ```
This commit is contained in:
committed by
kannappanr
parent
f9779b24ad
commit
7e1661f4fa
@@ -23,6 +23,8 @@ import (
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/tidwall/sjson"
|
||||
|
||||
"github.com/minio/minio/pkg/ioutil"
|
||||
"github.com/minio/minio/pkg/s3select/format"
|
||||
)
|
||||
@@ -96,7 +98,6 @@ func New(opts *Options) (format.Select, error) {
|
||||
reader.stats.BytesScanned = opts.StreamSize
|
||||
reader.stats.BytesProcessed = 0
|
||||
reader.stats.BytesReturned = 0
|
||||
|
||||
reader.firstRow = nil
|
||||
|
||||
reader.reader.FieldsPerRecord = -1
|
||||
@@ -120,7 +121,14 @@ func New(opts *Options) (format.Select, error) {
|
||||
|
||||
// Replace the spaces in columnnames with underscores
|
||||
func cleanHeader(columns []string) []string {
|
||||
for i := 0; i < len(columns); i++ {
|
||||
for i := range columns {
|
||||
// Even if header name is specified, some CSV's
|
||||
// might have column header names might be empty
|
||||
// and non-empty. In such a scenario we prepare
|
||||
// indexed value.
|
||||
if columns[i] == "" {
|
||||
columns[i] = "_" + strconv.Itoa(i)
|
||||
}
|
||||
columns[i] = strings.Replace(columns[i], " ", "_", -1)
|
||||
}
|
||||
return columns
|
||||
@@ -137,15 +145,14 @@ func (reader *cinput) readHeader() error {
|
||||
}
|
||||
reader.header = cleanHeader(reader.firstRow)
|
||||
reader.firstRow = nil
|
||||
reader.minOutputLength = len(reader.header)
|
||||
} else {
|
||||
reader.firstRow, readErr = reader.reader.Read()
|
||||
reader.header = make([]string, len(reader.firstRow))
|
||||
for i := 0; i < reader.minOutputLength; i++ {
|
||||
reader.header[i] = strconv.Itoa(i)
|
||||
for i := range reader.firstRow {
|
||||
reader.header[i] = "_" + strconv.Itoa(i)
|
||||
}
|
||||
|
||||
}
|
||||
reader.minOutputLength = len(reader.header)
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -155,33 +162,24 @@ func (reader *cinput) Progress() bool {
|
||||
}
|
||||
|
||||
// UpdateBytesProcessed - populates the bytes Processed
|
||||
func (reader *cinput) UpdateBytesProcessed(record map[string]interface{}) {
|
||||
// Convert map to slice of values.
|
||||
values := []string{}
|
||||
for _, value := range record {
|
||||
values = append(values, value.(string))
|
||||
}
|
||||
|
||||
reader.stats.BytesProcessed += int64(len(values))
|
||||
func (reader *cinput) UpdateBytesProcessed(size int64) {
|
||||
reader.stats.BytesProcessed += size
|
||||
|
||||
}
|
||||
|
||||
// Read the file and returns map[string]interface{}
|
||||
func (reader *cinput) Read() (map[string]interface{}, error) {
|
||||
record := make(map[string]interface{})
|
||||
// Read returns byte sequence
|
||||
func (reader *cinput) Read() ([]byte, error) {
|
||||
dec := reader.readRecord()
|
||||
if dec != nil {
|
||||
if reader.options.HasHeader {
|
||||
columns := reader.header
|
||||
for i, value := range dec {
|
||||
record[columns[i]] = value
|
||||
}
|
||||
} else {
|
||||
for i, value := range dec {
|
||||
record["_"+strconv.Itoa(i)] = value
|
||||
var data []byte
|
||||
var err error
|
||||
for i, value := range dec {
|
||||
data, err = sjson.SetBytes(data, reader.header[i], value)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
return record, nil
|
||||
return data, nil
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
@@ -17,11 +17,10 @@
|
||||
package json
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"bufio"
|
||||
"encoding/xml"
|
||||
"io"
|
||||
|
||||
jsoniter "github.com/json-iterator/go"
|
||||
"github.com/minio/minio/pkg/s3select/format"
|
||||
)
|
||||
|
||||
@@ -57,7 +56,7 @@ type Options struct {
|
||||
// jinput represents a record producing input from a formatted file or pipe.
|
||||
type jinput struct {
|
||||
options *Options
|
||||
reader *jsoniter.Decoder
|
||||
reader *bufio.Reader
|
||||
firstRow []string
|
||||
header []string
|
||||
minOutputLength int
|
||||
@@ -75,7 +74,7 @@ type jinput struct {
|
||||
func New(opts *Options) (format.Select, error) {
|
||||
reader := &jinput{
|
||||
options: opts,
|
||||
reader: jsoniter.NewDecoder(opts.ReadFrom),
|
||||
reader: bufio.NewReader(opts.ReadFrom),
|
||||
}
|
||||
reader.stats.BytesScanned = opts.StreamSize
|
||||
reader.stats.BytesProcessed = 0
|
||||
@@ -90,26 +89,21 @@ func (reader *jinput) Progress() bool {
|
||||
}
|
||||
|
||||
// UpdateBytesProcessed - populates the bytes Processed
|
||||
func (reader *jinput) UpdateBytesProcessed(record map[string]interface{}) {
|
||||
out, _ := json.Marshal(record)
|
||||
reader.stats.BytesProcessed += int64(len(out))
|
||||
func (reader *jinput) UpdateBytesProcessed(size int64) {
|
||||
reader.stats.BytesProcessed += size
|
||||
}
|
||||
|
||||
// Read the file and returns map[string]interface{}
|
||||
func (reader *jinput) Read() (map[string]interface{}, error) {
|
||||
dec := reader.reader
|
||||
var record interface{}
|
||||
for {
|
||||
err := dec.Decode(&record)
|
||||
// Read the file and returns
|
||||
func (reader *jinput) Read() ([]byte, error) {
|
||||
data, err := reader.reader.ReadBytes('\n')
|
||||
if err != nil {
|
||||
if err == io.EOF || err == io.ErrClosedPipe {
|
||||
break
|
||||
err = nil
|
||||
} else {
|
||||
err = format.ErrJSONParsingError
|
||||
}
|
||||
if err != nil {
|
||||
return nil, format.ErrJSONParsingError
|
||||
}
|
||||
return record.(map[string]interface{}), nil
|
||||
}
|
||||
return nil, nil
|
||||
return data, err
|
||||
}
|
||||
|
||||
// OutputFieldDelimiter - returns the delimiter specified in input request
|
||||
|
||||
@@ -22,11 +22,11 @@ import "encoding/xml"
|
||||
// https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html
|
||||
type Select interface {
|
||||
Type() Type
|
||||
Read() (map[string]interface{}, error)
|
||||
Read() ([]byte, error)
|
||||
Header() []string
|
||||
HasHeader() bool
|
||||
OutputFieldDelimiter() string
|
||||
UpdateBytesProcessed(record map[string]interface{})
|
||||
UpdateBytesProcessed(int64)
|
||||
Expression() string
|
||||
UpdateBytesReturned(int64)
|
||||
CreateStatXML() (string, error)
|
||||
|
||||
Reference in New Issue
Block a user