mirror of
https://github.com/minio/minio.git
synced 2025-11-22 10:37:42 -05:00
Performance improvements to SELECT API on certain query operations (#6752)
This improves the performance of certain queries dramatically, such as 'count(*)' etc. Without this PR ``` ~ time mc select --query "select count(*) from S3Object" myminio/sjm-airlines/star2000.csv.gz 2173762 real 0m42.464s user 0m0.071s sys 0m0.010s ``` With this PR ``` ~ time mc select --query "select count(*) from S3Object" myminio/sjm-airlines/star2000.csv.gz 2173762 real 0m17.603s user 0m0.093s sys 0m0.008s ``` Almost a 250% improvement in performance. This PR avoids a lot of type conversions and instead relies on raw sequences of data and interprets them lazily. ``` benchcmp old new benchmark old ns/op new ns/op delta BenchmarkSQLAggregate_100K-4 551213 259782 -52.87% BenchmarkSQLAggregate_1M-4 6981901985 2432413729 -65.16% BenchmarkSQLAggregate_2M-4 13511978488 4536903552 -66.42% BenchmarkSQLAggregate_10M-4 68427084908 23266283336 -66.00% benchmark old allocs new allocs delta BenchmarkSQLAggregate_100K-4 2366 485 -79.50% BenchmarkSQLAggregate_1M-4 47455492 21462860 -54.77% BenchmarkSQLAggregate_2M-4 95163637 43110771 -54.70% BenchmarkSQLAggregate_10M-4 476959550 216906510 -54.52% benchmark old bytes new bytes delta BenchmarkSQLAggregate_100K-4 1233079 1086024 -11.93% BenchmarkSQLAggregate_1M-4 2607984120 557038536 -78.64% BenchmarkSQLAggregate_2M-4 5254103616 1128149168 -78.53% BenchmarkSQLAggregate_10M-4 26443524872 5722715992 -78.36% ```
This commit is contained in:
committed by
kannappanr
parent
f9779b24ad
commit
7e1661f4fa
@@ -23,6 +23,8 @@ import (
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/tidwall/sjson"
|
||||
|
||||
"github.com/minio/minio/pkg/ioutil"
|
||||
"github.com/minio/minio/pkg/s3select/format"
|
||||
)
|
||||
@@ -96,7 +98,6 @@ func New(opts *Options) (format.Select, error) {
|
||||
reader.stats.BytesScanned = opts.StreamSize
|
||||
reader.stats.BytesProcessed = 0
|
||||
reader.stats.BytesReturned = 0
|
||||
|
||||
reader.firstRow = nil
|
||||
|
||||
reader.reader.FieldsPerRecord = -1
|
||||
@@ -120,7 +121,14 @@ func New(opts *Options) (format.Select, error) {
|
||||
|
||||
// Replace the spaces in columnnames with underscores
|
||||
func cleanHeader(columns []string) []string {
|
||||
for i := 0; i < len(columns); i++ {
|
||||
for i := range columns {
|
||||
// Even if header name is specified, some CSV's
|
||||
// might have column header names might be empty
|
||||
// and non-empty. In such a scenario we prepare
|
||||
// indexed value.
|
||||
if columns[i] == "" {
|
||||
columns[i] = "_" + strconv.Itoa(i)
|
||||
}
|
||||
columns[i] = strings.Replace(columns[i], " ", "_", -1)
|
||||
}
|
||||
return columns
|
||||
@@ -137,15 +145,14 @@ func (reader *cinput) readHeader() error {
|
||||
}
|
||||
reader.header = cleanHeader(reader.firstRow)
|
||||
reader.firstRow = nil
|
||||
reader.minOutputLength = len(reader.header)
|
||||
} else {
|
||||
reader.firstRow, readErr = reader.reader.Read()
|
||||
reader.header = make([]string, len(reader.firstRow))
|
||||
for i := 0; i < reader.minOutputLength; i++ {
|
||||
reader.header[i] = strconv.Itoa(i)
|
||||
for i := range reader.firstRow {
|
||||
reader.header[i] = "_" + strconv.Itoa(i)
|
||||
}
|
||||
|
||||
}
|
||||
reader.minOutputLength = len(reader.header)
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -155,33 +162,24 @@ func (reader *cinput) Progress() bool {
|
||||
}
|
||||
|
||||
// UpdateBytesProcessed - populates the bytes Processed
|
||||
func (reader *cinput) UpdateBytesProcessed(record map[string]interface{}) {
|
||||
// Convert map to slice of values.
|
||||
values := []string{}
|
||||
for _, value := range record {
|
||||
values = append(values, value.(string))
|
||||
}
|
||||
|
||||
reader.stats.BytesProcessed += int64(len(values))
|
||||
func (reader *cinput) UpdateBytesProcessed(size int64) {
|
||||
reader.stats.BytesProcessed += size
|
||||
|
||||
}
|
||||
|
||||
// Read the file and returns map[string]interface{}
|
||||
func (reader *cinput) Read() (map[string]interface{}, error) {
|
||||
record := make(map[string]interface{})
|
||||
// Read returns byte sequence
|
||||
func (reader *cinput) Read() ([]byte, error) {
|
||||
dec := reader.readRecord()
|
||||
if dec != nil {
|
||||
if reader.options.HasHeader {
|
||||
columns := reader.header
|
||||
for i, value := range dec {
|
||||
record[columns[i]] = value
|
||||
}
|
||||
} else {
|
||||
for i, value := range dec {
|
||||
record["_"+strconv.Itoa(i)] = value
|
||||
var data []byte
|
||||
var err error
|
||||
for i, value := range dec {
|
||||
data, err = sjson.SetBytes(data, reader.header[i], value)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
return record, nil
|
||||
return data, nil
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user