Performance improvements to SELECT API on certain query operations (#6752)

This improves the performance of certain queries dramatically,
such as 'count(*)' etc.

Without this PR
```
~ time mc select --query "select count(*) from S3Object" myminio/sjm-airlines/star2000.csv.gz
2173762

real	0m42.464s
user	0m0.071s
sys	0m0.010s
```

With this PR
```
~ time mc select --query "select count(*) from S3Object" myminio/sjm-airlines/star2000.csv.gz
2173762

real	0m17.603s
user	0m0.093s
sys	0m0.008s
```

Almost a 250% improvement in performance. This PR avoids a lot of type
conversions and instead relies on raw sequences of data and interprets
them lazily.

```
benchcmp old new
benchmark                        old ns/op       new ns/op       delta
BenchmarkSQLAggregate_100K-4     551213          259782          -52.87%
BenchmarkSQLAggregate_1M-4       6981901985      2432413729      -65.16%
BenchmarkSQLAggregate_2M-4       13511978488     4536903552      -66.42%
BenchmarkSQLAggregate_10M-4      68427084908     23266283336     -66.00%

benchmark                        old allocs     new allocs     delta
BenchmarkSQLAggregate_100K-4     2366           485            -79.50%
BenchmarkSQLAggregate_1M-4       47455492       21462860       -54.77%
BenchmarkSQLAggregate_2M-4       95163637       43110771       -54.70%
BenchmarkSQLAggregate_10M-4      476959550      216906510      -54.52%

benchmark                        old bytes       new bytes      delta
BenchmarkSQLAggregate_100K-4     1233079         1086024        -11.93%
BenchmarkSQLAggregate_1M-4       2607984120      557038536      -78.64%
BenchmarkSQLAggregate_2M-4       5254103616      1128149168     -78.53%
BenchmarkSQLAggregate_10M-4      26443524872     5722715992     -78.36%
```
This commit is contained in:
Harshavardhana
2018-11-14 15:55:10 -08:00
committed by kannappanr
parent f9779b24ad
commit 7e1661f4fa
108 changed files with 640 additions and 12237 deletions

View File

@@ -23,6 +23,8 @@ import (
"strconv"
"strings"
"github.com/tidwall/sjson"
"github.com/minio/minio/pkg/ioutil"
"github.com/minio/minio/pkg/s3select/format"
)
@@ -96,7 +98,6 @@ func New(opts *Options) (format.Select, error) {
reader.stats.BytesScanned = opts.StreamSize
reader.stats.BytesProcessed = 0
reader.stats.BytesReturned = 0
reader.firstRow = nil
reader.reader.FieldsPerRecord = -1
@@ -120,7 +121,14 @@ func New(opts *Options) (format.Select, error) {
// Replace the spaces in columnnames with underscores
func cleanHeader(columns []string) []string {
for i := 0; i < len(columns); i++ {
for i := range columns {
// Even if header name is specified, some CSV's
// might have column header names might be empty
// and non-empty. In such a scenario we prepare
// indexed value.
if columns[i] == "" {
columns[i] = "_" + strconv.Itoa(i)
}
columns[i] = strings.Replace(columns[i], " ", "_", -1)
}
return columns
@@ -137,15 +145,14 @@ func (reader *cinput) readHeader() error {
}
reader.header = cleanHeader(reader.firstRow)
reader.firstRow = nil
reader.minOutputLength = len(reader.header)
} else {
reader.firstRow, readErr = reader.reader.Read()
reader.header = make([]string, len(reader.firstRow))
for i := 0; i < reader.minOutputLength; i++ {
reader.header[i] = strconv.Itoa(i)
for i := range reader.firstRow {
reader.header[i] = "_" + strconv.Itoa(i)
}
}
reader.minOutputLength = len(reader.header)
return nil
}
@@ -155,33 +162,24 @@ func (reader *cinput) Progress() bool {
}
// UpdateBytesProcessed - populates the bytes Processed
func (reader *cinput) UpdateBytesProcessed(record map[string]interface{}) {
// Convert map to slice of values.
values := []string{}
for _, value := range record {
values = append(values, value.(string))
}
reader.stats.BytesProcessed += int64(len(values))
func (reader *cinput) UpdateBytesProcessed(size int64) {
reader.stats.BytesProcessed += size
}
// Read the file and returns map[string]interface{}
func (reader *cinput) Read() (map[string]interface{}, error) {
record := make(map[string]interface{})
// Read returns byte sequence
func (reader *cinput) Read() ([]byte, error) {
dec := reader.readRecord()
if dec != nil {
if reader.options.HasHeader {
columns := reader.header
for i, value := range dec {
record[columns[i]] = value
}
} else {
for i, value := range dec {
record["_"+strconv.Itoa(i)] = value
var data []byte
var err error
for i, value := range dec {
data, err = sjson.SetBytes(data, reader.header[i], value)
if err != nil {
return nil, err
}
}
return record, nil
return data, nil
}
return nil, nil
}