Performance improvements to SELECT API on certain query operations (#6752)

This improves the performance of certain queries dramatically,
such as 'count(*)' etc.

Without this PR
```
~ time mc select --query "select count(*) from S3Object" myminio/sjm-airlines/star2000.csv.gz
2173762

real	0m42.464s
user	0m0.071s
sys	0m0.010s
```

With this PR
```
~ time mc select --query "select count(*) from S3Object" myminio/sjm-airlines/star2000.csv.gz
2173762

real	0m17.603s
user	0m0.093s
sys	0m0.008s
```

Almost a 250% improvement in performance. This PR avoids a lot of type
conversions and instead relies on raw sequences of data and interprets
them lazily.

```
benchcmp old new
benchmark                        old ns/op       new ns/op       delta
BenchmarkSQLAggregate_100K-4     551213          259782          -52.87%
BenchmarkSQLAggregate_1M-4       6981901985      2432413729      -65.16%
BenchmarkSQLAggregate_2M-4       13511978488     4536903552      -66.42%
BenchmarkSQLAggregate_10M-4      68427084908     23266283336     -66.00%

benchmark                        old allocs     new allocs     delta
BenchmarkSQLAggregate_100K-4     2366           485            -79.50%
BenchmarkSQLAggregate_1M-4       47455492       21462860       -54.77%
BenchmarkSQLAggregate_2M-4       95163637       43110771       -54.70%
BenchmarkSQLAggregate_10M-4      476959550      216906510      -54.52%

benchmark                        old bytes       new bytes      delta
BenchmarkSQLAggregate_100K-4     1233079         1086024        -11.93%
BenchmarkSQLAggregate_1M-4       2607984120      557038536      -78.64%
BenchmarkSQLAggregate_2M-4       5254103616      1128149168     -78.53%
BenchmarkSQLAggregate_10M-4      26443524872     5722715992     -78.36%
```
This commit is contained in:
Harshavardhana
2018-11-14 15:55:10 -08:00
committed by kannappanr
parent f9779b24ad
commit 7e1661f4fa
108 changed files with 640 additions and 12237 deletions

View File

@@ -17,11 +17,10 @@
package json
import (
"encoding/json"
"bufio"
"encoding/xml"
"io"
jsoniter "github.com/json-iterator/go"
"github.com/minio/minio/pkg/s3select/format"
)
@@ -57,7 +56,7 @@ type Options struct {
// jinput represents a record producing input from a formatted file or pipe.
type jinput struct {
options *Options
reader *jsoniter.Decoder
reader *bufio.Reader
firstRow []string
header []string
minOutputLength int
@@ -75,7 +74,7 @@ type jinput struct {
func New(opts *Options) (format.Select, error) {
reader := &jinput{
options: opts,
reader: jsoniter.NewDecoder(opts.ReadFrom),
reader: bufio.NewReader(opts.ReadFrom),
}
reader.stats.BytesScanned = opts.StreamSize
reader.stats.BytesProcessed = 0
@@ -90,26 +89,21 @@ func (reader *jinput) Progress() bool {
}
// UpdateBytesProcessed - populates the bytes Processed
func (reader *jinput) UpdateBytesProcessed(record map[string]interface{}) {
out, _ := json.Marshal(record)
reader.stats.BytesProcessed += int64(len(out))
func (reader *jinput) UpdateBytesProcessed(size int64) {
reader.stats.BytesProcessed += size
}
// Read the file and returns map[string]interface{}
func (reader *jinput) Read() (map[string]interface{}, error) {
dec := reader.reader
var record interface{}
for {
err := dec.Decode(&record)
// Read the file and returns
func (reader *jinput) Read() ([]byte, error) {
data, err := reader.reader.ReadBytes('\n')
if err != nil {
if err == io.EOF || err == io.ErrClosedPipe {
break
err = nil
} else {
err = format.ErrJSONParsingError
}
if err != nil {
return nil, format.ErrJSONParsingError
}
return record.(map[string]interface{}), nil
}
return nil, nil
return data, err
}
// OutputFieldDelimiter - returns the delimiter specified in input request