Performance improvements to SELECT API on certain query operations (#6752)

This improves the performance of certain queries dramatically,
such as 'count(*)' etc.

Without this PR
```
~ time mc select --query "select count(*) from S3Object" myminio/sjm-airlines/star2000.csv.gz
2173762

real	0m42.464s
user	0m0.071s
sys	0m0.010s
```

With this PR
```
~ time mc select --query "select count(*) from S3Object" myminio/sjm-airlines/star2000.csv.gz
2173762

real	0m17.603s
user	0m0.093s
sys	0m0.008s
```

Almost a 250% improvement in performance. This PR avoids a lot of type
conversions and instead relies on raw sequences of data and interprets
them lazily.

```
benchcmp old new
benchmark                        old ns/op       new ns/op       delta
BenchmarkSQLAggregate_100K-4     551213          259782          -52.87%
BenchmarkSQLAggregate_1M-4       6981901985      2432413729      -65.16%
BenchmarkSQLAggregate_2M-4       13511978488     4536903552      -66.42%
BenchmarkSQLAggregate_10M-4      68427084908     23266283336     -66.00%

benchmark                        old allocs     new allocs     delta
BenchmarkSQLAggregate_100K-4     2366           485            -79.50%
BenchmarkSQLAggregate_1M-4       47455492       21462860       -54.77%
BenchmarkSQLAggregate_2M-4       95163637       43110771       -54.70%
BenchmarkSQLAggregate_10M-4      476959550      216906510      -54.52%

benchmark                        old bytes       new bytes      delta
BenchmarkSQLAggregate_100K-4     1233079         1086024        -11.93%
BenchmarkSQLAggregate_1M-4       2607984120      557038536      -78.64%
BenchmarkSQLAggregate_2M-4       5254103616      1128149168     -78.53%
BenchmarkSQLAggregate_10M-4      26443524872     5722715992     -78.36%
```
This commit is contained in:
Harshavardhana
2018-11-14 15:55:10 -08:00
committed by kannappanr
parent f9779b24ad
commit 7e1661f4fa
108 changed files with 640 additions and 12237 deletions

View File

@@ -19,17 +19,17 @@ package s3select
import (
"bytes"
"compress/bzip2"
"compress/gzip"
"io"
"net/http"
"strings"
"time"
humanize "github.com/dustin/go-humanize"
"github.com/klauspost/pgzip"
"github.com/minio/minio/pkg/s3select/format"
"github.com/minio/minio/pkg/s3select/format/csv"
"github.com/minio/minio/pkg/s3select/format/json"
humanize "github.com/dustin/go-humanize"
)
const (
@@ -40,18 +40,6 @@ const (
continuationTime time.Duration = 5 * time.Second
)
// ParseSelectTokens tokenizes the select query into required Columns, Alias, limit value
// where clause, aggregate functions, myFunctions, error.
type ParseSelectTokens struct {
reqCols []string
alias string
myLimit int64
whereClause interface{}
aggFunctionNames []string
myFuncs *SelectFuncs
myErr error
}
// Row is a Struct for keeping track of key aspects of a row.
type Row struct {
record string
@@ -60,7 +48,7 @@ type Row struct {
// This function replaces "",'' with `` for the select parser
func cleanExpr(expr string) string {
r := strings.NewReplacer("\"", "`", "'", "`")
r := strings.NewReplacer("\"", "`")
return r.Replace(expr)
}
@@ -68,7 +56,7 @@ func cleanExpr(expr string) string {
func New(reader io.Reader, size int64, req ObjectSelectRequest) (s3s format.Select, err error) {
switch req.InputSerialization.CompressionType {
case SelectCompressionGZIP:
if reader, err = gzip.NewReader(reader); err != nil {
if reader, err = pgzip.NewReader(reader); err != nil {
return nil, format.ErrTruncatedInput
}
case SelectCompressionBZIP:
@@ -119,7 +107,7 @@ func New(reader io.Reader, size int64, req ObjectSelectRequest) (s3s format.Sele
// response writer in a streaming fashion so that the client can actively use
// the results before the query is finally finished executing. The
func Execute(writer io.Writer, f format.Select) error {
myRow := make(chan Row, 1000)
rowCh := make(chan Row)
curBuf := bytes.NewBuffer(make([]byte, humanize.MiByte))
curBuf.Reset()
progressTicker := time.NewTicker(progressTime)
@@ -127,10 +115,10 @@ func Execute(writer io.Writer, f format.Select) error {
defer progressTicker.Stop()
defer continuationTimer.Stop()
go runSelectParser(f, myRow)
go runSelectParser(f, rowCh)
for {
select {
case row, ok := <-myRow:
case row, ok := <-rowCh:
if ok && row.err != nil {
_, err := writeErrorMessage(row.err, curBuf).WriteTo(writer)
flusher, okFlush := writer.(http.Flusher)
@@ -141,7 +129,7 @@ func Execute(writer io.Writer, f format.Select) error {
return err
}
curBuf.Reset()
close(myRow)
close(rowCh)
return nil
} else if ok {
_, err := writeRecordMessage(row.record, curBuf).WriteTo(writer)