Performance improvements to SELECT API on certain query operations (#6752)

This improves the performance of certain queries dramatically,
such as 'count(*)' etc.

Without this PR
```
~ time mc select --query "select count(*) from S3Object" myminio/sjm-airlines/star2000.csv.gz
2173762

real	0m42.464s
user	0m0.071s
sys	0m0.010s
```

With this PR
```
~ time mc select --query "select count(*) from S3Object" myminio/sjm-airlines/star2000.csv.gz
2173762

real	0m17.603s
user	0m0.093s
sys	0m0.008s
```

Almost a 250% improvement in performance. This PR avoids a lot of type
conversions and instead relies on raw sequences of data and interprets
them lazily.

```
benchcmp old new
benchmark                        old ns/op       new ns/op       delta
BenchmarkSQLAggregate_100K-4     551213          259782          -52.87%
BenchmarkSQLAggregate_1M-4       6981901985      2432413729      -65.16%
BenchmarkSQLAggregate_2M-4       13511978488     4536903552      -66.42%
BenchmarkSQLAggregate_10M-4      68427084908     23266283336     -66.00%

benchmark                        old allocs     new allocs     delta
BenchmarkSQLAggregate_100K-4     2366           485            -79.50%
BenchmarkSQLAggregate_1M-4       47455492       21462860       -54.77%
BenchmarkSQLAggregate_2M-4       95163637       43110771       -54.70%
BenchmarkSQLAggregate_10M-4      476959550      216906510      -54.52%

benchmark                        old bytes       new bytes      delta
BenchmarkSQLAggregate_100K-4     1233079         1086024        -11.93%
BenchmarkSQLAggregate_1M-4       2607984120      557038536      -78.64%
BenchmarkSQLAggregate_2M-4       5254103616      1128149168     -78.53%
BenchmarkSQLAggregate_10M-4      26443524872     5722715992     -78.36%
```
This commit is contained in:
Harshavardhana
2018-11-14 15:55:10 -08:00
committed by kannappanr
parent f9779b24ad
commit 7e1661f4fa
108 changed files with 640 additions and 12237 deletions

View File

@@ -29,6 +29,7 @@ import (
"github.com/minio/minio/pkg/event"
"github.com/minio/minio/pkg/hash"
"github.com/minio/minio/pkg/s3select"
"github.com/minio/minio/pkg/s3select/format"
)
// APIError structure
@@ -1655,7 +1656,8 @@ func toAPIErrorCode(ctx context.Context, err error) (apiErr APIErrorCode) {
apiErr = ErrEvaluatorBindingDoesNotExist
case s3select.ErrMissingHeaders:
apiErr = ErrMissingHeaders
case format.ErrParseInvalidPathComponent:
apiErr = ErrMissingHeaders
}
// Compression errors

View File

@@ -230,9 +230,7 @@ func (api objectAPIHandlers) SelectObjectContentHandler(w http.ResponseWriter, r
}
if selectReq.InputSerialization.JSON != nil {
if selectReq.InputSerialization.JSON.Type != s3select.JSONTypeDocument &&
selectReq.InputSerialization.JSON.Type != s3select.JSONLinesType &&
selectReq.InputSerialization.JSON.Type != "" {
if selectReq.InputSerialization.JSON.Type != s3select.JSONLinesType {
writeErrorResponse(w, ErrInvalidJSONType, r.URL)
return
}
@@ -255,7 +253,16 @@ func (api objectAPIHandlers) SelectObjectContentHandler(w http.ResponseWriter, r
reader := readahead.NewReader(gr)
defer reader.Close()
s3s, err := s3select.New(reader, objInfo.GetActualSize(), selectReq)
size := objInfo.Size
if objInfo.IsCompressed() {
size = objInfo.GetActualSize()
if size < 0 {
writeErrorResponse(w, toAPIErrorCode(ctx, errInvalidDecompressedSize), r.URL)
return
}
}
s3s, err := s3select.New(reader, size, selectReq)
if err != nil {
writeErrorResponse(w, toAPIErrorCode(ctx, err), r.URL)
return