Performance improvements to SELECT API on certain query operations (#6752)

This improves the performance of certain queries dramatically,
such as 'count(*)' etc.

Without this PR
```
~ time mc select --query "select count(*) from S3Object" myminio/sjm-airlines/star2000.csv.gz
2173762

real	0m42.464s
user	0m0.071s
sys	0m0.010s
```

With this PR
```
~ time mc select --query "select count(*) from S3Object" myminio/sjm-airlines/star2000.csv.gz
2173762

real	0m17.603s
user	0m0.093s
sys	0m0.008s
```

Almost a 250% improvement in performance. This PR avoids a lot of type
conversions and instead relies on raw sequences of data and interprets
them lazily.

```
benchcmp old new
benchmark                        old ns/op       new ns/op       delta
BenchmarkSQLAggregate_100K-4     551213          259782          -52.87%
BenchmarkSQLAggregate_1M-4       6981901985      2432413729      -65.16%
BenchmarkSQLAggregate_2M-4       13511978488     4536903552      -66.42%
BenchmarkSQLAggregate_10M-4      68427084908     23266283336     -66.00%

benchmark                        old allocs     new allocs     delta
BenchmarkSQLAggregate_100K-4     2366           485            -79.50%
BenchmarkSQLAggregate_1M-4       47455492       21462860       -54.77%
BenchmarkSQLAggregate_2M-4       95163637       43110771       -54.70%
BenchmarkSQLAggregate_10M-4      476959550      216906510      -54.52%

benchmark                        old bytes       new bytes      delta
BenchmarkSQLAggregate_100K-4     1233079         1086024        -11.93%
BenchmarkSQLAggregate_1M-4       2607984120      557038536      -78.64%
BenchmarkSQLAggregate_2M-4       5254103616      1128149168     -78.53%
BenchmarkSQLAggregate_10M-4      26443524872     5722715992     -78.36%
```
This commit is contained in:
Harshavardhana
2018-11-14 15:55:10 -08:00
committed by kannappanr
parent f9779b24ad
commit 7e1661f4fa
108 changed files with 640 additions and 12237 deletions

View File

@@ -19,13 +19,15 @@ package s3select
import (
"strings"
"github.com/minio/minio/pkg/s3select/format"
"github.com/tidwall/gjson"
"github.com/xwb1989/sqlparser"
"github.com/minio/minio/pkg/s3select/format"
)
// stringOps is a function which handles the case in a clause if there is a need
// to perform a string function
func stringOps(myFunc *sqlparser.FuncExpr, record string, myReturnVal string) string {
// stringOps is a function which handles the case in a clause
// if there is a need to perform a string function
func stringOps(myFunc *sqlparser.FuncExpr, record []byte, myReturnVal string) string {
var value string
funcName := myFunc.Name.CompliantName()
switch tempArg := myFunc.Exprs[0].(type) {
@@ -33,29 +35,29 @@ func stringOps(myFunc *sqlparser.FuncExpr, record string, myReturnVal string) st
switch col := tempArg.Expr.(type) {
case *sqlparser.FuncExpr:
// myReturnVal is actually the tail recursive value being used in the eval func.
return applyStrFunc(myReturnVal, funcName)
return applyStrFunc(gjson.Parse(myReturnVal), funcName)
case *sqlparser.ColName:
value = applyStrFunc(jsonValue(col.Name.CompliantName(), record), funcName)
value = applyStrFunc(gjson.GetBytes(record, col.Name.CompliantName()), funcName)
case *sqlparser.SQLVal:
value = applyStrFunc(string(col.Val), funcName)
value = applyStrFunc(gjson.ParseBytes(col.Val), funcName)
}
}
return value
}
// coalOps is a function which decomposes a COALESCE func expr into its struct.
func coalOps(myFunc *sqlparser.FuncExpr, record string, myReturnVal string) string {
func coalOps(myFunc *sqlparser.FuncExpr, record []byte, myReturnVal string) string {
myArgs := make([]string, len(myFunc.Exprs))
for i := 0; i < len(myFunc.Exprs); i++ {
switch tempArg := myFunc.Exprs[i].(type) {
for i, expr := range myFunc.Exprs {
switch tempArg := expr.(type) {
case *sqlparser.AliasedExpr:
switch col := tempArg.Expr.(type) {
case *sqlparser.FuncExpr:
// myReturnVal is actually the tail recursive value being used in the eval func.
return myReturnVal
case *sqlparser.ColName:
myArgs[i] = jsonValue(col.Name.CompliantName(), record)
myArgs[i] = gjson.GetBytes(record, col.Name.CompliantName()).String()
case *sqlparser.SQLVal:
myArgs[i] = string(col.Val)
}
@@ -65,54 +67,47 @@ func coalOps(myFunc *sqlparser.FuncExpr, record string, myReturnVal string) stri
}
// nullOps is a function which decomposes a NullIf func expr into its struct.
func nullOps(myFunc *sqlparser.FuncExpr, record string, myReturnVal string) string {
func nullOps(myFunc *sqlparser.FuncExpr, record []byte, myReturnVal string) string {
myArgs := make([]string, 2)
for i := 0; i < len(myFunc.Exprs); i++ {
switch tempArg := myFunc.Exprs[i].(type) {
for i, expr := range myFunc.Exprs {
switch tempArg := expr.(type) {
case *sqlparser.AliasedExpr:
switch col := tempArg.Expr.(type) {
case *sqlparser.FuncExpr:
return myReturnVal
case *sqlparser.ColName:
myArgs[i] = jsonValue(col.Name.CompliantName(), record)
myArgs[i] = gjson.GetBytes(record, col.Name.CompliantName()).String()
case *sqlparser.SQLVal:
myArgs[i] = string(col.Val)
}
}
}
return processNullIf(myArgs)
if myArgs[0] == myArgs[1] {
return ""
}
return myArgs[0]
}
// isValidString is a function that ensures the current index is one with a
// StrFunc
// isValidString is a function that ensures the
// current index is one with a StrFunc
func isValidFunc(myList []int, index int) bool {
if myList == nil {
return false
}
for i := 0; i < len(myList); i++ {
if myList[i] == index {
for _, i := range myList {
if i == index {
return true
}
}
return false
}
// processNullIf is a function that evaluates a given NULLIF clause.
func processNullIf(nullStore []string) string {
nullValOne := nullStore[0]
nullValTwo := nullStore[1]
if nullValOne == nullValTwo {
return ""
}
return nullValOne
}
// processCoalNoIndex is a function which evaluates a given COALESCE clause.
func processCoalNoIndex(coalStore []string) string {
for i := 0; i < len(coalStore); i++ {
if coalStore[i] != "null" && coalStore[i] != "missing" && coalStore[i] != "" {
return coalStore[i]
for _, coal := range coalStore {
if coal != "null" && coal != "missing" && coal != "" {
return coal
}
}
return "null"
@@ -120,15 +115,15 @@ func processCoalNoIndex(coalStore []string) string {
// evaluateFuncExpr is a function that allows for tail recursive evaluation of
// nested function expressions
func evaluateFuncExpr(myVal *sqlparser.FuncExpr, myReturnVal string, myRecord string) string {
func evaluateFuncExpr(myVal *sqlparser.FuncExpr, myReturnVal string, record []byte) string {
if myVal == nil {
return myReturnVal
}
// retrieve all the relevant arguments of the function
var mySubFunc []*sqlparser.FuncExpr
mySubFunc = make([]*sqlparser.FuncExpr, len(myVal.Exprs))
for i := 0; i < len(myVal.Exprs); i++ {
switch col := myVal.Exprs[i].(type) {
for i, expr := range myVal.Exprs {
switch col := expr.(type) {
case *sqlparser.AliasedExpr:
switch temp := col.Expr.(type) {
case *sqlparser.FuncExpr:
@@ -141,19 +136,19 @@ func evaluateFuncExpr(myVal *sqlparser.FuncExpr, myReturnVal string, myRecord st
for i := 0; i < len(mySubFunc); i++ {
if supportedString(myVal.Name.CompliantName()) {
if mySubFunc != nil {
return stringOps(myVal, myRecord, evaluateFuncExpr(mySubFunc[i], myReturnVal, myRecord))
return stringOps(myVal, record, evaluateFuncExpr(mySubFunc[i], myReturnVal, record))
}
return stringOps(myVal, myRecord, myReturnVal)
return stringOps(myVal, record, myReturnVal)
} else if strings.ToUpper(myVal.Name.CompliantName()) == "NULLIF" {
if mySubFunc != nil {
return nullOps(myVal, myRecord, evaluateFuncExpr(mySubFunc[i], myReturnVal, myRecord))
return nullOps(myVal, record, evaluateFuncExpr(mySubFunc[i], myReturnVal, record))
}
return nullOps(myVal, myRecord, myReturnVal)
return nullOps(myVal, record, myReturnVal)
} else if strings.ToUpper(myVal.Name.CompliantName()) == "COALESCE" {
if mySubFunc != nil {
return coalOps(myVal, myRecord, evaluateFuncExpr(mySubFunc[i], myReturnVal, myRecord))
return coalOps(myVal, record, evaluateFuncExpr(mySubFunc[i], myReturnVal, record))
}
return coalOps(myVal, myRecord, myReturnVal)
return coalOps(myVal, record, myReturnVal)
}
}
return ""
@@ -167,8 +162,8 @@ func evaluateFuncErr(myVal *sqlparser.FuncExpr, reader format.Select) error {
if !supportedFunc(myVal.Name.CompliantName()) {
return ErrUnsupportedSQLOperation
}
for i := 0; i < len(myVal.Exprs); i++ {
switch tempArg := myVal.Exprs[i].(type) {
for _, expr := range myVal.Exprs {
switch tempArg := expr.(type) {
case *sqlparser.StarExpr:
return ErrParseUnsupportedCallWithStar
case *sqlparser.AliasedExpr:
@@ -188,29 +183,31 @@ func evaluateFuncErr(myVal *sqlparser.FuncExpr, reader format.Select) error {
}
// evaluateIsExpr is a function for evaluating expressions of the form "column is ...."
func evaluateIsExpr(myFunc *sqlparser.IsExpr, row string, alias string) (bool, error) {
operator := myFunc.Operator
var myVal string
switch myIs := myFunc.Expr.(type) {
// case for literal val
case *sqlparser.SQLVal:
myVal = string(myIs.Val)
// case for nested func val
case *sqlparser.FuncExpr:
myVal = evaluateFuncExpr(myIs, "", row)
// case for col val
case *sqlparser.ColName:
myVal = jsonValue(myIs.Name.CompliantName(), row)
func evaluateIsExpr(myFunc *sqlparser.IsExpr, row []byte, alias string) (bool, error) {
getMyVal := func() (myVal string) {
switch myIs := myFunc.Expr.(type) {
// case for literal val
case *sqlparser.SQLVal:
myVal = string(myIs.Val)
// case for nested func val
case *sqlparser.FuncExpr:
myVal = evaluateFuncExpr(myIs, "", row)
// case for col val
case *sqlparser.ColName:
myVal = gjson.GetBytes(row, myIs.Name.CompliantName()).String()
}
return myVal
}
// case to evaluate is null
if strings.ToLower(operator) == "is null" {
return myVal == "", nil
operator := strings.ToLower(myFunc.Operator)
switch operator {
case "is null":
return getMyVal() == "", nil
case "is not null":
return getMyVal() != "", nil
default:
return false, ErrUnsupportedSQLOperation
}
// case to evaluate is not null
if strings.ToLower(operator) == "is not null" {
return myVal != "", nil
}
return false, ErrUnsupportedSQLOperation
}
// supportedString is a function that checks whether the function is a supported