mirror of
https://github.com/minio/minio.git
synced 2025-11-22 02:35:30 -05:00
Performance improvements to SELECT API on certain query operations (#6752)
This improves the performance of certain queries dramatically, such as 'count(*)' etc. Without this PR ``` ~ time mc select --query "select count(*) from S3Object" myminio/sjm-airlines/star2000.csv.gz 2173762 real 0m42.464s user 0m0.071s sys 0m0.010s ``` With this PR ``` ~ time mc select --query "select count(*) from S3Object" myminio/sjm-airlines/star2000.csv.gz 2173762 real 0m17.603s user 0m0.093s sys 0m0.008s ``` Almost a 250% improvement in performance. This PR avoids a lot of type conversions and instead relies on raw sequences of data and interprets them lazily. ``` benchcmp old new benchmark old ns/op new ns/op delta BenchmarkSQLAggregate_100K-4 551213 259782 -52.87% BenchmarkSQLAggregate_1M-4 6981901985 2432413729 -65.16% BenchmarkSQLAggregate_2M-4 13511978488 4536903552 -66.42% BenchmarkSQLAggregate_10M-4 68427084908 23266283336 -66.00% benchmark old allocs new allocs delta BenchmarkSQLAggregate_100K-4 2366 485 -79.50% BenchmarkSQLAggregate_1M-4 47455492 21462860 -54.77% BenchmarkSQLAggregate_2M-4 95163637 43110771 -54.70% BenchmarkSQLAggregate_10M-4 476959550 216906510 -54.52% benchmark old bytes new bytes delta BenchmarkSQLAggregate_100K-4 1233079 1086024 -11.93% BenchmarkSQLAggregate_1M-4 2607984120 557038536 -78.64% BenchmarkSQLAggregate_2M-4 5254103616 1128149168 -78.53% BenchmarkSQLAggregate_10M-4 26443524872 5722715992 -78.36% ```
This commit is contained in:
committed by
kannappanr
parent
f9779b24ad
commit
7e1661f4fa
@@ -17,10 +17,8 @@
|
||||
package s3select
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"reflect"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
@@ -32,64 +30,50 @@ import (
|
||||
// MaxExpressionLength - 256KiB
|
||||
const MaxExpressionLength = 256 * 1024
|
||||
|
||||
// matchesMyWhereClause takes map[string]interfaces{} , process the where clause and returns true if the row suffices
|
||||
func matchesMyWhereClause(record map[string]interface{}, alias string, whereClause interface{}) (bool, error) {
|
||||
// matchesMyWhereClause takes []byte, process the where clause and returns true if the row suffices
|
||||
func matchesMyWhereClause(record []byte, alias string, whereClause sqlparser.Expr) (bool, error) {
|
||||
var conversionColumn string
|
||||
var operator string
|
||||
var operand interface{}
|
||||
var operand gjson.Result
|
||||
if fmt.Sprintf("%v", whereClause) == "false" {
|
||||
return false, nil
|
||||
}
|
||||
out, err := json.Marshal(record)
|
||||
if err != nil {
|
||||
return false, ErrExternalEvalException
|
||||
}
|
||||
switch expr := whereClause.(type) {
|
||||
case *sqlparser.IsExpr:
|
||||
return evaluateIsExpr(expr, string(out), alias)
|
||||
return evaluateIsExpr(expr, record, alias)
|
||||
case *sqlparser.RangeCond:
|
||||
operator = expr.Operator
|
||||
if operator != "between" && operator != "not between" {
|
||||
return false, ErrUnsupportedSQLOperation
|
||||
}
|
||||
if operator == "not between" {
|
||||
result, err := evaluateBetween(expr, alias, string(out))
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
return !result, nil
|
||||
}
|
||||
result, err := evaluateBetween(expr, alias, string(out))
|
||||
result, err := evaluateBetween(expr, alias, record)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if operator == "not between" {
|
||||
return !result, nil
|
||||
}
|
||||
return result, nil
|
||||
case *sqlparser.ComparisonExpr:
|
||||
operator = expr.Operator
|
||||
switch right := expr.Right.(type) {
|
||||
case *sqlparser.FuncExpr:
|
||||
operand = evaluateFuncExpr(right, "", string(out))
|
||||
operand = gjson.Parse(evaluateFuncExpr(right, "", record))
|
||||
case *sqlparser.SQLVal:
|
||||
var err error
|
||||
operand, err = evaluateParserType(right)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
operand = gjson.ParseBytes(right.Val)
|
||||
}
|
||||
var myVal string
|
||||
myVal = ""
|
||||
switch left := expr.Left.(type) {
|
||||
case *sqlparser.FuncExpr:
|
||||
myVal = evaluateFuncExpr(left, "", string(out))
|
||||
myVal = evaluateFuncExpr(left, "", record)
|
||||
conversionColumn = ""
|
||||
case *sqlparser.ColName:
|
||||
conversionColumn = left.Name.CompliantName()
|
||||
}
|
||||
|
||||
if myVal != "" {
|
||||
return evaluateOperator(myVal, operator, operand)
|
||||
return evaluateOperator(gjson.Parse(myVal), operator, operand)
|
||||
}
|
||||
return evaluateOperator(jsonValue(conversionColumn, string(out)), operator, operand)
|
||||
return evaluateOperator(gjson.GetBytes(record, conversionColumn), operator, operand)
|
||||
case *sqlparser.AndExpr:
|
||||
var leftVal bool
|
||||
var rightVal bool
|
||||
@@ -127,58 +111,50 @@ func matchesMyWhereClause(record map[string]interface{}, alias string, whereClau
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func applyStrFunc(rawArg string, funcName string) string {
|
||||
func applyStrFunc(rawArg gjson.Result, funcName string) string {
|
||||
switch strings.ToUpper(funcName) {
|
||||
case "TRIM":
|
||||
// parser has an issue which does not allow it to support Trim with other
|
||||
// arguments
|
||||
return strings.Trim(rawArg, " ")
|
||||
// parser has an issue which does not allow it to support
|
||||
// Trim with other arguments
|
||||
return strings.Trim(rawArg.String(), " ")
|
||||
case "SUBSTRING":
|
||||
// TODO parser has an issue which does not support substring
|
||||
return rawArg
|
||||
// TODO: parser has an issue which does not support substring
|
||||
return rawArg.String()
|
||||
case "CHAR_LENGTH":
|
||||
return strconv.Itoa(len(rawArg))
|
||||
return strconv.Itoa(len(rawArg.String()))
|
||||
case "CHARACTER_LENGTH":
|
||||
return strconv.Itoa(len(rawArg))
|
||||
return strconv.Itoa(len(rawArg.String()))
|
||||
case "LOWER":
|
||||
return strings.ToLower(rawArg)
|
||||
return strings.ToLower(rawArg.String())
|
||||
case "UPPER":
|
||||
return strings.ToUpper(rawArg)
|
||||
return strings.ToUpper(rawArg.String())
|
||||
}
|
||||
return rawArg
|
||||
return rawArg.String()
|
||||
|
||||
}
|
||||
|
||||
// evaluateBetween is a function which evaluates a Between Clause.
|
||||
func evaluateBetween(betweenExpr *sqlparser.RangeCond, alias string, record string) (bool, error) {
|
||||
var colToVal interface{}
|
||||
var colFromVal interface{}
|
||||
func evaluateBetween(betweenExpr *sqlparser.RangeCond, alias string, record []byte) (bool, error) {
|
||||
var colToVal gjson.Result
|
||||
var colFromVal gjson.Result
|
||||
var conversionColumn string
|
||||
var funcName string
|
||||
switch colTo := betweenExpr.To.(type) {
|
||||
case sqlparser.Expr:
|
||||
switch colToMyVal := colTo.(type) {
|
||||
case *sqlparser.FuncExpr:
|
||||
colToVal = stringOps(colToMyVal, record, "")
|
||||
colToVal = gjson.Parse(stringOps(colToMyVal, record, ""))
|
||||
case *sqlparser.SQLVal:
|
||||
var err error
|
||||
colToVal, err = evaluateParserType(colToMyVal)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
colToVal = gjson.ParseBytes(colToMyVal.Val)
|
||||
}
|
||||
}
|
||||
switch colFrom := betweenExpr.From.(type) {
|
||||
case sqlparser.Expr:
|
||||
switch colFromMyVal := colFrom.(type) {
|
||||
case *sqlparser.FuncExpr:
|
||||
colFromVal = stringOps(colFromMyVal, record, "")
|
||||
colFromVal = gjson.Parse(stringOps(colFromMyVal, record, ""))
|
||||
case *sqlparser.SQLVal:
|
||||
var err error
|
||||
colFromVal, err = evaluateParserType(colFromMyVal)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
colFromVal = gjson.ParseBytes(colFromMyVal.Val)
|
||||
}
|
||||
}
|
||||
var myFuncVal string
|
||||
@@ -189,7 +165,7 @@ func evaluateBetween(betweenExpr *sqlparser.RangeCond, alias string, record stri
|
||||
case *sqlparser.ColName:
|
||||
conversionColumn = cleanCol(left.Name.CompliantName(), alias)
|
||||
}
|
||||
toGreater, err := evaluateOperator(fmt.Sprintf("%v", colToVal), ">", colFromVal)
|
||||
toGreater, err := evaluateOperator(colToVal, ">", colFromVal)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
@@ -199,113 +175,87 @@ func evaluateBetween(betweenExpr *sqlparser.RangeCond, alias string, record stri
|
||||
return evalBetweenLess(conversionColumn, record, funcName, colFromVal, colToVal, myFuncVal)
|
||||
}
|
||||
|
||||
// evalBetweenGreater is a function which evaluates the between given that the
|
||||
// TO is > than the FROM.
|
||||
func evalBetweenGreater(conversionColumn string, record string, funcName string, colFromVal interface{}, colToVal interface{}, myColVal string) (bool, error) {
|
||||
func evalBetween(conversionColumn string, record []byte, funcName string, colFromVal gjson.Result, colToVal gjson.Result, myColVal string, operator string) (bool, error) {
|
||||
if format.IsInt(conversionColumn) {
|
||||
myVal, err := evaluateOperator(jsonValue("_"+conversionColumn, record), ">=", colFromVal)
|
||||
myVal, err := evaluateOperator(gjson.GetBytes(record, "_"+conversionColumn), operator, colFromVal)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
var myOtherVal bool
|
||||
myOtherVal, err = evaluateOperator(fmt.Sprintf("%v", colToVal), ">=", checkStringType(jsonValue("_"+conversionColumn, record)))
|
||||
myOtherVal, err = evaluateOperator(colToVal, operator, gjson.GetBytes(record, "_"+conversionColumn))
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
return (myVal && myOtherVal), nil
|
||||
}
|
||||
if myColVal != "" {
|
||||
myVal, err := evaluateOperator(myColVal, ">=", colFromVal)
|
||||
myVal, err := evaluateOperator(gjson.Parse(myColVal), operator, colFromVal)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
var myOtherVal bool
|
||||
myOtherVal, err = evaluateOperator(fmt.Sprintf("%v", colToVal), ">=", checkStringType(myColVal))
|
||||
myOtherVal, err = evaluateOperator(colToVal, operator, gjson.Parse(myColVal))
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
return (myVal && myOtherVal), nil
|
||||
}
|
||||
myVal, err := evaluateOperator(jsonValue(conversionColumn, record), ">=", colFromVal)
|
||||
myVal, err := evaluateOperator(gjson.GetBytes(record, conversionColumn), operator, colFromVal)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
var myOtherVal bool
|
||||
myOtherVal, err = evaluateOperator(fmt.Sprintf("%v", colToVal), ">=", checkStringType(jsonValue(conversionColumn, record)))
|
||||
myOtherVal, err = evaluateOperator(colToVal, operator, gjson.GetBytes(record, conversionColumn))
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
return (myVal && myOtherVal), nil
|
||||
}
|
||||
|
||||
// evalBetweenGreater is a function which evaluates the between given that the
|
||||
// TO is > than the FROM.
|
||||
func evalBetweenGreater(conversionColumn string, record []byte, funcName string, colFromVal gjson.Result, colToVal gjson.Result, myColVal string) (bool, error) {
|
||||
return evalBetween(conversionColumn, record, funcName, colFromVal, colToVal, myColVal, ">=")
|
||||
}
|
||||
|
||||
// evalBetweenLess is a function which evaluates the between given that the
|
||||
// FROM is > than the TO.
|
||||
func evalBetweenLess(conversionColumn string, record string, funcName string, colFromVal interface{}, colToVal interface{}, myColVal string) (bool, error) {
|
||||
if format.IsInt(conversionColumn) {
|
||||
// Subtract 1 out because the index starts at 1 for Amazon instead of 0.
|
||||
myVal, err := evaluateOperator(jsonValue("_"+conversionColumn, record), "<=", colFromVal)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
var myOtherVal bool
|
||||
myOtherVal, err = evaluateOperator(fmt.Sprintf("%v", colToVal), "<=", checkStringType(jsonValue("_"+conversionColumn, record)))
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
return (myVal && myOtherVal), nil
|
||||
}
|
||||
if myColVal != "" {
|
||||
myVal, err := evaluateOperator(myColVal, "<=", colFromVal)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
var myOtherVal bool
|
||||
myOtherVal, err = evaluateOperator(fmt.Sprintf("%v", colToVal), "<=", checkStringType(myColVal))
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
return (myVal && myOtherVal), nil
|
||||
}
|
||||
myVal, err := evaluateOperator(jsonValue(conversionColumn, record), "<=", colFromVal)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
var myOtherVal bool
|
||||
myOtherVal, err = evaluateOperator(fmt.Sprintf("%v", colToVal), "<=", checkStringType(jsonValue(conversionColumn, record)))
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
return (myVal && myOtherVal), nil
|
||||
func evalBetweenLess(conversionColumn string, record []byte, funcName string, colFromVal gjson.Result, colToVal gjson.Result, myColVal string) (bool, error) {
|
||||
return evalBetween(conversionColumn, record, funcName, colFromVal, colToVal, myColVal, "<=")
|
||||
}
|
||||
|
||||
// This is a really important function it actually evaluates the boolean
|
||||
// statement and therefore actually returns a bool, it functions as the lowest
|
||||
// level of the state machine.
|
||||
func evaluateOperator(myTblVal string, operator string, operand interface{}) (bool, error) {
|
||||
func evaluateOperator(myTblVal gjson.Result, operator string, operand gjson.Result) (bool, error) {
|
||||
if err := checkValidOperator(operator); err != nil {
|
||||
return false, err
|
||||
}
|
||||
myRecordVal := checkStringType(myTblVal)
|
||||
myVal := reflect.ValueOf(myRecordVal)
|
||||
myOp := reflect.ValueOf(operand)
|
||||
|
||||
switch {
|
||||
case myVal.Kind() == reflect.String && myOp.Kind() == reflect.String:
|
||||
return stringEval(myVal.String(), operator, myOp.String())
|
||||
case myVal.Kind() == reflect.Float64 && myOp.Kind() == reflect.Float64:
|
||||
return floatEval(myVal.Float(), operator, myOp.Float())
|
||||
case myVal.Kind() == reflect.Int && myOp.Kind() == reflect.Int:
|
||||
return intEval(myVal.Int(), operator, myOp.Int())
|
||||
case myVal.Kind() == reflect.Int && myOp.Kind() == reflect.String:
|
||||
stringVs := strconv.Itoa(int(myVal.Int()))
|
||||
return stringEval(stringVs, operator, myOp.String())
|
||||
case myVal.Kind() == reflect.Float64 && myOp.Kind() == reflect.String:
|
||||
stringVs := strconv.FormatFloat(myVal.Float(), 'f', 6, 64)
|
||||
return stringEval(stringVs, operator, myOp.String())
|
||||
case myVal.Kind() != myOp.Kind():
|
||||
if !myTblVal.Exists() {
|
||||
return false, nil
|
||||
}
|
||||
return false, ErrUnsupportedSyntax
|
||||
switch {
|
||||
case operand.Type == gjson.String || operand.Type == gjson.Null:
|
||||
return stringEval(myTblVal.String(), operator, operand.String())
|
||||
case operand.Type == gjson.Number:
|
||||
opInt := format.IsInt(operand.Raw)
|
||||
tblValInt := format.IsInt(strings.Trim(myTblVal.Raw, "\""))
|
||||
if opInt && tblValInt {
|
||||
return intEval(int64(myTblVal.Float()), operator, operand.Int())
|
||||
}
|
||||
if !opInt && !tblValInt {
|
||||
return floatEval(myTblVal.Float(), operator, operand.Float())
|
||||
}
|
||||
switch operator {
|
||||
case "!=":
|
||||
return true, nil
|
||||
}
|
||||
return false, nil
|
||||
case myTblVal.Type != operand.Type:
|
||||
return false, nil
|
||||
default:
|
||||
return false, ErrUnsupportedSyntax
|
||||
}
|
||||
}
|
||||
|
||||
// checkValidOperator ensures that the current operator is supported
|
||||
@@ -319,19 +269,6 @@ func checkValidOperator(operator string) error {
|
||||
return ErrParseUnknownOperator
|
||||
}
|
||||
|
||||
// checkStringType converts the value from the csv to the appropriate one.
|
||||
func checkStringType(tblVal string) interface{} {
|
||||
intVal, err := strconv.Atoi(tblVal)
|
||||
if err == nil {
|
||||
return intVal
|
||||
}
|
||||
floatVal, err := strconv.ParseFloat(tblVal, 64)
|
||||
if err == nil {
|
||||
return floatVal
|
||||
}
|
||||
return tblVal
|
||||
}
|
||||
|
||||
// stringEval is for evaluating the state of string comparison.
|
||||
func stringEval(myRecordVal string, operator string, myOperand string) (bool, error) {
|
||||
switch operator {
|
||||
@@ -586,48 +523,17 @@ func aggFuncToStr(aggVals []float64, f format.Select) string {
|
||||
}
|
||||
|
||||
// checkForDuplicates ensures we do not have an ambigious column name.
|
||||
func checkForDuplicates(columns []string, columnsMap map[string]int, hasDuplicates map[string]bool, lowercaseColumnsMap map[string]int) error {
|
||||
for i := 0; i < len(columns); i++ {
|
||||
columns[i] = strings.Replace(columns[i], " ", "_", len(columns[i]))
|
||||
func checkForDuplicates(columns []string, columnsMap map[string]int) error {
|
||||
for i, column := range columns {
|
||||
columns[i] = strings.Replace(column, " ", "_", len(column))
|
||||
if _, exist := columnsMap[columns[i]]; exist {
|
||||
return ErrAmbiguousFieldName
|
||||
}
|
||||
columnsMap[columns[i]] = i
|
||||
// This checks that if a key has already been put into the map, that we're
|
||||
// setting its appropriate value in has duplicates to be true.
|
||||
if _, exist := lowercaseColumnsMap[strings.ToLower(columns[i])]; exist {
|
||||
hasDuplicates[strings.ToLower(columns[i])] = true
|
||||
} else {
|
||||
lowercaseColumnsMap[strings.ToLower(columns[i])] = i
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// evaluateParserType is a function that takes a SQL value and returns it as an
|
||||
// interface converted into the appropriate value.
|
||||
func evaluateParserType(col *sqlparser.SQLVal) (interface{}, error) {
|
||||
colDataType := col.Type
|
||||
var val interface{}
|
||||
switch colDataType {
|
||||
case 0:
|
||||
val = string(col.Val)
|
||||
case 1:
|
||||
intVersion, isInt := strconv.Atoi(string(col.Val))
|
||||
if isInt != nil {
|
||||
return nil, ErrIntegerOverflow
|
||||
}
|
||||
val = intVersion
|
||||
case 2:
|
||||
floatVersion, isFloat := strconv.ParseFloat(string(col.Val), 64)
|
||||
if isFloat != nil {
|
||||
return nil, ErrIntegerOverflow
|
||||
}
|
||||
val = floatVersion
|
||||
}
|
||||
return val, nil
|
||||
}
|
||||
|
||||
// parseErrs is the function which handles all the errors that could occur
|
||||
// through use of function arguments such as column names in NULLIF
|
||||
func parseErrs(columnNames []string, whereClause interface{}, alias string, myFuncs SelectFuncs, f format.Select) error {
|
||||
@@ -655,10 +561,3 @@ func parseErrs(columnNames []string, whereClause interface{}, alias string, myFu
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// It return the value corresponding to the tag in Json .
|
||||
// Input is the Key and row is the JSON string
|
||||
func jsonValue(input string, row string) string {
|
||||
value := gjson.Get(row, input)
|
||||
return value.String()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user