Add new SQL parser to support S3 Select syntax (#7102)

- New parser written from scratch, allows easier and complete parsing of the full S3 Select SQL syntax. Parser definition is directly provided by the AST defined for the SQL grammar. - Bring support to parse and interpret SQL involving JSON path expressions; evaluation of JSON path expressions will be subsequently added. - Bring automatic type inference and conversion for untyped values (e.g. CSV data).
2025-11-20 18:06:10 -05:00 · 2019-01-28 17:59:48 -08:00
parent 0a28c28a8c
commit 2786055df4
65 changed files with 6405 additions and 18231 deletions
--- a/pkg/s3select/sql/value.go
+++ b/pkg/s3select/sql/value.go
@@ -17,220 +17,710 @@
 package sql

 import (
-	"encoding/json"
+	"errors"
 	"fmt"
+	"math"
 	"strconv"
 	"strings"
-	"time"
-
-	"github.com/xwb1989/sqlparser"
 )

-// Value - represents any primitive value of bool, int, float, string and time.
+var (
+	errArithMismatchedTypes = errors.New("cannot perform arithmetic on mismatched types")
+	errArithInvalidOperator = errors.New("invalid arithmetic operator")
+	errArithDivideByZero    = errors.New("cannot divide by 0")
+
+	errCmpMismatchedTypes     = errors.New("cannot compare values of different types")
+	errCmpInvalidBoolOperator = errors.New("invalid comparison operator for boolean arguments")
+)
+
+// vType represents the concrete type of a `Value`
+type vType int
+
+// Valid values for Type
+const (
+	typeNull vType = iota + 1
+	typeBool
+	typeString
+
+	// 64-bit signed integer
+	typeInt
+
+	// 64-bit floating point
+	typeFloat
+
+	// This type refers to untyped values, e.g. as read from CSV
+	typeBytes
+)
+
+// Value represents a value of restricted type reduced from an
+// expression represented by an ASTNode. Only one of the fields is
+// non-nil.
+//
+// In cases where we are fetching data from a data source (like csv),
+// the type may not be determined yet. In these cases, a byte-slice is
+// used.
 type Value struct {
-	value     interface{}
-	valueType Type
+	value interface{}
+	vType vType
 }

-// String - represents value as string.
-func (value *Value) String() string {
-	if value.value == nil {
-		if value.valueType == Null {
-			return "NULL"
-		}
-
-		return "<nil>"
+// GetTypeString returns a string representation for vType
+func (v *Value) GetTypeString() string {
+	switch v.vType {
+	case typeNull:
+		return "NULL"
+	case typeBool:
+		return "BOOL"
+	case typeString:
+		return "STRING"
+	case typeInt:
+		return "INT"
+	case typeFloat:
+		return "FLOAT"
+	case typeBytes:
+		return "BYTES"
 	}
-
-	switch value.valueType {
-	case String:
-		return fmt.Sprintf("'%v'", value.value)
-	case Array:
-		var valueStrings []string
-		for _, v := range value.value.([]*Value) {
-			valueStrings = append(valueStrings, fmt.Sprintf("%v", v))
-		}
-		return fmt.Sprintf("(%v)", strings.Join(valueStrings, ","))
-	}
-
-	return fmt.Sprintf("%v", value.value)
+	return "--"
 }

-// CSVString - encodes to CSV string.
-func (value *Value) CSVString() string {
-	if value.valueType == Null {
+// Repr returns a string representation of value.
+func (v *Value) Repr() string {
+	switch v.vType {
+	case typeNull:
+		return ":NULL"
+	case typeBool, typeInt, typeFloat:
+		return fmt.Sprintf("%v:%s", v.value, v.GetTypeString())
+	case typeString:
+		return fmt.Sprintf("\"%s\":%s", v.value.(string), v.GetTypeString())
+	case typeBytes:
+		return fmt.Sprintf("\"%s\":BYTES", string(v.value.([]byte)))
+	default:
+		return fmt.Sprintf("%v:INVALID", v.value)
+	}
+}
+
+// FromFloat creates a Value from a number
+func FromFloat(f float64) *Value {
+	return &Value{value: f, vType: typeFloat}
+}
+
+// FromInt creates a Value from an int
+func FromInt(f int64) *Value {
+	return &Value{value: f, vType: typeInt}
+}
+
+// FromString creates a Value from a string
+func FromString(str string) *Value {
+	return &Value{value: str, vType: typeString}
+}
+
+// FromBool creates a Value from a bool
+func FromBool(b bool) *Value {
+	return &Value{value: b, vType: typeBool}
+}
+
+// FromNull creates a Value with Null value
+func FromNull() *Value {
+	return &Value{vType: typeNull}
+}
+
+// FromBytes creates a Value from a []byte
+func FromBytes(b []byte) *Value {
+	return &Value{value: b, vType: typeBytes}
+}
+
+// ToFloat works for int and float values
+func (v *Value) ToFloat() (val float64, ok bool) {
+	switch v.vType {
+	case typeFloat:
+		val, ok = v.value.(float64)
+	case typeInt:
+		var i int64
+		i, ok = v.value.(int64)
+		val = float64(i)
+	default:
+	}
+	return
+}
+
+// ToInt converts value to int.
+func (v *Value) ToInt() (val int64, ok bool) {
+	switch v.vType {
+	case typeInt:
+		val, ok = v.value.(int64)
+	default:
+	}
+	return
+}
+
+// ToString converts value to string.
+func (v *Value) ToString() (val string, ok bool) {
+	switch v.vType {
+	case typeString:
+		val, ok = v.value.(string)
+	default:
+	}
+	return
+}
+
+// ToBool returns the bool value; second return value refers to if the bool
+// conversion succeeded.
+func (v *Value) ToBool() (val bool, ok bool) {
+	switch v.vType {
+	case typeBool:
+		return v.value.(bool), true
+	}
+	return false, false
+}
+
+// ToBytes converts Value to byte-slice.
+func (v *Value) ToBytes() ([]byte, bool) {
+	switch v.vType {
+	case typeBytes:
+		return v.value.([]byte), true
+	}
+	return nil, false
+}
+
+// IsNull - checks if value is missing.
+func (v *Value) IsNull() bool {
+	return v.vType == typeNull
+}
+
+func (v *Value) isNumeric() bool {
+	return v.vType == typeInt || v.vType == typeFloat
+}
+
+// setters used internally to mutate values
+
+func (v *Value) setInt(i int64) {
+	v.vType = typeInt
+	v.value = i
+}
+
+func (v *Value) setFloat(f float64) {
+	v.vType = typeFloat
+	v.value = f
+}
+
+func (v *Value) setString(s string) {
+	v.vType = typeString
+	v.value = s
+}
+
+func (v *Value) setBool(b bool) {
+	v.vType = typeBool
+	v.value = b
+}
+
+// CSVString - convert to string for CSV serialization
+func (v *Value) CSVString() string {
+	switch v.vType {
+	case typeNull:
 		return ""
+	case typeBool:
+		return fmt.Sprintf("%v", v.value.(bool))
+	case typeString:
+		return fmt.Sprintf("%s", v.value.(string))
+	case typeInt:
+		return fmt.Sprintf("%v", v.value.(int64))
+	case typeFloat:
+		return fmt.Sprintf("%v", v.value.(float64))
+	case typeBytes:
+		return fmt.Sprintf("%v", string(v.value.([]byte)))
+	default:
+		return "CSV serialization not implemented for this type"
+	}
+}
+
+// floatToValue converts a float into int representation if needed.
+func floatToValue(f float64) *Value {
+	intPart, fracPart := math.Modf(f)
+	if fracPart == 0 {
+		return FromInt(int64(intPart))
+	}
+	return FromFloat(f)
+}
+
+// Value comparison functions: we do not expose them outside the
+// module. Logical operators "<", ">", ">=", "<=" work on strings and
+// numbers. Equality operators "=", "!=" work on strings,
+// numbers and booleans.
+
+// Supported comparison operators
+const (
+	opLt   = "<"
+	opLte  = "<="
+	opGt   = ">"
+	opGte  = ">="
+	opEq   = "="
+	opIneq = "!="
+)
+
+// When numeric types are compared, type promotions could happen. If
+// values do not have types (e.g. when reading from CSV), for
+// comparison operations, automatic type conversion happens by trying
+// to check if the value is a number (first an integer, then a float),
+// and falling back to string.
+func (v *Value) compareOp(op string, a *Value) (res bool, err error) {
+	if !isValidComparisonOperator(op) {
+		return false, errArithInvalidOperator
 	}

-	return fmt.Sprintf("%v", value.value)
+	// Check if type conversion/inference is needed - it is needed
+	// if the Value is a byte-slice.
+	err = inferTypesForCmp(v, a)
+	if err != nil {
+		return false, err
+	}
+
+	isNumeric := v.isNumeric() && a.isNumeric()
+	if isNumeric {
+		intV, ok1i := v.ToInt()
+		intA, ok2i := a.ToInt()
+		if ok1i && ok2i {
+			return intCompare(op, intV, intA), nil
+		}
+
+		// If both values are numeric, then at least one is
+		// float since we got here, so we convert.
+		flV, _ := v.ToFloat()
+		flA, _ := a.ToFloat()
+		return floatCompare(op, flV, flA), nil
+	}
+
+	strV, ok1s := v.ToString()
+	strA, ok2s := a.ToString()
+	if ok1s && ok2s {
+		return stringCompare(op, strV, strA), nil
+	}
+
+	boolV, ok1b := v.ToBool()
+	boolA, ok2b := v.ToBool()
+	if ok1b && ok2b {
+		return boolCompare(op, boolV, boolA)
+	}
+
+	return false, errCmpMismatchedTypes
 }

-// MarshalJSON - encodes to JSON data.
-func (value *Value) MarshalJSON() ([]byte, error) {
-	return json.Marshal(value.value)
+func inferTypesForCmp(a *Value, b *Value) error {
+	_, okA := a.ToBytes()
+	_, okB := b.ToBytes()
+	switch {
+	case !okA && !okB:
+		// Both Values already have types
+		return nil
+
+	case okA && okB:
+		// Both Values are untyped so try the types in order:
+		// int, float, bool, string
+
+		// Check for numeric inference
+		iA, okAi := a.bytesToInt()
+		iB, okBi := b.bytesToInt()
+		if okAi && okBi {
+			a.setInt(iA)
+			b.setInt(iB)
+			return nil
+		}
+
+		fA, okAf := a.bytesToFloat()
+		fB, okBf := b.bytesToFloat()
+		if okAf && okBf {
+			a.setFloat(fA)
+			b.setFloat(fB)
+			return nil
+		}
+
+		// Check if they int and float combination.
+		if okAi && okBf {
+			a.setInt(iA)
+			b.setFloat(fA)
+			return nil
+		}
+		if okBi && okAf {
+			a.setFloat(fA)
+			b.setInt(iB)
+			return nil
+		}
+
+		// Not numeric types at this point.
+
+		// Check for bool inference
+		bA, okAb := a.bytesToBool()
+		bB, okBb := b.bytesToBool()
+		if okAb && okBb {
+			a.setBool(bA)
+			b.setBool(bB)
+			return nil
+		}
+
+		// Fallback to string
+		sA := a.bytesToString()
+		sB := b.bytesToString()
+		a.setString(sA)
+		b.setString(sB)
+		return nil
+
+	case okA && !okB:
+		// Here a has `a` is untyped, but `b` has a fixed
+		// type.
+		switch b.vType {
+		case typeString:
+			s := a.bytesToString()
+			a.setString(s)
+
+		case typeInt, typeFloat:
+			if iA, ok := a.bytesToInt(); ok {
+				a.setInt(iA)
+			} else if fA, ok := a.bytesToFloat(); ok {
+				a.setFloat(fA)
+			} else {
+				return fmt.Errorf("Could not convert %s to a number", string(a.value.([]byte)))
+			}
+
+		case typeBool:
+			if bA, ok := a.bytesToBool(); ok {
+				a.setBool(bA)
+			} else {
+				return fmt.Errorf("Could not convert %s to a boolean", string(a.value.([]byte)))
+			}
+
+		default:
+			return errCmpMismatchedTypes
+		}
+		return nil
+
+	case !okA && okB:
+		// swap arguments to avoid repeating code
+		return inferTypesForCmp(b, a)
+
+	default:
+		// Does not happen
+		return nil
+	}
 }

-// NullValue - returns underlying null value. It panics if value is not null type.
-func (value *Value) NullValue() *struct{} {
-	if value.valueType == Null {
+// Value arithmetic functions: we do not expose them outside the
+// module. All arithmetic works only on numeric values with automatic
+// promotion to the "larger" type that can represent the value. TODO:
+// Add support for large number arithmetic.
+
+// Supported arithmetic operators
+const (
+	opPlus     = "+"
+	opMinus    = "-"
+	opDivide   = "/"
+	opMultiply = "*"
+	opModulo   = "%"
+)
+
+// For arithmetic operations, if both values are numeric then the
+// operation shall succeed. If the types are unknown automatic type
+// conversion to a number is attempted.
+func (v *Value) arithOp(op string, a *Value) error {
+	err := inferTypeForArithOp(v)
+	if err != nil {
+		return err
+	}
+
+	err = inferTypeForArithOp(a)
+	if err != nil {
+		return err
+	}
+
+	if !v.isNumeric() || !a.isNumeric() {
+		return errInvalidDataType(errArithMismatchedTypes)
+	}
+
+	if !isValidArithOperator(op) {
+		return errInvalidDataType(errArithMismatchedTypes)
+	}
+
+	intV, ok1i := v.ToInt()
+	intA, ok2i := a.ToInt()
+	switch {
+	case ok1i && ok2i:
+		res, err := intArithOp(op, intV, intA)
+		v.setInt(res)
+		return err
+
+	default:
+		// Convert arguments to float
+		flV, _ := v.ToFloat()
+		flA, _ := a.ToFloat()
+		res, err := floatArithOp(op, flV, flA)
+		v.setFloat(res)
+		return err
+	}
+}
+
+func inferTypeForArithOp(a *Value) error {
+	if _, ok := a.ToBytes(); !ok {
 		return nil
 	}

-	panic(fmt.Sprintf("requested bool value but found %T type", value.value))
-}
-
-// BoolValue - returns underlying bool value. It panics if value is not Bool type.
-func (value *Value) BoolValue() bool {
-	if value.valueType == Bool {
-		return value.value.(bool)
+	if i, ok := a.bytesToInt(); ok {
+		a.setInt(i)
+		return nil
 	}

-	panic(fmt.Sprintf("requested bool value but found %T type", value.value))
-}
-
-// IntValue - returns underlying int value. It panics if value is not Int type.
-func (value *Value) IntValue() int64 {
-	if value.valueType == Int {
-		return value.value.(int64)
+	if f, ok := a.bytesToFloat(); ok {
+		a.setFloat(f)
+		return nil
 	}

-	panic(fmt.Sprintf("requested int value but found %T type", value.value))
+	err := fmt.Errorf("Could not convert %s to a number", string(a.value.([]byte)))
+	return errInvalidDataType(err)
 }

-// FloatValue - returns underlying int/float value as float64. It panics if value is not Int/Float type.
-func (value *Value) FloatValue() float64 {
-	switch value.valueType {
-	case Int:
-		return float64(value.value.(int64))
-	case Float:
-		return value.value.(float64)
+// All the bytesTo* functions defined below assume the value is a byte-slice.
+
+// Converts untyped value into int. The bool return implies success -
+// it returns false only if there is a conversion failure.
+func (v *Value) bytesToInt() (int64, bool) {
+	bytes, _ := v.ToBytes()
+	i, err := strconv.ParseInt(string(bytes), 10, 64)
+	return i, err == nil
+}
+
+// Converts untyped value into float. The bool return implies success
+// - it returns false only if there is a conversion failure.
+func (v *Value) bytesToFloat() (float64, bool) {
+	bytes, _ := v.ToBytes()
+	i, err := strconv.ParseFloat(string(bytes), 64)
+	return i, err == nil
+}
+
+// Converts untyped value into bool. The second bool return implies
+// success - it returns false in case of a conversion failure.
+func (v *Value) bytesToBool() (val bool, ok bool) {
+	bytes, _ := v.ToBytes()
+	ok = true
+	switch strings.ToLower(string(bytes)) {
+	case "t", "true":
+		val = true
+	case "f", "false":
+		val = false
+	default:
+		ok = false
+	}
+	return val, ok
+}
+
+// bytesToString - never fails
+func (v *Value) bytesToString() string {
+	bytes, _ := v.ToBytes()
+	return string(bytes)
+}
+
+// Calculates minimum or maximum of v and a and assigns the result to
+// v - it works only on numeric arguments, where `v` is already
+// assumed to be numeric. Attempts conversion to numeric type for `a`
+// (first int, then float) only if the underlying values do not have a
+// type.
+func (v *Value) minmax(a *Value, isMax, isFirstRow bool) error {
+	err := inferTypeForArithOp(a)
+	if err != nil {
+		return err
 	}

-	panic(fmt.Sprintf("requested float value but found %T type", value.value))
-}
-
-// StringValue - returns underlying string value. It panics if value is not String type.
-func (value *Value) StringValue() string {
-	if value.valueType == String {
-		return value.value.(string)
+	if !a.isNumeric() {
+		return errArithMismatchedTypes
 	}

-	panic(fmt.Sprintf("requested string value but found %T type", value.value))
-}
-
-// TimeValue - returns underlying time value. It panics if value is not Timestamp type.
-func (value *Value) TimeValue() time.Time {
-	if value.valueType == Timestamp {
-		return value.value.(time.Time)
-	}
-
-	panic(fmt.Sprintf("requested time value but found %T type", value.value))
-}
-
-// ArrayValue - returns underlying value array. It panics if value is not Array type.
-func (value *Value) ArrayValue() []*Value {
-	if value.valueType == Array {
-		return value.value.([]*Value)
-	}
-
-	panic(fmt.Sprintf("requested array value but found %T type", value.value))
-}
-
-func (value *Value) recordValue() Record {
-	if value.valueType == record {
-		return value.value.(Record)
-	}
-
-	panic(fmt.Sprintf("requested record value but found %T type", value.value))
-}
-
-// Type - returns value type.
-func (value *Value) Type() Type {
-	return value.valueType
-}
-
-// Value - returns underneath value interface.
-func (value *Value) Value() interface{} {
-	return value.value
-}
-
-// NewNull - creates new null value.
-func NewNull() *Value {
-	return &Value{nil, Null}
-}
-
-// NewBool - creates new Bool value of b.
-func NewBool(b bool) *Value {
-	return &Value{b, Bool}
-}
-
-// NewInt - creates new Int value of i.
-func NewInt(i int64) *Value {
-	return &Value{i, Int}
-}
-
-// NewFloat - creates new Float value of f.
-func NewFloat(f float64) *Value {
-	return &Value{f, Float}
-}
-
-// NewString - creates new Sring value of s.
-func NewString(s string) *Value {
-	return &Value{s, String}
-}
-
-// NewTime - creates new Time value of t.
-func NewTime(t time.Time) *Value {
-	return &Value{t, Timestamp}
-}
-
-// NewArray - creates new Array value of values.
-func NewArray(values []*Value) *Value {
-	return &Value{values, Array}
-}
-
-func newRecordValue(r Record) *Value {
-	return &Value{r, record}
-}
-
-// NewValue - creates new Value from SQLVal v.
-func NewValue(v *sqlparser.SQLVal) (*Value, error) {
-	switch v.Type {
-	case sqlparser.StrVal:
-		return NewString(string(v.Val)), nil
-	case sqlparser.IntVal:
-		i64, err := strconv.ParseInt(string(v.Val), 10, 64)
-		if err != nil {
-			return nil, err
+	// In case of first row, set v to a.
+	if isFirstRow {
+		intA, okI := a.ToInt()
+		if okI {
+			v.setInt(intA)
+			return nil
 		}
-		return NewInt(i64), nil
-	case sqlparser.FloatVal:
-		f64, err := strconv.ParseFloat(string(v.Val), 64)
-		if err != nil {
-			return nil, err
-		}
-		return NewFloat(f64), nil
-	case sqlparser.HexNum: // represented as 0xDD
-		i64, err := strconv.ParseInt(string(v.Val), 16, 64)
-		if err != nil {
-			return nil, err
-		}
-		return NewInt(i64), nil
-	case sqlparser.HexVal: // represented as X'0DD'
-		i64, err := strconv.ParseInt(string(v.Val), 16, 64)
-		if err != nil {
-			return nil, err
-		}
-		return NewInt(i64), nil
-	case sqlparser.BitVal: // represented as B'00'
-		i64, err := strconv.ParseInt(string(v.Val), 2, 64)
-		if err != nil {
-			return nil, err
-		}
-		return NewInt(i64), nil
-	case sqlparser.ValArg:
-		// FIXME: the format is unknown and not sure how to handle it.
+		floatA, _ := a.ToFloat()
+		v.setFloat(floatA)
+		return nil
 	}

-	return nil, fmt.Errorf("unknown SQL value %v; %v ", v, v.Type)
+	intV, ok1i := v.ToInt()
+	intA, ok2i := a.ToInt()
+	if ok1i && ok2i {
+		result := intV
+		if !isMax {
+			if intA < result {
+				result = intA
+			}
+		} else {
+			if intA > result {
+				result = intA
+			}
+		}
+		v.setInt(result)
+		return nil
+	}
+
+	floatV, _ := v.ToFloat()
+	floatA, _ := a.ToFloat()
+	var result float64
+	if !isMax {
+		result = math.Min(floatV, floatA)
+	} else {
+		result = math.Max(floatV, floatA)
+	}
+	v.setFloat(result)
+	return nil
+}
+
+// inferTypeAsString is used to convert untyped values to string - it
+// is called when the caller requires a string context to proceed.
+func inferTypeAsString(v *Value) {
+	b, ok := v.ToBytes()
+	if !ok {
+		return
+	}
+
+	v.setString(string(b))
+}
+
+func isValidComparisonOperator(op string) bool {
+	switch op {
+	case opLt:
+	case opLte:
+	case opGt:
+	case opGte:
+	case opEq:
+	case opIneq:
+	default:
+		return false
+	}
+	return true
+}
+
+func intCompare(op string, left, right int64) bool {
+	switch op {
+	case opLt:
+		return left < right
+	case opLte:
+		return left <= right
+	case opGt:
+		return left > right
+	case opGte:
+		return left >= right
+	case opEq:
+		return left == right
+	case opIneq:
+		return left != right
+	}
+	// This case does not happen
+	return false
+}
+
+func floatCompare(op string, left, right float64) bool {
+	switch op {
+	case opLt:
+		return left < right
+	case opLte:
+		return left <= right
+	case opGt:
+		return left > right
+	case opGte:
+		return left >= right
+	case opEq:
+		return left == right
+	case opIneq:
+		return left != right
+	}
+	// This case does not happen
+	return false
+}
+
+func stringCompare(op string, left, right string) bool {
+	switch op {
+	case opLt:
+		return left < right
+	case opLte:
+		return left <= right
+	case opGt:
+		return left > right
+	case opGte:
+		return left >= right
+	case opEq:
+		return left == right
+	case opIneq:
+		return left != right
+	}
+	// This case does not happen
+	return false
+}
+
+func boolCompare(op string, left, right bool) (bool, error) {
+	switch op {
+	case opEq:
+		return left == right, nil
+	case opIneq:
+		return left != right, nil
+	default:
+		return false, errCmpInvalidBoolOperator
+	}
+}
+
+func isValidArithOperator(op string) bool {
+	switch op {
+	case opPlus:
+	case opMinus:
+	case opDivide:
+	case opMultiply:
+	case opModulo:
+	default:
+		return false
+	}
+	return true
+}
+
+// Overflow errors are ignored.
+func intArithOp(op string, left, right int64) (int64, error) {
+	switch op {
+	case opPlus:
+		return left + right, nil
+	case opMinus:
+		return left - right, nil
+	case opDivide:
+		if right == 0 {
+			return 0, errArithDivideByZero
+		}
+		return left / right, nil
+	case opMultiply:
+		return left * right, nil
+	case opModulo:
+		if right == 0 {
+			return 0, errArithDivideByZero
+		}
+		return left % right, nil
+	}
+	// This does not happen
+	return 0, nil
+}
+
+// Overflow errors are ignored.
+func floatArithOp(op string, left, right float64) (float64, error) {
+	switch op {
+	case opPlus:
+		return left + right, nil
+	case opMinus:
+		return left - right, nil
+	case opDivide:
+		if right == 0 {
+			return 0, errArithDivideByZero
+		}
+		return left / right, nil
+	case opMultiply:
+		return left * right, nil
+	case opModulo:
+		if right == 0 {
+			return 0, errArithDivideByZero
+		}
+		return math.Mod(left, right), nil
+	}
+	// This does not happen
+	return 0, nil
 }