minio/pkg/s3select/sql/parser.go
Klaus Post c9b8bd8de2 S3 Select: optimize output (#8238)
Queue output items and reuse them.
Remove the unneeded type system in sql and just use the Go type system.

In best case this is more than an order of magnitude speedup:

```
BenchmarkSelectAll_1M-12    	       1	1841049400 ns/op	274299728 B/op	 4198522 allocs/op
BenchmarkSelectAll_1M-12    	      14	  84833400 ns/op	169228346 B/op	 3146541 allocs/op
```
2019-09-17 05:56:27 +05:30

348 lines
12 KiB
Go

/*
* MinIO Cloud Storage, (C) 2019 MinIO, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package sql
import (
"strings"
"github.com/alecthomas/participle"
"github.com/alecthomas/participle/lexer"
)
// Types with custom Capture interface for parsing
// Boolean is a type for a parsed Boolean literal
type Boolean bool
// Capture interface used by participle
func (b *Boolean) Capture(values []string) error {
*b = strings.ToLower(values[0]) == "true"
return nil
}
// LiteralString is a type for parsed SQL string literals
type LiteralString string
// Capture interface used by participle
func (ls *LiteralString) Capture(values []string) error {
// Remove enclosing single quote
n := len(values[0])
r := values[0][1 : n-1]
// Translate doubled quotes
*ls = LiteralString(strings.Replace(r, "''", "'", -1))
return nil
}
// ObjectKey is a type for parsed strings occurring in key paths
type ObjectKey struct {
Lit *LiteralString `parser:" \"[\" @LitString \"]\""`
ID *Identifier `parser:"| \".\" @@"`
}
// QuotedIdentifier is a type for parsed strings that are double
// quoted.
type QuotedIdentifier string
// Capture interface used by participle
func (qi *QuotedIdentifier) Capture(values []string) error {
// Remove enclosing quotes
n := len(values[0])
r := values[0][1 : n-1]
// Translate doubled quotes
*qi = QuotedIdentifier(strings.Replace(r, `""`, `"`, -1))
return nil
}
// Types representing AST of SQL statement. Only SELECT is supported.
// Select is the top level AST node type
type Select struct {
Expression *SelectExpression `parser:"\"SELECT\" @@"`
From *TableExpression `parser:"\"FROM\" @@"`
Where *Expression `parser:"( \"WHERE\" @@ )?"`
Limit *LitValue `parser:"( \"LIMIT\" @@ )?"`
}
// SelectExpression represents the items requested in the select
// statement
type SelectExpression struct {
All bool `parser:" @\"*\""`
Expressions []*AliasedExpression `parser:"| @@ { \",\" @@ }"`
}
// TableExpression represents the FROM clause
type TableExpression struct {
Table *JSONPath `parser:"@@"`
As string `parser:"( \"AS\"? @Ident )?"`
}
// JSONPathElement represents a keypath component
type JSONPathElement struct {
Key *ObjectKey `parser:" @@"` // ['name'] and .name forms
Index *int `parser:"| \"[\" @Number \"]\""` // [3] form
ObjectWildcard bool `parser:"| @\".*\""` // .* form
ArrayWildcard bool `parser:"| @\"[*]\""` // [*] form
}
// JSONPath represents a keypath.
// Instances should be treated idempotent and not change once created.
type JSONPath struct {
BaseKey *Identifier `parser:" @@"`
PathExpr []*JSONPathElement `parser:"(@@)*"`
// Cached values:
pathString string
}
// AliasedExpression is an expression that can be optionally named
type AliasedExpression struct {
Expression *Expression `parser:"@@"`
As string `parser:"[ \"AS\" @Ident ]"`
}
// Grammar for Expression
//
// Expression → AndCondition ("OR" AndCondition)*
// AndCondition → Condition ("AND" Condition)*
// Condition → "NOT" Condition | ConditionExpression
// ConditionExpression → ValueExpression ("=" | "<>" | "<=" | ">=" | "<" | ">") ValueExpression
// | ValueExpression "LIKE" ValueExpression ("ESCAPE" LitString)?
// | ValueExpression ("NOT"? "BETWEEN" ValueExpression "AND" ValueExpression)
// | ValueExpression "IN" "(" Expression ("," Expression)* ")"
// | ValueExpression
// ValueExpression → Operand
//
// Operand grammar follows below
// Expression represents a logical disjunction of clauses
type Expression struct {
And []*AndCondition `parser:"@@ ( \"OR\" @@ )*"`
}
// AndCondition represents logical conjunction of clauses
type AndCondition struct {
Condition []*Condition `parser:"@@ ( \"AND\" @@ )*"`
}
// Condition represents a negation or a condition operand
type Condition struct {
Operand *ConditionOperand `parser:" @@"`
Not *Condition `parser:"| \"NOT\" @@"`
}
// ConditionOperand is a operand followed by an an optional operation
// expression
type ConditionOperand struct {
Operand *Operand `parser:"@@"`
ConditionRHS *ConditionRHS `parser:"@@?"`
}
// ConditionRHS represents the right-hand-side of Compare, Between, In
// or Like expressions.
type ConditionRHS struct {
Compare *Compare `parser:" @@"`
Between *Between `parser:"| @@"`
In *In `parser:"| \"IN\" \"(\" @@ \")\""`
Like *Like `parser:"| @@"`
}
// Compare represents the RHS of a comparison expression
type Compare struct {
Operator string `parser:"@( \"<>\" | \"<=\" | \">=\" | \"=\" | \"<\" | \">\" | \"!=\" )"`
Operand *Operand `parser:" @@"`
}
// Like represents the RHS of a LIKE expression
type Like struct {
Not bool `parser:" @\"NOT\"? "`
Pattern *Operand `parser:" \"LIKE\" @@ "`
EscapeChar *Operand `parser:" (\"ESCAPE\" @@)? "`
}
// Between represents the RHS of a BETWEEN expression
type Between struct {
Not bool `parser:" @\"NOT\"? "`
Start *Operand `parser:" \"BETWEEN\" @@ "`
End *Operand `parser:" \"AND\" @@ "`
}
// In represents the RHS of an IN expression
type In struct {
Expressions []*Expression `parser:"@@ ( \",\" @@ )*"`
}
// Grammar for Operand:
//
// operand → multOp ( ("-" | "+") multOp )*
// multOp → unary ( ("/" | "*" | "%") unary )*
// unary → "-" unary | primary
// primary → Value | Variable | "(" expression ")"
//
// An Operand is a single term followed by an optional sequence of
// terms separated by +/-
type Operand struct {
Left *MultOp `parser:"@@"`
Right []*OpFactor `parser:"(@@)*"`
}
// OpFactor represents the right-side of a +/- operation.
type OpFactor struct {
Op string `parser:"@(\"+\" | \"-\")"`
Right *MultOp `parser:"@@"`
}
// MultOp represents a single term followed by an optional sequence of
// terms separated by *, / or % operators.
type MultOp struct {
Left *UnaryTerm `parser:"@@"`
Right []*OpUnaryTerm `parser:"(@@)*"`
}
// OpUnaryTerm represents the right side of *, / or % binary operations.
type OpUnaryTerm struct {
Op string `parser:"@(\"*\" | \"/\" | \"%\")"`
Right *UnaryTerm `parser:"@@"`
}
// UnaryTerm represents a single negated term or a primary term
type UnaryTerm struct {
Negated *NegatedTerm `parser:" @@"`
Primary *PrimaryTerm `parser:"| @@"`
}
// NegatedTerm has a leading minus sign.
type NegatedTerm struct {
Term *PrimaryTerm `parser:"\"-\" @@"`
}
// PrimaryTerm represents a Value, Path expression, a Sub-expression
// or a function call.
type PrimaryTerm struct {
Value *LitValue `parser:" @@"`
JPathExpr *JSONPath `parser:"| @@"`
SubExpression *Expression `parser:"| \"(\" @@ \")\""`
// Include function expressions here.
FuncCall *FuncExpr `parser:"| @@"`
}
// FuncExpr represents a function call
type FuncExpr struct {
SFunc *SimpleArgFunc `parser:" @@"`
Count *CountFunc `parser:"| @@"`
Cast *CastFunc `parser:"| @@"`
Substring *SubstringFunc `parser:"| @@"`
Extract *ExtractFunc `parser:"| @@"`
Trim *TrimFunc `parser:"| @@"`
DateAdd *DateAddFunc `parser:"| @@"`
DateDiff *DateDiffFunc `parser:"| @@"`
// Used during evaluation for aggregation funcs
aggregate *aggVal
}
// SimpleArgFunc represents functions with simple expression
// arguments.
type SimpleArgFunc struct {
FunctionName string `parser:" @(\"AVG\" | \"MAX\" | \"MIN\" | \"SUM\" | \"COALESCE\" | \"NULLIF\" | \"TO_STRING\" | \"TO_TIMESTAMP\" | \"UTCNOW\" | \"CHAR_LENGTH\" | \"CHARACTER_LENGTH\" | \"LOWER\" | \"UPPER\") "`
ArgsList []*Expression `parser:"\"(\" (@@ (\",\" @@)*)?\")\""`
}
// CountFunc represents the COUNT sql function
type CountFunc struct {
StarArg bool `parser:" \"COUNT\" \"(\" ( @\"*\"?"`
ExprArg *Expression `parser:" @@? )! \")\""`
}
// CastFunc represents CAST sql function
type CastFunc struct {
Expr *Expression `parser:" \"CAST\" \"(\" @@ "`
CastType string `parser:" \"AS\" @(\"BOOL\" | \"INT\" | \"INTEGER\" | \"STRING\" | \"FLOAT\" | \"DECIMAL\" | \"NUMERIC\" | \"TIMESTAMP\") \")\" "`
}
// SubstringFunc represents SUBSTRING sql function
type SubstringFunc struct {
Expr *PrimaryTerm `parser:" \"SUBSTRING\" \"(\" @@ "`
From *Operand `parser:" ( \"FROM\" @@ "`
For *Operand `parser:" (\"FOR\" @@)? \")\" "`
Arg2 *Operand `parser:" | \",\" @@ "`
Arg3 *Operand `parser:" (\",\" @@)? \")\" )"`
}
// ExtractFunc represents EXTRACT sql function
type ExtractFunc struct {
Timeword string `parser:" \"EXTRACT\" \"(\" @( \"YEAR\":Timeword | \"MONTH\":Timeword | \"DAY\":Timeword | \"HOUR\":Timeword | \"MINUTE\":Timeword | \"SECOND\":Timeword | \"TIMEZONE_HOUR\":Timeword | \"TIMEZONE_MINUTE\":Timeword ) "`
From *PrimaryTerm `parser:" \"FROM\" @@ \")\" "`
}
// TrimFunc represents TRIM sql function
type TrimFunc struct {
TrimWhere *string `parser:" \"TRIM\" \"(\" ( @( \"LEADING\" | \"TRAILING\" | \"BOTH\" ) "`
TrimChars *PrimaryTerm `parser:" @@? "`
TrimFrom *PrimaryTerm `parser:" \"FROM\" )? @@ \")\" "`
}
// DateAddFunc represents the DATE_ADD function
type DateAddFunc struct {
DatePart string `parser:" \"DATE_ADD\" \"(\" @( \"YEAR\":Timeword | \"MONTH\":Timeword | \"DAY\":Timeword | \"HOUR\":Timeword | \"MINUTE\":Timeword | \"SECOND\":Timeword ) \",\""`
Quantity *Operand `parser:" @@ \",\""`
Timestamp *PrimaryTerm `parser:" @@ \")\""`
}
// DateDiffFunc represents the DATE_DIFF function
type DateDiffFunc struct {
DatePart string `parser:" \"DATE_DIFF\" \"(\" @( \"YEAR\":Timeword | \"MONTH\":Timeword | \"DAY\":Timeword | \"HOUR\":Timeword | \"MINUTE\":Timeword | \"SECOND\":Timeword ) \",\" "`
Timestamp1 *PrimaryTerm `parser:" @@ \",\" "`
Timestamp2 *PrimaryTerm `parser:" @@ \")\" "`
}
// LitValue represents a literal value parsed from the sql
type LitValue struct {
Number *float64 `parser:"( @Number"`
String *LiteralString `parser:" | @LitString"`
Boolean *Boolean `parser:" | @(\"TRUE\" | \"FALSE\")"`
Null bool `parser:" | @\"NULL\")"`
}
// Identifier represents a parsed identifier
type Identifier struct {
Unquoted *string `parser:" @Ident"`
Quoted *QuotedIdentifier `parser:"| @QuotIdent"`
}
var (
sqlLexer = lexer.Must(lexer.Regexp(`(\s+)` +
`|(?P<Timeword>(?i)\b(?:YEAR|MONTH|DAY|HOUR|MINUTE|SECOND|TIMEZONE_HOUR|TIMEZONE_MINUTE)\b)` +
`|(?P<Keyword>(?i)\b(?:SELECT|FROM|TOP|DISTINCT|ALL|WHERE|GROUP|BY|HAVING|UNION|MINUS|EXCEPT|INTERSECT|ORDER|LIMIT|OFFSET|TRUE|FALSE|NULL|IS|NOT|ANY|SOME|BETWEEN|AND|OR|LIKE|ESCAPE|AS|IN|BOOL|INT|INTEGER|STRING|FLOAT|DECIMAL|NUMERIC|TIMESTAMP|AVG|COUNT|MAX|MIN|SUM|COALESCE|NULLIF|CAST|DATE_ADD|DATE_DIFF|EXTRACT|TO_STRING|TO_TIMESTAMP|UTCNOW|CHAR_LENGTH|CHARACTER_LENGTH|LOWER|SUBSTRING|TRIM|UPPER|LEADING|TRAILING|BOTH|FOR)\b)` +
`|(?P<Ident>[a-zA-Z_][a-zA-Z0-9_]*)` +
`|(?P<QuotIdent>"([^"]*("")?)*")` +
`|(?P<Number>\d*\.?\d+([eE][-+]?\d+)?)` +
`|(?P<LitString>'([^']*('')?)*')` +
`|(?P<Operators><>|!=|<=|>=|\.\*|\[\*\]|[-+*/%,.()=<>\[\]])`,
))
// SQLParser is used to parse SQL statements
SQLParser = participle.MustBuild(
&Select{},
participle.Lexer(sqlLexer),
participle.CaseInsensitive("Keyword"),
participle.CaseInsensitive("Timeword"),
)
)