mirror of
https://github.com/minio/minio.git
synced 2025-01-15 16:53:16 -05:00
2786055df4
- New parser written from scratch, allows easier and complete parsing of the full S3 Select SQL syntax. Parser definition is directly provided by the AST defined for the SQL grammar. - Bring support to parse and interpret SQL involving JSON path expressions; evaluation of JSON path expressions will be subsequently added. - Bring automatic type inference and conversion for untyped values (e.g. CSV data).
113 lines
2.2 KiB
Go
113 lines
2.2 KiB
Go
package lexer
|
|
|
|
import (
|
|
"bytes"
|
|
"io"
|
|
"io/ioutil"
|
|
"regexp"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
var eolBytes = []byte("\n")
|
|
|
|
type regexpDefinition struct {
|
|
re *regexp.Regexp
|
|
symbols map[string]rune
|
|
}
|
|
|
|
// Regexp creates a lexer definition from a regular expression.
|
|
//
|
|
// Each named sub-expression in the regular expression matches a token. Anonymous sub-expressions
|
|
// will be matched and discarded.
|
|
//
|
|
// eg.
|
|
//
|
|
// def, err := Regexp(`(?P<Ident>[a-z]+)|(\s+)|(?P<Number>\d+)`)
|
|
func Regexp(pattern string) (Definition, error) {
|
|
re, err := regexp.Compile(pattern)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
symbols := map[string]rune{
|
|
"EOF": EOF,
|
|
}
|
|
for i, sym := range re.SubexpNames()[1:] {
|
|
if sym != "" {
|
|
symbols[sym] = EOF - 1 - rune(i)
|
|
}
|
|
}
|
|
return ®expDefinition{re: re, symbols: symbols}, nil
|
|
}
|
|
|
|
func (d *regexpDefinition) Lex(r io.Reader) (Lexer, error) {
|
|
b, err := ioutil.ReadAll(r)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return ®expLexer{
|
|
pos: Position{
|
|
Filename: NameOfReader(r),
|
|
Line: 1,
|
|
Column: 1,
|
|
},
|
|
b: b,
|
|
re: d.re,
|
|
names: d.re.SubexpNames(),
|
|
}, nil
|
|
}
|
|
|
|
func (d *regexpDefinition) Symbols() map[string]rune {
|
|
return d.symbols
|
|
}
|
|
|
|
type regexpLexer struct {
|
|
pos Position
|
|
b []byte
|
|
re *regexp.Regexp
|
|
names []string
|
|
}
|
|
|
|
func (r *regexpLexer) Next() (Token, error) {
|
|
nextToken:
|
|
for len(r.b) != 0 {
|
|
matches := r.re.FindSubmatchIndex(r.b)
|
|
if matches == nil || matches[0] != 0 {
|
|
rn, _ := utf8.DecodeRune(r.b)
|
|
return Token{}, Errorf(r.pos, "invalid token %q", rn)
|
|
}
|
|
match := r.b[:matches[1]]
|
|
token := Token{
|
|
Pos: r.pos,
|
|
Value: string(match),
|
|
}
|
|
|
|
// Update lexer state.
|
|
r.pos.Offset += matches[1]
|
|
lines := bytes.Count(match, eolBytes)
|
|
r.pos.Line += lines
|
|
// Update column.
|
|
if lines == 0 {
|
|
r.pos.Column += utf8.RuneCount(match)
|
|
} else {
|
|
r.pos.Column = utf8.RuneCount(match[bytes.LastIndex(match, eolBytes):])
|
|
}
|
|
// Move slice along.
|
|
r.b = r.b[matches[1]:]
|
|
|
|
// Finally, assign token type. If it is not a named group, we continue to the next token.
|
|
for i := 2; i < len(matches); i += 2 {
|
|
if matches[i] != -1 {
|
|
if r.names[i/2] == "" {
|
|
continue nextToken
|
|
}
|
|
token.Type = EOF - rune(i/2)
|
|
break
|
|
}
|
|
}
|
|
|
|
return token, nil
|
|
}
|
|
|
|
return EOFToken(r.pos), nil
|
|
}
|