/* Copyright 2017 Google Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package sqlparser import ( "bytes" "errors" "fmt" "io" "github.com/xwb1989/sqlparser/dependency/bytes2" "github.com/xwb1989/sqlparser/dependency/sqltypes" ) const ( defaultBufSize = 4096 eofChar = 0x100 ) // Tokenizer is the struct used to generate SQL // tokens for the parser. type Tokenizer struct { InStream io.Reader AllowComments bool ForceEOF bool lastChar uint16 Position int lastToken []byte LastError error posVarIndex int ParseTree Statement partialDDL *DDL nesting int multi bool specialComment *Tokenizer buf []byte bufPos int bufSize int } // NewStringTokenizer creates a new Tokenizer for the // sql string. func NewStringTokenizer(sql string) *Tokenizer { buf := []byte(sql) return &Tokenizer{ buf: buf, bufSize: len(buf), } } // NewTokenizer creates a new Tokenizer reading a sql // string from the io.Reader. func NewTokenizer(r io.Reader) *Tokenizer { return &Tokenizer{ InStream: r, buf: make([]byte, defaultBufSize), } } // keywords is a map of mysql keywords that fall into two categories: // 1) keywords considered reserved by MySQL // 2) keywords for us to handle specially in sql.y // // Those marked as UNUSED are likely reserved keywords. We add them here so that // when rewriting queries we can properly backtick quote them so they don't cause issues // // NOTE: If you add new keywords, add them also to the reserved_keywords or // non_reserved_keywords grammar in sql.y -- this will allow the keyword to be used // in identifiers. See the docs for each grammar to determine which one to put it into. var keywords = map[string]int{ "accessible": UNUSED, "add": ADD, "against": AGAINST, "all": ALL, "alter": ALTER, "analyze": ANALYZE, "and": AND, "as": AS, "asc": ASC, "asensitive": UNUSED, "auto_increment": AUTO_INCREMENT, "before": UNUSED, "begin": BEGIN, "between": BETWEEN, "bigint": BIGINT, "binary": BINARY, "_binary": UNDERSCORE_BINARY, "bit": BIT, "blob": BLOB, "bool": BOOL, "boolean": BOOLEAN, "both": UNUSED, "by": BY, "call": UNUSED, "cascade": UNUSED, "case": CASE, "cast": CAST, "change": UNUSED, "char": CHAR, "character": CHARACTER, "charset": CHARSET, "check": UNUSED, "collate": COLLATE, "column": COLUMN, "comment": COMMENT_KEYWORD, "committed": COMMITTED, "commit": COMMIT, "condition": UNUSED, "constraint": CONSTRAINT, "continue": UNUSED, "convert": CONVERT, "substr": SUBSTR, "substring": SUBSTRING, "create": CREATE, "cross": CROSS, "current_date": CURRENT_DATE, "current_time": CURRENT_TIME, "current_timestamp": CURRENT_TIMESTAMP, "current_user": UNUSED, "cursor": UNUSED, "database": DATABASE, "databases": DATABASES, "day_hour": UNUSED, "day_microsecond": UNUSED, "day_minute": UNUSED, "day_second": UNUSED, "date": DATE, "datetime": DATETIME, "dec": UNUSED, "decimal": DECIMAL, "declare": UNUSED, "default": DEFAULT, "delayed": UNUSED, "delete": DELETE, "desc": DESC, "describe": DESCRIBE, "deterministic": UNUSED, "distinct": DISTINCT, "distinctrow": UNUSED, "div": DIV, "double": DOUBLE, "drop": DROP, "duplicate": DUPLICATE, "each": UNUSED, "else": ELSE, "elseif": UNUSED, "enclosed": UNUSED, "end": END, "enum": ENUM, "escape": ESCAPE, "escaped": UNUSED, "exists": EXISTS, "exit": UNUSED, "explain": EXPLAIN, "expansion": EXPANSION, "extended": EXTENDED, "false": FALSE, "fetch": UNUSED, "float": FLOAT_TYPE, "float4": UNUSED, "float8": UNUSED, "for": FOR, "force": FORCE, "foreign": FOREIGN, "from": FROM, "full": FULL, "fulltext": FULLTEXT, "generated": UNUSED, "geometry": GEOMETRY, "geometrycollection": GEOMETRYCOLLECTION, "get": UNUSED, "global": GLOBAL, "grant": UNUSED, "group": GROUP, "group_concat": GROUP_CONCAT, "having": HAVING, "high_priority": UNUSED, "hour_microsecond": UNUSED, "hour_minute": UNUSED, "hour_second": UNUSED, "if": IF, "ignore": IGNORE, "in": IN, "index": INDEX, "infile": UNUSED, "inout": UNUSED, "inner": INNER, "insensitive": UNUSED, "insert": INSERT, "int": INT, "int1": UNUSED, "int2": UNUSED, "int3": UNUSED, "int4": UNUSED, "int8": UNUSED, "integer": INTEGER, "interval": INTERVAL, "into": INTO, "io_after_gtids": UNUSED, "is": IS, "isolation": ISOLATION, "iterate": UNUSED, "join": JOIN, "json": JSON, "key": KEY, "keys": KEYS, "key_block_size": KEY_BLOCK_SIZE, "kill": UNUSED, "language": LANGUAGE, "last_insert_id": LAST_INSERT_ID, "leading": UNUSED, "leave": UNUSED, "left": LEFT, "less": LESS, "level": LEVEL, "like": LIKE, "limit": LIMIT, "linear": UNUSED, "lines": UNUSED, "linestring": LINESTRING, "load": UNUSED, "localtime": LOCALTIME, "localtimestamp": LOCALTIMESTAMP, "lock": LOCK, "long": UNUSED, "longblob": LONGBLOB, "longtext": LONGTEXT, "loop": UNUSED, "low_priority": UNUSED, "master_bind": UNUSED, "match": MATCH, "maxvalue": MAXVALUE, "mediumblob": MEDIUMBLOB, "mediumint": MEDIUMINT, "mediumtext": MEDIUMTEXT, "middleint": UNUSED, "minute_microsecond": UNUSED, "minute_second": UNUSED, "mod": MOD, "mode": MODE, "modifies": UNUSED, "multilinestring": MULTILINESTRING, "multipoint": MULTIPOINT, "multipolygon": MULTIPOLYGON, "names": NAMES, "natural": NATURAL, "nchar": NCHAR, "next": NEXT, "not": NOT, "no_write_to_binlog": UNUSED, "null": NULL, "numeric": NUMERIC, "offset": OFFSET, "on": ON, "only": ONLY, "optimize": OPTIMIZE, "optimizer_costs": UNUSED, "option": UNUSED, "optionally": UNUSED, "or": OR, "order": ORDER, "out": UNUSED, "outer": OUTER, "outfile": UNUSED, "partition": PARTITION, "point": POINT, "polygon": POLYGON, "precision": UNUSED, "primary": PRIMARY, "processlist": PROCESSLIST, "procedure": PROCEDURE, "query": QUERY, "range": UNUSED, "read": READ, "reads": UNUSED, "read_write": UNUSED, "real": REAL, "references": UNUSED, "regexp": REGEXP, "release": UNUSED, "rename": RENAME, "reorganize": REORGANIZE, "repair": REPAIR, "repeat": UNUSED, "repeatable": REPEATABLE, "replace": REPLACE, "require": UNUSED, "resignal": UNUSED, "restrict": UNUSED, "return": UNUSED, "revoke": UNUSED, "right": RIGHT, "rlike": REGEXP, "rollback": ROLLBACK, "schema": SCHEMA, "schemas": UNUSED, "second_microsecond": UNUSED, "select": SELECT, "sensitive": UNUSED, "separator": SEPARATOR, "serializable": SERIALIZABLE, "session": SESSION, "set": SET, "share": SHARE, "show": SHOW, "signal": UNUSED, "signed": SIGNED, "smallint": SMALLINT, "spatial": SPATIAL, "specific": UNUSED, "sql": UNUSED, "sqlexception": UNUSED, "sqlstate": UNUSED, "sqlwarning": UNUSED, "sql_big_result": UNUSED, "sql_cache": SQL_CACHE, "sql_calc_found_rows": UNUSED, "sql_no_cache": SQL_NO_CACHE, "sql_small_result": UNUSED, "ssl": UNUSED, "start": START, "starting": UNUSED, "status": STATUS, "stored": UNUSED, "straight_join": STRAIGHT_JOIN, "stream": STREAM, "table": TABLE, "tables": TABLES, "terminated": UNUSED, "text": TEXT, "than": THAN, "then": THEN, "time": TIME, "timestamp": TIMESTAMP, "tinyblob": TINYBLOB, "tinyint": TINYINT, "tinytext": TINYTEXT, "to": TO, "trailing": UNUSED, "transaction": TRANSACTION, "trigger": TRIGGER, "true": TRUE, "truncate": TRUNCATE, "uncommitted": UNCOMMITTED, "undo": UNUSED, "union": UNION, "unique": UNIQUE, "unlock": UNUSED, "unsigned": UNSIGNED, "update": UPDATE, "usage": UNUSED, "use": USE, "using": USING, "utc_date": UTC_DATE, "utc_time": UTC_TIME, "utc_timestamp": UTC_TIMESTAMP, "values": VALUES, "variables": VARIABLES, "varbinary": VARBINARY, "varchar": VARCHAR, "varcharacter": UNUSED, "varying": UNUSED, "virtual": UNUSED, "vindex": VINDEX, "vindexes": VINDEXES, "view": VIEW, "vitess_keyspaces": VITESS_KEYSPACES, "vitess_shards": VITESS_SHARDS, "vitess_tablets": VITESS_TABLETS, "vschema_tables": VSCHEMA_TABLES, "when": WHEN, "where": WHERE, "while": UNUSED, "with": WITH, "write": WRITE, "xor": UNUSED, "year": YEAR, "year_month": UNUSED, "zerofill": ZEROFILL, } // keywordStrings contains the reverse mapping of token to keyword strings var keywordStrings = map[int]string{} func init() { for str, id := range keywords { if id == UNUSED { continue } keywordStrings[id] = str } } // KeywordString returns the string corresponding to the given keyword func KeywordString(id int) string { str, ok := keywordStrings[id] if !ok { return "" } return str } // Lex returns the next token form the Tokenizer. // This function is used by go yacc. func (tkn *Tokenizer) Lex(lval *yySymType) int { typ, val := tkn.Scan() for typ == COMMENT { if tkn.AllowComments { break } typ, val = tkn.Scan() } lval.bytes = val tkn.lastToken = val return typ } // Error is called by go yacc if there's a parsing error. func (tkn *Tokenizer) Error(err string) { buf := &bytes2.Buffer{} if tkn.lastToken != nil { fmt.Fprintf(buf, "%s at position %v near '%s'", err, tkn.Position, tkn.lastToken) } else { fmt.Fprintf(buf, "%s at position %v", err, tkn.Position) } tkn.LastError = errors.New(buf.String()) // Try and re-sync to the next statement if tkn.lastChar != ';' { tkn.skipStatement() } } // Scan scans the tokenizer for the next token and returns // the token type and an optional value. func (tkn *Tokenizer) Scan() (int, []byte) { if tkn.specialComment != nil { // Enter specialComment scan mode. // for scanning such kind of comment: /*! MySQL-specific code */ specialComment := tkn.specialComment tok, val := specialComment.Scan() if tok != 0 { // return the specialComment scan result as the result return tok, val } // leave specialComment scan mode after all stream consumed. tkn.specialComment = nil } if tkn.lastChar == 0 { tkn.next() } if tkn.ForceEOF { tkn.skipStatement() return 0, nil } tkn.skipBlank() switch ch := tkn.lastChar; { case isLetter(ch): tkn.next() if ch == 'X' || ch == 'x' { if tkn.lastChar == '\'' { tkn.next() return tkn.scanHex() } } if ch == 'B' || ch == 'b' { if tkn.lastChar == '\'' { tkn.next() return tkn.scanBitLiteral() } } isDbSystemVariable := false if ch == '@' && tkn.lastChar == '@' { isDbSystemVariable = true } return tkn.scanIdentifier(byte(ch), isDbSystemVariable) case isDigit(ch): return tkn.scanNumber(false) case ch == ':': return tkn.scanBindVar() case ch == ';' && tkn.multi: return 0, nil default: tkn.next() switch ch { case eofChar: return 0, nil case '=', ',', ';', '(', ')', '+', '*', '%', '^', '~': return int(ch), nil case '&': if tkn.lastChar == '&' { tkn.next() return AND, nil } return int(ch), nil case '|': if tkn.lastChar == '|' { tkn.next() return OR, nil } return int(ch), nil case '?': tkn.posVarIndex++ buf := new(bytes2.Buffer) fmt.Fprintf(buf, ":v%d", tkn.posVarIndex) return VALUE_ARG, buf.Bytes() case '.': if isDigit(tkn.lastChar) { return tkn.scanNumber(true) } return int(ch), nil case '/': switch tkn.lastChar { case '/': tkn.next() return tkn.scanCommentType1("//") case '*': tkn.next() switch tkn.lastChar { case '!': return tkn.scanMySQLSpecificComment() default: return tkn.scanCommentType2() } default: return int(ch), nil } case '#': return tkn.scanCommentType1("#") case '-': switch tkn.lastChar { case '-': tkn.next() return tkn.scanCommentType1("--") case '>': tkn.next() if tkn.lastChar == '>' { tkn.next() return JSON_UNQUOTE_EXTRACT_OP, nil } return JSON_EXTRACT_OP, nil } return int(ch), nil case '<': switch tkn.lastChar { case '>': tkn.next() return NE, nil case '<': tkn.next() return SHIFT_LEFT, nil case '=': tkn.next() switch tkn.lastChar { case '>': tkn.next() return NULL_SAFE_EQUAL, nil default: return LE, nil } default: return int(ch), nil } case '>': switch tkn.lastChar { case '=': tkn.next() return GE, nil case '>': tkn.next() return SHIFT_RIGHT, nil default: return int(ch), nil } case '!': if tkn.lastChar == '=' { tkn.next() return NE, nil } return int(ch), nil case '\'', '"': return tkn.scanString(ch, STRING) case '`': return tkn.scanLiteralIdentifier() default: return LEX_ERROR, []byte{byte(ch)} } } } // skipStatement scans until the EOF, or end of statement is encountered. func (tkn *Tokenizer) skipStatement() { ch := tkn.lastChar for ch != ';' && ch != eofChar { tkn.next() ch = tkn.lastChar } } func (tkn *Tokenizer) skipBlank() { ch := tkn.lastChar for ch == ' ' || ch == '\n' || ch == '\r' || ch == '\t' { tkn.next() ch = tkn.lastChar } } func (tkn *Tokenizer) scanIdentifier(firstByte byte, isDbSystemVariable bool) (int, []byte) { buffer := &bytes2.Buffer{} buffer.WriteByte(firstByte) for isLetter(tkn.lastChar) || isDigit(tkn.lastChar) || (isDbSystemVariable && isCarat(tkn.lastChar)) { buffer.WriteByte(byte(tkn.lastChar)) tkn.next() } lowered := bytes.ToLower(buffer.Bytes()) loweredStr := string(lowered) if keywordID, found := keywords[loweredStr]; found { return keywordID, lowered } // dual must always be case-insensitive if loweredStr == "dual" { return ID, lowered } return ID, buffer.Bytes() } func (tkn *Tokenizer) scanHex() (int, []byte) { buffer := &bytes2.Buffer{} tkn.scanMantissa(16, buffer) if tkn.lastChar != '\'' { return LEX_ERROR, buffer.Bytes() } tkn.next() if buffer.Len()%2 != 0 { return LEX_ERROR, buffer.Bytes() } return HEX, buffer.Bytes() } func (tkn *Tokenizer) scanBitLiteral() (int, []byte) { buffer := &bytes2.Buffer{} tkn.scanMantissa(2, buffer) if tkn.lastChar != '\'' { return LEX_ERROR, buffer.Bytes() } tkn.next() return BIT_LITERAL, buffer.Bytes() } func (tkn *Tokenizer) scanLiteralIdentifier() (int, []byte) { buffer := &bytes2.Buffer{} backTickSeen := false for { if backTickSeen { if tkn.lastChar != '`' { break } backTickSeen = false buffer.WriteByte('`') tkn.next() continue } // The previous char was not a backtick. switch tkn.lastChar { case '`': backTickSeen = true case eofChar: // Premature EOF. return LEX_ERROR, buffer.Bytes() default: buffer.WriteByte(byte(tkn.lastChar)) } tkn.next() } if buffer.Len() == 0 { return LEX_ERROR, buffer.Bytes() } return ID, buffer.Bytes() } func (tkn *Tokenizer) scanBindVar() (int, []byte) { buffer := &bytes2.Buffer{} buffer.WriteByte(byte(tkn.lastChar)) token := VALUE_ARG tkn.next() if tkn.lastChar == ':' { token = LIST_ARG buffer.WriteByte(byte(tkn.lastChar)) tkn.next() } if !isLetter(tkn.lastChar) { return LEX_ERROR, buffer.Bytes() } for isLetter(tkn.lastChar) || isDigit(tkn.lastChar) || tkn.lastChar == '.' { buffer.WriteByte(byte(tkn.lastChar)) tkn.next() } return token, buffer.Bytes() } func (tkn *Tokenizer) scanMantissa(base int, buffer *bytes2.Buffer) { for digitVal(tkn.lastChar) < base { tkn.consumeNext(buffer) } } func (tkn *Tokenizer) scanNumber(seenDecimalPoint bool) (int, []byte) { token := INTEGRAL buffer := &bytes2.Buffer{} if seenDecimalPoint { token = FLOAT buffer.WriteByte('.') tkn.scanMantissa(10, buffer) goto exponent } // 0x construct. if tkn.lastChar == '0' { tkn.consumeNext(buffer) if tkn.lastChar == 'x' || tkn.lastChar == 'X' { token = HEXNUM tkn.consumeNext(buffer) tkn.scanMantissa(16, buffer) goto exit } } tkn.scanMantissa(10, buffer) if tkn.lastChar == '.' { token = FLOAT tkn.consumeNext(buffer) tkn.scanMantissa(10, buffer) } exponent: if tkn.lastChar == 'e' || tkn.lastChar == 'E' { token = FLOAT tkn.consumeNext(buffer) if tkn.lastChar == '+' || tkn.lastChar == '-' { tkn.consumeNext(buffer) } tkn.scanMantissa(10, buffer) } exit: // A letter cannot immediately follow a number. if isLetter(tkn.lastChar) { return LEX_ERROR, buffer.Bytes() } return token, buffer.Bytes() } func (tkn *Tokenizer) scanString(delim uint16, typ int) (int, []byte) { var buffer bytes2.Buffer for { ch := tkn.lastChar if ch == eofChar { // Unterminated string. return LEX_ERROR, buffer.Bytes() } if ch != delim && ch != '\\' { buffer.WriteByte(byte(ch)) // Scan ahead to the next interesting character. start := tkn.bufPos for ; tkn.bufPos < tkn.bufSize; tkn.bufPos++ { ch = uint16(tkn.buf[tkn.bufPos]) if ch == delim || ch == '\\' { break } } buffer.Write(tkn.buf[start:tkn.bufPos]) tkn.Position += (tkn.bufPos - start) if tkn.bufPos >= tkn.bufSize { // Reached the end of the buffer without finding a delim or // escape character. tkn.next() continue } tkn.bufPos++ tkn.Position++ } tkn.next() // Read one past the delim or escape character. if ch == '\\' { if tkn.lastChar == eofChar { // String terminates mid escape character. return LEX_ERROR, buffer.Bytes() } if decodedChar := sqltypes.SQLDecodeMap[byte(tkn.lastChar)]; decodedChar == sqltypes.DontEscape { ch = tkn.lastChar } else { ch = uint16(decodedChar) } } else if ch == delim && tkn.lastChar != delim { // Correctly terminated string, which is not a double delim. break } buffer.WriteByte(byte(ch)) tkn.next() } return typ, buffer.Bytes() } func (tkn *Tokenizer) scanCommentType1(prefix string) (int, []byte) { buffer := &bytes2.Buffer{} buffer.WriteString(prefix) for tkn.lastChar != eofChar { if tkn.lastChar == '\n' { tkn.consumeNext(buffer) break } tkn.consumeNext(buffer) } return COMMENT, buffer.Bytes() } func (tkn *Tokenizer) scanCommentType2() (int, []byte) { buffer := &bytes2.Buffer{} buffer.WriteString("/*") for { if tkn.lastChar == '*' { tkn.consumeNext(buffer) if tkn.lastChar == '/' { tkn.consumeNext(buffer) break } continue } if tkn.lastChar == eofChar { return LEX_ERROR, buffer.Bytes() } tkn.consumeNext(buffer) } return COMMENT, buffer.Bytes() } func (tkn *Tokenizer) scanMySQLSpecificComment() (int, []byte) { buffer := &bytes2.Buffer{} buffer.WriteString("/*!") tkn.next() for { if tkn.lastChar == '*' { tkn.consumeNext(buffer) if tkn.lastChar == '/' { tkn.consumeNext(buffer) break } continue } if tkn.lastChar == eofChar { return LEX_ERROR, buffer.Bytes() } tkn.consumeNext(buffer) } _, sql := ExtractMysqlComment(buffer.String()) tkn.specialComment = NewStringTokenizer(sql) return tkn.Scan() } func (tkn *Tokenizer) consumeNext(buffer *bytes2.Buffer) { if tkn.lastChar == eofChar { // This should never happen. panic("unexpected EOF") } buffer.WriteByte(byte(tkn.lastChar)) tkn.next() } func (tkn *Tokenizer) next() { if tkn.bufPos >= tkn.bufSize && tkn.InStream != nil { // Try and refill the buffer var err error tkn.bufPos = 0 if tkn.bufSize, err = tkn.InStream.Read(tkn.buf); err != io.EOF && err != nil { tkn.LastError = err } } if tkn.bufPos >= tkn.bufSize { if tkn.lastChar != eofChar { tkn.Position++ tkn.lastChar = eofChar } } else { tkn.Position++ tkn.lastChar = uint16(tkn.buf[tkn.bufPos]) tkn.bufPos++ } } // reset clears any internal state. func (tkn *Tokenizer) reset() { tkn.ParseTree = nil tkn.partialDDL = nil tkn.specialComment = nil tkn.posVarIndex = 0 tkn.nesting = 0 tkn.ForceEOF = false } func isLetter(ch uint16) bool { return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch == '@' } func isCarat(ch uint16) bool { return ch == '.' || ch == '\'' || ch == '"' || ch == '`' } func digitVal(ch uint16) int { switch { case '0' <= ch && ch <= '9': return int(ch) - '0' case 'a' <= ch && ch <= 'f': return int(ch) - 'a' + 10 case 'A' <= ch && ch <= 'F': return int(ch) - 'A' + 10 } return 16 // larger than any legal digit val } func isDigit(ch uint16) bool { return '0' <= ch && ch <= '9' }