Skip to content

Commit 3d9671b

Browse files
committed
Fix identifier matching, improve lexer performance by ~25%
1 parent 01241b8 commit 3d9671b

File tree

3 files changed

+14
-651
lines changed

3 files changed

+14
-651
lines changed

custom-parser/parser/MySQLLexer.php

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,13 @@ class MySQLLexer {
4444
const SQL_MODE_IGNORE_SPACE = 1 << 3;
4545
const SQL_MODE_NO_BACKSLASH_ESCAPES = 1 << 4;
4646

47+
/**
48+
* MySQL unquoted identifiers: https://dev.mysql.com/doc/refman/8.4/en/identifiers.html
49+
* 1. Allowed characters are ASCII a-z, A-Z, 0-9, $, _ and Unicode \x{0080}-\x{ffff}.
50+
* 2. Unquoted identifiers may begin with a digit but may not consist solely of digits.
51+
*/
52+
const PATTERN_UNQUOTED_IDENTIFIER = '(?=\D)[\w_$\x{80}-\x{ffff}]+';
53+
4754
// Constants for token types.
4855
// Operators
4956
public const EQUAL_OPERATOR = 1;
@@ -1129,7 +1136,11 @@ private function nextToken()
11291136
$this->NUMBER();
11301137
} elseif (($la === 'x' || $la === 'X' || $la === 'b' || $la === 'B') && $this->LA(2) === "'") {
11311138
$this->NUMBER();
1132-
} elseif (safe_ctype_alpha($la)) {
1139+
} elseif (preg_match('/\G' . self::PATTERN_UNQUOTED_IDENTIFIER . '/u', $this->input, $matches, 0, $this->position)) {
1140+
$this->text = $matches[0];
1141+
$this->position += strlen($this->text);
1142+
$this->c = $this->input[$this->position] ?? null;
1143+
$this->n = $this->input[$this->position + 1] ?? null;
11331144
$this->IDENTIFIER_OR_KEYWORD();
11341145
} elseif ($la === null) {
11351146
$this->matchEOF();
@@ -3090,11 +3101,7 @@ protected function emitDot(): void
30903101

30913102
protected function IDENTIFIER_OR_KEYWORD()
30923103
{
3093-
// Match the longest possible keyword.
3094-
while (safe_ctype_alnum($this->LA(1)) || $this->LA(1) === '_' || $this->LA(1) === '$') {
3095-
$this->consume();
3096-
}
3097-
$text = strtoupper($this->getText());
3104+
$text = strtoupper($this->getText());
30983105

30993106
// Lookup the string in the token table.
31003107
$this->type = self::TOKENS[$text] ?? self::IDENTIFIER;

0 commit comments

Comments
 (0)