-
Notifications
You must be signed in to change notification settings - Fork 761
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feat: parse analyze compute statistics #4547
Changes from all commits
0f00d29
a2e9c01
b00da28
1ca874d
33af89f
5357b4c
b2ee218
28bb75a
280dd54
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -788,6 +788,7 @@ class Parser(metaclass=_Parser): | |
|
||
STATEMENT_PARSERS = { | ||
TokenType.ALTER: lambda self: self._parse_alter(), | ||
TokenType.ANALYZE: lambda self: self._parse_analyze(), | ||
TokenType.BEGIN: lambda self: self._parse_transaction(), | ||
TokenType.CACHE: lambda self: self._parse_cache(), | ||
TokenType.COMMENT: lambda self: self._parse_comment(), | ||
|
@@ -7079,6 +7080,34 @@ def _parse_alter(self) -> exp.Alter | exp.Command: | |
|
||
return self._parse_as_command(start) | ||
|
||
def _parse_analyze(self) -> exp.Analyze | exp.Command: | ||
start = self._prev | ||
kind = None | ||
this: t.Optional[exp.Expression] = None | ||
partition = None | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit, could we do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. re-wrote this part in #4591 |
||
if self._match(TokenType.TABLE): | ||
kind = "TABLE" | ||
this = self._parse_table_parts() | ||
partition = self._parse_partition() | ||
elif self._match_texts("TABLES"): | ||
kind = "TABLES" | ||
this = ( | ||
self._parse_table(is_db_reference=True) | ||
if self._match_set((TokenType.FROM, TokenType.IN)) | ||
else None | ||
) | ||
else: | ||
return self._parse_as_command(start) | ||
|
||
if self._match(TokenType.COMPUTE_STATISTICS): | ||
compute_stats = self._parse_compute_statistics() | ||
return self.expression( | ||
exp.Analyze, kind=kind, this=this, partition=partition, expression=compute_stats | ||
) | ||
|
||
return self._parse_as_command(start) | ||
|
||
def _parse_merge(self) -> exp.Merge: | ||
self._match(TokenType.INTO) | ||
target = self._parse_table() | ||
|
@@ -7300,6 +7329,21 @@ def _parse_comprehension( | |
condition=condition, | ||
) | ||
|
||
# https://spark.apache.org/docs/3.5.1/sql-ref-syntax-aux-analyze-table.html | ||
def _parse_compute_statistics(self) -> exp.ComputeStatistics: | ||
this = None | ||
expressions = None | ||
if self._match_text_seq("NOSCAN"): | ||
this = "NOSCAN" | ||
elif self._match(TokenType.FOR): | ||
if self._match_text_seq("ALL", "COLUMNS"): | ||
this = "FOR ALL COLUMNS" | ||
if self._match_texts("COLUMNS"): | ||
this = "FOR COLUMNS" | ||
expressions = self._parse_csv(self._parse_column_reference) | ||
|
||
return self.expression(exp.ComputeStatistics, this=this, expressions=expressions) | ||
|
||
def _parse_heredoc(self) -> t.Optional[exp.Heredoc]: | ||
if self._match(TokenType.HEREDOC_STRING): | ||
return self.expression(exp.Heredoc, this=self._prev.text) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -410,6 +410,8 @@ class TokenType(AutoName): | |
OPTION = auto() | ||
SINK = auto() | ||
SOURCE = auto() | ||
ANALYZE = auto() | ||
COMPUTE_STATISTICS = auto() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can now remove the It can be consumed by the parser through There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
NAMESPACE = auto() | ||
|
||
|
||
|
@@ -704,6 +706,7 @@ class Tokenizer(metaclass=_Tokenizer): | |
"COLLATE": TokenType.COLLATE, | ||
"COLUMN": TokenType.COLUMN, | ||
"COMMIT": TokenType.COMMIT, | ||
"COMPUTE STATISTICS": TokenType.COMPUTE_STATISTICS, | ||
"CONNECT BY": TokenType.CONNECT_BY, | ||
"CONSTRAINT": TokenType.CONSTRAINT, | ||
"COPY": TokenType.COPY, | ||
|
@@ -938,7 +941,7 @@ class Tokenizer(metaclass=_Tokenizer): | |
"SEQUENCE": TokenType.SEQUENCE, | ||
"VARIANT": TokenType.VARIANT, | ||
"ALTER": TokenType.ALTER, | ||
"ANALYZE": TokenType.COMMAND, | ||
"ANALYZE": TokenType.ANALYZE, | ||
"CALL": TokenType.COMMAND, | ||
"COMMENT": TokenType.COMMENT, | ||
"EXPLAIN": TokenType.COMMAND, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -918,3 +918,14 @@ def test_string(self): | |
with self.subTest(f"Testing STRING() for {dialect}"): | ||
query = parse_one("STRING(a)", dialect=dialect) | ||
self.assertEqual(query.sql(dialect), "CAST(a AS STRING)") | ||
|
||
def test_analyze_compute_statistics(self): | ||
self.validate_identity("ANALYZE TABLE tbl COMPUTE STATISTICS NOSCAN") | ||
self.validate_identity("ANALYZE TABLE tbl COMPUTE STATISTICS FOR ALL COLUMNS") | ||
self.validate_identity("ANALYZE TABLE tbl COMPUTE STATISTICS FOR COLUMNS foo, bar") | ||
self.validate_identity("ANALYZE TABLE ctlg.db.tbl COMPUTE STATISTICS NOSCAN") | ||
self.validate_identity( | ||
"ANALYZE TABLE ctlg.db.tbl PARTITION(foo = 'foo', bar = 'bar') COMPUTE STATISTICS NOSCAN" | ||
) | ||
Comment on lines
+927
to
+929
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Styling nit, can we move this at the end of this identity chain since it breaks into multiple lines There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
self.validate_identity("ANALYZE TABLES COMPUTE STATISTICS NOSCAN") | ||
self.validate_identity("ANALYZE TABLES FROM db COMPUTE STATISTICS") |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -900,3 +900,16 @@ def test_udf_meta(self): | |
# Incomplete or incorrect anonymous meta comments are not registered | ||
ast = parse_one("YEAR(a) /* sqlglot.anon */") | ||
self.assertIsInstance(ast, exp.Year) | ||
|
||
def test_analyze(self): | ||
# Valid spark analyze statement. | ||
ast = parse_one("ANALYZE TABLE tbl COMPUTE STATISTICS FOR ALL COLUMNS") | ||
self.assertIsInstance(ast, exp.Analyze) | ||
Comment on lines
+906
to
+907
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since we're not passing in a specific dialect to self.validate_identity(...).assert_is(exp.Command) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's a great idea, I am removing this test here and adding There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I then removed it when parse_analyze only returns exp.Analyze. Changes in #4591 |
||
|
||
# Fallback to command - valid postgres. | ||
ast = parse_one("ANALYZE VERBOSE tbl") | ||
self.assertIsInstance(ast, exp.Command) | ||
|
||
# Fallback to command - valid sqllite. | ||
ast = parse_one("ANALYZE TABLE tbl") | ||
self.assertIsInstance(ast, exp.Command) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We'll always have
this
here according to_parse_compute_statistics
, we can make thisTrue
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done in #4591