Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: parse analyze compute statistics #4547

Merged
merged 9 commits into from
Jan 9, 2025
16 changes: 16 additions & 0 deletions sqlglot/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4713,6 +4713,22 @@ def actions(self) -> t.List[Expression]:
return self.args.get("actions") or []


class Analyze(Expression):
arg_types = {
"kind": True,
"this": False,
"partition": False,
"expression": True,
}


class ComputeStatistics(Expression):
arg_types = {
"this": False,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We'll always have this here according to _parse_compute_statistics, we can make this True

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in #4591

"expressions": False,
}


class AddConstraint(Expression):
arg_types = {"expressions": True}

Expand Down
17 changes: 17 additions & 0 deletions sqlglot/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4646,3 +4646,20 @@ def unpivotcolumns_sql(self, expression: exp.UnpivotColumns) -> str:
values = self.expressions(expression, flat=True)

return f"NAME {name} VALUE {values}"

def computestatistics_sql(self, expression: exp.ComputeStatistics) -> str:
this = self.sql(expression, "this")
columns = self.expressions(expression)
columns = f" {columns}" if columns else ""
return f"COMPUTE STATISTICS {this}{columns}"

def analyze_sql(self, expression: exp.Analyze) -> str:
kind = self.sql(expression, "kind")
this = self.sql(expression, "this")
this = f" {this}" if this else ""
if this and kind == "TABLES":
this = f" FROM{this}"
partition = self.sql(expression, "partition")
partition = f" {partition}" if partition else ""
inner_expression = f" {self.sql(expression, 'expression')}"
return f"ANALYZE {kind}{this}{partition}{inner_expression}"
44 changes: 44 additions & 0 deletions sqlglot/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -788,6 +788,7 @@ class Parser(metaclass=_Parser):

STATEMENT_PARSERS = {
TokenType.ALTER: lambda self: self._parse_alter(),
TokenType.ANALYZE: lambda self: self._parse_analyze(),
TokenType.BEGIN: lambda self: self._parse_transaction(),
TokenType.CACHE: lambda self: self._parse_cache(),
TokenType.COMMENT: lambda self: self._parse_comment(),
Expand Down Expand Up @@ -7058,6 +7059,34 @@ def _parse_alter(self) -> exp.Alter | exp.Command:

return self._parse_as_command(start)

def _parse_analyze(self) -> exp.Analyze | exp.Command:
start = self._prev
kind = None
this: t.Optional[exp.Expression] = None
partition = None

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit, could we do kind = self._curr and self._curr.text.upper() here i.e before the branches? I think that would remove the hardcoded values in the if/elif

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

re-wrote this part in #4591

if self._match(TokenType.TABLE):
kind = "TABLE"
this = self._parse_table_parts()
partition = self._parse_partition()
elif self._match_texts("TABLES"):
kind = "TABLES"
this = (
self._parse_table(is_db_reference=True)
if self._match_set((TokenType.FROM, TokenType.IN))
else None
)
else:
return self._parse_as_command(start)

if self._match(TokenType.COMPUTE_STATISTICS):
compute_stats = self._parse_compute_statistics()
return self.expression(
exp.Analyze, kind=kind, this=this, partition=partition, expression=compute_stats
)

return self._parse_as_command(start)

def _parse_merge(self) -> exp.Merge:
self._match(TokenType.INTO)
target = self._parse_table()
Expand Down Expand Up @@ -7279,6 +7308,21 @@ def _parse_comprehension(
condition=condition,
)

# https://spark.apache.org/docs/3.5.1/sql-ref-syntax-aux-analyze-table.html
def _parse_compute_statistics(self) -> exp.ComputeStatistics:
this = None
expressions = None
if self._match_text_seq("NOSCAN"):
this = "NOSCAN"
elif self._match(TokenType.FOR):
if self._match_text_seq("ALL", "COLUMNS"):
this = "FOR ALL COLUMNS"
if self._match_texts("COLUMNS"):
this = "FOR COLUMNS"
expressions = self._parse_csv(self._parse_column_reference)

return self.expression(exp.ComputeStatistics, this=this, expressions=expressions)

def _parse_heredoc(self) -> t.Optional[exp.Heredoc]:
if self._match(TokenType.HEREDOC_STRING):
return self.expression(exp.Heredoc, this=self._prev.text)
Expand Down
5 changes: 4 additions & 1 deletion sqlglot/tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,8 @@ class TokenType(AutoName):
OPTION = auto()
SINK = auto()
SOURCE = auto()
ANALYZE = auto()
COMPUTE_STATISTICS = auto()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can now remove the COMPUTE STATISTICS token since it was removed from STATEMENT_PARSERS, right?

It can be consumed by the parser through self._match_text_seq("COMPUTE", "STATISTICS")

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

NAMESPACE = auto()


Expand Down Expand Up @@ -704,6 +706,7 @@ class Tokenizer(metaclass=_Tokenizer):
"COLLATE": TokenType.COLLATE,
"COLUMN": TokenType.COLUMN,
"COMMIT": TokenType.COMMIT,
"COMPUTE STATISTICS": TokenType.COMPUTE_STATISTICS,
"CONNECT BY": TokenType.CONNECT_BY,
"CONSTRAINT": TokenType.CONSTRAINT,
"COPY": TokenType.COPY,
Expand Down Expand Up @@ -938,7 +941,7 @@ class Tokenizer(metaclass=_Tokenizer):
"SEQUENCE": TokenType.SEQUENCE,
"VARIANT": TokenType.VARIANT,
"ALTER": TokenType.ALTER,
"ANALYZE": TokenType.COMMAND,
"ANALYZE": TokenType.ANALYZE,
"CALL": TokenType.COMMAND,
"COMMENT": TokenType.COMMENT,
"EXPLAIN": TokenType.COMMAND,
Expand Down
11 changes: 11 additions & 0 deletions tests/dialects/test_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -918,3 +918,14 @@ def test_string(self):
with self.subTest(f"Testing STRING() for {dialect}"):
query = parse_one("STRING(a)", dialect=dialect)
self.assertEqual(query.sql(dialect), "CAST(a AS STRING)")

def test_analyze_compute_statistics(self):
self.validate_identity("ANALYZE TABLE tbl COMPUTE STATISTICS NOSCAN")
self.validate_identity("ANALYZE TABLE tbl COMPUTE STATISTICS FOR ALL COLUMNS")
self.validate_identity("ANALYZE TABLE tbl COMPUTE STATISTICS FOR COLUMNS foo, bar")
self.validate_identity("ANALYZE TABLE ctlg.db.tbl COMPUTE STATISTICS NOSCAN")
self.validate_identity(
"ANALYZE TABLE ctlg.db.tbl PARTITION(foo = 'foo', bar = 'bar') COMPUTE STATISTICS NOSCAN"
)
Comment on lines +927 to +929
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Styling nit, can we move this at the end of this identity chain since it breaks into multiple lines

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

self.validate_identity("ANALYZE TABLES COMPUTE STATISTICS NOSCAN")
self.validate_identity("ANALYZE TABLES FROM db COMPUTE STATISTICS")
13 changes: 13 additions & 0 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -900,3 +900,16 @@ def test_udf_meta(self):
# Incomplete or incorrect anonymous meta comments are not registered
ast = parse_one("YEAR(a) /* sqlglot.anon */")
self.assertIsInstance(ast, exp.Year)

def test_analyze(self):
# Valid spark analyze statement.
ast = parse_one("ANALYZE TABLE tbl COMPUTE STATISTICS FOR ALL COLUMNS")
self.assertIsInstance(ast, exp.Analyze)
Comment on lines +906 to +907
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we're not passing in a specific dialect to parse_one, afaict we can merge each of these 2 lines as:

self.validate_identity(...).assert_is(exp.Command)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a great idea, I am removing this test here and adding assert_is(exp.Analyze) to all dialect specefic test.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I then removed it when parse_analyze only returns exp.Analyze. Changes in #4591


# Fallback to command - valid postgres.
ast = parse_one("ANALYZE VERBOSE tbl")
self.assertIsInstance(ast, exp.Command)

# Fallback to command - valid sqllite.
ast = parse_one("ANALYZE TABLE tbl")
self.assertIsInstance(ast, exp.Command)
1 change: 0 additions & 1 deletion tests/test_transpile.py
Original file line number Diff line number Diff line change
Expand Up @@ -880,7 +880,6 @@ def test_command_identity(self):
"ALTER TABLE table1 RENAME COLUMN c1 c2",
"ALTER TYPE electronic_mail RENAME TO email",
"ALTER schema doo",
"ANALYZE a.y",
"CALL catalog.system.iceberg_procedure_name(named_arg_1 => 'arg_1', named_arg_2 => 'arg_2')",
"COMMENT ON ACCESS METHOD gin IS 'GIN index access method'",
"CREATE OR REPLACE STAGE",
Expand Down
Loading