Skip to content

Commit

Permalink
Feat: parse analyze compute statistics (#4547)
Browse files Browse the repository at this point in the history
* feat: define analyze as standalone statment rather than command

* Python style review comments

* Move spark syntax to test_spark.py

* nit: auto formatting

* Fallback to parse as Command when parsin unsupported analyze syntax

* Style review comments

* Remove COMPUTE STATISTICS from statement parsers

---------

Co-authored-by: Jo <[email protected]>
  • Loading branch information
zashroof and georgesittas authored Jan 9, 2025
1 parent 1ad8532 commit c75016a
Show file tree
Hide file tree
Showing 7 changed files with 105 additions and 2 deletions.
16 changes: 16 additions & 0 deletions sqlglot/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4713,6 +4713,22 @@ def actions(self) -> t.List[Expression]:
return self.args.get("actions") or []


class Analyze(Expression):
arg_types = {
"kind": True,
"this": False,
"partition": False,
"expression": True,
}


class ComputeStatistics(Expression):
arg_types = {
"this": False,
"expressions": False,
}


class AddConstraint(Expression):
arg_types = {"expressions": True}

Expand Down
17 changes: 17 additions & 0 deletions sqlglot/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4647,6 +4647,23 @@ def unpivotcolumns_sql(self, expression: exp.UnpivotColumns) -> str:

return f"NAME {name} VALUE {values}"

def computestatistics_sql(self, expression: exp.ComputeStatistics) -> str:
this = self.sql(expression, "this")
columns = self.expressions(expression)
columns = f" {columns}" if columns else ""
return f"COMPUTE STATISTICS {this}{columns}"

def analyze_sql(self, expression: exp.Analyze) -> str:
kind = self.sql(expression, "kind")
this = self.sql(expression, "this")
this = f" {this}" if this else ""
if this and kind == "TABLES":
this = f" FROM{this}"
partition = self.sql(expression, "partition")
partition = f" {partition}" if partition else ""
inner_expression = f" {self.sql(expression, 'expression')}"
return f"ANALYZE {kind}{this}{partition}{inner_expression}"

def xmltable_sql(self, expression: exp.XMLTable) -> str:
this = self.sql(expression, "this")
passing = self.expressions(expression, key="passing")
Expand Down
44 changes: 44 additions & 0 deletions sqlglot/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -788,6 +788,7 @@ class Parser(metaclass=_Parser):

STATEMENT_PARSERS = {
TokenType.ALTER: lambda self: self._parse_alter(),
TokenType.ANALYZE: lambda self: self._parse_analyze(),
TokenType.BEGIN: lambda self: self._parse_transaction(),
TokenType.CACHE: lambda self: self._parse_cache(),
TokenType.COMMENT: lambda self: self._parse_comment(),
Expand Down Expand Up @@ -7079,6 +7080,34 @@ def _parse_alter(self) -> exp.Alter | exp.Command:

return self._parse_as_command(start)

def _parse_analyze(self) -> exp.Analyze | exp.Command:
start = self._prev
kind = None
this: t.Optional[exp.Expression] = None
partition = None

if self._match(TokenType.TABLE):
kind = "TABLE"
this = self._parse_table_parts()
partition = self._parse_partition()
elif self._match_texts("TABLES"):
kind = "TABLES"
this = (
self._parse_table(is_db_reference=True)
if self._match_set((TokenType.FROM, TokenType.IN))
else None
)
else:
return self._parse_as_command(start)

if self._match(TokenType.COMPUTE_STATISTICS):
compute_stats = self._parse_compute_statistics()
return self.expression(
exp.Analyze, kind=kind, this=this, partition=partition, expression=compute_stats
)

return self._parse_as_command(start)

def _parse_merge(self) -> exp.Merge:
self._match(TokenType.INTO)
target = self._parse_table()
Expand Down Expand Up @@ -7300,6 +7329,21 @@ def _parse_comprehension(
condition=condition,
)

# https://spark.apache.org/docs/3.5.1/sql-ref-syntax-aux-analyze-table.html
def _parse_compute_statistics(self) -> exp.ComputeStatistics:
this = None
expressions = None
if self._match_text_seq("NOSCAN"):
this = "NOSCAN"
elif self._match(TokenType.FOR):
if self._match_text_seq("ALL", "COLUMNS"):
this = "FOR ALL COLUMNS"
if self._match_texts("COLUMNS"):
this = "FOR COLUMNS"
expressions = self._parse_csv(self._parse_column_reference)

return self.expression(exp.ComputeStatistics, this=this, expressions=expressions)

def _parse_heredoc(self) -> t.Optional[exp.Heredoc]:
if self._match(TokenType.HEREDOC_STRING):
return self.expression(exp.Heredoc, this=self._prev.text)
Expand Down
5 changes: 4 additions & 1 deletion sqlglot/tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,8 @@ class TokenType(AutoName):
OPTION = auto()
SINK = auto()
SOURCE = auto()
ANALYZE = auto()
COMPUTE_STATISTICS = auto()
NAMESPACE = auto()


Expand Down Expand Up @@ -704,6 +706,7 @@ class Tokenizer(metaclass=_Tokenizer):
"COLLATE": TokenType.COLLATE,
"COLUMN": TokenType.COLUMN,
"COMMIT": TokenType.COMMIT,
"COMPUTE STATISTICS": TokenType.COMPUTE_STATISTICS,
"CONNECT BY": TokenType.CONNECT_BY,
"CONSTRAINT": TokenType.CONSTRAINT,
"COPY": TokenType.COPY,
Expand Down Expand Up @@ -938,7 +941,7 @@ class Tokenizer(metaclass=_Tokenizer):
"SEQUENCE": TokenType.SEQUENCE,
"VARIANT": TokenType.VARIANT,
"ALTER": TokenType.ALTER,
"ANALYZE": TokenType.COMMAND,
"ANALYZE": TokenType.ANALYZE,
"CALL": TokenType.COMMAND,
"COMMENT": TokenType.COMMENT,
"EXPLAIN": TokenType.COMMAND,
Expand Down
11 changes: 11 additions & 0 deletions tests/dialects/test_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -918,3 +918,14 @@ def test_string(self):
with self.subTest(f"Testing STRING() for {dialect}"):
query = parse_one("STRING(a)", dialect=dialect)
self.assertEqual(query.sql(dialect), "CAST(a AS STRING)")

def test_analyze_compute_statistics(self):
self.validate_identity("ANALYZE TABLE tbl COMPUTE STATISTICS NOSCAN")
self.validate_identity("ANALYZE TABLE tbl COMPUTE STATISTICS FOR ALL COLUMNS")
self.validate_identity("ANALYZE TABLE tbl COMPUTE STATISTICS FOR COLUMNS foo, bar")
self.validate_identity("ANALYZE TABLE ctlg.db.tbl COMPUTE STATISTICS NOSCAN")
self.validate_identity(
"ANALYZE TABLE ctlg.db.tbl PARTITION(foo = 'foo', bar = 'bar') COMPUTE STATISTICS NOSCAN"
)
self.validate_identity("ANALYZE TABLES COMPUTE STATISTICS NOSCAN")
self.validate_identity("ANALYZE TABLES FROM db COMPUTE STATISTICS")
13 changes: 13 additions & 0 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -900,3 +900,16 @@ def test_udf_meta(self):
# Incomplete or incorrect anonymous meta comments are not registered
ast = parse_one("YEAR(a) /* sqlglot.anon */")
self.assertIsInstance(ast, exp.Year)

def test_analyze(self):
# Valid spark analyze statement.
ast = parse_one("ANALYZE TABLE tbl COMPUTE STATISTICS FOR ALL COLUMNS")
self.assertIsInstance(ast, exp.Analyze)

# Fallback to command - valid postgres.
ast = parse_one("ANALYZE VERBOSE tbl")
self.assertIsInstance(ast, exp.Command)

# Fallback to command - valid sqllite.
ast = parse_one("ANALYZE TABLE tbl")
self.assertIsInstance(ast, exp.Command)
1 change: 0 additions & 1 deletion tests/test_transpile.py
Original file line number Diff line number Diff line change
Expand Up @@ -880,7 +880,6 @@ def test_command_identity(self):
"ALTER TABLE table1 RENAME COLUMN c1 c2",
"ALTER TYPE electronic_mail RENAME TO email",
"ALTER schema doo",
"ANALYZE a.y",
"CALL catalog.system.iceberg_procedure_name(named_arg_1 => 'arg_1', named_arg_2 => 'arg_2')",
"COMMENT ON ACCESS METHOD gin IS 'GIN index access method'",
"CREATE OR REPLACE STAGE",
Expand Down

0 comments on commit c75016a

Please sign in to comment.