Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bugfix:(transpile) use static alias for unnesting a struct #4313

Closed
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sqlglot/dialects/hive.py
Original file line number Diff line number Diff line change
@@ -555,7 +555,7 @@ class Generator(generator.Generator):
[
transforms.eliminate_qualify,
transforms.eliminate_distinct_on,
partial(transforms.unnest_to_explode, unnest_using_arrays_zip=False),
partial(transforms.unnest_to_explode),
transforms.any_to_exists,
]
),
29 changes: 21 additions & 8 deletions sqlglot/transforms.py
Original file line number Diff line number Diff line change
@@ -302,19 +302,13 @@ def unqualify_unnest(expression: exp.Expression) -> exp.Expression:
return expression


def unnest_to_explode(
expression: exp.Expression,
unnest_using_arrays_zip: bool = True,
) -> exp.Expression:
def unnest_to_explode(expression: exp.Expression) -> exp.Expression:
"""Convert cross join unnest into lateral view explode."""

def _unnest_zip_exprs(
u: exp.Unnest, unnest_exprs: t.List[exp.Expression], has_multi_expr: bool
) -> t.List[exp.Expression]:
if has_multi_expr:
if not unnest_using_arrays_zip:
raise UnsupportedError("Cannot transpile UNNEST with multiple input arrays")

# Use INLINE(ARRAYS_ZIP(...)) for multiple expressions
zip_exprs: t.List[exp.Expression] = [
exp.Anonymous(this="ARRAYS_ZIP", expressions=unnest_exprs)
@@ -368,6 +362,25 @@ def _udtf_type(u: exp.Unnest, has_multi_expr: bool) -> t.Type[exp.Func]:
expression.args["joins"].remove(join)

alias_cols = alias.columns if alias else []

"""
Handle Presto CROSS JOIN UNNEST to LATERAL VIEW EXPLODE for Multiple or No Exploded table column alias.

Spark/Hive LATERAL VIEW EXPLODE requires only single alias for respective exploded array/struct to be given for unnest ulike trino/presto which can take multiple.
https://cwiki.apache.org/confluence/display/Hive/LanguageManual+LateralView
https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-lateral-view.html

"""
# Replace multiple alias for single EXPLODE column with single static alias name: `t_struct`
if not has_multi_expr and (len(alias_cols) != 1):
alias_cols = ["t_struct"]

# [Optional] Do Update the Column reference in AST for table with current alias
Comment on lines +374 to +378
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If there's a single exploded column why would there be multiple aliases coming from Presto/Trino in the first place?

Copy link
Contributor Author

@gauravsagar483 gauravsagar483 Oct 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll share some examples in sometime for this. But in short, this is the scenario of array of Structs
thanks for the review.

if alias:
for column in expression.find_all(exp.Column):
if column.table == alias.name:
column.set("table", "t_struct")

for e, column in zip(exprs, alias_cols):
expression.append(
"laterals",
@@ -376,7 +389,7 @@ def _udtf_type(u: exp.Unnest, has_multi_expr: bool) -> t.Type[exp.Func]:
view=True,
alias=exp.TableAlias(
this=alias.this, # type: ignore
columns=alias_cols if unnest_using_arrays_zip else [column], # type: ignore
columns=alias_cols,
),
),
)
28 changes: 28 additions & 0 deletions tests/dialects/test_dialect.py
Original file line number Diff line number Diff line change
@@ -1444,6 +1444,34 @@ def test_cross_join(self):
},
)

# UNNEST without column alias
self.validate_all(
"SELECT * FROM x CROSS JOIN UNNEST(y) AS t",
write={
"presto": "SELECT * FROM x CROSS JOIN UNNEST(y) AS t",
"spark": "SELECT * FROM x LATERAL VIEW EXPLODE(y) t AS t_struct",
"databricks": "SELECT * FROM x LATERAL VIEW EXPLODE(y) t AS t_struct",
},
)

# UNNEST STRUCT Object into multiple columns, using single alias
self.validate_all(
"SELECT a, b FROM x CROSS JOIN UNNEST(y) AS t (a, b)",
write={
"presto": "SELECT a, b FROM x CROSS JOIN UNNEST(y) AS t(a, b)",
"spark": "SELECT a, b FROM x LATERAL VIEW EXPLODE(y) t AS t_struct",
},
)

# Unnest multiple Expression into respective mapped alias
self.validate_all(
"SELECT numbers, animals, n, a FROM (SELECT ARRAY(2, 5) AS numbers, ARRAY('dog', 'cat', 'bird') AS animals UNION ALL SELECT ARRAY(7, 8, 9), ARRAY('cow', 'pig')) AS x CROSS JOIN UNNEST(numbers, animals) AS t(n, a)",
write={
"presto": "SELECT numbers, animals, n, a FROM (SELECT ARRAY[2, 5] AS numbers, ARRAY['dog', 'cat', 'bird'] AS animals UNION ALL SELECT ARRAY[7, 8, 9], ARRAY['cow', 'pig']) AS x CROSS JOIN UNNEST(numbers, animals) AS t(n, a)",
"spark": "SELECT numbers, animals, n, a FROM (SELECT ARRAY(2, 5) AS numbers, ARRAY('dog', 'cat', 'bird') AS animals UNION ALL SELECT ARRAY(7, 8, 9), ARRAY('cow', 'pig')) AS x LATERAL VIEW INLINE(ARRAYS_ZIP(numbers, animals)) t AS n, a",
},
)

def test_lateral_subquery(self):
self.validate_identity(
"SELECT art FROM tbl1 INNER JOIN LATERAL (SELECT art FROM tbl2) AS tbl2 ON tbl1.art = tbl2.art"
5 changes: 2 additions & 3 deletions tests/dialects/test_starrocks.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from sqlglot.errors import UnsupportedError
from tests.dialects.test_dialect import Validator


@@ -91,7 +90,7 @@ def test_unnest(self):
"spark": r"""SELECT id, t.type, t.scores FROM example_table LATERAL VIEW INLINE(ARRAYS_ZIP(SPLIT(type, CONCAT('\\Q', ';', '\\E')), scores)) t AS type, scores""",
"databricks": r"""SELECT id, t.type, t.scores FROM example_table LATERAL VIEW INLINE(ARRAYS_ZIP(SPLIT(type, CONCAT('\\Q', ';', '\\E')), scores)) t AS type, scores""",
"starrocks": r"""SELECT id, t.type, t.scores FROM example_table, UNNEST(SPLIT(type, ';'), scores) AS t(type, scores)""",
"hive": UnsupportedError,
"hive": r"""SELECT id, t.type, t.scores FROM example_table LATERAL VIEW INLINE(ARRAYS_ZIP(SPLIT(type, CONCAT('\\Q', ';', '\\E')), scores)) t AS type, scores""",
},
Comment on lines -94 to +93
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did these hive tests change? Did you test these queries?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. hive test has changed because hive do support array of struct using inline.

ref doc: LanguageManualUDF-inline- arrayofstructs

I'll check for some more scenarios regarding this, thanks for pointing it out.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The issue is not INLINE in this case, though. It's ARRAYS_ZIP; IIRC, it doesn't support it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, Let me run a few more testcases around hive.
If this has the issue you mentioned above I'll re-adjust the code. Thank you

)

@@ -100,7 +99,7 @@ def test_unnest(self):
write={
"spark": r"""SELECT id, t.type, t.scores FROM example_table_2 LATERAL VIEW INLINE(ARRAYS_ZIP(SPLIT(type, CONCAT('\\Q', ';', '\\E')), scores)) t AS type, scores""",
"starrocks": r"""SELECT id, t.type, t.scores FROM example_table_2 CROSS JOIN LATERAL UNNEST(SPLIT(type, ';'), scores) AS t(type, scores)""",
"hive": UnsupportedError,
"hive": r"""SELECT id, t.type, t.scores FROM example_table_2 LATERAL VIEW INLINE(ARRAYS_ZIP(SPLIT(type, CONCAT('\\Q', ';', '\\E')), scores)) t AS type, scores""",
},
)