Skip to content

Commit 482fe22

Browse files
authored
[Coral-Trino] Migrate SUBSTR() operator from RexShuttle to SqlShuttle (#432)
* [Coral-Trino] Initial commit for migrating SUBSTR() operator * rebase * register Coral IR functions * add UTs
1 parent 8c4b054 commit 482fe22

File tree

5 files changed

+118
-28
lines changed

5 files changed

+118
-28
lines changed

coral-hive/src/main/java/com/linkedin/coral/hive/hive2rel/functions/StaticHiveFunctionRegistry.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,13 @@ public boolean isOptional(int i) {
298298
or(family(SqlTypeFamily.STRING), family(SqlTypeFamily.BINARY)));
299299
createAddUserDefinedFunction("crc32", BIGINT, or(family(SqlTypeFamily.STRING), family(SqlTypeFamily.BINARY)));
300300
createAddUserDefinedFunction("from_utf8", explicit(SqlTypeName.VARCHAR), or(CHARACTER, BINARY));
301+
createAddUserDefinedFunction("at_timezone", explicit(SqlTypeName.TIMESTAMP),
302+
family(SqlTypeFamily.TIMESTAMP, SqlTypeFamily.STRING));
303+
createAddUserDefinedFunction("with_timezone", explicit(SqlTypeName.TIMESTAMP),
304+
family(SqlTypeFamily.TIMESTAMP, SqlTypeFamily.STRING));
305+
createAddUserDefinedFunction("to_unixtime", explicit(SqlTypeName.DOUBLE), family(SqlTypeFamily.TIMESTAMP));
306+
createAddUserDefinedFunction("from_unixtime_nanos", explicit(SqlTypeName.TIMESTAMP), NUMERIC);
307+
createAddUserDefinedFunction("$canonicalize_hive_timezone_id", explicit(SqlTypeName.VARCHAR), STRING);
301308

302309
// xpath functions
303310
createAddUserDefinedFunction("xpath", FunctionReturnTypes.arrayOfType(SqlTypeName.VARCHAR), STRING_STRING);

coral-trino/src/main/java/com/linkedin/coral/trino/rel2trino/Calcite2TrinoUDFConverter.java

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,6 @@
3838
import org.apache.calcite.sql.type.SqlReturnTypeInference;
3939
import org.apache.calcite.sql.validate.SqlUserDefinedFunction;
4040

41-
import com.linkedin.coral.com.google.common.collect.ImmutableList;
42-
4341
import static com.linkedin.coral.trino.rel2trino.CoralTrinoConfigKeys.*;
4442
import static org.apache.calcite.sql.type.ReturnTypes.explicit;
4543
import static org.apache.calcite.sql.type.SqlTypeName.*;
@@ -167,34 +165,9 @@ public RexNode visitCall(RexCall call) {
167165
}
168166
}
169167

170-
if (operatorName.equalsIgnoreCase("substr")) {
171-
Optional<RexNode> modifiedCall = visitSubstring(call);
172-
if (modifiedCall.isPresent()) {
173-
return modifiedCall.get();
174-
}
175-
}
176-
177168
return super.visitCall(call);
178169
}
179170

180-
// Hive allows passing in a byte array or String to substr/substring, so we can make an effort to emulate the
181-
// behavior by casting non-String input to String
182-
// https://cwiki.apache.org/confluence/display/hive/languagemanual+udf
183-
private Optional<RexNode> visitSubstring(RexCall call) {
184-
final SqlOperator op = call.getOperator();
185-
List<RexNode> convertedOperands = visitList(call.getOperands(), (boolean[]) null);
186-
RexNode inputOperand = convertedOperands.get(0);
187-
188-
if (inputOperand.getType().getSqlTypeName() != VARCHAR && inputOperand.getType().getSqlTypeName() != CHAR) {
189-
List<RexNode> operands = new ImmutableList.Builder<RexNode>()
190-
.add(rexBuilder.makeCast(typeFactory.createSqlType(VARCHAR), inputOperand))
191-
.addAll(convertedOperands.subList(1, convertedOperands.size())).build();
192-
return Optional.of(rexBuilder.makeCall(op, operands));
193-
}
194-
195-
return Optional.empty();
196-
}
197-
198171
private Optional<RexNode> visitCast(RexCall call) {
199172
final SqlOperator op = call.getOperator();
200173
if (op.getKind() != SqlKind.CAST) {

coral-trino/src/main/java/com/linkedin/coral/trino/rel2trino/DataTypeDerivedSqlCallConverter.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import com.linkedin.coral.trino.rel2trino.transformers.FromUtcTimestampOperatorTransformer;
2121
import com.linkedin.coral.trino.rel2trino.transformers.GenericProjectTransformer;
2222
import com.linkedin.coral.trino.rel2trino.transformers.NamedStructToCastTransformer;
23+
import com.linkedin.coral.trino.rel2trino.transformers.SubstrOperatorTransformer;
2324

2425

2526
/**
@@ -41,7 +42,7 @@ public DataTypeDerivedSqlCallConverter(HiveMetastoreClient mscClient, SqlNode to
4142
TypeDerivationUtil typeDerivationUtil = new TypeDerivationUtil(toRelConverter.getSqlValidator(), topSqlNode);
4243
operatorTransformerList = SqlCallTransformers.of(new FromUtcTimestampOperatorTransformer(typeDerivationUtil),
4344
new GenericProjectTransformer(typeDerivationUtil), new NamedStructToCastTransformer(typeDerivationUtil),
44-
new ConcatOperatorTransformer(typeDerivationUtil));
45+
new ConcatOperatorTransformer(typeDerivationUtil), new SubstrOperatorTransformer(typeDerivationUtil));
4546
}
4647

4748
@Override
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
/**
2+
* Copyright 2023 LinkedIn Corporation. All rights reserved.
3+
* Licensed under the BSD-2 Clause license.
4+
* See LICENSE in the project root for license information.
5+
*/
6+
package com.linkedin.coral.trino.rel2trino.transformers;
7+
8+
import java.util.ArrayList;
9+
import java.util.Arrays;
10+
import java.util.HashSet;
11+
import java.util.List;
12+
import java.util.Set;
13+
14+
import org.apache.calcite.rel.type.RelDataType;
15+
import org.apache.calcite.sql.SqlBasicTypeNameSpec;
16+
import org.apache.calcite.sql.SqlCall;
17+
import org.apache.calcite.sql.SqlDataTypeSpec;
18+
import org.apache.calcite.sql.SqlNode;
19+
import org.apache.calcite.sql.fun.SqlStdOperatorTable;
20+
import org.apache.calcite.sql.parser.SqlParserPos;
21+
import org.apache.calcite.sql.type.SqlTypeName;
22+
23+
import com.linkedin.coral.common.HiveTypeSystem;
24+
import com.linkedin.coral.common.transformers.SqlCallTransformer;
25+
import com.linkedin.coral.common.utils.TypeDerivationUtil;
26+
27+
import static org.apache.calcite.sql.parser.SqlParserPos.*;
28+
import static org.apache.calcite.sql.type.SqlTypeName.*;
29+
30+
31+
/**
32+
* This class implements the transformation of SqlCalls with Coral IR function `SUBSTR`
33+
* to their corresponding Trino-compatible versions.
34+
*
35+
* For example:
36+
* Given table:
37+
* t1(int_col INTEGER, time_col timestamp)
38+
* and a Coral IR SqlCall:
39+
* `SUBSTR(time_col, 12, 8)`
40+
*
41+
* The transformed SqlCall would be:
42+
* `SUBSTR(CAST(time_col AS VARCHAR(65535)), 12, 8)`
43+
*/
44+
public class SubstrOperatorTransformer extends SqlCallTransformer {
45+
46+
private static final int DEFAULT_VARCHAR_PRECISION = new HiveTypeSystem().getDefaultPrecision(SqlTypeName.VARCHAR);
47+
private static final String SUBSTR_OPERATOR_NAME = "substr";
48+
private static final Set<SqlTypeName> OPERAND_SQL_TYPE_NAMES =
49+
new HashSet<>(Arrays.asList(SqlTypeName.VARCHAR, SqlTypeName.CHAR));
50+
private static final SqlDataTypeSpec VARCHAR_SQL_DATA_TYPE_SPEC =
51+
new SqlDataTypeSpec(new SqlBasicTypeNameSpec(SqlTypeName.VARCHAR, DEFAULT_VARCHAR_PRECISION, ZERO), ZERO);
52+
53+
public SubstrOperatorTransformer(TypeDerivationUtil typeDerivationUtil) {
54+
super(typeDerivationUtil);
55+
}
56+
57+
@Override
58+
protected boolean condition(SqlCall sqlCall) {
59+
return sqlCall.getOperator().getName().equalsIgnoreCase(SUBSTR_OPERATOR_NAME);
60+
}
61+
62+
@Override
63+
protected SqlCall transform(SqlCall sqlCall) {
64+
List<SqlNode> operands = sqlCall.getOperandList();
65+
RelDataType relDataTypeOfOperand = deriveRelDatatype(operands.get(0));
66+
67+
// Coral IR accepts a byte array or String as an input for the `substr` operator.
68+
// This behavior is emulated by casting non-String input to String in this transformer
69+
// https://cwiki.apache.org/confluence/display/hive/languagemanual+udf
70+
if (!OPERAND_SQL_TYPE_NAMES.contains(relDataTypeOfOperand.getSqlTypeName())) {
71+
List<SqlNode> modifiedOperands = new ArrayList<>();
72+
73+
modifiedOperands.add(SqlStdOperatorTable.CAST.createCall(ZERO, operands.get(0), VARCHAR_SQL_DATA_TYPE_SPEC));
74+
modifiedOperands.addAll(operands.subList(1, operands.size()));
75+
76+
return sqlCall.getOperator().createCall(SqlParserPos.ZERO, modifiedOperands);
77+
}
78+
return sqlCall;
79+
}
80+
}

coral-trino/src/test/java/com/linkedin/coral/trino/rel2trino/HiveToTrinoConverterTest.java

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -486,6 +486,35 @@ public void testCastNestedTimestampToDecimal() {
486486
assertEquals(expandedSql, targetSql);
487487
}
488488

489+
@Test
490+
public void testSubstrWithTimestampOperator() {
491+
RelToTrinoConverter relToTrinoConverter = TestUtils.getRelToTrinoConverter();
492+
493+
RelNode relNode = TestUtils.getHiveToRelConverter().convertSql(
494+
"SELECT substring(from_utc_timestamp(a_bigint,'PST'),1,10) AS d\nFROM test.table_from_utc_timestamp");
495+
String targetSql =
496+
"SELECT \"substr\"(CAST(CAST(\"at_timezone\"(\"from_unixtime_nanos\"(CAST(\"table_from_utc_timestamp\".\"a_bigint\" AS BIGINT) * 1000000), \"$canonicalize_hive_timezone_id\"('PST')) AS TIMESTAMP(3)) AS VARCHAR(65535)), 1, 10) AS \"d\"\n"
497+
+ "FROM \"test\".\"table_from_utc_timestamp\" AS \"table_from_utc_timestamp\"";
498+
String expandedSql = relToTrinoConverter.convert(relNode);
499+
assertEquals(expandedSql, targetSql);
500+
501+
relNode = TestUtils.getHiveToRelConverter().convertSql(
502+
"SELECT substring(from_utc_timestamp(a_decimal_three,'PST'),1,10) AS d\nFROM test.table_from_utc_timestamp");
503+
targetSql =
504+
"SELECT \"substr\"(CAST(CAST(\"at_timezone\"(CAST(\"format_datetime\"(\"from_unixtime\"(CAST(\"table_from_utc_timestamp0\".\"a_decimal_three\" AS DOUBLE)), 'yyyy-MM-dd HH:mm:ss') AS TIMESTAMP), \"$canonicalize_hive_timezone_id\"('PST')) AS TIMESTAMP(3)) AS VARCHAR(65535)), 1, 10) AS \"d\"\n"
505+
+ "FROM \"test\".\"table_from_utc_timestamp\" AS \"table_from_utc_timestamp0\"";
506+
expandedSql = relToTrinoConverter.convert(relNode);
507+
assertEquals(expandedSql, targetSql);
508+
509+
relNode = TestUtils.getHiveToRelConverter().convertSql(
510+
"SELECT substring(from_utc_timestamp(a_timestamp,'PST'),1,10) AS d\nFROM test.table_from_utc_timestamp");
511+
targetSql =
512+
"SELECT \"substr\"(CAST(CAST(\"at_timezone\"(CAST(\"format_datetime\"(\"from_unixtime\"(\"to_unixtime\"(\"with_timezone\"(\"table_from_utc_timestamp1\".\"a_timestamp\", 'UTC'))), 'yyyy-MM-dd HH:mm:ss') AS TIMESTAMP), \"$canonicalize_hive_timezone_id\"('PST')) AS TIMESTAMP(3)) AS VARCHAR(65535)), 1, 10) AS \"d\"\n"
513+
+ "FROM \"test\".\"table_from_utc_timestamp\" AS \"table_from_utc_timestamp1\"";
514+
expandedSql = relToTrinoConverter.convert(relNode);
515+
assertEquals(expandedSql, targetSql);
516+
}
517+
489518
@Test
490519
public void testTranslateFunction() {
491520
RelToTrinoConverter relToTrinoConverter = TestUtils.getRelToTrinoConverter();

0 commit comments

Comments
 (0)