From 28e5bc28dc0b2a913b1cf1a3b841032e6ff83a3a Mon Sep 17 00:00:00 2001 From: Kevin Ge Date: Wed, 29 Jan 2025 18:16:44 -0500 Subject: [PATCH 1/4] fix nested nullability inference for nested udf calls --- .../linkedin/coral/schema/avro/RelToAvroSchemaConverter.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java b/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java index a7165a779..dca7c7ce5 100644 --- a/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java +++ b/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java @@ -448,8 +448,11 @@ public RexNode visitCall(RexCall rexCall) { if (operand instanceof RexInputRef) { appendRexInputRefField((RexInputRef) operand); - return rexCall; + } else if (operand instanceof RexCall) { + // If the operand is a call, we need to visit the call to get the field schema + visitCall((RexCall) operand); } + return rexCall; } RelDataType fieldType = rexCall.getType(); From fc0165a2458a0576048beb12a1637a7280a4a430 Mon Sep 17 00:00:00 2001 From: Kevin Ge Date: Wed, 29 Jan 2025 20:06:11 -0500 Subject: [PATCH 2/4] spotless --- .../linkedin/coral/schema/avro/RelToAvroSchemaConverter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java b/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java index dca7c7ce5..9fb62fb9d 100644 --- a/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java +++ b/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java @@ -1,5 +1,5 @@ /** - * Copyright 2019-2024 LinkedIn Corporation. All rights reserved. + * Copyright 2019-2025 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ From a2e9e8ea2253aeec414013d89be9c4ff531f4263 Mon Sep 17 00:00:00 2001 From: Kevin Ge Date: Thu, 30 Jan 2025 12:28:50 -0500 Subject: [PATCH 3/4] generalize operand schema inference for ordinal return type UDF calls --- .../schema/avro/RelToAvroSchemaConverter.java | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java b/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java index 9fb62fb9d..78d2959e7 100644 --- a/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java +++ b/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java @@ -433,7 +433,7 @@ public RexNode visitLiteral(RexLiteral rexLiteral) { @Override public RexNode visitCall(RexCall rexCall) { /** - * For SqlUserDefinedFunction and SqlOperator RexCall, no need to handle it recursively + * For SqlUserDefinedFunction and SqlOperator RexCall, no need to handle it recursively (in most cases) * and only return type of udf or sql operator is relevant */ @@ -445,13 +445,13 @@ public RexNode visitCall(RexCall rexCall) { if (rexCall.getOperator().getReturnTypeInference() instanceof OrdinalReturnTypeInferenceV2) { int index = ((OrdinalReturnTypeInferenceV2) rexCall.getOperator().getReturnTypeInference()).getOrdinal(); RexNode operand = rexCall.operands.get(index); - - if (operand instanceof RexInputRef) { - appendRexInputRefField((RexInputRef) operand); - } else if (operand instanceof RexCall) { - // If the operand is a call, we need to visit the call to get the field schema - visitCall((RexCall) operand); - } + operand.accept(this); + // if (operand instanceof RexInputRef) { + // appendRexInputRefField((RexInputRef) operand); + // } else if (operand instanceof RexCall) { + // // If the operand is another call, we need to visit the call to get the field schema + // visitCall((RexCall) operand); + // } return rexCall; } From c3c152165965d15fda31f8e33b59670d85b25cf2 Mon Sep 17 00:00:00 2001 From: Kevin Ge Date: Thu, 30 Jan 2025 16:39:50 -0500 Subject: [PATCH 4/4] add unit test + documentation --- .../schema/avro/RelToAvroSchemaConverter.java | 20 ++++--------- .../linkedin/coral/schema/avro/TestUtils.java | 4 +-- .../avro/ViewToAvroSchemaConverterTests.java | 28 +++++++++++++++++-- 3 files changed, 33 insertions(+), 19 deletions(-) diff --git a/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java b/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java index 78d2959e7..1452d86f7 100644 --- a/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java +++ b/coral-schema/src/main/java/com/linkedin/coral/schema/avro/RelToAvroSchemaConverter.java @@ -433,28 +433,20 @@ public RexNode visitLiteral(RexLiteral rexLiteral) { @Override public RexNode visitCall(RexCall rexCall) { /** - * For SqlUserDefinedFunction and SqlOperator RexCall, no need to handle it recursively (in most cases) - * and only return type of udf or sql operator is relevant - */ - - /** - * If the return type of RexCall is based on the ordinal of its input argument - * and the corresponding input argument refers to a field from the input schema, - * use the field's schema as is. + * If the return type of RexCall is based on an ordinal of its input arguments, then leverage SchemaRexShuttle + * to visit the input argument and use the argument's schema as is to infer the return type of the call */ if (rexCall.getOperator().getReturnTypeInference() instanceof OrdinalReturnTypeInferenceV2) { int index = ((OrdinalReturnTypeInferenceV2) rexCall.getOperator().getReturnTypeInference()).getOrdinal(); RexNode operand = rexCall.operands.get(index); operand.accept(this); - // if (operand instanceof RexInputRef) { - // appendRexInputRefField((RexInputRef) operand); - // } else if (operand instanceof RexCall) { - // // If the operand is another call, we need to visit the call to get the field schema - // visitCall((RexCall) operand); - // } return rexCall; } + /** + * For SqlUserDefinedFunction and SqlOperator RexCall, no need to handle it recursively + * and just directly use the return type of udf or sql operator as the field's schema + */ RelDataType fieldType = rexCall.getType(); boolean isNullable = SchemaUtilities.isFieldNullable(rexCall, inputSchema); diff --git a/coral-schema/src/test/java/com/linkedin/coral/schema/avro/TestUtils.java b/coral-schema/src/test/java/com/linkedin/coral/schema/avro/TestUtils.java index 2d503ddcd..88fcb0d34 100644 --- a/coral-schema/src/test/java/com/linkedin/coral/schema/avro/TestUtils.java +++ b/coral-schema/src/test/java/com/linkedin/coral/schema/avro/TestUtils.java @@ -1,5 +1,5 @@ /** - * Copyright 2019-2024 LinkedIn Corporation. All rights reserved. + * Copyright 2019-2025 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ @@ -177,7 +177,7 @@ private static void initializeUdfs() { executeCreateFunctionQuery("default", Collections.singletonList("foo_udf_return_struct"), "FuncIsEven", "com.linkedin.coral.hive.hive2rel.CoralTestUDFReturnStruct"); - executeCreateFunctionQuery("default", Collections.singletonList("innerfield_with_udf"), "ReturnInnerStuct", + executeCreateFunctionQuery("default", Collections.singletonList("innerfield_with_udf"), "ReturnInnerStruct", "com.linkedin.coral.hive.hive2rel.CoralTestUDFReturnSecondArg"); } diff --git a/coral-schema/src/test/java/com/linkedin/coral/schema/avro/ViewToAvroSchemaConverterTests.java b/coral-schema/src/test/java/com/linkedin/coral/schema/avro/ViewToAvroSchemaConverterTests.java index d014be0ee..9d44369f6 100644 --- a/coral-schema/src/test/java/com/linkedin/coral/schema/avro/ViewToAvroSchemaConverterTests.java +++ b/coral-schema/src/test/java/com/linkedin/coral/schema/avro/ViewToAvroSchemaConverterTests.java @@ -1,5 +1,5 @@ /** - * Copyright 2019-2024 LinkedIn Corporation. All rights reserved. + * Copyright 2019-2025 LinkedIn Corporation. All rights reserved. * Licensed under the BSD-2 Clause license. * See LICENSE in the project root for license information. */ @@ -236,9 +236,9 @@ public void testUdfLessThanHundred() { @Test public void testPreserveNullabilitiesAfterApplyingOrdinalReturnTypeUDF() { String viewSql = "CREATE VIEW innerfield_with_udf " - + "tblproperties('functions' = 'ReturnInnerStuct:com.linkedin.coral.hive.hive2rel.CoralTestUDFReturnSecondArg', " + + "tblproperties('functions' = 'ReturnInnerStruct:com.linkedin.coral.hive.hive2rel.CoralTestUDFReturnSecondArg', " + " 'dependencies' = 'ivy://com.linkedin:udf:1.0') " + "AS " - + "SELECT default_innerfield_with_udf_ReturnInnerStuct('foo', innerRecord) AS innerRecord " + + "SELECT default_innerfield_with_udf_ReturnInnerStruct('foo', innerRecord) AS innerRecord " + "FROM basecomplexmixednullabilities"; TestUtils.executeCreateViewQuery("default", "innerfield_with_udf", viewSql); @@ -252,6 +252,28 @@ public void testPreserveNullabilitiesAfterApplyingOrdinalReturnTypeUDF() { TestUtils.loadSchema("testPreserveNullabilitiesAfterApplyingOrdinalReturnTypeUDF-expected.avsc")); } + @Test + public void testPreserveNullabilitiesAfterApplyingOrdinalReturnTypeUDFForNestedCalls() { + String viewSql = "CREATE VIEW innerfield_with_udf " + + "tblproperties('functions' = 'ReturnInnerStruct:com.linkedin.coral.hive.hive2rel.CoralTestUDFReturnSecondArg', " + + " 'dependencies' = 'ivy://com.linkedin:udf:1.0') " + "AS " + + "SELECT default_innerfield_with_udf_ReturnInnerStruct('foo', default_innerfield_with_udf_ReturnInnerStruct('foo', innerRecord)) AS innerRecord " + + "FROM basecomplexmixednullabilities"; + + TestUtils.executeCreateViewQuery("default", "innerfield_with_udf", viewSql); + + ViewToAvroSchemaConverter viewToAvroSchemaConverter = ViewToAvroSchemaConverter.create(hiveMetastoreClient); + Schema actualSchema = viewToAvroSchemaConverter.toAvroSchema("default", "innerfield_with_udf"); + + // Inner ReturnInnerStruct call return type == Return type of it's second argument, innerRecord + // Outer ReturnInnerStruct call return type == Return type of it's second argument, Inner ReturnInnerStruct call return type + // Therefore, Outer ReturnInnerStruct call return type == Return type of innerRecord + // + // We also expect all fields to retain their nullability after applying the UDF calls + Assert.assertEquals(actualSchema.toString(true), + TestUtils.loadSchema("testPreserveNullabilitiesAfterApplyingOrdinalReturnTypeUDF-expected.avsc")); + } + @Test public void testUdfGreaterThanHundred() { String viewSql = "CREATE VIEW foo_dali_udf2 "