diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ApplyNameMapping.java b/parquet/src/main/java/org/apache/iceberg/parquet/ApplyNameMapping.java index 3ad11ab6e523..b6569a6eb6f9 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ApplyNameMapping.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ApplyNameMapping.java @@ -103,6 +103,12 @@ public Type primitive(PrimitiveType primitive) { return field == null ? primitive : primitive.withId(field.id()); } + @Override + public Type variant(GroupType variant) { + MappedField field = nameMapping.find(currentPath()); + return field == null ? variant : variant.withId(field.id()); + } + @Override public void beforeField(Type type) { fieldNames.push(type.getName()); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/MessageTypeToType.java b/parquet/src/main/java/org/apache/iceberg/parquet/MessageTypeToType.java index 26ef6e468ede..841777152ee8 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/MessageTypeToType.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/MessageTypeToType.java @@ -179,6 +179,11 @@ public Type primitive(PrimitiveType primitive) { throw new UnsupportedOperationException("Cannot convert unknown primitive type: " + primitive); } + @Override + public Type variant(GroupType variant) { + return Types.VariantType.get(); + } + private static class ParquetLogicalTypeVisitor implements LogicalTypeAnnotation.LogicalTypeAnnotationVisitor { private static final ParquetLogicalTypeVisitor INSTANCE = new ParquetLogicalTypeVisitor(); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetSchemaUtil.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetSchemaUtil.java index f4760738df68..9a81626827c6 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetSchemaUtil.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetSchemaUtil.java @@ -217,6 +217,11 @@ public Boolean map(GroupType map, Boolean keyHasId, Boolean valueHasId) { public Boolean primitive(PrimitiveType primitive) { return primitive.getId() != null; } + + @Override + public Boolean variant(GroupType variant) { + return variant.getId() != null; + } } public static Type determineListElementType(GroupType array) { diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetTypeVisitor.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetTypeVisitor.java index 43c62a8c77a3..5ec319fe3674 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetTypeVisitor.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetTypeVisitor.java @@ -46,6 +46,8 @@ public static T visit(Type type, ParquetTypeVisitor visitor) { return visitList(group, visitor); } else if (LogicalTypeAnnotation.mapType().equals(annotation)) { return visitMap(group, visitor); + } else if (LogicalTypeAnnotation.variantType((byte) 1).equals(annotation)) { + return visitVariant(group, visitor); } return visitor.struct(group, visitFields(group, visitor)); @@ -168,6 +170,10 @@ private static T visitMap(GroupType map, ParquetTypeVisitor visitor) { } } + private static T visitVariant(GroupType variant, ParquetTypeVisitor visitor) { + return visitor.variant(variant); + } + private static List visitFields(GroupType group, ParquetTypeVisitor visitor) { List results = Lists.newArrayListWithExpectedSize(group.getFieldCount()); for (Type field : group.getFields()) { @@ -202,6 +208,10 @@ public T primitive(PrimitiveType primitive) { return null; } + public T variant(GroupType variant) { + return null; + } + public void beforeField(Type type) { fieldNames.push(type.getName()); } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/RemoveIds.java b/parquet/src/main/java/org/apache/iceberg/parquet/RemoveIds.java index 37ead23a6793..b1d1d91bd5d6 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/RemoveIds.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/RemoveIds.java @@ -77,6 +77,16 @@ public Type primitive(PrimitiveType primitive) { .named(primitive.getName()); } + @Override + public Type variant(GroupType variant) { + Types.GroupBuilder builder = + Types.buildGroup(variant.getRepetition()).as(variant.getLogicalTypeAnnotation()); + for (Type field : variant.getFields()) { + builder.addField(field); + } + return builder.named(variant.getName()); + } + public static MessageType removeIds(MessageType type) { return (MessageType) ParquetTypeVisitor.visit(type, new RemoveIds()); } diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetSchemaUtil.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetSchemaUtil.java index 84fbf2a7d989..018a550413c4 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetSchemaUtil.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetSchemaUtil.java @@ -112,7 +112,20 @@ public void testAssignIdsByNameMapping() { 27, "m2", Types.MapType.ofOptional( - 28, 29, Types.StringType.get(), SUPPORTED_PRIMITIVES)))))); + 28, 29, Types.StringType.get(), SUPPORTED_PRIMITIVES))))), + optional(30, "variant_col", Types.VariantType.get()), + required( + 31, "list_of_variants", Types.ListType.ofOptional(32, Types.VariantType.get())), + optional( + 33, + "struct_with_variant", + Types.StructType.of( + Types.NestedField.required(34, "id", Types.IntegerType.get()), + Types.NestedField.optional(35, "data", Types.VariantType.get()))), + required( + 36, + "map_with_variant_value", + Types.MapType.ofOptional(37, 38, Types.StringType.get(), Types.VariantType.get()))); Schema schema = new Schema( @@ -224,7 +237,16 @@ public void testSchemaConversionWithoutAssigningIds() { "map_col_5", Repetition.REQUIRED, primitive(28, "k", PrimitiveTypeName.INT32, Repetition.REQUIRED), - primitive(29, "v", PrimitiveTypeName.INT32, Repetition.REQUIRED))); + primitive(29, "v", PrimitiveTypeName.INT32, Repetition.REQUIRED)), + variant(30, "variant_col_1", Repetition.OPTIONAL), + variant(null, "variant_col_2", Repetition.REQUIRED), + list(31, "list_col_6", Repetition.OPTIONAL, variant(32, "v", Repetition.OPTIONAL)), + struct( + 33, + "struct_col_3", + Repetition.REQUIRED, + primitive(34, "n1", PrimitiveTypeName.INT32, Repetition.REQUIRED), + variant(null, "variant_field", Repetition.OPTIONAL))); Schema expectedSchema = new Schema( @@ -255,8 +277,13 @@ public void testSchemaConversionWithoutAssigningIds() { required( 27, "map_col_5", - Types.MapType.ofRequired( - 28, 29, Types.IntegerType.get(), Types.IntegerType.get()))); + Types.MapType.ofRequired(28, 29, Types.IntegerType.get(), Types.IntegerType.get())), + optional(30, "variant_col_1", Types.VariantType.get()), + optional(31, "list_col_6", Types.ListType.ofOptional(32, Types.VariantType.get())), + required( + 33, + "struct_col_3", + Types.StructType.of(required(34, "n1", Types.IntegerType.get())))); Schema actualSchema = ParquetSchemaUtil.convertAndPrune(messageType); assertThat(actualSchema.asStruct()) @@ -427,6 +454,17 @@ public void testLegacyTwoLevelListGenByParquetThrift1() { .isEqualTo(expectedSchema.asStruct()); } + @Test + public void testVariantTypeConversion() { + MessageType messageType = + new MessageType("test", variant(1, "variant_field", Repetition.OPTIONAL)); + Schema expectedSchema = new Schema(optional(1, "variant_field", Types.VariantType.get())); + Schema actualSchema = ParquetSchemaUtil.convertAndPrune(messageType); + assertThat(actualSchema.asStruct()) + .as("Schema must match") + .isEqualTo(expectedSchema.asStruct()); + } + private Type primitive( Integer id, String name, PrimitiveTypeName typeName, Repetition repetition) { PrimitiveBuilder builder = @@ -464,4 +502,18 @@ private Type map(Integer id, String name, Repetition repetition, Type keyType, T } return builder.named(name); } + + private Type variant(Integer id, String name, Repetition repetition) { + GroupBuilder builder = + org.apache.parquet.schema.Types.buildGroup(repetition) + .as(org.apache.parquet.schema.LogicalTypeAnnotation.variantType((byte) 1)) + .primitive(PrimitiveTypeName.BINARY, Repetition.REQUIRED) + .named("metadata") + .primitive(PrimitiveTypeName.BINARY, Repetition.REQUIRED) + .named("value"); + if (id != null) { + builder.id(id); + } + return builder.named(name); + } }