diff --git a/derive-macros/src/lib.rs b/derive-macros/src/lib.rs index be3bfe32f..bc58c5b58 100644 --- a/derive-macros/src/lib.rs +++ b/derive-macros/src/lib.rs @@ -10,7 +10,12 @@ use syn::{parse_macro_input, Data, DataStruct, DeriveInput, Fields, Meta, PathAr /// Change Metadata](https://github.com/delta-io/delta/blob/master/PROTOCOL.md#change-metadata) /// action (this macro allows the use of standard rust snake_case, and will convert to the correct /// delta schema camelCase version). -#[proc_macro_derive(Schema, attributes(schema_container_values_null))] +/// +/// If a field sets `drop_null_container_values`, it means the underlying data can contain null in +/// the values of the container (i.e. a `key` -> `null` in a `HashMap`). Therefore the schema should +/// mark the value field as nullable, but those mappings will be dropped when converting to an +/// actual rust `HashMap`. Currently this can _only_ be set on `HashMap` fields. +#[proc_macro_derive(Schema, attributes(drop_null_container_values))] pub fn derive_schema(input: proc_macro::TokenStream) -> proc_macro::TokenStream { let input = parse_macro_input!(input as DeriveInput); let struct_ident = input.ident; @@ -67,7 +72,7 @@ fn gen_schema_fields(data: &Data) -> TokenStream { let have_schema_null = field.attrs.iter().any(|attr| { // check if we have schema_map_values_null attr match &attr.meta { - Meta::Path(path) => path.get_ident().map(|ident| ident == "schema_container_values_null").unwrap_or(false), + Meta::Path(path) => path.get_ident().is_some_and(|ident| ident == "drop_null_container_values"), _ => false, } }); @@ -84,8 +89,8 @@ fn gen_schema_fields(data: &Data) -> TokenStream { }); if have_schema_null { if let Some(first_ident) = type_path.path.segments.first().map(|seg| &seg.ident) { - if first_ident != "HashMap" && first_ident != "Vec" { - panic!("Can only use schema_container_values_null on HashMap or Vec fields, not {first_ident:?}"); + if first_ident != "HashMap" { + panic!("Can only use drop_null_container_values on HashMap fields, not {first_ident:?}"); } } quote_spanned! { field.span() => #(#type_path_quoted),* get_nullable_container_struct_field(stringify!(#name))} diff --git a/kernel/src/actions/mod.rs b/kernel/src/actions/mod.rs index 0d24b3ee4..c5e1d0d69 100644 --- a/kernel/src/actions/mod.rs +++ b/kernel/src/actions/mod.rs @@ -140,8 +140,11 @@ pub struct Add { /// [RFC 2396 URI Generic Syntax]: https://www.ietf.org/rfc/rfc2396.txt pub path: String, - /// A map from partition column to value for this logical file. - #[schema_container_values_null] + /// A map from partition column to value for this logical file. This map can contain null in the + /// values meaning a partition is null. We drop those values from this map, due to the + /// `drop_null_container_values` annotation. This means an engine can assume that if a partition + /// is found in [`Metadata`] `partition_columns`, but not in this map, it's value is null. + #[drop_null_container_values] pub partition_values: HashMap, /// The size of this data file in bytes @@ -297,6 +300,40 @@ mod tests { assert_eq!(schema, expected); } + #[test] + fn test_add_schema() { + let schema = get_log_schema() + .project(&["add"]) + .expect("Couldn't get add field"); + + let expected = Arc::new(StructType::new(vec![StructField::new( + "add", + StructType::new(vec![ + StructField::new("path", DataType::STRING, false), + StructField::new( + "partitionValues", + MapType::new(DataType::STRING, DataType::STRING, true), + false, + ), + StructField::new("size", DataType::LONG, false), + StructField::new("modificationTime", DataType::LONG, false), + StructField::new("dataChange", DataType::BOOLEAN, false), + StructField::new("stats", DataType::STRING, true), + StructField::new( + "tags", + MapType::new(DataType::STRING, DataType::STRING, false), + true, + ), + deletion_vector_field(), + StructField::new("baseRowId", DataType::LONG, true), + StructField::new("defaultRowCommitVersion", DataType::LONG, true), + StructField::new("clusteringProvider", DataType::STRING, true), + ]), + true, + )])); + assert_eq!(schema, expected); + } + fn tags_field() -> StructField { StructField::new( "tags", diff --git a/kernel/src/actions/schemas.rs b/kernel/src/actions/schemas.rs index d5f7393c8..89fb5e32c 100644 --- a/kernel/src/actions/schemas.rs +++ b/kernel/src/actions/schemas.rs @@ -42,13 +42,6 @@ impl ToDataType for Vec { } } -// ToDataType impl for nullable array types -impl ToNullableContainerType for Vec { - fn to_nullable_container_type() -> DataType { - ArrayType::new(T::to_data_type(), true).into() - } -} - impl ToDataType for HashSet { fn to_data_type() -> DataType { ArrayType::new(T::to_data_type(), false).into()