Skip to content

Commit

Permalink
Merge branch 'main' into map_access
Browse files Browse the repository at this point in the history
  • Loading branch information
hntd187 authored Sep 26, 2024
2 parents d2907d9 + e88ae2d commit fe57b9b
Show file tree
Hide file tree
Showing 9 changed files with 598 additions and 241 deletions.
2 changes: 1 addition & 1 deletion acceptance/src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ use crate::{TestCaseInfo, TestResult};
pub async fn read_golden(path: &Path, _version: Option<&str>) -> DeltaResult<RecordBatch> {
let expected_root = path.join("expected").join("latest").join("table_content");
let store = Arc::new(LocalFileSystem::new_with_prefix(&expected_root)?);
let files = store.list(None).try_collect::<Vec<_>>().await?;
let files: Vec<_> = store.list(None).try_collect().await?;
let mut batches = vec![];
let mut schema = None;
for meta in files.into_iter() {
Expand Down
30 changes: 26 additions & 4 deletions derive-macros/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use proc_macro2::{Ident, TokenStream};
use quote::{quote, quote_spanned};
use syn::spanned::Spanned;
use syn::{parse_macro_input, Data, DataStruct, DeriveInput, Fields, PathArguments, Type};
use syn::{parse_macro_input, Data, DataStruct, DeriveInput, Fields, Meta, PathArguments, Type};

/// Derive a `delta_kernel::schemas::ToDataType` implementation for the annotated struct. The actual
/// field names in the schema (and therefore of the struct members) are all mandated by the Delta
Expand All @@ -10,7 +10,12 @@ use syn::{parse_macro_input, Data, DataStruct, DeriveInput, Fields, PathArgument
/// Change Metadata](https://github.com/delta-io/delta/blob/master/PROTOCOL.md#change-metadata)
/// action (this macro allows the use of standard rust snake_case, and will convert to the correct
/// delta schema camelCase version).
#[proc_macro_derive(Schema)]
///
/// If a field sets `drop_null_container_values`, it means the underlying data can contain null in
/// the values of the container (i.e. a `key` -> `null` in a `HashMap`). Therefore the schema should
/// mark the value field as nullable, but those mappings will be dropped when converting to an
/// actual rust `HashMap`. Currently this can _only_ be set on `HashMap` fields.
#[proc_macro_derive(Schema, attributes(drop_null_container_values))]
pub fn derive_schema(input: proc_macro::TokenStream) -> proc_macro::TokenStream {
let input = parse_macro_input!(input as DeriveInput);
let struct_ident = input.ident;
Expand All @@ -20,7 +25,7 @@ pub fn derive_schema(input: proc_macro::TokenStream) -> proc_macro::TokenStream
#[automatically_derived]
impl crate::actions::schemas::ToDataType for #struct_ident {
fn to_data_type() -> crate::schema::DataType {
use crate::actions::schemas::{ToDataType, GetStructField};
use crate::actions::schemas::{ToDataType, GetStructField, GetNullableContainerStructField};
crate::schema::StructType::new(vec![
#schema_fields
]).into()
Expand Down Expand Up @@ -67,6 +72,14 @@ fn gen_schema_fields(data: &Data) -> TokenStream {
let schema_fields = fields.iter().map(|field| {
let name = field.ident.as_ref().unwrap(); // we know these are named fields
let name = get_schema_name(name);
let have_schema_null = field.attrs.iter().any(|attr| {
// check if we have drop_null_container_values attr
match &attr.meta {
Meta::Path(path) => path.get_ident().is_some_and(|ident| ident == "drop_null_container_values"),
_ => false,
}
});

match field.ty {
Type::Path(ref type_path) => {
let type_path_quoted = type_path.path.segments.iter().map(|segment| {
Expand All @@ -77,7 +90,16 @@ fn gen_schema_fields(data: &Data) -> TokenStream {
_ => panic!("Can only handle <> type path args"),
}
});
quote_spanned! { field.span() => #(#type_path_quoted),* get_struct_field(stringify!(#name))}
if have_schema_null {
if let Some(first_ident) = type_path.path.segments.first().map(|seg| &seg.ident) {
if first_ident != "HashMap" {
panic!("Can only use drop_null_container_values on HashMap fields, not {first_ident:?}");
}
}
quote_spanned! { field.span() => #(#type_path_quoted),* get_nullable_container_struct_field(stringify!(#name))}
} else {
quote_spanned! { field.span() => #(#type_path_quoted),* get_struct_field(stringify!(#name))}
}
}
_ => {
panic!("Can't handle type: {:?}", field.ty);
Expand Down
2 changes: 2 additions & 0 deletions ffi/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,7 @@ pub enum KernelError {
InvalidStructDataError,
InternalError,
InvalidExpression,
InvalidLogPath,
}

impl From<Error> for KernelError {
Expand Down Expand Up @@ -374,6 +375,7 @@ impl From<Error> for KernelError {
backtrace: _,
} => Self::from(*source),
Error::InvalidExpressionEvaluation(_) => KernelError::InvalidExpression,
Error::InvalidLogPath(_) => KernelError::InvalidLogPath,
}
}
}
Expand Down
42 changes: 40 additions & 2 deletions kernel/src/actions/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,12 @@ pub struct Add {
/// [RFC 2396 URI Generic Syntax]: https://www.ietf.org/rfc/rfc2396.txt
pub path: String,

/// A map from partition column to value for this logical file.
pub partition_values: HashMap<String, Option<String>>,
/// A map from partition column to value for this logical file. This map can contain null in the
/// values meaning a partition is null. We drop those values from this map, due to the
/// `drop_null_container_values` annotation. This means an engine can assume that if a partition
/// is found in [`Metadata`] `partition_columns`, but not in this map, its value is null.
#[drop_null_container_values]
pub partition_values: HashMap<String, String>,

/// The size of this data file in bytes
pub size: i64,
Expand Down Expand Up @@ -296,6 +300,40 @@ mod tests {
assert_eq!(schema, expected);
}

#[test]
fn test_add_schema() {
let schema = get_log_schema()
.project(&["add"])
.expect("Couldn't get add field");

let expected = Arc::new(StructType::new(vec![StructField::new(
"add",
StructType::new(vec![
StructField::new("path", DataType::STRING, false),
StructField::new(
"partitionValues",
MapType::new(DataType::STRING, DataType::STRING, true),
false,
),
StructField::new("size", DataType::LONG, false),
StructField::new("modificationTime", DataType::LONG, false),
StructField::new("dataChange", DataType::BOOLEAN, false),
StructField::new("stats", DataType::STRING, true),
StructField::new(
"tags",
MapType::new(DataType::STRING, DataType::STRING, false),
true,
),
deletion_vector_field(),
StructField::new("baseRowId", DataType::LONG, true),
StructField::new("defaultRowCommitVersion", DataType::LONG, true),
StructField::new("clusteringProvider", DataType::STRING, true),
]),
true,
)]));
assert_eq!(schema, expected);
}

fn tags_field() -> StructField {
StructField::new(
"tags",
Expand Down
23 changes: 23 additions & 0 deletions kernel/src/actions/schemas.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ pub(crate) trait ToDataType {
fn nullable() -> bool;
}

pub(crate) trait ToNullableContainerType {
fn to_nullable_container_type() -> DataType;
}

macro_rules! impl_to_data_type {
( $(($rust_type: ty, $kernel_type: expr, $nullable: expr)), * ) => {
$(
Expand Down Expand Up @@ -78,10 +82,29 @@ impl<T: ToDataType> ToDataType for Option<T> {
}
}

// ToDataType impl for nullable map types
impl<K: ToDataType, V: ToDataType> ToNullableContainerType for HashMap<K, V> {
fn to_nullable_container_type() -> DataType {
MapType::new(K::to_data_type(), V::to_data_type(), true).into()
}
}

pub(crate) trait GetStructField {
fn get_struct_field(name: impl Into<String>) -> StructField;
}

pub(crate) trait GetNullableContainerStructField {
fn get_nullable_container_struct_field(name: impl Into<String>) -> StructField;
}

// Normal types produce non-nullable fields, but in this case the container they reference has
// nullable values
impl<T: ToNullableContainerType> GetNullableContainerStructField for T {
fn get_nullable_container_struct_field(name: impl Into<String>) -> StructField {
StructField::new(name, T::to_nullable_container_type(), false)
}
}

// Normal types produce non-nullable fields
impl<T: ToDataType> GetStructField for T {
fn get_struct_field(name: impl Into<String>) -> StructField {
Expand Down
7 changes: 7 additions & 0 deletions kernel/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,10 @@ pub enum Error {
/// Expressions did not parse or evaluate correctly
#[error("Invalid expression evaluation: {0}")]
InvalidExpressionEvaluation(String),

/// Unable to parse the name of a log path
#[error("Invalid log path: {0}")]
InvalidLogPath(String),
}

// Convenience constructors for Error types that take a String argument
Expand Down Expand Up @@ -203,6 +207,9 @@ impl Error {
pub fn invalid_expression(msg: impl ToString) -> Self {
Self::InvalidExpressionEvaluation(msg.to_string())
}
pub(crate) fn invalid_log_path(msg: impl ToString) -> Self {
Self::InvalidLogPath(msg.to_string())
}

pub fn internal_error(msg: impl ToString) -> Self {
Self::InternalError(msg.to_string()).with_backtrace()
Expand Down
5 changes: 5 additions & 0 deletions kernel/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,12 @@ pub mod engine_data;
pub mod error;
pub mod expressions;
pub mod features;

#[cfg(feature = "developer-visibility")]
pub mod path;
#[cfg(not(feature = "developer-visibility"))]
pub(crate) mod path;

pub mod scan;
pub mod schema;
pub mod snapshot;
Expand Down
Loading

0 comments on commit fe57b9b

Please sign in to comment.