@@ -15,24 +15,23 @@ use crate::engine::ensure_data_types::ensure_data_types;
1515use crate :: error:: { DeltaResult , Error } ;
1616use crate :: schema:: { ArrayType , DataType , MapType , Schema , StructField } ;
1717
18- // Apply a schema to an array. The array _must_ be a `StructArray`. Returns a `RecordBatch where the
19- // names of fields, nullable, and metadata in the struct have been transformed to match those in
20- // schema specified by `schema`
18+ // Apply a schema to an array. The array _must_ be a `StructArray`. Returns a `RecordBatch` where
19+ // the names of fields, nullable, and metadata in the struct have been transformed to match those
20+ // in the schema specified by `schema`.
21+ //
22+ // Note: If the struct array has top-level nulls, the child columns are expected to already have
23+ // those nulls propagated. Arrow's JSON reader does this automatically, and parquet data goes
24+ // through `fix_nested_null_masks` which handles it. We decompose the struct and discard its null
25+ // buffer since RecordBatch cannot have top-level nulls.
2126pub ( crate ) fn apply_schema ( array : & dyn Array , schema : & DataType ) -> DeltaResult < RecordBatch > {
2227 let DataType :: Struct ( struct_schema) = schema else {
2328 return Err ( Error :: generic (
2429 "apply_schema at top-level must be passed a struct schema" ,
2530 ) ) ;
2631 } ;
2732 let applied = apply_schema_to_struct ( array, struct_schema) ?;
28- let ( fields, columns, nulls) = applied. into_parts ( ) ;
29- if let Some ( nulls) = nulls {
30- if nulls. null_count ( ) != 0 {
31- return Err ( Error :: invalid_struct_data (
32- "Top-level nulls in struct are not supported" ,
33- ) ) ;
34- }
35- }
33+ let ( fields, columns, _nulls) = applied. into_parts ( ) ;
34+
3635 Ok ( RecordBatch :: try_new (
3736 Arc :: new ( ArrowSchema :: new ( fields) ) ,
3837 columns,
@@ -191,6 +190,7 @@ mod apply_schema_validation_tests {
191190 use std:: sync:: Arc ;
192191
193192 use crate :: arrow:: array:: { Int32Array , StructArray } ;
193+ use crate :: arrow:: buffer:: { BooleanBuffer , NullBuffer } ;
194194 use crate :: arrow:: datatypes:: {
195195 DataType as ArrowDataType , Field as ArrowField , Schema as ArrowSchema ,
196196 } ;
@@ -238,4 +238,75 @@ mod apply_schema_validation_tests {
238238 StructField :: new ( "b" , DataType :: INTEGER , false ) ,
239239 ] )
240240 }
241+
242+ /// Test that apply_schema handles structs with top-level nulls correctly.
243+ ///
244+ /// This simulates a Delta log scenario where each row is one action type (add, remove, etc.).
245+ /// When extracting `add.stats_parsed`, rows where `add` is null (e.g., remove actions) should
246+ /// have null child columns. The child columns are expected to already have nulls propagated
247+ /// (Arrow's JSON reader does this, and parquet data goes through `fix_nested_null_masks`).
248+ #[ test]
249+ fn test_apply_schema_handles_top_level_nulls ( ) {
250+ // Create a struct array with 4 rows where rows 1 and 3 have top-level nulls.
251+ // This simulates: [add_action, remove_action, add_action, remove_action]
252+ // where remove_action rows have null for the entire struct.
253+ let field_a = ArrowField :: new ( "a" , ArrowDataType :: Int32 , true ) ;
254+ let field_b = ArrowField :: new ( "b" , ArrowDataType :: Int32 , true ) ;
255+ let schema = ArrowSchema :: new ( vec ! [ field_a, field_b] ) ;
256+
257+ // Child columns with nulls already propagated (simulating what Arrow readers do).
258+ // Rows 1 and 3 are null because the parent struct is null at those positions.
259+ let a_data = Int32Array :: from ( vec ! [ Some ( 1 ) , None , Some ( 3 ) , None ] ) ;
260+ let b_data = Int32Array :: from ( vec ! [ Some ( 10 ) , None , Some ( 30 ) , None ] ) ;
261+
262+ // Top-level struct nulls: rows 0 and 2 are valid, rows 1 and 3 are null
263+ let null_buffer = NullBuffer :: new ( BooleanBuffer :: from ( vec ! [ true , false , true , false ] ) ) ;
264+
265+ let struct_array = StructArray :: try_new (
266+ schema. fields . clone ( ) ,
267+ vec ! [ Arc :: new( a_data) , Arc :: new( b_data) ] ,
268+ Some ( null_buffer) ,
269+ )
270+ . unwrap ( ) ;
271+
272+ // Target schema with nullable fields
273+ let target_schema = DataType :: Struct ( Box :: new ( StructType :: new_unchecked ( [
274+ StructField :: new ( "a" , DataType :: INTEGER , true ) ,
275+ StructField :: new ( "b" , DataType :: INTEGER , true ) ,
276+ ] ) ) ) ;
277+
278+ // Apply schema - should successfully convert to RecordBatch
279+ let result = apply_schema ( & struct_array, & target_schema) . unwrap ( ) ;
280+
281+ assert_eq ! ( result. num_rows( ) , 4 ) ;
282+ assert_eq ! ( result. num_columns( ) , 2 ) ;
283+
284+ // Verify columns preserve nulls from child arrays
285+ let col_a = result. column ( 0 ) ;
286+ assert ! ( col_a. is_valid( 0 ) , "Row 0 should be valid" ) ;
287+ assert ! ( col_a. is_null( 1 ) , "Row 1 should be null" ) ;
288+ assert ! ( col_a. is_valid( 2 ) , "Row 2 should be valid" ) ;
289+ assert ! ( col_a. is_null( 3 ) , "Row 3 should be null" ) ;
290+
291+ let col_b = result. column ( 1 ) ;
292+ assert ! ( col_b. is_valid( 0 ) , "Row 0 should be valid" ) ;
293+ assert ! ( col_b. is_null( 1 ) , "Row 1 should be null" ) ;
294+ assert ! ( col_b. is_valid( 2 ) , "Row 2 should be valid" ) ;
295+ assert ! ( col_b. is_null( 3 ) , "Row 3 should be null" ) ;
296+
297+ // Verify the actual values for valid rows
298+ let col_a = col_a
299+ . as_any ( )
300+ . downcast_ref :: < Int32Array > ( )
301+ . expect ( "column a should be Int32Array" ) ;
302+ let col_b = col_b
303+ . as_any ( )
304+ . downcast_ref :: < Int32Array > ( )
305+ . expect ( "column b should be Int32Array" ) ;
306+
307+ assert_eq ! ( col_a. value( 0 ) , 1 ) ;
308+ assert_eq ! ( col_a. value( 2 ) , 3 ) ;
309+ assert_eq ! ( col_b. value( 0 ) , 10 ) ;
310+ assert_eq ! ( col_b. value( 2 ) , 30 ) ;
311+ }
241312}
0 commit comments