@@ -7,7 +7,7 @@ use std::sync::Arc;
77use delta_kernel_derive:: internal_api;
88
99use crate :: arrow:: array:: builder:: { MapBuilder , MapFieldNames , StringBuilder } ;
10- use crate :: arrow:: array:: { Int64Array , RecordBatch , StringArray , StructArray } ;
10+ use crate :: arrow:: array:: { Array , Int64Array , RecordBatch , StringArray , StructArray } ;
1111use crate :: arrow:: datatypes:: { DataType , Field } ;
1212use crate :: parquet:: arrow:: arrow_reader:: {
1313 ArrowReaderMetadata , ArrowReaderOptions , ParquetRecordBatchReaderBuilder ,
@@ -23,9 +23,10 @@ use object_store::{DynObjectStore, ObjectStore};
2323use uuid:: Uuid ;
2424
2525use super :: file_stream:: { FileOpenFuture , FileOpener , FileStream } ;
26+ use super :: stats:: collect_stats;
2627use super :: UrlExt ;
2728use crate :: engine:: arrow_conversion:: { TryFromArrow as _, TryIntoArrow as _} ;
28- use crate :: engine:: arrow_data:: ArrowEngineData ;
29+ use crate :: engine:: arrow_data:: { extract_record_batch , ArrowEngineData } ;
2930use crate :: engine:: arrow_utils:: {
3031 fixup_parquet_read, generate_mask, get_requested_indices, ordering_needs_row_indexes,
3132 RowIndexBuilder ,
@@ -54,16 +55,25 @@ pub struct DataFileMetadata {
5455 file_meta : FileMeta ,
5556 // NB: We use usize instead of u64 since arrow uses usize for record batch sizes
5657 num_records : usize ,
58+ /// Collected statistics for this file (optional).
59+ stats : Option < StructArray > ,
5760}
5861
5962impl DataFileMetadata {
6063 pub fn new ( file_meta : FileMeta , num_records : usize ) -> Self {
6164 Self {
6265 file_meta,
6366 num_records,
67+ stats : None ,
6468 }
6569 }
6670
71+ /// Set the collected statistics for this file.
72+ pub fn with_stats ( mut self , stats : StructArray ) -> Self {
73+ self . stats = Some ( stats) ;
74+ self
75+ }
76+
6777 /// Convert DataFileMetadata into a record batch which matches the schema returned by
6878 /// [`add_files_schema`].
6979 ///
@@ -81,6 +91,7 @@ impl DataFileMetadata {
8191 size,
8292 } ,
8393 num_records,
94+ stats,
8495 } = self ;
8596 // create the record batch of the write metadata
8697 let path = Arc :: new ( StringArray :: from ( vec ! [ location. to_string( ) ] ) ) ;
@@ -104,20 +115,53 @@ impl DataFileMetadata {
104115 . map_err ( |_| Error :: generic ( "Failed to convert parquet metadata 'size' to i64" ) ) ?;
105116 let size = Arc :: new ( Int64Array :: from ( vec ! [ size] ) ) ;
106117 let modification_time = Arc :: new ( Int64Array :: from ( vec ! [ * last_modified] ) ) ;
107- let stats = Arc :: new ( StructArray :: try_new_with_length (
108- vec ! [ Field :: new( "numRecords" , DataType :: Int64 , true ) ] . into ( ) ,
109- vec ! [ Arc :: new( Int64Array :: from( vec![ * num_records as i64 ] ) ) ] ,
110- None ,
111- 1 ,
112- ) ?) ;
113118
114- Ok ( Box :: new ( ArrowEngineData :: new ( RecordBatch :: try_new (
115- Arc :: new (
116- crate :: transaction:: BASE_ADD_FILES_SCHEMA
117- . as_ref ( )
118- . try_into_arrow ( ) ?,
119+ // Use full stats if available, otherwise just numRecords
120+ let stats_array: Arc < StructArray > = if let Some ( full_stats) = stats {
121+ Arc :: new ( full_stats. clone ( ) )
122+ } else {
123+ Arc :: new ( StructArray :: try_new_with_length (
124+ vec ! [ Field :: new( "numRecords" , DataType :: Int64 , true ) ] . into ( ) ,
125+ vec ! [ Arc :: new( Int64Array :: from( vec![ * num_records as i64 ] ) ) ] ,
126+ None ,
127+ 1 ,
128+ ) ?)
129+ } ;
130+
131+ // Build schema dynamically based on stats
132+ let stats_field = Field :: new ( "stats" , stats_array. data_type ( ) . clone ( ) , true ) ;
133+ let schema = crate :: arrow:: datatypes:: Schema :: new ( vec ! [
134+ Field :: new( "path" , crate :: arrow:: datatypes:: DataType :: Utf8 , false ) ,
135+ Field :: new(
136+ "partitionValues" ,
137+ crate :: arrow:: datatypes:: DataType :: Map (
138+ Arc :: new( Field :: new(
139+ "key_value" ,
140+ crate :: arrow:: datatypes:: DataType :: Struct (
141+ vec![
142+ Field :: new( "key" , crate :: arrow:: datatypes:: DataType :: Utf8 , false ) ,
143+ Field :: new( "value" , crate :: arrow:: datatypes:: DataType :: Utf8 , true ) ,
144+ ]
145+ . into( ) ,
146+ ) ,
147+ false ,
148+ ) ) ,
149+ false ,
150+ ) ,
151+ false ,
119152 ) ,
120- vec ! [ path, partitions, size, modification_time, stats] ,
153+ Field :: new( "size" , crate :: arrow:: datatypes:: DataType :: Int64 , false ) ,
154+ Field :: new(
155+ "modificationTime" ,
156+ crate :: arrow:: datatypes:: DataType :: Int64 ,
157+ false ,
158+ ) ,
159+ stats_field,
160+ ] ) ;
161+
162+ Ok ( Box :: new ( ArrowEngineData :: new ( RecordBatch :: try_new (
163+ Arc :: new ( schema) ,
164+ vec ! [ path, partitions, size, modification_time, stats_array] ,
121165 ) ?) ) )
122166 }
123167}
@@ -201,9 +245,24 @@ impl<E: TaskExecutor> DefaultParquetHandler<E> {
201245 path : & url:: Url ,
202246 data : Box < dyn EngineData > ,
203247 partition_values : HashMap < String , String > ,
204- _stats_columns : Option < & [ String ] > ,
248+ stats_columns : Option < & [ String ] > ,
205249 ) -> DeltaResult < Box < dyn EngineData > > {
206- let parquet_metadata = self . write_parquet ( path, data) . await ?;
250+ // Collect statistics from the data before writing if stats_columns provided
251+ let stats = if let Some ( cols) = stats_columns {
252+ let record_batch = extract_record_batch ( data. as_ref ( ) ) ?;
253+ Some ( collect_stats ( record_batch, cols) ?)
254+ } else {
255+ None
256+ } ;
257+
258+ // Write the parquet file
259+ let mut parquet_metadata = self . write_parquet ( path, data) . await ?;
260+
261+ // Attach the collected statistics if present
262+ if let Some ( s) = stats {
263+ parquet_metadata = parquet_metadata. with_stats ( s) ;
264+ }
265+
207266 parquet_metadata. as_record_batch ( & partition_values)
208267 }
209268}
@@ -685,6 +744,7 @@ mod tests {
685744 size,
686745 } ,
687746 num_records,
747+ ..
688748 } = write_metadata;
689749 let expected_location = Url :: parse ( "memory:///data/" ) . unwrap ( ) ;
690750
0 commit comments