@@ -133,6 +133,120 @@ pub(crate) fn expected_stats_schema(
133133 StructType :: try_new ( fields)
134134}
135135
136+ /// Returns the list of column names that should have statistics collected.
137+ ///
138+ /// This extracts just the column names without building the full stats schema,
139+ /// making it more efficient when only the column list is needed.
140+ #[ allow( unused) ]
141+ pub ( crate ) fn stats_column_names (
142+ physical_file_schema : & Schema ,
143+ table_properties : & TableProperties ,
144+ ) -> Vec < ColumnName > {
145+ let mut filter = StatsColumnFilter :: new ( table_properties) ;
146+ let mut columns = Vec :: new ( ) ;
147+ filter. collect_columns ( physical_file_schema, & mut columns) ;
148+ columns
149+ }
150+
151+ /// Handles column filtering logic for statistics based on table properties.
152+ ///
153+ /// Filters columns according to:
154+ /// * `dataSkippingStatsColumns` - explicit list of columns to include (takes precedence)
155+ /// * `dataSkippingNumIndexedCols` - number of leaf columns to include (default 32)
156+ struct StatsColumnFilter {
157+ n_columns : Option < DataSkippingNumIndexedCols > ,
158+ added_columns : u64 ,
159+ column_names : Option < Vec < ColumnName > > ,
160+ path : Vec < String > ,
161+ }
162+
163+ impl StatsColumnFilter {
164+ fn new ( props : & TableProperties ) -> Self {
165+ // If data_skipping_stats_columns is specified, it takes precedence
166+ // over data_skipping_num_indexed_cols, even if that is also specified.
167+ if let Some ( column_names) = & props. data_skipping_stats_columns {
168+ Self {
169+ n_columns : None ,
170+ added_columns : 0 ,
171+ column_names : Some ( column_names. clone ( ) ) ,
172+ path : Vec :: new ( ) ,
173+ }
174+ } else {
175+ let n_cols = props
176+ . data_skipping_num_indexed_cols
177+ . unwrap_or ( DataSkippingNumIndexedCols :: NumColumns ( 32 ) ) ;
178+ Self {
179+ n_columns : Some ( n_cols) ,
180+ added_columns : 0 ,
181+ column_names : None ,
182+ path : Vec :: new ( ) ,
183+ }
184+ }
185+ }
186+
187+ /// Collects column names that should have statistics.
188+ fn collect_columns ( & mut self , schema : & Schema , result : & mut Vec < ColumnName > ) {
189+ for field in schema. fields ( ) {
190+ self . collect_field ( field, result) ;
191+ }
192+ }
193+
194+ fn collect_field ( & mut self , field : & StructField , result : & mut Vec < ColumnName > ) {
195+ if self . at_column_limit ( ) {
196+ return ;
197+ }
198+
199+ self . path . push ( field. name . clone ( ) ) ;
200+
201+ match field. data_type ( ) {
202+ DataType :: Struct ( struct_type) => {
203+ for child in struct_type. fields ( ) {
204+ self . collect_field ( child, result) ;
205+ }
206+ }
207+ _ => {
208+ if self . should_include_current ( ) {
209+ result. push ( ColumnName :: new ( & self . path ) ) ;
210+ self . added_columns += 1 ;
211+ }
212+ }
213+ }
214+
215+ self . path . pop ( ) ;
216+ }
217+
218+ /// Returns true if the column limit has been reached.
219+ fn at_column_limit ( & self ) -> bool {
220+ matches ! (
221+ self . n_columns,
222+ Some ( DataSkippingNumIndexedCols :: NumColumns ( n) ) if self . added_columns >= n
223+ )
224+ }
225+
226+ /// Returns true if the current path should be included based on column_names config.
227+ fn should_include_current ( & self ) -> bool {
228+ self . column_names
229+ . as_ref ( )
230+ . map ( |ns| should_include_column ( & ColumnName :: new ( & self . path ) , ns) )
231+ . unwrap_or ( true )
232+ }
233+
234+ /// Enters a field path for filtering decisions.
235+ fn enter_field ( & mut self , name : & str ) {
236+ self . path . push ( name. to_string ( ) ) ;
237+ }
238+
239+ /// Exits the current field path.
240+ fn exit_field ( & mut self ) {
241+ self . path . pop ( ) ;
242+ }
243+
244+ /// Records that a leaf column was included.
245+ fn record_included ( & mut self ) {
246+ self . added_columns += 1 ;
247+ }
248+ }
249+
136250/// Transforms a schema to make all fields nullable.
137251/// Used for stats schemas where stats may not be available for all columns.
138252pub ( crate ) struct NullableStatsTransform ;
@@ -182,40 +296,17 @@ impl<'a> SchemaTransform<'a> for NullCountStatsTransform {
182296/// The concrete shape of the schema depends on the table configuration.
183297/// * `dataSkippingStatsColumns` - used to explicitly specify the columns
184298/// to be used for data skipping statistics. (takes precedence)
185- /// * `dataSkippingNumIndexedCols` - used to specify the number of columns
186- /// to be used for data skipping statistics. Defaults to 32.
187- ///
188299/// All fields are nullable.
189300#[ allow( unused) ]
190301struct BaseStatsTransform {
191- n_columns : Option < DataSkippingNumIndexedCols > ,
192- added_columns : u64 ,
193- column_names : Option < Vec < ColumnName > > ,
194- path : Vec < String > ,
302+ filter : StatsColumnFilter ,
195303}
196304
197305impl BaseStatsTransform {
198306 #[ allow( unused) ]
199307 fn new ( props : & TableProperties ) -> Self {
200- // If data_skipping_stats_columns is specified, it takes precedence
201- // over data_skipping_num_indexed_cols, even if that is also specified.
202- if let Some ( column_names) = & props. data_skipping_stats_columns {
203- Self {
204- n_columns : None ,
205- added_columns : 0 ,
206- column_names : Some ( column_names. clone ( ) ) ,
207- path : Vec :: new ( ) ,
208- }
209- } else {
210- let n_cols = props
211- . data_skipping_num_indexed_cols
212- . unwrap_or ( DataSkippingNumIndexedCols :: NumColumns ( 32 ) ) ;
213- Self {
214- n_columns : Some ( n_cols) ,
215- added_columns : 0 ,
216- column_names : None ,
217- path : Vec :: new ( ) ,
218- }
308+ Self {
309+ filter : StatsColumnFilter :: new ( props) ,
219310 }
220311 }
221312}
@@ -224,34 +315,22 @@ impl<'a> SchemaTransform<'a> for BaseStatsTransform {
224315 fn transform_struct_field ( & mut self , field : & ' a StructField ) -> Option < Cow < ' a , StructField > > {
225316 use Cow :: * ;
226317
227- // Check if the number of columns is set and if the added columns exceed the limit
228- // In the constructor we assert this will always be None if column_names are specified
229- if let Some ( DataSkippingNumIndexedCols :: NumColumns ( n_cols) ) = self . n_columns {
230- if self . added_columns >= n_cols {
231- return None ;
232- }
318+ if self . filter . at_column_limit ( ) {
319+ return None ;
233320 }
234321
235- self . path . push ( field. name . clone ( ) ) ;
322+ self . filter . enter_field ( field. name ( ) ) ;
236323 let data_type = field. data_type ( ) ;
237324
238325 // We always traverse struct fields (they don't count against the column limit),
239326 // but we only include leaf fields if they qualify based on column_names config.
240327 // When column_names is None, all leaf fields are included (up to n_columns limit).
241328 if !matches ! ( data_type, DataType :: Struct ( _) ) {
242- let should_include = self
243- . column_names
244- . as_ref ( )
245- . map ( |ns| should_include_column ( & ColumnName :: new ( & self . path ) , ns) )
246- . unwrap_or ( true ) ;
247-
248- if !should_include {
249- self . path . pop ( ) ;
329+ if !self . filter . should_include_current ( ) {
330+ self . filter . exit_field ( ) ;
250331 return None ;
251332 }
252-
253- // Increment count only for leaf columns
254- self . added_columns += 1 ;
333+ self . filter . record_included ( ) ;
255334 }
256335
257336 let field = match self . transform ( & field. data_type ) ? {
@@ -264,7 +343,7 @@ impl<'a> SchemaTransform<'a> for BaseStatsTransform {
264343 } ) ,
265344 } ;
266345
267- self . path . pop ( ) ;
346+ self . filter . exit_field ( ) ;
268347
269348 // exclude struct fields with no children
270349 if matches ! ( field. data_type( ) , DataType :: Struct ( dt) if dt. fields( ) . len( ) == 0 ) {
0 commit comments