2222from sklearn .ensemble import RandomForestClassifier , RandomForestRegressor
2323from sklearn .ensemble import ExtraTreesClassifier , ExtraTreesRegressor
2424from sklearn .ensemble import GradientBoostingClassifier , GradientBoostingRegressor
25+ from sklearn .model_selection import KFold
2526from xgboost import XGBClassifier , XGBRegressor
2627
2728from feature .base import _BaseDispatcher , _BaseSupervisedSelector , _BaseUnsupervisedSelector
@@ -475,9 +476,11 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation,
475476 SelectionMethod .Variance ]],
476477 data : pd .DataFrame ,
477478 labels : Optional [pd .Series ] = None ,
479+ cv : Optional [int ] = None ,
478480 output_filename : Optional [str ] = None ,
479481 drop_zero_variance_features : Optional [bool ] = True ,
480- verbose : bool = False ) \
482+ verbose : bool = False ,
483+ seed : int = Constants .default_seed ) \
481484 -> Tuple [pd .DataFrame , pd .DataFrame , pd .DataFrame ]:
482485 """
483486 Benchmark with a given set of feature selectors.
@@ -495,13 +498,90 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation,
495498 Data of shape (n_samples, n_features) used for feature selection.
496499 labels: pd.Series, optional (default=None)
497500 The target values (class labels in classification, real numbers in regression).
501+ cv: int, optional (default=None)
502+ Number of folds to use for cross-validation.
498503 output_filename: str, optional (default=None)
499504 If not None, benchmarking output is saved.
500505 If file exists, results are appended, otherwise file is created.
501506 drop_zero_variance_features: bool, optional (default=True)
502507 Whether to drop features with zero variance before running feature selector methods or not.
503508 verbose: bool, optional (default=False)
504509 Whether to print progress messages or not.
510+ seed: int, optional (default=Constants.default_seed)
511+ The random seed to initialize the random number generator.
512+
513+ Returns
514+ -------
515+ Tuple of data frames with scores, selected features and runtime for each method.
516+ If cv is not None, the data frames will contain the concatenated results from each fold.
517+ """
518+
519+ check_true (selectors is not None , ValueError ("Benchmark selectors cannot be none." ))
520+ check_true (data is not None , ValueError ("Benchmark data cannot be none." ))
521+
522+ if cv is None :
523+ return _bench (selectors = selectors ,
524+ data = data ,
525+ labels = labels ,
526+ output_filename = output_filename ,
527+ drop_zero_variance_features = drop_zero_variance_features ,
528+ verbose = verbose )
529+ else :
530+
531+ # Create K-Fold object
532+ kf = KFold (n_splits = cv , shuffle = True , random_state = seed )
533+
534+ # Initialize variables
535+ t0 = time ()
536+ train_labels , test_labels = None , None
537+ score_df , selected_df , runtime_df = pd .DataFrame (), pd .DataFrame (), pd .DataFrame ()
538+
539+ # Split data into cv-folds and run _bench for each fold
540+ if verbose :
541+ print ("\n >>> Running" )
542+ for fold , (train_index , _ ) in enumerate (kf .split (data )):
543+
544+ if verbose :
545+ print ("\t Fold" , fold , "..." )
546+
547+ # Split data, labels into folds
548+ train_data = data .iloc [train_index ]
549+ if labels is not None :
550+ train_labels = labels .iloc [train_index ]
551+
552+ # Run benchmark
553+ score_cv_df , selected_cv_df , runtime_cv_df = _bench (selectors = selectors ,
554+ data = train_data ,
555+ labels = train_labels ,
556+ output_filename = output_filename ,
557+ drop_zero_variance_features = drop_zero_variance_features ,
558+ verbose = False )
559+
560+ # Concatenate data frames
561+ score_df = pd .concat ((score_df , score_cv_df ))
562+ selected_df = pd .concat ((selected_df , selected_cv_df ))
563+ runtime_df = pd .concat ((runtime_df , runtime_cv_df ))
564+
565+ if verbose :
566+ print (f"<<< Done! Time taken: { (time () - t0 ) / 60 :.2f} minutes" )
567+
568+ return score_df , selected_df , runtime_df
569+
570+
571+ def _bench (selectors : Dict [str , Union [SelectionMethod .Correlation ,
572+ SelectionMethod .Linear ,
573+ SelectionMethod .TreeBased ,
574+ SelectionMethod .Statistical ,
575+ SelectionMethod .Variance ]],
576+ data : pd .DataFrame ,
577+ labels : Optional [pd .Series ] = None ,
578+ output_filename : Optional [str ] = None ,
579+ drop_zero_variance_features : Optional [bool ] = True ,
580+ verbose : bool = False ) \
581+ -> Tuple [pd .DataFrame , pd .DataFrame , pd .DataFrame ]:
582+ """
583+ Benchmark with a given set of feature selectors.
584+ Return a tuple of data frames with scores, runtime and selected features for each method.
505585
506586 Returns
507587 -------
@@ -552,7 +632,7 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation,
552632 if verbose :
553633 print (f"<<< Done! Time taken: { (time () - t0 ) / 60 :.2f} minutes" )
554634
555- # Convert to series
635+ # Format
556636 runtime_df = pd .Series (method_to_runtime ).to_frame ("runtime" ).rename_axis ("method" ).reset_index ()
557637
558638 return score_df , selected_df , runtime_df
@@ -561,15 +641,19 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation,
561641def calculate_statistics (scores : pd .DataFrame ,
562642 selected : pd .DataFrame ,
563643 columns : Optional [list ] = None ,
564- ignore_constant : Optional [bool ] = True ):
565- """Calculate statistics for each feature using scores/selections from list of methods.
644+ ignore_constant : Optional [bool ] = True ) -> pd .DataFrame :
645+ """
646+ Calculate statistics for each feature using scores/selections from list of methods.
647+ Returns data frame with calculated statistics for each feature.
566648
567649 Parameters
568650 ----------
569651 scores: pd.DataFrame
570652 Data frame with scores for each feature (index) and selector (columns).
653+ Each feature could have multiple rows from different cross-validation folds.
571654 selected: pd.DataFrame
572655 Data frame with selection flag for each feature (index) and selector (columns).
656+ Each feature could have multiple rows from different cross-validation folds.
573657 columns: list (default=None)
574658 List of methods (columns) to include in statistics.
575659 If None, all methods (columns) will be used.
@@ -584,9 +668,9 @@ def calculate_statistics(scores: pd.DataFrame,
584668 check_true (isinstance (scores , pd .DataFrame ), ValueError ("scores must be a data frame." ))
585669 check_true (isinstance (selected , pd .DataFrame ), ValueError ("selection must be a data frame." ))
586670 check_true (scores .shape == selected .shape , ValueError ("Shapes of scores and selected data frames must match." ))
587- check_true (len (scores .index . intersection ( selected . index )) == selected .shape [ 0 ] ,
671+ check_true (np . all (scores .index == selected .index ) ,
588672 ValueError ("Index of score and selection data frames must match." ))
589- check_true (len (scores .columns . intersection ( selected . columns )) == selected .shape [ 1 ] ,
673+ check_true (np . all (scores .columns == selected .columns ) ,
590674 ValueError ("Columns of score and selection data frames must match." ))
591675
592676 # Get columns to use
@@ -597,25 +681,25 @@ def calculate_statistics(scores: pd.DataFrame,
597681 scores_df = scores [columns ].copy ()
598682 selected_df = selected [columns ].copy ()
599683
684+ # Group by feature for CV results
685+ scores_df = scores_df .groupby (scores_df .index ).mean ()
686+ selected_df = selected_df .groupby (selected_df .index ).mean ()
687+
600688 # Drop methods with constant scores
601689 if ignore_constant :
602690 mask = ~ np .isclose (np .var (scores_df , axis = 0 ), 0 )
603691 scores_df = scores_df .loc [:, mask ]
604692 selected_df = selected_df .loc [:, mask ]
605693
606- # Sort by index
607- scores_df .sort_index (inplace = True )
608- selected_df .sort_index (inplace = True )
609-
610694 # Calculate statistics
611- stats_df = pd .DataFrame (index = scores .index )
612- stats_df ["_score_mean " ] = scores_df .mean (axis = 1 )
613- stats_df ["_score_mean_norm " ] = normalize_columns (scores_df ).mean (axis = 1 )
614- stats_df ["_selection_freq " ] = selected_df .sum (axis = 1 )
615- stats_df ["_selection_freq_norm " ] = normalize_columns (selected_df ).sum (axis = 1 )
695+ stats_df = pd .DataFrame (index = scores_df .index )
696+ stats_df ["score_mean " ] = scores_df .mean (axis = 1 )
697+ stats_df ["score_mean_norm " ] = normalize_columns (scores_df ).mean (axis = 1 )
698+ stats_df ["selection_freq " ] = selected_df .sum (axis = 1 )
699+ stats_df ["selection_freq_norm " ] = normalize_columns (selected_df ).sum (axis = 1 )
616700
617701 # Sort
618- stats_df .sort_values (by = "_score_mean_norm " , ascending = False , inplace = True )
702+ stats_df .sort_values (by = "score_mean_norm " , ascending = False , inplace = True )
619703
620704 return stats_df
621705
@@ -632,6 +716,7 @@ def plot_importance(scores: pd.DataFrame,
632716 ----------
633717 scores: pd.DataFrame
634718 Data frame with scores for each feature (index) and method (columns).
719+ Each feature could have multiple rows from different cross-validation folds.
635720 columns: list (default=None)
636721 List of methods (columns) to include in statistics.
637722 If None, all methods (columns) will be used.
@@ -663,6 +748,9 @@ def plot_importance(scores: pd.DataFrame,
663748 df = scores [columns ].copy ()
664749 df .fillna (0 , inplace = True )
665750
751+ # Group by feature for CV results
752+ df = df .groupby (df .index ).mean ()
753+
666754 # Get normalized scores such that scores for each method sums to 1
667755 if normalize :
668756 df = normalize_columns (df )
0 commit comments