From b7a29e265c54dc65eda44cd43c6c356fd1968f12 Mon Sep 17 00:00:00 2001 From: lshpaner Date: Mon, 26 Aug 2024 15:45:06 -0700 Subject: [PATCH 1/3] created todo comment for stratification fix --- src/model_tuner/model_tuner_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/model_tuner/model_tuner_utils.py b/src/model_tuner/model_tuner_utils.py index e78cd11..4ef673d 100644 --- a/src/model_tuner/model_tuner_utils.py +++ b/src/model_tuner/model_tuner_utils.py @@ -941,6 +941,10 @@ def train_val_test_split( # if calibrate: # X = X.join(self.dropped_strat_cols) # Determine the stratify parameter based on stratify and stratify_cols + + ## TODO: need to either consolidate stratification into one input or + ## alow for simultaneous usage of stratify_cols and stratify_y inputs. + if stratify_cols: # Creating stratification columns out of stratify_cols list stratify_key = X[stratify_cols] From 1bcd21f0123498eb48c14e02120b6e3f37922a0b Mon Sep 17 00:00:00 2001 From: Arthur Funnell Date: Mon, 26 Aug 2024 16:31:29 -0700 Subject: [PATCH 2/3] Allowing stratification by both y and cols --- notebooks/binary_test.ipynb | 10 +++++++ src/model_tuner/model_tuner_utils.py | 41 ++++++++++++++++++---------- 2 files changed, 36 insertions(+), 15 deletions(-) diff --git a/notebooks/binary_test.ipynb b/notebooks/binary_test.ipynb index 2a5b6b0..3e7bb30 100644 --- a/notebooks/binary_test.ipynb +++ b/notebooks/binary_test.ipynb @@ -49,6 +49,15 @@ "X = bc[bc_cols]" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X.columns" + ] + }, { "cell_type": "code", "execution_count": null, @@ -97,6 +106,7 @@ " estimator=lr,\n", " kfold=kfold,\n", " stratify_y=True,\n", + " stratify_cols=['mean radius', 'mean texture', 'mean perimeter', 'mean area']\n", " grid=tuned_parameters,\n", " randomized_grid=True,\n", " n_iter=40,\n", diff --git a/src/model_tuner/model_tuner_utils.py b/src/model_tuner/model_tuner_utils.py index 4ef673d..23b266a 100644 --- a/src/model_tuner/model_tuner_utils.py +++ b/src/model_tuner/model_tuner_utils.py @@ -306,7 +306,7 @@ def calibrateModel( if self.imbalance_sampler: self.process_imbalance_sampler(X_train, y_train) - + self.fit(X_train, y_train) # calibrate model, and save output self.estimator = CalibratedClassifierCV( @@ -355,7 +355,7 @@ def calibrateModel( # fit estimator if self.imbalance_sampler: self.process_imbalance_sampler(X_train, y_train) - + # fit model self.fit( X_train, @@ -542,7 +542,14 @@ def fit(self, X, y, validation_data=None, score=None): return def return_bootstrap_metrics( - self, X_test, y_test, metrics, threshold=0.5, num_resamples=500, n_samples=500, balance=False + self, + X_test, + y_test, + metrics, + threshold=0.5, + num_resamples=500, + n_samples=500, + balance=False, ): if self.model_type != "regression": y_pred_prob = pd.Series(self.predict_proba(X_test)[:, 1]) @@ -554,7 +561,7 @@ def return_bootstrap_metrics( threshold=threshold, num_resamples=num_resamples, n_samples=n_samples, - balance=balance + balance=balance, ) else: y_pred = pd.Series(self.predict(X_test)) @@ -566,7 +573,7 @@ def return_bootstrap_metrics( metrics=metrics, num_resamples=num_resamples, n_samples=n_samples, - balance=balance + balance=balance, ) return bootstrap_metrics @@ -744,14 +751,13 @@ def grid_search_param_tuning( if self.imbalance_sampler: self.process_imbalance_sampler(X_train, y_train) - - + ## casting the ParameterGrid Object to a list so that we can update ## update the hyperparameters in both random grid and non random grid ## scenarios if not self.randomized_grid: self.grid = list(self.grid) - + for score in self.scoring: scores = [] for index, params in enumerate(tqdm(self.grid)): @@ -819,18 +825,17 @@ def grid_search_param_tuning( ) in best_early_stopping_params.items(): if param_name in params: params[param_name] = param_value - + params[f"{self.estimator_name}__n_estimators"] = clf[ len(clf) - 1 ].best_iteration # Update the parameters in the grid self.grid[index] = params - else: clf = self.estimator.set_params(**params).fit(X_train, y_train) - + if score in self.custom_scorer: scorer_func = self.custom_scorer[score] else: @@ -942,11 +947,13 @@ def train_val_test_split( # X = X.join(self.dropped_strat_cols) # Determine the stratify parameter based on stratify and stratify_cols - ## TODO: need to either consolidate stratification into one input or + ## TODO: need to either consolidate stratification into one input or ## alow for simultaneous usage of stratify_cols and stratify_y inputs. - - if stratify_cols: + + if stratify_cols and stratify_y: # Creating stratification columns out of stratify_cols list + stratify_key = pd.concat([X[stratify_cols], y], axis=1) + elif stratify_cols: stratify_key = X[stratify_cols] elif stratify_y: stratify_key = y @@ -968,7 +975,11 @@ def train_val_test_split( # Determine the proportion of validation to test size in the remaining dataset proportion = test_size / (validation_size + test_size) - if stratify_cols: + if stratify_cols and stratify_y: + strat_key_val_test = pd.concat( + [X_valid_test[stratify_cols], y_valid_test], axis=1 + ) + elif stratify_cols: strat_key_val_test = X_valid_test[stratify_cols] elif stratify_y: strat_key_val_test = y_valid_test From 6d899ef2ec8739aeef7c68f61114554aa493be94 Mon Sep 17 00:00:00 2001 From: Arthur Funnell Date: Tue, 27 Aug 2024 11:39:27 -0700 Subject: [PATCH 3/3] Removing the stratify parameter where it isn't used --- notebooks/binary_test.ipynb | 4 ++-- notebooks/regression_test.ipynb | 19 +++++++++++++++++++ src/model_tuner/model_tuner_utils.py | 5 +---- 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/notebooks/binary_test.ipynb b/notebooks/binary_test.ipynb index 3e7bb30..ca82f4c 100644 --- a/notebooks/binary_test.ipynb +++ b/notebooks/binary_test.ipynb @@ -55,7 +55,7 @@ "metadata": {}, "outputs": [], "source": [ - "X.columns" + "X" ] }, { @@ -106,7 +106,7 @@ " estimator=lr,\n", " kfold=kfold,\n", " stratify_y=True,\n", - " stratify_cols=['mean radius', 'mean texture', 'mean perimeter', 'mean area']\n", + " stratify_cols=[\"mean radius\"],\n", " grid=tuned_parameters,\n", " randomized_grid=True,\n", " n_iter=40,\n", diff --git a/notebooks/regression_test.ipynb b/notebooks/regression_test.ipynb index 47958ff..f2a2d70 100644 --- a/notebooks/regression_test.ipynb +++ b/notebooks/regression_test.ipynb @@ -20,6 +20,25 @@ "from xgboost import XGBRegressor" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df1 = pd.DataFrame({\"A\": [1, 2], \"B\": [3, 4]})\n", + "df2 = pd.DataFrame({\"Y\": [5, 6]})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.concat([df1, df2], axis=1)" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/src/model_tuner/model_tuner_utils.py b/src/model_tuner/model_tuner_utils.py index 23b266a..8c83c43 100644 --- a/src/model_tuner/model_tuner_utils.py +++ b/src/model_tuner/model_tuner_utils.py @@ -947,9 +947,6 @@ def train_val_test_split( # X = X.join(self.dropped_strat_cols) # Determine the stratify parameter based on stratify and stratify_cols - ## TODO: need to either consolidate stratification into one input or - ## alow for simultaneous usage of stratify_cols and stratify_y inputs. - if stratify_cols and stratify_y: # Creating stratification columns out of stratify_cols list stratify_key = pd.concat([X[stratify_cols], y], axis=1) @@ -1204,7 +1201,7 @@ def kfold_split( return kf -def get_cross_validate(classifier, X, y, kf, stratify=False, scoring=["roc_auc"]): +def get_cross_validate(classifier, X, y, kf, scoring=["roc_auc"]): return cross_validate( classifier, X,