diff --git a/notebooks/binary_test.ipynb b/notebooks/binary_test.ipynb index 2a5b6b0..ca82f4c 100644 --- a/notebooks/binary_test.ipynb +++ b/notebooks/binary_test.ipynb @@ -49,6 +49,15 @@ "X = bc[bc_cols]" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X" + ] + }, { "cell_type": "code", "execution_count": null, @@ -97,6 +106,7 @@ " estimator=lr,\n", " kfold=kfold,\n", " stratify_y=True,\n", + " stratify_cols=[\"mean radius\"],\n", " grid=tuned_parameters,\n", " randomized_grid=True,\n", " n_iter=40,\n", diff --git a/notebooks/regression_test.ipynb b/notebooks/regression_test.ipynb index 47958ff..f2a2d70 100644 --- a/notebooks/regression_test.ipynb +++ b/notebooks/regression_test.ipynb @@ -20,6 +20,25 @@ "from xgboost import XGBRegressor" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df1 = pd.DataFrame({\"A\": [1, 2], \"B\": [3, 4]})\n", + "df2 = pd.DataFrame({\"Y\": [5, 6]})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.concat([df1, df2], axis=1)" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/src/model_tuner/model_tuner_utils.py b/src/model_tuner/model_tuner_utils.py index e78cd11..8c83c43 100644 --- a/src/model_tuner/model_tuner_utils.py +++ b/src/model_tuner/model_tuner_utils.py @@ -306,7 +306,7 @@ def calibrateModel( if self.imbalance_sampler: self.process_imbalance_sampler(X_train, y_train) - + self.fit(X_train, y_train) # calibrate model, and save output self.estimator = CalibratedClassifierCV( @@ -355,7 +355,7 @@ def calibrateModel( # fit estimator if self.imbalance_sampler: self.process_imbalance_sampler(X_train, y_train) - + # fit model self.fit( X_train, @@ -542,7 +542,14 @@ def fit(self, X, y, validation_data=None, score=None): return def return_bootstrap_metrics( - self, X_test, y_test, metrics, threshold=0.5, num_resamples=500, n_samples=500, balance=False + self, + X_test, + y_test, + metrics, + threshold=0.5, + num_resamples=500, + n_samples=500, + balance=False, ): if self.model_type != "regression": y_pred_prob = pd.Series(self.predict_proba(X_test)[:, 1]) @@ -554,7 +561,7 @@ def return_bootstrap_metrics( threshold=threshold, num_resamples=num_resamples, n_samples=n_samples, - balance=balance + balance=balance, ) else: y_pred = pd.Series(self.predict(X_test)) @@ -566,7 +573,7 @@ def return_bootstrap_metrics( metrics=metrics, num_resamples=num_resamples, n_samples=n_samples, - balance=balance + balance=balance, ) return bootstrap_metrics @@ -744,14 +751,13 @@ def grid_search_param_tuning( if self.imbalance_sampler: self.process_imbalance_sampler(X_train, y_train) - - + ## casting the ParameterGrid Object to a list so that we can update ## update the hyperparameters in both random grid and non random grid ## scenarios if not self.randomized_grid: self.grid = list(self.grid) - + for score in self.scoring: scores = [] for index, params in enumerate(tqdm(self.grid)): @@ -819,18 +825,17 @@ def grid_search_param_tuning( ) in best_early_stopping_params.items(): if param_name in params: params[param_name] = param_value - + params[f"{self.estimator_name}__n_estimators"] = clf[ len(clf) - 1 ].best_iteration # Update the parameters in the grid self.grid[index] = params - else: clf = self.estimator.set_params(**params).fit(X_train, y_train) - + if score in self.custom_scorer: scorer_func = self.custom_scorer[score] else: @@ -941,8 +946,11 @@ def train_val_test_split( # if calibrate: # X = X.join(self.dropped_strat_cols) # Determine the stratify parameter based on stratify and stratify_cols - if stratify_cols: + + if stratify_cols and stratify_y: # Creating stratification columns out of stratify_cols list + stratify_key = pd.concat([X[stratify_cols], y], axis=1) + elif stratify_cols: stratify_key = X[stratify_cols] elif stratify_y: stratify_key = y @@ -964,7 +972,11 @@ def train_val_test_split( # Determine the proportion of validation to test size in the remaining dataset proportion = test_size / (validation_size + test_size) - if stratify_cols: + if stratify_cols and stratify_y: + strat_key_val_test = pd.concat( + [X_valid_test[stratify_cols], y_valid_test], axis=1 + ) + elif stratify_cols: strat_key_val_test = X_valid_test[stratify_cols] elif stratify_y: strat_key_val_test = y_valid_test @@ -1189,7 +1201,7 @@ def kfold_split( return kf -def get_cross_validate(classifier, X, y, kf, stratify=False, scoring=["roc_auc"]): +def get_cross_validate(classifier, X, y, kf, scoring=["roc_auc"]): return cross_validate( classifier, X,