Merge pull request #29 from uclamii/stratify_fix

created todo comment for stratification fix
uclamii · Aug 27, 2024 · f9f7cff · f9f7cff
2 parents c61ce50 + 6d899ef
commit f9f7cff
Show file tree

Hide file tree

Showing 3 changed files with 55 additions and 14 deletions.
diff --git a/notebooks/binary_test.ipynb b/notebooks/binary_test.ipynb
@@ -49,6 +49,15 @@
     "X = bc[bc_cols]"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -97,6 +106,7 @@
     "    estimator=lr,\n",
     "    kfold=kfold,\n",
     "    stratify_y=True,\n",
+    "    stratify_cols=[\"mean radius\"],\n",
     "    grid=tuned_parameters,\n",
     "    randomized_grid=True,\n",
     "    n_iter=40,\n",

diff --git a/notebooks/regression_test.ipynb b/notebooks/regression_test.ipynb
@@ -20,6 +20,25 @@
     "from xgboost import XGBRegressor"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1 = pd.DataFrame({\"A\": [1, 2], \"B\": [3, 4]})\n",
+    "df2 = pd.DataFrame({\"Y\": [5, 6]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.concat([df1, df2], axis=1)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/src/model_tuner/model_tuner_utils.py b/src/model_tuner/model_tuner_utils.py
@@ -306,7 +306,7 @@ def calibrateModel(
 
                     if self.imbalance_sampler:
                         self.process_imbalance_sampler(X_train, y_train)
-                    
+
                     self.fit(X_train, y_train)
                     #  calibrate model, and save output
                     self.estimator = CalibratedClassifierCV(
@@ -355,7 +355,7 @@ def calibrateModel(
                     # fit estimator
                     if self.imbalance_sampler:
                         self.process_imbalance_sampler(X_train, y_train)
-                    
+
                     # fit model
                     self.fit(
                         X_train,
@@ -542,7 +542,14 @@ def fit(self, X, y, validation_data=None, score=None):
         return
 
     def return_bootstrap_metrics(
-        self, X_test, y_test, metrics, threshold=0.5, num_resamples=500, n_samples=500, balance=False
+        self,
+        X_test,
+        y_test,
+        metrics,
+        threshold=0.5,
+        num_resamples=500,
+        n_samples=500,
+        balance=False,
     ):
         if self.model_type != "regression":
             y_pred_prob = pd.Series(self.predict_proba(X_test)[:, 1])
@@ -554,7 +561,7 @@ def return_bootstrap_metrics(
                 threshold=threshold,
                 num_resamples=num_resamples,
                 n_samples=n_samples,
-                balance=balance
+                balance=balance,
             )
         else:
             y_pred = pd.Series(self.predict(X_test))
@@ -566,7 +573,7 @@ def return_bootstrap_metrics(
                 metrics=metrics,
                 num_resamples=num_resamples,
                 n_samples=n_samples,
-                balance=balance
+                balance=balance,
             )
         return bootstrap_metrics
 
@@ -744,14 +751,13 @@ def grid_search_param_tuning(
 
             if self.imbalance_sampler:
                 self.process_imbalance_sampler(X_train, y_train)
-
-
+
             ## casting the ParameterGrid Object to a list so that we can update
             ## update the hyperparameters in both random grid and non random grid
             ## scenarios
             if not self.randomized_grid:
                 self.grid = list(self.grid)
-                
+
             for score in self.scoring:
                 scores = []
                 for index, params in enumerate(tqdm(self.grid)):
@@ -819,18 +825,17 @@ def grid_search_param_tuning(
                         ) in best_early_stopping_params.items():
                             if param_name in params:
                                 params[param_name] = param_value
-                            
+
                         params[f"{self.estimator_name}__n_estimators"] = clf[
                             len(clf) - 1
                         ].best_iteration
 
                         # Update the parameters in the grid
                         self.grid[index] = params
-
 
                     else:
                         clf = self.estimator.set_params(**params).fit(X_train, y_train)
-                        
+
                     if score in self.custom_scorer:
                         scorer_func = self.custom_scorer[score]
                     else:
@@ -941,8 +946,11 @@ def train_val_test_split(
         # if calibrate:
         #     X = X.join(self.dropped_strat_cols)
         # Determine the stratify parameter based on stratify and stratify_cols
-        if stratify_cols:
+
+        if stratify_cols and stratify_y:
             # Creating stratification columns out of stratify_cols list
+            stratify_key = pd.concat([X[stratify_cols], y], axis=1)
+        elif stratify_cols:
             stratify_key = X[stratify_cols]
         elif stratify_y:
             stratify_key = y
@@ -964,7 +972,11 @@ def train_val_test_split(
         # Determine the proportion of validation to test size in the remaining dataset
         proportion = test_size / (validation_size + test_size)
 
-        if stratify_cols:
+        if stratify_cols and stratify_y:
+            strat_key_val_test = pd.concat(
+                [X_valid_test[stratify_cols], y_valid_test], axis=1
+            )
+        elif stratify_cols:
             strat_key_val_test = X_valid_test[stratify_cols]
         elif stratify_y:
             strat_key_val_test = y_valid_test
@@ -1189,7 +1201,7 @@ def kfold_split(
         return kf
 
 
-def get_cross_validate(classifier, X, y, kf, stratify=False, scoring=["roc_auc"]):
+def get_cross_validate(classifier, X, y, kf, scoring=["roc_auc"]):
     return cross_validate(
         classifier,
         X,