From b7a29e265c54dc65eda44cd43c6c356fd1968f12 Mon Sep 17 00:00:00 2001
From: lshpaner <lshpaner@ucla.edu>
Date: Mon, 26 Aug 2024 15:45:06 -0700
Subject: [PATCH 1/3] created todo comment for stratification fix

---
 src/model_tuner/model_tuner_utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/model_tuner/model_tuner_utils.py b/src/model_tuner/model_tuner_utils.py
index e78cd11..4ef673d 100644
--- a/src/model_tuner/model_tuner_utils.py
+++ b/src/model_tuner/model_tuner_utils.py
@@ -941,6 +941,10 @@ def train_val_test_split(
         # if calibrate:
         #     X = X.join(self.dropped_strat_cols)
         # Determine the stratify parameter based on stratify and stratify_cols
+
+        ## TODO: need to either consolidate stratification into one input or 
+        ## alow for simultaneous usage of stratify_cols and stratify_y inputs.
+        
         if stratify_cols:
             # Creating stratification columns out of stratify_cols list
             stratify_key = X[stratify_cols]

From 1bcd21f0123498eb48c14e02120b6e3f37922a0b Mon Sep 17 00:00:00 2001
From: Arthur Funnell <afunnell@mednet.ucla.edu>
Date: Mon, 26 Aug 2024 16:31:29 -0700
Subject: [PATCH 2/3] Allowing stratification by both y and cols

---
 notebooks/binary_test.ipynb          | 10 +++++++
 src/model_tuner/model_tuner_utils.py | 41 ++++++++++++++++++----------
 2 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/notebooks/binary_test.ipynb b/notebooks/binary_test.ipynb
index 2a5b6b0..3e7bb30 100644
--- a/notebooks/binary_test.ipynb
+++ b/notebooks/binary_test.ipynb
@@ -49,6 +49,15 @@
     "X = bc[bc_cols]"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X.columns"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -97,6 +106,7 @@
     "    estimator=lr,\n",
     "    kfold=kfold,\n",
     "    stratify_y=True,\n",
+    "    stratify_cols=['mean radius', 'mean texture', 'mean perimeter', 'mean area']\n",
     "    grid=tuned_parameters,\n",
     "    randomized_grid=True,\n",
     "    n_iter=40,\n",
diff --git a/src/model_tuner/model_tuner_utils.py b/src/model_tuner/model_tuner_utils.py
index 4ef673d..23b266a 100644
--- a/src/model_tuner/model_tuner_utils.py
+++ b/src/model_tuner/model_tuner_utils.py
@@ -306,7 +306,7 @@ def calibrateModel(
 
                     if self.imbalance_sampler:
                         self.process_imbalance_sampler(X_train, y_train)
-                    
+
                     self.fit(X_train, y_train)
                     #  calibrate model, and save output
                     self.estimator = CalibratedClassifierCV(
@@ -355,7 +355,7 @@ def calibrateModel(
                     # fit estimator
                     if self.imbalance_sampler:
                         self.process_imbalance_sampler(X_train, y_train)
-                    
+
                     # fit model
                     self.fit(
                         X_train,
@@ -542,7 +542,14 @@ def fit(self, X, y, validation_data=None, score=None):
         return
 
     def return_bootstrap_metrics(
-        self, X_test, y_test, metrics, threshold=0.5, num_resamples=500, n_samples=500, balance=False
+        self,
+        X_test,
+        y_test,
+        metrics,
+        threshold=0.5,
+        num_resamples=500,
+        n_samples=500,
+        balance=False,
     ):
         if self.model_type != "regression":
             y_pred_prob = pd.Series(self.predict_proba(X_test)[:, 1])
@@ -554,7 +561,7 @@ def return_bootstrap_metrics(
                 threshold=threshold,
                 num_resamples=num_resamples,
                 n_samples=n_samples,
-                balance=balance
+                balance=balance,
             )
         else:
             y_pred = pd.Series(self.predict(X_test))
@@ -566,7 +573,7 @@ def return_bootstrap_metrics(
                 metrics=metrics,
                 num_resamples=num_resamples,
                 n_samples=n_samples,
-                balance=balance
+                balance=balance,
             )
         return bootstrap_metrics
 
@@ -744,14 +751,13 @@ def grid_search_param_tuning(
 
             if self.imbalance_sampler:
                 self.process_imbalance_sampler(X_train, y_train)
-                
-                            
+
             ## casting the ParameterGrid Object to a list so that we can update
             ## update the hyperparameters in both random grid and non random grid
             ## scenarios
             if not self.randomized_grid:
                 self.grid = list(self.grid)
-                
+
             for score in self.scoring:
                 scores = []
                 for index, params in enumerate(tqdm(self.grid)):
@@ -819,18 +825,17 @@ def grid_search_param_tuning(
                         ) in best_early_stopping_params.items():
                             if param_name in params:
                                 params[param_name] = param_value
-                            
+
                         params[f"{self.estimator_name}__n_estimators"] = clf[
                             len(clf) - 1
                         ].best_iteration
 
                         # Update the parameters in the grid
                         self.grid[index] = params
-                            
 
                     else:
                         clf = self.estimator.set_params(**params).fit(X_train, y_train)
-                        
+
                     if score in self.custom_scorer:
                         scorer_func = self.custom_scorer[score]
                     else:
@@ -942,11 +947,13 @@ def train_val_test_split(
         #     X = X.join(self.dropped_strat_cols)
         # Determine the stratify parameter based on stratify and stratify_cols
 
-        ## TODO: need to either consolidate stratification into one input or 
+        ## TODO: need to either consolidate stratification into one input or
         ## alow for simultaneous usage of stratify_cols and stratify_y inputs.
-        
-        if stratify_cols:
+
+        if stratify_cols and stratify_y:
             # Creating stratification columns out of stratify_cols list
+            stratify_key = pd.concat([X[stratify_cols], y], axis=1)
+        elif stratify_cols:
             stratify_key = X[stratify_cols]
         elif stratify_y:
             stratify_key = y
@@ -968,7 +975,11 @@ def train_val_test_split(
         # Determine the proportion of validation to test size in the remaining dataset
         proportion = test_size / (validation_size + test_size)
 
-        if stratify_cols:
+        if stratify_cols and stratify_y:
+            strat_key_val_test = pd.concat(
+                [X_valid_test[stratify_cols], y_valid_test], axis=1
+            )
+        elif stratify_cols:
             strat_key_val_test = X_valid_test[stratify_cols]
         elif stratify_y:
             strat_key_val_test = y_valid_test

From 6d899ef2ec8739aeef7c68f61114554aa493be94 Mon Sep 17 00:00:00 2001
From: Arthur Funnell <afunnell@mednet.ucla.edu>
Date: Tue, 27 Aug 2024 11:39:27 -0700
Subject: [PATCH 3/3] Removing the stratify parameter where it isn't used

---
 notebooks/binary_test.ipynb          |  4 ++--
 notebooks/regression_test.ipynb      | 19 +++++++++++++++++++
 src/model_tuner/model_tuner_utils.py |  5 +----
 3 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/notebooks/binary_test.ipynb b/notebooks/binary_test.ipynb
index 3e7bb30..ca82f4c 100644
--- a/notebooks/binary_test.ipynb
+++ b/notebooks/binary_test.ipynb
@@ -55,7 +55,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "X.columns"
+    "X"
    ]
   },
   {
@@ -106,7 +106,7 @@
     "    estimator=lr,\n",
     "    kfold=kfold,\n",
     "    stratify_y=True,\n",
-    "    stratify_cols=['mean radius', 'mean texture', 'mean perimeter', 'mean area']\n",
+    "    stratify_cols=[\"mean radius\"],\n",
     "    grid=tuned_parameters,\n",
     "    randomized_grid=True,\n",
     "    n_iter=40,\n",
diff --git a/notebooks/regression_test.ipynb b/notebooks/regression_test.ipynb
index 47958ff..f2a2d70 100644
--- a/notebooks/regression_test.ipynb
+++ b/notebooks/regression_test.ipynb
@@ -20,6 +20,25 @@
     "from xgboost import XGBRegressor"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1 = pd.DataFrame({\"A\": [1, 2], \"B\": [3, 4]})\n",
+    "df2 = pd.DataFrame({\"Y\": [5, 6]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.concat([df1, df2], axis=1)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/src/model_tuner/model_tuner_utils.py b/src/model_tuner/model_tuner_utils.py
index 23b266a..8c83c43 100644
--- a/src/model_tuner/model_tuner_utils.py
+++ b/src/model_tuner/model_tuner_utils.py
@@ -947,9 +947,6 @@ def train_val_test_split(
         #     X = X.join(self.dropped_strat_cols)
         # Determine the stratify parameter based on stratify and stratify_cols
 
-        ## TODO: need to either consolidate stratification into one input or
-        ## alow for simultaneous usage of stratify_cols and stratify_y inputs.
-
         if stratify_cols and stratify_y:
             # Creating stratification columns out of stratify_cols list
             stratify_key = pd.concat([X[stratify_cols], y], axis=1)
@@ -1204,7 +1201,7 @@ def kfold_split(
         return kf
 
 
-def get_cross_validate(classifier, X, y, kf, stratify=False, scoring=["roc_auc"]):
+def get_cross_validate(classifier, X, y, kf, scoring=["roc_auc"]):
     return cross_validate(
         classifier,
         X,