Skip to content

Commit

Permalink
Merge pull request #29 from uclamii/stratify_fix
Browse files Browse the repository at this point in the history
created todo comment for stratification fix
  • Loading branch information
lshpaner authored Aug 27, 2024
2 parents c61ce50 + 6d899ef commit f9f7cff
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 14 deletions.
10 changes: 10 additions & 0 deletions notebooks/binary_test.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,15 @@
"X = bc[bc_cols]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -97,6 +106,7 @@
" estimator=lr,\n",
" kfold=kfold,\n",
" stratify_y=True,\n",
" stratify_cols=[\"mean radius\"],\n",
" grid=tuned_parameters,\n",
" randomized_grid=True,\n",
" n_iter=40,\n",
Expand Down
19 changes: 19 additions & 0 deletions notebooks/regression_test.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,25 @@
"from xgboost import XGBRegressor"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df1 = pd.DataFrame({\"A\": [1, 2], \"B\": [3, 4]})\n",
"df2 = pd.DataFrame({\"Y\": [5, 6]})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pd.concat([df1, df2], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
40 changes: 26 additions & 14 deletions src/model_tuner/model_tuner_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ def calibrateModel(

if self.imbalance_sampler:
self.process_imbalance_sampler(X_train, y_train)

self.fit(X_train, y_train)
# calibrate model, and save output
self.estimator = CalibratedClassifierCV(
Expand Down Expand Up @@ -355,7 +355,7 @@ def calibrateModel(
# fit estimator
if self.imbalance_sampler:
self.process_imbalance_sampler(X_train, y_train)

# fit model
self.fit(
X_train,
Expand Down Expand Up @@ -542,7 +542,14 @@ def fit(self, X, y, validation_data=None, score=None):
return

def return_bootstrap_metrics(
self, X_test, y_test, metrics, threshold=0.5, num_resamples=500, n_samples=500, balance=False
self,
X_test,
y_test,
metrics,
threshold=0.5,
num_resamples=500,
n_samples=500,
balance=False,
):
if self.model_type != "regression":
y_pred_prob = pd.Series(self.predict_proba(X_test)[:, 1])
Expand All @@ -554,7 +561,7 @@ def return_bootstrap_metrics(
threshold=threshold,
num_resamples=num_resamples,
n_samples=n_samples,
balance=balance
balance=balance,
)
else:
y_pred = pd.Series(self.predict(X_test))
Expand All @@ -566,7 +573,7 @@ def return_bootstrap_metrics(
metrics=metrics,
num_resamples=num_resamples,
n_samples=n_samples,
balance=balance
balance=balance,
)
return bootstrap_metrics

Expand Down Expand Up @@ -744,14 +751,13 @@ def grid_search_param_tuning(

if self.imbalance_sampler:
self.process_imbalance_sampler(X_train, y_train)



## casting the ParameterGrid Object to a list so that we can update
## update the hyperparameters in both random grid and non random grid
## scenarios
if not self.randomized_grid:
self.grid = list(self.grid)

for score in self.scoring:
scores = []
for index, params in enumerate(tqdm(self.grid)):
Expand Down Expand Up @@ -819,18 +825,17 @@ def grid_search_param_tuning(
) in best_early_stopping_params.items():
if param_name in params:
params[param_name] = param_value

params[f"{self.estimator_name}__n_estimators"] = clf[
len(clf) - 1
].best_iteration

# Update the parameters in the grid
self.grid[index] = params


else:
clf = self.estimator.set_params(**params).fit(X_train, y_train)

if score in self.custom_scorer:
scorer_func = self.custom_scorer[score]
else:
Expand Down Expand Up @@ -941,8 +946,11 @@ def train_val_test_split(
# if calibrate:
# X = X.join(self.dropped_strat_cols)
# Determine the stratify parameter based on stratify and stratify_cols
if stratify_cols:

if stratify_cols and stratify_y:
# Creating stratification columns out of stratify_cols list
stratify_key = pd.concat([X[stratify_cols], y], axis=1)
elif stratify_cols:
stratify_key = X[stratify_cols]
elif stratify_y:
stratify_key = y
Expand All @@ -964,7 +972,11 @@ def train_val_test_split(
# Determine the proportion of validation to test size in the remaining dataset
proportion = test_size / (validation_size + test_size)

if stratify_cols:
if stratify_cols and stratify_y:
strat_key_val_test = pd.concat(
[X_valid_test[stratify_cols], y_valid_test], axis=1
)
elif stratify_cols:
strat_key_val_test = X_valid_test[stratify_cols]
elif stratify_y:
strat_key_val_test = y_valid_test
Expand Down Expand Up @@ -1189,7 +1201,7 @@ def kfold_split(
return kf


def get_cross_validate(classifier, X, y, kf, stratify=False, scoring=["roc_auc"]):
def get_cross_validate(classifier, X, y, kf, scoring=["roc_auc"]):
return cross_validate(
classifier,
X,
Expand Down

0 comments on commit f9f7cff

Please sign in to comment.