Skip to content

Commit

Permalink
Fixed PR AUC
Browse files Browse the repository at this point in the history
  • Loading branch information
leschultz committed Jul 27, 2023
1 parent fdb9fd9 commit ec5ae9b
Show file tree
Hide file tree
Showing 10 changed files with 140 additions and 13 deletions.
27 changes: 27 additions & 0 deletions examples/single_runs/bw/make_runs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash

sets=(
"diffusion"
)

grid=(0.001 0.01 0.1 1.0 10.0 100.0 1000.0)

for i in "${sets[@]}"
do

for j in "${grid[@]}"
do

echo "runs/${i}/${j}"

mkdir -p "runs/${i}"
cp -r template "runs/${i}/${j}"

cd "runs/${i}/${j}"
sed -i "s/replace_data/'${i}'/g" fit.py
sed -i "s/replace_bw/bandwidth=${j}/g" fit.py

cd - > /dev/null

done
done
9 changes: 9 additions & 0 deletions examples/single_runs/bw/submit_jobs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

submit=submit.sh
for i in $(find ${1} -type f -name ${submit})
do
cd $(dirname ${i})
qsub ${submit}
cd - > /dev/null
done
78 changes: 78 additions & 0 deletions examples/single_runs/bw/template/fit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from sklearn.cluster import AgglomerativeClustering
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from madml.ml.splitters import BootstrappedLeaveClusterOut
from madml.models.space import distance_model
from madml.models.combine import domain_model
from madml.models.uq import calibration_model
from madml.ml.assessment import nested_cv
from madml import datasets


def main():

run_name = 'run'
data_name = replace_data

# Load data
data = datasets.load(data_name)
X = data['data']
y = data['target']
g = data['class_name']
n_repeats = 5

# ML Distance model
ds_model = distance_model(dist='kde', replace_bw)

# ML UQ function
uq_model = calibration_model(params=[0.0, 1.0])

# ML
scale = StandardScaler()
model = RandomForestRegressor()

# The grid for grid search
grid = {}
grid['model__n_estimators'] = [100]

# The machine learning pipeline
pipe = Pipeline(steps=[
('scaler', scale),
('model', model),
])

# The gridsearch model
gs_model = GridSearchCV(
pipe,
grid,
cv=((slice(None), slice(None)),), # No splits
)

# Types of sampling to test
splits = [('calibration', RepeatedKFold(n_repeats=n_repeats))]

# Boostrap, cluster data, and generate splits
for i in [2, 3]:

# Cluster Splits
top_split = BootstrappedLeaveClusterOut(
AgglomerativeClustering,
n_repeats=n_repeats,
n_clusters=i
)

splits.append(('agglo_{}'.format(i), top_split))

# Fit models
model = domain_model(gs_model, ds_model, uq_model, splits)
cv = nested_cv(X, y, g, model, splits, save=run_name)
cv.assess()
cv.push('leschultz/cmg:{}'.format(data_name))


if __name__ == '__main__':
main()
6 changes: 6 additions & 0 deletions examples/single_runs/bw/template/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash

export PYTHONPATH=$(pwd)/../../../../../../src:$PYTHONPATH

rm -rf run
python3 fit.py
10 changes: 10 additions & 0 deletions examples/single_runs/bw/template/submit.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#PBS -S /bin/bash
#PBS -m be
#PBS -q morgan
#PBS -l select=1:ncpus=12:mpiprocs=12
#PBS -l walltime=72:00:00
#PBS -N job

cd $PBS_O_WORKDIR

./run.sh
2 changes: 1 addition & 1 deletion examples/single_runs/nn/template/fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def main():
X = data['data']
y = data['target']
g = data['class_name']
n_repeats = 1
n_repeats = 5

# ML Distance model
ds_model = distance_model(dist='kde')
Expand Down
2 changes: 1 addition & 1 deletion examples/single_runs/ols/template/fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def main():
X = data['data']
y = data['target']
g = data['class_name']
n_repeats = 1
n_repeats = 5

# ML Distance model
ds_model = distance_model(dist='kde')
Expand Down
2 changes: 1 addition & 1 deletion examples/single_runs/rf/template/fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def main():
X = data['data']
y = data['target']
g = data['class_name']
n_repeats = 1
n_repeats = 5

# ML Distance model
ds_model = distance_model(dist='kde')
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# Package information
name = 'madml'
version = '0.3.7' # Need to increment every time to push to PyPI
version = '0.3.9' # Need to increment every time to push to PyPI
description = 'Application domain of machine learning in materials science.'
url = 'https://github.com/leschultz/'\
'materials_application_domain_machine_learning.git'
Expand Down
15 changes: 6 additions & 9 deletions src/madml/plots.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from sklearn.metrics import (
precision_recall_curve,
PrecisionRecallDisplay,
auc,
average_precision_score,
)

from sklearn import metrics
Expand Down Expand Up @@ -1212,6 +1212,10 @@ def pr(score, in_domain, pos_label, save=False):

precision, recall, thresholds = prc_scores

with warnings.catch_warnings():
warnings.simplefilter('ignore')
auc_score = average_precision_score(in_domain, score)

num = 2*recall*precision
den = recall+precision
f1_scores = np.divide(
Expand Down Expand Up @@ -1273,16 +1277,9 @@ def pr(score, in_domain, pos_label, save=False):
baseline = sum(baseline)/len(in_domain)
relative_base = 1-baseline # The amount of area to gain in PR

# AUC score (add zero for positive format in labels)
if recall.shape[0] > 2:
auc_score = auc(recall[:-1], precision[:-1])+0
else:
# Correction for sklearn adding a 1 and a 0 to PR values
auc_score = baseline+0

# AUC relative to the baseline
if relative_base == 0.0:
auc_relative = np.nan
auc_relative = 0.0
else:
auc_relative = (auc_score-baseline)/relative_base

Expand Down

0 comments on commit ec5ae9b

Please sign in to comment.