diff --git a/examples/bandwidth_grid/template/run.sh b/examples/bandwidth_grid/template/run.sh index 8be3c16..82c2915 100755 --- a/examples/bandwidth_grid/template/run.sh +++ b/examples/bandwidth_grid/template/run.sh @@ -3,4 +3,4 @@ export PYTHONPATH=$(pwd)/../../../../../../src:$PYTHONPATH rm -rf run -python3 fit.py +time python3 fit.py diff --git a/examples/gpr/make_runs.sh b/examples/gpr/make_runs.sh index a1a92b3..7efb4eb 100755 --- a/examples/gpr/make_runs.sh +++ b/examples/gpr/make_runs.sh @@ -1,7 +1,6 @@ #!/bin/bash data=( - "friedman1" "fluence" "diffusion" "strength" diff --git a/examples/gpr/template/run.sh b/examples/gpr/template/run.sh index 2541e06..8d01fef 100755 --- a/examples/gpr/template/run.sh +++ b/examples/gpr/template/run.sh @@ -3,4 +3,4 @@ export PYTHONPATH=$(pwd)/../../../../../src:$PYTHONPATH rm -rf run -python3 fit.py +time python3 fit.py diff --git a/examples/kernel_grid/template/run.sh b/examples/kernel_grid/template/run.sh index 8be3c16..82c2915 100755 --- a/examples/kernel_grid/template/run.sh +++ b/examples/kernel_grid/template/run.sh @@ -3,4 +3,4 @@ export PYTHONPATH=$(pwd)/../../../../../../src:$PYTHONPATH rm -rf run -python3 fit.py +time python3 fit.py diff --git a/examples/minmax_scaler/make_runs.sh b/examples/minmax_scaler/make_runs.sh new file mode 100755 index 0000000..0835791 --- /dev/null +++ b/examples/minmax_scaler/make_runs.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +data=( + "friedman1" + "fluence" + "diffusion" + "strength" + "supercond" + ) + +models=( + "rf" + ) + +for i in "${data[@]}" +do + +for j in "${models[@]}" +do + + echo "Making (data, model)=(${i}, ${j})" + job_dir="runs/data_${i}/model_${j}" + + mkdir -p ${job_dir} + cp -r template/* ${job_dir} + cd ${job_dir} + + # Define the repeats + if [ "${i}" == "fluence" ] && [ "${j}" == "bnn" ]; then + r=3 + elif [ "${i}" == "friedman1" ] && [ "${j}" == "bnn" ]; then + r=3 + elif [ "${i}" == "supercond" ] && [ "${j}" == "bnn" ]; then + r=2 + else + r=5 + fi + + sed -i "s/replace_data/'${i}'/g" fit.py + sed -i "s/replace_model/'${j}'/g" fit.py + sed -i "s/replace_repeats/${r}/g" fit.py + + cd - > /dev/null + +done +done diff --git a/examples/minmax_scaler/submit_jobs.sh b/examples/minmax_scaler/submit_jobs.sh new file mode 100755 index 0000000..74db82c --- /dev/null +++ b/examples/minmax_scaler/submit_jobs.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +submit=submit.sh +for i in $(find ${1} -type f -name ${submit}) +do + cd $(dirname ${i}) + qsub ${submit} + cd - > /dev/null +done diff --git a/examples/minmax_scaler/template/fit.py b/examples/minmax_scaler/template/fit.py new file mode 100644 index 0000000..fbadc2e --- /dev/null +++ b/examples/minmax_scaler/template/fit.py @@ -0,0 +1,81 @@ +from sklearn.cluster import AgglomerativeClustering +from sklearn.model_selection import RepeatedKFold +from sklearn.model_selection import GridSearchCV +from sklearn.preprocessing import MinMaxScaler +from sklearn.pipeline import Pipeline + +from madml.models import dissimilarity, calibration, combine +from madml.splitters import BootstrappedLeaveClusterOut +from madml.assess import nested_cv +from madml import datasets + +from mods import return_model + + +def main(): + + run_name = 'output' + data_name = replace_data + model = replace_model + + # Load data + data = datasets.load(data_name) + X = data['data'] + y = data['target'] + + # MADML parameters + bins = 10 + n_repeats = replace_repeats + + # ML Distance model + ds_model = dissimilarity(dis='kde') + + # ML UQ function + uq_model = calibration(params=[0.0, 1.0]) + + # ML + scale = MinMaxScaler() + model = return_model(model, X) + + # The grid for grid search + grid = {} + + # The machine learning pipeline + pipe = Pipeline(steps=[ + ('scaler', scale), + ('model', model), + ]) + + # The gridsearch model + gs_model = GridSearchCV( + pipe, + grid, + cv=((slice(None), slice(None)),), # No splits + scoring='neg_mean_squared_error', + ) + + # Types of sampling to test + splits = [('fit', RepeatedKFold(n_repeats=n_repeats, n_splits=5))] + + # Boostrap, cluster data, and generate splits + for clusters in [2, 3]: + + # Cluster Splits + top_split = BootstrappedLeaveClusterOut( + AgglomerativeClustering, + n_repeats=n_repeats, + n_clusters=clusters, + ) + + splits.append(('agglo_{}'.format(clusters), top_split)) + + # Assess models + model = combine(gs_model, ds_model, uq_model, splits, bins=bins) + cv = nested_cv(model, X, y, splitters=splits) + df, df_bin, fit_model = cv.test( + save_outer_folds=run_name, + ) + + +if __name__ == '__main__': + main() diff --git a/examples/minmax_scaler/template/mods.py b/examples/minmax_scaler/template/mods.py new file mode 100644 index 0000000..7d508e2 --- /dev/null +++ b/examples/minmax_scaler/template/mods.py @@ -0,0 +1,62 @@ +from sklearn.ensemble import RandomForestRegressor +from sklearn.linear_model import LinearRegression +from sklearn.ensemble import BaggingRegressor +from scikeras.wrappers import KerasRegressor +from keras.layers import Dense, Dropout +from keras.models import Sequential +from sklearn.svm import SVR + + +def return_model(name, X): + + if name == 'rf': + return RandomForestRegressor(n_estimators=100) + + elif name == 'bols': + return BaggingRegressor(LinearRegression(), n_estimators=100) + + elif name == 'bsvr': + return BaggingRegressor(SVR(), n_estimators=100) + + elif name == 'bnn': + model = KerasRegressor( + build_fn=keras_model, + shape=X.shape[1], + epochs=500, + batch_size=100, + verbose=0, + ) + + return BaggingRegressor(model, n_estimators=10) + + else: + raise 'No model matching name.' + + +def keras_model(shape): + + n = 100 + model = Sequential() + model.add(Dense( + n, + input_dim=shape, + kernel_initializer='normal', + activation='relu' + )) + model.add(Dropout(0.3)) + model.add(Dense( + n, + kernel_initializer='normal', + activation='relu' + )) + model.add(Dropout(0.3)) + model.add(Dense( + 1, + kernel_initializer='normal' + )) + model.compile( + loss='mean_squared_error', + optimizer='adam' + ) + + return model diff --git a/examples/minmax_scaler/template/run.sh b/examples/minmax_scaler/template/run.sh new file mode 100755 index 0000000..8d01fef --- /dev/null +++ b/examples/minmax_scaler/template/run.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +export PYTHONPATH=$(pwd)/../../../../../src:$PYTHONPATH + +rm -rf run +time python3 fit.py diff --git a/examples/minmax_scaler/template/submit.sh b/examples/minmax_scaler/template/submit.sh new file mode 100644 index 0000000..24266cc --- /dev/null +++ b/examples/minmax_scaler/template/submit.sh @@ -0,0 +1,9 @@ +#PBS -S /bin/bash +#PBS -q bardeen +#PBS -l select=1:ncpus=16:mpiprocs=16 +#PBS -l walltime=72:00:00 +#PBS -N job + +cd $PBS_O_WORKDIR + +./run.sh diff --git a/examples/single_runs/template/run.sh b/examples/single_runs/template/run.sh index 2541e06..8d01fef 100755 --- a/examples/single_runs/template/run.sh +++ b/examples/single_runs/template/run.sh @@ -3,4 +3,4 @@ export PYTHONPATH=$(pwd)/../../../../../src:$PYTHONPATH rm -rf run -python3 fit.py +time python3 fit.py diff --git a/setup.py b/setup.py index c27d841..8eb68c0 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ # Package information name = 'madml' -version = '2.6.4' # Need to increment every time to push to PyPI +version = '2.6.5' # Need to increment every time to push to PyPI description = 'Application domain of machine learning in materials science.' url = 'https://github.com/leschultz/'\ 'materials_application_domain_machine_learning.git' diff --git a/src/madml/plots.py b/src/madml/plots.py index 9083149..5f9f9d9 100644 --- a/src/madml/plots.py +++ b/src/madml/plots.py @@ -93,7 +93,7 @@ def func(i, d, y, z): return removed, rmse, area - def sub(x, y, ylabel, key, gt, gtlabel, metric, color): + def sub(x, y, ylabel, key, gt, gtlabel, auc, metric, color): if key == r'$|y-\hat{y}|$': name = 'residual' @@ -122,6 +122,7 @@ def sub(x, y, ylabel, key, gt, gtlabel, metric, color): 'x': list(map(float, x)), 'y': list(map(float, y)), 'gt': float(gt), + 'auc': auc, } plot_dump( @@ -194,6 +195,7 @@ def sub(x, y, ylabel, key, gt, gtlabel, metric, color): key, gt_rmse, r'$E^{RMSE/\sigma_{y}}_{c}$', + auc_rmse, 'rmse', color, ) @@ -204,6 +206,7 @@ def sub(x, y, ylabel, key, gt, gtlabel, metric, color): key, gt_area, r'$E^{area}_{c}$', + auc_area, 'area', color, )