diff --git a/setup.py b/setup.py index 7f6878f..cad132a 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ # Package information name = 'madml' -version = '2.1.2' # Need to increment every time to push to PyPI +version = '2.1.3' # Need to increment every time to push to PyPI description = 'Application domain of machine learning in materials science.' url = 'https://github.com/leschultz/'\ 'materials_application_domain_machine_learning.git' diff --git a/src/madml/assess.py b/src/madml/assess.py index 6c9b64b..13bc253 100644 --- a/src/madml/assess.py +++ b/src/madml/assess.py @@ -64,7 +64,7 @@ def __init__( train, test = split self.splits.append((train, test, count, splitter[0])) - def cv(self, split): + def cv(self, split, save_inner_folds=None): ''' Fit models and get predictions from one split. ''' @@ -78,6 +78,13 @@ def cv(self, split): model = copy.deepcopy(self.model) model.fit(self.X[train], self.y[train], self.g[train]) + # Save fold fit + if save_inner_folds is not None: + save = os.path.join(save_inner_folds, 'train_folds') + save = os.path.join(save, 'fold_{}'.format(count)) + os.makedirs(save, exist_ok=True) + model.plot(save) + # Gather predictions on test set data = model.predict(self.X[test]) @@ -104,15 +111,28 @@ def cv(self, split): return data - def test(self): + def test( + self, + save_inner_folds=None, + save_outer_folds=None, + name=None, + push_container=False, + ): ''' Gather assessment data and plot results. + + inputs: + save_inner_folds = Top directory to save training folds. + save_outer_folds = The top directory to save assessment. + name = The name of the container /: + push_container = Whether to build and push a container with model. + ''' # Assess model df = [] for i in tqdm(self.splits): - df.append(self.cv(i)) + df.append(self.cv(i, save_inner_folds)) df = pd.concat(df) # Combine data df, df_bin = bin_data(df, self.model.bins) @@ -156,123 +176,115 @@ def test(self): # Full fit self.model.fit(self.X, self.y, self.g) - return df, df_bin, self.model - - def write_results(self, parent, name=None, push_container=False): - ''' - Push docker container with full fit model. - - inputs: - parent = The top directory to save things. - name = The name of the container /: - push_container = Whether to build and push a container with model. - ''' - - # Save locations - ass_save = os.path.join(parent, 'assessment') - model_save = os.path.join(parent, 'model') - model_ass = os.path.join(model_save, 'train') - - # Create locations - for d in [ass_save, model_save, model_ass]: - os.makedirs(d, exist_ok=True) - - # Write test data - self.df.to_csv(os.path.join(ass_save, 'pred.csv'), index=False) - self.df_bin.to_csv(os.path.join( - ass_save, - 'pred_bins.csv', - ), index=False) - - # Save model - np.savetxt( - os.path.join(model_save, 'X.csv'), - self.X, - delimiter=',', - ) - - np.savetxt( - os.path.join(model_save, 'y.csv'), - self.y, - delimiter=',', - ) - - np.savetxt( - os.path.join(model_save, 'g.csv'), - self.g, - delimiter=',', - fmt='%s', - ) - - dill.dump( - self.model, - open(os.path.join(model_save, 'model.dill'), 'wb') - ) - - # Test data - plot = plotter( - self.df, - self.df_bin, - self.gt_rmse, - self.gt_area, - self.model.precs, - ass_save, - ) - plot.generate() - - # Train data - plot = plotter( - self.model.data_cv, - self.model.bin_cv, - self.model.gt_rmse, - self.model.gt_area, - self.model.precs, - model_ass, + if save_outer_folds is not None: + + # Save locations + ass_save = os.path.join(save_outer_folds, 'assessment') + model_save = os.path.join(save_outer_folds, 'model') + model_ass = os.path.join(model_save, 'train') + + # Create locations + for d in [ass_save, model_save, model_ass]: + os.makedirs(d, exist_ok=True) + + # Write test data + self.df.to_csv(os.path.join(ass_save, 'pred.csv'), index=False) + self.df_bin.to_csv(os.path.join( + ass_save, + 'pred_bins.csv', + ), index=False) + + # Save model + np.savetxt( + os.path.join(model_save, 'X.csv'), + self.X, + delimiter=',', ) - plot.generate() - - if name is not None: - - # Create a container - data_path = pkg_resources.resource_filename( - 'madml', - 'templates/docker', - ) - save = os.path.join(parent, 'hosting') - shutil.copytree(data_path, save) - - old = os.getcwd() - os.chdir(save) - shutil.copy('../model/model.dill', '.') - shutil.copy('../model/X.csv', '.') - - # Capture current environment - env = subprocess.run( - ['pip', 'freeze'], - capture_output=True, - text=True - ) - - with open('requirements.txt', 'w') as handle: - handle.write(env.stdout) - if push_container: - docker.build_and_push_container(name) - - with open('user_predict.py', 'r') as handle: - data = handle.read() - - data = data.replace('replace', name) - - with open('user_predict.py', 'w') as handle: - handle.write(data) - - with open('user_predict.ipynb', 'r') as handle: - data = handle.read() + np.savetxt( + os.path.join(model_save, 'y.csv'), + self.y, + delimiter=',', + ) - data = data.replace('replace', name) + np.savetxt( + os.path.join(model_save, 'g.csv'), + self.g, + delimiter=',', + fmt='%s', + ) - with open('user_predict.ipynb', 'w') as handle: - handle.write(data) + dill.dump( + self.model, + open(os.path.join(model_save, 'model.dill'), 'wb') + ) + + # Test data + plot = plotter( + self.df, + self.df_bin, + self.gt_rmse, + self.gt_area, + self.model.precs, + ass_save, + ) + plot.generate() + + # Train data + plot = plotter( + self.model.data_cv, + self.model.bin_cv, + self.model.gt_rmse, + self.model.gt_area, + self.model.precs, + model_ass, + ) + plot.generate() + + if name is not None: + + # Create a container + data_path = pkg_resources.resource_filename( + 'madml', + 'templates/docker', + ) + save = os.path.join(save_outer_folds, 'hosting') + shutil.copytree(data_path, save) + + old = os.getcwd() + os.chdir(save) + shutil.copy('../model/model.dill', '.') + shutil.copy('../model/X.csv', '.') + + # Capture current environment + env = subprocess.run( + ['pip', 'freeze'], + capture_output=True, + text=True + ) + + with open('requirements.txt', 'w') as handle: + handle.write(env.stdout) + + if push_container: + docker.build_and_push_container(name) + + with open('user_predict.py', 'r') as handle: + data = handle.read() + + data = data.replace('replace', name) + + with open('user_predict.py', 'w') as handle: + handle.write(data) + + with open('user_predict.ipynb', 'r') as handle: + data = handle.read() + + data = data.replace('replace', name) + + with open('user_predict.ipynb', 'w') as handle: + handle.write(data) + + os.chdir(old) - os.chdir(old) + return df, df_bin, self.model diff --git a/src/madml/models.py b/src/madml/models.py index 1bc99eb..e4d9658 100644 --- a/src/madml/models.py +++ b/src/madml/models.py @@ -11,6 +11,7 @@ from sklearn.neighbors import KernelDensity from scipy.spatial.distance import cdist from madml.utils import parallel +from madml.plots import plotter from sklearn.base import clone from sklearn import metrics @@ -556,3 +557,18 @@ def predict(self, X, d_input=None): ], axis=1) return pred + + def plot(self, save): + ''' + Plot model fit data. + ''' + + plot = plotter( + self.data_cv, + self.bin_cv, + self.gt_rmse, + self.gt_area, + self.precs, + save, + ) + plot.generate()