From 78e928743f7bf2c5f77d880a225498e071d02db7 Mon Sep 17 00:00:00 2001 From: leschultz Date: Wed, 31 Jan 2024 16:09:32 -0600 Subject: [PATCH] The ground truth is now defined by each models ground truth from training data --- setup.py | 2 +- src/madml/assess.py | 19 ++++--------------- src/madml/models.py | 45 ++++++++++++++++++++++----------------------- src/madml/plots.py | 28 ++++++++-------------------- 4 files changed, 35 insertions(+), 59 deletions(-) diff --git a/setup.py b/setup.py index b35af43..3100c89 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ # Package information name = 'madml' -version = '2.1.7' # Need to increment every time to push to PyPI +version = '2.1.8' # Need to increment every time to push to PyPI description = 'Application domain of machine learning in materials science.' url = 'https://github.com/leschultz/'\ 'materials_application_domain_machine_learning.git' diff --git a/src/madml/assess.py b/src/madml/assess.py index b80db0a..429a689 100644 --- a/src/madml/assess.py +++ b/src/madml/assess.py @@ -106,6 +106,10 @@ def cv(self, split, save_inner_folds=None): data['r/std_y'] = data['r']/data['std_y'] data['y_stdc_pred/std_y'] = data['y_stdc_pred']/data['std_y'] + # Ground truths + data['gt_rmse'] = model.gt_rmse + data['gt_area'] = model.gt_area + return data def test( @@ -135,23 +139,12 @@ def test( # Full fit self.model.fit(self.X, self.y, self.g) - self.gt_rmse = self.model.gt_rmse - self.gt_area = self.model.gt_area - - pred = self.model.combine_domains_preds(df['d_pred']) - df.drop(pred.columns, axis=1, inplace=True) - df = pd.concat([ - df.reset_index(drop=True), - pred.reset_index(drop=True) - ], axis=1) # Ground truths df, df_bin = bin_data(df, self.model.bins) df, df_bin = assign_ground_truth( df, df_bin, - self.gt_rmse, - self.gt_area, ) if save_outer_folds is not None: @@ -201,8 +194,6 @@ def test( plot = plotter( df, df_bin, - self.gt_rmse, - self.gt_area, self.model.precs, ass_save, ) @@ -212,8 +203,6 @@ def test( plot = plotter( self.model.data_cv, self.model.bin_cv, - self.model.gt_rmse, - self.model.gt_area, self.model.precs, model_ass, ) diff --git a/src/madml/models.py b/src/madml/models.py index 8f0b285..8b1c22b 100644 --- a/src/madml/models.py +++ b/src/madml/models.py @@ -266,37 +266,36 @@ def predict_std(model, X): return std -def assign_ground_truth(data_cv, bin_cv, gt_rmse, gt_area): +def assign_ground_truth(data_cv, bin_cv): data_cv = copy.deepcopy(data_cv) bin_cv = copy.deepcopy(bin_cv) - rmse = bin_cv['rmse/std_y'] <= gt_rmse - area = bin_cv['cdf_area'] <= gt_area + data_cv = data_cv.merge(bin_cv, on=['bin']) - bin_cv['domain_rmse/sigma_y'] = np.where(rmse, 'ID', 'OD') - bin_cv['domain_cdf_area'] = np.where(area, 'ID', 'OD') + # Innitiate arrays + cols = ['gt_rmse', 'gt_area'] + for c in cols: + bin_cv[c] = None - cols = [ - 'domain_rmse/sigma_y', - 'domain_cdf_area', - 'rmse/std_y', - 'cdf_area', - ] + # Propagate ground truths + for group, value in data_cv.groupby(['bin', *cols]): + row = bin_cv['bin'] == group[0] + bin_cv.loc[row, 'gt_rmse'] = group[1] + bin_cv.loc[row, 'gt_area'] = group[2] - # Allocate data - for col in cols: - data_cv[col] = None + # Make labels + rmse = data_cv['rmse/std_y'] <= data_cv['gt_rmse'] + area = data_cv['cdf_area'] <= data_cv['gt_area'] - # Assign bin data to individual points - for i in bin_cv.bin: + data_cv['domain_rmse/sigma_y'] = np.where(rmse, 'ID', 'OD') + data_cv['domain_cdf_area'] = np.where(area, 'ID', 'OD') - # Ground labels based on rmse - row = data_cv['bin'] == i - gt = bin_cv.loc[bin_cv['bin'] == i][cols] + rmse = bin_cv['rmse/std_y'] <= bin_cv['gt_rmse'] + area = bin_cv['cdf_area'] <= bin_cv['gt_area'] - for col in cols: - data_cv.loc[row, col] = gt[col].values[0] + bin_cv['domain_rmse/sigma_y'] = np.where(rmse, 'ID', 'OD') + bin_cv['domain_cdf_area'] = np.where(area, 'ID', 'OD') return data_cv, bin_cv @@ -472,13 +471,13 @@ def fit(self, X, y, g=None, d_input=None): # Acquire ground truths self = ground_truth(self, y) + data_cv['gt_rmse'] = self.gt_rmse + data_cv['gt_area'] = self.gt_area # Classify ground truth labels data_cv, bin_cv = assign_ground_truth( data_cv, bin_cv, - self.gt_rmse, - self.gt_area, ) # Fit domain classifiers diff --git a/src/madml/plots.py b/src/madml/plots.py index bafdf66..8449cc5 100644 --- a/src/madml/plots.py +++ b/src/madml/plots.py @@ -251,7 +251,7 @@ def cdf(df, gt, save, suffix): plot_dump(data, fig, ax, 'cdf', save, suffix) -def bins(df, d, e, elabel, gt, ylabel, save, suffix): +def bins(df, d, e, elabel, ylabel, save, suffix): ''' Plot statistical errors with respect to dissimilarity. @@ -259,13 +259,12 @@ def bins(df, d, e, elabel, gt, ylabel, save, suffix): d = The dissimilarity. e = The error statistic. elabel = The domain labels. - gt = The domain ground truth. ylabel = The y-axis label. save = The directory to save plot. suffix = Append a suffix to the save name. ''' - data = {'gt': gt} + data = {} fig, ax = pl.subplots() for group, values in df.groupby([elabel, 'bin']): @@ -291,12 +290,6 @@ def bins(df, d, e, elabel, gt, ylabel, save, suffix): data[dom]['x'] = x.tolist() data[dom]['y'] = y.tolist() - ax.axhline( - gt, - color='g', - label='GT = {:.2f}'.format(gt), - ) - ax.set_ylabel(ylabel) ax.set_xlabel('D') @@ -495,8 +488,6 @@ def __init__( self, df, df_bin, - gt_rmse, - gt_area, precs, save, ): @@ -505,10 +496,9 @@ def __init__( self.domains = ['domain_rmse/sigma_y', 'domain_cdf_area'] self.errors = ['rmse/std_y', 'cdf_area'] self.assessments = ['rmse', 'area'] - self.gts = [gt_rmse, gt_area] # Ground truths self.precs = precs # Precisions used - # For plotting purposes + # For plotting purposes on the histogram of E^* vs. D cols = self.errors+self.domains self.df = df.sort_values(by=['d_pred']+cols) self.df_bin = df_bin.sort_values(by=['d_pred_max']+cols) @@ -548,12 +538,11 @@ def generate(self): area_vs_rmse(self.df_bin, self.save) # Loop over domains - for i, j, k, f, in zip( - self.domains, - self.errors, - self.assessments, - self.gts, - ): + for i, j, k, in zip( + self.domains, + self.errors, + self.assessments, + ): # Separate domains and classes for group, df in self.df.groupby(i): @@ -588,7 +577,6 @@ def generate(self): 'd_pred', j, i, - f, r'$E^{{{}}}$'.format(k), self.save, k,