From b788bc92c3cb618b2f73f5100cf084973fb11e03 Mon Sep 17 00:00:00 2001
From: leschultz <laneenriqueschultz@gmail.com>
Date: Wed, 1 May 2024 10:18:34 -0500
Subject: [PATCH] Added residual domain

---
 setup.py                 |  2 +-
 src/madml/assess.py      | 21 ++++++++----
 src/madml/calculators.py |  4 +--
 src/madml/models.py      |  8 ++---
 src/madml/plots.py       | 74 ++++++++++++++--------------------------
 5 files changed, 48 insertions(+), 61 deletions(-)

diff --git a/setup.py b/setup.py
index d4ad236..45031c3 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@
 
 # Package information
 name = 'madml'
-version = '2.6.1'  # Need to increment every time to push to PyPI
+version = '2.6.2'  # Need to increment every time to push to PyPI
 description = 'Application domain of machine learning in materials science.'
 url = 'https://github.com/leschultz/'\
       'materials_application_domain_machine_learning.git'
diff --git a/src/madml/assess.py b/src/madml/assess.py
index 7b2d8ed..dd08813 100644
--- a/src/madml/assess.py
+++ b/src/madml/assess.py
@@ -125,9 +125,9 @@ def cv(self, split, save_inner_folds=None):
         # Predictions
         data['r'] = self.y[test]-data['y_pred']
         data['z'] = data['r']/data['y_stdc_pred']
-        data['|r|'] = data['r'].abs()
-        data['|r|/std_y'] = data['|r|']/data['std_y']
-        data['|r|/mad_y'] = data['|r|']/data['mad_y']
+        data['absres'] = data['r'].abs()
+        data['absres/std_y'] = data['absres']/data['std_y']
+        data['absres/mad_y'] = data['absres']/data['mad_y']
         data['y_stdc_pred/std_y'] = data['y_stdc_pred']/data['std_y']
 
         # Ground truths
@@ -177,6 +177,7 @@ def test(
 
         # Acquire ground truths
         self = ground_truth(self, self.y)
+        df['gt_absres'] = self.gt_absres
         df['gt_rmse'] = self.gt_rmse
         df['gt_area'] = self.gt_area
 
@@ -198,7 +199,11 @@ def test(
             cols = p.columns
             cols = [i.split(' (')[0] for i in cols]
             p.columns = cols
-            d = df[['domain_rmse/std_y', 'domain_cdf_area']]
+            d = df[[
+                    'domain_absres/mad_y',
+                    'domain_rmse/std_y',
+                    'domain_cdf_area',
+                    ]]
 
             d = pd.concat([
                            d.reset_index(drop=True),
@@ -215,12 +220,16 @@ def test(
         self.model.fit(self.X, self.y, self.g, n_jobs=self.n_jobs)
 
         # Refit on out-of-bag data for final classification models
+        self.model.domain_absres.fit(
+                                     df['d_pred'].values,
+                                     df['domain_absres/mad_y'].values,
+                                     )
         self.model.domain_rmse.fit(
-                                   df['d_pred'].values,
+                                   df['d_pred_max'].values,
                                    df['domain_rmse/std_y'].values,
                                    )
         self.model.domain_area.fit(
-                                   df['d_pred'].values,
+                                   df['d_pred_max'].values,
                                    df['domain_cdf_area'].values,
                                    )
 
diff --git a/src/madml/calculators.py b/src/madml/calculators.py
index d8b27d7..0284ba3 100644
--- a/src/madml/calculators.py
+++ b/src/madml/calculators.py
@@ -248,7 +248,7 @@ def bin_data(data_cv, bins, by='d_pred'):
     binmax = bin_groups['d_pred'].max()
     counts = bin_groups['z'].count()
     stdc = bin_groups['y_stdc_pred/std_y'].mean()
-    rmse = bin_groups['|r|/std_y'].apply(lambda x: (sum(x**2)/len(x))**0.5)
+    rmse = bin_groups['absres/std_y'].apply(lambda x: (sum(x**2)/len(x))**0.5)
 
     area = bin_groups.apply(lambda x: cdf(
                                           x['z'],
@@ -259,7 +259,7 @@ def bin_data(data_cv, bins, by='d_pred'):
     distmean = distmean.to_frame().add_suffix('_mean')
     binmax = binmax.to_frame().add_suffix('_max')
     stdc = stdc.to_frame().add_suffix('_mean')
-    rmse = rmse.to_frame().rename({'|r|/std_y': 'rmse/std_y'}, axis=1)
+    rmse = rmse.to_frame().rename({'absres/std_y': 'rmse/std_y'}, axis=1)
     counts = counts.to_frame().rename({'z': 'count'}, axis=1)
 
     # Combine data for each bin
diff --git a/src/madml/models.py b/src/madml/models.py
index 6c44c19..358af73 100644
--- a/src/madml/models.py
+++ b/src/madml/models.py
@@ -349,7 +349,7 @@ def assign_ground_truth(data_cv, bin_cv):
         bin_cv.loc[row, 'gt_area'] = group[3]
 
     # Make labels
-    absres = data_cv['|r|/mad_y'] <= data_cv['gt_absres']
+    absres = data_cv['absres/mad_y'] <= data_cv['gt_absres']
     rmse = data_cv['rmse/std_y'] <= data_cv['gt_rmse']
     area = data_cv['cdf_area'] <= data_cv['gt_area']
 
@@ -476,9 +476,9 @@ def cv(self, split, gs_model, ds_model, X, y, g=None):
         data['y_stdu_pred'] = predict_std(gs_model_cv, X_trans_te)
         data['d_pred'] = ds_model_cv.predict(X_trans_te)
         data['r'] = y[te]-data['y_pred']
-        data['|r|'] = data['r'].abs()
-        data['|r|/std_y'] = data['|r|']/data['std_y']
-        data['|r|/mad_y'] = data['|r|']/data['mad_y']
+        data['absres'] = data['r'].abs()
+        data['absres/std_y'] = data['absres']/data['std_y']
+        data['absres/mad_y'] = data['absres']/data['mad_y']
 
         return data
 
diff --git a/src/madml/plots.py b/src/madml/plots.py
index 68538d5..51d2d0f 100644
--- a/src/madml/plots.py
+++ b/src/madml/plots.py
@@ -72,33 +72,6 @@ def plot_dump(data, fig, ax, name, save, suffix, legend=True):
         json.dump(data, handle)
 
 
-def residuals(df, save='.', suffix='d'):
-    '''
-    A plot of absolute residuals vs. dissimilarity.
-
-    inputs:
-        df = Data.
-        save = The directory to save plot.
-        suffix = Append a suffix to the save name.
-    '''
-
-    data = {}
-    fig, ax = pl.subplots()
-
-    x = df['d_pred'].values
-    y = df['|r|/mad_y'].values
-
-    ax.scatter(x, y, marker='.', color='r')
-
-    data['x'] = x.tolist()
-    data['y'] = y.tolist()
-
-    ax.set_xlabel(r'$d$')
-    ax.set_ylabel(r'$|y-\hat{y}|/MAD_{y}$')
-
-    plot_dump(data, fig, ax, 'residuals', save, suffix, False)
-
-
 def confidence(df, save='.', suffix='all'):
     '''
     A plot of absolute residuals vs. dissimilarity.
@@ -159,7 +132,7 @@ def sub(x, y, ylabel, key, gt, gtlabel, metric, color):
     gt_area = float(df['gt_area'].min())  # Should all be same
 
     sorters = {
-               r'$|y-\hat{y}|$': df['|r|'].values,
+               r'$|y-\hat{y}|$': df['absres'].values,
                r'$d$': df['d_pred'].values,
                r'$\sigma_{c}$': df['y_stdc_pred'].values,
                'Random': np.random.uniform(size=df.shape[0]),
@@ -176,7 +149,7 @@ def sub(x, y, ylabel, key, gt, gtlabel, metric, color):
 
         indx = np.argsort(value)
         d = value[indx]
-        y = df['|r|/std_y'].values[indx]
+        y = df['absres/std_y'].values[indx]
         z = df['z'].values[indx]
 
         out = parallel(
@@ -277,7 +250,7 @@ def parity(
     y = df.y
     y_pred = df.y_pred
     y_stdc_pred = df.y_stdc_pred
-    r_std_y = df['|r|/std_y']
+    r_std_y = df['absres/std_y']
     d = df.d_pred
 
     rmse = metrics.mean_squared_error(y, y_pred)**0.5
@@ -468,16 +441,17 @@ def bins(df, d, e, elabel, gt, ylabel, gtlabel, save, suffix):
         ax.scatter(
                    x,
                    y,
-                   alpha=0.5,
+                   alpha=0.4,
                    **p[dom],
                    )
 
-        ax.fill_between(
-                        x,
-                        y,
-                        color=p[dom]['color'],
-                        alpha=0.5,
-                        )
+        if suffix != 'absres':
+            ax.fill_between(
+                            x,
+                            y,
+                            color=p[dom]['color'],
+                            alpha=0.5,
+                            )
 
         data[dom]['x'].append(x.tolist())
         data[dom]['y'].append(y.tolist())
@@ -715,16 +689,17 @@ def __init__(
                  ):
 
         self.save = save
-        self.domains = ['domain_rmse/std_y', 'domain_cdf_area']
-        self.errors = ['rmse/std_y', 'cdf_area']
-        self.gts = ['gt_rmse', 'gt_area']
-        self.assessments = ['rmse', 'area']
+        self.errors = ['absres/mad_y', 'rmse/std_y', 'cdf_area']
+        self.domains = ['domain_'+i for i in self.errors]
+        self.assessments = ['absres', 'rmse', 'area']
+        self.gts = ['gt_'+i for i in self.assessments]
         self.precs = precs  # Precisions used
 
         # For plotting purposes on the histogram of E^* vs. D
-        cols = self.errors+self.domains
-        self.df = df.sort_values(by=['d_pred']+cols)
-        self.df_bin = df_bin.sort_values(by=['d_pred_max']+cols)
+        df_cols = self.errors+self.domains
+        bin_cols = [i for i in df_cols if 'absres' not in i]
+        self.df = df.sort_values(by=['d_pred']+df_cols)
+        self.df_bin = df_bin.sort_values(by=['d_pred_max']+bin_cols)
 
         self.bins = self.df_bin.shape[0]
 
@@ -738,6 +713,8 @@ def __init__(
             self.df_confusion = df_confusion
             self.pred_cols = [i.split(' (')[0] for i in pred_cols]
 
+        self.pred_cols = set(self.pred_cols)
+
     def generate(self):
 
         # Write test data
@@ -755,9 +732,6 @@ def generate(self):
                'fit_splitter',
                )
 
-        # Residuals
-        residuals(self.df, self.save)
-
         # Confidence
         confidence(self.df, self.save)
         confidence(df, self.save, 'fit_splitter')
@@ -799,6 +773,9 @@ def generate(self):
             elif k == 'area':
                 ename = r'$E^{area}$'
                 cname = r'$E^{area}_{c}$'
+            elif k == 'absres':
+                ename = r'$E^{|y-\hat{y}|/MAD_{y}}$'
+                cname = r'$E^{|y-\hat{y}|/MAD_{y}}_{c}$'
             else:
                 raise 'Unsupported error metric'
 
@@ -852,7 +829,8 @@ def generate(self):
 
             # Confusion matrices
             for pred in self.pred_cols:
-                if i.replace('domain_', '') in pred:
+
+                if j in pred:
 
                     # Confusion matrix for all splitters
                     y = self.df_confusion.loc[:, i].values