Attempting to scale bandwidths by feature importance

leschultz · Sep 12, 2023 · 90a41b4 · 90a41b4
1 parent e66604d
commit 90a41b4
Show file tree

Hide file tree

Showing 4 changed files with 136 additions and 7 deletions.
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 # Package information
 name = 'madml'
-version = '0.7.5'  # Need to increment every time to push to PyPI
+version = '0.7.6'  # Need to increment every time to push to PyPI
 description = 'Application domain of machine learning in materials science.'
 url = 'https://github.com/leschultz/'\
       'materials_application_domain_machine_learning.git'
@@ -28,6 +28,7 @@
                     'tensorflow',
                     'udocker',
                     'scikeras',
+                    'seaborn',
                     ]
 
 long_description = open('README.md').read()

diff --git a/src/madml/models/combine.py b/src/madml/models/combine.py
@@ -97,7 +97,8 @@ def __init__(
                  bins=10,
                  save=False,
                  gts=1.0,
-                 gtb=0.25
+                 gtb=0.25,
+                 weigh=None,
                  ):
 
         '''
@@ -110,6 +111,7 @@ def __init__(
             save = The location to save figures and data.
             gts = The ground truth cutoff for residual magnitude test.
             gtb = The ground truth cutoff for statistical test.
+            weigh = Whether to weight distance features.
         '''
 
         self.gs_model = gs_model
@@ -120,6 +122,7 @@ def __init__(
         self.splits = copy.deepcopy(splits)
         self.gts = gts
         self.gtb = gtb
+        self.weigh = weigh
 
         self.dists = []
         self.methods = ['']
@@ -230,7 +233,17 @@ def cv(self, split, gs_model, ds_model, X, y, g):
             data['y_stdu'] = self.std_pred(gs_model_cv, X_trans_te)
 
         if self.ds_model:
+
             ds_model_cv = copy.deepcopy(ds_model)
+
+            mod_attr = gs_model_cv.best_estimator_.named_steps['model']
+            attr = dir(mod_attr)
+
+            condition = (any([i in attr for i in ['feature_importances_']]))
+            condition = condition and (self.weigh is True)
+            if condition:
+                ds_model_cv.weights = mod_attr.feature_importances_
+
             ds_model_cv.fit(X_trans_tr)
 
             data['dist'] = ds_model_cv.predict(X_trans_te)
@@ -307,6 +320,14 @@ def fit(self, X, y, g):
                                       )
 
             # Fit distance model
+            mod_attr = self.gs_model.best_estimator_.named_steps['model']
+            attr = dir(mod_attr)
+
+            condition = (any([i in attr for i in ['feature_importances_']]))
+            condition = condition and (self.weigh is True)
+            if condition:
+                self.ds_model.weights = mod_attr.feature_importances_
+
             self.ds_model.fit(X_trans)
 
         out = plots.generate_plots(

diff --git a/src/madml/models/space.py b/src/madml/models/space.py
@@ -6,11 +6,40 @@
 import numpy as np
 
 
+class weighted_model:
+
+    def __init__(self, bandwidth, weights, kernel):
+        self.bandwidths = bandwidth*weights
+        self.kernel = kernel
+
+    def fit(self, X_train):
+        self.models = []
+        for b in range(self.bandwidths.shape[0]):
+            self.model = KernelDensity(
+                                       kernel=self.kernel,
+                                       bandwidth=self.bandwidths[b],
+                                       ).fit(X_train[:, b:b+1])
+
+            self.models.append(self.model)
+
+    def score_samples(self, X):
+        scores = []
+        for b in range(self.bandwidths.shape[0]):
+            score = self.models[b].score_samples(X[:, b:b+1])
+            scores.append(score)
+
+        return np.sum(scores, axis=0)
+
+    def return_bandwidths(self):
+        return self.bandwidths
+
+
 class distance_model:
 
-    def __init__(self, dist='kde', *args, **kwargs):
+    def __init__(self, dist='kde', weights=None, *args, **kwargs):
 
         self.dist = dist
+        self.weights = weights
         self.args = args
         self.kwargs = kwargs
 
@@ -42,16 +71,26 @@ def fit(
                 self.bandwidth = estimate_bandwidth(X_train)
 
             # If the estimated bandwidth is zero
-            if self.bandwidth > 0.0:
+            if (self.weights is None) and (self.bandwidth == 0.0):
                 self.model = KernelDensity(
                                            kernel=self.kernel,
-                                           bandwidth=self.bandwidth,
                                            ).fit(X_train)
-            else:
+                self.bandwidth = self.model.bandwidth  # Update
+
+            elif (self.weights is None) and (self.bandwidth > 0.0):
                 self.model = KernelDensity(
                                            kernel=self.kernel,
+                                           bandwidth=self.bandwidth,
                                            ).fit(X_train)
-                self.bandwidth = self.model.bandwidth  # Update
+            else:
+
+                self.model = weighted_model(
+                                            self.bandwidth,
+                                            self.weights,
+                                            self.kernel
+                                            )
+                self.model.fit(X_train)
+                self.bandwidth = self.model.bandwidths
 
             dist = self.model.score_samples(X_train)
             m = max(dist)

diff --git a/src/madml/plots.py b/src/madml/plots.py
@@ -11,6 +11,7 @@
 from functools import reduce
 from sklearn import metrics
 
+import seaborn as sns
 import pandas as pd
 import numpy as np
 
@@ -323,10 +324,12 @@ def cdf(x, save=None, binsave=None, subsave='', choice='standard_normal'):
 
         cdf_name = 'cdf'
         parity_name = 'cdf_parity'
+        dist_name = 'distribution'
         if binsave is not None:
             save = os.path.join(save, 'each_bin')
             cdf_name = '{}_{}'.format(cdf_name, binsave)
             parity_name = '{}_{}'.format(parity_name, binsave)
+            dist_name = '{}_{}'.format(dist_name, binsave)
 
         os.makedirs(save, exist_ok=True)
 
@@ -472,6 +475,71 @@ def cdf(x, save=None, binsave=None, subsave='', choice='standard_normal'):
                                ), 'w') as handle:
             json.dump(data, handle)
 
+        fig, ax = pl.subplots()
+
+        sns.histplot(
+                     z,
+                     kde=True,
+                     stat='density',
+                     color='g',
+                     ax=ax,
+                     label='Standard Normal Distribution',
+                     )
+
+        sns.histplot(
+                     x,
+                     kde=True,
+                     stat='density',
+                     color='r',
+                     ax=ax,
+                     label='Observed Distribution',
+                     )
+
+        ax.set_xlabel('z')
+        ax.set_ylabel('Fraction')
+
+        fig.tight_layout()
+
+        fig_legend, ax_legend = pl.subplots()
+        ax_legend.axis(False)
+        legend = ax_legend.legend(
+                                  *ax.get_legend_handles_labels(),
+                                  frameon=False,
+                                  loc='center',
+                                  bbox_to_anchor=(0.5, 0.5)
+                                  )
+        ax_legend.spines['top'].set_visible(False)
+        ax_legend.spines['bottom'].set_visible(False)
+        ax_legend.spines['left'].set_visible(False)
+        ax_legend.spines['right'].set_visible(False)
+
+        fig.savefig(os.path.join(
+                                 save,
+                                 '{}{}.png'.format(dist_name, subsave),
+                                 ), bbox_inches='tight')
+
+        fig_legend.savefig(os.path.join(
+                                        save,
+                                        '{}{}_legend.png'.format(
+                                                                 dist_name,
+                                                                 subsave
+                                                                 ),
+                                        ), bbox_inches='tight')
+
+        pl.close(fig)
+        pl.close(fig_legend)
+
+        data = {}
+        data['x'] = list(eval_points)
+        data['y'] = list(y)
+        data['y_pred'] = list(y_pred)
+        data['Area'] = areacdf
+        with open(os.path.join(
+                               save,
+                               '{}{}.json'.format(cdf_name, subsave),
+                               ), 'w') as handle:
+            json.dump(data, handle)
+
     return y, y_pred, areaparity, areacdf