diff --git a/setup.py b/setup.py index 79c4c65..f4fcd69 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ # Package information name = 'madml' -version = '0.7.5' # Need to increment every time to push to PyPI +version = '0.7.6' # Need to increment every time to push to PyPI description = 'Application domain of machine learning in materials science.' url = 'https://github.com/leschultz/'\ 'materials_application_domain_machine_learning.git' @@ -28,6 +28,7 @@ 'tensorflow', 'udocker', 'scikeras', + 'seaborn', ] long_description = open('README.md').read() diff --git a/src/madml/models/combine.py b/src/madml/models/combine.py index eddb4b6..89a1a02 100644 --- a/src/madml/models/combine.py +++ b/src/madml/models/combine.py @@ -97,7 +97,8 @@ def __init__( bins=10, save=False, gts=1.0, - gtb=0.25 + gtb=0.25, + weigh=None, ): ''' @@ -110,6 +111,7 @@ def __init__( save = The location to save figures and data. gts = The ground truth cutoff for residual magnitude test. gtb = The ground truth cutoff for statistical test. + weigh = Whether to weight distance features. ''' self.gs_model = gs_model @@ -120,6 +122,7 @@ def __init__( self.splits = copy.deepcopy(splits) self.gts = gts self.gtb = gtb + self.weigh = weigh self.dists = [] self.methods = [''] @@ -230,7 +233,17 @@ def cv(self, split, gs_model, ds_model, X, y, g): data['y_stdu'] = self.std_pred(gs_model_cv, X_trans_te) if self.ds_model: + ds_model_cv = copy.deepcopy(ds_model) + + mod_attr = gs_model_cv.best_estimator_.named_steps['model'] + attr = dir(mod_attr) + + condition = (any([i in attr for i in ['feature_importances_']])) + condition = condition and (self.weigh is True) + if condition: + ds_model_cv.weights = mod_attr.feature_importances_ + ds_model_cv.fit(X_trans_tr) data['dist'] = ds_model_cv.predict(X_trans_te) @@ -307,6 +320,14 @@ def fit(self, X, y, g): ) # Fit distance model + mod_attr = self.gs_model.best_estimator_.named_steps['model'] + attr = dir(mod_attr) + + condition = (any([i in attr for i in ['feature_importances_']])) + condition = condition and (self.weigh is True) + if condition: + self.ds_model.weights = mod_attr.feature_importances_ + self.ds_model.fit(X_trans) out = plots.generate_plots( diff --git a/src/madml/models/space.py b/src/madml/models/space.py index 08943c1..09a804c 100644 --- a/src/madml/models/space.py +++ b/src/madml/models/space.py @@ -6,11 +6,40 @@ import numpy as np +class weighted_model: + + def __init__(self, bandwidth, weights, kernel): + self.bandwidths = bandwidth*weights + self.kernel = kernel + + def fit(self, X_train): + self.models = [] + for b in range(self.bandwidths.shape[0]): + self.model = KernelDensity( + kernel=self.kernel, + bandwidth=self.bandwidths[b], + ).fit(X_train[:, b:b+1]) + + self.models.append(self.model) + + def score_samples(self, X): + scores = [] + for b in range(self.bandwidths.shape[0]): + score = self.models[b].score_samples(X[:, b:b+1]) + scores.append(score) + + return np.sum(scores, axis=0) + + def return_bandwidths(self): + return self.bandwidths + + class distance_model: - def __init__(self, dist='kde', *args, **kwargs): + def __init__(self, dist='kde', weights=None, *args, **kwargs): self.dist = dist + self.weights = weights self.args = args self.kwargs = kwargs @@ -42,16 +71,26 @@ def fit( self.bandwidth = estimate_bandwidth(X_train) # If the estimated bandwidth is zero - if self.bandwidth > 0.0: + if (self.weights is None) and (self.bandwidth == 0.0): self.model = KernelDensity( kernel=self.kernel, - bandwidth=self.bandwidth, ).fit(X_train) - else: + self.bandwidth = self.model.bandwidth # Update + + elif (self.weights is None) and (self.bandwidth > 0.0): self.model = KernelDensity( kernel=self.kernel, + bandwidth=self.bandwidth, ).fit(X_train) - self.bandwidth = self.model.bandwidth # Update + else: + + self.model = weighted_model( + self.bandwidth, + self.weights, + self.kernel + ) + self.model.fit(X_train) + self.bandwidth = self.model.bandwidths dist = self.model.score_samples(X_train) m = max(dist) diff --git a/src/madml/plots.py b/src/madml/plots.py index 1cb676e..fa1960b 100644 --- a/src/madml/plots.py +++ b/src/madml/plots.py @@ -11,6 +11,7 @@ from functools import reduce from sklearn import metrics +import seaborn as sns import pandas as pd import numpy as np @@ -323,10 +324,12 @@ def cdf(x, save=None, binsave=None, subsave='', choice='standard_normal'): cdf_name = 'cdf' parity_name = 'cdf_parity' + dist_name = 'distribution' if binsave is not None: save = os.path.join(save, 'each_bin') cdf_name = '{}_{}'.format(cdf_name, binsave) parity_name = '{}_{}'.format(parity_name, binsave) + dist_name = '{}_{}'.format(dist_name, binsave) os.makedirs(save, exist_ok=True) @@ -472,6 +475,71 @@ def cdf(x, save=None, binsave=None, subsave='', choice='standard_normal'): ), 'w') as handle: json.dump(data, handle) + fig, ax = pl.subplots() + + sns.histplot( + z, + kde=True, + stat='density', + color='g', + ax=ax, + label='Standard Normal Distribution', + ) + + sns.histplot( + x, + kde=True, + stat='density', + color='r', + ax=ax, + label='Observed Distribution', + ) + + ax.set_xlabel('z') + ax.set_ylabel('Fraction') + + fig.tight_layout() + + fig_legend, ax_legend = pl.subplots() + ax_legend.axis(False) + legend = ax_legend.legend( + *ax.get_legend_handles_labels(), + frameon=False, + loc='center', + bbox_to_anchor=(0.5, 0.5) + ) + ax_legend.spines['top'].set_visible(False) + ax_legend.spines['bottom'].set_visible(False) + ax_legend.spines['left'].set_visible(False) + ax_legend.spines['right'].set_visible(False) + + fig.savefig(os.path.join( + save, + '{}{}.png'.format(dist_name, subsave), + ), bbox_inches='tight') + + fig_legend.savefig(os.path.join( + save, + '{}{}_legend.png'.format( + dist_name, + subsave + ), + ), bbox_inches='tight') + + pl.close(fig) + pl.close(fig_legend) + + data = {} + data['x'] = list(eval_points) + data['y'] = list(y) + data['y_pred'] = list(y_pred) + data['Area'] = areacdf + with open(os.path.join( + save, + '{}{}.json'.format(cdf_name, subsave), + ), 'w') as handle: + json.dump(data, handle) + return y, y_pred, areaparity, areacdf