Skip to content

Commit

Permalink
Attempting to scale bandwidths by feature importance
Browse files Browse the repository at this point in the history
  • Loading branch information
leschultz committed Sep 12, 2023
1 parent e66604d commit 90a41b4
Show file tree
Hide file tree
Showing 4 changed files with 136 additions and 7 deletions.
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# Package information
name = 'madml'
version = '0.7.5' # Need to increment every time to push to PyPI
version = '0.7.6' # Need to increment every time to push to PyPI
description = 'Application domain of machine learning in materials science.'
url = 'https://github.com/leschultz/'\
'materials_application_domain_machine_learning.git'
Expand All @@ -28,6 +28,7 @@
'tensorflow',
'udocker',
'scikeras',
'seaborn',
]

long_description = open('README.md').read()
Expand Down
23 changes: 22 additions & 1 deletion src/madml/models/combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ def __init__(
bins=10,
save=False,
gts=1.0,
gtb=0.25
gtb=0.25,
weigh=None,
):

'''
Expand All @@ -110,6 +111,7 @@ def __init__(
save = The location to save figures and data.
gts = The ground truth cutoff for residual magnitude test.
gtb = The ground truth cutoff for statistical test.
weigh = Whether to weight distance features.
'''

self.gs_model = gs_model
Expand All @@ -120,6 +122,7 @@ def __init__(
self.splits = copy.deepcopy(splits)
self.gts = gts
self.gtb = gtb
self.weigh = weigh

self.dists = []
self.methods = ['']
Expand Down Expand Up @@ -230,7 +233,17 @@ def cv(self, split, gs_model, ds_model, X, y, g):
data['y_stdu'] = self.std_pred(gs_model_cv, X_trans_te)

if self.ds_model:

ds_model_cv = copy.deepcopy(ds_model)

mod_attr = gs_model_cv.best_estimator_.named_steps['model']
attr = dir(mod_attr)

condition = (any([i in attr for i in ['feature_importances_']]))
condition = condition and (self.weigh is True)
if condition:
ds_model_cv.weights = mod_attr.feature_importances_

ds_model_cv.fit(X_trans_tr)

data['dist'] = ds_model_cv.predict(X_trans_te)
Expand Down Expand Up @@ -307,6 +320,14 @@ def fit(self, X, y, g):
)

# Fit distance model
mod_attr = self.gs_model.best_estimator_.named_steps['model']
attr = dir(mod_attr)

condition = (any([i in attr for i in ['feature_importances_']]))
condition = condition and (self.weigh is True)
if condition:
self.ds_model.weights = mod_attr.feature_importances_

self.ds_model.fit(X_trans)

out = plots.generate_plots(
Expand Down
49 changes: 44 additions & 5 deletions src/madml/models/space.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,40 @@
import numpy as np


class weighted_model:

def __init__(self, bandwidth, weights, kernel):
self.bandwidths = bandwidth*weights
self.kernel = kernel

def fit(self, X_train):
self.models = []
for b in range(self.bandwidths.shape[0]):
self.model = KernelDensity(
kernel=self.kernel,
bandwidth=self.bandwidths[b],
).fit(X_train[:, b:b+1])

self.models.append(self.model)

def score_samples(self, X):
scores = []
for b in range(self.bandwidths.shape[0]):
score = self.models[b].score_samples(X[:, b:b+1])
scores.append(score)

return np.sum(scores, axis=0)

def return_bandwidths(self):
return self.bandwidths


class distance_model:

def __init__(self, dist='kde', *args, **kwargs):
def __init__(self, dist='kde', weights=None, *args, **kwargs):

self.dist = dist
self.weights = weights
self.args = args
self.kwargs = kwargs

Expand Down Expand Up @@ -42,16 +71,26 @@ def fit(
self.bandwidth = estimate_bandwidth(X_train)

# If the estimated bandwidth is zero
if self.bandwidth > 0.0:
if (self.weights is None) and (self.bandwidth == 0.0):
self.model = KernelDensity(
kernel=self.kernel,
bandwidth=self.bandwidth,
).fit(X_train)
else:
self.bandwidth = self.model.bandwidth # Update

elif (self.weights is None) and (self.bandwidth > 0.0):
self.model = KernelDensity(
kernel=self.kernel,
bandwidth=self.bandwidth,
).fit(X_train)
self.bandwidth = self.model.bandwidth # Update
else:

self.model = weighted_model(
self.bandwidth,
self.weights,
self.kernel
)
self.model.fit(X_train)
self.bandwidth = self.model.bandwidths

dist = self.model.score_samples(X_train)
m = max(dist)
Expand Down
68 changes: 68 additions & 0 deletions src/madml/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from functools import reduce
from sklearn import metrics

import seaborn as sns
import pandas as pd
import numpy as np

Expand Down Expand Up @@ -323,10 +324,12 @@ def cdf(x, save=None, binsave=None, subsave='', choice='standard_normal'):

cdf_name = 'cdf'
parity_name = 'cdf_parity'
dist_name = 'distribution'
if binsave is not None:
save = os.path.join(save, 'each_bin')
cdf_name = '{}_{}'.format(cdf_name, binsave)
parity_name = '{}_{}'.format(parity_name, binsave)
dist_name = '{}_{}'.format(dist_name, binsave)

os.makedirs(save, exist_ok=True)

Expand Down Expand Up @@ -472,6 +475,71 @@ def cdf(x, save=None, binsave=None, subsave='', choice='standard_normal'):
), 'w') as handle:
json.dump(data, handle)

fig, ax = pl.subplots()

sns.histplot(
z,
kde=True,
stat='density',
color='g',
ax=ax,
label='Standard Normal Distribution',
)

sns.histplot(
x,
kde=True,
stat='density',
color='r',
ax=ax,
label='Observed Distribution',
)

ax.set_xlabel('z')
ax.set_ylabel('Fraction')

fig.tight_layout()

fig_legend, ax_legend = pl.subplots()
ax_legend.axis(False)
legend = ax_legend.legend(
*ax.get_legend_handles_labels(),
frameon=False,
loc='center',
bbox_to_anchor=(0.5, 0.5)
)
ax_legend.spines['top'].set_visible(False)
ax_legend.spines['bottom'].set_visible(False)
ax_legend.spines['left'].set_visible(False)
ax_legend.spines['right'].set_visible(False)

fig.savefig(os.path.join(
save,
'{}{}.png'.format(dist_name, subsave),
), bbox_inches='tight')

fig_legend.savefig(os.path.join(
save,
'{}{}_legend.png'.format(
dist_name,
subsave
),
), bbox_inches='tight')

pl.close(fig)
pl.close(fig_legend)

data = {}
data['x'] = list(eval_points)
data['y'] = list(y)
data['y_pred'] = list(y_pred)
data['Area'] = areacdf
with open(os.path.join(
save,
'{}{}.json'.format(cdf_name, subsave),
), 'w') as handle:
json.dump(data, handle)

return y, y_pred, areaparity, areacdf


Expand Down

0 comments on commit 90a41b4

Please sign in to comment.