Skip to content

Commit

Permalink
Now save innner folds
Browse files Browse the repository at this point in the history
  • Loading branch information
leschultz committed Jan 30, 2024
1 parent 6c103f8 commit c809615
Show file tree
Hide file tree
Showing 3 changed files with 147 additions and 119 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# Package information
name = 'madml'
version = '2.1.2' # Need to increment every time to push to PyPI
version = '2.1.3' # Need to increment every time to push to PyPI
description = 'Application domain of machine learning in materials science.'
url = 'https://github.com/leschultz/'\
'materials_application_domain_machine_learning.git'
Expand Down
248 changes: 130 additions & 118 deletions src/madml/assess.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def __init__(
train, test = split
self.splits.append((train, test, count, splitter[0]))

def cv(self, split):
def cv(self, split, save_inner_folds=None):
'''
Fit models and get predictions from one split.
'''
Expand All @@ -78,6 +78,13 @@ def cv(self, split):
model = copy.deepcopy(self.model)
model.fit(self.X[train], self.y[train], self.g[train])

# Save fold fit
if save_inner_folds is not None:
save = os.path.join(save_inner_folds, 'train_folds')
save = os.path.join(save, 'fold_{}'.format(count))
os.makedirs(save, exist_ok=True)
model.plot(save)

# Gather predictions on test set
data = model.predict(self.X[test])

Expand All @@ -104,15 +111,28 @@ def cv(self, split):

return data

def test(self):
def test(
self,
save_inner_folds=None,
save_outer_folds=None,
name=None,
push_container=False,
):
'''
Gather assessment data and plot results.
inputs:
save_inner_folds = Top directory to save training folds.
save_outer_folds = The top directory to save assessment.
name = The name of the container <account>/<repository_name>:<tag>
push_container = Whether to build and push a container with model.
'''

# Assess model
df = []
for i in tqdm(self.splits):
df.append(self.cv(i))
df.append(self.cv(i, save_inner_folds))

df = pd.concat(df) # Combine data
df, df_bin = bin_data(df, self.model.bins)
Expand Down Expand Up @@ -156,123 +176,115 @@ def test(self):
# Full fit
self.model.fit(self.X, self.y, self.g)

return df, df_bin, self.model

def write_results(self, parent, name=None, push_container=False):
'''
Push docker container with full fit model.
inputs:
parent = The top directory to save things.
name = The name of the container <account>/<repository_name>:<tag>
push_container = Whether to build and push a container with model.
'''

# Save locations
ass_save = os.path.join(parent, 'assessment')
model_save = os.path.join(parent, 'model')
model_ass = os.path.join(model_save, 'train')

# Create locations
for d in [ass_save, model_save, model_ass]:
os.makedirs(d, exist_ok=True)

# Write test data
self.df.to_csv(os.path.join(ass_save, 'pred.csv'), index=False)
self.df_bin.to_csv(os.path.join(
ass_save,
'pred_bins.csv',
), index=False)

# Save model
np.savetxt(
os.path.join(model_save, 'X.csv'),
self.X,
delimiter=',',
)

np.savetxt(
os.path.join(model_save, 'y.csv'),
self.y,
delimiter=',',
)

np.savetxt(
os.path.join(model_save, 'g.csv'),
self.g,
delimiter=',',
fmt='%s',
)

dill.dump(
self.model,
open(os.path.join(model_save, 'model.dill'), 'wb')
)

# Test data
plot = plotter(
self.df,
self.df_bin,
self.gt_rmse,
self.gt_area,
self.model.precs,
ass_save,
)
plot.generate()

# Train data
plot = plotter(
self.model.data_cv,
self.model.bin_cv,
self.model.gt_rmse,
self.model.gt_area,
self.model.precs,
model_ass,
if save_outer_folds is not None:

# Save locations
ass_save = os.path.join(save_outer_folds, 'assessment')
model_save = os.path.join(save_outer_folds, 'model')
model_ass = os.path.join(model_save, 'train')

# Create locations
for d in [ass_save, model_save, model_ass]:
os.makedirs(d, exist_ok=True)

# Write test data
self.df.to_csv(os.path.join(ass_save, 'pred.csv'), index=False)
self.df_bin.to_csv(os.path.join(
ass_save,
'pred_bins.csv',
), index=False)

# Save model
np.savetxt(
os.path.join(model_save, 'X.csv'),
self.X,
delimiter=',',
)
plot.generate()

if name is not None:

# Create a container
data_path = pkg_resources.resource_filename(
'madml',
'templates/docker',
)
save = os.path.join(parent, 'hosting')
shutil.copytree(data_path, save)

old = os.getcwd()
os.chdir(save)
shutil.copy('../model/model.dill', '.')
shutil.copy('../model/X.csv', '.')

# Capture current environment
env = subprocess.run(
['pip', 'freeze'],
capture_output=True,
text=True
)

with open('requirements.txt', 'w') as handle:
handle.write(env.stdout)

if push_container:
docker.build_and_push_container(name)

with open('user_predict.py', 'r') as handle:
data = handle.read()

data = data.replace('replace', name)

with open('user_predict.py', 'w') as handle:
handle.write(data)

with open('user_predict.ipynb', 'r') as handle:
data = handle.read()
np.savetxt(
os.path.join(model_save, 'y.csv'),
self.y,
delimiter=',',
)

data = data.replace('replace', name)
np.savetxt(
os.path.join(model_save, 'g.csv'),
self.g,
delimiter=',',
fmt='%s',
)

with open('user_predict.ipynb', 'w') as handle:
handle.write(data)
dill.dump(
self.model,
open(os.path.join(model_save, 'model.dill'), 'wb')
)

# Test data
plot = plotter(
self.df,
self.df_bin,
self.gt_rmse,
self.gt_area,
self.model.precs,
ass_save,
)
plot.generate()

# Train data
plot = plotter(
self.model.data_cv,
self.model.bin_cv,
self.model.gt_rmse,
self.model.gt_area,
self.model.precs,
model_ass,
)
plot.generate()

if name is not None:

# Create a container
data_path = pkg_resources.resource_filename(
'madml',
'templates/docker',
)
save = os.path.join(save_outer_folds, 'hosting')
shutil.copytree(data_path, save)

old = os.getcwd()
os.chdir(save)
shutil.copy('../model/model.dill', '.')
shutil.copy('../model/X.csv', '.')

# Capture current environment
env = subprocess.run(
['pip', 'freeze'],
capture_output=True,
text=True
)

with open('requirements.txt', 'w') as handle:
handle.write(env.stdout)

if push_container:
docker.build_and_push_container(name)

with open('user_predict.py', 'r') as handle:
data = handle.read()

data = data.replace('replace', name)

with open('user_predict.py', 'w') as handle:
handle.write(data)

with open('user_predict.ipynb', 'r') as handle:
data = handle.read()

data = data.replace('replace', name)

with open('user_predict.ipynb', 'w') as handle:
handle.write(data)

os.chdir(old)

os.chdir(old)
return df, df_bin, self.model
16 changes: 16 additions & 0 deletions src/madml/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from sklearn.neighbors import KernelDensity
from scipy.spatial.distance import cdist
from madml.utils import parallel
from madml.plots import plotter
from sklearn.base import clone
from sklearn import metrics

Expand Down Expand Up @@ -556,3 +557,18 @@ def predict(self, X, d_input=None):
], axis=1)

return pred

def plot(self, save):
'''
Plot model fit data.
'''

plot = plotter(
self.data_cv,
self.bin_cv,
self.gt_rmse,
self.gt_area,
self.precs,
save,
)
plot.generate()

0 comments on commit c809615

Please sign in to comment.