-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathMain.py
249 lines (210 loc) · 11.9 KB
/
Main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
"""
SPProC: Sequential learning with Physical Probabilistic Constraints
@authors: Armi Tiihonen, Felipe Oviedo, Shreyaa Raghavan, Zhe Liu
MIT Photovoltaics Laboratory
"""
#Libraries: seaborn, scipy, pandas, Python 3.XX and GPyOpt are required
import pandas as pd
import numpy as np
import os
import matplotlib
import seaborn as sns
import GPyOpt
from scipy.integrate import simps
import ternary
import pickle
import datetime
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.tri as tri
from RGB_data import RGB_data
from plotting_v2 import triangleplot
from plotting_v2 import plotBO
def tolerance_factor(suggestion_df = None, tolerance_factor_bound = None):
if (suggestion_df is not None and tolerance_factor_bound is None):
# Calculate the tolerance factor for the given dataframe of suggestions.
tolerance_factor_series = (220+(167*suggestion_df.CsPbI +
217*suggestion_df.MAPbI +
253*suggestion_df.FAPbI))/(1.4142*(119+220))
# Works for materials = ['CsPbI', 'MAPbI', 'FAPbI'] only!
result = tolerance_factor_series
elif (suggestion_df is None and tolerance_factor_bound is not None):
tolerance_factor_constraint = str(tolerance_factor_bound) + ' - ((220+(167*x[:,0] + 217*x[:,1] + 253*x[:,2]))/(1.4142*(119+220)))' # This function is designed for these elements in this
# specific order only: materials = ['CsPbI', 'MAPbI', 'FAPbI']
result = tolerance_factor_constraint
else:
raise ValueError('This function is not intended to be used for this kind of inputs.')
return result
#%%
###############################################################################
# These variables are related to data collection and computing the figure of merit.
# Scroll down for changing variables related to the Bayesian Optimization.
# Give path to the folders that contain camera CSV data. The data will be
# analyzed in this order. Include '/' or '\' from the end of the string.
original_folder = os.getcwd()
folders = [r'/20190606-R1-JT/BMP/RGB/Calibrated/',
r'/20190622-R1-JT/BMP/RGB/Calibrated/',
r'/20190711-R1-JT/BMP/RGB/Calibrated/',
r'/20190723-R1-JT/BMP/RGB/Calibrated/',
r'/20190809-R1-JT/BMP/RGB/Calibrated/']
for n in range(len(folders)):
folders[n] = original_folder + folders[n]
# Give True if the data is calibrated, and False if the data is raw.
is_calibrated = True
# Give True if the data is RGB, and False if the data is LAB.
is_rgb = True
# Give the materials the compositions of which are being optimized. Use the
# same format than in the 'Samples.csv' of the aging tests.
materials = ['CsPbI', 'MAPbI', 'FAPbI']
# Note: Current implementation of tolerance factor function works only for
# these materials. Current implementation of solubility/DFT works only for
# these materials in this order.
# Give the cutoff (in minutes) for analyzing the data. The firs n minutes will
# be utilized and the rest of the data is dropped off.
cutoff = 7000
# We need to drop the empty sample slots from the analysis. This is done by
# by searching for an indicator string from 'Samples.csv', column 'Sample'. The
# default value is '---'.
indicator_for_empty = '---' # Or 'NA'
# Choose how the possible duplicate samples (same compositions several times)
# are treated. Write one of these options: 'first' (choose the first one of the
# duplicates), 'last' (choose the last one of the duplicates), 'mean'
# (calculate the mean of the duplicates, remove the original samples, and
# replace by the mean value), or 'full' (pass all the data to the BO function
# as such - the BO function takes into account every sample). This will treat
# only duplicates in each round, not between the rounds. Default value is 'full'.
duplicate_operation = 'full'
###############################################################################
# Collect the data and compute the figure of merit.
rounds = len(folders)
df_compositions = [None for j in range(rounds)]
mean_RGB = [None for j in range(rounds)]
red = [None for j in range(rounds)]
blue = [None for j in range(rounds)]
green = [None for j in range(rounds)]
times = [None for j in range(rounds)]
merit_area = [None for j in range(rounds)]
merit_diff = [None for j in range(rounds)]
merit_inv_moment = [None for j in range(rounds)]
degradation_input = [None for j in range(rounds)]
compositions_input = [None for j in range(rounds)]
for k in range(rounds):
df_compositions[k] = pd.read_csv(folders[k] + 'Samples.csv')
#Import RGB data and sample details.
mean_RGB[k] = RGB_data(folders[k], df_compositions[k].get("Sample"), cutoff, is_calibrated, is_rgb)
#Get dataframes with sample as level
red[k], blue[k], green[k], times[k]= mean_RGB[k].preprocess()
#Compute figure-of-merit, returns dataframe with figure of merit
merit_area[k] = mean_RGB[k].compute_degradation(method = 'area')
merit_diff[k] = mean_RGB[k].compute_degradation(method = 'diff_area')
merit_inv_moment[k] = mean_RGB[k].compute_degradation(method = 'inverted_moment')
#Pick the one that looks best, in this case will do merit_diff_abs
#Drop the empty slots and prepare the data to be fed into GpyOpt
print('Round ' + str(k) + ':')
print('These slots are empty:')
print(merit_diff[k][merit_diff[k]['Sample'].astype(str).str.contains(indicator_for_empty)])
degradation_input[k] = merit_diff[k][~merit_diff[k]['Sample'].astype(str).str.contains(indicator_for_empty)]
compositions_input[k] = df_compositions[k][~df_compositions[k]['Sample'].astype(str).str.contains(indicator_for_empty)]
#Creating dataframe to report comparison between methods
merit_diff[k] = merit_diff[k][~merit_diff[k]['Sample'].astype(str).str.contains(indicator_for_empty)]
merit_inv_moment[k] = merit_inv_moment[k][~merit_inv_moment[k]['Sample'].astype(str).str.contains(indicator_for_empty)]
merit_area[k] = merit_area[k][~merit_area[k]['Sample'].astype(str).str.contains(indicator_for_empty)]
os.chdir(original_folder)
###############################################################################
# These variables are related to the Bayesian optimization.
num_cores = 1 # Not a parallel run
composition_total = [0.995, 1] # The sum of the amount of each material will be
# limited between these values. If you need everything to sum up to 100% within
# 1%-units accuracy, choose [0.995, 1] (default value). If the code runs too
# long with this value, choose [0.99,1] or even wider range. The code currently
# works only for exactly three materials.
tolerance_factor_bound = 0.80 # Tolerance factor will be limited above this value.
tolerance_factor_function = tolerance_factor(suggestion_df = None,
tolerance_factor_bound = tolerance_factor_bound)
jitter = 0.01 # The level of exploration.
###############################################################################
#%%
# BEGIN BAYESIAN OPTIMIZATION
# Define the variables and the domain for each
# One can define here also material- or round-specific parameters.
bounds = [None for j in range(len(materials))]
for j in range(len(materials)):
bounds[j] = {'name': materials[j], 'type': 'continuous', 'domain': (0,1)}
X_rounds = [None for j in range(rounds)]
Y_rounds = [None for j in range(rounds)]
X_step = [np.empty((0,len(materials))) for j in range(rounds)]
Y_step = [np.empty((0,1)) for j in range(rounds)] # Change (0,1) to (0,2) if multiobjective
batch_size = [None for j in range(rounds)]
constraints = [None for j in range(rounds)]
for k in range(rounds):
batch_size[k] = len(degradation_input[0])
# The batch size i.e. the number of suggestions the algorithm gives is the
# same as the number of samples that were degraded in the first round.
constraints[k] = [{'name': 'constr_1', 'constraint': 'x[:,0] +x[:,1] + x[:,2] - ' + str(composition_total[1])},
{'name': 'constr_2', 'constraint': str(composition_total[0])+'-x[:,0] -x[:,1] - x[:,2] '},
{'name': 'constr_3', 'constraint': tolerance_factor_function}]
# These lines perform the selected operations to the duplicate samples
# (choose first, choose last, choose the average, do nothing).
df = compositions_input[k].copy()
df['Merit'] = degradation_input[k]['Merit'].values
if duplicate_operation == 'first':
df = df.drop_duplicates(subset=materials, keep='first').reset_index()
elif duplicate_operation == 'last':
df = df.drop_duplicates(subset=materials, keep='last').reset_index()
elif duplicate_operation == 'full':
df = df
elif duplicate_operation == 'mean':
df = df.groupby(materials).mean().reset_index()
else:
raise Exception('The given value for treating duplicate samples is not valid. Give a valid value.')
# X is a vector of compositions, Y is a vector of merit values.
X_rounds[k] = df[materials].values
# Reshaping is done to go from (n,) to (n,1), which is required by GPyOpt.
Y_rounds[k] = np.reshape(df['Merit'].values, (df['Merit'].values.shape[0], 1))
# For each BayesianOp round, we include only the data that has been
# collected by that time.
for j in range(rounds):
if j >= k:
X_step[j] = np.append(X_step[j], X_rounds[k], axis=0)
Y_step[j] = np.append(Y_step[j], Y_rounds[k], axis=0)
# Do the Bayesian Optimization.
x_next = [None for k in range(rounds)]
suggestion_df = [None for k in range(rounds)]
BO_batch = [None for k in range(rounds)]
# These files contain DFT data that is integrated into the optimization loop as
# a soft constraint.
DFT_files = ['/phasestability/CsFA/fulldata/CsFA_T300_above.csv',
'/phasestability/FAMA/fulldata/FAMA_T300_above.csv',
'/phasestability/CsMA/fulldata/CsMA_T300_above.csv'
]
for n in range(len(DFT_files)):
DFT_files[n] = original_folder + DFT_files[n]
for i in range(rounds):
print('X_step and Y_step for round ' + str(i) + ':', X_step[i], Y_step[i])
#Define Bayesian Opt object
#Instantiate BO object, f=None as we are using tabular data, no analytical function
BO_batch[i] = GPyOpt.methods.BayesianOptimization(f=None,
domain = bounds,
constraints = constraints[i],
acquisition_type = 'EI_DFT',
files = DFT_files,
normalize_Y = True,
X = X_step[i],
Y = Y_step[i],
evaluator_type = 'local_penalization',
batch_size = batch_size[i],
acquisition_jitter = jitter)
#Suggest next points (samples to prepare).
x_next[i] = BO_batch[i].suggest_next_locations()
suggestion_df[i] = pd.DataFrame(x_next[i], columns = materials)
suggestion_df[i]['Total'] = suggestion_df[i].sum(axis = 1)
suggestion_df[i]['Tolerance Factor'] = tolerance_factor(
suggestion_df = suggestion_df[i],
tolerance_factor_bound = None)
# Plot and save the results.
plotBO(rounds, suggestion_df, compositions_input, degradation_input, BO_batch, materials, X_rounds, x_next, Y_step, X_step)
print('Results are saved into folder ./Results.')
# Save the model as an backup
# dbfile = open('Backup-model-{date:%Y%m%d%H%M%S}'.format(date=datetime.datetime.now()), 'ab')
# pickle.dump([BO_batch, suggestion_df, x_next, X_rounds, Y_rounds], dbfile)
# dbfile.close()