Skip to content

Commit 6d6072e

Browse files
committed
change loss function for composite bn
1 parent 75b370f commit 6d6072e

File tree

1 file changed

+104
-54
lines changed

1 file changed

+104
-54
lines changed

bamt/utils/composite_utils/CompositeGeneticOperators.py

Lines changed: 104 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from scipy.stats import norm
88
from sklearn.metrics import mean_squared_error
99
from sklearn.model_selection import train_test_split
10-
10+
import numpy as np
1111
from .CompositeModel import CompositeModel
1212
from .MLUtils import MlModels
1313

@@ -127,63 +127,113 @@ def custom_mutation_add_model(graph: CompositeModel, **kwargs):
127127
return graph
128128

129129

130-
def composite_metric(graph: CompositeModel, data: pd.DataFrame, percent=0.02):
131-
data_all = data
132-
data_train, data_test = train_test_split(data_all, train_size=0.8, random_state=42)
133-
score, len_data = 0, len(data_train)
130+
131+
def composite_metric(graph: CompositeModel, data: pd.DataFrame):
132+
data_train, data_test = train_test_split(data, train_size=0.8, random_state=42)
133+
score = 0
134+
len_data = len(data_train)
135+
134136
for node in graph.nodes:
135-
data_of_node_train = data_train[node.content["name"]]
136-
data_of_node_test = data_test[node.content["name"]]
137-
if node.nodes_from is None or node.nodes_from == []:
138-
if node.content["type"] == "cont":
139-
mu, sigma = mean(data_of_node_train), std(data_of_node_train)
140-
score += norm.logpdf(
141-
data_of_node_test.values, loc=mu, scale=sigma
142-
).sum()
137+
node_name = node.content["name"]
138+
node_type = node.content["type"]
139+
140+
data_of_node_train = data_train[node_name]
141+
data_of_node_test = data_test[node_name]
142+
index_test_dict = {k:value for value, k in enumerate(sorted(data_train[node_name].unique()))}
143+
144+
if not node.nodes_from:
145+
if node_type == "cont":
146+
mu, sigma = data_of_node_train.mean(), data_of_node_train.std()
147+
score += norm.logpdf(data_of_node_test, loc=mu, scale=sigma).sum()
143148
else:
144149
count = data_of_node_train.value_counts()
145-
frequency = log(count / len_data)
146-
index = frequency.index.tolist()
147-
for value in data_of_node_test:
148-
if value in index:
149-
score += frequency[value]
150+
frequency = np.log(count / len_data)
151+
score += data_of_node_test.map(frequency).fillna(1e-7).sum()
150152
else:
151-
model, columns, target, idx = (
152-
MlModels().dict_models[node.content["parent_model"]](),
153-
[n.content["name"] for n in node.nodes_from],
154-
data_of_node_train.to_numpy(),
155-
data_train.index.to_numpy(),
156-
)
157-
setattr(model, "max_iter", 100000)
158-
features = data_train[columns].to_numpy()
159-
if len(set(target)) == 1:
153+
parent_model = MlModels().dict_models[node.content["parent_model"]]
154+
model = parent_model()
155+
model.max_iter = 100000
156+
157+
columns = [n.content["name"] for n in node.nodes_from]
158+
features_train = data_train[columns].to_numpy()
159+
target_train = data_of_node_train.to_numpy()
160+
161+
if len(set(target_train)) == 1:
160162
continue
161-
fitted_model = model.fit(features, target)
162-
163-
features = data_test[columns].to_numpy()
164-
target = data_of_node_test.to_numpy()
165-
if node.content["type"] == "cont":
166-
predict = fitted_model.predict(features)
167-
mse = mean_squared_error(target, predict, squared=False) + 0.0000001
168-
a = norm.logpdf(target, loc=predict, scale=mse)
169-
score += a.sum()
163+
164+
fitted_model = model.fit(features_train, target_train)
165+
166+
features_test = data_test[columns].to_numpy()
167+
target_test = data_of_node_test.to_numpy()
168+
169+
if node_type == "cont":
170+
predictions = fitted_model.predict(features_test)
171+
mse = mean_squared_error(target_test, predictions, squared=False) + 1e-7
172+
score += norm.logpdf(target_test, loc=predictions, scale=mse).sum()
170173
else:
171-
predict_proba = fitted_model.predict_proba(features)
172-
idx = pd.array(list(range(len(target))))
173-
li = []
174-
175-
for i in idx:
176-
a = predict_proba[i]
177-
try:
178-
b = a[target[i]]
179-
except BaseException:
180-
b = 0.0000001
181-
if b < 0.0000001:
182-
b = 0.0000001
183-
li.append(log(b))
184-
score += sum(li)
185-
186-
edges_count = len(graph.get_edges())
187-
score -= (edges_count * percent) * log10(len_data) * edges_count
188-
174+
predict_proba = fitted_model.predict_proba(features_test)
175+
probas = np.maximum(predict_proba[range(len(target_test)), [index_test_dict[x] for x in target_test]], 1e-7)
176+
score += np.log(probas).sum()
177+
189178
return -score
179+
180+
# def composite_metric(graph: CompositeModel, data: pd.DataFrame, percent=0.02):
181+
# data_all = data
182+
# data_train, data_test = train_test_split(data_all, train_size=0.8, random_state=42)
183+
# score, len_data = 0, len(data_train)
184+
# for node in graph.nodes:
185+
# data_of_node_train = data_train[node.content["name"]]
186+
# data_of_node_test = data_test[node.content["name"]]
187+
# if node.nodes_from is None or node.nodes_from == []:
188+
# if node.content["type"] == "cont":
189+
# mu, sigma = mean(data_of_node_train), std(data_of_node_train)
190+
# score += norm.logpdf(
191+
# data_of_node_test.values, loc=mu, scale=sigma
192+
# ).sum()
193+
# else:
194+
# count = data_of_node_train.value_counts()
195+
# frequency = log(count / len_data)
196+
# index = frequency.index.tolist()
197+
# for value in data_of_node_test:
198+
# if value in index:
199+
# score += frequency[value]
200+
# else:
201+
# model, columns, target, idx = (
202+
# MlModels().dict_models[node.content["parent_model"]](),
203+
# [n.content["name"] for n in node.nodes_from],
204+
# data_of_node_train.to_numpy(),
205+
# data_train.index.to_numpy(),
206+
# )
207+
# setattr(model, "max_iter", 100000)
208+
# features = data_train[columns].to_numpy()
209+
# if len(set(target)) == 1:
210+
# continue
211+
# fitted_model = model.fit(features, target)
212+
213+
# features = data_test[columns].to_numpy()
214+
# target = data_of_node_test.to_numpy()
215+
# if node.content["type"] == "cont":
216+
# predict = fitted_model.predict(features)
217+
# mse = mean_squared_error(target, predict, squared=False) + 0.0000001
218+
# a = norm.logpdf(target, loc=predict, scale=mse)
219+
# score += a.sum()
220+
# else:
221+
# predict_proba = fitted_model.predict_proba(features)
222+
# idx = pd.array(list(range(len(target))))
223+
# li = []
224+
225+
# for i in idx:
226+
# a = predict_proba[i]
227+
# try:
228+
# b = a[target[i]]
229+
# except BaseException:
230+
# b = 0.0000001
231+
# if b < 0.0000001:
232+
# b = 0.0000001
233+
# li.append(log(b))
234+
# score += sum(li)
235+
236+
# # edges_count = len(graph.get_edges())
237+
# # score -= (edges_count * percent) * log10(len_data) * edges_count
238+
239+
# return -score

0 commit comments

Comments
 (0)