|
7 | 7 | from scipy.stats import norm
|
8 | 8 | from sklearn.metrics import mean_squared_error
|
9 | 9 | from sklearn.model_selection import train_test_split
|
10 |
| - |
| 10 | +import numpy as np |
11 | 11 | from .CompositeModel import CompositeModel
|
12 | 12 | from .MLUtils import MlModels
|
13 | 13 |
|
@@ -127,63 +127,113 @@ def custom_mutation_add_model(graph: CompositeModel, **kwargs):
|
127 | 127 | return graph
|
128 | 128 |
|
129 | 129 |
|
130 |
| -def composite_metric(graph: CompositeModel, data: pd.DataFrame, percent=0.02): |
131 |
| - data_all = data |
132 |
| - data_train, data_test = train_test_split(data_all, train_size=0.8, random_state=42) |
133 |
| - score, len_data = 0, len(data_train) |
| 130 | + |
| 131 | +def composite_metric(graph: CompositeModel, data: pd.DataFrame): |
| 132 | + data_train, data_test = train_test_split(data, train_size=0.8, random_state=42) |
| 133 | + score = 0 |
| 134 | + len_data = len(data_train) |
| 135 | + |
134 | 136 | for node in graph.nodes:
|
135 |
| - data_of_node_train = data_train[node.content["name"]] |
136 |
| - data_of_node_test = data_test[node.content["name"]] |
137 |
| - if node.nodes_from is None or node.nodes_from == []: |
138 |
| - if node.content["type"] == "cont": |
139 |
| - mu, sigma = mean(data_of_node_train), std(data_of_node_train) |
140 |
| - score += norm.logpdf( |
141 |
| - data_of_node_test.values, loc=mu, scale=sigma |
142 |
| - ).sum() |
| 137 | + node_name = node.content["name"] |
| 138 | + node_type = node.content["type"] |
| 139 | + |
| 140 | + data_of_node_train = data_train[node_name] |
| 141 | + data_of_node_test = data_test[node_name] |
| 142 | + index_test_dict = {k:value for value, k in enumerate(sorted(data_train[node_name].unique()))} |
| 143 | + |
| 144 | + if not node.nodes_from: |
| 145 | + if node_type == "cont": |
| 146 | + mu, sigma = data_of_node_train.mean(), data_of_node_train.std() |
| 147 | + score += norm.logpdf(data_of_node_test, loc=mu, scale=sigma).sum() |
143 | 148 | else:
|
144 | 149 | count = data_of_node_train.value_counts()
|
145 |
| - frequency = log(count / len_data) |
146 |
| - index = frequency.index.tolist() |
147 |
| - for value in data_of_node_test: |
148 |
| - if value in index: |
149 |
| - score += frequency[value] |
| 150 | + frequency = np.log(count / len_data) |
| 151 | + score += data_of_node_test.map(frequency).fillna(1e-7).sum() |
150 | 152 | else:
|
151 |
| - model, columns, target, idx = ( |
152 |
| - MlModels().dict_models[node.content["parent_model"]](), |
153 |
| - [n.content["name"] for n in node.nodes_from], |
154 |
| - data_of_node_train.to_numpy(), |
155 |
| - data_train.index.to_numpy(), |
156 |
| - ) |
157 |
| - setattr(model, "max_iter", 100000) |
158 |
| - features = data_train[columns].to_numpy() |
159 |
| - if len(set(target)) == 1: |
| 153 | + parent_model = MlModels().dict_models[node.content["parent_model"]] |
| 154 | + model = parent_model() |
| 155 | + model.max_iter = 100000 |
| 156 | + |
| 157 | + columns = [n.content["name"] for n in node.nodes_from] |
| 158 | + features_train = data_train[columns].to_numpy() |
| 159 | + target_train = data_of_node_train.to_numpy() |
| 160 | + |
| 161 | + if len(set(target_train)) == 1: |
160 | 162 | continue
|
161 |
| - fitted_model = model.fit(features, target) |
162 |
| - |
163 |
| - features = data_test[columns].to_numpy() |
164 |
| - target = data_of_node_test.to_numpy() |
165 |
| - if node.content["type"] == "cont": |
166 |
| - predict = fitted_model.predict(features) |
167 |
| - mse = mean_squared_error(target, predict, squared=False) + 0.0000001 |
168 |
| - a = norm.logpdf(target, loc=predict, scale=mse) |
169 |
| - score += a.sum() |
| 163 | + |
| 164 | + fitted_model = model.fit(features_train, target_train) |
| 165 | + |
| 166 | + features_test = data_test[columns].to_numpy() |
| 167 | + target_test = data_of_node_test.to_numpy() |
| 168 | + |
| 169 | + if node_type == "cont": |
| 170 | + predictions = fitted_model.predict(features_test) |
| 171 | + mse = mean_squared_error(target_test, predictions, squared=False) + 1e-7 |
| 172 | + score += norm.logpdf(target_test, loc=predictions, scale=mse).sum() |
170 | 173 | else:
|
171 |
| - predict_proba = fitted_model.predict_proba(features) |
172 |
| - idx = pd.array(list(range(len(target)))) |
173 |
| - li = [] |
174 |
| - |
175 |
| - for i in idx: |
176 |
| - a = predict_proba[i] |
177 |
| - try: |
178 |
| - b = a[target[i]] |
179 |
| - except BaseException: |
180 |
| - b = 0.0000001 |
181 |
| - if b < 0.0000001: |
182 |
| - b = 0.0000001 |
183 |
| - li.append(log(b)) |
184 |
| - score += sum(li) |
185 |
| - |
186 |
| - edges_count = len(graph.get_edges()) |
187 |
| - score -= (edges_count * percent) * log10(len_data) * edges_count |
188 |
| - |
| 174 | + predict_proba = fitted_model.predict_proba(features_test) |
| 175 | + probas = np.maximum(predict_proba[range(len(target_test)), [index_test_dict[x] for x in target_test]], 1e-7) |
| 176 | + score += np.log(probas).sum() |
| 177 | + |
189 | 178 | return -score
|
| 179 | + |
| 180 | +# def composite_metric(graph: CompositeModel, data: pd.DataFrame, percent=0.02): |
| 181 | +# data_all = data |
| 182 | +# data_train, data_test = train_test_split(data_all, train_size=0.8, random_state=42) |
| 183 | +# score, len_data = 0, len(data_train) |
| 184 | +# for node in graph.nodes: |
| 185 | +# data_of_node_train = data_train[node.content["name"]] |
| 186 | +# data_of_node_test = data_test[node.content["name"]] |
| 187 | +# if node.nodes_from is None or node.nodes_from == []: |
| 188 | +# if node.content["type"] == "cont": |
| 189 | +# mu, sigma = mean(data_of_node_train), std(data_of_node_train) |
| 190 | +# score += norm.logpdf( |
| 191 | +# data_of_node_test.values, loc=mu, scale=sigma |
| 192 | +# ).sum() |
| 193 | +# else: |
| 194 | +# count = data_of_node_train.value_counts() |
| 195 | +# frequency = log(count / len_data) |
| 196 | +# index = frequency.index.tolist() |
| 197 | +# for value in data_of_node_test: |
| 198 | +# if value in index: |
| 199 | +# score += frequency[value] |
| 200 | +# else: |
| 201 | +# model, columns, target, idx = ( |
| 202 | +# MlModels().dict_models[node.content["parent_model"]](), |
| 203 | +# [n.content["name"] for n in node.nodes_from], |
| 204 | +# data_of_node_train.to_numpy(), |
| 205 | +# data_train.index.to_numpy(), |
| 206 | +# ) |
| 207 | +# setattr(model, "max_iter", 100000) |
| 208 | +# features = data_train[columns].to_numpy() |
| 209 | +# if len(set(target)) == 1: |
| 210 | +# continue |
| 211 | +# fitted_model = model.fit(features, target) |
| 212 | + |
| 213 | +# features = data_test[columns].to_numpy() |
| 214 | +# target = data_of_node_test.to_numpy() |
| 215 | +# if node.content["type"] == "cont": |
| 216 | +# predict = fitted_model.predict(features) |
| 217 | +# mse = mean_squared_error(target, predict, squared=False) + 0.0000001 |
| 218 | +# a = norm.logpdf(target, loc=predict, scale=mse) |
| 219 | +# score += a.sum() |
| 220 | +# else: |
| 221 | +# predict_proba = fitted_model.predict_proba(features) |
| 222 | +# idx = pd.array(list(range(len(target)))) |
| 223 | +# li = [] |
| 224 | + |
| 225 | +# for i in idx: |
| 226 | +# a = predict_proba[i] |
| 227 | +# try: |
| 228 | +# b = a[target[i]] |
| 229 | +# except BaseException: |
| 230 | +# b = 0.0000001 |
| 231 | +# if b < 0.0000001: |
| 232 | +# b = 0.0000001 |
| 233 | +# li.append(log(b)) |
| 234 | +# score += sum(li) |
| 235 | + |
| 236 | +# # edges_count = len(graph.get_edges()) |
| 237 | +# # score -= (edges_count * percent) * log10(len_data) * edges_count |
| 238 | + |
| 239 | +# return -score |
0 commit comments