forked from ygxw0909/MST-SQL
-
Notifications
You must be signed in to change notification settings - Fork 0
/
evaluator.py
102 lines (85 loc) · 4.31 KB
/
evaluator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
import numpy as np
import utils
from modeling.base_model import BaseModel
from modeling.model_factory import create_model
from featurizer import InputFeature, HydraFeaturizer, SQLDataset
class HydraEvaluator():
def __init__(self, output_path, config, hydra_featurizer: HydraFeaturizer, model:BaseModel, note=""):
self.config = config
self.model = model
self.eval_history_file = os.path.join(output_path, "eval.log")
self.bad_case_dir = os.path.join(output_path, "bad_cases")
if "DEBUG" not in config:
os.mkdir(self.bad_case_dir)
with open(self.eval_history_file, "w", encoding="utf8") as f:
f.write(note.rstrip() + "\n")
self.eval_data = {}
for eval_path in config["dev_data_path"].split("|") + config["test_data_path"].split("|"):
eval_data = SQLDataset(eval_path, config, hydra_featurizer, True)
self.eval_data[os.path.basename(eval_path)] = eval_data
print("Eval Data file {0} loaded, sample num = {1}".format(eval_path, len(eval_data)))
def _eval_imp(self, eval_data: SQLDataset, get_sq=True):
items = ["overall", "agg", "sel", "wn", "wc", "op", "val"]
acc = {k:0.0 for k in items}
sq = []
cnt = 0
model_outputs = self.model.dataset_inference(eval_data)
for input_feature, model_output in zip(eval_data.input_features, model_outputs):
cur_acc = {k:1 for k in acc if k != "overall"}
select_label = np.argmax(input_feature.select)
agg_label = input_feature.agg[select_label]
wn_label = input_feature.where_num[0]
wc_label = [i for i, w in enumerate(input_feature.where) if w == 1]
agg, select, where, conditions, _ = self.model.parse_output(input_feature, model_output, wc_label)
if agg != agg_label:
cur_acc["agg"] = 0
if select != select_label:
cur_acc["sel"] = 0
if len(where) != wn_label:
cur_acc["wn"] = 0
if set(where) != set(wc_label):
cur_acc["wc"] = 0
for w in wc_label:
_, op, vs, ve = conditions[w]
if op != input_feature.op[w]:
cur_acc["op"] = 0
if vs != input_feature.value_start[w] or ve != input_feature.value_end[w]:
cur_acc["val"] = 0
for k in cur_acc:
acc[k] += cur_acc[k]
all_correct = 0 if 0 in cur_acc.values() else 1
acc["overall"] += all_correct
if ("DEBUG" in self.config or get_sq) and not all_correct:
try:
true_sq = input_feature.output_SQ()
pred_sq = input_feature.output_SQ(agg=agg, sel=select, conditions=[conditions[w] for w in where])
task_cor_text = "".join([str(cur_acc[k]) for k in items if k in cur_acc])
sq.append([str(cnt), input_feature.question, "|".join([task_cor_text, pred_sq, true_sq])])
except:
pass
cnt += 1
result_str = []
for item in items:
result_str.append(item + ":{0:.1f}".format(acc[item] * 100.0 / cnt))
result_str = ", ".join(result_str)
return result_str, sq, acc["overall"] * 100.0 / cnt
def eval(self, epochs):
print(self.bad_case_dir)
overall_acc = 0.0
for eval_file in self.eval_data:
result_str, sq, acc = self._eval_imp(self.eval_data[eval_file])
overall_acc += acc
print(eval_file + ": " + result_str)
if "DEBUG" in self.config:
for text in sq:
print(text[0] + ":" + text[1] + "\t" + text[2])
else:
with open(self.eval_history_file, "a+", encoding="utf8") as f:
f.write("[{0}, epoch {1}] ".format(eval_file, epochs) + result_str + "\n")
bad_case_file = os.path.join(self.bad_case_dir,
"{0}_epoch_{1}.log".format(eval_file, epochs))
with open(bad_case_file, "w", encoding="utf8") as f:
for text in sq:
f.write(text[0] + ":" + text[1] + "\t" + text[2] + "\n")
return float(overall_acc / len(self.eval_data.keys()))