Skip to content

Commit 0a0e02e

Browse files
committed
add gpt-4 eval code
1 parent dcbca86 commit 0a0e02e

File tree

10 files changed

+4059
-0
lines changed

10 files changed

+4059
-0
lines changed

evaluation_set/flask_evaluation.jsonl

+1,700
Large diffs are not rendered by default.

evaluation_set/flask_hard_evaluation.jsonl

+65
Large diffs are not rendered by default.
+94
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import json
2+
import copy
3+
import csv
4+
from collections import OrderedDict
5+
import argparse
6+
7+
total_score = {}
8+
max_score = {}
9+
min_score = {}
10+
review = []
11+
author = []
12+
13+
parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
14+
parser.add_argument('-m', '--file', default='fig1')
15+
args = parser.parse_args()
16+
17+
file = args.file
18+
19+
with open(file, 'r') as f:
20+
for line in f:
21+
review.append(json.loads(line))
22+
23+
difficulty = []
24+
with open('../evaluation_set/flask_evaluation.jsonl', 'r') as f2:
25+
for line in f2:
26+
difficulty.append(json.loads(line))
27+
28+
29+
init = {
30+
"simple lifestyle knowledge":[0,0],
31+
"advanced lifestyle knowledge":[0,0],
32+
"formal education knowledge":[0,0],
33+
"major level knowledge":[0,0],
34+
"expert level knowledge":[0,0]
35+
}
36+
37+
38+
difficulty_dict = {}
39+
cnt=0
40+
for index, item in enumerate(review):
41+
42+
level = difficulty[index]["difficulty_labeled"]
43+
if len(item["score"])!=3:
44+
print("length issue!!!", item["score"], item )
45+
for key, score in item["score"].items():
46+
47+
48+
if key.split(' ')[0] not in difficulty_dict.keys():
49+
if 'logical' in key:
50+
# print(key)
51+
try:
52+
if key.split(' ')[1] not in difficulty_dict:
53+
print(key)
54+
difficulty_dict[key.split(' ')[1]]=copy.deepcopy(init)
55+
except:
56+
print(file, key, index)
57+
else:
58+
print(key)
59+
difficulty_dict[key.split(' ')[0]]=copy.deepcopy(init)
60+
if item["score"][key] == "N/A":
61+
cnt+=1
62+
else:
63+
if 'logical' in key:
64+
try:
65+
difficulty_dict[key.split(' ')[1]][str(level)][0]+=float(score)
66+
difficulty_dict[key.split(' ')[1]][str(level)][1]+=1
67+
except:
68+
print(file)
69+
else:
70+
try:
71+
difficulty_dict[key.split(' ')[0]][str(level)][0]+=float(score)
72+
difficulty_dict[key.split(' ')[0]][str(level)][1]+=1
73+
except:
74+
print(file)
75+
76+
key_order = ["robustness", "correctness", "efficiency", "factuality", "commonsense", "comprehension", "insightfulness", "completeness", "metacognition","readability", "conciseness", "harmlessness"]
77+
ordered_dict = OrderedDict((key, difficulty_dict[key]) for key in key_order if key in difficulty_dict)
78+
79+
file_name = file.split('/')[1].split('.jsonl')[0]
80+
with open("outputs/stats/"+file_name+".csv", "w") as f1:
81+
write = csv.writer(f1)
82+
write.writerow(["score", "count", "avg", "cumu"])
83+
for key, dict in ordered_dict.items():
84+
sum_score = 0
85+
sum_count = 0
86+
for level, list in dict.items():
87+
sum_score += list[0]
88+
sum_count += list[1]
89+
if list[1] != 0:
90+
write.writerow([str(list[0]), str(list[1]), str(list[0]/list[1]), str(sum_score/sum_count)])
91+
elif sum_count == 0:
92+
write.writerow([str(list[0]), str(list[1]), "N/A", "N/A"])
93+
else:
94+
write.writerow([str(list[0]), str(list[1]), "N/A", str(sum_score/sum_count)])

gpt_review/gpt4_eval.py

+198
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
import sys
2+
sys.path.append("../")
3+
import argparse
4+
import json
5+
import os
6+
7+
import shortuuid
8+
import logging
9+
from openai_concurrent import OpenAIChatCompletionConcurrent
10+
logging.basicConfig(level=logging.INFO)
11+
logger = logging.getLogger(__name__)
12+
import re
13+
import ast
14+
15+
def parse_score(review, num):
16+
try:
17+
18+
match = re.findall(r'{[^}]+}', review)
19+
if len(match)>0:
20+
21+
dictionary_part = match[-1].replace("\n", "").replace('_', " ").lower()
22+
lines = ast.literal_eval(dictionary_part)
23+
for key, value in lines.items():
24+
if value == 'na':
25+
lines[key] = 'N/A'
26+
elif value == 'n/a':
27+
lines[key] = 'N/A'
28+
elif value == 'not applicable':
29+
lines[key]= 'N/A'
30+
return lines
31+
else:
32+
return {}
33+
34+
except Exception as e:
35+
logger.error(f'{e}\nContent: {review}\n'
36+
'You must manually fix the score pair.')
37+
return {}
38+
39+
40+
def gen_prompt(reviewer_jsons, prompt_jsons, skills_jsons, response, item):
41+
reviewer_idx = 1
42+
prompt_id = reviewer_jsons[reviewer_idx]['prompt_id']
43+
prompt_json = prompt_jsons[prompt_id-1]
44+
assert prompt_json['prompt_id'] == prompt_id
45+
46+
sys_prompt = prompt_json['system_prompt']
47+
prompt_template = prompt_json['prompt_template']
48+
defaults = prompt_json['defaults']
49+
50+
# skills =metrics
51+
skills = ""
52+
metric_list = item["metrics"]
53+
for label in metric_list:
54+
for skill in skills_jsons:
55+
if label in skill["Skill"]:
56+
name = skill["Skill"]
57+
criteria = skill["Criteria"]
58+
skills+=f"\n{name}: {criteria}"
59+
scoring = skill["Scoring"]
60+
skills+=f"\nScore 1: {scoring['1']}"
61+
skills+=f"\nScore 2: {scoring['2']}"
62+
skills+=f"\nScore 3: {scoring['3']}"
63+
skills+=f"\nScore 4: {scoring['4']}"
64+
skills+=f"\nScore 5: {scoring['5']}\n\n"
65+
break
66+
prompt = prompt_template.format(question=item["text"], response=response, skills=skills, num=3, sample_answer=item["answer"], **defaults)
67+
print("@@@",prompt)
68+
return sys_prompt, prompt
69+
70+
71+
def get_json_list(file_path):
72+
file_path = os.path.expanduser(file_path)
73+
file_extension = file_path.split('.')[-1]
74+
if file_extension=="jsonl":
75+
with open(file_path, 'r') as f:
76+
json_list = []
77+
for line in f:
78+
json_list.append(json.loads(line))
79+
return json_list
80+
else:
81+
with open(file_path, 'r') as f:
82+
return json.load(f)
83+
84+
85+
if __name__ == '__main__':
86+
parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
87+
parser.add_argument('-k', '--key-file', default='../openai_info/api_info.json')
88+
parser.add_argument('-q', '--question-file', default='../evaluation_set/flask_evaluation.jsonl')
89+
parser.add_argument('-s', '--skillset-file', default='../skillset_label/src/skillset.json')
90+
parser.add_argument('-a', '--answer-file', default='../model_output/outputs/chatgpt.jsonl')
91+
parser.add_argument('-p', '--prompt-file', default='src/ver3/prompt.jsonl')
92+
parser.add_argument('-r', '--reviewer-file', default='src/ver3/reviewer.jsonl')
93+
parser.add_argument('-o', '--output-review-file', default='outputs/chatgpt_review.jsonl')
94+
parser.add_argument('-e', '--output-error-file', default='outputs/chatgpt_review_error.jsonl')
95+
parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
96+
args = parser.parse_args()
97+
98+
key_jsons = get_json_list(args.key_file)
99+
question_jsons = get_json_list(args.question_file)
100+
skills_jsons = get_json_list(args.skillset_file)
101+
answer_jsons = get_json_list(args.answer_file)
102+
reviewer_jsons = get_json_list(args.reviewer_file)
103+
prompt_jsons = get_json_list(args.prompt_file)
104+
105+
handles = []
106+
review_jsons = []
107+
total_len = len(question_jsons)
108+
question_idx_list = list(range(total_len))
109+
question_copy = []
110+
answer_copy = []
111+
112+
requests = []
113+
for i in question_idx_list:
114+
for row in answer_jsons:
115+
if row.get('question_id') == question_jsons[i]['question_id']:
116+
answer_elem = row
117+
break
118+
answer_copy.append(answer_elem)
119+
assert answer_copy[i]['question_id'] == question_jsons[i]['question_id']
120+
question_copy.append(question_jsons[i])
121+
sys_prompt, prompt = gen_prompt(reviewer_jsons, prompt_jsons, skills_jsons,answer_copy[i]["text"], question_jsons[i])
122+
review_id = shortuuid.uuid()
123+
review_jsons.append({
124+
'review_id': review_id,
125+
'question_id': question_jsons[i]['question_id'],
126+
'metadata': {},
127+
})
128+
requests.append(
129+
{
130+
'review_id': review_id,
131+
'question_id': question_jsons[i]['question_id'],
132+
'metadata': {},
133+
'request': {
134+
"model": "gpt-4-0613",
135+
"messages":[
136+
{
137+
'role': 'system',
138+
'content': sys_prompt
139+
},
140+
{
141+
'role': 'user',
142+
'content': prompt,
143+
}
144+
]
145+
},
146+
# setting temperature 0 for reproducibility
147+
"temperature": 0,
148+
"max_tokens": args.max_tokens
149+
}
150+
)
151+
152+
openai_concurrent = OpenAIChatCompletionConcurrent(api_keys=key_jsons["api_keys"], requests_per_minute=60, expected_response_seconds=5)
153+
responses, fails = openai_concurrent.create_many(requests)
154+
155+
reviews = [response['response']['choices'][0]['message']['content'] for response in responses]
156+
total_tokens = [response['response']['usage']['total_tokens'] for response in responses]
157+
print("total_token:", sum(total_tokens))
158+
159+
delete_index = []
160+
if len(fails)>0:
161+
with open(f'{args.output_error_file}', 'w') as output_error_file:
162+
try:
163+
for idx, fail in enumerate(fails):
164+
print("fail:", fail)
165+
for index, item in enumerate(question_copy):
166+
if int(item.get("question_id")) == int(fail['question_id']):
167+
delete_elem_idx = index
168+
delete_index.append(delete_elem_idx)
169+
output_error_file.write(json.dumps(question_copy[delete_elem_idx]) + '\n')
170+
except:
171+
print("@@@", delete_index)
172+
delete_index=[]
173+
174+
print("$$$", delete_index)
175+
question_copy = [item for index, item in enumerate(question_copy) if index not in delete_index]
176+
177+
178+
output_review_directory = os.path.dirname(args.output_review_file)
179+
180+
if not os.path.exists(output_review_directory):
181+
os.makedirs(output_review_directory)
182+
183+
with open(f'{args.output_review_file}', 'w') as output_review_file:
184+
for idx, review in enumerate(reviews):
185+
num = 3
186+
scores = parse_score(review, num)
187+
review_jsons[idx] = question_copy[idx]
188+
for row in answer_jsons:
189+
if row.get('question_id') == question_copy[idx]['question_id']:
190+
review_jsons[idx]['target_txt'] = row["text"]
191+
review_jsons[idx]['review'] = review
192+
review_jsons[idx]['score'] = scores
193+
review_jsons[idx]['total_tokens_step4'] = total_tokens[idx]
194+
try:
195+
output_review_file.write(json.dumps(review_jsons[idx]) + '\n')
196+
except Exception as e:
197+
output_review_file.write('\n')
198+
print(review_jsons[idx]['question_id'])

gpt_review/src/prompt.jsonl

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
{"prompt_id": 1, "system_prompt": "You are a helpful and precise assistant in checking the quality of the answer.", "prompt_template": "{prompt1}\n{skills}\n[Instruction]\n{question}\n\n[Assistant's Response]\n{response}\n\n[The End of Assistant's Response]\n\n{prompt2}\n\n[System]\n\n\n", "defaults": {"prompt1": "We would like to request your feedback on the performance of the response of the assistant to the user instruction displayed below. In the feedback, I want you to rate the quality of the response in these 3 categories according to each scoring rubric:", "prompt2": "Please give feedback on the assistant's responses. Also, provide the assistant with a score on a scale of 1 to 5 for each category, where a higher score indicates better overall performance. Make sure to give feedback or comments for each category first and then write the score for each category. Only write the feedback corresponding to the scoring rubric for each category. The scores of each category should be orthogonal, indicating that 'correctness' should not be considered for 'readability' category, for example. \n\nLastly, return a Python dictionary object that has skillset names as keys and the corresponding scores as values."}, "description": "Prompt for general questions without sample answer"}
2+
{"prompt_id": 2, "system_prompt": "You are a helpful and precise assistant in checking the quality of the answer.", "prompt_template": "{prompt1}\n{skills}\n[Instruction]\n{question}\n\n[Ground truth Answer]\n{sample_answer}\n\n[Assistant's Response]\n{response}\n\n[The End of Assistant's Response]\n\n{prompt2}\n\n[System]\n\n\n", "defaults": {"prompt1": "We would like to request your feedback on the performance of the response of the assistant to the user instruction displayed below. In the feedback, I want you to rate the quality of the response in these 3 categories according to each scoring rubric:", "prompt2": "Please give feedback on the assistant's responses. Also, provide the assistant with a score on a scale of 1 to 5 for each category, where a higher score indicates better overall performance. Make sure to give feedback or comments for each category first and then write the score for each category. Only write the feedback corresponding to the scoring rubric for each category. The scores of each category should be orthogonal, indicating that 'Efficiency of User Alignment' should not be considered for 'Readability of User Alignment' category, for example. \n\nLastly, return a Python dictionary object that has skillset names as keys and the corresponding scores as values."}, "description": "Prompt for general questions with sample answer"}

gpt_review/src/reviewer.jsonl

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
{"reviewer_id": "gpt-4-0613-default", "prompt_id": 1, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for general questions without sample answer"}
2+
{"reviewer_id": "gpt-4-0613-default", "prompt_id": 2, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for general questions with sample answer"}

0 commit comments

Comments
 (0)