kaistAI
diff --git a/‎evaluation_set/flask_evaluation.jsonl
+1,700 b/‎evaluation_set/flask_evaluation.jsonl
+1,700
diff --git a/‎evaluation_set/flask_hard_evaluation.jsonl
+65 b/‎evaluation_set/flask_hard_evaluation.jsonl
+65
diff --git a/‎gpt_review/aggregate_difficulty_skill.py
+94 b/‎gpt_review/aggregate_difficulty_skill.py
+94
diff --git a/‎gpt_review/gpt4_eval.py
+198 b/‎gpt_review/gpt4_eval.py
+198
diff --git a/‎gpt_review/src/prompt.jsonl
+2 b/‎gpt_review/src/prompt.jsonl
+2
diff --git a/‎gpt_review/src/reviewer.jsonl
+2 b/‎gpt_review/src/reviewer.jsonl
+2
@@ -0,0 +1,94 @@
+import json
+import copy
+import csv
+from collections import OrderedDict
+import argparse
+
+total_score = {}
+max_score = {}
+min_score = {}
+review = []
+author = []
+
+parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+parser.add_argument('-m', '--file', default='fig1')
+args = parser.parse_args()
+
+file = args.file
+
+with open(file, 'r') as f:
+    for line in f:
+        review.append(json.loads(line))
+
+difficulty = []
+with open('../evaluation_set/flask_evaluation.jsonl', 'r') as f2:
+    for line in f2:
+        difficulty.append(json.loads(line))
+
+
+init = {
+    "simple lifestyle knowledge":[0,0],
+    "advanced lifestyle knowledge":[0,0],
+    "formal education knowledge":[0,0],
+    "major level knowledge":[0,0],
+    "expert level knowledge":[0,0]
+}
+
+
+difficulty_dict = {}
+cnt=0
+for index, item in enumerate(review):
+
+    level = difficulty[index]["difficulty_labeled"]
+    if len(item["score"])!=3:
+        print("length issue!!!", item["score"], item )
+    for key, score in item["score"].items():
+
+            
+        if key.split(' ')[0] not in difficulty_dict.keys():
+            if 'logical' in key:
+                # print(key)
+                try:
+                    if key.split(' ')[1] not in difficulty_dict:
+                        print(key)
+                        difficulty_dict[key.split(' ')[1]]=copy.deepcopy(init)
+                except:
+                    print(file, key, index)
+            else:
+                print(key)
+                difficulty_dict[key.split(' ')[0]]=copy.deepcopy(init) 
+        if item["score"][key] == "N/A":
+            cnt+=1
+        else: 
+            if 'logical' in key:
+                try:
+                    difficulty_dict[key.split(' ')[1]][str(level)][0]+=float(score)
+                    difficulty_dict[key.split(' ')[1]][str(level)][1]+=1
+                except:
+                    print(file)
+            else: 
+                try: 
+                    difficulty_dict[key.split(' ')[0]][str(level)][0]+=float(score)
+                    difficulty_dict[key.split(' ')[0]][str(level)][1]+=1
+                except:
+                    print(file)
+
+key_order = ["robustness", "correctness", "efficiency", "factuality", "commonsense", "comprehension", "insightfulness", "completeness",  "metacognition","readability", "conciseness", "harmlessness"]
+ordered_dict = OrderedDict((key, difficulty_dict[key]) for key in key_order if key in difficulty_dict)
+
+file_name = file.split('/')[1].split('.jsonl')[0]
+with open("outputs/stats/"+file_name+".csv", "w") as f1: 
+    write = csv.writer(f1)
+    write.writerow(["score", "count", "avg", "cumu"])
+    for key, dict in ordered_dict.items():
+        sum_score = 0 
+        sum_count = 0 
+        for level, list in dict.items():
+            sum_score += list[0]
+            sum_count += list[1]
+            if list[1] != 0:
+                write.writerow([str(list[0]), str(list[1]), str(list[0]/list[1]), str(sum_score/sum_count)])
+            elif sum_count == 0: 
+                write.writerow([str(list[0]), str(list[1]), "N/A", "N/A"])
+            else: 
+                write.writerow([str(list[0]), str(list[1]), "N/A", str(sum_score/sum_count)])
@@ -0,0 +1,198 @@
+import sys
+sys.path.append("../")
+import argparse
+import json
+import os
+
+import shortuuid
+import logging
+from openai_concurrent import OpenAIChatCompletionConcurrent
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+import re
+import ast
+
+def parse_score(review, num):
+    try:
+        
+        match = re.findall(r'{[^}]+}', review)
+        if len(match)>0:
+            
+            dictionary_part = match[-1].replace("\n", "").replace('_', " ").lower()
+            lines = ast.literal_eval(dictionary_part)
+            for key, value in lines.items():
+                if value == 'na':
+                    lines[key] = 'N/A'
+                elif value == 'n/a':
+                    lines[key] = 'N/A'
+                elif value == 'not applicable':
+                    lines[key]= 'N/A'
+            return lines
+        else:
+            return {}
+
+    except Exception as e:
+        logger.error(f'{e}\nContent: {review}\n'
+                     'You must manually fix the score pair.')
+        return {}
+
+
+def gen_prompt(reviewer_jsons, prompt_jsons, skills_jsons, response, item):
+    reviewer_idx = 1
+    prompt_id = reviewer_jsons[reviewer_idx]['prompt_id']
+    prompt_json = prompt_jsons[prompt_id-1]
+    assert prompt_json['prompt_id'] == prompt_id
+
+    sys_prompt = prompt_json['system_prompt']
+    prompt_template = prompt_json['prompt_template']
+    defaults = prompt_json['defaults']
+
+    # skills =metrics
+    skills = ""
+    metric_list = item["metrics"]
+    for label in metric_list:
+        for skill in skills_jsons:
+            if label in skill["Skill"]:
+                name = skill["Skill"]
+                criteria = skill["Criteria"]
+                skills+=f"\n{name}: {criteria}"
+                scoring = skill["Scoring"]
+                skills+=f"\nScore 1: {scoring['1']}"
+                skills+=f"\nScore 2: {scoring['2']}"
+                skills+=f"\nScore 3: {scoring['3']}"
+                skills+=f"\nScore 4: {scoring['4']}"
+                skills+=f"\nScore 5: {scoring['5']}\n\n"
+                break
+    prompt = prompt_template.format(question=item["text"], response=response, skills=skills, num=3, sample_answer=item["answer"], **defaults)
+    print("@@@",prompt)
+    return sys_prompt, prompt
+
+
+def get_json_list(file_path):
+    file_path = os.path.expanduser(file_path)
+    file_extension = file_path.split('.')[-1]
+    if file_extension=="jsonl":
+        with open(file_path, 'r') as f:
+            json_list = []
+            for line in f:
+                json_list.append(json.loads(line))
+            return json_list
+    else:
+        with open(file_path, 'r') as f:
+            return json.load(f)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+    parser.add_argument('-k', '--key-file', default='../openai_info/api_info.json')
+    parser.add_argument('-q', '--question-file', default='../evaluation_set/flask_evaluation.jsonl')
+    parser.add_argument('-s', '--skillset-file', default='../skillset_label/src/skillset.json')
+    parser.add_argument('-a', '--answer-file', default='../model_output/outputs/chatgpt.jsonl')
+    parser.add_argument('-p', '--prompt-file', default='src/ver3/prompt.jsonl')
+    parser.add_argument('-r', '--reviewer-file', default='src/ver3/reviewer.jsonl')
+    parser.add_argument('-o', '--output-review-file', default='outputs/chatgpt_review.jsonl')
+    parser.add_argument('-e', '--output-error-file', default='outputs/chatgpt_review_error.jsonl')
+    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
+    args = parser.parse_args()
+
+    key_jsons = get_json_list(args.key_file)
+    question_jsons = get_json_list(args.question_file)
+    skills_jsons = get_json_list(args.skillset_file)
+    answer_jsons = get_json_list(args.answer_file) 
+    reviewer_jsons = get_json_list(args.reviewer_file)
+    prompt_jsons = get_json_list(args.prompt_file)
+
+    handles = []
+    review_jsons = []
+    total_len = len(question_jsons)
+    question_idx_list = list(range(total_len))
+    question_copy = []
+    answer_copy = []
+
+    requests = []
+    for i in question_idx_list:
+        for row in answer_jsons:
+            if row.get('question_id') == question_jsons[i]['question_id']:
+                answer_elem = row
+                break
+        answer_copy.append(answer_elem)
+        assert answer_copy[i]['question_id'] == question_jsons[i]['question_id']
+        question_copy.append(question_jsons[i])
+        sys_prompt, prompt = gen_prompt(reviewer_jsons, prompt_jsons, skills_jsons,answer_copy[i]["text"], question_jsons[i])
+        review_id = shortuuid.uuid()
+        review_jsons.append({
+            'review_id': review_id,
+            'question_id': question_jsons[i]['question_id'],
+            'metadata': {},
+        })
+        requests.append(
+            {
+                'review_id': review_id,
+                'question_id': question_jsons[i]['question_id'],
+                'metadata': {},
+                'request': {
+                    "model": "gpt-4-0613",
+                    "messages":[
+                        {
+                            'role': 'system',
+                            'content': sys_prompt
+                        },
+                        {
+                            'role': 'user',
+                            'content': prompt,
+                        }
+                    ]
+                },
+                # setting temperature 0 for reproducibility
+                "temperature": 0,
+                "max_tokens": args.max_tokens
+            }
+        )
+
+    openai_concurrent = OpenAIChatCompletionConcurrent(api_keys=key_jsons["api_keys"], requests_per_minute=60, expected_response_seconds=5)
+    responses, fails = openai_concurrent.create_many(requests)
+
+    reviews = [response['response']['choices'][0]['message']['content'] for response in responses]
+    total_tokens = [response['response']['usage']['total_tokens'] for response in responses]
+    print("total_token:", sum(total_tokens))
+
+    delete_index = []
+    if len(fails)>0:
+        with open(f'{args.output_error_file}', 'w') as output_error_file:
+            try:
+                for idx, fail in enumerate(fails):
+                    print("fail:", fail)
+                    for index, item in enumerate(question_copy):
+                        if int(item.get("question_id")) == int(fail['question_id']):
+                            delete_elem_idx = index 
+                    delete_index.append(delete_elem_idx)
+                    output_error_file.write(json.dumps(question_copy[delete_elem_idx]) + '\n')
+            except: 
+                print("@@@", delete_index)
+                delete_index=[]
+    
+    print("$$$", delete_index)
+    question_copy = [item for index, item in enumerate(question_copy) if index not in delete_index]
+
+
+    output_review_directory = os.path.dirname(args.output_review_file)
+
+    if not os.path.exists(output_review_directory):
+        os.makedirs(output_review_directory)
+
+    with open(f'{args.output_review_file}', 'w') as output_review_file:
+        for idx, review in enumerate(reviews):
+            num = 3
+            scores = parse_score(review, num)
+            review_jsons[idx] = question_copy[idx]
+            for row in answer_jsons:
+                if row.get('question_id') == question_copy[idx]['question_id']:
+                    review_jsons[idx]['target_txt'] = row["text"]
+            review_jsons[idx]['review'] = review
+            review_jsons[idx]['score'] = scores
+            review_jsons[idx]['total_tokens_step4'] = total_tokens[idx]
+            try:
+                output_review_file.write(json.dumps(review_jsons[idx]) + '\n')
+            except Exception as e:
+                output_review_file.write('\n')
+                print(review_jsons[idx]['question_id'])
@@ -0,0 +1,2 @@
+{"prompt_id": 1, "system_prompt": "You are a helpful and precise assistant in checking the quality of the answer.", "prompt_template": "{prompt1}\n{skills}\n[Instruction]\n{question}\n\n[Assistant's Response]\n{response}\n\n[The End of Assistant's Response]\n\n{prompt2}\n\n[System]\n\n\n", "defaults": {"prompt1": "We would like to request your feedback on the performance of the response of the assistant to the user instruction displayed below. In the feedback, I want you to rate the quality of the response in these 3 categories according to each scoring rubric:",  "prompt2": "Please give feedback on the assistant's responses. Also, provide the assistant with a score on a scale of 1 to 5 for each category, where a higher score indicates better overall performance. Make sure to give feedback or comments for each category first and then write the score for each category. Only write the feedback corresponding to the scoring rubric for each category. The scores of each category should be orthogonal, indicating that 'correctness' should not be considered for 'readability' category, for example. \n\nLastly, return a Python dictionary object that has skillset names as keys and the corresponding scores as values."}, "description": "Prompt for general questions without sample answer"}
+{"prompt_id": 2, "system_prompt": "You are a helpful and precise assistant in checking the quality of the answer.", "prompt_template": "{prompt1}\n{skills}\n[Instruction]\n{question}\n\n[Ground truth Answer]\n{sample_answer}\n\n[Assistant's Response]\n{response}\n\n[The End of Assistant's Response]\n\n{prompt2}\n\n[System]\n\n\n", "defaults": {"prompt1": "We would like to request your feedback on the performance of the response of the assistant to the user instruction displayed below. In the feedback, I want you to rate the quality of the response in these 3 categories according to each scoring rubric:",  "prompt2": "Please give feedback on the assistant's responses. Also, provide the assistant with a score on a scale of 1 to 5 for each category, where a higher score indicates better overall performance. Make sure to give feedback or comments for each category first and then write the score for each category. Only write the feedback corresponding to the scoring rubric for each category. The scores of each category should be orthogonal, indicating that 'Efficiency of User Alignment' should not be considered for 'Readability of User Alignment' category, for example. \n\nLastly, return a Python dictionary object that has skillset names as keys and the corresponding scores as values."}, "description": "Prompt for general questions with sample answer"}
@@ -0,0 +1,2 @@
+{"reviewer_id": "gpt-4-0613-default", "prompt_id": 1, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for general questions without sample answer"}
+{"reviewer_id": "gpt-4-0613-default", "prompt_id": 2, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for general questions with sample answer"}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	+{"prompt_id": 1, "system_prompt": "You are a helpful and precise assistant in checking the quality of the answer.", "prompt_template": "{prompt1}\n{skills}\n[Instruction]\n{question}\n\n[Assistant's Response]\n{response}\n\n[The End of Assistant's Response]\n\n{prompt2}\n\n[System]\n\n\n", "defaults": {"prompt1": "We would like to request your feedback on the performance of the response of the assistant to the user instruction displayed below. In the feedback, I want you to rate the quality of the response in these 3 categories according to each scoring rubric:", "prompt2": "Please give feedback on the assistant's responses. Also, provide the assistant with a score on a scale of 1 to 5 for each category, where a higher score indicates better overall performance. Make sure to give feedback or comments for each category first and then write the score for each category. Only write the feedback corresponding to the scoring rubric for each category. The scores of each category should be orthogonal, indicating that 'correctness' should not be considered for 'readability' category, for example. \n\nLastly, return a Python dictionary object that has skillset names as keys and the corresponding scores as values."}, "description": "Prompt for general questions without sample answer"}
	`2`	+{"prompt_id": 2, "system_prompt": "You are a helpful and precise assistant in checking the quality of the answer.", "prompt_template": "{prompt1}\n{skills}\n[Instruction]\n{question}\n\n[Ground truth Answer]\n{sample_answer}\n\n[Assistant's Response]\n{response}\n\n[The End of Assistant's Response]\n\n{prompt2}\n\n[System]\n\n\n", "defaults": {"prompt1": "We would like to request your feedback on the performance of the response of the assistant to the user instruction displayed below. In the feedback, I want you to rate the quality of the response in these 3 categories according to each scoring rubric:", "prompt2": "Please give feedback on the assistant's responses. Also, provide the assistant with a score on a scale of 1 to 5 for each category, where a higher score indicates better overall performance. Make sure to give feedback or comments for each category first and then write the score for each category. Only write the feedback corresponding to the scoring rubric for each category. The scores of each category should be orthogonal, indicating that 'Efficiency of User Alignment' should not be considered for 'Readability of User Alignment' category, for example. \n\nLastly, return a Python dictionary object that has skillset names as keys and the corresponding scores as values."}, "description": "Prompt for general questions with sample answer"}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+{"reviewer_id": "gpt-4-0613-default", "prompt_id": 1, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for general questions without sample answer"}`
	`2`	`+{"reviewer_id": "gpt-4-0613-default", "prompt_id": 2, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for general questions with sample answer"}`