GenExam/cal_score.py at main · OpenGVLab/GenExam · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import json
import os
from tqdm import tqdm
from glob import glob
import argparse
from collections import defaultdict

def cal_score_single(eval_result):
    assert "global_evaluation" in eval_result and "answers" in eval_result, f"Invalid eval result: {eval_result}"
    scoring_point_scores = 0
    for idx, answer in enumerate(eval_result["answers"]):
        if answer["answer"] == 1:
            scoring_point_scores += eval_result["scoring_points"][idx]["score"]

    assert round(sum(item["score"] for item in eval_result["scoring_points"]), 4) == 1, f"Invalid sum of scoring point scores: {sum(item['score'] for item in eval_result['scoring_points'])}"

    scores = {
        "semantic_correctness": round(scoring_point_scores, 4),
        "readability": eval_result["global_evaluation"]["Clarity and Readability"]["score"],
        "logical_consistency": eval_result["global_evaluation"]["Logical Consistency"]["score"],
        "spelling": eval_result["global_evaluation"]["Spelling"]["score"],
    }

    scores["scoring_point_all_correct"] = all(item["answer"] == True for item in eval_result["answers"])


    scores["strict_score"] = int(scores["scoring_point_all_correct"] and scores["spelling"] == 2 and scores["readability"] == 2 and scores["logical_consistency"] == 2)
    scores["relaxed_score"] = round(scores["semantic_correctness"] * 0.7 + scores["spelling"] * 0.1 / 2 + scores["readability"] * 0.1 / 2 + scores["logical_consistency"] * 0.1 / 2, 3)

    return scores


def calculate_score(eval_results_dir, sampled_id_path=None):
    all_score_keys = ["semantic_correctness", "spelling", "readability", "logical_consistency"]

    all_scores = defaultdict(list)

    if sampled_id_path is not None:
        with open(sampled_id_path) as f:
            sampled_ids = [x.strip() for x in f.readlines()]

    eval_results = glob(os.path.join(eval_results_dir, "**/*.json"), recursive=True)
    for eval_result_path in eval_results:
        with open(eval_result_path, "r") as f:
            eval_result = json.load(f)

        if sampled_id_path is not None and eval_result["id"] not in sampled_ids:
            continue

        scores = cal_score_single(eval_result)

        data_name = eval_result.get("subject", "all")

        all_scores[data_name].append(scores)

    data_names = sorted(list(all_scores.keys()))

    print("=" * 80)
    print("Each score dimension:")
    for key in all_score_keys:
        score_sum = sum(score[key] for scores in all_scores.values() for score in scores)
        score_avg = score_sum / sum(len(scores) for scores in all_scores.values())
        print(f"- {key}: {round(score_avg, 2)}")

    print("=" * 80)
    print("Each score dimension (average) for each subject:")
    for data_name, scores in all_scores.items():
        if len(scores) == 0:
            continue
        print(f"- {data_name}:")
        for key in all_score_keys:
            score_sum = sum(score[key] for score in scores)
            score_avg = score_sum / len(scores)
            print(f"  {key}: {round(score_avg, 2)}")

    print("-" * 80)
    print("Total number of eval results: ", sum(len(scores) for scores in all_scores.values()))

    print("-" * 80)
    print("Strict score:")
    for data_name in data_names:
        scores = all_scores[data_name]
        if len(scores) == 0:
            continue
        print(f"- {data_name}({len(scores)} samples): {round(sum(score['strict_score'] for score in scores) / len(scores) * 100, 1)}%", end=" ")

    avg_strict_score = sum(sum(score['strict_score'] for score in scores) / len(scores) for scores in all_scores.values() if len(scores) > 0) / len(all_scores)
    print(f"\nAverage strict score: {round(avg_strict_score * 100, 1)}%")

    print("-" * 80)
    print("Relaxed score:")
    for data_name in data_names:
        scores = all_scores[data_name]
        if len(scores) == 0:
            continue
        print(f"- {data_name}({len(scores)} samples): {round(sum(score['relaxed_score'] for score in scores) / len(scores) * 100, 1)}%", end=" ")
    avg_relaxed_score = sum(sum(score['relaxed_score'] for score in scores) / len(scores) for scores in all_scores.values() if len(scores) > 0) / len(all_scores)
    print(f"\nAverage relaxed score: {round(avg_relaxed_score * 100, 1)}%")

    avg_semantic_correctness = sum(sum(score['semantic_correctness'] for score in scores) / len(scores) for scores in all_scores.values() if len(scores) > 0) / len(all_scores)
    avg_spelling = sum(sum(score['spelling'] for score in scores) / len(scores) for scores in all_scores.values() if len(scores) > 0) / len(all_scores)
    avg_readability = sum(sum(score['readability'] for score in scores) / len(scores) for scores in all_scores.values() if len(scores) > 0) / len(all_scores)
    avg_logical_consistency = sum(sum(score['logical_consistency'] for score in scores) / len(scores) for scores in all_scores.values() if len(scores) > 0) / len(all_scores)

    return {
        "strict_score": avg_strict_score,
        "relaxed_score": avg_relaxed_score,
        "semantic_correctness": avg_semantic_correctness,
        "spelling": avg_spelling,
        "readability": avg_readability,
        "logical_consistency": avg_logical_consistency,
    }


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--eval_results_dir", type=str, default="eval_results")
    parser.add_argument("--sampled_id_path", type=str, default="data/mini_sample_ids.txt")
    parser.add_argument("--mini", action="store_true")
    args = parser.parse_args()

    if args.mini:
        calculate_score(eval_results_dir=args.eval_results_dir, sampled_id_path=args.sampled_id_path)
    else:
        calculate_score(eval_results_dir=args.eval_results_dir, sampled_id_path=None)