diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 532e9e7ae6..bbba109434 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -107,9 +107,6 @@ def __post_init__(self) -> None:
self.generation_kwargs["temperature"] = float(
self.generation_kwargs["temperature"]
)
-
- if "until" not in self.generation_kwargs:
- self.generation_kwargs["until"] = [self.fewshot_delimiter]
else:
if self.output_type == "generate_until":
# ensure that we greedily generate in absence of explicit arguments otherwise
diff --git a/lm_eval/tasks/README.md b/lm_eval/tasks/README.md
index 83977a4790..9fb727a784 100644
--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -97,6 +97,7 @@
| [race](race/README.md) | Reading comprehension assessment tasks based on English exams in China. | English |
| realtoxicityprompts | Tasks to evaluate language models for generating text with potential toxicity. | |
| [sciq](sciq/README.md) | Science Question Answering tasks to assess understanding of scientific concepts. | English |
+| [score](score/README.md) | Systematic consistency and robustness evaluation for LLMs on 3 datasets(MMLU-Pro, Agi Eval and MATH) | English |
| [scrolls](scrolls/README.md) | Tasks that involve long-form reading comprehension across various domains. | English |
| [siqa](siqa/README.md) | Social Interaction Question Answering to evaluate common sense and social reasoning. | English |
| [spanish_bench](spanish_bench/README.md) | Collection of tasks in Spanish encompassing various evaluation areas. | Spanish |
diff --git a/lm_eval/tasks/score/README.md b/lm_eval/tasks/score/README.md
new file mode 100644
index 0000000000..4055d5f76c
--- /dev/null
+++ b/lm_eval/tasks/score/README.md
@@ -0,0 +1,89 @@
+```
+Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+````
+# SCORE: Systematic COnsistency and Robustness Evaluation for Large Language Models
+
+
+## Citation
+```bib
+[Citation placeholder]
+```
+
+## Groups
+
+- `score_robustness_mmlu_pro`: two 0-shot robutstness tasks on MMLU-PRO dataset [[1](#mmlu_pro)]
+
+- `score_robustness_agieval`: two 0-shot robutstness tasks on the AGIEVAL datasets [[2](#agi_eval)] multiple choice questions subsets: `'agieval-sat-math'`, `'agieval-lsat-lr'`, `'agieval-lsat-rc'`, `'agieval-logiqa-en'`, `'agieval-aqua-rat'`, `'agieval-sat-en'`, `'agieval-lsat-ar'`
+
+- `score_robustness_math`: one 0-shot robutstness tasks on Hendryk's MATH dataset [[3](#math)]
+
+## Tasks
+
+Both `score_robustness_mmlu_pro` and `score_robustness_agieval` contain the following 2 tasks:
+
+* Option order robustness:
+`score_option_order_robustness_mmlu_pro`,
+`score_option_order_robustness_agieval`
+
+* Prompt robustness:
+`score_prompt_robustness_mmlu_pro`,
+`score_prompt_robustness_agieval`,
+
+Whereas math contains only
+* Prompt robustness:
+`score_prompt_robustness_math`
+
+
+### Option order robustness
+
+Measures the model's robustness to the placement of the correct answer in the options list by swapping the correct answer with all the other possible options.
+
+### Prompt robustness
+
+Measures the model's robustness to 10 different prompts. list of the prompts can be found in the `./prompt_templates.json` file under the key `prompt_robustness`.
+
+
+## Metrics
+
+All robustness tasks calculate 2 metrics: *Accuracy* and *Consistency Rate(CR)* [[4](#cr)].
+
+$CR = \frac{1}{|Q|} \sum_{Q_k \in Q} \sum_{y_i \in Y_k} \sum_{\substack{y_j \in Y_k \\ j \neq i}}\frac{\text{sim}(y_i, y_j)}{\binom{|Y_k|}{2}}$
+
+## Notes
+
+- All tasks are designed for **Instruct** models for which we recommend to pass "`--apply_chat_template`" flag.
+
+
+## References
+[1] Wang, et al. "Mmlu-pro: A more robust and challenging multi-task language understanding benchmark." arXiv preprint arXiv:2406.01574 (2024).
+
+[2] Zhong, et al. "Agieval: A human-centric benchmark for evaluating foundation models." arXiv preprint arXiv:2304.06364 (2023).
+
+[3] Hendrycks et al. "Measuring Mathematical Problem Solving With the MATH Dataset." arXiv:2103.03874 (2021).
+
+[4] Yukun et al. "Improving the robustness of large language models via consistency alignment." arXiv:2403.14221 (2024).
+
+## Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [-] Is the task an existing benchmark in the literature?
+ * [-] Have you referenced the original paper that introduced the task? - Will be referenced as soon as the paper is published
+ * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
diff --git a/lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_aqua_rat.yaml b/lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_aqua_rat.yaml
new file mode 100644
index 0000000000..166539fe81
--- /dev/null
+++ b/lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_aqua_rat.yaml
@@ -0,0 +1,46 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+task: option_order_robustness_agieval_aqua_rat
+dataset_path: hails/agieval-aqua-rat
+dataset_name: default
+output_type: generate_until
+test_split: test
+process_docs: !function utils_agieval.option_order_robustness_process_docs
+doc_to_text: !function utils_agieval.agi_eval_robustness_doc_to_text
+doc_to_target: answer
+generation_kwargs:
+ max_gen_toks: 1024
+ do_sample: False
+process_results: !function utils_agieval.option_order_robustness_process_results
+metric_list:
+ - metric: per_option_accuracy_A
+ aggregation: !function utils_agieval.per_option_accuracy_a
+ higher_is_better: true
+ - metric: per_option_accuracy_B
+ aggregation: !function utils_agieval.per_option_accuracy_b
+ higher_is_better: true
+ - metric: per_option_accuracy_C
+ aggregation: !function utils_agieval.per_option_accuracy_c
+ higher_is_better: true
+ - metric: per_option_accuracy_D
+ aggregation: !function utils_agieval.per_option_accuracy_d
+ higher_is_better: true
+ - metric: options_consistency_rate
+ aggregation: !function utils_agieval.options_consistency_rate
+ higher_is_better: true
+metadata:
+ version: 1.0
+dataset_kwargs:
+ trust_remote_code: true
diff --git a/lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_logiqa_en.yaml b/lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_logiqa_en.yaml
new file mode 100644
index 0000000000..7d9f7d5445
--- /dev/null
+++ b/lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_logiqa_en.yaml
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include: option_order_robustness_agieval_aqua_rat.yaml
+task: option_order_robustness_agieval_logiqa_en
+dataset_path: hails/agieval-logiqa-en
diff --git a/lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_lsat_ar.yaml b/lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_lsat_ar.yaml
new file mode 100644
index 0000000000..1d897edbb0
--- /dev/null
+++ b/lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_lsat_ar.yaml
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include: option_order_robustness_agieval_aqua_rat.yaml
+task: option_order_robustness_agieval_lsat_ar
+dataset_path: hails/agieval-lsat-ar
diff --git a/lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_lsat_lr.yaml b/lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_lsat_lr.yaml
new file mode 100644
index 0000000000..27dca84912
--- /dev/null
+++ b/lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_lsat_lr.yaml
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include: option_order_robustness_agieval_aqua_rat.yaml
+task: option_order_robustness_agieval_lsat_lr
+dataset_path: hails/agieval-lsat-lr
diff --git a/lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_lsat_rc.yaml b/lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_lsat_rc.yaml
new file mode 100644
index 0000000000..f476a079fd
--- /dev/null
+++ b/lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_lsat_rc.yaml
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include: option_order_robustness_agieval_aqua_rat.yaml
+task: option_order_robustness_agieval_lsat_rc
+dataset_path: hails/agieval-lsat-rc
diff --git a/lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_sat_en.yaml b/lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_sat_en.yaml
new file mode 100644
index 0000000000..4b90fd203e
--- /dev/null
+++ b/lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_sat_en.yaml
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include: option_order_robustness_agieval_aqua_rat.yaml
+task: option_order_robustness_agieval_sat_en
+dataset_path: hails/agieval-sat-en
diff --git a/lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_sat_math.yaml b/lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_sat_math.yaml
new file mode 100644
index 0000000000..3b0d82e894
--- /dev/null
+++ b/lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_sat_math.yaml
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include: option_order_robustness_agieval_aqua_rat.yaml
+task: option_order_robustness_agieval_sat_math
+dataset_path: hails/agieval-sat-math
diff --git a/lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_aqua_rat.yaml b/lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_aqua_rat.yaml
new file mode 100644
index 0000000000..afba761205
--- /dev/null
+++ b/lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_aqua_rat.yaml
@@ -0,0 +1,64 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+task: prompt_robustness_agieval_aqua_rat
+dataset_path: hails/agieval-aqua-rat
+dataset_name: default
+output_type: generate_until
+test_split: test
+process_docs: !function utils_agieval.prompt_robustness_process_docs
+doc_to_text: !function utils_agieval.agi_eval_robustness_doc_to_text
+doc_to_target: answer
+generation_kwargs:
+ max_gen_toks: 1024
+ do_sample: False
+process_results: !function utils_agieval.prompt_robustness_process_results
+metric_list:
+ - metric: 0_accuracy
+ aggregation: !function utils_agieval.per_prompt_accuracy_0
+ higher_is_better: true
+ - metric: 1_accuracy
+ aggregation: !function utils_agieval.per_prompt_accuracy_1
+ higher_is_better: true
+ - metric: 2_accuracy
+ aggregation: !function utils_agieval.per_prompt_accuracy_2
+ higher_is_better: true
+ - metric: 3_accuracy
+ aggregation: !function utils_agieval.per_prompt_accuracy_3
+ higher_is_better: true
+ - metric: 4_accuracy
+ aggregation: !function utils_agieval.per_prompt_accuracy_4
+ higher_is_better: true
+ - metric: 5_accuracy
+ aggregation: !function utils_agieval.per_prompt_accuracy_5
+ higher_is_better: true
+ - metric: 6_accuracy
+ aggregation: !function utils_agieval.per_prompt_accuracy_6
+ higher_is_better: true
+ - metric: 7_accuracy
+ aggregation: !function utils_agieval.per_prompt_accuracy_7
+ higher_is_better: true
+ - metric: 8_accuracy
+ aggregation: !function utils_agieval.per_prompt_accuracy_8
+ higher_is_better: true
+ - metric: 9_accuracy
+ aggregation: !function utils_agieval.per_prompt_accuracy_9
+ higher_is_better: true
+ - metric: consistency_rate
+ aggregation: !function utils_agieval.agi_eval_prompt_consistency_rate
+ higher_is_better: true
+metadata:
+ version: 1.0
+dataset_kwargs:
+ trust_remote_code: true
diff --git a/lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_logiqa_en.yaml b/lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_logiqa_en.yaml
new file mode 100644
index 0000000000..417998c653
--- /dev/null
+++ b/lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_logiqa_en.yaml
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include: prompt_robustness_agieval_aqua_rat.yaml
+task: prompt_robustness_agieval_logiqa_en
+dataset_path: hails/agieval-logiqa-en
diff --git a/lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_lsat_rc.yaml b/lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_lsat_rc.yaml
new file mode 100644
index 0000000000..4c4a749aa8
--- /dev/null
+++ b/lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_lsat_rc.yaml
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include: prompt_robustness_agieval_aqua_rat.yaml
+task: prompt_robustness_agieval_lsat_rc
+dataset_path: hails/agieval-lsat-rc
diff --git a/lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_lstat_ar.yaml b/lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_lstat_ar.yaml
new file mode 100644
index 0000000000..9c69244d10
--- /dev/null
+++ b/lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_lstat_ar.yaml
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include: prompt_robustness_agieval_aqua_rat.yaml
+task: prompt_robustness_agieval_lsat_ar
+dataset_path: hails/agieval-lsat-ar
diff --git a/lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_lstat_lr.yaml b/lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_lstat_lr.yaml
new file mode 100644
index 0000000000..3add875cba
--- /dev/null
+++ b/lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_lstat_lr.yaml
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include: prompt_robustness_agieval_aqua_rat.yaml
+task: prompt_robustness_agieval_lsat_lr
+dataset_path: hails/agieval-lsat-lr
diff --git a/lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_sat_en.yaml b/lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_sat_en.yaml
new file mode 100644
index 0000000000..5f6a78544a
--- /dev/null
+++ b/lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_sat_en.yaml
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include: prompt_robustness_agieval_aqua_rat.yaml
+task: prompt_robustness_agieval_sat_en
+dataset_path: hails/agieval-sat-en
diff --git a/lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_sat_math.yaml b/lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_sat_math.yaml
new file mode 100644
index 0000000000..65af591eae
--- /dev/null
+++ b/lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_sat_math.yaml
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include: prompt_robustness_agieval_aqua_rat.yaml
+task: prompt_robustness_agieval_sat_math
+dataset_path: hails/agieval-sat-math
diff --git a/lm_eval/tasks/score/agi_eval/prompt_templates.json b/lm_eval/tasks/score/agi_eval/prompt_templates.json
new file mode 100644
index 0000000000..720a66355e
--- /dev/null
+++ b/lm_eval/tasks/score/agi_eval/prompt_templates.json
@@ -0,0 +1,52 @@
+{
+ "option_order_robustness":{
+ "prompt": "For the multiple-choice question, which option (A-E) is correct?.\n\nQuestion: {question}{options}\n\nEnd the answer with the following:\nThe best answer is (the_answer_letter) where the (the_answer_letter) is one of 'A', 'B', 'C', 'D' or 'E'.",
+ "options_format": "\n{letter}: {option}"
+ },
+
+
+ "prompt_robustness":[
+ {
+ "prompt": "{question}{options}\nExamine the question and choose the correct answer from the options 'A', 'B', 'C', 'D' or 'E'. End your answer with:\nThe best answer is [the_answer_letter].\nwhere the [the_answer_letter] is a letter from A to E.",
+ "options_format": "\n{letter}: {option}"
+ },
+ {
+ "prompt": "{question}{options}\nAnswer the multiple-choice question by selecting the correct option from A to E. Always conclude with 'The best answer is (answer_letter)' where the (answer_letter) is one of A, B, C, D, E.",
+ "options_format": "\n{letter}: {option}"
+ },
+ {
+ "prompt": "You must reply with only a single letter from A, B, C, D or E to this question. Conclude with:\nThe best answer is answer_letter where the answer_letter is a single letter from A to E.\n{question}{options}",
+ "options_format": "\n{letter}: {option}"
+ },
+ {
+ "prompt": "From the options A-E, select the correct answer to the following question. End the answer with - The best answer is answer_letter, where answer_letter is one of A, B, C, D or E.\nQuestion: {question}{options}",
+ "options_format": "\n{letter}: {option}"
+ },
+ {
+ "prompt": "For the multiple-choice question, which option (A-E) is correct?.\n\nQuestion:{question}{options}\nEnd the answer with the following:\nThe best answer is (the_answer_letter) where the (the_answer_letter) is one of 'A', 'B', 'C', 'D' or 'E'.",
+ "options_format": "\n{letter}: {option}"
+ },
+ {
+ "prompt": "Evaluate the multiple-choice question and select the most fitting response from 'A', 'B', 'C', 'D', 'E'. \nQuestion:{question}{options}\nAlways conclude with:\nThe best answer is [the_answer_letter].\nwhere the [the_answer_letter] is one of A, B, C, D or E.",
+ "options_format": "\n{letter}: {option}"
+ },
+ {
+ "prompt": "Answer to the following question by selecting the correct option A, B, C, D or E. {question}{options}\nThe answer should end with:\nThe best answer is [the_answer_letter] where [the_answer_letter] is one of letters A to E. Let's think step by step.",
+ "options_format": "\n{letter}: {option}"
+ },
+ {
+ "prompt": "Select the correct answer from the options 'A', 'B', 'C', 'D', 'E' for the question provided below. Conclude by stating: The best answer is answer_letter where answer_letter is one of 'A', 'B', 'C', 'D' or 'E'.\nQuestion: {question}{options}\nLet's think step by step.",
+ "options_format": "\n{letter}: {option}"
+ },
+ {
+ "prompt": "{question}{options}\nFor this question with 10 possible answers A, B, C, D, E, choose the one that answers the question. If the problem is simple or straightforward, just provide the answer. If the answer is more complex, use a step-by-step approach and for each step briefly explain your reasoning. Always conclude with 'The best answer is (answer_letter)' where the (answer_letter) is one of 'A', 'B', 'C', 'D', 'E'. Let's think step by step.",
+ "options_format": "\n{letter}: {option}"
+ },
+ {
+ "prompt": "Read the question and options below, then determine the correct answer choice (A-E)\nQuestion: {question}{options}\n\nFor simple questions, provide a quick answer. For complicated ones, think step by step, break down the question into smaller problems and reach to a conclusion\nEnd your answer by stating:\nThe best answer is [the_answer_letter].\nwhere [the_answer_letter] is one of A, B, C, D or E.",
+ "options_format": "\n{letter}: {option}"
+ }
+
+ ]
+
+}
diff --git a/lm_eval/tasks/score/agi_eval/score_option_order_robustness_agieval.yaml b/lm_eval/tasks/score/agi_eval/score_option_order_robustness_agieval.yaml
new file mode 100644
index 0000000000..f27cc1ddb3
--- /dev/null
+++ b/lm_eval/tasks/score/agi_eval/score_option_order_robustness_agieval.yaml
@@ -0,0 +1,42 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+group: score_option_order_robustness_agieval
+task:
+ - option_order_robustness_agieval_aqua_rat
+ - option_order_robustness_agieval_logiqa_en
+ - option_order_robustness_agieval_lsat_ar
+ - option_order_robustness_agieval_lsat_lr
+ - option_order_robustness_agieval_lsat_rc
+ - option_order_robustness_agieval_sat_en
+ - option_order_robustness_agieval_sat_math
+
+aggregate_metric_list:
+ - metric: per_option_accuracy_A
+ aggregation: mean
+ weight_by_size: true
+ - metric: per_option_accuracy_B
+ aggregation: mean
+ weight_by_size: true
+ - metric: per_option_accuracy_C
+ aggregation: mean
+ weight_by_size: true
+ - metric: per_option_accuracy_D
+ aggregation: mean
+ weight_by_size: truez
+ - metric: options_consistency_rate
+ aggregation: mean
+ weight_by_size: true
+metadata:
+ version: 1.0
diff --git a/lm_eval/tasks/score/agi_eval/score_prompt_robustness_agieval.yaml b/lm_eval/tasks/score/agi_eval/score_prompt_robustness_agieval.yaml
new file mode 100644
index 0000000000..36b5376e4d
--- /dev/null
+++ b/lm_eval/tasks/score/agi_eval/score_prompt_robustness_agieval.yaml
@@ -0,0 +1,60 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+group: score_prompt_robustness_agieval
+task:
+ - prompt_robustness_agieval_aqua_rat
+ - prompt_robustness_agieval_logiqa_en
+ - prompt_robustness_agieval_lsat_ar
+ - prompt_robustness_agieval_lsat_lr
+ - prompt_robustness_agieval_lsat_rc
+ - prompt_robustness_agieval_sat_en
+ - prompt_robustness_agieval_sat_math
+
+aggregate_metric_list:
+ - metric: 0_accuracy
+ aggregation: mean
+ weight_by_size: true
+ - metric: 1_accuracy
+ aggregation: mean
+ weight_by_size: true
+ - metric: 2_accuracy
+ aggregation: mean
+ weight_by_size: true
+ - metric: 3_accuracy
+ aggregation: mean
+ weight_by_size: true
+ - metric: 4_accuracy
+ aggregation: mean
+ weight_by_size: true
+ - metric: 5_accuracy
+ aggregation: mean
+ weight_by_size: true
+ - metric: 6_accuracy
+ aggregation: mean
+ weight_by_size: true
+ - metric: 7_accuracy
+ aggregation: mean
+ weight_by_size: true
+ - metric: 8_accuracy
+ aggregation: mean
+ weight_by_size: true
+ - metric: 9_accuracy
+ aggregation: mean
+ weight_by_size: true
+ - metric: consistency_rate
+ aggregation: mean
+ weight_by_size: true
+metadata:
+ version: 1.0
diff --git a/lm_eval/tasks/score/agi_eval/score_robustness_agieval.yaml b/lm_eval/tasks/score/agi_eval/score_robustness_agieval.yaml
new file mode 100644
index 0000000000..354cb5675c
--- /dev/null
+++ b/lm_eval/tasks/score/agi_eval/score_robustness_agieval.yaml
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+group: score_robustness_agieval
+task:
+ - score_prompt_robustness_agieval
+ - score_option_order_robustness_agieval
+metadata:
+ version: 1.0
diff --git a/lm_eval/tasks/score/agi_eval/utils_agieval.py b/lm_eval/tasks/score/agi_eval/utils_agieval.py
new file mode 100644
index 0000000000..b8034259d8
--- /dev/null
+++ b/lm_eval/tasks/score/agi_eval/utils_agieval.py
@@ -0,0 +1,183 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+from functools import partial
+from typing import Any, Dict, List
+
+import numpy as np
+from datasets import Dataset
+
+from lm_eval.tasks.score import utils
+from lm_eval.tasks.score.utils import prompt_consistency_rate, robustness_doc_to_text
+from lm_eval.utils import eval_logger
+
+
+TEMPLATE_FILE_PATH = os.path.join(os.path.dirname(__file__), "prompt_templates.json")
+
+PROMPT_ROBUSTNESS_TEMPLATE_KEY = "prompt_robustness"
+OPTION_ORDER_ROBUSTNESS_TEMPLATE_KEY = "option_order_robustness"
+
+QUESTION_KEY = "query"
+ANSWER_INDEX_KEY = "gold"
+OPTIONS_KEY = "choices"
+
+LABELS = ["A", "B", "C", "D", "E"]
+
+agi_eval_prompt_consistency_rate = prompt_consistency_rate
+agi_eval_robustness_doc_to_text = robustness_doc_to_text
+
+
+def initial_process_docs(doc: Dataset) -> Dataset:
+ """
+ add question_id to the documents
+ """
+
+ bracket_pattern = r"^\([A-E]\)"
+ letter_space = r"^[A-E] "
+ letter_question_space = r"^[A-E]\? "
+
+ def __process(_doc, idx):
+ if "question" not in _doc:
+ question = _doc[QUESTION_KEY].split(" Answer Choices:")[0]
+ if question.startswith("Q: "):
+ question = question[3:]
+ _doc["question"] = question
+ if "question_id" not in _doc:
+ _doc["question_id"] = idx
+ if "answer_index" not in _doc:
+ _doc["answer_index"] = _doc[ANSWER_INDEX_KEY][0]
+ if "answer" not in _doc:
+ _doc["answer"] = LABELS[_doc["answer_index"]]
+ if "options" not in _doc:
+ prepared_options = []
+ for option in _doc[OPTIONS_KEY]:
+ if re.match(bracket_pattern, option):
+ prepared_options.append(option[3:])
+ elif re.match(letter_space, option):
+ prepared_options.append(option[2:])
+ elif re.match(letter_question_space, option):
+ prepared_options.append(option[3:])
+ else:
+ prepared_options.append(option)
+ _doc["options"] = prepared_options
+ return _doc
+
+ return doc.map(__process, with_indices=True)
+
+
+prompt_robustness_process_docs = partial(
+ utils.process_docs_add_prompts,
+ templates_key=PROMPT_ROBUSTNESS_TEMPLATE_KEY,
+ template_file_path=TEMPLATE_FILE_PATH,
+ dataset_specific_preprocess=initial_process_docs,
+)
+
+option_order_robustness_process_docs = partial(
+ utils.option_order_robustness_process_docs,
+ template_file_path=TEMPLATE_FILE_PATH,
+ templates_key=OPTION_ORDER_ROBUSTNESS_TEMPLATE_KEY,
+ labels=LABELS[:-1],
+ dataset_specific_preprocess=initial_process_docs,
+)
+
+
+def prompt_robustness_process_results(doc, results) -> Dict[str, float]:
+ final_answer = utils.__postprocess_pred(results[0])
+ final_answer = utils.translate_model_answer_to_labels(
+ final_answer, option_format=doc["options_format"], labels=LABELS
+ )
+ gt = LABELS[doc["answer_index"]]
+ prompt_id = doc["prompt_id"]
+ question_id = doc["question_id"]
+ return {
+ f"{prompt_id}_accuracy": (question_id, prompt_id, final_answer, gt),
+ "consistency_rate": (question_id, prompt_id, final_answer, gt),
+ }
+
+
+def option_order_robustness_process_results(doc, results) -> Dict[str, float]:
+ final_answer = utils.__postprocess_pred(results[0])
+ final_answer = utils.translate_model_answer_to_labels(
+ final_answer, option_format=doc["options_format"], labels=LABELS
+ )
+ gt = LABELS[doc["answer_index"]]
+ always_same_option = doc["always_same_option"]
+ question_id = doc["question_id"]
+ original_answer_index = doc["original_answer_index"]
+ answer_index = (doc["answer_index"],)
+ return {
+ f"per_option_accuracy_{always_same_option}": (
+ question_id,
+ always_same_option,
+ final_answer,
+ gt,
+ ),
+ "options_consistency_rate": (
+ question_id,
+ always_same_option,
+ final_answer,
+ original_answer_index,
+ answer_index,
+ ),
+ }
+
+
+def per_prompt_accuracy(results: List[Dict[str, Any]], p_id=0) -> float:
+ accuracies = []
+ for result in results:
+ question_id, prompt_id, final_answer, gt = result
+ if prompt_id != p_id:
+ continue
+ accuracies.append(final_answer == gt)
+
+ accuracie = sum(accuracies) / len(accuracies)
+ eval_logger.info(f"Prompt - {prompt_id} accuracy: {accuracie}")
+
+ return np.round(accuracie, 4)
+
+
+per_prompt_accuracy_0 = partial(per_prompt_accuracy, p_id=0)
+per_prompt_accuracy_1 = partial(per_prompt_accuracy, p_id=1)
+per_prompt_accuracy_2 = partial(per_prompt_accuracy, p_id=2)
+per_prompt_accuracy_3 = partial(per_prompt_accuracy, p_id=3)
+per_prompt_accuracy_4 = partial(per_prompt_accuracy, p_id=4)
+per_prompt_accuracy_5 = partial(per_prompt_accuracy, p_id=5)
+per_prompt_accuracy_6 = partial(per_prompt_accuracy, p_id=6)
+per_prompt_accuracy_7 = partial(per_prompt_accuracy, p_id=7)
+per_prompt_accuracy_8 = partial(per_prompt_accuracy, p_id=8)
+per_prompt_accuracy_9 = partial(per_prompt_accuracy, p_id=9)
+
+
+def per_option_accuracy(results: List[Dict[str, Any]], always_opt="a") -> float:
+ accuracies = []
+ for result in results:
+ question_id, always_same_option, final_answer, gt = result
+ if always_opt != always_same_option:
+ continue
+ accuracies.append(int(final_answer == gt))
+
+ accuracie = sum(accuracies) / len(accuracies)
+ eval_logger.info(f"Prompt - {always_opt.upper()} accuracy: {accuracie}")
+
+ return np.round(accuracie, 4)
+
+
+per_option_accuracy_a = partial(per_option_accuracy, always_opt="A")
+per_option_accuracy_b = partial(per_option_accuracy, always_opt="B")
+per_option_accuracy_c = partial(per_option_accuracy, always_opt="C")
+per_option_accuracy_d = partial(per_option_accuracy, always_opt="D")
+
+options_consistency_rate = partial(utils.options_consistency_rate, labels=LABELS)
diff --git a/lm_eval/tasks/score/math/math_grader.py b/lm_eval/tasks/score/math/math_grader.py
new file mode 100644
index 0000000000..156e739e0f
--- /dev/null
+++ b/lm_eval/tasks/score/math/math_grader.py
@@ -0,0 +1,654 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright (c) Microsoft Corporation.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE
+
+# Copyright (c) 2023 OpenAI
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Copyright (c) 2021 Dan Hendrycks
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+"""
+This logic is largely copied from the Hendrycks' MATH release (math_equivalence), and borrowed from:
+- https://github.com/microsoft/ToRA/blob/main/src/eval/grader.py
+- https://github.com/microsoft/ProphetNet/tree/master/CRITIC
+- https://github.com/openai/prm800k
+"""
+
+import contextlib
+import re
+import signal
+from importlib.metadata import PackageNotFoundError, version
+from math import isclose
+from typing import Union
+
+
+def _check_antlr_version():
+ "Function for checking the antlr package version."
+ # Check antlr version
+ PACKAGE_NAME = "antlr4-python3-runtime"
+ REQUIRED_VERSION = "4.11.0"
+
+ try:
+ installed_version = version(PACKAGE_NAME)
+ if installed_version != REQUIRED_VERSION:
+ raise RuntimeError(
+ f"Package {PACKAGE_NAME} version mismatch: {installed_version} (required: {REQUIRED_VERSION})"
+ )
+ except PackageNotFoundError:
+ raise RuntimeError(
+ f"Package {PACKAGE_NAME} not found. Please install antlr4-python3-runtime==4.11.0."
+ )
+
+
+def _fix_fracs(string):
+ # replacing all extra spaces
+ while "\\frac " in string:
+ string = string.replace("\\frac ", "\\frac")
+ substrs = string.split("\\frac")
+ new_str = substrs[0]
+ if len(substrs) > 1:
+ substrs = substrs[1:]
+ for substr in substrs:
+ new_str += "\\frac"
+ if len(substr) > 0 and substr[0] == "{":
+ new_str += substr
+ else:
+ try:
+ assert len(substr) >= 2
+ except AssertionError:
+ return string
+ a = substr[0]
+ b = substr[1]
+ if b != "{":
+ if len(substr) > 2:
+ post_substr = substr[2:]
+ new_str += "{" + a + "}{" + b + "}" + post_substr
+ else:
+ new_str += "{" + a + "}{" + b + "}"
+ else:
+ if len(substr) > 2:
+ post_substr = substr[2:]
+ new_str += "{" + a + "}" + b + post_substr
+ else:
+ new_str += "{" + a + "}" + b
+ string = new_str
+ return string
+
+
+def _str_is_int(x: str) -> bool:
+ try:
+ x = _strip_properly_formatted_commas(x)
+ x = float(x)
+ return abs(x - int(round(x))) <= 1e-7
+ except Exception:
+ return False
+
+
+def _str_to_int(x: str) -> bool:
+ x = x.replace(",", "")
+ if "_" in x:
+ # Due to base
+ x = x.split("_")[0]
+ x = float(x)
+ return int(x)
+
+
+def _inject_implicit_mixed_number(step: str):
+ """
+ Automatically make a mixed number evalable
+ e.g. 7 3/4 => 7+3/4
+ """
+ p1 = re.compile("([0-9]) +([0-9])")
+ step = p1.sub("\\1+\\2", step) # implicit mults
+ return step
+
+
+def _strip_properly_formatted_commas(expr: str):
+ # We want to be careful because we don't want to strip tuple commas
+ p1 = re.compile(r"(\d)(,)(\d\d\d)($|\D)")
+ while True:
+ next_expr = p1.sub("\\1\\3\\4", expr)
+ if next_expr == expr:
+ break
+ expr = next_expr
+ return next_expr
+
+
+def _remove_right_units(expr):
+ # "\\text{ " only ever occurs (at least in the val set) when describing units
+ if "\\text" in expr:
+ try:
+ splits = re.split(r"\\text\s*{\s*", expr)
+ # print(splits)
+ assert len(splits) == 2 and splits[0] not in ("", "(")
+ return splits[0]
+ except AssertionError:
+ pass
+
+ if "\\text{" in expr:
+ return re.sub(r"\\text{([^}]+)}", r"\1", expr)
+ elif "\\mbox{" in expr:
+ splits = expr.split("\\mbox{")
+ assert len(splits) == 2
+ return splits[0]
+ else:
+ return expr
+
+
+def _process_and_or_inside_text(string):
+ string = re.sub(r"\s*\\text{\s*(or|and)\s*}\s*", ",", string)
+ string = re.sub(r",\s*,", ",", string)
+ return string
+
+
+def _remove_left_and_right(expr):
+ """Remove the right and left latex commands."""
+ expr = re.sub(r"\\left", "", expr)
+ expr = re.sub(r"\\right", "", expr)
+ return expr
+
+
+def _fix_sqrt(string):
+ _string = re.sub(r"\\sqrt(\s*\w+)", r"\\sqrt{\1}", string)
+ return _string
+
+
+def _fix_interval(expr):
+ """Fix interval expression."""
+ if "\\in " in expr:
+ return expr.split("\\in ")[1].strip()
+
+ return expr
+
+
+def _inject_implicit_mixed_fraction(step: str):
+ """
+ Automatically make a mixed number evalable
+ e.g. 7 \\frac{3}{4} => 7+3/4
+ """
+ p1 = re.compile(r"(\d+) *\\frac{(\d+)}{(\d+)}")
+
+ def replacer(match):
+ whole_part = match.group(1)
+ numerator = match.group(2)
+ denominator = match.group(3)
+
+ if whole_part:
+ return f"{whole_part} + {numerator}/{denominator}"
+ else:
+ return f"{numerator}/{denominator}"
+
+ step = p1.sub(replacer, step)
+ return step
+
+
+def normalize_answer_string(expr: str) -> str:
+ """Normalize answer expressions."""
+ if expr is None:
+ return None
+
+ # Remove enclosing `\text{}`.
+
+ expr = _remove_left_and_right(expr)
+ expr = _process_and_or_inside_text(expr)
+ expr = _remove_right_units(expr)
+ expr = _fix_interval(expr)
+ for surround_str in [
+ "\\\\text",
+ "\\\\mathrm",
+ "\\\\mathcal",
+ "\\\\textbf",
+ "\\\\textit",
+ ]:
+ expr = expr.replace(surround_str, "")
+ pattern = f"^{surround_str}" + "\{(?P.+?)\}$"
+ m = re.search(pattern, expr)
+ if m is not None:
+ expr = m.group("text")
+
+ expr = expr.replace("\!", "")
+ expr = expr.replace("\\%", "%")
+ expr = expr.replace("\\$", "$")
+ expr = expr.replace("$", "")
+ expr = expr.replace("%", "")
+ expr = expr.replace("^{\\circ}", "")
+
+ expr = expr.replace(" or ", " , ")
+ expr = expr.replace(" and ", " , ")
+
+ expr = expr.replace("million", "*10^6")
+ expr = expr.replace("billion", "*10^9")
+ expr = expr.replace("trillion", "*10^12")
+
+ for unit in [
+ "degree",
+ "cm",
+ "centimeter",
+ "meter",
+ "mile",
+ "second",
+ "minute",
+ "hour",
+ "week",
+ "month",
+ "year",
+ "foot",
+ "feet",
+ "inch",
+ "yard",
+ "p.m.",
+ "PM",
+ ]:
+ expr = re.sub(f"{unit}(es)?(s)? *(\^[0-9]+)?", "", expr)
+
+ if "day" in expr:
+ days = [
+ "Monday",
+ "Tuesday",
+ "Wednesday",
+ "Thursday",
+ "Friday",
+ "Saturday",
+ "Sunday",
+ ]
+ weekday_expressed = False
+ for day in days:
+ if day in expr:
+ weekday_expressed = True
+ break
+
+ if not weekday_expressed:
+ expr = re.sub("day(s)?", "", expr)
+
+ expr = re.sub("\^ *\\\\circ", "", expr)
+
+ if len(expr) > 0 and expr[0] == "{" and expr[-1] == "}":
+ expr = expr[1:-1]
+
+ expr = _fix_sqrt(expr)
+
+ # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
+ expr = _fix_fracs(expr)
+
+ # edge case with mixed numbers and negative signs
+ expr = re.sub("- *", "-", expr)
+ expr = _inject_implicit_mixed_number(expr)
+ expr = _inject_implicit_mixed_fraction(expr)
+ expr = expr.replace(" ", "")
+
+ if _str_is_int(expr):
+ expr = str(_str_to_int(expr))
+
+ return expr
+
+
+def is_digit(s):
+ try:
+ if "{,}" in str(s):
+ num = float(str(s).replace("{,}", ""))
+ return True, num
+
+ num = float(str(s).replace(",", ""))
+ return True, num
+ except ValueError:
+ return False, None
+
+
+def normalize(answer) -> str:
+ # checking if answer is $ and removing $ in that case to compare
+ if isinstance(answer, str) and bool(re.match(r"\$\d+(\.\d+)?", answer)):
+ return answer[1:]
+
+ # checking if answer is % or \\% and removing %
+ if isinstance(answer, str) and (
+ bool(re.match(r"^\d+(\.\d+)?%$", answer))
+ or bool(re.match(r"^\d+(\.\d+)?\\%$", answer))
+ ):
+ return answer.replace("\\%", "").replace("%", "")
+
+ return answer
+
+
+def math_equal(
+ prediction: Union[bool, float, str],
+ reference: Union[float, str],
+ include_percentage: bool = True,
+ tolerance: float = 1e-4,
+ timeout: float = 10.0,
+) -> bool:
+ """
+ Exact match of math if and only if:
+ 1. numerical equal: both can convert to float and are equal
+ 2. symbolic equal: both can convert to sympy expression and are equal
+ """
+
+ # Check that the right antlr version is installed.
+ _check_antlr_version()
+
+ from sympy.parsing.sympy_parser import parse_expr
+
+ prediction = normalize(prediction)
+ reference = normalize(reference)
+
+ # another round of normalization
+ prediction = normalize_answer_string(prediction)
+ reference = normalize_answer_string(reference)
+
+ if (
+ isinstance(prediction, str) and len(prediction) > 1000
+ ): # handling weird corner-cases
+ prediction = prediction[:1000]
+
+ # 0. string comparison
+ if isinstance(prediction, str) and isinstance(reference, str):
+ if prediction.strip().lower() == reference.strip().lower():
+ return True
+ if prediction.replace(" ", "") == reference.replace(" ", ""):
+ return True
+
+ try: # 1. numerical equal
+ if is_digit(prediction)[0] and is_digit(reference)[0]:
+ prediction = is_digit(prediction)[1]
+ reference = is_digit(reference)[1]
+ # number questions
+ if include_percentage:
+ gt_result = [reference / 100, reference, reference * 100]
+ else:
+ gt_result = [reference]
+ for item in gt_result:
+ try:
+ if isclose(item, prediction, rel_tol=tolerance):
+ return True
+ except Exception:
+ continue
+ return False
+ except Exception:
+ pass
+
+ if not prediction and prediction not in [0, False]:
+ return False
+
+ # 2. symbolic equal
+ reference = str(reference).strip()
+ prediction = str(prediction).strip()
+
+ ## deal with [], (), {}
+ prediction = format_intervals(prediction)
+
+ pred_str, ref_str = prediction, reference
+ if (
+ prediction.startswith("[")
+ and prediction.endswith("]")
+ and not reference.startswith("(")
+ ) or (
+ prediction.startswith("(")
+ and prediction.endswith(")")
+ and not reference.startswith("[")
+ ):
+ pred_str = pred_str.strip("[]()")
+ ref_str = ref_str.strip("[]()")
+ for s in ["{", "}", "(", ")"]:
+ ref_str = ref_str.replace(s, "")
+ pred_str = pred_str.replace(s, "")
+ if pred_str == ref_str:
+ return True
+
+ ## [a, b] vs. [c, d], return a==c and b==d
+ if (
+ prediction
+ and reference
+ and prediction[0] in "(["
+ and prediction[-1] in ")]"
+ and prediction[0] == reference[0]
+ and prediction[-1] == reference[-1]
+ ):
+ pred_parts = prediction[1:-1].split(",")
+ ref_parts = reference[1:-1].split(",")
+ if len(pred_parts) == len(ref_parts):
+ if all(
+ [
+ math_equal(pred_pt, ref_pt, include_percentage, tolerance)
+ for pred_pt, ref_pt in zip(pred_parts, ref_parts)
+ ]
+ ):
+ return True
+
+ if "," in prediction and "," in reference:
+ pred_parts = [item.strip() for item in prediction.split(",")]
+ ref_parts = [item.strip() for item in reference.split(",")]
+
+ if len(pred_parts) == len(ref_parts):
+ if all(
+ [
+ math_equal(
+ pred_parts[i], ref_parts[i], include_percentage, tolerance
+ )
+ for i in range(len(pred_parts))
+ ]
+ ):
+ return True
+ else:
+ return False
+
+ # if we have point == tuple of values
+ if prediction.startswith("Point") and reference[0] == "(" and reference[-1] == ")":
+ pred_parts = prediction[prediction.find("(") + 1 : -1].split(",")
+ ref_parts = reference[1:-1].split(",")
+ if len(pred_parts) == len(ref_parts):
+ if all(
+ [
+ math_equal(pred_pt, ref_pt, include_percentage, tolerance)
+ for pred_pt, ref_pt in zip(pred_parts, ref_parts)
+ ]
+ ):
+ return True
+
+ # if reference is a matrix
+ if reference.startswith("\\begin{pmatrix}") and prediction.startswith("Matrix"):
+ try:
+ pred_matrix = parse_expr(prediction)
+ ref_matrix_items = reference.split()[1:-1:2]
+ if len(pred_matrix) == len(ref_matrix_items):
+ if all(
+ [
+ math_equal(ref, pred, include_percentage, tolerance)
+ for ref, pred in zip(ref_matrix_items, pred_matrix)
+ ]
+ ):
+ return True
+ except Exception:
+ pass
+
+ return symbolic_equal(prediction, reference, tolerance, timeout)
+
+
+def symbolic_equal(a, b, tolerance, timeout=10.0):
+ import sympy
+ from sympy.parsing.latex import parse_latex
+ from sympy.parsing.sympy_parser import parse_expr
+
+ def _parse(s):
+ for f in [parse_expr, parse_latex]:
+ try:
+ with time_limit(timeout):
+ return f(s)
+ except Exception:
+ pass
+ return s
+
+ a = _parse(a)
+ b = _parse(b)
+
+ try:
+ with time_limit(timeout):
+ if sympy.simplify(a - b) == 0:
+ return True
+ except Exception:
+ pass
+
+ try:
+ with time_limit(timeout):
+ if isclose(sympy.N(a), sympy.N(b), rel_tol=tolerance):
+ return True
+ except Exception:
+ pass
+ return False
+
+
+def extract_answer(
+ string: str,
+ extract_from_boxed: bool = True,
+ extract_regex: str = r"The final answer is (.+)$",
+):
+ """Extract Answer String from \\boxed expression or based on regex"""
+ if not extract_from_boxed:
+ match = re.search(extract_regex, string)
+ if match:
+ return match.group(1)
+ return None
+
+ if "\\boxed" not in string:
+ return None
+
+ idx = string.rfind("\\boxed")
+ if idx < 0:
+ idx = string.rfind("\\fbox")
+ if idx < 0:
+ return None
+
+ i = idx
+ right_brace_idx = None
+ num_left_braces_open = 0
+ while i < len(string):
+ if string[i] == "{":
+ num_left_braces_open += 1
+ if string[i] == "}":
+ num_left_braces_open -= 1
+ if num_left_braces_open == 0:
+ right_brace_idx = i
+ break
+ i += 1
+
+ if right_brace_idx is None:
+ retval = None
+ else:
+ retval = string[idx : right_brace_idx + 1]
+
+ if retval:
+ left = "\\boxed{"
+ try:
+ assert retval[: len(left)] == left
+ assert retval[-1] == "}"
+ return retval[len(left) : -1]
+ except AssertionError:
+ return None
+
+ return None
+
+
+class TimeoutException(Exception):
+ pass
+
+
+@contextlib.contextmanager
+def time_limit(seconds: float):
+ def signal_handler(signum, frame):
+ raise TimeoutException("Timed out!")
+
+ signal.setitimer(signal.ITIMER_REAL, seconds)
+ signal.signal(signal.SIGALRM, signal_handler)
+ try:
+ yield
+ finally:
+ signal.setitimer(signal.ITIMER_REAL, 0)
+
+
+def format_intervals(prediction):
+ patterns = {
+ "Interval(": r"^Interval\((.*)\)$",
+ "Interval.Ropen(": r"^Interval\.Ropen\((.*)\)$",
+ "Interval.Lopen(": r"^Interval\.Lopen\((.*)\)$",
+ "Interval.open(": r"^Interval\.open\((.*)\)$",
+ }
+
+ for key, pattern in patterns.items():
+ match = re.match(pattern, prediction)
+ if match:
+ inner_content = match.group(1)
+
+ if key == "Interval(": # Intarval(a, b) == [a, b]
+ return f"[{inner_content}]"
+ elif key == "Interval.Ropen(": # Intarval.Ropen(a, b) == [a, b)
+ return f"[{inner_content})"
+ elif key == "Interval.Lopen(": # Intarval.Lopen(a, b) == (a, b]
+ return f"({inner_content}]"
+ elif key == "Interval.open(": # Intarval.open(a, b) == (a, b)
+ return f"({inner_content})"
+
+ return prediction
diff --git a/lm_eval/tasks/score/math/prompt_robustness_math_algebra.yaml b/lm_eval/tasks/score/math/prompt_robustness_math_algebra.yaml
new file mode 100644
index 0000000000..29e769eb04
--- /dev/null
+++ b/lm_eval/tasks/score/math/prompt_robustness_math_algebra.yaml
@@ -0,0 +1,65 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+task: prompt_robustness_math_algebra
+dataset_path: EleutherAI/hendrycks_math
+process_docs: !function utils_math.prompt_robustness_process_docs
+dataset_name: algebra
+output_type: generate_until
+test_split: test
+doc_to_text: !function utils_math.math_robustness_doc_to_text
+process_results: !function utils_math.process_results
+doc_to_target: answer
+generation_kwargs:
+ do_sample: false
+ temperature: 0
+ max_gen_toks: 1024
+metric_list:
+ - metric: 0_accuracy
+ aggregation: !function utils_math.per_prompt_accuracy_0
+ higher_is_better: true
+ - metric: 1_accuracy
+ aggregation: !function utils_math.per_prompt_accuracy_1
+ higher_is_better: true
+ - metric: 2_accuracy
+ aggregation: !function utils_math.per_prompt_accuracy_2
+ higher_is_better: true
+ - metric: 3_accuracy
+ aggregation: !function utils_math.per_prompt_accuracy_3
+ higher_is_better: true
+ - metric: 4_accuracy
+ aggregation: !function utils_math.per_prompt_accuracy_4
+ higher_is_better: true
+ - metric: 5_accuracy
+ aggregation: !function utils_math.per_prompt_accuracy_5
+ higher_is_better: true
+ - metric: 6_accuracy
+ aggregation: !function utils_math.per_prompt_accuracy_6
+ higher_is_better: true
+ - metric: 7_accuracy
+ aggregation: !function utils_math.per_prompt_accuracy_7
+ higher_is_better: true
+ - metric: 8_accuracy
+ aggregation: !function utils_math.per_prompt_accuracy_8
+ higher_is_better: true
+ - metric: 9_accuracy
+ aggregation: !function utils_math.per_prompt_accuracy_9
+ higher_is_better: true
+ - metric: consistency_rate
+ aggregation: !function utils_math.math_prompt_consistency_rate
+ higher_is_better: true
+metadata:
+ version: 1.0
+dataset_kwargs:
+ trust_remote_code: true
diff --git a/lm_eval/tasks/score/math/prompt_robustness_math_counting_and_prob.yaml b/lm_eval/tasks/score/math/prompt_robustness_math_counting_and_prob.yaml
new file mode 100644
index 0000000000..6162fa9936
--- /dev/null
+++ b/lm_eval/tasks/score/math/prompt_robustness_math_counting_and_prob.yaml
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include: prompt_robustness_math_algebra.yaml
+dataset_name: counting_and_probability
+task: prompt_robustness_math_counting_and_prob
diff --git a/lm_eval/tasks/score/math/prompt_robustness_math_geometry.yaml b/lm_eval/tasks/score/math/prompt_robustness_math_geometry.yaml
new file mode 100644
index 0000000000..1ffa8438b0
--- /dev/null
+++ b/lm_eval/tasks/score/math/prompt_robustness_math_geometry.yaml
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include: prompt_robustness_math_algebra.yaml
+dataset_name: geometry
+task: prompt_robustness_math_geometry
diff --git a/lm_eval/tasks/score/math/prompt_robustness_math_intermediate_algebra.yaml b/lm_eval/tasks/score/math/prompt_robustness_math_intermediate_algebra.yaml
new file mode 100644
index 0000000000..ce65abf54e
--- /dev/null
+++ b/lm_eval/tasks/score/math/prompt_robustness_math_intermediate_algebra.yaml
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include: prompt_robustness_math_algebra.yaml
+dataset_name: intermediate_algebra
+task: prompt_robustness_math_intermediate_algebra
diff --git a/lm_eval/tasks/score/math/prompt_robustness_math_num_theory.yaml b/lm_eval/tasks/score/math/prompt_robustness_math_num_theory.yaml
new file mode 100644
index 0000000000..fde9802693
--- /dev/null
+++ b/lm_eval/tasks/score/math/prompt_robustness_math_num_theory.yaml
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include: prompt_robustness_math_algebra.yaml
+dataset_name: number_theory
+task: prompt_robustness_math_num_theory
diff --git a/lm_eval/tasks/score/math/prompt_robustness_math_prealgebra.yaml b/lm_eval/tasks/score/math/prompt_robustness_math_prealgebra.yaml
new file mode 100644
index 0000000000..9387f7df3d
--- /dev/null
+++ b/lm_eval/tasks/score/math/prompt_robustness_math_prealgebra.yaml
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include: prompt_robustness_math_algebra.yaml
+dataset_name: prealgebra
+task: prompt_robustness_math_prealgebra
diff --git a/lm_eval/tasks/score/math/prompt_robustness_math_precalc.yaml b/lm_eval/tasks/score/math/prompt_robustness_math_precalc.yaml
new file mode 100644
index 0000000000..ca84fca7fc
--- /dev/null
+++ b/lm_eval/tasks/score/math/prompt_robustness_math_precalc.yaml
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include: prompt_robustness_math_algebra.yaml
+dataset_name: precalculus
+task: prompt_robustness_math_precalc
diff --git a/lm_eval/tasks/score/math/prompt_templates.json b/lm_eval/tasks/score/math/prompt_templates.json
new file mode 100644
index 0000000000..072f574034
--- /dev/null
+++ b/lm_eval/tasks/score/math/prompt_templates.json
@@ -0,0 +1,35 @@
+{
+ "prompt_robustness": [
+ {
+ "prompt": "Efficiently solve the following math challenge. Explain your approach step-by-step\nThe answer should end with: The final answer is: $\\boxed{{answer}}$\nwhere [answer] is just the final number or expression that solves the problem\nProblem: {question}\nLets think step by step"
+ },
+ {
+ "prompt": "You should solve this math problem.\nIf the problem is easy, provide a brief solution with little explanation.\nFor more difficult problems, follow this structured format\n## Step 1: [Brief description]\n[Simple explanation and calculations]\n\n## Step 2: [Brief description]\n[Simple explanation and calculations]\n\nRepeat steps until your reach a solution\n\nProblem: {question}\nEnd with:\nThe final answer is: $\\boxed{{answer}}$\nwhere [answer] is just the final number or expression that solves the problem."
+ },
+ {
+ "prompt": "Solve this math problem. Your anwer should end with 'The final answer is: $\\boxed{{answer}}$' where [answer] is just the final number or expression that solves the problem\nProblem: {question}"
+ },
+ {
+ "prompt": "Analyze and solve the math task.\nProblem: {question}\nEnd the answer with:\nThe final answer is: $\\boxed{{answer}}$ where [answer] is just the final number or expression that solves the problem."
+ },
+ {
+ "prompt": "{question}\nFind the solution to this math problem. Your answer should end with - The final answer is: $\\boxed{{answer}}$\nwhere [answer] is just the final number or expression that solves the problem."
+ },
+ {
+ "prompt": "Calculate the answer to this math problem\nProblem: {question}\nConclude your answer with:\nThe final answer is: $\\boxed{{answer}}$\nwhere [answer] is just the final number or expression that solves the problem."
+ },
+ {
+ "prompt": "{question}\nPlease solve this math problem efficiently. Finish with: The final answer is: $\\boxed{{answer}}$ where [answer] is just the final number or expression that solves the problem."
+ },
+ {
+ "prompt": "{question}\nSolve the following math problem\nShow each step of your solution\nConclude with:\nThe final answer is: $\\boxed{{answer}}$\nwhere [answer] is just the final number or expression that solves the problem\nLets think step by step"
+ },
+ {
+ "prompt": "Find the answer to the following math question. Conclude with: 'The final answer is: $\\boxed{{answer}}$'\nwhere [answer] is just the final number or expression that solves the problem\nProblem: {question}"
+ },
+ {
+ "prompt": "Please solve the math problem. For simple problems offer a quick solution with minimal details. For more challenging problems, explain your approach step-by-step. Finish with\nThe final answer is: $\\boxed{{answer}}$.\nwhere [answer] is just the final number or expression that solves the problem.\nProblem: {question}\nLets think step by step."
+ }
+ ]
+
+}
diff --git a/lm_eval/tasks/score/math/score_prompt_robustness_math.yaml b/lm_eval/tasks/score/math/score_prompt_robustness_math.yaml
new file mode 100644
index 0000000000..b22e23b987
--- /dev/null
+++ b/lm_eval/tasks/score/math/score_prompt_robustness_math.yaml
@@ -0,0 +1,60 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+group: score_prompt_robustness_math
+task:
+ - prompt_robustness_math_algebra
+ - prompt_robustness_math_counting_and_prob
+ - prompt_robustness_math_geometry
+ - prompt_robustness_math_intermediate_algebra
+ - prompt_robustness_math_num_theory
+ - prompt_robustness_math_prealgebra
+ - prompt_robustness_math_precalc
+
+aggregate_metric_list:
+ - metric: 0_accuracy
+ aggregation: mean
+ weight_by_size: true
+ - metric: 1_accuracy
+ aggregation: mean
+ weight_by_size: true
+ - metric: 2_accuracy
+ aggregation: mean
+ weight_by_size: true
+ - metric: 3_accuracy
+ aggregation: mean
+ weight_by_size: true
+ - metric: 4_accuracy
+ aggregation: mean
+ weight_by_size: true
+ - metric: 5_accuracy
+ aggregation: mean
+ weight_by_size: true
+ - metric: 6_accuracy
+ aggregation: mean
+ weight_by_size: true
+ - metric: 7_accuracy
+ aggregation: mean
+ weight_by_size: true
+ - metric: 8_accuracy
+ aggregation: mean
+ weight_by_size: true
+ - metric: 9_accuracy
+ aggregation: mean
+ weight_by_size: true
+ - metric: consistency_rate
+ aggregation: mean
+ weight_by_size: true
+metadata:
+ version: 1.0
diff --git a/lm_eval/tasks/score/math/score_robustness_math.yaml b/lm_eval/tasks/score/math/score_robustness_math.yaml
new file mode 100644
index 0000000000..f3b733667b
--- /dev/null
+++ b/lm_eval/tasks/score/math/score_robustness_math.yaml
@@ -0,0 +1,19 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+group: score_robustness_math
+task:
+ - score_prompt_robustness_math
+metadata:
+ version: 1.0
diff --git a/lm_eval/tasks/score/math/to_be_fixed_questions.json b/lm_eval/tasks/score/math/to_be_fixed_questions.json
new file mode 100644
index 0000000000..020d07b956
--- /dev/null
+++ b/lm_eval/tasks/score/math/to_be_fixed_questions.json
@@ -0,0 +1,57 @@
+[
+ {
+ "id": "test/prealgebra/1088.json",
+ "problem": "Simplify $(5x+3) - 2(2x-4)$.",
+ "answer": "x+11"
+ },
+ {
+ "id": "test/algebra/1197.json",
+ "problem": "Two positive numbers $p$ and $q$ have the property that their sum is equal to their product. If their difference is $7$, what is $\\frac{1}{\\frac{1}{p^2}+\\frac{1}{q^2}}$? Your answer will be of the form $\\frac{a+b\\sqrt{c}}{d}$, where $a$ and $b$ don't both share the same common factor with $d$ and $c$ has no square as a factor. Find $a+b+c+d$.",
+ "answer": "161"
+ },
+ {
+ "id": "test/geometry/66.json",
+ "problem": "Square $ABCD$ has side lengths of 13 units. Point $E$ lies in the interior of the square such that $AE = 5$ units and $BE = 12$ units. What is the distance from $E$ to side $AD$?",
+ "answer": "\\frac{25}{13}"
+ },
+ {
+ "id": "test/geometry/1125.json",
+ "problem": "An aquarium has a rectangular base that measures 100 cm by 40 cm and has a height of 50 cm. The aquarium is filled with water to a depth of 37 cm. A rock with volume $1000 \\text{cm}^3$ is then placed in the aquarium and completely submerged. By how many centimeters does the water level rise? Express your answer as a decimal to the nearest 100th.",
+ "answer": "0.25\\text{ cm}"
+ },
+ {
+ "id": "test/prealgebra/1407.json",
+ "problem": "What number must be placed in the box in the equation below to produce an equation that has more than one solution: \\[4x + 6 + 7x - 9 = 12x - 7 - x + \\boxed{\\phantom{2}}?\\]",
+ "answer": "4"
+ },
+ {
+ "id": "test/prealgebra/224.json",
+ "problem": "I am going to buy exotic fruits. Dragonfruit costs $x-4$ dollars. Starfruit is five dollars less expensive than rambutan. Rambutan costs $2x$ dollars more than dragonfruit. How much does it cost to buy one rambutan, two starfruit, and three dragonfruit? Your answer will be an expression that depends on $x$.",
+ "answer": "-34 + 12x"
+ },
+ {
+ "id": "test/prealgebra/177.json",
+ "problem": "Let $\\boxed{N}$ mean the number of whole number divisors of $N$. For example, $\\boxed{3}=2$, because $3$ has two divisors, $1$ and $3.$ Find the value of \\[\\boxed{\\boxed{11}\\times\\boxed{20}}\\]",
+ "answer": "12"
+ },
+ {
+ "id": "test/number_theory/459.json",
+ "problem": "On a particular map, $3$ inches on the map equates to $10$ miles in real life. If you know that the real life distance between two buildings on the map is $53.25$ miles, what would the distance between the buildings be (in inches) on the map, expressed as a fraction?",
+ "answer": "\\frac{639}{40}"
+ },
+ {
+ "id": "test/intermediate_algebra/702.json",
+ "problem": "Find the coordinates of either of the vertices of the hyperbola \\[16x^2+16x-4y^2-20y-85=0.\\](Enter your answer as an ordered pair. Enter the coordinates of one of the vertices, not both.)",
+ "answer": "\\left(-\\tfrac52, -\\tfrac52\\right)"
+ },
+ {
+ "id": "test/intermediate_algebra/25.json",
+ "problem": "Find the coordinates of one of the foci of the hyperbola \\[x^2 - 10x = 4y^2 - 5.\\](Enter your answer as an ordered pair. Enter only one of the foci, not both.)",
+ "answer": "(0,0)"
+ },
+ {
+ "id": "test/intermediate_algebra/747.json",
+ "problem": "The graph of $y = f(x)$ passes through the point $(-3,5).$ If $f(x)$ is an odd function, then what other point must the graph pass through? Enter your answer as an ordered pair.",
+ "answer": "(0,0)"
+ }
+]
diff --git a/lm_eval/tasks/score/math/utils_math.py b/lm_eval/tasks/score/math/utils_math.py
new file mode 100644
index 0000000000..cf41473ae8
--- /dev/null
+++ b/lm_eval/tasks/score/math/utils_math.py
@@ -0,0 +1,235 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from functools import partial
+from itertools import combinations
+from typing import Any, Dict, List
+
+import datasets
+import numpy as np
+
+from lm_eval.tasks.score import utils
+from lm_eval.tasks.score.math.math_grader import (
+ extract_answer,
+ math_equal,
+ normalize_answer_string,
+)
+from lm_eval.tasks.score.utils import robustness_doc_to_text
+from lm_eval.utils import eval_logger
+
+
+TEMPLATE_FILE_PATH = os.path.join(os.path.dirname(__file__), "prompt_templates.json")
+
+PROMPT_ROBUSTNESS_TEMPLATE_KEY = "prompt_robustness"
+
+math_robustness_doc_to_text = robustness_doc_to_text
+
+
+def find_boxed_entries(answer_str):
+ stack = []
+ results = []
+ i = 0
+
+ while i < len(answer_str):
+ if answer_str[i : i + 7] == "\\boxed{":
+ stack.append(i + 7)
+ i += 7
+ elif answer_str[i] == "{":
+ if stack:
+ stack.append(i + 1)
+ i += 1
+ elif answer_str[i] == "}":
+ if stack:
+ start = stack.pop()
+ if not stack:
+ results.append(answer_str[start:i])
+ i += 1
+ else:
+ i += 1
+
+ if len(results) == 0:
+ raise ValueError("Not enough boxed entries")
+ else:
+ results = [normalize_answer_string(result) for result in results]
+
+ if len(results) == 1:
+ # Single boxed entry, trivial case
+ return results
+
+ else:
+ # Multiple boxed entries. There are two cases possible
+ # (a) The reference solution has the same question answered in multiple ways
+ # (b) The answer is split across multiple boxed entries and we need to merge
+ result_equal = True
+ for idx in range(len(results) - 1):
+ if not (results[idx] == results[idx + 1]):
+ result_equal = False
+ break
+
+ if result_equal:
+ # Same problem solved in multiple ways
+ return [results[0]]
+ else:
+ return results
+
+
+def extract_answer_dataset(solution: str, problem: str, corrected_answers: list) -> str:
+ entries = find_boxed_entries(solution)
+
+ if len(entries) == 1:
+ parsed_answer = entries[0]
+
+ if len(entries) > 1:
+ for item in corrected_answers:
+ if item["problem"] == problem:
+ parsed_answer = item["answer"]
+ break
+ else:
+ parsed_answer = ", ".join(entries)
+
+ if not (
+ ("Find the equation" in problem)
+ or ("Enter the equation" in problem)
+ or ("What is the equation" in problem)
+ or ("described by the equation" in problem)
+ or ("Find an equation" in problem)
+ ) and ("=" in parsed_answer):
+ if parsed_answer.count("=") == 1:
+ # For greater count, it means we're just predicting values of multiple variables
+ parsed_answer = parsed_answer.split("=")[1]
+ return parsed_answer
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+ def _process_doc(doc: dict, idx, corrected_answer) -> dict:
+ out_doc = {
+ "question": doc["problem"],
+ "question_id": idx,
+ "solution": doc["solution"],
+ "answer": extract_answer_dataset(
+ doc["solution"], doc["problem"], corrected_answer
+ ),
+ }
+ return out_doc
+
+ corrected_answer_path = os.path.join(
+ os.path.dirname(__file__), "to_be_fixed_questions.json"
+ )
+
+ with open(corrected_answer_path, "r") as f:
+ corrected_answers = json.load(f)
+
+ return dataset.map(
+ partial(_process_doc, corrected_answer=corrected_answers), with_indices=True
+ )
+
+
+def prompt_robustness_process_docs(doc: datasets.Dataset) -> datasets.Dataset:
+ doc = process_docs(doc)
+ return utils.process_docs_add_prompts(
+ doc,
+ PROMPT_ROBUSTNESS_TEMPLATE_KEY,
+ TEMPLATE_FILE_PATH,
+ )
+
+
+def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
+ answer = extract_answer(results[0])
+
+ if math_equal(answer, doc["answer"]):
+ retval = 1
+ else:
+ retval = 0
+
+ prompt_id = doc["prompt_id"]
+
+ results = {
+ f"{prompt_id}_accuracy": (prompt_id, retval),
+ "consistency_rate": (doc["question_id"], answer),
+ }
+ return results
+
+
+def per_prompt_accuracy(results: List[Dict[str, Any]], p_id=0) -> float:
+ accuracies = []
+ for result in results:
+ prompt_id, retval = result
+ if prompt_id != p_id:
+ continue
+ accuracies.append(retval)
+
+ accuracy = sum(accuracies) / len(accuracies)
+ eval_logger.info(f"Prompt - {prompt_id} accuracy: {accuracy}")
+
+ return np.round(accuracy, 4)
+
+
+per_prompt_accuracy_0 = partial(per_prompt_accuracy, p_id=0)
+per_prompt_accuracy_1 = partial(per_prompt_accuracy, p_id=1)
+per_prompt_accuracy_2 = partial(per_prompt_accuracy, p_id=2)
+per_prompt_accuracy_3 = partial(per_prompt_accuracy, p_id=3)
+per_prompt_accuracy_4 = partial(per_prompt_accuracy, p_id=4)
+per_prompt_accuracy_5 = partial(per_prompt_accuracy, p_id=5)
+per_prompt_accuracy_6 = partial(per_prompt_accuracy, p_id=6)
+per_prompt_accuracy_7 = partial(per_prompt_accuracy, p_id=7)
+per_prompt_accuracy_8 = partial(per_prompt_accuracy, p_id=8)
+per_prompt_accuracy_9 = partial(per_prompt_accuracy, p_id=9)
+
+
+def calculate_consistency_rate(responses: List[List[str]]) -> float:
+ """
+ Calculate the Consistency Rate (CR) for a given set of responses.
+
+ Args:
+ responses: List of lists, where each inner list contains responses to the same question.
+
+ Returns:
+ The consistency rate as a float.
+ """
+ total_similarity = 0
+ total_combinations = 0
+
+ for response_set in responses:
+ pairs = combinations(response_set, 2)
+ num_pairs = len(response_set) * (len(response_set) - 1) / 2
+ total_combinations += num_pairs
+ for answer1, answer2 in pairs:
+ total_similarity += int(math_equal(answer1, answer2))
+
+ return total_similarity / total_combinations if total_combinations > 0 else 0.0
+
+
+def math_prompt_consistency_rate(results: List[Dict[str, Any]]) -> float:
+ """
+ Calculate the Consistency Rate (CR) for a given set of responses.
+
+ Args:
+ responses: List of lists, where each inner list contains responses to the same question.
+
+ Returns:
+ The consistency rate as a float.
+ """
+ question_answers_dict = {}
+
+ for result in results:
+ question_id, answer = result
+ if question_id not in question_answers_dict:
+ question_answers_dict[question_id] = []
+ question_answers_dict[question_id].append(answer)
+
+ question_answers_list = [answers for answers in question_answers_dict.values()]
+
+ return calculate_consistency_rate(question_answers_list)
diff --git a/lm_eval/tasks/score/mmlu_pro/prompt_templates.json b/lm_eval/tasks/score/mmlu_pro/prompt_templates.json
new file mode 100644
index 0000000000..57278cd17e
--- /dev/null
+++ b/lm_eval/tasks/score/mmlu_pro/prompt_templates.json
@@ -0,0 +1,51 @@
+{
+ "option_order_robustness":{
+ "prompt": "For the multiple-choice question related to {category}, which option (A-J) is correct?.\n\nQuestion: {question}{options}\n\nEnd the answer with the following:\nThe best answer is (the_answer_letter) where the (the_answer_letter) is one of 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I' or 'J'.",
+ "options_format": "\n{letter}: {option}"
+ },
+
+ "prompt_robustness": [
+ {
+ "prompt": "{question}{options}\nExamine the question and choose the correct answer from the options 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I' or 'J'. End your answer with:\nThe best answer is [the_answer_letter].\nwhere the [the_answer_letter] is a letter from A to J.",
+ "options_format": "\n{letter}: {option}"
+ },
+ {
+ "prompt": "{question}{options}\nAnswer the multiple-choice question about {category} by selecting the correct option from A to J. Always conclude with 'The best answer is (answer_letter)' where the (answer_letter) is one of A, B, C, D, E, F, G, H, I, J.",
+ "options_format": "\n{letter}: {option}"
+ },
+ {
+ "prompt": "You must reply with only a single letter from A, B, C, D, E, F, G, H, I or J to this question. Conclude with:\nThe best answer is answer_letter where the answer_letter is a single letter from A to J. \n{question}{options}",
+ "options_format": "\n{letter}: {option}"
+ },
+ {
+ "prompt": "From the options A-J, select the correct answer to the following question. End the answer with - The best answer is answer_letter, where answer_letter is one of A, B, C, D, E, F, G, H, I, or J.\nQuestion: {question}{options}",
+ "options_format": "\n{letter}: {option}"
+ },
+ {
+ "prompt": "For the multiple-choice question related to {category}, which option (A-J) is correct?.\n\nQuestion:{question}{options}\nEnd the answer with the following:\nThe best answer is (the_answer_letter) where the (the_answer_letter) is one of 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I' or 'J'.",
+ "options_format": "\n{letter}: {option}"
+ },
+ {
+ "prompt": "Evaluate the multiple-choice question and select the most fitting response from 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'. \nQuestion:{question}{options}\nAlways conclude with:\nThe best answer is [the_answer_letter].\nwhere the [the_answer_letter] is one of A, B, C, D, E, F, G, H, I or J.",
+ "options_format": "\n{letter}: {option}"
+ },
+ {
+ "prompt": "Answer to the following question about {category} by selecting the correct option A, B, C, D, E, F, G, H, I or J. {question}{options}\nThe answer should end with:\nThe best answer is [the_answer_letter] where [the_answer_letter] is one of the letters A to J. Let's think step by step.",
+ "options_format": "\n{letter}: {option}"
+ },
+ {
+ "prompt": "Select the correct answer from the options 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I','J' for the question provided below. Conclude by stating: The best answer is answer_letter where answer_letter is one of 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I' or 'J'. Let's think step by step.\nQuestion: {question}{options}",
+ "options_format": "\n{letter}: {option}"
+ },
+ {
+ "prompt": "{question}{options}\nFor this question about {category} with 10 possible answers A, B, C, D, E, F, G, H, I, J choose the one that answers the question. If the problem is simple or straightforward, just provide the answer. If the answer is more complex, use a step-by-step approach and for each step briefly explain your reasoning. Always conclude with 'The best answer is (answer_letter)' where the (answer_letter) is one of 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I','J'. Let's think step by step.",
+ "options_format": "\n{letter}: {option}"
+ },
+ {
+ "prompt": "Read the question and options below, then determine the correct answer choice (A-J)\nQuestion: {question}{options}\n\nFor simple questions, provide a quick answer. For complicated ones, think step by step, break down the question into smaller problems and reach to a conclusion\nEnd your answer by stating:\nThe best answer is [the_answer_letter].\nwhere [the_answer_letter] is one of A, B, C, D, E, F, G, H, I, or J.",
+ "options_format": "\n{letter}: {option}"
+ }
+
+ ]
+
+}
diff --git a/lm_eval/tasks/score/mmlu_pro/score_option_order_robustness_mmlu_pro.yaml b/lm_eval/tasks/score/mmlu_pro/score_option_order_robustness_mmlu_pro.yaml
new file mode 100644
index 0000000000..4df6e432f9
--- /dev/null
+++ b/lm_eval/tasks/score/mmlu_pro/score_option_order_robustness_mmlu_pro.yaml
@@ -0,0 +1,66 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+tag: score_robustness_mmlu_pro
+task: score_option_order_robustness_mmlu_pro
+dataset_path: TIGER-Lab/MMLU-Pro
+dataset_name: default
+output_type: generate_until
+validation_split: validation
+test_split: test
+process_docs: !function utils_mmlu_pro.option_order_robustness_process_docs
+doc_to_text: !function utils_mmlu_pro.mmlu_pro_robustness_doc_to_text
+doc_to_target: answer
+generation_kwargs:
+ max_gen_toks: 1024
+ do_sample: False
+process_results: !function utils_mmlu_pro.option_order_robustness_process_results
+metric_list:
+ - metric: per_option_macro_accuracy_A
+ aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_a
+ higher_is_better: true
+ - metric: per_option_macro_accuracy_B
+ aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_b
+ higher_is_better: true
+ - metric: per_option_macro_accuracy_C
+ aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_c
+ higher_is_better: true
+ - metric: per_option_macro_accuracy_D
+ aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_d
+ higher_is_better: true
+ - metric: per_option_macro_accuracy_E
+ aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_e
+ higher_is_better: true
+ - metric: per_option_macro_accuracy_F
+ aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_f
+ higher_is_better: true
+ - metric: per_option_macro_accuracy_G
+ aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_g
+ higher_is_better: true
+ - metric: per_option_macro_accuracy_H
+ aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_h
+ higher_is_better: true
+ - metric: per_option_macro_accuracy_I
+ aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_i
+ higher_is_better: true
+ - metric: per_option_macro_accuracy_J
+ aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_j
+ higher_is_better: true
+ - metric: options_consistency_rate
+ aggregation: !function utils_mmlu_pro.options_consistency_rate
+ higher_is_better: true
+metadata:
+ version: 1.0
+dataset_kwargs:
+ trust_remote_code: true
diff --git a/lm_eval/tasks/score/mmlu_pro/score_prompt_robustness_mmlu_pro.yaml b/lm_eval/tasks/score/mmlu_pro/score_prompt_robustness_mmlu_pro.yaml
new file mode 100644
index 0000000000..735d642a55
--- /dev/null
+++ b/lm_eval/tasks/score/mmlu_pro/score_prompt_robustness_mmlu_pro.yaml
@@ -0,0 +1,66 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+tag: score_robustness_mmlu_pro
+task: score_prompt_robustness_mmlu_pro
+dataset_path: TIGER-Lab/MMLU-Pro
+dataset_name: default
+output_type: generate_until
+validation_split: validation
+test_split: test
+process_docs: !function utils_mmlu_pro.prompt_robustness_process_docs
+doc_to_text: !function utils_mmlu_pro.mmlu_pro_robustness_doc_to_text
+doc_to_target: answer
+generation_kwargs:
+ max_gen_toks: 1024
+ do_sample: False
+process_results: !function utils_mmlu_pro.prompt_robustness_process_results
+metric_list:
+ - metric: 0_macro_accuracy
+ aggregation: !function utils_mmlu_pro.per_prompt_accuracy_0
+ higher_is_better: true
+ - metric: 1_macro_accuracy
+ aggregation: !function utils_mmlu_pro.per_prompt_accuracy_1
+ higher_is_better: true
+ - metric: 2_macro_accuracy
+ aggregation: !function utils_mmlu_pro.per_prompt_accuracy_2
+ higher_is_better: true
+ - metric: 3_macro_accuracy
+ aggregation: !function utils_mmlu_pro.per_prompt_accuracy_3
+ higher_is_better: true
+ - metric: 4_macro_accuracy
+ aggregation: !function utils_mmlu_pro.per_prompt_accuracy_4
+ higher_is_better: true
+ - metric: 5_macro_accuracy
+ aggregation: !function utils_mmlu_pro.per_prompt_accuracy_5
+ higher_is_better: true
+ - metric: 6_macro_accuracy
+ aggregation: !function utils_mmlu_pro.per_prompt_accuracy_6
+ higher_is_better: true
+ - metric: 7_macro_accuracy
+ aggregation: !function utils_mmlu_pro.per_prompt_accuracy_7
+ higher_is_better: true
+ - metric: 8_macro_accuracy
+ aggregation: !function utils_mmlu_pro.per_prompt_accuracy_8
+ higher_is_better: true
+ - metric: 9_macro_accuracy
+ aggregation: !function utils_mmlu_pro.per_prompt_accuracy_9
+ higher_is_better: true
+ - metric: consistency_rate
+ aggregation: !function utils_mmlu_pro.mmlu_pro_prompt_consistency_rate
+ higher_is_better: true
+metadata:
+ version: 1.0
+dataset_kwargs:
+ trust_remote_code: true
diff --git a/lm_eval/tasks/score/mmlu_pro/utils_mmlu_pro.py b/lm_eval/tasks/score/mmlu_pro/utils_mmlu_pro.py
new file mode 100644
index 0000000000..4dd4b65703
--- /dev/null
+++ b/lm_eval/tasks/score/mmlu_pro/utils_mmlu_pro.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from functools import partial
+from typing import Any, Dict, List
+
+import numpy as np
+
+from lm_eval.tasks.score import utils
+from lm_eval.tasks.score.utils import prompt_consistency_rate, robustness_doc_to_text
+from lm_eval.utils import eval_logger
+
+
+TEMPLATE_FILE_PATH = os.path.join(os.path.dirname(__file__), "prompt_templates.json")
+
+PROMPT_ROBUSTNESS_TEMPLATE_KEY = "prompt_robustness"
+OPTION_ORDER_ROBUSTNESS_TEMPLATE_KEY = "option_order_robustness"
+
+QUESTION_KEY = "question"
+
+LABELS = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]
+
+mmlu_pro_prompt_consistency_rate = prompt_consistency_rate
+mmlu_pro_robustness_doc_to_text = robustness_doc_to_text
+
+
+prompt_robustness_process_docs = partial(
+ utils.process_docs_add_prompts,
+ templates_key=PROMPT_ROBUSTNESS_TEMPLATE_KEY,
+ template_file_path=TEMPLATE_FILE_PATH,
+)
+
+option_order_robustness_process_docs = partial(
+ utils.option_order_robustness_process_docs,
+ template_file_path=TEMPLATE_FILE_PATH,
+ templates_key=OPTION_ORDER_ROBUSTNESS_TEMPLATE_KEY,
+ labels=LABELS,
+)
+
+
+def prompt_robustness_process_results(doc, results) -> Dict[str, float]:
+ final_answer = utils.__postprocess_pred(results[0])
+ final_answer = utils.translate_model_answer_to_labels(
+ final_answer, option_format=doc["options_format"], labels=LABELS
+ )
+ gt = LABELS[doc["answer_index"]]
+ prompt_id = doc["prompt_id"]
+ question_id = doc["question_id"]
+ category = doc["category"]
+ return {
+ f"{prompt_id}_macro_accuracy": (
+ question_id,
+ prompt_id,
+ final_answer,
+ gt,
+ category,
+ ),
+ "consistency_rate": (question_id, prompt_id, final_answer, gt),
+ }
+
+
+def option_order_robustness_process_results(doc, results) -> Dict[str, float]:
+ final_answer = utils.__postprocess_pred(results[0])
+ final_answer = utils.translate_model_answer_to_labels(
+ final_answer, option_format=doc["options_format"], labels=LABELS
+ )
+ gt = LABELS[doc["answer_index"]]
+ always_same_option = doc["always_same_option"]
+ question_id = doc["question_id"]
+ original_answer_index = doc["original_answer_index"]
+ answer_index = (doc["answer_index"],)
+ category = doc["category"]
+ return {
+ f"per_option_macro_accuracy_{always_same_option}": (
+ question_id,
+ always_same_option,
+ final_answer,
+ gt,
+ category,
+ ),
+ "options_consistency_rate": (
+ question_id,
+ always_same_option,
+ final_answer,
+ original_answer_index,
+ answer_index,
+ ),
+ }
+
+
+def per_prompt_macro_accuracy(results: List[Dict[str, Any]], p_id=0) -> float:
+ accuracies = {}
+ for result in results:
+ question_id, prompt_id, final_answer, gt, category = result
+ if prompt_id != p_id:
+ continue
+ if category not in accuracies:
+ accuracies[category] = []
+ accuracies[category].append(final_answer == gt)
+
+ for key in accuracies:
+ accuracies[key] = sum(accuracies[key]) / len(accuracies[key])
+ eval_logger.info(
+ f"Prompt - {prompt_id}, category - {key} accuracy: {accuracies[key]}"
+ )
+
+ return np.round(np.mean([v for v in accuracies.values()]), 4)
+
+
+per_prompt_accuracy_0 = partial(per_prompt_macro_accuracy, p_id=0)
+per_prompt_accuracy_1 = partial(per_prompt_macro_accuracy, p_id=1)
+per_prompt_accuracy_2 = partial(per_prompt_macro_accuracy, p_id=2)
+per_prompt_accuracy_3 = partial(per_prompt_macro_accuracy, p_id=3)
+per_prompt_accuracy_4 = partial(per_prompt_macro_accuracy, p_id=4)
+per_prompt_accuracy_5 = partial(per_prompt_macro_accuracy, p_id=5)
+per_prompt_accuracy_6 = partial(per_prompt_macro_accuracy, p_id=6)
+per_prompt_accuracy_7 = partial(per_prompt_macro_accuracy, p_id=7)
+per_prompt_accuracy_8 = partial(per_prompt_macro_accuracy, p_id=8)
+per_prompt_accuracy_9 = partial(per_prompt_macro_accuracy, p_id=9)
+
+
+def per_option_macro_accuracy(results: List[Dict[str, Any]], always_opt="a") -> float:
+ accuracies = {}
+ for result in results:
+ question_id, always_same_option, final_answer, gt, category = result
+ if always_opt != always_same_option:
+ continue
+ if category not in accuracies:
+ accuracies[category] = []
+ accuracies[category].append(int(final_answer == gt))
+
+ for key in accuracies:
+ accuracies[key] = sum(accuracies[key]) / len(accuracies[key])
+ eval_logger.info(
+ f"Prompt - {always_opt.upper()}, category - {key} accuracy: {accuracies[key]}"
+ )
+
+ return np.round(np.mean([v for v in accuracies.values()]), 4)
+
+
+per_option_macro_accuracy_a = partial(per_option_macro_accuracy, always_opt="A")
+per_option_macro_accuracy_b = partial(per_option_macro_accuracy, always_opt="B")
+per_option_macro_accuracy_c = partial(per_option_macro_accuracy, always_opt="C")
+per_option_macro_accuracy_d = partial(per_option_macro_accuracy, always_opt="D")
+per_option_macro_accuracy_e = partial(per_option_macro_accuracy, always_opt="E")
+per_option_macro_accuracy_f = partial(per_option_macro_accuracy, always_opt="F")
+per_option_macro_accuracy_g = partial(per_option_macro_accuracy, always_opt="G")
+per_option_macro_accuracy_h = partial(per_option_macro_accuracy, always_opt="H")
+per_option_macro_accuracy_i = partial(per_option_macro_accuracy, always_opt="I")
+per_option_macro_accuracy_j = partial(per_option_macro_accuracy, always_opt="J")
+
+options_consistency_rate = partial(utils.options_consistency_rate, labels=LABELS)
diff --git a/lm_eval/tasks/score/score_robustness.yaml b/lm_eval/tasks/score/score_robustness.yaml
new file mode 100644
index 0000000000..602f6d7d3e
--- /dev/null
+++ b/lm_eval/tasks/score/score_robustness.yaml
@@ -0,0 +1,21 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+group: score_robustness
+task:
+ - score_robustness_agieval
+ - score_robustness_mmlu_pro
+ - score_robustness_math
+metadata:
+ version: 1.0
diff --git a/lm_eval/tasks/score/utils.py b/lm_eval/tasks/score/utils.py
new file mode 100644
index 0000000000..5a7174f1ee
--- /dev/null
+++ b/lm_eval/tasks/score/utils.py
@@ -0,0 +1,263 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import json
+import re
+import string
+import sys
+from functools import partial
+from itertools import combinations
+from typing import Any, Dict, List
+
+import numpy as np
+from datasets import Dataset
+
+from lm_eval.utils import eval_logger
+
+
+NUMERALS = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
+ROMAN_NUMERALS = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X"]
+
+
+def __repeat_elements(lst, n):
+ result = []
+ for element in lst:
+ result.extend([element] * n)
+ return result
+
+
+def process_docs_add_prompts(
+ doc: Dataset,
+ templates_key: str,
+ template_file_path: str,
+ dataset_specific_preprocess: callable = None,
+) -> Dataset:
+ try:
+ with open(template_file_path) as f:
+ prompt_templates = json.load(f)[templates_key]
+ except FileNotFoundError:
+ eval_logger.error("Prompt templates not found")
+ sys.exit()
+ if dataset_specific_preprocess is not None:
+ doc = dataset_specific_preprocess(doc)
+
+ def process_batch(batch):
+ n = len(prompt_templates)
+ initial_len = len(next(iter(batch.values())))
+
+ result = {key: __repeat_elements(values, n) for key, values in batch.items()}
+ result["prompt_id"] = list(range(n)) * initial_len
+ result["prompt"] = [prompt_templates[i]["prompt"] for i in result["prompt_id"]]
+ if "options_format" in prompt_templates[0]:
+ result["options_format"] = [
+ prompt_templates[i]["options_format"] for i in result["prompt_id"]
+ ]
+ return result
+
+ return doc.map(process_batch, batched=True)
+
+
+def option_order_robustness_process_docs(
+ doc: Dataset,
+ template_file_path: str,
+ templates_key: str,
+ labels: list,
+ dataset_specific_preprocess: callable = None,
+) -> Dataset:
+ try:
+ with open(template_file_path) as f:
+ prompt_template = json.load(f)[templates_key]
+ prompt = prompt_template["prompt"]
+ options_format = prompt_template["options_format"]
+ except FileNotFoundError:
+ eval_logger.error("Prompt templates not found")
+ sys.exit()
+
+ if dataset_specific_preprocess is not None:
+ doc = dataset_specific_preprocess(doc)
+
+ def repeat_doc_swap_correct_answer(batched_docs):
+ initial_len = len(next(iter(batched_docs.values())))
+ keys = list(batched_docs.keys())
+ new_batched_docs = {key: [] for key in keys}
+ new_batched_docs["always_same_option"] = []
+ new_batched_docs["prompt"] = []
+ new_batched_docs["options_format"] = []
+ new_batched_docs["original_answer_index"] = []
+
+ for doc_ind in range(initial_len):
+ for label_ind, label in enumerate(labels):
+ new_batched_docs["original_answer_index"].append(
+ batched_docs["answer_index"][doc_ind]
+ )
+ for key in keys:
+ new_batched_docs[key].append(
+ copy.deepcopy(batched_docs[key][doc_ind])
+ )
+ if label_ind < len(batched_docs["options"][doc_ind]):
+ if key == "options":
+ # Swap correct answer with label_ind option
+ new_batched_docs[key][-1][label_ind] = batched_docs[
+ "options"
+ ][doc_ind][batched_docs["answer_index"][doc_ind]]
+ new_batched_docs[key][-1][
+ batched_docs["answer_index"][doc_ind]
+ ] = batched_docs["options"][doc_ind][label_ind]
+
+ if key == "answer_index":
+ new_batched_docs[key][-1] = label_ind
+
+ if key == "answer":
+ new_batched_docs[key][-1] = label
+
+ new_batched_docs["always_same_option"].append(label)
+ new_batched_docs["prompt"].append(prompt)
+ new_batched_docs["options_format"].append(options_format)
+ return new_batched_docs
+
+ return doc.map(repeat_doc_swap_correct_answer, batched=True)
+
+
+def robustness_doc_to_text(doc: Dataset) -> str:
+ upper_case = string.ascii_uppercase
+ lower_case = string.ascii_lowercase
+ prompt = doc["prompt"]
+ options_format = doc.get("options_format", "")
+ question = doc["question"]
+ catrgory = doc.get("category", "")
+ options = None
+ if options_format:
+ options = "".join(
+ [
+ options_format.format(
+ letter=upper_case[i],
+ option=doc["options"][i],
+ numeral=NUMERALS[i],
+ roman_numeral=ROMAN_NUMERALS[i],
+ lower_case_letter=lower_case[i],
+ )
+ for i in range(len(doc["options"]))
+ ]
+ )
+ return prompt.format(question=question, options=options, category=catrgory)
+
+
+def __postprocess_pred(pred):
+ if "the best answer is" not in pred.lower():
+ return pred
+ pred_proc = (
+ pred.lower().split("the best answer is ")[-1].split("\n")[0].split(" ")[0]
+ )
+ pred_proc = re.sub(r"[^a-zA-Z0-9]", "", pred_proc).strip()
+ return pred_proc.upper()
+
+
+def translate_model_answer_to_labels(answer, labels, option_format=None):
+ answer = answer.upper()
+
+ if option_format is None:
+ return answer
+
+ elif "numeral" in option_format:
+ if "roman" in option_format:
+ if answer not in ROMAN_NUMERALS:
+ return answer
+ else:
+ return labels[ROMAN_NUMERALS.index(answer)]
+
+ if answer not in NUMERALS:
+ return answer
+ else:
+ return labels[NUMERALS.index(answer)]
+
+ return answer
+
+
+def calculate_consistency_rate(responses: List[List[str]]) -> float:
+ """
+ Calculate the Consistency Rate (CR) for a given set of responses.
+
+ Args:
+ responses: List of lists, where each inner list contains responses to the same question.
+
+ Returns:
+ The consistency rate as a float.
+ """
+ total_similarity = 0
+ total_combinations = 0
+
+ for response_set in responses:
+ pairs = combinations(response_set, 2)
+ num_pairs = len(response_set) * (len(response_set) - 1) / 2
+ total_combinations += num_pairs
+ for answer1, answer2 in pairs:
+ total_similarity += int(answer1 == answer2)
+
+ return total_similarity / total_combinations if total_combinations > 0 else 0.0
+
+
+def prompt_consistency_rate(results: List[Dict[str, Any]]) -> float:
+ """
+ Calculate the Consistency Rate (CR) for a given set of responses.
+
+ Args:
+ responses: List of lists, where each inner list contains responses to the same question.
+
+ Returns:
+ The consistency rate as a float.
+ """
+ question_answers_dict = {}
+
+ for result in results:
+ question_id, prompt_id, final_answer, gt = result
+ if question_id not in question_answers_dict:
+ question_answers_dict[question_id] = []
+ question_answers_dict[question_id].append(final_answer)
+
+ question_answers_list = [answers for answers in question_answers_dict.values()]
+
+ return calculate_consistency_rate(question_answers_list)
+
+
+def options_consistency_rate(results: List[Dict[str, Any]], labels) -> float:
+ """
+ Calculate the Consistency Rate (CR) for a given set of responses.
+
+ Args:
+ responses: List of lists, where each inner list contains responses to the same question.
+
+ Returns:
+ The consistency rate as a float.
+ """
+ question_answers_dict = {}
+ for result in results:
+ (
+ question_id,
+ always_same_option,
+ final_answer,
+ original_answer_index,
+ answer_index,
+ ) = result
+ if final_answer == labels[original_answer_index]:
+ final_answer = always_same_option
+ if final_answer == always_same_option:
+ final_answer = labels[original_answer_index]
+ if question_id not in question_answers_dict:
+ question_answers_dict[question_id] = []
+ question_answers_dict[question_id].append(final_answer)
+
+ question_answers_list = [answers for answers in question_answers_dict.values()]
+
+ return calculate_consistency_rate(question_answers_list)