Skip to content

Commit 0822bb2

Browse files
authored
Merge pull request #99 from OpenGPTX/mlmm
New MLMM tasks
2 parents 5556094 + c8b5316 commit 0822bb2

File tree

6 files changed

+607
-0
lines changed

6 files changed

+607
-0
lines changed

lm_eval/tasks/__init__.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,12 @@
6464
from .aam.all_tasks_registry import TASK_REGISTRY as AAM_TASK_REGISTRY
6565
from .opengptx.all_tasks_registry import TASK_REGISTRY as OGX_TASK_REGISTRY
6666

67+
from .mlmm import multilingual_arc
68+
from .mlmm import multilingual_hellaswag
69+
from .mlmm import multilingual_mmlu
70+
from .mlmm import multilingual_truthfulqa
71+
72+
6773
########################################
6874
# Translation tasks
6975
########################################
@@ -328,6 +334,11 @@
328334
**tmp_new_pawsx.construct_tasks(),
329335
**tmp_new_xnli.construct_tasks(),
330336
**mgsm.construct_tasks(),
337+
# Multilingual OpenLLM Evaluation
338+
**multilingual_arc.create_all_tasks(),
339+
**multilingual_mmlu.create_all_tasks(),
340+
**multilingual_truthfulqa.create_all_tasks(),
341+
**multilingual_hellaswag.create_all_tasks(),
331342
}
332343

333344
# append the luminous (eg. Aleph-Alpha implemented) tasks to the whole registry

lm_eval/tasks/mlmm/__init__.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
"""
2+
Tasks from "Multilingual Large Language Models Evaluation Benchmark"
3+
4+
Source: https://github.com/nlp-uoregon/mlmm-evaluation
5+
6+
This repo contains benchmark datasets and evaluation scripts for Multilingual Large Language Models (LLMs). These datasets can be used to evaluate the models across 26 different languages and encompass three distinct tasks: ARC, HellaSwag, and MMLU. This is released as a part of our [Okapi framework](https://github.com/nlp-uoregon/Okapi) for multilingual instruction-tuned LLMs with reinforcement learning from human feedback.
7+
8+
- [**ARC**](https://allenai.org/data/arc): A dataset with 7,787 genuine grade-school level, multiple-choice science questions, assembled to encourage research in advanced question-answering.
9+
- [**HellaSwag**](https://allenai.org/data/hellaswag): HellaSWAG is a dataset for studying grounded commonsense inference. It consists of 70k multiple choice questions about grounded situations: each question comes from one of two domains *activitynet* or *wikihow* with four answer choices about what might happen next in the scene. The correct answer is the (real) sentence for the next event; the three incorrect answers are adversarially generated and human verified, so as to fool machines but not humans.
10+
- [**MMLU**](https://arxiv.org/pdf/2009.03300.pdf): This dataset contains multiple choice questions derived from diverse fields of knowledge. The test covers subjects in the humanities, social sciences, hard sciences, and other essential areas of learning for certain individuals.
11+
12+
Currently, our datasets support 26 languages: Russian, German, Chinese, French, Spanish, Italian, Dutch, Vietnamese, Indonesian, Arabic, Hungarian, Romanian, Danish, Slovak, Ukrainian, Catalan, Serbian, Croatian, Hindi, Bengali, Tamil, Nepali, Malayalam, Marathi, Telugu, and Kannada.
13+
14+
"""
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
"""
2+
Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge
3+
https://arxiv.org/pdf/1803.05457.pdf
4+
5+
The ARC dataset consists of 7,787 science exam questions drawn from a variety
6+
of sources, including science questions provided under license by a research
7+
partner affiliated with AI2. These are text-only, English language exam questions
8+
that span several grade levels as indicated in the files. Each question has a
9+
multiple choice structure (typically 4 answer options). The questions are sorted
10+
into a Challenge Set of 2,590 “hard” questions (those that both a retrieval and
11+
a co-occurrence method fail to answer correctly) and an Easy Set of 5,197 questions.
12+
13+
Homepage: https://allenai.org/data/arc
14+
"""
15+
from lm_eval.base import MultipleChoiceTask
16+
17+
_CITATION = """
18+
@article{Clark2018ThinkYH,
19+
title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
20+
author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
21+
journal={ArXiv},
22+
year={2018},
23+
volume={abs/1803.05457}
24+
}
25+
"""
26+
27+
LANGS = "ar,bn,ca,da,de,es,eu,fr,gu,hi,hr,hu,hy,id,it,kn,ml,mr,ne,nl,pt,ro,ru,sk,sr,sv,ta,te,uk,vi,zh".split(
28+
","
29+
)
30+
31+
32+
def create_all_tasks():
33+
"""Creates a dictionary of tasks from a list of subjects
34+
:return: {task_name: task}
35+
e.g. {arc_vi: Task, arc_bn: Task}
36+
"""
37+
return {f"mlmm_arc_{lang}": create_task(lang) for lang in LANGS}
38+
39+
40+
def create_task(lang):
41+
class ATest(MultilingualARC):
42+
def __init__(self):
43+
super().__init__(lang)
44+
45+
return ATest
46+
47+
48+
class MultilingualARC(MultipleChoiceTask):
49+
def __init__(self, lang, **kwargs):
50+
self.VERSION = 0
51+
self.lang = lang
52+
self.DATASET_NAME = f"arc_{lang}"
53+
self.DATASET_PATH = "malteos/m_arc"
54+
self.NUM_FEW_SHOT = 25
55+
super().__init__(**kwargs)
56+
57+
def has_training_docs(self):
58+
return True
59+
60+
def has_validation_docs(self):
61+
return True
62+
63+
def has_test_docs(self):
64+
return True
65+
66+
def training_docs(self):
67+
if self._training_docs is None:
68+
self._training_docs = list(map(self._process_doc, self.dataset["train"]))
69+
return self._training_docs
70+
71+
def validation_docs(self):
72+
return map(self._process_doc, self.dataset["validation"])
73+
74+
def test_docs(self):
75+
return map(self._process_doc, self.dataset["test"])
76+
77+
def _process_doc(self, doc):
78+
# NOTE:
79+
out_doc = {
80+
"id": doc["id"],
81+
"query": "Question: " + doc["question"] + "\nAnswer:",
82+
"choices": doc["choices"],
83+
"gold": ["A", "B", "C", "D", "E"].index(doc["answerKey"]),
84+
}
85+
return out_doc
86+
87+
def doc_to_text(self, doc):
88+
return doc["query"]
89+
90+
def should_decontaminate(self):
91+
return True
92+
93+
def doc_to_decontamination_query(self, doc):
94+
return doc["query"]
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
"""
2+
HellaSwag: Can a Machine Really Finish Your Sentence?
3+
https://arxiv.org/pdf/1905.07830.pdf
4+
5+
Hellaswag is a commonsense inference challenge dataset. Though its questions are
6+
trivial for humans (>95% accuracy), state-of-the-art models struggle (<48%). This is
7+
achieved via Adversarial Filtering (AF), a data collection paradigm wherein a
8+
series of discriminators iteratively select an adversarial set of machine-generated
9+
wrong answers. AF proves to be surprisingly robust. The key insight is to scale up
10+
the length and complexity of the dataset examples towards a critical 'Goldilocks'
11+
zone wherein generated text is ridiculous to humans, yet often misclassified by
12+
state-of-the-art models.
13+
14+
Homepage: https://rowanzellers.com/hellaswag/
15+
"""
16+
import re
17+
from lm_eval.base import MultipleChoiceTask
18+
19+
_CITATION = """
20+
@inproceedings{zellers2019hellaswag,
21+
title={HellaSwag: Can a Machine Really Finish Your Sentence?},
22+
author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
23+
booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
24+
year={2019}
25+
}
26+
"""
27+
28+
LANGS = "ar,bn,ca,da,de,es,eu,fr,gu,hi,hr,hu,hy,id,it,kn,ml,mr,ne,nl,pt,ro,ru,sk,sr,sv,ta,te,uk,vi,zh".split(
29+
","
30+
)
31+
32+
33+
def create_all_tasks():
34+
"""Creates a dictionary of tasks from a list of subjects
35+
:return: {task_name: task}
36+
e.g. {hellaswag_vi: Task, hellaswag_en: Task}
37+
"""
38+
return {f"mlmm_hellaswag_{lang}": create_task(lang) for lang in LANGS}
39+
40+
41+
def create_task(lang):
42+
class ATest(HellaSwag):
43+
def __init__(self):
44+
super().__init__(lang)
45+
46+
return ATest
47+
48+
49+
class HellaSwag(MultipleChoiceTask):
50+
def __init__(self, lang, **kwargs):
51+
self.VERSION = 1
52+
self.lang = lang
53+
self.DATASET_NAME = f"hellaswag_{lang}"
54+
self.DATASET_PATH = "malteos/m_hellaswag"
55+
self.NUM_FEW_SHOT = 0
56+
super().__init__(**kwargs)
57+
58+
def has_training_docs(self):
59+
return False
60+
61+
def has_validation_docs(self):
62+
return True
63+
64+
def has_test_docs(self):
65+
return False
66+
67+
def training_docs(self):
68+
if self._training_docs is None:
69+
self._training_docs = list(map(self._process_doc, self.dataset["train"]))
70+
return self._training_docs
71+
72+
def validation_docs(self):
73+
return map(self._process_doc, self.dataset["validation"])
74+
75+
def _process_doc(self, doc):
76+
ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
77+
out_doc = {
78+
"query": self.preprocess(doc["activity_label"] + ": " + ctx),
79+
"choices": [self.preprocess(ending) for ending in doc["endings"]],
80+
"gold": int(doc["label"]),
81+
}
82+
return out_doc
83+
84+
@classmethod
85+
def preprocess(cls, text):
86+
text = text.strip()
87+
# NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
88+
text = text.replace(" [title]", ". ")
89+
text = re.sub("\\[.*?\\]", "", text)
90+
text = text.replace(" ", " ")
91+
return text
92+
93+
def doc_to_text(self, doc):
94+
return doc["query"]
95+
96+
def should_decontaminate(self):
97+
return True
98+
99+
def doc_to_decontamination_query(self, doc):
100+
return doc["query"]

0 commit comments

Comments
 (0)