From 8756403fc8fc74cc0011cc30d8ef1453ce5e191e Mon Sep 17 00:00:00 2001 From: issam <mr.issam.yahiaoui@gmail.com> Date: Sat, 16 Dec 2023 19:29:03 +0400 Subject: [PATCH] add new tasks... --- lm_eval/tasks/medmcqa/README.md | 61 +++++++++++++++++++ lm_eval/tasks/medmcqa/medmcqa.yaml | 11 ++++ lm_eval/tasks/medmcqa/utils.py | 27 ++++++++ lm_eval/tasks/medqa/README.md | 58 ++++++++++++++++++ lm_eval/tasks/medqa/medqa.yaml | 13 ++++ lm_eval/tasks/medqa/utils.py | 24 ++++++++ lm_eval/tasks/medqa_sample_exam/README.md | 49 +++++++++++++++ .../medqa_usmle_exam_step_1.yaml | 11 ++++ .../medqa_usmle_exam_step_2.yaml | 11 ++++ .../medqa_usmle_exam_step_3.yaml | 11 ++++ lm_eval/tasks/medqa_sample_exam/utils.py | 24 ++++++++ .../tasks/medqa_usmle_assessment/README.md | 41 +++++++++++++ .../medqa_usmle_assessment.yaml | 12 ++++ lm_eval/tasks/medqa_usmle_assessment/utils.py | 27 ++++++++ 14 files changed, 380 insertions(+) create mode 100644 lm_eval/tasks/medmcqa/README.md create mode 100644 lm_eval/tasks/medmcqa/medmcqa.yaml create mode 100644 lm_eval/tasks/medmcqa/utils.py create mode 100644 lm_eval/tasks/medqa/README.md create mode 100644 lm_eval/tasks/medqa/medqa.yaml create mode 100644 lm_eval/tasks/medqa/utils.py create mode 100644 lm_eval/tasks/medqa_sample_exam/README.md create mode 100644 lm_eval/tasks/medqa_sample_exam/medqa_usmle_exam_step_1.yaml create mode 100644 lm_eval/tasks/medqa_sample_exam/medqa_usmle_exam_step_2.yaml create mode 100644 lm_eval/tasks/medqa_sample_exam/medqa_usmle_exam_step_3.yaml create mode 100644 lm_eval/tasks/medqa_sample_exam/utils.py create mode 100644 lm_eval/tasks/medqa_usmle_assessment/README.md create mode 100644 lm_eval/tasks/medqa_usmle_assessment/medqa_usmle_assessment.yaml create mode 100644 lm_eval/tasks/medqa_usmle_assessment/utils.py diff --git a/lm_eval/tasks/medmcqa/README.md b/lm_eval/tasks/medmcqa/README.md new file mode 100644 index 0000000000..b0ddd41e3f --- /dev/null +++ b/lm_eval/tasks/medmcqa/README.md @@ -0,0 +1,61 @@ +# MedMCQA + +### Paper + +Title: `MedMCQA: A Large-scale Multi-Subject Multi-Choice Dataset for Medical domain Question Answering` + +Abstract: `https://arxiv.org/pdf/2203.14371.pdf` + +`This paper introduces MedMCQA, a new large-scale, Multiple-Choice Question Answering (MCQA) dataset +designed to address real-world medical entrance exam questions. More than 194k high-quality AIIMS +& NEET PG entrance exam MCQs covering 2.4k healthcare topics and 21 medical subjects are collected +with an average token length of 12.77 and high topical diversity. Each sample contains a question, +correct answer(s), and other options which requires a deeper language understanding as it tests the +10+ reasoning abilities of a model across a wide range of medical subjects & topics. A detailed explanation +of the solution, along with the above information, is provided in this study.` + +Homepage: `https://medmcqa.github.io` + + +### Citation + +``` +@InProceedings{pmlr-v174-pal22a, + title = {MedMCQA: A Large-scale Multi-Subject Multi-Choice Dataset for Medical domain Question Answering}, + author = {Pal, Ankit and Umapathi, Logesh Kumar and Sankarasubbu, Malaikannan}, + booktitle = {Proceedings of the Conference on Health, Inference, and Learning}, + pages = {248--260}, + year = {2022}, + editor = {Flores, Gerardo and Chen, George H and Pollard, Tom and Ho, Joyce C and Naumann, Tristan}, + volume = {174}, + series = {Proceedings of Machine Learning Research}, + month = {07--08 Apr}, + publisher = {PMLR}, + pdf = {https://proceedings.mlr.press/v174/pal22a/pal22a.pdf}, + url = {https://proceedings.mlr.press/v174/pal22a.html}, + abstract = {This paper introduces MedMCQA, a new large-scale, Multiple-Choice Question Answering (MCQA) dataset designed to address real-world medical entrance exam questions. More than 194k high-quality AIIMS & NEET PG entrance exam MCQs covering 2.4k healthcare topics and 21 medical subjects are collected with an average token length of 12.77 and high topical diversity. Each sample contains a question, correct answer(s), and other options which requires a deeper language understanding as it tests the 10+ reasoning abilities of a model across a wide range of medical subjects & topics. A detailed explanation of the solution, along with the above information, is provided in this study.} +} +``` + +### Groups and Tasks + +#### Groups + +* `group_name`: `Short description` + +#### Tasks + +* `MedMCQA`: `A Large-scale Multi-Subject Multi-Choice Dataset for Medical domain Question Answering` + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [ ] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/lm_eval/tasks/medmcqa/medmcqa.yaml b/lm_eval/tasks/medmcqa/medmcqa.yaml new file mode 100644 index 0000000000..c512242f41 --- /dev/null +++ b/lm_eval/tasks/medmcqa/medmcqa.yaml @@ -0,0 +1,11 @@ +task: medmcqa +dataset_path: medmcqa +output_type: multiple_choice +validation_split: validation +process_docs: !function utils.process_docs +doc_to_text: !function utils.doc_to_text +doc_to_target: !function utils.doc_to_target +doc_to_choice: "{{choices}}" +metric_list: + - metric: acc + - metric: acc_norm \ No newline at end of file diff --git a/lm_eval/tasks/medmcqa/utils.py b/lm_eval/tasks/medmcqa/utils.py new file mode 100644 index 0000000000..f69d359998 --- /dev/null +++ b/lm_eval/tasks/medmcqa/utils.py @@ -0,0 +1,27 @@ +def process_docs(dataset): + def _process(doc): + choices = [doc['opa'], doc['opb'], doc['opc'], doc['opd']] + out_doc = { + "query": doc["question"], + "choices": choices, + "gold": doc['cop'], + "subject": doc['subject_name'] + } + return out_doc + + return dataset.map(_process) + + +def doc_to_text(doc): + # Followed the prompt used in openai Medmcqa evaluation https://github.com/openai/evals/blob/main/evals/registry/data/medmcqa/convert.js + return ( + "You are a highly intelligent doctor who answers the following multiple choice question correctly.\nOnly write the answer down." + "\n\n**Subject:**" + doc["subject"] + + "\n\n**Question:**" + doc["query"] + "\n\n" + + ",".join(doc['choices']) + + "\n\n**Answer:**" + ) + + +def doc_to_target(doc): + return " " + doc["choices"][doc["gold"]] + "\n\n" diff --git a/lm_eval/tasks/medqa/README.md b/lm_eval/tasks/medqa/README.md new file mode 100644 index 0000000000..f1d463b25d --- /dev/null +++ b/lm_eval/tasks/medqa/README.md @@ -0,0 +1,58 @@ +# MedQA + +### Paper + +Title: `MedQA: A Large-scale Open Domain Question Answering Dataset from Medical Exams` + +Abstract: `https://arxiv.org/abs/2009.13081` + +`Open domain question answering (OpenQA) tasks have been recently attracting +more and more attention from the natural language processing (NLP) community. +In this work, we present the first free-form multiple-choice OpenQA dataset +for solving medical problems, MedQA, collected from the professional medical +board exams. It covers three languages: English, simplified Chinese, and traditional +Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, +respectively. We implement both rule-based and popular neural methods by sequentially +combining a document retriever and a machine comprehension model. Through experiments, +we find that even the current best method can only achieve 36.7\%, 42.0\%, and 70.1\% +of test accuracy on the English, traditional Chinese, and simplified Chinese questions, +respectively. We expect MedQA to present great challenges to existing OpenQA systems and +hope that it can serve as a platform to promote much stronger OpenQA models from the NLP +community in the future.` + +Homepage: `https://github.com/jind11/MedQA` + + +### Citation + +``` +@article{jin2020disease, + title={What Disease does this Patient Have? A Large-scale Open Domain Question Answering Dataset from Medical Exams}, + author={Jin, Di and Pan, Eileen and Oufattole, Nassim and Weng, Wei-Hung and Fang, Hanyi and Szolovits, Peter}, + journal={arXiv preprint arXiv:2009.13081}, + year={2020} +} +``` + +### Groups and Tasks + +#### Groups + +* `group_name`: `Short description` + +#### Tasks + +* `MedQA`: `A Large-scale Open Domain Question Answering Dataset from Medical Exams` + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [ ] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/lm_eval/tasks/medqa/medqa.yaml b/lm_eval/tasks/medqa/medqa.yaml new file mode 100644 index 0000000000..9c421a8c83 --- /dev/null +++ b/lm_eval/tasks/medqa/medqa.yaml @@ -0,0 +1,13 @@ +task: medqa +dataset_path: bigbio/med_qa +output_type: multiple_choice +training_split: training +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: !function utils.doc_to_text +doc_to_target: !function utils.doc_to_target +doc_to_choice: "{{choices}}" +metric_list: + - metric: acc + - metric: acc_norm \ No newline at end of file diff --git a/lm_eval/tasks/medqa/utils.py b/lm_eval/tasks/medqa/utils.py new file mode 100644 index 0000000000..8ff7f7d7ca --- /dev/null +++ b/lm_eval/tasks/medqa/utils.py @@ -0,0 +1,24 @@ +def process_docs(dataset): + def _process(doc): + choices = doc["choices"] + + return { + "query": doc["question"], # The query prompt. + "choices": choices, # The list of choices. + "gold": doc["choices"].index(doc["answer"][0]), + } + + return dataset.map(_process) + + +def doc_to_text(self, doc): + return ( + "You are a highly intelligent doctor who answers the following multiple choice question correctly.\nOnly write the answer down." + + "\n\n**Question:**" + doc["query"] + "\n\n" + + ",".join(doc['choices']) + + "\n\n**Answer:**" + ) + + +def doc_to_target(self, doc): + return " " + doc["choices"][doc["gold"]] + "\n\n" diff --git a/lm_eval/tasks/medqa_sample_exam/README.md b/lm_eval/tasks/medqa_sample_exam/README.md new file mode 100644 index 0000000000..bcc1454f5f --- /dev/null +++ b/lm_eval/tasks/medqa_sample_exam/README.md @@ -0,0 +1,49 @@ +# MedQA Sample Exam + +### Paper + +Title: `MedQA Sample Exam: A Large-scale Open Domain Question Answering Dataset from Medical Exams` + +Abstract: `https://arxiv.org/abs/2009.13081` + +`What Disease does this Patient Have? A Large-scale Open Domain Question Answering Dataset from Medical Exams, +This contains the English portion of the full MedQA dataset, containing 12,723 multiple (4) choice questions from the US medical licensing exam.` + +Homepage: `https://paperswithcode.com/dataset/medqa-usmle` + + +### Citation + +``` +@misc{jin2020disease, + title={What Disease does this Patient Have? A Large-scale Open Domain Question Answering Dataset from Medical Exams}, + author={Di Jin and Eileen Pan and Nassim Oufattole and Wei-Hung Weng and Hanyi Fang and Peter Szolovits}, + year={2020}, + eprint={2009.13081}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + +### Groups and Tasks + +#### Groups + +* `group_name`: `Short description` + +#### Tasks + +* `MedQA Sample Exam`: `A Large-scale Open Domain Question Answering Dataset from Medical Exams` + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [ ] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/lm_eval/tasks/medqa_sample_exam/medqa_usmle_exam_step_1.yaml b/lm_eval/tasks/medqa_sample_exam/medqa_usmle_exam_step_1.yaml new file mode 100644 index 0000000000..d3161f5922 --- /dev/null +++ b/lm_eval/tasks/medqa_sample_exam/medqa_usmle_exam_step_1.yaml @@ -0,0 +1,11 @@ +task: medqa_usmle_exam_step_1 +dataset_path: augtoma/usmle_step_1 +output_type: multiple_choice +test_split: test +process_docs: !function utils.process_docs +doc_to_text: !function utils.doc_to_text +doc_to_target: !function utils.doc_to_target +doc_to_choice: "{{choices}}" +metric_list: + - metric: acc + - metric: acc_norm \ No newline at end of file diff --git a/lm_eval/tasks/medqa_sample_exam/medqa_usmle_exam_step_2.yaml b/lm_eval/tasks/medqa_sample_exam/medqa_usmle_exam_step_2.yaml new file mode 100644 index 0000000000..99f96bbc5b --- /dev/null +++ b/lm_eval/tasks/medqa_sample_exam/medqa_usmle_exam_step_2.yaml @@ -0,0 +1,11 @@ +task: medqa_usmle_exam_step_2 +dataset_path: augtoma/usmle_step_2 +output_type: multiple_choice +test_split: test +process_docs: !function utils.process_docs +doc_to_text: !function utils.doc_to_text +doc_to_target: !function utils.doc_to_target +doc_to_choice: "{{choices}}" +metric_list: + - metric: acc + - metric: acc_norm \ No newline at end of file diff --git a/lm_eval/tasks/medqa_sample_exam/medqa_usmle_exam_step_3.yaml b/lm_eval/tasks/medqa_sample_exam/medqa_usmle_exam_step_3.yaml new file mode 100644 index 0000000000..99f96bbc5b --- /dev/null +++ b/lm_eval/tasks/medqa_sample_exam/medqa_usmle_exam_step_3.yaml @@ -0,0 +1,11 @@ +task: medqa_usmle_exam_step_2 +dataset_path: augtoma/usmle_step_2 +output_type: multiple_choice +test_split: test +process_docs: !function utils.process_docs +doc_to_text: !function utils.doc_to_text +doc_to_target: !function utils.doc_to_target +doc_to_choice: "{{choices}}" +metric_list: + - metric: acc + - metric: acc_norm \ No newline at end of file diff --git a/lm_eval/tasks/medqa_sample_exam/utils.py b/lm_eval/tasks/medqa_sample_exam/utils.py new file mode 100644 index 0000000000..86a0ff4858 --- /dev/null +++ b/lm_eval/tasks/medqa_sample_exam/utils.py @@ -0,0 +1,24 @@ +def process_docs(dataset): + def _process(doc): + choices = [choice for choice in doc["options"].values() if choice is not None] + out_doc = { + "query": doc["question"], + "choices": choices, + "gold": ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'].index(doc['answer_idx']), + } + return out_doc + + return dataset.map(_process) + + +def doc_to_text(self, doc): + return ( + "You are a highly intelligent doctor who answers the following multiple choice question correctly.\nOnly write the answer down." + + "\n\n**Question:**" + doc["query"] + "\n\n" + + ",".join(choice for choice in doc['choices'] if choice is not None) + + "\n\n**Answer:**" + ) + + +def doc_to_target(self, doc): + return " " + doc["choices"][doc["gold"]] + "\n\n" diff --git a/lm_eval/tasks/medqa_usmle_assessment/README.md b/lm_eval/tasks/medqa_usmle_assessment/README.md new file mode 100644 index 0000000000..4167e6af33 --- /dev/null +++ b/lm_eval/tasks/medqa_usmle_assessment/README.md @@ -0,0 +1,41 @@ +# MedQA USMLE Assessment + +### Paper + +Title: `N/A` + +Abstract: `N/A` + +`N/A` + +Homepage: ``N/A`` + + +### Citation + +``` +`N/A` +``` + +### Groups and Tasks + +#### Groups + +* `group_name`: `Short description` + +#### Tasks + +* `Task Name`: `N/A` + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [ ] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/lm_eval/tasks/medqa_usmle_assessment/medqa_usmle_assessment.yaml b/lm_eval/tasks/medqa_usmle_assessment/medqa_usmle_assessment.yaml new file mode 100644 index 0000000000..fed9ed1dec --- /dev/null +++ b/lm_eval/tasks/medqa_usmle_assessment/medqa_usmle_assessment.yaml @@ -0,0 +1,12 @@ +task: medqa_usmle_assessment +dataset_path: augtoma/medqa_usmle +output_type: multiple_choice +training_split: training +test_split: test +process_docs: !function utils.process_docs +doc_to_text: !function utils.doc_to_text +doc_to_target: !function utils.doc_to_target +doc_to_choice: "{{choices}}" +metric_list: + - metric: acc + - metric: acc_norm \ No newline at end of file diff --git a/lm_eval/tasks/medqa_usmle_assessment/utils.py b/lm_eval/tasks/medqa_usmle_assessment/utils.py new file mode 100644 index 0000000000..f69d359998 --- /dev/null +++ b/lm_eval/tasks/medqa_usmle_assessment/utils.py @@ -0,0 +1,27 @@ +def process_docs(dataset): + def _process(doc): + choices = [doc['opa'], doc['opb'], doc['opc'], doc['opd']] + out_doc = { + "query": doc["question"], + "choices": choices, + "gold": doc['cop'], + "subject": doc['subject_name'] + } + return out_doc + + return dataset.map(_process) + + +def doc_to_text(doc): + # Followed the prompt used in openai Medmcqa evaluation https://github.com/openai/evals/blob/main/evals/registry/data/medmcqa/convert.js + return ( + "You are a highly intelligent doctor who answers the following multiple choice question correctly.\nOnly write the answer down." + "\n\n**Subject:**" + doc["subject"] + + "\n\n**Question:**" + doc["query"] + "\n\n" + + ",".join(doc['choices']) + + "\n\n**Answer:**" + ) + + +def doc_to_target(doc): + return " " + doc["choices"][doc["gold"]] + "\n\n"