From 21d4ae98c1e5bab15ccd71a239fc08c8fbf02c03 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20B=C3=A4uerle?= <alex@a13x.io>
Date: Wed, 20 Dec 2023 15:12:08 +0100
Subject: [PATCH 01/25] feat: add option to upload results to Zeno (#990)

* feat: add option to upload results to Zeno

* config-based upload supporting different task types and metrics

* upload tasks as individual projects

* wording

* readme

* add example notebook

* Update documentation for Zeno integration

* Make zeno deps an extra

* Update README.md

* Document extra deps installation

* Update zeno_visualize.py

* fix: balance parens

* fix typo

* fix merge commit I botched

* Update zeno_visualize.py

* Update logger warning stmt

* fix whitespace

---------

Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
---
 README.md                     |  40 +++++++
 examples/visualize-zeno.ipynb | 114 ++++++++++++++++++
 pyproject.toml                |   2 +
 scripts/zeno_visualize.py     | 210 ++++++++++++++++++++++++++++++++++
 4 files changed, 366 insertions(+)
 create mode 100644 examples/visualize-zeno.ipynb
 create mode 100644 scripts/zeno_visualize.py

diff --git a/README.md b/README.md
index 099959690b..ef3dae54d8 100644
--- a/README.md
+++ b/README.md
@@ -59,6 +59,7 @@ We also provide a number of optional dependencies for extended functionality. Ex
 | promptsource  | For using PromtSource prompts         |
 | sentencepiece | For using the sentencepiece tokenizer |
 | vllm          | For loading models with vLLM          |
+| zeno          | For visualizing results with Zeno     |
 | all           | Loads all extras                      |
 
 ## Basic Usage
@@ -225,6 +226,45 @@ Additionally, one can provide a directory with `--use_cache` to cache the result
 
 For a full list of supported arguments, check out the [interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md) guide in our documentation!
 
+## Visualizing Results
+
+You can use [Zeno](https://zenoml.com) to visualize the results of your eval harness runs.
+
+First, head to [hub.zenoml.com](hub.zenoml.com) to create an account and get an API key [on your account page](hub.zenoml.com/account).
+Add this key as an environment variable:
+
+```bash
+export ZENO_API_KEY=[your api key]
+```
+
+You'll also need to install the `lm_eval[zeno]` package extra.
+
+To visualize the results, run the eval harness with the `log_samples` and `output_path` flags.
+We expect `output_path` to contain multiple folders that represent individual model names.
+You can thus run your evaluation on any number of tasks and models and upload all of the results as projects on Zeno.
+
+```bash
+lm_eval \
+    --model hf \
+    --model_args pretrained=EleutherAI/gpt-j-6B \
+    --tasks hellaswag \
+    --device cuda:0 \
+    --batch_size 8 \
+    --log_samples \
+    --output_path output/gpt-j-6B
+```
+
+Then, you can upload the resulting data using the `zeno_visualize` script:
+
+```bash
+python scripts/zeno_visualize.py \
+    --data_path output \
+    --project_name "Eleuther Project"
+```
+
+This will use all subfolders in `data_path` as different models and upload all tasks within these model folders to Zeno.
+If you run the eval harness on multiple tasks, the `project_name` will be used as a prefix and one project will be created per task.
+
 ## How to Contribute or Learn More?
 
 For more information on the library and how everything fits together, check out all of our [documentation pages](https://github.com/EleutherAI/lm-evaluation-harness/tree/big-refactor/docs)! We plan to post a larger roadmap of desired + planned library improvements soon, with more information on how contributors can help.
diff --git a/examples/visualize-zeno.ipynb b/examples/visualize-zeno.ipynb
new file mode 100644
index 0000000000..48beeddff4
--- /dev/null
+++ b/examples/visualize-zeno.ipynb
@@ -0,0 +1,114 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Visualizing Results in Zeno\n",
+    "\n",
+    "Benchmarking your models is the first step towards making sure your model performs well.\n",
+    "However, looking at the data behind the benchmark, slicing the data into subsets, and comparing models on individual instances can help you even more in evaluating and quantifying the behavior of your AI system.\n",
+    "\n",
+    "All of this can be done in [Zeno](https://zenoml.com)!\n",
+    "Zeno is super easy to use with the eval harness, let's explore how you can easily upload and visualize your eval results.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install this project if you did not already do that. This is all that needs to be installed for you to be able to visualize your data in Zeno!\n",
+    "!pip install .."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Run the Eval Harness\n",
+    "\n",
+    "To visualize the results, run the eval harness with the `log_samples` and `output_path` flags. We expect `output_path` to contain multiple folders that represent individual model names. You can thus run your evaluation on any number of tasks and models and upload all of the results as projects on Zeno.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!lm_eval \\\n",
+    "    --model hf \\\n",
+    "    --model_args pretrained=EleutherAI/gpt-neo-2.7B \\\n",
+    "    --tasks hellaswag,wikitext \\\n",
+    "    --batch_size 8 \\\n",
+    "    --device mps \\\n",
+    "    --log_samples \\\n",
+    "    --output_path output/gpt-neo-2.7B \\\n",
+    "    --limit 10"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Set your API Key\n",
+    "\n",
+    "This is so you can be authenticated with Zeno.\n",
+    "If you don't already have a Zeno account, first create an account on [Zeno Hub](https://hub.zenoml.com).\n",
+    "After logging in to Zeno Hub, generate your API key by clicking on your profile at the bottom left to navigate to your account page.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%env ZENO_API_KEY=YOUR_API_KEY"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Visualize Eval Results\n",
+    "\n",
+    "You can now use the `zeno_visualize` script to upload the results to Zeno.\n",
+    "\n",
+    "This will use all subfolders in `data_path` as different models and upload all tasks within these model folders to Zeno. If you run the eval harness on multiple tasks, the `project_name` will be used as a prefix and one project will be created per task.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!python ../scripts/zeno_visualize.py --data_path output --project_name \"Zeno Upload Test\""
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "zeno_projects",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/pyproject.toml b/pyproject.toml
index 6ddd76e8b6..5a4d191d7c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -73,6 +73,7 @@ anthropic = ["anthropic"]
 openai = ["openai==1.3.9", "tiktoken"]
 vllm = ["vllm"]
 ifeval = ["langdetect", "immutabledict"]
+zeno = ["pandas", "zeno-client"]
 all = [
     "lm_eval[dev]",
     "lm_eval[testing]",
@@ -85,4 +86,5 @@ all = [
     "lm_eval[openai]",
     "lm_eval[vllm]",
     "lm_eval[ifeval]",
+    "lm_eval[zeno]",
 ]
diff --git a/scripts/zeno_visualize.py b/scripts/zeno_visualize.py
new file mode 100644
index 0000000000..5c0dad713f
--- /dev/null
+++ b/scripts/zeno_visualize.py
@@ -0,0 +1,210 @@
+import argparse
+import json
+import os
+import re
+from pathlib import Path
+
+import pandas as pd
+from zeno_client import ZenoClient, ZenoMetric
+
+from lm_eval.utils import eval_logger
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Upload your data to the Zeno AI evaluation platform to visualize results. This requires a ZENO_API_KEY in your environment variables. The eleuther harness must be run with log_samples=True and an output_path set for data to be written to disk."
+    )
+    parser.add_argument(
+        "--data_path",
+        required=True,
+        help="Where to find the results of the benchmarks that have been run. Uses the name of each subfolder as the model name.",
+    )
+    parser.add_argument(
+        "--project_name",
+        required=True,
+        help="The name of the generated Zeno project.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    """Upload the results of your benchmark tasks to the Zeno AI evaluation platform.
+
+    This scripts expects your results to live in a data folder where subfolders contain results of individual models.
+    """
+    args = parse_args()
+
+    client = ZenoClient(os.environ["ZENO_API_KEY"])
+
+    # Get all model subfolders from the parent data folder.
+    models = [
+        os.path.basename(os.path.normpath(f))
+        for f in os.scandir(Path(args.data_path))
+        if f.is_dir()
+    ]
+
+    assert len(models) > 0, "No model directories found in the data_path."
+
+    tasks = set(tasks_for_model(models[0], args.data_path))
+
+    for model in models:  # Make sure that all models have the same tasks.
+        old_tasks = tasks.copy()
+        task_count = len(tasks)
+
+        model_tasks = tasks_for_model(model, args.data_path)
+        tasks.intersection(set(model_tasks))
+
+        if task_count != len(tasks):
+            eval_logger.warning(
+                f"All models must have the same tasks. {model} has tasks: {model_tasks} but have already recorded tasks: {old_tasks}. Taking intersection {tasks}"
+            )
+
+    assert (
+        len(tasks) > 0
+    ), "Must provide at least one task in common amongst models to compare."
+
+    for task in tasks:
+        # Upload data for all models
+        for model_index, model in enumerate(models):
+            model_args = re.sub(
+                "/|=",
+                "__",
+                json.load(open(Path(args.data_path, model, "results.json")))["config"][
+                    "model_args"
+                ],
+            )
+            with open(
+                Path(args.data_path, model, f"{model_args}_{task}.jsonl"), "r"
+            ) as file:
+                data = json.loads(file.read())
+
+            configs = json.load(open(Path(args.data_path, model, "results.json")))[
+                "configs"
+            ]
+            config = configs[task]
+
+            if model_index == 0:  # Only need to assemble data for the first model
+                metrics = []
+                for metric in config["metric_list"]:
+                    metrics.append(
+                        ZenoMetric(
+                            name=metric["metric"],
+                            type="mean",
+                            columns=[metric["metric"]],
+                        )
+                    )
+                project = client.create_project(
+                    name=args.project_name + (f"_{task}" if len(tasks) > 1 else ""),
+                    view="text-classification",
+                    metrics=metrics,
+                )
+                project.upload_dataset(
+                    generate_dataset(data, config),
+                    id_column="id",
+                    data_column="data",
+                    label_column="labels",
+                )
+
+            project.upload_system(
+                generate_system_df(data, config),
+                name=model,
+                id_column="id",
+                output_column="output",
+            )
+
+
+def tasks_for_model(model: str, data_path: str):
+    """Get the tasks for a specific model.
+
+    Args:
+        model (str): The name of the model.
+        data_path (str): The path to the data.
+
+    Returns:
+        list: A list of tasks for the model.
+    """
+    dir_path = Path(data_path, model)
+    config = (json.load(open(Path(dir_path, "results.json")))["configs"],)
+    return list(config[0].keys())
+
+
+def generate_dataset(
+    data,
+    config,
+):
+    """Generate a Zeno dataset from evaluation data.
+
+    Args:
+        data: The data to generate a dataset for.
+        config: The configuration of the task.
+
+    Returns:
+        pd.Dataframe: A dataframe that is ready to be uploaded to Zeno.
+    """
+    ids = [x["doc_id"] for x in data]
+    labels = [x["target"] for x in data]
+    instance = [""] * len(ids)
+
+    if config["output_type"] == "loglikelihood":
+        instance = [x["arguments"][0][0] for x in data]
+        labels = [x["arguments"][0][1] for x in data]
+    elif config["output_type"] == "multiple_choice":
+        instance = [
+            x["arguments"][0][0]
+            + "\n\n"
+            + "\n".join([f"- {y[1]}" for y in x["arguments"]])
+            for x in data
+        ]
+    elif config["output_type"] == "loglikelihood_rolling":
+        instance = [x["arguments"][0][0] for x in data]
+    elif config["output_type"] == "generate_until":
+        instance = [x["arguments"][0][0] for x in data]
+
+    return pd.DataFrame(
+        {
+            "id": ids,
+            "data": instance,
+            "labels": labels,
+            "output_type": config["output_type"],
+        }
+    )
+
+
+def generate_system_df(data, config):
+    """Generate a dataframe for a specific system to be uploaded to Zeno.
+
+    Args:
+        data: The data to generate a dataframe from.
+        config: The configuration of the task.
+
+    Returns:
+        pd.Dataframe: A dataframe that is ready to be uploaded to Zeno as a system.
+    """
+    ids = [x["doc_id"] for x in data]
+    answers = [""] * len(ids)
+
+    if config["output_type"] == "loglikelihood":
+        answers = [
+            "correct" if x["filtered_resps"][0][1] is True else "incorrect"
+            for x in data
+        ]
+    elif config["output_type"] == "multiple_choice":
+        answers = [", ".join([str(y[0]) for y in x["filtered_resps"]]) for x in data]
+    elif config["output_type"] == "loglikelihood_rolling":
+        answers = [str(x["filtered_resps"][0]) for x in data]
+    elif config["output_type"] == "generate_until":
+        answers = [str(x["filtered_resps"][0]) for x in data]
+
+    metrics = {}
+    for metric in config["metric_list"]:
+        if "aggregation" in metric and metric["aggregation"] == "mean":
+            metrics[metric["metric"]] = [x[metric["metric"]] for x in data]
+
+    system_dict = {"id": ids, "output": answers}
+    system_dict.update(metrics)
+    system_df = pd.DataFrame(system_dict)
+    return system_df
+
+
+if __name__ == "__main__":
+    main()

From 65b8761db922513dada0320b860fabb1b4f01dc3 Mon Sep 17 00:00:00 2001
From: Baber Abbasi <92168766+baberabb@users.noreply.github.com>
Date: Wed, 20 Dec 2023 19:14:46 +0500
Subject: [PATCH 02/25] Switch Linting to `ruff` (#1166)

* add ruff and isort. remove black and flake8

* remove unnecessary dependencies

* remove dependency from table

* change order

* ran ruff

* check 3.9

* exclude evaluator

* update CI workflow

* use ruff config in pyproject.toml

* test

* add isort rules to ruff

* sort imports

* import `make_table`

* try stages for no-commit-to-branch

* turn on mypy for pre-commit

* test

* test

* test

* change no-commit-to-branch to default

* nits

* fixed dependency
---
 .github/workflows/new_tasks.yml               |  2 +-
 .github/workflows/unit_tests.yml              | 29 +++----
 .pre-commit-config.yaml                       | 16 ++--
 README.md                                     |  5 +-
 lm_eval/__main__.py                           | 19 ++--
 lm_eval/api/filter.py                         |  4 +-
 lm_eval/api/metrics.py                        |  9 +-
 lm_eval/api/model.py                          | 10 +--
 lm_eval/api/registry.py                       | 10 +--
 lm_eval/api/samplers.py                       | 10 +--
 lm_eval/api/task.py                           | 86 ++++++++-----------
 lm_eval/decontamination/archiver.py           | 13 +--
 lm_eval/decontamination/decontaminate.py      | 20 ++---
 lm_eval/decontamination/janitor.py            |  6 +-
 lm_eval/evaluator.py                          | 14 ++-
 lm_eval/filters/__init__.py                   |  2 +-
 lm_eval/models/anthropic_llms.py              | 11 ++-
 lm_eval/models/dummy.py                       |  1 +
 lm_eval/models/gguf.py                        |  7 +-
 lm_eval/models/huggingface.py                 | 50 ++++++-----
 lm_eval/models/openai_completions.py          | 38 ++++----
 lm_eval/models/textsynth.py                   |  6 +-
 lm_eval/models/vllm_causallms.py              | 28 +++---
 lm_eval/prompts/__init__.py                   |  2 -
 lm_eval/tasks/__init__.py                     |  1 -
 lm_eval/tasks/bbh/_generate_configs.py        |  2 -
 lm_eval/tasks/belebele/_generate_configs.py   |  1 -
 lm_eval/tasks/bigbench/generate_tasks.py      |  1 -
 lm_eval/tasks/blimp/generate_configs.py       |  1 -
 lm_eval/tasks/ceval/_generate_configs.py      |  5 +-
 lm_eval/tasks/cmmlu/_generate_configs.py      |  5 +-
 lm_eval/tasks/code_x_glue/code-text/bleu.py   | 11 +--
 lm_eval/tasks/code_x_glue/code-text/utils.py  |  2 -
 lm_eval/tasks/coqa/utils.py                   |  4 +-
 lm_eval/tasks/csatqa/_generate_configs.py     |  2 -
 lm_eval/tasks/drop/utils.py                   |  1 -
 lm_eval/tasks/ifeval/instructions_registry.py | 27 ++----
 lm_eval/tasks/ifeval/instructions_util.py     |  1 -
 lm_eval/tasks/mgsm/utils.py                   |  1 -
 lm_eval/tasks/mmlu/_generate_configs.py       |  3 -
 .../advanced_ai_risk/_generate_configs.py     |  2 -
 .../persona/_generate_configs.py              |  2 -
 lm_eval/tasks/paws-x/_generate_config.py      |  1 -
 lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py |  3 +-
 lm_eval/tasks/qasper/utils.py                 |  1 -
 lm_eval/tasks/scrolls/task.py                 |  1 -
 lm_eval/tasks/squadv2/task.py                 |  5 +-
 lm_eval/tasks/super_glue/cb/t5_utils.py       |  2 -
 lm_eval/tasks/super_glue/multirc/t5_utils.py  |  3 -
 lm_eval/tasks/super_glue/record/t5_utils.py   |  5 +-
 lm_eval/tasks/super_glue/wsc/t5_utils.py      |  4 -
 lm_eval/tasks/translation/utils.py            |  2 -
 lm_eval/tasks/truthfulqa/utils.py             |  3 -
 lm_eval/tasks/xnli/utils.py                   |  1 -
 lm_eval/utils.py                              | 36 ++++----
 mypy.ini                                      | 34 ++++----
 pyproject.toml                                | 23 +++--
 scripts/build_benchmark.py                    | 10 +--
 .../compress_and_package.py                   | 10 +--
 .../clean_training_data/generate_13_grams.py  | 19 ++--
 .../clean_training_data/investigate_pile.py   |  9 +-
 .../process_sorted_buckets.py                 | 12 ++-
 .../sort_13_gram_buckets.py                   | 11 ++-
 scripts/cost_estimate.py                      |  4 +-
 scripts/get_prompts.py                        |  4 +-
 scripts/make_gpt2_test_cases.py               |  5 +-
 scripts/make_table_results.py                 |  5 +-
 scripts/make_table_tasks.py                   |  4 +-
 scripts/model_comparator.py                   | 14 +--
 scripts/regression.py                         | 10 ++-
 scripts/write_out.py                          | 10 ++-
 setup.py                                      |  1 +
 tests/models/test_gguf.py                     |  7 +-
 tests/models/test_huggingface.py              | 20 +++--
 tests/models/test_vllm.py                     |  9 +-
 tests/test_evaluator.py                       | 12 ++-
 tests/test_janitor.py                         | 12 +--
 tests/test_misc.py                            |  4 +-
 tests/test_tasks.py                           |  6 +-
 tests/tests_master/test_description.py        |  4 +-
 tests/tests_master/test_generate_13_grams.py  | 10 +--
 tests/tests_master/test_models.py             |  5 +-
 tests/tests_master/test_version_stable.py     | 16 ++--
 tests/utils.py                                |  8 +-
 84 files changed, 389 insertions(+), 446 deletions(-)

diff --git a/.github/workflows/new_tasks.yml b/.github/workflows/new_tasks.yml
index 76ab1be15b..0c4490f53a 100644
--- a/.github/workflows/new_tasks.yml
+++ b/.github/workflows/new_tasks.yml
@@ -56,7 +56,7 @@ jobs:
         if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
         run: |
             python -m pip install --upgrade pip
-            pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+            pip install -e '.[dev]' --extra-index-url https://download.pytorch.org/whl/cpu
     #   Install optional git dependencies
     #       pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
     #       if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 07a85864b3..f981798fdf 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -17,29 +17,22 @@ jobs:
   linter:
     name: Linters
     runs-on: ubuntu-latest
-    timeout-minutes: 20
+    timeout-minutes: 5
 
     steps:
     - name: Checkout Code
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
     - name: Set up Python 3.8
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: 3.8
         cache: pip
-        cache-dependency-path: setup.py
-    - name: Install dependencies
-      run: pip install -e '.[linting,testing]' --extra-index-url https://download.pytorch.org/whl/cpu ; export SKIP=no-commit-to-branch # env var deactivates --no-commit-to-branch
+        cache-dependency-path: pyproject.toml
     - name: Pre-Commit
+      env:
+        SKIP: "no-commit-to-branch,mypy"
+
       uses: pre-commit/action@v3.0.0
-    - name: Lint with pylint
-      run: python -m pylint --disable=all -e W0311 --jobs=0 --indent-string='    ' **/*.py
-    - name: Lint with flake8
-      run: |
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=F,E9,E71,E72,E501,E112,E113,W6 --extend-ignore=F541 --show-source --statistics --exit-zero
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
 #       # mypy turned off for now
 #    - name: Lint with mypy
 #      run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
@@ -53,17 +46,17 @@ jobs:
     timeout-minutes: 30
     steps:
     - name: Checkout Code
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
         cache: pip
-        cache-dependency-path: setup.py
+        cache-dependency-path: pyproject.toml
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install -e '.[testing,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install -e '.[dev,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
 #         Install optional git dependencies
 #                pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
 #        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9b4ae822c1..b5386cfda3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -27,14 +27,16 @@ repos:
         args: [--remove]
       - id: mixed-line-ending
         args: [--fix=lf]
-  - repo: https://github.com/pycqa/flake8
-    rev: 3.7.9
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.1.8
     hooks:
-      - id: flake8
-  - repo: https://github.com/psf/black
-    rev: 22.3.0
-    hooks:
-      - id: black
+      # Run the linter.
+      - id: ruff
+        args:
+          - --fix
+        # Run the formatter.
+      - id: ruff-format
   - repo: https://github.com/codespell-project/codespell
     rev: v2.1.0
     hooks:
diff --git a/README.md b/README.md
index ef3dae54d8..9dbee8fbb2 100644
--- a/README.md
+++ b/README.md
@@ -49,11 +49,10 @@ pip install -e .
 We also provide a number of optional dependencies for extended functionality. Extras can be installed via `pip install -e ".[NAME]"`
 
 | Name          | Use                                   |
-| ------------- | ------------------------------------- |
+|---------------|---------------------------------------|
 | anthropic     | For using Anthropic's models          |
-| dev           | You probably don't want to use this   |
 | gptq          | For loading models with GPTQ          |
-| testing       | You probably don't want to use this   |
+| dev           | You probably don't want to use this   |
 | multilingual  | For multilingual tokenizers           |
 | openai        | For using OpenAI's models             |
 | promptsource  | For using PromtSource prompts         |
diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py
index ebb1b6c4ab..7fbee0dc73 100644
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -1,17 +1,18 @@
+import argparse
+import json
+import logging
 import os
 import re
 import sys
-import json
-import logging
-import argparse
-import numpy as np
-
 from pathlib import Path
 from typing import Union
 
+import numpy as np
+
 from lm_eval import evaluator, utils
-from lm_eval.tasks import initialize_tasks, include_path
 from lm_eval.api.registry import ALL_TASKS
+from lm_eval.tasks import include_path, initialize_tasks
+from lm_eval.utils import make_table
 
 
 def _handle_non_serializable(o):
@@ -170,7 +171,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
         task_names = ALL_TASKS
     elif args.tasks == "list":
         eval_logger.info(
-            "Available Tasks:\n - {}".format(f"\n - ".join(sorted(ALL_TASKS)))
+            "Available Tasks:\n - {}".format("\n - ".join(sorted(ALL_TASKS)))
         )
         sys.exit()
     else:
@@ -271,9 +272,9 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
             f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
             f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
         )
-        print(evaluator.make_table(results))
+        print(make_table(results))
         if "groups" in results:
-            print(evaluator.make_table(results, "groups"))
+            print(make_table(results, "groups"))
 
 
 if __name__ == "__main__":
diff --git a/lm_eval/api/filter.py b/lm_eval/api/filter.py
index ac69aa8ffd..bc26a1a637 100644
--- a/lm_eval/api/filter.py
+++ b/lm_eval/api/filter.py
@@ -1,9 +1,10 @@
 from dataclasses import dataclass
 from typing import List
 
-from lm_eval.api.instance import Instance
 from datasets import Dataset
 
+from lm_eval.api.instance import Instance
+
 
 class Filter:
     """
@@ -42,7 +43,6 @@ class FilterEnsemble:
     filters: List[Filter]
 
     def apply(self, instances: List[Instance], docs: List[Dataset]) -> None:
-
         resps = [
             inst.resps for inst in instances
         ]  # operate just on the model responses
diff --git a/lm_eval/api/metrics.py b/lm_eval/api/metrics.py
index 4eb68585b6..85a944c888 100644
--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -1,18 +1,19 @@
+import logging
 import math
+import random
 from collections.abc import Iterable
 
+import evaluate
 import numpy as np
 import sacrebleu
 import sklearn.metrics
-import random
-import evaluate
 
-from lm_eval.api.registry import register_metric, register_aggregation
+from lm_eval.api.registry import register_aggregation, register_metric
 
-import logging
 
 eval_logger = logging.getLogger("lm-eval")
 
+
 # Register Aggregations First
 @register_aggregation("mean")
 def mean(arr):
diff --git a/lm_eval/api/model.py b/lm_eval/api/model.py
index 0f67095879..df829af592 100644
--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -1,17 +1,15 @@
 import abc
+import hashlib
+import json
+import logging
 import os
+from typing import List, Optional, Tuple, Type, TypeVar
 
-import torch
-from typing import Union, List, Tuple, Optional, Type, TypeVar
 from sqlitedict import SqliteDict
-import json
-import hashlib
-
 from tqdm import tqdm
 
 from lm_eval import utils
 
-import logging
 
 eval_logger = logging.getLogger("lm-eval")
 
diff --git a/lm_eval/api/registry.py b/lm_eval/api/registry.py
index 7d73ae6c5f..5fb9c011fc 100644
--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
@@ -1,8 +1,9 @@
-import os
+import logging
+
 import evaluate
+
 from lm_eval.api.model import LM
 
-import logging
 
 eval_logger = logging.getLogger("lm-eval")
 
@@ -91,7 +92,6 @@ def decorate(fn):
 def register_metric(**args):
     # TODO: do we want to enforce a certain interface to registered metrics?
     def decorate(fn):
-
         assert "metric" in args
         name = args["metric"]
 
@@ -100,7 +100,6 @@ def decorate(fn):
             ("higher_is_better", HIGHER_IS_BETTER_REGISTRY),
             ("aggregation", METRIC_AGGREGATION_REGISTRY),
         ]:
-
             if key in args:
                 value = args[key]
                 assert (
@@ -120,7 +119,6 @@ def decorate(fn):
 
 
 def get_metric(name, hf_evaluate_metric=False):
-
     if not hf_evaluate_metric:
         if name in METRIC_REGISTRY:
             return METRIC_REGISTRY[name]
@@ -151,7 +149,6 @@ def decorate(fn):
 
 
 def get_aggregation(name):
-
     try:
         return AGGREGATION_REGISTRY[name]
     except KeyError:
@@ -161,7 +158,6 @@ def get_aggregation(name):
 
 
 def get_metric_aggregation(name):
-
     try:
         return METRIC_AGGREGATION_REGISTRY[name]
     except KeyError:
diff --git a/lm_eval/api/samplers.py b/lm_eval/api/samplers.py
index 8a0d1e334d..57e3a6f1a4 100644
--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
@@ -40,18 +40,18 @@ def get_context(self, doc, num_fewshot):
                         self.doc_to_text(doc)
                         if (
                             self.config.doc_to_choice is None
-                            or type(self.doc_to_text(doc)) is str
+                            or isinstance(self.doc_to_text(doc), str)
                         )
                         else self.doc_to_choice(doc)[self.doc_to_text(doc)]
                     )
                     + self.target_delimiter
                     + (
                         str(self.doc_to_target(doc)[0])
-                        if type(self.doc_to_target(doc)) is list
+                        if isinstance(self.doc_to_target(doc), list)
                         else self.doc_to_target(doc)
                         if (
                             self.config.doc_to_choice is None
-                            or type(self.doc_to_target(doc)) is str
+                            or isinstance(self.doc_to_target(doc), str)
                         )
                         else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
                     )
@@ -77,8 +77,8 @@ def sample(self, n) -> None:
         Draw the first `n` samples in order from the specified split.
         Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU.
         """
-        assert n <= len(
-            self.docs
+        assert (
+            n <= len(self.docs)
         ), f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available."
         return self.docs[:n]
 
diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py
index 88ca412923..217349426c 100644
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -1,45 +1,35 @@
 import abc
-from dataclasses import dataclass, field, asdict
-
-import os
-import re
 import ast
-import yaml
 import logging
-import evaluate
+import os
 import random
-import itertools
-import functools
-from tqdm import tqdm
+import re
+from collections.abc import Callable
+from dataclasses import asdict, dataclass
+from typing import Any, List, Literal, Tuple, Union
 
 import datasets
 import numpy as np
 
-from typing import Union, List, Any, Tuple, Literal
-from collections.abc import Callable
-
 from lm_eval import utils
 from lm_eval.api import samplers
 from lm_eval.api.instance import Instance
-from lm_eval.api.filter import FilterEnsemble
-
-from lm_eval.prompts import get_prompt
-from lm_eval.filters import build_filter_ensemble
 from lm_eval.api.metrics import (
+    bits_per_byte,
     mean,
     weighted_perplexity,
-    bits_per_byte,
-    metric_max_over_ground_truths,
 )
 from lm_eval.api.registry import (
-    get_metric,
+    AGGREGATION_REGISTRY,
+    DEFAULT_METRIC_REGISTRY,
     get_aggregation,
+    get_metric,
     get_metric_aggregation,
     is_higher_better,
-    DEFAULT_METRIC_REGISTRY,
-    OUTPUT_TYPE_REGISTRY,
-    AGGREGATION_REGISTRY,
 )
+from lm_eval.filters import build_filter_ensemble
+from lm_eval.prompts import get_prompt
+
 
 ALL_OUTPUT_TYPES = [
     "loglikelihood",
@@ -349,9 +339,7 @@ def build_all_requests(self, limit=None, rank=None, world_size=None) -> None:
         elif self.has_validation_docs():
             docs = self.validation_docs()
         else:
-            assert (
-                False
-            ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
+            assert False, f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
 
         eval_logger.info(f"Building contexts for task on rank {rank}...")
 
@@ -603,9 +591,9 @@ def __init__(
 
                 if "aggregation" in metric_config:
                     agg_name = metric_config["aggregation"]
-                    if type(agg_name) == str:
+                    if isinstance(agg_name, str):
                         self._aggregation_list[metric_name] = get_aggregation(agg_name)
-                    elif callable(agg_name):
+                    elif callable(agg_name):  # noqa: E721
                         self._aggregation_list[metric_name] = metric_config[
                             "aggregation"
                         ]
@@ -672,9 +660,7 @@ def __init__(
         elif self.has_validation_docs():
             self.task_docs = self.validation_docs()
         else:
-            assert (
-                False
-            ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
+            assert False, f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
 
         # Test One Doc
         self.features = list(self.task_docs.features.keys())
@@ -686,20 +672,20 @@ def __init__(
 
         if self.config.doc_to_choice is not None:
             test_choice = self.doc_to_choice(test_doc)
-            if type(test_choice) is not list:
+            if not isinstance(test_choice, list):
                 eval_logger.error("doc_to_choice must return list")
             else:
                 num_choice = len(test_choice)
 
-            if type(test_text) is int:
+            if isinstance(test_text, int):
                 self.multiple_input = num_choice
         else:
             test_choice = None
 
-        if type(test_target) is list:
+        if isinstance(test_target, list):
             self.multiple_target = len(test_target)
         else:
-            if (type(test_target) is int) and (test_choice is not None):
+            if (isinstance(test_target, int)) and (test_choice is not None):
                 test_target = test_choice[test_target]
             else:
                 test_target = str(test_target)
@@ -808,11 +794,11 @@ def fewshot_context(self, doc, num_fewshot):
             )
 
         example = self.doc_to_text(doc)
-        if type(example) == str:
+        if isinstance(example, str):
             return labeled_examples + example
-        elif type(example) == list:
+        elif isinstance(example, list):
             return [labeled_examples + ex for ex in example]
-        elif type(example) == int:
+        elif isinstance(example, int):
             if self.config.doc_to_choice is not None:
                 choices = self.doc_to_choice(doc)
                 return labeled_examples + choices[example]
@@ -864,9 +850,9 @@ def doc_to_text(self, doc):
         else:
             doc_to_text = self.config.doc_to_text
 
-        if type(doc_to_text) == int:
+        if isinstance(doc_to_text, int):
             return doc_to_text
-        elif type(doc_to_text) == str:
+        elif isinstance(doc_to_text, str):
             if doc_to_text in self.features:
                 # if self.config.doc_to_choice is not None:
                 #     return self.doc_to_choice(doc)[doc[doc_to_text]]
@@ -898,9 +884,9 @@ def doc_to_target(self, doc: dict) -> Union[int, str, list]:
         else:
             doc_to_target = self.config.doc_to_target
 
-        if type(doc_to_target) == int:
+        if isinstance(doc_to_target, int):
             return doc_to_target
-        elif type(doc_to_target) == str:
+        elif isinstance(doc_to_target, str):
             if doc_to_target in self.features:
                 # if self.config.doc_to_choice is not None:
                 #     return self.doc_to_choice(doc)[doc[doc_to_target]]
@@ -921,7 +907,7 @@ def doc_to_target(self, doc: dict) -> Union[int, str, list]:
                         return target_string
                 else:
                     return target_string
-        elif type(doc_to_target) == list:
+        elif isinstance(doc_to_target, list):
             return doc_to_target
         elif callable(doc_to_target):
             return doc_to_target(doc)
@@ -944,14 +930,14 @@ def doc_to_choice(self, doc: Any) -> List[str]:
         else:
             doc_to_choice = self.config.doc_to_choice
 
-        if type(doc_to_choice) == str:
+        if isinstance(doc_to_choice, str):
             if doc_to_choice in self.features:
                 return doc[doc_to_choice]
             else:
                 return ast.literal_eval(utils.apply_template(doc_to_choice, doc))
-        elif type(doc_to_choice) == list:
+        elif isinstance(doc_to_choice, list):
             return doc_to_choice
-        elif type(doc_to_choice) == dict:
+        elif isinstance(doc_to_choice, dict):
             return list(doc_to_choice.values())
         elif callable(doc_to_choice):
             return doc_to_choice(doc)
@@ -1078,14 +1064,14 @@ def process_results(self, doc, results):
                 gold = self.doc_to_target(doc)
 
             gold_index_error = False
-            if type(gold) is list:
+            if isinstance(gold, list):
                 gold = [i if i < len(choices) else -100 for i in gold]
                 if -100 in gold:
                     gold_index_error = True
             else:
-                if type(gold) is int:
+                if isinstance(gold, int):
                     gold = gold if gold < len(choices) else -100
-                elif type(gold) is str:
+                elif isinstance(gold, str):
                     gold = choices.index(gold) if gold in choices else -100
 
                 if gold == -100:
@@ -1175,9 +1161,7 @@ def process_results(self, doc, results):
                             predictions=[result],
                             **self._metric_fn_kwargs[metric],
                         )
-                    except (
-                        TypeError
-                    ):  # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
+                    except TypeError:  # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
                         result_score = self._metric_fn_list[metric]([gold, result])
                     if isinstance(result_score, dict):
                         # TODO: this handles the case where HF evaluate returns a dict.
diff --git a/lm_eval/decontamination/archiver.py b/lm_eval/decontamination/archiver.py
index 3b5f09f525..e6bff33f0c 100644
--- a/lm_eval/decontamination/archiver.py
+++ b/lm_eval/decontamination/archiver.py
@@ -1,13 +1,14 @@
+import datetime
+import io
+import json
+import mmap
 import os
+from pathlib import Path
 from typing import Any
-import zstandard
-import json
+
 import jsonlines
-import io
-import datetime
-import mmap
 import tqdm
-from pathlib import Path
+import zstandard
 
 
 def json_serial(obj: Any) -> str:
diff --git a/lm_eval/decontamination/decontaminate.py b/lm_eval/decontamination/decontaminate.py
index 447eae52bf..f5b4157c67 100644
--- a/lm_eval/decontamination/decontaminate.py
+++ b/lm_eval/decontamination/decontaminate.py
@@ -1,13 +1,13 @@
-import time
-import random
-import pickle
-import json
+import collections
 import glob
+import json
 import os
-import collections
+import pickle
+import random
+import time
 
-from .janitor import Janitor, word_ngrams
 from .archiver import ZStdTextReader
+from .janitor import Janitor, word_ngrams
 
 
 # Was used for testing the evaluator decoupled from the full logic below
@@ -109,7 +109,7 @@ def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit) -> str:
         print(f"Merging lookups took {elapsed:0.5f} seconds.")
 
         print(f"{ngrams_n_size} grams files found in {ngrams_path}:")
-        files = glob.glob(os.path.join(ngrams_path, f"*.sorted.zst"))
+        files = glob.glob(os.path.join(ngrams_path, "*.sorted.zst"))
         print(files)
 
         for file in files:
@@ -135,11 +135,7 @@ def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit) -> str:
                         matching_unique += 1
                         for task_name, task_set, doc_ids in merged_lookup[ngram]:
                             task_doc_set = duplicates[(task_name, task_set)]
-                            for (
-                                doc_id
-                            ) in (
-                                doc_ids
-                            ):  # Record contamination across all relevant task/set combos
+                            for doc_id in doc_ids:  # Record contamination across all relevant task/set combos
                                 task_doc_set.add(doc_id)
                         del merged_lookup[ngram]  # No point matching again
                     else:
diff --git a/lm_eval/decontamination/janitor.py b/lm_eval/decontamination/janitor.py
index 5ad84d13df..cedf8a5717 100644
--- a/lm_eval/decontamination/janitor.py
+++ b/lm_eval/decontamination/janitor.py
@@ -1,9 +1,9 @@
+import pickle
 import re
 import string
-import pickle
 import traceback
-from pprint import pprint
-from typing import Iterator, Sequence, TypeVar, List, Tuple
+from typing import Iterator, List, Sequence, Tuple, TypeVar
+
 
 # This is a cpp module. Compile janitor_util.cpp with:
 # c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup
diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
index cb5d0f53ae..5d277a6bf7 100644
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -1,8 +1,6 @@
 import random
 import itertools
-import json
 import collections
-import sys
 
 import torch
 
@@ -17,8 +15,6 @@
 from lm_eval.utils import (
     positional_deprecated,
     run_task_tests,
-    make_table,
-    create_iterator,
     get_git_commit_hash,
     simple_parse_args_string,
     eval_logger,
@@ -91,7 +87,7 @@ def simple_evaluate(
     if gen_kwargs is not None:
         gen_kwargs = simple_parse_args_string(gen_kwargs)
         eval_logger.warning(
-            f"generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks."
+            "generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks."
         )
         if gen_kwargs == "":
             gen_kwargs = None
@@ -118,7 +114,9 @@ def simple_evaluate(
             use_cache
             # each rank receives a different cache db.
             # necessary to avoid multiple writes to cache at once
-            + "_rank" + str(lm.rank) + ".db",
+            + "_rank"
+            + str(lm.rank)
+            + ".db",
         )
 
     task_dict = lm_eval.tasks.get_task_dict(tasks)
@@ -513,9 +511,7 @@ def evaluate(
                                 ) + total_size * current_size / (
                                     (total_size + current_size)
                                     * (total_size + current_size - 1)
-                                ) * (
-                                    results[group][metric] - metric_score
-                                ) ** 2
+                                ) * (results[group][metric] - metric_score) ** 2
                             else:
                                 results[group][metric] = metric_score
                                 results[group][stderr] = var_score
diff --git a/lm_eval/filters/__init__.py b/lm_eval/filters/__init__.py
index c74ac01593..76eb78467e 100644
--- a/lm_eval/filters/__init__.py
+++ b/lm_eval/filters/__init__.py
@@ -32,7 +32,7 @@ def build_filter_ensemble(filter_name, components):
     Create a filtering pipeline.
     """
     filters = []
-    for (function, kwargs) in components:
+    for function, kwargs in components:
         if kwargs is None:
             f = get_filter(function)()
         else:
diff --git a/lm_eval/models/anthropic_llms.py b/lm_eval/models/anthropic_llms.py
index 18b1b70a38..6e5b437875 100644
--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
@@ -1,9 +1,12 @@
-from lm_eval.api.model import LM
-from lm_eval.api.registry import register_model
-from tqdm import tqdm
 import time
+from typing import Any, List, Tuple
+
+from tqdm import tqdm
+
 from lm_eval import utils
-from typing import List, Any, Tuple
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+
 
 eval_logger = utils.eval_logger
 
diff --git a/lm_eval/models/dummy.py b/lm_eval/models/dummy.py
index b13a3900f9..d28435f7ea 100644
--- a/lm_eval/models/dummy.py
+++ b/lm_eval/models/dummy.py
@@ -1,4 +1,5 @@
 import random
+
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
 
diff --git a/lm_eval/models/gguf.py b/lm_eval/models/gguf.py
index 5ae154f39a..8eebc2e04f 100644
--- a/lm_eval/models/gguf.py
+++ b/lm_eval/models/gguf.py
@@ -1,11 +1,14 @@
-import requests
 import logging
 import time
-from tqdm import tqdm
+
+import requests
 from requests.exceptions import RequestException
+from tqdm import tqdm
+
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
 
+
 logger = logging.getLogger(__name__)
 
 
diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index b32ffc34e3..dc243a1a5c 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -1,29 +1,28 @@
+import copy
 import os
-from packaging import version
+from collections import defaultdict
+from pathlib import Path
+from typing import List, Literal, Optional, Tuple, Union
+
 import torch
+import torch.nn.functional as F
 import transformers
+from accelerate import Accelerator, DistributedType, find_executable_batch_size
+from packaging import version
+from peft import PeftModel
+from peft import __version__ as PEFT_VERSION
+from tqdm import tqdm
 from transformers.models.auto.modeling_auto import (
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
     MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
 )
-from peft import __version__ as PEFT_VERSION, PeftModel
-
-import copy
-from collections import defaultdict
-from tqdm import tqdm
-from pathlib import Path
-
-import torch.nn.functional as F
 
 from lm_eval import utils
 from lm_eval.api.instance import Instance
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
+from lm_eval.utils import stop_sequences_criteria
 
-from lm_eval.utils import MultiTokenEOSCriteria, stop_sequences_criteria
-
-from accelerate import Accelerator, find_executable_batch_size, DistributedType
-from typing import List, Optional, Union, Tuple, Literal
 
 eval_logger = utils.eval_logger
 
@@ -107,9 +106,7 @@ def __init__(
             eval_logger.warning(
                 "`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way."
             )
-            assert (
-                not parallelize
-            ), "`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`"
+            assert not parallelize, "`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`"
             self._model = pretrained
             self._device = self._model.device
 
@@ -279,10 +276,13 @@ def __init__(
                             "with 'accelerate launch *script*'. "
                             f"Current run will proceed with {accelerator.num_processes} devices."
                         )
-                    assert accelerator.distributed_type in [
-                        DistributedType.FSDP,
-                        DistributedType.MULTI_GPU,
-                    ], "Unsupported distributed type provided. Only DDP and FSDP are supported."
+                    assert (
+                        accelerator.distributed_type
+                        in [
+                            DistributedType.FSDP,
+                            DistributedType.MULTI_GPU,
+                        ]
+                    ), "Unsupported distributed type provided. Only DDP and FSDP are supported."
                     if accelerator.distributed_type == DistributedType.FSDP:
                         self._model = accelerator.prepare(self.model)
                     else:
@@ -417,7 +417,6 @@ def _get_config(
         revision: str = "main",
         trust_remote_code: bool = False,
     ) -> None:
-
         self._config = transformers.AutoConfig.from_pretrained(
             pretrained,
             revision=revision,
@@ -751,8 +750,9 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
         for context, continuation in [req.args for req in requests]:
             if context == "":
                 # end of text as context
-                context_enc, continuation_enc = [self.eot_token_id], self.tok_encode(
-                    continuation
+                context_enc, continuation_enc = (
+                    [self.eot_token_id],
+                    self.tok_encode(continuation),
                 )
             else:
                 context_enc, continuation_enc = self._encode_pair(context, continuation)
@@ -995,9 +995,7 @@ def _collate(x):
                 greedy_tokens = logits.argmax(dim=-1)
                 cont_toks = torch.tensor(
                     cont_toks, dtype=torch.long, device=self.device
-                ).unsqueeze(
-                    0
-                )  # [1, seq]
+                ).unsqueeze(0)  # [1, seq]
                 max_equal = (greedy_tokens == cont_toks).all()
 
                 # Obtain log-probs at the corresponding continuation token indices
diff --git a/lm_eval/models/openai_completions.py b/lm_eval/models/openai_completions.py
index b0f6a8f170..d63f8ab12a 100644
--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -1,9 +1,10 @@
+import copy
 import os
 import time
-from typing import List, Tuple, Optional
-
-import copy
 from collections import defaultdict
+from importlib.util import find_spec
+from typing import List, Optional, Tuple
+
 from tqdm import tqdm
 
 from lm_eval import utils
@@ -44,13 +45,13 @@ def oa_completion(**kwargs):
 
     Retry with back-off until they respond
     """
-    try:
-        import openai, tiktoken  # noqa: E401
-    except ModuleNotFoundError:
+    if not find_spec("openai") or not find_spec("tiktoken"):
         raise Exception(
-            "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
-please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`",
+            "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. "
+            "Please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`"
         )
+    else:
+        import openai
 
     backoff_time = 3
     while True:
@@ -88,7 +89,8 @@ def __init__(
         super().__init__()
         self.seed = seed
         try:
-            import openai, tiktoken  # noqa: E401
+            import openai  # noqa: E401
+            import tiktoken
         except ModuleNotFoundError:
             raise Exception(
                 "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
@@ -154,8 +156,9 @@ def loglikelihood(self, requests) -> List[Tuple[float, bool]]:
         for context, continuation in [req.args for req in requests]:
             if context == "":
                 # end of text as context
-                context_enc, continuation_enc = [self.eot_token_id], self.tok_encode(
-                    continuation
+                context_enc, continuation_enc = (
+                    [self.eot_token_id],
+                    self.tok_encode(continuation),
                 )
             else:
                 context_enc, continuation_enc = self._encode_pair(context, continuation)
@@ -326,13 +329,13 @@ def oa_chat_completion(client, **kwargs):
 
     Retry with back-off until they respond
     """
-    try:
-        import openai, tiktoken  # noqa: E401
-    except ModuleNotFoundError:
+    if not find_spec("openai") or not find_spec("tiktoken"):
         raise Exception(
-            "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
-please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`",
+            "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. "
+            "Please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`"
         )
+    else:
+        import openai
 
     async def _get_completions(**kwargs):
         chat_completions = await client.chat.completions.create(**kwargs)
@@ -364,7 +367,8 @@ def __init__(
         """
         super().__init__()
         try:
-            import openai, tiktoken  # noqa: E401
+            import openai  # noqa: E401
+            import tiktoken
         except ModuleNotFoundError:
             raise Exception(
                 "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
diff --git a/lm_eval/models/textsynth.py b/lm_eval/models/textsynth.py
index 379f11b902..32917d692c 100644
--- a/lm_eval/models/textsynth.py
+++ b/lm_eval/models/textsynth.py
@@ -13,9 +13,11 @@
 """
 import logging
 import os
-import requests as _requests
 import time
+
+import requests as _requests
 from tqdm import tqdm
+
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
 
@@ -149,7 +151,7 @@ def generate_until(self, requests):
                 self.cache_hook.add_partial("generate_until", (inp, request_args), s)
             else:
                 logger.error(
-                    f"The following response does not contain generated `text`. "
+                    "The following response does not contain generated `text`. "
                     "Got:\n{resp}"
                 )
                 assert False
diff --git a/lm_eval/models/vllm_causallms.py b/lm_eval/models/vllm_causallms.py
index edab369411..e6a75ceb21 100644
--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
@@ -1,16 +1,19 @@
+import copy
 from collections import defaultdict
-from typing import List, Tuple, Optional, Literal, Union, Any
-from transformers import AutoTokenizer
+from importlib.util import find_spec
+from typing import List, Literal, Optional, Tuple, Union
+
+from tqdm import tqdm
+
+from lm_eval import utils
 from lm_eval.api.instance import Instance
 from lm_eval.api.model import LM
-import copy
-from tqdm import tqdm
 from lm_eval.api.registry import register_model
-from lm_eval import utils
+
 
 try:
-    from vllm import LLM, SamplingParams
     from ray.util.multiprocessing import Pool
+    from vllm import LLM, SamplingParams
     from vllm.transformers_utils.tokenizer import get_tokenizer
 except ModuleNotFoundError:
     pass
@@ -54,12 +57,10 @@ def __init__(
     ):
         super().__init__()
 
-        try:
-            import vllm
-        except ModuleNotFoundError:
+        if not find_spec("vllm"):
             raise Exception(
-                "attempted to use 'vllm' LM type, but package `vllm` is not installed. \
-please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`",
+                "attempted to use 'vllm' LM type, but package `vllm` is not installed. "
+                "Please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`"
             )
 
         assert "cuda" in device or device is None, "vLLM only supports CUDA"
@@ -193,8 +194,9 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
         for context, continuation in [req.args for req in requests]:
             if context == "":
                 # end of text as context
-                context_enc, continuation_enc = [self.eot_token_id], self.tok_encode(
-                    continuation
+                context_enc, continuation_enc = (
+                    [self.eot_token_id],
+                    self.tok_encode(continuation),
                 )
             else:
                 context_enc, continuation_enc = self._encode_pair(context, continuation)
diff --git a/lm_eval/prompts/__init__.py b/lm_eval/prompts/__init__.py
index d058a48776..d8b62e7deb 100644
--- a/lm_eval/prompts/__init__.py
+++ b/lm_eval/prompts/__init__.py
@@ -69,7 +69,6 @@ def get_prompt(prompt_id: str, dataset_name: str = None, subset_name: str = None
 def load_prompt_list(
     use_prompt: str, dataset_name=None, subset_name=None, yaml_path=None, **kwargs
 ):
-
     category_name, prompt_name = use_prompt.split(":")
 
     if category_name == "promptsource":
@@ -113,7 +112,6 @@ def __init__(self, prompt_string):
         self.prompt_string = prompt_string
 
     def apply(self, doc):
-
         doc_to_text = self.prompt_string["doc_to_text"]
         doc_to_target = self.prompt_string["doc_to_target"]
 
diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py
index 28563de6bf..ed92bd9755 100644
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -180,7 +180,6 @@ def include_path(task_dir):
 
 
 def initialize_tasks(verbosity="INFO"):
-
     eval_logger.setLevel(getattr(logging, f"{verbosity}"))
 
     task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
diff --git a/lm_eval/tasks/bbh/_generate_configs.py b/lm_eval/tasks/bbh/_generate_configs.py
index d2a53cfb6c..18a55c705a 100644
--- a/lm_eval/tasks/bbh/_generate_configs.py
+++ b/lm_eval/tasks/bbh/_generate_configs.py
@@ -24,7 +24,6 @@ def parse_args():
 
 
 if __name__ == "__main__":
-
     args = parse_args()
 
     # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
@@ -37,7 +36,6 @@ def parse_args():
 
     dataset_path = "lukaemon/bbh"
     for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()):
-
         resp = requests.get(
             f"https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/main/cot-prompts/{task}.txt"
         ).content.decode("utf-8")
diff --git a/lm_eval/tasks/belebele/_generate_configs.py b/lm_eval/tasks/belebele/_generate_configs.py
index 9df56f5feb..052d55bea2 100644
--- a/lm_eval/tasks/belebele/_generate_configs.py
+++ b/lm_eval/tasks/belebele/_generate_configs.py
@@ -23,7 +23,6 @@ def parse_args():
 
 
 if __name__ == "__main__":
-
     args = parse_args()
 
     # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
diff --git a/lm_eval/tasks/bigbench/generate_tasks.py b/lm_eval/tasks/bigbench/generate_tasks.py
index fa8619f40c..3fd5cd6c2b 100644
--- a/lm_eval/tasks/bigbench/generate_tasks.py
+++ b/lm_eval/tasks/bigbench/generate_tasks.py
@@ -173,7 +173,6 @@
 
 
 def main() -> None:
-
     for path, task_type in zip(
         ["multiple_choice", "generate_until"],
         ["multiple_choice_template_yaml", "generate_until_template_yaml"],
diff --git a/lm_eval/tasks/blimp/generate_configs.py b/lm_eval/tasks/blimp/generate_configs.py
index 4fa45db4d2..dfc4b4dc95 100644
--- a/lm_eval/tasks/blimp/generate_configs.py
+++ b/lm_eval/tasks/blimp/generate_configs.py
@@ -73,7 +73,6 @@
 
 def main() -> None:
     for task in all_subtasks:
-
         file_name = f"{task}.yaml"
         try:
             with open(f"{file_name}", "w") as f:
diff --git a/lm_eval/tasks/ceval/_generate_configs.py b/lm_eval/tasks/ceval/_generate_configs.py
index deaa0372c8..2b96e00713 100644
--- a/lm_eval/tasks/ceval/_generate_configs.py
+++ b/lm_eval/tasks/ceval/_generate_configs.py
@@ -75,7 +75,6 @@ def parse_args():
 
 
 if __name__ == "__main__":
-
     args = parse_args()
 
     # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
@@ -93,7 +92,9 @@ def parse_args():
         if args.cot_prompt_path is not None:
             description = cot_file[subject_eng]
         else:
-            description = f"以下是中国关于{subject_zh}的单项选择题，请选出其中的正确答案。\n\n"
+            description = (
+                f"以下是中国关于{subject_zh}的单项选择题，请选出其中的正确答案。\n\n"
+            )
 
         yaml_dict = {
             "include": base_yaml_name,
diff --git a/lm_eval/tasks/cmmlu/_generate_configs.py b/lm_eval/tasks/cmmlu/_generate_configs.py
index 4b3dba75b1..07553bb1ea 100644
--- a/lm_eval/tasks/cmmlu/_generate_configs.py
+++ b/lm_eval/tasks/cmmlu/_generate_configs.py
@@ -90,7 +90,6 @@ def parse_args():
 
 
 if __name__ == "__main__":
-
     args = parse_args()
 
     # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
@@ -108,7 +107,9 @@ def parse_args():
         if args.cot_prompt_path is not None:
             description = cot_file[subject_eng]
         else:
-            description = f"以下是关于{subject_zh}的单项选择题，请直接给出正确答案的选项。\n\n"
+            description = (
+                f"以下是关于{subject_zh}的单项选择题，请直接给出正确答案的选项。\n\n"
+            )
 
         yaml_dict = {
             "include": base_yaml_name,
diff --git a/lm_eval/tasks/code_x_glue/code-text/bleu.py b/lm_eval/tasks/code_x_glue/code-text/bleu.py
index 310c626c73..a90fc46b17 100644
--- a/lm_eval/tasks/code_x_glue/code-text/bleu.py
+++ b/lm_eval/tasks/code_x_glue/code-text/bleu.py
@@ -1,9 +1,7 @@
 #!/usr/bin/python
-import os
 import re
 import sys
 import math
-import subprocess
 import xml.sax.saxutils
 
 from typing import List, Pattern, Tuple, Union, Dict, Any, Optional
@@ -65,14 +63,14 @@ def normalize(s):
     if type(s) is not str:
         s = " ".join(s)
     # language-independent part:
-    for (pattern, replace) in normalize1:
+    for pattern, replace in normalize1:
         s = re.sub(pattern, replace, s)
     s = xml.sax.saxutils.unescape(s, {"&quot;": '"'})
     # language-dependent part (assuming Western languages):
     s = " %s " % s
     if not preserve_case:
         s = s.lower()  # this might not be identical to the original
-    for (pattern, replace) in normalize2:
+    for pattern, replace in normalize2:
         s = re.sub(pattern, replace, s)
     return s.split()
 
@@ -95,7 +93,7 @@ def cook_refs(refs, n=4):
     maxcounts: Dict[Tuple[str], int] = {}
     for ref in refs:
         counts = count_ngrams(ref, n)
-        for (ngram, count) in counts.items():
+        for ngram, count in counts.items():
             maxcounts[ngram] = max(maxcounts.get(ngram, 0), count)
     return ([len(ref) for ref in refs], maxcounts)
 
@@ -125,7 +123,7 @@ def cook_test(test, item, n=4):
 
     result["correct"] = [0] * n
     counts = count_ngrams(test, n)
-    for (ngram, count) in counts.items():
+    for ngram, count in counts.items():
         result["correct"][len(ngram) - 1] += min(refmaxcounts.get(ngram, 0), count)
 
     return result
@@ -222,7 +220,6 @@ def bleuFromMaps(m1, m2):
 
 
 def smoothed_bleu_4(references, predictions, **kwargs):
-
     predictionMap = {}
     goldMap = {}
 
diff --git a/lm_eval/tasks/code_x_glue/code-text/utils.py b/lm_eval/tasks/code_x_glue/code-text/utils.py
index 981a00b912..6975684259 100644
--- a/lm_eval/tasks/code_x_glue/code-text/utils.py
+++ b/lm_eval/tasks/code_x_glue/code-text/utils.py
@@ -1,5 +1,4 @@
 def doc_to_text(doc):
-
     inputs = " ".join(doc["code_tokens"]).replace("\n", " ")
     inputs = " ".join(inputs.strip().split())
 
@@ -7,7 +6,6 @@ def doc_to_text(doc):
 
 
 def doc_to_target(doc):
-
     targets = " ".join(doc["docstring_tokens"]).replace("\n", "")
     targets = " ".join(targets.strip().split())
 
diff --git a/lm_eval/tasks/coqa/utils.py b/lm_eval/tasks/coqa/utils.py
index 4fed8ff8c2..29911cfec5 100644
--- a/lm_eval/tasks/coqa/utils.py
+++ b/lm_eval/tasks/coqa/utils.py
@@ -7,7 +7,7 @@ def doc_to_text(doc):
     # Given a passage p, the conversation history {q1, a1, . . . qi−1, ai−1}
     # and a question qi, the task is to predict the answer ai
     doc_text = doc["story"] + "\n\n"
-    for (q, a) in zip_longest(
+    for q, a in zip_longest(
         doc["questions"]["input_text"], doc["answers"]["input_text"][:-1]
     ):  # omit target answer ai
         question = f"Q: {q}\n\n"
@@ -17,7 +17,6 @@ def doc_to_text(doc):
 
 
 def doc_to_target(doc):
-
     turn_id = len(doc["questions"]["input_text"])
     # Returns unique answers and valid alternatives (Some questions in CoQA have multiple valid answers).
     answers = []
@@ -71,7 +70,6 @@ def compute_scores(gold_list, pred):
 
 
 def process_results(doc, results):
-
     gold_list = doc_to_target(doc)
     pred = results[0].strip().split("\n")[0]
 
diff --git a/lm_eval/tasks/csatqa/_generate_configs.py b/lm_eval/tasks/csatqa/_generate_configs.py
index ca2bfc436e..56fe825a90 100644
--- a/lm_eval/tasks/csatqa/_generate_configs.py
+++ b/lm_eval/tasks/csatqa/_generate_configs.py
@@ -21,7 +21,6 @@ def parse_args():
 
 
 if __name__ == "__main__":
-
     args = parse_args()
 
     # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
@@ -30,7 +29,6 @@ def parse_args():
         base_yaml = yaml.full_load(f)
 
     for name in tqdm(SUBSETS):
-
         yaml_dict = {
             "include": base_yaml_name,
             "task": f"csatqa_{args.task_prefix}_{name}"
diff --git a/lm_eval/tasks/drop/utils.py b/lm_eval/tasks/drop/utils.py
index 1e2888ce3e..03f7218c90 100644
--- a/lm_eval/tasks/drop/utils.py
+++ b/lm_eval/tasks/drop/utils.py
@@ -62,7 +62,6 @@ def parse_answer(answer):
 
 
 def process_results(doc, results):
-
     preds, golds = results, doc["answers"]
     max_em = 0
     max_f1 = 0
diff --git a/lm_eval/tasks/ifeval/instructions_registry.py b/lm_eval/tasks/ifeval/instructions_registry.py
index 1056b139e2..ecb20e9b23 100644
--- a/lm_eval/tasks/ifeval/instructions_registry.py
+++ b/lm_eval/tasks/ifeval/instructions_registry.py
@@ -78,8 +78,7 @@
     # _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
     _KEYWORD + "forbidden_words": {_KEYWORD + "forbidden_words"},
     _KEYWORD + "letter_frequency": {_KEYWORD + "letter_frequency"},
-    _LANGUAGE
-    + "response_language": {
+    _LANGUAGE + "response_language": {
         _LANGUAGE + "response_language",
         _FORMAT + "multiple_sections",
         _KEYWORD + "existence",
@@ -90,16 +89,14 @@
         _CHANGE_CASES + "english_lowercase",
     },
     _LENGTH + "number_sentences": {_LENGTH + "number_sentences"},
-    _LENGTH
-    + "number_paragraphs": {
+    _LENGTH + "number_paragraphs": {
         _LENGTH + "number_paragraphs",
         _LENGTH + "nth_paragraph_first_word",
         _LENGTH + "number_sentences",
         _LENGTH + "nth_paragraph_first_word",
     },
     _LENGTH + "number_words": {_LENGTH + "number_words"},
-    _LENGTH
-    + "nth_paragraph_first_word": {
+    _LENGTH + "nth_paragraph_first_word": {
         _LENGTH + "nth_paragraph_first_word",
         _LENGTH + "number_paragraphs",
     },
@@ -110,23 +107,20 @@
     # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
     _FORMAT + "constrained_response": set(INSTRUCTION_DICT.keys()),
     _FORMAT + "number_highlighted_sections": {_FORMAT + "number_highlighted_sections"},
-    _FORMAT
-    + "multiple_sections": {
+    _FORMAT + "multiple_sections": {
         _FORMAT + "multiple_sections",
         _LANGUAGE + "response_language",
         _FORMAT + "number_highlighted_sections",
     },
     # TODO(tianjianlu): Re-enable rephrasing with preprocessing the message.
     # _FORMAT + "rephrase": instructions.RephraseChecker,
-    _FORMAT
-    + "json_format": set(INSTRUCTION_DICT.keys()).difference(
+    _FORMAT + "json_format": set(INSTRUCTION_DICT.keys()).difference(
         {_KEYWORD + "forbidden_words", _KEYWORD + "existence"}
     ),
     _FORMAT + "title": {_FORMAT + "title"},
     # TODO(tianjianlu): Re-enable with specific prompts.
     # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
-    _COMBINATION
-    + "two_responses": set(INSTRUCTION_DICT.keys()).difference(
+    _COMBINATION + "two_responses": set(INSTRUCTION_DICT.keys()).difference(
         {
             _KEYWORD + "forbidden_words",
             _KEYWORD + "existence",
@@ -135,20 +129,17 @@
             _PUNCTUATION + "no_comma",
         }
     ),
-    _COMBINATION
-    + "repeat_prompt": set(INSTRUCTION_DICT.keys()).difference(
+    _COMBINATION + "repeat_prompt": set(INSTRUCTION_DICT.keys()).difference(
         {_KEYWORD + "existence", _FORMAT + "title", _PUNCTUATION + "no_comma"}
     ),
     _STARTEND + "end_checker": {_STARTEND + "end_checker"},
-    _CHANGE_CASES
-    + "capital_word_frequency": {
+    _CHANGE_CASES + "capital_word_frequency": {
         _CHANGE_CASES + "capital_word_frequency",
         _CHANGE_CASES + "english_lowercase",
         _CHANGE_CASES + "english_capital",
     },
     _CHANGE_CASES + "english_capital": {_CHANGE_CASES + "english_capital"},
-    _CHANGE_CASES
-    + "english_lowercase": {
+    _CHANGE_CASES + "english_lowercase": {
         _CHANGE_CASES + "english_lowercase",
         _CHANGE_CASES + "english_capital",
     },
diff --git a/lm_eval/tasks/ifeval/instructions_util.py b/lm_eval/tasks/ifeval/instructions_util.py
index 2390cba305..ccb531f96e 100644
--- a/lm_eval/tasks/ifeval/instructions_util.py
+++ b/lm_eval/tasks/ifeval/instructions_util.py
@@ -17,7 +17,6 @@
 import functools
 import random
 import re
-from typing import List
 
 import immutabledict
 import nltk
diff --git a/lm_eval/tasks/mgsm/utils.py b/lm_eval/tasks/mgsm/utils.py
index 97affac765..3edc78ab28 100644
--- a/lm_eval/tasks/mgsm/utils.py
+++ b/lm_eval/tasks/mgsm/utils.py
@@ -94,7 +94,6 @@
 
 
 def add_regex_pattern(regex_pattern):
-
     if regex_pattern is None:
         return {}
     return {
diff --git a/lm_eval/tasks/mmlu/_generate_configs.py b/lm_eval/tasks/mmlu/_generate_configs.py
index 2bf27ac0f7..e6271bc4c2 100644
--- a/lm_eval/tasks/mmlu/_generate_configs.py
+++ b/lm_eval/tasks/mmlu/_generate_configs.py
@@ -7,7 +7,6 @@
 
 from tqdm import tqdm
 
-from lm_eval import utils
 from lm_eval.logger import eval_logger
 
 SUBJECTS = {
@@ -82,7 +81,6 @@ def parse_args():
 
 
 if __name__ == "__main__":
-
     args = parse_args()
 
     # get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
@@ -98,7 +96,6 @@ def parse_args():
 
     ALL_CATEGORIES = []
     for subject, category in tqdm(SUBJECTS.items()):
-
         if category not in ALL_CATEGORIES:
             ALL_CATEGORIES.append(category)
 
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/_generate_configs.py b/lm_eval/tasks/model_written_evals/advanced_ai_risk/_generate_configs.py
index ca199226a8..aecb40a5eb 100644
--- a/lm_eval/tasks/model_written_evals/advanced_ai_risk/_generate_configs.py
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/_generate_configs.py
@@ -1,12 +1,10 @@
 import yaml
-import inspect
 import datasets
 
 from tqdm import tqdm
 
 
 def main() -> None:
-
     dataset_path = "EleutherAI/advanced_ai_risk"
     for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()):
         file_name = f"{task}.yaml"
diff --git a/lm_eval/tasks/model_written_evals/persona/_generate_configs.py b/lm_eval/tasks/model_written_evals/persona/_generate_configs.py
index a21f28309b..7aff892f03 100644
--- a/lm_eval/tasks/model_written_evals/persona/_generate_configs.py
+++ b/lm_eval/tasks/model_written_evals/persona/_generate_configs.py
@@ -1,12 +1,10 @@
 import yaml
-import inspect
 import datasets
 
 from tqdm import tqdm
 
 
 def main() -> None:
-
     dataset_path = "EleutherAI/persona"
     for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()):
         file_name = f"{task}.yaml"
diff --git a/lm_eval/tasks/paws-x/_generate_config.py b/lm_eval/tasks/paws-x/_generate_config.py
index bff82e4ff0..a1341fec89 100644
--- a/lm_eval/tasks/paws-x/_generate_config.py
+++ b/lm_eval/tasks/paws-x/_generate_config.py
@@ -1,5 +1,4 @@
 import argparse
-from typing import Dict, List
 
 import yaml
 
diff --git a/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py b/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
index 51c198703f..0dccf9408a 100644
--- a/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
+++ b/lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
@@ -1,5 +1,6 @@
 def doc_to_text(doc) -> str:
     ctxs = "\n".join(doc["CONTEXTS"])
     return "Abstract: {}\nQuestion: {}\nAnswer:".format(
-        ctxs, doc["QUESTION"], doc["final_decision"]
+        ctxs,
+        doc["QUESTION"],
     )
diff --git a/lm_eval/tasks/qasper/utils.py b/lm_eval/tasks/qasper/utils.py
index be6f79dcad..7a02237a78 100644
--- a/lm_eval/tasks/qasper/utils.py
+++ b/lm_eval/tasks/qasper/utils.py
@@ -3,7 +3,6 @@
 
 
 def process_docs(dataset, set_answer_type="bool"):
-
     FEATURES = ["title", "abstract", "question", "answer", "answer_type"]
 
     def _categorise_answer(answer_blob):
diff --git a/lm_eval/tasks/scrolls/task.py b/lm_eval/tasks/scrolls/task.py
index e44296a4e0..829e97bde0 100644
--- a/lm_eval/tasks/scrolls/task.py
+++ b/lm_eval/tasks/scrolls/task.py
@@ -235,7 +235,6 @@ def process_results(self, doc, results):
         }
 
     def construct_requests(self, doc, ctx, **kwargs):
-
         request_list = [
             Instance(
                 request_type="loglikelihood",
diff --git a/lm_eval/tasks/squadv2/task.py b/lm_eval/tasks/squadv2/task.py
index 4630e2a161..ba308acd43 100644
--- a/lm_eval/tasks/squadv2/task.py
+++ b/lm_eval/tasks/squadv2/task.py
@@ -14,7 +14,6 @@
 Homepage: https://rajpurkar.github.io/SQuAD-explorer/
 """
 import datasets
-from evaluate import load
 
 from math import exp
 from functools import partial
@@ -120,14 +119,14 @@ def construct_requests(self, doc, ctx, **kwargs):
                 doc=doc,
                 arguments=(ctx, {"until": ["\n"]}),
                 idx=0,
-                **kwargs
+                **kwargs,
             ),
             Instance(
                 request_type="loglikelihood",
                 doc=doc,
                 arguments=(ctx, " " + "unanswerable"),
                 idx=0,
-                **kwargs
+                **kwargs,
             ),
         ]
 
diff --git a/lm_eval/tasks/super_glue/cb/t5_utils.py b/lm_eval/tasks/super_glue/cb/t5_utils.py
index 43eafce9d6..ec02e34538 100644
--- a/lm_eval/tasks/super_glue/cb/t5_utils.py
+++ b/lm_eval/tasks/super_glue/cb/t5_utils.py
@@ -2,7 +2,6 @@
 
 
 def mean_3class_f1(predictions, references):  # This is a passthrough function
-
     string_label = ["entailment", "contradiction", "neutral"]
     predictions = (
         string_label.index(predictions[0]) if predictions[0] in string_label else 0
@@ -13,7 +12,6 @@ def mean_3class_f1(predictions, references):  # This is a passthrough function
 
 
 def agg_mean_3class_f1(items):
-
     predictions, references = zip(*items)
 
     """Computes the unweighted average of the F1 per class."""
diff --git a/lm_eval/tasks/super_glue/multirc/t5_utils.py b/lm_eval/tasks/super_glue/multirc/t5_utils.py
index ac99aaf962..d17d498fa2 100644
--- a/lm_eval/tasks/super_glue/multirc/t5_utils.py
+++ b/lm_eval/tasks/super_glue/multirc/t5_utils.py
@@ -5,7 +5,6 @@
 
 
 def f1(predictions, references):  # This is a passthrough function
-
     _prediction = predictions[0]
     _reference = references[0].split("_")[-1]
     string_label = ["False", "True"]
@@ -20,7 +19,6 @@ def f1(predictions, references):  # This is a passthrough function
 
 
 def agg_f1(items):
-
     predictions, references = zip(*items)
     references, predictions = np.asarray(references), np.asarray(predictions)
 
@@ -28,7 +26,6 @@ def agg_f1(items):
 
 
 def em(predictions, references):  # This is a passthrough function
-
     _prediction = predictions[0]
     _group, _reference = references[0].split("_")
     string_label = ["False", "True"]
diff --git a/lm_eval/tasks/super_glue/record/t5_utils.py b/lm_eval/tasks/super_glue/record/t5_utils.py
index 98730cacd4..68301b18b3 100644
--- a/lm_eval/tasks/super_glue/record/t5_utils.py
+++ b/lm_eval/tasks/super_glue/record/t5_utils.py
@@ -3,14 +3,12 @@
 import collections
 import numpy as np
 
-from tqdm import tqdm
-from datasets import Dataset, concatenate_datasets
+from datasets import Dataset
 
 from lm_eval.api.metrics import metric_max_over_ground_truths
 
 
 def doc_to_text(doc):
-
     passage = doc["passage"]
     passage = re.sub(r"(\.|\?|\!|\"|\')\n@highlight\n", r"\1 ", passage)
     passage = re.sub(r"\n@highlight\n", ". ", passage)
@@ -34,7 +32,6 @@ def split_answers(doc):
         }
         answers = doc.pop("answers")
         for idx, answer in enumerate(answers):
-
             for key in split_doc.keys():
                 if key in doc:
                     split_doc[key].append(doc[key])
diff --git a/lm_eval/tasks/super_glue/wsc/t5_utils.py b/lm_eval/tasks/super_glue/wsc/t5_utils.py
index 7e55a52a7b..eb5331a42a 100644
--- a/lm_eval/tasks/super_glue/wsc/t5_utils.py
+++ b/lm_eval/tasks/super_glue/wsc/t5_utils.py
@@ -8,7 +8,6 @@ def doc_to_text(x):
 
 
 def _wsc_inputs(x):
-
     words = x["text"].split(" ")
 
     # We would need some special logic to handle the case where the pronoun is the
@@ -55,7 +54,6 @@ def create_input():
 
 class WSCPostprocess(Filter):
     def __init__(self, **kwargs):
-
         self.determiners = {
             "a",
             "an",
@@ -86,10 +84,8 @@ def clean(self, s):
         return " ".join([w for w in s.split(" ") if w not in self.determiners])
 
     def apply(self, resps, docs):
-
         filtered_resps = []
         for prediction, reference in zip(*(resps, docs["span1_text"])):
-
             prediction = self.clean(prediction[0])
             reference = self.clean(reference)
 
diff --git a/lm_eval/tasks/translation/utils.py b/lm_eval/tasks/translation/utils.py
index f80ae89a4f..f30c4d8625 100644
--- a/lm_eval/tasks/translation/utils.py
+++ b/lm_eval/tasks/translation/utils.py
@@ -1,9 +1,7 @@
 import argparse
-from typing import Dict, List
 
 import yaml
 
-import sacrebleu
 
 try:
     import pycountry
diff --git a/lm_eval/tasks/truthfulqa/utils.py b/lm_eval/tasks/truthfulqa/utils.py
index 8c011d2d10..8e2ab43fe8 100644
--- a/lm_eval/tasks/truthfulqa/utils.py
+++ b/lm_eval/tasks/truthfulqa/utils.py
@@ -6,7 +6,6 @@
 
 
 def process_results_mc2(doc, results):
-
     lls, is_greedy = zip(*results)
 
     # Split on the first `0` as everything before it is true (`1`).
@@ -20,7 +19,6 @@ def process_results_mc2(doc, results):
 
 
 def process_docs_gen(dataset: datasets.Dataset) -> datasets.Dataset:
-
     return dataset.map(preprocess_function)
 
 
@@ -49,7 +47,6 @@ def _format_answers(answers):
 
 
 def process_results_gen(doc, results):
-
     completion = results[0]
     true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"]
     all_refs = true_refs + false_refs
diff --git a/lm_eval/tasks/xnli/utils.py b/lm_eval/tasks/xnli/utils.py
index fa7806fc74..2844d1d7c8 100644
--- a/lm_eval/tasks/xnli/utils.py
+++ b/lm_eval/tasks/xnli/utils.py
@@ -1,5 +1,4 @@
 import argparse
-from typing import Dict, List
 
 import yaml
 
diff --git a/lm_eval/utils.py b/lm_eval/utils.py
index 4067669c0d..74f4f482da 100644
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -1,25 +1,23 @@
-import os
-import re
-import sys
-import yaml
+import collections
+import fnmatch
+import functools
+import gc
+import importlib.util
 import inspect
+import logging
+import os
 import pathlib
-import functools
+import re
 import subprocess
-import collections
-import importlib.util
-import fnmatch
-
-from typing import Iterator, List, Literal, Union, Any, Callable
+import sys
+from itertools import islice
+from typing import Any, Callable, Iterator, List, Literal, Union
 
-import gc
 import torch
 import transformers
-
+import yaml
 from jinja2 import BaseLoader, Environment, StrictUndefined
-from itertools import islice
 
-import logging
 
 logging.basicConfig(
     format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
@@ -143,7 +141,7 @@ def __init__(self, choices) -> None:
     def __contains__(self, values) -> bool:
         for value in values.split(","):
             if len(fnmatch.filter(self.choices, value)) == 0:
-                eval_logger.info(f"Available tasks to choose:")
+                eval_logger.info("Available tasks to choose:")
                 for choice in self.choices:
                     eval_logger.info(f"  - {choice}")
                 raise ValueError("'{}' is not in task list".format(value))
@@ -157,7 +155,7 @@ def __iter__(self) -> Iterator:
 # Returns a list containing all values of the source_list that
 # match at least one of the patterns
 def pattern_match(patterns, source_list):
-    if type(patterns) == str:
+    if isinstance(patterns, str):
         patterns = [patterns]
 
     task_names = set()
@@ -332,7 +330,7 @@ def get_original(self, grouped_dict):
 
 def make_table(result_dict, column: str = "results"):
     """Generate table of results."""
-    from pytablewriter import MarkdownTableWriter, LatexTableWriter
+    from pytablewriter import LatexTableWriter, MarkdownTableWriter
 
     if column == "results":
         column_name = "Tasks"
@@ -466,7 +464,7 @@ def import_function(loader, node):
     yaml_path = os.path.dirname(loader.name)
 
     *module_name, function_name = function_name.split(".")
-    if type(module_name) == list:
+    if isinstance(module_name, list):
         module_name = ".".join(module_name)
     module_path = os.path.normpath(os.path.join(yaml_path, "{}.py".format(module_name)))
 
@@ -496,7 +494,7 @@ def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None):
         include_path = yaml_config["include"]
         del yaml_config["include"]
 
-        if type(include_path) == str:
+        if isinstance(include_path, str):
             include_path = [include_path]
 
         # Load from the last one first
diff --git a/mypy.ini b/mypy.ini
index 76a0c86452..2d20dd2cc5 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -9,21 +9,19 @@ warn_unused_ignores = True
 warn_redundant_casts = True
 
 # We ignore errors everywhere to gradually add type annotations
-
-[mypy-lm_eval.*]
-ignore_errors = True
-
-[mypy-lm_eval.api.*]
-ignore_errors = True
-
-[mypy-lm_eval.prompts.*]
-ignore_errors = True
-
-[mypy-lm_eval.models.*]
-ignore_errors = True
-
-[mypy-scripts.*]
-ignore_errors = True
-
-[mypy-main]
-ignore_errors = True
+# [mypy-lm_eval.*]
+# ignore_errors = True
+#
+# [mypy-lm_eval.api.*]
+# ignore_errors = True
+#
+# [mypy-lm_eval.prompts.*]
+# ignore_errors = True
+#
+# [mypy-lm_eval.models.*]
+# ignore_errors = True
+#
+# [mypy-scripts.*]
+# ignore_errors = True
+#
+# [mypy-main]
diff --git a/pyproject.toml b/pyproject.toml
index 5a4d191d7c..87eefc72d3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,14 +54,7 @@ Homepage = "https://github.com/EleutherAI/lm-evaluation-harness"
 Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
 
 [project.optional-dependencies]
-dev = ["black", "flake8", "pre-commit", "pytest", "pytest-cov"]
-linting = [
-    "flake8",
-    "pylint",
-    "mypy",
-    "pre-commit",
-]
-testing = ["pytest", "pytest-cov", "pytest-xdist"]
+dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy"]
 multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
 math = ["sympy>=1.12", "antlr4-python3-runtime==4.11"]
 sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
@@ -88,3 +81,17 @@ all = [
     "lm_eval[ifeval]",
     "lm_eval[zeno]",
 ]
+
+[tool.ruff]
+extend-exclude = ["lm_eval/evaluator.py", "lm_eval/tasks/*.py"]
+
+[tool.ruff.lint]
+extend-select = ["I"]
+
+[tool.ruff.isort]
+lines-after-imports = 2
+known-first-party = ["lm_eval"]
+
+[tool.ruff.extend-per-file-ignores]
+"__init__.py" = ["F401","F402","F403","I"]
+"lm_eval/tasks/*"= ["E721"]
diff --git a/scripts/build_benchmark.py b/scripts/build_benchmark.py
index 4cd07dd3eb..ce4b661681 100644
--- a/scripts/build_benchmark.py
+++ b/scripts/build_benchmark.py
@@ -1,15 +1,14 @@
-import os
-import yaml
 import argparse
+import os
 
-from tqdm import tqdm
+import yaml
 from promptsource.templates import DatasetTemplates
-
-from lm_eval import utils
+from tqdm import tqdm
 
 # from lm_eval.api.registry import ALL_TASKS
 from lm_eval.logger import eval_logger
 
+
 # from lm_eval.tasks import include_task_folder
 
 
@@ -22,7 +21,6 @@ def parse_args():
 
 
 if __name__ == "__main__":
-
     args = parse_args()
 
     with open(args.benchmark_path) as file:
diff --git a/scripts/clean_training_data/compress_and_package.py b/scripts/clean_training_data/compress_and_package.py
index c9e7f2593c..d4af5ba5f3 100644
--- a/scripts/clean_training_data/compress_and_package.py
+++ b/scripts/clean_training_data/compress_and_package.py
@@ -1,15 +1,15 @@
-import glob
 import argparse
+import glob
+import logging
 import os
-import subprocess
 import shutil
+import subprocess
 
 from tqdm import tqdm
 from tqdm_multiprocess import TqdmMultiProcessPool
-
-import logging
 from tqdm_multiprocess.logger import setup_logger_tqdm
 
+
 logger = logging.getLogger(__name__)
 
 
@@ -35,7 +35,7 @@ def compress_and_move(working_directory, output_directory, process_count):
 
     tasks = []
     bucket_file_paths = glob.glob(
-        os.path.join(working_directory, "output", f"*.bkt.txt.sorted")
+        os.path.join(working_directory, "output", "*.bkt.txt.sorted")
     )
     for bucket_file_path in bucket_file_paths:
         task = (process_task, (working_directory, output_directory, bucket_file_path))
diff --git a/scripts/clean_training_data/generate_13_grams.py b/scripts/clean_training_data/generate_13_grams.py
index 27037e394d..66fa0ff45b 100644
--- a/scripts/clean_training_data/generate_13_grams.py
+++ b/scripts/clean_training_data/generate_13_grams.py
@@ -21,22 +21,22 @@
 """
 
 import argparse
+import glob
 import json
-import pickle
+import logging
 import os
+import pickle
+import signal
 import sys
 from pathlib import Path
-import glob
-import signal
 from signal import SIGINT
 
 from tqdm import tqdm
+from tqdm_multiprocess.logger import setup_logger_tqdm
 
+from lm_eval.decontamination.archiver import Reader, TextArchive
 from lm_eval.decontamination.janitor import Janitor, word_ngrams
-from lm_eval.decontamination.archiver import TextArchive, Reader
 
-import logging
-from tqdm_multiprocess.logger import setup_logger_tqdm
 
 logger = logging.getLogger(__name__)
 
@@ -89,7 +89,7 @@ def __init__(self, directory, num_buckets):
             os.path.join(directory, f"ngrams_{i}.bkt.txt") for i in range(num_buckets)
         ]
         self.buckets = list(map(TextArchive, self.bucket_files))
-        self.checkpoint_file = os.path.join(directory, f"bucket_offsets.ckpt")
+        self.checkpoint_file = os.path.join(directory, "bucket_offsets.ckpt")
 
         if os.path.exists(self.checkpoint_file):
             self.bucket_offsets = pickle.load(open(self.checkpoint_file, "rb"))
@@ -119,7 +119,6 @@ def close_buckets(self):
 
 
 def do_ngrams_in_buckets(n_value, working_directory, bucket_count):
-
     pile_statistics = json.load(open("pile_statistics.json", "r"))
     pile_document_count = pile_statistics["Document Count"]
     start_offsets = pile_statistics["File Start Offsets"]
@@ -130,13 +129,13 @@ def do_ngrams_in_buckets(n_value, working_directory, bucket_count):
     logger.info(f"Generating {n_value}-grams and bucketing.")
 
     # Done file
-    done_file = os.path.join(output_directory, f"ngram_buckets.done")
+    done_file = os.path.join(output_directory, "ngram_buckets.done")
     if os.path.exists(done_file):
         logger.info("ngrams already generated and bucketed, skipping")
         return
 
     # Checkpoint
-    checkpoint_file = os.path.join(working_directory, f"pile_offset.ckpt")
+    checkpoint_file = os.path.join(working_directory, "pile_offset.ckpt")
     if os.path.exists(checkpoint_file):
         checkpoint_offset = pickle.load(open(checkpoint_file, "rb"))
         iterate = True
diff --git a/scripts/clean_training_data/investigate_pile.py b/scripts/clean_training_data/investigate_pile.py
index dccd3abe70..c1d348d463 100644
--- a/scripts/clean_training_data/investigate_pile.py
+++ b/scripts/clean_training_data/investigate_pile.py
@@ -1,12 +1,13 @@
-from lm_eval.decontamination.archiver import Reader
-import os
+import glob
 import json
+import os
 from functools import reduce
-import glob
-import tqdm
 
+import tqdm
 from tqdm_multiprocess import TqdmMultiProcessPool
 
+from lm_eval.decontamination.archiver import Reader
+
 
 def get_file_stats(file_path, tqdm_func, global_tqdm):
     reader = Reader()
diff --git a/scripts/clean_training_data/process_sorted_buckets.py b/scripts/clean_training_data/process_sorted_buckets.py
index 1e145f9198..9d345d8e86 100644
--- a/scripts/clean_training_data/process_sorted_buckets.py
+++ b/scripts/clean_training_data/process_sorted_buckets.py
@@ -15,18 +15,18 @@
 
 import argparse
 import glob
+import logging
 import os
-from pathlib import Path
 import re
 import shutil
+from pathlib import Path
 
 from tqdm import tqdm
 from tqdm_multiprocess import TqdmMultiProcessPool
+from tqdm_multiprocess.logger import setup_logger_tqdm
 
-from scripts.clean_training_data.archiver import TextReader, TextArchive
+from scripts.clean_training_data.archiver import TextArchive, TextReader
 
-import logging
-from tqdm_multiprocess.logger import setup_logger_tqdm
 
 logger = logging.getLogger(__name__)
 
@@ -35,7 +35,6 @@
 def process_bucket(
     bucket_file_path, processed_directory, move_dir, tqdm_func, global_tqdm
 ):
-
     bucket_id = re.sub("\D", "", os.path.basename(bucket_file_path))  # noqa: W605
     done_file = os.path.join(
         processed_directory, f"ngram_bucket_processing_{bucket_id}.done"
@@ -96,7 +95,7 @@ def process_bucket(
 
 
 def process_sorted_buckets(working_directory, move_dir, process_count):
-    bucket_file_paths = glob.glob(os.path.join(working_directory, f"*.bkt.txt.sorted"))
+    bucket_file_paths = glob.glob(os.path.join(working_directory, "*.bkt.txt.sorted"))
     processed_directory = os.path.join(working_directory, "processed")
     os.makedirs(processed_directory, exist_ok=True)
 
@@ -123,7 +122,6 @@ def on_error(_):
 parser.add_argument("-procs", "--process_count", type=int, default=4)
 
 if __name__ == "__main__":
-
     logfile_path = "process13grams.log"
     setup_logger_tqdm(logfile_path)
 
diff --git a/scripts/clean_training_data/sort_13_gram_buckets.py b/scripts/clean_training_data/sort_13_gram_buckets.py
index 07a2eedcd0..83990de822 100644
--- a/scripts/clean_training_data/sort_13_gram_buckets.py
+++ b/scripts/clean_training_data/sort_13_gram_buckets.py
@@ -8,18 +8,18 @@
     directory and the unsorted buckets are removed after.
 """
 
-import glob
 import argparse
+import glob
+import logging
 import os
 import signal
-from signal import SIGINT
 import subprocess
+from signal import SIGINT
 
 from tqdm import tqdm
-
-import logging
 from tqdm_multiprocess.logger import setup_logger_tqdm
 
+
 logger = logging.getLogger(__name__)
 
 terminate = False
@@ -31,7 +31,7 @@ def handler(signal_received, frame):
 
 
 def sort_13_gram_buckets(working_directory):
-    bucket_file_paths = glob.glob(os.path.join(working_directory, f"*.bkt.txt"))
+    bucket_file_paths = glob.glob(os.path.join(working_directory, "*.bkt.txt"))
 
     for bucket_file_path in tqdm(bucket_file_paths, dynamic_ncols=True):
         sorted_file_path = bucket_file_path + ".sorted"
@@ -49,7 +49,6 @@ def sort_13_gram_buckets(working_directory):
 parser.add_argument("-dir", "--working_directory", default="")
 
 if __name__ == "__main__":
-
     version = 1.00
     print(f"Running version {version}")
 
diff --git a/scripts/cost_estimate.py b/scripts/cost_estimate.py
index 72b8d4b358..6fb64504e8 100644
--- a/scripts/cost_estimate.py
+++ b/scripts/cost_estimate.py
@@ -1,6 +1,8 @@
 import random
+
 import transformers
-from lm_eval import tasks, evaluator
+
+from lm_eval import evaluator, tasks
 from lm_eval.base import LM
 
 
diff --git a/scripts/get_prompts.py b/scripts/get_prompts.py
index 06e2f89c13..d262ec37e4 100644
--- a/scripts/get_prompts.py
+++ b/scripts/get_prompts.py
@@ -1,6 +1,8 @@
-from lm_eval import tasks
 from itertools import islice
 
+from lm_eval import tasks
+
+
 ct = 3
 
 for (
diff --git a/scripts/make_gpt2_test_cases.py b/scripts/make_gpt2_test_cases.py
index 361bc2ecd6..0c1a4bffe0 100644
--- a/scripts/make_gpt2_test_cases.py
+++ b/scripts/make_gpt2_test_cases.py
@@ -1,8 +1,9 @@
-import transformers
+import random
 
 import torch
 import torch.nn.functional as F
-import random
+import transformers
+
 
 random.seed(42)
 
diff --git a/scripts/make_table_results.py b/scripts/make_table_results.py
index 690658ccea..72af524ffe 100644
--- a/scripts/make_table_results.py
+++ b/scripts/make_table_results.py
@@ -2,10 +2,11 @@
 Usage:
    python make_table_tasks.py --output <markdown_filename>
 """
+import json
 import logging
-from pytablewriter import MarkdownTableWriter, LatexTableWriter
 import os
-import json
+
+from pytablewriter import LatexTableWriter, MarkdownTableWriter
 
 
 logging.basicConfig(level=logging.INFO)
diff --git a/scripts/make_table_tasks.py b/scripts/make_table_tasks.py
index d68d8fe219..ded7c1a596 100644
--- a/scripts/make_table_tasks.py
+++ b/scripts/make_table_tasks.py
@@ -4,9 +4,11 @@
 """
 import argparse
 import logging
-from lm_eval import tasks
+
 from pytablewriter import MarkdownTableWriter
 
+from lm_eval import tasks
+
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
diff --git a/scripts/model_comparator.py b/scripts/model_comparator.py
index f3cbd320f4..b1aeb142b7 100644
--- a/scripts/model_comparator.py
+++ b/scripts/model_comparator.py
@@ -1,13 +1,15 @@
 import argparse
+import os
+from typing import Dict, List, Tuple
+
 import numpy as np
-import lm_eval.evaluator
-from lm_eval import tasks
-from lm_eval import utils
-import scipy.stats
-from typing import Tuple, Dict, List
 import pandas as pd
+import scipy.stats
 import torch
-import os
+
+import lm_eval.evaluator
+from lm_eval import tasks, utils
+
 
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 eval_logger = utils.eval_logger
diff --git a/scripts/regression.py b/scripts/regression.py
index ef85d0c75e..2b8167c0eb 100644
--- a/scripts/regression.py
+++ b/scripts/regression.py
@@ -5,7 +5,7 @@
 import time
 from pathlib import Path
 
-from lm_eval import evaluator, utils
+from lm_eval import utils
 from lm_eval.api.registry import ALL_TASKS
 
 
@@ -136,14 +136,16 @@ def main():
     args = parse_args()
 
     args.branches = (
-        args.branches.split(",") if type(args.branches) == str else args.branches
+        args.branches.split(",") if isinstance(args.branches, str) else args.branches
+    )
+    args.models = (
+        args.models.split(",") if isinstance(args.models, str) else args.models
     )
-    args.models = args.models.split(",") if type(args.models) == str else args.models
     args.tasks = (
         ALL_TASKS
         if args.tasks == "all_tasks"
         else utils.pattern_match(args.tasks.split(","), ALL_TASKS)
-        if type(args.tasks) == str
+        if isinstance(args.tasks, str)
         else args.tasks
     )
 
diff --git a/scripts/write_out.py b/scripts/write_out.py
index eb81e6732b..360b0b6271 100644
--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -1,11 +1,13 @@
 import argparse
-import numpy as np
-import json
 import os
 import random
+
+import numpy as np
+
 from lm_eval import tasks
-from lm_eval.utils import join_iters, eval_logger
-from lm_eval.tasks import initialize_tasks, include_path
+from lm_eval.tasks import include_path, initialize_tasks
+from lm_eval.utils import eval_logger, join_iters
+
 
 EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n"
 
diff --git a/setup.py b/setup.py
index dbe4675d06..b5d8fabb86 100644
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,5 @@
 import setuptools
 
+
 # This is to make sure that the package supports editable installs
 setuptools.setup()
diff --git a/tests/models/test_gguf.py b/tests/models/test_gguf.py
index 6d186676fe..186b2305e6 100644
--- a/tests/models/test_gguf.py
+++ b/tests/models/test_gguf.py
@@ -1,12 +1,13 @@
-import unittest
-from unittest.mock import patch
 import hashlib
 import json
 import os
 import pickle
-from lm_eval.models.gguf import GGUFLM
+import unittest
+from unittest.mock import patch
 
 from lm_eval.api.instance import Instance
+from lm_eval.models.gguf import GGUFLM
+
 
 base_url = "https://matthoffner-ggml-llm-api.hf.space"
 
diff --git a/tests/models/test_huggingface.py b/tests/models/test_huggingface.py
index 557ad05124..323d664af8 100644
--- a/tests/models/test_huggingface.py
+++ b/tests/models/test_huggingface.py
@@ -1,13 +1,16 @@
 from __future__ import annotations
-import pytest
+
+import sys
 from pathlib import Path
+
 import numpy as np
-from lm_eval.models.huggingface import HFLM
-from lm_eval.api.instance import Instance
-import lm_eval.tasks as tasks
-import sys
 import torch
 
+import lm_eval.tasks as tasks
+from lm_eval.api.instance import Instance
+from lm_eval.models.huggingface import HFLM
+
+
 tasks.initialize_tasks()
 
 
@@ -106,9 +109,10 @@ def test_logliklihood(self) -> None:
             f.write("\n".join(str(x) for x in _res))
         assert np.allclose(_res, _RES, atol=1e-2)
         # check indices for Multiple Choice
-        argmax_RES, argmax_res = np.argmax(
-            np.array(_RES).reshape(-1, 4), axis=1
-        ), np.argmax(np.array(_res).reshape(-1, 4), axis=1)
+        argmax_RES, argmax_res = (
+            np.argmax(np.array(_RES).reshape(-1, 4), axis=1),
+            np.argmax(np.array(_res).reshape(-1, 4), axis=1),
+        )
         assert (argmax_RES == argmax_res).all()
 
     def test_generate_until(self) -> None:
diff --git a/tests/models/test_vllm.py b/tests/models/test_vllm.py
index 61a024ce71..1da8a48762 100644
--- a/tests/models/test_vllm.py
+++ b/tests/models/test_vllm.py
@@ -1,10 +1,11 @@
-import pytest
 from typing import List
-from lm_eval.api.instance import Instance
-import lm_eval.tasks as tasks
-import sys
+
+import pytest
 import torch
 
+import lm_eval.tasks as tasks
+from lm_eval.api.instance import Instance
+
 
 @pytest.mark.skip(reason="requires CUDA")
 class TEST_VLLM:
diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py
index 7f30e21f43..825f57413d 100644
--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -1,15 +1,13 @@
-import os
-
 # import lm_eval.base as base
-import lm_eval.api.registry as registry
-import lm_eval.tasks as tasks
+from typing import List
+
+import pytest
 
 # import lm_eval.models as models
 import lm_eval.api as api
 import lm_eval.evaluator as evaluator
-from typing import List
-import random
-import pytest
+import lm_eval.tasks as tasks
+
 
 tasks.initialize_tasks()
 
diff --git a/tests/test_janitor.py b/tests/test_janitor.py
index b496bfadd1..19ba611dfb 100644
--- a/tests/test_janitor.py
+++ b/tests/test_janitor.py
@@ -1,11 +1,10 @@
-import re
 from collections import defaultdict
 
 from lm_eval.decontamination.janitor import (
     Janitor,
     form_ngrams,
-    word_ngrams,
     split_indices,
+    word_ngrams,
     word_ngrams_indices,
 )
 
@@ -81,7 +80,6 @@ def test_split_indices():
 
 
 def test_word_ngrams_indices():
-
     sequence = (
         "Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
         " more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
@@ -119,9 +117,9 @@ def test_word_ngrams_indices():
 # Assumptions from GPT3 Paper:
 # the 200 characters to remove include punctuation and is actually a half-window
 
+
 # All tests below initially test without any registered contaminants, expecting the same sequence back.
 def test_janitor1():
-
     # First test using a 1gram and expected the first block before the filth to have some remaining
     # characters, but the second block should be completely removed.
 
@@ -165,7 +163,6 @@ def test_janitor1():
 
 
 def test_janitor2():
-
     # Second test using a 1gram and expected the first block before the filth to have some remaining
     # characters, and the second block is longer then 200 characters so should also have some remaining.
 
@@ -214,7 +211,6 @@ def test_janitor2():
 
 
 def test_janitor3():
-
     # Same test as above but with a 6gram.
 
     sequence = (
@@ -262,7 +258,6 @@ def test_janitor3():
 
 
 def test_janitor4():
-
     # This test adds another block to that from the previous. The middle block should be entirely
     # removed as the 200 characters are removed from each side.
 
@@ -318,7 +313,6 @@ def test_janitor4():
 
 
 def test_janitor5():
-
     # Same as above but using multiple different filth 6grams.
 
     sequence = (
@@ -374,7 +368,6 @@ def test_janitor5():
 
 
 def test_janitor6():
-
     # Same as above but now we add 10 filths and expect the same result, the following test does 11.
 
     sequence = (
@@ -438,7 +431,6 @@ def test_janitor6():
 
 
 def test_janitor7():
-
     # Same as above but now we add 9 filths and expect the same result, the following test does 10.
 
     sequence = (
diff --git a/tests/test_misc.py b/tests/test_misc.py
index 149a65f4c3..30267f63d0 100644
--- a/tests/test_misc.py
+++ b/tests/test_misc.py
@@ -1,6 +1,8 @@
+import random
+
 import pytest
+
 import lm_eval.api.metrics as metrics
-import random
 
 
 def test_bootstrapping():
diff --git a/tests/test_tasks.py b/tests/test_tasks.py
index 41504430d5..3651fd5ab3 100644
--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
@@ -1,9 +1,13 @@
 from itertools import islice
+
 import pytest
-from .utils import new_tasks
+
 import lm_eval.tasks as tasks
 from lm_eval.api.task import ConfigurableTask
 
+from .utils import new_tasks
+
+
 tasks.initialize_tasks()
 # Default Task
 TASKS = ["arc_easy"]
diff --git a/tests/tests_master/test_description.py b/tests/tests_master/test_description.py
index fdf7bf5db0..2503bcea4b 100644
--- a/tests/tests_master/test_description.py
+++ b/tests/tests_master/test_description.py
@@ -1,6 +1,7 @@
 import random
-import lm_eval.tasks
+
 import lm_eval.models
+import lm_eval.tasks
 
 
 def test_description():
@@ -14,7 +15,6 @@ def test_description():
 
     task_dict = lm_eval.tasks.get_task_dict(task_names)
     for task_name, task in task_dict.items():
-
         # patch description field in task (# TODO: make this much more cleaned up)
         task._config.description = description_dict[task_name]
 
diff --git a/tests/tests_master/test_generate_13_grams.py b/tests/tests_master/test_generate_13_grams.py
index 26cd890369..722e69a77e 100644
--- a/tests/tests_master/test_generate_13_grams.py
+++ b/tests/tests_master/test_generate_13_grams.py
@@ -1,13 +1,13 @@
+import glob
+import logging
 import os
-from collections import Counter
 import shutil
-import glob
+from collections import Counter
 
+from lm_eval.decontamination.archiver import Archive, TextReader
 from lm_eval.decontamination.janitor import Janitor, word_ngrams
 from scripts.clean_training_data.generate_13_grams import do_ngrams_in_buckets
-from lm_eval.decontamination.archiver import Archive, TextReader
 
-import logging
 
 logger = logging.getLogger(__name__)
 
@@ -57,7 +57,7 @@ def test_generate_13_grams_1(caplog):
     print("rebuild")
     rebuilt_ngrams = []
     bucket_file_paths = glob.glob(
-        os.path.join(test_working_directory, "output", f"*.bkt.txt")
+        os.path.join(test_working_directory, "output", "*.bkt.txt")
     )
     for bucket_file_path in bucket_file_paths:
         reader = TextReader(bucket_file_path)
diff --git a/tests/tests_master/test_models.py b/tests/tests_master/test_models.py
index 11ea5a8b46..e56dcaf8e4 100644
--- a/tests/tests_master/test_models.py
+++ b/tests/tests_master/test_models.py
@@ -2,12 +2,13 @@
 import json
 import os
 import pickle
-import pytest
 import unittest.mock as mock
 
+import pytest
+from openai import OpenAI
+
 import lm_eval.models as models
 
-from openai import OpenAI
 
 client = OpenAI()
 
diff --git a/tests/tests_master/test_version_stable.py b/tests/tests_master/test_version_stable.py
index 2eba83c6c6..34073d0a69 100644
--- a/tests/tests_master/test_version_stable.py
+++ b/tests/tests_master/test_version_stable.py
@@ -1,12 +1,14 @@
-import lm_eval.tasks as tasks
-import lm_eval.models as models
-import lm_eval.evaluator as evaluator
+import collections
+import hashlib
+import json
+import os
 import random
+
 import pytest
-import os
-import json
-import hashlib
-import collections
+
+import lm_eval.evaluator as evaluator
+import lm_eval.models as models
+import lm_eval.tasks as tasks
 
 
 os.makedirs("tests/testdata", exist_ok=True)
diff --git a/tests/utils.py b/tests/utils.py
index 3555541e71..fbdbb6a7fb 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,8 +1,8 @@
-from typing import List
-from lm_eval.utils import load_yaml_config
-from pathlib import Path
-from typing import Union
 import os
+from pathlib import Path
+from typing import List, Union
+
+from lm_eval.utils import load_yaml_config
 
 
 # {{{CI}}}

From 12f2c5ea15d793b7f1aa8f3611ccc8db5eb1a9a5 Mon Sep 17 00:00:00 2001
From: GUIJIN SON <spthsrbwls123@yonsei.ac.kr>
Date: Thu, 21 Dec 2023 05:02:02 +0900
Subject: [PATCH 03/25] Error in --num_fewshot option for K-MMLU Evaluation
 Harness (#1178)

* update kmmlu default formatting

* Update _default_kmmlu_yaml

* Delete lm_eval/tasks/kmmlu/utils.py
---
 lm_eval/tasks/kmmlu/_default_kmmlu_yaml |  9 ++++-----
 lm_eval/tasks/kmmlu/utils.py            | 20 --------------------
 2 files changed, 4 insertions(+), 25 deletions(-)
 delete mode 100644 lm_eval/tasks/kmmlu/utils.py

diff --git a/lm_eval/tasks/kmmlu/_default_kmmlu_yaml b/lm_eval/tasks/kmmlu/_default_kmmlu_yaml
index 25c7009cff..2d36f159bf 100644
--- a/lm_eval/tasks/kmmlu/_default_kmmlu_yaml
+++ b/lm_eval/tasks/kmmlu/_default_kmmlu_yaml
@@ -6,10 +6,9 @@ validation_split: dev
 test_split: test
 fewshot_split: dev
 output_type: multiple_choice
-process_docs: !function utils.process_docs
-doc_to_text: "{{question}}"
-doc_to_choice: "{{choices}}"
-doc_to_target: "{{gold}}"
+doc_to_text: "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답："
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: "{{['A', 'B', 'C', 'D'][answer-1]}}"
 metric_list:
   - metric: acc
     aggregation: mean
@@ -18,4 +17,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  version: 1.0
+  version: 1.1
diff --git a/lm_eval/tasks/kmmlu/utils.py b/lm_eval/tasks/kmmlu/utils.py
deleted file mode 100644
index 690fea9299..0000000000
--- a/lm_eval/tasks/kmmlu/utils.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import datasets
-
-
-def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
-    def _process_doc(doc):
-        instruction = (
-            f"다음을 읽고 정답으로 알맞은 것을 고르시요.\n"
-            f"### Question: {doc['question']}\n"
-            f"### Options:\n"
-            f"(1) {doc['A']}\n(2) {doc['B']}\n(3) {doc['C']}\n(4) {doc['D']}\n"
-            f"### Answer: 주어진 문제의 정답은"
-        )
-        out_doc = {
-            "question": instruction,
-            "choices": ["(1)", "(2)", "(3)", "(4)"],
-            "gold": int(doc["answer"]) - 1,
-        }
-        return out_doc
-
-    return dataset.map(_process_doc)

From fcfc0c6044480d4caccdcbcfd1c221eb48fcccef Mon Sep 17 00:00:00 2001
From: Vicki Boykis <vicki@mozilla.ai>
Date: Wed, 20 Dec 2023 15:49:23 -0500
Subject: [PATCH 04/25] Implementing local OpenAI API-style chat completions on
 any given inference server (#1174)

* LocalChatCompletionsLM add

* clean up completions class

* clean up completions class

* update tokens

* README

* fix constructor

* eos token

* folding local-chat-completions into OpenAIChatCompletions

* refactoring to include gen_kwargs as passable option

* add todo on chat completion kwarg validation

* Ruff and README fix

* generalize to **kwargs

* remove unnecessary kwargs

* README and remove kwargs

* README
---
 README.md                            | 25 ++++++----
 lm_eval/models/openai_completions.py | 75 +++++++++++++++++-----------
 2 files changed, 60 insertions(+), 40 deletions(-)

diff --git a/README.md b/README.md
index 9dbee8fbb2..ed47dba83b 100644
--- a/README.md
+++ b/README.md
@@ -155,19 +155,24 @@ lm_eval --model openai-completions \
     --tasks lambada_openai,hellaswag
 ```
 
+We also support using your own local inference server with an implemented version of the OpenAI ChatCompletions endpoint and passing trained HuggingFace artifacts and tokenizers.
+
+```bash
+lm_eval --model local-chat-completions --tasks gsm8k --model_args model=facebook/opt-125m,base_url=http://{yourip}:8000/v1
+```
 Note that for externally hosted models, configs such as `--device` and `--batch_size` should not be used and do not function. Just like you can use `--model_args` to pass arbitrary arguments to the model constructor for local models, you can use it to pass arbitrary arguments to the model API for hosted models. See the documentation of the hosting service for information on what arguments they support.
 
 
-| API or Inference Server     | Implemented?                    | `--model <xxx>` name                                                           | Models supported:                                                                             | Request Types:                                           |
-|-----------------------------|---------------------------------|--------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------|----------------------------------------------------------|
-| OpenAI Completions          | :heavy_check_mark:              | `openai-completions`                                              | up to `code-davinci-002`                                                                      | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
-| OpenAI ChatCompletions      | :x: Not yet - needs testing!       | N/A                                                                            | [All ChatCompletions API models](https://platform.openai.com/docs/guides/gpt)                 | `generate_until` (no logprobs)                             |
-| Anthropic                   | :heavy_check_mark:              | `anthropic`                                                                    | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model)  | `generate_until` (no logprobs)                             |
-| Textsynth                   | :heavy_check_mark:                   | `textsynth`                                                                    | [All supported engines](https://textsynth.com/documentation.html#engines)                                                                                           | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
-| Cohere                      | [:hourglass: - blocked on Cohere API bug](https://github.com/EleutherAI/lm-evaluation-harness/pull/395) | N/A                                                                            | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models)                        | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
-| [Llama.cpp](https://github.com/ggerganov/llama.cpp) (via [llama-cpp-python](https://github.com/abetlen/llama-cpp-python))                        | :heavy_check_mark:              | `gguf`, `ggml`                                                                 | [All models supported by llama.cpp](https://github.com/ggerganov/llama.cpp)               | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
-| vLLM                        | :heavy_check_mark:       | `vllm`                                                                         | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html) | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                             |
-| Your inference server here! | ...                             | ...                                                                            | ...                                                                                           | ...                                                      |                                | ...                                                      |
+| API or Inference Server                                                                                                   | Implemented?                    | `--model <xxx>` name                                                | Models supported:                                                                             | Request Types:                                             |
+|---------------------------------------------------------------------------------------------------------------------------|---------------------------------|---------------------------------------------------------------------|-----------------------------------------------------------------------------------------------|------------------------------------------------------------|
+| OpenAI Completions                                                                                                        | :heavy_check_mark:              | `openai-completions` | up to `code-davinci-002`                                                                      | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| OpenAI ChatCompletions                                                                                                    | :heavy_check_mark:        | `openai-chat-completions`, `local-chat-completions`                                                               | [All ChatCompletions API models](https://platform.openai.com/docs/guides/gpt)                 | `generate_until` (no logprobs)                             |
+| Anthropic                                                                                                                 | :heavy_check_mark:              | `anthropic`                                                         | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model)  | `generate_until` (no logprobs)                             |
+| Textsynth                                                                                                                 | :heavy_check_mark:                   | `textsynth`                                                         | [All supported engines](https://textsynth.com/documentation.html#engines)                     | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| Cohere                                                                                                                    | [:hourglass: - blocked on Cohere API bug](https://github.com/EleutherAI/lm-evaluation-harness/pull/395) | N/A                                                                 | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models)                        | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| [Llama.cpp](https://github.com/ggerganov/llama.cpp) (via [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)) | :heavy_check_mark:              | `gguf`, `ggml`                                                      | [All models supported by llama.cpp](https://github.com/ggerganov/llama.cpp)                   | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| vLLM                                                                                                                      | :heavy_check_mark:       | `vllm`                                                              | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| Your local inference server!                                                                                              | :heavy_check_mark:                             | `local-chat-completions` (using `openai-completions` model type)    | Any server address that accepts GET requests using HF models and mirror's OpenAI's ChatCompletions interface                                  | `generate_until`                                           |                                | ...                                                      |
 
 It is on our roadmap to create task variants designed to enable models which do not serve logprobs/loglikelihoods to be compared with generation performance of open-source models.
 
diff --git a/lm_eval/models/openai_completions.py b/lm_eval/models/openai_completions.py
index d63f8ab12a..7feb83f51e 100644
--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -5,6 +5,7 @@
 from importlib.util import find_spec
 from typing import List, Optional, Tuple
 
+import transformers
 from tqdm import tqdm
 
 from lm_eval import utils
@@ -104,7 +105,7 @@ def __init__(
         self._max_gen_toks = max_gen_toks
         self._max_length = max_length
 
-        # Read from environment variable OPENAI_API_SECRET_KEY
+        # Read from environment variable OPENAI_API_KEY
         openai.api_key = os.environ["OPENAI_API_KEY"]
 
     @property
@@ -353,15 +354,26 @@ async def _get_completions(**kwargs):
             backoff_time *= 1.5
 
 
-@register_model("openai-chat-completions")
+@register_model("openai-chat-completions", "local-chat-completions")
 class OpenaiChatCompletionsLM(LM):
     def __init__(
-        self, model: str = "gpt-3.5-turbo", truncate: bool = False, batch_size: int = 1
+        self,
+        model: str = "gpt-3.5-turbo",  # GPT model or Local model using HuggingFace model paths
+        base_url: str = None,
+        truncate: bool = False,
+        revision: Optional[str] = "main",
+        trust_remote_code: Optional[bool] = False,
+        use_fast_tokenizer: Optional[bool] = True,
+        **kwargs,
     ) -> None:
         """
 
         :param model: str
+            Implements an OpenAI-style chat completion API for
+            accessing both OpenAI OR locally-hosted models using
+            HuggingFace Tokenizer
             OpenAI API model (e.g. gpt-3.5-turbo)
+            using the **gen_kwargs passed on init
         :param truncate: bool
             Truncate input if too long (if False and input is too long, throw error)
         """
@@ -375,19 +387,34 @@ def __init__(
     please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`",
             )
         self.model = model
-        self.frequency_penalty = 0
-        self.logit_bias = None
-        self.n = 1
-        self.presence_penalty = 0
-        self.temperature = 1
-        self.top_p = 1
-        self.tokenizer = tiktoken.encoding_for_model(self.model)
-        self.vocab_size = self.tokenizer.n_vocab
+        self.base_url = base_url
         self.truncate = truncate
-        self.end_of_text_token_id = self.tokenizer.eot_token
+
+        # if we have a local model, use HF tokenizer over tiktoken
+        if self.base_url:
+            self.revision = revision
+            self.trust_remote_code = trust_remote_code
+            self.use_fast_tokenizer = use_fast_tokenizer
+
+            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                self.model,
+                revision=self.revision,
+                trust_remote_code=self.trust_remote_code,
+                use_fast_tokenizer=self.use_fast_tokenizer,
+            )
+            self.vocab_size = self.tokenizer.vocab
+            self.end_of_text_token_id = self.tokenizer.eos_token
+        else:
+            self.tokenizer = tiktoken.encoding_for_model(self.model)
+            self.vocab_size = self.tokenizer.n_vocab
+            self.end_of_text_token_id = self.tokenizer.eot_token
 
         # Read from environment variable OPENAI_API_KEY
-        self.client = openai.OpenAI()  # openai.AsyncOpenAI()
+        # Set to EMPTY for local
+        if self.base_url:
+            self.client = openai.OpenAI(base_url=self.base_url)
+        else:
+            self.client = openai.OpenAI()  # openai.AsyncOpenAI()
 
     @property
     def eot_token_id(self):
@@ -474,35 +501,23 @@ def sameuntil_chunks(xs, size):
                 until = None
                 if isinstance(gen_kwargs, dict):
                     kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                    if "do_sample" in kwargs.keys():
+                        kwargs.pop("do_sample")
                     if "until" in kwargs.keys():
                         until = kwargs.pop("until")
                         if isinstance(until, str):
                             until = [kwargs]
                         elif not isinstance(until, list):
                             raise ValueError(
-                                f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
+                                f"Expected repr(kwargs['until']) to be of type Union[str, list] but got {until}"
                             )
                 else:
                     raise ValueError(
-                        f"Expected `kwargs` to be of type `dict` but got {kwargs}"
+                        f"Expected repr(kwargs) to be of type repr(dict) but got {kwargs}"
                     )
 
-                if "max_gen_toks" in kwargs.keys():
-                    max_gen_toks = kwargs.pop("max_gen_toks")
-                else:
-                    max_gen_toks = self.max_gen_toks
-
                 response = oa_chat_completion(
-                    client=self.client,
-                    messages=inps,
-                    model=self.model,
-                    frequency_penalty=self.frequency_penalty,
-                    # logit_bias=self.logit_bias,
-                    max_tokens=max_gen_toks,
-                    n=self.n,
-                    presence_penalty=self.presence_penalty,
-                    temperature=self.temperature,
-                    top_p=self.top_p,
+                    client=self.client, messages=inps, model=self.model, **kwargs
                 )
 
                 for resp, (context, args_) in zip(response.choices, chunk):

From e548d94d4c57bc5bd1ed6efea3203edb9e22eaee Mon Sep 17 00:00:00 2001
From: Anjor Kanekar <anjor@umd.edu>
Date: Thu, 21 Dec 2023 11:39:49 +0000
Subject: [PATCH 05/25] Update README.md (#1184)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ed47dba83b..747d47c927 100644
--- a/README.md
+++ b/README.md
@@ -151,7 +151,7 @@ To call a hosted model, use:
 ```bash
 export OPENAI_API_KEY=YOUR_KEY_HERE
 lm_eval --model openai-completions \
-    --model_args engine=davinci \
+    --model_args model=davinci \
     --tasks lambada_openai,hellaswag
 ```
 

From 2b0b6fd823345dab6db9ee1c341023b50c8d7dd5 Mon Sep 17 00:00:00 2001
From: Anjor Kanekar <anjor@umd.edu>
Date: Thu, 21 Dec 2023 11:42:35 +0000
Subject: [PATCH 06/25] Update README.md (#1183)

---
 docs/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/README.md b/docs/README.md
index c416af6c81..d0c498f08a 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -4,7 +4,7 @@ Welcome to the docs for the LM Evaluation Harness!
 
 ## Table of Contents
 
-* To learn about the public interface of the library, as well as how to evaluate via the commandline or as integrated into an external library, see the [Interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/user_guide.md)
+* To learn about the public interface of the library, as well as how to evaluate via the commandline or as integrated into an external library, see the [Interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/interface.md)
 * To learn how to add a new library, API, or model type to the library, as well as a quick explainer on the types of ways to evaluate an LM, see the [Model Guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/model_guide.md).
 * For a crash course on adding new tasks to the library, see our [New Task Guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md).
 * To learn more about pushing the limits of task configuration that the Eval Harness supports, see the [Task Configuration Guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/task_guide.md).

From a0cfe3f631a5a889ded400f7bc7bc514cc01fb5d Mon Sep 17 00:00:00 2001
From: Anjor Kanekar <anjor@umd.edu>
Date: Thu, 21 Dec 2023 14:04:07 +0000
Subject: [PATCH 07/25] Add tokenizer backend (#1186)

* separate local flag

* tokenizer_backend

* import order
---
 lm_eval/models/openai_completions.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/lm_eval/models/openai_completions.py b/lm_eval/models/openai_completions.py
index 7feb83f51e..d99365a635 100644
--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -3,7 +3,7 @@
 import time
 from collections import defaultdict
 from importlib.util import find_spec
-from typing import List, Optional, Tuple
+from typing import List, Literal, Optional, Tuple
 
 import transformers
 from tqdm import tqdm
@@ -360,6 +360,7 @@ def __init__(
         self,
         model: str = "gpt-3.5-turbo",  # GPT model or Local model using HuggingFace model paths
         base_url: str = None,
+        tokenizer_backend: Literal["tiktoken", "huggingface"] = "tiktoken",
         truncate: bool = False,
         revision: Optional[str] = "main",
         trust_remote_code: Optional[bool] = False,
@@ -388,10 +389,11 @@ def __init__(
             )
         self.model = model
         self.base_url = base_url
+        self.tokenizer_backend = tokenizer_backend
         self.truncate = truncate
 
         # if we have a local model, use HF tokenizer over tiktoken
-        if self.base_url:
+        if self.tokenizer_backend == "huggingface":
             self.revision = revision
             self.trust_remote_code = trust_remote_code
             self.use_fast_tokenizer = use_fast_tokenizer
@@ -404,10 +406,14 @@ def __init__(
             )
             self.vocab_size = self.tokenizer.vocab
             self.end_of_text_token_id = self.tokenizer.eos_token
-        else:
+        elif self.tokenizer_backend == "tiktoken":
             self.tokenizer = tiktoken.encoding_for_model(self.model)
             self.vocab_size = self.tokenizer.n_vocab
             self.end_of_text_token_id = self.tokenizer.eot_token
+        else:
+            raise ValueError(
+                f"Expected tokenizer_backend to be one of ['tiktoken', 'huggingface'] but got {self.tokenizer_backend}"
+            )
 
         # Read from environment variable OPENAI_API_KEY
         # Set to EMPTY for local

From 9cd798974542f361960b164f0e41cc14cb61d436 Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Thu, 21 Dec 2023 09:20:40 -0500
Subject: [PATCH 08/25] Correctly Print Task Versioning (#1173)

* change version field formatting in metadata

* mention versioning in new task guide

* add instructions for changelog

* run linters
---
 docs/new_task_guide.md                        | 19 +++++++++++++++++++
 lm_eval/tasks/anli/anli_r1.yaml               |  2 +-
 lm_eval/tasks/arc/arc_easy.yaml               |  2 +-
 lm_eval/tasks/arithmetic/arithmetic_1dc.yaml  |  2 +-
 lm_eval/tasks/asdiv/default.yaml              |  2 +-
 lm_eval/tasks/babi/babi.yaml                  |  2 +-
 .../cot_fewshot/_cot_fewshot_template_yaml    |  2 +-
 .../cot_zeroshot/_cot_zeroshot_template_yaml  |  2 +-
 .../tasks/bbh/fewshot/_fewshot_template_yaml  |  2 +-
 .../bbh/zeroshot/_zeroshot_template_yaml      |  2 +-
 lm_eval/tasks/belebele/_default_template_yaml |  2 +-
 .../bigbench/generate_until_template_yaml     |  2 +-
 .../multiple_choice/causal_judgement.yaml     |  4 ++++
 .../bigbench/multiple_choice_template_yaml    |  2 +-
 lm_eval/tasks/blimp/_template_yaml            |  2 +-
 lm_eval/tasks/ceval/_default_ceval_yaml       |  2 +-
 lm_eval/tasks/cmmlu/_default_template_yaml    |  2 +-
 lm_eval/tasks/code_x_glue/code-text/go.yaml   |  2 +-
 lm_eval/tasks/code_x_glue/code-text/java.yaml |  2 +-
 .../code_x_glue/code-text/javascript.yaml     |  2 +-
 lm_eval/tasks/code_x_glue/code-text/php.yaml  |  2 +-
 .../tasks/code_x_glue/code-text/python.yaml   |  2 +-
 lm_eval/tasks/code_x_glue/code-text/ruby.yaml |  2 +-
 lm_eval/tasks/coqa/default.yaml               |  2 +-
 .../crows_pairs/crows_pairs_english.yaml      |  2 +-
 lm_eval/tasks/csatqa/_default_csatqa_yaml     |  2 +-
 lm_eval/tasks/drop/default.yaml               |  2 +-
 lm_eval/tasks/fld/fld_default.yaml            |  2 ++
 lm_eval/tasks/glue/cola/default.yaml          |  2 +-
 lm_eval/tasks/glue/mnli/default.yaml          |  2 +-
 lm_eval/tasks/glue/mrpc/default.yaml          |  2 +-
 lm_eval/tasks/glue/qnli/default.yaml          |  2 +-
 lm_eval/tasks/glue/qqp/default.yaml           |  2 +-
 lm_eval/tasks/glue/rte/default.yaml           |  2 +-
 lm_eval/tasks/glue/sst2/default.yaml          |  2 +-
 lm_eval/tasks/glue/wnli/default.yaml          |  2 +-
 .../gsm8k/gsm8k-cot-self-consistency.yaml     |  2 +-
 lm_eval/tasks/gsm8k/gsm8k-cot.yaml            |  2 +-
 lm_eval/tasks/gsm8k/gsm8k.yaml                |  2 +-
 lm_eval/tasks/headqa/headqa_en.yaml           |  2 +-
 lm_eval/tasks/hellaswag/hellaswag.yaml        |  2 +-
 .../tasks/hendrycks_ethics/commonsense.yaml   |  2 +-
 .../tasks/hendrycks_ethics/deontology.yaml    |  2 +-
 lm_eval/tasks/hendrycks_ethics/justice.yaml   |  2 +-
 .../hendrycks_ethics/utilitarianism.yaml      |  2 +-
 .../utilitarianism_original_yaml              |  2 +-
 lm_eval/tasks/hendrycks_ethics/virtue.yaml    |  2 +-
 lm_eval/tasks/ifeval/ifeval.yaml              |  2 +-
 lm_eval/tasks/lambada/lambada_openai.yaml     |  2 +-
 lm_eval/tasks/lambada/lambada_standard.yaml   |  2 +-
 .../lambada_cloze/lambada_openai_cloze.yaml   |  2 +-
 .../lambada_cloze/lambada_standard_cloze.yaml |  2 +-
 .../lambada_multilingual/lambada_mt_en.yaml   |  2 +-
 lm_eval/tasks/logiqa/logiqa.yaml              |  2 +-
 lm_eval/tasks/logiqa2/logieval.yaml           |  2 +-
 lm_eval/tasks/logiqa2/logiqa2.yaml            |  2 +-
 lm_eval/tasks/mathqa/mathqa.yaml              |  2 +-
 lm_eval/tasks/mc_taco/default.yaml            |  2 +-
 lm_eval/tasks/mgsm/direct/direct_yaml         |  2 +-
 lm_eval/tasks/mgsm/en_cot/cot_yaml            |  2 +-
 lm_eval/tasks/mgsm/native_cot/cot_yaml        |  2 +-
 .../minerva_math/minerva_math_algebra.yaml    |  2 +-
 .../tasks/mmlu/default/_default_template_yaml |  2 +-
 .../_mmlu_flan_cot_fewshot_template_yaml      |  2 +-
 .../_mmlu_flan_cot_zeroshot_template_yaml     |  2 +-
 .../_mmlu_flan_generative_template_yaml       |  2 +-
 .../_mmlu_flan_loglikelihood_template_yaml    |  2 +-
 .../advanced_ai_risk/_template_yaml           |  2 +-
 .../persona/_template_yaml                    |  2 +-
 .../sycophancy/sycophancy_on_nlp_survey.yaml  |  2 +-
 .../sycophancy_on_philpapers2020.yaml         |  2 +-
 ...sycophancy_on_political_typology_quiz.yaml |  2 +-
 .../winogenerated/_template_yaml              |  2 +-
 lm_eval/tasks/mutual/mutual.yaml              |  2 +-
 lm_eval/tasks/nq_open/nq_open.yaml            |  2 +-
 lm_eval/tasks/openbookqa/openbookqa.yaml      |  2 +-
 lm_eval/tasks/paws-x/pawsx_template_yaml      |  2 +-
 lm_eval/tasks/pile/pile_arxiv.yaml            |  2 +-
 lm_eval/tasks/piqa/piqa.yaml                  |  2 +-
 lm_eval/tasks/polemo2/polemo2_in.yaml         |  2 +-
 lm_eval/tasks/prost/corypaik_prost.yaml       |  2 +-
 lm_eval/tasks/pubmedqa/pubmedqa.yaml          |  2 +-
 lm_eval/tasks/qa4mre/qa4mre_2011.yaml         |  2 +-
 lm_eval/tasks/qasper/bool.yaml                |  2 +-
 lm_eval/tasks/qasper/freeform.yaml            |  2 +-
 lm_eval/tasks/race/race.yaml                  |  2 +-
 .../realtoxicityprompts.yaml                  |  2 +-
 lm_eval/tasks/sciq/sciq.yaml                  |  2 +-
 lm_eval/tasks/siqa/default.yml                |  2 +-
 lm_eval/tasks/storycloze/storycloze_2016.yaml |  2 +-
 lm_eval/tasks/super_glue/boolq/default.yaml   |  2 +-
 lm_eval/tasks/super_glue/boolq/seq2seq.yaml   |  2 +-
 lm_eval/tasks/super_glue/boolq/t5-prompt.yaml |  2 +-
 lm_eval/tasks/super_glue/cb/default.yaml      |  2 +-
 lm_eval/tasks/super_glue/cb/t5-prompt.yaml    |  2 +-
 lm_eval/tasks/super_glue/copa/default.yaml    |  2 +-
 lm_eval/tasks/super_glue/copa/t5-prompt.yaml  |  2 +-
 lm_eval/tasks/super_glue/multirc/default.yaml |  2 +-
 .../tasks/super_glue/multirc/t5-prompt.yaml   |  2 +-
 lm_eval/tasks/super_glue/record/default.yaml  |  2 +-
 .../tasks/super_glue/record/t5-prompt.yaml    |  2 +-
 lm_eval/tasks/super_glue/rte/default.yaml     |  2 +-
 lm_eval/tasks/super_glue/rte/t5-prompt.yaml   |  2 +-
 lm_eval/tasks/super_glue/wic/default.yaml     |  2 +-
 lm_eval/tasks/super_glue/wic/t5-prompt.yaml   |  2 +-
 lm_eval/tasks/super_glue/wsc/default.yaml     |  2 +-
 lm_eval/tasks/super_glue/wsc/t5-prompt.yaml   |  2 +-
 lm_eval/tasks/swag/swag.yaml                  |  2 +-
 lm_eval/tasks/toxigen/toxigen.yaml            |  2 +-
 lm_eval/tasks/translation/wmt_common_yaml     |  2 +-
 lm_eval/tasks/triviaqa/default.yaml           |  2 +-
 lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml  |  2 +-
 lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml  |  2 +-
 lm_eval/tasks/truthfulqa/truthfulqa_mc2.yaml  |  2 +-
 lm_eval/tasks/unscramble/anagrams1.yaml       |  2 +-
 lm_eval/tasks/unscramble/anagrams2.yaml       |  2 +-
 lm_eval/tasks/unscramble/cycle_letters.yaml   |  2 +-
 .../tasks/unscramble/random_insertion.yaml    |  2 +-
 lm_eval/tasks/unscramble/reversed_words.yaml  |  2 +-
 lm_eval/tasks/webqs/webqs.yaml                |  2 +-
 lm_eval/tasks/wikitext/wikitext.yaml          |  2 +-
 lm_eval/tasks/winogrande/default.yaml         |  2 +-
 lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml    |  2 +-
 lm_eval/tasks/wsc273/default.yaml             |  2 +-
 lm_eval/tasks/xcopa/default_et.yaml           |  2 +-
 lm_eval/tasks/xnli/xnli_common_yaml           |  2 +-
 lm_eval/tasks/xstorycloze/default_ar.yaml     |  2 +-
 lm_eval/tasks/xwinograd/xwinograd_common_yaml |  2 +-
 128 files changed, 150 insertions(+), 125 deletions(-)
 create mode 100644 lm_eval/tasks/bigbench/multiple_choice/causal_judgement.yaml

diff --git a/docs/new_task_guide.md b/docs/new_task_guide.md
index 26ffd3aa4f..cfcf0e4d98 100644
--- a/docs/new_task_guide.md
+++ b/docs/new_task_guide.md
@@ -315,6 +315,25 @@ python -m scripts.write_out \
 Open the file specified at the `--output_base_path <path>` and ensure it passes
 a simple eye test.
 
+## Versioning
+
+One key feature in LM Evaluation Harness is the ability to version tasks--that is, mark them with a specific version number that can be bumped whenever a breaking change is made.
+
+This version info can be provided by adding the following to your new task config file:
+
+```
+metadata:
+  version: 0
+```
+
+Now, whenever a change needs to be made to your task in the future, please increase the version number by 1 so that users can differentiate the different task iterations and versions.
+
+If you are incrementing a task's version, please also consider adding a changelog to the task's README.md noting the date, PR number, what version you have updated to, and a one-liner describing the change.
+
+for example,
+
+* \[Dec 25, 2023\] (PR #999) Version 0.0 -> 1.0: Fixed a bug with answer extraction that led to underestimated performance.
+
 ## Checking performance + equivalence
 
 It's now time to check models' performance on your task! In the evaluation harness, we intend to support a wide range of evaluation tasks and setups, but prioritize the inclusion of already-proven benchmarks following the precise evaluation setups in the literature where possible.
diff --git a/lm_eval/tasks/anli/anli_r1.yaml b/lm_eval/tasks/anli/anli_r1.yaml
index 493a3a3f24..bcf7674ee1 100644
--- a/lm_eval/tasks/anli/anli_r1.yaml
+++ b/lm_eval/tasks/anli/anli_r1.yaml
@@ -23,4 +23,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/arc/arc_easy.yaml b/lm_eval/tasks/arc/arc_easy.yaml
index 1ec12090c5..9c0d312bac 100644
--- a/lm_eval/tasks/arc/arc_easy.yaml
+++ b/lm_eval/tasks/arc/arc_easy.yaml
@@ -20,4 +20,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml b/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
index 6efbb2cc8b..0e2c7ac8dd 100644
--- a/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
@@ -13,4 +13,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/asdiv/default.yaml b/lm_eval/tasks/asdiv/default.yaml
index d448e867c5..350198be39 100644
--- a/lm_eval/tasks/asdiv/default.yaml
+++ b/lm_eval/tasks/asdiv/default.yaml
@@ -11,4 +11,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/babi/babi.yaml b/lm_eval/tasks/babi/babi.yaml
index 31c421f50f..d1193ec859 100644
--- a/lm_eval/tasks/babi/babi.yaml
+++ b/lm_eval/tasks/babi/babi.yaml
@@ -17,4 +17,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml b/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
index 30f1aafd1e..50bf5e8b36 100644
--- a/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
@@ -27,4 +27,4 @@ filter_list:
       - function: "take_first"
 num_fewshot: 0
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml b/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml
index c6bffa31a7..650f91bfb5 100644
--- a/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml
@@ -24,4 +24,4 @@ filter_list:
       - function: "take_first"
 num_fewshot: 0
 metadata:
-  - version: 0
+  version: 0
diff --git a/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml b/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
index 6134c86f05..6bc65079c0 100644
--- a/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
@@ -18,4 +18,4 @@ generation_kwargs:
   temperature: 0.0
 num_fewshot: 0
 metadata:
-  - version: 0
+  version: 0
diff --git a/lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml b/lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml
index a0734aeceb..94a671409d 100644
--- a/lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml
@@ -18,4 +18,4 @@ generation_kwargs:
   temperature: 0.0
 num_fewshot: 0
 metadata:
-  - version: 0
+  version: 0
diff --git a/lm_eval/tasks/belebele/_default_template_yaml b/lm_eval/tasks/belebele/_default_template_yaml
index a16d1ad1fc..ef7c1a2374 100644
--- a/lm_eval/tasks/belebele/_default_template_yaml
+++ b/lm_eval/tasks/belebele/_default_template_yaml
@@ -18,4 +18,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/bigbench/generate_until_template_yaml b/lm_eval/tasks/bigbench/generate_until_template_yaml
index 7dff331292..b370418953 100644
--- a/lm_eval/tasks/bigbench/generate_until_template_yaml
+++ b/lm_eval/tasks/bigbench/generate_until_template_yaml
@@ -15,4 +15,4 @@ metric_list:
     higher_is_better: true
     ignore_punctuation: true
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/bigbench/multiple_choice/causal_judgement.yaml b/lm_eval/tasks/bigbench/multiple_choice/causal_judgement.yaml
new file mode 100644
index 0000000000..e8011772b9
--- /dev/null
+++ b/lm_eval/tasks/bigbench/multiple_choice/causal_judgement.yaml
@@ -0,0 +1,4 @@
+# Generated by utils.py
+dataset_name: causal_judgment_zero_shot
+include: ../multiple_choice_template_yaml
+task: bigbench_causal_judgement_multiple_choice
diff --git a/lm_eval/tasks/bigbench/multiple_choice_template_yaml b/lm_eval/tasks/bigbench/multiple_choice_template_yaml
index 7f299060fe..10fce5c1c3 100644
--- a/lm_eval/tasks/bigbench/multiple_choice_template_yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice_template_yaml
@@ -12,4 +12,4 @@ metric_list:
   - metric: acc
   # TODO: brier score and other metrics
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/blimp/_template_yaml b/lm_eval/tasks/blimp/_template_yaml
index 920076c72a..fb1dd31360 100644
--- a/lm_eval/tasks/blimp/_template_yaml
+++ b/lm_eval/tasks/blimp/_template_yaml
@@ -11,4 +11,4 @@ doc_to_decontamination_query: "{{sentence_good}} {{sentence_bad}}"
 metric_list:
   - metric: acc
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/ceval/_default_ceval_yaml b/lm_eval/tasks/ceval/_default_ceval_yaml
index 11392f0526..a94d87cb54 100644
--- a/lm_eval/tasks/ceval/_default_ceval_yaml
+++ b/lm_eval/tasks/ceval/_default_ceval_yaml
@@ -16,4 +16,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/cmmlu/_default_template_yaml b/lm_eval/tasks/cmmlu/_default_template_yaml
index 0c8bc28d3c..d2e0a8876c 100644
--- a/lm_eval/tasks/cmmlu/_default_template_yaml
+++ b/lm_eval/tasks/cmmlu/_default_template_yaml
@@ -16,4 +16,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/code_x_glue/code-text/go.yaml b/lm_eval/tasks/code_x_glue/code-text/go.yaml
index 8b004f5f54..c88067458e 100644
--- a/lm_eval/tasks/code_x_glue/code-text/go.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/go.yaml
@@ -18,4 +18,4 @@ metric_list:
     aggregation: mean
     higher_is_better: True
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/code_x_glue/code-text/java.yaml b/lm_eval/tasks/code_x_glue/code-text/java.yaml
index 36585cdf2d..ac1ad955cf 100644
--- a/lm_eval/tasks/code_x_glue/code-text/java.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/java.yaml
@@ -18,4 +18,4 @@ metric_list:
     aggregation: mean
     higher_is_better: True
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/code_x_glue/code-text/javascript.yaml b/lm_eval/tasks/code_x_glue/code-text/javascript.yaml
index 93002a57fc..ec8b0a6bd0 100644
--- a/lm_eval/tasks/code_x_glue/code-text/javascript.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/javascript.yaml
@@ -18,4 +18,4 @@ metric_list:
     aggregation: mean
     higher_is_better: True
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/code_x_glue/code-text/php.yaml b/lm_eval/tasks/code_x_glue/code-text/php.yaml
index 6f1861aa93..ebc3691afb 100644
--- a/lm_eval/tasks/code_x_glue/code-text/php.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/php.yaml
@@ -18,4 +18,4 @@ metric_list:
     aggregation: mean
     higher_is_better: True
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/code_x_glue/code-text/python.yaml b/lm_eval/tasks/code_x_glue/code-text/python.yaml
index 8faeebe4b7..92768f9bea 100644
--- a/lm_eval/tasks/code_x_glue/code-text/python.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/python.yaml
@@ -18,4 +18,4 @@ metric_list:
     aggregation: mean
     higher_is_better: True
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/code_x_glue/code-text/ruby.yaml b/lm_eval/tasks/code_x_glue/code-text/ruby.yaml
index 124c644c4e..c2c939b63a 100644
--- a/lm_eval/tasks/code_x_glue/code-text/ruby.yaml
+++ b/lm_eval/tasks/code_x_glue/code-text/ruby.yaml
@@ -18,4 +18,4 @@ metric_list:
     aggregation: mean
     higher_is_better: True
 metadata:
-  - version: 2.0
+  version: 2.0
diff --git a/lm_eval/tasks/coqa/default.yaml b/lm_eval/tasks/coqa/default.yaml
index 4154ac528a..f9494d5db8 100644
--- a/lm_eval/tasks/coqa/default.yaml
+++ b/lm_eval/tasks/coqa/default.yaml
@@ -19,4 +19,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 2.0
+  version: 2.0
diff --git a/lm_eval/tasks/crows_pairs/crows_pairs_english.yaml b/lm_eval/tasks/crows_pairs/crows_pairs_english.yaml
index 929e0a6205..d95c83d01c 100644
--- a/lm_eval/tasks/crows_pairs/crows_pairs_english.yaml
+++ b/lm_eval/tasks/crows_pairs/crows_pairs_english.yaml
@@ -20,4 +20,4 @@ metric_list:
     aggregation: mean
     higher_is_better: false
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/csatqa/_default_csatqa_yaml b/lm_eval/tasks/csatqa/_default_csatqa_yaml
index 98c23e559e..a4a5db84b5 100644
--- a/lm_eval/tasks/csatqa/_default_csatqa_yaml
+++ b/lm_eval/tasks/csatqa/_default_csatqa_yaml
@@ -14,4 +14,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/drop/default.yaml b/lm_eval/tasks/drop/default.yaml
index 4b8848072d..7e425660ac 100644
--- a/lm_eval/tasks/drop/default.yaml
+++ b/lm_eval/tasks/drop/default.yaml
@@ -21,4 +21,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 2.0
+  version: 2.0
diff --git a/lm_eval/tasks/fld/fld_default.yaml b/lm_eval/tasks/fld/fld_default.yaml
index afcbebd03e..ee84f73bc5 100644
--- a/lm_eval/tasks/fld/fld_default.yaml
+++ b/lm_eval/tasks/fld/fld_default.yaml
@@ -12,3 +12,5 @@ metric_list:
   - metric: exact_match
     aggregation: mean
     higher_is_better: true
+metadata:
+  version: 0.0
diff --git a/lm_eval/tasks/glue/cola/default.yaml b/lm_eval/tasks/glue/cola/default.yaml
index 291c94e2ac..a46003c276 100644
--- a/lm_eval/tasks/glue/cola/default.yaml
+++ b/lm_eval/tasks/glue/cola/default.yaml
@@ -13,4 +13,4 @@ doc_to_decontamination_query: sentence
 metric_list:
   - metric: mcc
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/glue/mnli/default.yaml b/lm_eval/tasks/glue/mnli/default.yaml
index 81de19a3a8..6caffa85a2 100644
--- a/lm_eval/tasks/glue/mnli/default.yaml
+++ b/lm_eval/tasks/glue/mnli/default.yaml
@@ -11,4 +11,4 @@ doc_to_choice: ["True", "Neither", "False"]
 metric_list:
   - metric: acc
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/glue/mrpc/default.yaml b/lm_eval/tasks/glue/mrpc/default.yaml
index 455ef682b0..f0bc24510c 100644
--- a/lm_eval/tasks/glue/mrpc/default.yaml
+++ b/lm_eval/tasks/glue/mrpc/default.yaml
@@ -12,4 +12,4 @@ metric_list:
   - metric: acc
   - metric: f1
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/glue/qnli/default.yaml b/lm_eval/tasks/glue/qnli/default.yaml
index b31e16259a..49a6216a5e 100644
--- a/lm_eval/tasks/glue/qnli/default.yaml
+++ b/lm_eval/tasks/glue/qnli/default.yaml
@@ -11,4 +11,4 @@ doc_to_choice: ["yes", "no"]
 metric_list:
   - metric: acc
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/glue/qqp/default.yaml b/lm_eval/tasks/glue/qqp/default.yaml
index 1fa7a796ea..34b6e10375 100644
--- a/lm_eval/tasks/glue/qqp/default.yaml
+++ b/lm_eval/tasks/glue/qqp/default.yaml
@@ -12,4 +12,4 @@ metric_list:
   - metric: acc
   - metric: f1
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/glue/rte/default.yaml b/lm_eval/tasks/glue/rte/default.yaml
index c9cc837d6d..7b12096a46 100644
--- a/lm_eval/tasks/glue/rte/default.yaml
+++ b/lm_eval/tasks/glue/rte/default.yaml
@@ -11,4 +11,4 @@ doc_to_choice: ["True", "False"]
 metric_list:
   - metric: acc
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/glue/sst2/default.yaml b/lm_eval/tasks/glue/sst2/default.yaml
index f561d76566..838afeb218 100644
--- a/lm_eval/tasks/glue/sst2/default.yaml
+++ b/lm_eval/tasks/glue/sst2/default.yaml
@@ -11,4 +11,4 @@ doc_to_choice: ["negative", "positive"]
 metric_list:
   - metric: acc
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/glue/wnli/default.yaml b/lm_eval/tasks/glue/wnli/default.yaml
index 8b3a8e7fc5..a8e57a35d6 100644
--- a/lm_eval/tasks/glue/wnli/default.yaml
+++ b/lm_eval/tasks/glue/wnli/default.yaml
@@ -11,4 +11,4 @@ doc_to_choice: ["False", "True"]
 metric_list:
   - metric: acc
 metadata:
-  - version: 2.0
+  version: 2.0
diff --git a/lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml b/lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml
index 080dc34cf7..b076d4efbe 100644
--- a/lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml
@@ -31,4 +31,4 @@ filter_list:
       - function: "majority_vote"
       - function: "take_first"
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/gsm8k/gsm8k-cot.yaml b/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
index 65da50575b..2df407b772 100644
--- a/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
@@ -41,4 +41,4 @@ filter_list:
         regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)."
       - function: "take_first"
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/gsm8k/gsm8k.yaml b/lm_eval/tasks/gsm8k/gsm8k.yaml
index eb3c9af777..ccf6a5a341 100644
--- a/lm_eval/tasks/gsm8k/gsm8k.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k.yaml
@@ -34,4 +34,4 @@ filter_list:
         regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
       - function: "take_first"
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/headqa/headqa_en.yaml b/lm_eval/tasks/headqa/headqa_en.yaml
index 96eaa2f109..eeb2ff12dd 100644
--- a/lm_eval/tasks/headqa/headqa_en.yaml
+++ b/lm_eval/tasks/headqa/headqa_en.yaml
@@ -20,4 +20,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/hellaswag/hellaswag.yaml b/lm_eval/tasks/hellaswag/hellaswag.yaml
index 4df871e0c2..ec627da7d4 100644
--- a/lm_eval/tasks/hellaswag/hellaswag.yaml
+++ b/lm_eval/tasks/hellaswag/hellaswag.yaml
@@ -19,4 +19,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/hendrycks_ethics/commonsense.yaml b/lm_eval/tasks/hendrycks_ethics/commonsense.yaml
index 0dbd29a7da..f127074367 100644
--- a/lm_eval/tasks/hendrycks_ethics/commonsense.yaml
+++ b/lm_eval/tasks/hendrycks_ethics/commonsense.yaml
@@ -12,4 +12,4 @@ doc_to_choice: ['no', 'yes']
 metric_list:
   - metric: acc
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/hendrycks_ethics/deontology.yaml b/lm_eval/tasks/hendrycks_ethics/deontology.yaml
index 05493d9456..a2f3685222 100644
--- a/lm_eval/tasks/hendrycks_ethics/deontology.yaml
+++ b/lm_eval/tasks/hendrycks_ethics/deontology.yaml
@@ -5,5 +5,5 @@ doc_to_text: "Question: Would most people believe this reasonable or unreasonabl
 doc_to_target: label
 doc_to_choice: ['unreasonable', 'reasonable']
 metadata:
-  - version: 1.0
+  version: 1.0
 # TODO: implement exact-match metric for this subset
diff --git a/lm_eval/tasks/hendrycks_ethics/justice.yaml b/lm_eval/tasks/hendrycks_ethics/justice.yaml
index 668f05ed82..2ba79d16b3 100644
--- a/lm_eval/tasks/hendrycks_ethics/justice.yaml
+++ b/lm_eval/tasks/hendrycks_ethics/justice.yaml
@@ -6,4 +6,4 @@ dataset_name: justice
 doc_to_text: "Question: Would most people believe this reasonable or unreasonable to say? \"{{scenario}}\"\nAnswer:"
 # TODO: impl. exact match for this and deontology
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml b/lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml
index 99b464683f..8960a31934 100644
--- a/lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml
+++ b/lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml
@@ -9,4 +9,4 @@ doc_to_choice: ['no', 'yes']
 metric_list:
   - metric: acc
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/hendrycks_ethics/utilitarianism_original_yaml b/lm_eval/tasks/hendrycks_ethics/utilitarianism_original_yaml
index 65b95a4f28..5583c1b682 100644
--- a/lm_eval/tasks/hendrycks_ethics/utilitarianism_original_yaml
+++ b/lm_eval/tasks/hendrycks_ethics/utilitarianism_original_yaml
@@ -13,4 +13,4 @@
 #   - metric: acc
 # TODO: we want this to be implemented as a winograd_schema task type, actually
 # metadata:
-#   - version: 1.0
+#   version: 1.0
diff --git a/lm_eval/tasks/hendrycks_ethics/virtue.yaml b/lm_eval/tasks/hendrycks_ethics/virtue.yaml
index 85bb552660..8c236a983d 100644
--- a/lm_eval/tasks/hendrycks_ethics/virtue.yaml
+++ b/lm_eval/tasks/hendrycks_ethics/virtue.yaml
@@ -7,4 +7,4 @@ doc_to_text: "Sentence: {{scenario}}\nQuestion: Does the character in this sente
 doc_to_target: label
 doc_to_choice: ['no', 'yes']
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/ifeval/ifeval.yaml b/lm_eval/tasks/ifeval/ifeval.yaml
index bbaaa2f2a2..7913549cea 100644
--- a/lm_eval/tasks/ifeval/ifeval.yaml
+++ b/lm_eval/tasks/ifeval/ifeval.yaml
@@ -26,4 +26,4 @@ metric_list:
     aggregation: !function utils.agg_inst_level_acc
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/lambada/lambada_openai.yaml b/lm_eval/tasks/lambada/lambada_openai.yaml
index d9a9ccc37d..2fcccbd59f 100644
--- a/lm_eval/tasks/lambada/lambada_openai.yaml
+++ b/lm_eval/tasks/lambada/lambada_openai.yaml
@@ -17,4 +17,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/lambada/lambada_standard.yaml b/lm_eval/tasks/lambada/lambada_standard.yaml
index 3521053e50..900e181163 100644
--- a/lm_eval/tasks/lambada/lambada_standard.yaml
+++ b/lm_eval/tasks/lambada/lambada_standard.yaml
@@ -18,4 +18,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml b/lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml
index 42aea6de7d..d25e26d9ef 100644
--- a/lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml
+++ b/lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml
@@ -17,4 +17,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml b/lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml
index 3e412d63d1..7cde8fdebc 100644
--- a/lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml
+++ b/lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml
@@ -18,4 +18,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/lambada_multilingual/lambada_mt_en.yaml b/lm_eval/tasks/lambada_multilingual/lambada_mt_en.yaml
index 768b0dd7e3..7e63a6d1bc 100644
--- a/lm_eval/tasks/lambada_multilingual/lambada_mt_en.yaml
+++ b/lm_eval/tasks/lambada_multilingual/lambada_mt_en.yaml
@@ -17,4 +17,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/logiqa/logiqa.yaml b/lm_eval/tasks/logiqa/logiqa.yaml
index 912de6342f..181ef4d8c7 100644
--- a/lm_eval/tasks/logiqa/logiqa.yaml
+++ b/lm_eval/tasks/logiqa/logiqa.yaml
@@ -18,4 +18,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/logiqa2/logieval.yaml b/lm_eval/tasks/logiqa2/logieval.yaml
index f0552b7c5b..f2593beb77 100644
--- a/lm_eval/tasks/logiqa2/logieval.yaml
+++ b/lm_eval/tasks/logiqa2/logieval.yaml
@@ -24,4 +24,4 @@ filter_list:
         regex_pattern: "^\\s*([A-D])"
       - function: "take_first"
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/logiqa2/logiqa2.yaml b/lm_eval/tasks/logiqa2/logiqa2.yaml
index 568692b01a..0bcd97b131 100644
--- a/lm_eval/tasks/logiqa2/logiqa2.yaml
+++ b/lm_eval/tasks/logiqa2/logiqa2.yaml
@@ -18,4 +18,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/mathqa/mathqa.yaml b/lm_eval/tasks/mathqa/mathqa.yaml
index 73439072af..e37ba11807 100644
--- a/lm_eval/tasks/mathqa/mathqa.yaml
+++ b/lm_eval/tasks/mathqa/mathqa.yaml
@@ -19,4 +19,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/mc_taco/default.yaml b/lm_eval/tasks/mc_taco/default.yaml
index e3708e3224..16aee3f7e7 100644
--- a/lm_eval/tasks/mc_taco/default.yaml
+++ b/lm_eval/tasks/mc_taco/default.yaml
@@ -12,4 +12,4 @@ metric_list:
   - metric: acc
   - metric: f1
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/mgsm/direct/direct_yaml b/lm_eval/tasks/mgsm/direct/direct_yaml
index 58af06d5d1..6cb89f90e8 100644
--- a/lm_eval/tasks/mgsm/direct/direct_yaml
+++ b/lm_eval/tasks/mgsm/direct/direct_yaml
@@ -26,4 +26,4 @@ metric_list:
     ignore_case: true
     ignore_punctuation: true
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/mgsm/en_cot/cot_yaml b/lm_eval/tasks/mgsm/en_cot/cot_yaml
index ec7937860d..a6307e3d7e 100644
--- a/lm_eval/tasks/mgsm/en_cot/cot_yaml
+++ b/lm_eval/tasks/mgsm/en_cot/cot_yaml
@@ -28,4 +28,4 @@ filter_list:
         regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
       - function: "take_first"
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/mgsm/native_cot/cot_yaml b/lm_eval/tasks/mgsm/native_cot/cot_yaml
index 4d4e6fb380..e6f96160aa 100644
--- a/lm_eval/tasks/mgsm/native_cot/cot_yaml
+++ b/lm_eval/tasks/mgsm/native_cot/cot_yaml
@@ -28,4 +28,4 @@ filter_list:
         regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
       - function: "take_first"
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml b/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
index 8dc9b34a38..65b5a6442f 100644
--- a/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
+++ b/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
@@ -21,4 +21,4 @@ metric_list:
     higher_is_better: true
 num_fewshot: 0
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/mmlu/default/_default_template_yaml b/lm_eval/tasks/mmlu/default/_default_template_yaml
index e2b54acf29..37e8bb1649 100644
--- a/lm_eval/tasks/mmlu/default/_default_template_yaml
+++ b/lm_eval/tasks/mmlu/default/_default_template_yaml
@@ -12,4 +12,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
index 0f75fa3001..87662b3c17 100644
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
@@ -23,4 +23,4 @@ metric_list:
     ignore_case: true
     ignore_punctuation: true
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
index ae8214d859..c2c1ff67f1 100644
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
@@ -23,4 +23,4 @@ metric_list:
     ignore_case: true
     ignore_punctuation: true
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
index 863d85cc87..d480001132 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
@@ -13,4 +13,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml
index 5828843577..4bd5e44e45 100644
--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml
@@ -13,4 +13,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml
index 78ce72cfd8..6409360bdc 100644
--- a/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml
@@ -11,4 +11,4 @@ doc_to_decontamination_query: "{{sentence_good}} {{sentence_bad}}"
 metric_list:
   - metric: acc
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/model_written_evals/persona/_template_yaml b/lm_eval/tasks/model_written_evals/persona/_template_yaml
index fc4babb2b6..5702df8e0b 100644
--- a/lm_eval/tasks/model_written_evals/persona/_template_yaml
+++ b/lm_eval/tasks/model_written_evals/persona/_template_yaml
@@ -9,4 +9,4 @@ doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
 metric_list:
   - metric: acc
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml
index f06c9959f8..303e33906a 100644
--- a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml
+++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml
@@ -11,4 +11,4 @@ doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
 metric_list:
   - metric: acc
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml
index 26d2e3f6b1..2339894b1e 100644
--- a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml
+++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml
@@ -11,4 +11,4 @@ doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
 metric_list:
   - metric: acc
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml
index f96fbde1f1..c7772c1d67 100644
--- a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml
+++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml
@@ -11,4 +11,4 @@ doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
 metric_list:
   - metric: acc
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/model_written_evals/winogenerated/_template_yaml b/lm_eval/tasks/model_written_evals/winogenerated/_template_yaml
index 0dfa3d3163..6b16788923 100644
--- a/lm_eval/tasks/model_written_evals/winogenerated/_template_yaml
+++ b/lm_eval/tasks/model_written_evals/winogenerated/_template_yaml
@@ -9,4 +9,4 @@ doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
 metric_list:
   - metric: acc
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/mutual/mutual.yaml b/lm_eval/tasks/mutual/mutual.yaml
index dae7b374f7..f313010182 100644
--- a/lm_eval/tasks/mutual/mutual.yaml
+++ b/lm_eval/tasks/mutual/mutual.yaml
@@ -22,4 +22,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 2.0
+  version: 2.0
diff --git a/lm_eval/tasks/nq_open/nq_open.yaml b/lm_eval/tasks/nq_open/nq_open.yaml
index 4051c0630c..99ee83327a 100644
--- a/lm_eval/tasks/nq_open/nq_open.yaml
+++ b/lm_eval/tasks/nq_open/nq_open.yaml
@@ -29,4 +29,4 @@ metric_list:
     regexes_to_ignore:
     - "\ban|a|the\b"
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/openbookqa/openbookqa.yaml b/lm_eval/tasks/openbookqa/openbookqa.yaml
index 401bb03fd3..bdfcd19635 100644
--- a/lm_eval/tasks/openbookqa/openbookqa.yaml
+++ b/lm_eval/tasks/openbookqa/openbookqa.yaml
@@ -18,4 +18,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/paws-x/pawsx_template_yaml b/lm_eval/tasks/paws-x/pawsx_template_yaml
index a393f625b8..4756473829 100644
--- a/lm_eval/tasks/paws-x/pawsx_template_yaml
+++ b/lm_eval/tasks/paws-x/pawsx_template_yaml
@@ -17,4 +17,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/pile/pile_arxiv.yaml b/lm_eval/tasks/pile/pile_arxiv.yaml
index 2328665deb..58760cc86e 100644
--- a/lm_eval/tasks/pile/pile_arxiv.yaml
+++ b/lm_eval/tasks/pile/pile_arxiv.yaml
@@ -20,4 +20,4 @@ metric_list:
     aggregation: bits_per_byte
     higher_is_better: false
 metadata:
-  - version: 2.0
+  version: 2.0
diff --git a/lm_eval/tasks/piqa/piqa.yaml b/lm_eval/tasks/piqa/piqa.yaml
index 23a523ebfa..5a07250ab9 100644
--- a/lm_eval/tasks/piqa/piqa.yaml
+++ b/lm_eval/tasks/piqa/piqa.yaml
@@ -18,4 +18,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/polemo2/polemo2_in.yaml b/lm_eval/tasks/polemo2/polemo2_in.yaml
index c1da822bc3..6fc02fc908 100644
--- a/lm_eval/tasks/polemo2/polemo2_in.yaml
+++ b/lm_eval/tasks/polemo2/polemo2_in.yaml
@@ -42,4 +42,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/prost/corypaik_prost.yaml b/lm_eval/tasks/prost/corypaik_prost.yaml
index b6f3e60e05..adf7a8d232 100644
--- a/lm_eval/tasks/prost/corypaik_prost.yaml
+++ b/lm_eval/tasks/prost/corypaik_prost.yaml
@@ -16,4 +16,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/pubmedqa/pubmedqa.yaml b/lm_eval/tasks/pubmedqa/pubmedqa.yaml
index 9d2d19606a..47de2fa098 100644
--- a/lm_eval/tasks/pubmedqa/pubmedqa.yaml
+++ b/lm_eval/tasks/pubmedqa/pubmedqa.yaml
@@ -13,4 +13,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/qa4mre/qa4mre_2011.yaml b/lm_eval/tasks/qa4mre/qa4mre_2011.yaml
index 5e585b5927..b9ceb78094 100644
--- a/lm_eval/tasks/qa4mre/qa4mre_2011.yaml
+++ b/lm_eval/tasks/qa4mre/qa4mre_2011.yaml
@@ -19,4 +19,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/qasper/bool.yaml b/lm_eval/tasks/qasper/bool.yaml
index 3446121944..468da5c6d1 100644
--- a/lm_eval/tasks/qasper/bool.yaml
+++ b/lm_eval/tasks/qasper/bool.yaml
@@ -11,4 +11,4 @@ doc_to_choice: ["no", "yes"]
 metric_list:
   - metric: f1
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/qasper/freeform.yaml b/lm_eval/tasks/qasper/freeform.yaml
index 0d9e8f94da..248aede8b4 100644
--- a/lm_eval/tasks/qasper/freeform.yaml
+++ b/lm_eval/tasks/qasper/freeform.yaml
@@ -15,4 +15,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/race/race.yaml b/lm_eval/tasks/race/race.yaml
index 04ffaef4b2..56707fbf15 100644
--- a/lm_eval/tasks/race/race.yaml
+++ b/lm_eval/tasks/race/race.yaml
@@ -11,4 +11,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 2.0
+  version: 2.0
diff --git a/lm_eval/tasks/realtoxicityprompts/realtoxicityprompts.yaml b/lm_eval/tasks/realtoxicityprompts/realtoxicityprompts.yaml
index 5053792670..658c6cdba3 100644
--- a/lm_eval/tasks/realtoxicityprompts/realtoxicityprompts.yaml
+++ b/lm_eval/tasks/realtoxicityprompts/realtoxicityprompts.yaml
@@ -14,4 +14,4 @@ generation_kwargs:
   do_sample: false
   temperature: 0.0
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/sciq/sciq.yaml b/lm_eval/tasks/sciq/sciq.yaml
index d7ed2eacfb..926d66b180 100644
--- a/lm_eval/tasks/sciq/sciq.yaml
+++ b/lm_eval/tasks/sciq/sciq.yaml
@@ -18,4 +18,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/siqa/default.yml b/lm_eval/tasks/siqa/default.yml
index f31929b5bc..35b14599d6 100644
--- a/lm_eval/tasks/siqa/default.yml
+++ b/lm_eval/tasks/siqa/default.yml
@@ -13,4 +13,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/storycloze/storycloze_2016.yaml b/lm_eval/tasks/storycloze/storycloze_2016.yaml
index e17c7d5f06..df1c2629cb 100644
--- a/lm_eval/tasks/storycloze/storycloze_2016.yaml
+++ b/lm_eval/tasks/storycloze/storycloze_2016.yaml
@@ -15,4 +15,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/super_glue/boolq/default.yaml b/lm_eval/tasks/super_glue/boolq/default.yaml
index bb63aa3f2e..f26e4682c4 100644
--- a/lm_eval/tasks/super_glue/boolq/default.yaml
+++ b/lm_eval/tasks/super_glue/boolq/default.yaml
@@ -14,4 +14,4 @@ doc_to_decontamination_query: passage
 metric_list:
   - metric: acc
 metadata:
-  - version: 2.0
+  version: 2.0
diff --git a/lm_eval/tasks/super_glue/boolq/seq2seq.yaml b/lm_eval/tasks/super_glue/boolq/seq2seq.yaml
index c283d5ea11..569316cb31 100644
--- a/lm_eval/tasks/super_glue/boolq/seq2seq.yaml
+++ b/lm_eval/tasks/super_glue/boolq/seq2seq.yaml
@@ -23,4 +23,4 @@ metric_list:
     ignore_case: true
     ignore_punctuation: true
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml b/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
index 17f626fe03..7089381ad8 100644
--- a/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
@@ -19,4 +19,4 @@ metric_list:
     ignore_case: true
     ignore_punctuation: true
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/super_glue/cb/default.yaml b/lm_eval/tasks/super_glue/cb/default.yaml
index 6c333b6d38..c575e9872a 100644
--- a/lm_eval/tasks/super_glue/cb/default.yaml
+++ b/lm_eval/tasks/super_glue/cb/default.yaml
@@ -14,4 +14,4 @@ metric_list:
   - metric: f1
     aggregation: !function "aggregate.cb_multi_fi"
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/super_glue/cb/t5-prompt.yaml b/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
index 2a6130dba5..984e17935a 100644
--- a/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
@@ -22,4 +22,4 @@ metric_list:
     aggregation: !function "t5_utils.agg_mean_3class_f1"
     higher_is_better: true
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/super_glue/copa/default.yaml b/lm_eval/tasks/super_glue/copa/default.yaml
index 2efb6070ae..1af5dbf472 100644
--- a/lm_eval/tasks/super_glue/copa/default.yaml
+++ b/lm_eval/tasks/super_glue/copa/default.yaml
@@ -12,4 +12,4 @@ doc_to_choice: !function utils.doc_to_choice
 metric_list:
   - metric: acc
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/super_glue/copa/t5-prompt.yaml b/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
index 6c8f52a729..20a90db98d 100644
--- a/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
@@ -19,4 +19,4 @@ metric_list:
     ignore_case: true
     ignore_punctuation: true
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/super_glue/multirc/default.yaml b/lm_eval/tasks/super_glue/multirc/default.yaml
index 7489d0679b..5a388299f6 100644
--- a/lm_eval/tasks/super_glue/multirc/default.yaml
+++ b/lm_eval/tasks/super_glue/multirc/default.yaml
@@ -12,4 +12,4 @@ doc_to_choice: "['''{{answer}}\\nIs the answer correct? yes''', '''{{answer}}\\n
 metric_list:
   - metric: acc
 metadata:
-  - version: 2.0
+  version: 2.0
diff --git a/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml b/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
index 442a345075..927a357158 100644
--- a/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
@@ -20,4 +20,4 @@ metric_list:
     aggregation: !function t5_utils.agg_em
     higher_is_better: true
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/super_glue/record/default.yaml b/lm_eval/tasks/super_glue/record/default.yaml
index ff9a823b32..54f871c9d5 100644
--- a/lm_eval/tasks/super_glue/record/default.yaml
+++ b/lm_eval/tasks/super_glue/record/default.yaml
@@ -17,4 +17,4 @@ metric_list:
     higher_is_better: True
     aggregation: mean
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/super_glue/record/t5-prompt.yaml b/lm_eval/tasks/super_glue/record/t5-prompt.yaml
index 356d922170..c999bc9030 100644
--- a/lm_eval/tasks/super_glue/record/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/record/t5-prompt.yaml
@@ -19,4 +19,4 @@ metric_list:
     aggregation: !function t5_utils.squad_f1_agg
     higher_is_better: true
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/super_glue/rte/default.yaml b/lm_eval/tasks/super_glue/rte/default.yaml
index d77ede0725..6754af1a1e 100644
--- a/lm_eval/tasks/super_glue/rte/default.yaml
+++ b/lm_eval/tasks/super_glue/rte/default.yaml
@@ -12,4 +12,4 @@ doc_to_choice: ['True', 'False']
 metric_list:
   - metric: acc
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/super_glue/rte/t5-prompt.yaml b/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
index 389450777f..9e80686e2a 100644
--- a/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
@@ -19,4 +19,4 @@ metric_list:
     ignore_case: true
     ignore_punctuation: true
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/super_glue/wic/default.yaml b/lm_eval/tasks/super_glue/wic/default.yaml
index 7e53ab4280..0f86855a78 100644
--- a/lm_eval/tasks/super_glue/wic/default.yaml
+++ b/lm_eval/tasks/super_glue/wic/default.yaml
@@ -12,4 +12,4 @@ doc_to_choice: ['no', 'yes']
 metric_list:
   - metric: acc
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/super_glue/wic/t5-prompt.yaml b/lm_eval/tasks/super_glue/wic/t5-prompt.yaml
index 79bc518f93..3a0dbb2f7f 100644
--- a/lm_eval/tasks/super_glue/wic/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/wic/t5-prompt.yaml
@@ -19,4 +19,4 @@ metric_list:
     ignore_case: true
     ignore_punctuation: true
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/super_glue/wsc/default.yaml b/lm_eval/tasks/super_glue/wsc/default.yaml
index 0e93ad09f2..b9c7ec347c 100644
--- a/lm_eval/tasks/super_glue/wsc/default.yaml
+++ b/lm_eval/tasks/super_glue/wsc/default.yaml
@@ -12,4 +12,4 @@ doc_to_choice: ['no', 'yes']
 metric_list:
   - metric: acc
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml b/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
index 01183727d1..5e18acbbfb 100644
--- a/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
@@ -20,4 +20,4 @@ filter_list:
     filter:
       - function: !function t5_utils.WSCPostprocess
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/swag/swag.yaml b/lm_eval/tasks/swag/swag.yaml
index dab13f10dc..13e30566ea 100644
--- a/lm_eval/tasks/swag/swag.yaml
+++ b/lm_eval/tasks/swag/swag.yaml
@@ -16,4 +16,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/toxigen/toxigen.yaml b/lm_eval/tasks/toxigen/toxigen.yaml
index 691376e7f0..8b840b426d 100644
--- a/lm_eval/tasks/toxigen/toxigen.yaml
+++ b/lm_eval/tasks/toxigen/toxigen.yaml
@@ -15,4 +15,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/translation/wmt_common_yaml b/lm_eval/tasks/translation/wmt_common_yaml
index 7ef6a0ea4f..3e3c395ad6 100644
--- a/lm_eval/tasks/translation/wmt_common_yaml
+++ b/lm_eval/tasks/translation/wmt_common_yaml
@@ -14,4 +14,4 @@ generation_kwargs:
   temperature: 0.0
 repeats: 1
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/triviaqa/default.yaml b/lm_eval/tasks/triviaqa/default.yaml
index dcfcf3ddc0..106c0290cc 100644
--- a/lm_eval/tasks/triviaqa/default.yaml
+++ b/lm_eval/tasks/triviaqa/default.yaml
@@ -28,4 +28,4 @@ metric_list:
     ignore_case: true
     ignore_punctuation: true
 metadata:
-  - version: 2.0
+  version: 2.0
diff --git a/lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml b/lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml
index afad7eab3b..2a1e6108f1 100644
--- a/lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml
+++ b/lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml
@@ -76,4 +76,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 2.0
+  version: 2.0
diff --git a/lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml b/lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml
index 9ae8092b69..d9d3a696aa 100644
--- a/lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml
+++ b/lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml
@@ -33,4 +33,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 2.0
+  version: 2.0
diff --git a/lm_eval/tasks/truthfulqa/truthfulqa_mc2.yaml b/lm_eval/tasks/truthfulqa/truthfulqa_mc2.yaml
index 45a1ef293b..0599b9d6be 100644
--- a/lm_eval/tasks/truthfulqa/truthfulqa_mc2.yaml
+++ b/lm_eval/tasks/truthfulqa/truthfulqa_mc2.yaml
@@ -10,4 +10,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 2.0
+  version: 2.0
diff --git a/lm_eval/tasks/unscramble/anagrams1.yaml b/lm_eval/tasks/unscramble/anagrams1.yaml
index c41c225eef..b6a123ec98 100644
--- a/lm_eval/tasks/unscramble/anagrams1.yaml
+++ b/lm_eval/tasks/unscramble/anagrams1.yaml
@@ -17,4 +17,4 @@ metric_list:
     ignore_case: false
     ignore_punctuation: false
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/unscramble/anagrams2.yaml b/lm_eval/tasks/unscramble/anagrams2.yaml
index 72a3cb39db..fea6e11006 100644
--- a/lm_eval/tasks/unscramble/anagrams2.yaml
+++ b/lm_eval/tasks/unscramble/anagrams2.yaml
@@ -17,4 +17,4 @@ metric_list:
     ignore_case: false
     ignore_punctuation: false
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/unscramble/cycle_letters.yaml b/lm_eval/tasks/unscramble/cycle_letters.yaml
index d86e6fdb99..063c7d3f0c 100644
--- a/lm_eval/tasks/unscramble/cycle_letters.yaml
+++ b/lm_eval/tasks/unscramble/cycle_letters.yaml
@@ -17,4 +17,4 @@ metric_list:
     ignore_case: false
     ignore_punctuation: false
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/unscramble/random_insertion.yaml b/lm_eval/tasks/unscramble/random_insertion.yaml
index a843c9f494..7b08b8330f 100644
--- a/lm_eval/tasks/unscramble/random_insertion.yaml
+++ b/lm_eval/tasks/unscramble/random_insertion.yaml
@@ -17,4 +17,4 @@ metric_list:
     ignore_case: false
     ignore_punctuation: false
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/unscramble/reversed_words.yaml b/lm_eval/tasks/unscramble/reversed_words.yaml
index 9a909bb0cd..0c698c3dc9 100644
--- a/lm_eval/tasks/unscramble/reversed_words.yaml
+++ b/lm_eval/tasks/unscramble/reversed_words.yaml
@@ -17,4 +17,4 @@ metric_list:
     ignore_case: false
     ignore_punctuation: false
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/webqs/webqs.yaml b/lm_eval/tasks/webqs/webqs.yaml
index 2490944ea1..32893edfb1 100644
--- a/lm_eval/tasks/webqs/webqs.yaml
+++ b/lm_eval/tasks/webqs/webqs.yaml
@@ -17,4 +17,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/wikitext/wikitext.yaml b/lm_eval/tasks/wikitext/wikitext.yaml
index 06b7d981e9..c31d920dde 100644
--- a/lm_eval/tasks/wikitext/wikitext.yaml
+++ b/lm_eval/tasks/wikitext/wikitext.yaml
@@ -15,4 +15,4 @@ metric_list:
   - metric: byte_perplexity
   - metric: bits_per_byte
 metadata:
-  - version: 2.0
+  version: 2.0
diff --git a/lm_eval/tasks/winogrande/default.yaml b/lm_eval/tasks/winogrande/default.yaml
index 1927059905..213f0727fe 100644
--- a/lm_eval/tasks/winogrande/default.yaml
+++ b/lm_eval/tasks/winogrande/default.yaml
@@ -14,4 +14,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml b/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml
index f6d7a9230a..aa14b66413 100644
--- a/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml
+++ b/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml
@@ -16,4 +16,4 @@ metric_list:
     aggregation: !function metrics.agg_bleu
     higher_is_better: true
 metadata:
-  - version: 0.0
+  version: 0.0
diff --git a/lm_eval/tasks/wsc273/default.yaml b/lm_eval/tasks/wsc273/default.yaml
index 8584c49502..c6f7335700 100644
--- a/lm_eval/tasks/wsc273/default.yaml
+++ b/lm_eval/tasks/wsc273/default.yaml
@@ -14,4 +14,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/xcopa/default_et.yaml b/lm_eval/tasks/xcopa/default_et.yaml
index 4484f61803..9f2b0b73b5 100644
--- a/lm_eval/tasks/xcopa/default_et.yaml
+++ b/lm_eval/tasks/xcopa/default_et.yaml
@@ -11,4 +11,4 @@ doc_to_choice: !function utils.doc_to_choice
 metric_list:
   - metric: acc
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/xnli/xnli_common_yaml b/lm_eval/tasks/xnli/xnli_common_yaml
index f76b39f5bd..0201459d35 100644
--- a/lm_eval/tasks/xnli/xnli_common_yaml
+++ b/lm_eval/tasks/xnli/xnli_common_yaml
@@ -16,4 +16,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/xstorycloze/default_ar.yaml b/lm_eval/tasks/xstorycloze/default_ar.yaml
index 1718863bf4..2a52966d5a 100644
--- a/lm_eval/tasks/xstorycloze/default_ar.yaml
+++ b/lm_eval/tasks/xstorycloze/default_ar.yaml
@@ -15,4 +15,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0
diff --git a/lm_eval/tasks/xwinograd/xwinograd_common_yaml b/lm_eval/tasks/xwinograd/xwinograd_common_yaml
index 2e22d706e0..86554820e9 100644
--- a/lm_eval/tasks/xwinograd/xwinograd_common_yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_common_yaml
@@ -17,4 +17,4 @@ metric_list:
     aggregation: mean
     higher_is_better: true
 metadata:
-  - version: 1.0
+  version: 1.0

From 84790e9931dfc453bab4c0dc17c2f69d6bdce6aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20B=C3=A4uerle?= <alex@a13x.io>
Date: Thu, 21 Dec 2023 16:37:34 +0100
Subject: [PATCH 09/25] update Zeno example and reference in README (#1190)

---
 README.md                     | 2 ++
 examples/visualize-zeno.ipynb | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 747d47c927..f12946ed8c 100644
--- a/README.md
+++ b/README.md
@@ -269,6 +269,8 @@ python scripts/zeno_visualize.py \
 This will use all subfolders in `data_path` as different models and upload all tasks within these model folders to Zeno.
 If you run the eval harness on multiple tasks, the `project_name` will be used as a prefix and one project will be created per task.
 
+You can find an example of this workflow in [examples/visualize-zeno.ipynb](examples/visualize-zeno.ipynb).
+
 ## How to Contribute or Learn More?
 
 For more information on the library and how everything fits together, check out all of our [documentation pages](https://github.com/EleutherAI/lm-evaluation-harness/tree/big-refactor/docs)! We plan to post a larger roadmap of desired + planned library improvements soon, with more information on how contributors can help.
diff --git a/examples/visualize-zeno.ipynb b/examples/visualize-zeno.ipynb
index 48beeddff4..4ceabbf425 100644
--- a/examples/visualize-zeno.ipynb
+++ b/examples/visualize-zeno.ipynb
@@ -20,7 +20,8 @@
    "outputs": [],
    "source": [
     "# Install this project if you did not already do that. This is all that needs to be installed for you to be able to visualize your data in Zeno!\n",
-    "!pip install .."
+    "!pip install -e ..\n",
+    "!pip install -e ..[zeno]"
    ]
   },
   {

From da0a5e36e238f41ad5d0f3bba921fcfbf21022ca Mon Sep 17 00:00:00 2001
From: Anjor Kanekar <anjor@umd.edu>
Date: Thu, 21 Dec 2023 21:07:34 +0000
Subject: [PATCH 10/25] Remove tokenizer for openai chat completions (#1191)

* remove tokenizer for openai chat completions

* reordering function

* linter

* remove tiktoken import
---
 lm_eval/models/openai_completions.py | 76 ++--------------------------
 1 file changed, 4 insertions(+), 72 deletions(-)

diff --git a/lm_eval/models/openai_completions.py b/lm_eval/models/openai_completions.py
index d99365a635..63703ddea4 100644
--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -3,9 +3,8 @@
 import time
 from collections import defaultdict
 from importlib.util import find_spec
-from typing import List, Literal, Optional, Tuple
+from typing import List, Optional, Tuple
 
-import transformers
 from tqdm import tqdm
 
 from lm_eval import utils
@@ -360,11 +359,7 @@ def __init__(
         self,
         model: str = "gpt-3.5-turbo",  # GPT model or Local model using HuggingFace model paths
         base_url: str = None,
-        tokenizer_backend: Literal["tiktoken", "huggingface"] = "tiktoken",
         truncate: bool = False,
-        revision: Optional[str] = "main",
-        trust_remote_code: Optional[bool] = False,
-        use_fast_tokenizer: Optional[bool] = True,
         **kwargs,
     ) -> None:
         """
@@ -381,7 +376,6 @@ def __init__(
         super().__init__()
         try:
             import openai  # noqa: E401
-            import tiktoken
         except ModuleNotFoundError:
             raise Exception(
                 "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
@@ -389,32 +383,8 @@ def __init__(
             )
         self.model = model
         self.base_url = base_url
-        self.tokenizer_backend = tokenizer_backend
         self.truncate = truncate
 
-        # if we have a local model, use HF tokenizer over tiktoken
-        if self.tokenizer_backend == "huggingface":
-            self.revision = revision
-            self.trust_remote_code = trust_remote_code
-            self.use_fast_tokenizer = use_fast_tokenizer
-
-            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-                self.model,
-                revision=self.revision,
-                trust_remote_code=self.trust_remote_code,
-                use_fast_tokenizer=self.use_fast_tokenizer,
-            )
-            self.vocab_size = self.tokenizer.vocab
-            self.end_of_text_token_id = self.tokenizer.eos_token
-        elif self.tokenizer_backend == "tiktoken":
-            self.tokenizer = tiktoken.encoding_for_model(self.model)
-            self.vocab_size = self.tokenizer.n_vocab
-            self.end_of_text_token_id = self.tokenizer.eot_token
-        else:
-            raise ValueError(
-                f"Expected tokenizer_backend to be one of ['tiktoken', 'huggingface'] but got {self.tokenizer_backend}"
-            )
-
         # Read from environment variable OPENAI_API_KEY
         # Set to EMPTY for local
         if self.base_url:
@@ -422,10 +392,6 @@ def __init__(
         else:
             self.client = openai.OpenAI()  # openai.AsyncOpenAI()
 
-    @property
-    def eot_token_id(self):
-        return self.end_of_text_token_id
-
     @property
     def max_length(self) -> int:
         # Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token
@@ -445,53 +411,19 @@ def device(self):
         # Isn't used because we override _loglikelihood_tokens
         raise NotImplementedError()
 
-    def tok_encode(self, string: str) -> List[int]:
-        return self.tokenizer.encode(string)
-
-    def tok_decode(self, tokens: List[int]) -> str:
-        return self.tokenizer.decode(tokens)
-
-    def _encode_pair(
-        self, context: str, continuation: str
-    ) -> Tuple[List[int], List[int]]:
-        n_spaces = len(context) - len(context.rstrip())
-        if n_spaces > 0:
-            continuation = context[-n_spaces:] + continuation
-            context = context[:-n_spaces]
-        whole_enc = self.tok_encode(context + continuation)
-        context_enc = self.tok_encode(context)
-        context_enc_len = len(context_enc)
-        continuation_enc = whole_enc[context_enc_len:]
-        return context_enc, continuation_enc
-
     def generate_until(self, requests) -> List[str]:
         res = defaultdict(list)
         re_ords = {}
 
-        def _collate(x):
-            toks = self.tok_encode(x[0])
-            return -len(toks), x[0]
-
         # we group requests by their generation_kwargs,
         # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
         # in the same batch.
         grouper = utils.Grouper(requests, lambda x: str(x.args[1]))
         for key, reqs in grouper.get_grouped().items():
             # within each set of reqs for given kwargs, we reorder by token length, descending.
-            re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate)
-
-        def sameuntil_chunks(xs, size):
-            ret = []
-            lastuntil = xs[0][1]
-            for x in xs:
-                if len(ret) >= size or x[1] != lastuntil:
-                    yield ret, lastuntil
-                    ret = []
-                    lastuntil = x[1]
-                ret.append(x)
-
-            if ret:
-                yield ret, lastuntil
+            re_ords[key] = utils.Reorderer(
+                [req.args for req in reqs], lambda x: (-len(x[0]), x[0])
+            )
 
         pbar = tqdm(total=len(requests), disable=(self.rank != 0))
         for key, re_ord in re_ords.items():

From 9267354e651a406a5314e16bcb9bbbe28170f9f4 Mon Sep 17 00:00:00 2001
From: Anjor Kanekar <anjor@umd.edu>
Date: Thu, 21 Dec 2023 21:08:59 +0000
Subject: [PATCH 11/25] Update README.md (#1181)

* Update README.md

Add a not about running on apple arm gpus

* Update README.md

* Update README.md

---------

Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index f12946ed8c..d8f7bc388d 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ New updates and features include:
 
 Please see our updated documentation pages in `docs/` for more details.
 
-Development will be continuing on the `main` branch, and we encourage you to give us feedback on what features are desired and how to improve the library further, or ask questions, either in issues or PRs on GitHub, or in the [EleutherAI discord](discord.gg/eleutherai)!
+Development will be continuing on the `main` branch, and we encourage you to give us feedback on what features are desired and how to improve the library further, or ask questions, either in issues or PRs on GitHub, or in the [EleutherAI discord](https://discord.gg/eleutherai)!
 
 ## Overview
 
@@ -65,7 +65,7 @@ We also provide a number of optional dependencies for extended functionality. Ex
 
 ### Hugging Face `transformers`
 
-To evaluate a model hosted on the [HuggingFace Hub](https://huggingface.co/models) (e.g. GPT-J-6B) on `hellaswag` you can use the following command:
+To evaluate a model hosted on the [HuggingFace Hub](https://huggingface.co/models) (e.g. GPT-J-6B) on `hellaswag` you can use the following command (this assumes you are using a CUDA-compatible GPU):
 
 ```bash
 lm_eval --model hf \

From 09493fd24d5c5947dfae07853b05d19c79bc16a4 Mon Sep 17 00:00:00 2001
From: Baber Abbasi <92168766+baberabb@users.noreply.github.com>
Date: Fri, 22 Dec 2023 02:47:31 +0500
Subject: [PATCH 12/25] disable `mypy` (#1193)

---
 mypy.ini | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/mypy.ini b/mypy.ini
index 2d20dd2cc5..76a0c86452 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -9,19 +9,21 @@ warn_unused_ignores = True
 warn_redundant_casts = True
 
 # We ignore errors everywhere to gradually add type annotations
-# [mypy-lm_eval.*]
-# ignore_errors = True
-#
-# [mypy-lm_eval.api.*]
-# ignore_errors = True
-#
-# [mypy-lm_eval.prompts.*]
-# ignore_errors = True
-#
-# [mypy-lm_eval.models.*]
-# ignore_errors = True
-#
-# [mypy-scripts.*]
-# ignore_errors = True
-#
-# [mypy-main]
+
+[mypy-lm_eval.*]
+ignore_errors = True
+
+[mypy-lm_eval.api.*]
+ignore_errors = True
+
+[mypy-lm_eval.prompts.*]
+ignore_errors = True
+
+[mypy-lm_eval.models.*]
+ignore_errors = True
+
+[mypy-scripts.*]
+ignore_errors = True
+
+[mypy-main]
+ignore_errors = True

From 046ea6e242f29d8afa44f5b0a072d065c84a9d7f Mon Sep 17 00:00:00 2001
From: Zach Schillaci <zschillaci@brandeis.edu>
Date: Thu, 21 Dec 2023 23:11:18 -0500
Subject: [PATCH 13/25] Generic decorator for handling rate limit errors
 (#1109)

* Add retry error handler

* fixup! Add retry error handler

* Move to utils.py

* Run isort on utils.py

* Catch multiple exceptions

* Update LMs with exception handler

* Fixes to anthropic retry handler

* fix callback kwarg

* Update textsynth.py

* fix python 3.8 incompatibility

* fix indenterror I introduced

* placate linter?

* Update on_exception_callback kwarg name

* fixup! Merge branch 'main' into add-retry-error-handler

* fixup! fixup! Merge branch 'main' into add-retry-error-handler

* Merge conflicts are fun

* Run pre-commit

---------

Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
---
 lm_eval/models/anthropic_llms.py     | 54 +++++++++++++++++-----------
 lm_eval/models/openai_completions.py | 48 +++++++++++++------------
 lm_eval/models/textsynth.py          | 27 ++++++++------
 lm_eval/utils.py                     | 44 ++++++++++++++++++++++-
 4 files changed, 118 insertions(+), 55 deletions(-)

diff --git a/lm_eval/models/anthropic_llms.py b/lm_eval/models/anthropic_llms.py
index 6e5b437875..8a3695c3e2 100644
--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
@@ -1,4 +1,3 @@
-import time
 from typing import Any, List, Tuple
 
 from tqdm import tqdm
@@ -6,6 +5,7 @@
 from lm_eval import utils
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
+from lm_eval.utils import retry_on_specific_exceptions
 
 
 eval_logger = utils.eval_logger
@@ -48,26 +48,30 @@ def anthropic_completion(
 please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e .[anthropic]`",
         )
 
-    backoff_time: float = 3
-    while True:
-        try:
-            response = client.completions.create(
-                prompt=f"{anthropic.HUMAN_PROMPT} {prompt}{anthropic.AI_PROMPT}",
-                model=model,
-                # NOTE: Claude really likes to do CoT, and overly aggressive stop sequences
-                #       (e.g. gsm8k's ":") may truncate a lot of the input.
-                stop_sequences=[anthropic.HUMAN_PROMPT] + stop,
-                max_tokens_to_sample=max_tokens_to_sample,
-                temperature=temperature,
-                **kwargs,
-            )
-            return response.completion
-        except anthropic.RateLimitError as e:
-            eval_logger.warning(
-                f"RateLimitError occurred: {e.__cause__}\n Retrying in {backoff_time} seconds"
-            )
-            time.sleep(backoff_time)
-            backoff_time *= 1.5
+    def _exception_callback(e: Exception, sleep_time: float) -> None:
+        eval_logger.warning(
+            f"RateLimitError occurred: {e.__cause__}\n Retrying in {sleep_time} seconds"
+        )
+
+    @retry_on_specific_exceptions(
+        on_exceptions=[anthropic.RateLimitError],
+        max_retries=None,  # retry forever, consider changing
+        on_exception_callback=_exception_callback,
+    )
+    def completion():
+        response = client.completions.create(
+            prompt=f"{anthropic.HUMAN_PROMPT} {prompt}{anthropic.AI_PROMPT}",
+            model=model,
+            # NOTE: Claude really likes to do CoT, and overly aggressive stop sequences
+            #       (e.g. gsm8k's ":") may truncate a lot of the input.
+            stop_sequences=[anthropic.HUMAN_PROMPT] + stop,
+            max_tokens_to_sample=max_tokens_to_sample,
+            temperature=temperature,
+            **kwargs,
+        )
+        return response.completion
+
+    return completion()
 
 
 @register_model("anthropic")
@@ -144,6 +148,14 @@ def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False):
         raise NotImplementedError("No support for logits.")
 
     def generate_until(self, requests) -> List[str]:
+        try:
+            import anthropic
+        except ModuleNotFoundError:
+            raise Exception(
+                "attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
+please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e .[anthropic]`",
+            )
+
         if not requests:
             return []
 
diff --git a/lm_eval/models/openai_completions.py b/lm_eval/models/openai_completions.py
index 63703ddea4..11f2cf33aa 100644
--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -1,6 +1,5 @@
 import copy
 import os
-import time
 from collections import defaultdict
 from importlib.util import find_spec
 from typing import List, Optional, Tuple
@@ -10,6 +9,7 @@
 from lm_eval import utils
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
+from lm_eval.utils import retry_on_specific_exceptions
 
 
 def get_result(response, ctxlen: int) -> Tuple[float, bool]:
@@ -53,16 +53,20 @@ def oa_completion(**kwargs):
     else:
         import openai
 
-    backoff_time = 3
-    while True:
-        try:
-            return openai.completions.create(**kwargs)
-        except openai.OpenAIError:
-            import traceback
+    def _exception_callback(e: Exception, sleep_time: float) -> None:
+        import traceback
+
+        traceback.print_exc()
+
+    @retry_on_specific_exceptions(
+        on_exceptions=[openai.OpenAIError],
+        max_retries=None,  # retry forever, consider changing
+        on_exception_callback=_exception_callback,
+    )
+    def completion():
+        return openai.completions.create(**kwargs)
 
-            traceback.print_exc()
-            time.sleep(backoff_time)
-            backoff_time *= 1.5
+    return completion()
 
 
 @register_model("openai-completions")
@@ -337,20 +341,20 @@ def oa_chat_completion(client, **kwargs):
     else:
         import openai
 
-    async def _get_completions(**kwargs):
-        chat_completions = await client.chat.completions.create(**kwargs)
-        return chat_completions
+    def _exception_callback(e: Exception, sleep_time: float) -> None:
+        import traceback
 
-    backoff_time = 3
-    while True:
-        try:
-            return client.chat.completions.create(**kwargs)
-        except openai.OpenAIError:
-            import traceback
+        traceback.print_exc()
+
+    @retry_on_specific_exceptions(
+        on_exceptions=[openai.OpenAIError],
+        max_retries=None,  # retry forever, consider changing
+        on_exception_callback=_exception_callback,
+    )
+    def completion():
+        return client.chat.completions.create(**kwargs)
 
-            traceback.print_exc()
-            time.sleep(backoff_time)
-            backoff_time *= 1.5
+    return completion()
 
 
 @register_model("openai-chat-completions", "local-chat-completions")
diff --git a/lm_eval/models/textsynth.py b/lm_eval/models/textsynth.py
index 32917d692c..a75e6d8d19 100644
--- a/lm_eval/models/textsynth.py
+++ b/lm_eval/models/textsynth.py
@@ -13,13 +13,13 @@
 """
 import logging
 import os
-import time
 
 import requests as _requests
 from tqdm import tqdm
 
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
+from lm_eval.utils import retry_on_specific_exceptions
 
 
 logger = logging.getLogger(__name__)
@@ -29,21 +29,26 @@ def textsynth_completion(**kwargs):
     """Query TextSynth API for completion.
     Retry with back-off until they respond.
     """
-    backoff_time = 3
-    while True:
-        try:
-            return _requests.post(**kwargs)
-        except _requests.exceptions.RequestException:
-            import traceback
 
-            traceback.print_exc()
-            time.sleep(backoff_time)
-            backoff_time *= 1.5
+    def _exception_callback(e: Exception, sleep_time: float) -> None:
+        import traceback
+
+        traceback.print_exc()
+
+    @retry_on_specific_exceptions(
+        on_exceptions=[_requests.exceptions.RequestException],
+        max_retries=None,  # retry forever, consider changing
+        on_exception_callback=_exception_callback,
+    )
+    def completion():
+        return _requests.post(**kwargs)
+
+    return completion()
 
 
 @register_model("textsynth")
 class TextSynthLM(LM):
-    def __init__(self, engine, truncate: bool = False) -> None:
+    def __init__(self, engine, truncate: bool = False, **kwargs) -> None:
         """
         :param engine: str
             TextSynth API engine (e.g. `gptj_6B`)
diff --git a/lm_eval/utils.py b/lm_eval/utils.py
index 74f4f482da..29d6947a2c 100644
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -10,8 +10,10 @@
 import re
 import subprocess
 import sys
+import time
+from functools import wraps
 from itertools import islice
-from typing import Any, Callable, Iterator, List, Literal, Union
+from typing import Any, Callable, Iterator, List, Literal, Optional, Type, Union
 
 import torch
 import transformers
@@ -714,3 +716,43 @@ def divide(iterable, n) -> List[Iterator]:
         ret.append(iter(seq[start:stop]))
 
     return ret
+
+
+def retry_on_specific_exceptions(
+    on_exceptions: List[Type[Exception]],
+    max_retries: Optional[int] = None,
+    backoff_time: float = 3.0,
+    backoff_multiplier: float = 1.5,
+    on_exception_callback: Optional[Callable[[Exception, float], Any]] = None,
+):
+    """Retry on an LLM Provider's rate limit error with exponential backoff
+    For example, to use for OpenAI, do the following:
+    ```
+    from openai import RateLimitError
+
+    # Recommend specifying max_retries to avoid infinite loops!
+    @retry_on_specific_exceptions([RateLimitError], max_retries=3)
+    def completion(...):
+        # Wrap OpenAI completion function here
+        ...
+    ```
+    """
+
+    def decorator(func: Callable):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            sleep_time = backoff_time
+            attempt = 0
+            while max_retries is None or attempt < max_retries:
+                try:
+                    return func(*args, **kwargs)
+                except tuple(on_exceptions) as e:
+                    if on_exception_callback is not None:
+                        on_exception_callback(e, sleep_time)
+                    time.sleep(sleep_time)
+                    sleep_time *= backoff_multiplier
+                    attempt += 1
+
+        return wrapper
+
+    return decorator

From 25cefbc1b37a9c4d10acab51a9ec283f2ff34772 Mon Sep 17 00:00:00 2001
From: Bram Vanroy <2779410+BramVanroy@users.noreply.github.com>
Date: Fri, 22 Dec 2023 10:25:02 +0100
Subject: [PATCH 14/25] Refer in README to main branch (#1200)

---
 README.md              | 2 +-
 docs/new_task_guide.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index d8f7bc388d..2f4fb02fa2 100644
--- a/README.md
+++ b/README.md
@@ -273,7 +273,7 @@ You can find an example of this workflow in [examples/visualize-zeno.ipynb](exam
 
 ## How to Contribute or Learn More?
 
-For more information on the library and how everything fits together, check out all of our [documentation pages](https://github.com/EleutherAI/lm-evaluation-harness/tree/big-refactor/docs)! We plan to post a larger roadmap of desired + planned library improvements soon, with more information on how contributors can help.
+For more information on the library and how everything fits together, check out all of our [documentation pages](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs)! We plan to post a larger roadmap of desired + planned library improvements soon, with more information on how contributors can help.
 
 ### Implementing new tasks
 
diff --git a/docs/new_task_guide.md b/docs/new_task_guide.md
index cfcf0e4d98..b6be316284 100644
--- a/docs/new_task_guide.md
+++ b/docs/new_task_guide.md
@@ -359,4 +359,4 @@ It is recommended to include a filled-out copy of this checklist in the README.m
 
 ## Submitting your task
 
-You're all set! Now push your work and make a pull request to the `big-refactor` branch! Thanks for the contribution :). If there are any questions, please leave a message in the `#lm-thunderdome` channel on the EAI discord!
+You're all set! Now push your work and make a pull request to the `main` branch! Thanks for the contribution :). If there are any questions, please leave a message in the `#lm-thunderdome` channel on the EAI discord!

From b69ca72ec3a0294638382e0f90cf32f90d761b44 Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Fri, 22 Dec 2023 08:57:11 -0500
Subject: [PATCH 15/25] Update minerva_math_algebra.yaml (#1189)


From 5503b274e0b3f494ee13d740e29833330436ea03 Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Fri, 22 Dec 2023 10:00:07 -0500
Subject: [PATCH 16/25] Upstream Mamba Support (`mamba_ssm`) (#1110)

* modularize HFLM code

* pass through extra kwargs to AutoModel.from_pretrained call

* remove explicit model_kwargs

* rename gptq -> autogptq

* fix tokenizer pad token errors

* ensure model always respects device_map and autogptq's selected devices

* add a _get_config helper fn

* add mambaLMWrapper

* add mamba extra

* add mamba extra

* fix conditional import

* Fix botched merge commit

* Remove beginning-of-file comment for consistency

* Add docstring for mambaLM re: supported kwargs

* Alphabetize extras

* Update extras table

* appease precommit

* run precommit on mamba_lm
---
 README.md                  |  15 +++--
 lm_eval/models/__init__.py |   1 +
 lm_eval/models/mamba_lm.py | 125 +++++++++++++++++++++++++++++++++++++
 pyproject.toml             |  28 +++++----
 4 files changed, 152 insertions(+), 17 deletions(-)
 create mode 100644 lm_eval/models/mamba_lm.py

diff --git a/README.md b/README.md
index 2f4fb02fa2..4709fb6738 100644
--- a/README.md
+++ b/README.md
@@ -51,15 +51,20 @@ We also provide a number of optional dependencies for extended functionality. Ex
 | Name          | Use                                   |
 |---------------|---------------------------------------|
 | anthropic     | For using Anthropic's models          |
+| dev           | For linting PRs and contributions     |
 | gptq          | For loading models with GPTQ          |
-| dev           | You probably don't want to use this   |
+| ifeval        | For running the IFEval task           |
+| mamba         | For loading Mamba SSM models          |
+| math          | For running math task answer checking |
 | multilingual  | For multilingual tokenizers           |
 | openai        | For using OpenAI's models             |
-| promptsource  | For using PromtSource prompts         |
+| promptsource  | For using PromptSource prompts        |
 | sentencepiece | For using the sentencepiece tokenizer |
+| testing       | For running library test suite        |
 | vllm          | For loading models with vLLM          |
 | zeno          | For visualizing results with Zeno     |
-| all           | Loads all extras                      |
+|---------------|---------------------------------------|
+| all           | Loads all extras (not recommended)    |
 
 ## Basic Usage
 
@@ -162,7 +167,6 @@ lm_eval --model local-chat-completions --tasks gsm8k --model_args model=facebook
 ```
 Note that for externally hosted models, configs such as `--device` and `--batch_size` should not be used and do not function. Just like you can use `--model_args` to pass arbitrary arguments to the model constructor for local models, you can use it to pass arbitrary arguments to the model API for hosted models. See the documentation of the hosting service for information on what arguments they support.
 
-
 | API or Inference Server                                                                                                   | Implemented?                    | `--model <xxx>` name                                                | Models supported:                                                                             | Request Types:                                             |
 |---------------------------------------------------------------------------------------------------------------------------|---------------------------------|---------------------------------------------------------------------|-----------------------------------------------------------------------------------------------|------------------------------------------------------------|
 | OpenAI Completions                                                                                                        | :heavy_check_mark:              | `openai-completions` | up to `code-davinci-002`                                                                      | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
@@ -172,7 +176,8 @@ Note that for externally hosted models, configs such as `--device` and `--batch_
 | Cohere                                                                                                                    | [:hourglass: - blocked on Cohere API bug](https://github.com/EleutherAI/lm-evaluation-harness/pull/395) | N/A                                                                 | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models)                        | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
 | [Llama.cpp](https://github.com/ggerganov/llama.cpp) (via [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)) | :heavy_check_mark:              | `gguf`, `ggml`                                                      | [All models supported by llama.cpp](https://github.com/ggerganov/llama.cpp)                   | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
 | vLLM                                                                                                                      | :heavy_check_mark:       | `vllm`                                                              | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
-| Your local inference server!                                                                                              | :heavy_check_mark:                             | `local-chat-completions` (using `openai-completions` model type)    | Any server address that accepts GET requests using HF models and mirror's OpenAI's ChatCompletions interface                                  | `generate_until`                                           |                                | ...                                                      |
+| Mamba                       | :heavy_check_mark:       | `mamba_ssm`                                                                      | [Mamba architecture Language Models via the `mamba_ssm` package](https://huggingface.co/state-spaces) | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                             |
+| Your local inference server!                                                                                              | :heavy_check_mark:                             | `local-chat-completions` (using `openai-chat-completions` model type)    | Any server address that accepts GET requests using HF models and mirror's OpenAI's ChatCompletions interface                                  | `generate_until`                                           |                                | ...                                                      |
 
 It is on our roadmap to create task variants designed to enable models which do not serve logprobs/loglikelihoods to be compared with generation performance of open-source models.
 
diff --git a/lm_eval/models/__init__.py b/lm_eval/models/__init__.py
index 201bff3ba6..f994bdebf5 100644
--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
@@ -5,5 +5,6 @@
 from . import anthropic_llms
 from . import gguf
 from . import vllm_causallms
+from . import mamba_lm
 
 # TODO: implement __all__
diff --git a/lm_eval/models/mamba_lm.py b/lm_eval/models/mamba_lm.py
new file mode 100644
index 0000000000..fc7769fd59
--- /dev/null
+++ b/lm_eval/models/mamba_lm.py
@@ -0,0 +1,125 @@
+from typing import Optional, Union
+
+import torch
+
+from lm_eval import utils
+from lm_eval.api.registry import register_model
+from lm_eval.models.huggingface import HFLM
+
+
+@register_model("mamba_ssm")
+class MambaLMWrapper(HFLM):
+    def __init__(
+        self,
+        pretrained="state-spaces/mamba-130m",
+        **kwargs,
+    ) -> None:
+        """
+        Mamba (via the `mamba_ssm` package) supports the following args:
+        ```
+        d_model: int,
+        n_layer: int,
+        vocab_size: int,
+        initializer_cfg=None,
+        pad_vocab_size_multiple: int = 1,
+        ssm_cfg=None,
+        norm_epsilon: float = 1e-5,
+        rms_norm: bool = False,
+        initializer_cfg=None,
+        fused_add_norm=False,
+        residual_in_fp32=False,
+        ```
+
+        See https://github.com/state-spaces/mamba/blob/main/mamba_ssm/models/mixer_seq_simple.py#L175 for more info.
+        The above can all be passed via `--model_args` or to this __init__() directly
+        but we recommend placing many of these within the config.json file uploaded alongside your
+        Mamba model to the HF Hub instead.
+        All other HuggingFace from_pretrained() kwargs
+        such as those related to
+        `parallelize=True`, PEFT, autoGPTQ,
+        or any sub-configurations of these advanced args,
+        are unsupported by the `mamba_ssm` package.
+
+        The HFLM arguments
+
+        `backend`, `revision`, `subfolder`, `tokenizer`, `truncation`, `max_length`,
+        `device`, `dtype`, `batch_size`, `max_batch_size`, `trust_remote_code`, `use_fast_tokenizer`
+
+        Are all supported by Mamba where they do not conflict
+        with Mamba-specific restrictions such as causal LMs only.
+        """
+
+        if "backend" in kwargs:
+            # mamba currently only supports causal models
+            assert kwargs["backend"] == "causal"
+
+        super().__init__(
+            pretrained=pretrained,
+            # set appropriate defaults for tokenizer, max length, etc
+            backend=kwargs.get("backend", "causal"),
+            tokenizer=kwargs.get("tokenizer", "EleutherAI/gpt-neox-20b"),
+            max_length=kwargs.get("max_length", 2048),
+            **kwargs,
+        )
+
+    def _get_config(
+        self,
+        pretrained: str,
+        **kwargs,
+    ) -> None:
+        try:
+            from mamba_ssm.utils.hf import load_config_hf  # noqa: F811
+        except ModuleNotFoundError:
+            raise Exception(
+                "attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
+please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
+            )
+
+        self._config = load_config_hf(pretrained)
+
+    def _create_model(
+        self,
+        pretrained: str,
+        dtype: Optional[Union[str, torch.dtype]] = "float16",
+        # no `parallelize=True` options
+        # no PEFT and quantization options
+        # Mamba does not support arbitrary HF from_pretrained() args
+        **kwargs,
+    ) -> None:
+        try:
+            from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel  # noqa: F811
+        except ModuleNotFoundError:
+            raise Exception(
+                "attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
+please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
+            )
+
+        self._model = MambaLMHeadModel.from_pretrained(
+            pretrained,
+            device=self._device,
+            dtype=torch.float16 if dtype == "auto" else utils.get_dtype(dtype),
+            **kwargs,
+        )
+
+    def _model_generate(self, context, max_length, stop, **generation_kwargs):
+        for key in ("do_sample", "attention_mask"):
+            if key in generation_kwargs:
+                generation_kwargs.pop(key)
+
+        # mamba's custom GenerationMixin currently does not support
+        # passing stopping criteria.
+        # for the time being, we simply generate to max length,
+        # then truncate (equivalent result)
+        # -- this should be revisited to speed up generation
+        # stopping_criteria = stop_sequences_criteria(
+        #     self.tokenizer, stop, 1, context.shape[0]
+        # )
+
+        return self.model.generate(
+            input_ids=context,
+            max_length=max_length,
+            # stopping_criteria=stopping_criteria,
+            # pad_token_id=self.tokenizer.pad_token_id,
+            # use_cache=True,
+            **generation_kwargs,
+        )
diff --git a/pyproject.toml b/pyproject.toml
index 87eefc72d3..42edbb61db 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,31 +54,35 @@ Homepage = "https://github.com/EleutherAI/lm-evaluation-harness"
 Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
 
 [project.optional-dependencies]
+anthropic = ["anthropic"]
 dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy"]
-multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
+gptq = ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"]
+ifeval = ["langdetect", "immutabledict"]
+mamba = ["mamba_ssm", "causal-conv1d==1.0.2"]
 math = ["sympy>=1.12", "antlr4-python3-runtime==4.11"]
-sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
+multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
+openai = ["openai==1.3.9", "tiktoken"]
 promptsource = [
     "promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
 ]
-gptq = ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"]
-anthropic = ["anthropic"]
-openai = ["openai==1.3.9", "tiktoken"]
+sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
+testing = ["pytest", "pytest-cov", "pytest-xdist"]
 vllm = ["vllm"]
-ifeval = ["langdetect", "immutabledict"]
 zeno = ["pandas", "zeno-client"]
 all = [
+    "lm_eval[anthropic]",
     "lm_eval[dev]",
-    "lm_eval[testing]",
+    "lm_eval[gptq]",
+    "lm_eval[ifeval]",
     "lm_eval[linting]",
+    "lm_eval[mamba]",
+    "lm_eval[math]",
     "lm_eval[multilingual]",
-    "lm_eval[sentencepiece]",
-    "lm_eval[promptsource]",
-    "lm_eval[gptq]",
-    "lm_eval[anthropic]",
     "lm_eval[openai]",
+    "lm_eval[promptsource]",
+    "lm_eval[sentencepiece]",
+    "lm_eval[testing]",
     "lm_eval[vllm]",
-    "lm_eval[ifeval]",
     "lm_eval[zeno]",
 ]
 

From 8286b1d99289c62b059150e44455c9dcec993dff Mon Sep 17 00:00:00 2001
From: Anjor Kanekar <anjor@umd.edu>
Date: Fri, 22 Dec 2023 19:37:46 +0000
Subject: [PATCH 17/25] Fixes
 https://github.com/EleutherAI/lm-evaluation-harness/issues/437 (#1180)

---
 lm_eval/models/huggingface.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index dc243a1a5c..64d24ca5f9 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -142,9 +142,7 @@ def __init__(
                     + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
                     + ["mps", "mps:0"]
                 )
-                if device:
-                    if device not in device_list:
-                        device = int(device)
+                if device and device in device_list:
                     self._device = torch.device(device)
                     eval_logger.info(f"Using device '{device}'")
                     if device in ("mps", "mps:0") and version.parse(

From b12bb1d4c0718a3f001fc49b91a46d261921775d Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Sat, 23 Dec 2023 07:43:31 -0500
Subject: [PATCH 18/25] Fix documentation in API table (#1203)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4709fb6738..edbf39b402 100644
--- a/README.md
+++ b/README.md
@@ -174,7 +174,7 @@ Note that for externally hosted models, configs such as `--device` and `--batch_
 | Anthropic                                                                                                                 | :heavy_check_mark:              | `anthropic`                                                         | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model)  | `generate_until` (no logprobs)                             |
 | Textsynth                                                                                                                 | :heavy_check_mark:                   | `textsynth`                                                         | [All supported engines](https://textsynth.com/documentation.html#engines)                     | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
 | Cohere                                                                                                                    | [:hourglass: - blocked on Cohere API bug](https://github.com/EleutherAI/lm-evaluation-harness/pull/395) | N/A                                                                 | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models)                        | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
-| [Llama.cpp](https://github.com/ggerganov/llama.cpp) (via [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)) | :heavy_check_mark:              | `gguf`, `ggml`                                                      | [All models supported by llama.cpp](https://github.com/ggerganov/llama.cpp)                   | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| [Llama.cpp](https://github.com/ggerganov/llama.cpp) (via [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)) | :heavy_check_mark:              | `gguf`, `ggml`                                                      | [All models supported by llama.cpp](https://github.com/ggerganov/llama.cpp)                   | `generate_until`, `loglikelihood`, (perplexity evaluation not yet implemented) |
 | vLLM                                                                                                                      | :heavy_check_mark:       | `vllm`                                                              | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
 | Mamba                       | :heavy_check_mark:       | `mamba_ssm`                                                                      | [Mamba architecture Language Models via the `mamba_ssm` package](https://huggingface.co/state-spaces) | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                             |
 | Your local inference server!                                                                                              | :heavy_check_mark:                             | `local-chat-completions` (using `openai-chat-completions` model type)    | Any server address that accepts GET requests using HF models and mirror's OpenAI's ChatCompletions interface                                  | `generate_until`                                           |                                | ...                                                      |

From 9fb2ebababd78986f445cbe10e97c499213c407a Mon Sep 17 00:00:00 2001
From: Baber Abbasi <92168766+baberabb@users.noreply.github.com>
Date: Sat, 23 Dec 2023 22:58:50 +0500
Subject: [PATCH 19/25] Consolidate batching (#1197)

* refactor dataloader

* cleanup + add docs

* change arg

* renamed Collator and added testing

* parametrized test for Collator

* appease pre-commit

* added edge case batch 0 (no batching)

* fix typos

---------

Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
---
 lm_eval/models/huggingface.py | 211 +++++++++++++++++-----------------
 lm_eval/utils.py              | 167 ++++++++++++++++++++++++++-
 tests/test_utils.py           |  77 ++++++++++++-
 3 files changed, 346 insertions(+), 109 deletions(-)

diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index 64d24ca5f9..565b20177f 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -1,6 +1,5 @@
 import copy
 import os
-from collections import defaultdict
 from pathlib import Path
 from typing import List, Literal, Optional, Tuple, Union
 
@@ -21,7 +20,7 @@
 from lm_eval.api.instance import Instance
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
-from lm_eval.utils import stop_sequences_criteria
+from lm_eval.utils import Collator, stop_sequences_criteria
 
 
 eval_logger = utils.eval_logger
@@ -632,7 +631,7 @@ def tok_batch_encode(
         padding_side: str = "left",
         left_truncate_len: int = None,
         truncation: bool = False,
-    ) -> Tuple[List[int], List[int]]:
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         # encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
         old_padding_side = self.tokenizer.padding_side
         self.tokenizer.padding_side = padding_side
@@ -842,6 +841,7 @@ def _loglikelihood_tokens(
         res = []
 
         def _collate(x):
+            """Defines the key for the sorted method"""
             # the negative sign on len(toks) sorts descending - this has a few advantages:
             # - time estimates will always be over not underestimates, which is more useful for planning
             # - to know the size of a batch when going through the list, you know the first one is always the batch
@@ -852,26 +852,27 @@ def _collate(x):
             toks = x[1] + x[2]
             return -len(toks), tuple(toks)
 
-        re_ord = utils.Reorderer(requests, _collate)
+        re_ord = Collator(requests, sort_fn=_collate)
 
-        n_reordered_requests = len(re_ord.get_reordered())
         # automatic (variable) batch size detection for vectorization
         # pull longest context sample from request
-
-        chunks = utils.chunks(
-            re_ord.get_reordered(),
-            n=self.batch_size
+        n_reordered_requests = len(re_ord)
+        batch_size = (
+            self.batch_size
             if self.batch_size != "auto"
             else override_bs
             if override_bs is not None
-            else 0,
-            fn=self._batch_scheduler
+            else 0
+        )
+        batch_fn = (
+            self._batch_scheduler
             if self.batch_size == "auto"
             and n_reordered_requests > 0
             and not override_bs
-            else None,
+            else None
         )
 
+        chunks = re_ord.get_batched(n=batch_size, batch_fn=batch_fn)
         pbar = tqdm(total=len(requests), disable=(disable_tqdm or (self.rank != 0)))
         for chunk in chunks:
             inps = []
@@ -1015,10 +1016,10 @@ def _collate(x):
         return re_ord.get_original(res)
 
     def generate_until(self, requests: List[Instance]) -> List[str]:
-        res = defaultdict(list)
-        re_ords = {}
+        res = []
 
         def _collate(x):
+            """Defines the key for the sorted method"""
             # the negative sign on len(toks) sorts descending - this has a few advantages:
             # - time estimates will always be over not underestimates, which is more useful for planning
             # - to know the size of a batch when going through the list, you know the first one is always the batch
@@ -1028,14 +1029,6 @@ def _collate(x):
             toks = self.tok_encode(x[0])
             return -len(toks), x[0]
 
-        # we group requests by their generation_kwargs,
-        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
-        # in the same batch.
-        grouper = utils.Grouper(requests, lambda x: str(x.args[1]))
-        for key, reqs in grouper.get_grouped().items():
-            # within each set of reqs for given kwargs, we reorder by token length, descending.
-            re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate)
-
         pbar = tqdm(total=len(requests), disable=(self.rank != 0))
         if self.batch_size == "auto":
             # using rolling window with maximum context
@@ -1044,98 +1037,102 @@ def _collate(x):
             print(f"Determined Largest batch size: {batch_size}")
             adaptive_batch_size = batch_size
         # for each different set of kwargs, we execute all requests, by batch.
-        for key, re_ord in re_ords.items():
-            chunks = utils.chunks(
-                re_ord.get_reordered(),
-                n=self.batch_size
-                if self.batch_size != "auto"
-                else adaptive_batch_size
-                if adaptive_batch_size is not None
-                else 0,
-                fn=self._batch_scheduler
-                if self.batch_size == "auto" and not adaptive_batch_size
-                else None,
-            )
-            for chunk in chunks:
-                contexts, all_gen_kwargs = zip(*chunk)
-                # we assume all gen kwargs in the batch are the same
-                # this is safe to assume because the `grouper` object ensures it.
-                gen_kwargs = all_gen_kwargs[0]
-                # unpack our keyword arguments.
-                until = None
-                if isinstance(gen_kwargs, dict):
-                    kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
-                    if "until" in kwargs.keys():
-                        until = kwargs.pop("until")
-                        if isinstance(until, str):
-                            until = [kwargs]
-                        elif not isinstance(until, list):
-                            raise ValueError(
-                                f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
-                            )
-                else:
-                    raise ValueError(
-                        f"Expected `kwargs` to be of type `dict` but got {kwargs}"
-                    )
-                if not until:
-                    until = [self.tok_decode(self.eot_token_id)]
-                if "max_gen_toks" in kwargs.keys():
-                    max_gen_toks = kwargs.pop("max_gen_toks")
-                else:
-                    max_gen_toks = self.max_gen_toks
+        batch_size = (
+            self.batch_size
+            if self.batch_size != "auto"
+            else adaptive_batch_size
+            if adaptive_batch_size is not None
+            else 0
+        )
+        batch_fn = (
+            self._batch_scheduler
+            if self.batch_size == "auto" and not adaptive_batch_size
+            else None
+        )
 
-                # set the max length in tokens of inputs ("context_enc")
-                if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
-                    # max len for inputs = max length, minus room to generate the max new tokens
-                    max_ctx_len = self.max_length - max_gen_toks
-                elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
-                    # max len for inputs = encoder's whole max_length
-                    max_ctx_len = self.max_length
-
-                # encode, pad, and truncate contexts for this batch
-                context_enc, attn_masks = self.tok_batch_encode(
-                    contexts,
-                    left_truncate_len=max_ctx_len,
-                    truncation=self.truncation,
-                )
-                context_enc = context_enc.to(self.device)
-                attn_masks = attn_masks.to(self.device)
-
-                if "max_length" not in kwargs:
-                    kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
-
-                # perform batched generation
-                cont = self._model_generate(
-                    context=context_enc,
-                    attention_mask=attn_masks,
-                    stop=until,
-                    **kwargs,
+        # we group requests by their generation_kwargs,
+        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+        # in the same batch.
+        re_ords = Collator([reg.args for reg in requests], _collate, grouping=True)
+        chunks = re_ords.get_batched(n=batch_size, batch_fn=batch_fn)
+        for chunk in chunks:
+            contexts, all_gen_kwargs = zip(*chunk)
+            # we assume all gen kwargs in the batch are the same
+            # this is safe to assume because the `grouper` object ensures it.
+            gen_kwargs = all_gen_kwargs[0]
+            # unpack our keyword arguments.
+            until = None
+            if isinstance(gen_kwargs, dict):
+                kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                if "until" in kwargs.keys():
+                    until = kwargs.pop("until")
+                    if isinstance(until, str):
+                        until = [kwargs]
+                    elif not isinstance(until, list):
+                        raise ValueError(
+                            f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
+                        )
+            else:
+                raise ValueError(
+                    f"Expected `kwargs` to be of type `dict` but got {kwargs}"
                 )
+            if not until:
+                until = [self.tok_decode(self.eot_token_id)]
+            if "max_gen_toks" in kwargs.keys():
+                max_gen_toks = kwargs.pop("max_gen_toks")
+            else:
+                max_gen_toks = self.max_gen_toks
+
+            # set the max length in tokens of inputs ("context_enc")
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                # max len for inputs = max length, minus room to generate the max new tokens
+                max_ctx_len = self.max_length - max_gen_toks
+            elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                # max len for inputs = encoder's whole max_length
+                max_ctx_len = self.max_length
+
+            # encode, pad, and truncate contexts for this batch
+            context_enc, attn_masks = self.tok_batch_encode(
+                contexts,
+                left_truncate_len=max_ctx_len,
+                truncation=self.truncation,
+            )
+            context_enc = context_enc.to(self.device)
+            attn_masks = attn_masks.to(self.device)
 
-                cont_toks_list = cont.tolist()
-                for cont_toks, context in zip(cont_toks_list, contexts):
-                    # discard context + left-padding toks if using causal decoder-only LM
-                    if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
-                        cont_toks = cont_toks[context_enc.shape[1] :]
+            if "max_length" not in kwargs:
+                kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
 
-                    s = self.tok_decode(cont_toks)
+            # perform batched generation
+            cont = self._model_generate(
+                context=context_enc,
+                attention_mask=attn_masks,
+                stop=until,
+                **kwargs,
+            )
 
-                    # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
-                    for term in until:
-                        if len(term) > 0:
-                            # ignore '' separator,
-                            # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
-                            s = s.split(term)[0]
+            cont_toks_list = cont.tolist()
+            for cont_toks, context in zip(cont_toks_list, contexts):
+                # discard context + left-padding toks if using causal decoder-only LM
+                if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                    cont_toks = cont_toks[context_enc.shape[1] :]
 
-                    res[key].append(s)
+                s = self.tok_decode(cont_toks)
 
-                    self.cache_hook.add_partial(
-                        "generate_until", (context, gen_kwargs), s
-                    )
-                    pbar.update(1)
-            # reorder this group of results back to original unsorted form
-            res[key] = re_ord.get_original(res[key])
+                # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
+                for term in until:
+                    if len(term) > 0:
+                        # ignore '' separator,
+                        # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
+                        s = s.split(term)[0]
+
+                res.append(s)
+
+                self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
+                pbar.update(1)
+        # reorder this group of results back to original unsorted form
+        res = re_ords.get_original(res)
 
         pbar.close()
 
-        return grouper.get_original(res)
+        return res
diff --git a/lm_eval/utils.py b/lm_eval/utils.py
index 29d6947a2c..d407cf7995 100644
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -13,7 +13,18 @@
 import time
 from functools import wraps
 from itertools import islice
-from typing import Any, Callable, Iterator, List, Literal, Optional, Type, Union
+from typing import (
+    Any,
+    Callable,
+    Iterable,
+    Iterator,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    Type,
+    Union,
+)
 
 import torch
 import transformers
@@ -756,3 +767,157 @@ def wrapper(*args, **kwargs):
         return wrapper
 
     return decorator
+
+
+class Collator:
+    """
+    A class for reordering and batching elements of an array.
+
+    This class allows for sorting an array based on a provided sorting function, grouping elements based on a grouping function, and generating batches from the sorted and grouped data.
+    """
+
+    def __init__(
+        self,
+        arr: List,
+        sort_fn: Callable,
+        group_fn: Callable = lambda x: x[1],
+        grouping: bool = False,
+    ) -> None:
+        self.grouping = grouping
+        self.fn = sort_fn
+        self.group_fn = lambda x: group_fn(x[1])  # first index are enumerated indices
+        self.reorder_indices: List = []
+        self.size = len(arr)
+        self.arr_with_indices: Iterable[Any] = tuple(enumerate(arr))  # [indices, (arr)]
+        if self.grouping is True:
+            self.group_by_index()
+
+    def group_by_index(self) -> None:
+        self.arr_with_indices = self.group(
+            self.arr_with_indices, fn=self.group_fn, values=False
+        )
+
+    def get_batched(self, n: int = 1, batch_fn: Optional[Callable] = None) -> Iterator:
+        """
+        Generates and yields batches from the reordered array.
+
+        Parameters:
+        - n (int): The size of each batch. Defaults to 1.
+        - batch_fn (Optional[Callable[[int, Iterable], int]]): A function to determine the size of each batch. Defaults to None.
+
+        Yields:
+        Iterator: An iterator over batches of reordered elements.
+        """
+        if self.grouping:
+            for (
+                key,
+                values,
+            ) in self.arr_with_indices.items():  # type: ignore
+                values = self._reorder(values)
+                batch = self.get_chunks(values, n=n, fn=batch_fn)
+                yield from batch
+        else:
+            values = self._reorder(self.arr_with_indices)  # type: ignore
+            batch = self.get_chunks(values, n=n, fn=batch_fn)
+            yield from batch
+
+    def _reorder(self, arr: Union[List, Tuple[Tuple[int, Any], ...]]) -> List:
+        """
+        Reorders the elements in the array based on the sorting function.
+
+        Parameters:
+        - arr (Union[List, Tuple[Tuple[int, Any], ...]]): The array or iterable to be reordered.
+
+        Yields:
+        List: Yields reordered elements one by one.
+        """
+        arr = sorted(arr, key=lambda x: self.fn(x[1]))
+        self.reorder_indices.extend([x[0] for x in arr])
+        yield from [x[1] for x in arr]
+
+    def get_original(self, newarr: List) -> List:
+        """
+        Restores the original order of elements from the reordered list.
+
+        Parameters:
+        - newarr (List): The reordered array.
+
+        Returns:
+        List: The array with elements restored to their original order.
+        """
+        res = [None] * self.size
+        cov = [False] * self.size
+
+        for ind, v in zip(self.reorder_indices, newarr):
+            res[ind] = v
+            cov[ind] = True
+
+        assert all(cov)
+
+        return res
+
+    def __len__(self):
+        return self.size
+
+    def group(self, arr: Iterable, fn: Callable, values: bool = False) -> Iterable:
+        """
+        Groups elements of an iterable based on a provided function.
+
+        Parameters:
+        - arr (Iterable): The iterable to be grouped.
+        - fn (Callable): The function to determine the grouping.
+        - values (bool): If True, returns the values of the group. Defaults to False.
+
+        Returns:
+        Iterable: An iterable of grouped elements.
+        """
+        res = collections.defaultdict(list)
+        for ob in arr:
+            try:
+                hashable_dict = tuple(
+                    (key, tuple(value) if isinstance(value, list) else value)
+                    for key, value in sorted(ob[1][1].items())
+                )
+                res[hashable_dict].append(ob)
+            except TypeError:
+                res[fn(ob)].append(ob)
+        if not values:
+            return res
+        return res.values()
+
+    def get_chunks(self, iter, n: int = 0, fn=None):
+        """
+        Divides an iterable into chunks of specified size or based on a given function.
+        Useful for batching
+
+        Parameters:
+        - iter: The input iterable to be divided into chunks.
+        - n: An integer representing the size of each chunk. Default is 0.
+        - fn: A function that takes the current index and the iterable as arguments and returns the size of the chunk. Default is None.
+
+        Returns:
+        An iterator that yields chunks of the input iterable.
+
+        Example usage:
+        ```
+        data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        for chunk in chunks(data, 3):
+            print(chunk)
+        ```
+        Output:
+        ```
+        [1, 2, 3]
+        [4, 5, 6]
+        [7, 8, 9]
+        [10]
+        ```
+        """
+        arr = []
+        for i, x in enumerate(iter):
+            arr.append(x)
+            if len(arr) == (fn(i, iter) if fn else n):
+                yield arr
+                arr = []
+
+        if arr:
+            yield arr
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 454e153237..c50b16781f 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,4 +1,6 @@
-from lm_eval.utils import get_rolling_token_windows, make_disjoint_window
+import pytest
+
+from lm_eval.utils import Collator, get_rolling_token_windows, make_disjoint_window
 
 
 # noinspection DuplicatedCode
@@ -220,3 +222,76 @@ def test_make_disjoint_window():
     )
     assert make_disjoint_window(([1, 2, 3, 4, 5], [4, 5, 6])) == ([1, 2, 3], [4, 5, 6])
     assert make_disjoint_window(([1, 2, 3, 4, 5], [6])) == ([1, 2, 3, 4, 5], [6])
+
+
+class TestCollator:
+    def make_generate_sample(self, end=10):
+        strings = ["x" * i for i in range(1, end + 1)]
+        gen_kwargs1, gen_kwargs2 = (
+            {"temperature": 0},
+            {"temperature": 0, "until": ["nn", "\n\n"]},
+        )
+        args = [
+            (string, gen_kwargs1 if i < len(strings) // 2 else gen_kwargs2)
+            for i, string in enumerate(strings)
+        ]
+
+        return args
+
+    def make_loglikelihood_sample(self, end=11):
+        samples = [
+            (("x", "x"), list(range(1, total_length + 1)))
+            for total_length in range(1, end + 1)
+        ]
+        return samples
+
+    @pytest.mark.parametrize("batch_size, end", [(17, 30), (8, 61), (12, 48), (0, 9)])
+    def test_generations(self, batch_size, end):
+        _collate_gen = lambda x: (-len(x[0]), x[0])  # noqa: E731
+
+        generation_samples = self.make_generate_sample(int(end))
+        gens = Collator(generation_samples, _collate_gen, grouping=True)
+        chunks = gens.get_batched(n=int(batch_size), batch_fn=None)
+        output = []
+        for chunks in chunks:
+            # check batching
+            group_one = end // 2
+            group_two = end - end // 2
+            assert (
+                len(chunks) <= batch_size
+                if batch_size != 0
+                else len(chunks) in [group_one, group_two]
+            )
+            # check if reorder-er is working correctly
+            assert all(
+                len(chunks[i][0]) <= len(chunks[i - 1][0])
+                for i in range(1, len(chunks))
+            )
+            # check if grouping correctly
+            assert all(x[1] == chunks[0][1] for x in chunks)
+            for x in chunks:
+                output.append(x)
+        reordered_output = gens.get_original(output)
+        # check get original
+        assert reordered_output == generation_samples
+
+    @pytest.mark.parametrize("batch_size, end", [(17, 30), (8, 61), (12, 48), (0, 3)])
+    def test_loglikelihood(self, batch_size, end):
+        _collate_log = lambda x: (-len(x[1]), tuple(x[1]))  # noqa: E731
+        loglikelihood_samples = self.make_loglikelihood_sample(int(end))
+        loglikelihoods = Collator(loglikelihood_samples, _collate_log, grouping=False)
+        chunks = loglikelihoods.get_batched(n=int(batch_size), batch_fn=None)
+        output = []
+        for chunks in chunks:
+            # check batching
+            assert len(chunks) <= batch_size if batch_size != 0 else len(chunks) == end
+            # check reorder
+            assert all(
+                len(chunks[i][1]) <= len(chunks[i - 1][1])
+                for i in range(1, len(chunks))
+            )
+            for x in chunks:
+                output.append(x[1])
+        # check indices
+        reordered_output = loglikelihoods.get_original(output)
+        assert reordered_output == [x[1] for x in loglikelihood_samples]

From 8ffbe58a63d2ce89002a788c3ea733c4d983d778 Mon Sep 17 00:00:00 2001
From: MorishT <106973776+MorishT@users.noreply.github.com>
Date: Sun, 24 Dec 2023 22:03:24 +0900
Subject: [PATCH 20/25] Add remove_whitespace to FLD benchmark (#1206)

* Add remove_whitespace to FLD benchmark

* bump task version

---------

Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
---
 lm_eval/tasks/fld/fld_default.yaml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/lm_eval/tasks/fld/fld_default.yaml b/lm_eval/tasks/fld/fld_default.yaml
index ee84f73bc5..114efa83c9 100644
--- a/lm_eval/tasks/fld/fld_default.yaml
+++ b/lm_eval/tasks/fld/fld_default.yaml
@@ -12,5 +12,10 @@ metric_list:
   - metric: exact_match
     aggregation: mean
     higher_is_better: true
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
 metadata:
-  version: 0.0
+  version: 1.0

From e4970d817ae1f8ad1fccab3d77e9ef844d332239 Mon Sep 17 00:00:00 2001
From: Yuliang Li <40186387+xTayEx@users.noreply.github.com>
Date: Sun, 24 Dec 2023 23:24:48 +0800
Subject: [PATCH 21/25] fix: incorrect argument order in `utils.divide` doc
 (#1208)

---
 lm_eval/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lm_eval/utils.py b/lm_eval/utils.py
index d407cf7995..3613095afd 100644
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -682,7 +682,7 @@ def divide(iterable, n) -> List[Iterator]:
     """Divide the elements from *iterable* into *n* parts, maintaining
     order.
 
-        >>> group_1, group_2 = divide(2, [1, 2, 3, 4, 5, 6])
+        >>> group_1, group_2 = divide([1, 2, 3, 4, 5, 6], 2)
         >>> list(group_1)
         [1, 2, 3]
         >>> list(group_2)
@@ -691,14 +691,14 @@ def divide(iterable, n) -> List[Iterator]:
     If the length of *iterable* is not evenly divisible by *n*, then the
     length of the returned iterables will not be identical:
 
-        >>> children = divide(3, [1, 2, 3, 4, 5, 6, 7])
+        >>> children = divide([1, 2, 3, 4, 5, 6, 7], 3)
         >>> [list(c) for c in children]
         [[1, 2, 3], [4, 5], [6, 7]]
 
     If the length of the iterable is smaller than n, then the last returned
     iterables will be empty:
 
-        >>> children = divide(5, [1, 2, 3])
+        >>> children = divide([1, 2, 3], 5)
         >>> [list(c) for c in children]
         [[1], [2], [3], [], []]
 

From af74a93d1abd9a35fee9d628c2637ae922a82541 Mon Sep 17 00:00:00 2001
From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
Date: Mon, 25 Dec 2023 10:56:53 -0500
Subject: [PATCH 22/25] pin vllm at < 0.2.6 (#1212)

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 42edbb61db..466f708429 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -67,7 +67,7 @@ promptsource = [
 ]
 sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
 testing = ["pytest", "pytest-cov", "pytest-xdist"]
-vllm = ["vllm"]
+vllm = ["vllm<=0.2.5"]
 zeno = ["pandas", "zeno-client"]
 all = [
     "lm_eval[anthropic]",

From f28539959d1bd18373446d6e0ee8fd926b89432e Mon Sep 17 00:00:00 2001
From: Jaewoo Yang <sweyjw@gmail.com>
Date: Wed, 27 Dec 2023 22:08:03 +0900
Subject: [PATCH 23/25] fix unbounded local variable (#1218)

---
 lm_eval/models/huggingface.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py
index 565b20177f..97bcc8d2ab 100644
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -201,7 +201,7 @@ def __init__(
         self.model.eval()
         self.model.tie_weights()
 
-        if (gpus >= 1 or str(self.device) == "mps") and isinstance(pretrained, str):
+        if isinstance(pretrained, str) and (gpus >= 1 or str(self.device) == "mps"):
             if not (parallelize or autogptq or ("device_map" in kwargs)):
                 # place model onto device requested manually,
                 # if not using HF Accelerate or device_map

From 6a1c19ed7406184da887c78fa755e647ce6febcf Mon Sep 17 00:00:00 2001
From: Baber Abbasi <92168766+baberabb@users.noreply.github.com>
Date: Wed, 27 Dec 2023 18:29:02 +0500
Subject: [PATCH 24/25] nits + fix siqa (#1216)

* fix group

* siqa: default.yml -> default.yaml

* max_gen_toks -> self.max_gen_toks

* add ids to task tests

* fix siqa

* fix gen_kwargs for openai-chat
---
 lm_eval/models/openai_completions.py          |  6 ++++--
 lm_eval/tasks/siqa/{default.yml => siqa.yaml} |  7 +++++--
 lm_eval/utils.py                              | 19 +++++++++++++------
 tests/test_tasks.py                           |  2 +-
 4 files changed, 23 insertions(+), 11 deletions(-)
 rename lm_eval/tasks/siqa/{default.yml => siqa.yaml} (74%)

diff --git a/lm_eval/models/openai_completions.py b/lm_eval/models/openai_completions.py
index 11f2cf33aa..c690765d95 100644
--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -254,6 +254,7 @@ def sameuntil_chunks(xs, size):
             list(sameuntil_chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE))
         ):
             inps = []
+            self._max_gen_toks = request_args.pop("max_gen_toks", self.max_gen_toks)
             for context, _ in chunk:
                 context_enc = self.tok_encode(context)
                 inp = context_enc[-(self.max_length - self.max_gen_toks) :]
@@ -441,8 +442,7 @@ def generate_until(self, requests) -> List[str]:
 
                 gen_kwargs = all_gen_kwargs[0]
                 until = None
-                if isinstance(gen_kwargs, dict):
-                    kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
+                if isinstance(kwargs := copy.deepcopy(gen_kwargs), dict):
                     if "do_sample" in kwargs.keys():
                         kwargs.pop("do_sample")
                     if "until" in kwargs.keys():
@@ -453,6 +453,8 @@ def generate_until(self, requests) -> List[str]:
                             raise ValueError(
                                 f"Expected repr(kwargs['until']) to be of type Union[str, list] but got {until}"
                             )
+                        kwargs["stop"] = until
+                    kwargs["max_tokens"] = kwargs.pop("max_gen_toks", self.max_gen_toks)
                 else:
                     raise ValueError(
                         f"Expected repr(kwargs) to be of type repr(dict) but got {kwargs}"
diff --git a/lm_eval/tasks/siqa/default.yml b/lm_eval/tasks/siqa/siqa.yaml
similarity index 74%
rename from lm_eval/tasks/siqa/default.yml
rename to lm_eval/tasks/siqa/siqa.yaml
index 35b14599d6..191ffa8d30 100644
--- a/lm_eval/tasks/siqa/default.yml
+++ b/lm_eval/tasks/siqa/siqa.yaml
@@ -6,8 +6,11 @@ training_split: train
 validation_split: validation
 doc_to_text: "Q: {{context}} {{question}}\nA:"
 target_delimiter: " "
-doc_to_choice: ["{{answerA}}", "{{answerB}}", "{{answerC}}"]
-doc_to_target: "{{label}}"
+doc_to_choice:
+  - "{{answerA}}"
+  - "{{answerB}}"
+  - "{{answerC}}"
+doc_to_target: "{{ (label|int) - 1 }}"
 metric_list:
   - metric: acc
     aggregation: mean
diff --git a/lm_eval/utils.py b/lm_eval/utils.py
index 3613095afd..e08fefa285 100644
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -859,7 +859,8 @@ def get_original(self, newarr: List) -> List:
     def __len__(self):
         return self.size
 
-    def group(self, arr: Iterable, fn: Callable, values: bool = False) -> Iterable:
+    @staticmethod
+    def group(arr: Iterable, fn: Callable, values: bool = False) -> Iterable:
         """
         Groups elements of an iterable based on a provided function.
 
@@ -875,8 +876,13 @@ def group(self, arr: Iterable, fn: Callable, values: bool = False) -> Iterable:
         for ob in arr:
             try:
                 hashable_dict = tuple(
-                    (key, tuple(value) if isinstance(value, list) else value)
-                    for key, value in sorted(ob[1][1].items())
+                    (
+                        key,
+                        tuple(value)
+                        if isinstance(value, collections.abc.Iterable)
+                        else value,
+                    )
+                    for key, value in sorted(fn(ob).items())
                 )
                 res[hashable_dict].append(ob)
             except TypeError:
@@ -885,7 +891,8 @@ def group(self, arr: Iterable, fn: Callable, values: bool = False) -> Iterable:
             return res
         return res.values()
 
-    def get_chunks(self, iter, n: int = 0, fn=None):
+    @staticmethod
+    def get_chunks(_iter, n: int = 0, fn=None):
         """
         Divides an iterable into chunks of specified size or based on a given function.
         Useful for batching
@@ -913,9 +920,9 @@ def get_chunks(self, iter, n: int = 0, fn=None):
         ```
         """
         arr = []
-        for i, x in enumerate(iter):
+        for i, x in enumerate(_iter):
             arr.append(x)
-            if len(arr) == (fn(i, iter) if fn else n):
+            if len(arr) == (fn(i, _iter) if fn else n):
                 yield arr
                 arr = []
 
diff --git a/tests/test_tasks.py b/tests/test_tasks.py
index 3651fd5ab3..a0a6c7c2b3 100644
--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
@@ -30,7 +30,7 @@ def limit() -> int:
 
 
 # Tests
-@pytest.mark.parametrize("task_class", task_class())
+@pytest.mark.parametrize("task_class", task_class(), ids=lambda x: f"{x.config.task}")
 class TestNewTasks:
     def test_download(self, task_class: ConfigurableTask):
         task_class.download()

From 46c796644913cd99da7eee868e64f9ed6af33407 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20B=C3=A4uerle?= <alex@a13x.io>
Date: Thu, 28 Dec 2023 15:34:20 +0100
Subject: [PATCH 25/25] add length of strings and answer options to metadata
 (#1222)

---
 scripts/zeno_visualize.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/scripts/zeno_visualize.py b/scripts/zeno_visualize.py
index 5c0dad713f..f4012afeca 100644
--- a/scripts/zeno_visualize.py
+++ b/scripts/zeno_visualize.py
@@ -164,6 +164,7 @@ def generate_dataset(
         {
             "id": ids,
             "data": instance,
+            "input_len": [len(x) for x in instance],
             "labels": labels,
             "output_type": config["output_type"],
         }
@@ -181,26 +182,30 @@ def generate_system_df(data, config):
         pd.Dataframe: A dataframe that is ready to be uploaded to Zeno as a system.
     """
     ids = [x["doc_id"] for x in data]
-    answers = [""] * len(ids)
+    system_dict = {"id": ids}
+    system_dict["output"] = [""] * len(ids)
 
     if config["output_type"] == "loglikelihood":
-        answers = [
+        system_dict["output"] = [
             "correct" if x["filtered_resps"][0][1] is True else "incorrect"
             for x in data
         ]
     elif config["output_type"] == "multiple_choice":
-        answers = [", ".join([str(y[0]) for y in x["filtered_resps"]]) for x in data]
+        system_dict["output"] = [
+            ", ".join([str(y[0]) for y in x["filtered_resps"]]) for x in data
+        ]
+        system_dict["num_answers"] = [len(x["filtered_resps"]) for x in data]
     elif config["output_type"] == "loglikelihood_rolling":
-        answers = [str(x["filtered_resps"][0]) for x in data]
+        system_dict["output"] = [str(x["filtered_resps"][0]) for x in data]
     elif config["output_type"] == "generate_until":
-        answers = [str(x["filtered_resps"][0]) for x in data]
+        system_dict["output"] = [str(x["filtered_resps"][0]) for x in data]
+        system_dict["output_length"] = [len(str(x["filtered_resps"][0])) for x in data]
 
     metrics = {}
     for metric in config["metric_list"]:
         if "aggregation" in metric and metric["aggregation"] == "mean":
             metrics[metric["metric"]] = [x[metric["metric"]] for x in data]
 
-    system_dict = {"id": ids, "output": answers}
     system_dict.update(metrics)
     system_df = pd.DataFrame(system_dict)
     return system_df