meta-llama · erik-dunteman · Oct 4, 2024 · Oct 4, 2024 · Oct 4, 2024 · Nov 8, 2024
diff --git a/recipes/3p_integrations/modal/many-llamas-human-eval/README.md b/recipes/3p_integrations/modal/many-llamas-human-eval/README.md
@@ -0,0 +1,71 @@
+# Many-Llamas Human-Eval
+
+In this directory, we run an experiment answering the question:
+
+*If we run enough Llama models in parallel, can they outperform GPT-4o on HumanEval?*
+
+It seeks to increase model performance not through scaling parameters, but by scaling compute time.
+
+### Technical Blog
+
+This experiment built by the team at [Modal](https://modal.com), and is described in the following blog post:
+
+[Beat GPT-4o at Python by searching with 100 small Llamas](https://modal.com/blog/llama-human-eval)
+
+The experiment has since been upgraded to use the [Llama 3.2 3B Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) model, and runnable end-to-end using the Modal serverless platform.
+
+## Run it yourself
+
+### Install the Modal CLI
+From within your virtual environment, run:
+```bash
+pip install modal
+```
+And if you're new to Modal, authenticate with:
+```bash
+modal setup
+# or if that doesn't work, try 
+# python -m modal setup
+```
+
+That's all!
+
+This CLI will execute your modal apps, which build and run containers on the cloud, on your GPU of choice.
+
+### HuggingFace Pull Access
+
+To download the model, you'll first need to accept the [Llama 3.2 License](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) on HuggingFace and be approved for access.
+
+Then, create a [modal secret](https://modal.com/secrets) named `huggingface`, to which you'll add your HF_TOKEN as an environment variable.
+
+### Run The Experiment
+
+This command will run every step for you:
+```bash
+bash run_e2e.sh
+```
+
+Or if you prefer to run it manually, you can step through each of the modal commands in [the script](./run_e2e.sh).
+
+This will execute:
+1. Downloading the Llama 3.2 3B Instruct model to a cloud volume
+2. Deploying a vLLM inference server to GPUs
+3. Running hundreds of parallel generations on the HumanEval test set
+4. Running the evaluation script to compute pass@k and fail@k
+5. Generating graphs of pass@k and fail@k
+
+### Results
+
+The resulting plots of the evals will be saved locally to:
+- `/tmp/plot-pass-k.jpeg`
+- `/tmp/plot-fail-k.jpeg`
+
+`/tmp/plot-pass-k.jpeg` shows pass@k for the Llama 3.2 3B Instruct model vs pass@1 for GPT-4o. 
+
+![plot-pass-k](https://github.com/user-attachments/assets/11e9dc6e-4322-4d44-b928-4ed7c4ce8262)
+
+You'll see that at 100 generations, the Llama model is able to perform on-par with GPT-4o. At higher scale, the Llama model will outperform GPT-4o.
+
+`/tmp/plot-fail-k.jpeg` shows fail@k across a log-scale, showing smooth scaling of this method.
+
+![plot-fail-k](https://github.com/user-attachments/assets/7286e4ff-5090-4288-bd62-8a078c6dc5a1)
diff --git a/recipes/3p_integrations/modal/many-llamas-human-eval/download.py b/recipes/3p_integrations/modal/many-llamas-human-eval/download.py
@@ -0,0 +1,64 @@
+# ## Downloading Llama 3.2 3B Instruct Model
+# This script uses a Modal Function to download the model into a cloud Volume.
+#
+# Run it with:
+#    modal run download
+
+import modal
+
+MODELS_DIR = "/llamas"
+DEFAULT_NAME = "meta-llama/Llama-3.2-3B-Instruct"
+
+MINUTES = 60
+HOURS = 60 * MINUTES
+
+# Create a modal Volume to store the model
+volume = modal.Volume.from_name("llamas", create_if_missing=True)
+
+# This defines the image to use for the modal function
+image = (
+    modal.Image.debian_slim(python_version="3.10")
+    .pip_install(
+        [
+            "huggingface_hub",  # download models from the Hugging Face Hub
+            "hf-transfer",  # download models faster with Rust
+        ]
+    )
+    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
+)
+
+# We run the function from a modal App, which will have our HF_SECRET env var set.
+# Add your HuggingFace secret access token here: https://modal.com/secrets
+# secret name: huggingface
+# env var name: HF_TOKEN
+app = modal.App(image=image, secrets=[modal.Secret.from_name("huggingface")])
+
+# This function will be ran in the cloud, with the volume mounted.
+@app.function(volumes={MODELS_DIR: volume}, timeout=4 * HOURS)
+def download_model(model_name, force_download=False):
+    from huggingface_hub import snapshot_download
+
+    volume.reload()
+
+    snapshot_download(
+        model_name,
+        local_dir=MODELS_DIR + "/" + model_name,
+        ignore_patterns=[
+            "*.pt",
+            "*.bin",
+            "*.pth",
+            "original/*",
+        ],  # Ensure safetensors
+        force_download=force_download,
+    )
+
+    volume.commit()
+
+    print("Model successfully downloaded")
+
+@app.local_entrypoint()
+def main(
+    model_name: str = DEFAULT_NAME,
+    force_download: bool = False,
+):
+    download_model.remote(model_name, force_download)
diff --git a/recipes/3p_integrations/modal/many-llamas-human-eval/eval.py b/recipes/3p_integrations/modal/many-llamas-human-eval/eval.py
@@ -0,0 +1,96 @@
+# ## Evaluating HumanEval Results using Modal Sandboxes
+# This script will take generated results and evaluate them.
+# We use Modal Sandboxes to safely evaluate LLM-generated results.
+#
+# Run it with:
+#    modal run eval
+
+from pathlib import Path
+
+import modal
+
+app = modal.App("many-llamas-human-eval")
+
+volume = modal.Volume.from_name("humaneval", create_if_missing=True)
+
+sandbox_image = (
+    modal.Image.debian_slim()
+    .apt_install("git")
+    .run_commands(
+        "git clone https://github.com/modal-labs/human-eval.git",
+        "pip install -e human-eval",
+    )
+)
+
+MINUTES = 60
+
+@app.function(volumes={"/humaneval": volume}, timeout=10 * MINUTES)
+def eval_single_task(sample_file_path: str, problem_file_path: str):
+    with modal.Volume.ephemeral() as vol:
+        with vol.batch_upload() as batch:
+            batch.put_file(sample_file_path, "samples.jsonl")
+            batch.put_file(problem_file_path, "problems.jsonl")
+
+        print(f"Starting sandbox for {sample_file_path}")
+        sandbox = modal.Sandbox.create(
+            "bash",
+            "-c",
+            "evaluate_functional_correctness vol/samples.jsonl --problem_file=vol/problems.jsonl --n_workers=32",
+            image=sandbox_image,
+            volumes={"/vol": vol},
+            timeout=10 * MINUTES,
+            cpu=32,
+        )
+
+        try:
+            sandbox.wait()
+            print(f"Finished sandbox for {sample_file_path}")
+        except FunctionTimeoutError:
+            print("Sandbox timed out")
+
+        if sandbox.returncode == 0:
+            print(sandbox.stdout.read())
+            data = b""
+            for chunk in vol.read_file("samples.jsonl_results.jsonl"):
+                data += chunk
+            with open(f"{sample_file_path}_results.jsonl", "wb") as f:
+                f.write(data)
+        else:
+            print(f"Tests failed with code {sandbox.returncode}")
+            print(sandbox.stderr.read())
+
+
+@app.function(volumes={"/humaneval": volume}, timeout=10 * MINUTES)
+def eval_all_tasks():
+    import os
+
+    volume.reload()
+
+    # Find all files matching /humaneval/{env}/{run}/{id}.jsonl
+    envs = [element for element in Path("/humaneval").iterdir() if element.is_dir()]
+    for env in envs:
+        print(f"looking in {env}")
+        problem_file = env / "data.jsonl"
+
+        pattern = "*/*.jsonl"
+        handles = []
+        for file_path in env.glob(pattern):            
+            # Skip files that end with _results.jsonl
+            if str(file_path).endswith("_results.jsonl"):
+                continue
+
+            print(f"Checking {file_path}")
+            # Check if the corresponding results file exists
+            results_file = f"{file_path}_results.jsonl"
+            if not os.path.exists(results_file):
+                # If it doesn't exist, run do_eval
+                print("Spawning on", file_path, problem_file)
+                handles.append(eval_single_task.spawn(file_path, problem_file))
+
+        for handle in handles:
+            handle.get()
+
+
+@app.local_entrypoint()
+def main():
+    eval_all_tasks.remote()