From ef7a31cc339ad28f432b0d72ad109717c56346b2 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 6 Sep 2024 09:55:02 -0700
Subject: [PATCH 01/89] batch convert checkpoint

---
 log.txt                              |  10 +++
 scripts/convert_checkpoints.py       |  75 +++++++++++++++++
 scripts/convert_checkpoints_batch.py | 118 +++++++++++++++++++++++++++
 3 files changed, 203 insertions(+)
 create mode 100644 log.txt
 create mode 100644 scripts/convert_checkpoints.py
 create mode 100644 scripts/convert_checkpoints_batch.py

diff --git a/log.txt b/log.txt
new file mode 100644
index 000000000..a26fb9f41
--- /dev/null
+++ b/log.txt
@@ -0,0 +1,10 @@
+
+                                             o=======[]   
+   __ _                    _               _ |_      []   
+  / _` |  __ _    _ _     | |_      _ _   | || |     []   
+  \__, | / _` |  | ' \    |  _|    | '_|   \_, |   _/ ]_  
+  |___/  \__,_|  |_||_|   _\__|   _|_|_   _|__/   |_____| 
+_|"""""|_|"""""|_|"""""|_|"""""|_|"""""|_| """"| 
+ `---------------------------------------------' 
+
+Experiment submitted, see progress at https://beaker.org/ex/01J743RVEV62XWWKXSVSFQHXH6
diff --git a/scripts/convert_checkpoints.py b/scripts/convert_checkpoints.py
new file mode 100644
index 000000000..c9c9c4a3e
--- /dev/null
+++ b/scripts/convert_checkpoints.py
@@ -0,0 +1,75 @@
+# This script requires to be run at the root level.
+# Requires the AWS CLI and Beaker Gantry to be installed and configured.
+
+
+import argparse
+import subprocess
+
+# Beaker secret keys
+AWS_ACCESS_KEY_ID = 'JENA_AWS_ACCESS_KEY_ID'
+AWS_SECRET_ACCESS_KEY = 'JENA_AWS_SECRET_ACCESS_KEY'
+
+SANITY_CHECK = False
+
+def convert_checkpoint(checkpoint_paths):
+
+    for cp in checkpoint_paths:
+        retain_path_name = cp.replace('s3://', '').strip('/')
+        load_dir = "/data/input"
+        weka_loc = f"{load_dir}/{retain_path_name}-hf/"
+        log_file = "log.txt"
+
+        cmd = f"gantry run " \
+              f"--description 'Converting {cp}' " \
+              f"--allow-dirty " \
+              f"--no-python " \
+              f"--workspace ai2/cheap-decisions " \
+              f"--priority normal " \
+              f"--gpus 0 " \
+              f"--preemptible " \
+              f"--cluster 'ai2/jupiter-cirrascale-2' " \
+              f"--budget ai2/oe-eval " \
+              f"--env-secret AWS_ACCESS_KEY_ID={AWS_ACCESS_KEY_ID} " \
+              f"--env-secret AWS_SECRET_ACCESS_KEY={AWS_SECRET_ACCESS_KEY} " \
+              f"--shared-memory 10GiB " \
+              f"--weka=oe-eval-default:{load_dir} " \
+              f"--yes " \
+              f"-- /bin/bash -c python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{cp}' --destination-dir '{weka_loc}' --keep-olmo-artifacts"
+
+        #f"--mount weka://oe-eval-default={load_dir} "
+            # FIX THIS
+        if SANITY_CHECK:
+            print(cmd)
+        else:
+            try:
+                with open(log_file,'w') as fout:
+                    subprocess.run(cmd, shell=True, check=True, stdout=fout, stderr=subprocess.STDOUT)
+            except subprocess.CalledProcessError as e:
+                print(e.output)
+
+
+def read_checkpoints(f):
+    with open(f,'r') as fin:
+        checkpoints = [line for line in f if line and line != '']
+    return checkpoints
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Unshard checkpoint and convert to HF format. Run via Gantry. Invoke this script from the root of the OLMo repo."
+    )
+
+    group_batch = parser.add_mutually_exclusive_group(required=True)
+    group_batch.add_argument("--checkpoint_path", help="path to sharded checkpoint", type=str)
+    group_batch.add_argument("--checkpoint_path_file", help="file that lists sharded checkpoint paths (batch run option)", type=str)
+
+    args = parser.parse_args()
+
+    if args.checkpoint_path is not None:
+        convert_checkpoint([args.checkpoint_path])
+    else:
+        convert_checkpoint(read_checkpoints(args.checkpoint_path_file))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
new file mode 100644
index 000000000..3692e885c
--- /dev/null
+++ b/scripts/convert_checkpoints_batch.py
@@ -0,0 +1,118 @@
+"""
+Modification of s3_unshard_to_hf.py
+Wrapper for hf_olmo/convert_olmo_to_hf.py
+
+Takes a model checkpoint stored on S3, unshards, and converts to HF format.
+Saves the converted checkpoints to weka.
+Requires AWS CLI to be installed and configured.
+"""
+
+import argparse
+import pathlib
+import shutil
+import subprocess
+import os
+
+
+def convert_to_hf(args):
+    # Ensure local directory exists
+    if not os.path.exists(local_file_dir):
+        os.makedirs(local_file_dir)
+
+    # Convert to old-style checkpoint.
+    hf_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir {unsharded_dir} --destination-dir {local_file_dir}"
+    subprocess.run(hf_cmd, shell=True, check=True)
+
+    # Move to Weka
+    if not os.path.exists(weka_file_dir):
+        os.makedirs(weka_file_dir)
+
+
+
+    # Move the HF files from the unsharded dir to their own.
+    for fname in [
+        "config.json",
+        "pytorch_model.bin",
+        "special_tokens_map.json",
+        "tokenizer.json",
+        "tokenizer_config.json",
+    ]:
+        (unsharded_dir / fname).rename(hf_dir / fname)
+
+    # Upload the unsharded and HF files back to S3.
+    print("Uploading files back to S3.")
+    if not args.already_unsharded:
+        upload_unsharded_cmd = aws_copy(unsharded_dir, args.unsharded_bucket, args)
+        subprocess.run(upload_unsharded_cmd, shell=True, check=True)
+
+    upload_hf_cmd = aws_copy(hf_dir, args.hf_bucket, args)
+    subprocess.run(upload_hf_cmd, shell=True, check=True)
+
+def make_parser():
+    parser = argparse.ArgumentParser(
+        description="Unshard S3 checkpoint and convert to HF format. Invoke this script from the root of the OLMo repo."
+    )
+    parser.add_argument("--sharded_bucket", help="S3 bucket with sharded checkpoint.", type=str)
+    parser.add_argument(
+        "--unsharded_bucket",
+        help="S3 bucket to save the unsharded checkpoint.",
+        type=str,
+    )
+    parser.add_argument(
+        "--already_downloaded",
+        action="store_true",
+        help="Use this flag if the unsharded S3 checkpoint is already downloaded, but still needs to be unsharded.",
+    )
+    parser.add_argument(
+        "--already_unsharded",
+        action="store_true",
+        help="If given, the checkpoint has already been unsharded; just convert to HF.",
+    )
+    parser.add_argument("--hf_bucket", help="S3 bucket to save the HF-converted checkpoint.", type=str)
+    parser.add_argument(
+        "--local_dir",
+        help="""Directory to store checkpoints locally.""",
+        type=pathlib.Path,
+    )
+    parser.add_argument(
+        "--cleanup_local_dir",
+        action="store_true",
+        help="If given, remove the local directory if everything runs successfully to free up space on NFS.",
+    )
+    parser.add_argument(
+        "--checkpoint_style",
+        default="hf_olmo",
+        choices=["hf_olmo", "transformers"],
+        help="""Checkpoint style. The `transformers` style works with HF transformers as-is, while
+             `hf_olmo` relies on the `hf_olmo` package for conversion. In general, use
+             `transformers` for external releases and `hf_olmo` for internal model
+             development.""",
+    )
+    parser.add_argument(
+        "--hf_olmo",
+        action="store_true",
+        help="If given, convert to 'hf-olmo' style checkpoints.",
+    )
+    parser.add_argument(
+        "--quiet",
+        action="store_true",
+        help="If given, don't show progress for AWS commands.",
+    )
+    parser.add_argument("--type", default=None, help="If given, pass this argument on to `unshard.py`.")
+    parser.add_argument("--model_only", action="store_true", help="If given, only unshard the model.")
+    return parser
+
+def main():
+    parser = make_parser()
+    args = parser.parse_args()
+    args.local_dir.mkdir(exist_ok=True, parents=True)
+
+    s3_unshard_to_hf(args)
+
+    if args.cleanup_local_dir:
+        # Clear out temp dir if we got here (everything ran without error).
+        shutil.rmtree(args.tmp_dir)
+
+
+if __name__ == "__main__":
+    main()

From 62d9a1de78670db1c0492d50b5ebfd2955c145d8 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 6 Sep 2024 10:11:22 -0700
Subject: [PATCH 02/89] batch convert checkpoint

---
 log.txt                        |  2 +-
 scripts/convert_checkpoints.py |  6 ++----
 scripts/convert_checkpoints.sh | 16 ++++++++++++++++
 3 files changed, 19 insertions(+), 5 deletions(-)
 create mode 100644 scripts/convert_checkpoints.sh

diff --git a/log.txt b/log.txt
index a26fb9f41..16a1a62e7 100644
--- a/log.txt
+++ b/log.txt
@@ -7,4 +7,4 @@
 _|"""""|_|"""""|_|"""""|_|"""""|_|"""""|_| """"| 
  `---------------------------------------------' 
 
-Experiment submitted, see progress at https://beaker.org/ex/01J743RVEV62XWWKXSVSFQHXH6
+Experiment submitted, see progress at https://beaker.org/ex/01J7446KB7EXZ35D8NST0JTNTY
diff --git a/scripts/convert_checkpoints.py b/scripts/convert_checkpoints.py
index c9c9c4a3e..2cbbcd301 100644
--- a/scripts/convert_checkpoints.py
+++ b/scripts/convert_checkpoints.py
@@ -9,7 +9,7 @@
 AWS_ACCESS_KEY_ID = 'JENA_AWS_ACCESS_KEY_ID'
 AWS_SECRET_ACCESS_KEY = 'JENA_AWS_SECRET_ACCESS_KEY'
 
-SANITY_CHECK = False
+SANITY_CHECK = True
 
 def convert_checkpoint(checkpoint_paths):
 
@@ -17,7 +17,6 @@ def convert_checkpoint(checkpoint_paths):
         retain_path_name = cp.replace('s3://', '').strip('/')
         load_dir = "/data/input"
         weka_loc = f"{load_dir}/{retain_path_name}-hf/"
-        log_file = "log.txt"
 
         cmd = f"gantry run " \
               f"--description 'Converting {cp}' " \
@@ -42,8 +41,7 @@ def convert_checkpoint(checkpoint_paths):
             print(cmd)
         else:
             try:
-                with open(log_file,'w') as fout:
-                    subprocess.run(cmd, shell=True, check=True, stdout=fout, stderr=subprocess.STDOUT)
+                subprocess.run(cmd, shell=True, check=True)
             except subprocess.CalledProcessError as e:
                 print(e.output)
 
diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
new file mode 100644
index 000000000..8caffa73d
--- /dev/null
+++ b/scripts/convert_checkpoints.sh
@@ -0,0 +1,16 @@
+gantry run \
+    --description "Converting ${CHECKPOINT_PATH}" \
+    --allow-dirty \
+    --no-python \
+    --workspace ai2/cheap-decisions  \
+    --priority normal \
+    --gpus 0 \
+    --preemptible \
+    --cluster ai2/jupiter-cirrascale-2 \
+    --budget ai2/oe-eval \
+    --env-secret AWS_ACCESS_KEY_ID={AWS_ACCESS_KEY_ID} \
+    --env-secret AWS_SECRET_ACCESS_KEY={AWS_SECRET_ACCESS_KEY} \
+    --shared-memory 10GiB \
+    --weka=oe-eval-default:/data/input \
+    --yes \
+    -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{cp}' --destination-dir '{weka_loc}' --keep-olmo-artifacts"
\ No newline at end of file

From dba011b2e435df2167e8639259e04fb617befb68 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 6 Sep 2024 10:12:34 -0700
Subject: [PATCH 03/89] batch convert checkpoint

---
 scripts/convert_checkpoints.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index 8caffa73d..3e822ee85 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -8,8 +8,8 @@ gantry run \
     --preemptible \
     --cluster ai2/jupiter-cirrascale-2 \
     --budget ai2/oe-eval \
-    --env-secret AWS_ACCESS_KEY_ID={AWS_ACCESS_KEY_ID} \
-    --env-secret AWS_SECRET_ACCESS_KEY={AWS_SECRET_ACCESS_KEY} \
+    --env-secret AWS_ACCESS_KEY_ID=JENA_AWS_ACCESS_KEY_ID \
+    --env-secret AWS_SECRET_ACCESS_KEY=JENA_AWS_SECRET_ACCESS_KE \
     --shared-memory 10GiB \
     --weka=oe-eval-default:/data/input \
     --yes \

From 84a4f1b7e9353f4586ecd20e3489a1244c625603 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 6 Sep 2024 10:13:04 -0700
Subject: [PATCH 04/89] batch convert checkpoint

---
 scripts/convert_checkpoints.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index 3e822ee85..12c9e36e6 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -9,7 +9,7 @@ gantry run \
     --cluster ai2/jupiter-cirrascale-2 \
     --budget ai2/oe-eval \
     --env-secret AWS_ACCESS_KEY_ID=JENA_AWS_ACCESS_KEY_ID \
-    --env-secret AWS_SECRET_ACCESS_KEY=JENA_AWS_SECRET_ACCESS_KE \
+    --env-secret AWS_SECRET_ACCESS_KEY=JENA_AWS_SECRET_ACCESS_KEY \
     --shared-memory 10GiB \
     --weka=oe-eval-default:/data/input \
     --yes \

From 9a7b03b6df674973026e1be45338b30c0edd6bdf Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 6 Sep 2024 12:09:52 -0700
Subject: [PATCH 05/89] batch convert checkpoint

---
 requirements.txt               | 3 +++
 requirements.txt~              | 1 +
 scripts/convert_checkpoints.sh | 2 +-
 3 files changed, 5 insertions(+), 1 deletion(-)
 create mode 100644 requirements.txt
 create mode 100644 requirements.txt~

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 000000000..c32a09983
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+torch
+omegaconf
+tqdm
diff --git a/requirements.txt~ b/requirements.txt~
new file mode 100644
index 000000000..12c6d5d5e
--- /dev/null
+++ b/requirements.txt~
@@ -0,0 +1 @@
+torch
diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index 12c9e36e6..d067b4681 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -13,4 +13,4 @@ gantry run \
     --shared-memory 10GiB \
     --weka=oe-eval-default:/data/input \
     --yes \
-    -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{cp}' --destination-dir '{weka_loc}' --keep-olmo-artifacts"
\ No newline at end of file
+    -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --destination-dir '/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --keep-olmo-artifacts"
\ No newline at end of file

From d4687e99e3a15f922f87377cdb179b8aaf26ffad Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 6 Sep 2024 12:15:07 -0700
Subject: [PATCH 06/89] batch convert checkpoint

---
 requirements.txt~              | 1 -
 scripts/convert_checkpoints.sh | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)
 delete mode 100644 requirements.txt~

diff --git a/requirements.txt~ b/requirements.txt~
deleted file mode 100644
index 12c6d5d5e..000000000
--- a/requirements.txt~
+++ /dev/null
@@ -1 +0,0 @@
-torch
diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index d067b4681..6babf8658 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -13,4 +13,5 @@ gantry run \
     --shared-memory 10GiB \
     --weka=oe-eval-default:/data/input \
     --yes \
+    --pip requirements.txt \
     -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --destination-dir '/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --keep-olmo-artifacts"
\ No newline at end of file

From 24ec144881f9990eed74047ac75efa6ed309ef53 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 6 Sep 2024 12:16:57 -0700
Subject: [PATCH 07/89] batch convert checkpoint

---
 scripts/convert_checkpoints.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index 6babf8658..6e1992c5b 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -1,5 +1,4 @@
 gantry run \
-    --description "Converting ${CHECKPOINT_PATH}" \
     --allow-dirty \
     --no-python \
     --workspace ai2/cheap-decisions  \

From 6187889f9caa4b23ecdcf387380afb9e76d093cb Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 6 Sep 2024 12:23:29 -0700
Subject: [PATCH 08/89] batch convert checkpoint

---
 environments.yml | 8 ++++++++
 requirements.txt | 3 ---
 2 files changed, 8 insertions(+), 3 deletions(-)
 create mode 100644 environments.yml
 delete mode 100644 requirements.txt

diff --git a/environments.yml b/environments.yml
new file mode 100644
index 000000000..409aa9047
--- /dev/null
+++ b/environments.yml
@@ -0,0 +1,8 @@
+name: torch-env
+channels:
+- pytorch
+dependencies:
+- python=3.9
+- pytorch
+- omegaconf
+- tqdm 
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index c32a09983..000000000
--- a/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-torch
-omegaconf
-tqdm

From fbfda0e3eca0768728eaa8d7dbd91bcbba5d8d2c Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 6 Sep 2024 12:25:51 -0700
Subject: [PATCH 09/89] batch convert checkpoint

---
 environments.yml => environment.yml | 0
 scripts/convert_checkpoints.sh      | 1 -
 2 files changed, 1 deletion(-)
 rename environments.yml => environment.yml (100%)

diff --git a/environments.yml b/environment.yml
similarity index 100%
rename from environments.yml
rename to environment.yml
diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index 6e1992c5b..965cd647f 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -12,5 +12,4 @@ gantry run \
     --shared-memory 10GiB \
     --weka=oe-eval-default:/data/input \
     --yes \
-    --pip requirements.txt \
     -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --destination-dir '/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --keep-olmo-artifacts"
\ No newline at end of file

From a862a0b943642629c03289b81bdf9a714472dd7c Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 6 Sep 2024 12:34:36 -0700
Subject: [PATCH 10/89] tinkering

---
 .gitignore      | 3 +++
 environment.yml | 8 --------
 2 files changed, 3 insertions(+), 8 deletions(-)
 delete mode 100644 environment.yml

diff --git a/.gitignore b/.gitignore
index 9b1e99785..e0f77ccd8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+# beaker yaml
+guided-trout-2f805b9.yaml
+
 # build artifacts
 
 .eggs/
diff --git a/environment.yml b/environment.yml
deleted file mode 100644
index 409aa9047..000000000
--- a/environment.yml
+++ /dev/null
@@ -1,8 +0,0 @@
-name: torch-env
-channels:
-- pytorch
-dependencies:
-- python=3.9
-- pytorch
-- omegaconf
-- tqdm 
\ No newline at end of file

From 8d79a01fd4987311fb1d75b4859e7d597038eede Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 6 Sep 2024 12:53:54 -0700
Subject: [PATCH 11/89] testing

---
 requirements.txt | 149 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 149 insertions(+)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 000000000..269595af9
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,149 @@
+ai2-olmo==0.5.0
+ai2-olmo-core==0.1.0
+aiohappyeyeballs==2.4.0
+aiohttp==3.10.5
+aiosignal==1.3.1
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+async-timeout==4.0.3
+attrs==24.2.0
+backoff==2.1.2
+backports.tarfile==1.2.0
+beaker-gantry==1.8.3
+beaker-py==1.31.3
+black==23.12.1
+boltons==24.0.0
+boto3==1.35.6
+boto3-extensions==0.23.0
+botocore==1.35.6
+build==1.2.1
+cached_path==1.6.3
+cachetools==5.5.0
+certifi==2024.7.4
+charset-normalizer==3.3.2
+click==8.1.7
+click-aliases==1.0.4
+click-help-colors==0.9.4
+colorama==0.4.6
+datasets==2.7.1
+dateparser==1.2.0
+dill==0.3.6
+docker==7.1.0
+docker-pycreds==0.4.0
+docutils==0.21.2
+exceptiongroup==1.2.2
+face==20.1.1
+filelock==3.13.4
+frozenlist==1.4.1
+fsspec==2024.6.1
+ftfy==6.2.3
+gantry==0.6.14
+gitdb==4.0.11
+GitPython==3.1.43
+glom==23.5.0
+google-api-core==2.19.2
+google-auth==2.34.0
+google-cloud-core==2.4.1
+google-cloud-storage==2.18.2
+google-crc32c==1.6.0
+google-resumable-media==2.7.2
+googleapis-common-protos==1.65.0
+halo==0.0.31
+huggingface-hub==0.23.5
+idna==3.8
+importlib_metadata==8.4.0
+importlib_resources==6.4.4
+iniconfig==2.0.0
+isodate==0.6.1
+isort==5.12.0
+jaraco.classes==3.4.0
+jaraco.context==6.0.1
+jaraco.functools==4.0.2
+Jinja2==3.1.4
+jmespath==1.0.1
+joblib==1.4.2
+keyring==25.3.0
+lightning-utilities==0.11.7
+log-symbols==0.0.14
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+mdurl==0.1.2
+more-itertools==10.5.0
+mpmath==1.3.0
+msgspec==0.18.6
+multidict==6.0.5
+multiprocess==0.70.14
+mypy==1.3.0
+mypy-extensions==1.0.0
+necessary==0.4.3
+networkx==3.3
+nh3==0.2.18
+numpy==2.1.0
+omegaconf==2.3.0
+packaging==24.1
+pandas==2.2.2
+pathspec==0.12.1
+petname==2.6
+pkginfo==1.10.0
+platformdirs==4.2.2
+pluggy==1.5.0
+proto-plus==1.24.0
+protobuf==5.28.0
+psutil==6.0.0
+pyarrow==17.0.0
+pyasn1==0.6.0
+pyasn1_modules==0.4.0
+pydantic==2.8.2
+pydantic_core==2.20.1
+Pygments==2.18.0
+pyproject_hooks==1.1.0
+pytest==8.3.2
+pytest-sphinx==0.6.3
+python-dateutil==2.9.0.post0
+pytz==2024.1
+PyYAML==6.0.2
+readme_renderer==44.0
+regex==2024.7.24
+requests==2.32.3
+requests-toolbelt==1.0.0
+requirements-parser==0.11.0
+responses==0.18.0
+rfc3986==2.0.0
+rich==13.8.0
+rsa==4.9
+ruff==0.6.4
+s3transfer==0.10.2
+safetensors==0.4.5
+scikit-learn==1.5.1
+scipy==1.14.1
+sentry-sdk==2.13.0
+setproctitle==1.3.3
+six==1.16.0
+smart-open==7.0.4
+smashed==0.21.5
+smmap==5.0.1
+spinners==0.0.24
+sympy==1.13.1
+tabulate==0.9.0
+termcolor==2.4.0
+threadpoolctl==3.5.0
+tokenizers==0.19.1
+tomli==2.0.1
+torch==2.2.2
+torchmetrics==1.4.1
+tqdm==4.66.5
+transformers==4.44.2
+trouting==0.3.3
+twine==5.1.1
+typeguard==2.13.3
+types-setuptools==74.1.0.20240906
+typing_extensions==4.12.2
+tzdata==2024.1
+tzlocal==5.2
+urllib3==2.2.2
+wandb==0.17.9
+wcwidth==0.2.13
+wrapt==1.16.0
+xxhash==3.5.0
+yarl==1.9.4
+zipp==3.20.1

From b4ed78dabe2665537a497c044a0fc70b29f1bb2b Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 6 Sep 2024 13:04:59 -0700
Subject: [PATCH 12/89] testing

---
 requirements.txt               | 150 +--------------------------------
 scripts/convert_checkpoints.sh |   1 +
 2 files changed, 2 insertions(+), 149 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 269595af9..679c6f744 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,149 +1 @@
-ai2-olmo==0.5.0
-ai2-olmo-core==0.1.0
-aiohappyeyeballs==2.4.0
-aiohttp==3.10.5
-aiosignal==1.3.1
-annotated-types==0.7.0
-antlr4-python3-runtime==4.9.3
-async-timeout==4.0.3
-attrs==24.2.0
-backoff==2.1.2
-backports.tarfile==1.2.0
-beaker-gantry==1.8.3
-beaker-py==1.31.3
-black==23.12.1
-boltons==24.0.0
-boto3==1.35.6
-boto3-extensions==0.23.0
-botocore==1.35.6
-build==1.2.1
-cached_path==1.6.3
-cachetools==5.5.0
-certifi==2024.7.4
-charset-normalizer==3.3.2
-click==8.1.7
-click-aliases==1.0.4
-click-help-colors==0.9.4
-colorama==0.4.6
-datasets==2.7.1
-dateparser==1.2.0
-dill==0.3.6
-docker==7.1.0
-docker-pycreds==0.4.0
-docutils==0.21.2
-exceptiongroup==1.2.2
-face==20.1.1
-filelock==3.13.4
-frozenlist==1.4.1
-fsspec==2024.6.1
-ftfy==6.2.3
-gantry==0.6.14
-gitdb==4.0.11
-GitPython==3.1.43
-glom==23.5.0
-google-api-core==2.19.2
-google-auth==2.34.0
-google-cloud-core==2.4.1
-google-cloud-storage==2.18.2
-google-crc32c==1.6.0
-google-resumable-media==2.7.2
-googleapis-common-protos==1.65.0
-halo==0.0.31
-huggingface-hub==0.23.5
-idna==3.8
-importlib_metadata==8.4.0
-importlib_resources==6.4.4
-iniconfig==2.0.0
-isodate==0.6.1
-isort==5.12.0
-jaraco.classes==3.4.0
-jaraco.context==6.0.1
-jaraco.functools==4.0.2
-Jinja2==3.1.4
-jmespath==1.0.1
-joblib==1.4.2
-keyring==25.3.0
-lightning-utilities==0.11.7
-log-symbols==0.0.14
-markdown-it-py==3.0.0
-MarkupSafe==2.1.5
-mdurl==0.1.2
-more-itertools==10.5.0
-mpmath==1.3.0
-msgspec==0.18.6
-multidict==6.0.5
-multiprocess==0.70.14
-mypy==1.3.0
-mypy-extensions==1.0.0
-necessary==0.4.3
-networkx==3.3
-nh3==0.2.18
-numpy==2.1.0
-omegaconf==2.3.0
-packaging==24.1
-pandas==2.2.2
-pathspec==0.12.1
-petname==2.6
-pkginfo==1.10.0
-platformdirs==4.2.2
-pluggy==1.5.0
-proto-plus==1.24.0
-protobuf==5.28.0
-psutil==6.0.0
-pyarrow==17.0.0
-pyasn1==0.6.0
-pyasn1_modules==0.4.0
-pydantic==2.8.2
-pydantic_core==2.20.1
-Pygments==2.18.0
-pyproject_hooks==1.1.0
-pytest==8.3.2
-pytest-sphinx==0.6.3
-python-dateutil==2.9.0.post0
-pytz==2024.1
-PyYAML==6.0.2
-readme_renderer==44.0
-regex==2024.7.24
-requests==2.32.3
-requests-toolbelt==1.0.0
-requirements-parser==0.11.0
-responses==0.18.0
-rfc3986==2.0.0
-rich==13.8.0
-rsa==4.9
-ruff==0.6.4
-s3transfer==0.10.2
-safetensors==0.4.5
-scikit-learn==1.5.1
-scipy==1.14.1
-sentry-sdk==2.13.0
-setproctitle==1.3.3
-six==1.16.0
-smart-open==7.0.4
-smashed==0.21.5
-smmap==5.0.1
-spinners==0.0.24
-sympy==1.13.1
-tabulate==0.9.0
-termcolor==2.4.0
-threadpoolctl==3.5.0
-tokenizers==0.19.1
-tomli==2.0.1
-torch==2.2.2
-torchmetrics==1.4.1
-tqdm==4.66.5
-transformers==4.44.2
-trouting==0.3.3
-twine==5.1.1
-typeguard==2.13.3
-types-setuptools==74.1.0.20240906
-typing_extensions==4.12.2
-tzdata==2024.1
-tzlocal==5.2
-urllib3==2.2.2
-wandb==0.17.9
-wcwidth==0.2.13
-wrapt==1.16.0
-xxhash==3.5.0
-yarl==1.9.4
-zipp==3.20.1
+pytorch
diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index 965cd647f..bd0f6a9ce 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -12,4 +12,5 @@ gantry run \
     --shared-memory 10GiB \
     --weka=oe-eval-default:/data/input \
     --yes \
+    --install install_torch.sh \
     -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --destination-dir '/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --keep-olmo-artifacts"
\ No newline at end of file

From 2f2a764a36a116a77e9288f1338a4a7d55ad9e84 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 6 Sep 2024 13:05:56 -0700
Subject: [PATCH 13/89] testing

---
 install_torch.sh | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 install_torch.sh

diff --git a/install_torch.sh b/install_torch.sh
new file mode 100644
index 000000000..5ac68ad6e
--- /dev/null
+++ b/install_torch.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+pip install torch

From b9601c4a81e3f26be17eba804b7266c3e6a5e73b Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 6 Sep 2024 13:11:09 -0700
Subject: [PATCH 14/89] testing

---
 requirements.txt               | 2 +-
 scripts/convert_checkpoints.sh | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 679c6f744..12c6d5d5e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1 @@
-pytorch
+torch
diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index bd0f6a9ce..55da72f44 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -1,6 +1,5 @@
 gantry run \
     --allow-dirty \
-    --no-python \
     --workspace ai2/cheap-decisions  \
     --priority normal \
     --gpus 0 \
@@ -11,6 +10,8 @@ gantry run \
     --env-secret AWS_SECRET_ACCESS_KEY=JENA_AWS_SECRET_ACCESS_KEY \
     --shared-memory 10GiB \
     --weka=oe-eval-default:/data/input \
+    --pip requirements.txt \
     --yes \
-    --install install_torch.sh \
-    -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --destination-dir '/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --keep-olmo-artifacts"
\ No newline at end of file
+    -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --destination-dir '/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --keep-olmo-artifacts"
+
+#    --install install_torch.sh \

From 8cc86ee9ddbba3d97aee615235892ae7317fa1f8 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 6 Sep 2024 13:19:25 -0700
Subject: [PATCH 15/89] testing

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 12c6d5d5e..c00bc2475 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,2 @@
 torch
+datasets

From 50e7090023cd02d02b04d5b2408752e4ef0aa1ff Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 6 Sep 2024 13:20:03 -0700
Subject: [PATCH 16/89] testing

---
 requirements.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index c00bc2475..b70cc172e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,5 @@
 torch
 datasets
+rich
+botocore
+cachedpath

From 8aa450f8ce653f3532e80129bdc9da6f2bfe12a9 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 6 Sep 2024 13:25:49 -0700
Subject: [PATCH 17/89] testing

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index b70cc172e..9339fe636 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,4 +2,4 @@ torch
 datasets
 rich
 botocore
-cachedpath
+cached-path

From f35732083d50619aa2af7c0492f915e4ee18ce73 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 6 Sep 2024 14:29:47 -0700
Subject: [PATCH 18/89] testing

---
 requirements.txt               | 1 +
 scripts/convert_checkpoints.sh | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 9339fe636..f439913b6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,4 @@ datasets
 rich
 botocore
 cached-path
+transformers
diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index 55da72f44..7d09df0d8 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -12,6 +12,6 @@ gantry run \
     --weka=oe-eval-default:/data/input \
     --pip requirements.txt \
     --yes \
-    -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --destination-dir '/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --keep-olmo-artifacts"
+    -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --destination-dir '/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --keep-olmo-artifacts --tokenizer_name_or_path 'tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json'"
 
 #    --install install_torch.sh \

From 02899a303617fc799fea0226af77ade3f8c08070 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 6 Sep 2024 14:30:48 -0700
Subject: [PATCH 19/89] testing

---
 scripts/convert_checkpoints.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index 7d09df0d8..0b921fd9c 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -12,6 +12,6 @@ gantry run \
     --weka=oe-eval-default:/data/input \
     --pip requirements.txt \
     --yes \
-    -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --destination-dir '/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --keep-olmo-artifacts --tokenizer_name_or_path 'tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json'"
+    -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --destination-dir '/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --keep-olmo-artifacts --tokenizer_name_or_path 'olmo_data/tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json'"
 
 #    --install install_torch.sh \

From 9ac37396d784ccca3eeab01504cd694008e4ec9d Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 6 Sep 2024 14:31:21 -0700
Subject: [PATCH 20/89] testing

---
 scripts/convert_checkpoints.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index 0b921fd9c..dcc7e5b02 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -12,6 +12,6 @@ gantry run \
     --weka=oe-eval-default:/data/input \
     --pip requirements.txt \
     --yes \
-    -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --destination-dir '/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --keep-olmo-artifacts --tokenizer_name_or_path 'olmo_data/tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json'"
+    -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --destination-dir '/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --keep-olmo-artifacts --tokenizer 'olmo_data/tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json'"
 
 #    --install install_torch.sh \

From ef0b4034f57add1b1a796283e88c0e5d635ef188 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Mon, 9 Sep 2024 11:23:09 -0700
Subject: [PATCH 21/89] convert checkpoint batch

---
 scripts/convert_checkpoints.py       |  73 +++++-------
 scripts/convert_checkpoints_batch.py | 163 +++++++++++----------------
 2 files changed, 96 insertions(+), 140 deletions(-)

diff --git a/scripts/convert_checkpoints.py b/scripts/convert_checkpoints.py
index 2cbbcd301..6d51b3149 100644
--- a/scripts/convert_checkpoints.py
+++ b/scripts/convert_checkpoints.py
@@ -11,45 +11,33 @@
 
 SANITY_CHECK = True
 
-def convert_checkpoint(checkpoint_paths):
-
-    for cp in checkpoint_paths:
-        retain_path_name = cp.replace('s3://', '').strip('/')
-        load_dir = "/data/input"
-        weka_loc = f"{load_dir}/{retain_path_name}-hf/"
-
-        cmd = f"gantry run " \
-              f"--description 'Converting {cp}' " \
-              f"--allow-dirty " \
-              f"--no-python " \
-              f"--workspace ai2/cheap-decisions " \
-              f"--priority normal " \
-              f"--gpus 0 " \
-              f"--preemptible " \
-              f"--cluster 'ai2/jupiter-cirrascale-2' " \
-              f"--budget ai2/oe-eval " \
-              f"--env-secret AWS_ACCESS_KEY_ID={AWS_ACCESS_KEY_ID} " \
-              f"--env-secret AWS_SECRET_ACCESS_KEY={AWS_SECRET_ACCESS_KEY} " \
-              f"--shared-memory 10GiB " \
-              f"--weka=oe-eval-default:{load_dir} " \
-              f"--yes " \
-              f"-- /bin/bash -c python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{cp}' --destination-dir '{weka_loc}' --keep-olmo-artifacts"
-
-        #f"--mount weka://oe-eval-default={load_dir} "
-            # FIX THIS
-        if SANITY_CHECK:
-            print(cmd)
-        else:
-            try:
-                subprocess.run(cmd, shell=True, check=True)
-            except subprocess.CalledProcessError as e:
-                print(e.output)
+def convert_checkpoints(args):
+    cmd = f"gantry run " \
+          f"--allow-dirty " \
+          f"--workspace ai2/cheap-decisions " \
+          f"--priority normal " \
+          f"--gpus 0 " \
+          f"--preemptible " \
+          f"--cluster 'ai2/jupiter-cirrascale-2' " \
+          f"--budget ai2/oe-eval " \
+          f"--env-secret AWS_ACCESS_KEY_ID={AWS_ACCESS_KEY_ID} " \
+          f"--env-secret AWS_SECRET_ACCESS_KEY={AWS_SECRET_ACCESS_KEY} " \
+          f"--shared-memory 10GiB " \
+          f"--weka=oe-eval-default:{args.weka_load_dir} " \
+          f"--yes "
 
+    if args.checkpoint_path is not None:
+        cmd += f"-- /bin/bash -c python convert_checkpoints_batch.py --checkpoint-path '{args.checkpoint_path}' --weka-load-dir {args.weka_load_dir}"
+    else:
+        cmd += f"-- /bin/bash -c python convert_checkpoints_batch.py --checkpoint-path-file '{args.checkpoint_path_file}' --weka-load-dir {args.weka_load_dir}"
 
-def read_checkpoints(f):
-    with open(f,'r') as fin:
-        checkpoints = [line for line in f if line and line != '']
-    return checkpoints
+    if SANITY_CHECK:
+        print(cmd)
+    else:
+        try:
+            subprocess.run(cmd, shell=True, check=True)
+        except subprocess.CalledProcessError as e:
+            print(e.output)
 
 
 def main():
@@ -58,15 +46,12 @@ def main():
     )
 
     group_batch = parser.add_mutually_exclusive_group(required=True)
-    group_batch.add_argument("--checkpoint_path", help="path to sharded checkpoint", type=str)
-    group_batch.add_argument("--checkpoint_path_file", help="file that lists sharded checkpoint paths (batch run option)", type=str)
+    group_batch.add_argument("--checkpoint-path", help="path to sharded checkpoint", type=str)
+    group_batch.add_argument("--checkpoint-path-file", help="file that lists sharded checkpoint paths (batch run option)", type=str)
+    parser.add_argument("--weka-load-dir", help='mounted location of weka bucket', default='/data/input', type=str)
 
     args = parser.parse_args()
-
-    if args.checkpoint_path is not None:
-        convert_checkpoint([args.checkpoint_path])
-    else:
-        convert_checkpoint(read_checkpoints(args.checkpoint_path_file))
+    convert_checkpoints(args)
 
 
 if __name__ == "__main__":
diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 3692e885c..654504d55 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -8,110 +8,81 @@
 """
 
 import argparse
-import pathlib
-import shutil
 import subprocess
 import os
 
+SANITY_CHECK = True
+
+def convert_checkpoint(cps, load_dir="/data/input"):
+    cps = expand_paths(cps)
+    save = {}
+
+    for checkpoint_path in cps:
+        # Convert to old-style checkpoint.
+
+        retain_path_name = checkpoint_path.replace('s3://', '').strip('/')
+        weka_loc = f"{load_dir}/{retain_path_name}-hf/"
+
+        # Check if the output location is already there. If not, do the conversion.
+        if os.path.exists(weka_loc):
+            conversion = 'existing'
+        else:
+            conversion = 'new'
+            conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --keep-olmo-artifacts"
+
+            if SANITY_CHECK:
+                print(conversion_cmd)
+            else:
+                subprocess.run(conversion_cmd, shell=True, check=True)
+
+        save[checkpoint_path] = {'converted_path': weka_loc, 'convertion': conversion}
+
+    print(save)
+
+def expand_paths(cps):
+    expanded = []
+    for cp in cps:
+        segs = cp.split('*')
+        prefix = 's3://ai2-llm/'
+        cmd = f"aws s3 ls --recursive {segs[0]}"
+        all_dirs = subprocess.run(cmd, shell=True, check=True, capture_output=True, text = True).stdout
+        relevant_dirs = ['/'.join(d.split()[-1].split('/')[:-1]) for d in all_dirs.split() if 'model.pt' in d]
+        search_segs = [seg for i, seg in enumerate(segs) if i > 0 and seg != ""]
+
+        print(search_segs)
+
+        temp_dirs = relevant_dirs
+        if len(search_segs) > 0:
+            for s in search_segs:
+                temp_dirs = [d for d in temp_dirs if s in d]
+
+        exp = set([f"{prefix}{d}" for d in temp_dirs])
+        print(exp)
+
+        expanded += exp
+    return expanded
+
+
+def read_checkpoints(f):
+    with open(f, 'r') as fin:
+        checkpoints = [line for line in fin if line and line != '']
+    return checkpoints
 
-def convert_to_hf(args):
-    # Ensure local directory exists
-    if not os.path.exists(local_file_dir):
-        os.makedirs(local_file_dir)
-
-    # Convert to old-style checkpoint.
-    hf_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir {unsharded_dir} --destination-dir {local_file_dir}"
-    subprocess.run(hf_cmd, shell=True, check=True)
-
-    # Move to Weka
-    if not os.path.exists(weka_file_dir):
-        os.makedirs(weka_file_dir)
-
-
-
-    # Move the HF files from the unsharded dir to their own.
-    for fname in [
-        "config.json",
-        "pytorch_model.bin",
-        "special_tokens_map.json",
-        "tokenizer.json",
-        "tokenizer_config.json",
-    ]:
-        (unsharded_dir / fname).rename(hf_dir / fname)
-
-    # Upload the unsharded and HF files back to S3.
-    print("Uploading files back to S3.")
-    if not args.already_unsharded:
-        upload_unsharded_cmd = aws_copy(unsharded_dir, args.unsharded_bucket, args)
-        subprocess.run(upload_unsharded_cmd, shell=True, check=True)
-
-    upload_hf_cmd = aws_copy(hf_dir, args.hf_bucket, args)
-    subprocess.run(upload_hf_cmd, shell=True, check=True)
-
-def make_parser():
-    parser = argparse.ArgumentParser(
-        description="Unshard S3 checkpoint and convert to HF format. Invoke this script from the root of the OLMo repo."
-    )
-    parser.add_argument("--sharded_bucket", help="S3 bucket with sharded checkpoint.", type=str)
-    parser.add_argument(
-        "--unsharded_bucket",
-        help="S3 bucket to save the unsharded checkpoint.",
-        type=str,
-    )
-    parser.add_argument(
-        "--already_downloaded",
-        action="store_true",
-        help="Use this flag if the unsharded S3 checkpoint is already downloaded, but still needs to be unsharded.",
-    )
-    parser.add_argument(
-        "--already_unsharded",
-        action="store_true",
-        help="If given, the checkpoint has already been unsharded; just convert to HF.",
-    )
-    parser.add_argument("--hf_bucket", help="S3 bucket to save the HF-converted checkpoint.", type=str)
-    parser.add_argument(
-        "--local_dir",
-        help="""Directory to store checkpoints locally.""",
-        type=pathlib.Path,
-    )
-    parser.add_argument(
-        "--cleanup_local_dir",
-        action="store_true",
-        help="If given, remove the local directory if everything runs successfully to free up space on NFS.",
-    )
-    parser.add_argument(
-        "--checkpoint_style",
-        default="hf_olmo",
-        choices=["hf_olmo", "transformers"],
-        help="""Checkpoint style. The `transformers` style works with HF transformers as-is, while
-             `hf_olmo` relies on the `hf_olmo` package for conversion. In general, use
-             `transformers` for external releases and `hf_olmo` for internal model
-             development.""",
-    )
-    parser.add_argument(
-        "--hf_olmo",
-        action="store_true",
-        help="If given, convert to 'hf-olmo' style checkpoints.",
-    )
-    parser.add_argument(
-        "--quiet",
-        action="store_true",
-        help="If given, don't show progress for AWS commands.",
-    )
-    parser.add_argument("--type", default=None, help="If given, pass this argument on to `unshard.py`.")
-    parser.add_argument("--model_only", action="store_true", help="If given, only unshard the model.")
-    return parser
 
 def main():
-    parser = make_parser()
-    args = parser.parse_args()
-    args.local_dir.mkdir(exist_ok=True, parents=True)
+    parser = argparse.ArgumentParser()
 
-    s3_unshard_to_hf(args)
+    group_batch = parser.add_mutually_exclusive_group(required=True)
+    group_batch.add_argument("--checkpoint-path", help="path to sharded checkpoint", type=str)
+    group_batch.add_argument("--checkpoint-path-file", help="file that lists sharded checkpoint paths (batch run option)", type=str)
+    parser.add_argument("--weka-load-dir", help='mounted location of weka bucket', default='/data/input', type=str)
+
+    args = parser.parse_args()
 
-    if args.cleanup_local_dir:
-        # Clear out temp dir if we got here (everything ran without error).
-        shutil.rmtree(args.tmp_dir)
+    if args.checkpoint_path is not None:
+        convert_checkpoint([args.checkpoint_path], args.weka_load_dir)
+    else:
+        convert_checkpoint(read_checkpoints(args.checkpoint_path_file), args.weka_load_dir)
 
 
 if __name__ == "__main__":

From c0ff18605fbcdb6fc7dde95a33f848b4b6225a60 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Mon, 9 Sep 2024 12:23:18 -0700
Subject: [PATCH 22/89] convert checkpoint batch

---
 scripts/convert_checkpoints_batch.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 654504d55..96087cf4c 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -10,12 +10,13 @@
 import argparse
 import subprocess
 import os
+import json
 
 SANITY_CHECK = True
 
 def convert_checkpoint(cps, load_dir="/data/input"):
     cps = expand_paths(cps)
-    save = {}
+    processed = []
 
     for checkpoint_path in cps:
         # Convert to old-style checkpoint.
@@ -35,9 +36,14 @@ def convert_checkpoint(cps, load_dir="/data/input"):
             else:
                 subprocess.run(conversion_cmd, shell=True, check=True)
 
-        save[checkpoint_path] = {'converted_path': weka_loc, 'convertion': conversion}
+        processed.append({
+            'unproccessed_path': checkpoint_path,
+            'converted_path': weka_loc,
+            'convertion': conversion})
 
-    print(save)
+    with open('/data/input/jenah/log.jsonl','a+') as fout:
+        for p in processed:
+            fout.write(json.dumps(p)+'\n')
 
 def expand_paths(cps):
     expanded = []
@@ -49,7 +55,7 @@ def expand_paths(cps):
         relevant_dirs = ['/'.join(d.split()[-1].split('/')[:-1]) for d in all_dirs.split() if 'model.pt' in d]
         search_segs = [seg for i, seg in enumerate(segs) if i > 0 and seg != ""]
 
-        print(search_segs)
+        print(f"search segments: {search_segs}")
 
         temp_dirs = relevant_dirs
         if len(search_segs) > 0:

From 15092ae968e1f18512670d47d00fc88b14f61916 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Mon, 9 Sep 2024 12:25:37 -0700
Subject: [PATCH 23/89] convert checkpoint batch

---
 scripts/convert_checkpoints.py       | 2 +-
 scripts/convert_checkpoints_batch.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/convert_checkpoints.py b/scripts/convert_checkpoints.py
index 6d51b3149..415f6e387 100644
--- a/scripts/convert_checkpoints.py
+++ b/scripts/convert_checkpoints.py
@@ -9,7 +9,7 @@
 AWS_ACCESS_KEY_ID = 'JENA_AWS_ACCESS_KEY_ID'
 AWS_SECRET_ACCESS_KEY = 'JENA_AWS_SECRET_ACCESS_KEY'
 
-SANITY_CHECK = True
+SANITY_CHECK = False
 
 def convert_checkpoints(args):
     cmd = f"gantry run " \
diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 96087cf4c..3c07de860 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -12,7 +12,7 @@
 import os
 import json
 
-SANITY_CHECK = True
+SANITY_CHECK = False
 
 def convert_checkpoint(cps, load_dir="/data/input"):
     cps = expand_paths(cps)

From c489f53dba1f250a69ab5940318dd87060abf416 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Mon, 9 Sep 2024 12:45:19 -0700
Subject: [PATCH 24/89] convert checkpoint batch

---
 scripts/convert_checkpoints.sh       | 124 ++++++++++++++++++++++++++-
 scripts/convert_checkpoints_batch.py |   4 +-
 2 files changed, 126 insertions(+), 2 deletions(-)

diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index dcc7e5b02..319375032 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -1,3 +1,125 @@
+##!/usr/bin/env bash
+#
+## RUN AT THE TOP OF THE OLMo root
+#
+#CHECKPOINT_PATH=$1
+#shift
+#
+#SUFFIX="hf"
+#WORKSPACE=""
+#BUDGET=""
+#PRIORITY="normal"
+#
+#while getopts "s:w:b:p:t:" opt; do
+#    case $opt in
+#        s)
+#            SUFFIX="$OPTARG"
+#            ;;
+#        w)
+#            WORKSPACE="$OPTARG"
+#            ;;
+#        b)
+#            BUDGET="$OPTARG"
+#            ;;
+#        p)
+#            PRIORITY="$OPTARG"
+#            ;;
+#        t)
+#            CUSTOM_TOKENIZER="--tokenizer $OPTARG"
+#            ;;
+#        \?)
+#            echo "Invalid option: -$OPTARG" >&2
+#            exit 1
+#            ;;
+#    esac
+#done
+#
+## Set default values if not specified
+#if [ -z "$WORKSPACE" ]; then
+#    WORKSPACE="ai2/oe-data"
+#fi
+#
+#if [ -z "$BUDGET" ]; then
+#    BUDGET="$WORKSPACE"
+#fi
+#
+## Verify that a path has been provided
+#if [ -z "$CHECKPOINT_PATH" ]; then
+#  echo "Error: No path provided."
+#  exit 1
+#fi
+#
+## Check if CHECKPOINT_PATH is an s3:// path or an absolute path
+#if [[ ! "$CHECKPOINT_PATH" =~ ^s3:// ]] && [[ ! "$CHECKPOINT_PATH" =~ ^/ ]]; then
+#    echo "Error: CHECKPOINT_PATH must be an s3:// path or an absolute path."
+#    exit 1
+#fi
+#
+#
+## Extract weka_mountpoint if checkpoint path starts with specific directories
+#CLUSTERS="ai2/*"
+#for dir in climate-default mosaic-default nora-default oe-adapt-default oe-data-default oe-eval-default oe-training-default prior-default reviz-default skylight-default; do
+#    if [[ $CHECKPOINT_PATH == "/$dir"* ]]; then
+#        WEKA_MOUNTPOINTS=" --weka=${dir}:/${dir}"
+#        # Override clusters to use only jupiter-cirrascale-2
+#        CLUSTERS="ai2/jupiter-cirrascale-2"
+#        break
+#    fi
+#done
+#
+#
+## Function to check if S3 path exists
+#check_s3_path() {
+#  aws s3 ls "$1" > /dev/null 2>&1
+#  return $?
+#}
+#
+## Check if the provided path exists (only for S3 paths)
+#if [[ "$CHECKPOINT_PATH" =~ ^s3:// ]]; then
+#    if check_s3_path "$CHECKPOINT_PATH"; then
+#        echo "S3 path exists: $CHECKPOINT_PATH"
+#    else
+#        echo "Error: S3 path does not exist: $CHECKPOINT_PATH"
+#        exit 1
+#    fi
+#else
+#    echo "Skipping existence check for non-S3 path: $CHECKPOINT_PATH"
+#fi
+#
+#commands=(
+#    "pip install awscli"
+#    "git clone https://github.com/allenai/OLMo.git"
+#    "cd OLMo"
+#    "pip install -e '.[all]'"
+#    "if [ ! -d '${CHECKPOINT_PATH}-${SUFFIX}' ]; then python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '$CHECKPOINT_PATH'  --destination-dir '${CHECKPOINT_PATH}-${SUFFIX}' --keep-olmo-artifacts ${CUSTOM_TOKENIZER}; else echo 'Destination directory already exists. Skipping conversion.'; fi"
+#)
+#
+#
+#for cmd in "${commands[@]}"; do
+#  if [ -z "$joined_commands" ]; then
+#    joined_commands="$cmd"
+#  else
+#    joined_commands="$joined_commands && $cmd"
+#  fi
+#done
+#
+#gantry run \
+#    --description "Converting ${CHECKPOINT_PATH}" \
+#    --allow-dirty \
+#    --no-python \
+#    --workspace ${WORKSPACE} \
+#    --priority ${PRIORITY} \
+#    --gpus 0 \
+#    --preemptible \
+#    --cluster ${CLUSTERS} \
+#    --budget ${BUDGET} \
+#    --env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
+#    --env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
+#    --shared-memory 10GiB \
+#    ${WEKA_MOUNTPOINTS} \
+#    --yes \
+#    -- /bin/bash -c "${joined_commands}"
+
 gantry run \
     --allow-dirty \
     --workspace ai2/cheap-decisions  \
@@ -12,6 +134,6 @@ gantry run \
     --weka=oe-eval-default:/data/input \
     --pip requirements.txt \
     --yes \
-    -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --destination-dir '/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --keep-olmo-artifacts --tokenizer 'olmo_data/tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json'"
+    -- /bin/bash -c "python convert_checkpoints_batch.pyq --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step351000-unsharded' --weka-load-dir '/data/input/'"
 
 #    --install install_torch.sh \
diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 3c07de860..d1c6678a2 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -29,7 +29,9 @@ def convert_checkpoint(cps, load_dir="/data/input"):
             conversion = 'existing'
         else:
             conversion = 'new'
-            conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --keep-olmo-artifacts"
+            conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --keep-olmo-artifacts --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5.json'"
+
+
 
             if SANITY_CHECK:
                 print(conversion_cmd)

From 2ed6a50e60e686a448e3320749b4e97e71428c2d Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Mon, 9 Sep 2024 15:23:34 -0700
Subject: [PATCH 25/89] convert checkpoint batch

---
 scripts/convert_checkpoints.sh       | 124 +--------------------------
 scripts/convert_checkpoints_batch.py |  57 ++++++++----
 2 files changed, 43 insertions(+), 138 deletions(-)

diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index 319375032..3b1d463e2 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -1,125 +1,3 @@
-##!/usr/bin/env bash
-#
-## RUN AT THE TOP OF THE OLMo root
-#
-#CHECKPOINT_PATH=$1
-#shift
-#
-#SUFFIX="hf"
-#WORKSPACE=""
-#BUDGET=""
-#PRIORITY="normal"
-#
-#while getopts "s:w:b:p:t:" opt; do
-#    case $opt in
-#        s)
-#            SUFFIX="$OPTARG"
-#            ;;
-#        w)
-#            WORKSPACE="$OPTARG"
-#            ;;
-#        b)
-#            BUDGET="$OPTARG"
-#            ;;
-#        p)
-#            PRIORITY="$OPTARG"
-#            ;;
-#        t)
-#            CUSTOM_TOKENIZER="--tokenizer $OPTARG"
-#            ;;
-#        \?)
-#            echo "Invalid option: -$OPTARG" >&2
-#            exit 1
-#            ;;
-#    esac
-#done
-#
-## Set default values if not specified
-#if [ -z "$WORKSPACE" ]; then
-#    WORKSPACE="ai2/oe-data"
-#fi
-#
-#if [ -z "$BUDGET" ]; then
-#    BUDGET="$WORKSPACE"
-#fi
-#
-## Verify that a path has been provided
-#if [ -z "$CHECKPOINT_PATH" ]; then
-#  echo "Error: No path provided."
-#  exit 1
-#fi
-#
-## Check if CHECKPOINT_PATH is an s3:// path or an absolute path
-#if [[ ! "$CHECKPOINT_PATH" =~ ^s3:// ]] && [[ ! "$CHECKPOINT_PATH" =~ ^/ ]]; then
-#    echo "Error: CHECKPOINT_PATH must be an s3:// path or an absolute path."
-#    exit 1
-#fi
-#
-#
-## Extract weka_mountpoint if checkpoint path starts with specific directories
-#CLUSTERS="ai2/*"
-#for dir in climate-default mosaic-default nora-default oe-adapt-default oe-data-default oe-eval-default oe-training-default prior-default reviz-default skylight-default; do
-#    if [[ $CHECKPOINT_PATH == "/$dir"* ]]; then
-#        WEKA_MOUNTPOINTS=" --weka=${dir}:/${dir}"
-#        # Override clusters to use only jupiter-cirrascale-2
-#        CLUSTERS="ai2/jupiter-cirrascale-2"
-#        break
-#    fi
-#done
-#
-#
-## Function to check if S3 path exists
-#check_s3_path() {
-#  aws s3 ls "$1" > /dev/null 2>&1
-#  return $?
-#}
-#
-## Check if the provided path exists (only for S3 paths)
-#if [[ "$CHECKPOINT_PATH" =~ ^s3:// ]]; then
-#    if check_s3_path "$CHECKPOINT_PATH"; then
-#        echo "S3 path exists: $CHECKPOINT_PATH"
-#    else
-#        echo "Error: S3 path does not exist: $CHECKPOINT_PATH"
-#        exit 1
-#    fi
-#else
-#    echo "Skipping existence check for non-S3 path: $CHECKPOINT_PATH"
-#fi
-#
-#commands=(
-#    "pip install awscli"
-#    "git clone https://github.com/allenai/OLMo.git"
-#    "cd OLMo"
-#    "pip install -e '.[all]'"
-#    "if [ ! -d '${CHECKPOINT_PATH}-${SUFFIX}' ]; then python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '$CHECKPOINT_PATH'  --destination-dir '${CHECKPOINT_PATH}-${SUFFIX}' --keep-olmo-artifacts ${CUSTOM_TOKENIZER}; else echo 'Destination directory already exists. Skipping conversion.'; fi"
-#)
-#
-#
-#for cmd in "${commands[@]}"; do
-#  if [ -z "$joined_commands" ]; then
-#    joined_commands="$cmd"
-#  else
-#    joined_commands="$joined_commands && $cmd"
-#  fi
-#done
-#
-#gantry run \
-#    --description "Converting ${CHECKPOINT_PATH}" \
-#    --allow-dirty \
-#    --no-python \
-#    --workspace ${WORKSPACE} \
-#    --priority ${PRIORITY} \
-#    --gpus 0 \
-#    --preemptible \
-#    --cluster ${CLUSTERS} \
-#    --budget ${BUDGET} \
-#    --env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
-#    --env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
-#    --shared-memory 10GiB \
-#    ${WEKA_MOUNTPOINTS} \
-#    --yes \
-#    -- /bin/bash -c "${joined_commands}"
-
 gantry run \
     --allow-dirty \
     --workspace ai2/cheap-decisions  \
@@ -134,6 +12,6 @@ gantry run \
     --weka=oe-eval-default:/data/input \
     --pip requirements.txt \
     --yes \
-    -- /bin/bash -c "python convert_checkpoints_batch.pyq --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step351000-unsharded' --weka-load-dir '/data/input/'"
+    -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step351000-unsharded' --weka-load-dir '/data/input/'"
 
 #    --install install_torch.sh \
diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index d1c6678a2..1f27ea5f7 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -10,28 +10,36 @@
 import argparse
 import subprocess
 import os
+import time
+import boto3
 import json
 
-SANITY_CHECK = False
+SANITY_CHECK = True
 
 def convert_checkpoint(cps, load_dir="/data/input"):
-    cps = expand_paths(cps)
+    s3_client = boto3.client('s3')
+    s3_resource = boto3.resource('s3')
+
+    cps = expand_paths(cps, s3_client)
     processed = []
 
     for checkpoint_path in cps:
         # Convert to old-style checkpoint.
-
         retain_path_name = checkpoint_path.replace('s3://', '').strip('/')
         weka_loc = f"{load_dir}/{retain_path_name}-hf/"
 
         # Check if the output location is already there. If not, do the conversion.
         if os.path.exists(weka_loc):
             conversion = 'existing'
+            converted_path = weka_loc
+        elif s3_path_exists(checkpoint_path, s3_resource):
+            conversion = 'existing'
+            converted_path = checkpoint_path + '-hf'
         else:
             conversion = 'new'
-            conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --keep-olmo-artifacts --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5.json'"
-
+            converted_path = weka_loc
 
+            conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --keep-olmo-artifacts --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5.json'"
 
             if SANITY_CHECK:
                 print(conversion_cmd)
@@ -40,21 +48,41 @@ def convert_checkpoint(cps, load_dir="/data/input"):
 
         processed.append({
             'unproccessed_path': checkpoint_path,
-            'converted_path': weka_loc,
-            'convertion': conversion})
+            'converted_path': converted_path.replace(load_dir,'/weka'),
+            'convertion': conversion,
+            'date_time': time.strftime('%b-%d-%Y_%H%M', time.localtime())})
 
-    with open('/data/input/jenah/log.jsonl','a+') as fout:
+    print(processed)
+    with open('log.jsonl','a+') as fout:
         for p in processed:
             fout.write(json.dumps(p)+'\n')
 
-def expand_paths(cps):
+
+def s3_path_exists(cp, s3):
+    b = cp.split('/')[2]
+    bucket = s3.Bucket(b)
+    objs = list(bucket.objects.filter(Prefix=cp.replace('s3://'+b+'/', '') + '-hf'))
+    return True if (len(objs) > 0) else False
+
+
+def expand_paths(cps, s3):
     expanded = []
     for cp in cps:
+        bucket = cp.split('/')[2]
         segs = cp.split('*')
-        prefix = 's3://ai2-llm/'
-        cmd = f"aws s3 ls --recursive {segs[0]}"
-        all_dirs = subprocess.run(cmd, shell=True, check=True, capture_output=True, text = True).stdout
-        relevant_dirs = ['/'.join(d.split()[-1].split('/')[:-1]) for d in all_dirs.split() if 'model.pt' in d]
+
+        # cmd = f"aws s3 ls --recursive {segs[0]}"
+        # all_dirs = subprocess.run(cmd, shell=True, check=True, capture_output=True, text = True).stdout
+        # relevant_dirs = ['/'.join(d.split()[-1].split('/')[:-1]) for d in all_dirs.split() if 'model.pt' in d]
+
+        relevant_dirs = []
+        paginator = s3.get_paginator('list_objects_v2')
+        page_iterator = paginator.paginate(Bucket=bucket, Prefix=segs[0].replace('s3://'+bucket+'/', ''))
+        for page in page_iterator:
+            for obj in page['Contents']:
+                if 'model.pt' in obj["Key"]:
+                    relevant_dirs.append(obj["Key"].replace('/model.pt',''))
+
         search_segs = [seg for i, seg in enumerate(segs) if i > 0 and seg != ""]
 
         print(f"search segments: {search_segs}")
@@ -64,8 +92,7 @@ def expand_paths(cps):
             for s in search_segs:
                 temp_dirs = [d for d in temp_dirs if s in d]
 
-        exp = set([f"{prefix}{d}" for d in temp_dirs])
-        print(exp)
+        exp = set([f"s3://{bucket}/{d}" for d in temp_dirs])
 
         expanded += exp
     return expanded

From dd3dc185c8872a86db7492d88ed2d5bd1425e6d6 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Mon, 9 Sep 2024 15:29:06 -0700
Subject: [PATCH 26/89] convert checkpoint batch

---
 scripts/convert_checkpoints.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index 3b1d463e2..2029b2864 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -12,6 +12,7 @@ gantry run \
     --weka=oe-eval-default:/data/input \
     --pip requirements.txt \
     --yes \
-    -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step351000-unsharded' --weka-load-dir '/data/input/'"
+    --result=log.jsonl \
+    -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99*' --weka-load-dir '/data/input/'"
 
 #    --install install_torch.sh \

From 9bc11d7de3a2ba6f5e8b512117cacfa079405cfe Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Mon, 9 Sep 2024 15:36:33 -0700
Subject: [PATCH 27/89] convert checkpoint batch

---
 scripts/convert_checkpoints_batch.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 1f27ea5f7..8fbb2191c 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -39,7 +39,7 @@ def convert_checkpoint(cps, load_dir="/data/input"):
             conversion = 'new'
             converted_path = weka_loc
 
-            conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --keep-olmo-artifacts --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5.json'"
+            conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5.json'"
 
             if SANITY_CHECK:
                 print(conversion_cmd)
@@ -53,9 +53,13 @@ def convert_checkpoint(cps, load_dir="/data/input"):
             'date_time': time.strftime('%b-%d-%Y_%H%M', time.localtime())})
 
     print(processed)
-    with open('log.jsonl','a+') as fout:
+
+    results = 'results/'
+    if not os.path.exists(results):
+        os.mkdir(results)
+    with open(f'{results}log.jsonl', 'a+') as fout:
         for p in processed:
-            fout.write(json.dumps(p)+'\n')
+            fout.write(json.dumps(p) + '\n')
 
 
 def s3_path_exists(cp, s3):

From e08895fcc51a785fc8f8a2f001338afd4735352d Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Mon, 9 Sep 2024 15:38:14 -0700
Subject: [PATCH 28/89] convert checkpoint batch

---
 scripts/convert_checkpoints.sh       | 1 -
 scripts/convert_checkpoints_batch.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index 2029b2864..8f28dccd8 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -12,7 +12,6 @@ gantry run \
     --weka=oe-eval-default:/data/input \
     --pip requirements.txt \
     --yes \
-    --result=log.jsonl \
     -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99*' --weka-load-dir '/data/input/'"
 
 #    --install install_torch.sh \
diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 8fbb2191c..69e7fad14 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -57,7 +57,7 @@ def convert_checkpoint(cps, load_dir="/data/input"):
     results = 'results/'
     if not os.path.exists(results):
         os.mkdir(results)
-    with open(f'{results}log.jsonl', 'a+') as fout:
+    with open(f'{results}metrics.jsonl', 'a+') as fout:
         for p in processed:
             fout.write(json.dumps(p) + '\n')
 

From 0b82c2bba466bcf46b6fbc74f4b8e43713b3a6fb Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Mon, 9 Sep 2024 16:10:39 -0700
Subject: [PATCH 29/89] convert checkpoint batch

---
 scripts/convert_checkpoints.sh       | 27 ++++++++++++++++++++++++---
 scripts/convert_checkpoints_batch.py | 12 +++++++++---
 2 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index 8f28dccd8..594bb85d8 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -1,4 +1,27 @@
+#!/usr/bin/env bash
+
+# To be run at the top of the root of OLMo repository.
+#  Converts s3 checkpoints into WEKA
+
+# ASSUMPTIONS
+# - INPUT must be on s3
+# - OUTPUT is weka with the same path name as s3 + "-hf" suffix appended to the path
+# - Budget for oe-eval
+# - Experiments saved to ai2/cheap-decisions
+
+# NOTES
+# - saves metrics.json
+# - allows for wildcard (*)
+
+# TODOs
+# - Make consistent with Luca's code
+# - Code allows for a txt file with a list of checkpoint paths, sh needs to allow this
+
+CHECKPOINT_PATH=$1
+
+
 gantry run \
+    --description "checkpoint conv; eval for cons ranking" \
     --allow-dirty \
     --workspace ai2/cheap-decisions  \
     --priority normal \
@@ -10,8 +33,6 @@ gantry run \
     --env-secret AWS_SECRET_ACCESS_KEY=JENA_AWS_SECRET_ACCESS_KEY \
     --shared-memory 10GiB \
     --weka=oe-eval-default:/data/input \
-    --pip requirements.txt \
     --yes \
-    -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99*' --weka-load-dir '/data/input/'"
+    -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input/'"
 
-#    --install install_torch.sh \
diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 69e7fad14..538884692 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -14,7 +14,7 @@
 import boto3
 import json
 
-SANITY_CHECK = True
+SANITY_CHECK = False
 
 def convert_checkpoint(cps, load_dir="/data/input"):
     s3_client = boto3.client('s3')
@@ -44,6 +44,11 @@ def convert_checkpoint(cps, load_dir="/data/input"):
             if SANITY_CHECK:
                 print(conversion_cmd)
             else:
+                print('\n--------------------------------------------')
+                print("\nConverting Checkpoint...")
+                print(conversion_cmd)
+                print('\n--------------------------------------------')
+
                 subprocess.run(conversion_cmd, shell=True, check=True)
 
         processed.append({
@@ -57,7 +62,7 @@ def convert_checkpoint(cps, load_dir="/data/input"):
     results = 'results/'
     if not os.path.exists(results):
         os.mkdir(results)
-    with open(f'{results}metrics.jsonl', 'a+') as fout:
+    with open(f'{results}metrics.json', 'w') as fout:
         for p in processed:
             fout.write(json.dumps(p) + '\n')
 
@@ -66,6 +71,7 @@ def s3_path_exists(cp, s3):
     b = cp.split('/')[2]
     bucket = s3.Bucket(b)
     objs = list(bucket.objects.filter(Prefix=cp.replace('s3://'+b+'/', '') + '-hf'))
+    print(objs)
     return True if (len(objs) > 0) else False
 
 
@@ -89,7 +95,7 @@ def expand_paths(cps, s3):
 
         search_segs = [seg for i, seg in enumerate(segs) if i > 0 and seg != ""]
 
-        print(f"search segments: {search_segs}")
+        # print(f"search segments: {search_segs}")
 
         temp_dirs = relevant_dirs
         if len(search_segs) > 0:

From 482a487d4fd013aa9788f74c4cc22b3707ead250 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Mon, 9 Sep 2024 16:31:44 -0700
Subject: [PATCH 30/89] convert checkpoint batch

---
 scripts/convert_checkpoints.sh       | 2 ++
 scripts/convert_checkpoints_batch.py | 3 ++-
 scripts/results/metrics.json         | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)
 create mode 100644 scripts/results/metrics.json

diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index 594bb85d8..cbcf87eea 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -8,6 +8,7 @@
 # - OUTPUT is weka with the same path name as s3 + "-hf" suffix appended to the path
 # - Budget for oe-eval
 # - Experiments saved to ai2/cheap-decisions
+# - Assumes tokenizer allenai/gpt-neox-olmo-dolma-v1_5.json
 
 # NOTES
 # - saves metrics.json
@@ -16,6 +17,7 @@
 # TODOs
 # - Make consistent with Luca's code
 # - Code allows for a txt file with a list of checkpoint paths, sh needs to allow this
+#
 
 CHECKPOINT_PATH=$1
 
diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 538884692..438053362 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -29,6 +29,7 @@ def convert_checkpoint(cps, load_dir="/data/input"):
         weka_loc = f"{load_dir}/{retain_path_name}-hf/"
 
         # Check if the output location is already there. If not, do the conversion.
+        print('WEKA LOC', weka_loc)
         if os.path.exists(weka_loc):
             conversion = 'existing'
             converted_path = weka_loc
@@ -39,7 +40,7 @@ def convert_checkpoint(cps, load_dir="/data/input"):
             conversion = 'new'
             converted_path = weka_loc
 
-            conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5.json'"
+            conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --tokenizer ' allenai/gpt-neox-olmo-dolma-v1_5'"
 
             if SANITY_CHECK:
                 print(conversion_cmd)
diff --git a/scripts/results/metrics.json b/scripts/results/metrics.json
new file mode 100644
index 000000000..d9fd489c9
--- /dev/null
+++ b/scripts/results/metrics.json
@@ -0,0 +1 @@
+{"unproccessed_path": "s3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded", "converted_path": "/weka/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded-hf/", "convertion": "new", "date_time": "Sep-09-2024_1630"}

From 5c015cab34c80b779aa6c776a43a0860e89cb157 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Mon, 9 Sep 2024 16:40:14 -0700
Subject: [PATCH 31/89] convert checkpoint batch

---
 scripts/convert_checkpoints_batch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 438053362..c775e88e6 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -40,7 +40,7 @@ def convert_checkpoint(cps, load_dir="/data/input"):
             conversion = 'new'
             converted_path = weka_loc
 
-            conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --tokenizer ' allenai/gpt-neox-olmo-dolma-v1_5'"
+            conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5'"
 
             if SANITY_CHECK:
                 print(conversion_cmd)

From 268d74d1e9fa6d60e76c9f91f5b8c0d3804ee163 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Tue, 10 Sep 2024 09:00:49 -0700
Subject: [PATCH 32/89] convert checkpoint batch

---
 scripts/convert_checkpoints.py       | 58 ----------------------------
 scripts/convert_checkpoints.sh       |  2 +-
 scripts/convert_checkpoints_batch.py | 11 +++---
 3 files changed, 7 insertions(+), 64 deletions(-)
 delete mode 100644 scripts/convert_checkpoints.py

diff --git a/scripts/convert_checkpoints.py b/scripts/convert_checkpoints.py
deleted file mode 100644
index 415f6e387..000000000
--- a/scripts/convert_checkpoints.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# This script requires to be run at the root level.
-# Requires the AWS CLI and Beaker Gantry to be installed and configured.
-
-
-import argparse
-import subprocess
-
-# Beaker secret keys
-AWS_ACCESS_KEY_ID = 'JENA_AWS_ACCESS_KEY_ID'
-AWS_SECRET_ACCESS_KEY = 'JENA_AWS_SECRET_ACCESS_KEY'
-
-SANITY_CHECK = False
-
-def convert_checkpoints(args):
-    cmd = f"gantry run " \
-          f"--allow-dirty " \
-          f"--workspace ai2/cheap-decisions " \
-          f"--priority normal " \
-          f"--gpus 0 " \
-          f"--preemptible " \
-          f"--cluster 'ai2/jupiter-cirrascale-2' " \
-          f"--budget ai2/oe-eval " \
-          f"--env-secret AWS_ACCESS_KEY_ID={AWS_ACCESS_KEY_ID} " \
-          f"--env-secret AWS_SECRET_ACCESS_KEY={AWS_SECRET_ACCESS_KEY} " \
-          f"--shared-memory 10GiB " \
-          f"--weka=oe-eval-default:{args.weka_load_dir} " \
-          f"--yes "
-
-    if args.checkpoint_path is not None:
-        cmd += f"-- /bin/bash -c python convert_checkpoints_batch.py --checkpoint-path '{args.checkpoint_path}' --weka-load-dir {args.weka_load_dir}"
-    else:
-        cmd += f"-- /bin/bash -c python convert_checkpoints_batch.py --checkpoint-path-file '{args.checkpoint_path_file}' --weka-load-dir {args.weka_load_dir}"
-
-    if SANITY_CHECK:
-        print(cmd)
-    else:
-        try:
-            subprocess.run(cmd, shell=True, check=True)
-        except subprocess.CalledProcessError as e:
-            print(e.output)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Unshard checkpoint and convert to HF format. Run via Gantry. Invoke this script from the root of the OLMo repo."
-    )
-
-    group_batch = parser.add_mutually_exclusive_group(required=True)
-    group_batch.add_argument("--checkpoint-path", help="path to sharded checkpoint", type=str)
-    group_batch.add_argument("--checkpoint-path-file", help="file that lists sharded checkpoint paths (batch run option)", type=str)
-    parser.add_argument("--weka-load-dir", help='mounted location of weka bucket', default='/data/input', type=str)
-
-    args = parser.parse_args()
-    convert_checkpoints(args)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index cbcf87eea..84091c49e 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -36,5 +36,5 @@ gantry run \
     --shared-memory 10GiB \
     --weka=oe-eval-default:/data/input \
     --yes \
-    -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input/'"
+    -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input'"
 
diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index c775e88e6..ef78d7f80 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -13,6 +13,7 @@
 import time
 import boto3
 import json
+import sys
 
 SANITY_CHECK = False
 
@@ -45,10 +46,10 @@ def convert_checkpoint(cps, load_dir="/data/input"):
             if SANITY_CHECK:
                 print(conversion_cmd)
             else:
-                print('\n--------------------------------------------')
-                print("\nConverting Checkpoint...")
-                print(conversion_cmd)
-                print('\n--------------------------------------------')
+                sys.stdout.write('\n--------------------------------------------')
+                sys.stdout.write("\nConverting Checkpoint...")
+                sys.stdout.write(conversion_cmd)
+                sys.stdout.write('\n--------------------------------------------')
 
                 subprocess.run(conversion_cmd, shell=True, check=True)
 
@@ -72,7 +73,7 @@ def s3_path_exists(cp, s3):
     b = cp.split('/')[2]
     bucket = s3.Bucket(b)
     objs = list(bucket.objects.filter(Prefix=cp.replace('s3://'+b+'/', '') + '-hf'))
-    print(objs)
+    sys.stdout.write(f's3 path exists check: {objs}')
     return True if (len(objs) > 0) else False
 
 

From 1326da70f0e939a0aae699db62bc367b2abb409d Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Tue, 10 Sep 2024 09:07:49 -0700
Subject: [PATCH 33/89] convert checkpoint batch

---
 scripts/convert_checkpoints.sh | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index 84091c49e..1eabea5ab 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -2,6 +2,16 @@
 
 # To be run at the top of the root of OLMo repository.
 #  Converts s3 checkpoints into WEKA
+#
+# Example use:
+# sh scripts/convert_checkpoints.sh s3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step9*
+#
+# This will convert all models in the directory
+# and save them to their respective directories under
+#
+# /weka/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step9*
+#
+# It will first, though, check that the weka directory doesn't exist AND that s3 doesn't have a corresponding directory (so as not to replicate what conversions already made)
 
 # ASSUMPTIONS
 # - INPUT must be on s3
@@ -17,7 +27,7 @@
 # TODOs
 # - Make consistent with Luca's code
 # - Code allows for a txt file with a list of checkpoint paths, sh needs to allow this
-#
+# - Output is not saving. But it prints to the log. Fix this.
 
 CHECKPOINT_PATH=$1
 

From ccbeef256dab86ca8af16e2e5f531c9cd7e45c36 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Tue, 10 Sep 2024 09:19:21 -0700
Subject: [PATCH 34/89] convert checkpoint batch

---
 scripts/convert_checkpoints.sh       |  1 +
 scripts/convert_checkpoints_batch.py | 10 ++++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index 1eabea5ab..e76aa908f 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -28,6 +28,7 @@
 # - Make consistent with Luca's code
 # - Code allows for a txt file with a list of checkpoint paths, sh needs to allow this
 # - Output is not saving. But it prints to the log. Fix this.
+# - Make tokenizer updatable
 
 CHECKPOINT_PATH=$1
 
diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index ef78d7f80..38a877360 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -15,6 +15,8 @@
 import json
 import sys
 
+from gantry import METRICS_FILE
+
 SANITY_CHECK = False
 
 def convert_checkpoint(cps, load_dir="/data/input"):
@@ -61,10 +63,10 @@ def convert_checkpoint(cps, load_dir="/data/input"):
 
     print(processed)
 
-    results = 'results/'
-    if not os.path.exists(results):
-        os.mkdir(results)
-    with open(f'{results}metrics.json', 'w') as fout:
+    # results = 'results/'
+    # if not os.path.exists(results):
+    #     os.mkdir(results)
+    with open(METRICS_FILE, 'w') as fout:
         for p in processed:
             fout.write(json.dumps(p) + '\n')
 

From d83f2ed1dbc54ad172022aa17cc7682cb57291d2 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Tue, 10 Sep 2024 09:38:31 -0700
Subject: [PATCH 35/89] convert checkpoint batch

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index f439913b6..1a5847543 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,4 @@ rich
 botocore
 cached-path
 transformers
+gantry

From a42de22dfcb08f5c9f1bcb7d26be7d1962bd81a7 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Tue, 10 Sep 2024 09:42:19 -0700
Subject: [PATCH 36/89] convert checkpoint batch

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 1a5847543..ae2bf89c5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,4 @@ rich
 botocore
 cached-path
 transformers
-gantry
+beaker-gantry

From e07796b14d96b5a0028fb9cf9ac085c5cc05b3ee Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Wed, 11 Sep 2024 09:52:40 -0700
Subject: [PATCH 37/89] error catch

---
 scripts/convert_checkpoints_batch.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 38a877360..76ee38fdd 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -32,10 +32,11 @@ def convert_checkpoint(cps, load_dir="/data/input"):
         weka_loc = f"{load_dir}/{retain_path_name}-hf/"
 
         # Check if the output location is already there. If not, do the conversion.
+        error = ""
         print('WEKA LOC', weka_loc)
         if os.path.exists(weka_loc):
             conversion = 'existing'
-            converted_path = weka_loc
+            converted_path = weka_loc.replace(load_dir,'/weka')
         elif s3_path_exists(checkpoint_path, s3_resource):
             conversion = 'existing'
             converted_path = checkpoint_path + '-hf'
@@ -53,13 +54,18 @@ def convert_checkpoint(cps, load_dir="/data/input"):
                 sys.stdout.write(conversion_cmd)
                 sys.stdout.write('\n--------------------------------------------')
 
-                subprocess.run(conversion_cmd, shell=True, check=True)
+                try:
+                    subprocess.run(conversion_cmd, shell=True, check=True)
+                except subprocess.CalledProcessError as e:
+                    error = e.output
 
         processed.append({
             'unproccessed_path': checkpoint_path,
-            'converted_path': converted_path.replace(load_dir,'/weka'),
+            'converted_path': converted_path,
             'convertion': conversion,
-            'date_time': time.strftime('%b-%d-%Y_%H%M', time.localtime())})
+            'date_time': time.strftime('%b-%d-%Y_%H%M', time.localtime()),
+            'error': error}
+        )
 
     print(processed)
 

From c6e773d2d1f1dd52d98a4873b193dff081d101bd Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Wed, 11 Sep 2024 11:29:43 -0700
Subject: [PATCH 38/89] checking for existing conversions

---
 scripts/convert_checkpoints_batch.py | 30 ++++++++++++++++++++++------
 scripts/results/metrics.json         |  1 -
 2 files changed, 24 insertions(+), 7 deletions(-)
 delete mode 100644 scripts/results/metrics.json

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 76ee38fdd..ffa82d67d 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -19,6 +19,15 @@
 
 SANITY_CHECK = False
 
+# possible converted locations.
+# "self" is the target location where the converted model would be saved
+# key: template, value: description
+# template: MUST obey .format(load_dir, retain_path_name)
+WEKA_CHECK_LOCATIONS_PREFIXES = {
+    "{}/{}-hf/": 'self',
+    "{}/ianm/{}-hf": "ian's"
+}
+
 def convert_checkpoint(cps, load_dir="/data/input"):
     s3_client = boto3.client('s3')
     s3_resource = boto3.resource('s3')
@@ -30,16 +39,27 @@ def convert_checkpoint(cps, load_dir="/data/input"):
         # Convert to old-style checkpoint.
         retain_path_name = checkpoint_path.replace('s3://', '').strip('/')
         weka_loc = f"{load_dir}/{retain_path_name}-hf/"
+        check_locs = [l.format(load_dir,retain_path_name) for l in WEKA_CHECK_LOCATIONS_PREFIXES]
 
-        # Check if the output location is already there. If not, do the conversion.
+        sys.stdout.write(f"\n\n=== Processing Checkpoint: {retain_path_name}\n")
         error = ""
-        print('WEKA LOC', weka_loc)
-        if os.path.exists(weka_loc):
+
+        path_found = None
+        for loc in check_locs:
+            if os.path.exists(loc):
+                path_found = loc
+                break
+
+        # Check if the output location is already there. If not, do the conversion.
+        # print('WEKA LOC', weka_loc)
+        if path_found is not None:
             conversion = 'existing'
-            converted_path = weka_loc.replace(load_dir,'/weka')
+            converted_path = path_found.replace(load_dir,'/weka')
+            sys.stdout.write(f" -- Converted Checkpoint Found: {converted_path}\n")
         elif s3_path_exists(checkpoint_path, s3_resource):
             conversion = 'existing'
             converted_path = checkpoint_path + '-hf'
+            sys.stdout.write(f" -- Converted Checkpoint Found: {converted_path}\n")
         else:
             conversion = 'new'
             converted_path = weka_loc
@@ -49,8 +69,6 @@ def convert_checkpoint(cps, load_dir="/data/input"):
             if SANITY_CHECK:
                 print(conversion_cmd)
             else:
-                sys.stdout.write('\n--------------------------------------------')
-                sys.stdout.write("\nConverting Checkpoint...")
                 sys.stdout.write(conversion_cmd)
                 sys.stdout.write('\n--------------------------------------------')
 
diff --git a/scripts/results/metrics.json b/scripts/results/metrics.json
deleted file mode 100644
index d9fd489c9..000000000
--- a/scripts/results/metrics.json
+++ /dev/null
@@ -1 +0,0 @@
-{"unproccessed_path": "s3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded", "converted_path": "/weka/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded-hf/", "convertion": "new", "date_time": "Sep-09-2024_1630"}

From 798ded33306204b0bcb21d318d62c261d7edea89 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Wed, 11 Sep 2024 12:47:04 -0700
Subject: [PATCH 39/89] minor change

---
 scripts/convert_checkpoints.sh       | 3 ++-
 scripts/convert_checkpoints_batch.py | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index e76aa908f..1642fd4a9 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -31,10 +31,11 @@
 # - Make tokenizer updatable
 
 CHECKPOINT_PATH=$1
+DESCRIPTION=$2
 
 
 gantry run \
-    --description "checkpoint conv; eval for cons ranking" \
+    --description $DESCRIPTION \
     --allow-dirty \
     --workspace ai2/cheap-decisions  \
     --priority normal \
diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index ffa82d67d..496c5b538 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -99,7 +99,6 @@ def s3_path_exists(cp, s3):
     b = cp.split('/')[2]
     bucket = s3.Bucket(b)
     objs = list(bucket.objects.filter(Prefix=cp.replace('s3://'+b+'/', '') + '-hf'))
-    sys.stdout.write(f's3 path exists check: {objs}')
     return True if (len(objs) > 0) else False
 
 

From 45a93742edcdb68a3f4b5c9db141ae8b08f0672a Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Wed, 11 Sep 2024 13:00:37 -0700
Subject: [PATCH 40/89] minor change

---
 scripts/convert_checkpoints.sh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index 1642fd4a9..536e0a2eb 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -31,11 +31,9 @@
 # - Make tokenizer updatable
 
 CHECKPOINT_PATH=$1
-DESCRIPTION=$2
-
 
 gantry run \
-    --description $DESCRIPTION \
+    --description "Converting $CHECKPOINT_PATH" \
     --allow-dirty \
     --workspace ai2/cheap-decisions  \
     --priority normal \

From 26a1e26a55446c442de31a23bce617d14db093d3 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Wed, 11 Sep 2024 13:46:11 -0700
Subject: [PATCH 41/89] adding a cleanup flag for removing local directory at
 the end of the process

---
 hf_olmo/convert_olmo_to_hf.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/hf_olmo/convert_olmo_to_hf.py b/hf_olmo/convert_olmo_to_hf.py
index 2e0a9e074..9e0c7afb6 100644
--- a/hf_olmo/convert_olmo_to_hf.py
+++ b/hf_olmo/convert_olmo_to_hf.py
@@ -284,6 +284,12 @@ def main():
         help="Keep olmo-specific artifacts in the checkpoint.",
     )
 
+    parser.add_argument(
+        "--cleanup-local-dir",
+        action="store_true",
+        help="Remove local download of the directory."
+    )
+
     args = parser.parse_args()
 
     args.destination_dir = args.destination_dir or args.checkpoint_dir
@@ -308,6 +314,9 @@ def main():
     upload_local_checkpoint(local_checkpoint_dir, args.destination_dir)
 
     print(f"Converted checkpoint saved to {args.destination_dir}")
+    if args.cleanup_local_dir:
+        print(f"Removing temporary local dir: {local_checkpoint_dir}")
+        shutil.rmtree(local_checkpoint_dir)
 
 
 if __name__ == "__main__":

From 30a9cb9aad2f43092789a6e296f3ea6d3dcbb74e Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Wed, 11 Sep 2024 13:51:32 -0700
Subject: [PATCH 42/89] fix

---
 scripts/convert_checkpoints_batch.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 496c5b538..d3114ead2 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -23,6 +23,7 @@
 # "self" is the target location where the converted model would be saved
 # key: template, value: description
 # template: MUST obey .format(load_dir, retain_path_name)
+
 WEKA_CHECK_LOCATIONS_PREFIXES = {
     "{}/{}-hf/": 'self',
     "{}/ianm/{}-hf": "ian's"
@@ -64,7 +65,7 @@ def convert_checkpoint(cps, load_dir="/data/input"):
             conversion = 'new'
             converted_path = weka_loc
 
-            conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5'"
+            conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5'  --cleanup-local-dir"
 
             if SANITY_CHECK:
                 print(conversion_cmd)

From 083ff3eaf91991a084ee061376fb13e041490766 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Wed, 11 Sep 2024 14:18:55 -0700
Subject: [PATCH 43/89] troubleshooting

---
 scripts/convert_checkpoints_batch.py | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index d3114ead2..dc3c84279 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -17,7 +17,7 @@
 
 from gantry import METRICS_FILE
 
-SANITY_CHECK = False
+SANITY_CHECK = True
 
 # possible converted locations.
 # "self" is the target location where the converted model would be saved
@@ -34,15 +34,16 @@ def convert_checkpoint(cps, load_dir="/data/input"):
     s3_resource = boto3.resource('s3')
 
     cps = expand_paths(cps, s3_client)
-    processed = []
 
     for checkpoint_path in cps:
         # Convert to old-style checkpoint.
+        processed = []
+
         retain_path_name = checkpoint_path.replace('s3://', '').strip('/')
         weka_loc = f"{load_dir}/{retain_path_name}-hf/"
         check_locs = [l.format(load_dir,retain_path_name) for l in WEKA_CHECK_LOCATIONS_PREFIXES]
 
-        sys.stdout.write(f"\n\n=== Processing Checkpoint: {retain_path_name}\n")
+        sys.stdout.write(f"\n\nProcessing Checkpoint: {retain_path_name}\n")
         error = ""
 
         path_found = None
@@ -56,7 +57,7 @@ def convert_checkpoint(cps, load_dir="/data/input"):
         if path_found is not None:
             conversion = 'existing'
             converted_path = path_found.replace(load_dir,'/weka')
-            sys.stdout.write(f" -- Converted Checkpoint Found: {converted_path}\n")
+            sys.stdout.write(f"Converted Checkpoint Found: {converted_path}\n")
         elif s3_path_exists(checkpoint_path, s3_resource):
             conversion = 'existing'
             converted_path = checkpoint_path + '-hf'
@@ -68,10 +69,10 @@ def convert_checkpoint(cps, load_dir="/data/input"):
             conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5'  --cleanup-local-dir"
 
             if SANITY_CHECK:
-                print(conversion_cmd)
+                sys.stdout.write('SANITY CHECK MODE (not running the conversion)')
+                sys.stdout.write(conversion_cmd + '\n')
             else:
-                sys.stdout.write(conversion_cmd)
-                sys.stdout.write('\n--------------------------------------------')
+                sys.stdout.write(conversion_cmd + '\n')
 
                 try:
                     subprocess.run(conversion_cmd, shell=True, check=True)
@@ -86,14 +87,11 @@ def convert_checkpoint(cps, load_dir="/data/input"):
             'error': error}
         )
 
-    print(processed)
+        print(processed)
 
-    # results = 'results/'
-    # if not os.path.exists(results):
-    #     os.mkdir(results)
-    with open(METRICS_FILE, 'w') as fout:
-        for p in processed:
-            fout.write(json.dumps(p) + '\n')
+        with open(METRICS_FILE, 'a+') as fout:
+            for p in processed:
+                fout.write(json.dumps(p) + '\n')
 
 
 def s3_path_exists(cp, s3):

From 9c6dc75ab49442f54c7ad9c0202de504eedf0810 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Wed, 11 Sep 2024 14:23:57 -0700
Subject: [PATCH 44/89] troubleshooting

---
 scripts/convert_checkpoints.sh       |  2 +-
 scripts/convert_checkpoints_batch.py | 11 +++++------
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index 536e0a2eb..e85b4d269 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -46,5 +46,5 @@ gantry run \
     --shared-memory 10GiB \
     --weka=oe-eval-default:/data/input \
     --yes \
-    -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input'"
+    -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input' --sanity-check"
 
diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index dc3c84279..cb3c1bbf8 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -17,8 +17,6 @@
 
 from gantry import METRICS_FILE
 
-SANITY_CHECK = True
-
 # possible converted locations.
 # "self" is the target location where the converted model would be saved
 # key: template, value: description
@@ -29,7 +27,7 @@
     "{}/ianm/{}-hf": "ian's"
 }
 
-def convert_checkpoint(cps, load_dir="/data/input"):
+def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False):
     s3_client = boto3.client('s3')
     s3_resource = boto3.resource('s3')
 
@@ -68,7 +66,7 @@ def convert_checkpoint(cps, load_dir="/data/input"):
 
             conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5'  --cleanup-local-dir"
 
-            if SANITY_CHECK:
+            if sanity_check:
                 sys.stdout.write('SANITY CHECK MODE (not running the conversion)')
                 sys.stdout.write(conversion_cmd + '\n')
             else:
@@ -147,13 +145,14 @@ def main():
     group_batch.add_argument("--checkpoint-path", help="path to sharded checkpoint", type=str)
     group_batch.add_argument("--checkpoint-path-file", help="file that lists sharded checkpoint paths (batch run option)", type=str)
     parser.add_argument("--weka-load-dir", help='mounted location of weka bucket', default='/data/input', type=str)
+    parser.add_argument("--sanity-check", help='print what would be run; do not actually run conversion', action='store_true')
 
     args = parser.parse_args()
 
     if args.checkpoint_path is not None:
-        convert_checkpoint([args.checkpoint_path], args.weka_load_dir)
+        convert_checkpoint([args.checkpoint_path], load_dir=args.weka_load_dir, sanity_check=args.sanity_check)
     else:
-        convert_checkpoint(read_checkpoints(args.checkpoint_path_file), args.weka_load_dir)
+        convert_checkpoint(read_checkpoints(args.checkpoint_path_file), load_dir=args.weka_load_dir, sanity_check=args.sanity_check)
 
 
 if __name__ == "__main__":

From 54a0d620b4a65e247664a1640f02e8107a107be2 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Wed, 11 Sep 2024 14:34:07 -0700
Subject: [PATCH 45/89] troubleshooting

---
 scripts/convert_checkpoints_batch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index cb3c1bbf8..2b39e516a 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -23,8 +23,8 @@
 # template: MUST obey .format(load_dir, retain_path_name)
 
 WEKA_CHECK_LOCATIONS_PREFIXES = {
-    "{}/{}-hf/": 'self',
-    "{}/ianm/{}-hf": "ian's"
+    "{}/{}-hf/pytorch_model.bin": 'self',
+    "{}/ianm/{}-hf/pytorch_model.bin": "ian's"
 }
 
 def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False):

From 25d5a7dbac5876ed33bffb65e2addba995c85530 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Wed, 11 Sep 2024 14:50:37 -0700
Subject: [PATCH 46/89] troubleshooting

---
 scripts/convert_checkpoints.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index e85b4d269..536e0a2eb 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -46,5 +46,5 @@ gantry run \
     --shared-memory 10GiB \
     --weka=oe-eval-default:/data/input \
     --yes \
-    -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input' --sanity-check"
+    -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input'"
 

From dd9ae8a1020d96cb2758b8b751372a99ddb3bdf0 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Thu, 12 Sep 2024 10:32:03 -0700
Subject: [PATCH 47/89] minor fixes

---
 scripts/convert_checkpoints_batch.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 2b39e516a..7e72e5011 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -41,7 +41,8 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False):
         weka_loc = f"{load_dir}/{retain_path_name}-hf/"
         check_locs = [l.format(load_dir,retain_path_name) for l in WEKA_CHECK_LOCATIONS_PREFIXES]
 
-        sys.stdout.write(f"\n\nProcessing Checkpoint: {retain_path_name}\n")
+        print(f"\n\n------------------------------------------------------------", flush=True)
+        print(f"\nProcessing Checkpoint: {retain_path_name}\n", flush=True)
         error = ""
 
         path_found = None
@@ -55,22 +56,22 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False):
         if path_found is not None:
             conversion = 'existing'
             converted_path = path_found.replace(load_dir,'/weka')
-            sys.stdout.write(f"Converted Checkpoint Found: {converted_path}\n")
+            print(f"Converted Checkpoint Found: {converted_path}\n", flush=True)
         elif s3_path_exists(checkpoint_path, s3_resource):
             conversion = 'existing'
             converted_path = checkpoint_path + '-hf'
-            sys.stdout.write(f" -- Converted Checkpoint Found: {converted_path}\n")
+            print(f"Converted Checkpoint Found: {converted_path}\n", flush=True)
         else:
             conversion = 'new'
-            converted_path = weka_loc
+            converted_path = weka_loc.replace(load_dir,'/weka')
 
             conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5'  --cleanup-local-dir"
 
             if sanity_check:
-                sys.stdout.write('SANITY CHECK MODE (not running the conversion)')
-                sys.stdout.write(conversion_cmd + '\n')
+                print('SANITY CHECK MODE (not running the conversion)')
+                print(conversion_cmd + '\n')
             else:
-                sys.stdout.write(conversion_cmd + '\n')
+                # sys.stdout.write(conversion_cmd + '\n')
 
                 try:
                     subprocess.run(conversion_cmd, shell=True, check=True)
@@ -78,14 +79,14 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False):
                     error = e.output
 
         processed.append({
-            'unproccessed_path': checkpoint_path,
+            'unprocessed_path': checkpoint_path,
             'converted_path': converted_path,
-            'convertion': conversion,
+            'conversion': conversion,
             'date_time': time.strftime('%b-%d-%Y_%H%M', time.localtime()),
             'error': error}
         )
 
-        print(processed)
+        #print(processed)
 
         with open(METRICS_FILE, 'a+') as fout:
             for p in processed:

From 402f7b720026dc06e7181ee31a8af6f8ee801a9d Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Thu, 12 Sep 2024 10:42:55 -0700
Subject: [PATCH 48/89] minor fixes

---
 scripts/convert_checkpoints_batch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 7e72e5011..94a49a3d3 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -48,7 +48,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False):
         path_found = None
         for loc in check_locs:
             if os.path.exists(loc):
-                path_found = loc
+                path_found = loc.replace('/pytorch_model.bin','')
                 break
 
         # Check if the output location is already there. If not, do the conversion.

From f5806dad3027d7191abab9eac47d99104a1dcac4 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Thu, 12 Sep 2024 11:02:03 -0700
Subject: [PATCH 49/89] minor fixes

---
 scripts/convert_checkpoints_batch.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 94a49a3d3..24799ff71 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -33,6 +33,8 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False):
 
     cps = expand_paths(cps, s3_client)
 
+    print(f"Total of {len(cps)} paths to process.", flush=True)
+
     for checkpoint_path in cps:
         # Convert to old-style checkpoint.
         processed = []

From 7718e9f2cec2ccfa03f9a437ff5b07d3316416d3 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Thu, 12 Sep 2024 11:20:07 -0700
Subject: [PATCH 50/89] minor fixes

---
 scripts/convert_checkpoints_batch.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 24799ff71..e6974296c 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -55,14 +55,17 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False):
 
         # Check if the output location is already there. If not, do the conversion.
         # print('WEKA LOC', weka_loc)
-        if path_found is not None:
+        s3_hf_exists = s3_path_exists(checkpoint_path, s3_resource)
+        if s3_hf_exists is not None:
             conversion = 'existing'
-            converted_path = path_found.replace(load_dir,'/weka')
+            converted_path = s3_hf_exists # checkpoint_path + '-hf'
             print(f"Converted Checkpoint Found: {converted_path}\n", flush=True)
-        elif s3_path_exists(checkpoint_path, s3_resource):
+
+        elif path_found is not None:
             conversion = 'existing'
-            converted_path = checkpoint_path + '-hf'
+            converted_path = path_found.replace(load_dir,'/weka')
             print(f"Converted Checkpoint Found: {converted_path}\n", flush=True)
+
         else:
             conversion = 'new'
             converted_path = weka_loc.replace(load_dir,'/weka')
@@ -98,8 +101,13 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False):
 def s3_path_exists(cp, s3):
     b = cp.split('/')[2]
     bucket = s3.Bucket(b)
-    objs = list(bucket.objects.filter(Prefix=cp.replace('s3://'+b+'/', '') + '-hf'))
-    return True if (len(objs) > 0) else False
+    prefix = cp.replace('s3://'+b+'/', '')
+    objs = list(bucket.objects.filter(Prefix=prefix + '-hf/pytorch_model.bin'))
+    if len(objs) > 0:
+        return cp + '-hf'
+    else:
+        objs2 = list(bucket.objects.filter(Prefix=prefix + '-hf-olmo/pytorch_model.bin'))
+        return cp + '-hf-olmo' if (len(objs2) > 0) else None
 
 
 def expand_paths(cps, s3):

From d8436da662d41b86f5d4b3ed9e8e323e606014ac Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Thu, 12 Sep 2024 11:54:10 -0700
Subject: [PATCH 51/89] minor fixes

---
 scripts/convert_checkpoints_batch.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index e6974296c..2553d29fb 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -14,6 +14,7 @@
 import boto3
 import json
 import sys
+from pathlib import Path
 
 from gantry import METRICS_FILE
 
@@ -102,6 +103,7 @@ def s3_path_exists(cp, s3):
     b = cp.split('/')[2]
     bucket = s3.Bucket(b)
     prefix = cp.replace('s3://'+b+'/', '')
+    print(bucket, prefix)
     objs = list(bucket.objects.filter(Prefix=prefix + '-hf/pytorch_model.bin'))
     if len(objs) > 0:
         return cp + '-hf'
@@ -112,21 +114,26 @@ def s3_path_exists(cp, s3):
 
 def expand_paths(cps, s3):
     expanded = []
+
     for cp in cps:
         bucket = cp.split('/')[2]
         segs = cp.split('*')
-
-        # cmd = f"aws s3 ls --recursive {segs[0]}"
-        # all_dirs = subprocess.run(cmd, shell=True, check=True, capture_output=True, text = True).stdout
-        # relevant_dirs = ['/'.join(d.split()[-1].split('/')[:-1]) for d in all_dirs.split() if 'model.pt' in d]
+        prefix = segs[0].replace('s3://'+bucket+'/', '')
 
         relevant_dirs = []
+
         paginator = s3.get_paginator('list_objects_v2')
-        page_iterator = paginator.paginate(Bucket=bucket, Prefix=segs[0].replace('s3://'+bucket+'/', ''))
+        page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix)
+
         for page in page_iterator:
             for obj in page['Contents']:
-                if 'model.pt' in obj["Key"]:
-                    relevant_dirs.append(obj["Key"].replace('/model.pt',''))
+                p = Path(obj["Key"])
+                if p.parent.name in ['optim', 'train','model']:
+                    relevant_dirs.append(p.parent.parent)
+                elif p.name == 'model.pt':
+                    relevant_dirs.append(p.parent)
+                # if 'model.pt' in obj["Key"]:
+                #     relevant_dirs.append(obj["Key"].replace('/model.pt',''))
 
         search_segs = [seg for i, seg in enumerate(segs) if i > 0 and seg != ""]
 

From c7717a19ec40a776cf06534e76330239257ecdd4 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Thu, 12 Sep 2024 12:40:14 -0700
Subject: [PATCH 52/89] fix

---
 scripts/convert_checkpoints_batch.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 2553d29fb..c7cc42e6a 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -129,9 +129,9 @@ def expand_paths(cps, s3):
             for obj in page['Contents']:
                 p = Path(obj["Key"])
                 if p.parent.name in ['optim', 'train','model']:
-                    relevant_dirs.append(p.parent.parent)
+                    relevant_dirs.append(str(p.parent.parent))
                 elif p.name == 'model.pt':
-                    relevant_dirs.append(p.parent)
+                    relevant_dirs.append(str(p.parent))
                 # if 'model.pt' in obj["Key"]:
                 #     relevant_dirs.append(obj["Key"].replace('/model.pt',''))
 
@@ -139,6 +139,7 @@ def expand_paths(cps, s3):
 
         # print(f"search segments: {search_segs}")
 
+        # subselect the directory with remaining segments (for multiple wildcard *)
         temp_dirs = relevant_dirs
         if len(search_segs) > 0:
             for s in search_segs:

From b8445ce9be8c73f94b1e42cc4e35794227fc8b4b Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 13 Sep 2024 08:59:14 -0700
Subject: [PATCH 53/89] updates

---
 scripts/convert_checkpoints_batch.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index c7cc42e6a..eb6bd574b 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -34,7 +34,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False):
 
     cps = expand_paths(cps, s3_client)
 
-    print(f"Total of {len(cps)} paths to process.", flush=True)
+    print(f">>> Total of {len(cps)} paths to process. <<<", flush=True)
 
     for checkpoint_path in cps:
         # Convert to old-style checkpoint.
@@ -94,16 +94,17 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False):
 
         #print(processed)
 
-        with open(METRICS_FILE, 'a+') as fout:
-            for p in processed:
-                fout.write(json.dumps(p) + '\n')
+        if not sanity_check:
+            with open(METRICS_FILE, 'a+') as fout:
+                for p in processed:
+                    fout.write(json.dumps(p) + '\n')
 
 
 def s3_path_exists(cp, s3):
     b = cp.split('/')[2]
     bucket = s3.Bucket(b)
     prefix = cp.replace('s3://'+b+'/', '')
-    print(bucket, prefix)
+    # print(bucket, prefix)
     objs = list(bucket.objects.filter(Prefix=prefix + '-hf/pytorch_model.bin'))
     if len(objs) > 0:
         return cp + '-hf'

From 855d66646de75f29a34e496aa4443201c91e0f74 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 13 Sep 2024 15:27:31 -0700
Subject: [PATCH 54/89] handle directories that have unsharded counterparts

---
 scripts/convert_checkpoints_batch.py | 47 ++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 10 deletions(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index eb6bd574b..37f11e714 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -122,19 +122,46 @@ def expand_paths(cps, s3):
         prefix = segs[0].replace('s3://'+bucket+'/', '')
 
         relevant_dirs = []
+        skip_parent = []
 
         paginator = s3.get_paginator('list_objects_v2')
         page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix)
-
-        for page in page_iterator:
-            for obj in page['Contents']:
-                p = Path(obj["Key"])
-                if p.parent.name in ['optim', 'train','model']:
-                    relevant_dirs.append(str(p.parent.parent))
-                elif p.name == 'model.pt':
-                    relevant_dirs.append(str(p.parent))
-                # if 'model.pt' in obj["Key"]:
-                #     relevant_dirs.append(obj["Key"].replace('/model.pt',''))
+        contents = {obj["Key"]:str(Path(obj['Key']).parent) for page in page_iterator for obj in page['Contents']}
+        paths = set(contents.values())
+        # print(contents)
+
+        for path in contents:
+            p = Path(path)
+            parent = str(p.parent)
+            grandpa = str(p.parent.parent)
+
+            if parent in relevant_dirs or parent in skip_parent:
+                continue
+            if p.parent.name in ['optim', 'train','model']:
+                if f"{grandpa}-unsharded" in paths:
+                    # skip condition
+                    skip_parent.append(parent)
+                    continue
+                else:
+                    relevant_dirs.append(grandpa)
+            elif p.name == 'model.pt':
+                relevant_dirs.append(parent)
+
+
+        # for page in page_iterator:
+        #     for obj in page['Contents']:
+        #         p = Path(obj["Key"])
+        #         if p.parent.name in ['optim', 'train','model']:
+        #             grand_parent = str(p.parent.parent)
+        #             if '-unsharded' not in grand_parent:
+        #                 objs = list(s3_resource.Bucket(bucket).objects.filter(Prefix=grand_parent + '-unsharded'))
+        #                 if len(objs) > 0:
+        #                     continue
+        #             relevant_dirs.append(str(p.parent.parent))
+        #         elif p.name == 'model.pt':
+        #             relevant_dirs.append(str(p.parent))
+        #         # if 'model.pt' in obj["Key"]:
+        #         #     relevant_dirs.append(obj["Key"].replace('/model.pt',''))
 
         search_segs = [seg for i, seg in enumerate(segs) if i > 0 and seg != ""]
 

From ed5abb78da4960cc137e8cd46939814cf16db585 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 13 Sep 2024 16:20:31 -0700
Subject: [PATCH 55/89] fixing error catching

---
 scripts/convert_checkpoints_batch.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 37f11e714..cd3cf2da6 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -77,12 +77,12 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False):
                 print('SANITY CHECK MODE (not running the conversion)')
                 print(conversion_cmd + '\n')
             else:
-                # sys.stdout.write(conversion_cmd + '\n')
-
                 try:
                     subprocess.run(conversion_cmd, shell=True, check=True)
                 except subprocess.CalledProcessError as e:
-                    error = e.output
+                    error = e.output ### NOT ACTUALLY WORKING CORRECTLY. FIX THIS (not catching config not found error)
+                    conversion = 'error'
+                    converted_path = ""
 
         processed.append({
             'unprocessed_path': checkpoint_path,

From cbcbc86983e4c5e3484e6ce13e4ab370ff6f98b5 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Wed, 18 Sep 2024 12:13:20 -0700
Subject: [PATCH 56/89] output log edits

---
 scripts/convert_checkpoints_batch.py | 50 ++++++++++++++++++----------
 1 file changed, 33 insertions(+), 17 deletions(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index cd3cf2da6..b3c0eee23 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -16,7 +16,7 @@
 import sys
 from pathlib import Path
 
-from gantry import METRICS_FILE
+from gantry import RESULTS_DIR
 
 # possible converted locations.
 # "self" is the target location where the converted model would be saved
@@ -28,7 +28,7 @@
     "{}/ianm/{}-hf/pytorch_model.bin": "ian's"
 }
 
-def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False):
+def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_prefix="/weka"):
     s3_client = boto3.client('s3')
     s3_resource = boto3.resource('s3')
 
@@ -36,10 +36,10 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False):
 
     print(f">>> Total of {len(cps)} paths to process. <<<", flush=True)
 
+    processed = {}
+
     for checkpoint_path in cps:
         # Convert to old-style checkpoint.
-        processed = []
-
         retain_path_name = checkpoint_path.replace('s3://', '').strip('/')
         weka_loc = f"{load_dir}/{retain_path_name}-hf/"
         check_locs = [l.format(load_dir,retain_path_name) for l in WEKA_CHECK_LOCATIONS_PREFIXES]
@@ -64,12 +64,12 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False):
 
         elif path_found is not None:
             conversion = 'existing'
-            converted_path = path_found.replace(load_dir,'/weka')
+            converted_path = path_found.replace(load_dir,weka_prefix)
             print(f"Converted Checkpoint Found: {converted_path}\n", flush=True)
 
         else:
             conversion = 'new'
-            converted_path = weka_loc.replace(load_dir,'/weka')
+            converted_path = weka_loc
 
             conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5'  --cleanup-local-dir"
 
@@ -84,20 +84,35 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False):
                     conversion = 'error'
                     converted_path = ""
 
-        processed.append({
+        local_log = {
             'unprocessed_path': checkpoint_path,
-            'converted_path': converted_path,
+            'converted_path': converted_path.replace(load_dir,weka_prefix),
             'conversion': conversion,
             'date_time': time.strftime('%b-%d-%Y_%H%M', time.localtime()),
-            'error': error}
-        )
-
-        #print(processed)
+            'error': error
+        }
+
+        # {"model_name": "name", "checkpoints_location": "weka://path/to/<name/>", "revisions": ["step0-unsharded-hf", "step1000-unsharded-hf", etc]}
+        curr = Path(converted_path)
+        parent = curr.parent
+        if parent.name not in processed:
+            processed[parent.name] = {
+                'model_name': parent.name,
+                'checkpoints_location': str(parent).replace(load_dir,weka_prefix),
+                'revisions': [curr.name]
+            }
+        else:
+            processed[parent.name]['revisions'].append(curr.name)
 
+        # LOG
         if not sanity_check:
-            with open(METRICS_FILE, 'a+') as fout:
-                for p in processed:
-                    fout.write(json.dumps(p) + '\n')
+            with open(os.path.join(RESULTS_DIR, 'log.jsonl'), 'a+') as fout:
+                fout.write(json.dumps(local_log) + '\n')
+
+    if not sanity_check:
+        with open(os.path.join(RESULTS_DIR, 'model_checkpoints.jsonl'), 'w') as fout:
+            for p in processed:
+                fout.write(json.dumps(p) + '\n')
 
 
 def s3_path_exists(cp, s3):
@@ -192,14 +207,15 @@ def main():
     group_batch.add_argument("--checkpoint-path", help="path to sharded checkpoint", type=str)
     group_batch.add_argument("--checkpoint-path-file", help="file that lists sharded checkpoint paths (batch run option)", type=str)
     parser.add_argument("--weka-load-dir", help='mounted location of weka bucket', default='/data/input', type=str)
+    parser.add_argument("--weka-prefix", help='weka directory prefix for output', default='/weka', type=str)
     parser.add_argument("--sanity-check", help='print what would be run; do not actually run conversion', action='store_true')
 
     args = parser.parse_args()
 
     if args.checkpoint_path is not None:
-        convert_checkpoint([args.checkpoint_path], load_dir=args.weka_load_dir, sanity_check=args.sanity_check)
+        convert_checkpoint([args.checkpoint_path], load_dir=args.weka_load_dir, sanity_check=args.sanity_check, weka_prefix=args.weka_prefix)
     else:
-        convert_checkpoint(read_checkpoints(args.checkpoint_path_file), load_dir=args.weka_load_dir, sanity_check=args.sanity_check)
+        convert_checkpoint(read_checkpoints(args.checkpoint_path_file), load_dir=args.weka_load_dir, sanity_check=args.sanity_check, weka_prefix=args.weka_prefix)
 
 
 if __name__ == "__main__":

From d49db7b41a864ed21d423123e42b65168df31240 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Wed, 18 Sep 2024 15:10:41 -0700
Subject: [PATCH 57/89] output log edits

---
 scripts/convert_checkpoints.sh       | 2 +-
 scripts/convert_checkpoints_batch.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index 536e0a2eb..2afbd15f8 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -46,5 +46,5 @@ gantry run \
     --shared-memory 10GiB \
     --weka=oe-eval-default:/data/input \
     --yes \
-    -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input'"
+    -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input' --weka-prefix 'weka://oe-eval-default'"
 
diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index b3c0eee23..afb69f8ad 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -111,7 +111,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
 
     if not sanity_check:
         with open(os.path.join(RESULTS_DIR, 'model_checkpoints.jsonl'), 'w') as fout:
-            for p in processed:
+            for _,p in processed.items():
                 fout.write(json.dumps(p) + '\n')
 
 

From cd6a75a386fbf5cbcebf6347156f2873c1bf81da Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Wed, 18 Sep 2024 15:36:07 -0700
Subject: [PATCH 58/89] output log edits

---
 scripts/convert_checkpoints_batch.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index afb69f8ad..970b60b97 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -54,23 +54,23 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
                 path_found = loc.replace('/pytorch_model.bin','')
                 break
 
-        # Check if the output location is already there. If not, do the conversion.
-        # print('WEKA LOC', weka_loc)
+        # Check if the output location is already there in s3. If so then skip conversion
         s3_hf_exists = s3_path_exists(checkpoint_path, s3_resource)
         if s3_hf_exists is not None:
             conversion = 'existing'
             converted_path = s3_hf_exists # checkpoint_path + '-hf'
             print(f"Converted Checkpoint Found: {converted_path}\n", flush=True)
 
+        # Check if the output location is in weka. If so then skip conversion
         elif path_found is not None:
             conversion = 'existing'
             converted_path = path_found.replace(load_dir,weka_prefix)
             print(f"Converted Checkpoint Found: {converted_path}\n", flush=True)
 
+        # Do conversion and save to Weka
         else:
             conversion = 'new'
             converted_path = weka_loc
-
             conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5'  --cleanup-local-dir"
 
             if sanity_check:
@@ -84,6 +84,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
                     conversion = 'error'
                     converted_path = ""
 
+        # Keep info for log.jsonl
         local_log = {
             'unprocessed_path': checkpoint_path,
             'converted_path': converted_path.replace(load_dir,weka_prefix),
@@ -92,9 +93,10 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
             'error': error
         }
 
-        # {"model_name": "name", "checkpoints_location": "weka://path/to/<name/>", "revisions": ["step0-unsharded-hf", "step1000-unsharded-hf", etc]}
+        # output model checkpoint location for eval scripts
         curr = Path(converted_path)
         parent = curr.parent
+        print(parent, flush=True)
         if parent.name not in processed:
             processed[parent.name] = {
                 'model_name': parent.name,
@@ -104,11 +106,12 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
         else:
             processed[parent.name]['revisions'].append(curr.name)
 
-        # LOG
+        # Output Log
         if not sanity_check:
             with open(os.path.join(RESULTS_DIR, 'log.jsonl'), 'a+') as fout:
                 fout.write(json.dumps(local_log) + '\n')
 
+    # Output checkpoint location for eval scripts
     if not sanity_check:
         with open(os.path.join(RESULTS_DIR, 'model_checkpoints.jsonl'), 'w') as fout:
             for _,p in processed.items():

From f8e9c96800cc6ed8c0edad1c55e2d9b71545857b Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Wed, 18 Sep 2024 15:40:10 -0700
Subject: [PATCH 59/89] output log edits

---
 scripts/convert_checkpoints_batch.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 970b60b97..25a01c668 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -64,7 +64,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
         # Check if the output location is in weka. If so then skip conversion
         elif path_found is not None:
             conversion = 'existing'
-            converted_path = path_found.replace(load_dir,weka_prefix)
+            converted_path = path_found
             print(f"Converted Checkpoint Found: {converted_path}\n", flush=True)
 
         # Do conversion and save to Weka
@@ -96,7 +96,6 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
         # output model checkpoint location for eval scripts
         curr = Path(converted_path)
         parent = curr.parent
-        print(parent, flush=True)
         if parent.name not in processed:
             processed[parent.name] = {
                 'model_name': parent.name,

From de8292256450ea71c4acaf73df988cf8691d62c7 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 4 Oct 2024 10:01:20 -0700
Subject: [PATCH 60/89] code cleanup

---
 scripts/convert_checkpoints.sh       |  36 ++++----
 scripts/convert_checkpoints_batch.py | 125 ++++++++++++++-------------
 2 files changed, 84 insertions(+), 77 deletions(-)

diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index 2afbd15f8..f6c7f3a84 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -1,33 +1,33 @@
 #!/usr/bin/env bash
 
+# Converts s3 checkpoints into WEKA
 # To be run at the top of the root of OLMo repository.
-#  Converts s3 checkpoints into WEKA
+# Script requires the use of GANTRY and AWS access to WEKA
 #
 # Example use:
+# Run:
 # sh scripts/convert_checkpoints.sh s3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step9*
-#
-# This will convert all models in the directory
-# and save them to their respective directories under
-#
-# /weka/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step9*
+# This will convert all models in the directory and save them to:
+# weka://oe-eval-default/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001-hf/step9*
 #
 # It will first, though, check that the weka directory doesn't exist AND that s3 doesn't have a corresponding directory (so as not to replicate what conversions already made)
-
+#
 # ASSUMPTIONS
 # - INPUT must be on s3
-# - OUTPUT is weka with the same path name as s3 + "-hf" suffix appended to the path
-# - Budget for oe-eval
-# - Experiments saved to ai2/cheap-decisions
+# - OUTPUT to weka is saved to the path as found on s3 with "-hf" suffix appended to the path
 # - Assumes tokenizer allenai/gpt-neox-olmo-dolma-v1_5.json
-
-# NOTES
-# - saves metrics.json
-# - allows for wildcard (*)
-
+#
+# OUTPUT logs
+# - saves log.jsonl
+# - saves model_checkpoints.jsonl: this is input file is formatted for oe-eval-internal experiments
+#
+# SH run SPECIFICATION DEFAULTS:
+# - Budget for oe-eval (see below)
+# - Loading for weka weka://oe-eval-default/ (see below)
+# - Gantry experiments saved to beaker://ai2/cheap-decisions
+# - Weka prefix is used for model_checkpoints.jsonl
+#
 # TODOs
-# - Make consistent with Luca's code
-# - Code allows for a txt file with a list of checkpoint paths, sh needs to allow this
-# - Output is not saving. But it prints to the log. Fix this.
 # - Make tokenizer updatable
 
 CHECKPOINT_PATH=$1
diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 25a01c668..64425ad19 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -13,7 +13,6 @@
 import time
 import boto3
 import json
-import sys
 from pathlib import Path
 
 from gantry import RESULTS_DIR
@@ -28,7 +27,7 @@
     "{}/ianm/{}-hf/pytorch_model.bin": "ian's"
 }
 
-def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_prefix="/weka"):
+def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_prefix="/weka", save_to_weka=False):
     s3_client = boto3.client('s3')
     s3_resource = boto3.resource('s3')
 
@@ -38,40 +37,61 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
 
     processed = {}
 
+    # Convert to old-style checkpoint.
     for checkpoint_path in cps:
-        # Convert to old-style checkpoint.
-        retain_path_name = checkpoint_path.replace('s3://', '').strip('/')
-        weka_loc = f"{load_dir}/{retain_path_name}-hf/"
-        check_locs = [l.format(load_dir,retain_path_name) for l in WEKA_CHECK_LOCATIONS_PREFIXES]
-
         print(f"\n\n------------------------------------------------------------", flush=True)
-        print(f"\nProcessing Checkpoint: {retain_path_name}\n", flush=True)
-        error = ""
+        print(f"\nProcessing Checkpoint: {checkpoint_path}\n", flush=True)
 
-        path_found = None
-        for loc in check_locs:
+        error = ""
+        converted_path = ""
+        existing_location = ""
+        conversion_status = ""
+
+        # sort out paths, bucket names, and so on ...
+        path_bits = checkpoint_path.strip('/').replace('s3://', '').split('/')
+        s3_bucket_name = path_bits[0]
+        s3_prefix = '/'.join(path_bits[1:])
+        temp_path = '/'.join(path_bits) #checkpoint_path.replace('s3://', '').strip('/')
+        local_path = f"{load_dir}/{temp_path}-hf/"
+
+        # the converted model may already exist in local_path or in
+        path_found = False
+        potential_existing_locations = [l.format(load_dir,temp_path) for l in WEKA_CHECK_LOCATIONS_PREFIXES]
+        for loc in potential_existing_locations:
             if os.path.exists(loc):
-                path_found = loc.replace('/pytorch_model.bin','')
+                existing_location = loc.replace('/pytorch_model.bin','')
+                path_found = True
                 break
 
-        # Check if the output location is already there in s3. If so then skip conversion
-        s3_hf_exists = s3_path_exists(checkpoint_path, s3_resource)
-        if s3_hf_exists is not None:
-            conversion = 'existing'
-            converted_path = s3_hf_exists # checkpoint_path + '-hf'
+        # if one of the potential existing location has converted model in it then use that
+        if path_found:
+            # then there is no conversion to do.
+            conversion_status = 'existing'
+            converted_path = existing_location
             print(f"Converted Checkpoint Found: {converted_path}\n", flush=True)
-
-        # Check if the output location is in weka. If so then skip conversion
-        elif path_found is not None:
-            conversion = 'existing'
-            converted_path = path_found
-            print(f"Converted Checkpoint Found: {converted_path}\n", flush=True)
-
-        # Do conversion and save to Weka
         else:
-            conversion = 'new'
-            converted_path = weka_loc
-            conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5'  --cleanup-local-dir"
+            s3_bucket = s3_resource.Bucket(s3_bucket_name)
+            s3_hf_exists = s3_path_exists(s3_bucket, s3_prefix, s3_resource)
+
+            # if s3 already has a location for converted model then use that
+            if s3_hf_exists is not None:
+                path_found = True
+                print(f"Converted Checkpoint Found: {s3_hf_exists}", flush=True)
+
+                # if save to weka flag is passed, then download the s3 converted model to the local path
+                if save_to_weka:
+                    copy_s3_to_local(s3_hf_exists, local_path, s3_resource, sanity_check)
+                    conversion_status = 'existing-downloaded'
+                    converted_path = local_path
+                else:
+                    conversion_status = 'existing'
+                    converted_path = s3_hf_exists
+
+        # if no existing conversions are found then process and save to local path
+        if not path_found:
+            conversion_status = 'new'
+            converted_path = local_path
+            conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{local_path}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5'  --cleanup-local-dir"
 
             if sanity_check:
                 print('SANITY CHECK MODE (not running the conversion)')
@@ -81,14 +101,14 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
                     subprocess.run(conversion_cmd, shell=True, check=True)
                 except subprocess.CalledProcessError as e:
                     error = e.output ### NOT ACTUALLY WORKING CORRECTLY. FIX THIS (not catching config not found error)
-                    conversion = 'error'
+                    conversion_status = 'error'
                     converted_path = ""
 
         # Keep info for log.jsonl
         local_log = {
             'unprocessed_path': checkpoint_path,
             'converted_path': converted_path.replace(load_dir,weka_prefix),
-            'conversion': conversion,
+            'conversion': conversion_status,
             'date_time': time.strftime('%b-%d-%Y_%H%M', time.localtime()),
             'error': error
         }
@@ -113,21 +133,26 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
     # Output checkpoint location for eval scripts
     if not sanity_check:
         with open(os.path.join(RESULTS_DIR, 'model_checkpoints.jsonl'), 'w') as fout:
-            for _,p in processed.items():
+            for _, p in processed.items():
                 fout.write(json.dumps(p) + '\n')
 
 
-def s3_path_exists(cp, s3):
-    b = cp.split('/')[2]
-    bucket = s3.Bucket(b)
-    prefix = cp.replace('s3://'+b+'/', '')
-    # print(bucket, prefix)
+def s3_path_exists(bucket, prefix, s3_resource):
+    # look for pytorch_model.bin in directories ending with -hf or -hf-olmo.
     objs = list(bucket.objects.filter(Prefix=prefix + '-hf/pytorch_model.bin'))
     if len(objs) > 0:
-        return cp + '-hf'
+        return f"s3://{bucket}/{prefix}-hf"
     else:
         objs2 = list(bucket.objects.filter(Prefix=prefix + '-hf-olmo/pytorch_model.bin'))
-        return cp + '-hf-olmo' if (len(objs2) > 0) else None
+        return f"s3://{bucket}/{prefix}-hf-olmo" if (len(objs2) > 0) else None
+
+
+def copy_s3_to_local(bucket, prefix, local_path, s3_resource, sanity_check):
+    if not os.path.exists(os.path.dirname(local_path)):
+        print(f"Downloading checkpoint to weka://{bucket}/{prefix}\n", flush=True)
+        if not sanity_check:
+            os.makedirs(local_path)
+            bucket.download_file(prefix, local_path)  # save to same path
 
 
 def expand_paths(cps, s3):
@@ -145,7 +170,6 @@ def expand_paths(cps, s3):
         page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix)
         contents = {obj["Key"]:str(Path(obj['Key']).parent) for page in page_iterator for obj in page['Contents']}
         paths = set(contents.values())
-        # print(contents)
 
         for path in contents:
             p = Path(path)
@@ -164,26 +188,8 @@ def expand_paths(cps, s3):
             elif p.name == 'model.pt':
                 relevant_dirs.append(parent)
 
-
-        # for page in page_iterator:
-        #     for obj in page['Contents']:
-        #         p = Path(obj["Key"])
-        #         if p.parent.name in ['optim', 'train','model']:
-        #             grand_parent = str(p.parent.parent)
-        #             if '-unsharded' not in grand_parent:
-        #                 objs = list(s3_resource.Bucket(bucket).objects.filter(Prefix=grand_parent + '-unsharded'))
-        #                 if len(objs) > 0:
-        #                     continue
-        #             relevant_dirs.append(str(p.parent.parent))
-        #         elif p.name == 'model.pt':
-        #             relevant_dirs.append(str(p.parent))
-        #         # if 'model.pt' in obj["Key"]:
-        #         #     relevant_dirs.append(obj["Key"].replace('/model.pt',''))
-
         search_segs = [seg for i, seg in enumerate(segs) if i > 0 and seg != ""]
 
-        # print(f"search segments: {search_segs}")
-
         # subselect the directory with remaining segments (for multiple wildcard *)
         temp_dirs = relevant_dirs
         if len(search_segs) > 0:
@@ -211,13 +217,14 @@ def main():
     parser.add_argument("--weka-load-dir", help='mounted location of weka bucket', default='/data/input', type=str)
     parser.add_argument("--weka-prefix", help='weka directory prefix for output', default='/weka', type=str)
     parser.add_argument("--sanity-check", help='print what would be run; do not actually run conversion', action='store_true')
+    parser.add_argument("--save-to-weka", help='if checkpoints are found on s3, save them to loaded weka dir', action='store_true')
 
     args = parser.parse_args()
 
     if args.checkpoint_path is not None:
-        convert_checkpoint([args.checkpoint_path], load_dir=args.weka_load_dir, sanity_check=args.sanity_check, weka_prefix=args.weka_prefix)
+        convert_checkpoint([args.checkpoint_path], load_dir=args.weka_load_dir, sanity_check=args.sanity_check, weka_prefix=args.weka_prefix, save_to_weka=args.save_to_weka)
     else:
-        convert_checkpoint(read_checkpoints(args.checkpoint_path_file), load_dir=args.weka_load_dir, sanity_check=args.sanity_check, weka_prefix=args.weka_prefix)
+        convert_checkpoint(read_checkpoints(args.checkpoint_path_file), load_dir=args.weka_load_dir, sanity_check=args.sanity_check, weka_prefix=args.weka_prefix, save_to_weka=args.save_to_weka)
 
 
 if __name__ == "__main__":

From d3e16d72c47cc395c4f9adac7d3d6eb2801a1736 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 4 Oct 2024 12:57:29 -0700
Subject: [PATCH 61/89] code cleanup

---
 scripts/convert_checkpoints_batch.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 64425ad19..20fef2299 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -71,7 +71,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
             print(f"Converted Checkpoint Found: {converted_path}\n", flush=True)
         else:
             s3_bucket = s3_resource.Bucket(s3_bucket_name)
-            s3_hf_exists = s3_path_exists(s3_bucket, s3_prefix, s3_resource)
+            s3_hf_exists = s3_path_exists(s3_bucket, s3_prefix, s3_bucket_name)
 
             # if s3 already has a location for converted model then use that
             if s3_hf_exists is not None:
@@ -80,7 +80,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
 
                 # if save to weka flag is passed, then download the s3 converted model to the local path
                 if save_to_weka:
-                    copy_s3_to_local(s3_hf_exists, local_path, s3_resource, sanity_check)
+                    copy_s3_to_local(s3_bucket, s3_prefix, local_path, local_path.replace(load_dir,weka_prefix), sanity_check)
                     conversion_status = 'existing-downloaded'
                     converted_path = local_path
                 else:
@@ -137,19 +137,19 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
                 fout.write(json.dumps(p) + '\n')
 
 
-def s3_path_exists(bucket, prefix, s3_resource):
+def s3_path_exists(bucket, prefix, bucket_name):
     # look for pytorch_model.bin in directories ending with -hf or -hf-olmo.
     objs = list(bucket.objects.filter(Prefix=prefix + '-hf/pytorch_model.bin'))
     if len(objs) > 0:
-        return f"s3://{bucket}/{prefix}-hf"
+        return f"s3://{bucket_name}/{prefix}-hf"
     else:
         objs2 = list(bucket.objects.filter(Prefix=prefix + '-hf-olmo/pytorch_model.bin'))
-        return f"s3://{bucket}/{prefix}-hf-olmo" if (len(objs2) > 0) else None
+        return f"s3://{bucket_name}/{prefix}-hf-olmo" if (len(objs2) > 0) else None
 
 
-def copy_s3_to_local(bucket, prefix, local_path, s3_resource, sanity_check):
+def copy_s3_to_local(bucket, prefix, local_path, display_name, sanity_check):
     if not os.path.exists(os.path.dirname(local_path)):
-        print(f"Downloading checkpoint to weka://{bucket}/{prefix}\n", flush=True)
+        print(f"Downloading checkpoint to {display_name}\n", flush=True)
         if not sanity_check:
             os.makedirs(local_path)
             bucket.download_file(prefix, local_path)  # save to same path

From ef8ffbdd3ec618de62ba60f8b7fb39f8d173c129 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 4 Oct 2024 13:05:33 -0700
Subject: [PATCH 62/89] testing

---
 scripts/convert_checkpoints.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index f6c7f3a84..b0d592d55 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -13,7 +13,7 @@
 # It will first, though, check that the weka directory doesn't exist AND that s3 doesn't have a corresponding directory (so as not to replicate what conversions already made)
 #
 # ASSUMPTIONS
-# - INPUT must be on s3
+# - INPUT must be on s3. Multiple wildcards allowed
 # - OUTPUT to weka is saved to the path as found on s3 with "-hf" suffix appended to the path
 # - Assumes tokenizer allenai/gpt-neox-olmo-dolma-v1_5.json
 #
@@ -46,5 +46,5 @@ gantry run \
     --shared-memory 10GiB \
     --weka=oe-eval-default:/data/input \
     --yes \
-    -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input' --weka-prefix 'weka://oe-eval-default'"
+    -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input' --weka-prefix 'weka://oe-eval-default' --save_to_weka"
 

From fac649d904ee626812c5f232a094c1544a96ac07 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 4 Oct 2024 13:09:24 -0700
Subject: [PATCH 63/89] testing

---
 scripts/convert_checkpoints.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index b0d592d55..934cfb452 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -46,5 +46,5 @@ gantry run \
     --shared-memory 10GiB \
     --weka=oe-eval-default:/data/input \
     --yes \
-    -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input' --weka-prefix 'weka://oe-eval-default' --save_to_weka"
+    -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input' --weka-prefix 'weka://oe-eval-default' --save-to-weka"
 

From 759753469de49e7c967ede78ff9ae7a791231f2a Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 4 Oct 2024 13:52:16 -0700
Subject: [PATCH 64/89] testing

---
 scripts/convert_checkpoints.sh | 49 ++++++++++++++++++++++++++++++++--
 1 file changed, 47 insertions(+), 2 deletions(-)

diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh
index 934cfb452..311c0b420 100644
--- a/scripts/convert_checkpoints.sh
+++ b/scripts/convert_checkpoints.sh
@@ -4,6 +4,17 @@
 # To be run at the top of the root of OLMo repository.
 # Script requires the use of GANTRY and AWS access to WEKA
 #
+# Usage: scripts/convert_checkpoints.sh <s3 checkpoint to process> [-s]
+#  -s if converted checkpoint is found in s3, then save to weka
+#  -c sanity check; don't actually do conversion. just go through the motions and print stuff
+#
+# calls: convert_checkpoints_batch.py
+# usage: convert_checkpoints_batch.py [-h]
+#                                    (--checkpoint-path CHECKPOINT_PATH | --checkpoint-path-file CHECKPOINT_PATH_FILE)
+#                                    [--weka-load-dir WEKA_LOAD_DIR]
+#                                    [--weka-prefix WEKA_PREFIX]
+#                                    [--sanity-check] [--save-to-weka]
+#
 # Example use:
 # Run:
 # sh scripts/convert_checkpoints.sh s3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step9*
@@ -18,8 +29,21 @@
 # - Assumes tokenizer allenai/gpt-neox-olmo-dolma-v1_5.json
 #
 # OUTPUT logs
-# - saves log.jsonl
+# - saves log.jsonl. For every checkpoint found given input:
+#   - "unprocessed_path" :=  checkpoint path to convert
+#   - "converted_path" := checkpoint converted path
+#   - "conversion_status" := [new | existing (already in weka) | existing-downloaded (from s3) ]
+#   - "date" := datestamp
+#   - "error" := error if any conversions didn't pan out for any reason
 # - saves model_checkpoints.jsonl: this is input file is formatted for oe-eval-internal experiments
+# - example log files for the following run:
+#   > sh scripts/convert_checkpoints.sh s3://ai2-llm/checkpoints/OLMo-ladder/baseline-300M-1xC/step91*6-unsharded
+#   log.jsonl:
+#     {"unprocessed_path": "s3://ai2-llm/checkpoints/OLMo-ladder/baseline-300M-1xC/step9176-unsharded", "converted_path": "weka://oe-eval-default/ianm/ai2-llm/checkpoints/OLMo-ladder/baseline-300M-1xC/step9176-unsharded-hf", "conversion": "existing", "date_time": "Oct-04-2024_2012", "error": ""}
+#     {"unprocessed_path": "s3://ai2-llm/checkpoints/OLMo-ladder/baseline-300M-1xC/step9166-unsharded", "converted_path": "weka://oe-eval-default/ianm/ai2-llm/checkpoints/OLMo-ladder/baseline-300M-1xC/step9166-unsharded-hf", "conversion": "existing", "date_time": "Oct-04-2024_2012", "error": ""}
+#     {"unprocessed_path": "s3://ai2-llm/checkpoints/OLMo-ladder/baseline-300M-1xC/step9186-unsharded", "converted_path": "weka://oe-eval-default/ianm/ai2-llm/checkpoints/OLMo-ladder/baseline-300M-1xC/step9186-unsharded-hf", "conversion": "existing", "date_time": "Oct-04-2024_2012", "error": ""}
+#   model_checkpoints.jsonl:
+#     {"model_name": "baseline-300M-1xC", "checkpoints_location": "weka://oe-eval-default/ianm/ai2-llm/checkpoints/OLMo-ladder/baseline-300M-1xC", "revisions": ["step9176-unsharded-hf", "step9166-unsharded-hf", "step9186-unsharded-hf"]}
 #
 # SH run SPECIFICATION DEFAULTS:
 # - Budget for oe-eval (see below)
@@ -31,6 +55,27 @@
 # - Make tokenizer updatable
 
 CHECKPOINT_PATH=$1
+SAVE_TO_WEKA=""
+SANITY_CHECK=""
+shift
+
+usage() {
+  echo "Usage: $0 <s3 checkpoint to process> [-s]"
+  echo "  -s --save-to-weka"
+  echo "  -c --sanity-check"
+  exit 1;
+}
+
+while getopts "sc" opt;
+do
+  case $opt in
+    s) SAVE_TO_WEKA="--save-to-weka" ;;
+    c) SANITY_CHECK="--sanity-check" ;;  # mostly useful for local test runs - it will stop from doing any copying or conversions.
+    *) usage ;;
+  esac
+done
+
+#echo  "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input' --weka-prefix 'weka://oe-eval-default' $SAVE_TO_WEKA $SANITY_CHECK"
 
 gantry run \
     --description "Converting $CHECKPOINT_PATH" \
@@ -46,5 +91,5 @@ gantry run \
     --shared-memory 10GiB \
     --weka=oe-eval-default:/data/input \
     --yes \
-    -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input' --weka-prefix 'weka://oe-eval-default' --save-to-weka"
+    -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input' --weka-prefix 'weka://oe-eval-default' $SAVE_TO_WEKA $SANITY_CHECK"
 

From 94fa6daa46db9b92b90259ced24b0750bab7f520 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Tue, 8 Oct 2024 10:01:04 -0700
Subject: [PATCH 65/89] downloading fix

---
 scripts/convert_checkpoints_batch.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 20fef2299..c5a8eb1e1 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -148,11 +148,16 @@ def s3_path_exists(bucket, prefix, bucket_name):
 
 
 def copy_s3_to_local(bucket, prefix, local_path, display_name, sanity_check):
-    if not os.path.exists(os.path.dirname(local_path)):
-        print(f"Downloading checkpoint to {display_name}\n", flush=True)
-        if not sanity_check:
+    # if not os.path.exists(os.path.dirname(local_path)):
+    print(f"Downloading checkpoint to {display_name}\n", flush=True)
+    if not sanity_check:
+        try:
             os.makedirs(local_path)
-            bucket.download_file(prefix, local_path)  # save to same path
+        except:
+            pass
+        print(prefix)
+        print(local_path)
+        bucket.download_file(bucket, prefix, local_path)  # save to same path
 
 
 def expand_paths(cps, s3):

From 5e46840ace6fc8c657291f77dc4bcd01d5e357bd Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Tue, 8 Oct 2024 10:26:19 -0700
Subject: [PATCH 66/89] downloading fix

---
 scripts/convert_checkpoints_batch.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index c5a8eb1e1..9b7bd4ab8 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -151,13 +151,13 @@ def copy_s3_to_local(bucket, prefix, local_path, display_name, sanity_check):
     # if not os.path.exists(os.path.dirname(local_path)):
     print(f"Downloading checkpoint to {display_name}\n", flush=True)
     if not sanity_check:
-        try:
-            os.makedirs(local_path)
-        except:
-            pass
-        print(prefix)
-        print(local_path)
-        bucket.download_file(bucket, prefix, local_path)  # save to same path
+        for obj in bucket.objects.filter(Prefix=prefix):
+            target = os.path.join(local_path, os.path.relpath(obj.key, os.path.dirname(prefix)))
+            if not os.path.exists(os.path.dirname(target)):
+                os.makedirs(os.path.dirname(target))
+            if obj.key[-1] == '/':
+                continue
+            bucket.download_file(obj.key, target)
 
 
 def expand_paths(cps, s3):
@@ -227,9 +227,9 @@ def main():
     args = parser.parse_args()
 
     if args.checkpoint_path is not None:
-        convert_checkpoint([args.checkpoint_path], load_dir=args.weka_load_dir, sanity_check=args.sanity_check, weka_prefix=args.weka_prefix, save_to_weka=args.save_to_weka)
+        convert_checkpoint([args.checkpoint_path], load_dir=args.weka_load_dir.rstrip('/'), sanity_check=args.sanity_check, weka_prefix=args.weka_prefix, save_to_weka=args.save_to_weka)
     else:
-        convert_checkpoint(read_checkpoints(args.checkpoint_path_file), load_dir=args.weka_load_dir, sanity_check=args.sanity_check, weka_prefix=args.weka_prefix, save_to_weka=args.save_to_weka)
+        convert_checkpoint(read_checkpoints(args.checkpoint_path_file), load_dir=args.weka_load_dir.rstrip('/'), sanity_check=args.sanity_check, weka_prefix=args.weka_prefix, save_to_weka=args.save_to_weka)
 
 
 if __name__ == "__main__":

From da90ae2b43d101b40c4f77e867f7bcc37c988c2b Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 11 Oct 2024 14:52:03 -0700
Subject: [PATCH 67/89] .

---
 scripts/convert_checkpoints_batch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 9b7bd4ab8..48816c99e 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -39,7 +39,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
 
     # Convert to old-style checkpoint.
     for checkpoint_path in cps:
-        print(f"\n\n------------------------------------------------------------", flush=True)
+        print("\n\n------------------------------------------------------------", flush=True)
         print(f"\nProcessing Checkpoint: {checkpoint_path}\n", flush=True)
 
         error = ""
@@ -56,7 +56,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
 
         # the converted model may already exist in local_path or in
         path_found = False
-        potential_existing_locations = [l.format(load_dir,temp_path) for l in WEKA_CHECK_LOCATIONS_PREFIXES]
+        potential_existing_locations = [candidate_loc.format(load_dir,temp_path) for candidate_loc in WEKA_CHECK_LOCATIONS_PREFIXES]
         for loc in potential_existing_locations:
             if os.path.exists(loc):
                 existing_location = loc.replace('/pytorch_model.bin','')

From 447de12ab2e540ea00cf6c93ea9f5daa82428983 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 11 Oct 2024 15:22:52 -0700
Subject: [PATCH 68/89] .

---
 scripts/convert_checkpoints_batch.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 48816c99e..0385ae7ee 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -8,14 +8,16 @@
 """
 
 import argparse
-import subprocess
+import json
 import os
+import subprocess
 import time
+
 import boto3
-import json
-from pathlib import Path
 
 from gantry import RESULTS_DIR
+from pathlib import Path
+from typing import Dict
 
 # possible converted locations.
 # "self" is the target location where the converted model would be saved
@@ -117,7 +119,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
         curr = Path(converted_path)
         parent = curr.parent
         if parent.name not in processed:
-            processed[parent.name] = {
+            processed[parent.name]: Dict = {
                 'model_name': parent.name,
                 'checkpoints_location': str(parent).replace(load_dir,weka_prefix),
                 'revisions': [curr.name]

From 3920f2ebe0aabd1c2d53c25275104e5234725ddb Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 11 Oct 2024 15:27:45 -0700
Subject: [PATCH 69/89] addressing errors

---
 scripts/convert_checkpoints_batch.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 0385ae7ee..593fe99b8 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -17,7 +17,6 @@
 
 from gantry import RESULTS_DIR
 from pathlib import Path
-from typing import Dict
 
 # possible converted locations.
 # "self" is the target location where the converted model would be saved
@@ -119,11 +118,13 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
         curr = Path(converted_path)
         parent = curr.parent
         if parent.name not in processed:
-            processed[parent.name]: Dict = {
+            processed[parent.name] = {
                 'model_name': parent.name,
                 'checkpoints_location': str(parent).replace(load_dir,weka_prefix),
                 'revisions': [curr.name]
             }
+        elif 'revisions' not in processed[parent.name]: # not sure if this would ever occur, but trying to get the error check happy
+            processed[parent.name]['revisions'] = [curr.name]
         else:
             processed[parent.name]['revisions'].append(curr.name)
 

From acaccddb6a22d632f38811844921388a0a7e0962 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 11 Oct 2024 16:01:47 -0700
Subject: [PATCH 70/89] error fixes for pr

---
 CHANGELOG.md                         | 1 +
 scripts/convert_checkpoints_batch.py | 9 ++++-----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e9752a733..32cc5b415 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Added `scripts/convert_checkpoints_batch.py` and `scripts/convert_checkpoints.sh` for processing many intermediate checkpoints in batches for offline evals. 
 - Added ability to try loading latest checkpoint from save folder using `--try_load_latest_save`.
 - Added support for flash attention and gradient checkpointing to `hf_olmo`.
 
diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 593fe99b8..846bce14a 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -17,6 +17,7 @@
 
 from gantry import RESULTS_DIR
 from pathlib import Path
+from typing import List, Dict, Union
 
 # possible converted locations.
 # "self" is the target location where the converted model would be saved
@@ -36,7 +37,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
 
     print(f">>> Total of {len(cps)} paths to process. <<<", flush=True)
 
-    processed = {}
+    processed: Dict = {}
 
     # Convert to old-style checkpoint.
     for checkpoint_path in cps:
@@ -118,13 +119,11 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
         curr = Path(converted_path)
         parent = curr.parent
         if parent.name not in processed:
-            processed[parent.name] = {
+            processed[parent.name]= {
                 'model_name': parent.name,
                 'checkpoints_location': str(parent).replace(load_dir,weka_prefix),
                 'revisions': [curr.name]
             }
-        elif 'revisions' not in processed[parent.name]: # not sure if this would ever occur, but trying to get the error check happy
-            processed[parent.name]['revisions'] = [curr.name]
         else:
             processed[parent.name]['revisions'].append(curr.name)
 
@@ -164,7 +163,7 @@ def copy_s3_to_local(bucket, prefix, local_path, display_name, sanity_check):
 
 
 def expand_paths(cps, s3):
-    expanded = []
+    expanded: List[str] = []
 
     for cp in cps:
         bucket = cp.split('/')[2]

From a4a40e172d9abc66b2ec7dee7dc27d5dacfa3fd9 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 11 Oct 2024 16:07:04 -0700
Subject: [PATCH 71/89] fixing errors for pr

---
 scripts/convert_checkpoints_batch.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 846bce14a..efedf70c2 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -12,12 +12,11 @@
 import os
 import subprocess
 import time
+from pathlib import Path
+from typing import Dict, List
 
 import boto3
-
 from gantry import RESULTS_DIR
-from pathlib import Path
-from typing import List, Dict, Union
 
 # possible converted locations.
 # "self" is the target location where the converted model would be saved

From 3d2bd3224a0469f3c8fd1cac304cb1ad2ecf5c0f Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 11 Oct 2024 16:33:39 -0700
Subject: [PATCH 72/89] fixing errors for pr

---
 scripts/convert_checkpoints_batch.py | 128 ++++++++++++++++-----------
 1 file changed, 75 insertions(+), 53 deletions(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index efedf70c2..0075211bc 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -23,14 +23,12 @@
 # key: template, value: description
 # template: MUST obey .format(load_dir, retain_path_name)
 
-WEKA_CHECK_LOCATIONS_PREFIXES = {
-    "{}/{}-hf/pytorch_model.bin": 'self',
-    "{}/ianm/{}-hf/pytorch_model.bin": "ian's"
-}
+WEKA_CHECK_LOCATIONS_PREFIXES = {"{}/{}-hf/pytorch_model.bin": "self", "{}/ianm/{}-hf/pytorch_model.bin": "ian's"}
+
 
 def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_prefix="/weka", save_to_weka=False):
-    s3_client = boto3.client('s3')
-    s3_resource = boto3.resource('s3')
+    s3_client = boto3.client("s3")
+    s3_resource = boto3.resource("s3")
 
     cps = expand_paths(cps, s3_client)
 
@@ -49,25 +47,27 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
         conversion_status = ""
 
         # sort out paths, bucket names, and so on ...
-        path_bits = checkpoint_path.strip('/').replace('s3://', '').split('/')
+        path_bits = checkpoint_path.strip("/").replace("s3://", "").split("/")
         s3_bucket_name = path_bits[0]
-        s3_prefix = '/'.join(path_bits[1:])
-        temp_path = '/'.join(path_bits) #checkpoint_path.replace('s3://', '').strip('/')
+        s3_prefix = "/".join(path_bits[1:])
+        temp_path = "/".join(path_bits)  # checkpoint_path.replace('s3://', '').strip('/')
         local_path = f"{load_dir}/{temp_path}-hf/"
 
         # the converted model may already exist in local_path or in
         path_found = False
-        potential_existing_locations = [candidate_loc.format(load_dir,temp_path) for candidate_loc in WEKA_CHECK_LOCATIONS_PREFIXES]
+        potential_existing_locations = [
+            candidate_loc.format(load_dir, temp_path) for candidate_loc in WEKA_CHECK_LOCATIONS_PREFIXES
+        ]
         for loc in potential_existing_locations:
             if os.path.exists(loc):
-                existing_location = loc.replace('/pytorch_model.bin','')
+                existing_location = loc.replace("/pytorch_model.bin", "")
                 path_found = True
                 break
 
         # if one of the potential existing location has converted model in it then use that
         if path_found:
             # then there is no conversion to do.
-            conversion_status = 'existing'
+            conversion_status = "existing"
             converted_path = existing_location
             print(f"Converted Checkpoint Found: {converted_path}\n", flush=True)
         else:
@@ -81,70 +81,74 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
 
                 # if save to weka flag is passed, then download the s3 converted model to the local path
                 if save_to_weka:
-                    copy_s3_to_local(s3_bucket, s3_prefix, local_path, local_path.replace(load_dir,weka_prefix), sanity_check)
-                    conversion_status = 'existing-downloaded'
+                    copy_s3_to_local(
+                        s3_bucket, s3_prefix, local_path, local_path.replace(load_dir, weka_prefix), sanity_check
+                    )
+                    conversion_status = "existing-downloaded"
                     converted_path = local_path
                 else:
-                    conversion_status = 'existing'
+                    conversion_status = "existing"
                     converted_path = s3_hf_exists
 
         # if no existing conversions are found then process and save to local path
         if not path_found:
-            conversion_status = 'new'
+            conversion_status = "new"
             converted_path = local_path
             conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{local_path}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5'  --cleanup-local-dir"
 
             if sanity_check:
-                print('SANITY CHECK MODE (not running the conversion)')
-                print(conversion_cmd + '\n')
+                print("SANITY CHECK MODE (not running the conversion)")
+                print(conversion_cmd + "\n")
             else:
                 try:
                     subprocess.run(conversion_cmd, shell=True, check=True)
                 except subprocess.CalledProcessError as e:
-                    error = e.output ### NOT ACTUALLY WORKING CORRECTLY. FIX THIS (not catching config not found error)
-                    conversion_status = 'error'
+                    error = (
+                        e.output
+                    )  ### NOT ACTUALLY WORKING CORRECTLY. FIX THIS (not catching config not found error)
+                    conversion_status = "error"
                     converted_path = ""
 
         # Keep info for log.jsonl
         local_log = {
-            'unprocessed_path': checkpoint_path,
-            'converted_path': converted_path.replace(load_dir,weka_prefix),
-            'conversion': conversion_status,
-            'date_time': time.strftime('%b-%d-%Y_%H%M', time.localtime()),
-            'error': error
+            "unprocessed_path": checkpoint_path,
+            "converted_path": converted_path.replace(load_dir, weka_prefix),
+            "conversion": conversion_status,
+            "date_time": time.strftime("%b-%d-%Y_%H%M", time.localtime()),
+            "error": error,
         }
 
         # output model checkpoint location for eval scripts
         curr = Path(converted_path)
         parent = curr.parent
         if parent.name not in processed:
-            processed[parent.name]= {
-                'model_name': parent.name,
-                'checkpoints_location': str(parent).replace(load_dir,weka_prefix),
-                'revisions': [curr.name]
+            processed[parent.name] = {
+                "model_name": parent.name,
+                "checkpoints_location": str(parent).replace(load_dir, weka_prefix),
+                "revisions": [curr.name],
             }
         else:
-            processed[parent.name]['revisions'].append(curr.name)
+            processed[parent.name]["revisions"].append(curr.name)
 
         # Output Log
         if not sanity_check:
-            with open(os.path.join(RESULTS_DIR, 'log.jsonl'), 'a+') as fout:
-                fout.write(json.dumps(local_log) + '\n')
+            with open(os.path.join(RESULTS_DIR, "log.jsonl"), "a+") as fout:
+                fout.write(json.dumps(local_log) + "\n")
 
     # Output checkpoint location for eval scripts
     if not sanity_check:
-        with open(os.path.join(RESULTS_DIR, 'model_checkpoints.jsonl'), 'w') as fout:
+        with open(os.path.join(RESULTS_DIR, "model_checkpoints.jsonl"), "w") as fout:
             for _, p in processed.items():
-                fout.write(json.dumps(p) + '\n')
+                fout.write(json.dumps(p) + "\n")
 
 
 def s3_path_exists(bucket, prefix, bucket_name):
     # look for pytorch_model.bin in directories ending with -hf or -hf-olmo.
-    objs = list(bucket.objects.filter(Prefix=prefix + '-hf/pytorch_model.bin'))
+    objs = list(bucket.objects.filter(Prefix=prefix + "-hf/pytorch_model.bin"))
     if len(objs) > 0:
         return f"s3://{bucket_name}/{prefix}-hf"
     else:
-        objs2 = list(bucket.objects.filter(Prefix=prefix + '-hf-olmo/pytorch_model.bin'))
+        objs2 = list(bucket.objects.filter(Prefix=prefix + "-hf-olmo/pytorch_model.bin"))
         return f"s3://{bucket_name}/{prefix}-hf-olmo" if (len(objs2) > 0) else None
 
 
@@ -156,7 +160,7 @@ def copy_s3_to_local(bucket, prefix, local_path, display_name, sanity_check):
             target = os.path.join(local_path, os.path.relpath(obj.key, os.path.dirname(prefix)))
             if not os.path.exists(os.path.dirname(target)):
                 os.makedirs(os.path.dirname(target))
-            if obj.key[-1] == '/':
+            if obj.key[-1] == "/":
                 continue
             bucket.download_file(obj.key, target)
 
@@ -165,16 +169,16 @@ def expand_paths(cps, s3):
     expanded: List[str] = []
 
     for cp in cps:
-        bucket = cp.split('/')[2]
-        segs = cp.split('*')
-        prefix = segs[0].replace('s3://'+bucket+'/', '')
+        bucket = cp.split("/")[2]
+        segs = cp.split("*")
+        prefix = segs[0].replace("s3://" + bucket + "/", "")
 
         relevant_dirs = []
         skip_parent = []
 
-        paginator = s3.get_paginator('list_objects_v2')
+        paginator = s3.get_paginator("list_objects_v2")
         page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix)
-        contents = {obj["Key"]:str(Path(obj['Key']).parent) for page in page_iterator for obj in page['Contents']}
+        contents = {obj["Key"]: str(Path(obj["Key"]).parent) for page in page_iterator for obj in page["Contents"]}
         paths = set(contents.values())
 
         for path in contents:
@@ -184,14 +188,14 @@ def expand_paths(cps, s3):
 
             if parent in relevant_dirs or parent in skip_parent:
                 continue
-            if p.parent.name in ['optim', 'train','model']:
+            if p.parent.name in ["optim", "train", "model"]:
                 if f"{grandpa}-unsharded" in paths:
                     # skip condition
                     skip_parent.append(parent)
                     continue
                 else:
                     relevant_dirs.append(grandpa)
-            elif p.name == 'model.pt':
+            elif p.name == "model.pt":
                 relevant_dirs.append(parent)
 
         search_segs = [seg for i, seg in enumerate(segs) if i > 0 and seg != ""]
@@ -209,8 +213,8 @@ def expand_paths(cps, s3):
 
 
 def read_checkpoints(f):
-    with open(f, 'r') as fin:
-        checkpoints = [line for line in fin if line and line != '']
+    with open(f, "r") as fin:
+        checkpoints = [line for line in fin if line and line != ""]
     return checkpoints
 
 
@@ -219,18 +223,36 @@ def main():
 
     group_batch = parser.add_mutually_exclusive_group(required=True)
     group_batch.add_argument("--checkpoint-path", help="path to sharded checkpoint", type=str)
-    group_batch.add_argument("--checkpoint-path-file", help="file that lists sharded checkpoint paths (batch run option)", type=str)
-    parser.add_argument("--weka-load-dir", help='mounted location of weka bucket', default='/data/input', type=str)
-    parser.add_argument("--weka-prefix", help='weka directory prefix for output', default='/weka', type=str)
-    parser.add_argument("--sanity-check", help='print what would be run; do not actually run conversion', action='store_true')
-    parser.add_argument("--save-to-weka", help='if checkpoints are found on s3, save them to loaded weka dir', action='store_true')
+    group_batch.add_argument(
+        "--checkpoint-path-file", help="file that lists sharded checkpoint paths (batch run option)", type=str
+    )
+    parser.add_argument("--weka-load-dir", help="mounted location of weka bucket", default="/data/input", type=str)
+    parser.add_argument("--weka-prefix", help="weka directory prefix for output", default="/weka", type=str)
+    parser.add_argument(
+        "--sanity-check", help="print what would be run; do not actually run conversion", action="store_true"
+    )
+    parser.add_argument(
+        "--save-to-weka", help="if checkpoints are found on s3, save them to loaded weka dir", action="store_true"
+    )
 
     args = parser.parse_args()
 
     if args.checkpoint_path is not None:
-        convert_checkpoint([args.checkpoint_path], load_dir=args.weka_load_dir.rstrip('/'), sanity_check=args.sanity_check, weka_prefix=args.weka_prefix, save_to_weka=args.save_to_weka)
+        convert_checkpoint(
+            [args.checkpoint_path],
+            load_dir=args.weka_load_dir.rstrip("/"),
+            sanity_check=args.sanity_check,
+            weka_prefix=args.weka_prefix,
+            save_to_weka=args.save_to_weka,
+        )
     else:
-        convert_checkpoint(read_checkpoints(args.checkpoint_path_file), load_dir=args.weka_load_dir.rstrip('/'), sanity_check=args.sanity_check, weka_prefix=args.weka_prefix, save_to_weka=args.save_to_weka)
+        convert_checkpoint(
+            read_checkpoints(args.checkpoint_path_file),
+            load_dir=args.weka_load_dir.rstrip("/"),
+            sanity_check=args.sanity_check,
+            weka_prefix=args.weka_prefix,
+            save_to_weka=args.save_to_weka,
+        )
 
 
 if __name__ == "__main__":

From d529f5a60dafadc202d2403bad0baec310aa70d7 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 11 Oct 2024 16:34:48 -0700
Subject: [PATCH 73/89] fixing errors for pr

---
 CHANGELOG.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 32cc5b415..e9752a733 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,7 +9,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
-- Added `scripts/convert_checkpoints_batch.py` and `scripts/convert_checkpoints.sh` for processing many intermediate checkpoints in batches for offline evals. 
 - Added ability to try loading latest checkpoint from save folder using `--try_load_latest_save`.
 - Added support for flash attention and gradient checkpointing to `hf_olmo`.
 

From 4a32be0621a4bee4527b99e791fed12e6549194a Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 25 Oct 2024 17:57:24 -0700
Subject: [PATCH 74/89] removing temp outputs

---
 .gitignore                |  3 ---
 guided-trout-2f805b9.yaml | 39 +++++++++++++++++++++++++++++++++++++++
 log.txt                   | 10 ----------
 3 files changed, 39 insertions(+), 13 deletions(-)
 create mode 100644 guided-trout-2f805b9.yaml
 delete mode 100644 log.txt

diff --git a/.gitignore b/.gitignore
index e0f77ccd8..9b1e99785 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,3 @@
-# beaker yaml
-guided-trout-2f805b9.yaml
-
 # build artifacts
 
 .eggs/
diff --git a/guided-trout-2f805b9.yaml b/guided-trout-2f805b9.yaml
new file mode 100644
index 000000000..7607d5d52
--- /dev/null
+++ b/guided-trout-2f805b9.yaml
@@ -0,0 +1,39 @@
+version: v2
+tasks:
+  - name: main
+    image:
+      beaker: ai2/conda
+    command: [bash, /gantry/entrypoint.sh]
+    arguments: [/bin/bash, -c, 'python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir ''s3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded'' --destination-dir ''/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded'' --keep-olmo-artifacts']
+    envVars:
+      - name: GANTRY_VERSION
+        value: 1.8.3
+      - name: GITHUB_REPO
+        value: allenai/OLMo
+      - name: GIT_REF
+        value: fbfda0e3eca0768728eaa8d7dbd91bcbba5d8d2c
+      - name: GANTRY_TASK_NAME
+        value: main
+      - name: AWS_ACCESS_KEY_ID
+        secret: JENA_AWS_ACCESS_KEY_ID
+      - name: AWS_SECRET_ACCESS_KEY
+        secret: JENA_AWS_SECRET_ACCESS_KEY
+      - name: NO_PYTHON
+        value: "1"
+    datasets:
+      - mountPath: /gantry
+        source:
+          beaker: 01J721NEMRKZ4HBGNS5KV7873R
+      - mountPath: /data/input
+        source:
+          weka: oe-eval-default
+    result:
+      path: /results
+    resources:
+      sharedMemory: 10 GiB
+    context:
+      priority: normal
+      preemptible: true
+    constraints:
+      cluster:
+        - ai2/jupiter-cirrascale-2
diff --git a/log.txt b/log.txt
deleted file mode 100644
index 16a1a62e7..000000000
--- a/log.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-
-                                             o=======[]   
-   __ _                    _               _ |_      []   
-  / _` |  __ _    _ _     | |_      _ _   | || |     []   
-  \__, | / _` |  | ' \    |  _|    | '_|   \_, |   _/ ]_  
-  |___/  \__,_|  |_||_|   _\__|   _|_|_   _|__/   |_____| 
-_|"""""|_|"""""|_|"""""|_|"""""|_|"""""|_| """"| 
- `---------------------------------------------' 
-
-Experiment submitted, see progress at https://beaker.org/ex/01J7446KB7EXZ35D8NST0JTNTY

From 2dc26a90b7b14e10b6fa07e075d3538ff5938017 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 25 Oct 2024 18:27:08 -0700
Subject: [PATCH 75/89] fixes and cleanups

---
 guided-trout-2f805b9.yaml     | 39 -----------------------------------
 hf_olmo/convert_olmo_to_hf.py | 13 ++++--------
 requirements.txt              |  7 -------
 3 files changed, 4 insertions(+), 55 deletions(-)
 delete mode 100644 guided-trout-2f805b9.yaml
 delete mode 100644 requirements.txt

diff --git a/guided-trout-2f805b9.yaml b/guided-trout-2f805b9.yaml
deleted file mode 100644
index 7607d5d52..000000000
--- a/guided-trout-2f805b9.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-version: v2
-tasks:
-  - name: main
-    image:
-      beaker: ai2/conda
-    command: [bash, /gantry/entrypoint.sh]
-    arguments: [/bin/bash, -c, 'python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir ''s3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded'' --destination-dir ''/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded'' --keep-olmo-artifacts']
-    envVars:
-      - name: GANTRY_VERSION
-        value: 1.8.3
-      - name: GITHUB_REPO
-        value: allenai/OLMo
-      - name: GIT_REF
-        value: fbfda0e3eca0768728eaa8d7dbd91bcbba5d8d2c
-      - name: GANTRY_TASK_NAME
-        value: main
-      - name: AWS_ACCESS_KEY_ID
-        secret: JENA_AWS_ACCESS_KEY_ID
-      - name: AWS_SECRET_ACCESS_KEY
-        secret: JENA_AWS_SECRET_ACCESS_KEY
-      - name: NO_PYTHON
-        value: "1"
-    datasets:
-      - mountPath: /gantry
-        source:
-          beaker: 01J721NEMRKZ4HBGNS5KV7873R
-      - mountPath: /data/input
-        source:
-          weka: oe-eval-default
-    result:
-      path: /results
-    resources:
-      sharedMemory: 10 GiB
-    context:
-      priority: normal
-      preemptible: true
-    constraints:
-      cluster:
-        - ai2/jupiter-cirrascale-2
diff --git a/hf_olmo/convert_olmo_to_hf.py b/hf_olmo/convert_olmo_to_hf.py
index 9e0c7afb6..731488e9e 100644
--- a/hf_olmo/convert_olmo_to_hf.py
+++ b/hf_olmo/convert_olmo_to_hf.py
@@ -284,12 +284,6 @@ def main():
         help="Keep olmo-specific artifacts in the checkpoint.",
     )
 
-    parser.add_argument(
-        "--cleanup-local-dir",
-        action="store_true",
-        help="Remove local download of the directory."
-    )
-
     args = parser.parse_args()
 
     args.destination_dir = args.destination_dir or args.checkpoint_dir
@@ -314,9 +308,10 @@ def main():
     upload_local_checkpoint(local_checkpoint_dir, args.destination_dir)
 
     print(f"Converted checkpoint saved to {args.destination_dir}")
-    if args.cleanup_local_dir:
-        print(f"Removing temporary local dir: {local_checkpoint_dir}")
-        shutil.rmtree(local_checkpoint_dir)
+
+    # remove local dir copy
+    print(f"Removing temporary local dir: {local_checkpoint_dir}")
+    shutil.rmtree(local_checkpoint_dir)
 
 
 if __name__ == "__main__":
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index ae2bf89c5..000000000
--- a/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-torch
-datasets
-rich
-botocore
-cached-path
-transformers
-beaker-gantry

From 378aafeca14e32c694c57db26b1f290d8f19f058 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 13 Dec 2024 12:57:30 -0800
Subject: [PATCH 76/89] adding beaker-gantry to dependencies

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 87bd75591..7e80af66b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,6 +25,7 @@ dependencies = [
     "cached_path>=1.6.2",
     "transformers",
     "importlib_resources",
+    "baker-gantry"
 ]
 
 [project.optional-dependencies]

From 69d12f346199c1d6cee680745127826b02527b0d Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 13 Dec 2024 13:08:40 -0800
Subject: [PATCH 77/89] adding beaker-gantry to dependencies

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7e80af66b..10bfa8927 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,7 +25,7 @@ dependencies = [
     "cached_path>=1.6.2",
     "transformers",
     "importlib_resources",
-    "baker-gantry"
+    "beaker-gantry"
 ]
 
 [project.optional-dependencies]

From 579d61217c0c3268b92573a671a19daf8eddb74e Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 13 Dec 2024 13:16:13 -0800
Subject: [PATCH 78/89] adding beaker-gantry to dependencies

---
 pyproject.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 10bfa8927..3ccd96ab2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,7 +25,8 @@ dependencies = [
     "cached_path>=1.6.2",
     "transformers",
     "importlib_resources",
-    "beaker-gantry"
+    "beaker-gantry",
+    "datasets"
 ]
 
 [project.optional-dependencies]

From 3b9563dc355c697ff52dcd98f2959fd5be7b8763 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 13 Dec 2024 13:30:01 -0800
Subject: [PATCH 79/89] python version apparently has to be 3.10 above for
 olmo/util.py to run

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3ccd96ab2..4a6e6113b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ description = "Open Language Model (OLMo)"
 authors = [
     { name = "Allen Institute for Artificial Intelligence", email = "olmo@allenai.org" }
 ]
-requires-python = ">=3.8"
+requires-python = ">=3.10"
 license = { file = "LICENSE" }
 dependencies = [
     "numpy<2",

From 4a882a377545ea9990e6aa165fbeb31482bd24b7 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 13 Dec 2024 13:50:04 -0800
Subject: [PATCH 80/89] err... no

---
 olmo/util.py   | 2 ++
 pyproject.toml | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/olmo/util.py b/olmo/util.py
index aad77eb1c..9d62a2b32 100644
--- a/olmo/util.py
+++ b/olmo/util.py
@@ -30,6 +30,8 @@
 
 from olmo_data.data import get_data_path
 
+from __future__ import annotations ### TO BE REMOVED -- hack --
+
 from .aliases import PathOrStr
 from .exceptions import (
     OLMoCliError,
diff --git a/pyproject.toml b/pyproject.toml
index 4a6e6113b..3ccd96ab2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ description = "Open Language Model (OLMo)"
 authors = [
     { name = "Allen Institute for Artificial Intelligence", email = "olmo@allenai.org" }
 ]
-requires-python = ">=3.10"
+requires-python = ">=3.8"
 license = { file = "LICENSE" }
 dependencies = [
     "numpy<2",

From 6acbfccacaad88044bd08bdd2f8b617ba3ce44d4 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 13 Dec 2024 13:54:08 -0800
Subject: [PATCH 81/89] err... no

---
 olmo/util.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/olmo/util.py b/olmo/util.py
index 9d62a2b32..66af8b1f0 100644
--- a/olmo/util.py
+++ b/olmo/util.py
@@ -1,3 +1,5 @@
+from __future__ import annotations ### TO BE REMOVED
+
 import gzip
 import io
 import json
@@ -30,7 +32,6 @@
 
 from olmo_data.data import get_data_path
 
-from __future__ import annotations ### TO BE REMOVED -- hack --
 
 from .aliases import PathOrStr
 from .exceptions import (

From 6e67a9c96db092f682ecb38f0d504fb2f9f9941e Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Fri, 13 Dec 2024 13:58:34 -0800
Subject: [PATCH 82/89] tinkering

---
 scripts/convert_olmo_to_hf_new.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scripts/convert_olmo_to_hf_new.py b/scripts/convert_olmo_to_hf_new.py
index 0f4ebe9f0..afd74b1ee 100644
--- a/scripts/convert_olmo_to_hf_new.py
+++ b/scripts/convert_olmo_to_hf_new.py
@@ -11,6 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import annotations ### TO BE REMOVED
+
 import argparse
 import gc
 import json

From 21193cba6a0daa052c4a29e6aface1ce56e68c12 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Mon, 16 Dec 2024 08:01:44 -0800
Subject: [PATCH 83/89] undoing changes

---
 olmo/util.py                      | 2 --
 scripts/convert_olmo_to_hf_new.py | 2 --
 2 files changed, 4 deletions(-)

diff --git a/olmo/util.py b/olmo/util.py
index 66af8b1f0..3f4093c7c 100644
--- a/olmo/util.py
+++ b/olmo/util.py
@@ -1,5 +1,3 @@
-from __future__ import annotations ### TO BE REMOVED
-
 import gzip
 import io
 import json
diff --git a/scripts/convert_olmo_to_hf_new.py b/scripts/convert_olmo_to_hf_new.py
index afd74b1ee..b0752e651 100644
--- a/scripts/convert_olmo_to_hf_new.py
+++ b/scripts/convert_olmo_to_hf_new.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import annotations ### TO BE REMOVED
-
 import argparse
 import gc
 import json

From 1b4da65e37dba14c87562a3b61c3f28e1522fb6c Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Mon, 16 Dec 2024 08:42:21 -0800
Subject: [PATCH 84/89] fix

---
 scripts/convert_checkpoints_batch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 0075211bc..1ea5fa688 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -94,7 +94,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
         if not path_found:
             conversion_status = "new"
             converted_path = local_path
-            conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{local_path}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5'  --cleanup-local-dir"
+            conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{local_path}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5'"
 
             if sanity_check:
                 print("SANITY CHECK MODE (not running the conversion)")

From 7b6e37f2713e956a18e34257ccc215bb2d603483 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Mon, 16 Dec 2024 10:33:41 -0800
Subject: [PATCH 85/89] error code updated

---
 scripts/convert_checkpoints_batch.py | 39 ++++++++++++++++++----------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 1ea5fa688..e935acb01 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -35,6 +35,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
     print(f">>> Total of {len(cps)} paths to process. <<<", flush=True)
 
     processed: Dict = {}
+    errored: Dict = {}
 
     # Convert to old-style checkpoint.
     for checkpoint_path in cps:
@@ -103,32 +104,40 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
                 try:
                     subprocess.run(conversion_cmd, shell=True, check=True)
                 except subprocess.CalledProcessError as e:
-                    error = (
-                        e.output
-                    )  ### NOT ACTUALLY WORKING CORRECTLY. FIX THIS (not catching config not found error)
+                    print(f"Error during checkpoint conversion: {checkpoint_path}")
+                    error = ( e.return_code, e.stderr )  ### NOT ACTUALLY WORKING CORRECTLY. FIX THIS (not catching config not found error)
                     conversion_status = "error"
                     converted_path = ""
 
+        timestamp = time.strftime("%b-%d-%Y_%H%M", time.localtime())
+
         # Keep info for log.jsonl
         local_log = {
             "unprocessed_path": checkpoint_path,
             "converted_path": converted_path.replace(load_dir, weka_prefix),
             "conversion": conversion_status,
-            "date_time": time.strftime("%b-%d-%Y_%H%M", time.localtime()),
+            "date_time": timestamp,
             "error": error,
         }
 
-        # output model checkpoint location for eval scripts
-        curr = Path(converted_path)
-        parent = curr.parent
-        if parent.name not in processed:
-            processed[parent.name] = {
-                "model_name": parent.name,
-                "checkpoints_location": str(parent).replace(load_dir, weka_prefix),
-                "revisions": [curr.name],
+        if conversion_status == 'error':
+            errored[checkpoint_path] = {
+                "unprocessed_path": checkpoint_path,
+                "date_time": timestamp,
+                "error": error
             }
         else:
-            processed[parent.name]["revisions"].append(curr.name)
+            # output model checkpoint location for eval scripts
+            curr = Path(converted_path)
+            parent = curr.parent
+            if parent.name not in processed:
+                processed[parent.name] = {
+                    "model_name": parent.name,
+                    "checkpoints_location": str(parent).replace(load_dir, weka_prefix),
+                    "revisions": [curr.name],
+                }
+            else:
+                processed[parent.name]["revisions"].append(curr.name)
 
         # Output Log
         if not sanity_check:
@@ -140,6 +149,10 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
         with open(os.path.join(RESULTS_DIR, "model_checkpoints.jsonl"), "w") as fout:
             for _, p in processed.items():
                 fout.write(json.dumps(p) + "\n")
+        if len(errored) > 0:
+            with open(os.path.join(RESULTS_DIR, "errors.jsonl"), "w") as fout:
+                for _, p in errored.items():
+                    fout.write(json.dumps(p) + "\n")
 
 
 def s3_path_exists(bucket, prefix, bucket_name):

From 264ce05b52280792f6dac14403a34d8bb1fe4f0e Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Mon, 16 Dec 2024 12:56:06 -0800
Subject: [PATCH 86/89] minor change to the error log

---
 scripts/convert_checkpoints_batch.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index e935acb01..e1ea66194 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -105,7 +105,10 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
                     subprocess.run(conversion_cmd, shell=True, check=True)
                 except subprocess.CalledProcessError as e:
                     print(f"Error during checkpoint conversion: {checkpoint_path}")
-                    error = ( e.return_code, e.stderr )  ### NOT ACTUALLY WORKING CORRECTLY. FIX THIS (not catching config not found error)
+                    error = {
+                        'error_code': e.return_code,
+                        'error_stderr': e.stderr
+                    }
                     conversion_status = "error"
                     converted_path = ""
 

From 2154ea80125b601972a69be758cd4f583b582d25 Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Tue, 17 Dec 2024 12:41:18 -0800
Subject: [PATCH 87/89] fixed error

---
 scripts/convert_checkpoints_batch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index e1ea66194..96e44da7e 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -106,7 +106,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
                 except subprocess.CalledProcessError as e:
                     print(f"Error during checkpoint conversion: {checkpoint_path}")
                     error = {
-                        'error_code': e.return_code,
+                        'error_code': e.returncode,
                         'error_stderr': e.stderr
                     }
                     conversion_status = "error"

From d4e1f42fb506443ea506ff3612ab35f7c522d7fe Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Tue, 17 Dec 2024 12:49:46 -0800
Subject: [PATCH 88/89] edited conversion error output

---
 scripts/convert_checkpoints_batch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 96e44da7e..9f03b90e4 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -107,7 +107,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
                     print(f"Error during checkpoint conversion: {checkpoint_path}")
                     error = {
                         'error_code': e.returncode,
-                        'error_stderr': e.stderr
+                        'error_stderr': e.stdout
                     }
                     conversion_status = "error"
                     converted_path = ""

From f39a522f90fd61a6bf3602d8598be2d5ecea965b Mon Sep 17 00:00:00 2001
From: Jena Hwang <jenah@allenai.org>
Date: Tue, 17 Dec 2024 12:56:26 -0800
Subject: [PATCH 89/89] edited conversion error output

---
 scripts/convert_checkpoints_batch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py
index 9f03b90e4..d6a3bfee1 100644
--- a/scripts/convert_checkpoints_batch.py
+++ b/scripts/convert_checkpoints_batch.py
@@ -107,7 +107,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre
                     print(f"Error during checkpoint conversion: {checkpoint_path}")
                     error = {
                         'error_code': e.returncode,
-                        'error_stderr': e.stdout
+                        'error_stdout': e.stdout
                     }
                     conversion_status = "error"
                     converted_path = ""