From ef7a31cc339ad28f432b0d72ad109717c56346b2 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 6 Sep 2024 09:55:02 -0700 Subject: [PATCH 01/89] batch convert checkpoint --- log.txt | 10 +++ scripts/convert_checkpoints.py | 75 +++++++++++++++++ scripts/convert_checkpoints_batch.py | 118 +++++++++++++++++++++++++++ 3 files changed, 203 insertions(+) create mode 100644 log.txt create mode 100644 scripts/convert_checkpoints.py create mode 100644 scripts/convert_checkpoints_batch.py diff --git a/log.txt b/log.txt new file mode 100644 index 000000000..a26fb9f41 --- /dev/null +++ b/log.txt @@ -0,0 +1,10 @@ + + o=======[] + __ _ _ _ |_ [] + / _` | __ _ _ _ | |_ _ _ | || | [] + \__, | / _` | | ' \ | _| | '_| \_, | _/ ]_ + |___/ \__,_| |_||_| _\__| _|_|_ _|__/ |_____| +_|"""""|_|"""""|_|"""""|_|"""""|_|"""""|_| """"| + `---------------------------------------------' + +Experiment submitted, see progress at https://beaker.org/ex/01J743RVEV62XWWKXSVSFQHXH6 diff --git a/scripts/convert_checkpoints.py b/scripts/convert_checkpoints.py new file mode 100644 index 000000000..c9c9c4a3e --- /dev/null +++ b/scripts/convert_checkpoints.py @@ -0,0 +1,75 @@ +# This script requires to be run at the root level. +# Requires the AWS CLI and Beaker Gantry to be installed and configured. + + +import argparse +import subprocess + +# Beaker secret keys +AWS_ACCESS_KEY_ID = 'JENA_AWS_ACCESS_KEY_ID' +AWS_SECRET_ACCESS_KEY = 'JENA_AWS_SECRET_ACCESS_KEY' + +SANITY_CHECK = False + +def convert_checkpoint(checkpoint_paths): + + for cp in checkpoint_paths: + retain_path_name = cp.replace('s3://', '').strip('/') + load_dir = "/data/input" + weka_loc = f"{load_dir}/{retain_path_name}-hf/" + log_file = "log.txt" + + cmd = f"gantry run " \ + f"--description 'Converting {cp}' " \ + f"--allow-dirty " \ + f"--no-python " \ + f"--workspace ai2/cheap-decisions " \ + f"--priority normal " \ + f"--gpus 0 " \ + f"--preemptible " \ + f"--cluster 'ai2/jupiter-cirrascale-2' " \ + f"--budget ai2/oe-eval " \ + f"--env-secret AWS_ACCESS_KEY_ID={AWS_ACCESS_KEY_ID} " \ + f"--env-secret AWS_SECRET_ACCESS_KEY={AWS_SECRET_ACCESS_KEY} " \ + f"--shared-memory 10GiB " \ + f"--weka=oe-eval-default:{load_dir} " \ + f"--yes " \ + f"-- /bin/bash -c python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{cp}' --destination-dir '{weka_loc}' --keep-olmo-artifacts" + + #f"--mount weka://oe-eval-default={load_dir} " + # FIX THIS + if SANITY_CHECK: + print(cmd) + else: + try: + with open(log_file,'w') as fout: + subprocess.run(cmd, shell=True, check=True, stdout=fout, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + print(e.output) + + +def read_checkpoints(f): + with open(f,'r') as fin: + checkpoints = [line for line in f if line and line != ''] + return checkpoints + + +def main(): + parser = argparse.ArgumentParser( + description="Unshard checkpoint and convert to HF format. Run via Gantry. Invoke this script from the root of the OLMo repo." + ) + + group_batch = parser.add_mutually_exclusive_group(required=True) + group_batch.add_argument("--checkpoint_path", help="path to sharded checkpoint", type=str) + group_batch.add_argument("--checkpoint_path_file", help="file that lists sharded checkpoint paths (batch run option)", type=str) + + args = parser.parse_args() + + if args.checkpoint_path is not None: + convert_checkpoint([args.checkpoint_path]) + else: + convert_checkpoint(read_checkpoints(args.checkpoint_path_file)) + + +if __name__ == "__main__": + main() diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py new file mode 100644 index 000000000..3692e885c --- /dev/null +++ b/scripts/convert_checkpoints_batch.py @@ -0,0 +1,118 @@ +""" +Modification of s3_unshard_to_hf.py +Wrapper for hf_olmo/convert_olmo_to_hf.py + +Takes a model checkpoint stored on S3, unshards, and converts to HF format. +Saves the converted checkpoints to weka. +Requires AWS CLI to be installed and configured. +""" + +import argparse +import pathlib +import shutil +import subprocess +import os + + +def convert_to_hf(args): + # Ensure local directory exists + if not os.path.exists(local_file_dir): + os.makedirs(local_file_dir) + + # Convert to old-style checkpoint. + hf_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir {unsharded_dir} --destination-dir {local_file_dir}" + subprocess.run(hf_cmd, shell=True, check=True) + + # Move to Weka + if not os.path.exists(weka_file_dir): + os.makedirs(weka_file_dir) + + + + # Move the HF files from the unsharded dir to their own. + for fname in [ + "config.json", + "pytorch_model.bin", + "special_tokens_map.json", + "tokenizer.json", + "tokenizer_config.json", + ]: + (unsharded_dir / fname).rename(hf_dir / fname) + + # Upload the unsharded and HF files back to S3. + print("Uploading files back to S3.") + if not args.already_unsharded: + upload_unsharded_cmd = aws_copy(unsharded_dir, args.unsharded_bucket, args) + subprocess.run(upload_unsharded_cmd, shell=True, check=True) + + upload_hf_cmd = aws_copy(hf_dir, args.hf_bucket, args) + subprocess.run(upload_hf_cmd, shell=True, check=True) + +def make_parser(): + parser = argparse.ArgumentParser( + description="Unshard S3 checkpoint and convert to HF format. Invoke this script from the root of the OLMo repo." + ) + parser.add_argument("--sharded_bucket", help="S3 bucket with sharded checkpoint.", type=str) + parser.add_argument( + "--unsharded_bucket", + help="S3 bucket to save the unsharded checkpoint.", + type=str, + ) + parser.add_argument( + "--already_downloaded", + action="store_true", + help="Use this flag if the unsharded S3 checkpoint is already downloaded, but still needs to be unsharded.", + ) + parser.add_argument( + "--already_unsharded", + action="store_true", + help="If given, the checkpoint has already been unsharded; just convert to HF.", + ) + parser.add_argument("--hf_bucket", help="S3 bucket to save the HF-converted checkpoint.", type=str) + parser.add_argument( + "--local_dir", + help="""Directory to store checkpoints locally.""", + type=pathlib.Path, + ) + parser.add_argument( + "--cleanup_local_dir", + action="store_true", + help="If given, remove the local directory if everything runs successfully to free up space on NFS.", + ) + parser.add_argument( + "--checkpoint_style", + default="hf_olmo", + choices=["hf_olmo", "transformers"], + help="""Checkpoint style. The `transformers` style works with HF transformers as-is, while + `hf_olmo` relies on the `hf_olmo` package for conversion. In general, use + `transformers` for external releases and `hf_olmo` for internal model + development.""", + ) + parser.add_argument( + "--hf_olmo", + action="store_true", + help="If given, convert to 'hf-olmo' style checkpoints.", + ) + parser.add_argument( + "--quiet", + action="store_true", + help="If given, don't show progress for AWS commands.", + ) + parser.add_argument("--type", default=None, help="If given, pass this argument on to `unshard.py`.") + parser.add_argument("--model_only", action="store_true", help="If given, only unshard the model.") + return parser + +def main(): + parser = make_parser() + args = parser.parse_args() + args.local_dir.mkdir(exist_ok=True, parents=True) + + s3_unshard_to_hf(args) + + if args.cleanup_local_dir: + # Clear out temp dir if we got here (everything ran without error). + shutil.rmtree(args.tmp_dir) + + +if __name__ == "__main__": + main() From 62d9a1de78670db1c0492d50b5ebfd2955c145d8 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 6 Sep 2024 10:11:22 -0700 Subject: [PATCH 02/89] batch convert checkpoint --- log.txt | 2 +- scripts/convert_checkpoints.py | 6 ++---- scripts/convert_checkpoints.sh | 16 ++++++++++++++++ 3 files changed, 19 insertions(+), 5 deletions(-) create mode 100644 scripts/convert_checkpoints.sh diff --git a/log.txt b/log.txt index a26fb9f41..16a1a62e7 100644 --- a/log.txt +++ b/log.txt @@ -7,4 +7,4 @@ _|"""""|_|"""""|_|"""""|_|"""""|_|"""""|_| """"| `---------------------------------------------' -Experiment submitted, see progress at https://beaker.org/ex/01J743RVEV62XWWKXSVSFQHXH6 +Experiment submitted, see progress at https://beaker.org/ex/01J7446KB7EXZ35D8NST0JTNTY diff --git a/scripts/convert_checkpoints.py b/scripts/convert_checkpoints.py index c9c9c4a3e..2cbbcd301 100644 --- a/scripts/convert_checkpoints.py +++ b/scripts/convert_checkpoints.py @@ -9,7 +9,7 @@ AWS_ACCESS_KEY_ID = 'JENA_AWS_ACCESS_KEY_ID' AWS_SECRET_ACCESS_KEY = 'JENA_AWS_SECRET_ACCESS_KEY' -SANITY_CHECK = False +SANITY_CHECK = True def convert_checkpoint(checkpoint_paths): @@ -17,7 +17,6 @@ def convert_checkpoint(checkpoint_paths): retain_path_name = cp.replace('s3://', '').strip('/') load_dir = "/data/input" weka_loc = f"{load_dir}/{retain_path_name}-hf/" - log_file = "log.txt" cmd = f"gantry run " \ f"--description 'Converting {cp}' " \ @@ -42,8 +41,7 @@ def convert_checkpoint(checkpoint_paths): print(cmd) else: try: - with open(log_file,'w') as fout: - subprocess.run(cmd, shell=True, check=True, stdout=fout, stderr=subprocess.STDOUT) + subprocess.run(cmd, shell=True, check=True) except subprocess.CalledProcessError as e: print(e.output) diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh new file mode 100644 index 000000000..8caffa73d --- /dev/null +++ b/scripts/convert_checkpoints.sh @@ -0,0 +1,16 @@ +gantry run \ + --description "Converting ${CHECKPOINT_PATH}" \ + --allow-dirty \ + --no-python \ + --workspace ai2/cheap-decisions \ + --priority normal \ + --gpus 0 \ + --preemptible \ + --cluster ai2/jupiter-cirrascale-2 \ + --budget ai2/oe-eval \ + --env-secret AWS_ACCESS_KEY_ID={AWS_ACCESS_KEY_ID} \ + --env-secret AWS_SECRET_ACCESS_KEY={AWS_SECRET_ACCESS_KEY} \ + --shared-memory 10GiB \ + --weka=oe-eval-default:/data/input \ + --yes \ + -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{cp}' --destination-dir '{weka_loc}' --keep-olmo-artifacts" \ No newline at end of file From dba011b2e435df2167e8639259e04fb617befb68 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 6 Sep 2024 10:12:34 -0700 Subject: [PATCH 03/89] batch convert checkpoint --- scripts/convert_checkpoints.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index 8caffa73d..3e822ee85 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -8,8 +8,8 @@ gantry run \ --preemptible \ --cluster ai2/jupiter-cirrascale-2 \ --budget ai2/oe-eval \ - --env-secret AWS_ACCESS_KEY_ID={AWS_ACCESS_KEY_ID} \ - --env-secret AWS_SECRET_ACCESS_KEY={AWS_SECRET_ACCESS_KEY} \ + --env-secret AWS_ACCESS_KEY_ID=JENA_AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=JENA_AWS_SECRET_ACCESS_KE \ --shared-memory 10GiB \ --weka=oe-eval-default:/data/input \ --yes \ From 84a4f1b7e9353f4586ecd20e3489a1244c625603 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 6 Sep 2024 10:13:04 -0700 Subject: [PATCH 04/89] batch convert checkpoint --- scripts/convert_checkpoints.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index 3e822ee85..12c9e36e6 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -9,7 +9,7 @@ gantry run \ --cluster ai2/jupiter-cirrascale-2 \ --budget ai2/oe-eval \ --env-secret AWS_ACCESS_KEY_ID=JENA_AWS_ACCESS_KEY_ID \ - --env-secret AWS_SECRET_ACCESS_KEY=JENA_AWS_SECRET_ACCESS_KE \ + --env-secret AWS_SECRET_ACCESS_KEY=JENA_AWS_SECRET_ACCESS_KEY \ --shared-memory 10GiB \ --weka=oe-eval-default:/data/input \ --yes \ From 9a7b03b6df674973026e1be45338b30c0edd6bdf Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 6 Sep 2024 12:09:52 -0700 Subject: [PATCH 05/89] batch convert checkpoint --- requirements.txt | 3 +++ requirements.txt~ | 1 + scripts/convert_checkpoints.sh | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 requirements.txt create mode 100644 requirements.txt~ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 000000000..c32a09983 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +torch +omegaconf +tqdm diff --git a/requirements.txt~ b/requirements.txt~ new file mode 100644 index 000000000..12c6d5d5e --- /dev/null +++ b/requirements.txt~ @@ -0,0 +1 @@ +torch diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index 12c9e36e6..d067b4681 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -13,4 +13,4 @@ gantry run \ --shared-memory 10GiB \ --weka=oe-eval-default:/data/input \ --yes \ - -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{cp}' --destination-dir '{weka_loc}' --keep-olmo-artifacts" \ No newline at end of file + -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --destination-dir '/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --keep-olmo-artifacts" \ No newline at end of file From d4687e99e3a15f922f87377cdb179b8aaf26ffad Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 6 Sep 2024 12:15:07 -0700 Subject: [PATCH 06/89] batch convert checkpoint --- requirements.txt~ | 1 - scripts/convert_checkpoints.sh | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 requirements.txt~ diff --git a/requirements.txt~ b/requirements.txt~ deleted file mode 100644 index 12c6d5d5e..000000000 --- a/requirements.txt~ +++ /dev/null @@ -1 +0,0 @@ -torch diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index d067b4681..6babf8658 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -13,4 +13,5 @@ gantry run \ --shared-memory 10GiB \ --weka=oe-eval-default:/data/input \ --yes \ + --pip requirements.txt \ -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --destination-dir '/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --keep-olmo-artifacts" \ No newline at end of file From 24ec144881f9990eed74047ac75efa6ed309ef53 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 6 Sep 2024 12:16:57 -0700 Subject: [PATCH 07/89] batch convert checkpoint --- scripts/convert_checkpoints.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index 6babf8658..6e1992c5b 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -1,5 +1,4 @@ gantry run \ - --description "Converting ${CHECKPOINT_PATH}" \ --allow-dirty \ --no-python \ --workspace ai2/cheap-decisions \ From 6187889f9caa4b23ecdcf387380afb9e76d093cb Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 6 Sep 2024 12:23:29 -0700 Subject: [PATCH 08/89] batch convert checkpoint --- environments.yml | 8 ++++++++ requirements.txt | 3 --- 2 files changed, 8 insertions(+), 3 deletions(-) create mode 100644 environments.yml delete mode 100644 requirements.txt diff --git a/environments.yml b/environments.yml new file mode 100644 index 000000000..409aa9047 --- /dev/null +++ b/environments.yml @@ -0,0 +1,8 @@ +name: torch-env +channels: +- pytorch +dependencies: +- python=3.9 +- pytorch +- omegaconf +- tqdm \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index c32a09983..000000000 --- a/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -torch -omegaconf -tqdm From fbfda0e3eca0768728eaa8d7dbd91bcbba5d8d2c Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 6 Sep 2024 12:25:51 -0700 Subject: [PATCH 09/89] batch convert checkpoint --- environments.yml => environment.yml | 0 scripts/convert_checkpoints.sh | 1 - 2 files changed, 1 deletion(-) rename environments.yml => environment.yml (100%) diff --git a/environments.yml b/environment.yml similarity index 100% rename from environments.yml rename to environment.yml diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index 6e1992c5b..965cd647f 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -12,5 +12,4 @@ gantry run \ --shared-memory 10GiB \ --weka=oe-eval-default:/data/input \ --yes \ - --pip requirements.txt \ -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --destination-dir '/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --keep-olmo-artifacts" \ No newline at end of file From a862a0b943642629c03289b81bdf9a714472dd7c Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 6 Sep 2024 12:34:36 -0700 Subject: [PATCH 10/89] tinkering --- .gitignore | 3 +++ environment.yml | 8 -------- 2 files changed, 3 insertions(+), 8 deletions(-) delete mode 100644 environment.yml diff --git a/.gitignore b/.gitignore index 9b1e99785..e0f77ccd8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# beaker yaml +guided-trout-2f805b9.yaml + # build artifacts .eggs/ diff --git a/environment.yml b/environment.yml deleted file mode 100644 index 409aa9047..000000000 --- a/environment.yml +++ /dev/null @@ -1,8 +0,0 @@ -name: torch-env -channels: -- pytorch -dependencies: -- python=3.9 -- pytorch -- omegaconf -- tqdm \ No newline at end of file From 8d79a01fd4987311fb1d75b4859e7d597038eede Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 6 Sep 2024 12:53:54 -0700 Subject: [PATCH 11/89] testing --- requirements.txt | 149 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 000000000..269595af9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,149 @@ +ai2-olmo==0.5.0 +ai2-olmo-core==0.1.0 +aiohappyeyeballs==2.4.0 +aiohttp==3.10.5 +aiosignal==1.3.1 +annotated-types==0.7.0 +antlr4-python3-runtime==4.9.3 +async-timeout==4.0.3 +attrs==24.2.0 +backoff==2.1.2 +backports.tarfile==1.2.0 +beaker-gantry==1.8.3 +beaker-py==1.31.3 +black==23.12.1 +boltons==24.0.0 +boto3==1.35.6 +boto3-extensions==0.23.0 +botocore==1.35.6 +build==1.2.1 +cached_path==1.6.3 +cachetools==5.5.0 +certifi==2024.7.4 +charset-normalizer==3.3.2 +click==8.1.7 +click-aliases==1.0.4 +click-help-colors==0.9.4 +colorama==0.4.6 +datasets==2.7.1 +dateparser==1.2.0 +dill==0.3.6 +docker==7.1.0 +docker-pycreds==0.4.0 +docutils==0.21.2 +exceptiongroup==1.2.2 +face==20.1.1 +filelock==3.13.4 +frozenlist==1.4.1 +fsspec==2024.6.1 +ftfy==6.2.3 +gantry==0.6.14 +gitdb==4.0.11 +GitPython==3.1.43 +glom==23.5.0 +google-api-core==2.19.2 +google-auth==2.34.0 +google-cloud-core==2.4.1 +google-cloud-storage==2.18.2 +google-crc32c==1.6.0 +google-resumable-media==2.7.2 +googleapis-common-protos==1.65.0 +halo==0.0.31 +huggingface-hub==0.23.5 +idna==3.8 +importlib_metadata==8.4.0 +importlib_resources==6.4.4 +iniconfig==2.0.0 +isodate==0.6.1 +isort==5.12.0 +jaraco.classes==3.4.0 +jaraco.context==6.0.1 +jaraco.functools==4.0.2 +Jinja2==3.1.4 +jmespath==1.0.1 +joblib==1.4.2 +keyring==25.3.0 +lightning-utilities==0.11.7 +log-symbols==0.0.14 +markdown-it-py==3.0.0 +MarkupSafe==2.1.5 +mdurl==0.1.2 +more-itertools==10.5.0 +mpmath==1.3.0 +msgspec==0.18.6 +multidict==6.0.5 +multiprocess==0.70.14 +mypy==1.3.0 +mypy-extensions==1.0.0 +necessary==0.4.3 +networkx==3.3 +nh3==0.2.18 +numpy==2.1.0 +omegaconf==2.3.0 +packaging==24.1 +pandas==2.2.2 +pathspec==0.12.1 +petname==2.6 +pkginfo==1.10.0 +platformdirs==4.2.2 +pluggy==1.5.0 +proto-plus==1.24.0 +protobuf==5.28.0 +psutil==6.0.0 +pyarrow==17.0.0 +pyasn1==0.6.0 +pyasn1_modules==0.4.0 +pydantic==2.8.2 +pydantic_core==2.20.1 +Pygments==2.18.0 +pyproject_hooks==1.1.0 +pytest==8.3.2 +pytest-sphinx==0.6.3 +python-dateutil==2.9.0.post0 +pytz==2024.1 +PyYAML==6.0.2 +readme_renderer==44.0 +regex==2024.7.24 +requests==2.32.3 +requests-toolbelt==1.0.0 +requirements-parser==0.11.0 +responses==0.18.0 +rfc3986==2.0.0 +rich==13.8.0 +rsa==4.9 +ruff==0.6.4 +s3transfer==0.10.2 +safetensors==0.4.5 +scikit-learn==1.5.1 +scipy==1.14.1 +sentry-sdk==2.13.0 +setproctitle==1.3.3 +six==1.16.0 +smart-open==7.0.4 +smashed==0.21.5 +smmap==5.0.1 +spinners==0.0.24 +sympy==1.13.1 +tabulate==0.9.0 +termcolor==2.4.0 +threadpoolctl==3.5.0 +tokenizers==0.19.1 +tomli==2.0.1 +torch==2.2.2 +torchmetrics==1.4.1 +tqdm==4.66.5 +transformers==4.44.2 +trouting==0.3.3 +twine==5.1.1 +typeguard==2.13.3 +types-setuptools==74.1.0.20240906 +typing_extensions==4.12.2 +tzdata==2024.1 +tzlocal==5.2 +urllib3==2.2.2 +wandb==0.17.9 +wcwidth==0.2.13 +wrapt==1.16.0 +xxhash==3.5.0 +yarl==1.9.4 +zipp==3.20.1 From b4ed78dabe2665537a497c044a0fc70b29f1bb2b Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 6 Sep 2024 13:04:59 -0700 Subject: [PATCH 12/89] testing --- requirements.txt | 150 +-------------------------------- scripts/convert_checkpoints.sh | 1 + 2 files changed, 2 insertions(+), 149 deletions(-) diff --git a/requirements.txt b/requirements.txt index 269595af9..679c6f744 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,149 +1 @@ -ai2-olmo==0.5.0 -ai2-olmo-core==0.1.0 -aiohappyeyeballs==2.4.0 -aiohttp==3.10.5 -aiosignal==1.3.1 -annotated-types==0.7.0 -antlr4-python3-runtime==4.9.3 -async-timeout==4.0.3 -attrs==24.2.0 -backoff==2.1.2 -backports.tarfile==1.2.0 -beaker-gantry==1.8.3 -beaker-py==1.31.3 -black==23.12.1 -boltons==24.0.0 -boto3==1.35.6 -boto3-extensions==0.23.0 -botocore==1.35.6 -build==1.2.1 -cached_path==1.6.3 -cachetools==5.5.0 -certifi==2024.7.4 -charset-normalizer==3.3.2 -click==8.1.7 -click-aliases==1.0.4 -click-help-colors==0.9.4 -colorama==0.4.6 -datasets==2.7.1 -dateparser==1.2.0 -dill==0.3.6 -docker==7.1.0 -docker-pycreds==0.4.0 -docutils==0.21.2 -exceptiongroup==1.2.2 -face==20.1.1 -filelock==3.13.4 -frozenlist==1.4.1 -fsspec==2024.6.1 -ftfy==6.2.3 -gantry==0.6.14 -gitdb==4.0.11 -GitPython==3.1.43 -glom==23.5.0 -google-api-core==2.19.2 -google-auth==2.34.0 -google-cloud-core==2.4.1 -google-cloud-storage==2.18.2 -google-crc32c==1.6.0 -google-resumable-media==2.7.2 -googleapis-common-protos==1.65.0 -halo==0.0.31 -huggingface-hub==0.23.5 -idna==3.8 -importlib_metadata==8.4.0 -importlib_resources==6.4.4 -iniconfig==2.0.0 -isodate==0.6.1 -isort==5.12.0 -jaraco.classes==3.4.0 -jaraco.context==6.0.1 -jaraco.functools==4.0.2 -Jinja2==3.1.4 -jmespath==1.0.1 -joblib==1.4.2 -keyring==25.3.0 -lightning-utilities==0.11.7 -log-symbols==0.0.14 -markdown-it-py==3.0.0 -MarkupSafe==2.1.5 -mdurl==0.1.2 -more-itertools==10.5.0 -mpmath==1.3.0 -msgspec==0.18.6 -multidict==6.0.5 -multiprocess==0.70.14 -mypy==1.3.0 -mypy-extensions==1.0.0 -necessary==0.4.3 -networkx==3.3 -nh3==0.2.18 -numpy==2.1.0 -omegaconf==2.3.0 -packaging==24.1 -pandas==2.2.2 -pathspec==0.12.1 -petname==2.6 -pkginfo==1.10.0 -platformdirs==4.2.2 -pluggy==1.5.0 -proto-plus==1.24.0 -protobuf==5.28.0 -psutil==6.0.0 -pyarrow==17.0.0 -pyasn1==0.6.0 -pyasn1_modules==0.4.0 -pydantic==2.8.2 -pydantic_core==2.20.1 -Pygments==2.18.0 -pyproject_hooks==1.1.0 -pytest==8.3.2 -pytest-sphinx==0.6.3 -python-dateutil==2.9.0.post0 -pytz==2024.1 -PyYAML==6.0.2 -readme_renderer==44.0 -regex==2024.7.24 -requests==2.32.3 -requests-toolbelt==1.0.0 -requirements-parser==0.11.0 -responses==0.18.0 -rfc3986==2.0.0 -rich==13.8.0 -rsa==4.9 -ruff==0.6.4 -s3transfer==0.10.2 -safetensors==0.4.5 -scikit-learn==1.5.1 -scipy==1.14.1 -sentry-sdk==2.13.0 -setproctitle==1.3.3 -six==1.16.0 -smart-open==7.0.4 -smashed==0.21.5 -smmap==5.0.1 -spinners==0.0.24 -sympy==1.13.1 -tabulate==0.9.0 -termcolor==2.4.0 -threadpoolctl==3.5.0 -tokenizers==0.19.1 -tomli==2.0.1 -torch==2.2.2 -torchmetrics==1.4.1 -tqdm==4.66.5 -transformers==4.44.2 -trouting==0.3.3 -twine==5.1.1 -typeguard==2.13.3 -types-setuptools==74.1.0.20240906 -typing_extensions==4.12.2 -tzdata==2024.1 -tzlocal==5.2 -urllib3==2.2.2 -wandb==0.17.9 -wcwidth==0.2.13 -wrapt==1.16.0 -xxhash==3.5.0 -yarl==1.9.4 -zipp==3.20.1 +pytorch diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index 965cd647f..bd0f6a9ce 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -12,4 +12,5 @@ gantry run \ --shared-memory 10GiB \ --weka=oe-eval-default:/data/input \ --yes \ + --install install_torch.sh \ -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --destination-dir '/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --keep-olmo-artifacts" \ No newline at end of file From 2f2a764a36a116a77e9288f1338a4a7d55ad9e84 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 6 Sep 2024 13:05:56 -0700 Subject: [PATCH 13/89] testing --- install_torch.sh | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 install_torch.sh diff --git a/install_torch.sh b/install_torch.sh new file mode 100644 index 000000000..5ac68ad6e --- /dev/null +++ b/install_torch.sh @@ -0,0 +1,2 @@ +#!/bin/bash +pip install torch From b9601c4a81e3f26be17eba804b7266c3e6a5e73b Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 6 Sep 2024 13:11:09 -0700 Subject: [PATCH 14/89] testing --- requirements.txt | 2 +- scripts/convert_checkpoints.sh | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index 679c6f744..12c6d5d5e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -pytorch +torch diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index bd0f6a9ce..55da72f44 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -1,6 +1,5 @@ gantry run \ --allow-dirty \ - --no-python \ --workspace ai2/cheap-decisions \ --priority normal \ --gpus 0 \ @@ -11,6 +10,8 @@ gantry run \ --env-secret AWS_SECRET_ACCESS_KEY=JENA_AWS_SECRET_ACCESS_KEY \ --shared-memory 10GiB \ --weka=oe-eval-default:/data/input \ + --pip requirements.txt \ --yes \ - --install install_torch.sh \ - -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --destination-dir '/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --keep-olmo-artifacts" \ No newline at end of file + -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --destination-dir '/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --keep-olmo-artifacts" + +# --install install_torch.sh \ From 8cc86ee9ddbba3d97aee615235892ae7317fa1f8 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 6 Sep 2024 13:19:25 -0700 Subject: [PATCH 15/89] testing --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 12c6d5d5e..c00bc2475 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ torch +datasets From 50e7090023cd02d02b04d5b2408752e4ef0aa1ff Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 6 Sep 2024 13:20:03 -0700 Subject: [PATCH 16/89] testing --- requirements.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/requirements.txt b/requirements.txt index c00bc2475..b70cc172e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,5 @@ torch datasets +rich +botocore +cachedpath From 8aa450f8ce653f3532e80129bdc9da6f2bfe12a9 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 6 Sep 2024 13:25:49 -0700 Subject: [PATCH 17/89] testing --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b70cc172e..9339fe636 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,4 @@ torch datasets rich botocore -cachedpath +cached-path From f35732083d50619aa2af7c0492f915e4ee18ce73 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 6 Sep 2024 14:29:47 -0700 Subject: [PATCH 18/89] testing --- requirements.txt | 1 + scripts/convert_checkpoints.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 9339fe636..f439913b6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ datasets rich botocore cached-path +transformers diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index 55da72f44..7d09df0d8 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -12,6 +12,6 @@ gantry run \ --weka=oe-eval-default:/data/input \ --pip requirements.txt \ --yes \ - -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --destination-dir '/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --keep-olmo-artifacts" + -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --destination-dir '/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --keep-olmo-artifacts --tokenizer_name_or_path 'tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json'" # --install install_torch.sh \ From 02899a303617fc799fea0226af77ade3f8c08070 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 6 Sep 2024 14:30:48 -0700 Subject: [PATCH 19/89] testing --- scripts/convert_checkpoints.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index 7d09df0d8..0b921fd9c 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -12,6 +12,6 @@ gantry run \ --weka=oe-eval-default:/data/input \ --pip requirements.txt \ --yes \ - -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --destination-dir '/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --keep-olmo-artifacts --tokenizer_name_or_path 'tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json'" + -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --destination-dir '/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --keep-olmo-artifacts --tokenizer_name_or_path 'olmo_data/tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json'" # --install install_torch.sh \ From 9ac37396d784ccca3eeab01504cd694008e4ec9d Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 6 Sep 2024 14:31:21 -0700 Subject: [PATCH 20/89] testing --- scripts/convert_checkpoints.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index 0b921fd9c..dcc7e5b02 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -12,6 +12,6 @@ gantry run \ --weka=oe-eval-default:/data/input \ --pip requirements.txt \ --yes \ - -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --destination-dir '/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --keep-olmo-artifacts --tokenizer_name_or_path 'olmo_data/tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json'" + -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --destination-dir '/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --keep-olmo-artifacts --tokenizer 'olmo_data/tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json'" # --install install_torch.sh \ From ef0b4034f57add1b1a796283e88c0e5d635ef188 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Mon, 9 Sep 2024 11:23:09 -0700 Subject: [PATCH 21/89] convert checkpoint batch --- scripts/convert_checkpoints.py | 73 +++++------- scripts/convert_checkpoints_batch.py | 163 +++++++++++---------------- 2 files changed, 96 insertions(+), 140 deletions(-) diff --git a/scripts/convert_checkpoints.py b/scripts/convert_checkpoints.py index 2cbbcd301..6d51b3149 100644 --- a/scripts/convert_checkpoints.py +++ b/scripts/convert_checkpoints.py @@ -11,45 +11,33 @@ SANITY_CHECK = True -def convert_checkpoint(checkpoint_paths): - - for cp in checkpoint_paths: - retain_path_name = cp.replace('s3://', '').strip('/') - load_dir = "/data/input" - weka_loc = f"{load_dir}/{retain_path_name}-hf/" - - cmd = f"gantry run " \ - f"--description 'Converting {cp}' " \ - f"--allow-dirty " \ - f"--no-python " \ - f"--workspace ai2/cheap-decisions " \ - f"--priority normal " \ - f"--gpus 0 " \ - f"--preemptible " \ - f"--cluster 'ai2/jupiter-cirrascale-2' " \ - f"--budget ai2/oe-eval " \ - f"--env-secret AWS_ACCESS_KEY_ID={AWS_ACCESS_KEY_ID} " \ - f"--env-secret AWS_SECRET_ACCESS_KEY={AWS_SECRET_ACCESS_KEY} " \ - f"--shared-memory 10GiB " \ - f"--weka=oe-eval-default:{load_dir} " \ - f"--yes " \ - f"-- /bin/bash -c python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{cp}' --destination-dir '{weka_loc}' --keep-olmo-artifacts" - - #f"--mount weka://oe-eval-default={load_dir} " - # FIX THIS - if SANITY_CHECK: - print(cmd) - else: - try: - subprocess.run(cmd, shell=True, check=True) - except subprocess.CalledProcessError as e: - print(e.output) +def convert_checkpoints(args): + cmd = f"gantry run " \ + f"--allow-dirty " \ + f"--workspace ai2/cheap-decisions " \ + f"--priority normal " \ + f"--gpus 0 " \ + f"--preemptible " \ + f"--cluster 'ai2/jupiter-cirrascale-2' " \ + f"--budget ai2/oe-eval " \ + f"--env-secret AWS_ACCESS_KEY_ID={AWS_ACCESS_KEY_ID} " \ + f"--env-secret AWS_SECRET_ACCESS_KEY={AWS_SECRET_ACCESS_KEY} " \ + f"--shared-memory 10GiB " \ + f"--weka=oe-eval-default:{args.weka_load_dir} " \ + f"--yes " + if args.checkpoint_path is not None: + cmd += f"-- /bin/bash -c python convert_checkpoints_batch.py --checkpoint-path '{args.checkpoint_path}' --weka-load-dir {args.weka_load_dir}" + else: + cmd += f"-- /bin/bash -c python convert_checkpoints_batch.py --checkpoint-path-file '{args.checkpoint_path_file}' --weka-load-dir {args.weka_load_dir}" -def read_checkpoints(f): - with open(f,'r') as fin: - checkpoints = [line for line in f if line and line != ''] - return checkpoints + if SANITY_CHECK: + print(cmd) + else: + try: + subprocess.run(cmd, shell=True, check=True) + except subprocess.CalledProcessError as e: + print(e.output) def main(): @@ -58,15 +46,12 @@ def main(): ) group_batch = parser.add_mutually_exclusive_group(required=True) - group_batch.add_argument("--checkpoint_path", help="path to sharded checkpoint", type=str) - group_batch.add_argument("--checkpoint_path_file", help="file that lists sharded checkpoint paths (batch run option)", type=str) + group_batch.add_argument("--checkpoint-path", help="path to sharded checkpoint", type=str) + group_batch.add_argument("--checkpoint-path-file", help="file that lists sharded checkpoint paths (batch run option)", type=str) + parser.add_argument("--weka-load-dir", help='mounted location of weka bucket', default='/data/input', type=str) args = parser.parse_args() - - if args.checkpoint_path is not None: - convert_checkpoint([args.checkpoint_path]) - else: - convert_checkpoint(read_checkpoints(args.checkpoint_path_file)) + convert_checkpoints(args) if __name__ == "__main__": diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 3692e885c..654504d55 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -8,110 +8,81 @@ """ import argparse -import pathlib -import shutil import subprocess import os +SANITY_CHECK = True + +def convert_checkpoint(cps, load_dir="/data/input"): + cps = expand_paths(cps) + save = {} + + for checkpoint_path in cps: + # Convert to old-style checkpoint. + + retain_path_name = checkpoint_path.replace('s3://', '').strip('/') + weka_loc = f"{load_dir}/{retain_path_name}-hf/" + + # Check if the output location is already there. If not, do the conversion. + if os.path.exists(weka_loc): + conversion = 'existing' + else: + conversion = 'new' + conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --keep-olmo-artifacts" + + if SANITY_CHECK: + print(conversion_cmd) + else: + subprocess.run(conversion_cmd, shell=True, check=True) + + save[checkpoint_path] = {'converted_path': weka_loc, 'convertion': conversion} + + print(save) + +def expand_paths(cps): + expanded = [] + for cp in cps: + segs = cp.split('*') + prefix = 's3://ai2-llm/' + cmd = f"aws s3 ls --recursive {segs[0]}" + all_dirs = subprocess.run(cmd, shell=True, check=True, capture_output=True, text = True).stdout + relevant_dirs = ['/'.join(d.split()[-1].split('/')[:-1]) for d in all_dirs.split() if 'model.pt' in d] + search_segs = [seg for i, seg in enumerate(segs) if i > 0 and seg != ""] + + print(search_segs) + + temp_dirs = relevant_dirs + if len(search_segs) > 0: + for s in search_segs: + temp_dirs = [d for d in temp_dirs if s in d] + + exp = set([f"{prefix}{d}" for d in temp_dirs]) + print(exp) + + expanded += exp + return expanded + + +def read_checkpoints(f): + with open(f, 'r') as fin: + checkpoints = [line for line in fin if line and line != ''] + return checkpoints -def convert_to_hf(args): - # Ensure local directory exists - if not os.path.exists(local_file_dir): - os.makedirs(local_file_dir) - - # Convert to old-style checkpoint. - hf_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir {unsharded_dir} --destination-dir {local_file_dir}" - subprocess.run(hf_cmd, shell=True, check=True) - - # Move to Weka - if not os.path.exists(weka_file_dir): - os.makedirs(weka_file_dir) - - - - # Move the HF files from the unsharded dir to their own. - for fname in [ - "config.json", - "pytorch_model.bin", - "special_tokens_map.json", - "tokenizer.json", - "tokenizer_config.json", - ]: - (unsharded_dir / fname).rename(hf_dir / fname) - - # Upload the unsharded and HF files back to S3. - print("Uploading files back to S3.") - if not args.already_unsharded: - upload_unsharded_cmd = aws_copy(unsharded_dir, args.unsharded_bucket, args) - subprocess.run(upload_unsharded_cmd, shell=True, check=True) - - upload_hf_cmd = aws_copy(hf_dir, args.hf_bucket, args) - subprocess.run(upload_hf_cmd, shell=True, check=True) - -def make_parser(): - parser = argparse.ArgumentParser( - description="Unshard S3 checkpoint and convert to HF format. Invoke this script from the root of the OLMo repo." - ) - parser.add_argument("--sharded_bucket", help="S3 bucket with sharded checkpoint.", type=str) - parser.add_argument( - "--unsharded_bucket", - help="S3 bucket to save the unsharded checkpoint.", - type=str, - ) - parser.add_argument( - "--already_downloaded", - action="store_true", - help="Use this flag if the unsharded S3 checkpoint is already downloaded, but still needs to be unsharded.", - ) - parser.add_argument( - "--already_unsharded", - action="store_true", - help="If given, the checkpoint has already been unsharded; just convert to HF.", - ) - parser.add_argument("--hf_bucket", help="S3 bucket to save the HF-converted checkpoint.", type=str) - parser.add_argument( - "--local_dir", - help="""Directory to store checkpoints locally.""", - type=pathlib.Path, - ) - parser.add_argument( - "--cleanup_local_dir", - action="store_true", - help="If given, remove the local directory if everything runs successfully to free up space on NFS.", - ) - parser.add_argument( - "--checkpoint_style", - default="hf_olmo", - choices=["hf_olmo", "transformers"], - help="""Checkpoint style. The `transformers` style works with HF transformers as-is, while - `hf_olmo` relies on the `hf_olmo` package for conversion. In general, use - `transformers` for external releases and `hf_olmo` for internal model - development.""", - ) - parser.add_argument( - "--hf_olmo", - action="store_true", - help="If given, convert to 'hf-olmo' style checkpoints.", - ) - parser.add_argument( - "--quiet", - action="store_true", - help="If given, don't show progress for AWS commands.", - ) - parser.add_argument("--type", default=None, help="If given, pass this argument on to `unshard.py`.") - parser.add_argument("--model_only", action="store_true", help="If given, only unshard the model.") - return parser def main(): - parser = make_parser() - args = parser.parse_args() - args.local_dir.mkdir(exist_ok=True, parents=True) + parser = argparse.ArgumentParser() - s3_unshard_to_hf(args) + group_batch = parser.add_mutually_exclusive_group(required=True) + group_batch.add_argument("--checkpoint-path", help="path to sharded checkpoint", type=str) + group_batch.add_argument("--checkpoint-path-file", help="file that lists sharded checkpoint paths (batch run option)", type=str) + parser.add_argument("--weka-load-dir", help='mounted location of weka bucket', default='/data/input', type=str) + + args = parser.parse_args() - if args.cleanup_local_dir: - # Clear out temp dir if we got here (everything ran without error). - shutil.rmtree(args.tmp_dir) + if args.checkpoint_path is not None: + convert_checkpoint([args.checkpoint_path], args.weka_load_dir) + else: + convert_checkpoint(read_checkpoints(args.checkpoint_path_file), args.weka_load_dir) if __name__ == "__main__": From c0ff18605fbcdb6fc7dde95a33f848b4b6225a60 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Mon, 9 Sep 2024 12:23:18 -0700 Subject: [PATCH 22/89] convert checkpoint batch --- scripts/convert_checkpoints_batch.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 654504d55..96087cf4c 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -10,12 +10,13 @@ import argparse import subprocess import os +import json SANITY_CHECK = True def convert_checkpoint(cps, load_dir="/data/input"): cps = expand_paths(cps) - save = {} + processed = [] for checkpoint_path in cps: # Convert to old-style checkpoint. @@ -35,9 +36,14 @@ def convert_checkpoint(cps, load_dir="/data/input"): else: subprocess.run(conversion_cmd, shell=True, check=True) - save[checkpoint_path] = {'converted_path': weka_loc, 'convertion': conversion} + processed.append({ + 'unproccessed_path': checkpoint_path, + 'converted_path': weka_loc, + 'convertion': conversion}) - print(save) + with open('/data/input/jenah/log.jsonl','a+') as fout: + for p in processed: + fout.write(json.dumps(p)+'\n') def expand_paths(cps): expanded = [] @@ -49,7 +55,7 @@ def expand_paths(cps): relevant_dirs = ['/'.join(d.split()[-1].split('/')[:-1]) for d in all_dirs.split() if 'model.pt' in d] search_segs = [seg for i, seg in enumerate(segs) if i > 0 and seg != ""] - print(search_segs) + print(f"search segments: {search_segs}") temp_dirs = relevant_dirs if len(search_segs) > 0: From 15092ae968e1f18512670d47d00fc88b14f61916 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Mon, 9 Sep 2024 12:25:37 -0700 Subject: [PATCH 23/89] convert checkpoint batch --- scripts/convert_checkpoints.py | 2 +- scripts/convert_checkpoints_batch.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/convert_checkpoints.py b/scripts/convert_checkpoints.py index 6d51b3149..415f6e387 100644 --- a/scripts/convert_checkpoints.py +++ b/scripts/convert_checkpoints.py @@ -9,7 +9,7 @@ AWS_ACCESS_KEY_ID = 'JENA_AWS_ACCESS_KEY_ID' AWS_SECRET_ACCESS_KEY = 'JENA_AWS_SECRET_ACCESS_KEY' -SANITY_CHECK = True +SANITY_CHECK = False def convert_checkpoints(args): cmd = f"gantry run " \ diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 96087cf4c..3c07de860 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -12,7 +12,7 @@ import os import json -SANITY_CHECK = True +SANITY_CHECK = False def convert_checkpoint(cps, load_dir="/data/input"): cps = expand_paths(cps) From c489f53dba1f250a69ab5940318dd87060abf416 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Mon, 9 Sep 2024 12:45:19 -0700 Subject: [PATCH 24/89] convert checkpoint batch --- scripts/convert_checkpoints.sh | 124 ++++++++++++++++++++++++++- scripts/convert_checkpoints_batch.py | 4 +- 2 files changed, 126 insertions(+), 2 deletions(-) diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index dcc7e5b02..319375032 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -1,3 +1,125 @@ +##!/usr/bin/env bash +# +## RUN AT THE TOP OF THE OLMo root +# +#CHECKPOINT_PATH=$1 +#shift +# +#SUFFIX="hf" +#WORKSPACE="" +#BUDGET="" +#PRIORITY="normal" +# +#while getopts "s:w:b:p:t:" opt; do +# case $opt in +# s) +# SUFFIX="$OPTARG" +# ;; +# w) +# WORKSPACE="$OPTARG" +# ;; +# b) +# BUDGET="$OPTARG" +# ;; +# p) +# PRIORITY="$OPTARG" +# ;; +# t) +# CUSTOM_TOKENIZER="--tokenizer $OPTARG" +# ;; +# \?) +# echo "Invalid option: -$OPTARG" >&2 +# exit 1 +# ;; +# esac +#done +# +## Set default values if not specified +#if [ -z "$WORKSPACE" ]; then +# WORKSPACE="ai2/oe-data" +#fi +# +#if [ -z "$BUDGET" ]; then +# BUDGET="$WORKSPACE" +#fi +# +## Verify that a path has been provided +#if [ -z "$CHECKPOINT_PATH" ]; then +# echo "Error: No path provided." +# exit 1 +#fi +# +## Check if CHECKPOINT_PATH is an s3:// path or an absolute path +#if [[ ! "$CHECKPOINT_PATH" =~ ^s3:// ]] && [[ ! "$CHECKPOINT_PATH" =~ ^/ ]]; then +# echo "Error: CHECKPOINT_PATH must be an s3:// path or an absolute path." +# exit 1 +#fi +# +# +## Extract weka_mountpoint if checkpoint path starts with specific directories +#CLUSTERS="ai2/*" +#for dir in climate-default mosaic-default nora-default oe-adapt-default oe-data-default oe-eval-default oe-training-default prior-default reviz-default skylight-default; do +# if [[ $CHECKPOINT_PATH == "/$dir"* ]]; then +# WEKA_MOUNTPOINTS=" --weka=${dir}:/${dir}" +# # Override clusters to use only jupiter-cirrascale-2 +# CLUSTERS="ai2/jupiter-cirrascale-2" +# break +# fi +#done +# +# +## Function to check if S3 path exists +#check_s3_path() { +# aws s3 ls "$1" > /dev/null 2>&1 +# return $? +#} +# +## Check if the provided path exists (only for S3 paths) +#if [[ "$CHECKPOINT_PATH" =~ ^s3:// ]]; then +# if check_s3_path "$CHECKPOINT_PATH"; then +# echo "S3 path exists: $CHECKPOINT_PATH" +# else +# echo "Error: S3 path does not exist: $CHECKPOINT_PATH" +# exit 1 +# fi +#else +# echo "Skipping existence check for non-S3 path: $CHECKPOINT_PATH" +#fi +# +#commands=( +# "pip install awscli" +# "git clone https://github.com/allenai/OLMo.git" +# "cd OLMo" +# "pip install -e '.[all]'" +# "if [ ! -d '${CHECKPOINT_PATH}-${SUFFIX}' ]; then python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '$CHECKPOINT_PATH' --destination-dir '${CHECKPOINT_PATH}-${SUFFIX}' --keep-olmo-artifacts ${CUSTOM_TOKENIZER}; else echo 'Destination directory already exists. Skipping conversion.'; fi" +#) +# +# +#for cmd in "${commands[@]}"; do +# if [ -z "$joined_commands" ]; then +# joined_commands="$cmd" +# else +# joined_commands="$joined_commands && $cmd" +# fi +#done +# +#gantry run \ +# --description "Converting ${CHECKPOINT_PATH}" \ +# --allow-dirty \ +# --no-python \ +# --workspace ${WORKSPACE} \ +# --priority ${PRIORITY} \ +# --gpus 0 \ +# --preemptible \ +# --cluster ${CLUSTERS} \ +# --budget ${BUDGET} \ +# --env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \ +# --env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \ +# --shared-memory 10GiB \ +# ${WEKA_MOUNTPOINTS} \ +# --yes \ +# -- /bin/bash -c "${joined_commands}" + gantry run \ --allow-dirty \ --workspace ai2/cheap-decisions \ @@ -12,6 +134,6 @@ gantry run \ --weka=oe-eval-default:/data/input \ --pip requirements.txt \ --yes \ - -- /bin/bash -c "python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --destination-dir '/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded' --keep-olmo-artifacts --tokenizer 'olmo_data/tokenizers/allenai_gpt-neox-olmo-dolma-v1_5.json'" + -- /bin/bash -c "python convert_checkpoints_batch.pyq --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step351000-unsharded' --weka-load-dir '/data/input/'" # --install install_torch.sh \ diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 3c07de860..d1c6678a2 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -29,7 +29,9 @@ def convert_checkpoint(cps, load_dir="/data/input"): conversion = 'existing' else: conversion = 'new' - conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --keep-olmo-artifacts" + conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --keep-olmo-artifacts --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5.json'" + + if SANITY_CHECK: print(conversion_cmd) From 2ed6a50e60e686a448e3320749b4e97e71428c2d Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Mon, 9 Sep 2024 15:23:34 -0700 Subject: [PATCH 25/89] convert checkpoint batch --- scripts/convert_checkpoints.sh | 124 +-------------------------- scripts/convert_checkpoints_batch.py | 57 ++++++++---- 2 files changed, 43 insertions(+), 138 deletions(-) diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index 319375032..3b1d463e2 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -1,125 +1,3 @@ -##!/usr/bin/env bash -# -## RUN AT THE TOP OF THE OLMo root -# -#CHECKPOINT_PATH=$1 -#shift -# -#SUFFIX="hf" -#WORKSPACE="" -#BUDGET="" -#PRIORITY="normal" -# -#while getopts "s:w:b:p:t:" opt; do -# case $opt in -# s) -# SUFFIX="$OPTARG" -# ;; -# w) -# WORKSPACE="$OPTARG" -# ;; -# b) -# BUDGET="$OPTARG" -# ;; -# p) -# PRIORITY="$OPTARG" -# ;; -# t) -# CUSTOM_TOKENIZER="--tokenizer $OPTARG" -# ;; -# \?) -# echo "Invalid option: -$OPTARG" >&2 -# exit 1 -# ;; -# esac -#done -# -## Set default values if not specified -#if [ -z "$WORKSPACE" ]; then -# WORKSPACE="ai2/oe-data" -#fi -# -#if [ -z "$BUDGET" ]; then -# BUDGET="$WORKSPACE" -#fi -# -## Verify that a path has been provided -#if [ -z "$CHECKPOINT_PATH" ]; then -# echo "Error: No path provided." -# exit 1 -#fi -# -## Check if CHECKPOINT_PATH is an s3:// path or an absolute path -#if [[ ! "$CHECKPOINT_PATH" =~ ^s3:// ]] && [[ ! "$CHECKPOINT_PATH" =~ ^/ ]]; then -# echo "Error: CHECKPOINT_PATH must be an s3:// path or an absolute path." -# exit 1 -#fi -# -# -## Extract weka_mountpoint if checkpoint path starts with specific directories -#CLUSTERS="ai2/*" -#for dir in climate-default mosaic-default nora-default oe-adapt-default oe-data-default oe-eval-default oe-training-default prior-default reviz-default skylight-default; do -# if [[ $CHECKPOINT_PATH == "/$dir"* ]]; then -# WEKA_MOUNTPOINTS=" --weka=${dir}:/${dir}" -# # Override clusters to use only jupiter-cirrascale-2 -# CLUSTERS="ai2/jupiter-cirrascale-2" -# break -# fi -#done -# -# -## Function to check if S3 path exists -#check_s3_path() { -# aws s3 ls "$1" > /dev/null 2>&1 -# return $? -#} -# -## Check if the provided path exists (only for S3 paths) -#if [[ "$CHECKPOINT_PATH" =~ ^s3:// ]]; then -# if check_s3_path "$CHECKPOINT_PATH"; then -# echo "S3 path exists: $CHECKPOINT_PATH" -# else -# echo "Error: S3 path does not exist: $CHECKPOINT_PATH" -# exit 1 -# fi -#else -# echo "Skipping existence check for non-S3 path: $CHECKPOINT_PATH" -#fi -# -#commands=( -# "pip install awscli" -# "git clone https://github.com/allenai/OLMo.git" -# "cd OLMo" -# "pip install -e '.[all]'" -# "if [ ! -d '${CHECKPOINT_PATH}-${SUFFIX}' ]; then python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '$CHECKPOINT_PATH' --destination-dir '${CHECKPOINT_PATH}-${SUFFIX}' --keep-olmo-artifacts ${CUSTOM_TOKENIZER}; else echo 'Destination directory already exists. Skipping conversion.'; fi" -#) -# -# -#for cmd in "${commands[@]}"; do -# if [ -z "$joined_commands" ]; then -# joined_commands="$cmd" -# else -# joined_commands="$joined_commands && $cmd" -# fi -#done -# -#gantry run \ -# --description "Converting ${CHECKPOINT_PATH}" \ -# --allow-dirty \ -# --no-python \ -# --workspace ${WORKSPACE} \ -# --priority ${PRIORITY} \ -# --gpus 0 \ -# --preemptible \ -# --cluster ${CLUSTERS} \ -# --budget ${BUDGET} \ -# --env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \ -# --env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \ -# --shared-memory 10GiB \ -# ${WEKA_MOUNTPOINTS} \ -# --yes \ -# -- /bin/bash -c "${joined_commands}" - gantry run \ --allow-dirty \ --workspace ai2/cheap-decisions \ @@ -134,6 +12,6 @@ gantry run \ --weka=oe-eval-default:/data/input \ --pip requirements.txt \ --yes \ - -- /bin/bash -c "python convert_checkpoints_batch.pyq --checkpoint-dir 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step351000-unsharded' --weka-load-dir '/data/input/'" + -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step351000-unsharded' --weka-load-dir '/data/input/'" # --install install_torch.sh \ diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index d1c6678a2..1f27ea5f7 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -10,28 +10,36 @@ import argparse import subprocess import os +import time +import boto3 import json -SANITY_CHECK = False +SANITY_CHECK = True def convert_checkpoint(cps, load_dir="/data/input"): - cps = expand_paths(cps) + s3_client = boto3.client('s3') + s3_resource = boto3.resource('s3') + + cps = expand_paths(cps, s3_client) processed = [] for checkpoint_path in cps: # Convert to old-style checkpoint. - retain_path_name = checkpoint_path.replace('s3://', '').strip('/') weka_loc = f"{load_dir}/{retain_path_name}-hf/" # Check if the output location is already there. If not, do the conversion. if os.path.exists(weka_loc): conversion = 'existing' + converted_path = weka_loc + elif s3_path_exists(checkpoint_path, s3_resource): + conversion = 'existing' + converted_path = checkpoint_path + '-hf' else: conversion = 'new' - conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --keep-olmo-artifacts --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5.json'" - + converted_path = weka_loc + conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --keep-olmo-artifacts --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5.json'" if SANITY_CHECK: print(conversion_cmd) @@ -40,21 +48,41 @@ def convert_checkpoint(cps, load_dir="/data/input"): processed.append({ 'unproccessed_path': checkpoint_path, - 'converted_path': weka_loc, - 'convertion': conversion}) + 'converted_path': converted_path.replace(load_dir,'/weka'), + 'convertion': conversion, + 'date_time': time.strftime('%b-%d-%Y_%H%M', time.localtime())}) - with open('/data/input/jenah/log.jsonl','a+') as fout: + print(processed) + with open('log.jsonl','a+') as fout: for p in processed: fout.write(json.dumps(p)+'\n') -def expand_paths(cps): + +def s3_path_exists(cp, s3): + b = cp.split('/')[2] + bucket = s3.Bucket(b) + objs = list(bucket.objects.filter(Prefix=cp.replace('s3://'+b+'/', '') + '-hf')) + return True if (len(objs) > 0) else False + + +def expand_paths(cps, s3): expanded = [] for cp in cps: + bucket = cp.split('/')[2] segs = cp.split('*') - prefix = 's3://ai2-llm/' - cmd = f"aws s3 ls --recursive {segs[0]}" - all_dirs = subprocess.run(cmd, shell=True, check=True, capture_output=True, text = True).stdout - relevant_dirs = ['/'.join(d.split()[-1].split('/')[:-1]) for d in all_dirs.split() if 'model.pt' in d] + + # cmd = f"aws s3 ls --recursive {segs[0]}" + # all_dirs = subprocess.run(cmd, shell=True, check=True, capture_output=True, text = True).stdout + # relevant_dirs = ['/'.join(d.split()[-1].split('/')[:-1]) for d in all_dirs.split() if 'model.pt' in d] + + relevant_dirs = [] + paginator = s3.get_paginator('list_objects_v2') + page_iterator = paginator.paginate(Bucket=bucket, Prefix=segs[0].replace('s3://'+bucket+'/', '')) + for page in page_iterator: + for obj in page['Contents']: + if 'model.pt' in obj["Key"]: + relevant_dirs.append(obj["Key"].replace('/model.pt','')) + search_segs = [seg for i, seg in enumerate(segs) if i > 0 and seg != ""] print(f"search segments: {search_segs}") @@ -64,8 +92,7 @@ def expand_paths(cps): for s in search_segs: temp_dirs = [d for d in temp_dirs if s in d] - exp = set([f"{prefix}{d}" for d in temp_dirs]) - print(exp) + exp = set([f"s3://{bucket}/{d}" for d in temp_dirs]) expanded += exp return expanded From dd3dc185c8872a86db7492d88ed2d5bd1425e6d6 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Mon, 9 Sep 2024 15:29:06 -0700 Subject: [PATCH 26/89] convert checkpoint batch --- scripts/convert_checkpoints.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index 3b1d463e2..2029b2864 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -12,6 +12,7 @@ gantry run \ --weka=oe-eval-default:/data/input \ --pip requirements.txt \ --yes \ - -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step351000-unsharded' --weka-load-dir '/data/input/'" + --result=log.jsonl \ + -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99*' --weka-load-dir '/data/input/'" # --install install_torch.sh \ From 9bc11d7de3a2ba6f5e8b512117cacfa079405cfe Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Mon, 9 Sep 2024 15:36:33 -0700 Subject: [PATCH 27/89] convert checkpoint batch --- scripts/convert_checkpoints_batch.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 1f27ea5f7..8fbb2191c 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -39,7 +39,7 @@ def convert_checkpoint(cps, load_dir="/data/input"): conversion = 'new' converted_path = weka_loc - conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --keep-olmo-artifacts --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5.json'" + conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5.json'" if SANITY_CHECK: print(conversion_cmd) @@ -53,9 +53,13 @@ def convert_checkpoint(cps, load_dir="/data/input"): 'date_time': time.strftime('%b-%d-%Y_%H%M', time.localtime())}) print(processed) - with open('log.jsonl','a+') as fout: + + results = 'results/' + if not os.path.exists(results): + os.mkdir(results) + with open(f'{results}log.jsonl', 'a+') as fout: for p in processed: - fout.write(json.dumps(p)+'\n') + fout.write(json.dumps(p) + '\n') def s3_path_exists(cp, s3): From e08895fcc51a785fc8f8a2f001338afd4735352d Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Mon, 9 Sep 2024 15:38:14 -0700 Subject: [PATCH 28/89] convert checkpoint batch --- scripts/convert_checkpoints.sh | 1 - scripts/convert_checkpoints_batch.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index 2029b2864..8f28dccd8 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -12,7 +12,6 @@ gantry run \ --weka=oe-eval-default:/data/input \ --pip requirements.txt \ --yes \ - --result=log.jsonl \ -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99*' --weka-load-dir '/data/input/'" # --install install_torch.sh \ diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 8fbb2191c..69e7fad14 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -57,7 +57,7 @@ def convert_checkpoint(cps, load_dir="/data/input"): results = 'results/' if not os.path.exists(results): os.mkdir(results) - with open(f'{results}log.jsonl', 'a+') as fout: + with open(f'{results}metrics.jsonl', 'a+') as fout: for p in processed: fout.write(json.dumps(p) + '\n') From 0b82c2bba466bcf46b6fbc74f4b8e43713b3a6fb Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Mon, 9 Sep 2024 16:10:39 -0700 Subject: [PATCH 29/89] convert checkpoint batch --- scripts/convert_checkpoints.sh | 27 ++++++++++++++++++++++++--- scripts/convert_checkpoints_batch.py | 12 +++++++++--- 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index 8f28dccd8..594bb85d8 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -1,4 +1,27 @@ +#!/usr/bin/env bash + +# To be run at the top of the root of OLMo repository. +# Converts s3 checkpoints into WEKA + +# ASSUMPTIONS +# - INPUT must be on s3 +# - OUTPUT is weka with the same path name as s3 + "-hf" suffix appended to the path +# - Budget for oe-eval +# - Experiments saved to ai2/cheap-decisions + +# NOTES +# - saves metrics.json +# - allows for wildcard (*) + +# TODOs +# - Make consistent with Luca's code +# - Code allows for a txt file with a list of checkpoint paths, sh needs to allow this + +CHECKPOINT_PATH=$1 + + gantry run \ + --description "checkpoint conv; eval for cons ranking" \ --allow-dirty \ --workspace ai2/cheap-decisions \ --priority normal \ @@ -10,8 +33,6 @@ gantry run \ --env-secret AWS_SECRET_ACCESS_KEY=JENA_AWS_SECRET_ACCESS_KEY \ --shared-memory 10GiB \ --weka=oe-eval-default:/data/input \ - --pip requirements.txt \ --yes \ - -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path 's3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99*' --weka-load-dir '/data/input/'" + -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input/'" -# --install install_torch.sh \ diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 69e7fad14..538884692 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -14,7 +14,7 @@ import boto3 import json -SANITY_CHECK = True +SANITY_CHECK = False def convert_checkpoint(cps, load_dir="/data/input"): s3_client = boto3.client('s3') @@ -44,6 +44,11 @@ def convert_checkpoint(cps, load_dir="/data/input"): if SANITY_CHECK: print(conversion_cmd) else: + print('\n--------------------------------------------') + print("\nConverting Checkpoint...") + print(conversion_cmd) + print('\n--------------------------------------------') + subprocess.run(conversion_cmd, shell=True, check=True) processed.append({ @@ -57,7 +62,7 @@ def convert_checkpoint(cps, load_dir="/data/input"): results = 'results/' if not os.path.exists(results): os.mkdir(results) - with open(f'{results}metrics.jsonl', 'a+') as fout: + with open(f'{results}metrics.json', 'w') as fout: for p in processed: fout.write(json.dumps(p) + '\n') @@ -66,6 +71,7 @@ def s3_path_exists(cp, s3): b = cp.split('/')[2] bucket = s3.Bucket(b) objs = list(bucket.objects.filter(Prefix=cp.replace('s3://'+b+'/', '') + '-hf')) + print(objs) return True if (len(objs) > 0) else False @@ -89,7 +95,7 @@ def expand_paths(cps, s3): search_segs = [seg for i, seg in enumerate(segs) if i > 0 and seg != ""] - print(f"search segments: {search_segs}") + # print(f"search segments: {search_segs}") temp_dirs = relevant_dirs if len(search_segs) > 0: From 482a487d4fd013aa9788f74c4cc22b3707ead250 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Mon, 9 Sep 2024 16:31:44 -0700 Subject: [PATCH 30/89] convert checkpoint batch --- scripts/convert_checkpoints.sh | 2 ++ scripts/convert_checkpoints_batch.py | 3 ++- scripts/results/metrics.json | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 scripts/results/metrics.json diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index 594bb85d8..cbcf87eea 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -8,6 +8,7 @@ # - OUTPUT is weka with the same path name as s3 + "-hf" suffix appended to the path # - Budget for oe-eval # - Experiments saved to ai2/cheap-decisions +# - Assumes tokenizer allenai/gpt-neox-olmo-dolma-v1_5.json # NOTES # - saves metrics.json @@ -16,6 +17,7 @@ # TODOs # - Make consistent with Luca's code # - Code allows for a txt file with a list of checkpoint paths, sh needs to allow this +# CHECKPOINT_PATH=$1 diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 538884692..438053362 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -29,6 +29,7 @@ def convert_checkpoint(cps, load_dir="/data/input"): weka_loc = f"{load_dir}/{retain_path_name}-hf/" # Check if the output location is already there. If not, do the conversion. + print('WEKA LOC', weka_loc) if os.path.exists(weka_loc): conversion = 'existing' converted_path = weka_loc @@ -39,7 +40,7 @@ def convert_checkpoint(cps, load_dir="/data/input"): conversion = 'new' converted_path = weka_loc - conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5.json'" + conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --tokenizer ' allenai/gpt-neox-olmo-dolma-v1_5'" if SANITY_CHECK: print(conversion_cmd) diff --git a/scripts/results/metrics.json b/scripts/results/metrics.json new file mode 100644 index 000000000..d9fd489c9 --- /dev/null +++ b/scripts/results/metrics.json @@ -0,0 +1 @@ +{"unproccessed_path": "s3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded", "converted_path": "/weka/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded-hf/", "convertion": "new", "date_time": "Sep-09-2024_1630"} From 5c015cab34c80b779aa6c776a43a0860e89cb157 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Mon, 9 Sep 2024 16:40:14 -0700 Subject: [PATCH 31/89] convert checkpoint batch --- scripts/convert_checkpoints_batch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 438053362..c775e88e6 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -40,7 +40,7 @@ def convert_checkpoint(cps, load_dir="/data/input"): conversion = 'new' converted_path = weka_loc - conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --tokenizer ' allenai/gpt-neox-olmo-dolma-v1_5'" + conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5'" if SANITY_CHECK: print(conversion_cmd) From 268d74d1e9fa6d60e76c9f91f5b8c0d3804ee163 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Tue, 10 Sep 2024 09:00:49 -0700 Subject: [PATCH 32/89] convert checkpoint batch --- scripts/convert_checkpoints.py | 58 ---------------------------- scripts/convert_checkpoints.sh | 2 +- scripts/convert_checkpoints_batch.py | 11 +++--- 3 files changed, 7 insertions(+), 64 deletions(-) delete mode 100644 scripts/convert_checkpoints.py diff --git a/scripts/convert_checkpoints.py b/scripts/convert_checkpoints.py deleted file mode 100644 index 415f6e387..000000000 --- a/scripts/convert_checkpoints.py +++ /dev/null @@ -1,58 +0,0 @@ -# This script requires to be run at the root level. -# Requires the AWS CLI and Beaker Gantry to be installed and configured. - - -import argparse -import subprocess - -# Beaker secret keys -AWS_ACCESS_KEY_ID = 'JENA_AWS_ACCESS_KEY_ID' -AWS_SECRET_ACCESS_KEY = 'JENA_AWS_SECRET_ACCESS_KEY' - -SANITY_CHECK = False - -def convert_checkpoints(args): - cmd = f"gantry run " \ - f"--allow-dirty " \ - f"--workspace ai2/cheap-decisions " \ - f"--priority normal " \ - f"--gpus 0 " \ - f"--preemptible " \ - f"--cluster 'ai2/jupiter-cirrascale-2' " \ - f"--budget ai2/oe-eval " \ - f"--env-secret AWS_ACCESS_KEY_ID={AWS_ACCESS_KEY_ID} " \ - f"--env-secret AWS_SECRET_ACCESS_KEY={AWS_SECRET_ACCESS_KEY} " \ - f"--shared-memory 10GiB " \ - f"--weka=oe-eval-default:{args.weka_load_dir} " \ - f"--yes " - - if args.checkpoint_path is not None: - cmd += f"-- /bin/bash -c python convert_checkpoints_batch.py --checkpoint-path '{args.checkpoint_path}' --weka-load-dir {args.weka_load_dir}" - else: - cmd += f"-- /bin/bash -c python convert_checkpoints_batch.py --checkpoint-path-file '{args.checkpoint_path_file}' --weka-load-dir {args.weka_load_dir}" - - if SANITY_CHECK: - print(cmd) - else: - try: - subprocess.run(cmd, shell=True, check=True) - except subprocess.CalledProcessError as e: - print(e.output) - - -def main(): - parser = argparse.ArgumentParser( - description="Unshard checkpoint and convert to HF format. Run via Gantry. Invoke this script from the root of the OLMo repo." - ) - - group_batch = parser.add_mutually_exclusive_group(required=True) - group_batch.add_argument("--checkpoint-path", help="path to sharded checkpoint", type=str) - group_batch.add_argument("--checkpoint-path-file", help="file that lists sharded checkpoint paths (batch run option)", type=str) - parser.add_argument("--weka-load-dir", help='mounted location of weka bucket', default='/data/input', type=str) - - args = parser.parse_args() - convert_checkpoints(args) - - -if __name__ == "__main__": - main() diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index cbcf87eea..84091c49e 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -36,5 +36,5 @@ gantry run \ --shared-memory 10GiB \ --weka=oe-eval-default:/data/input \ --yes \ - -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input/'" + -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input'" diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index c775e88e6..ef78d7f80 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -13,6 +13,7 @@ import time import boto3 import json +import sys SANITY_CHECK = False @@ -45,10 +46,10 @@ def convert_checkpoint(cps, load_dir="/data/input"): if SANITY_CHECK: print(conversion_cmd) else: - print('\n--------------------------------------------') - print("\nConverting Checkpoint...") - print(conversion_cmd) - print('\n--------------------------------------------') + sys.stdout.write('\n--------------------------------------------') + sys.stdout.write("\nConverting Checkpoint...") + sys.stdout.write(conversion_cmd) + sys.stdout.write('\n--------------------------------------------') subprocess.run(conversion_cmd, shell=True, check=True) @@ -72,7 +73,7 @@ def s3_path_exists(cp, s3): b = cp.split('/')[2] bucket = s3.Bucket(b) objs = list(bucket.objects.filter(Prefix=cp.replace('s3://'+b+'/', '') + '-hf')) - print(objs) + sys.stdout.write(f's3 path exists check: {objs}') return True if (len(objs) > 0) else False From 1326da70f0e939a0aae699db62bc367b2abb409d Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Tue, 10 Sep 2024 09:07:49 -0700 Subject: [PATCH 33/89] convert checkpoint batch --- scripts/convert_checkpoints.sh | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index 84091c49e..1eabea5ab 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -2,6 +2,16 @@ # To be run at the top of the root of OLMo repository. # Converts s3 checkpoints into WEKA +# +# Example use: +# sh scripts/convert_checkpoints.sh s3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step9* +# +# This will convert all models in the directory +# and save them to their respective directories under +# +# /weka/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step9* +# +# It will first, though, check that the weka directory doesn't exist AND that s3 doesn't have a corresponding directory (so as not to replicate what conversions already made) # ASSUMPTIONS # - INPUT must be on s3 @@ -17,7 +27,7 @@ # TODOs # - Make consistent with Luca's code # - Code allows for a txt file with a list of checkpoint paths, sh needs to allow this -# +# - Output is not saving. But it prints to the log. Fix this. CHECKPOINT_PATH=$1 From ccbeef256dab86ca8af16e2e5f531c9cd7e45c36 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Tue, 10 Sep 2024 09:19:21 -0700 Subject: [PATCH 34/89] convert checkpoint batch --- scripts/convert_checkpoints.sh | 1 + scripts/convert_checkpoints_batch.py | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index 1eabea5ab..e76aa908f 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -28,6 +28,7 @@ # - Make consistent with Luca's code # - Code allows for a txt file with a list of checkpoint paths, sh needs to allow this # - Output is not saving. But it prints to the log. Fix this. +# - Make tokenizer updatable CHECKPOINT_PATH=$1 diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index ef78d7f80..38a877360 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -15,6 +15,8 @@ import json import sys +from gantry import METRICS_FILE + SANITY_CHECK = False def convert_checkpoint(cps, load_dir="/data/input"): @@ -61,10 +63,10 @@ def convert_checkpoint(cps, load_dir="/data/input"): print(processed) - results = 'results/' - if not os.path.exists(results): - os.mkdir(results) - with open(f'{results}metrics.json', 'w') as fout: + # results = 'results/' + # if not os.path.exists(results): + # os.mkdir(results) + with open(METRICS_FILE, 'w') as fout: for p in processed: fout.write(json.dumps(p) + '\n') From d83f2ed1dbc54ad172022aa17cc7682cb57291d2 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Tue, 10 Sep 2024 09:38:31 -0700 Subject: [PATCH 35/89] convert checkpoint batch --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index f439913b6..1a5847543 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ rich botocore cached-path transformers +gantry From a42de22dfcb08f5c9f1bcb7d26be7d1962bd81a7 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Tue, 10 Sep 2024 09:42:19 -0700 Subject: [PATCH 36/89] convert checkpoint batch --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 1a5847543..ae2bf89c5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,4 @@ rich botocore cached-path transformers -gantry +beaker-gantry From e07796b14d96b5a0028fb9cf9ac085c5cc05b3ee Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Wed, 11 Sep 2024 09:52:40 -0700 Subject: [PATCH 37/89] error catch --- scripts/convert_checkpoints_batch.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 38a877360..76ee38fdd 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -32,10 +32,11 @@ def convert_checkpoint(cps, load_dir="/data/input"): weka_loc = f"{load_dir}/{retain_path_name}-hf/" # Check if the output location is already there. If not, do the conversion. + error = "" print('WEKA LOC', weka_loc) if os.path.exists(weka_loc): conversion = 'existing' - converted_path = weka_loc + converted_path = weka_loc.replace(load_dir,'/weka') elif s3_path_exists(checkpoint_path, s3_resource): conversion = 'existing' converted_path = checkpoint_path + '-hf' @@ -53,13 +54,18 @@ def convert_checkpoint(cps, load_dir="/data/input"): sys.stdout.write(conversion_cmd) sys.stdout.write('\n--------------------------------------------') - subprocess.run(conversion_cmd, shell=True, check=True) + try: + subprocess.run(conversion_cmd, shell=True, check=True) + except subprocess.CalledProcessError as e: + error = e.output processed.append({ 'unproccessed_path': checkpoint_path, - 'converted_path': converted_path.replace(load_dir,'/weka'), + 'converted_path': converted_path, 'convertion': conversion, - 'date_time': time.strftime('%b-%d-%Y_%H%M', time.localtime())}) + 'date_time': time.strftime('%b-%d-%Y_%H%M', time.localtime()), + 'error': error} + ) print(processed) From c6e773d2d1f1dd52d98a4873b193dff081d101bd Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Wed, 11 Sep 2024 11:29:43 -0700 Subject: [PATCH 38/89] checking for existing conversions --- scripts/convert_checkpoints_batch.py | 30 ++++++++++++++++++++++------ scripts/results/metrics.json | 1 - 2 files changed, 24 insertions(+), 7 deletions(-) delete mode 100644 scripts/results/metrics.json diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 76ee38fdd..ffa82d67d 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -19,6 +19,15 @@ SANITY_CHECK = False +# possible converted locations. +# "self" is the target location where the converted model would be saved +# key: template, value: description +# template: MUST obey .format(load_dir, retain_path_name) +WEKA_CHECK_LOCATIONS_PREFIXES = { + "{}/{}-hf/": 'self', + "{}/ianm/{}-hf": "ian's" +} + def convert_checkpoint(cps, load_dir="/data/input"): s3_client = boto3.client('s3') s3_resource = boto3.resource('s3') @@ -30,16 +39,27 @@ def convert_checkpoint(cps, load_dir="/data/input"): # Convert to old-style checkpoint. retain_path_name = checkpoint_path.replace('s3://', '').strip('/') weka_loc = f"{load_dir}/{retain_path_name}-hf/" + check_locs = [l.format(load_dir,retain_path_name) for l in WEKA_CHECK_LOCATIONS_PREFIXES] - # Check if the output location is already there. If not, do the conversion. + sys.stdout.write(f"\n\n=== Processing Checkpoint: {retain_path_name}\n") error = "" - print('WEKA LOC', weka_loc) - if os.path.exists(weka_loc): + + path_found = None + for loc in check_locs: + if os.path.exists(loc): + path_found = loc + break + + # Check if the output location is already there. If not, do the conversion. + # print('WEKA LOC', weka_loc) + if path_found is not None: conversion = 'existing' - converted_path = weka_loc.replace(load_dir,'/weka') + converted_path = path_found.replace(load_dir,'/weka') + sys.stdout.write(f" -- Converted Checkpoint Found: {converted_path}\n") elif s3_path_exists(checkpoint_path, s3_resource): conversion = 'existing' converted_path = checkpoint_path + '-hf' + sys.stdout.write(f" -- Converted Checkpoint Found: {converted_path}\n") else: conversion = 'new' converted_path = weka_loc @@ -49,8 +69,6 @@ def convert_checkpoint(cps, load_dir="/data/input"): if SANITY_CHECK: print(conversion_cmd) else: - sys.stdout.write('\n--------------------------------------------') - sys.stdout.write("\nConverting Checkpoint...") sys.stdout.write(conversion_cmd) sys.stdout.write('\n--------------------------------------------') diff --git a/scripts/results/metrics.json b/scripts/results/metrics.json deleted file mode 100644 index d9fd489c9..000000000 --- a/scripts/results/metrics.json +++ /dev/null @@ -1 +0,0 @@ -{"unproccessed_path": "s3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded", "converted_path": "/weka/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded-hf/", "convertion": "new", "date_time": "Sep-09-2024_1630"} From 798ded33306204b0bcb21d318d62c261d7edea89 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Wed, 11 Sep 2024 12:47:04 -0700 Subject: [PATCH 39/89] minor change --- scripts/convert_checkpoints.sh | 3 ++- scripts/convert_checkpoints_batch.py | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index e76aa908f..1642fd4a9 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -31,10 +31,11 @@ # - Make tokenizer updatable CHECKPOINT_PATH=$1 +DESCRIPTION=$2 gantry run \ - --description "checkpoint conv; eval for cons ranking" \ + --description $DESCRIPTION \ --allow-dirty \ --workspace ai2/cheap-decisions \ --priority normal \ diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index ffa82d67d..496c5b538 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -99,7 +99,6 @@ def s3_path_exists(cp, s3): b = cp.split('/')[2] bucket = s3.Bucket(b) objs = list(bucket.objects.filter(Prefix=cp.replace('s3://'+b+'/', '') + '-hf')) - sys.stdout.write(f's3 path exists check: {objs}') return True if (len(objs) > 0) else False From 45a93742edcdb68a3f4b5c9db141ae8b08f0672a Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Wed, 11 Sep 2024 13:00:37 -0700 Subject: [PATCH 40/89] minor change --- scripts/convert_checkpoints.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index 1642fd4a9..536e0a2eb 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -31,11 +31,9 @@ # - Make tokenizer updatable CHECKPOINT_PATH=$1 -DESCRIPTION=$2 - gantry run \ - --description $DESCRIPTION \ + --description "Converting $CHECKPOINT_PATH" \ --allow-dirty \ --workspace ai2/cheap-decisions \ --priority normal \ From 26a1e26a55446c442de31a23bce617d14db093d3 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Wed, 11 Sep 2024 13:46:11 -0700 Subject: [PATCH 41/89] adding a cleanup flag for removing local directory at the end of the process --- hf_olmo/convert_olmo_to_hf.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/hf_olmo/convert_olmo_to_hf.py b/hf_olmo/convert_olmo_to_hf.py index 2e0a9e074..9e0c7afb6 100644 --- a/hf_olmo/convert_olmo_to_hf.py +++ b/hf_olmo/convert_olmo_to_hf.py @@ -284,6 +284,12 @@ def main(): help="Keep olmo-specific artifacts in the checkpoint.", ) + parser.add_argument( + "--cleanup-local-dir", + action="store_true", + help="Remove local download of the directory." + ) + args = parser.parse_args() args.destination_dir = args.destination_dir or args.checkpoint_dir @@ -308,6 +314,9 @@ def main(): upload_local_checkpoint(local_checkpoint_dir, args.destination_dir) print(f"Converted checkpoint saved to {args.destination_dir}") + if args.cleanup_local_dir: + print(f"Removing temporary local dir: {local_checkpoint_dir}") + shutil.rmtree(local_checkpoint_dir) if __name__ == "__main__": From 30a9cb9aad2f43092789a6e296f3ea6d3dcbb74e Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Wed, 11 Sep 2024 13:51:32 -0700 Subject: [PATCH 42/89] fix --- scripts/convert_checkpoints_batch.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 496c5b538..d3114ead2 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -23,6 +23,7 @@ # "self" is the target location where the converted model would be saved # key: template, value: description # template: MUST obey .format(load_dir, retain_path_name) + WEKA_CHECK_LOCATIONS_PREFIXES = { "{}/{}-hf/": 'self', "{}/ianm/{}-hf": "ian's" @@ -64,7 +65,7 @@ def convert_checkpoint(cps, load_dir="/data/input"): conversion = 'new' converted_path = weka_loc - conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5'" + conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5' --cleanup-local-dir" if SANITY_CHECK: print(conversion_cmd) From 083ff3eaf91991a084ee061376fb13e041490766 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Wed, 11 Sep 2024 14:18:55 -0700 Subject: [PATCH 43/89] troubleshooting --- scripts/convert_checkpoints_batch.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index d3114ead2..dc3c84279 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -17,7 +17,7 @@ from gantry import METRICS_FILE -SANITY_CHECK = False +SANITY_CHECK = True # possible converted locations. # "self" is the target location where the converted model would be saved @@ -34,15 +34,16 @@ def convert_checkpoint(cps, load_dir="/data/input"): s3_resource = boto3.resource('s3') cps = expand_paths(cps, s3_client) - processed = [] for checkpoint_path in cps: # Convert to old-style checkpoint. + processed = [] + retain_path_name = checkpoint_path.replace('s3://', '').strip('/') weka_loc = f"{load_dir}/{retain_path_name}-hf/" check_locs = [l.format(load_dir,retain_path_name) for l in WEKA_CHECK_LOCATIONS_PREFIXES] - sys.stdout.write(f"\n\n=== Processing Checkpoint: {retain_path_name}\n") + sys.stdout.write(f"\n\nProcessing Checkpoint: {retain_path_name}\n") error = "" path_found = None @@ -56,7 +57,7 @@ def convert_checkpoint(cps, load_dir="/data/input"): if path_found is not None: conversion = 'existing' converted_path = path_found.replace(load_dir,'/weka') - sys.stdout.write(f" -- Converted Checkpoint Found: {converted_path}\n") + sys.stdout.write(f"Converted Checkpoint Found: {converted_path}\n") elif s3_path_exists(checkpoint_path, s3_resource): conversion = 'existing' converted_path = checkpoint_path + '-hf' @@ -68,10 +69,10 @@ def convert_checkpoint(cps, load_dir="/data/input"): conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5' --cleanup-local-dir" if SANITY_CHECK: - print(conversion_cmd) + sys.stdout.write('SANITY CHECK MODE (not running the conversion)') + sys.stdout.write(conversion_cmd + '\n') else: - sys.stdout.write(conversion_cmd) - sys.stdout.write('\n--------------------------------------------') + sys.stdout.write(conversion_cmd + '\n') try: subprocess.run(conversion_cmd, shell=True, check=True) @@ -86,14 +87,11 @@ def convert_checkpoint(cps, load_dir="/data/input"): 'error': error} ) - print(processed) + print(processed) - # results = 'results/' - # if not os.path.exists(results): - # os.mkdir(results) - with open(METRICS_FILE, 'w') as fout: - for p in processed: - fout.write(json.dumps(p) + '\n') + with open(METRICS_FILE, 'a+') as fout: + for p in processed: + fout.write(json.dumps(p) + '\n') def s3_path_exists(cp, s3): From 9c6dc75ab49442f54c7ad9c0202de504eedf0810 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Wed, 11 Sep 2024 14:23:57 -0700 Subject: [PATCH 44/89] troubleshooting --- scripts/convert_checkpoints.sh | 2 +- scripts/convert_checkpoints_batch.py | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index 536e0a2eb..e85b4d269 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -46,5 +46,5 @@ gantry run \ --shared-memory 10GiB \ --weka=oe-eval-default:/data/input \ --yes \ - -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input'" + -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input' --sanity-check" diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index dc3c84279..cb3c1bbf8 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -17,8 +17,6 @@ from gantry import METRICS_FILE -SANITY_CHECK = True - # possible converted locations. # "self" is the target location where the converted model would be saved # key: template, value: description @@ -29,7 +27,7 @@ "{}/ianm/{}-hf": "ian's" } -def convert_checkpoint(cps, load_dir="/data/input"): +def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False): s3_client = boto3.client('s3') s3_resource = boto3.resource('s3') @@ -68,7 +66,7 @@ def convert_checkpoint(cps, load_dir="/data/input"): conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5' --cleanup-local-dir" - if SANITY_CHECK: + if sanity_check: sys.stdout.write('SANITY CHECK MODE (not running the conversion)') sys.stdout.write(conversion_cmd + '\n') else: @@ -147,13 +145,14 @@ def main(): group_batch.add_argument("--checkpoint-path", help="path to sharded checkpoint", type=str) group_batch.add_argument("--checkpoint-path-file", help="file that lists sharded checkpoint paths (batch run option)", type=str) parser.add_argument("--weka-load-dir", help='mounted location of weka bucket', default='/data/input', type=str) + parser.add_argument("--sanity-check", help='print what would be run; do not actually run conversion', action='store_true') args = parser.parse_args() if args.checkpoint_path is not None: - convert_checkpoint([args.checkpoint_path], args.weka_load_dir) + convert_checkpoint([args.checkpoint_path], load_dir=args.weka_load_dir, sanity_check=args.sanity_check) else: - convert_checkpoint(read_checkpoints(args.checkpoint_path_file), args.weka_load_dir) + convert_checkpoint(read_checkpoints(args.checkpoint_path_file), load_dir=args.weka_load_dir, sanity_check=args.sanity_check) if __name__ == "__main__": From 54a0d620b4a65e247664a1640f02e8107a107be2 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Wed, 11 Sep 2024 14:34:07 -0700 Subject: [PATCH 45/89] troubleshooting --- scripts/convert_checkpoints_batch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index cb3c1bbf8..2b39e516a 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -23,8 +23,8 @@ # template: MUST obey .format(load_dir, retain_path_name) WEKA_CHECK_LOCATIONS_PREFIXES = { - "{}/{}-hf/": 'self', - "{}/ianm/{}-hf": "ian's" + "{}/{}-hf/pytorch_model.bin": 'self', + "{}/ianm/{}-hf/pytorch_model.bin": "ian's" } def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False): From 25d5a7dbac5876ed33bffb65e2addba995c85530 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Wed, 11 Sep 2024 14:50:37 -0700 Subject: [PATCH 46/89] troubleshooting --- scripts/convert_checkpoints.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index e85b4d269..536e0a2eb 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -46,5 +46,5 @@ gantry run \ --shared-memory 10GiB \ --weka=oe-eval-default:/data/input \ --yes \ - -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input' --sanity-check" + -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input'" From dd9ae8a1020d96cb2758b8b751372a99ddb3bdf0 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Thu, 12 Sep 2024 10:32:03 -0700 Subject: [PATCH 47/89] minor fixes --- scripts/convert_checkpoints_batch.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 2b39e516a..7e72e5011 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -41,7 +41,8 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False): weka_loc = f"{load_dir}/{retain_path_name}-hf/" check_locs = [l.format(load_dir,retain_path_name) for l in WEKA_CHECK_LOCATIONS_PREFIXES] - sys.stdout.write(f"\n\nProcessing Checkpoint: {retain_path_name}\n") + print(f"\n\n------------------------------------------------------------", flush=True) + print(f"\nProcessing Checkpoint: {retain_path_name}\n", flush=True) error = "" path_found = None @@ -55,22 +56,22 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False): if path_found is not None: conversion = 'existing' converted_path = path_found.replace(load_dir,'/weka') - sys.stdout.write(f"Converted Checkpoint Found: {converted_path}\n") + print(f"Converted Checkpoint Found: {converted_path}\n", flush=True) elif s3_path_exists(checkpoint_path, s3_resource): conversion = 'existing' converted_path = checkpoint_path + '-hf' - sys.stdout.write(f" -- Converted Checkpoint Found: {converted_path}\n") + print(f"Converted Checkpoint Found: {converted_path}\n", flush=True) else: conversion = 'new' - converted_path = weka_loc + converted_path = weka_loc.replace(load_dir,'/weka') conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5' --cleanup-local-dir" if sanity_check: - sys.stdout.write('SANITY CHECK MODE (not running the conversion)') - sys.stdout.write(conversion_cmd + '\n') + print('SANITY CHECK MODE (not running the conversion)') + print(conversion_cmd + '\n') else: - sys.stdout.write(conversion_cmd + '\n') + # sys.stdout.write(conversion_cmd + '\n') try: subprocess.run(conversion_cmd, shell=True, check=True) @@ -78,14 +79,14 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False): error = e.output processed.append({ - 'unproccessed_path': checkpoint_path, + 'unprocessed_path': checkpoint_path, 'converted_path': converted_path, - 'convertion': conversion, + 'conversion': conversion, 'date_time': time.strftime('%b-%d-%Y_%H%M', time.localtime()), 'error': error} ) - print(processed) + #print(processed) with open(METRICS_FILE, 'a+') as fout: for p in processed: From 402f7b720026dc06e7181ee31a8af6f8ee801a9d Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Thu, 12 Sep 2024 10:42:55 -0700 Subject: [PATCH 48/89] minor fixes --- scripts/convert_checkpoints_batch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 7e72e5011..94a49a3d3 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -48,7 +48,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False): path_found = None for loc in check_locs: if os.path.exists(loc): - path_found = loc + path_found = loc.replace('/pytorch_model.bin','') break # Check if the output location is already there. If not, do the conversion. From f5806dad3027d7191abab9eac47d99104a1dcac4 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Thu, 12 Sep 2024 11:02:03 -0700 Subject: [PATCH 49/89] minor fixes --- scripts/convert_checkpoints_batch.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 94a49a3d3..24799ff71 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -33,6 +33,8 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False): cps = expand_paths(cps, s3_client) + print(f"Total of {len(cps)} paths to process.", flush=True) + for checkpoint_path in cps: # Convert to old-style checkpoint. processed = [] From 7718e9f2cec2ccfa03f9a437ff5b07d3316416d3 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Thu, 12 Sep 2024 11:20:07 -0700 Subject: [PATCH 50/89] minor fixes --- scripts/convert_checkpoints_batch.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 24799ff71..e6974296c 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -55,14 +55,17 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False): # Check if the output location is already there. If not, do the conversion. # print('WEKA LOC', weka_loc) - if path_found is not None: + s3_hf_exists = s3_path_exists(checkpoint_path, s3_resource) + if s3_hf_exists is not None: conversion = 'existing' - converted_path = path_found.replace(load_dir,'/weka') + converted_path = s3_hf_exists # checkpoint_path + '-hf' print(f"Converted Checkpoint Found: {converted_path}\n", flush=True) - elif s3_path_exists(checkpoint_path, s3_resource): + + elif path_found is not None: conversion = 'existing' - converted_path = checkpoint_path + '-hf' + converted_path = path_found.replace(load_dir,'/weka') print(f"Converted Checkpoint Found: {converted_path}\n", flush=True) + else: conversion = 'new' converted_path = weka_loc.replace(load_dir,'/weka') @@ -98,8 +101,13 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False): def s3_path_exists(cp, s3): b = cp.split('/')[2] bucket = s3.Bucket(b) - objs = list(bucket.objects.filter(Prefix=cp.replace('s3://'+b+'/', '') + '-hf')) - return True if (len(objs) > 0) else False + prefix = cp.replace('s3://'+b+'/', '') + objs = list(bucket.objects.filter(Prefix=prefix + '-hf/pytorch_model.bin')) + if len(objs) > 0: + return cp + '-hf' + else: + objs2 = list(bucket.objects.filter(Prefix=prefix + '-hf-olmo/pytorch_model.bin')) + return cp + '-hf-olmo' if (len(objs2) > 0) else None def expand_paths(cps, s3): From d8436da662d41b86f5d4b3ed9e8e323e606014ac Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Thu, 12 Sep 2024 11:54:10 -0700 Subject: [PATCH 51/89] minor fixes --- scripts/convert_checkpoints_batch.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index e6974296c..2553d29fb 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -14,6 +14,7 @@ import boto3 import json import sys +from pathlib import Path from gantry import METRICS_FILE @@ -102,6 +103,7 @@ def s3_path_exists(cp, s3): b = cp.split('/')[2] bucket = s3.Bucket(b) prefix = cp.replace('s3://'+b+'/', '') + print(bucket, prefix) objs = list(bucket.objects.filter(Prefix=prefix + '-hf/pytorch_model.bin')) if len(objs) > 0: return cp + '-hf' @@ -112,21 +114,26 @@ def s3_path_exists(cp, s3): def expand_paths(cps, s3): expanded = [] + for cp in cps: bucket = cp.split('/')[2] segs = cp.split('*') - - # cmd = f"aws s3 ls --recursive {segs[0]}" - # all_dirs = subprocess.run(cmd, shell=True, check=True, capture_output=True, text = True).stdout - # relevant_dirs = ['/'.join(d.split()[-1].split('/')[:-1]) for d in all_dirs.split() if 'model.pt' in d] + prefix = segs[0].replace('s3://'+bucket+'/', '') relevant_dirs = [] + paginator = s3.get_paginator('list_objects_v2') - page_iterator = paginator.paginate(Bucket=bucket, Prefix=segs[0].replace('s3://'+bucket+'/', '')) + page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix) + for page in page_iterator: for obj in page['Contents']: - if 'model.pt' in obj["Key"]: - relevant_dirs.append(obj["Key"].replace('/model.pt','')) + p = Path(obj["Key"]) + if p.parent.name in ['optim', 'train','model']: + relevant_dirs.append(p.parent.parent) + elif p.name == 'model.pt': + relevant_dirs.append(p.parent) + # if 'model.pt' in obj["Key"]: + # relevant_dirs.append(obj["Key"].replace('/model.pt','')) search_segs = [seg for i, seg in enumerate(segs) if i > 0 and seg != ""] From c7717a19ec40a776cf06534e76330239257ecdd4 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Thu, 12 Sep 2024 12:40:14 -0700 Subject: [PATCH 52/89] fix --- scripts/convert_checkpoints_batch.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 2553d29fb..c7cc42e6a 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -129,9 +129,9 @@ def expand_paths(cps, s3): for obj in page['Contents']: p = Path(obj["Key"]) if p.parent.name in ['optim', 'train','model']: - relevant_dirs.append(p.parent.parent) + relevant_dirs.append(str(p.parent.parent)) elif p.name == 'model.pt': - relevant_dirs.append(p.parent) + relevant_dirs.append(str(p.parent)) # if 'model.pt' in obj["Key"]: # relevant_dirs.append(obj["Key"].replace('/model.pt','')) @@ -139,6 +139,7 @@ def expand_paths(cps, s3): # print(f"search segments: {search_segs}") + # subselect the directory with remaining segments (for multiple wildcard *) temp_dirs = relevant_dirs if len(search_segs) > 0: for s in search_segs: From b8445ce9be8c73f94b1e42cc4e35794227fc8b4b Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 13 Sep 2024 08:59:14 -0700 Subject: [PATCH 53/89] updates --- scripts/convert_checkpoints_batch.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index c7cc42e6a..eb6bd574b 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -34,7 +34,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False): cps = expand_paths(cps, s3_client) - print(f"Total of {len(cps)} paths to process.", flush=True) + print(f">>> Total of {len(cps)} paths to process. <<<", flush=True) for checkpoint_path in cps: # Convert to old-style checkpoint. @@ -94,16 +94,17 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False): #print(processed) - with open(METRICS_FILE, 'a+') as fout: - for p in processed: - fout.write(json.dumps(p) + '\n') + if not sanity_check: + with open(METRICS_FILE, 'a+') as fout: + for p in processed: + fout.write(json.dumps(p) + '\n') def s3_path_exists(cp, s3): b = cp.split('/')[2] bucket = s3.Bucket(b) prefix = cp.replace('s3://'+b+'/', '') - print(bucket, prefix) + # print(bucket, prefix) objs = list(bucket.objects.filter(Prefix=prefix + '-hf/pytorch_model.bin')) if len(objs) > 0: return cp + '-hf' From 855d66646de75f29a34e496aa4443201c91e0f74 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 13 Sep 2024 15:27:31 -0700 Subject: [PATCH 54/89] handle directories that have unsharded counterparts --- scripts/convert_checkpoints_batch.py | 47 ++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index eb6bd574b..37f11e714 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -122,19 +122,46 @@ def expand_paths(cps, s3): prefix = segs[0].replace('s3://'+bucket+'/', '') relevant_dirs = [] + skip_parent = [] paginator = s3.get_paginator('list_objects_v2') page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix) - - for page in page_iterator: - for obj in page['Contents']: - p = Path(obj["Key"]) - if p.parent.name in ['optim', 'train','model']: - relevant_dirs.append(str(p.parent.parent)) - elif p.name == 'model.pt': - relevant_dirs.append(str(p.parent)) - # if 'model.pt' in obj["Key"]: - # relevant_dirs.append(obj["Key"].replace('/model.pt','')) + contents = {obj["Key"]:str(Path(obj['Key']).parent) for page in page_iterator for obj in page['Contents']} + paths = set(contents.values()) + # print(contents) + + for path in contents: + p = Path(path) + parent = str(p.parent) + grandpa = str(p.parent.parent) + + if parent in relevant_dirs or parent in skip_parent: + continue + if p.parent.name in ['optim', 'train','model']: + if f"{grandpa}-unsharded" in paths: + # skip condition + skip_parent.append(parent) + continue + else: + relevant_dirs.append(grandpa) + elif p.name == 'model.pt': + relevant_dirs.append(parent) + + + # for page in page_iterator: + # for obj in page['Contents']: + # p = Path(obj["Key"]) + # if p.parent.name in ['optim', 'train','model']: + # grand_parent = str(p.parent.parent) + # if '-unsharded' not in grand_parent: + # objs = list(s3_resource.Bucket(bucket).objects.filter(Prefix=grand_parent + '-unsharded')) + # if len(objs) > 0: + # continue + # relevant_dirs.append(str(p.parent.parent)) + # elif p.name == 'model.pt': + # relevant_dirs.append(str(p.parent)) + # # if 'model.pt' in obj["Key"]: + # # relevant_dirs.append(obj["Key"].replace('/model.pt','')) search_segs = [seg for i, seg in enumerate(segs) if i > 0 and seg != ""] From ed5abb78da4960cc137e8cd46939814cf16db585 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 13 Sep 2024 16:20:31 -0700 Subject: [PATCH 55/89] fixing error catching --- scripts/convert_checkpoints_batch.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 37f11e714..cd3cf2da6 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -77,12 +77,12 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False): print('SANITY CHECK MODE (not running the conversion)') print(conversion_cmd + '\n') else: - # sys.stdout.write(conversion_cmd + '\n') - try: subprocess.run(conversion_cmd, shell=True, check=True) except subprocess.CalledProcessError as e: - error = e.output + error = e.output ### NOT ACTUALLY WORKING CORRECTLY. FIX THIS (not catching config not found error) + conversion = 'error' + converted_path = "" processed.append({ 'unprocessed_path': checkpoint_path, From cbcbc86983e4c5e3484e6ce13e4ab370ff6f98b5 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Wed, 18 Sep 2024 12:13:20 -0700 Subject: [PATCH 56/89] output log edits --- scripts/convert_checkpoints_batch.py | 50 ++++++++++++++++++---------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index cd3cf2da6..b3c0eee23 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -16,7 +16,7 @@ import sys from pathlib import Path -from gantry import METRICS_FILE +from gantry import RESULTS_DIR # possible converted locations. # "self" is the target location where the converted model would be saved @@ -28,7 +28,7 @@ "{}/ianm/{}-hf/pytorch_model.bin": "ian's" } -def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False): +def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_prefix="/weka"): s3_client = boto3.client('s3') s3_resource = boto3.resource('s3') @@ -36,10 +36,10 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False): print(f">>> Total of {len(cps)} paths to process. <<<", flush=True) + processed = {} + for checkpoint_path in cps: # Convert to old-style checkpoint. - processed = [] - retain_path_name = checkpoint_path.replace('s3://', '').strip('/') weka_loc = f"{load_dir}/{retain_path_name}-hf/" check_locs = [l.format(load_dir,retain_path_name) for l in WEKA_CHECK_LOCATIONS_PREFIXES] @@ -64,12 +64,12 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False): elif path_found is not None: conversion = 'existing' - converted_path = path_found.replace(load_dir,'/weka') + converted_path = path_found.replace(load_dir,weka_prefix) print(f"Converted Checkpoint Found: {converted_path}\n", flush=True) else: conversion = 'new' - converted_path = weka_loc.replace(load_dir,'/weka') + converted_path = weka_loc conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5' --cleanup-local-dir" @@ -84,20 +84,35 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False): conversion = 'error' converted_path = "" - processed.append({ + local_log = { 'unprocessed_path': checkpoint_path, - 'converted_path': converted_path, + 'converted_path': converted_path.replace(load_dir,weka_prefix), 'conversion': conversion, 'date_time': time.strftime('%b-%d-%Y_%H%M', time.localtime()), - 'error': error} - ) - - #print(processed) + 'error': error + } + + # {"model_name": "name", "checkpoints_location": "weka://path/to/", "revisions": ["step0-unsharded-hf", "step1000-unsharded-hf", etc]} + curr = Path(converted_path) + parent = curr.parent + if parent.name not in processed: + processed[parent.name] = { + 'model_name': parent.name, + 'checkpoints_location': str(parent).replace(load_dir,weka_prefix), + 'revisions': [curr.name] + } + else: + processed[parent.name]['revisions'].append(curr.name) + # LOG if not sanity_check: - with open(METRICS_FILE, 'a+') as fout: - for p in processed: - fout.write(json.dumps(p) + '\n') + with open(os.path.join(RESULTS_DIR, 'log.jsonl'), 'a+') as fout: + fout.write(json.dumps(local_log) + '\n') + + if not sanity_check: + with open(os.path.join(RESULTS_DIR, 'model_checkpoints.jsonl'), 'w') as fout: + for p in processed: + fout.write(json.dumps(p) + '\n') def s3_path_exists(cp, s3): @@ -192,14 +207,15 @@ def main(): group_batch.add_argument("--checkpoint-path", help="path to sharded checkpoint", type=str) group_batch.add_argument("--checkpoint-path-file", help="file that lists sharded checkpoint paths (batch run option)", type=str) parser.add_argument("--weka-load-dir", help='mounted location of weka bucket', default='/data/input', type=str) + parser.add_argument("--weka-prefix", help='weka directory prefix for output', default='/weka', type=str) parser.add_argument("--sanity-check", help='print what would be run; do not actually run conversion', action='store_true') args = parser.parse_args() if args.checkpoint_path is not None: - convert_checkpoint([args.checkpoint_path], load_dir=args.weka_load_dir, sanity_check=args.sanity_check) + convert_checkpoint([args.checkpoint_path], load_dir=args.weka_load_dir, sanity_check=args.sanity_check, weka_prefix=args.weka_prefix) else: - convert_checkpoint(read_checkpoints(args.checkpoint_path_file), load_dir=args.weka_load_dir, sanity_check=args.sanity_check) + convert_checkpoint(read_checkpoints(args.checkpoint_path_file), load_dir=args.weka_load_dir, sanity_check=args.sanity_check, weka_prefix=args.weka_prefix) if __name__ == "__main__": From d49db7b41a864ed21d423123e42b65168df31240 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Wed, 18 Sep 2024 15:10:41 -0700 Subject: [PATCH 57/89] output log edits --- scripts/convert_checkpoints.sh | 2 +- scripts/convert_checkpoints_batch.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index 536e0a2eb..2afbd15f8 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -46,5 +46,5 @@ gantry run \ --shared-memory 10GiB \ --weka=oe-eval-default:/data/input \ --yes \ - -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input'" + -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input' --weka-prefix 'weka://oe-eval-default'" diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index b3c0eee23..afb69f8ad 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -111,7 +111,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre if not sanity_check: with open(os.path.join(RESULTS_DIR, 'model_checkpoints.jsonl'), 'w') as fout: - for p in processed: + for _,p in processed.items(): fout.write(json.dumps(p) + '\n') From cd6a75a386fbf5cbcebf6347156f2873c1bf81da Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Wed, 18 Sep 2024 15:36:07 -0700 Subject: [PATCH 58/89] output log edits --- scripts/convert_checkpoints_batch.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index afb69f8ad..970b60b97 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -54,23 +54,23 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre path_found = loc.replace('/pytorch_model.bin','') break - # Check if the output location is already there. If not, do the conversion. - # print('WEKA LOC', weka_loc) + # Check if the output location is already there in s3. If so then skip conversion s3_hf_exists = s3_path_exists(checkpoint_path, s3_resource) if s3_hf_exists is not None: conversion = 'existing' converted_path = s3_hf_exists # checkpoint_path + '-hf' print(f"Converted Checkpoint Found: {converted_path}\n", flush=True) + # Check if the output location is in weka. If so then skip conversion elif path_found is not None: conversion = 'existing' converted_path = path_found.replace(load_dir,weka_prefix) print(f"Converted Checkpoint Found: {converted_path}\n", flush=True) + # Do conversion and save to Weka else: conversion = 'new' converted_path = weka_loc - conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5' --cleanup-local-dir" if sanity_check: @@ -84,6 +84,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre conversion = 'error' converted_path = "" + # Keep info for log.jsonl local_log = { 'unprocessed_path': checkpoint_path, 'converted_path': converted_path.replace(load_dir,weka_prefix), @@ -92,9 +93,10 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre 'error': error } - # {"model_name": "name", "checkpoints_location": "weka://path/to/", "revisions": ["step0-unsharded-hf", "step1000-unsharded-hf", etc]} + # output model checkpoint location for eval scripts curr = Path(converted_path) parent = curr.parent + print(parent, flush=True) if parent.name not in processed: processed[parent.name] = { 'model_name': parent.name, @@ -104,11 +106,12 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre else: processed[parent.name]['revisions'].append(curr.name) - # LOG + # Output Log if not sanity_check: with open(os.path.join(RESULTS_DIR, 'log.jsonl'), 'a+') as fout: fout.write(json.dumps(local_log) + '\n') + # Output checkpoint location for eval scripts if not sanity_check: with open(os.path.join(RESULTS_DIR, 'model_checkpoints.jsonl'), 'w') as fout: for _,p in processed.items(): From f8e9c96800cc6ed8c0edad1c55e2d9b71545857b Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Wed, 18 Sep 2024 15:40:10 -0700 Subject: [PATCH 59/89] output log edits --- scripts/convert_checkpoints_batch.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 970b60b97..25a01c668 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -64,7 +64,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre # Check if the output location is in weka. If so then skip conversion elif path_found is not None: conversion = 'existing' - converted_path = path_found.replace(load_dir,weka_prefix) + converted_path = path_found print(f"Converted Checkpoint Found: {converted_path}\n", flush=True) # Do conversion and save to Weka @@ -96,7 +96,6 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre # output model checkpoint location for eval scripts curr = Path(converted_path) parent = curr.parent - print(parent, flush=True) if parent.name not in processed: processed[parent.name] = { 'model_name': parent.name, From de8292256450ea71c4acaf73df988cf8691d62c7 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 4 Oct 2024 10:01:20 -0700 Subject: [PATCH 60/89] code cleanup --- scripts/convert_checkpoints.sh | 36 ++++---- scripts/convert_checkpoints_batch.py | 125 ++++++++++++++------------- 2 files changed, 84 insertions(+), 77 deletions(-) diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index 2afbd15f8..f6c7f3a84 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -1,33 +1,33 @@ #!/usr/bin/env bash +# Converts s3 checkpoints into WEKA # To be run at the top of the root of OLMo repository. -# Converts s3 checkpoints into WEKA +# Script requires the use of GANTRY and AWS access to WEKA # # Example use: +# Run: # sh scripts/convert_checkpoints.sh s3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step9* -# -# This will convert all models in the directory -# and save them to their respective directories under -# -# /weka/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step9* +# This will convert all models in the directory and save them to: +# weka://oe-eval-default/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001-hf/step9* # # It will first, though, check that the weka directory doesn't exist AND that s3 doesn't have a corresponding directory (so as not to replicate what conversions already made) - +# # ASSUMPTIONS # - INPUT must be on s3 -# - OUTPUT is weka with the same path name as s3 + "-hf" suffix appended to the path -# - Budget for oe-eval -# - Experiments saved to ai2/cheap-decisions +# - OUTPUT to weka is saved to the path as found on s3 with "-hf" suffix appended to the path # - Assumes tokenizer allenai/gpt-neox-olmo-dolma-v1_5.json - -# NOTES -# - saves metrics.json -# - allows for wildcard (*) - +# +# OUTPUT logs +# - saves log.jsonl +# - saves model_checkpoints.jsonl: this is input file is formatted for oe-eval-internal experiments +# +# SH run SPECIFICATION DEFAULTS: +# - Budget for oe-eval (see below) +# - Loading for weka weka://oe-eval-default/ (see below) +# - Gantry experiments saved to beaker://ai2/cheap-decisions +# - Weka prefix is used for model_checkpoints.jsonl +# # TODOs -# - Make consistent with Luca's code -# - Code allows for a txt file with a list of checkpoint paths, sh needs to allow this -# - Output is not saving. But it prints to the log. Fix this. # - Make tokenizer updatable CHECKPOINT_PATH=$1 diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 25a01c668..64425ad19 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -13,7 +13,6 @@ import time import boto3 import json -import sys from pathlib import Path from gantry import RESULTS_DIR @@ -28,7 +27,7 @@ "{}/ianm/{}-hf/pytorch_model.bin": "ian's" } -def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_prefix="/weka"): +def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_prefix="/weka", save_to_weka=False): s3_client = boto3.client('s3') s3_resource = boto3.resource('s3') @@ -38,40 +37,61 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre processed = {} + # Convert to old-style checkpoint. for checkpoint_path in cps: - # Convert to old-style checkpoint. - retain_path_name = checkpoint_path.replace('s3://', '').strip('/') - weka_loc = f"{load_dir}/{retain_path_name}-hf/" - check_locs = [l.format(load_dir,retain_path_name) for l in WEKA_CHECK_LOCATIONS_PREFIXES] - print(f"\n\n------------------------------------------------------------", flush=True) - print(f"\nProcessing Checkpoint: {retain_path_name}\n", flush=True) - error = "" + print(f"\nProcessing Checkpoint: {checkpoint_path}\n", flush=True) - path_found = None - for loc in check_locs: + error = "" + converted_path = "" + existing_location = "" + conversion_status = "" + + # sort out paths, bucket names, and so on ... + path_bits = checkpoint_path.strip('/').replace('s3://', '').split('/') + s3_bucket_name = path_bits[0] + s3_prefix = '/'.join(path_bits[1:]) + temp_path = '/'.join(path_bits) #checkpoint_path.replace('s3://', '').strip('/') + local_path = f"{load_dir}/{temp_path}-hf/" + + # the converted model may already exist in local_path or in + path_found = False + potential_existing_locations = [l.format(load_dir,temp_path) for l in WEKA_CHECK_LOCATIONS_PREFIXES] + for loc in potential_existing_locations: if os.path.exists(loc): - path_found = loc.replace('/pytorch_model.bin','') + existing_location = loc.replace('/pytorch_model.bin','') + path_found = True break - # Check if the output location is already there in s3. If so then skip conversion - s3_hf_exists = s3_path_exists(checkpoint_path, s3_resource) - if s3_hf_exists is not None: - conversion = 'existing' - converted_path = s3_hf_exists # checkpoint_path + '-hf' + # if one of the potential existing location has converted model in it then use that + if path_found: + # then there is no conversion to do. + conversion_status = 'existing' + converted_path = existing_location print(f"Converted Checkpoint Found: {converted_path}\n", flush=True) - - # Check if the output location is in weka. If so then skip conversion - elif path_found is not None: - conversion = 'existing' - converted_path = path_found - print(f"Converted Checkpoint Found: {converted_path}\n", flush=True) - - # Do conversion and save to Weka else: - conversion = 'new' - converted_path = weka_loc - conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{weka_loc}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5' --cleanup-local-dir" + s3_bucket = s3_resource.Bucket(s3_bucket_name) + s3_hf_exists = s3_path_exists(s3_bucket, s3_prefix, s3_resource) + + # if s3 already has a location for converted model then use that + if s3_hf_exists is not None: + path_found = True + print(f"Converted Checkpoint Found: {s3_hf_exists}", flush=True) + + # if save to weka flag is passed, then download the s3 converted model to the local path + if save_to_weka: + copy_s3_to_local(s3_hf_exists, local_path, s3_resource, sanity_check) + conversion_status = 'existing-downloaded' + converted_path = local_path + else: + conversion_status = 'existing' + converted_path = s3_hf_exists + + # if no existing conversions are found then process and save to local path + if not path_found: + conversion_status = 'new' + converted_path = local_path + conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{local_path}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5' --cleanup-local-dir" if sanity_check: print('SANITY CHECK MODE (not running the conversion)') @@ -81,14 +101,14 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre subprocess.run(conversion_cmd, shell=True, check=True) except subprocess.CalledProcessError as e: error = e.output ### NOT ACTUALLY WORKING CORRECTLY. FIX THIS (not catching config not found error) - conversion = 'error' + conversion_status = 'error' converted_path = "" # Keep info for log.jsonl local_log = { 'unprocessed_path': checkpoint_path, 'converted_path': converted_path.replace(load_dir,weka_prefix), - 'conversion': conversion, + 'conversion': conversion_status, 'date_time': time.strftime('%b-%d-%Y_%H%M', time.localtime()), 'error': error } @@ -113,21 +133,26 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre # Output checkpoint location for eval scripts if not sanity_check: with open(os.path.join(RESULTS_DIR, 'model_checkpoints.jsonl'), 'w') as fout: - for _,p in processed.items(): + for _, p in processed.items(): fout.write(json.dumps(p) + '\n') -def s3_path_exists(cp, s3): - b = cp.split('/')[2] - bucket = s3.Bucket(b) - prefix = cp.replace('s3://'+b+'/', '') - # print(bucket, prefix) +def s3_path_exists(bucket, prefix, s3_resource): + # look for pytorch_model.bin in directories ending with -hf or -hf-olmo. objs = list(bucket.objects.filter(Prefix=prefix + '-hf/pytorch_model.bin')) if len(objs) > 0: - return cp + '-hf' + return f"s3://{bucket}/{prefix}-hf" else: objs2 = list(bucket.objects.filter(Prefix=prefix + '-hf-olmo/pytorch_model.bin')) - return cp + '-hf-olmo' if (len(objs2) > 0) else None + return f"s3://{bucket}/{prefix}-hf-olmo" if (len(objs2) > 0) else None + + +def copy_s3_to_local(bucket, prefix, local_path, s3_resource, sanity_check): + if not os.path.exists(os.path.dirname(local_path)): + print(f"Downloading checkpoint to weka://{bucket}/{prefix}\n", flush=True) + if not sanity_check: + os.makedirs(local_path) + bucket.download_file(prefix, local_path) # save to same path def expand_paths(cps, s3): @@ -145,7 +170,6 @@ def expand_paths(cps, s3): page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix) contents = {obj["Key"]:str(Path(obj['Key']).parent) for page in page_iterator for obj in page['Contents']} paths = set(contents.values()) - # print(contents) for path in contents: p = Path(path) @@ -164,26 +188,8 @@ def expand_paths(cps, s3): elif p.name == 'model.pt': relevant_dirs.append(parent) - - # for page in page_iterator: - # for obj in page['Contents']: - # p = Path(obj["Key"]) - # if p.parent.name in ['optim', 'train','model']: - # grand_parent = str(p.parent.parent) - # if '-unsharded' not in grand_parent: - # objs = list(s3_resource.Bucket(bucket).objects.filter(Prefix=grand_parent + '-unsharded')) - # if len(objs) > 0: - # continue - # relevant_dirs.append(str(p.parent.parent)) - # elif p.name == 'model.pt': - # relevant_dirs.append(str(p.parent)) - # # if 'model.pt' in obj["Key"]: - # # relevant_dirs.append(obj["Key"].replace('/model.pt','')) - search_segs = [seg for i, seg in enumerate(segs) if i > 0 and seg != ""] - # print(f"search segments: {search_segs}") - # subselect the directory with remaining segments (for multiple wildcard *) temp_dirs = relevant_dirs if len(search_segs) > 0: @@ -211,13 +217,14 @@ def main(): parser.add_argument("--weka-load-dir", help='mounted location of weka bucket', default='/data/input', type=str) parser.add_argument("--weka-prefix", help='weka directory prefix for output', default='/weka', type=str) parser.add_argument("--sanity-check", help='print what would be run; do not actually run conversion', action='store_true') + parser.add_argument("--save-to-weka", help='if checkpoints are found on s3, save them to loaded weka dir', action='store_true') args = parser.parse_args() if args.checkpoint_path is not None: - convert_checkpoint([args.checkpoint_path], load_dir=args.weka_load_dir, sanity_check=args.sanity_check, weka_prefix=args.weka_prefix) + convert_checkpoint([args.checkpoint_path], load_dir=args.weka_load_dir, sanity_check=args.sanity_check, weka_prefix=args.weka_prefix, save_to_weka=args.save_to_weka) else: - convert_checkpoint(read_checkpoints(args.checkpoint_path_file), load_dir=args.weka_load_dir, sanity_check=args.sanity_check, weka_prefix=args.weka_prefix) + convert_checkpoint(read_checkpoints(args.checkpoint_path_file), load_dir=args.weka_load_dir, sanity_check=args.sanity_check, weka_prefix=args.weka_prefix, save_to_weka=args.save_to_weka) if __name__ == "__main__": From d3e16d72c47cc395c4f9adac7d3d6eb2801a1736 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 4 Oct 2024 12:57:29 -0700 Subject: [PATCH 61/89] code cleanup --- scripts/convert_checkpoints_batch.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 64425ad19..20fef2299 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -71,7 +71,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre print(f"Converted Checkpoint Found: {converted_path}\n", flush=True) else: s3_bucket = s3_resource.Bucket(s3_bucket_name) - s3_hf_exists = s3_path_exists(s3_bucket, s3_prefix, s3_resource) + s3_hf_exists = s3_path_exists(s3_bucket, s3_prefix, s3_bucket_name) # if s3 already has a location for converted model then use that if s3_hf_exists is not None: @@ -80,7 +80,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre # if save to weka flag is passed, then download the s3 converted model to the local path if save_to_weka: - copy_s3_to_local(s3_hf_exists, local_path, s3_resource, sanity_check) + copy_s3_to_local(s3_bucket, s3_prefix, local_path, local_path.replace(load_dir,weka_prefix), sanity_check) conversion_status = 'existing-downloaded' converted_path = local_path else: @@ -137,19 +137,19 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre fout.write(json.dumps(p) + '\n') -def s3_path_exists(bucket, prefix, s3_resource): +def s3_path_exists(bucket, prefix, bucket_name): # look for pytorch_model.bin in directories ending with -hf or -hf-olmo. objs = list(bucket.objects.filter(Prefix=prefix + '-hf/pytorch_model.bin')) if len(objs) > 0: - return f"s3://{bucket}/{prefix}-hf" + return f"s3://{bucket_name}/{prefix}-hf" else: objs2 = list(bucket.objects.filter(Prefix=prefix + '-hf-olmo/pytorch_model.bin')) - return f"s3://{bucket}/{prefix}-hf-olmo" if (len(objs2) > 0) else None + return f"s3://{bucket_name}/{prefix}-hf-olmo" if (len(objs2) > 0) else None -def copy_s3_to_local(bucket, prefix, local_path, s3_resource, sanity_check): +def copy_s3_to_local(bucket, prefix, local_path, display_name, sanity_check): if not os.path.exists(os.path.dirname(local_path)): - print(f"Downloading checkpoint to weka://{bucket}/{prefix}\n", flush=True) + print(f"Downloading checkpoint to {display_name}\n", flush=True) if not sanity_check: os.makedirs(local_path) bucket.download_file(prefix, local_path) # save to same path From ef8ffbdd3ec618de62ba60f8b7fb39f8d173c129 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 4 Oct 2024 13:05:33 -0700 Subject: [PATCH 62/89] testing --- scripts/convert_checkpoints.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index f6c7f3a84..b0d592d55 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -13,7 +13,7 @@ # It will first, though, check that the weka directory doesn't exist AND that s3 doesn't have a corresponding directory (so as not to replicate what conversions already made) # # ASSUMPTIONS -# - INPUT must be on s3 +# - INPUT must be on s3. Multiple wildcards allowed # - OUTPUT to weka is saved to the path as found on s3 with "-hf" suffix appended to the path # - Assumes tokenizer allenai/gpt-neox-olmo-dolma-v1_5.json # @@ -46,5 +46,5 @@ gantry run \ --shared-memory 10GiB \ --weka=oe-eval-default:/data/input \ --yes \ - -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input' --weka-prefix 'weka://oe-eval-default'" + -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input' --weka-prefix 'weka://oe-eval-default' --save_to_weka" From fac649d904ee626812c5f232a094c1544a96ac07 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 4 Oct 2024 13:09:24 -0700 Subject: [PATCH 63/89] testing --- scripts/convert_checkpoints.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index b0d592d55..934cfb452 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -46,5 +46,5 @@ gantry run \ --shared-memory 10GiB \ --weka=oe-eval-default:/data/input \ --yes \ - -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input' --weka-prefix 'weka://oe-eval-default' --save_to_weka" + -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input' --weka-prefix 'weka://oe-eval-default' --save-to-weka" From 759753469de49e7c967ede78ff9ae7a791231f2a Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 4 Oct 2024 13:52:16 -0700 Subject: [PATCH 64/89] testing --- scripts/convert_checkpoints.sh | 49 ++++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/scripts/convert_checkpoints.sh b/scripts/convert_checkpoints.sh index 934cfb452..311c0b420 100644 --- a/scripts/convert_checkpoints.sh +++ b/scripts/convert_checkpoints.sh @@ -4,6 +4,17 @@ # To be run at the top of the root of OLMo repository. # Script requires the use of GANTRY and AWS access to WEKA # +# Usage: scripts/convert_checkpoints.sh [-s] +# -s if converted checkpoint is found in s3, then save to weka +# -c sanity check; don't actually do conversion. just go through the motions and print stuff +# +# calls: convert_checkpoints_batch.py +# usage: convert_checkpoints_batch.py [-h] +# (--checkpoint-path CHECKPOINT_PATH | --checkpoint-path-file CHECKPOINT_PATH_FILE) +# [--weka-load-dir WEKA_LOAD_DIR] +# [--weka-prefix WEKA_PREFIX] +# [--sanity-check] [--save-to-weka] +# # Example use: # Run: # sh scripts/convert_checkpoints.sh s3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step9* @@ -18,8 +29,21 @@ # - Assumes tokenizer allenai/gpt-neox-olmo-dolma-v1_5.json # # OUTPUT logs -# - saves log.jsonl +# - saves log.jsonl. For every checkpoint found given input: +# - "unprocessed_path" := checkpoint path to convert +# - "converted_path" := checkpoint converted path +# - "conversion_status" := [new | existing (already in weka) | existing-downloaded (from s3) ] +# - "date" := datestamp +# - "error" := error if any conversions didn't pan out for any reason # - saves model_checkpoints.jsonl: this is input file is formatted for oe-eval-internal experiments +# - example log files for the following run: +# > sh scripts/convert_checkpoints.sh s3://ai2-llm/checkpoints/OLMo-ladder/baseline-300M-1xC/step91*6-unsharded +# log.jsonl: +# {"unprocessed_path": "s3://ai2-llm/checkpoints/OLMo-ladder/baseline-300M-1xC/step9176-unsharded", "converted_path": "weka://oe-eval-default/ianm/ai2-llm/checkpoints/OLMo-ladder/baseline-300M-1xC/step9176-unsharded-hf", "conversion": "existing", "date_time": "Oct-04-2024_2012", "error": ""} +# {"unprocessed_path": "s3://ai2-llm/checkpoints/OLMo-ladder/baseline-300M-1xC/step9166-unsharded", "converted_path": "weka://oe-eval-default/ianm/ai2-llm/checkpoints/OLMo-ladder/baseline-300M-1xC/step9166-unsharded-hf", "conversion": "existing", "date_time": "Oct-04-2024_2012", "error": ""} +# {"unprocessed_path": "s3://ai2-llm/checkpoints/OLMo-ladder/baseline-300M-1xC/step9186-unsharded", "converted_path": "weka://oe-eval-default/ianm/ai2-llm/checkpoints/OLMo-ladder/baseline-300M-1xC/step9186-unsharded-hf", "conversion": "existing", "date_time": "Oct-04-2024_2012", "error": ""} +# model_checkpoints.jsonl: +# {"model_name": "baseline-300M-1xC", "checkpoints_location": "weka://oe-eval-default/ianm/ai2-llm/checkpoints/OLMo-ladder/baseline-300M-1xC", "revisions": ["step9176-unsharded-hf", "step9166-unsharded-hf", "step9186-unsharded-hf"]} # # SH run SPECIFICATION DEFAULTS: # - Budget for oe-eval (see below) @@ -31,6 +55,27 @@ # - Make tokenizer updatable CHECKPOINT_PATH=$1 +SAVE_TO_WEKA="" +SANITY_CHECK="" +shift + +usage() { + echo "Usage: $0 [-s]" + echo " -s --save-to-weka" + echo " -c --sanity-check" + exit 1; +} + +while getopts "sc" opt; +do + case $opt in + s) SAVE_TO_WEKA="--save-to-weka" ;; + c) SANITY_CHECK="--sanity-check" ;; # mostly useful for local test runs - it will stop from doing any copying or conversions. + *) usage ;; + esac +done + +#echo "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input' --weka-prefix 'weka://oe-eval-default' $SAVE_TO_WEKA $SANITY_CHECK" gantry run \ --description "Converting $CHECKPOINT_PATH" \ @@ -46,5 +91,5 @@ gantry run \ --shared-memory 10GiB \ --weka=oe-eval-default:/data/input \ --yes \ - -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input' --weka-prefix 'weka://oe-eval-default' --save-to-weka" + -- /bin/bash -c "python scripts/convert_checkpoints_batch.py --checkpoint-path $CHECKPOINT_PATH --weka-load-dir '/data/input' --weka-prefix 'weka://oe-eval-default' $SAVE_TO_WEKA $SANITY_CHECK" From 94fa6daa46db9b92b90259ced24b0750bab7f520 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Tue, 8 Oct 2024 10:01:04 -0700 Subject: [PATCH 65/89] downloading fix --- scripts/convert_checkpoints_batch.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 20fef2299..c5a8eb1e1 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -148,11 +148,16 @@ def s3_path_exists(bucket, prefix, bucket_name): def copy_s3_to_local(bucket, prefix, local_path, display_name, sanity_check): - if not os.path.exists(os.path.dirname(local_path)): - print(f"Downloading checkpoint to {display_name}\n", flush=True) - if not sanity_check: + # if not os.path.exists(os.path.dirname(local_path)): + print(f"Downloading checkpoint to {display_name}\n", flush=True) + if not sanity_check: + try: os.makedirs(local_path) - bucket.download_file(prefix, local_path) # save to same path + except: + pass + print(prefix) + print(local_path) + bucket.download_file(bucket, prefix, local_path) # save to same path def expand_paths(cps, s3): From 5e46840ace6fc8c657291f77dc4bcd01d5e357bd Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Tue, 8 Oct 2024 10:26:19 -0700 Subject: [PATCH 66/89] downloading fix --- scripts/convert_checkpoints_batch.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index c5a8eb1e1..9b7bd4ab8 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -151,13 +151,13 @@ def copy_s3_to_local(bucket, prefix, local_path, display_name, sanity_check): # if not os.path.exists(os.path.dirname(local_path)): print(f"Downloading checkpoint to {display_name}\n", flush=True) if not sanity_check: - try: - os.makedirs(local_path) - except: - pass - print(prefix) - print(local_path) - bucket.download_file(bucket, prefix, local_path) # save to same path + for obj in bucket.objects.filter(Prefix=prefix): + target = os.path.join(local_path, os.path.relpath(obj.key, os.path.dirname(prefix))) + if not os.path.exists(os.path.dirname(target)): + os.makedirs(os.path.dirname(target)) + if obj.key[-1] == '/': + continue + bucket.download_file(obj.key, target) def expand_paths(cps, s3): @@ -227,9 +227,9 @@ def main(): args = parser.parse_args() if args.checkpoint_path is not None: - convert_checkpoint([args.checkpoint_path], load_dir=args.weka_load_dir, sanity_check=args.sanity_check, weka_prefix=args.weka_prefix, save_to_weka=args.save_to_weka) + convert_checkpoint([args.checkpoint_path], load_dir=args.weka_load_dir.rstrip('/'), sanity_check=args.sanity_check, weka_prefix=args.weka_prefix, save_to_weka=args.save_to_weka) else: - convert_checkpoint(read_checkpoints(args.checkpoint_path_file), load_dir=args.weka_load_dir, sanity_check=args.sanity_check, weka_prefix=args.weka_prefix, save_to_weka=args.save_to_weka) + convert_checkpoint(read_checkpoints(args.checkpoint_path_file), load_dir=args.weka_load_dir.rstrip('/'), sanity_check=args.sanity_check, weka_prefix=args.weka_prefix, save_to_weka=args.save_to_weka) if __name__ == "__main__": From da90ae2b43d101b40c4f77e867f7bcc37c988c2b Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 11 Oct 2024 14:52:03 -0700 Subject: [PATCH 67/89] . --- scripts/convert_checkpoints_batch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 9b7bd4ab8..48816c99e 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -39,7 +39,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre # Convert to old-style checkpoint. for checkpoint_path in cps: - print(f"\n\n------------------------------------------------------------", flush=True) + print("\n\n------------------------------------------------------------", flush=True) print(f"\nProcessing Checkpoint: {checkpoint_path}\n", flush=True) error = "" @@ -56,7 +56,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre # the converted model may already exist in local_path or in path_found = False - potential_existing_locations = [l.format(load_dir,temp_path) for l in WEKA_CHECK_LOCATIONS_PREFIXES] + potential_existing_locations = [candidate_loc.format(load_dir,temp_path) for candidate_loc in WEKA_CHECK_LOCATIONS_PREFIXES] for loc in potential_existing_locations: if os.path.exists(loc): existing_location = loc.replace('/pytorch_model.bin','') From 447de12ab2e540ea00cf6c93ea9f5daa82428983 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 11 Oct 2024 15:22:52 -0700 Subject: [PATCH 68/89] . --- scripts/convert_checkpoints_batch.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 48816c99e..0385ae7ee 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -8,14 +8,16 @@ """ import argparse -import subprocess +import json import os +import subprocess import time + import boto3 -import json -from pathlib import Path from gantry import RESULTS_DIR +from pathlib import Path +from typing import Dict # possible converted locations. # "self" is the target location where the converted model would be saved @@ -117,7 +119,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre curr = Path(converted_path) parent = curr.parent if parent.name not in processed: - processed[parent.name] = { + processed[parent.name]: Dict = { 'model_name': parent.name, 'checkpoints_location': str(parent).replace(load_dir,weka_prefix), 'revisions': [curr.name] From 3920f2ebe0aabd1c2d53c25275104e5234725ddb Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 11 Oct 2024 15:27:45 -0700 Subject: [PATCH 69/89] addressing errors --- scripts/convert_checkpoints_batch.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 0385ae7ee..593fe99b8 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -17,7 +17,6 @@ from gantry import RESULTS_DIR from pathlib import Path -from typing import Dict # possible converted locations. # "self" is the target location where the converted model would be saved @@ -119,11 +118,13 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre curr = Path(converted_path) parent = curr.parent if parent.name not in processed: - processed[parent.name]: Dict = { + processed[parent.name] = { 'model_name': parent.name, 'checkpoints_location': str(parent).replace(load_dir,weka_prefix), 'revisions': [curr.name] } + elif 'revisions' not in processed[parent.name]: # not sure if this would ever occur, but trying to get the error check happy + processed[parent.name]['revisions'] = [curr.name] else: processed[parent.name]['revisions'].append(curr.name) From acaccddb6a22d632f38811844921388a0a7e0962 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 11 Oct 2024 16:01:47 -0700 Subject: [PATCH 70/89] error fixes for pr --- CHANGELOG.md | 1 + scripts/convert_checkpoints_batch.py | 9 ++++----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e9752a733..32cc5b415 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- Added `scripts/convert_checkpoints_batch.py` and `scripts/convert_checkpoints.sh` for processing many intermediate checkpoints in batches for offline evals. - Added ability to try loading latest checkpoint from save folder using `--try_load_latest_save`. - Added support for flash attention and gradient checkpointing to `hf_olmo`. diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 593fe99b8..846bce14a 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -17,6 +17,7 @@ from gantry import RESULTS_DIR from pathlib import Path +from typing import List, Dict, Union # possible converted locations. # "self" is the target location where the converted model would be saved @@ -36,7 +37,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre print(f">>> Total of {len(cps)} paths to process. <<<", flush=True) - processed = {} + processed: Dict = {} # Convert to old-style checkpoint. for checkpoint_path in cps: @@ -118,13 +119,11 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre curr = Path(converted_path) parent = curr.parent if parent.name not in processed: - processed[parent.name] = { + processed[parent.name]= { 'model_name': parent.name, 'checkpoints_location': str(parent).replace(load_dir,weka_prefix), 'revisions': [curr.name] } - elif 'revisions' not in processed[parent.name]: # not sure if this would ever occur, but trying to get the error check happy - processed[parent.name]['revisions'] = [curr.name] else: processed[parent.name]['revisions'].append(curr.name) @@ -164,7 +163,7 @@ def copy_s3_to_local(bucket, prefix, local_path, display_name, sanity_check): def expand_paths(cps, s3): - expanded = [] + expanded: List[str] = [] for cp in cps: bucket = cp.split('/')[2] From a4a40e172d9abc66b2ec7dee7dc27d5dacfa3fd9 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 11 Oct 2024 16:07:04 -0700 Subject: [PATCH 71/89] fixing errors for pr --- scripts/convert_checkpoints_batch.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 846bce14a..efedf70c2 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -12,12 +12,11 @@ import os import subprocess import time +from pathlib import Path +from typing import Dict, List import boto3 - from gantry import RESULTS_DIR -from pathlib import Path -from typing import List, Dict, Union # possible converted locations. # "self" is the target location where the converted model would be saved From 3d2bd3224a0469f3c8fd1cac304cb1ad2ecf5c0f Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 11 Oct 2024 16:33:39 -0700 Subject: [PATCH 72/89] fixing errors for pr --- scripts/convert_checkpoints_batch.py | 128 ++++++++++++++++----------- 1 file changed, 75 insertions(+), 53 deletions(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index efedf70c2..0075211bc 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -23,14 +23,12 @@ # key: template, value: description # template: MUST obey .format(load_dir, retain_path_name) -WEKA_CHECK_LOCATIONS_PREFIXES = { - "{}/{}-hf/pytorch_model.bin": 'self', - "{}/ianm/{}-hf/pytorch_model.bin": "ian's" -} +WEKA_CHECK_LOCATIONS_PREFIXES = {"{}/{}-hf/pytorch_model.bin": "self", "{}/ianm/{}-hf/pytorch_model.bin": "ian's"} + def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_prefix="/weka", save_to_weka=False): - s3_client = boto3.client('s3') - s3_resource = boto3.resource('s3') + s3_client = boto3.client("s3") + s3_resource = boto3.resource("s3") cps = expand_paths(cps, s3_client) @@ -49,25 +47,27 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre conversion_status = "" # sort out paths, bucket names, and so on ... - path_bits = checkpoint_path.strip('/').replace('s3://', '').split('/') + path_bits = checkpoint_path.strip("/").replace("s3://", "").split("/") s3_bucket_name = path_bits[0] - s3_prefix = '/'.join(path_bits[1:]) - temp_path = '/'.join(path_bits) #checkpoint_path.replace('s3://', '').strip('/') + s3_prefix = "/".join(path_bits[1:]) + temp_path = "/".join(path_bits) # checkpoint_path.replace('s3://', '').strip('/') local_path = f"{load_dir}/{temp_path}-hf/" # the converted model may already exist in local_path or in path_found = False - potential_existing_locations = [candidate_loc.format(load_dir,temp_path) for candidate_loc in WEKA_CHECK_LOCATIONS_PREFIXES] + potential_existing_locations = [ + candidate_loc.format(load_dir, temp_path) for candidate_loc in WEKA_CHECK_LOCATIONS_PREFIXES + ] for loc in potential_existing_locations: if os.path.exists(loc): - existing_location = loc.replace('/pytorch_model.bin','') + existing_location = loc.replace("/pytorch_model.bin", "") path_found = True break # if one of the potential existing location has converted model in it then use that if path_found: # then there is no conversion to do. - conversion_status = 'existing' + conversion_status = "existing" converted_path = existing_location print(f"Converted Checkpoint Found: {converted_path}\n", flush=True) else: @@ -81,70 +81,74 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre # if save to weka flag is passed, then download the s3 converted model to the local path if save_to_weka: - copy_s3_to_local(s3_bucket, s3_prefix, local_path, local_path.replace(load_dir,weka_prefix), sanity_check) - conversion_status = 'existing-downloaded' + copy_s3_to_local( + s3_bucket, s3_prefix, local_path, local_path.replace(load_dir, weka_prefix), sanity_check + ) + conversion_status = "existing-downloaded" converted_path = local_path else: - conversion_status = 'existing' + conversion_status = "existing" converted_path = s3_hf_exists # if no existing conversions are found then process and save to local path if not path_found: - conversion_status = 'new' + conversion_status = "new" converted_path = local_path conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{local_path}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5' --cleanup-local-dir" if sanity_check: - print('SANITY CHECK MODE (not running the conversion)') - print(conversion_cmd + '\n') + print("SANITY CHECK MODE (not running the conversion)") + print(conversion_cmd + "\n") else: try: subprocess.run(conversion_cmd, shell=True, check=True) except subprocess.CalledProcessError as e: - error = e.output ### NOT ACTUALLY WORKING CORRECTLY. FIX THIS (not catching config not found error) - conversion_status = 'error' + error = ( + e.output + ) ### NOT ACTUALLY WORKING CORRECTLY. FIX THIS (not catching config not found error) + conversion_status = "error" converted_path = "" # Keep info for log.jsonl local_log = { - 'unprocessed_path': checkpoint_path, - 'converted_path': converted_path.replace(load_dir,weka_prefix), - 'conversion': conversion_status, - 'date_time': time.strftime('%b-%d-%Y_%H%M', time.localtime()), - 'error': error + "unprocessed_path": checkpoint_path, + "converted_path": converted_path.replace(load_dir, weka_prefix), + "conversion": conversion_status, + "date_time": time.strftime("%b-%d-%Y_%H%M", time.localtime()), + "error": error, } # output model checkpoint location for eval scripts curr = Path(converted_path) parent = curr.parent if parent.name not in processed: - processed[parent.name]= { - 'model_name': parent.name, - 'checkpoints_location': str(parent).replace(load_dir,weka_prefix), - 'revisions': [curr.name] + processed[parent.name] = { + "model_name": parent.name, + "checkpoints_location": str(parent).replace(load_dir, weka_prefix), + "revisions": [curr.name], } else: - processed[parent.name]['revisions'].append(curr.name) + processed[parent.name]["revisions"].append(curr.name) # Output Log if not sanity_check: - with open(os.path.join(RESULTS_DIR, 'log.jsonl'), 'a+') as fout: - fout.write(json.dumps(local_log) + '\n') + with open(os.path.join(RESULTS_DIR, "log.jsonl"), "a+") as fout: + fout.write(json.dumps(local_log) + "\n") # Output checkpoint location for eval scripts if not sanity_check: - with open(os.path.join(RESULTS_DIR, 'model_checkpoints.jsonl'), 'w') as fout: + with open(os.path.join(RESULTS_DIR, "model_checkpoints.jsonl"), "w") as fout: for _, p in processed.items(): - fout.write(json.dumps(p) + '\n') + fout.write(json.dumps(p) + "\n") def s3_path_exists(bucket, prefix, bucket_name): # look for pytorch_model.bin in directories ending with -hf or -hf-olmo. - objs = list(bucket.objects.filter(Prefix=prefix + '-hf/pytorch_model.bin')) + objs = list(bucket.objects.filter(Prefix=prefix + "-hf/pytorch_model.bin")) if len(objs) > 0: return f"s3://{bucket_name}/{prefix}-hf" else: - objs2 = list(bucket.objects.filter(Prefix=prefix + '-hf-olmo/pytorch_model.bin')) + objs2 = list(bucket.objects.filter(Prefix=prefix + "-hf-olmo/pytorch_model.bin")) return f"s3://{bucket_name}/{prefix}-hf-olmo" if (len(objs2) > 0) else None @@ -156,7 +160,7 @@ def copy_s3_to_local(bucket, prefix, local_path, display_name, sanity_check): target = os.path.join(local_path, os.path.relpath(obj.key, os.path.dirname(prefix))) if not os.path.exists(os.path.dirname(target)): os.makedirs(os.path.dirname(target)) - if obj.key[-1] == '/': + if obj.key[-1] == "/": continue bucket.download_file(obj.key, target) @@ -165,16 +169,16 @@ def expand_paths(cps, s3): expanded: List[str] = [] for cp in cps: - bucket = cp.split('/')[2] - segs = cp.split('*') - prefix = segs[0].replace('s3://'+bucket+'/', '') + bucket = cp.split("/")[2] + segs = cp.split("*") + prefix = segs[0].replace("s3://" + bucket + "/", "") relevant_dirs = [] skip_parent = [] - paginator = s3.get_paginator('list_objects_v2') + paginator = s3.get_paginator("list_objects_v2") page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix) - contents = {obj["Key"]:str(Path(obj['Key']).parent) for page in page_iterator for obj in page['Contents']} + contents = {obj["Key"]: str(Path(obj["Key"]).parent) for page in page_iterator for obj in page["Contents"]} paths = set(contents.values()) for path in contents: @@ -184,14 +188,14 @@ def expand_paths(cps, s3): if parent in relevant_dirs or parent in skip_parent: continue - if p.parent.name in ['optim', 'train','model']: + if p.parent.name in ["optim", "train", "model"]: if f"{grandpa}-unsharded" in paths: # skip condition skip_parent.append(parent) continue else: relevant_dirs.append(grandpa) - elif p.name == 'model.pt': + elif p.name == "model.pt": relevant_dirs.append(parent) search_segs = [seg for i, seg in enumerate(segs) if i > 0 and seg != ""] @@ -209,8 +213,8 @@ def expand_paths(cps, s3): def read_checkpoints(f): - with open(f, 'r') as fin: - checkpoints = [line for line in fin if line and line != ''] + with open(f, "r") as fin: + checkpoints = [line for line in fin if line and line != ""] return checkpoints @@ -219,18 +223,36 @@ def main(): group_batch = parser.add_mutually_exclusive_group(required=True) group_batch.add_argument("--checkpoint-path", help="path to sharded checkpoint", type=str) - group_batch.add_argument("--checkpoint-path-file", help="file that lists sharded checkpoint paths (batch run option)", type=str) - parser.add_argument("--weka-load-dir", help='mounted location of weka bucket', default='/data/input', type=str) - parser.add_argument("--weka-prefix", help='weka directory prefix for output', default='/weka', type=str) - parser.add_argument("--sanity-check", help='print what would be run; do not actually run conversion', action='store_true') - parser.add_argument("--save-to-weka", help='if checkpoints are found on s3, save them to loaded weka dir', action='store_true') + group_batch.add_argument( + "--checkpoint-path-file", help="file that lists sharded checkpoint paths (batch run option)", type=str + ) + parser.add_argument("--weka-load-dir", help="mounted location of weka bucket", default="/data/input", type=str) + parser.add_argument("--weka-prefix", help="weka directory prefix for output", default="/weka", type=str) + parser.add_argument( + "--sanity-check", help="print what would be run; do not actually run conversion", action="store_true" + ) + parser.add_argument( + "--save-to-weka", help="if checkpoints are found on s3, save them to loaded weka dir", action="store_true" + ) args = parser.parse_args() if args.checkpoint_path is not None: - convert_checkpoint([args.checkpoint_path], load_dir=args.weka_load_dir.rstrip('/'), sanity_check=args.sanity_check, weka_prefix=args.weka_prefix, save_to_weka=args.save_to_weka) + convert_checkpoint( + [args.checkpoint_path], + load_dir=args.weka_load_dir.rstrip("/"), + sanity_check=args.sanity_check, + weka_prefix=args.weka_prefix, + save_to_weka=args.save_to_weka, + ) else: - convert_checkpoint(read_checkpoints(args.checkpoint_path_file), load_dir=args.weka_load_dir.rstrip('/'), sanity_check=args.sanity_check, weka_prefix=args.weka_prefix, save_to_weka=args.save_to_weka) + convert_checkpoint( + read_checkpoints(args.checkpoint_path_file), + load_dir=args.weka_load_dir.rstrip("/"), + sanity_check=args.sanity_check, + weka_prefix=args.weka_prefix, + save_to_weka=args.save_to_weka, + ) if __name__ == "__main__": From d529f5a60dafadc202d2403bad0baec310aa70d7 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 11 Oct 2024 16:34:48 -0700 Subject: [PATCH 73/89] fixing errors for pr --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 32cc5b415..e9752a733 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- Added `scripts/convert_checkpoints_batch.py` and `scripts/convert_checkpoints.sh` for processing many intermediate checkpoints in batches for offline evals. - Added ability to try loading latest checkpoint from save folder using `--try_load_latest_save`. - Added support for flash attention and gradient checkpointing to `hf_olmo`. From 4a32be0621a4bee4527b99e791fed12e6549194a Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 25 Oct 2024 17:57:24 -0700 Subject: [PATCH 74/89] removing temp outputs --- .gitignore | 3 --- guided-trout-2f805b9.yaml | 39 +++++++++++++++++++++++++++++++++++++++ log.txt | 10 ---------- 3 files changed, 39 insertions(+), 13 deletions(-) create mode 100644 guided-trout-2f805b9.yaml delete mode 100644 log.txt diff --git a/.gitignore b/.gitignore index e0f77ccd8..9b1e99785 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,3 @@ -# beaker yaml -guided-trout-2f805b9.yaml - # build artifacts .eggs/ diff --git a/guided-trout-2f805b9.yaml b/guided-trout-2f805b9.yaml new file mode 100644 index 000000000..7607d5d52 --- /dev/null +++ b/guided-trout-2f805b9.yaml @@ -0,0 +1,39 @@ +version: v2 +tasks: + - name: main + image: + beaker: ai2/conda + command: [bash, /gantry/entrypoint.sh] + arguments: [/bin/bash, -c, 'python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir ''s3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded'' --destination-dir ''/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded'' --keep-olmo-artifacts'] + envVars: + - name: GANTRY_VERSION + value: 1.8.3 + - name: GITHUB_REPO + value: allenai/OLMo + - name: GIT_REF + value: fbfda0e3eca0768728eaa8d7dbd91bcbba5d8d2c + - name: GANTRY_TASK_NAME + value: main + - name: AWS_ACCESS_KEY_ID + secret: JENA_AWS_ACCESS_KEY_ID + - name: AWS_SECRET_ACCESS_KEY + secret: JENA_AWS_SECRET_ACCESS_KEY + - name: NO_PYTHON + value: "1" + datasets: + - mountPath: /gantry + source: + beaker: 01J721NEMRKZ4HBGNS5KV7873R + - mountPath: /data/input + source: + weka: oe-eval-default + result: + path: /results + resources: + sharedMemory: 10 GiB + context: + priority: normal + preemptible: true + constraints: + cluster: + - ai2/jupiter-cirrascale-2 diff --git a/log.txt b/log.txt deleted file mode 100644 index 16a1a62e7..000000000 --- a/log.txt +++ /dev/null @@ -1,10 +0,0 @@ - - o=======[] - __ _ _ _ |_ [] - / _` | __ _ _ _ | |_ _ _ | || | [] - \__, | / _` | | ' \ | _| | '_| \_, | _/ ]_ - |___/ \__,_| |_||_| _\__| _|_|_ _|__/ |_____| -_|"""""|_|"""""|_|"""""|_|"""""|_|"""""|_| """"| - `---------------------------------------------' - -Experiment submitted, see progress at https://beaker.org/ex/01J7446KB7EXZ35D8NST0JTNTY From 2dc26a90b7b14e10b6fa07e075d3538ff5938017 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 25 Oct 2024 18:27:08 -0700 Subject: [PATCH 75/89] fixes and cleanups --- guided-trout-2f805b9.yaml | 39 ----------------------------------- hf_olmo/convert_olmo_to_hf.py | 13 ++++-------- requirements.txt | 7 ------- 3 files changed, 4 insertions(+), 55 deletions(-) delete mode 100644 guided-trout-2f805b9.yaml delete mode 100644 requirements.txt diff --git a/guided-trout-2f805b9.yaml b/guided-trout-2f805b9.yaml deleted file mode 100644 index 7607d5d52..000000000 --- a/guided-trout-2f805b9.yaml +++ /dev/null @@ -1,39 +0,0 @@ -version: v2 -tasks: - - name: main - image: - beaker: ai2/conda - command: [bash, /gantry/entrypoint.sh] - arguments: [/bin/bash, -c, 'python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir ''s3://ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded'' --destination-dir ''/data/input/ai2-llm/checkpoints/cheap_decisions/dolma-v1-6-and-sources-baseline-3x-code-1B-N-1T-D-mitchish1-001/step99000-unsharded'' --keep-olmo-artifacts'] - envVars: - - name: GANTRY_VERSION - value: 1.8.3 - - name: GITHUB_REPO - value: allenai/OLMo - - name: GIT_REF - value: fbfda0e3eca0768728eaa8d7dbd91bcbba5d8d2c - - name: GANTRY_TASK_NAME - value: main - - name: AWS_ACCESS_KEY_ID - secret: JENA_AWS_ACCESS_KEY_ID - - name: AWS_SECRET_ACCESS_KEY - secret: JENA_AWS_SECRET_ACCESS_KEY - - name: NO_PYTHON - value: "1" - datasets: - - mountPath: /gantry - source: - beaker: 01J721NEMRKZ4HBGNS5KV7873R - - mountPath: /data/input - source: - weka: oe-eval-default - result: - path: /results - resources: - sharedMemory: 10 GiB - context: - priority: normal - preemptible: true - constraints: - cluster: - - ai2/jupiter-cirrascale-2 diff --git a/hf_olmo/convert_olmo_to_hf.py b/hf_olmo/convert_olmo_to_hf.py index 9e0c7afb6..731488e9e 100644 --- a/hf_olmo/convert_olmo_to_hf.py +++ b/hf_olmo/convert_olmo_to_hf.py @@ -284,12 +284,6 @@ def main(): help="Keep olmo-specific artifacts in the checkpoint.", ) - parser.add_argument( - "--cleanup-local-dir", - action="store_true", - help="Remove local download of the directory." - ) - args = parser.parse_args() args.destination_dir = args.destination_dir or args.checkpoint_dir @@ -314,9 +308,10 @@ def main(): upload_local_checkpoint(local_checkpoint_dir, args.destination_dir) print(f"Converted checkpoint saved to {args.destination_dir}") - if args.cleanup_local_dir: - print(f"Removing temporary local dir: {local_checkpoint_dir}") - shutil.rmtree(local_checkpoint_dir) + + # remove local dir copy + print(f"Removing temporary local dir: {local_checkpoint_dir}") + shutil.rmtree(local_checkpoint_dir) if __name__ == "__main__": diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index ae2bf89c5..000000000 --- a/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -torch -datasets -rich -botocore -cached-path -transformers -beaker-gantry From 378aafeca14e32c694c57db26b1f290d8f19f058 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 13 Dec 2024 12:57:30 -0800 Subject: [PATCH 76/89] adding beaker-gantry to dependencies --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 87bd75591..7e80af66b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ dependencies = [ "cached_path>=1.6.2", "transformers", "importlib_resources", + "baker-gantry" ] [project.optional-dependencies] From 69d12f346199c1d6cee680745127826b02527b0d Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 13 Dec 2024 13:08:40 -0800 Subject: [PATCH 77/89] adding beaker-gantry to dependencies --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7e80af66b..10bfa8927 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ dependencies = [ "cached_path>=1.6.2", "transformers", "importlib_resources", - "baker-gantry" + "beaker-gantry" ] [project.optional-dependencies] From 579d61217c0c3268b92573a671a19daf8eddb74e Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 13 Dec 2024 13:16:13 -0800 Subject: [PATCH 78/89] adding beaker-gantry to dependencies --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 10bfa8927..3ccd96ab2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,8 @@ dependencies = [ "cached_path>=1.6.2", "transformers", "importlib_resources", - "beaker-gantry" + "beaker-gantry", + "datasets" ] [project.optional-dependencies] From 3b9563dc355c697ff52dcd98f2959fd5be7b8763 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 13 Dec 2024 13:30:01 -0800 Subject: [PATCH 79/89] python version apparently has to be 3.10 above for olmo/util.py to run --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 3ccd96ab2..4a6e6113b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ description = "Open Language Model (OLMo)" authors = [ { name = "Allen Institute for Artificial Intelligence", email = "olmo@allenai.org" } ] -requires-python = ">=3.8" +requires-python = ">=3.10" license = { file = "LICENSE" } dependencies = [ "numpy<2", From 4a882a377545ea9990e6aa165fbeb31482bd24b7 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 13 Dec 2024 13:50:04 -0800 Subject: [PATCH 80/89] err... no --- olmo/util.py | 2 ++ pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/olmo/util.py b/olmo/util.py index aad77eb1c..9d62a2b32 100644 --- a/olmo/util.py +++ b/olmo/util.py @@ -30,6 +30,8 @@ from olmo_data.data import get_data_path +from __future__ import annotations ### TO BE REMOVED -- hack -- + from .aliases import PathOrStr from .exceptions import ( OLMoCliError, diff --git a/pyproject.toml b/pyproject.toml index 4a6e6113b..3ccd96ab2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ description = "Open Language Model (OLMo)" authors = [ { name = "Allen Institute for Artificial Intelligence", email = "olmo@allenai.org" } ] -requires-python = ">=3.10" +requires-python = ">=3.8" license = { file = "LICENSE" } dependencies = [ "numpy<2", From 6acbfccacaad88044bd08bdd2f8b617ba3ce44d4 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 13 Dec 2024 13:54:08 -0800 Subject: [PATCH 81/89] err... no --- olmo/util.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/olmo/util.py b/olmo/util.py index 9d62a2b32..66af8b1f0 100644 --- a/olmo/util.py +++ b/olmo/util.py @@ -1,3 +1,5 @@ +from __future__ import annotations ### TO BE REMOVED + import gzip import io import json @@ -30,7 +32,6 @@ from olmo_data.data import get_data_path -from __future__ import annotations ### TO BE REMOVED -- hack -- from .aliases import PathOrStr from .exceptions import ( From 6e67a9c96db092f682ecb38f0d504fb2f9f9941e Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Fri, 13 Dec 2024 13:58:34 -0800 Subject: [PATCH 82/89] tinkering --- scripts/convert_olmo_to_hf_new.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/convert_olmo_to_hf_new.py b/scripts/convert_olmo_to_hf_new.py index 0f4ebe9f0..afd74b1ee 100644 --- a/scripts/convert_olmo_to_hf_new.py +++ b/scripts/convert_olmo_to_hf_new.py @@ -11,6 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from __future__ import annotations ### TO BE REMOVED + import argparse import gc import json From 21193cba6a0daa052c4a29e6aface1ce56e68c12 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Mon, 16 Dec 2024 08:01:44 -0800 Subject: [PATCH 83/89] undoing changes --- olmo/util.py | 2 -- scripts/convert_olmo_to_hf_new.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/olmo/util.py b/olmo/util.py index 66af8b1f0..3f4093c7c 100644 --- a/olmo/util.py +++ b/olmo/util.py @@ -1,5 +1,3 @@ -from __future__ import annotations ### TO BE REMOVED - import gzip import io import json diff --git a/scripts/convert_olmo_to_hf_new.py b/scripts/convert_olmo_to_hf_new.py index afd74b1ee..b0752e651 100644 --- a/scripts/convert_olmo_to_hf_new.py +++ b/scripts/convert_olmo_to_hf_new.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import annotations ### TO BE REMOVED - import argparse import gc import json From 1b4da65e37dba14c87562a3b61c3f28e1522fb6c Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Mon, 16 Dec 2024 08:42:21 -0800 Subject: [PATCH 84/89] fix --- scripts/convert_checkpoints_batch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 0075211bc..1ea5fa688 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -94,7 +94,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre if not path_found: conversion_status = "new" converted_path = local_path - conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{local_path}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5' --cleanup-local-dir" + conversion_cmd = f"python hf_olmo/convert_olmo_to_hf.py --checkpoint-dir '{checkpoint_path}' --destination-dir '{local_path}' --tokenizer 'allenai/gpt-neox-olmo-dolma-v1_5'" if sanity_check: print("SANITY CHECK MODE (not running the conversion)") From 7b6e37f2713e956a18e34257ccc215bb2d603483 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Mon, 16 Dec 2024 10:33:41 -0800 Subject: [PATCH 85/89] error code updated --- scripts/convert_checkpoints_batch.py | 39 ++++++++++++++++++---------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 1ea5fa688..e935acb01 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -35,6 +35,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre print(f">>> Total of {len(cps)} paths to process. <<<", flush=True) processed: Dict = {} + errored: Dict = {} # Convert to old-style checkpoint. for checkpoint_path in cps: @@ -103,32 +104,40 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre try: subprocess.run(conversion_cmd, shell=True, check=True) except subprocess.CalledProcessError as e: - error = ( - e.output - ) ### NOT ACTUALLY WORKING CORRECTLY. FIX THIS (not catching config not found error) + print(f"Error during checkpoint conversion: {checkpoint_path}") + error = ( e.return_code, e.stderr ) ### NOT ACTUALLY WORKING CORRECTLY. FIX THIS (not catching config not found error) conversion_status = "error" converted_path = "" + timestamp = time.strftime("%b-%d-%Y_%H%M", time.localtime()) + # Keep info for log.jsonl local_log = { "unprocessed_path": checkpoint_path, "converted_path": converted_path.replace(load_dir, weka_prefix), "conversion": conversion_status, - "date_time": time.strftime("%b-%d-%Y_%H%M", time.localtime()), + "date_time": timestamp, "error": error, } - # output model checkpoint location for eval scripts - curr = Path(converted_path) - parent = curr.parent - if parent.name not in processed: - processed[parent.name] = { - "model_name": parent.name, - "checkpoints_location": str(parent).replace(load_dir, weka_prefix), - "revisions": [curr.name], + if conversion_status == 'error': + errored[checkpoint_path] = { + "unprocessed_path": checkpoint_path, + "date_time": timestamp, + "error": error } else: - processed[parent.name]["revisions"].append(curr.name) + # output model checkpoint location for eval scripts + curr = Path(converted_path) + parent = curr.parent + if parent.name not in processed: + processed[parent.name] = { + "model_name": parent.name, + "checkpoints_location": str(parent).replace(load_dir, weka_prefix), + "revisions": [curr.name], + } + else: + processed[parent.name]["revisions"].append(curr.name) # Output Log if not sanity_check: @@ -140,6 +149,10 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre with open(os.path.join(RESULTS_DIR, "model_checkpoints.jsonl"), "w") as fout: for _, p in processed.items(): fout.write(json.dumps(p) + "\n") + if len(errored) > 0: + with open(os.path.join(RESULTS_DIR, "errors.jsonl"), "w") as fout: + for _, p in errored.items(): + fout.write(json.dumps(p) + "\n") def s3_path_exists(bucket, prefix, bucket_name): From 264ce05b52280792f6dac14403a34d8bb1fe4f0e Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Mon, 16 Dec 2024 12:56:06 -0800 Subject: [PATCH 86/89] minor change to the error log --- scripts/convert_checkpoints_batch.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index e935acb01..e1ea66194 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -105,7 +105,10 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre subprocess.run(conversion_cmd, shell=True, check=True) except subprocess.CalledProcessError as e: print(f"Error during checkpoint conversion: {checkpoint_path}") - error = ( e.return_code, e.stderr ) ### NOT ACTUALLY WORKING CORRECTLY. FIX THIS (not catching config not found error) + error = { + 'error_code': e.return_code, + 'error_stderr': e.stderr + } conversion_status = "error" converted_path = "" From 2154ea80125b601972a69be758cd4f583b582d25 Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Tue, 17 Dec 2024 12:41:18 -0800 Subject: [PATCH 87/89] fixed error --- scripts/convert_checkpoints_batch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index e1ea66194..96e44da7e 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -106,7 +106,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre except subprocess.CalledProcessError as e: print(f"Error during checkpoint conversion: {checkpoint_path}") error = { - 'error_code': e.return_code, + 'error_code': e.returncode, 'error_stderr': e.stderr } conversion_status = "error" From d4e1f42fb506443ea506ff3612ab35f7c522d7fe Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Tue, 17 Dec 2024 12:49:46 -0800 Subject: [PATCH 88/89] edited conversion error output --- scripts/convert_checkpoints_batch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 96e44da7e..9f03b90e4 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -107,7 +107,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre print(f"Error during checkpoint conversion: {checkpoint_path}") error = { 'error_code': e.returncode, - 'error_stderr': e.stderr + 'error_stderr': e.stdout } conversion_status = "error" converted_path = "" From f39a522f90fd61a6bf3602d8598be2d5ecea965b Mon Sep 17 00:00:00 2001 From: Jena Hwang Date: Tue, 17 Dec 2024 12:56:26 -0800 Subject: [PATCH 89/89] edited conversion error output --- scripts/convert_checkpoints_batch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/convert_checkpoints_batch.py b/scripts/convert_checkpoints_batch.py index 9f03b90e4..d6a3bfee1 100644 --- a/scripts/convert_checkpoints_batch.py +++ b/scripts/convert_checkpoints_batch.py @@ -107,7 +107,7 @@ def convert_checkpoint(cps, load_dir="/data/input", sanity_check=False, weka_pre print(f"Error during checkpoint conversion: {checkpoint_path}") error = { 'error_code': e.returncode, - 'error_stderr': e.stdout + 'error_stdout': e.stdout } conversion_status = "error" converted_path = ""