allenai · dirkgr · Dec 8, 2024 · Dec 8, 2024 · Dec 8, 2024 · Dec 8, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,14 +15,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added new LR schedulers: `LinearWithWarmup`, `InvSqrtWithWarmup`, `ConstantWithWarmup`, `SequentialScheduler`.
 - Added option to pre-download checkpoint files from remote storage before trying to load a checkpoint.
 - Added a callback for sending Slack notifications.
+- Added `SkipStepAdamW` optimizer.
 
 ### Changed
 
 - Changed storage of shared shard state in sharded checkpoints from smallest shard to lowest rank (normally 0).
+- Changed underlying AdamW implementation.
 
 ### Fixed
 
 - Added missing `weights_only=False` argument to fix loading train checkpoints with newer versions of PyTorch.
+- Fixed bug where GCS upload does not retry on transient failures. 
 
 ## [v1.7.0](https://github.com/allenai/OLMo-core/releases/tag/v1.7.0) - 2024-11-27
 

diff --git a/src/olmo_core/distributed/checkpoint/__init__.py b/src/olmo_core/distributed/checkpoint/__init__.py
@@ -63,6 +63,7 @@ def save_state_dict(
     state_dict: Dict[str, Any],
     process_group: Optional[dist.ProcessGroup] = None,
     save_overwrite: bool = False,
+    thread_count: Optional[int] = None,
 ):
     """
     Save an arbitrary state dictionary to a distributed format that can loaded again with
@@ -80,7 +81,7 @@ def save_state_dict(
     dir = _prepare_env_for_save(dir, process_group=process_group, save_overwrite=save_overwrite)
     dist_cp.state_dict_saver.save(
         state_dict,
-        storage_writer=RemoteFileSystemWriter(dir),
+        storage_writer=RemoteFileSystemWriter(dir, thread_count=thread_count),
         process_group=process_group,
     )
 
@@ -93,6 +94,7 @@ def save_model_and_optim_state(
     *,
     process_group: Optional[dist.ProcessGroup] = None,
     save_overwrite: bool = False,
+    thread_count: Optional[int] = None,
 ) -> None:
     """
     Save model and optimizer state dictionaries. The model state can be a sharded model, in which
@@ -123,7 +125,7 @@ def save_model_and_optim_state(
     planner = DefaultSavePlanner(dedup_save_to_lowest_rank=True)
     dist_cp.state_dict_saver.save(
         state_dict,
-        storage_writer=RemoteFileSystemWriter(dir),
+        storage_writer=RemoteFileSystemWriter(dir, thread_count=thread_count),
         process_group=process_group,
         planner=planner,
     )
@@ -137,6 +139,7 @@ def async_save_model_and_optim_state(
     *,
     process_group: Optional[dist.ProcessGroup] = None,
     save_overwrite: bool = False,
+    thread_count: Optional[int] = None,
 ) -> Future[None]:
     """
     An async version of :func:`save_model_and_optim_state()`.
@@ -148,7 +151,7 @@ def async_save_model_and_optim_state(
     planner = DefaultSavePlanner(dedup_save_to_lowest_rank=True)
     return dist_cp.state_dict_saver.async_save(
         state_dict,
-        storage_writer=RemoteFileSystemWriter(dir),
+        storage_writer=RemoteFileSystemWriter(dir, thread_count=thread_count),
         process_group=process_group,
         planner=planner,
     )
@@ -164,6 +167,7 @@ def load_model_and_optim_state(
     key_mapping: Optional[Dict[str, str]] = None,
     pre_download: bool = False,
     work_dir: Optional[PathOrStr] = None,
+    thread_count: Optional[int] = None,
 ):
     """
     Load model and optimizer state in-place from a checkpoint saved via :func:`save_model_and_optim_state()`.
@@ -201,10 +205,13 @@ def load_model_and_optim_state(
         This dictionary should map current keys to keys in the checkpoint to be loaded.
     :param pre_download: Download and cache relevant remote checkpoint files before trying to read from them.
     :param work_dir: A working directory for caching files/directories.
+    :param thread_count: Set the number of threads used for certain operations.
     """
     dir = normalize_path(dir)
     state_dict = _prepare_state_dict(model, optim, process_group=process_group)
-    reader = RemoteFileSystemReader(dir, pre_download=pre_download, work_dir=work_dir)
+    reader = RemoteFileSystemReader(
+        dir, thread_count=thread_count, pre_download=pre_download, work_dir=work_dir
+    )
 
     if key_mapping is not None:
         metadata = reader.read_metadata()

diff --git a/src/olmo_core/distributed/utils.py b/src/olmo_core/distributed/utils.py
@@ -92,6 +92,7 @@ def init_distributed(backend: str = "nccl", timeout: timedelta = timedelta(minut
                 "enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0",
             )
             set_env_var("NCCL_SOCKET_IFNAME", "enp0s12")
+            set_env_var("NCCL_DEBUG_SUBSYS", "INIT,NET")
 
     if backend_supports_cuda(backend):
         # Set CUDA device.

diff --git a/src/olmo_core/internal/common.py b/src/olmo_core/internal/common.py
@@ -102,6 +102,7 @@ def build_launch_config(
             # Setup python environment.
             "conda shell.bash activate base",
             "pip install -e '.[all]'",
+            "pip install --upgrade beaker-py",
             # Quickly try a new version of PyTorch like this
             #  "pip install --upgrade --pre torch==2.6.0.dev20241112+cu121 --index-url https://download.pytorch.org/whl/nightly/cu121",
             "pip freeze",

diff --git a/src/olmo_core/internal/experiment.py b/src/olmo_core/internal/experiment.py
@@ -130,6 +130,7 @@ def build_common_components(
         root_dir=root_dir,
         cmd=[script, cmd_to_launch, run_name, cluster, *overrides],
         cluster=cluster,
+        nccl_debug=False,
     )
 
     beaker_user = get_beaker_username()

diff --git a/src/olmo_core/io.py b/src/olmo_core/io.py
@@ -532,16 +532,25 @@ def _get_gcs_client():
 
 
 def _gcs_is_retriable(exc: Exception) -> bool:
+    from google.api_core.exceptions import BadRequest
     from google.api_core.retry import if_transient_error
 
-    return if_transient_error(exc) or isinstance(exc, requests.exceptions.Timeout)
+    return (
+        if_transient_error(exc)
+        or isinstance(exc, requests.exceptions.Timeout)
+        or isinstance(exc, BadRequest)  # Weird choice, but Google throws this transiently
+    )
 
 
 def _get_gcs_retry():
     from google.api_core.retry import Retry
 
     return Retry(
-        predicate=_gcs_is_retriable, initial=1.0, maximum=10.0, multiplier=2.0, timeout=600.0
+        predicate=_gcs_is_retriable,  # NOTE: it appears google might ignore this
+        initial=1.0,
+        maximum=10.0,
+        multiplier=2.0,
+        timeout=600.0,
     )
 
 
@@ -554,7 +563,7 @@ def _get_gcs_conditional_retry():
     return ConditionalRetryPolicy(_get_gcs_retry(), is_generation_specified, ["query_params"])
 
 
-@retriable()
+@retriable(retry_condition=_gcs_is_retriable)
 def _gcs_file_size(bucket_name: str, key: str) -> int:
     from google.api_core.exceptions import NotFound
 
@@ -569,35 +578,45 @@ def _gcs_file_size(bucket_name: str, key: str) -> int:
     return blob.size
 
 
-@retriable()
+@retriable(retry_condition=_gcs_is_retriable)
 def _gcs_get_bytes_range(bucket_name: str, key: str, bytes_start: int, num_bytes: int) -> bytes:
     from google.api_core.exceptions import NotFound
 
     storage_client = _get_gcs_client()
     bucket = storage_client.bucket(bucket_name)
     blob = bucket.blob(key)
     try:
-        blob.reload()
+        blob.reload(retry=_get_gcs_retry())
     except NotFound:
         raise FileNotFoundError(f"gs://{bucket_name}/{key}")
     return blob.download_as_bytes(
         start=bytes_start, end=bytes_start + num_bytes - 1, retry=_get_gcs_retry()
     )
 
 
-@retriable()
+@retriable(retry_condition=_gcs_is_retriable)
 def _gcs_upload(source: Path, bucket_name: str, key: str, save_overwrite: bool = False):
     storage_client = _get_gcs_client()
     bucket = storage_client.bucket(bucket_name)
     blob = bucket.blob(key)
-    if not save_overwrite and blob.exists():
-        raise FileExistsError(
-            f"gs://{bucket_name}/{key} already exists. Use save_overwrite to overwrite it."
-        )
-    blob.upload_from_filename(source, retry=_get_gcs_conditional_retry())
 
+    generation: int = 0
+    if blob.exists(retry=_get_gcs_retry()):
+        if not save_overwrite:
+            raise FileExistsError(
+                f"gs://{bucket_name}/{key} already exists. Use save_overwrite to overwrite it."
+            )
+
+        blob.reload(retry=_get_gcs_retry())
+        assert blob.generation is not None
+        generation = blob.generation
 
-@retriable()
+    blob.upload_from_filename(
+        source, if_generation_match=generation, retry=_get_gcs_conditional_retry()
+    )
+
+
+@retriable(retry_condition=_gcs_is_retriable)
 def _gcs_clear_directory(bucket_name: str, prefix: str):
     from google.api_core.exceptions import NotFound
 

diff --git a/src/olmo_core/launch/beaker.py b/src/olmo_core/launch/beaker.py
@@ -323,6 +323,14 @@ def build_experiment_spec(self, torchrun: bool = True) -> ExperimentSpec:
         ]
 
         if torchrun:
+            entrypoint_script.append(
+                "export BEAKER_REPLICA_RANK=$("
+                "python src/scripts/reorder_ranks_in_gcp.py "
+                "${BEAKER_REPLICA_RANK} "
+                "${BEAKER_REPLICA_COUNT} "
+                "${BEAKER_LEADER_REPLICA_HOSTNAME}"
+                ")"
+            )
             entrypoint_script.append(" ".join(self._get_torchrun_cmd()) + ' "$@"')
         else:
             entrypoint_script.append('python "$@"')
@@ -341,7 +349,7 @@ def build_experiment_spec(self, torchrun: bool = True) -> ExperimentSpec:
                 leader_selection=self.num_nodes > 1,
                 host_networking=self.num_nodes > 1
                 or any(["augusta" in cluster for cluster in self.clusters]),
-                propagate_failure=True if self.num_nodes > 1 else None,
+                propagate_failure=False,
                 propagate_preemption=True if self.num_nodes > 1 else None,
                 synchronized_start_timeout="90m" if self.num_nodes > 1 else None,
                 resources=TaskResources(gpu_count=self.num_gpus, shared_memory="10GiB"),

diff --git a/src/olmo_core/nn/transformer/config.py b/src/olmo_core/nn/transformer/config.py
@@ -460,19 +460,22 @@ def olmo2_13B(cls, vocab_size: int, **kwargs) -> "TransformerConfig":
         )
 
     @classmethod
-    def olmo2_26B(cls, vocab_size: int, **kwargs) -> "TransformerConfig":
+    def olmo2_32B(cls, vocab_size: int, **kwargs) -> "TransformerConfig":
         """
-        A 26B OLMo model config.
+        A 32B OLMo model config.
         """
+        d_model = 5120
         return cls.llama_like(
             vocab_size=vocab_size,
-            d_model=7168,
-            n_layers=kwargs.pop("n_layers", 40),
-            n_heads=kwargs.pop("n_heads", 56),
+            d_model=d_model,
+            n_layers=kwargs.pop("n_layers", 64),
+            n_heads=kwargs.pop("n_heads", 40),
+            n_kv_heads=kwargs.pop("n_kv_heads", 8),
             block_name=kwargs.pop("block_name", TransformerBlockType.reordered_norm),
             qk_norm=kwargs.pop("qk_norm", True),
             rope_theta=kwargs.pop("rope_theta", 500_000),
-            hidden_size_multiple_of=kwargs.pop("hidden_size_multiple_of", 1024),
+            hidden_size_multiple_of=kwargs.pop("hidden_size_multiple_of", 512),
+            hidden_size_multiplier=kwargs.pop("hidden_size_multiplier", 27648 / (8 * d_model / 3)),
             layer_norm_eps=1e-6,
             **kwargs,
         )

diff --git a/src/olmo_core/optim/__init__.py b/src/olmo_core/optim/__init__.py
@@ -1,5 +1,5 @@
 from .adam import AdamConfig
-from .adamw import AdamWConfig
+from .adamw import AdamW, AdamWConfig, SkipStepAdamW, SkipStepAdamWConfig
 from .config import OptimConfig, OptimGroupOverride
 from .lion import Lion, LionConfig, SkipStepLion, SkipStepLionConfig
 from .scheduler import (
@@ -19,6 +19,9 @@
     "SkipStepOptimizer",
     "AdamWConfig",
     "AdamConfig",
+    "AdamW",
+    "SkipStepAdamWConfig",
+    "SkipStepAdamW",
     "LionConfig",
     "Lion",
     "SkipStepLionConfig",