allenai · dirkgr · Dec 23, 2024 · Dec 23, 2024 · Dec 23, 2024 · Dec 24, 2024
diff --git a/src/Dockerfile b/src/Dockerfile
@@ -57,6 +57,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         git && \
     rm -rf /var/lib/apt/lists/*
 
+# Install google cloud CLI
+RUN apt-get update && apt-get install -y apt-transport-https ca-certificates curl gnupg && \
+    curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg && \
+    echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \
+    apt-get update && \
+    apt-get install -y google-cloud-cli
+
 # Install MLNX OFED user-space drivers
 # See https://docs.nvidia.com/networking/pages/releaseview.action?pageId=15049785#Howto:DeployRDMAacceleratedDockercontaineroverInfiniBandfabric.-Dockerfile
 ENV MOFED_VER="24.01-0.3.3.1"

diff --git a/src/olmo_core/data/data_loader.py b/src/olmo_core/data/data_loader.py
@@ -489,7 +489,7 @@ def __init__(
         assert isinstance(self.dataset, NumpyFSLDataset)
         if self.rank_batch_size % self.dataset.sequence_length != 0:
             raise OLMoConfigurationError(
-                "rank batch size (in tokens) must be divisible by sequence length"
+                f"rank batch size (in tokens) must be divisible by sequence length; got rbs={self.rank_batch_size}, sl={self.dataset.sequence_length}"
             )
 
     @property

diff --git a/src/olmo_core/distributed/checkpoint/__init__.py b/src/olmo_core/distributed/checkpoint/__init__.py
@@ -39,8 +39,9 @@
 from torch.distributed.checkpoint.metadata import Metadata
 
 from olmo_core.aliases import PathOrStr
-from olmo_core.io import clear_directory, dir_is_empty, is_url, normalize_path
-from olmo_core.utils import gc_cuda, wait_for
+from olmo_core.io import clear_directory, dir_is_empty, is_url, normalize_path, resource_path, file_exists
+from olmo_core.utils import gc_cuda, wait_for, log_all_threads
+from . import safetensors_util
 
 from ..utils import barrier, get_fs_local_rank, is_distributed
 from .filesystem import RemoteFileSystemReader, RemoteFileSystemWriter
@@ -207,71 +208,134 @@ def load_model_and_optim_state(
     :param work_dir: A working directory for caching files/directories.
     :param thread_count: Set the number of threads used for certain operations.
     """
-    dir = normalize_path(dir)
-    state_dict = _prepare_state_dict(model, optim, process_group=process_group)
-    reader = RemoteFileSystemReader(
-        dir, thread_count=thread_count, pre_download=pre_download, work_dir=work_dir
-    )
+    assert process_group is None
 
-    if key_mapping is not None:
-        metadata = reader.read_metadata()
-        for current_key, original_key in key_mapping.items():
-            if f"model.{original_key}" not in metadata.state_dict_metadata:
-                continue
+    dir = normalize_path(dir)
 
-            log.info(f"Mapping current param '{current_key}' to '{original_key}' in checkpoint")
-            state_dict["model"][original_key] = state_dict["model"].pop(current_key)
+    can_load_unsharded =(
+        file_exists(f"{dir}_unsharded/model.safetensors") and
+        file_exists(f"{dir}_unsharded/optim.safetensors")
+    )
 
-            if optim is None:
-                continue
+    if can_load_unsharded:
+        if get_fs_local_rank() == 0:
+            log.info(f"Local rank 0 loading {dir}/model.safetensors")
+            model_path = resource_path(dir, "model.safetensors", local_cache=work_dir)
+            log.info(f"Local rank 0 loaded {dir}/model.safetensors")
+            dist.barrier()
+        else:
+            log.info("Nonzero local rank waiting for rank 0 to load model.safetensors")
+            dist.barrier()
+            log.info("Nonzero local rank loading model.safetensors")
+            model_path = resource_path(dir, "model.safetensors", local_cache=work_dir)
+            log.info("Nonzero local rank loaded model.safetensors")
+
+        model_state_dict = safetensors_util.safetensors_file_to_state_dict(model_path)
+        if key_mapping is not None:
+            for current_key, original_key in key_mapping.items():
+                if original_key in model_state_dict:
+                    assert current_key not in model_state_dict, f"Mapping {original_key} to {current_key} in the model state dict would overwrite existing {current_key}"
+                    model_state_dict[current_key] = model_state_dict.pop(original_key)
+
+        sd_options = dist_cp_sd.StateDictOptions(
+            strict=True,
+            full_state_dict=True,
+            broadcast_from_rank0=False
+        )
+        dist_cp_sd.set_model_state_dict(model, model_state_dict, options=sd_options)
+        del model_path
+        del model_state_dict
+        gc_cuda()
 
-            state_dict["optim"]["state"][original_key] = state_dict["optim"]["state"].pop(
-                current_key
-            )
-            for group in state_dict["optim"]["param_groups"]:
-                if current_key in group["params"]:
-                    idx = group["params"].index(current_key)
-                    group["params"][idx] = original_key
-                    break
+        if optim is not None:
+            if get_fs_local_rank() == 0:
+                optim_path = resource_path(dir, "optim.safetensors", local_cache=work_dir)
+                dist.barrier()
+            else:
+                dist.barrier()
+                optim_path = resource_path(dir, "optim.safetensors", local_cache=work_dir)
+
+            optim_state_dict = safetensors_util.safetensors_file_to_state_dict(optim_path)
+            if key_mapping is not None:
+                for current_key, original_key in key_mapping.items():
+                    if original_key in optim_state_dict["state"]:
+                        assert current_key not in optim_state_dict["state"], f"Mapping {original_key} to {current_key} in the optimizer state dict would overwrite existing {current_key}"
+                        optim_state_dict["state"][current_key] = optim_state_dict["state"].pop(original_key)
+                        for group in optim_state_dict["param_groups"]:
+                            if original_key in group["params"]:
+                                idx = group["params"].index(original_key)
+                                group["params"][idx] = current_key
+                                break
+
+            dist_cp_sd.set_optimizer_state_dict(model, optim, optim_state_dict, options=sd_options)
+            del optim_path
+            del optim_state_dict
+            gc_cuda()
+    else:
+        state_dict = _prepare_state_dict(model, optim, process_group=process_group)
+        reader = RemoteFileSystemReader(
+            dir, thread_count=thread_count, pre_download=pre_download, work_dir=work_dir
+        )
 
-    dist_cp.load(
-        state_dict,
-        checkpoint_id=dir,
-        storage_reader=reader,
-        process_group=process_group,
-    )
+        if key_mapping is not None:
+            metadata = reader.read_metadata()
+            for current_key, original_key in key_mapping.items():
+                if f"model.{original_key}" not in metadata.state_dict_metadata:
+                    continue
+
+                log.info(f"Mapping current param '{current_key}' to '{original_key}' in checkpoint")
+                state_dict["model"][original_key] = state_dict["model"].pop(current_key)
+
+                if optim is None:
+                    continue
+
+                state_dict["optim"]["state"][original_key] = state_dict["optim"]["state"].pop(
+                    current_key
+                )
+                for group in state_dict["optim"]["param_groups"]:
+                    if current_key in group["params"]:
+                        idx = group["params"].index(current_key)
+                        group["params"][idx] = original_key
+                        break
+
+        dist_cp.load(
+            state_dict,
+            checkpoint_id=dir,
+            storage_reader=reader,
+            process_group=process_group,
+        )
 
-    if key_mapping is not None:
-        metadata = reader.read_metadata()
-        for current_key, original_key in key_mapping.items():
-            if f"model.{original_key}" not in metadata.state_dict_metadata:
-                continue
+        if key_mapping is not None:
+            metadata = reader.read_metadata()
+            for current_key, original_key in key_mapping.items():
+                if f"model.{original_key}" not in metadata.state_dict_metadata:
+                    continue
 
-            state_dict["model"][current_key] = state_dict["model"].pop(original_key)
+                state_dict["model"][current_key] = state_dict["model"].pop(original_key)
 
-            if optim is None:
-                continue
+                if optim is None:
+                    continue
 
-            state_dict["optim"]["state"][current_key] = state_dict["optim"]["state"].pop(
-                original_key
-            )
-            for group in state_dict["optim"]["param_groups"]:
-                if original_key in group["params"]:
-                    idx = group["params"].index(original_key)
-                    group["params"][idx] = current_key
-                    break
-
-    dist_cp_sd.set_model_state_dict(
-        model, state_dict["model"], options=dist_cp_sd.StateDictOptions(strict=True)
-    )
-    gc_cuda()
+                state_dict["optim"]["state"][current_key] = state_dict["optim"]["state"].pop(
+                    original_key
+                )
+                for group in state_dict["optim"]["param_groups"]:
+                    if original_key in group["params"]:
+                        idx = group["params"].index(original_key)
+                        group["params"][idx] = current_key
+                        break
 
-    if optim is not None:
-        dist_cp_sd.set_optimizer_state_dict(
-            model, optim, state_dict["optim"], options=dist_cp_sd.StateDictOptions(strict=True)
+        dist_cp_sd.set_model_state_dict(
+            model, state_dict["model"], options=dist_cp_sd.StateDictOptions(strict=True)
         )
         gc_cuda()
 
+        if optim is not None:
+            dist_cp_sd.set_optimizer_state_dict(
+                model, optim, state_dict["optim"], options=dist_cp_sd.StateDictOptions(strict=True)
+            )
+            gc_cuda()
+
 
 def unshard_checkpoint(
     dir: PathOrStr,

diff --git a/src/olmo_core/distributed/checkpoint/safetensors_util.py b/src/olmo_core/distributed/checkpoint/safetensors_util.py
@@ -0,0 +1,82 @@
+import base64
+import pickle
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple
+
+import safetensors.torch
+import torch
+
+
+__all__ = [
+    "state_dict_to_safetensors_file",
+    "safetensors_file_to_state_dict",
+]
+
+from olmo_core.aliases import PathOrStr
+
+
+@dataclass(eq=True, frozen=True)
+class STKey:
+    keys: Tuple
+    value_is_pickled: bool
+
+
+def encode_key(key: STKey) -> str:
+    b = pickle.dumps((key.keys, key.value_is_pickled))
+    b = base64.urlsafe_b64encode(b)
+    return str(b, "ASCII")
+
+
+def decode_key(key: str) -> STKey:
+    b = base64.urlsafe_b64decode(key)
+    keys, value_is_pickled = pickle.loads(b)
+    return STKey(keys, value_is_pickled)
+
+
+def flatten_dict(d: Dict) -> Dict[STKey, torch.Tensor]:
+    result = {}
+    for key, value in d.items():
+        if isinstance(value, torch.Tensor):
+            result[STKey((key,), False)] = value
+        elif isinstance(value, dict):
+            value = flatten_dict(value)
+            for inner_key, inner_value in value.items():
+                result[STKey((key,) + inner_key.keys, inner_key.value_is_pickled)] = inner_value
+        else:
+            pickled = bytearray(pickle.dumps(value))
+            pickled_tensor = torch.frombuffer(pickled, dtype=torch.uint8)
+            result[STKey((key,), True)] = pickled_tensor
+    return result
+
+
+def unflatten_dict(d: Dict[STKey, torch.Tensor]) -> Dict:
+    result: Dict = {}
+
+    for key, value in d.items():
+        if key.value_is_pickled:
+            value = pickle.loads(value.numpy().data)
+
+        target_dict = result
+        for k in key.keys[:-1]:
+            new_target_dict = target_dict.get(k)
+            if new_target_dict is None:
+                new_target_dict = {}
+                target_dict[k] = new_target_dict
+            target_dict = new_target_dict
+        target_dict[key.keys[-1]] = value
+
+    return result
+
+
+def state_dict_to_safetensors_file(state_dict: Dict, filename: PathOrStr):
+    state_dict = flatten_dict(state_dict)
+    state_dict = {encode_key(k): v for k, v in state_dict.items()}
+    safetensors.torch.save_file(state_dict, filename)
+
+
+def safetensors_file_to_state_dict(filename: PathOrStr, map_location: Optional[str] = None) -> Dict:
+    if map_location is None:
+        map_location = "cpu"
+    state_dict = safetensors.torch.load_file(filename, device=map_location)
+    state_dict = {decode_key(k): v for k, v in state_dict.items()}
+    return unflatten_dict(state_dict)
diff --git a/src/olmo_core/distributed/utils.py b/src/olmo_core/distributed/utils.py
@@ -67,7 +67,7 @@ def init_distributed(backend: str = "nccl", timeout: timedelta = timedelta(minut
             set_env_var("CUDA_VISIBLE_DEVICES", "0,1,2,3,4,5,6,7")
             set_env_var("NCCL_NET_GDR_LEVEL", "PIX")
             set_env_var("NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING", "0")
-            set_env_var("NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS", "600000")
+            set_env_var("NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS", str(30 * 60 * 1000))
             set_env_var("NCCL_NVLS_ENABLE", "0")
             set_env_var("NCCL_USE_SNAP", "1")
             set_env_var("NCCL_FASTRAK_USE_LLCM", "1")
@@ -93,6 +93,7 @@ def init_distributed(backend: str = "nccl", timeout: timedelta = timedelta(minut
             )
             set_env_var("NCCL_SOCKET_IFNAME", "enp0s12")
             set_env_var("NCCL_DEBUG_SUBSYS", "INIT,NET")
+            set_env_var("TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC", str(15 * 60))
 
     if backend_supports_cuda(backend):
         # Set CUDA device.

diff --git a/src/olmo_core/internal/experiment.py b/src/olmo_core/internal/experiment.py
@@ -229,11 +229,11 @@ def build_config(
         trainer=trainer,
     )
 
+    config = config.merge(overrides)
+
     if finalize_config is not None:
         finalize_config(config)
 
-    config = config.merge(overrides)
-
     if config.model.float8_config is not None and config.model.float8_config.enabled:
         config.trainer.add_callback(
             "float8_handler", Float8HandlerCallback(config=config.model.float8_config)

diff --git a/src/olmo_core/launch/beaker.py b/src/olmo_core/launch/beaker.py
@@ -323,6 +323,14 @@ def build_experiment_spec(self, torchrun: bool = True) -> ExperimentSpec:
         ]
 
         if torchrun:
+            entrypoint_script.append(
+                "export BEAKER_REPLICA_RANK=$("
+                "python src/scripts/reorder_ranks_in_gcp.py "
+                "${BEAKER_REPLICA_RANK} "
+                "${BEAKER_REPLICA_COUNT} "
+                "${BEAKER_LEADER_REPLICA_HOSTNAME}"
+                ")"
+            )
             entrypoint_script.append(" ".join(self._get_torchrun_cmd()) + ' "$@"')
         else:
             entrypoint_script.append('python "$@"')

diff --git a/src/olmo_core/nn/transformer/init.py b/src/olmo_core/nn/transformer/init.py
@@ -2,6 +2,7 @@
 
 import torch
 import torch.nn as nn
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ActivationWrapper
 
 from olmo_core.config import StrEnum
 
@@ -76,6 +77,9 @@ def init_attention(
         if self == InitMethod.normalized:
             std = d_model**-0.5
 
+        if isinstance(m, ActivationWrapper):
+            m = m._checkpoint_wrapped_module
+
         if isinstance(m, Attention):
             for w in (m.w_q, m.w_k, m.w_v):
                 self._init_linear(w, std=std, generator=generator)