wip: alignment context

kylesayrs · kylesayrs · commit 75867339f6ec · 2025-06-03T17:31:24.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
@@ -1,3 +1,5 @@
+import torch
+from compressed_tensors import force_cpu_offload
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -9,9 +11,10 @@
 
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
-    device_map="auto",
+    # device_map="auto",
     torch_dtype="auto",
 )
+force_cpu_offload(model, execution_device=torch.device("cuda"))
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 # Select calibration dataset.
diff --git a/src/llmcompressor/pipelines/basic/pipeline.py b/src/llmcompressor/pipelines/basic/pipeline.py
@@ -37,6 +37,8 @@ def __call__(
         :param dataloader: loads data for calibration
         :param dataset_args: dataset arguments relevant to pipelines
         """
+        # TODO: warn about cpu offloading
+
         model_device = get_execution_device(model)
 
         LifecycleCallbacks.calibration_epoch_start()
diff --git a/src/llmcompressor/pipelines/registry.py b/src/llmcompressor/pipelines/registry.py
@@ -75,12 +75,12 @@ def _validate_infer_pipeline(modifiers: List[Modifier]) -> str:
             quant_modifier = active_qmods[0]
             config = quant_modifier.resolve_quantization_config()
             if config.requires_calibration_data():
-                return "basic"
+                return "sequential"
             else:
                 return "datafree"
 
         if any(isinstance(modifier, SmoothQuantModifier) for modifier in modifiers):
-            return "basic"
+            return "sequential"
 
         return "datafree"
 
diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py
@@ -5,6 +5,7 @@
 from typing import Any, Dict, List, Optional, Set
 
 import torch
+from accelerate.hooks import AlignDevicesHook
 from compressed_tensors import has_offloaded_params
 from compressed_tensors.quantization import find_name_or_class_matches
 from loguru import logger
@@ -23,7 +24,12 @@
 
 from .ast_helpers import autowrap_forwards
 
-__all__ = ["trace_subgraphs", "Subgraph", "get_targets_from_modifiers"]
+__all__ = [
+    "trace_subgraphs",
+    "Subgraph",
+    "get_targets_from_modifiers",
+    "keep_onload_context",
+]
 
 
 @dataclass
@@ -485,3 +491,30 @@ def is_ancestor(module: Module) -> bool:
 
     is_ancestor(model)
     return ancestors
+
+
+@contextlib.contextmanager
+def keep_onload_context():
+    original_pre_forward = AlignDevicesHook.pre_forward
+    onloaded_modules = dict()
+
+    # onload once and disable any future onloading/offloading steps
+    def keep_onload_pre_forward(self: AlignDevicesHook, module, *args, **kwargs):
+        ret = original_pre_forward(self, module, *args, **kwargs)
+        if module not in onloaded_modules:
+            onloaded_modules[module] = (self, self.offload)
+            self.offload = False
+        return ret
+
+    # use the patched pre_forward function within the context
+    with patch_attr(AlignDevicesHook, "pre_forward", keep_onload_pre_forward):
+        yield
+
+    # manually offload all modules that were onloaded
+    for module, (hook, offload) in onloaded_modules.items():
+        hook.offload = offload
+        hook.post_forward(module, None)
+
+
+# def is_cpu_offloaded():
+#
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -1,16 +1,17 @@
 from typing import TYPE_CHECKING
 
 import torch
-import tqdm
 from compressed_tensors.utils import get_execution_device
 from torch.utils.data.dataloader import DataLoader
+from tqdm import tqdm
 
 from llmcompressor.core import LifecycleCallbacks, active_session
 from llmcompressor.modifiers.utils.hooks import HooksMixin
 from llmcompressor.pipelines.cache import IntermediatesCache
 from llmcompressor.pipelines.registry import CalibrationPipeline
 from llmcompressor.pipelines.sequential.helpers import (
     get_targets_from_modifiers,
+    keep_onload_context,
     trace_subgraphs,
 )
 from llmcompressor.utils.helpers import DisableQuantization, calibration_forward_context
@@ -51,6 +52,8 @@ def __call__(
         """
         session = active_session()
 
+        # TODO: warn about not cpu offloading
+
         # prepare to trace subgraphs
         modifiers = session.get_modifiers()
         sequential_targets = get_targets_from_modifiers(modifiers, model)
@@ -59,37 +62,39 @@ def __call__(
         # trace subgraphs
         sample_input = next(iter(dataloader))
         subgraphs = trace_subgraphs(model, sample_input, sequential_targets, ignore)
+        num_subgraphs = len(subgraphs)
 
         LifecycleCallbacks.calibration_epoch_start()
 
         with calibration_forward_context(model), DisableQuantization(model):
             # prepare intermediates cache
             model_device = get_execution_device(model)
-            intermediates = IntermediatesCache.from_dataloader(dataloader, model_device)
+            activations = IntermediatesCache.from_dataloader(dataloader, model_device)
 
-            num_subgraphs = len(subgraphs)
             for subgraph_index, subgraph in enumerate(subgraphs):
                 # prepare tqdm description texts
                 calib_desc = f"({subgraph_index + 1}/{num_subgraphs}): Calibrating"
                 prop_desc = f"({subgraph_index + 1}/{num_subgraphs}): Propagating"
 
-                # do a preliminary pass to trigger modifier hooks
-                for batch_idx in tqdm.tqdm(range(len(dataloader)), desc=calib_desc):
-                    inputs = intermediates.fetch(batch_idx, subgraph.input_names)
-                    subgraph.forward(model, **inputs)
-
-                LifecycleCallbacks.sequential_epoch_end()
-
-                # this pass does not trigger modifier hooks
-                # and is only used for capturing outputs from newly compressed modules
-                with HooksMixin.disable_hooks():
-                    for batch_idx in tqdm.tqdm(range(len(dataloader)), desc=prop_desc):
-                        inputs = intermediates.fetch(batch_idx, subgraph.input_names)
-                        output = subgraph.forward(model, **inputs)
-
-                        if subgraph_index < num_subgraphs - 1:
-                            intermediates.update(batch_idx, output)
-                            intermediates.delete(batch_idx, subgraph.consumed_names)
+                # reduce memory movement by keeping modules onloaded
+                with keep_onload_context():
+                    # do a preliminary pass to trigger modifier hooks
+                    for batch_idx in tqdm(range(len(dataloader)), desc=calib_desc):
+                        inputs = activations.fetch(batch_idx, subgraph.input_names)
+                        subgraph.forward(model, **inputs)
+
+                    LifecycleCallbacks.sequential_epoch_end()
+
+                    # this pass does not trigger modifier hooks
+                    # and is only used for capturing outputs of newly compressed modules
+                    with HooksMixin.disable_hooks():
+                        for batch_idx in tqdm(range(len(dataloader)), desc=prop_desc):
+                            inputs = activations.fetch(batch_idx, subgraph.input_names)
+                            output = subgraph.forward(model, **inputs)
+
+                            if subgraph_index < num_subgraphs - 1:
+                                activations.update(batch_idx, output)
+                                activations.delete(batch_idx, subgraph.consumed_names)
 
             # redundant, finish any remaining compression
             LifecycleCallbacks.calibration_epoch_end()