Add streaming/decoding API support to embeddings.

ds-hwang · changlan · commit cbed07d4ed10 · 2026-01-29T11:40:09.000-08:00
GitOrigin-RevId: 4572855cc635b40ea1637948f39d8374d02bf86e
diff --git a/axlearn/common/decoder.py b/axlearn/common/decoder.py
@@ -37,7 +37,7 @@
     infer_initial_time_step,
     sample_decode,
 )
-from axlearn.common.embedding import TransformerTextEmbeddings
+from axlearn.common.embedding import BaseEmbedding, TransformerTextEmbeddings
 from axlearn.common.layers import Dropout, LayerNorm, set_dropout_rate_recursively
 from axlearn.common.logit_modifiers import LogitsToLogitsFn
 from axlearn.common.module import (
@@ -446,7 +446,7 @@ class Config(BaseLayer.Config):
         # explicitly.
         dropout_rate: float = 0.0
         # Vector from input ids table.
-        emb: TransformerTextEmbeddings.Config = TransformerTextEmbeddings.default_config()
+        emb: BaseEmbedding.Config = TransformerTextEmbeddings.default_config()
         # Transformer model trunk.
         transformer: BaseStackedTransformerLayer.Config = StackedTransformerLayer.default_config()
         # Layer norm applied to transformer output.
@@ -519,25 +519,25 @@ def _forward_for_mode(
         emb_batch = {**input_batch}
         emb_batch["inputs"] = emb_batch["input_ids"]
 
-        x = self.emb(input_batch=emb_batch)
-
         if mode == ForwardMode.FORWARD:
-            transformer_state, x = (
-                None,
-                self.transformer(
-                    x,
-                    self_attention_logit_biases=self_attention_logit_biases,
-                    target_segment_ids=input_segment_ids,
-                    target_positions=positions,
-                    cross_attention_data=cross_attention_data,
-                    cross_attention_logit_biases=cross_attention_logit_biases,
-                ),
+            x = self.emb(input_batch=emb_batch)
+            x = self.transformer(
+                x,
+                self_attention_logit_biases=self_attention_logit_biases,
+                target_segment_ids=input_segment_ids,
+                target_positions=positions,
+                cross_attention_data=cross_attention_data,
+                cross_attention_logit_biases=cross_attention_logit_biases,
             )
+            cached_states = None
         elif mode == ForwardMode.INIT_STATES:
             assert cached_states is not None
             if input_segment_ids is not None:
                 raise ValueError("input_segment_ids is not supported in INIT_STATES.")
-            transformer_state, x = self.transformer.init_states(
+            cached_states["emb"], x = self.emb.extend_step(
+                cached_states=cached_states["emb"], input_batch=emb_batch
+            )
+            cached_states["transformer_state"], x = self.transformer.init_states(
                 time_step=cached_states["transformer_state"],
                 data=x,
                 self_attention_logit_biases=self_attention_logit_biases,
@@ -548,7 +548,10 @@ def _forward_for_mode(
             assert cached_states is not None
             if input_segment_ids is not None:
                 raise ValueError("input_segment_ids is not supported in EXTEND_STEP.")
-            transformer_state, x = self.transformer.extend_step(
+            cached_states["emb"], x = self.emb.extend_step(
+                cached_states=cached_states["emb"], input_batch=emb_batch
+            )
+            cached_states["transformer_state"], x = self.transformer.extend_step(
                 cached_states=cached_states["transformer_state"],
                 data=x,
                 self_attention_logit_biases=self_attention_logit_biases,
@@ -588,7 +591,7 @@ def _forward_for_mode(
             logits = self._output_logits_modifier(logits)
         logits = with_sharding_constraint(logits, PartitionSpec(*self.config.logits_partition_spec))
         # TODO(markblee): Rename to just "transformer". "transformer_state" is a bit redundant.
-        return dict(transformer_state=transformer_state), dict(logits=logits, hidden_states=x)
+        return cached_states, dict(logits=logits, hidden_states=x)
 
     def forward(
         self,
@@ -647,12 +650,14 @@ def init_states(
     ) -> NestedTensor:
         """See `BaseDecoder.init_states` for details."""
         cfg: Decoder.Config = self.config
-        init_state, _ = self.transformer.init_states(
+        emb = self.emb.init_states(batch_size=batch_size, dtype=dtype)
+        transformer_state, _ = self.transformer.init_states(
             time_step=None,
             data=TensorSpec([batch_size, max_sequence_length, cfg.dim], dtype=dtype),
         )
         return dict(
-            transformer_state=init_state,
+            emb=emb,
+            transformer_state=transformer_state,
             input_ids=jnp.full(
                 (batch_size, max_sequence_length), cfg.pad_token_id, dtype=jnp.int32
             ),
@@ -677,13 +682,14 @@ def prefill_states(
             See `BaseDecoder.prefill_states` for details.
         """
         validate_contains_paths(input_batch, paths=["input_ids"])
-        input_ids = input_batch["input_ids"]
+        input_ids: Tensor = input_batch["input_ids"]
         input_segment_ids = input_batch.get("input_segment_ids", None)
         positions = input_batch.get("positions", None)
 
+        emb = self.emb.init_states(batch_size=input_ids.shape[0], dtype=self.dtype())
         states, outputs = self._forward_for_mode(
             mode=ForwardMode.INIT_STATES,
-            cached_states=dict(transformer_state=time_step),
+            cached_states=dict(emb=emb, transformer_state=time_step),
             input_batch=input_batch,
             # TODO(markblee): Consider supporting packed inputs for more efficient prefilling.
             self_attention_logit_biases=self.compute_attention_logit_biases(
@@ -748,14 +754,13 @@ def extend_step(
             cached_states=cached_states,
             **kwargs,
         )
-        updated_states = dict(
+        updated_states.update(
             input_ids=updated_inputs,
             # There are some non-greedy DFS/BFS and sliding attention algorithms that
             # recursively search through potentials.
             # They backtrace to some anchor time step after exploring for t steps.
             # This requires tracking time_step separately from the attention time_step.
             time_step=cached_states["time_step"] + 1,
-            **updated_states,
         )
         return updated_states, outputs
 
diff --git a/axlearn/common/embedding.py b/axlearn/common/embedding.py
@@ -11,7 +11,7 @@
 from axlearn.common.base_layer import BaseLayer
 from axlearn.common.config import REQUIRED, InstantiableConfig, Required, config_class
 from axlearn.common.layers import Dropout, Embedding
-from axlearn.common.module import Module, Tensor, child_context
+from axlearn.common.module import Module, Tensor, child_context, nowrap
 from axlearn.common.utils import Nested, validate_contains_paths
 
 
@@ -50,6 +50,37 @@ def attend(self, x: Tensor):
         """
         raise NotImplementedError(type(self))
 
+    @nowrap
+    def init_states(self, *, batch_size: int, dtype: jnp.dtype) -> Nested[Tensor]:
+        """Initializes state for streaming decode.
+
+        Args:
+            batch_size: Batch size.
+            dtype: dtype for the decoding cache.
+
+        Returns:
+            A nested dict of initial states. Returns empty dict by default.
+        """
+        del batch_size, dtype
+        return dict()
+
+    def extend_step(
+        self, *, cached_states: Nested[Tensor], input_batch: Nested[Tensor]
+    ) -> tuple[Nested[Tensor], Tensor]:
+        """Extends one step for streaming decode.
+
+        TODO(dhwang2): prefill uses `extend_step`, which has a performance issue. This will be
+        resolved after #2057.
+
+        Args:
+            cached_states: Cached states from previous step or init_state.
+            input_batch: Input batch for current step.
+
+        Returns:
+            A tuple of (updated_states, embeddings).
+        """
+        return cached_states, self.forward(input_batch)
+
 
 class TransformerTextEmbeddings(BaseEmbedding):
     """Textual embeddings from token id, position and token type embeddings."""