reduce rearrange overhead

farhadrgh · farhadrgh · commit 2499da51aa8f · 2025-04-02T11:24:18.000-07:00
Signed-off-by: Farhad Ramezanghorbani &lt;farhadr@nvidia.com&gt;
diff --git a/nemo/collections/llm/gpt/model/megatron/hyena/hyena_mixer.py b/nemo/collections/llm/gpt/model/megatron/hyena/hyena_mixer.py
@@ -237,10 +237,16 @@ def _maybe_use_fp8(self, func, *args, **kwargs):
         return func(*args, **kwargs)
 
     def forward(self, x, layer_past=None, inference_params=None, _hyena_use_cp=True):
-        """Applies sequence mixing to a sequence of 1-dimensional embeddings: batch_size, seq_len, d_model.
+        """Applies the Hyena sequence mixing operation to input embeddings.
 
         Args:
-            u: input to the operator, in format [B, L, D]
+            x: Input tensor of shape [L, B, D] (seq_len, batch_size, hidden_dim)
+            layer_past: Past layer state for inference (default: None)
+            inference_params: Parameters for inference (default: None)
+            _hyena_use_cp: Whether to use context parallelism (default: True)
+
+        Returns:
+            Tuple of (output tensor, bias)
         """
         # CP control
         if _hyena_use_cp:
@@ -257,11 +263,11 @@ def forward(self, x, layer_past=None, inference_params=None, _hyena_use_cp=True)
         features = self.hyena_proj_conv(features, _use_cp=_proj_use_cp) # [B, D, L]
 
         x1, x2, v = rearrange(
-            features, "b (g dg p) l -> b l g p dg", p=3, g=self.num_groups_per_tp_rank
-        ).unbind(dim=3)
+            features, "b (g dg p) l -> b (g dg) p l", p=3, g=self.num_groups_per_tp_rank
+        ).unbind(dim=2)
 
         z = self.mixer(x1, x2, v)
-        z = rearrange(z, "b l d -> l b d").contiguous()
+        z = rearrange(z, "b d l -> l b d").contiguous()
 
         y, bias = self.dense(z)
         return y, bias
diff --git a/nemo/collections/llm/gpt/model/megatron/hyena/hyena_utils.py b/nemo/collections/llm/gpt/model/megatron/hyena/hyena_utils.py
@@ -799,23 +799,18 @@ def __init__(
     def forward(self, x1, x2, v, _hyena_use_cp=True):
         """Shape specification for inputs and outputs.
 
-        Input shapes: bs, seq_length, (num_groups, group_size)
-        Output shapes: bs, seq_length, num_groups, group_size
+        Input shapes: bs, (num_groups, group_size), seq_length
+        Output shapes: bs, (num_groups, group_size), seq_length
         """
-        B, L, G, DG = x1.shape
+        B, GDG, L = x1.shape
+        x1, x2, v = x1[..., :L], x2[..., :L], v[..., :L]
 
         # CP control
         if _hyena_use_cp:
             cp_group = get_context_parallel_group()
         else:
             cp_group = None
 
-        x1 = rearrange(x1, "b l g dg -> b (g dg) l", g=self.num_groups, dg=self.group_dim)
-        x2 = rearrange(x2, "b l g dg -> b (g dg) l", g=self.num_groups, dg=self.group_dim)
-        v = rearrange(v, "b l g dg -> b (g dg) l", g=self.num_groups, dg=self.group_dim)
-
-        x1, x2, v = x1[..., :L], x2[..., :L], v[..., :L]
-
         # The kernel length must be adjusted in CP settings
         _L_kernel = L if cp_group is None else L * len(torch.distributed.get_process_group_ranks(cp_group))
         if self.use_medium_hyena:
@@ -869,7 +864,7 @@ def forward(self, x1, x2, v, _hyena_use_cp=True):
         if cp_group is not None and len(torch.distributed.get_process_group_ranks(cp_group)) > 1:
             z = AllToAllSingleFunction.apply(z, cp_group, "full_to_split", True)
             # [ B, H, L // num_ranks]
-        return rearrange(z, "b d l -> b l d")
+        return z  # [B, (G, DG), L]
 
     def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None):
         """Sharded state dictionary for the ParallelHyenaOperator."""
@@ -972,15 +967,10 @@ def __init__(
     def forward(self, x1, x2, v, _hyena_use_cp=True):
         """Shape specification for inputs and outputs.
 
-        Input shapes: bs, seq_length, (num_groups, group_size)
-        Output shapes: bs, seq_length, num_groups, group_size
+        Input shapes: bs, (num_groups, group_size), seq_length
+        Output shapes: bs, (num_groups, group_size), seq_length
         """
-        B, L, G, DG = x1.shape
-
-        x1 = rearrange(x1, "b l g dg -> b (g dg) l")
-        x2 = rearrange(x2, "b l g dg -> b (g dg) l")
-        v = rearrange(v, "b l g dg -> b (g dg) l")
-
+        B, GDG, L = x1.shape
         x1, x2, v = x1[..., :L], x2[..., :L], v[..., :L]
 
         z = x2 * v if self.pregate else v
@@ -993,7 +983,7 @@ def forward(self, x1, x2, v, _hyena_use_cp=True):
 
         z = x1 * z if self.postgate else z
 
-        return rearrange(z, "b d l -> b l d")
+        return z  # [B, (G, DG), L]
 
     def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None):
         """Sharded state dictionary for the ParallelShortHyenaOperator."""