Improve assume_pure SPMD functionality (#9360)

tengyifei · web-flow · commit 0be0fbdff718 · 2025-06-16T10:07:41.000-07:00
diff --git a/test/test_assume_pure_spmd.py b/test/test_assume_pure_spmd.py
@@ -1,13 +1,26 @@
+from copy import copy
 import os
 import sys
 import unittest
 
 import numpy as np
 import torch
+import torch.nn as nn
 import torch_xla
 import torch_xla.runtime as xr
-from torch_xla.experimental.assume_pure import assume_pure
+from torch_xla.experimental.assume_pure import PureModule, assume_pure
 from torch_xla.distributed.spmd import mark_sharding, mark_sharding_with_gradients, set_global_mesh, get_1d_mesh, Mesh
+from torch_xla.distributed.spmd.xla_sharding import apply_xla_patch_to_nn_linear
+
+
+def get_2d_mesh(name1: str, name2: str):
+  num_devices = xr.global_runtime_device_count()
+  dim1_size = 2
+  assert num_devices % 2 == 0
+  dim2_size = num_devices // dim1_size
+  devices = np.arange(xr.global_runtime_device_count())
+  mesh_shape = (dim1_size, dim2_size)
+  return Mesh(devices, mesh_shape=mesh_shape, axis_names=(name1, name2))
 
 
 class AssumePureSpmdTest(unittest.TestCase):
@@ -56,6 +69,44 @@ def test_assume_pure_works_with_mark_sharding_with_gradients(self):
     self.assertIn(f'devices=[{N}',
                   torch_xla._XLAC._get_xla_sharding_spec(x.grad))
 
+  @unittest.skipUnless(xr.global_runtime_device_count() > 1,
+                       "Multiple devices required")
+  @unittest.skipIf(
+      torch.cuda.is_available() or os.environ.get('PJRT_DEVICE') == 'CUDA',
+      "TODO(https://github.com/pytorch/xla/issues/9017): Get these tests working on GPU"
+  )
+  def test_assume_pure_works_with_mark_sharding_nested(self):
+    mesh = get_2d_mesh("model", "batch")
+    set_global_mesh(mesh)
+    x = torch.randn((8, 4, 5, 128), device='xla')
+    result = assume_pure(mark_sharding)(x, mesh,
+                                        (("model", "batch"), None, None, None))
+    torch_xla.sync(wait=True)
+    N = xr.global_runtime_device_count()
+    self.assertIn(f'devices=[{N}',
+                  torch_xla._XLAC._get_xla_sharding_spec(result))
+
+  @unittest.skipUnless(xr.global_runtime_device_count() > 1,
+                       "Multiple devices required")
+  @unittest.skipIf(
+      torch.cuda.is_available() or os.environ.get('PJRT_DEVICE') == 'CUDA',
+      "TODO(https://github.com/pytorch/xla/issues/9017): Get these tests working on GPU"
+  )
+  def test_assume_pure_works_with_mark_sharding_with_gradients_nested(self):
+    mesh = get_2d_mesh("model", "batch")
+    set_global_mesh(mesh)
+    x = torch.randn((8, 4, 5, 128)).to('xla').requires_grad_(True)
+    result = assume_pure(mark_sharding_with_gradients)(
+        x, mesh, (("model", "batch"), None, None, None))
+    result.sum().backward()
+    torch_xla.sync(wait=True)
+    N = xr.global_runtime_device_count()
+    self.assertIn(f'devices=[{N}',
+                  torch_xla._XLAC._get_xla_sharding_spec(result))
+    assert x.grad is not None
+    self.assertIn(f'devices=[{N}',
+                  torch_xla._XLAC._get_xla_sharding_spec(x.grad))
+
   @unittest.skipUnless(xr.global_runtime_device_count() > 1,
                        "Multiple devices required")
   @unittest.skipIf(
@@ -94,6 +145,33 @@ def test_convert_to_jax_mesh_shuffled(self):
         np.array([dev['coords'] for dev in torch_xla_devices.flatten()]),
     )
 
+  @unittest.skipUnless(xr.global_runtime_device_count() > 1,
+                       "Multiple devices required")
+  @unittest.skipUnless(os.environ.get('PJRT_DEVICE') == 'TPU', "TPU only test")
+  def test_pure_module(self):
+    """Test tracing `nn.Linear` and `EinsumLinear` with `assume_pure`."""
+    for transform in [apply_xla_patch_to_nn_linear, lambda x: x]:
+      with torch_xla.device():
+        # Arrange
+        original = nn.Linear(4, 8)
+        replaced = PureModule(transform(copy(original)))
+        inputs = torch.ones((4,))
+        torch_xla.sync()
+
+        # Act
+        original_output = original(inputs)
+        original_output.sum().backward()
+        replaced_output = replaced(inputs)
+        replaced_output.sum().backward()
+        torch_xla.sync()
+
+        # Assert
+        torch.testing.assert_close(original_output, replaced_output)
+        torch.testing.assert_close(original.weight.grad,
+                                   replaced._module.weight.grad)
+        torch.testing.assert_close(original.bias.grad,
+                                   replaced._module.bias.grad)
+
 
 if __name__ == '__main__':
   test = unittest.main()
diff --git a/torch_xla/distributed/spmd/xla_sharding.py b/torch_xla/distributed/spmd/xla_sharding.py
@@ -644,9 +644,8 @@ def mark_sharding(t: Union[torch.Tensor, XLAShardedTensor], mesh: Mesh,
   tx = maybe_get_torchax()
   if tx is not None and isinstance(t, tx.tensor.Tensor):
     from jax.sharding import PartitionSpec as P, NamedSharding
-    op_sharding = tuple(str(i) if i is not None else i for i in partition_spec)
     jmesh = mesh.get_jax_mesh()
-    t.shard_(NamedSharding(jmesh, P(*op_sharding)))
+    t.shard_(NamedSharding(jmesh, P(*partition_spec)))
     return t
 
   op_sharding = mesh.get_op_sharding(partition_spec)
@@ -986,8 +985,9 @@ def apply_xla_patch_to_nn_linear(module: torch.nn.Module):
   for name, child in module.named_children():
     if isinstance(child,
                   torch.nn.Linear) and not isinstance(child, EinsumLinear):
-      einsum_linear = EinsumLinear(
-          child.in_features, child.out_features, bias=child.bias is not None)
+      with torch.device('meta'):
+        einsum_linear = EinsumLinear(
+            child.in_features, child.out_features, bias=child.bias is not None)
       einsum_linear.load_state_dict(
           child.state_dict(), strict=True, assign=True)
       setattr(module, name, einsum_linear)
diff --git a/torch_xla/experimental/assume_pure.py b/torch_xla/experimental/assume_pure.py
@@ -1,9 +1,9 @@
-from copy import copy
-from functools import wraps
+from functools import wraps, partial
 from typing import Dict
 
 import torch
-from torch.utils._pytree import tree_map, tree_flatten, tree_unflatten
+import torch.nn as nn
+from torch.utils._pytree import tree_flatten, tree_unflatten
 import torch_xla
 from torch_xla._internal.jax_workarounds import requires_jax
 import torch_xla.core.xla_builder as xb
@@ -48,6 +48,33 @@ def j2t_autograd(fn):
       fn, call_jax=lambda fn, *args: xb.call_jax(fn, args))
 
 
+class PureModule(nn.Module):
+  """Wraps a module whose forward pass is known to be free of side-effects and whose
+  behavior only depends on the inputs.
+
+  It behaves as if decorating the wrapped module's functionalized forward pass with `@assume_pure`.
+
+  This wrapper has a few advantages over the underlying module:
+  - `PureModule`s will only be traced once.
+  - Framework profile scopes added via `xp.Trace` will show up in both the forward
+    and the backward pass.
+  """
+
+  def __init__(self, module: nn.Module) -> None:
+    super().__init__()
+    self._module = module
+    self._pure_forward = assume_pure(partial(_pure_forward, self._module))
+
+  def forward(self, *args, **kwargs):
+    params = dict(self._module.named_parameters())
+    buffers = dict(self._module.named_buffers())
+    return self._pure_forward(params, buffers, args, kwargs)
+
+
+def _pure_forward(module, params, buffers, args, kwargs):
+  return torch.func.functional_call(module, (params, buffers), args, kwargs)
+
+
 def make_fake_inputs(input):
   """Creates a fake input for the given input torch tensor. If the input
   is not a tensor, it returns the input as is.
diff --git a/torchax/torchax/ops/jaten.py b/torchax/torchax/ops/jaten.py
@@ -1346,7 +1346,7 @@ def reduce_fn(a, b):
 try:
 
   @op(torch.ops.xla.max_pool2d_forward)
-  def _xla_max_pool2d_foward(*args, **kwargs):
+  def _xla_max_pool2d_forward(*args, **kwargs):
     return _aten_max_pool2d_with_indices(*args, **kwargs)[0]
 
   @op(torch.ops.xla.aot_mark_sharding)
@@ -1357,11 +1357,17 @@ def _xla_aot_mark_sharding(t, mesh: str, partition_spec: str):
     pmesh = xs.Mesh.from_str(mesh)
     assert pmesh is not None
     partition_spec_eval = ast.literal_eval(partition_spec)
-    op_sharding = tuple(
-        str(i) if i is not None else i for i in partition_spec_eval)
     jmesh = pmesh.get_jax_mesh()
     return jax.lax.with_sharding_constraint(
-        t, NamedSharding(jmesh, P(*op_sharding)))
+        t, NamedSharding(jmesh, P(*partition_spec_eval)))
+
+  @op(torch.ops.xla.einsum_linear_forward)
+  def _xla_einsum_linear_forward(input, weight, bias):
+    with jax.named_scope('einsum_linear_forward'):
+      product = jax.numpy.einsum('...n,mn->...m', input, weight)
+      if bias is not None:
+        return product + bias
+      return product
 
 except AttributeError:
   pass