Encapsulate Mesh invariants

rpsilva-aws · rpsilva-aws · commit e4df499e3d03 · 2025-03-25T22:16:09.000Z
diff --git a/test/spmd/test_xla_sharding.py b/test/spmd/test_xla_sharding.py
@@ -1,9 +1,10 @@
 import copy
 
-import unittest
-from unittest.mock import patch
+from collections import OrderedDict
 import math
 import numpy as np
+import unittest
+from unittest.mock import patch
 import sys
 
 import torch
@@ -1565,6 +1566,84 @@ def test_mark_sharding_with_gradients_annotation(self):
       # Check that the gradient has sharding.
       self.assertIn(sharding_spec, x_grad_sharding)
 
+  def test_valid_mesh_creation(self):
+    mesh_shape = (1, self.n_devices)
+    axis_names = ('data', 'model')
+    mesh = xs.Mesh(self.device_ids, mesh_shape, axis_names)
+
+    self.assertEqual(mesh.device_ids.tolist(), list(range(self.n_devices)))
+    self.assertEqual(mesh.mesh_shape, mesh_shape)
+    self.assertEqual(mesh.axis_names, axis_names)
+
+  def test_valid_mesh_without_axis_names(self):
+    mesh_shape = (1, self.n_devices)
+    mesh = xs.Mesh(self.device_ids, mesh_shape)
+
+    self.assertEqual(mesh.device_ids.tolist(), list(range(self.n_devices)))
+    self.assertEqual(mesh.mesh_shape, mesh_shape)
+    self.assertIsNone(mesh.axis_names)
+
+  def test_invalid_axis_names_length(self):
+    mesh_shape = (1, self.n_devices)
+    axis_names = ('data', 'model', 'extra')
+
+    with self.assertRaisesRegex(
+        AssertionError, "Number of axis names .* must match mesh dimensions"):
+      xs.Mesh(self.device_ids, mesh_shape, axis_names)
+
+  def test_duplicate_axis_names(self):
+    mesh_shape = (1, self.n_devices)
+    axis_names = ('data', 'data')
+
+    with self.assertRaisesRegex(AssertionError, "Axis names must be unique"):
+      xs.Mesh(self.device_ids, mesh_shape, axis_names)
+
+  def test_invalid_device_count(self):
+    mesh_shape = (2, self.n_devices)
+
+    with self.assertRaisesRegex(AssertionError,
+                                "Number of device IDs .* must match mesh size"):
+      xs.Mesh(self.device_ids, mesh_shape)
+
+  @unittest.skipIf(xr.global_runtime_device_count() == 1,
+                   "Multiple devices needed for duplicated device IDs")
+  def test_duplicate_device_ids(self):
+    mesh_shape = (1, self.n_devices)
+    duplicate_ids = np.array([0] * self.n_devices)
+
+    with self.assertRaisesRegex(AssertionError, "Device IDs must be unique"):
+      xs.Mesh(duplicate_ids, mesh_shape)
+
+  def test_device_ids_out_of_bounds(self):
+    mesh_shape = (1, self.n_devices)
+    invalid_ids = np.array([self.n_devices + 1] * self.n_devices)
+
+    with self.assertRaisesRegex(AssertionError,
+                                "Device IDs must be less than mesh size"):
+      xs.Mesh(invalid_ids, mesh_shape)
+
+  def test_mesh_size(self):
+    mesh_shape = (1, self.n_devices)
+    mesh = xs.Mesh(self.device_ids, mesh_shape)
+    self.assertEqual(mesh.size(), self.n_devices)
+
+  def test_mesh_shape_method(self):
+    mesh_shape = (1, self.n_devices)
+    axis_names = ('data', 'model')
+    mesh = xs.Mesh(self.device_ids, mesh_shape, axis_names)
+
+    expected_shape = OrderedDict([('data', 1), ('model', self.n_devices)])
+    self.assertEqual(mesh.shape(), expected_shape)
+
+  def test_get_logical_mesh(self):
+    mesh_shape = (2, 2)
+    device_ids = np.array([0, 1, 2, 3])
+    mesh = xs.Mesh(device_ids, mesh_shape)
+
+    expected_logical_mesh = np.array([[0, 1], [2, 3]])
+    np.testing.assert_array_equal(mesh.get_logical_mesh(),
+                                  expected_logical_mesh)
+
 
 if __name__ == '__main__':
   test = unittest.main()
diff --git a/torch_xla/distributed/spmd/xla_sharding.py b/torch_xla/distributed/spmd/xla_sharding.py
@@ -69,14 +69,25 @@ def __init__(self,
                axis_names: Optional[tuple[str, ...]] = None):
     if not isinstance(device_ids, np.ndarray):
       device_ids = np.array(device_ids)
-    assert (axis_names is None) or (len(mesh_shape) == len(axis_names))
-    assert axis_names is None or (len(set(axis_names)) == len(axis_names))
-    assert (len(device_ids) == np.prod(mesh_shape))
-    assert len(device_ids) == len(np.unique(device_ids))
+    assert len(device_ids) > 0, "This requires XLA supported device(s)."
+
+    if axis_names is not None:
+      assert len(mesh_shape) == len(axis_names), \
+          f"Number of axis names ({len(axis_names)}) must match mesh dimensions ({len(mesh_shape)})"
+      assert len(set(axis_names)) == len(axis_names), \
+          f"Axis names must be unique, got: {axis_names}"
+
+    expected_devices = np.prod(mesh_shape)
+    assert len(device_ids) == expected_devices, \
+        f"Number of device IDs ({len(device_ids)}) must match mesh size ({expected_devices})"
+    assert len(device_ids) == len(np.unique(device_ids)), \
+        f"Device IDs must be unique, got: {device_ids}"
+
     self.device_ids = device_ids
     self.mesh_shape = mesh_shape
     self.axis_names = axis_names
-    assert all(d < self.size() for d in device_ids)
+    assert all(d < self.size() for d in device_ids), \
+        f"Device IDs must be less than mesh size ({self.size()}), got: {device_ids}"
 
   def size(self):
     return np.prod(self.mesh_shape)
@@ -555,16 +566,14 @@ def mark_sharding(t: Union[torch.Tensor, XLAShardedTensor], mesh: Mesh,
       >>> linear = nn.Linear(32, 10).to(xm.xla_device())
       >>> xs.mark_sharding(linear.weight, mesh, (None, 1)) # 2-way model parallel
   """
-  num_devices = xr.global_runtime_device_count()
-  assert num_devices > 0, "This requires XLA supported device(s)."
-  assert mesh.size() == num_devices, \
-    f"{mesh.mesh_shape} is not mappable over {num_devices} devices."
   # We only allow fully specified `partition_spec` to be applicable, as opposed
   # to filling in the unspecified replicated dims. Fully specified `partiion_spec`
   # should be of the same rank as `t`. This is to support partial replication
   # where the group assignment may vary with different input ranks.
   assert len(t.shape) == len(partition_spec), \
     f"Partition spec length ({len(partition_spec)}) should be equal to the input rank ({len(t.shape)})."
+  assert len(partition_spec) == mesh.size(), \
+    f"Partition spec length ({len(partition_spec)}) should be equal to the mesh size ({mesh.size()})."
 
   op_sharding = mesh.get_op_sharding(partition_spec)
   annotate_func = torch_xla._XLAC._xla_mark_sharding
@@ -603,16 +612,14 @@ def mark_sharding_with_gradients(
 
     This version can also be used in AOTAutograd.
     """
-  num_devices = xr.global_runtime_device_count()
-  assert num_devices > 0, "This requires XLA supported device(s)."
-  assert mesh.size() == num_devices, \
-    f"{mesh.mesh_shape} is not mappable over {num_devices} devices."
   # We only allow fully specified `partition_spec` to be applicable, as opposed
   # to filling in the unspecified replicated dims. Fully specified `partiion_spec`
   # should be of the same rank as `t`. This is to support partial replication
   # where the group assignment may vary with different input ranks.
   assert len(t.shape) == len(partition_spec), \
     f"Partition spec length ({len(partition_spec)}) should be equal to the input rank ({len(t.shape)})."
+  assert len(partition_spec) == mesh.size(), \
+    f"Partition spec length ({len(partition_spec)}) should be equal to the mesh size ({mesh.size()})."
 
   return MarkShardingFunction.apply(t, mesh, partition_spec)