fix all_gather_into_tensor test and logic (#9332)

bfolie · web-flow · commit 9b0b02f06825 · 2025-06-16T11:29:24.000-07:00
diff --git a/test/pjrt/test_collective_ops_tpu.py b/test/pjrt/test_collective_ops_tpu.py
@@ -167,19 +167,25 @@ def callable(input):
     return input.cpu()
 
   @staticmethod
-  def _all_gather_into_tensor(use_dynamo: bool):
+  def _all_gather_into_tensor(use_dynamo: bool, mode: str):
     met.clear_all()
 
     def callable(output, input):
-      dist.all_gather_into_tensor(output_tensor, input, None)
-      return output_tensor
+      dist.all_gather_into_tensor(output, input, None)
+      return output
 
     dist.init_process_group("xla", init_method='xla://')
     device = torch_xla.device()
     input = torch.tensor([xr.global_ordinal()],
                          dtype=torch.float,
                          device=device)
-    output_tensor = torch.empty((1, xr.world_size()), device=device)
+    if mode == "stack":
+      output_tensor = torch.empty((xr.world_size(), 1), device=device)
+    elif mode == "concat":
+      output_tensor = torch.empty((xr.world_size(),), device=device)
+    else:
+      raise ValueError(f"mode must be either 'stack' or 'concat'")
+
     f = torch.compile(callable, backend='openxla') if use_dynamo else callable
     f(output_tensor, input)
     torch_xla.sync()
@@ -278,13 +284,17 @@ def test_all_reduce(self, use_dynamo):
     for index, val in results.items():
       torch.testing.assert_close(val, expected)
 
-  @parameterized.named_parameters(('dynamo', True), ('nondynamo', False))
-  def test_all_gather_into_tensor(self, use_dynamo):
+  @parameterized.product(dynamo=[True, False], mode=["stack", "concat"])
+  def test_all_gather_into_tensor(self, dynamo, mode):
+    if dynamo and mode == "stack":
+      self.skipTest("https://github.com/pytorch/pytorch/issues/155632")
     results = pjrt.run_multiprocess(
-        self._all_gather_into_tensor, use_dynamo=use_dynamo)
+        self._all_gather_into_tensor, use_dynamo=dynamo, mode=mode)
     expected = torch.arange(
-        tpu.num_expected_global_devices(), dtype=torch.float).unsqueeze(0)
-    for index, val in results.items():
+        tpu.num_expected_global_devices(), dtype=torch.float)
+    if mode == "stack":
+      expected = expected.unsqueeze(1)
+    for _, val in results.items():
       torch.testing.assert_close(val, expected)
 
   @parameterized.named_parameters(('dynamo', True), ('nondynamo', False))
diff --git a/torch_xla/distributed/xla_backend.py b/torch_xla/distributed/xla_backend.py
@@ -5,7 +5,7 @@
 from torch_xla._internal import rendezvous
 import logging
 import os
-from torch._C._distributed_c10d import ProcessGroup
+from torch._C._distributed_c10d import ProcessGroup, AllgatherOptions
 
 
 def _create_xla_process_group(prefix_store, rank, size, timeout):
@@ -81,16 +81,37 @@ def allreduce(self, tensors, all_reduce_options):
     xm.all_reduce(reduce_type, tensors, groups=self._mesh, pin_layout=False)
     return _ret_work(tensors)
 
-  # method for dist.all_gather_into_tensor under eager mode.
-  def _allgather_base(self, output_tensor, input_tensor, opts):
-    return self.allgather(output_tensor, input_tensor, opts)
+  # This method is called for dist.all_gather_into_tensor under eager mode.
+  # https://docs.pytorch.org/docs/stable/distributed.html#torch.distributed.all_gather_into_tensor
+  def _allgather_base(self, output_tensor: torch.Tensor,
+                      input_tensor: torch.Tensor, opts: AllgatherOptions):
+    is_scalar = (input_tensor.dim() == 0)
+    if is_scalar:
+      input_tensor = torch.reshape(input_tensor, (1,))
+
+    result = xm.all_gather(
+        input_tensor, dim=0, groups=self._mesh, pin_layout=False)
+
+    if result.shape == output_tensor.shape:
+      output_tensor.copy_(result, non_blocking=True)
+      return _ret_work([output_tensor])
+
+    stacked_result = torch.stack(
+        torch.split(result, input_tensor.shape[0], dim=0), dim=0)
+    if stacked_result.shape == output_tensor.shape:
+      output_tensor.copy_(stacked_result, non_blocking=True)
+      return _ret_work([output_tensor])
+
+    msg = f"Input shape {input_tensor.shape} and output shape {output_tensor.shape} are not compatible for all_gather_into_tensor. Input must be stacked or concatenated to create output."
+    raise ValueError(msg)
 
   def allgather(self, output_tensors_list, input_tensors, opts=None):
     for input_tensor, output_tensors in zip(input_tensors, output_tensors_list):
       is_scalar = (input_tensor.dim() == 0)
       if is_scalar:
         input_tensor = torch.reshape(input_tensor, (1,))
-      result = xm.all_gather(input_tensor, groups=self._mesh, pin_layout=False)
+      result = xm.all_gather(
+          input_tensor, dim=0, groups=self._mesh, pin_layout=False)
       for i, slice in enumerate(torch.split(result, input_tensor.shape[0])):
         with torch.no_grad():
           output_tensors[i].copy_(