Add out_sharding argument to call methods for standard layers

samanklesaria · samanklesaria · commit 96f0a6465db6 · 2025-11-19T13:20:57.000-06:00
diff --git a/flax/nnx/nn/linear.py b/flax/nnx/nn/linear.py
@@ -29,6 +29,7 @@
 from flax.nnx.nn import dtypes, initializers
 from flax.typing import (
   Dtype,
+  Optional,
   Shape,
   Initializer,
   PrecisionLike,
@@ -38,6 +39,7 @@
   LaxPadding,
   PromoteDtypeFn,
   EinsumT,
+  Sharding
 )
 
 Array = jax.Array
@@ -246,7 +248,7 @@ def bias_init_wrap(rng, shape, dtype):
     else:
       self.bias = nnx.data(None)
 
-  def __call__(self, inputs: Array) -> Array:
+  def __call__(self, inputs: Array, out_sharding: Optional[Sharding] = None) -> Array:
     """Applies a linear transformation to the inputs along multiple dimensions.
 
     Args:
@@ -288,7 +290,7 @@ def __call__(self, inputs: Array) -> Array:
     # user custom dot_general/dot_general_cls which may not have
     # preferred_element_type argument to avoid breaking
     # existing code
-    dot_general_kwargs = {}
+    dot_general_kwargs = {'out_sharding': out_sharding}
     if self.preferred_element_type is not None:
       dot_general_kwargs["preferred_element_type"] = self.preferred_element_type
     out = dot_general(
@@ -393,7 +395,7 @@ def __init__(
     self.promote_dtype = promote_dtype
     self.preferred_element_type = preferred_element_type
 
-  def __call__(self, inputs: Array) -> Array:
+  def __call__(self, inputs: Array, out_sharding: Optional[Sharding] = None) -> Array:
     """Applies a linear transformation to the inputs along the last dimension.
 
     Args:
@@ -412,7 +414,7 @@ def __call__(self, inputs: Array) -> Array:
     # user custom self.dot_general method which may not have
     # preferred_element_type argument to avoid breaking
     # existing code
-    dot_general_kwargs = {}
+    dot_general_kwargs = {'out_sharding': out_sharding}
     if self.preferred_element_type is not None:
       dot_general_kwargs["preferred_element_type"] = self.preferred_element_type
     y = self.dot_general(
@@ -521,7 +523,7 @@ def __init__(
     self.preferred_element_type = preferred_element_type
 
   def __call__(
-    self, inputs: Array, einsum_str: tp.Optional[str] = None
+    self, inputs: Array, einsum_str: tp.Optional[str] = None, out_sharding: Optional[Sharding] = None
   ) -> Array:
     """Applies a linear transformation to the inputs along the last dimension.
 
@@ -557,7 +559,7 @@ def __call__(
     # user custom self.einsum_op method which may not have
     # preferred_element_type argument to avoid breaking
     # existing code
-    einsum_op_kwargs = {}
+    einsum_op_kwargs = {'out_sharding': out_sharding}
     if self.preferred_element_type is not None:
       einsum_op_kwargs["preferred_element_type"] = self.preferred_element_type
 
@@ -1065,7 +1067,7 @@ def __init__(
     else:
       self.bias = nnx.data(None)
 
-  def __call__(self, inputs: Array) -> Array:
+  def __call__(self, inputs: Array, out_sharding: Optional[Sharding] = None) -> Array:
     """Applies a transposed convolution to the inputs.
 
     Behaviour mirrors of ``jax.lax.conv_transpose``.
@@ -1142,6 +1144,7 @@ def maybe_broadcast(
       transpose_kernel=self.transpose_kernel,
       precision=self.precision,
       preferred_element_type=self.preferred_element_type,
+      out_sharding=out_sharding,
     )
 
     if self.padding == 'CIRCULAR':
diff --git a/flax/typing.py b/flax/typing.py
@@ -132,7 +132,7 @@ class Out(Generic[T]):
 LogicalPartitionSpecPytree = Any  # pylint: disable=invalid-name
 PartitionSpecPytree = Any  # pylint: disable=invalid-name
 
-Sharding = tuple[AxisName, ...]
+Sharding = Union[tuple[AxisName, ...], jax.sharding.PartitionSpec, jax.sharding.Sharding]
 
 A = TypeVar('A')
 
@@ -233,4 +233,4 @@ def from_any(cls, x):
 class PromoteDtypeFn(Protocol):
   def __call__(
     self, args: TupleArg, /, *, dtype: Any = None, inexact: bool = True
-  ) -> TupleArg: ...
+  ) -> TupleArg: ...
diff --git a/tests/nnx/spmd_test.py b/tests/nnx/spmd_test.py
@@ -20,7 +20,7 @@
 from flax import nnx
 import jax
 import jax.numpy as jnp
-from jax.sharding import PartitionSpec as P, NamedSharding
+from jax.sharding import PartitionSpec as P, NamedSharding, AxisType, reshard
 import optax
 
 
@@ -211,6 +211,16 @@ def test_eager_sharding_context(self, use_eager_sharding):
         else:
           assert not has_sharding_spec(w)
 
+  def test_out_sharding(self):
+    mesh = jax.make_mesh((2, 2), ("X", "Y"),
+                         axis_types=(AxisType.Explicit, AxisType.Explicit))
+    with jax.set_mesh(mesh):
+      replicated_array = jnp.arange(4).reshape(2, 2)
+      sharded_array = reshard(replicated_array, P("X", None))
+      l = nnx.Linear(2,4, rngs=nnx.Rngs(0))
+      assert 'float32[2@X,4]' in str(jax.typeof(l(sharded_array)))
+      assert 'float32[2@X,4@Y]' in str(jax.typeof(l(sharded_array, out_sharding=P("X", "Y"))))
+
   @parameterized.product(use_hijax=[True, False])
   def test_logical_rules(self, use_hijax):
     self.enter_context(nnx.use_hijax(use_hijax))