Add test and fix comment

qihqi · qihqi · commit f33f6e8a67aa · 2025-01-22T00:40:57.000Z
diff --git a/experimental/torch_xla2/test/test_train.py b/experimental/torch_xla2/test/test_train.py
@@ -0,0 +1,60 @@
+import unittest
+import torch
+import torch_xla2 as tx
+import torch_xla2.export
+import torch_xla2.train
+from torch.testing._internal.common_utils import TestCase
+
+
+class TrainTest(unittest.TestCase):
+
+  def setUp(self):
+    torch.manual_seed(0)
+    torch_xla2.enable_accuracy_mode()
+  
+  def test_scan_module(self):
+    x = torch.arange(300).reshape(3, 100).to(torch.float32)
+    layers = [
+      torch.nn.Linear(100, 100),
+      torch.nn.Linear(100, 100),
+      torch.nn.Linear(100, 100),
+      torch.nn.Linear(100, 100),
+    ]
+    # repetitively applies the linear
+    result = x
+    for layer in layers:
+      result = layer(result)
+
+    model = tx.train.ScannedModule(
+      layers
+    )
+
+    with torch_xla2.default_env():
+      x = x.to('jax')
+      model.to('jax')
+      result2 = model(x)
+      torch.testing.assert_allclose(result, result2.to('cpu'))
+
+  def test_train_step_can_run(self):
+    import optax
+    with torch_xla2.default_env():
+      model = torch.nn.Linear(100, 100)
+      model.to('jax')
+      weights = model.state_dict()
+      x = torch.randn(2, 100).to('jax')
+      y = torch.tensor([1, 2]).to('jax')
+
+      def model_fn(weight, buffers, args):
+        return torch.func.functional_call(model, weight, args)
+
+      loss_fn = torch.nn.CrossEntropyLoss()
+
+      optimizer = optax.adam(0.01)
+      opt_state = tx.interop.call_jax(optimizer.init, weights)
+
+      step = tx.train.make_train_step(model_fn, loss_fn, optimizer)
+      print(step(weights, {}, opt_state, x, y))
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/experimental/torch_xla2/torch_xla2/train.py b/experimental/torch_xla2/torch_xla2/train.py
@@ -30,9 +30,6 @@ def make_train_step(model_fn,
   optax_optimizer: the optimizer from optax library. for example, optax.adam
   remat_policy: One of jax.ad_checkpoint.checkpoint_policies, specifies how
       to do gradient checkpointing. If None, then it means checkpoint everything.
-  mark_fsdp_sharding_axis: str. A string name for marking sharding for 
-      fsdp. It must be an axis that exists in the current mesh.
-      if None, then no sharding is specified (i.e. for single device)
   """
   env = torch_xla2.default_env()
   def loss(weights, buffers, args, label): # inputs are XLATensor