all tests pass.

vanbasten23 · vanbasten23 · commit 2ce56cccd31f · 2025-06-04T00:59:05.000Z
diff --git a/test/test_pallas.py b/test/test_pallas.py
@@ -890,11 +890,10 @@ def _test_quantized_matmul(
       in_block_size=None,
       atol=1.5,
       n_bits=8,
-    ):
+  ):
     x = torch.randn((bs, n_input_features), dtype=dtype)
     w = torch.randn((n_output_features, n_input_features), dtype=dtype)
-    min_val, max_val = torch.aminmax(
-        w, dim=1)  # min_val, max_val [out_dim]
+    min_val, max_val = torch.aminmax(w, dim=1)  # min_val, max_val [out_dim]
     int_min = -2**(n_bits - 1)
     int_max = 2**(n_bits - 1) - 1
     scalar, zero_point = determine_qparams(
@@ -913,21 +912,30 @@ def _test_quantized_matmul(
     x_copy = x.clone()
     w_copy = w.clone()
     expected = F.linear(x_copy, w_copy)
-    
+
     x_xla = x.to("xla")
     w_int_xla = w_int.to("xla")
     scalar_xla = scalar.to("xla")
     if use_dynamo:
-      def quantized_matmul_wrapper(x, w_int, scalar):
-        return torch.ops.xla.quantized_matmul(
-            x, w_int, scalar, quantize_activation=quantize_activation, batch_block_size=batch_block_size,
-            out_block_size=out_block_size, in_block_size=in_block_size)
 
-      quantized_matmul = torch.compile(quantized_matmul_wrapper, backend="openxla")
+      def quantized_matmul_wrapper(x, w_int, scalar, quantize_activation,
+                                   batch_block_size, out_block_size,
+                                   in_block_size):
+        return torch.ops.xla.quantized_matmul(
+            x,
+            w_int,
+            scalar,
+            quantize_activation=quantize_activation,
+            batch_block_size=batch_block_size,
+            out_block_size=out_block_size,
+            in_block_size=in_block_size)
+
+      quantized_matmul = torch.compile(
+          quantized_matmul_wrapper, backend="openxla")
     else:
       from torch_xla.experimental.custom_kernel import quantized_matmul
       quantized_matmul = quantized_matmul
-    
+
     actual = quantized_matmul(
         x_xla,
         w_int_xla,
@@ -936,68 +944,43 @@ def quantized_matmul_wrapper(x, w_int, scalar):
         batch_block_size=batch_block_size,
         out_block_size=out_block_size,
         in_block_size=in_block_size).cpu()
-    
+
     self.assertEqual(actual.shape, expected.shape)
     self.assertEqual(actual.dtype, expected.dtype)
-    self.assertTrue(
-        torch.allclose(
-            actual, expected, atol=atol))
+    self.assertTrue(torch.allclose(actual, expected, atol=atol))
 
-
-  @parameterized.product(
-      seq_lens=[[(1, 1328), (5, 18), (500, 563)]],
-      num_heads=[(32, 8), (8, 1)],
-      dtype=[(torch.bfloat16, torch.bfloat16),
-             (torch.bfloat16, torch.float8_e5m2)],
-      sm_scale=[1.0, 0.5],
-      sliding_window=[None, 128],
-      soft_cap=[None, 10.0],
-      pad_tokens_and_seqs=[False, True])
-  @unittest.skipIf(xr.device_type() != 'TPU' or tpu.version() < 4,
-                   "This test only works on TPUv4+.")
-  def test_quantized_matmul_with_dynamo(
-      self,
-      seq_lens,
-      num_heads,
-      dtype,
-      sm_scale,
-      sliding_window,
-      soft_cap,
-      pad_tokens_and_seqs,
-  ):
-    ...
-
-  # @parameterized.product(
-  #     dtype=[torch.bfloat16],
-  #     bs=[128],
-  #     n_input_features=[128],
-  #     n_output_features=[128],
-  #     quantize_activation=[True],
-  #     # block_sizes=[(None, None, None), (128, 128, 128)],
-  #     kernel_block_sizes=[(128, 128, 128)],
-  # )
   @parameterized.product(
       dtype=[torch.bfloat16, torch.float32],
-      bs=[128, 256],
-      n_input_features=[128, 256],
-      n_output_features=[128, 256],
+      bs=[256, 512],
+      n_input_features=[256, 512],
+      n_output_features=[256, 512],
       quantize_activation=[True],
-      # block_sizes=[(None, None, None), (128, 128, 128)],
-      kernel_block_sizes=[(128, 128, 128)],
+      kernel_block_sizes=[(None, None, None), (256, 256, 256)],
+      use_dynamo=[True, False],
   )
   @unittest.skipIf(xr.device_type() != 'TPU' or tpu.version() < 5,
                    "This test only works on TPUv5+.")
-  def test_quantized_matmul_wrapper_without_dynamo(
+  def test_quantized_matmul_wrapper(
       self,
       dtype,
       bs,
       n_input_features,
       n_output_features,
       quantize_activation,
       kernel_block_sizes,
+      use_dynamo,
   ):
     batch_block_size, out_block_size, in_block_size = kernel_block_sizes
-    self._test_quantized_matmul(dtype, bs, n_input_features, n_output_features, quantize_activation, use_dynamo=False, batch_block_size=batch_block_size, out_block_size=out_block_size, in_block_size=in_block_size)
+    self._test_quantized_matmul(
+        dtype,
+        bs,
+        n_input_features,
+        n_output_features,
+        quantize_activation,
+        use_dynamo=use_dynamo,
+        batch_block_size=batch_block_size,
+        out_block_size=out_block_size,
+        in_block_size=in_block_size)
 
   @unittest.skipIf(xr.device_type() != 'TPU' or tpu.version() < 4,
                    "This test only works on TPUv4+.")
diff --git a/torch_xla/experimental/custom_kernel.py b/torch_xla/experimental/custom_kernel.py
@@ -1061,26 +1061,32 @@ def ragged_paged_attention(
       ])
   return output[0]
 
+
 @requires_jax
 def quantized_matmul(
-  x: torch.Tensor,
-  w: torch.Tensor,
-  scalar: torch.Tensor,
-  zero_point: torch.Tensor | None = None,
-  block_size: torch.Tensor | None = None,
-  quantize_activation: bool = False,
-  batch_block_size: int | None = None,
-  out_block_size: int | None = None,
-  in_block_size: int | None = None,
-  vmem_limit_bytes: int | None = 64 * 1024 * 1024,
+    x: torch.Tensor,
+    w: torch.Tensor,
+    scalar: torch.Tensor,
+    zero_point: torch.Tensor | None = None,
+    block_size: torch.Tensor | None = None,
+    quantize_activation: bool = False,
+    batch_block_size: int | None = None,
+    out_block_size: int | None = None,
+    in_block_size: int | None = None,
+    vmem_limit_bytes: int | None = 64 * 1024 * 1024,
 ) -> torch.Tensor:
   from torch_xla.experimental.pallas_kernels.quantized_matmul_kernel import quantized_matmul
   return xb.call_jax(
-    quantized_matmul, 
-    (x, w, scalar, zero_point, block_size, quantize_activation), 
-    {"batch_block_size": batch_block_size, "out_block_size": out_block_size, "in_block_size": in_block_size, "vmem_limit_bytes": vmem_limit_bytes}
-  )
-  
+      quantized_matmul, (x, w, scalar), {
+          "zero_point": zero_point,
+          "block_size": block_size,
+          "quantize_activation": quantize_activation,
+          "batch_block_size": batch_block_size,
+          "out_block_size": out_block_size,
+          "in_block_size": in_block_size,
+          "vmem_limit_bytes": vmem_limit_bytes
+      })
+
 
 def _multi_queries_paged_attention_nonkernel(
     q,  # [batch_size, query_len, num_heads, head_size]
@@ -1646,3 +1652,61 @@ def gmm_non_xla(lhs: torch.Tensor,
 
   # we only need to return the tensor with correct shape for meta tensor.
   return torch.empty(lhs.size()[0], rhs_dim_size, device=lhs.device)
+
+
+# @requires_jax
+# def quantized_matmul(
+#   x: torch.Tensor,
+#   w: torch.Tensor,
+#   scalar: torch.Tensor,
+#   zero_point: torch.Tensor | None = None,
+#   block_size: torch.Tensor | None = None,
+#   quantize_activation: bool = False,
+#   batch_block_size: int | None = None,
+#   out_block_size: int | None = None,
+#   in_block_size: int | None = None,
+#   vmem_limit_bytes: int | None = 64 * 1024 * 1024,
+# ) -> torch.Tensor:
+
+XLA_LIB.define(
+    "quantized_matmul(Tensor x, Tensor w, Tensor scalar, Tensor? zero_point=None, Tensor? block_size=None, bool quantize_activation=False, int? batch_block_size=None, int? out_block_size=None, int? in_block_size=None, int? vmem_limit_bytes=None) -> Tensor",
+)
+
+
+@impl(XLA_LIB, "quantized_matmul", "XLA")
+def quantized_matmul_xla(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    scalar: torch.Tensor,
+    zero_point: torch.Tensor | None = None,
+    block_size: torch.Tensor | None = None,
+    quantize_activation: bool = False,
+    batch_block_size: int | None = None,
+    out_block_size: int | None = None,
+    in_block_size: int | None = None,
+    vmem_limit_bytes: int | None = 64 * 1024 * 1024,
+) -> torch.Tensor:
+  return quantized_matmul(x, w, scalar, zero_point, block_size,
+                          quantize_activation, batch_block_size, out_block_size,
+                          in_block_size, vmem_limit_bytes)
+
+
+@impl(XLA_LIB, "quantized_matmul", "CompositeExplicitAutograd")
+def quantized_matmul_non_xla(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    scalar: torch.Tensor,
+    zero_point: torch.Tensor | None = None,
+    block_size: torch.Tensor | None = None,
+    quantize_activation: bool = False,
+    batch_block_size: int | None = None,
+    out_block_size: int | None = None,
+    in_block_size: int | None = None,
+    vmem_limit_bytes: int | None = 64 * 1024 * 1024,
+) -> torch.Tensor:
+  # This will be called when dynamo use fake tensor to construct the fake output.
+  # We need to make sure output tensor's shape is correct.
+  if x.device != torch.device("meta"):
+    warnings.warn(
+        f'XLA quantized_matmul should only be applied to tensors on XLA device')
+  return torch.empty(x.shape[0], w.shape[0], device=x.device)