[tosa] : Move casting to integer domain after all computations for quantize_per_tensor. (#4487)

sahas3 · web-flow · commit 7f1d4b2686d4 · 2026-03-06T16:29:38.000-05:00
The existing lowering of `quantize_per_tensor` to TOSA is incorrect.
Here's a numerical example explaining the bug (thanks to AI for
formatting nicely) which is that the existing implementation casts to
integer domain before `ZP` addition and that produces incorrect result:

Consider an example input = 3.4501, 
Scale = 1/66.933334, 
Zero Point = -128

BUGGY Code (Original)
```
Step 1: Scale       → 3.4501 × 66.933334 = 230.95
Step 2: Round       → 231.0
Step 3: Cast to i8  → 231 → 127 (clamped by int8 range!)
Step 4: Add ZP      → 127 + (-128) = -1 ❌
Step 5: Clamp       → clamp(-1, -128, 127) = -1 
```

FIXED Code
```
Step 1: Scale       → 3.4501 × 66.933334 = 230.95
Step 2: Round       → 231.0
Step 3: Add ZP      → 231.0 + (-128.0) = 103.0
Step 4: Clamp       → clamp(103.0, -128, 127) = 103.0
Step 5: Cast to i8  → 103 ✓
```
diff --git a/lib/Conversion/TorchToTosa/TorchToTosa.cpp b/lib/Conversion/TorchToTosa/TorchToTosa.cpp
@@ -10226,36 +10226,44 @@ LogicalResult ConvertAtenOp<AtenQuantizePerTensorOp>::matchAndRewriteImpl(
         op, "failed to implement round-half-to-even with TOSA ops");
   }
 
-  // Cast to the destination integer type.
-  auto intermediateIntTy = resultTy.clone(resultElemTy);
-  Value castToInt =
-      tosa::CastOp::create(rewriter, loc, intermediateIntTy, *rounded);
-
-  // Add the zero point.
-  Value zpTensor =
-      tosa::createZeroPointTensor(rewriter, loc, intermediateIntTy, zpConst)
+  // Add the zero point
+  Value zpTensorFloat =
+      tosa::getConstTensor<float>(rewriter, op, static_cast<float>(zpConst), {},
+                                  inputElemTy)
           .value();
-  if (mlir::tosa::EqualizeRanks(rewriter, loc, castToInt, zpTensor).failed())
+  if (mlir::tosa::EqualizeRanks(rewriter, loc, *rounded, zpTensorFloat)
+          .failed())
     return failure();
-  Value withZp = tosa::AddOp::create(rewriter, loc, intermediateIntTy,
-                                     castToInt, zpTensor);
-
-  // Clamp the result to the valid range of the quantized type.
-  std::optional<int64_t> minInt,
-      maxInt; // no initialization needed as we want to clamp to the numeric
-              // limits of the type
-  IntegerAttr minIntAttr, maxIntAttr;
+  Value withZp =
+      tosa::AddOp::create(rewriter, loc, inputTy, *rounded, zpTensorFloat);
+
+  // Clamp the result to the valid range of the result/quantized type
+  std::optional<int64_t> minInt, maxInt;
+  IntegerAttr minIntAttr, maxIntAttr; // no initialization needed as we want to
+                                      // clamp to the numeric limits of the type
   if (failed(tosa::getIntegerClampAttrs(rewriter, op, resultElemTy, minInt,
                                         maxInt, minIntAttr, maxIntAttr))) {
     return failure();
   }
+
+  // Create float clamp attributes (clamp happens with integer range based on
+  // the result/quantized type but in the domain of the input type to preserve
+  // numeric)
+  auto minFloat = static_cast<float>(minIntAttr.getInt());
+  auto maxFloat = static_cast<float>(maxIntAttr.getInt());
+  auto minFloatAttr = rewriter.getFloatAttr(inputElemTy, minFloat);
+  auto maxFloatAttr = rewriter.getFloatAttr(inputElemTy, maxFloat);
+
   Value clamped = tosa::ClampOp::create(
-      rewriter, loc, resultTy, withZp, minIntAttr, maxIntAttr,
+      rewriter, loc, inputTy, withZp, minFloatAttr, maxFloatAttr,
       /*nan_mode=*/
       tosa::NanPropagationModeAttr::get(rewriter.getContext(),
                                         tosa::NanPropagationMode::PROPAGATE));
 
-  rewriter.replaceOp(op, clamped);
+  // Cast to the destination integer type
+  Value castToInt = tosa::CastOp::create(rewriter, loc, resultTy, clamped);
+
+  rewriter.replaceOp(op, castToInt);
   return success();
 }
 
diff --git a/projects/pt1/e2e_testing/xfail_sets.py b/projects/pt1/e2e_testing/xfail_sets.py
@@ -829,6 +829,7 @@
     "QuantizedMLP_basic",
     "QuantizedNoLayer_basic",
     "QuantizedSingleLayer_basic",
+    "QuantizePerTensorModule_basic",
     "RandnDtypeDeviceModule_basic",
     "RandnGeneratorF64Module_basic",
     "RandnGeneratorModule_basic",
@@ -3195,6 +3196,7 @@
     "QuantizedReluInt8_basic",
     "QuantizedReluInt32_basic",
     "QuantizedReluUint8_basic",
+    "QuantizePerTensorModule_basic",
     "RandIntDtypeModule_basic",
     "RandIntModule_basic",
     "RandIntPinMemoryModule_basic",
@@ -4802,6 +4804,7 @@
     "QuantizedReluInt8_basic",
     "QuantizedReluUint8_basic",
     "QuantizedSingleLayer_basic",
+    "QuantizePerTensorModule_basic",
     "RandIntDtypeModule_basic",
     "RandIntModule_basic",
     "RandIntPinMemoryModule_basic",
diff --git a/projects/pt1/python/torch_mlir_e2e_test/test_suite/quantized_models.py b/projects/pt1/python/torch_mlir_e2e_test/test_suite/quantized_models.py
@@ -5,6 +5,7 @@
 
 import torch
 from torch import nn
+import torch.ao.quantization.fx._decomposed
 
 from torch_mlir_e2e_test.framework import TestUtils
 from torch_mlir_e2e_test.registry import register_test_case
@@ -206,3 +207,33 @@ def forward(self, a):
 @register_test_case(module_factory=lambda: FakeQuantizePerTensorAffineCachemaskModule())
 def FakeQuantizePerTensorAffineCachemaskModule_basic(module, tu: TestUtils):
     module.forward(tu.rand(6, 4))
+
+
+# ==============================================================================
+
+
+class QuantizePerTensorModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    @export
+    @annotate_args(
+        [
+            None,
+            ([1, 64, 112, 112], torch.float32, True),
+        ]
+    )
+    def forward(self, x):
+        scale = 0.014940238557755947
+        zp = -128
+        quant_min = -128
+        quant_max = 127
+        return torch.ops.quantized_decomposed.quantize_per_tensor.default(
+            x, scale, zp, quant_min, quant_max, torch.int8
+        )
+
+
+@register_test_case(module_factory=lambda: QuantizePerTensorModule())
+def QuantizePerTensorModule_basic(module, tu: TestUtils):
+    # use values within [-5, 5] to ensure we run into overflow/underflow
+    module.forward(10 * torch.rand(1, 64, 112, 112) - 5)
diff --git a/test/Conversion/TorchToTosa/quantization.mlir b/test/Conversion/TorchToTosa/quantization.mlir
@@ -45,30 +45,31 @@ func.func @AtenMmQint8(%arg0: !torch.vtensor<[3,4],si8>, %arg1: !torch.vtensor<[
 
 // -----
 // CHECK-LABEL:   func.func @quantization_per_tensor(
-// CHECK-SAME:      %[[IN:.*]]: !torch.vtensor<[2,4,4],f32>) -> !torch.vtensor<[2,4,4],!torch.qint8> {
-// CHECK:           %[[ZP:.*]] = "tosa.const"() <{values = dense<3> : tensor<1x1x1xi8>}> : () -> tensor<1x1x1xi8>
-// CHECK:           %[[C2:.*]] = "tosa.const"() <{values = dense<2.000000e+00> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
-// CHECK:           %[[CHALF:.*]] = "tosa.const"() <{values = dense<5.000000e-01> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
-// CHECK:           %[[C10:.*]] = "tosa.const"() <{values = dense<1.000000e+01> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
-// CHECK:           %[[MUL_SHIFT:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
-// CHECK:           %[[IN_TENSOR:.*]] = torch_c.to_builtin_tensor %[[IN]] : !torch.vtensor<[2,4,4],f32> -> tensor<2x4x4xf32>
-// CHECK:           %[[RESCALE:.*]] = tosa.mul %[[IN_TENSOR]], %[[C10]], %[[MUL_SHIFT]] : (tensor<2x4x4xf32>, tensor<1x1x1xf32>, tensor<1xi8>) -> tensor<2x4x4xf32>
-// CHECK:           %[[FLOOR:.*]] = tosa.floor %[[RESCALE]] : (tensor<2x4x4xf32>) -> tensor<2x4x4xf32>
-// CHECK:           %[[FRAC:.*]] = tosa.sub %[[RESCALE]], %[[FLOOR]] : (tensor<2x4x4xf32>, tensor<2x4x4xf32>) -> tensor<2x4x4xf32>
-// CHECK:           %[[CEIL:.*]] = tosa.ceil %[[RESCALE]] : (tensor<2x4x4xf32>) -> tensor<2x4x4xf32>
-// CHECK:           %[[FLOOR_DIV_BY_2:.*]] = tosa.mul %[[FLOOR]], %[[CHALF]], %[[MUL_SHIFT]] : (tensor<2x4x4xf32>, tensor<1x1x1xf32>, tensor<1xi8>) -> tensor<2x4x4xf32>
-// CHECK:           %[[FLOOR_DIV:.*]] = tosa.floor %[[FLOOR_DIV_BY_2]] : (tensor<2x4x4xf32>) -> tensor<2x4x4xf32>
-// CHECK:           %[[EVEN_COMP:.*]] = tosa.mul %[[FLOOR_DIV]], %[[C2]], %[[MUL_SHIFT]] : (tensor<2x4x4xf32>, tensor<1x1x1xf32>, tensor<1xi8>) -> tensor<2x4x4xf32>
-// CHECK:           %[[FLOOR_INPUT_EVEN:.*]] = tosa.equal %[[FLOOR]], %[[EVEN_COMP]] : (tensor<2x4x4xf32>, tensor<2x4x4xf32>) -> tensor<2x4x4xi1>
-// CHECK:           %[[FRAC_EQ_HALF:.*]] = tosa.equal %[[FRAC]], %[[CHALF]] : (tensor<2x4x4xf32>, tensor<1x1x1xf32>) -> tensor<2x4x4xi1>
-// CHECK:           %[[GRTR:.*]] = tosa.greater %[[CHALF]], %[[FRAC]] : (tensor<1x1x1xf32>, tensor<2x4x4xf32>) -> tensor<2x4x4xi1>
-// CHECK:           %[[AND:.*]] = tosa.logical_and %[[FRAC_EQ_HALF]], %[[FLOOR_INPUT_EVEN]] : (tensor<2x4x4xi1>, tensor<2x4x4xi1>) -> tensor<2x4x4xi1>
-// CHECK:           %[[OR:.*]] = tosa.logical_or %[[GRTR]], %[[AND]] : (tensor<2x4x4xi1>, tensor<2x4x4xi1>) -> tensor<2x4x4xi1>
-// CHECK:           %[[SELECT:.*]] = tosa.select %[[OR]], %[[FLOOR]], %[[CEIL]] : (tensor<2x4x4xi1>, tensor<2x4x4xf32>, tensor<2x4x4xf32>) -> tensor<2x4x4xf32>
-// CHECK:           %[[CAST:.*]] = tosa.cast %[[SELECT]] : (tensor<2x4x4xf32>) -> tensor<2x4x4xi8>
-// CHECK:           %[[ADD:.*]] = tosa.add %[[CAST]], %[[ZP]] : (tensor<2x4x4xi8>, tensor<1x1x1xi8>) -> tensor<2x4x4xi8>
-// CHECK:           %[[RES:.*]] = torch_c.from_builtin_tensor %[[ADD]] : tensor<2x4x4xi8> -> !torch.vtensor<[2,4,4],!torch.qint8>
-// CHECK:           return %[[RES]]
+// CHECK-SAME:      %[[ARG0:.*]]: !torch.vtensor<[2,4,4],f32>) -> !torch.vtensor<[2,4,4],!torch.qint8> {
+// CHECK:           %[[VAL_0:.*]] = "tosa.const"() <{values = dense<3.000000e+00> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
+// CHECK:           %[[VAL_1:.*]] = "tosa.const"() <{values = dense<2.000000e+00> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
+// CHECK:           %[[VAL_2:.*]] = "tosa.const"() <{values = dense<5.000000e-01> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
+// CHECK:           %[[VAL_3:.*]] = "tosa.const"() <{values = dense<1.000000e+01> : tensor<1x1x1xf32>}> : () -> tensor<1x1x1xf32>
+// CHECK:           %[[VAL_4:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK:           %[[TO_BUILTIN_TENSOR_0:.*]] = torch_c.to_builtin_tensor %[[ARG0]] : !torch.vtensor<[2,4,4],f32> -> tensor<2x4x4xf32>
+// CHECK:           %[[MUL_0:.*]] = tosa.mul %[[TO_BUILTIN_TENSOR_0]], %[[VAL_3]], %[[VAL_4]] : (tensor<2x4x4xf32>, tensor<1x1x1xf32>, tensor<1xi8>) -> tensor<2x4x4xf32>
+// CHECK:           %[[FLOOR_0:.*]] = tosa.floor %[[MUL_0]] : (tensor<2x4x4xf32>) -> tensor<2x4x4xf32>
+// CHECK:           %[[SUB_0:.*]] = tosa.sub %[[MUL_0]], %[[FLOOR_0]] : (tensor<2x4x4xf32>, tensor<2x4x4xf32>) -> tensor<2x4x4xf32>
+// CHECK:           %[[CEIL_0:.*]] = tosa.ceil %[[MUL_0]] : (tensor<2x4x4xf32>) -> tensor<2x4x4xf32>
+// CHECK:           %[[MUL_1:.*]] = tosa.mul %[[FLOOR_0]], %[[VAL_2]], %[[VAL_4]] : (tensor<2x4x4xf32>, tensor<1x1x1xf32>, tensor<1xi8>) -> tensor<2x4x4xf32>
+// CHECK:           %[[FLOOR_1:.*]] = tosa.floor %[[MUL_1]] : (tensor<2x4x4xf32>) -> tensor<2x4x4xf32>
+// CHECK:           %[[MUL_2:.*]] = tosa.mul %[[FLOOR_1]], %[[VAL_1]], %[[VAL_4]] : (tensor<2x4x4xf32>, tensor<1x1x1xf32>, tensor<1xi8>) -> tensor<2x4x4xf32>
+// CHECK:           %[[EQUAL_0:.*]] = tosa.equal %[[FLOOR_0]], %[[MUL_2]] : (tensor<2x4x4xf32>, tensor<2x4x4xf32>) -> tensor<2x4x4xi1>
+// CHECK:           %[[EQUAL_1:.*]] = tosa.equal %[[SUB_0]], %[[VAL_2]] : (tensor<2x4x4xf32>, tensor<1x1x1xf32>) -> tensor<2x4x4xi1>
+// CHECK:           %[[GREATER_0:.*]] = tosa.greater %[[VAL_2]], %[[SUB_0]] : (tensor<1x1x1xf32>, tensor<2x4x4xf32>) -> tensor<2x4x4xi1>
+// CHECK:           %[[LOGICAL_AND_0:.*]] = tosa.logical_and %[[EQUAL_1]], %[[EQUAL_0]] : (tensor<2x4x4xi1>, tensor<2x4x4xi1>) -> tensor<2x4x4xi1>
+// CHECK:           %[[LOGICAL_OR_0:.*]] = tosa.logical_or %[[GREATER_0]], %[[LOGICAL_AND_0]] : (tensor<2x4x4xi1>, tensor<2x4x4xi1>) -> tensor<2x4x4xi1>
+// CHECK:           %[[SELECT_0:.*]] = tosa.select %[[LOGICAL_OR_0]], %[[FLOOR_0]], %[[CEIL_0]] : (tensor<2x4x4xi1>, tensor<2x4x4xf32>, tensor<2x4x4xf32>) -> tensor<2x4x4xf32>
+// CHECK:           %[[ADD_0:.*]] = tosa.add %[[SELECT_0]], %[[VAL_0]] : (tensor<2x4x4xf32>, tensor<1x1x1xf32>) -> tensor<2x4x4xf32>
+// CHECK:           %[[CLAMP_0:.*]] = tosa.clamp %[[ADD_0]] {max_val = 1.270000e+02 : f32, min_val = -1.280000e+02 : f32} : (tensor<2x4x4xf32>) -> tensor<2x4x4xf32>
+// CHECK:           %[[CAST_0:.*]] = tosa.cast %[[CLAMP_0]] : (tensor<2x4x4xf32>) -> tensor<2x4x4xi8>
+// CHECK:           %[[FROM_BUILTIN_TENSOR_0:.*]] = torch_c.from_builtin_tensor %[[CAST_0]] : tensor<2x4x4xi8> -> !torch.vtensor<[2,4,4],!torch.qint8>
+// CHECK:           return %[[FROM_BUILTIN_TENSOR_0]]
 func.func @quantization_per_tensor(%arg0: !torch.vtensor<[2,4,4],f32>) -> !torch.vtensor<[2,4,4],!torch.qint8> {
   %dtype = torch.constant.int 12
   %scale = torch.constant.float 0.1