Use ceilMode from pytorch MaxPool in shlo to get correct output shape (#4167)

brataTT · web-flow · commit 958d2c861368 · 2025-05-09T11:40:49.000+08:00
Pytorch passes an argument `ceilMode` into MaxPool op.
Currently, torch to stablehlo conversion does not use it to determine
stablehlo output shape.

This change intends to use the same logic as torch to use ceilMode when
calculating output shape.
During conversion, we make up the size difference using padding.

Changes:
- Use `ceilMode` param the way torch uses it.
- Added tests for both floor and ceil cases.
diff --git a/lib/Conversion/TorchToStablehlo/Pooling.cpp b/lib/Conversion/TorchToStablehlo/Pooling.cpp
@@ -478,24 +478,52 @@ class ConvertAtenMaxPoolOp : public ConvertAtenOp<AtenOpT> {
     Value initVal =
         createInitialValueForAtenPoolingOp(op, inputElemTy, rewriter);
 
-    if (Dim == 1) {
-      stablehloPadding[stablehloPadding.size() - 2] = padding[0];
-      stablehloPadding[stablehloPadding.size() - 1] = padding[0];
-    } else if (Dim == 2) {
-      stablehloPadding[stablehloPadding.size() - 4] = padding[0];
-      stablehloPadding[stablehloPadding.size() - 3] = padding[0];
-      stablehloPadding[stablehloPadding.size() - 2] = padding[1];
-      stablehloPadding[stablehloPadding.size() - 1] = padding[1];
-    } else if (Dim == 3) {
-      stablehloPadding[stablehloPadding.size() - 6] = padding[0];
-      stablehloPadding[stablehloPadding.size() - 5] = padding[0];
-      stablehloPadding[stablehloPadding.size() - 4] = padding[1];
-      stablehloPadding[stablehloPadding.size() - 3] = padding[1];
-      stablehloPadding[stablehloPadding.size() - 2] = padding[2];
-      stablehloPadding[stablehloPadding.size() - 1] = padding[2];
-    } else {
+    if (Dim < 1 || Dim > 3) {
       assert(false && "Unsupported pooling dimension");
     }
+
+    const size_t spatialIdxStart = inputRank - Dim;
+
+    for (int i = 0; i < Dim; i++) {
+      const size_t frontPadIdx = (spatialIdxStart + i) * 2;
+      const size_t backPadIdx = (spatialIdxStart + i) * 2 + 1;
+
+      // torch padding is symmetric
+      stablehloPadding[frontPadIdx] = padding[i];
+      stablehloPadding[backPadIdx] = padding[i];
+
+      if (ceilMode) {
+        // Match PyTorch output shape with extra padding. See
+        // https://github.com/pytorch/pytorch/blob/c5de6ff079e3e5b453d6ff5190c90f02db458928/aten/src/ATen/native/Pool.h#L79
+        // PyTorch output size formula:
+        // 1. Calculate base output size:
+        // output = (input + 2*pad - dilation*(kernel-1) - 1+adj) / stride + 1
+        // where adj = (stride-1) if ceil_mode else 0
+        // 2. Apply edge case correction:
+        // if ((output-1) * stride >= input + pad_l) --output;
+
+        const int64_t inputSize = inputTy.getDimSize(spatialIdxStart + i);
+        const int64_t numerator = (inputSize + 2 * padding[i] -
+                                   dilation[i] * (kernelSize[i] - 1) - 1);
+        const int64_t floor_output_size = (numerator) / stride[i] + 1;
+        const int64_t adj = (stride[i] - 1);
+        int64_t ceil_output_size = std::ceil((numerator + adj) / stride[i]) + 1;
+
+        // Ensure last pooling starts inside input
+        if ((ceil_output_size - 1) * stride[i] >= inputSize + padding[i]) {
+          ceil_output_size--;
+        }
+
+        // Add extra padding to make output size same as torch
+        if (ceil_output_size > floor_output_size) {
+          const int64_t sizeDiff = ceil_output_size - floor_output_size;
+          const int64_t extraPadding = sizeDiff * stride[i];
+          stablehloPadding[frontPadIdx] += extraPadding / 2;
+          stablehloPadding[backPadIdx] += extraPadding - extraPadding / 2;
+        }
+      }
+    }
+
     auto windowDimensions = rewriter.getDenseI64ArrayAttr(stablehloKernelSize);
     auto windowStrides = rewriter.getDenseI64ArrayAttr(stablehloStride);
     DenseI64ArrayAttr baseDilations;
diff --git a/test/Conversion/TorchToStablehlo/pooling.mlir b/test/Conversion/TorchToStablehlo/pooling.mlir
@@ -65,6 +65,78 @@ func.func @torch.aten.max_pool2d$padding(%arg0: !torch.vtensor<[?,?,?,?],f32>) -
   return %3 : !torch.vtensor<[?,?,?,?],f32>
 }
 
+// -----
+
+// CHECK-LABEL:   func.func @torch.aten.max_pool2d$ceiloff(
+// CHECK-SAME:                                      %[[VAL_0:.*]]: !torch.vtensor<[1,256,56,56],f32>) -> !torch.vtensor<[1,256,27,27],f32> {
+// CHECK:           %[[VAL_1:.*]] = torch_c.to_builtin_tensor %[[VAL_0]] : !torch.vtensor<[1,256,56,56],f32> -> tensor<1x256x56x56xf32>
+// CHECK:           %int3 = torch.constant.int 3
+// CHECK:           %int2 = torch.constant.int 2
+// CHECK:           %int1 = torch.constant.int 1
+// CHECK:           %false = torch.constant.bool false
+// CHECK:           %int0 = torch.constant.int 0
+// CHECK:           %[[VAL_2:.*]] = torch.prim.ListConstruct %int3, %int3 : (!torch.int, !torch.int) -> !torch.list<int>
+// CHECK:           %[[VAL_3:.*]] = torch.prim.ListConstruct %int2, %int2 : (!torch.int, !torch.int) -> !torch.list<int>
+// CHECK:           %[[VAL_4:.*]] = torch.prim.ListConstruct %int0, %int0 : (!torch.int, !torch.int) -> !torch.list<int>
+// CHECK:           %[[VAL_5:.*]] = stablehlo.constant dense<0xFF800000> : tensor<f32>
+// CHECK:           %[[VAL_6:.*]] = "stablehlo.reduce_window"(%[[VAL_1]], %[[VAL_5]])
+// CHECK{LITERAL}:    <{padding = dense<0> : tensor<4x2xi64>, window_dilations = array<i64: 1, 1, 1, 1>, window_dimensions = array<i64: 1, 1, 3, 3>, window_strides = array<i64: 1, 1, 2, 2>}> ({
+// CHECK:           ^bb0(%[[VAL_8:.*]]: tensor<f32>, %[[VAL_9:.*]]: tensor<f32>):
+// CHECK:             %[[VAL_10:.*]] = stablehlo.maximum %[[VAL_8]], %[[VAL_9]] : tensor<f32>
+// CHECK:             stablehlo.return %[[VAL_10]] : tensor<f32>
+// CHECK:           }) : (tensor<1x256x56x56xf32>, tensor<f32>) -> tensor<1x256x27x27xf32>
+// CHECK:           %[[VAL_7:.*]] = torch_c.from_builtin_tensor %[[VAL_6]] : tensor<1x256x27x27xf32> -> !torch.vtensor<[1,256,27,27],f32>
+// CHECK:           return %[[VAL_7]] : !torch.vtensor<[1,256,27,27],f32>
+func.func @torch.aten.max_pool2d$ceiloff(%arg0: !torch.vtensor<[1,256,56,56],f32>) -> !torch.vtensor<[1,256,27,27],f32> {
+  %int3 = torch.constant.int 3
+  %int2 = torch.constant.int 2
+  %int1 = torch.constant.int 1
+  %false = torch.constant.bool false
+  %int0 = torch.constant.int 0
+  %0 = torch.prim.ListConstruct %int3, %int3 : (!torch.int, !torch.int) -> !torch.list<int>
+  %1 = torch.prim.ListConstruct %int2, %int2 : (!torch.int, !torch.int) -> !torch.list<int>
+  %2 = torch.prim.ListConstruct %int0, %int0 : (!torch.int, !torch.int) -> !torch.list<int>
+  %3 = torch.prim.ListConstruct %int1, %int1 : (!torch.int, !torch.int) -> !torch.list<int>
+  %4 = torch.aten.max_pool2d %arg0, %0, %1, %2, %3, %false : !torch.vtensor<[1,256,56,56],f32>, !torch.list<int>, !torch.list<int>, !torch.list<int>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,256,27,27],f32>
+  return %4 : !torch.vtensor<[1,256,27,27],f32>
+}
+
+// -----
+
+// CHECK-LABEL:   func.func @torch.aten.max_pool2d$ceilon(
+// CHECK-SAME:                                      %[[VAL_0:.*]]: !torch.vtensor<[1,256,56,56],f32>) -> !torch.vtensor<[1,256,28,28],f32> {
+// CHECK:           %[[VAL_1:.*]] = torch_c.to_builtin_tensor %[[VAL_0]] : !torch.vtensor<[1,256,56,56],f32> -> tensor<1x256x56x56xf32>
+// CHECK:           %int3 = torch.constant.int 3
+// CHECK:           %int2 = torch.constant.int 2
+// CHECK:           %int1 = torch.constant.int 1
+// CHECK:           %true = torch.constant.bool true
+// CHECK:           %int0 = torch.constant.int 0
+// CHECK:           %[[VAL_2:.*]] = torch.prim.ListConstruct %int3, %int3 : (!torch.int, !torch.int) -> !torch.list<int>
+// CHECK:           %[[VAL_3:.*]] = torch.prim.ListConstruct %int2, %int2 : (!torch.int, !torch.int) -> !torch.list<int>
+// CHECK:           %[[VAL_4:.*]] = torch.prim.ListConstruct %int0, %int0 : (!torch.int, !torch.int) -> !torch.list<int>
+// CHECK:           %[[VAL_5:.*]] = stablehlo.constant dense<0xFF800000> : tensor<f32>
+// CHECK:           %[[VAL_6:.*]] = "stablehlo.reduce_window"(%[[VAL_1]], %[[VAL_5]])
+// CHECK{LITERAL}:    <{padding = dense<[[0, 0], [0, 0], [1, 1], [1, 1]]> : tensor<4x2xi64>, window_dilations = array<i64: 1, 1, 1, 1>, window_dimensions = array<i64: 1, 1, 3, 3>, window_strides = array<i64: 1, 1, 2, 2>}> ({
+// CHECK:           ^bb0(%[[VAL_8:.*]]: tensor<f32>, %[[VAL_9:.*]]: tensor<f32>):
+// CHECK:             %[[VAL_10:.*]] = stablehlo.maximum %[[VAL_8]], %[[VAL_9]] : tensor<f32>
+// CHECK:             stablehlo.return %[[VAL_10]] : tensor<f32>
+// CHECK:           }) : (tensor<1x256x56x56xf32>, tensor<f32>) -> tensor<1x256x28x28xf32>
+// CHECK:           %[[VAL_7:.*]] = torch_c.from_builtin_tensor %[[VAL_6]] : tensor<1x256x28x28xf32> -> !torch.vtensor<[1,256,28,28],f32>
+// CHECK:           return %[[VAL_7]] : !torch.vtensor<[1,256,28,28],f32>
+func.func @torch.aten.max_pool2d$ceilon(%arg0: !torch.vtensor<[1,256,56,56],f32>) -> !torch.vtensor<[1,256,28,28],f32> {
+  %int3 = torch.constant.int 3
+  %int2 = torch.constant.int 2
+  %int1 = torch.constant.int 1
+  %true = torch.constant.bool true
+  %int0 = torch.constant.int 0
+  %0 = torch.prim.ListConstruct %int3, %int3 : (!torch.int, !torch.int) -> !torch.list<int>
+  %1 = torch.prim.ListConstruct %int2, %int2 : (!torch.int, !torch.int) -> !torch.list<int>
+  %2 = torch.prim.ListConstruct %int0, %int0 : (!torch.int, !torch.int) -> !torch.list<int>
+  %3 = torch.prim.ListConstruct %int1, %int1 : (!torch.int, !torch.int) -> !torch.list<int>
+  %4 = torch.aten.max_pool2d %arg0, %0, %1, %2, %3, %true : !torch.vtensor<[1,256,56,56],f32>, !torch.list<int>, !torch.list<int>, !torch.list<int>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,256,28,28],f32>
+  return %4 : !torch.vtensor<[1,256,28,28],f32>
+}
+
 
 // -----