ROCm · sstamenk · Jan 9, 2026
diff --git a/test/inductor/test_online_softmax.py b/test/inductor/test_online_softmax.py
@@ -7,6 +7,7 @@
 import torch._inductor.config as inductor_config
 import torch.nn.functional as F
 from torch._dynamo.utils import rmse, same
+from torch._inductor.runtime.hints import DeviceProperties
 from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.utils import run_and_get_code
 from torch.testing._internal.common_utils import (
@@ -138,8 +139,19 @@ def test_prepare_softmax(self, dim, nrow):
         self.assertTrue(same(ref, act, tol=1e-2))
 
         if nrow == 2048 and dim == 0:
-            # split reduction is triggered. We have multiple kernels
-            self.assertTrue(code.count("def triton") >= 2)
+            num_kernels = 2
+            # split reduction may be triggered depending on the device's SM/CU count.
+            # The heuristic in num_splits() in ir.py returns split=1 (no split) when:
+            #   numel_hint >= num_sm * 2 * 32
+            # When dim=0, numel_hint (output size) = ncol (2048)
+            # split is expected only when num_sm > 32
+            props = DeviceProperties.create(torch.device(GPU_TYPE))
+            num_sm = props.multi_processor_count
+            split_not_expected = 2048 >= num_sm * 2 * 32
+            if split_not_expected:
+                num_kernels = 1
+
+            self.assertTrue(code.count("def triton") >= num_kernels)
         else:
             if nrow == 2 and dim == 0:
                 # persistent reduction triggered