Test performance and correctness checks and reassoc=true

Sameeranjoshi · Sameeranjoshi · commit b0035f82a6ab · 2025-08-03T22:52:32.000-07:00
F32
 Correctness: Pass
 Benchmark: Fails PM issue(run_benchmarks=true)

BF16:
 Correctness: Fails
 Benchmarks: Pass(but this might be not correct as results are wrong)

Run commands:
python run.py delete_out_reduction $IREE_DIR --xrt_dir=$XRT_DIR --peano_dir=$PEANO_DIR \
--target_device="npu4" --xrt_lite_n_core_rows=$XRT_LITE_N_CORE_ROWS \
--xrt_lite_n_core_cols=$XRT_LITE_N_CORE_COLS --tests Reduction
diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
@@ -1039,9 +1039,6 @@ def generate_aie_output(config, aie_vmfb, input_args, function_name, name, outpu
         shell_out(config.reset_npu_script, verbose=config.verbose)
 
     start = time.monotonic_ns()
-    print(f"Run command iree_run_exe: {run_args}")
-    print(f"Run command iree_run_exe: {test_dir}")
-    print(f"Run command iree_run_exe: {config.verbose}")
     shell_out(run_args, test_dir, config.verbose)
     run_time = time.monotonic_ns() - start
 
@@ -1478,7 +1475,7 @@ def aie_vs_baseline(
             name,
             output_type,
         )
-        print(f"SAM: {aie_output}")
+
         summary_string = compare(baseline_value, aie_output, rtol, atol)
         if summary_string:
             print(summary_string)
@@ -2489,13 +2486,14 @@ def __init__(self):
                 )
 
         # Reduction op tests:
-        for data_type in ["bf16", "f32"]:
+        for data_type in ["bf16"]:
+            custom_input = 1.0 * np.ones((8, 512), dtype=np.float16)  # bf16
+            # custom_input = 1.0 * np.ones((8, 512), dtype=np.float32)  # f32
             self.register(
                 Reduction(
                     file_base_name=f"reduction_sum_{data_type}",
                     function_name=f"reduction_sum",
                     test_params=TestParams(
-                        name_suffix=data_type,  # used in final test name
                         tile_pipeline="general-copy",
                         run_on_target=["npu4"],
                         use_chess=False,
@@ -2507,10 +2505,8 @@ def __init__(self):
                         lower_to_aie_pipeline="objectFifo",
                         n_repeats=1,
                         n_kernel_runs=1,
-                        aie_compilation_flags=[
-                            "--iree-amdaie-num-rows=4",
-                            "--iree-amdaie-num-cols=4",
-                        ],
+                        preset_inputs={1: custom_input},
+                        aie_compilation_flags=["--iree-hal-target-backends=amd-aie"],
                     ),
                 )
             )
diff --git a/build_tools/ci/cpu_comparison/test_files/reduction_sum_bf16.mlir b/build_tools/ci/cpu_comparison/test_files/reduction_sum_bf16.mlir
@@ -1,12 +1,12 @@
 // These lines are required for e2e numerical testing:
-// input 8x1024xbf16
+// input 8x512xbf16
 // output 8xbf16
 
 // Constraints:<D0xD1>
 // D0 = [8, no-limit]
 // D1 = [16, 1024]
 
-!in_ty = tensor<8x1024xbf16>
+!in_ty = tensor<8x512xbf16>
 !out_ty = tensor<8xbf16>
 
 func.func @reduction_sum(%arg0: !in_ty) -> !out_ty {
diff --git a/build_tools/ci/cpu_comparison/test_files/reduction_sum_f32.mlir b/build_tools/ci/cpu_comparison/test_files/reduction_sum_f32.mlir
@@ -1,12 +1,13 @@
 // These lines are required for e2e numerical testing:
-// input 1024x256xf32
-// output 1024xf32
+// input 8x512xf32
+// output 8xf32
 
 // Constraints:<D0xD1>
+// Format: [Min, Max]
 // D0 = [2, no-limit]
 // D1 = [2, 256]
-!in_ty = tensor<1024x256xf32>
-!out_ty = tensor<1024xf32>
+!in_ty = tensor<8x512xf32>
+!out_ty = tensor<8xf32>
 
 func.func @reduction_sum(%arg0: !in_ty) -> !out_ty {
   %cst = arith.constant 0.0 : f32
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
@@ -924,7 +924,11 @@ void addMLIRAIELoweringPasses(OpPassManager &pm,
   pm.addPass(createCanonicalizerPass());
   pm.addPass(createCSEPass());
   pm.addPass(aievec::createConvertAIEVecToLLVMPass());
-  pm.addPass(createConvertVectorToLLVMPass());
+  {
+    ConvertVectorToLLVMPassOptions opts{};
+    opts.reassociateFPReductions = true;
+    pm.addPass(createConvertVectorToLLVMPass(opt));
+  }
   pm.addPass(memref::createExpandStridedMetadataPass());
   pm.addPass(createLowerAffinePass());
   pm.addPass(createConvertMathToLLVMPass());