From ea74c75cebc92ba0ece13dd3d3434a5d3c9f771e Mon Sep 17 00:00:00 2001
From: pvasireddy-amd <pvasired@amd.com>
Date: Fri, 6 Dec 2024 08:59:53 -0700
Subject: [PATCH] Fixing dynamic objectFifo  (#1907)

Co-authored-by: AndraBisca <andrab@amd.com>
---
 .../AIEObjectFifoStatefulTransform.cpp        |  13 +-
 .../dynamic_object_fifo/nested_loops/aie2.py  |   5 +-
 .../dynamic_object_fifo/ping_pong/aie2.py     |   2 +-
 .../dynamic_object_fifo/reduction/aie2.py     |   2 +-
 .../sliding_window/aie2.py                    |   5 +-
 .../sliding_window/test.cpp                   |   1 +
 .../sliding_window_conditional/aie.mlir       | 201 ++++++++++++++++++
 .../sliding_window_conditional/aie2.py        |  80 -------
 .../sliding_window_conditional/run.lit        |  10 +
 .../sliding_window_conditional/test.cpp       |   1 +
 .../two_core_sliding_window/aie2.py           |   5 +-
 .../two_core_sliding_window/test.cpp          |   1 +
 .../dynamic_lowering_flag_test.mlir           |  50 ++---
 .../dynamic_lowering_test.mlir                | 114 +++++-----
 14 files changed, 320 insertions(+), 170 deletions(-)
 create mode 100644 test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie.mlir
 delete mode 100644 test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
 create mode 100644 test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/run.lit
diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
index 0d2e6c5821..6e9af55a07 100644
--- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
+++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
@@ -1061,7 +1061,7 @@ struct AIEObjectFifoStatefulTransformPass
         builder.getUnknownLoc(), globalNextIndex,
         ValueRange(ArrayRef({index.getResult()})));
     Value val = builder.create<arith::ConstantOp>(
-        oldCounter.getLoc(), builder.getIndexAttr(relOp.getSize()));
+        oldCounter.getLoc(), builder.getI32IntegerAttr(relOp.getSize()));
     Value sum = builder.create<arith::AddIOp>(val.getLoc(), oldCounter, val);
     Value newCounter = builder.create<arith::RemSIOp>(sum.getLoc(), sum, size);
     builder.create<memref::StoreOp>(size.getLoc(), newCounter, globalNextIndex,
@@ -1091,7 +1091,7 @@ struct AIEObjectFifoStatefulTransformPass
         builder.setInsertionPoint(coreOp);
         auto memrefTy =
             MemRefType::get(SmallVector<int64_t>{(int64_t)fifoSizes.size()},
-                            builder.getIndexType());
+                            builder.getI32Type());
         auto globalNextIndex = builder.create<BufferOp>(
             builder.getUnknownLoc(), memrefTy, coreOp.getTile(),
             /*sym_name*/ nullptr, /*address*/ nullptr,
@@ -1109,14 +1109,14 @@ struct AIEObjectFifoStatefulTransformPass
         int index = 0;
         builder.setInsertionPointToStart(&(coreOp.getBody().front()));
         Value initVal = builder.create<arith::ConstantOp>(
-            builder.getUnknownLoc(), builder.getIndexAttr(0));
+            builder.getUnknownLoc(), builder.getI32IntegerAttr(0));
         for (auto i : fifoSizes) {
           auto indexOp = builder.create<arith::ConstantOp>(
               initVal.getLoc(), builder.getIndexAttr(index));
           globalIndices[i.first] = indexOp;
           index++;
           auto size = builder.create<arith::ConstantOp>(
-              indexOp.getLoc(), builder.getIndexAttr(i.second));
+              indexOp.getLoc(), builder.getI32IntegerAttr(i.second));
           constantSizes[i.first] = size;
           builder.create<memref::StoreOp>(
               size.getLoc(), initVal, globalNextIndex,
@@ -1153,10 +1153,13 @@ struct AIEObjectFifoStatefulTransformPass
 
               // Create a switch for each subview access
               builder.setInsertionPointAfter(accessOp);
-              auto switchIndex = builder.create<memref::LoadOp>(
+              auto switchIndexAsInteger = builder.create<memref::LoadOp>(
                   builder.getUnknownLoc(), globalNextIndex,
                   ValueRange(
                       ArrayRef({globalIndices[{createOp, port}].getResult()})));
+              auto switchIndex = builder.create<arith::IndexCastOp>(
+                  builder.getUnknownLoc(), builder.getIndexType(),
+                  switchIndexAsInteger);
               unsigned caseRegionCounts = fifoSizes[{createOp, port}];
               SmallVector<int64_t, 4> caseValues;
               for (int i = 0; i < fifoSizes[{createOp, port}]; ++i) {
diff --git a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
index 8c41a9868e..7c2b664a65 100644
--- a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
@@ -9,10 +9,11 @@
 #
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
-# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
-# RUN: %run_on_npu ./test.exe | FileCheck %s
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --dynamic-objFifos --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: %run_on_npu ./test.exe -x final.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 # CHECK: PASS!
+
 import numpy as np
 
 from aie.dialects.aie import *
diff --git a/test/npu-xrt/dynamic_object_fifo/ping_pong/aie2.py b/test/npu-xrt/dynamic_object_fifo/ping_pong/aie2.py
index 03a25b90db..19dc7c6e3c 100644
--- a/test/npu-xrt/dynamic_object_fifo/ping_pong/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/ping_pong/aie2.py
@@ -9,7 +9,7 @@
 #
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
-# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --dynamic-objFifos --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
 # RUN: %run_on_npu ./test.exe | FileCheck %s
 # CHECK: PASS!
diff --git a/test/npu-xrt/dynamic_object_fifo/reduction/aie2.py b/test/npu-xrt/dynamic_object_fifo/reduction/aie2.py
index 3f04ed0f1f..4814d27dae 100644
--- a/test/npu-xrt/dynamic_object_fifo/reduction/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/reduction/aie2.py
@@ -9,7 +9,7 @@
 #
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
-# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --dynamic-objFifos --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
 # RUN: %run_on_npu ./test.exe | FileCheck %s
 # CHECK: PASS!
diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py b/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py
index 8b91d2e434..129b69eae5 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py
@@ -9,10 +9,11 @@
 #
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
-# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --dynamic-objFifos --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
 # RUN: %run_on_npu ./test.exe | FileCheck %s
-# XFAIL: *
+# CHECK: PASS!
+
 from aie.dialects.aie import *
 from aie.dialects.aiex import *
 from aie.helpers.dialects.ext.scf import _for as range_
diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window/test.cpp b/test/npu-xrt/dynamic_object_fifo/sliding_window/test.cpp
index 648924ac4f..0fb9cfa7d4 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window/test.cpp
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window/test.cpp
@@ -28,6 +28,7 @@
 #define INPUT_SIZE (100 * sizeof(int))  // in bytes
 #define OUTPUT_SIZE (100 * sizeof(int)) // in bytes
 #define WIDTH_SIZE (10 * sizeof(int))   // in bytes
+
 #define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE
 #define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE
 
diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie.mlir b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie.mlir
new file mode 100644
index 0000000000..87197925b1
--- /dev/null
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie.mlir
@@ -0,0 +1,201 @@
+module {
+  aie.device(npu1_1col) {
+    memref.global "public" @output_fifo_cons : memref<10xi32>
+    memref.global "public" @output_fifo : memref<10xi32>
+    memref.global "public" @input_fifo_cons : memref<10xi32>
+    memref.global "public" @input_fifo : memref<10xi32>
+    func.func private @add_10_i32(memref<10xi32>, memref<10xi32>, memref<10xi32>)
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_2 = aie.tile(0, 2)
+    %output_fifo_cons_prod_lock = aie.lock(%tile_0_0, 2) {init = 0 : i32, sym_name = "output_fifo_cons_prod_lock"}
+    %output_fifo_cons_cons_lock = aie.lock(%tile_0_0, 3) {init = 0 : i32, sym_name = "output_fifo_cons_cons_lock"}
+    %output_fifo_buff_0 = aie.buffer(%tile_0_2) {sym_name = "output_fifo_buff_0"} : memref<10xi32> 
+    %output_fifo_buff_1 = aie.buffer(%tile_0_2) {sym_name = "output_fifo_buff_1"} : memref<10xi32> 
+    %output_fifo_prod_lock = aie.lock(%tile_0_2, 2) {init = 2 : i32, sym_name = "output_fifo_prod_lock"}
+    %output_fifo_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "output_fifo_cons_lock"}
+    %input_fifo_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "input_fifo_cons_buff_0"} : memref<10xi32> 
+    %input_fifo_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "input_fifo_cons_buff_1"} : memref<10xi32> 
+    %input_fifo_cons_buff_2 = aie.buffer(%tile_0_2) {sym_name = "input_fifo_cons_buff_2"} : memref<10xi32> 
+    %input_fifo_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 3 : i32, sym_name = "input_fifo_cons_prod_lock"}
+    %input_fifo_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "input_fifo_cons_cons_lock"}
+    %input_fifo_prod_lock = aie.lock(%tile_0_0, 0) {init = 0 : i32, sym_name = "input_fifo_prod_lock"}
+    %input_fifo_cons_lock = aie.lock(%tile_0_0, 1) {init = 0 : i32, sym_name = "input_fifo_cons_lock"}
+    aie.flow(%tile_0_0, DMA : 0, %tile_0_2, DMA : 0)
+    aie.flow(%tile_0_2, DMA : 0, %tile_0_0, DMA : 0)
+    %buffer_0_2 = aie.buffer(%tile_0_2) : memref<2xi32> 
+    %core_0_2 = aie.core(%tile_0_2) {
+      %c0_i32 = arith.constant 0 : i32
+      %c0_0 = arith.constant 0 : index
+      %c2_i32 = arith.constant 2 : i32
+      memref.store %c0_i32, %buffer_0_2[%c0_0] : memref<2xi32>
+      %c1 = arith.constant 1 : index
+      %c3_i32 = arith.constant 3 : i32
+      memref.store %c0_i32, %buffer_0_2[%c1] : memref<2xi32>
+      %c0_1 = arith.constant 0 : index
+      %c10 = arith.constant 10 : index
+      %c1_2 = arith.constant 1 : index
+      scf.for %arg0 = %c0_1 to %c10 step %c1_2 {
+        aie.use_lock(%output_fifo_prod_lock, AcquireGreaterEqual, 1)
+        %0 = memref.load %buffer_0_2[%c0_0] : memref<2xi32>
+        %1 = arith.index_cast %0 : i32 to index
+        %2 = scf.index_switch %1 -> memref<10xi32> 
+        case 0 {
+          scf.yield %output_fifo_buff_0 : memref<10xi32>
+        }
+        case 1 {
+          scf.yield %output_fifo_buff_1 : memref<10xi32>
+        }
+        default {
+          scf.yield %output_fifo_buff_0 : memref<10xi32>
+        }
+        %3 = arith.cmpi eq, %arg0, %c0_1 : index
+        %4 = arith.subi %c10, %c1_2 : index
+        %5 = arith.cmpi eq, %arg0, %4 : index
+        scf.if %3 {
+          aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 1)
+          %8 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+          %9 = arith.index_cast %8 : i32 to index
+          %10 = scf.index_switch %9 -> memref<10xi32> 
+          case 0 {
+            scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
+          }
+          case 1 {
+            scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
+          }
+          case 2 {
+            scf.yield %input_fifo_cons_buff_2 : memref<10xi32>
+          }
+          default {
+            scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
+          }
+          func.call @add_10_i32(%10, %10, %2) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
+        } else {
+          scf.if %5 {
+            aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 2)
+            %8 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+            %9 = arith.index_cast %8 : i32 to index
+            %10 = scf.index_switch %9 -> memref<10xi32> 
+            case 0 {
+              scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
+            }
+            case 1 {
+              scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
+            }
+            case 2 {
+              scf.yield %input_fifo_cons_buff_2 : memref<10xi32>
+            }
+            default {
+              scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
+            }
+            %11 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+            %12 = arith.index_cast %11 : i32 to index
+            %13 = scf.index_switch %12 -> memref<10xi32> 
+            case 0 {
+              scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
+            }
+            case 1 {
+              scf.yield %input_fifo_cons_buff_2 : memref<10xi32>
+            }
+            case 2 {
+              scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
+            }
+            default {
+              scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
+            }
+            func.call @add_10_i32(%10, %13, %2) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
+            aie.use_lock(%input_fifo_cons_prod_lock, Release, 2)
+            %14 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+            %c2_4 = arith.constant 2 : i32
+            %15 = arith.addi %14, %c2_4 : i32
+            %16 = arith.remsi %15, %c3_i32 : i32
+            memref.store %16, %buffer_0_2[%c1] : memref<2xi32>
+          } else {
+            %8 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+            %9 = arith.index_cast %8 : i32 to index
+            %10 = scf.index_switch %9 -> memref<10xi32> 
+            case 0 {
+              scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
+            }
+            case 1 {
+              scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
+            }
+            case 2 {
+              scf.yield %input_fifo_cons_buff_2 : memref<10xi32>
+            }
+            default {
+              scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
+            }
+            %11 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+            %12 = arith.index_cast %11 : i32 to index
+            %13 = scf.index_switch %12 -> memref<10xi32> 
+            case 0 {
+              scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
+            }
+            case 1 {
+              scf.yield %input_fifo_cons_buff_2 : memref<10xi32>
+            }
+            case 2 {
+              scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
+            }
+            default {
+              scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
+            }
+            func.call @add_10_i32(%10, %13, %2) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
+            aie.use_lock(%input_fifo_cons_prod_lock, Release, 1)
+            %14 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+            %c1_4 = arith.constant 1 : i32
+            %15 = arith.addi %14, %c1_4 : i32
+            %16 = arith.remsi %15, %c3_i32 : i32
+            memref.store %16, %buffer_0_2[%c1] : memref<2xi32>
+          }
+        }
+        aie.use_lock(%output_fifo_cons_lock, Release, 1)
+        %6 = memref.load %buffer_0_2[%c0_0] : memref<2xi32>
+        %c1_3 = arith.constant 1 : i32
+        %7 = arith.addi %6, %c1_3 : i32
+        %8 = arith.remsi %7, %c2_i32 : i32
+        memref.store %8, %buffer_0_2[%c0_0] : memref<2xi32>
+      }
+      aie.end
+    } {link_with = "kernel.o"}
+    aie.shim_dma_allocation @input_fifo(MM2S, 0, 0)
+    aiex.runtime_sequence(%arg0: memref<10xi32>, %arg1: memref<10xi32>) {
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 100][0, 0, 0, 1]) {id = 0 : i64, metadata = @input_fifo} : memref<10xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 100][0, 0, 0, 1]) {id = 2 : i64, metadata = @output_fifo} : memref<10xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+    }
+    aie.shim_dma_allocation @output_fifo(S2MM, 0, 0)
+    %mem_0_2 = aie.mem(%tile_0_2) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb4)
+    ^bb1:  // 2 preds: ^bb0, ^bb3
+      aie.use_lock(%input_fifo_cons_prod_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%input_fifo_cons_buff_0 : memref<10xi32>, 0, 10)
+      aie.use_lock(%input_fifo_cons_cons_lock, Release, 1)
+      aie.next_bd ^bb2
+    ^bb2:  // pred: ^bb1
+      aie.use_lock(%input_fifo_cons_prod_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%input_fifo_cons_buff_1 : memref<10xi32>, 0, 10)
+      aie.use_lock(%input_fifo_cons_cons_lock, Release, 1)
+      aie.next_bd ^bb3
+    ^bb3:  // pred: ^bb2
+      aie.use_lock(%input_fifo_cons_prod_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%input_fifo_cons_buff_2 : memref<10xi32>, 0, 10)
+      aie.use_lock(%input_fifo_cons_cons_lock, Release, 1)
+      aie.next_bd ^bb1
+    ^bb4:  // pred: ^bb0
+      %1 = aie.dma_start(MM2S, 0, ^bb5, ^bb7)
+    ^bb5:  // 2 preds: ^bb4, ^bb6
+      aie.use_lock(%output_fifo_cons_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%output_fifo_buff_0 : memref<10xi32>, 0, 10)
+      aie.use_lock(%output_fifo_prod_lock, Release, 1)
+      aie.next_bd ^bb6
+    ^bb6:  // pred: ^bb5
+      aie.use_lock(%output_fifo_cons_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%output_fifo_buff_1 : memref<10xi32>, 0, 10)
+      aie.use_lock(%output_fifo_prod_lock, Release, 1)
+      aie.next_bd ^bb5
+    ^bb7:  // pred: ^bb4
+      aie.end
+    }
+  }
+}
diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
deleted file mode 100644
index d7eae0bc31..0000000000
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
+++ /dev/null
@@ -1,80 +0,0 @@
-#
-# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# (c) Copyright 2024 AMD Inc.
-
-# REQUIRES: ryzen_ai, valid_xchess_license
-#
-# RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
-# RUN: %python %S/aie2.py > ./aie2.mlir
-# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
-# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
-# RUN: %run_on_npu ./test.exe | FileCheck %s
-# XFAIL: *
-import numpy as np
-
-from aie.dialects.aie import *
-from aie.dialects.aiex import *
-from aie.helpers.dialects.ext.scf import _for as range_
-from aie.extras.context import mlir_mod_ctx
-
-N = 100
-n_rows = 10
-dev = AIEDevice.npu1_1col
-col = 0
-
-
-def sliding_window():
-    with mlir_mod_ctx() as ctx:
-
-        @device(dev)
-        def device_body():
-            subtensor_ty = np.ndarray[(N // n_rows,), np.dtype[np.int32]]
-
-            # Tile declarations
-            ShimTile = tile(col, 0)
-            ComputeTile = tile(col, 2)
-
-            # AIE-array data movement with object fifos
-            of_in = object_fifo("in", ShimTile, ComputeTile, 3, subtensor_ty)
-            of_out = object_fifo("out", ComputeTile, ShimTile, 2, subtensor_ty)
-
-            # AIE Core Function declarations
-            add_10_i32 = external_func(
-                "add_10_i32", inputs=[subtensor_ty, subtensor_ty, subtensor_ty]
-            )
-
-            # Set up compute tiles
-            @core(ComputeTile, "kernel.o")
-            def core_body():
-                for i in range_(10):
-                    elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
-                    if i == 0:
-                        elemInPre = of_in.acquire(ObjectFifoPort.Consume, 1)
-                        add_10_i32(elemInPre, elemInPre, elemOut)
-                    elif i == 9:
-                        elemsInPost = of_in.acquire(ObjectFifoPort.Consume, 2)
-                        add_10_i32(elemsInPost[0], elemsInPost[1], elemOut)
-                        of_in.release(ObjectFifoPort.Consume, 2)
-                    else:
-                        elemsIn = of_in.acquire(ObjectFifoPort.Consume, 2)
-                        add_10_i32(elemsIn[0], elemsIn[1], elemOut)
-                        of_in.release(ObjectFifoPort.Consume, 1)
-
-                of_out.release(ObjectFifoPort.Produce, 1)
-
-            # To/from AIE-array data movement
-            tensor_ty = np.ndarray[(N,), np.dtype[np.int32]]
-
-            @runtime_sequence(tensor_ty, tensor_ty)
-            def sequence(A, C):
-                npu_dma_memcpy_nd(metadata=of_in, bd_id=1, mem=A, sizes=[1, 1, 1, N])
-                npu_dma_memcpy_nd(metadata=of_out, bd_id=0, mem=C, sizes=[1, 1, 1, N])
-                dma_wait(of_out)
-
-    print(ctx.module)
-
-
-sliding_window()
diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/run.lit b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/run.lit
new file mode 100644
index 0000000000..6220c2ec10
--- /dev/null
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/run.lit
@@ -0,0 +1,10 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, valid_xchess_license
+//
+// RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
+// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --dynamic-objFifos --xclbin-name=final.xclbin --npu-insts-name=insts.txt %S/aie.mlir
+// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
+// RUN: %run_on_npu ./test.exe -x final.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// CHECK: PASS!
diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/test.cpp b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/test.cpp
index 648924ac4f..0fb9cfa7d4 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/test.cpp
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/test.cpp
@@ -28,6 +28,7 @@
 #define INPUT_SIZE (100 * sizeof(int))  // in bytes
 #define OUTPUT_SIZE (100 * sizeof(int)) // in bytes
 #define WIDTH_SIZE (10 * sizeof(int))   // in bytes
+
 #define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE
 #define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE
 
diff --git a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py
index 4fba84bb83..a48d6149ba 100644
--- a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py
@@ -9,10 +9,11 @@
 #
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
-# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --dynamic-objFifos --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
 # RUN: %run_on_npu ./test.exe | FileCheck %s
-# XFAIL: *
+# CHECK: PASS!
+
 import numpy as np
 
 from aie.dialects.aie import *
diff --git a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/test.cpp b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/test.cpp
index 648924ac4f..0fb9cfa7d4 100644
--- a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/test.cpp
+++ b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/test.cpp
@@ -28,6 +28,7 @@
 #define INPUT_SIZE (100 * sizeof(int))  // in bytes
 #define OUTPUT_SIZE (100 * sizeof(int)) // in bytes
 #define WIDTH_SIZE (10 * sizeof(int))   // in bytes
+
 #define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE
 #define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE
 
diff --git a/test/objectFifo-stateful-transform/dynamic_lowering_flag_test.mlir b/test/objectFifo-stateful-transform/dynamic_lowering_flag_test.mlir
index e91c1f9f21..16c028b6c3 100644
--- a/test/objectFifo-stateful-transform/dynamic_lowering_flag_test.mlir
+++ b/test/objectFifo-stateful-transform/dynamic_lowering_flag_test.mlir
@@ -11,20 +11,21 @@
 // RUN: aie-opt --aie-objectFifo-stateful-transform %s | FileCheck %s
 
 // CHECK:     %core_0_2 = aie.core(%tile_0_2) {
+// CHECK:       %c0_i32 = arith.constant 0 : i32
 // CHECK:       %c0 = arith.constant 0 : index
-// CHECK:       %c0_0 = arith.constant 0 : index
-// CHECK:       %c2 = arith.constant 2 : index
-// CHECK:       memref.store %c0, %buffer_0_2[%c0_0] : memref<2xindex>
+// CHECK:       %c2_i32 = arith.constant 2 : i32
+// CHECK:       memref.store %c0_i32, %buffer_0_2[%c0] : memref<2xi32>
 // CHECK:       %c1 = arith.constant 1 : index
-// CHECK:       %c2_1 = arith.constant 2 : index
-// CHECK:       memref.store %c0, %buffer_0_2[%c1] : memref<2xindex>
-// CHECK:       %c0_2 = arith.constant 0 : index
-// CHECK:       %c1_3 = arith.constant 1 : index
+// CHECK:       %c2_i32_0 = arith.constant 2 : i32
+// CHECK:       memref.store %c0_i32, %buffer_0_2[%c1] : memref<2xi32>
+// CHECK:       %c0_1 = arith.constant 0 : index
+// CHECK:       %c1_2 = arith.constant 1 : index
 // CHECK:       %c10 = arith.constant 10 : index
-// CHECK:       scf.for %arg0 = %c0_2 to %c10 step %c1_3 {
+// CHECK:       scf.for %arg0 = %c0_1 to %c10 step %c1_2 {
 // CHECK:         aie.use_lock(%output_fifo_prod_lock, AcquireGreaterEqual, 1)
-// CHECK:         %0 = memref.load %buffer_0_2[%c0_0] : memref<2xindex>
-// CHECK:         %1 = scf.index_switch %0 -> memref<10xi32> 
+// CHECK:         %0 = memref.load %buffer_0_2[%c0] : memref<2xi32>
+// CHECK:         %1 = arith.index_cast %0 : i32 to index
+// CHECK:         %2 = scf.index_switch %1 -> memref<10xi32> 
 // CHECK:         case 0 {
 // CHECK:           scf.yield %output_fifo_buff_0 : memref<10xi32>
 // CHECK:         }
@@ -35,8 +36,9 @@
 // CHECK:           scf.yield %output_fifo_buff_0 : memref<10xi32>
 // CHECK:         }
 // CHECK:         aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 1)
-// CHECK:         %2 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-// CHECK:         %3 = scf.index_switch %2 -> memref<10xi32> 
+// CHECK:         %3 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+// CHECK:         %4 = arith.index_cast %3 : i32 to index
+// CHECK:         %5 = scf.index_switch %4 -> memref<10xi32> 
 // CHECK:         case 0 {
 // CHECK:           scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
 // CHECK:         }
@@ -46,19 +48,19 @@
 // CHECK:         default {
 // CHECK:           scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
 // CHECK:         }
-// CHECK:         func.call @passthrough_10_i32(%3, %1) : (memref<10xi32>, memref<10xi32>) -> ()
+// CHECK:         func.call @passthrough_10_i32(%5, %2) : (memref<10xi32>, memref<10xi32>) -> ()
 // CHECK:         aie.use_lock(%input_fifo_cons_prod_lock, Release, 1)
-// CHECK:         %4 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-// CHECK:         %c1_4 = arith.constant 1 : index
-// CHECK:         %5 = arith.addi %4, %c1_4 : index
-// CHECK:         %6 = arith.remsi %5, %c2_1 : index
-// CHECK:         memref.store %6, %buffer_0_2[%c1] : memref<2xindex>
+// CHECK:         %6 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+// CHECK:         %c1_i32 = arith.constant 1 : i32
+// CHECK:         %7 = arith.addi %6, %c1_i32 : i32
+// CHECK:         %8 = arith.remsi %7, %c2_i32_0 : i32
+// CHECK:         memref.store %8, %buffer_0_2[%c1] : memref<2xi32>
 // CHECK:         aie.use_lock(%output_fifo_cons_lock, Release, 1)
-// CHECK:         %7 = memref.load %buffer_0_2[%c0_0] : memref<2xindex>
-// CHECK:         %c1_5 = arith.constant 1 : index
-// CHECK:         %8 = arith.addi %7, %c1_5 : index
-// CHECK:         %9 = arith.remsi %8, %c2 : index
-// CHECK:         memref.store %9, %buffer_0_2[%c0_0] : memref<2xindex>
+// CHECK:         %9 = memref.load %buffer_0_2[%c0] : memref<2xi32>
+// CHECK:         %c1_i32_3 = arith.constant 1 : i32
+// CHECK:         %10 = arith.addi %9, %c1_i32_3 : i32
+// CHECK:         %11 = arith.remsi %10, %c2_i32 : i32
+// CHECK:         memref.store %11, %buffer_0_2[%c0] : memref<2xi32>
 // CHECK:       }
 // CHECK:       aie.end
 // CHECK:     } {dynamic_objfifo_lowering = true}
@@ -78,7 +80,7 @@
 // CHECK:         func.call @passthrough_10_i32(%input_fifo2_cons_buff_1, %output_fifo2_buff_1) : (memref<10xi32>, memref<10xi32>) -> ()
 // CHECK:         aie.use_lock(%input_fifo2_cons_prod_lock, Release, 1)
 // CHECK:         aie.use_lock(%output_fifo2_cons_lock, Release, 1)
-// CHECK:       }        
+// CHECK:       }
 // CHECK:       aie.end
 // CHECK:     }
 // CHECK:     aie.shim_dma_allocation @input_fifo(MM2S, 0, 0)
diff --git a/test/objectFifo-stateful-transform/dynamic_lowering_test.mlir b/test/objectFifo-stateful-transform/dynamic_lowering_test.mlir
index c169c8472e..087b8e5a2a 100644
--- a/test/objectFifo-stateful-transform/dynamic_lowering_test.mlir
+++ b/test/objectFifo-stateful-transform/dynamic_lowering_test.mlir
@@ -35,21 +35,22 @@
 // CHECK:       %input_fifo_cons_lock = aie.lock(%tile_0_0, 1) {init = 0 : i32, sym_name = "input_fifo_cons_lock"}
 // CHECK:       aie.flow(%tile_0_0, DMA : 0, %tile_0_2, DMA : 0)
 // CHECK:       aie.flow(%tile_0_2, DMA : 0, %tile_0_0, DMA : 0)
-// CHECK:       %buffer_0_2 = aie.buffer(%tile_0_2) : memref<2xindex> 
+// CHECK:       %buffer_0_2 = aie.buffer(%tile_0_2) : memref<2xi32> 
 // CHECK:       %core_0_2 = aie.core(%tile_0_2) {
+// CHECK:         %c0_i32 = arith.constant 0 : i32
 // CHECK:         %c0 = arith.constant 0 : index
-// CHECK:         %c0_0 = arith.constant 0 : index
-// CHECK:         %c2 = arith.constant 2 : index
-// CHECK:         memref.store %c0, %buffer_0_2[%c0_0] : memref<2xindex>
+// CHECK:         %c2_i32 = arith.constant 2 : i32
+// CHECK:         memref.store %c0_i32, %buffer_0_2[%c0] : memref<2xi32>
 // CHECK:         %c1 = arith.constant 1 : index
-// CHECK:         %c3 = arith.constant 3 : index
-// CHECK:         memref.store %c0, %buffer_0_2[%c1] : memref<2xindex>
-// CHECK:         %c0_1 = arith.constant 0 : index
-// CHECK:         %c1_2 = arith.constant 1 : index
+// CHECK:         %c3_i32 = arith.constant 3 : i32
+// CHECK:         memref.store %c0_i32, %buffer_0_2[%c1] : memref<2xi32>
+// CHECK:         %c0_0 = arith.constant 0 : index
+// CHECK:         %c1_1 = arith.constant 1 : index
 // CHECK:         %c9 = arith.constant 9 : index
 // CHECK:         aie.use_lock(%output_fifo_prod_lock, AcquireGreaterEqual, 1)
-// CHECK:         %0 = memref.load %buffer_0_2[%c0_0] : memref<2xindex>
-// CHECK:         %1 = scf.index_switch %0 -> memref<10xi32> 
+// CHECK:         %0 = memref.load %buffer_0_2[%c0] : memref<2xi32>
+// CHECK:         %1 = arith.index_cast %0 : i32 to index
+// CHECK:         %2 = scf.index_switch %1 -> memref<10xi32> 
 // CHECK:         case 0 {
 // CHECK:           scf.yield %output_fifo_buff_0 : memref<10xi32>
 // CHECK:         }
@@ -60,8 +61,9 @@
 // CHECK:           scf.yield %output_fifo_buff_0 : memref<10xi32>
 // CHECK:         }
 // CHECK:         aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 1)
-// CHECK:         %2 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-// CHECK:         %3 = scf.index_switch %2 -> memref<10xi32> 
+// CHECK:         %3 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+// CHECK:         %4 = arith.index_cast %3 : i32 to index
+// CHECK:         %5 = scf.index_switch %4 -> memref<10xi32> 
 // CHECK:         case 0 {
 // CHECK:           scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
 // CHECK:         }
@@ -74,17 +76,18 @@
 // CHECK:         default {
 // CHECK:           scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
 // CHECK:         }
-// CHECK:         func.call @add_10_i32(%3, %3, %1) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
+// CHECK:         func.call @add_10_i32(%5, %5, %2) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
 // CHECK:         aie.use_lock(%output_fifo_cons_lock, Release, 1)
-// CHECK:         %4 = memref.load %buffer_0_2[%c0_0] : memref<2xindex>
-// CHECK:         %c1_3 = arith.constant 1 : index
-// CHECK:         %5 = arith.addi %4, %c1_3 : index
-// CHECK:         %6 = arith.remsi %5, %c2 : index
-// CHECK:         memref.store %6, %buffer_0_2[%c0_0] : memref<2xindex>
-// CHECK:         scf.for %arg0 = %c0_1 to %c9 step %c1_2 {
+// CHECK:         %6 = memref.load %buffer_0_2[%c0] : memref<2xi32>
+// CHECK:         %c1_i32 = arith.constant 1 : i32
+// CHECK:         %7 = arith.addi %6, %c1_i32 : i32
+// CHECK:         %8 = arith.remsi %7, %c2_i32 : i32
+// CHECK:         memref.store %8, %buffer_0_2[%c0] : memref<2xi32>
+// CHECK:         scf.for %arg0 = %c0_0 to %c9 step %c1_1 {
 // CHECK:           aie.use_lock(%output_fifo_prod_lock, AcquireGreaterEqual, 1)
-// CHECK:           %19 = memref.load %buffer_0_2[%c0_0] : memref<2xindex>
-// CHECK:           %20 = scf.index_switch %19 -> memref<10xi32> 
+// CHECK:           %24 = memref.load %buffer_0_2[%c0] : memref<2xi32>
+// CHECK:           %25 = arith.index_cast %24 : i32 to index
+// CHECK:           %26 = scf.index_switch %25 -> memref<10xi32> 
 // CHECK:           case 0 {
 // CHECK:             scf.yield %output_fifo_buff_0 : memref<10xi32>
 // CHECK:           }
@@ -95,8 +98,9 @@
 // CHECK:             scf.yield %output_fifo_buff_0 : memref<10xi32>
 // CHECK:           }
 // CHECK:           aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 1)
-// CHECK:           %21 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-// CHECK:           %22 = scf.index_switch %21 -> memref<10xi32> 
+// CHECK:           %27 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+// CHECK:           %28 = arith.index_cast %27 : i32 to index
+// CHECK:           %29 = scf.index_switch %28 -> memref<10xi32> 
 // CHECK:           case 0 {
 // CHECK:             scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
 // CHECK:           }
@@ -109,8 +113,9 @@
 // CHECK:           default {
 // CHECK:             scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
 // CHECK:           }
-// CHECK:           %23 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-// CHECK:           %24 = scf.index_switch %23 -> memref<10xi32> 
+// CHECK:           %30 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+// CHECK:           %31 = arith.index_cast %30 : i32 to index
+// CHECK:           %32 = scf.index_switch %31 -> memref<10xi32> 
 // CHECK:           case 0 {
 // CHECK:             scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
 // CHECK:           }
@@ -123,23 +128,24 @@
 // CHECK:           default {
 // CHECK:             scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
 // CHECK:           }
-// CHECK:           func.call @add_10_i32(%22, %24, %20) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
+// CHECK:           func.call @add_10_i32(%29, %32, %26) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
 // CHECK:           aie.use_lock(%input_fifo_cons_prod_lock, Release, 1)
-// CHECK:           %25 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-// CHECK:           %c1_6 = arith.constant 1 : index
-// CHECK:           %26 = arith.addi %25, %c1_6 : index
-// CHECK:           %27 = arith.remsi %26, %c3 : index
-// CHECK:           memref.store %27, %buffer_0_2[%c1] : memref<2xindex>
+// CHECK:           %33 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+// CHECK:           %c1_i32_4 = arith.constant 1 : i32
+// CHECK:           %34 = arith.addi %33, %c1_i32_4 : i32
+// CHECK:           %35 = arith.remsi %34, %c3_i32 : i32
+// CHECK:           memref.store %35, %buffer_0_2[%c1] : memref<2xi32>
 // CHECK:           aie.use_lock(%output_fifo_cons_lock, Release, 1)
-// CHECK:           %28 = memref.load %buffer_0_2[%c0_0] : memref<2xindex>
-// CHECK:           %c1_7 = arith.constant 1 : index
-// CHECK:           %29 = arith.addi %28, %c1_7 : index
-// CHECK:           %30 = arith.remsi %29, %c2 : index
-// CHECK:           memref.store %30, %buffer_0_2[%c0_0] : memref<2xindex>
+// CHECK:           %36 = memref.load %buffer_0_2[%c0] : memref<2xi32>
+// CHECK:           %c1_i32_5 = arith.constant 1 : i32
+// CHECK:           %37 = arith.addi %36, %c1_i32_5 : i32
+// CHECK:           %38 = arith.remsi %37, %c2_i32 : i32
+// CHECK:           memref.store %38, %buffer_0_2[%c0] : memref<2xi32>
 // CHECK:         }
 // CHECK:         aie.use_lock(%output_fifo_prod_lock, AcquireGreaterEqual, 1)
-// CHECK:         %7 = memref.load %buffer_0_2[%c0_0] : memref<2xindex>
-// CHECK:         %8 = scf.index_switch %7 -> memref<10xi32> 
+// CHECK:         %9 = memref.load %buffer_0_2[%c0] : memref<2xi32>
+// CHECK:         %10 = arith.index_cast %9 : i32 to index
+// CHECK:         %11 = scf.index_switch %10 -> memref<10xi32> 
 // CHECK:         case 0 {
 // CHECK:           scf.yield %output_fifo_buff_0 : memref<10xi32>
 // CHECK:         }
@@ -150,8 +156,9 @@
 // CHECK:           scf.yield %output_fifo_buff_0 : memref<10xi32>
 // CHECK:         }
 // CHECK:         aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 1)
-// CHECK:         %9 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-// CHECK:         %10 = scf.index_switch %9 -> memref<10xi32> 
+// CHECK:         %12 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+// CHECK:         %13 = arith.index_cast %12 : i32 to index
+// CHECK:         %14 = scf.index_switch %13 -> memref<10xi32> 
 // CHECK:         case 0 {
 // CHECK:           scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
 // CHECK:         }
@@ -164,8 +171,9 @@
 // CHECK:         default {
 // CHECK:           scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
 // CHECK:         }
-// CHECK:         %11 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-// CHECK:         %12 = scf.index_switch %11 -> memref<10xi32> 
+// CHECK:         %15 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+// CHECK:         %16 = arith.index_cast %15 : i32 to index
+// CHECK:         %17 = scf.index_switch %16 -> memref<10xi32> 
 // CHECK:         case 0 {
 // CHECK:           scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
 // CHECK:         }
@@ -178,19 +186,19 @@
 // CHECK:         default {
 // CHECK:           scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
 // CHECK:         }
-// CHECK:         func.call @add_10_i32(%10, %12, %8) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
+// CHECK:         func.call @add_10_i32(%14, %17, %11) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
 // CHECK:         aie.use_lock(%input_fifo_cons_prod_lock, Release, 2)
-// CHECK:         %13 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-// CHECK:         %c2_4 = arith.constant 2 : index
-// CHECK:         %14 = arith.addi %13, %c2_4 : index
-// CHECK:         %15 = arith.remsi %14, %c3 : index
-// CHECK:         memref.store %15, %buffer_0_2[%c1] : memref<2xindex>
+// CHECK:         %18 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+// CHECK:         %c2_i32_2 = arith.constant 2 : i32
+// CHECK:         %19 = arith.addi %18, %c2_i32_2 : i32
+// CHECK:         %20 = arith.remsi %19, %c3_i32 : i32
+// CHECK:         memref.store %20, %buffer_0_2[%c1] : memref<2xi32>
 // CHECK:         aie.use_lock(%output_fifo_cons_lock, Release, 1)
-// CHECK:         %16 = memref.load %buffer_0_2[%c0_0] : memref<2xindex>
-// CHECK:         %c1_5 = arith.constant 1 : index
-// CHECK:         %17 = arith.addi %16, %c1_5 : index
-// CHECK:         %18 = arith.remsi %17, %c2 : index
-// CHECK:         memref.store %18, %buffer_0_2[%c0_0] : memref<2xindex>
+// CHECK:         %21 = memref.load %buffer_0_2[%c0] : memref<2xi32>
+// CHECK:         %c1_i32_3 = arith.constant 1 : i32
+// CHECK:         %22 = arith.addi %21, %c1_i32_3 : i32
+// CHECK:         %23 = arith.remsi %22, %c2_i32 : i32
+// CHECK:         memref.store %23, %buffer_0_2[%c0] : memref<2xi32>
 // CHECK:         aie.end
 // CHECK:       }
 // CHECK:       aie.shim_dma_allocation @input_fifo(MM2S, 0, 0)