From f0a60bacc912f194596211f305d15973f5f3fa93 Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Tue, 5 Nov 2024 11:58:31 -0700
Subject: [PATCH 01/46] Adding flag to run command

---
 test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py           | 2 +-
 test/npu-xrt/dynamic_object_fifo/ping_pong/aie2.py              | 2 +-
 test/npu-xrt/dynamic_object_fifo/reduction/aie2.py              | 2 +-
 test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py         | 2 +-
 .../dynamic_object_fifo/sliding_window_conditional/aie2.py      | 2 +-
 .../npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
index 8c41a9868e..dee6558c3e 100644
--- a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
@@ -9,7 +9,7 @@
 #
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
-# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
 # RUN: %run_on_npu ./test.exe | FileCheck %s
 # CHECK: PASS!
diff --git a/test/npu-xrt/dynamic_object_fifo/ping_pong/aie2.py b/test/npu-xrt/dynamic_object_fifo/ping_pong/aie2.py
index 03a25b90db..0a8c1112d8 100644
--- a/test/npu-xrt/dynamic_object_fifo/ping_pong/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/ping_pong/aie2.py
@@ -9,7 +9,7 @@
 #
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
-# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
 # RUN: %run_on_npu ./test.exe | FileCheck %s
 # CHECK: PASS!
diff --git a/test/npu-xrt/dynamic_object_fifo/reduction/aie2.py b/test/npu-xrt/dynamic_object_fifo/reduction/aie2.py
index 3f04ed0f1f..eb5440e4cd 100644
--- a/test/npu-xrt/dynamic_object_fifo/reduction/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/reduction/aie2.py
@@ -9,7 +9,7 @@
 #
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
-# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
 # RUN: %run_on_npu ./test.exe | FileCheck %s
 # CHECK: PASS!
diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py b/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py
index 8b91d2e434..84a1f12db7 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py
@@ -9,7 +9,7 @@
 #
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
-# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
 # RUN: %run_on_npu ./test.exe | FileCheck %s
 # XFAIL: *
diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
index d7eae0bc31..d22c2848bf 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
@@ -9,7 +9,7 @@
 #
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
-# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: %python --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
 # RUN: %run_on_npu ./test.exe | FileCheck %s
 # XFAIL: *
diff --git a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py
index 4fba84bb83..0ecb7adcd2 100644
--- a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py
@@ -9,7 +9,7 @@
 #
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
-# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
 # RUN: %run_on_npu ./test.exe | FileCheck %s
 # XFAIL: *

From 21b5a0fdf443ce609b1a25f1fc3e9c8758360ae9 Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Tue, 5 Nov 2024 12:26:24 -0700
Subject: [PATCH 02/46] Correcting the CHECK messsage

---
 test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py         | 2 +-
 .../dynamic_object_fifo/sliding_window_conditional/aie2.py      | 2 +-
 .../npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py b/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py
index 84a1f12db7..37222b8a78 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py
@@ -12,7 +12,7 @@
 # RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
 # RUN: %run_on_npu ./test.exe | FileCheck %s
-# XFAIL: *
+# CHECK: PASS!
 from aie.dialects.aie import *
 from aie.dialects.aiex import *
 from aie.helpers.dialects.ext.scf import _for as range_
diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
index d22c2848bf..c93e1b21c9 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
@@ -12,7 +12,7 @@
 # RUN: %python --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
 # RUN: %run_on_npu ./test.exe | FileCheck %s
-# XFAIL: *
+# CHECK: PASS!
 import numpy as np
 
 from aie.dialects.aie import *
diff --git a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py
index 0ecb7adcd2..d0b0f53d36 100644
--- a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py
@@ -12,7 +12,7 @@
 # RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
 # RUN: %run_on_npu ./test.exe | FileCheck %s
-# XFAIL: *
+# CHECK: PASS!
 import numpy as np
 
 from aie.dialects.aie import *

From f5d41ebdd84afd174db5a6b40f5927a8eda114a6 Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Tue, 5 Nov 2024 12:57:05 -0700
Subject: [PATCH 03/46] Verifying as programming example

---
 .../dyn_objFifo/nested_loops/Makefile         |  66 +++++++++
 .../dyn_objFifo/nested_loops/aie2.py          |  73 +++++++++
 .../dyn_objFifo/nested_loops/kernel.cc        |  22 +++
 .../dyn_objFifo/nested_loops/test.cpp         | 139 ++++++++++++++++++
 4 files changed, 300 insertions(+)
 create mode 100644 programming_examples/dyn_objFifo/nested_loops/Makefile
 create mode 100644 programming_examples/dyn_objFifo/nested_loops/aie2.py
 create mode 100644 programming_examples/dyn_objFifo/nested_loops/kernel.cc
 create mode 100644 programming_examples/dyn_objFifo/nested_loops/test.cpp

diff --git a/programming_examples/dyn_objFifo/nested_loops/Makefile b/programming_examples/dyn_objFifo/nested_loops/Makefile
new file mode 100644
index 0000000000..0216ac75da
--- /dev/null
+++ b/programming_examples/dyn_objFifo/nested_loops/Makefile
@@ -0,0 +1,66 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# 
+##===----------------------------------------------------------------------===##
+
+# ---
+
+# The following environment variables that point to the Xilinx runtime (XRT)
+# should be set up by an environment setup script already.
+XILINX_XRT?=/opt/xilinx/xrt
+XILINX_VITIS?=$(shell realpath $(dir $(shell which vitis))/../)
+
+# ---
+
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+
+XILINX_XRT_INCLUDE?=${XILINX_XRT}/include
+XILINX_XRT_LIB?=${XILINX_XRT}/lib
+
+CHESSCCWRAP2_FLAGS=aie2 -I${XILINX_VITIS}/aietools/include 
+XRT_FLAGS=-I${XILINX_XRT_INCLUDE} -L${XILINX_XRT_LIB}
+XRT_LIBS=-lxrt_coreutil
+CXX=g++-13 -ggdb 
+
+#mlir_target?=build/aie.mlir
+xclbin_target?=build/final.xclbin
+insts_target?=build/insts.txt
+host_target?=build/test
+
+.PHONY: all
+all: ${xclbin_target} ${host_target}
+
+build/aie.mlir: ${srcdir}/aie2.py
+	mkdir -p ${@D}
+	python3 $< > $@
+
+build/kernel.o: ${srcdir}/kernel.cc
+	mkdir -p ${@D}
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F}
+
+${xclbin_target}: build/aie.mlir build/kernel.o
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py -v --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
+				--dynamic-objFifos --aie-generate-npu --npu-insts-name=${insts_target:build/%=%} ${<:%=../%}
+
+${host_target}: ${srcdir}/test.cpp ${xclbin_target}
+	mkdir -p ${@D}
+	${CXX} ${XRT_FLAGS} -DM=$M -DN=$N -o $@ $< ${XRT_LIBS}
+
+.PHONY: run
+run: ${host_target}
+	./${host_target}
+
+xclbin_sign=${XILINX_XRT}/amdxdna/setup_xclbin_firmware.sh 
+.PHONY: sign
+sign: ${xclbin_target}
+	${xclbin_sign} -dev Phoenix -xclbin $<
+
+.PHONY: clean
+clean:
+	-rm -r build
\ No newline at end of file
diff --git a/programming_examples/dyn_objFifo/nested_loops/aie2.py b/programming_examples/dyn_objFifo/nested_loops/aie2.py
new file mode 100644
index 0000000000..dee6558c3e
--- /dev/null
+++ b/programming_examples/dyn_objFifo/nested_loops/aie2.py
@@ -0,0 +1,73 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 AMD Inc.
+
+# REQUIRES: ryzen_ai, valid_xchess_license
+#
+# RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
+# RUN: %python %S/aie2.py > ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
+# RUN: %run_on_npu ./test.exe | FileCheck %s
+# CHECK: PASS!
+import numpy as np
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.helpers.dialects.ext.scf import _for as range_
+from aie.extras.context import mlir_mod_ctx
+
+N = 50
+O = 250
+n_rows = 5
+dev = AIEDevice.npu1_1col
+col = 0
+
+
+def nested_loops():
+    with mlir_mod_ctx() as ctx:
+
+        @device(dev)
+        def device_body():
+            tensor_ty = np.ndarray[(N // n_rows,), np.dtype[np.int32]]
+
+            # Tile declarations
+            ShimTile = tile(col, 0)
+            ComputeTile = tile(col, 2)
+
+            # AIE-array data movement with object fifos
+            of_in = object_fifo("in", ShimTile, ComputeTile, 2, tensor_ty)
+            of_out = object_fifo("out", ComputeTile, ShimTile, 2, tensor_ty)
+
+            # AIE Core Function declarations
+            passthrough_10_i32 = external_func(
+                "passthrough_10_i32", inputs=[tensor_ty, tensor_ty]
+            )
+
+            # Set up compute tiles
+            @core(ComputeTile, "kernel.o")
+            def core_body():
+                for _ in range_(5):
+                    elemIn = of_in.acquire(ObjectFifoPort.Consume, 1)
+                    for _ in range_(5):
+                        elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
+                        passthrough_10_i32(elemIn, elemOut)
+                        of_out.release(ObjectFifoPort.Produce, 1)
+                    of_in.release(ObjectFifoPort.Consume, 1)
+
+            # To/from AIE-array data movement
+            @runtime_sequence(tensor_ty, tensor_ty)
+            def sequence(A, C):
+                npu_dma_memcpy_nd(
+                    metadata=of_in, bd_id=1, mem=A, sizes=[1, 1, 1, N], issue_token=True
+                )
+                npu_dma_memcpy_nd(metadata=of_out, bd_id=0, mem=C, sizes=[1, 1, 1, O])
+                dma_wait(of_in, of_out)
+
+    print(ctx.module)
+
+
+nested_loops()
diff --git a/programming_examples/dyn_objFifo/nested_loops/kernel.cc b/programming_examples/dyn_objFifo/nested_loops/kernel.cc
new file mode 100644
index 0000000000..d5a796add2
--- /dev/null
+++ b/programming_examples/dyn_objFifo/nested_loops/kernel.cc
@@ -0,0 +1,22 @@
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 AMD Inc.
+
+#include <aie_api/aie.hpp>
+
+template <typename T_in, typename T_out, unsigned long N>
+void passthrough(const T_in *__restrict in, T_out *__restrict out) {
+  for (int i = 0; i < N; i++) {
+    out[i] = in[i];
+  }
+}
+
+extern "C" {
+
+void passthrough_10_i32(const int *__restrict in, int *__restrict out) {
+  passthrough<int, int, 10>(in, out);
+}
+}
\ No newline at end of file
diff --git a/programming_examples/dyn_objFifo/nested_loops/test.cpp b/programming_examples/dyn_objFifo/nested_loops/test.cpp
new file mode 100644
index 0000000000..ecd9a90d51
--- /dev/null
+++ b/programming_examples/dyn_objFifo/nested_loops/test.cpp
@@ -0,0 +1,139 @@
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 AMD Inc.
+
+#include <cassert>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+#ifndef XCLBIN
+#define XCLBIN "build/final.xclbin"
+#endif
+
+#ifndef INSTS_TXT
+#define INSTS_TXT "build/insts.txt"
+#endif
+
+#ifndef KERNEL_NAME
+#define KERNEL_NAME "MLIR_AIE"
+#endif
+
+#define INPUT_SIZE (50 * sizeof(int))   // in bytes
+#define OUTPUT_SIZE (250 * sizeof(int)) // in bytes
+#define WIDTH_SIZE (10 * sizeof(int))   // in bytes
+#define WIDTH 10
+#define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE
+#define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE
+
+std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
+  std::ifstream instr_file(instr_path);
+  std::string line;
+  std::vector<uint32_t> instr_v;
+  while (std::getline(instr_file, line)) {
+    std::istringstream iss(line);
+    uint32_t a;
+    if (!(iss >> std::hex >> a)) {
+      throw std::runtime_error("Unable to parse instruction file\n");
+    }
+    instr_v.push_back(a);
+  }
+  return instr_v;
+}
+
+int main(int argc, const char *argv[]) {
+
+  std::vector<uint32_t> instr_v = load_instr_sequence(INSTS_TXT);
+  assert(instr_v.size() > 0);
+
+  // Get a device handle
+  unsigned int device_index = 0;
+  xrt::device device = xrt::device(device_index);
+
+  // Load the xclbin
+  xrt::xclbin xclbin = xrt::xclbin(XCLBIN);
+
+  // Get the kernel from the xclbin
+  std::vector<xrt::xclbin::kernel> xkernels = xclbin.get_kernels();
+  xrt::xclbin::kernel xkernel = *std::find_if(
+      xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel &k) {
+        return k.get_name().rfind(KERNEL_NAME, 0) == 0;
+      });
+  std::string kernel_name = xkernel.get_name();
+  assert(strcmp(kernel_name.c_str(), KERNEL_NAME) == 0);
+
+  device.register_xclbin(xclbin);
+
+  // get a hardware context
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // get a kernel handle
+  auto kernel = xrt::kernel(context, kernel_name);
+
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_input =
+      xrt::bo(device, INPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_output =
+      xrt::bo(device, OUTPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+
+  int *buf_input = bo_input.map<int *>();
+  std::cout << std::endl << std::endl << "Input: " << std::endl;
+  for (int i = 0; i < INPUT_ROWS; i++) {
+    std::cout << "row " << i << " : ";
+    for (int j = 0; j < WIDTH; j++) {
+      buf_input[i * WIDTH + j] = i;
+      std::cout << buf_input[i * WIDTH + j] << " ";
+    }
+    std::cout << std::endl << std::endl;
+  }
+  int *buf_output = bo_output.map<int *>();
+  memset(buf_output, 0, OUTPUT_SIZE);
+
+  // Instruction buffer for DMA configuration
+  void *buf_instr = bo_instr.map<void *>();
+  memcpy(buf_instr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_input.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_output.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  unsigned int opcode = 3;
+  auto run = kernel(opcode, bo_instr, instr_v.size(), bo_input, bo_output);
+  ert_cmd_state r = run.wait();
+  if (r != ERT_CMD_STATE_COMPLETED) {
+    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
+    return 1;
+  }
+
+  bo_output.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+  bool pass = true;
+  std::cout << std::endl << "Output: " << std::endl;
+  int expected_output = 0;
+  int five_repetitions = 0;
+  for (int i = 0; i < OUTPUT_ROWS; i++) {
+    std::cout << "row " << i << std::endl;
+    if (five_repetitions == 5) {
+      expected_output++;
+      five_repetitions = 0;
+    }
+    for (int j = 0; j < WIDTH; j++) {
+      std::cout << "expected: " << expected_output << ", ";
+      std::cout << "got: " << buf_output[i * WIDTH + j] << std::endl;
+      pass &= buf_output[i * WIDTH + j] == expected_output;
+    }
+    std::cout << std::endl << std::endl;
+    five_repetitions++;
+  }
+  std::cout << std::endl << std::endl;
+  std::cout << (pass ? "PASS!" : "FAIL.") << std::endl;
+
+  return 0;
+}
\ No newline at end of file

From 77513c8a9346feaa52c6e17528556624013ed778 Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Tue, 5 Nov 2024 13:48:24 -0700
Subject: [PATCH 04/46] Checking the flags that caused the issue

---
 test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
index dee6558c3e..19945d46c2 100644
--- a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
@@ -9,7 +9,7 @@
 #
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
-# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
 # RUN: %run_on_npu ./test.exe | FileCheck %s
 # CHECK: PASS!

From 29cf679c03f97883c335ea7a0e9cb4e660c62dac Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Tue, 5 Nov 2024 14:24:53 -0700
Subject: [PATCH 05/46] Revert "Checking the flags that caused the issue"

This reverts commit 77513c8a9346feaa52c6e17528556624013ed778.
---
 test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
index 19945d46c2..dee6558c3e 100644
--- a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
@@ -9,7 +9,7 @@
 #
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
-# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
 # RUN: %run_on_npu ./test.exe | FileCheck %s
 # CHECK: PASS!

From 1db8fbf8bacc500fc004b4dd76bdd51672b94fcf Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Tue, 5 Nov 2024 14:29:26 -0700
Subject: [PATCH 06/46] Maybe clang version

---
 test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
index dee6558c3e..8f1146f34a 100644
--- a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
@@ -10,8 +10,8 @@
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
 # RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
-# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
-# RUN: %run_on_npu ./test.exe | FileCheck %s
+# RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
+# RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 # CHECK: PASS!
 import numpy as np
 

From a3bd5192038d0f0b81a58a831b573ee62a7529fc Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Tue, 5 Nov 2024 14:39:52 -0700
Subject: [PATCH 07/46] Revert "Maybe clang version"

This reverts commit 1db8fbf8bacc500fc004b4dd76bdd51672b94fcf.
---
 test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
index 8f1146f34a..dee6558c3e 100644
--- a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
@@ -10,8 +10,8 @@
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
 # RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
-# RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
-# RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
+# RUN: %run_on_npu ./test.exe | FileCheck %s
 # CHECK: PASS!
 import numpy as np
 

From 8945f79d57cc38d5e3cda41ef35b9f865adccf2a Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Wed, 6 Nov 2024 09:33:00 -0700
Subject: [PATCH 08/46] may be compiler

---
 test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
index dee6558c3e..c390a745fd 100644
--- a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
@@ -10,7 +10,7 @@
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
 # RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
-# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
+# RUN: g++ %S/test.cpp -o test.exe -std=c++13 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
 # RUN: %run_on_npu ./test.exe | FileCheck %s
 # CHECK: PASS!
 import numpy as np

From 46b1a921e7ffca26a70d2f916a3dda0dc10e67f9 Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Wed, 6 Nov 2024 09:47:56 -0700
Subject: [PATCH 09/46] one other check

---
 test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
index c390a745fd..a1338bcf39 100644
--- a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
@@ -10,7 +10,7 @@
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
 # RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
-# RUN: g++ %S/test.cpp -o test.exe -std=c++13 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
+# RUN: g++-13 -ggdb %S/test.cpp -o test.exe -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
 # RUN: %run_on_npu ./test.exe | FileCheck %s
 # CHECK: PASS!
 import numpy as np

From 7fee57d1579bbbdd085da3a4f0f6dae8b5b8f983 Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Wed, 6 Nov 2024 10:42:07 -0700
Subject: [PATCH 10/46] Same run command for all

---
 test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py           | 2 +-
 .../dynamic_object_fifo/sliding_window_conditional/aie2.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
index a1338bcf39..dee6558c3e 100644
--- a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
@@ -10,7 +10,7 @@
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
 # RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
-# RUN: g++-13 -ggdb %S/test.cpp -o test.exe -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
+# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
 # RUN: %run_on_npu ./test.exe | FileCheck %s
 # CHECK: PASS!
 import numpy as np
diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
index c93e1b21c9..366552907b 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
@@ -9,7 +9,7 @@
 #
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
-# RUN: %python --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
 # RUN: %run_on_npu ./test.exe | FileCheck %s
 # CHECK: PASS!

From 60fc2918bcd0ffd3fabbf592a5c02c313dec97f7 Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Wed, 6 Nov 2024 11:41:15 -0700
Subject: [PATCH 11/46] change chess

---
 test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py         | 2 +-
 .../dynamic_object_fifo/sliding_window_conditional/aie2.py      | 2 +-
 .../npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py b/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py
index 37222b8a78..c6dd53838d 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py
@@ -5,7 +5,7 @@
 #
 # (c) Copyright 2024 AMD Inc.
 
-# REQUIRES: ryzen_ai, valid_xchess_license
+# REQUIRES: ryzen_ai, chess
 #
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
index c93e1b21c9..366552907b 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
@@ -9,7 +9,7 @@
 #
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
-# RUN: %python --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
 # RUN: %run_on_npu ./test.exe | FileCheck %s
 # CHECK: PASS!
diff --git a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py
index d0b0f53d36..f9539fecfe 100644
--- a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py
@@ -5,7 +5,7 @@
 #
 # (c) Copyright 2024 AMD Inc.
 
-# REQUIRES: ryzen_ai, valid_xchess_license
+# REQUIRES: ryzen_ai, chess
 #
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir

From da418086ff1f98c74cfea8b9178ade2e17882b87 Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Wed, 6 Nov 2024 11:42:41 -0700
Subject: [PATCH 12/46] Missed file

---
 .../dynamic_object_fifo/sliding_window_conditional/aie2.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
index 366552907b..1c8922df1b 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
@@ -5,7 +5,7 @@
 #
 # (c) Copyright 2024 AMD Inc.
 
-# REQUIRES: ryzen_ai, valid_xchess_license
+# REQUIRES: ryzen_ai, chess
 #
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir

From d13e6076dadb314a129f4650dc08481ada0570a9 Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Wed, 6 Nov 2024 12:10:19 -0700
Subject: [PATCH 13/46] Checking

---
 test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py         | 2 +-
 .../dynamic_object_fifo/sliding_window_conditional/aie2.py      | 2 +-
 .../npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py b/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py
index c6dd53838d..7baa366452 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py
@@ -11,7 +11,7 @@
 # RUN: %python %S/aie2.py > ./aie2.mlir
 # RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
-# RUN: %run_on_npu ./test.exe | FileCheck %s
+# RUN: %run_on_npu ./test.exe -x final.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 # CHECK: PASS!
 from aie.dialects.aie import *
 from aie.dialects.aiex import *
diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
index 1c8922df1b..3f1159da13 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
@@ -11,7 +11,7 @@
 # RUN: %python %S/aie2.py > ./aie2.mlir
 # RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
-# RUN: %run_on_npu ./test.exe | FileCheck %s
+# RUN: %run_on_npu ./test.exe -x final.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 # CHECK: PASS!
 import numpy as np
 
diff --git a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py
index f9539fecfe..3c53c21cd8 100644
--- a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py
@@ -11,7 +11,7 @@
 # RUN: %python %S/aie2.py > ./aie2.mlir
 # RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
-# RUN: %run_on_npu ./test.exe | FileCheck %s
+# RUN: %run_on_npu ./test.exe -x final.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 # CHECK: PASS!
 import numpy as np
 

From dd87d0b71544cb453dd0a5c3fda0cd5fe2389dc9 Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Wed, 6 Nov 2024 14:38:08 -0700
Subject: [PATCH 14/46] Checking with working test file

---
 .../two_core_sliding_window/test.cpp          | 21 ++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/test.cpp b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/test.cpp
index 648924ac4f..ccf951d8dd 100644
--- a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/test.cpp
+++ b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/test.cpp
@@ -14,11 +14,11 @@
 #include "xrt/xrt_kernel.h"
 
 #ifndef XCLBIN
-#define XCLBIN "final.xclbin"
+#define XCLBIN "build/final.xclbin"
 #endif
 
 #ifndef INSTS_TXT
-#define INSTS_TXT "insts.txt"
+#define INSTS_TXT "build/insts.txt"
 #endif
 
 #ifndef KERNEL_NAME
@@ -33,9 +33,24 @@
 
 #include "test_utils.h"
 
+std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
+  std::ifstream instr_file(instr_path);
+  std::string line;
+  std::vector<uint32_t> instr_v;
+  while (std::getline(instr_file, line)) {
+    std::istringstream iss(line);
+    uint32_t a;
+    if (!(iss >> std::hex >> a)) {
+      throw std::runtime_error("Unable to parse instruction file\n");
+    }
+    instr_v.push_back(a);
+  }
+  return instr_v;
+}
+
 int main(int argc, const char *argv[]) {
 
-  std::vector<uint32_t> instr_v = test_utils::load_instr_sequence(INSTS_TXT);
+  std::vector<uint32_t> instr_v = load_instr_sequence(INSTS_TXT);
   assert(instr_v.size() > 0);
 
   // Get a device handle

From 30b7f84e25b139f386b97733adc06b51cab927c0 Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Wed, 6 Nov 2024 15:04:20 -0700
Subject: [PATCH 15/46] Missing::

---
 .../sliding_window/test.cpp                   | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window/test.cpp b/test/npu-xrt/dynamic_object_fifo/sliding_window/test.cpp
index 648924ac4f..2ffbd6ba56 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window/test.cpp
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window/test.cpp
@@ -14,11 +14,11 @@
 #include "xrt/xrt_kernel.h"
 
 #ifndef XCLBIN
-#define XCLBIN "final.xclbin"
+#define XCLBIN "build/final.xclbin"
 #endif
 
 #ifndef INSTS_TXT
-#define INSTS_TXT "insts.txt"
+#define INSTS_TXT "build/insts.txt"
 #endif
 
 #ifndef KERNEL_NAME
@@ -33,6 +33,21 @@
 
 #include "test_utils.h"
 
+std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
+  std::ifstream instr_file(instr_path);
+  std::string line;
+  std::vector<uint32_t> instr_v;
+  while (std::getline(instr_file, line)) {
+    std::istringstream iss(line);
+    uint32_t a;
+    if (!(iss >> std::hex >> a)) {
+      throw std::runtime_error("Unable to parse instruction file\n");
+    }
+    instr_v.push_back(a);
+  }
+  return instr_v;
+}
+
 int main(int argc, const char *argv[]) {
 
   std::vector<uint32_t> instr_v = test_utils::load_instr_sequence(INSTS_TXT);

From 8f4df8a5fe44dee60f570ed52ffb655954f67a43 Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Wed, 6 Nov 2024 16:07:31 -0700
Subject: [PATCH 16/46] Instr load problem

---
 .../dynamic_object_fifo/sliding_window_conditional/test.cpp    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/test.cpp b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/test.cpp
index 648924ac4f..6668c96421 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/test.cpp
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/test.cpp
@@ -25,9 +25,10 @@
 #define KERNEL_NAME "MLIR_AIE"
 #endif
 
-#define INPUT_SIZE (100 * sizeof(int))  // in bytes
+#define INPUT_SIZE (100 * sizeof(int))   // in bytes
 #define OUTPUT_SIZE (100 * sizeof(int)) // in bytes
 #define WIDTH_SIZE (10 * sizeof(int))   // in bytes
+#define WIDTH 10
 #define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE
 #define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE
 

From 401a55b61d4121239b0b4750cb0fd677165dfb0a Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Wed, 6 Nov 2024 16:10:36 -0700
Subject: [PATCH 17/46] format

---
 .../dynamic_object_fifo/sliding_window_conditional/test.cpp     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/test.cpp b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/test.cpp
index 6668c96421..c25d9358f6 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/test.cpp
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/test.cpp
@@ -25,7 +25,7 @@
 #define KERNEL_NAME "MLIR_AIE"
 #endif
 
-#define INPUT_SIZE (100 * sizeof(int))   // in bytes
+#define INPUT_SIZE (100 * sizeof(int))  // in bytes
 #define OUTPUT_SIZE (100 * sizeof(int)) // in bytes
 #define WIDTH_SIZE (10 * sizeof(int))   // in bytes
 #define WIDTH 10

From 2b6e8ec7c58cddf009781e80b2f8f85a85d16e6c Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Wed, 6 Nov 2024 16:24:02 -0700
Subject: [PATCH 18/46] Rest of them

---
 .../sliding_window/test.cpp                   | 20 +++--------------
 .../two_core_sliding_window/test.cpp          | 22 ++++---------------
 2 files changed, 7 insertions(+), 35 deletions(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window/test.cpp b/test/npu-xrt/dynamic_object_fifo/sliding_window/test.cpp
index 2ffbd6ba56..c25d9358f6 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window/test.cpp
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window/test.cpp
@@ -14,11 +14,11 @@
 #include "xrt/xrt_kernel.h"
 
 #ifndef XCLBIN
-#define XCLBIN "build/final.xclbin"
+#define XCLBIN "final.xclbin"
 #endif
 
 #ifndef INSTS_TXT
-#define INSTS_TXT "build/insts.txt"
+#define INSTS_TXT "insts.txt"
 #endif
 
 #ifndef KERNEL_NAME
@@ -28,26 +28,12 @@
 #define INPUT_SIZE (100 * sizeof(int))  // in bytes
 #define OUTPUT_SIZE (100 * sizeof(int)) // in bytes
 #define WIDTH_SIZE (10 * sizeof(int))   // in bytes
+#define WIDTH 10
 #define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE
 #define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE
 
 #include "test_utils.h"
 
-std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
-  std::ifstream instr_file(instr_path);
-  std::string line;
-  std::vector<uint32_t> instr_v;
-  while (std::getline(instr_file, line)) {
-    std::istringstream iss(line);
-    uint32_t a;
-    if (!(iss >> std::hex >> a)) {
-      throw std::runtime_error("Unable to parse instruction file\n");
-    }
-    instr_v.push_back(a);
-  }
-  return instr_v;
-}
-
 int main(int argc, const char *argv[]) {
 
   std::vector<uint32_t> instr_v = test_utils::load_instr_sequence(INSTS_TXT);
diff --git a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/test.cpp b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/test.cpp
index ccf951d8dd..c25d9358f6 100644
--- a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/test.cpp
+++ b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/test.cpp
@@ -14,11 +14,11 @@
 #include "xrt/xrt_kernel.h"
 
 #ifndef XCLBIN
-#define XCLBIN "build/final.xclbin"
+#define XCLBIN "final.xclbin"
 #endif
 
 #ifndef INSTS_TXT
-#define INSTS_TXT "build/insts.txt"
+#define INSTS_TXT "insts.txt"
 #endif
 
 #ifndef KERNEL_NAME
@@ -28,29 +28,15 @@
 #define INPUT_SIZE (100 * sizeof(int))  // in bytes
 #define OUTPUT_SIZE (100 * sizeof(int)) // in bytes
 #define WIDTH_SIZE (10 * sizeof(int))   // in bytes
+#define WIDTH 10
 #define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE
 #define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE
 
 #include "test_utils.h"
 
-std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
-  std::ifstream instr_file(instr_path);
-  std::string line;
-  std::vector<uint32_t> instr_v;
-  while (std::getline(instr_file, line)) {
-    std::istringstream iss(line);
-    uint32_t a;
-    if (!(iss >> std::hex >> a)) {
-      throw std::runtime_error("Unable to parse instruction file\n");
-    }
-    instr_v.push_back(a);
-  }
-  return instr_v;
-}
-
 int main(int argc, const char *argv[]) {
 
-  std::vector<uint32_t> instr_v = load_instr_sequence(INSTS_TXT);
+  std::vector<uint32_t> instr_v = test_utils::load_instr_sequence(INSTS_TXT);
   assert(instr_v.size() > 0);
 
   // Get a device handle

From a8fabc07aaad4fea1f70af8db4c66be64706511b Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Wed, 6 Nov 2024 17:00:46 -0700
Subject: [PATCH 19/46] Old test

---
 .../sliding_window/test.cpp                   | 22 ++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window/test.cpp b/test/npu-xrt/dynamic_object_fifo/sliding_window/test.cpp
index c25d9358f6..5c78b0e986 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window/test.cpp
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window/test.cpp
@@ -14,11 +14,11 @@
 #include "xrt/xrt_kernel.h"
 
 #ifndef XCLBIN
-#define XCLBIN "final.xclbin"
+#define XCLBIN "build/final.xclbin"
 #endif
 
 #ifndef INSTS_TXT
-#define INSTS_TXT "insts.txt"
+#define INSTS_TXT "build/insts.txt"
 #endif
 
 #ifndef KERNEL_NAME
@@ -28,15 +28,27 @@
 #define INPUT_SIZE (100 * sizeof(int))  // in bytes
 #define OUTPUT_SIZE (100 * sizeof(int)) // in bytes
 #define WIDTH_SIZE (10 * sizeof(int))   // in bytes
-#define WIDTH 10
 #define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE
 #define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE
 
-#include "test_utils.h"
+std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
+  std::ifstream instr_file(instr_path);
+  std::string line;
+  std::vector<uint32_t> instr_v;
+  while (std::getline(instr_file, line)) {
+    std::istringstream iss(line);
+    uint32_t a;
+    if (!(iss >> std::hex >> a)) {
+      throw std::runtime_error("Unable to parse instruction file\n");
+    }
+    instr_v.push_back(a);
+  }
+  return instr_v;
+}
 
 int main(int argc, const char *argv[]) {
 
-  std::vector<uint32_t> instr_v = test_utils::load_instr_sequence(INSTS_TXT);
+  std::vector<uint32_t> instr_v = load_instr_sequence(INSTS_TXT);
   assert(instr_v.size() > 0);
 
   // Get a device handle

From 3b4f1afaf137e0f00d2bec02a60b4d3d5c84d18c Mon Sep 17 00:00:00 2001
From: AndraBisca <andrab@amd.com>
Date: Tue, 12 Nov 2024 13:25:00 -0700
Subject: [PATCH 20/46] Update makefile-common

---
 test/npu-xrt/makefile-common | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/test/npu-xrt/makefile-common b/test/npu-xrt/makefile-common
index 51e9a19245..bdde6760b6 100644
--- a/test/npu-xrt/makefile-common
+++ b/test/npu-xrt/makefile-common
@@ -1,17 +1,19 @@
-# Contains common definitions used across the Makefiles of npu-xrt tests.
-
 # VITIS related variables
-VITIS_ROOT ?= $(shell realpath $(dir $(shell which vitis))/../)
-VITIS_AIETOOLS_DIR ?= ${VITIS_ROOT}/aietools
-VITIS_AIE_INCLUDE_DIR ?= ${VITIS_ROOT}/aietools/data/versal_prod/lib
-VITIS_AIE2_INCLUDE_DIR ?= ${VITIS_ROOT}/aietools/data/aie_ml/lib
+AIETOOLS_DIR ?= $(shell realpath $(dir $(shell which xchesscc))/../)
+AIE_INCLUDE_DIR ?= ${AIETOOLS_DIR}/data/versal_prod/lib
+AIE2_INCLUDE_DIR ?= ${AIETOOLS_DIR}/data/aie_ml/lib
+
+AIEOPT_DIR ?= $(shell realpath $(dir $(shell which aie-opt))/..)
+
+WARNING_FLAGS = -Wno-parentheses -Wno-attributes -Wno-macro-redefined
 
-CHESSCC1_FLAGS = -f -p me -P ${VITIS_AIE_INCLUDE_DIR} -I ${VITIS_AIETOOLS_DIR}/include
-CHESSCC2_FLAGS = -f -p me -P ${VITIS_AIE2_INCLUDE_DIR} -I ${VITIS_AIETOOLS_DIR}/include -D__AIENGINE__=2 -D__AIEARCH__=20
-CHESS_FLAGS = -P ${VITIS_AIE_INCLUDE_DIR}
+CHESSCC1_FLAGS = -f -p me -P ${AIE_INCLUDE_DIR} -I ${AIETOOLS_DIR}/include
+CHESSCC2_FLAGS = -f -p me -P ${AIE2_INCLUDE_DIR} -I ${AIETOOLS_DIR}/include -D__AIENGINE__=2 -D__AIEARCH__=20
+CHESS_FLAGS = -P ${AIE_INCLUDE_DIR}
 
-CHESSCCWRAP1_FLAGS = aie -I ${VITIS_AIETOOLS_DIR}/include 
-CHESSCCWRAP2_FLAGS = aie2 -I ${VITIS_AIETOOLS_DIR}/include 
+CHESSCCWRAP1_FLAGS = aie -I ${AIETOOLS_DIR}/include 
+CHESSCCWRAP2_FLAGS = aie2 -I ${AIETOOLS_DIR}/include
+PEANOWRAP2_FLAGS = -O2 -v -std=c++20 --target=aie2-none-unknown-elf ${WARNING_FLAGS} -DNDEBUG -I ${AIEOPT_DIR}/include 
 
 TEST_POWERSHELL := $(shell command -v powershell.exe >/dev/null 2>&1 && echo yes || echo no)
 ifeq ($(TEST_POWERSHELL),yes)

From dced95e6e71ce1b3b1984ae97194372ecf50338a Mon Sep 17 00:00:00 2001
From: AndraBisca <andrab@amd.com>
Date: Wed, 13 Nov 2024 13:19:27 -0700
Subject: [PATCH 21/46] Revert "Update makefile-common"

This reverts commit 3b4f1afaf137e0f00d2bec02a60b4d3d5c84d18c.
---
 test/npu-xrt/makefile-common | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/test/npu-xrt/makefile-common b/test/npu-xrt/makefile-common
index bdde6760b6..51e9a19245 100644
--- a/test/npu-xrt/makefile-common
+++ b/test/npu-xrt/makefile-common
@@ -1,19 +1,17 @@
-# VITIS related variables
-AIETOOLS_DIR ?= $(shell realpath $(dir $(shell which xchesscc))/../)
-AIE_INCLUDE_DIR ?= ${AIETOOLS_DIR}/data/versal_prod/lib
-AIE2_INCLUDE_DIR ?= ${AIETOOLS_DIR}/data/aie_ml/lib
-
-AIEOPT_DIR ?= $(shell realpath $(dir $(shell which aie-opt))/..)
+# Contains common definitions used across the Makefiles of npu-xrt tests.
 
-WARNING_FLAGS = -Wno-parentheses -Wno-attributes -Wno-macro-redefined
+# VITIS related variables
+VITIS_ROOT ?= $(shell realpath $(dir $(shell which vitis))/../)
+VITIS_AIETOOLS_DIR ?= ${VITIS_ROOT}/aietools
+VITIS_AIE_INCLUDE_DIR ?= ${VITIS_ROOT}/aietools/data/versal_prod/lib
+VITIS_AIE2_INCLUDE_DIR ?= ${VITIS_ROOT}/aietools/data/aie_ml/lib
 
-CHESSCC1_FLAGS = -f -p me -P ${AIE_INCLUDE_DIR} -I ${AIETOOLS_DIR}/include
-CHESSCC2_FLAGS = -f -p me -P ${AIE2_INCLUDE_DIR} -I ${AIETOOLS_DIR}/include -D__AIENGINE__=2 -D__AIEARCH__=20
-CHESS_FLAGS = -P ${AIE_INCLUDE_DIR}
+CHESSCC1_FLAGS = -f -p me -P ${VITIS_AIE_INCLUDE_DIR} -I ${VITIS_AIETOOLS_DIR}/include
+CHESSCC2_FLAGS = -f -p me -P ${VITIS_AIE2_INCLUDE_DIR} -I ${VITIS_AIETOOLS_DIR}/include -D__AIENGINE__=2 -D__AIEARCH__=20
+CHESS_FLAGS = -P ${VITIS_AIE_INCLUDE_DIR}
 
-CHESSCCWRAP1_FLAGS = aie -I ${AIETOOLS_DIR}/include 
-CHESSCCWRAP2_FLAGS = aie2 -I ${AIETOOLS_DIR}/include
-PEANOWRAP2_FLAGS = -O2 -v -std=c++20 --target=aie2-none-unknown-elf ${WARNING_FLAGS} -DNDEBUG -I ${AIEOPT_DIR}/include 
+CHESSCCWRAP1_FLAGS = aie -I ${VITIS_AIETOOLS_DIR}/include 
+CHESSCCWRAP2_FLAGS = aie2 -I ${VITIS_AIETOOLS_DIR}/include 
 
 TEST_POWERSHELL := $(shell command -v powershell.exe >/dev/null 2>&1 && echo yes || echo no)
 ifeq ($(TEST_POWERSHELL),yes)

From 98f8b0d2cd78666e7f4aef2a6712a6b2347ec708 Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Wed, 13 Nov 2024 19:32:06 -0700
Subject: [PATCH 22/46] Testing sliding window test cases as examples

---
 .../dyn_objFifo/sliding_window/Makefile       |  66 +++++++++
 .../dyn_objFifo/sliding_window/aie2.py        |  76 ++++++++++
 .../dyn_objFifo/sliding_window/kernel.cc      |  24 +++
 .../dyn_objFifo/sliding_window/test.cpp       | 138 ++++++++++++++++++
 .../sliding_window_conditional/Makefile       |  66 +++++++++
 .../sliding_window_conditional/aie2.py        |  74 ++++++++++
 .../sliding_window_conditional/kernel.cc      |  24 +++
 .../sliding_window_conditional/test.cpp       | 138 ++++++++++++++++++
 .../two_core_sliding_window/Makefile          |  66 +++++++++
 .../two_core_sliding_window/aie2.py           |  90 ++++++++++++
 .../two_core_sliding_window/kernel.cc         |  38 +++++
 .../two_core_sliding_window/test.cpp          | 138 ++++++++++++++++++
 12 files changed, 938 insertions(+)
 create mode 100644 programming_examples/dyn_objFifo/sliding_window/Makefile
 create mode 100644 programming_examples/dyn_objFifo/sliding_window/aie2.py
 create mode 100644 programming_examples/dyn_objFifo/sliding_window/kernel.cc
 create mode 100644 programming_examples/dyn_objFifo/sliding_window/test.cpp
 create mode 100644 programming_examples/dyn_objFifo/sliding_window_conditional/Makefile
 create mode 100644 programming_examples/dyn_objFifo/sliding_window_conditional/aie2.py
 create mode 100644 programming_examples/dyn_objFifo/sliding_window_conditional/kernel.cc
 create mode 100644 programming_examples/dyn_objFifo/sliding_window_conditional/test.cpp
 create mode 100644 programming_examples/dyn_objFifo/two_core_sliding_window/Makefile
 create mode 100644 programming_examples/dyn_objFifo/two_core_sliding_window/aie2.py
 create mode 100644 programming_examples/dyn_objFifo/two_core_sliding_window/kernel.cc
 create mode 100644 programming_examples/dyn_objFifo/two_core_sliding_window/test.cpp

diff --git a/programming_examples/dyn_objFifo/sliding_window/Makefile b/programming_examples/dyn_objFifo/sliding_window/Makefile
new file mode 100644
index 0000000000..0216ac75da
--- /dev/null
+++ b/programming_examples/dyn_objFifo/sliding_window/Makefile
@@ -0,0 +1,66 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# 
+##===----------------------------------------------------------------------===##
+
+# ---
+
+# The following environment variables that point to the Xilinx runtime (XRT)
+# should be set up by an environment setup script already.
+XILINX_XRT?=/opt/xilinx/xrt
+XILINX_VITIS?=$(shell realpath $(dir $(shell which vitis))/../)
+
+# ---
+
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+
+XILINX_XRT_INCLUDE?=${XILINX_XRT}/include
+XILINX_XRT_LIB?=${XILINX_XRT}/lib
+
+CHESSCCWRAP2_FLAGS=aie2 -I${XILINX_VITIS}/aietools/include 
+XRT_FLAGS=-I${XILINX_XRT_INCLUDE} -L${XILINX_XRT_LIB}
+XRT_LIBS=-lxrt_coreutil
+CXX=g++-13 -ggdb 
+
+#mlir_target?=build/aie.mlir
+xclbin_target?=build/final.xclbin
+insts_target?=build/insts.txt
+host_target?=build/test
+
+.PHONY: all
+all: ${xclbin_target} ${host_target}
+
+build/aie.mlir: ${srcdir}/aie2.py
+	mkdir -p ${@D}
+	python3 $< > $@
+
+build/kernel.o: ${srcdir}/kernel.cc
+	mkdir -p ${@D}
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F}
+
+${xclbin_target}: build/aie.mlir build/kernel.o
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py -v --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
+				--dynamic-objFifos --aie-generate-npu --npu-insts-name=${insts_target:build/%=%} ${<:%=../%}
+
+${host_target}: ${srcdir}/test.cpp ${xclbin_target}
+	mkdir -p ${@D}
+	${CXX} ${XRT_FLAGS} -DM=$M -DN=$N -o $@ $< ${XRT_LIBS}
+
+.PHONY: run
+run: ${host_target}
+	./${host_target}
+
+xclbin_sign=${XILINX_XRT}/amdxdna/setup_xclbin_firmware.sh 
+.PHONY: sign
+sign: ${xclbin_target}
+	${xclbin_sign} -dev Phoenix -xclbin $<
+
+.PHONY: clean
+clean:
+	-rm -r build
\ No newline at end of file
diff --git a/programming_examples/dyn_objFifo/sliding_window/aie2.py b/programming_examples/dyn_objFifo/sliding_window/aie2.py
new file mode 100644
index 0000000000..08d92c73e1
--- /dev/null
+++ b/programming_examples/dyn_objFifo/sliding_window/aie2.py
@@ -0,0 +1,76 @@
+# dynamic_object_fifo/sliding_window/aie2.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.extras.dialects.ext.scf import _for as range_
+from aie.extras.context import mlir_mod_ctx
+
+N = 100
+n_rows = 10
+dev = AIEDevice.npu1_1col
+col = 0
+
+
+def sliding_window():
+    with mlir_mod_ctx() as ctx:
+
+        @device(dev)
+        def device_body():
+            memRef_ty = T.memref(N // n_rows, T.i32())
+
+            # Tile declarations
+            ShimTile = tile(col, 0)
+            ComputeTile = tile(col, 2)
+
+            # AIE-array data movement with object fifos
+            of_in = object_fifo("in", ShimTile, ComputeTile, 3, memRef_ty)
+            of_out = object_fifo("out", ComputeTile, ShimTile, 2, memRef_ty)
+
+            # AIE Core Function declarations
+            add_10_i32 = external_func(
+                "add_10_i32", inputs=[memRef_ty, memRef_ty, memRef_ty]
+            )
+
+            # Set up compute tiles
+
+            @core(ComputeTile, "kernel.o")
+            def core_body():
+                elemOutPre = of_out.acquire(ObjectFifoPort.Produce, 1)
+                elemInPre = of_in.acquire(ObjectFifoPort.Consume, 1)
+                call(add_10_i32, [elemInPre, elemInPre, elemOutPre])
+                of_out.release(ObjectFifoPort.Produce, 1)
+
+                for _ in range_(8):
+                    elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
+                    elemsIn = of_in.acquire(ObjectFifoPort.Consume, 2)
+                    call(add_10_i32, [elemsIn[0], elemsIn[1], elemOut])
+                    of_in.release(ObjectFifoPort.Consume, 1)
+                    of_out.release(ObjectFifoPort.Produce, 1)
+
+                elemOutPost = of_out.acquire(ObjectFifoPort.Produce, 1)
+                elemsInPost = of_in.acquire(ObjectFifoPort.Consume, 2)
+                call(add_10_i32, [elemsInPost[0], elemsInPost[1], elemOutPost])
+                of_in.release(ObjectFifoPort.Consume, 2)
+                of_out.release(ObjectFifoPort.Produce, 1)
+
+            # To/from AIE-array data movement
+            tensor_ty = T.memref(N, T.i32())
+
+            @runtime_sequence(tensor_ty, tensor_ty)
+            def sequence(A, C):
+                npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
+                npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+                npu_sync(column=0, row=0, direction=0, channel=0)
+
+    print(ctx.module)
+
+
+sliding_window()
diff --git a/programming_examples/dyn_objFifo/sliding_window/kernel.cc b/programming_examples/dyn_objFifo/sliding_window/kernel.cc
new file mode 100644
index 0000000000..ddb474e102
--- /dev/null
+++ b/programming_examples/dyn_objFifo/sliding_window/kernel.cc
@@ -0,0 +1,24 @@
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 AMD Inc.
+
+#include <aie_api/aie.hpp>
+
+template <typename T_in, typename T_out, unsigned long N>
+void add(const T_in *__restrict inA, const T_in *__restrict inB,
+         T_out *__restrict out) {
+  for (int i = 0; i < N; i++) {
+    out[i] = inA[i] + inB[i];
+  }
+}
+
+extern "C" {
+
+void add_10_i32(const int *__restrict inA, const int *__restrict inB,
+                int *__restrict out) {
+  add<int, int, 10>(inA, inB, out);
+}
+}
diff --git a/programming_examples/dyn_objFifo/sliding_window/test.cpp b/programming_examples/dyn_objFifo/sliding_window/test.cpp
new file mode 100644
index 0000000000..3cd72ab880
--- /dev/null
+++ b/programming_examples/dyn_objFifo/sliding_window/test.cpp
@@ -0,0 +1,138 @@
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 AMD Inc.
+
+#include <cassert>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+#ifndef XCLBIN
+#define XCLBIN "build/final.xclbin"
+#endif
+
+#ifndef INSTS_TXT
+#define INSTS_TXT "build/insts.txt"
+#endif
+
+#ifndef KERNEL_NAME
+#define KERNEL_NAME "MLIR_AIE"
+#endif
+
+#define INPUT_SIZE (100 * sizeof(int))  // in bytes
+#define OUTPUT_SIZE (100 * sizeof(int)) // in bytes
+#define WIDTH_SIZE (10 * sizeof(int))   // in bytes
+#define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE
+#define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE
+
+std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
+  std::ifstream instr_file(instr_path);
+  std::string line;
+  std::vector<uint32_t> instr_v;
+  while (std::getline(instr_file, line)) {
+    std::istringstream iss(line);
+    uint32_t a;
+    if (!(iss >> std::hex >> a)) {
+      throw std::runtime_error("Unable to parse instruction file\n");
+    }
+    instr_v.push_back(a);
+  }
+  return instr_v;
+}
+
+int main(int argc, const char *argv[]) {
+
+  std::vector<uint32_t> instr_v = load_instr_sequence(INSTS_TXT);
+  assert(instr_v.size() > 0);
+
+  // Get a device handle
+  unsigned int device_index = 0;
+  xrt::device device = xrt::device(device_index);
+
+  // Load the xclbin
+  xrt::xclbin xclbin = xrt::xclbin(XCLBIN);
+
+  // Get the kernel from the xclbin
+  std::vector<xrt::xclbin::kernel> xkernels = xclbin.get_kernels();
+  xrt::xclbin::kernel xkernel = *std::find_if(
+      xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel &k) {
+        return k.get_name().rfind(KERNEL_NAME, 0) == 0;
+      });
+  std::string kernel_name = xkernel.get_name();
+  assert(strcmp(kernel_name.c_str(), KERNEL_NAME) == 0);
+
+  device.register_xclbin(xclbin);
+
+  // get a hardware context
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // get a kernel handle
+  auto kernel = xrt::kernel(context, kernel_name);
+
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_input =
+      xrt::bo(device, INPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_output =
+      xrt::bo(device, OUTPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+
+  int *buf_input = bo_input.map<int *>();
+  std::cout << std::endl << std::endl << "Input: " << std::endl;
+  for (int i = 0; i < INPUT_ROWS; i++) {
+    std::cout << "row " << i << " : ";
+    for (int j = 0; j < WIDTH_SIZE / sizeof(buf_input[0]); j++) {
+      buf_input[i * INPUT_ROWS + j] = i;
+      std::cout << buf_input[i * INPUT_ROWS + j] << " ";
+    }
+    std::cout << std::endl << std::endl;
+  }
+  int *buf_output = bo_output.map<int *>();
+  memset(buf_output, 0, OUTPUT_SIZE);
+
+  // Instruction buffer for DMA configuration
+  void *buf_instr = bo_instr.map<void *>();
+  memcpy(buf_instr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_input.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_output.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  unsigned int opcode = 3;
+  auto run = kernel(opcode, bo_instr, instr_v.size(), bo_input, bo_output);
+  ert_cmd_state r = run.wait();
+  if (r != ERT_CMD_STATE_COMPLETED) {
+    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
+    return 1;
+  }
+
+  bo_output.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+  bool pass = true;
+  std::cout << std::endl << "Output: " << std::endl;
+  for (int i = 0; i < OUTPUT_ROWS; i++) {
+    std::cout << "row " << i << std::endl;
+    for (int j = 0; j < WIDTH_SIZE / sizeof(buf_output[0]); j++) {
+      int expected_output = 0;
+      if (i == 0) {
+        expected_output = buf_input[i * INPUT_ROWS] * 2;
+      } else {
+        expected_output =
+            buf_input[(i - 1) * INPUT_ROWS] + buf_input[i * INPUT_ROWS];
+      }
+      std::cout << "expected: " << expected_output << ", ";
+      std::cout << "got: " << buf_output[i * OUTPUT_ROWS + j] << std::endl;
+      pass &= buf_output[i * OUTPUT_ROWS + j] == expected_output;
+    }
+    std::cout << std::endl << std::endl;
+  }
+  std::cout << std::endl << std::endl;
+  std::cout << (pass ? "PASS!" : "FAIL.") << std::endl;
+
+  return 0;
+}
diff --git a/programming_examples/dyn_objFifo/sliding_window_conditional/Makefile b/programming_examples/dyn_objFifo/sliding_window_conditional/Makefile
new file mode 100644
index 0000000000..0216ac75da
--- /dev/null
+++ b/programming_examples/dyn_objFifo/sliding_window_conditional/Makefile
@@ -0,0 +1,66 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# 
+##===----------------------------------------------------------------------===##
+
+# ---
+
+# The following environment variables that point to the Xilinx runtime (XRT)
+# should be set up by an environment setup script already.
+XILINX_XRT?=/opt/xilinx/xrt
+XILINX_VITIS?=$(shell realpath $(dir $(shell which vitis))/../)
+
+# ---
+
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+
+XILINX_XRT_INCLUDE?=${XILINX_XRT}/include
+XILINX_XRT_LIB?=${XILINX_XRT}/lib
+
+CHESSCCWRAP2_FLAGS=aie2 -I${XILINX_VITIS}/aietools/include 
+XRT_FLAGS=-I${XILINX_XRT_INCLUDE} -L${XILINX_XRT_LIB}
+XRT_LIBS=-lxrt_coreutil
+CXX=g++-13 -ggdb 
+
+#mlir_target?=build/aie.mlir
+xclbin_target?=build/final.xclbin
+insts_target?=build/insts.txt
+host_target?=build/test
+
+.PHONY: all
+all: ${xclbin_target} ${host_target}
+
+build/aie.mlir: ${srcdir}/aie2.py
+	mkdir -p ${@D}
+	python3 $< > $@
+
+build/kernel.o: ${srcdir}/kernel.cc
+	mkdir -p ${@D}
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F}
+
+${xclbin_target}: build/aie.mlir build/kernel.o
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py -v --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
+				--dynamic-objFifos --aie-generate-npu --npu-insts-name=${insts_target:build/%=%} ${<:%=../%}
+
+${host_target}: ${srcdir}/test.cpp ${xclbin_target}
+	mkdir -p ${@D}
+	${CXX} ${XRT_FLAGS} -DM=$M -DN=$N -o $@ $< ${XRT_LIBS}
+
+.PHONY: run
+run: ${host_target}
+	./${host_target}
+
+xclbin_sign=${XILINX_XRT}/amdxdna/setup_xclbin_firmware.sh 
+.PHONY: sign
+sign: ${xclbin_target}
+	${xclbin_sign} -dev Phoenix -xclbin $<
+
+.PHONY: clean
+clean:
+	-rm -r build
\ No newline at end of file
diff --git a/programming_examples/dyn_objFifo/sliding_window_conditional/aie2.py b/programming_examples/dyn_objFifo/sliding_window_conditional/aie2.py
new file mode 100644
index 0000000000..8ab2dfa636
--- /dev/null
+++ b/programming_examples/dyn_objFifo/sliding_window_conditional/aie2.py
@@ -0,0 +1,74 @@
+# dynamic_object_fifo/sliding_window/aie2.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.extras.dialects.ext.scf import _for as range_
+from aie.extras.context import mlir_mod_ctx
+
+N = 100
+n_rows = 10
+dev = AIEDevice.npu1_1col
+col = 0
+
+
+def sliding_window():
+    with mlir_mod_ctx() as ctx:
+
+        @device(dev)
+        def device_body():
+            memRef_ty = T.memref(N // n_rows, T.i32())
+
+            # Tile declarations
+            ShimTile = tile(col, 0)
+            ComputeTile = tile(col, 2)
+
+            # AIE-array data movement with object fifos
+            of_in = object_fifo("in", ShimTile, ComputeTile, 3, memRef_ty)
+            of_out = object_fifo("out", ComputeTile, ShimTile, 2, memRef_ty)
+
+            # AIE Core Function declarations
+            add_10_i32 = external_func(
+                "add_10_i32", inputs=[memRef_ty, memRef_ty, memRef_ty]
+            )
+
+            # Set up compute tiles
+
+            @core(ComputeTile, "kernel.o")
+            def core_body():
+                for i in range_(10):
+                    elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
+                    if i == 0:
+                        elemInPre = of_in.acquire(ObjectFifoPort.Consume, 1)
+                        call(add_10_i32, [elemInPre, elemInPre, elemOut])
+                    elif i == 9:
+                        elemsInPost = of_in.acquire(ObjectFifoPort.Consume, 2)
+                        call(add_10_i32, [elemsInPost[0], elemsInPost[1], elemOut])
+                        of_in.release(ObjectFifoPort.Consume, 2)
+                    else:
+                        elemsIn = of_in.acquire(ObjectFifoPort.Consume, 2)
+                        call(add_10_i32, [elemsIn[0], elemsIn[1], elemOut])
+                        of_in.release(ObjectFifoPort.Consume, 1)
+
+                of_out.release(ObjectFifoPort.Produce, 1)
+
+            # To/from AIE-array data movement
+            tensor_ty = T.memref(N, T.i32())
+
+            @runtime_sequence(tensor_ty, tensor_ty)
+            def sequence(A, C):
+                npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
+                npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+                npu_sync(column=0, row=0, direction=0, channel=0)
+
+    print(ctx.module)
+
+
+sliding_window()
diff --git a/programming_examples/dyn_objFifo/sliding_window_conditional/kernel.cc b/programming_examples/dyn_objFifo/sliding_window_conditional/kernel.cc
new file mode 100644
index 0000000000..ddb474e102
--- /dev/null
+++ b/programming_examples/dyn_objFifo/sliding_window_conditional/kernel.cc
@@ -0,0 +1,24 @@
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 AMD Inc.
+
+#include <aie_api/aie.hpp>
+
+template <typename T_in, typename T_out, unsigned long N>
+void add(const T_in *__restrict inA, const T_in *__restrict inB,
+         T_out *__restrict out) {
+  for (int i = 0; i < N; i++) {
+    out[i] = inA[i] + inB[i];
+  }
+}
+
+extern "C" {
+
+void add_10_i32(const int *__restrict inA, const int *__restrict inB,
+                int *__restrict out) {
+  add<int, int, 10>(inA, inB, out);
+}
+}
diff --git a/programming_examples/dyn_objFifo/sliding_window_conditional/test.cpp b/programming_examples/dyn_objFifo/sliding_window_conditional/test.cpp
new file mode 100644
index 0000000000..3cd72ab880
--- /dev/null
+++ b/programming_examples/dyn_objFifo/sliding_window_conditional/test.cpp
@@ -0,0 +1,138 @@
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 AMD Inc.
+
+#include <cassert>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+#ifndef XCLBIN
+#define XCLBIN "build/final.xclbin"
+#endif
+
+#ifndef INSTS_TXT
+#define INSTS_TXT "build/insts.txt"
+#endif
+
+#ifndef KERNEL_NAME
+#define KERNEL_NAME "MLIR_AIE"
+#endif
+
+#define INPUT_SIZE (100 * sizeof(int))  // in bytes
+#define OUTPUT_SIZE (100 * sizeof(int)) // in bytes
+#define WIDTH_SIZE (10 * sizeof(int))   // in bytes
+#define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE
+#define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE
+
+std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
+  std::ifstream instr_file(instr_path);
+  std::string line;
+  std::vector<uint32_t> instr_v;
+  while (std::getline(instr_file, line)) {
+    std::istringstream iss(line);
+    uint32_t a;
+    if (!(iss >> std::hex >> a)) {
+      throw std::runtime_error("Unable to parse instruction file\n");
+    }
+    instr_v.push_back(a);
+  }
+  return instr_v;
+}
+
+int main(int argc, const char *argv[]) {
+
+  std::vector<uint32_t> instr_v = load_instr_sequence(INSTS_TXT);
+  assert(instr_v.size() > 0);
+
+  // Get a device handle
+  unsigned int device_index = 0;
+  xrt::device device = xrt::device(device_index);
+
+  // Load the xclbin
+  xrt::xclbin xclbin = xrt::xclbin(XCLBIN);
+
+  // Get the kernel from the xclbin
+  std::vector<xrt::xclbin::kernel> xkernels = xclbin.get_kernels();
+  xrt::xclbin::kernel xkernel = *std::find_if(
+      xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel &k) {
+        return k.get_name().rfind(KERNEL_NAME, 0) == 0;
+      });
+  std::string kernel_name = xkernel.get_name();
+  assert(strcmp(kernel_name.c_str(), KERNEL_NAME) == 0);
+
+  device.register_xclbin(xclbin);
+
+  // get a hardware context
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // get a kernel handle
+  auto kernel = xrt::kernel(context, kernel_name);
+
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_input =
+      xrt::bo(device, INPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_output =
+      xrt::bo(device, OUTPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+
+  int *buf_input = bo_input.map<int *>();
+  std::cout << std::endl << std::endl << "Input: " << std::endl;
+  for (int i = 0; i < INPUT_ROWS; i++) {
+    std::cout << "row " << i << " : ";
+    for (int j = 0; j < WIDTH_SIZE / sizeof(buf_input[0]); j++) {
+      buf_input[i * INPUT_ROWS + j] = i;
+      std::cout << buf_input[i * INPUT_ROWS + j] << " ";
+    }
+    std::cout << std::endl << std::endl;
+  }
+  int *buf_output = bo_output.map<int *>();
+  memset(buf_output, 0, OUTPUT_SIZE);
+
+  // Instruction buffer for DMA configuration
+  void *buf_instr = bo_instr.map<void *>();
+  memcpy(buf_instr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_input.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_output.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  unsigned int opcode = 3;
+  auto run = kernel(opcode, bo_instr, instr_v.size(), bo_input, bo_output);
+  ert_cmd_state r = run.wait();
+  if (r != ERT_CMD_STATE_COMPLETED) {
+    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
+    return 1;
+  }
+
+  bo_output.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+  bool pass = true;
+  std::cout << std::endl << "Output: " << std::endl;
+  for (int i = 0; i < OUTPUT_ROWS; i++) {
+    std::cout << "row " << i << std::endl;
+    for (int j = 0; j < WIDTH_SIZE / sizeof(buf_output[0]); j++) {
+      int expected_output = 0;
+      if (i == 0) {
+        expected_output = buf_input[i * INPUT_ROWS] * 2;
+      } else {
+        expected_output =
+            buf_input[(i - 1) * INPUT_ROWS] + buf_input[i * INPUT_ROWS];
+      }
+      std::cout << "expected: " << expected_output << ", ";
+      std::cout << "got: " << buf_output[i * OUTPUT_ROWS + j] << std::endl;
+      pass &= buf_output[i * OUTPUT_ROWS + j] == expected_output;
+    }
+    std::cout << std::endl << std::endl;
+  }
+  std::cout << std::endl << std::endl;
+  std::cout << (pass ? "PASS!" : "FAIL.") << std::endl;
+
+  return 0;
+}
diff --git a/programming_examples/dyn_objFifo/two_core_sliding_window/Makefile b/programming_examples/dyn_objFifo/two_core_sliding_window/Makefile
new file mode 100644
index 0000000000..4e423e1df1
--- /dev/null
+++ b/programming_examples/dyn_objFifo/two_core_sliding_window/Makefile
@@ -0,0 +1,66 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# 
+##===----------------------------------------------------------------------===##
+
+# ---
+
+# The following environment variables that point to the Xilinx runtime (XRT)
+# should be set up by an environment setup script already.
+XILINX_XRT?=/opt/xilinx/xrt
+XILINX_VITIS?=$(shell realpath $(dir $(shell which vitis))/../)
+
+# ---
+
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+
+XILINX_XRT_INCLUDE?=${XILINX_XRT}/include
+XILINX_XRT_LIB?=${XILINX_XRT}/lib
+
+CHESSCCWRAP2_FLAGS=aie2 -I${XILINX_VITIS}/aietools/include 
+XRT_FLAGS=-I${XILINX_XRT_INCLUDE} -L${XILINX_XRT_LIB}
+XRT_LIBS=-lxrt_coreutil
+CXX=g++-13 -ggdb 
+
+#mlir_target?=build/aie.mlir
+xclbin_target?=build/final.xclbin
+insts_target?=build/insts.txt
+host_target?=build/test
+
+.PHONY: all
+all: ${xclbin_target} ${host_target}
+
+build/aie.mlir: ${srcdir}/aie2.py
+	mkdir -p ${@D}
+	python3 $< > $@
+
+build/kernel.o: ${srcdir}/kernel.cc
+	mkdir -p ${@D}
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F}
+
+${xclbin_target}: build/aie.mlir build/kernel.o
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py -v --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
+				--dynamic-objFifos --aie-generate-npu --npu-insts-name=${insts_target:build/%=%} ${<:%=../%}
+
+${host_target}: ${srcdir}/test.cpp ${xclbin_target}
+	mkdir -p ${@D}
+	${CXX} ${XRT_FLAGS} -DM=$M -DN=$N -o $@ $< ${XRT_LIBS}
+
+.PHONY: run
+run: ${host_target}
+	./${host_target}
+
+xclbin_sign=${XILINX_XRT}/amdxdna/setup_xclbin_firmware.sh 
+.PHONY: sign
+sign: ${xclbin_target}
+	${xclbin_sign} -dev Phoenix -xclbin $<
+
+.PHONY: clean
+clean:
+	-rm -r build
diff --git a/programming_examples/dyn_objFifo/two_core_sliding_window/aie2.py b/programming_examples/dyn_objFifo/two_core_sliding_window/aie2.py
new file mode 100644
index 0000000000..e815fada7c
--- /dev/null
+++ b/programming_examples/dyn_objFifo/two_core_sliding_window/aie2.py
@@ -0,0 +1,90 @@
+# dynamic_object_fifo/two_core_sliding_window/aie2.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.extras.dialects.ext.scf import _for as range_
+from aie.extras.context import mlir_mod_ctx
+
+N = 100
+n_rows = 10
+dev = AIEDevice.npu1_1col
+col = 0
+
+
+def two_core_sliding_window():
+    with mlir_mod_ctx() as ctx:
+
+        @device(dev)
+        def device_body():
+            memRef_ty = T.memref(N // n_rows, T.i32())
+
+            # Tile declarations
+            ShimTile = tile(col, 0)
+            ComputeTile = tile(col, 2)
+            ComputeTile2 = tile(col, 4)
+
+            # AIE-array data movement with object fifos
+            of_in = object_fifo("in", ShimTile, ComputeTile, 2, memRef_ty)
+            of_in2 = object_fifo("in2", ComputeTile, ComputeTile2, 3, memRef_ty)
+            of_out = object_fifo("out", ComputeTile2, ShimTile, 2, memRef_ty)
+
+            # AIE Core Function declarations
+            passthrough_10_i32 = external_func(
+                "passthrough_10_i32", inputs=[memRef_ty, memRef_ty]
+            )
+            add_10_i32 = external_func(
+                "add_10_i32", inputs=[memRef_ty, memRef_ty, memRef_ty]
+            )
+
+            # Set up compute tiles
+
+            @core(ComputeTile, "kernel.o")
+            def core_body():
+                for _ in range_(10):
+                    elemOut = of_in2.acquire(ObjectFifoPort.Produce, 1)
+                    elemIn = of_in.acquire(ObjectFifoPort.Consume, 1)
+                    call(passthrough_10_i32, [elemIn, elemOut])
+                    of_in.release(ObjectFifoPort.Consume, 1)
+                    of_in2.release(ObjectFifoPort.Produce, 1)
+
+            @core(ComputeTile2, "kernel.o")
+            def core_body():
+                elemOutPre = of_out.acquire(ObjectFifoPort.Produce, 1)
+                elemInPre = of_in2.acquire(ObjectFifoPort.Consume, 1)
+                call(add_10_i32, [elemInPre, elemInPre, elemOutPre])
+                of_out.release(ObjectFifoPort.Produce, 1)
+
+                for _ in range_(8):
+                    elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
+                    elemsIn = of_in2.acquire(ObjectFifoPort.Consume, 2)
+                    call(add_10_i32, [elemsIn[0], elemsIn[1], elemOut])
+                    of_in2.release(ObjectFifoPort.Consume, 1)
+                    of_out.release(ObjectFifoPort.Produce, 1)
+
+                elemOutPost = of_out.acquire(ObjectFifoPort.Produce, 1)
+                elemsInPost = of_in2.acquire(ObjectFifoPort.Consume, 2)
+                call(add_10_i32, [elemsInPost[0], elemsInPost[1], elemOutPost])
+                of_in2.release(ObjectFifoPort.Consume, 2)
+                of_out.release(ObjectFifoPort.Produce, 1)
+
+            # To/from AIE-array data movement
+            tensor_ty = T.memref(N, T.i32())
+
+            @runtime_sequence(tensor_ty, tensor_ty)
+            def sequence(A, C):
+                npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
+                npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+                npu_sync(column=0, row=0, direction=0, channel=0)
+
+    print(ctx.module)
+
+
+two_core_sliding_window()
diff --git a/programming_examples/dyn_objFifo/two_core_sliding_window/kernel.cc b/programming_examples/dyn_objFifo/two_core_sliding_window/kernel.cc
new file mode 100644
index 0000000000..7e4515193c
--- /dev/null
+++ b/programming_examples/dyn_objFifo/two_core_sliding_window/kernel.cc
@@ -0,0 +1,38 @@
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 AMD Inc.
+
+#include <aie_api/aie.hpp>
+
+template <typename T_in, typename T_out, unsigned long N>
+void passthrough(const T_in *__restrict in, T_out *__restrict out) {
+  for (int i = 0; i < N; i++) {
+    out[i] = in[i];
+  }
+}
+
+extern "C" {
+
+void passthrough_10_i32(const int *__restrict in, int *__restrict out) {
+  passthrough<int, int, 10>(in, out);
+}
+}
+
+template <typename T_in, typename T_out, unsigned long N>
+void add(const T_in *__restrict inA, const T_in *__restrict inB,
+         T_out *__restrict out) {
+  for (int i = 0; i < N; i++) {
+    out[i] = inA[i] + inB[i];
+  }
+}
+
+extern "C" {
+
+void add_10_i32(const int *__restrict inA, const int *__restrict inB,
+                int *__restrict out) {
+  add<int, int, 10>(inA, inB, out);
+}
+}
diff --git a/programming_examples/dyn_objFifo/two_core_sliding_window/test.cpp b/programming_examples/dyn_objFifo/two_core_sliding_window/test.cpp
new file mode 100644
index 0000000000..3cd72ab880
--- /dev/null
+++ b/programming_examples/dyn_objFifo/two_core_sliding_window/test.cpp
@@ -0,0 +1,138 @@
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 AMD Inc.
+
+#include <cassert>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+#ifndef XCLBIN
+#define XCLBIN "build/final.xclbin"
+#endif
+
+#ifndef INSTS_TXT
+#define INSTS_TXT "build/insts.txt"
+#endif
+
+#ifndef KERNEL_NAME
+#define KERNEL_NAME "MLIR_AIE"
+#endif
+
+#define INPUT_SIZE (100 * sizeof(int))  // in bytes
+#define OUTPUT_SIZE (100 * sizeof(int)) // in bytes
+#define WIDTH_SIZE (10 * sizeof(int))   // in bytes
+#define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE
+#define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE
+
+std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
+  std::ifstream instr_file(instr_path);
+  std::string line;
+  std::vector<uint32_t> instr_v;
+  while (std::getline(instr_file, line)) {
+    std::istringstream iss(line);
+    uint32_t a;
+    if (!(iss >> std::hex >> a)) {
+      throw std::runtime_error("Unable to parse instruction file\n");
+    }
+    instr_v.push_back(a);
+  }
+  return instr_v;
+}
+
+int main(int argc, const char *argv[]) {
+
+  std::vector<uint32_t> instr_v = load_instr_sequence(INSTS_TXT);
+  assert(instr_v.size() > 0);
+
+  // Get a device handle
+  unsigned int device_index = 0;
+  xrt::device device = xrt::device(device_index);
+
+  // Load the xclbin
+  xrt::xclbin xclbin = xrt::xclbin(XCLBIN);
+
+  // Get the kernel from the xclbin
+  std::vector<xrt::xclbin::kernel> xkernels = xclbin.get_kernels();
+  xrt::xclbin::kernel xkernel = *std::find_if(
+      xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel &k) {
+        return k.get_name().rfind(KERNEL_NAME, 0) == 0;
+      });
+  std::string kernel_name = xkernel.get_name();
+  assert(strcmp(kernel_name.c_str(), KERNEL_NAME) == 0);
+
+  device.register_xclbin(xclbin);
+
+  // get a hardware context
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // get a kernel handle
+  auto kernel = xrt::kernel(context, kernel_name);
+
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_input =
+      xrt::bo(device, INPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_output =
+      xrt::bo(device, OUTPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+
+  int *buf_input = bo_input.map<int *>();
+  std::cout << std::endl << std::endl << "Input: " << std::endl;
+  for (int i = 0; i < INPUT_ROWS; i++) {
+    std::cout << "row " << i << " : ";
+    for (int j = 0; j < WIDTH_SIZE / sizeof(buf_input[0]); j++) {
+      buf_input[i * INPUT_ROWS + j] = i;
+      std::cout << buf_input[i * INPUT_ROWS + j] << " ";
+    }
+    std::cout << std::endl << std::endl;
+  }
+  int *buf_output = bo_output.map<int *>();
+  memset(buf_output, 0, OUTPUT_SIZE);
+
+  // Instruction buffer for DMA configuration
+  void *buf_instr = bo_instr.map<void *>();
+  memcpy(buf_instr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_input.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_output.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  unsigned int opcode = 3;
+  auto run = kernel(opcode, bo_instr, instr_v.size(), bo_input, bo_output);
+  ert_cmd_state r = run.wait();
+  if (r != ERT_CMD_STATE_COMPLETED) {
+    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
+    return 1;
+  }
+
+  bo_output.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+  bool pass = true;
+  std::cout << std::endl << "Output: " << std::endl;
+  for (int i = 0; i < OUTPUT_ROWS; i++) {
+    std::cout << "row " << i << std::endl;
+    for (int j = 0; j < WIDTH_SIZE / sizeof(buf_output[0]); j++) {
+      int expected_output = 0;
+      if (i == 0) {
+        expected_output = buf_input[i * INPUT_ROWS] * 2;
+      } else {
+        expected_output =
+            buf_input[(i - 1) * INPUT_ROWS] + buf_input[i * INPUT_ROWS];
+      }
+      std::cout << "expected: " << expected_output << ", ";
+      std::cout << "got: " << buf_output[i * OUTPUT_ROWS + j] << std::endl;
+      pass &= buf_output[i * OUTPUT_ROWS + j] == expected_output;
+    }
+    std::cout << std::endl << std::endl;
+  }
+  std::cout << std::endl << std::endl;
+  std::cout << (pass ? "PASS!" : "FAIL.") << std::endl;
+
+  return 0;
+}

From 130211597211f7db93c7c9475bd1e0f4867793fb Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Wed, 13 Nov 2024 19:54:55 -0700
Subject: [PATCH 23/46] Updated aie2.py with the latest python bindings

---
 .../dyn_objFifo/sliding_window/aie2.py        |  4 +--
 .../sliding_window_conditional/aie2.py        | 27 ++++++++--------
 .../two_core_sliding_window/aie2.py           | 32 +++++++++----------
 3 files changed, 30 insertions(+), 33 deletions(-)

diff --git a/programming_examples/dyn_objFifo/sliding_window/aie2.py b/programming_examples/dyn_objFifo/sliding_window/aie2.py
index 08d92c73e1..57d5efb1a5 100644
--- a/programming_examples/dyn_objFifo/sliding_window/aie2.py
+++ b/programming_examples/dyn_objFifo/sliding_window/aie2.py
@@ -6,11 +6,9 @@
 #
 # (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
 
-import sys
-
 from aie.dialects.aie import *
 from aie.dialects.aiex import *
-from aie.extras.dialects.ext.scf import _for as range_
+from aie.helpers.dialects.ext.scf import _for as range_
 from aie.extras.context import mlir_mod_ctx
 
 N = 100
diff --git a/programming_examples/dyn_objFifo/sliding_window_conditional/aie2.py b/programming_examples/dyn_objFifo/sliding_window_conditional/aie2.py
index 8ab2dfa636..83719bc8e8 100644
--- a/programming_examples/dyn_objFifo/sliding_window_conditional/aie2.py
+++ b/programming_examples/dyn_objFifo/sliding_window_conditional/aie2.py
@@ -6,11 +6,11 @@
 #
 # (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
 
-import sys
+import numpy as np
 
 from aie.dialects.aie import *
 from aie.dialects.aiex import *
-from aie.extras.dialects.ext.scf import _for as range_
+from aie.helpers.dialects.ext.scf import _for as range_
 from aie.extras.context import mlir_mod_ctx
 
 N = 100
@@ -24,49 +24,48 @@ def sliding_window():
 
         @device(dev)
         def device_body():
-            memRef_ty = T.memref(N // n_rows, T.i32())
+            subtensor_ty = np.ndarray[(N // n_rows,), np.dtype[np.int32]]
 
             # Tile declarations
             ShimTile = tile(col, 0)
             ComputeTile = tile(col, 2)
 
             # AIE-array data movement with object fifos
-            of_in = object_fifo("in", ShimTile, ComputeTile, 3, memRef_ty)
-            of_out = object_fifo("out", ComputeTile, ShimTile, 2, memRef_ty)
+            of_in = object_fifo("in", ShimTile, ComputeTile, 3, subtensor_ty)
+            of_out = object_fifo("out", ComputeTile, ShimTile, 2, subtensor_ty)
 
             # AIE Core Function declarations
             add_10_i32 = external_func(
-                "add_10_i32", inputs=[memRef_ty, memRef_ty, memRef_ty]
+                "add_10_i32", inputs=[subtensor_ty, subtensor_ty, subtensor_ty]
             )
 
             # Set up compute tiles
-
             @core(ComputeTile, "kernel.o")
             def core_body():
                 for i in range_(10):
                     elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
                     if i == 0:
                         elemInPre = of_in.acquire(ObjectFifoPort.Consume, 1)
-                        call(add_10_i32, [elemInPre, elemInPre, elemOut])
+                        add_10_i32(elemInPre, elemInPre, elemOut)
                     elif i == 9:
                         elemsInPost = of_in.acquire(ObjectFifoPort.Consume, 2)
-                        call(add_10_i32, [elemsInPost[0], elemsInPost[1], elemOut])
+                        add_10_i32(elemsInPost[0], elemsInPost[1], elemOut)
                         of_in.release(ObjectFifoPort.Consume, 2)
                     else:
                         elemsIn = of_in.acquire(ObjectFifoPort.Consume, 2)
-                        call(add_10_i32, [elemsIn[0], elemsIn[1], elemOut])
+                        add_10_i32(elemsIn[0], elemsIn[1], elemOut)
                         of_in.release(ObjectFifoPort.Consume, 1)
 
                 of_out.release(ObjectFifoPort.Produce, 1)
 
             # To/from AIE-array data movement
-            tensor_ty = T.memref(N, T.i32())
+            tensor_ty = np.ndarray[(N,), np.dtype[np.int32]]
 
             @runtime_sequence(tensor_ty, tensor_ty)
             def sequence(A, C):
-                npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
-                npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
-                npu_sync(column=0, row=0, direction=0, channel=0)
+                npu_dma_memcpy_nd(metadata=of_in, bd_id=1, mem=A, sizes=[1, 1, 1, N])
+                npu_dma_memcpy_nd(metadata=of_out, bd_id=0, mem=C, sizes=[1, 1, 1, N])
+                dma_wait(of_out)
 
     print(ctx.module)
 
diff --git a/programming_examples/dyn_objFifo/two_core_sliding_window/aie2.py b/programming_examples/dyn_objFifo/two_core_sliding_window/aie2.py
index e815fada7c..c0d7c805ee 100644
--- a/programming_examples/dyn_objFifo/two_core_sliding_window/aie2.py
+++ b/programming_examples/dyn_objFifo/two_core_sliding_window/aie2.py
@@ -6,11 +6,11 @@
 #
 # (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
 
-import sys
+import numpy as np
 
 from aie.dialects.aie import *
 from aie.dialects.aiex import *
-from aie.extras.dialects.ext.scf import _for as range_
+from aie.helpers.dialects.ext.scf import _for as range_
 from aie.extras.context import mlir_mod_ctx
 
 N = 100
@@ -24,7 +24,7 @@ def two_core_sliding_window():
 
         @device(dev)
         def device_body():
-            memRef_ty = T.memref(N // n_rows, T.i32())
+            subtensor_ty = np.ndarray[(N // n_rows,), np.dtype[np.int32]]
 
             # Tile declarations
             ShimTile = tile(col, 0)
@@ -32,16 +32,16 @@ def device_body():
             ComputeTile2 = tile(col, 4)
 
             # AIE-array data movement with object fifos
-            of_in = object_fifo("in", ShimTile, ComputeTile, 2, memRef_ty)
-            of_in2 = object_fifo("in2", ComputeTile, ComputeTile2, 3, memRef_ty)
-            of_out = object_fifo("out", ComputeTile2, ShimTile, 2, memRef_ty)
+            of_in = object_fifo("in", ShimTile, ComputeTile, 2, subtensor_ty)
+            of_in2 = object_fifo("in2", ComputeTile, ComputeTile2, 3, subtensor_ty)
+            of_out = object_fifo("out", ComputeTile2, ShimTile, 2, subtensor_ty)
 
             # AIE Core Function declarations
             passthrough_10_i32 = external_func(
-                "passthrough_10_i32", inputs=[memRef_ty, memRef_ty]
+                "passthrough_10_i32", inputs=[subtensor_ty, subtensor_ty]
             )
             add_10_i32 = external_func(
-                "add_10_i32", inputs=[memRef_ty, memRef_ty, memRef_ty]
+                "add_10_i32", inputs=[subtensor_ty, subtensor_ty, subtensor_ty]
             )
 
             # Set up compute tiles
@@ -51,7 +51,7 @@ def core_body():
                 for _ in range_(10):
                     elemOut = of_in2.acquire(ObjectFifoPort.Produce, 1)
                     elemIn = of_in.acquire(ObjectFifoPort.Consume, 1)
-                    call(passthrough_10_i32, [elemIn, elemOut])
+                    passthrough_10_i32(elemIn, elemOut)
                     of_in.release(ObjectFifoPort.Consume, 1)
                     of_in2.release(ObjectFifoPort.Produce, 1)
 
@@ -59,30 +59,30 @@ def core_body():
             def core_body():
                 elemOutPre = of_out.acquire(ObjectFifoPort.Produce, 1)
                 elemInPre = of_in2.acquire(ObjectFifoPort.Consume, 1)
-                call(add_10_i32, [elemInPre, elemInPre, elemOutPre])
+                add_10_i32(elemInPre, elemInPre, elemOutPre)
                 of_out.release(ObjectFifoPort.Produce, 1)
 
                 for _ in range_(8):
                     elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
                     elemsIn = of_in2.acquire(ObjectFifoPort.Consume, 2)
-                    call(add_10_i32, [elemsIn[0], elemsIn[1], elemOut])
+                    add_10_i32(elemsIn[0], elemsIn[1], elemOut)
                     of_in2.release(ObjectFifoPort.Consume, 1)
                     of_out.release(ObjectFifoPort.Produce, 1)
 
                 elemOutPost = of_out.acquire(ObjectFifoPort.Produce, 1)
                 elemsInPost = of_in2.acquire(ObjectFifoPort.Consume, 2)
-                call(add_10_i32, [elemsInPost[0], elemsInPost[1], elemOutPost])
+                add_10_i32(elemsInPost[0], elemsInPost[1], elemOutPost)
                 of_in2.release(ObjectFifoPort.Consume, 2)
                 of_out.release(ObjectFifoPort.Produce, 1)
 
             # To/from AIE-array data movement
-            tensor_ty = T.memref(N, T.i32())
+            tensor_ty = np.ndarray[(N,), np.dtype[np.int32]]
 
             @runtime_sequence(tensor_ty, tensor_ty)
             def sequence(A, C):
-                npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
-                npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
-                npu_sync(column=0, row=0, direction=0, channel=0)
+                npu_dma_memcpy_nd(metadata=of_in, bd_id=1, mem=A, sizes=[1, 1, 1, N])
+                npu_dma_memcpy_nd(metadata=of_out, bd_id=0, mem=C, sizes=[1, 1, 1, N])
+                dma_wait(of_out)
 
     print(ctx.module)
 

From 57237c981cce19345e615cc83f424a13e479c123 Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Thu, 21 Nov 2024 10:29:00 -0700
Subject: [PATCH 24/46] Lit file

---
 programming_examples/dyn_objFifo/lit.local.cfg | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 programming_examples/dyn_objFifo/lit.local.cfg

diff --git a/programming_examples/dyn_objFifo/lit.local.cfg b/programming_examples/dyn_objFifo/lit.local.cfg
new file mode 100644
index 0000000000..64cca87fdf
--- /dev/null
+++ b/programming_examples/dyn_objFifo/lit.local.cfg
@@ -0,0 +1,11 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 AMD Inc.
+
+config.suffixes = ['.lit']
+
+if 'AIE2' not in config.vitis_components:
+    config.unsupported = True

From 5072fe5de4080525727ed229efad59d953393679 Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Tue, 3 Dec 2024 09:43:56 -0700
Subject: [PATCH 25/46] Finding the problem #Attempt 1

---
 .../dyn_objFifo/nested_loops/Makefile         | 10 +++---
 .../dyn_objFifo/nested_loops/aie.mlir         | 36 +++++++++++++++++++
 .../dyn_objFifo/nested_loops/run_makefile.lit |  9 +++++
 3 files changed, 50 insertions(+), 5 deletions(-)
 create mode 100644 programming_examples/dyn_objFifo/nested_loops/aie.mlir
 create mode 100644 programming_examples/dyn_objFifo/nested_loops/run_makefile.lit

diff --git a/programming_examples/dyn_objFifo/nested_loops/Makefile b/programming_examples/dyn_objFifo/nested_loops/Makefile
index 0216ac75da..7a3b9545a5 100644
--- a/programming_examples/dyn_objFifo/nested_loops/Makefile
+++ b/programming_examples/dyn_objFifo/nested_loops/Makefile
@@ -35,18 +35,18 @@ host_target?=build/test
 .PHONY: all
 all: ${xclbin_target} ${host_target}
 
-build/aie.mlir: ${srcdir}/aie2.py
-	mkdir -p ${@D}
-	python3 $< > $@
+# build/aie.mlir: ${srcdir}/aie2.py
+# 	mkdir -p ${@D}
+# 	python3 $< > $@
 
 build/kernel.o: ${srcdir}/kernel.cc
 	mkdir -p ${@D}
 	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F}
 
-${xclbin_target}: build/aie.mlir build/kernel.o
+${xclbin_target}: ${srcdir}/aie.mlir build/kernel.o
 	mkdir -p ${@D}
 	cd ${@D} && aiecc.py -v --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
-				--dynamic-objFifos --aie-generate-npu --npu-insts-name=${insts_target:build/%=%} ${<:%=../%}
+				--dynamic-objFifos --aie-generate-npu --npu-insts-name=${insts_target:build/%=%} ${srcdir}/${<:%=../%}
 
 ${host_target}: ${srcdir}/test.cpp ${xclbin_target}
 	mkdir -p ${@D}
diff --git a/programming_examples/dyn_objFifo/nested_loops/aie.mlir b/programming_examples/dyn_objFifo/nested_loops/aie.mlir
new file mode 100644
index 0000000000..1fb0cda89c
--- /dev/null
+++ b/programming_examples/dyn_objFifo/nested_loops/aie.mlir
@@ -0,0 +1,36 @@
+module {
+  aie.device(npu1_1col) {
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_2 = aie.tile(0, 2)
+    aie.objectfifo @in(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<10xi32>> 
+    aie.objectfifo @out(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<10xi32>> 
+    func.func private @passthrough_10_i32(memref<10xi32>, memref<10xi32>)
+    %core_0_2 = aie.core(%tile_0_2) {
+      %c0 = arith.constant 0 : index
+      %c5 = arith.constant 5 : index
+      %c1 = arith.constant 1 : index
+      scf.for %arg0 = %c0 to %c5 step %c1 {
+        %0 = aie.objectfifo.acquire @in(Consume, 1) : !aie.objectfifosubview<memref<10xi32>>
+        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<10xi32>> -> memref<10xi32>
+        %c0_0 = arith.constant 0 : index
+        %c5_1 = arith.constant 5 : index
+        %c1_2 = arith.constant 1 : index
+        scf.for %arg1 = %c0_0 to %c5_1 step %c1_2 {
+          %2 = aie.objectfifo.acquire @out(Produce, 1) : !aie.objectfifosubview<memref<10xi32>>
+          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<10xi32>> -> memref<10xi32>
+          func.call @passthrough_10_i32(%1, %3) : (memref<10xi32>, memref<10xi32>) -> ()
+          aie.objectfifo.release @out(Produce, 1)
+        }
+        aie.objectfifo.release @in(Consume, 1)
+      }
+      aie.end
+    } {link_with = "kernel.o"}
+    aiex.runtime_sequence(%arg0: memref<10xi32>, %arg1: memref<10xi32>) {
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 50][0, 0, 0, 1]) {id = 1 : i64, issue_token = true, metadata = @in} : memref<10xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 250][0, 0, 0, 1]) {id = 0 : i64, metadata = @out} : memref<10xi32>
+      aiex.npu.dma_wait {symbol = @in}
+      aiex.npu.dma_wait {symbol = @out}
+    }
+  }
+}
+
diff --git a/programming_examples/dyn_objFifo/nested_loops/run_makefile.lit b/programming_examples/dyn_objFifo/nested_loops/run_makefile.lit
new file mode 100644
index 0000000000..507b70720a
--- /dev/null
+++ b/programming_examples/dyn_objFifo/nested_loops/run_makefile.lit
@@ -0,0 +1,9 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, xchess 
+//
+// RUN: make -f %S/Makefile clean
+// RUN: make -f %S/Makefile 
+// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s
+// CHECK: PASS!
\ No newline at end of file

From 624a961c1a5c3310f52c151010bc9b287541cdf2 Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Tue, 3 Dec 2024 12:48:13 -0700
Subject: [PATCH 26/46] Second attempt

---
 .../dyn_objFifo/sliding_window/run_makefile.lit          | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 programming_examples/dyn_objFifo/sliding_window/run_makefile.lit

diff --git a/programming_examples/dyn_objFifo/sliding_window/run_makefile.lit b/programming_examples/dyn_objFifo/sliding_window/run_makefile.lit
new file mode 100644
index 0000000000..507b70720a
--- /dev/null
+++ b/programming_examples/dyn_objFifo/sliding_window/run_makefile.lit
@@ -0,0 +1,9 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, xchess 
+//
+// RUN: make -f %S/Makefile clean
+// RUN: make -f %S/Makefile 
+// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s
+// CHECK: PASS!
\ No newline at end of file

From f01462390a331a87e4dbecb51c276260472c9628 Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Tue, 3 Dec 2024 13:48:15 -0700
Subject: [PATCH 27/46] Python to mlir

---
 .../dynamic_object_fifo/nested_loops/aie2.py  | 28 +++++++++++++++----
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
index dee6558c3e..c13fba401e 100644
--- a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
@@ -7,12 +7,28 @@
 
 # REQUIRES: ryzen_ai, valid_xchess_license
 #
-# RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
-# RUN: %python %S/aie2.py > ./aie2.mlir
-# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
-# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
-# RUN: %run_on_npu ./test.exe | FileCheck %s
-# CHECK: PASS!
+# RUN: %python %s | FileCheck %s
+#CHECK: %core_0_2 = aie.core(%tile_0_2) {
+#CHECK:       %c0 = arith.constant 0 : index
+#CHECK:       %c5 = arith.constant 5 : index
+#CHECK:       %c1 = arith.constant 1 : index
+#CHECK:       scf.for %arg0 = %c0 to %c5 step %c1 {
+#CHECK:         %0 = aie.objectfifo.acquire @in(Consume, 1) : !aie.objectfifosubview<memref<10xi32>>
+#CHECK:         %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<10xi32>> -> memref<10xi32>
+#CHECK:         %c0_0 = arith.constant 0 : index
+#CHECK:         %c5_1 = arith.constant 5 : index
+#CHECK:         %c1_2 = arith.constant 1 : index
+#CHECK:         scf.for %arg1 = %c0_0 to %c5_1 step %c1_2 {
+#CHECK:           %2 = aie.objectfifo.acquire @out(Produce, 1) : !aie.objectfifosubview<memref<10xi32>>
+#CHECK:           %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<10xi32>> -> memref<10xi32>
+#CHECK:           func.call @passthrough_10_i32(%1, %3) : (memref<10xi32>, memref<10xi32>) -> ()
+#CHECK:           aie.objectfifo.release @out(Produce, 1)
+#CHECK:         }
+#CHECK:         aie.objectfifo.release @in(Consume, 1)
+#CHECK:       }
+#CHECK:       aie.end
+#CHECK:     } {link_with = "kernel.o"}
+
 import numpy as np
 
 from aie.dialects.aie import *

From 9f090a97631a4c4b4c3f36d597caa9bebb7410a3 Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Tue, 3 Dec 2024 13:58:36 -0700
Subject: [PATCH 28/46] aie-opt with dynamic object fifo flag lowering

---
 .../dynamic_object_fifo/nested_loops/aie2.py  | 98 +++++++++++++++----
 1 file changed, 77 insertions(+), 21 deletions(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
index c13fba401e..612a4d85c3 100644
--- a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
@@ -7,27 +7,83 @@
 
 # REQUIRES: ryzen_ai, valid_xchess_license
 #
-# RUN: %python %s | FileCheck %s
-#CHECK: %core_0_2 = aie.core(%tile_0_2) {
-#CHECK:       %c0 = arith.constant 0 : index
-#CHECK:       %c5 = arith.constant 5 : index
-#CHECK:       %c1 = arith.constant 1 : index
-#CHECK:       scf.for %arg0 = %c0 to %c5 step %c1 {
-#CHECK:         %0 = aie.objectfifo.acquire @in(Consume, 1) : !aie.objectfifosubview<memref<10xi32>>
-#CHECK:         %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<10xi32>> -> memref<10xi32>
-#CHECK:         %c0_0 = arith.constant 0 : index
-#CHECK:         %c5_1 = arith.constant 5 : index
-#CHECK:         %c1_2 = arith.constant 1 : index
-#CHECK:         scf.for %arg1 = %c0_0 to %c5_1 step %c1_2 {
-#CHECK:           %2 = aie.objectfifo.acquire @out(Produce, 1) : !aie.objectfifosubview<memref<10xi32>>
-#CHECK:           %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<10xi32>> -> memref<10xi32>
-#CHECK:           func.call @passthrough_10_i32(%1, %3) : (memref<10xi32>, memref<10xi32>) -> ()
-#CHECK:           aie.objectfifo.release @out(Produce, 1)
-#CHECK:         }
-#CHECK:         aie.objectfifo.release @in(Consume, 1)
-#CHECK:       }
-#CHECK:       aie.end
-#CHECK:     } {link_with = "kernel.o"}
+# RUN: %python %S/aie2.py > ./aie2.mlir
+# RUN: aie-opt --aie-objectFifo-stateful-transform=dynamic-objFifos ./aie2.mlir | FileCheck %s
+# CHECK: %tile_0_0 = aie.tile(0, 0)
+# CHECK: %tile_0_2 = aie.tile(0, 2)
+# CHECK: %out_cons_prod_lock = aie.lock(%tile_0_0, 2) {init = 1 : i32, sym_name = "out_cons_prod_lock"}
+# CHECK: %out_cons_cons_lock = aie.lock(%tile_0_0, 3) {init = 0 : i32, sym_name = "out_cons_cons_lock"}
+# CHECK: %out_buff_0 = aie.buffer(%tile_0_2) {sym_name = "out_buff_0"} : memref<10xi32> 
+# CHECK: %out_buff_1 = aie.buffer(%tile_0_2) {sym_name = "out_buff_1"} : memref<10xi32> 
+# CHECK: %out_prod_lock = aie.lock(%tile_0_2, 2) {init = 2 : i32, sym_name = "out_prod_lock"}
+# CHECK: %out_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "out_cons_lock"}
+# CHECK: %in_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "in_cons_buff_0"} : memref<10xi32> 
+# CHECK: %in_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "in_cons_buff_1"} : memref<10xi32> 
+# CHECK: %in_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 2 : i32, sym_name = "in_cons_prod_lock"}
+# CHECK: %in_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "in_cons_cons_lock"}
+# CHECK: %in_prod_lock = aie.lock(%tile_0_0, 0) {init = 1 : i32, sym_name = "in_prod_lock"}
+# CHECK: %in_cons_lock = aie.lock(%tile_0_0, 1) {init = 0 : i32, sym_name = "in_cons_lock"}
+# CHECK: aie.flow(%tile_0_0, DMA : 0, %tile_0_2, DMA : 0)
+# CHECK: aie.flow(%tile_0_2, DMA : 0, %tile_0_0, DMA : 0)
+# CHECK: func.func private @passthrough_10_i32(memref<10xi32>, memref<10xi32>)
+# CHECK: %buffer_0_2 = aie.buffer(%tile_0_2) : memref<2xindex> 
+# CHECK: %core_0_2 = aie.core(%tile_0_2) {
+# CHECK:   %c0 = arith.constant 0 : index
+# CHECK:   %c0_0 = arith.constant 0 : index
+# CHECK:   %c2 = arith.constant 2 : index
+# CHECK:   memref.store %c0, %buffer_0_2[%c0_0] : memref<2xindex>
+# CHECK:   %c1 = arith.constant 1 : index
+# CHECK:   %c2_1 = arith.constant 2 : index
+# CHECK:   memref.store %c0, %buffer_0_2[%c1] : memref<2xindex>
+# CHECK:   %c0_2 = arith.constant 0 : index
+# CHECK:   %c5 = arith.constant 5 : index
+# CHECK:   %c1_3 = arith.constant 1 : index
+# CHECK:   scf.for %arg0 = %c0_2 to %c5 step %c1_3 {
+# CHECK:     aie.use_lock(%in_cons_cons_lock, AcquireGreaterEqual, 1)
+# CHECK:     %0 = memref.load %buffer_0_2[%c1] : memref<2xindex>
+# CHECK:     %1 = scf.index_switch %0 -> memref<10xi32> 
+# CHECK:     case 0 {
+# CHECK:       scf.yield %in_cons_buff_0 : memref<10xi32>
+# CHECK:     }
+# CHECK:     case 1 {
+# CHECK:       scf.yield %in_cons_buff_1 : memref<10xi32>
+# CHECK:     }
+# CHECK:     default {
+# CHECK:       scf.yield %in_cons_buff_0 : memref<10xi32>
+# CHECK:     }
+# CHECK:     %c0_4 = arith.constant 0 : index
+# CHECK:     %c5_5 = arith.constant 5 : index
+# CHECK:     %c1_6 = arith.constant 1 : index
+# CHECK:     scf.for %arg1 = %c0_4 to %c5_5 step %c1_6 {
+# CHECK:       aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1)
+# CHECK:       %5 = memref.load %buffer_0_2[%c0_0] : memref<2xindex>
+# CHECK:       %6 = scf.index_switch %5 -> memref<10xi32> 
+# CHECK:       case 0 {
+# CHECK:         scf.yield %out_buff_0 : memref<10xi32>
+# CHECK:       }
+# CHECK:       case 1 {
+# CHECK:         scf.yield %out_buff_1 : memref<10xi32>
+# CHECK:       }
+# CHECK:       default {
+# CHECK:         scf.yield %out_buff_0 : memref<10xi32>
+# CHECK:       }
+# CHECK:       func.call @passthrough_10_i32(%1, %6) : (memref<10xi32>, memref<10xi32>) -> ()
+# CHECK:       aie.use_lock(%out_cons_lock, Release, 1)
+# CHECK:       %7 = memref.load %buffer_0_2[%c0_0] : memref<2xindex>
+# CHECK:       %c1_8 = arith.constant 1 : index
+# CHECK:       %8 = arith.addi %7, %c1_8 : index
+# CHECK:       %9 = arith.remsi %8, %c2 : index
+# CHECK:       memref.store %9, %buffer_0_2[%c0_0] : memref<2xindex>
+# CHECK:     }
+# CHECK:     aie.use_lock(%in_cons_prod_lock, Release, 1)
+# CHECK:     %2 = memref.load %buffer_0_2[%c1] : memref<2xindex>
+# CHECK:     %c1_7 = arith.constant 1 : index
+# CHECK:     %3 = arith.addi %2, %c1_7 : index
+# CHECK:     %4 = arith.remsi %3, %c2_1 : index
+# CHECK:     memref.store %4, %buffer_0_2[%c1] : memref<2xindex>
+# CHECK:   }
+# CHECK:   aie.end
+# CHECK: } {link_with = "kernel.o"}
 
 import numpy as np
 

From bb43c9592e967b3e7b834c037985a38c9973fb1c Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Tue, 3 Dec 2024 15:08:31 -0700
Subject: [PATCH 29/46] Removing core to see the problem

---
 .../dynamic_object_fifo/nested_loops/aie2.py  | 91 +------------------
 1 file changed, 5 insertions(+), 86 deletions(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
index 612a4d85c3..68ca1bce14 100644
--- a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
@@ -7,83 +7,12 @@
 
 # REQUIRES: ryzen_ai, valid_xchess_license
 #
+# RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
-# RUN: aie-opt --aie-objectFifo-stateful-transform=dynamic-objFifos ./aie2.mlir | FileCheck %s
-# CHECK: %tile_0_0 = aie.tile(0, 0)
-# CHECK: %tile_0_2 = aie.tile(0, 2)
-# CHECK: %out_cons_prod_lock = aie.lock(%tile_0_0, 2) {init = 1 : i32, sym_name = "out_cons_prod_lock"}
-# CHECK: %out_cons_cons_lock = aie.lock(%tile_0_0, 3) {init = 0 : i32, sym_name = "out_cons_cons_lock"}
-# CHECK: %out_buff_0 = aie.buffer(%tile_0_2) {sym_name = "out_buff_0"} : memref<10xi32> 
-# CHECK: %out_buff_1 = aie.buffer(%tile_0_2) {sym_name = "out_buff_1"} : memref<10xi32> 
-# CHECK: %out_prod_lock = aie.lock(%tile_0_2, 2) {init = 2 : i32, sym_name = "out_prod_lock"}
-# CHECK: %out_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "out_cons_lock"}
-# CHECK: %in_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "in_cons_buff_0"} : memref<10xi32> 
-# CHECK: %in_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "in_cons_buff_1"} : memref<10xi32> 
-# CHECK: %in_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 2 : i32, sym_name = "in_cons_prod_lock"}
-# CHECK: %in_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "in_cons_cons_lock"}
-# CHECK: %in_prod_lock = aie.lock(%tile_0_0, 0) {init = 1 : i32, sym_name = "in_prod_lock"}
-# CHECK: %in_cons_lock = aie.lock(%tile_0_0, 1) {init = 0 : i32, sym_name = "in_cons_lock"}
-# CHECK: aie.flow(%tile_0_0, DMA : 0, %tile_0_2, DMA : 0)
-# CHECK: aie.flow(%tile_0_2, DMA : 0, %tile_0_0, DMA : 0)
-# CHECK: func.func private @passthrough_10_i32(memref<10xi32>, memref<10xi32>)
-# CHECK: %buffer_0_2 = aie.buffer(%tile_0_2) : memref<2xindex> 
-# CHECK: %core_0_2 = aie.core(%tile_0_2) {
-# CHECK:   %c0 = arith.constant 0 : index
-# CHECK:   %c0_0 = arith.constant 0 : index
-# CHECK:   %c2 = arith.constant 2 : index
-# CHECK:   memref.store %c0, %buffer_0_2[%c0_0] : memref<2xindex>
-# CHECK:   %c1 = arith.constant 1 : index
-# CHECK:   %c2_1 = arith.constant 2 : index
-# CHECK:   memref.store %c0, %buffer_0_2[%c1] : memref<2xindex>
-# CHECK:   %c0_2 = arith.constant 0 : index
-# CHECK:   %c5 = arith.constant 5 : index
-# CHECK:   %c1_3 = arith.constant 1 : index
-# CHECK:   scf.for %arg0 = %c0_2 to %c5 step %c1_3 {
-# CHECK:     aie.use_lock(%in_cons_cons_lock, AcquireGreaterEqual, 1)
-# CHECK:     %0 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-# CHECK:     %1 = scf.index_switch %0 -> memref<10xi32> 
-# CHECK:     case 0 {
-# CHECK:       scf.yield %in_cons_buff_0 : memref<10xi32>
-# CHECK:     }
-# CHECK:     case 1 {
-# CHECK:       scf.yield %in_cons_buff_1 : memref<10xi32>
-# CHECK:     }
-# CHECK:     default {
-# CHECK:       scf.yield %in_cons_buff_0 : memref<10xi32>
-# CHECK:     }
-# CHECK:     %c0_4 = arith.constant 0 : index
-# CHECK:     %c5_5 = arith.constant 5 : index
-# CHECK:     %c1_6 = arith.constant 1 : index
-# CHECK:     scf.for %arg1 = %c0_4 to %c5_5 step %c1_6 {
-# CHECK:       aie.use_lock(%out_prod_lock, AcquireGreaterEqual, 1)
-# CHECK:       %5 = memref.load %buffer_0_2[%c0_0] : memref<2xindex>
-# CHECK:       %6 = scf.index_switch %5 -> memref<10xi32> 
-# CHECK:       case 0 {
-# CHECK:         scf.yield %out_buff_0 : memref<10xi32>
-# CHECK:       }
-# CHECK:       case 1 {
-# CHECK:         scf.yield %out_buff_1 : memref<10xi32>
-# CHECK:       }
-# CHECK:       default {
-# CHECK:         scf.yield %out_buff_0 : memref<10xi32>
-# CHECK:       }
-# CHECK:       func.call @passthrough_10_i32(%1, %6) : (memref<10xi32>, memref<10xi32>) -> ()
-# CHECK:       aie.use_lock(%out_cons_lock, Release, 1)
-# CHECK:       %7 = memref.load %buffer_0_2[%c0_0] : memref<2xindex>
-# CHECK:       %c1_8 = arith.constant 1 : index
-# CHECK:       %8 = arith.addi %7, %c1_8 : index
-# CHECK:       %9 = arith.remsi %8, %c2 : index
-# CHECK:       memref.store %9, %buffer_0_2[%c0_0] : memref<2xindex>
-# CHECK:     }
-# CHECK:     aie.use_lock(%in_cons_prod_lock, Release, 1)
-# CHECK:     %2 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-# CHECK:     %c1_7 = arith.constant 1 : index
-# CHECK:     %3 = arith.addi %2, %c1_7 : index
-# CHECK:     %4 = arith.remsi %3, %c2_1 : index
-# CHECK:     memref.store %4, %buffer_0_2[%c1] : memref<2xindex>
-# CHECK:   }
-# CHECK:   aie.end
-# CHECK: } {link_with = "kernel.o"}
+# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: %run_on_npu ./test.exe -x final.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+# CHECK: PASS!
 
 import numpy as np
 
@@ -119,16 +48,6 @@ def device_body():
                 "passthrough_10_i32", inputs=[tensor_ty, tensor_ty]
             )
 
-            # Set up compute tiles
-            @core(ComputeTile, "kernel.o")
-            def core_body():
-                for _ in range_(5):
-                    elemIn = of_in.acquire(ObjectFifoPort.Consume, 1)
-                    for _ in range_(5):
-                        elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
-                        passthrough_10_i32(elemIn, elemOut)
-                        of_out.release(ObjectFifoPort.Produce, 1)
-                    of_in.release(ObjectFifoPort.Consume, 1)
 
             # To/from AIE-array data movement
             @runtime_sequence(tensor_ty, tensor_ty)

From 7797c5da91c38b813d9e725daeccca3a0a048bf3 Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Tue, 3 Dec 2024 15:25:20 -0700
Subject: [PATCH 30/46] Remove inner loop

---
 test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
index 68ca1bce14..6ad086d3a4 100644
--- a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
@@ -48,6 +48,16 @@ def device_body():
                 "passthrough_10_i32", inputs=[tensor_ty, tensor_ty]
             )
 
+            # Set up compute tiles
+            @core(ComputeTile, "kernel.o")
+            def core_body():
+                for _ in range_(5):
+                    elemIn = of_in.acquire(ObjectFifoPort.Consume, 1)
+                    # for _ in range_(5):
+                    #     elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
+                    #     passthrough_10_i32(elemIn, elemOut)
+                    #     of_out.release(ObjectFifoPort.Produce, 1)
+                    of_in.release(ObjectFifoPort.Consume, 1)
 
             # To/from AIE-array data movement
             @runtime_sequence(tensor_ty, tensor_ty)

From a8eec7bb6b501611bc43b8ccb875d2c01a08f5f6 Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Tue, 3 Dec 2024 15:42:04 -0700
Subject: [PATCH 31/46] Remove body of outer loop

---
 test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
index 6ad086d3a4..0638d9b59e 100644
--- a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
@@ -52,12 +52,12 @@ def device_body():
             @core(ComputeTile, "kernel.o")
             def core_body():
                 for _ in range_(5):
-                    elemIn = of_in.acquire(ObjectFifoPort.Consume, 1)
+                    # elemIn = of_in.acquire(ObjectFifoPort.Consume, 1)
                     # for _ in range_(5):
                     #     elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
                     #     passthrough_10_i32(elemIn, elemOut)
                     #     of_out.release(ObjectFifoPort.Produce, 1)
-                    of_in.release(ObjectFifoPort.Consume, 1)
+                    # of_in.release(ObjectFifoPort.Consume, 1)
 
             # To/from AIE-array data movement
             @runtime_sequence(tensor_ty, tensor_ty)

From ada9589d16f1989b13d6cd1f157711a35807527d Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Tue, 3 Dec 2024 16:00:12 -0700
Subject: [PATCH 32/46] Problem maybe because of the usage of index_cast for
 switch index

---
 test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
index 0638d9b59e..8ae31698ab 100644
--- a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
@@ -40,8 +40,8 @@ def device_body():
             ComputeTile = tile(col, 2)
 
             # AIE-array data movement with object fifos
-            of_in = object_fifo("in", ShimTile, ComputeTile, 2, tensor_ty)
-            of_out = object_fifo("out", ComputeTile, ShimTile, 2, tensor_ty)
+            of_in = object_fifo("in", ShimTile, ComputeTile, 1, tensor_ty)
+            of_out = object_fifo("out", ComputeTile, ShimTile, 1, tensor_ty)
 
             # AIE Core Function declarations
             passthrough_10_i32 = external_func(
@@ -52,12 +52,12 @@ def device_body():
             @core(ComputeTile, "kernel.o")
             def core_body():
                 for _ in range_(5):
-                    # elemIn = of_in.acquire(ObjectFifoPort.Consume, 1)
+                    elemIn = of_in.acquire(ObjectFifoPort.Consume, 1)
                     # for _ in range_(5):
                     #     elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
                     #     passthrough_10_i32(elemIn, elemOut)
                     #     of_out.release(ObjectFifoPort.Produce, 1)
-                    # of_in.release(ObjectFifoPort.Consume, 1)
+                    of_in.release(ObjectFifoPort.Consume, 1)
 
             # To/from AIE-array data movement
             @runtime_sequence(tensor_ty, tensor_ty)

From 7e05750aa4d1ffed1e77846393d2f8f8c97af71d Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Wed, 4 Dec 2024 11:08:01 -0700
Subject: [PATCH 33/46] Global buffer (?)

---
 .../Transforms/AIEObjectFifoStatefulTransform.cpp  | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
index 0d2e6c5821..437ec55fb5 100644
--- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
+++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
@@ -1130,13 +1130,13 @@ struct AIEObjectFifoStatefulTransformPass
         //    - globalNextIndex: load index and use it to index_switch (one
         //    IndexSwithOp per AccessOp)
         WalkResult res = coreOp.walk([&](Operation *op) {
-          if (auto relOp = dyn_cast<ObjectFifoReleaseOp>(op)) {
-            ObjectFifoCreateOp createOp = relOp.getObjectFifo();
-            ObjectFifoPort port = relOp.getPort();
-            updateGlobalNextIndex(builder, relOp, globalNextIndex,
-                                  globalIndices[{createOp, port}],
-                                  constantSizes[{createOp, port}]);
-          }
+          // if (auto relOp = dyn_cast<ObjectFifoReleaseOp>(op)) {
+          //   ObjectFifoCreateOp createOp = relOp.getObjectFifo();
+          //   ObjectFifoPort port = relOp.getPort();
+          //   updateGlobalNextIndex(builder, relOp, globalNextIndex,
+          //                         globalIndices[{createOp, port}],
+          //                         constantSizes[{createOp, port}]);
+          // }
           if (auto acqOp = dyn_cast<ObjectFifoAcquireOp>(op)) {
             std::vector<ObjectFifoSubviewAccessOp> accessOps;
             for (auto u : acqOp->getUsers())

From dd69ad786899b0fd22b132c75d363ead8f6100b1 Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Wed, 4 Dec 2024 11:38:03 -0700
Subject: [PATCH 34/46] Previous one didn't remove the global buffer completely

---
 .../AIEObjectFifoStatefulTransform.cpp        | 214 +++++++++---------
 1 file changed, 107 insertions(+), 107 deletions(-)

diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
index 437ec55fb5..c96730808d 100644
--- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
+++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
@@ -1089,113 +1089,113 @@ struct AIEObjectFifoStatefulTransformPass
             fifoSizes[{op, port}] = op.size();
         });
         builder.setInsertionPoint(coreOp);
-        auto memrefTy =
-            MemRefType::get(SmallVector<int64_t>{(int64_t)fifoSizes.size()},
-                            builder.getIndexType());
-        auto globalNextIndex = builder.create<BufferOp>(
-            builder.getUnknownLoc(), memrefTy, coreOp.getTile(),
-            /*sym_name*/ nullptr, /*address*/ nullptr,
-            /*initial_value*/ nullptr, /*mem_bank*/ nullptr);
-
-        // Initialize all counters in the global buffers to 0.
-        // Also, keep a map of the ConstantOps for the indices per OF
-        // and a map with the ConstantOps for the sizes per OF.
-        std::map<std::pair<ObjectFifoCreateOp, ObjectFifoPort>,
-                 arith::ConstantOp>
-            globalIndices;
-        std::map<std::pair<ObjectFifoCreateOp, ObjectFifoPort>,
-                 arith::ConstantOp>
-            constantSizes;
-        int index = 0;
-        builder.setInsertionPointToStart(&(coreOp.getBody().front()));
-        Value initVal = builder.create<arith::ConstantOp>(
-            builder.getUnknownLoc(), builder.getIndexAttr(0));
-        for (auto i : fifoSizes) {
-          auto indexOp = builder.create<arith::ConstantOp>(
-              initVal.getLoc(), builder.getIndexAttr(index));
-          globalIndices[i.first] = indexOp;
-          index++;
-          auto size = builder.create<arith::ConstantOp>(
-              indexOp.getLoc(), builder.getIndexAttr(i.second));
-          constantSizes[i.first] = size;
-          builder.create<memref::StoreOp>(
-              size.getLoc(), initVal, globalNextIndex,
-              ValueRange(ArrayRef({indexOp.getResult()})));
-        }
-
-        // Walk the code:
-        // - after each ObjectFifoReleaseOp:
-        //    - globalNextIndex: add #rel modulo objfifo depth
-        // - before each ObjectFifoAcquireOp:
-        //    - globalNextIndex: load index and use it to index_switch (one
-        //    IndexSwithOp per AccessOp)
-        WalkResult res = coreOp.walk([&](Operation *op) {
-          // if (auto relOp = dyn_cast<ObjectFifoReleaseOp>(op)) {
-          //   ObjectFifoCreateOp createOp = relOp.getObjectFifo();
-          //   ObjectFifoPort port = relOp.getPort();
-          //   updateGlobalNextIndex(builder, relOp, globalNextIndex,
-          //                         globalIndices[{createOp, port}],
-          //                         constantSizes[{createOp, port}]);
-          // }
-          if (auto acqOp = dyn_cast<ObjectFifoAcquireOp>(op)) {
-            std::vector<ObjectFifoSubviewAccessOp> accessOps;
-            for (auto u : acqOp->getUsers())
-              if (auto accessOp = dyn_cast<ObjectFifoSubviewAccessOp>(u))
-                accessOps.push_back(accessOp);
-
-            for (auto accessOp : accessOps) {
-              ObjectFifoCreateOp createOp = acqOp.getObjectFifo();
-              ObjectFifoPort port = acqOp.getPort();
-
-              // Single switch case
-              if (fifoSizes[{createOp, port}] == 1)
-                return WalkResult::advance();
-
-              // Create a switch for each subview access
-              builder.setInsertionPointAfter(accessOp);
-              auto switchIndex = builder.create<memref::LoadOp>(
-                  builder.getUnknownLoc(), globalNextIndex,
-                  ValueRange(
-                      ArrayRef({globalIndices[{createOp, port}].getResult()})));
-              unsigned caseRegionCounts = fifoSizes[{createOp, port}];
-              SmallVector<int64_t, 4> caseValues;
-              for (int i = 0; i < fifoSizes[{createOp, port}]; ++i) {
-                caseValues.push_back(i);
-              }
-              auto cases =
-                  DenseI64ArrayAttr::get(builder.getContext(), caseValues);
-              auto switchOp = builder.create<scf::IndexSwitchOp>(
-                  switchIndex.getLoc(),
-                  TypeRange({buffersPerFifo[createOp][0].getType()}),
-                  switchIndex, cases, caseRegionCounts);
-              // Create default case of IndexSwitchOp
-              builder.createBlock(&switchOp.getDefaultRegion());
-              auto bufferIndex = (accessOp.getIndex()) % createOp.size();
-              builder.setInsertionPointToStart(&(switchOp.getDefaultBlock()));
-              builder.create<scf::YieldOp>(
-                  builder.getUnknownLoc(),
-                  buffersPerFifo[createOp][bufferIndex].getResult());
-              for (int i = 0; i < fifoSizes[{createOp, port}]; ++i) {
-                // Create other cases of IndexSwitchOp
-                builder.createBlock(&switchOp.getCaseRegions()[i]);
-                builder.setInsertionPoint(&switchOp.getCaseBlock(i),
-                                          switchOp.getCaseBlock(i).begin());
-                int bufferToBeAccesed =
-                    (accessOp.getIndex() + i) % fifoSizes[{createOp, port}];
-                builder.create<scf::YieldOp>(
-                    switchOp.getCaseRegions()[i].getLoc(),
-                    buffersPerFifo[createOp][bufferToBeAccesed].getResult());
-              }
-
-              // Replace all uses of accessed objectfifo buffers with
-              // results of switchOps
-              accessOp.getOutput().replaceAllUsesWith(switchOp.getResult(0));
-            }
-          }
-          return WalkResult::advance();
-        });
-        if (res.wasInterrupted())
-          return failure();
+        // auto memrefTy =
+        //     MemRefType::get(SmallVector<int64_t>{(int64_t)fifoSizes.size()},
+        //                     builder.getIndexType());
+        // auto globalNextIndex = builder.create<BufferOp>(
+        //     builder.getUnknownLoc(), memrefTy, coreOp.getTile(),
+        //     /*sym_name*/ nullptr, /*address*/ nullptr,
+        //     /*initial_value*/ nullptr, /*mem_bank*/ nullptr);
+
+        // // Initialize all counters in the global buffers to 0.
+        // // Also, keep a map of the ConstantOps for the indices per OF
+        // // and a map with the ConstantOps for the sizes per OF.
+        // std::map<std::pair<ObjectFifoCreateOp, ObjectFifoPort>,
+        //          arith::ConstantOp>
+        //     globalIndices;
+        // std::map<std::pair<ObjectFifoCreateOp, ObjectFifoPort>,
+        //          arith::ConstantOp>
+        //     constantSizes;
+        // int index = 0;
+        // builder.setInsertionPointToStart(&(coreOp.getBody().front()));
+        // Value initVal = builder.create<arith::ConstantOp>(
+        //     builder.getUnknownLoc(), builder.getIndexAttr(0));
+        // for (auto i : fifoSizes) {
+        //   auto indexOp = builder.create<arith::ConstantOp>(
+        //       initVal.getLoc(), builder.getIndexAttr(index));
+        //   globalIndices[i.first] = indexOp;
+        //   index++;
+        //   auto size = builder.create<arith::ConstantOp>(
+        //       indexOp.getLoc(), builder.getIndexAttr(i.second));
+        //   constantSizes[i.first] = size;
+        //   builder.create<memref::StoreOp>(
+        //       size.getLoc(), initVal, globalNextIndex,
+        //       ValueRange(ArrayRef({indexOp.getResult()})));
+        // }
+
+        // // Walk the code:
+        // // - after each ObjectFifoReleaseOp:
+        // //    - globalNextIndex: add #rel modulo objfifo depth
+        // // - before each ObjectFifoAcquireOp:
+        // //    - globalNextIndex: load index and use it to index_switch (one
+        // //    IndexSwithOp per AccessOp)
+        // WalkResult res = coreOp.walk([&](Operation *op) {
+        //   if (auto relOp = dyn_cast<ObjectFifoReleaseOp>(op)) {
+        //     ObjectFifoCreateOp createOp = relOp.getObjectFifo();
+        //     ObjectFifoPort port = relOp.getPort();
+        //     updateGlobalNextIndex(builder, relOp, globalNextIndex,
+        //                           globalIndices[{createOp, port}],
+        //                           constantSizes[{createOp, port}]);
+        //   }
+        //   if (auto acqOp = dyn_cast<ObjectFifoAcquireOp>(op)) {
+        //     std::vector<ObjectFifoSubviewAccessOp> accessOps;
+        //     for (auto u : acqOp->getUsers())
+        //       if (auto accessOp = dyn_cast<ObjectFifoSubviewAccessOp>(u))
+        //         accessOps.push_back(accessOp);
+
+        //     for (auto accessOp : accessOps) {
+        //       ObjectFifoCreateOp createOp = acqOp.getObjectFifo();
+        //       ObjectFifoPort port = acqOp.getPort();
+
+        //       // Single switch case
+        //       if (fifoSizes[{createOp, port}] == 1)
+        //         return WalkResult::advance();
+
+        //       // Create a switch for each subview access
+        //       builder.setInsertionPointAfter(accessOp);
+        //       auto switchIndex = builder.create<memref::LoadOp>(
+        //           builder.getUnknownLoc(), globalNextIndex,
+        //           ValueRange(
+        //               ArrayRef({globalIndices[{createOp, port}].getResult()})));
+        //       unsigned caseRegionCounts = fifoSizes[{createOp, port}];
+        //       SmallVector<int64_t, 4> caseValues;
+        //       for (int i = 0; i < fifoSizes[{createOp, port}]; ++i) {
+        //         caseValues.push_back(i);
+        //       }
+        //       auto cases =
+        //           DenseI64ArrayAttr::get(builder.getContext(), caseValues);
+        //       auto switchOp = builder.create<scf::IndexSwitchOp>(
+        //           switchIndex.getLoc(),
+        //           TypeRange({buffersPerFifo[createOp][0].getType()}),
+        //           switchIndex, cases, caseRegionCounts);
+        //       // Create default case of IndexSwitchOp
+        //       builder.createBlock(&switchOp.getDefaultRegion());
+        //       auto bufferIndex = (accessOp.getIndex()) % createOp.size();
+        //       builder.setInsertionPointToStart(&(switchOp.getDefaultBlock()));
+        //       builder.create<scf::YieldOp>(
+        //           builder.getUnknownLoc(),
+        //           buffersPerFifo[createOp][bufferIndex].getResult());
+        //       for (int i = 0; i < fifoSizes[{createOp, port}]; ++i) {
+        //         // Create other cases of IndexSwitchOp
+        //         builder.createBlock(&switchOp.getCaseRegions()[i]);
+        //         builder.setInsertionPoint(&switchOp.getCaseBlock(i),
+        //                                   switchOp.getCaseBlock(i).begin());
+        //         int bufferToBeAccesed =
+        //             (accessOp.getIndex() + i) % fifoSizes[{createOp, port}];
+        //         builder.create<scf::YieldOp>(
+        //             switchOp.getCaseRegions()[i].getLoc(),
+        //             buffersPerFifo[createOp][bufferToBeAccesed].getResult());
+        //       }
+
+        //       // Replace all uses of accessed objectfifo buffers with
+        //       // results of switchOps
+        //       accessOp.getOutput().replaceAllUsesWith(switchOp.getResult(0));
+        //     }
+        //   }
+        //   return WalkResult::advance();
+        // });
+        // if (res.wasInterrupted())
+        //   return failure();
       }
     }
     return success();

From 6d3f7502c6d4a25848a364f07302e56727d93dbc Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Wed, 4 Dec 2024 13:06:33 -0700
Subject: [PATCH 35/46] Confirming that global buffer is the problem

---
 .../AIEObjectFifoStatefulTransform.cpp        | 214 +++++++++---------
 1 file changed, 107 insertions(+), 107 deletions(-)

diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
index c96730808d..5af8b87fde 100644
--- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
+++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
@@ -1089,113 +1089,113 @@ struct AIEObjectFifoStatefulTransformPass
             fifoSizes[{op, port}] = op.size();
         });
         builder.setInsertionPoint(coreOp);
-        // auto memrefTy =
-        //     MemRefType::get(SmallVector<int64_t>{(int64_t)fifoSizes.size()},
-        //                     builder.getIndexType());
-        // auto globalNextIndex = builder.create<BufferOp>(
-        //     builder.getUnknownLoc(), memrefTy, coreOp.getTile(),
-        //     /*sym_name*/ nullptr, /*address*/ nullptr,
-        //     /*initial_value*/ nullptr, /*mem_bank*/ nullptr);
-
-        // // Initialize all counters in the global buffers to 0.
-        // // Also, keep a map of the ConstantOps for the indices per OF
-        // // and a map with the ConstantOps for the sizes per OF.
-        // std::map<std::pair<ObjectFifoCreateOp, ObjectFifoPort>,
-        //          arith::ConstantOp>
-        //     globalIndices;
-        // std::map<std::pair<ObjectFifoCreateOp, ObjectFifoPort>,
-        //          arith::ConstantOp>
-        //     constantSizes;
-        // int index = 0;
-        // builder.setInsertionPointToStart(&(coreOp.getBody().front()));
-        // Value initVal = builder.create<arith::ConstantOp>(
-        //     builder.getUnknownLoc(), builder.getIndexAttr(0));
-        // for (auto i : fifoSizes) {
-        //   auto indexOp = builder.create<arith::ConstantOp>(
-        //       initVal.getLoc(), builder.getIndexAttr(index));
-        //   globalIndices[i.first] = indexOp;
-        //   index++;
-        //   auto size = builder.create<arith::ConstantOp>(
-        //       indexOp.getLoc(), builder.getIndexAttr(i.second));
-        //   constantSizes[i.first] = size;
-        //   builder.create<memref::StoreOp>(
-        //       size.getLoc(), initVal, globalNextIndex,
-        //       ValueRange(ArrayRef({indexOp.getResult()})));
-        // }
-
-        // // Walk the code:
-        // // - after each ObjectFifoReleaseOp:
-        // //    - globalNextIndex: add #rel modulo objfifo depth
-        // // - before each ObjectFifoAcquireOp:
-        // //    - globalNextIndex: load index and use it to index_switch (one
-        // //    IndexSwithOp per AccessOp)
-        // WalkResult res = coreOp.walk([&](Operation *op) {
-        //   if (auto relOp = dyn_cast<ObjectFifoReleaseOp>(op)) {
-        //     ObjectFifoCreateOp createOp = relOp.getObjectFifo();
-        //     ObjectFifoPort port = relOp.getPort();
-        //     updateGlobalNextIndex(builder, relOp, globalNextIndex,
-        //                           globalIndices[{createOp, port}],
-        //                           constantSizes[{createOp, port}]);
-        //   }
-        //   if (auto acqOp = dyn_cast<ObjectFifoAcquireOp>(op)) {
-        //     std::vector<ObjectFifoSubviewAccessOp> accessOps;
-        //     for (auto u : acqOp->getUsers())
-        //       if (auto accessOp = dyn_cast<ObjectFifoSubviewAccessOp>(u))
-        //         accessOps.push_back(accessOp);
-
-        //     for (auto accessOp : accessOps) {
-        //       ObjectFifoCreateOp createOp = acqOp.getObjectFifo();
-        //       ObjectFifoPort port = acqOp.getPort();
-
-        //       // Single switch case
-        //       if (fifoSizes[{createOp, port}] == 1)
-        //         return WalkResult::advance();
-
-        //       // Create a switch for each subview access
-        //       builder.setInsertionPointAfter(accessOp);
-        //       auto switchIndex = builder.create<memref::LoadOp>(
-        //           builder.getUnknownLoc(), globalNextIndex,
-        //           ValueRange(
-        //               ArrayRef({globalIndices[{createOp, port}].getResult()})));
-        //       unsigned caseRegionCounts = fifoSizes[{createOp, port}];
-        //       SmallVector<int64_t, 4> caseValues;
-        //       for (int i = 0; i < fifoSizes[{createOp, port}]; ++i) {
-        //         caseValues.push_back(i);
-        //       }
-        //       auto cases =
-        //           DenseI64ArrayAttr::get(builder.getContext(), caseValues);
-        //       auto switchOp = builder.create<scf::IndexSwitchOp>(
-        //           switchIndex.getLoc(),
-        //           TypeRange({buffersPerFifo[createOp][0].getType()}),
-        //           switchIndex, cases, caseRegionCounts);
-        //       // Create default case of IndexSwitchOp
-        //       builder.createBlock(&switchOp.getDefaultRegion());
-        //       auto bufferIndex = (accessOp.getIndex()) % createOp.size();
-        //       builder.setInsertionPointToStart(&(switchOp.getDefaultBlock()));
-        //       builder.create<scf::YieldOp>(
-        //           builder.getUnknownLoc(),
-        //           buffersPerFifo[createOp][bufferIndex].getResult());
-        //       for (int i = 0; i < fifoSizes[{createOp, port}]; ++i) {
-        //         // Create other cases of IndexSwitchOp
-        //         builder.createBlock(&switchOp.getCaseRegions()[i]);
-        //         builder.setInsertionPoint(&switchOp.getCaseBlock(i),
-        //                                   switchOp.getCaseBlock(i).begin());
-        //         int bufferToBeAccesed =
-        //             (accessOp.getIndex() + i) % fifoSizes[{createOp, port}];
-        //         builder.create<scf::YieldOp>(
-        //             switchOp.getCaseRegions()[i].getLoc(),
-        //             buffersPerFifo[createOp][bufferToBeAccesed].getResult());
-        //       }
-
-        //       // Replace all uses of accessed objectfifo buffers with
-        //       // results of switchOps
-        //       accessOp.getOutput().replaceAllUsesWith(switchOp.getResult(0));
-        //     }
-        //   }
-        //   return WalkResult::advance();
-        // });
-        // if (res.wasInterrupted())
-        //   return failure();
+        auto memrefTy =
+            MemRefType::get(SmallVector<int64_t>{(int64_t)fifoSizes.size()},
+                            builder.getIndexType());
+        auto globalNextIndex = builder.create<BufferOp>(
+            builder.getUnknownLoc(), memrefTy, coreOp.getTile(),
+            /*sym_name*/ nullptr, /*address*/ nullptr,
+            /*initial_value*/ nullptr, /*mem_bank*/ nullptr);
+
+        // Initialize all counters in the global buffers to 0.
+        // Also, keep a map of the ConstantOps for the indices per OF
+        // and a map with the ConstantOps for the sizes per OF.
+        std::map<std::pair<ObjectFifoCreateOp, ObjectFifoPort>,
+                 arith::ConstantOp>
+            globalIndices;
+        std::map<std::pair<ObjectFifoCreateOp, ObjectFifoPort>,
+                 arith::ConstantOp>
+            constantSizes;
+        int index = 0;
+        builder.setInsertionPointToStart(&(coreOp.getBody().front()));
+        Value initVal = builder.create<arith::ConstantOp>(
+            builder.getUnknownLoc(), builder.getIndexAttr(0));
+        for (auto i : fifoSizes) {
+          auto indexOp = builder.create<arith::ConstantOp>(
+              initVal.getLoc(), builder.getIndexAttr(index));
+          globalIndices[i.first] = indexOp;
+          index++;
+          auto size = builder.create<arith::ConstantOp>(
+              indexOp.getLoc(), builder.getIndexAttr(i.second));
+          constantSizes[i.first] = size;
+          // builder.create<memref::StoreOp>(
+          //     size.getLoc(), initVal, globalNextIndex,
+          //     ValueRange(ArrayRef({indexOp.getResult()})));
+        }
+
+        // Walk the code:
+        // - after each ObjectFifoReleaseOp:
+        //    - globalNextIndex: add #rel modulo objfifo depth
+        // - before each ObjectFifoAcquireOp:
+        //    - globalNextIndex: load index and use it to index_switch (one
+        //    IndexSwithOp per AccessOp)
+        WalkResult res = coreOp.walk([&](Operation *op) {
+          if (auto relOp = dyn_cast<ObjectFifoReleaseOp>(op)) {
+            ObjectFifoCreateOp createOp = relOp.getObjectFifo();
+            ObjectFifoPort port = relOp.getPort();
+            updateGlobalNextIndex(builder, relOp, globalNextIndex,
+                                  globalIndices[{createOp, port}],
+                                  constantSizes[{createOp, port}]);
+          }
+          if (auto acqOp = dyn_cast<ObjectFifoAcquireOp>(op)) {
+            std::vector<ObjectFifoSubviewAccessOp> accessOps;
+            for (auto u : acqOp->getUsers())
+              if (auto accessOp = dyn_cast<ObjectFifoSubviewAccessOp>(u))
+                accessOps.push_back(accessOp);
+
+            for (auto accessOp : accessOps) {
+              ObjectFifoCreateOp createOp = acqOp.getObjectFifo();
+              ObjectFifoPort port = acqOp.getPort();
+
+              // Single switch case
+              if (fifoSizes[{createOp, port}] == 1)
+                return WalkResult::advance();
+
+              // Create a switch for each subview access
+              builder.setInsertionPointAfter(accessOp);
+              auto switchIndex = builder.create<memref::LoadOp>(
+                  builder.getUnknownLoc(), globalNextIndex,
+                  ValueRange(
+                      ArrayRef({globalIndices[{createOp, port}].getResult()})));
+              unsigned caseRegionCounts = fifoSizes[{createOp, port}];
+              SmallVector<int64_t, 4> caseValues;
+              for (int i = 0; i < fifoSizes[{createOp, port}]; ++i) {
+                caseValues.push_back(i);
+              }
+              auto cases =
+                  DenseI64ArrayAttr::get(builder.getContext(), caseValues);
+              auto switchOp = builder.create<scf::IndexSwitchOp>(
+                  switchIndex.getLoc(),
+                  TypeRange({buffersPerFifo[createOp][0].getType()}),
+                  switchIndex, cases, caseRegionCounts);
+              // Create default case of IndexSwitchOp
+              builder.createBlock(&switchOp.getDefaultRegion());
+              auto bufferIndex = (accessOp.getIndex()) % createOp.size();
+              builder.setInsertionPointToStart(&(switchOp.getDefaultBlock()));
+              builder.create<scf::YieldOp>(
+                  builder.getUnknownLoc(),
+                  buffersPerFifo[createOp][bufferIndex].getResult());
+              for (int i = 0; i < fifoSizes[{createOp, port}]; ++i) {
+                // Create other cases of IndexSwitchOp
+                builder.createBlock(&switchOp.getCaseRegions()[i]);
+                builder.setInsertionPoint(&switchOp.getCaseBlock(i),
+                                          switchOp.getCaseBlock(i).begin());
+                int bufferToBeAccesed =
+                    (accessOp.getIndex() + i) % fifoSizes[{createOp, port}];
+                builder.create<scf::YieldOp>(
+                    switchOp.getCaseRegions()[i].getLoc(),
+                    buffersPerFifo[createOp][bufferToBeAccesed].getResult());
+              }
+
+              // Replace all uses of accessed objectfifo buffers with
+              // results of switchOps
+              accessOp.getOutput().replaceAllUsesWith(switchOp.getResult(0));
+            }
+          }
+          return WalkResult::advance();
+        });
+        if (res.wasInterrupted())
+          return failure();
       }
     }
     return success();

From 71cbd258186b6810302e0392f96bb9f510bc471b Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Wed, 4 Dec 2024 14:10:08 -0700
Subject: [PATCH 36/46] Solutionworked locally

---
 .../AIEObjectFifoStatefulTransform.cpp        | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
index 5af8b87fde..6e9af55a07 100644
--- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
+++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
@@ -1061,7 +1061,7 @@ struct AIEObjectFifoStatefulTransformPass
         builder.getUnknownLoc(), globalNextIndex,
         ValueRange(ArrayRef({index.getResult()})));
     Value val = builder.create<arith::ConstantOp>(
-        oldCounter.getLoc(), builder.getIndexAttr(relOp.getSize()));
+        oldCounter.getLoc(), builder.getI32IntegerAttr(relOp.getSize()));
     Value sum = builder.create<arith::AddIOp>(val.getLoc(), oldCounter, val);
     Value newCounter = builder.create<arith::RemSIOp>(sum.getLoc(), sum, size);
     builder.create<memref::StoreOp>(size.getLoc(), newCounter, globalNextIndex,
@@ -1091,7 +1091,7 @@ struct AIEObjectFifoStatefulTransformPass
         builder.setInsertionPoint(coreOp);
         auto memrefTy =
             MemRefType::get(SmallVector<int64_t>{(int64_t)fifoSizes.size()},
-                            builder.getIndexType());
+                            builder.getI32Type());
         auto globalNextIndex = builder.create<BufferOp>(
             builder.getUnknownLoc(), memrefTy, coreOp.getTile(),
             /*sym_name*/ nullptr, /*address*/ nullptr,
@@ -1109,18 +1109,18 @@ struct AIEObjectFifoStatefulTransformPass
         int index = 0;
         builder.setInsertionPointToStart(&(coreOp.getBody().front()));
         Value initVal = builder.create<arith::ConstantOp>(
-            builder.getUnknownLoc(), builder.getIndexAttr(0));
+            builder.getUnknownLoc(), builder.getI32IntegerAttr(0));
         for (auto i : fifoSizes) {
           auto indexOp = builder.create<arith::ConstantOp>(
               initVal.getLoc(), builder.getIndexAttr(index));
           globalIndices[i.first] = indexOp;
           index++;
           auto size = builder.create<arith::ConstantOp>(
-              indexOp.getLoc(), builder.getIndexAttr(i.second));
+              indexOp.getLoc(), builder.getI32IntegerAttr(i.second));
           constantSizes[i.first] = size;
-          // builder.create<memref::StoreOp>(
-          //     size.getLoc(), initVal, globalNextIndex,
-          //     ValueRange(ArrayRef({indexOp.getResult()})));
+          builder.create<memref::StoreOp>(
+              size.getLoc(), initVal, globalNextIndex,
+              ValueRange(ArrayRef({indexOp.getResult()})));
         }
 
         // Walk the code:
@@ -1153,10 +1153,13 @@ struct AIEObjectFifoStatefulTransformPass
 
               // Create a switch for each subview access
               builder.setInsertionPointAfter(accessOp);
-              auto switchIndex = builder.create<memref::LoadOp>(
+              auto switchIndexAsInteger = builder.create<memref::LoadOp>(
                   builder.getUnknownLoc(), globalNextIndex,
                   ValueRange(
                       ArrayRef({globalIndices[{createOp, port}].getResult()})));
+              auto switchIndex = builder.create<arith::IndexCastOp>(
+                  builder.getUnknownLoc(), builder.getIndexType(),
+                  switchIndexAsInteger);
               unsigned caseRegionCounts = fifoSizes[{createOp, port}];
               SmallVector<int64_t, 4> caseValues;
               for (int i = 0; i < fifoSizes[{createOp, port}]; ++i) {

From a6f7e11ba6bab6f4ffc1438e7aaaca761dc122a8 Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Wed, 4 Dec 2024 14:18:48 -0700
Subject: [PATCH 37/46] Removing unnecessary changes

---
 .../npu-xrt/dynamic_object_fifo/nested_loops/aie2.py | 12 ++++++------
 .../dynamic_object_fifo/sliding_window/aie2.py       |  4 ++--
 .../sliding_window_conditional/aie2.py               |  4 ++--
 .../two_core_sliding_window/aie2.py                  |  4 ++--
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
index 8ae31698ab..e9dc107466 100644
--- a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
@@ -40,8 +40,8 @@ def device_body():
             ComputeTile = tile(col, 2)
 
             # AIE-array data movement with object fifos
-            of_in = object_fifo("in", ShimTile, ComputeTile, 1, tensor_ty)
-            of_out = object_fifo("out", ComputeTile, ShimTile, 1, tensor_ty)
+            of_in = object_fifo("in", ShimTile, ComputeTile, 2, tensor_ty)
+            of_out = object_fifo("out", ComputeTile, ShimTile, 2, tensor_ty)
 
             # AIE Core Function declarations
             passthrough_10_i32 = external_func(
@@ -53,10 +53,10 @@ def device_body():
             def core_body():
                 for _ in range_(5):
                     elemIn = of_in.acquire(ObjectFifoPort.Consume, 1)
-                    # for _ in range_(5):
-                    #     elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
-                    #     passthrough_10_i32(elemIn, elemOut)
-                    #     of_out.release(ObjectFifoPort.Produce, 1)
+                    for _ in range_(5):
+                        elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
+                        passthrough_10_i32(elemIn, elemOut)
+                        of_out.release(ObjectFifoPort.Produce, 1)
                     of_in.release(ObjectFifoPort.Consume, 1)
 
             # To/from AIE-array data movement
diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py b/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py
index 7baa366452..37222b8a78 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py
@@ -5,13 +5,13 @@
 #
 # (c) Copyright 2024 AMD Inc.
 
-# REQUIRES: ryzen_ai, chess
+# REQUIRES: ryzen_ai, valid_xchess_license
 #
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
 # RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
-# RUN: %run_on_npu ./test.exe -x final.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+# RUN: %run_on_npu ./test.exe | FileCheck %s
 # CHECK: PASS!
 from aie.dialects.aie import *
 from aie.dialects.aiex import *
diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
index 3f1159da13..366552907b 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
@@ -5,13 +5,13 @@
 #
 # (c) Copyright 2024 AMD Inc.
 
-# REQUIRES: ryzen_ai, chess
+# REQUIRES: ryzen_ai, valid_xchess_license
 #
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
 # RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
-# RUN: %run_on_npu ./test.exe -x final.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+# RUN: %run_on_npu ./test.exe | FileCheck %s
 # CHECK: PASS!
 import numpy as np
 
diff --git a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py
index 3c53c21cd8..d0b0f53d36 100644
--- a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py
@@ -5,13 +5,13 @@
 #
 # (c) Copyright 2024 AMD Inc.
 
-# REQUIRES: ryzen_ai, chess
+# REQUIRES: ryzen_ai, valid_xchess_license
 #
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
 # RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
-# RUN: %run_on_npu ./test.exe -x final.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+# RUN: %run_on_npu ./test.exe | FileCheck %s
 # CHECK: PASS!
 import numpy as np
 

From 265dde0447a271c99dfe70e6604a445cddc218eb Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Wed, 4 Dec 2024 14:34:04 -0700
Subject: [PATCH 38/46] Removed these from programming examples and modified
 tests: 2 sliding_window examples are still failing

---
 .../dyn_objFifo/lit.local.cfg                 |  11 --
 .../dyn_objFifo/nested_loops/Makefile         |  66 --------
 .../dyn_objFifo/nested_loops/aie.mlir         |  36 -----
 .../dyn_objFifo/nested_loops/run_makefile.lit |   9 --
 .../dyn_objFifo/sliding_window/Makefile       |  66 --------
 .../dyn_objFifo/sliding_window/aie2.py        |  74 ---------
 .../dyn_objFifo/sliding_window/kernel.cc      |  24 ---
 .../sliding_window/run_makefile.lit           |   9 --
 .../dyn_objFifo/sliding_window/test.cpp       | 138 -----------------
 .../sliding_window_conditional/Makefile       |  66 --------
 .../sliding_window_conditional/aie2.py        |  73 ---------
 .../sliding_window_conditional/kernel.cc      |  24 ---
 .../sliding_window_conditional/test.cpp       | 138 -----------------
 .../two_core_sliding_window/Makefile          |  66 --------
 .../two_core_sliding_window/aie2.py           |  90 -----------
 .../two_core_sliding_window/kernel.cc         |  38 -----
 .../two_core_sliding_window/test.cpp          | 138 -----------------
 .../dynamic_lowering_flag_test.mlir           | 146 ++++++++++++++----
 .../dynamic_lowering_test.mlir                | 114 +++++++-------
 19 files changed, 175 insertions(+), 1151 deletions(-)
 delete mode 100644 programming_examples/dyn_objFifo/lit.local.cfg
 delete mode 100644 programming_examples/dyn_objFifo/nested_loops/Makefile
 delete mode 100644 programming_examples/dyn_objFifo/nested_loops/aie.mlir
 delete mode 100644 programming_examples/dyn_objFifo/nested_loops/run_makefile.lit
 delete mode 100644 programming_examples/dyn_objFifo/sliding_window/Makefile
 delete mode 100644 programming_examples/dyn_objFifo/sliding_window/aie2.py
 delete mode 100644 programming_examples/dyn_objFifo/sliding_window/kernel.cc
 delete mode 100644 programming_examples/dyn_objFifo/sliding_window/run_makefile.lit
 delete mode 100644 programming_examples/dyn_objFifo/sliding_window/test.cpp
 delete mode 100644 programming_examples/dyn_objFifo/sliding_window_conditional/Makefile
 delete mode 100644 programming_examples/dyn_objFifo/sliding_window_conditional/aie2.py
 delete mode 100644 programming_examples/dyn_objFifo/sliding_window_conditional/kernel.cc
 delete mode 100644 programming_examples/dyn_objFifo/sliding_window_conditional/test.cpp
 delete mode 100644 programming_examples/dyn_objFifo/two_core_sliding_window/Makefile
 delete mode 100644 programming_examples/dyn_objFifo/two_core_sliding_window/aie2.py
 delete mode 100644 programming_examples/dyn_objFifo/two_core_sliding_window/kernel.cc
 delete mode 100644 programming_examples/dyn_objFifo/two_core_sliding_window/test.cpp

diff --git a/programming_examples/dyn_objFifo/lit.local.cfg b/programming_examples/dyn_objFifo/lit.local.cfg
deleted file mode 100644
index 64cca87fdf..0000000000
--- a/programming_examples/dyn_objFifo/lit.local.cfg
+++ /dev/null
@@ -1,11 +0,0 @@
-#
-# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# (c) Copyright 2023 AMD Inc.
-
-config.suffixes = ['.lit']
-
-if 'AIE2' not in config.vitis_components:
-    config.unsupported = True
diff --git a/programming_examples/dyn_objFifo/nested_loops/Makefile b/programming_examples/dyn_objFifo/nested_loops/Makefile
deleted file mode 100644
index 7a3b9545a5..0000000000
--- a/programming_examples/dyn_objFifo/nested_loops/Makefile
+++ /dev/null
@@ -1,66 +0,0 @@
-##===- Makefile -----------------------------------------------------------===##
-# 
-# This file licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
-# 
-##===----------------------------------------------------------------------===##
-
-# ---
-
-# The following environment variables that point to the Xilinx runtime (XRT)
-# should be set up by an environment setup script already.
-XILINX_XRT?=/opt/xilinx/xrt
-XILINX_VITIS?=$(shell realpath $(dir $(shell which vitis))/../)
-
-# ---
-
-srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
-
-XILINX_XRT_INCLUDE?=${XILINX_XRT}/include
-XILINX_XRT_LIB?=${XILINX_XRT}/lib
-
-CHESSCCWRAP2_FLAGS=aie2 -I${XILINX_VITIS}/aietools/include 
-XRT_FLAGS=-I${XILINX_XRT_INCLUDE} -L${XILINX_XRT_LIB}
-XRT_LIBS=-lxrt_coreutil
-CXX=g++-13 -ggdb 
-
-#mlir_target?=build/aie.mlir
-xclbin_target?=build/final.xclbin
-insts_target?=build/insts.txt
-host_target?=build/test
-
-.PHONY: all
-all: ${xclbin_target} ${host_target}
-
-# build/aie.mlir: ${srcdir}/aie2.py
-# 	mkdir -p ${@D}
-# 	python3 $< > $@
-
-build/kernel.o: ${srcdir}/kernel.cc
-	mkdir -p ${@D}
-	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F}
-
-${xclbin_target}: ${srcdir}/aie.mlir build/kernel.o
-	mkdir -p ${@D}
-	cd ${@D} && aiecc.py -v --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
-				--dynamic-objFifos --aie-generate-npu --npu-insts-name=${insts_target:build/%=%} ${srcdir}/${<:%=../%}
-
-${host_target}: ${srcdir}/test.cpp ${xclbin_target}
-	mkdir -p ${@D}
-	${CXX} ${XRT_FLAGS} -DM=$M -DN=$N -o $@ $< ${XRT_LIBS}
-
-.PHONY: run
-run: ${host_target}
-	./${host_target}
-
-xclbin_sign=${XILINX_XRT}/amdxdna/setup_xclbin_firmware.sh 
-.PHONY: sign
-sign: ${xclbin_target}
-	${xclbin_sign} -dev Phoenix -xclbin $<
-
-.PHONY: clean
-clean:
-	-rm -r build
\ No newline at end of file
diff --git a/programming_examples/dyn_objFifo/nested_loops/aie.mlir b/programming_examples/dyn_objFifo/nested_loops/aie.mlir
deleted file mode 100644
index 1fb0cda89c..0000000000
--- a/programming_examples/dyn_objFifo/nested_loops/aie.mlir
+++ /dev/null
@@ -1,36 +0,0 @@
-module {
-  aie.device(npu1_1col) {
-    %tile_0_0 = aie.tile(0, 0)
-    %tile_0_2 = aie.tile(0, 2)
-    aie.objectfifo @in(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<10xi32>> 
-    aie.objectfifo @out(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<10xi32>> 
-    func.func private @passthrough_10_i32(memref<10xi32>, memref<10xi32>)
-    %core_0_2 = aie.core(%tile_0_2) {
-      %c0 = arith.constant 0 : index
-      %c5 = arith.constant 5 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c5 step %c1 {
-        %0 = aie.objectfifo.acquire @in(Consume, 1) : !aie.objectfifosubview<memref<10xi32>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<10xi32>> -> memref<10xi32>
-        %c0_0 = arith.constant 0 : index
-        %c5_1 = arith.constant 5 : index
-        %c1_2 = arith.constant 1 : index
-        scf.for %arg1 = %c0_0 to %c5_1 step %c1_2 {
-          %2 = aie.objectfifo.acquire @out(Produce, 1) : !aie.objectfifosubview<memref<10xi32>>
-          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<10xi32>> -> memref<10xi32>
-          func.call @passthrough_10_i32(%1, %3) : (memref<10xi32>, memref<10xi32>) -> ()
-          aie.objectfifo.release @out(Produce, 1)
-        }
-        aie.objectfifo.release @in(Consume, 1)
-      }
-      aie.end
-    } {link_with = "kernel.o"}
-    aiex.runtime_sequence(%arg0: memref<10xi32>, %arg1: memref<10xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 50][0, 0, 0, 1]) {id = 1 : i64, issue_token = true, metadata = @in} : memref<10xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 250][0, 0, 0, 1]) {id = 0 : i64, metadata = @out} : memref<10xi32>
-      aiex.npu.dma_wait {symbol = @in}
-      aiex.npu.dma_wait {symbol = @out}
-    }
-  }
-}
-
diff --git a/programming_examples/dyn_objFifo/nested_loops/run_makefile.lit b/programming_examples/dyn_objFifo/nested_loops/run_makefile.lit
deleted file mode 100644
index 507b70720a..0000000000
--- a/programming_examples/dyn_objFifo/nested_loops/run_makefile.lit
+++ /dev/null
@@ -1,9 +0,0 @@
-// (c) Copyright 2024 Advanced Micro Devices, Inc.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// REQUIRES: ryzen_ai, xchess 
-//
-// RUN: make -f %S/Makefile clean
-// RUN: make -f %S/Makefile 
-// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s
-// CHECK: PASS!
\ No newline at end of file
diff --git a/programming_examples/dyn_objFifo/sliding_window/Makefile b/programming_examples/dyn_objFifo/sliding_window/Makefile
deleted file mode 100644
index 0216ac75da..0000000000
--- a/programming_examples/dyn_objFifo/sliding_window/Makefile
+++ /dev/null
@@ -1,66 +0,0 @@
-##===- Makefile -----------------------------------------------------------===##
-# 
-# This file licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
-# 
-##===----------------------------------------------------------------------===##
-
-# ---
-
-# The following environment variables that point to the Xilinx runtime (XRT)
-# should be set up by an environment setup script already.
-XILINX_XRT?=/opt/xilinx/xrt
-XILINX_VITIS?=$(shell realpath $(dir $(shell which vitis))/../)
-
-# ---
-
-srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
-
-XILINX_XRT_INCLUDE?=${XILINX_XRT}/include
-XILINX_XRT_LIB?=${XILINX_XRT}/lib
-
-CHESSCCWRAP2_FLAGS=aie2 -I${XILINX_VITIS}/aietools/include 
-XRT_FLAGS=-I${XILINX_XRT_INCLUDE} -L${XILINX_XRT_LIB}
-XRT_LIBS=-lxrt_coreutil
-CXX=g++-13 -ggdb 
-
-#mlir_target?=build/aie.mlir
-xclbin_target?=build/final.xclbin
-insts_target?=build/insts.txt
-host_target?=build/test
-
-.PHONY: all
-all: ${xclbin_target} ${host_target}
-
-build/aie.mlir: ${srcdir}/aie2.py
-	mkdir -p ${@D}
-	python3 $< > $@
-
-build/kernel.o: ${srcdir}/kernel.cc
-	mkdir -p ${@D}
-	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F}
-
-${xclbin_target}: build/aie.mlir build/kernel.o
-	mkdir -p ${@D}
-	cd ${@D} && aiecc.py -v --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
-				--dynamic-objFifos --aie-generate-npu --npu-insts-name=${insts_target:build/%=%} ${<:%=../%}
-
-${host_target}: ${srcdir}/test.cpp ${xclbin_target}
-	mkdir -p ${@D}
-	${CXX} ${XRT_FLAGS} -DM=$M -DN=$N -o $@ $< ${XRT_LIBS}
-
-.PHONY: run
-run: ${host_target}
-	./${host_target}
-
-xclbin_sign=${XILINX_XRT}/amdxdna/setup_xclbin_firmware.sh 
-.PHONY: sign
-sign: ${xclbin_target}
-	${xclbin_sign} -dev Phoenix -xclbin $<
-
-.PHONY: clean
-clean:
-	-rm -r build
\ No newline at end of file
diff --git a/programming_examples/dyn_objFifo/sliding_window/aie2.py b/programming_examples/dyn_objFifo/sliding_window/aie2.py
deleted file mode 100644
index 57d5efb1a5..0000000000
--- a/programming_examples/dyn_objFifo/sliding_window/aie2.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# dynamic_object_fifo/sliding_window/aie2.py -*- Python -*-
-#
-# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
-
-from aie.dialects.aie import *
-from aie.dialects.aiex import *
-from aie.helpers.dialects.ext.scf import _for as range_
-from aie.extras.context import mlir_mod_ctx
-
-N = 100
-n_rows = 10
-dev = AIEDevice.npu1_1col
-col = 0
-
-
-def sliding_window():
-    with mlir_mod_ctx() as ctx:
-
-        @device(dev)
-        def device_body():
-            memRef_ty = T.memref(N // n_rows, T.i32())
-
-            # Tile declarations
-            ShimTile = tile(col, 0)
-            ComputeTile = tile(col, 2)
-
-            # AIE-array data movement with object fifos
-            of_in = object_fifo("in", ShimTile, ComputeTile, 3, memRef_ty)
-            of_out = object_fifo("out", ComputeTile, ShimTile, 2, memRef_ty)
-
-            # AIE Core Function declarations
-            add_10_i32 = external_func(
-                "add_10_i32", inputs=[memRef_ty, memRef_ty, memRef_ty]
-            )
-
-            # Set up compute tiles
-
-            @core(ComputeTile, "kernel.o")
-            def core_body():
-                elemOutPre = of_out.acquire(ObjectFifoPort.Produce, 1)
-                elemInPre = of_in.acquire(ObjectFifoPort.Consume, 1)
-                call(add_10_i32, [elemInPre, elemInPre, elemOutPre])
-                of_out.release(ObjectFifoPort.Produce, 1)
-
-                for _ in range_(8):
-                    elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
-                    elemsIn = of_in.acquire(ObjectFifoPort.Consume, 2)
-                    call(add_10_i32, [elemsIn[0], elemsIn[1], elemOut])
-                    of_in.release(ObjectFifoPort.Consume, 1)
-                    of_out.release(ObjectFifoPort.Produce, 1)
-
-                elemOutPost = of_out.acquire(ObjectFifoPort.Produce, 1)
-                elemsInPost = of_in.acquire(ObjectFifoPort.Consume, 2)
-                call(add_10_i32, [elemsInPost[0], elemsInPost[1], elemOutPost])
-                of_in.release(ObjectFifoPort.Consume, 2)
-                of_out.release(ObjectFifoPort.Produce, 1)
-
-            # To/from AIE-array data movement
-            tensor_ty = T.memref(N, T.i32())
-
-            @runtime_sequence(tensor_ty, tensor_ty)
-            def sequence(A, C):
-                npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
-                npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
-                npu_sync(column=0, row=0, direction=0, channel=0)
-
-    print(ctx.module)
-
-
-sliding_window()
diff --git a/programming_examples/dyn_objFifo/sliding_window/kernel.cc b/programming_examples/dyn_objFifo/sliding_window/kernel.cc
deleted file mode 100644
index ddb474e102..0000000000
--- a/programming_examples/dyn_objFifo/sliding_window/kernel.cc
+++ /dev/null
@@ -1,24 +0,0 @@
-//
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// (c) Copyright 2024 AMD Inc.
-
-#include <aie_api/aie.hpp>
-
-template <typename T_in, typename T_out, unsigned long N>
-void add(const T_in *__restrict inA, const T_in *__restrict inB,
-         T_out *__restrict out) {
-  for (int i = 0; i < N; i++) {
-    out[i] = inA[i] + inB[i];
-  }
-}
-
-extern "C" {
-
-void add_10_i32(const int *__restrict inA, const int *__restrict inB,
-                int *__restrict out) {
-  add<int, int, 10>(inA, inB, out);
-}
-}
diff --git a/programming_examples/dyn_objFifo/sliding_window/run_makefile.lit b/programming_examples/dyn_objFifo/sliding_window/run_makefile.lit
deleted file mode 100644
index 507b70720a..0000000000
--- a/programming_examples/dyn_objFifo/sliding_window/run_makefile.lit
+++ /dev/null
@@ -1,9 +0,0 @@
-// (c) Copyright 2024 Advanced Micro Devices, Inc.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// REQUIRES: ryzen_ai, xchess 
-//
-// RUN: make -f %S/Makefile clean
-// RUN: make -f %S/Makefile 
-// RUN: %run_on_npu make -f %S/Makefile run | FileCheck %s
-// CHECK: PASS!
\ No newline at end of file
diff --git a/programming_examples/dyn_objFifo/sliding_window/test.cpp b/programming_examples/dyn_objFifo/sliding_window/test.cpp
deleted file mode 100644
index 3cd72ab880..0000000000
--- a/programming_examples/dyn_objFifo/sliding_window/test.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// (c) Copyright 2024 AMD Inc.
-
-#include <cassert>
-#include <cstring>
-#include <fstream>
-#include <iomanip>
-
-#include "xrt/xrt_bo.h"
-#include "xrt/xrt_device.h"
-#include "xrt/xrt_kernel.h"
-
-#ifndef XCLBIN
-#define XCLBIN "build/final.xclbin"
-#endif
-
-#ifndef INSTS_TXT
-#define INSTS_TXT "build/insts.txt"
-#endif
-
-#ifndef KERNEL_NAME
-#define KERNEL_NAME "MLIR_AIE"
-#endif
-
-#define INPUT_SIZE (100 * sizeof(int))  // in bytes
-#define OUTPUT_SIZE (100 * sizeof(int)) // in bytes
-#define WIDTH_SIZE (10 * sizeof(int))   // in bytes
-#define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE
-#define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE
-
-std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
-  std::ifstream instr_file(instr_path);
-  std::string line;
-  std::vector<uint32_t> instr_v;
-  while (std::getline(instr_file, line)) {
-    std::istringstream iss(line);
-    uint32_t a;
-    if (!(iss >> std::hex >> a)) {
-      throw std::runtime_error("Unable to parse instruction file\n");
-    }
-    instr_v.push_back(a);
-  }
-  return instr_v;
-}
-
-int main(int argc, const char *argv[]) {
-
-  std::vector<uint32_t> instr_v = load_instr_sequence(INSTS_TXT);
-  assert(instr_v.size() > 0);
-
-  // Get a device handle
-  unsigned int device_index = 0;
-  xrt::device device = xrt::device(device_index);
-
-  // Load the xclbin
-  xrt::xclbin xclbin = xrt::xclbin(XCLBIN);
-
-  // Get the kernel from the xclbin
-  std::vector<xrt::xclbin::kernel> xkernels = xclbin.get_kernels();
-  xrt::xclbin::kernel xkernel = *std::find_if(
-      xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel &k) {
-        return k.get_name().rfind(KERNEL_NAME, 0) == 0;
-      });
-  std::string kernel_name = xkernel.get_name();
-  assert(strcmp(kernel_name.c_str(), KERNEL_NAME) == 0);
-
-  device.register_xclbin(xclbin);
-
-  // get a hardware context
-  xrt::hw_context context(device, xclbin.get_uuid());
-
-  // get a kernel handle
-  auto kernel = xrt::kernel(context, kernel_name);
-
-  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
-                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
-  auto bo_input =
-      xrt::bo(device, INPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
-  auto bo_output =
-      xrt::bo(device, OUTPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
-
-  int *buf_input = bo_input.map<int *>();
-  std::cout << std::endl << std::endl << "Input: " << std::endl;
-  for (int i = 0; i < INPUT_ROWS; i++) {
-    std::cout << "row " << i << " : ";
-    for (int j = 0; j < WIDTH_SIZE / sizeof(buf_input[0]); j++) {
-      buf_input[i * INPUT_ROWS + j] = i;
-      std::cout << buf_input[i * INPUT_ROWS + j] << " ";
-    }
-    std::cout << std::endl << std::endl;
-  }
-  int *buf_output = bo_output.map<int *>();
-  memset(buf_output, 0, OUTPUT_SIZE);
-
-  // Instruction buffer for DMA configuration
-  void *buf_instr = bo_instr.map<void *>();
-  memcpy(buf_instr, instr_v.data(), instr_v.size() * sizeof(int));
-
-  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_input.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_output.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-
-  unsigned int opcode = 3;
-  auto run = kernel(opcode, bo_instr, instr_v.size(), bo_input, bo_output);
-  ert_cmd_state r = run.wait();
-  if (r != ERT_CMD_STATE_COMPLETED) {
-    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
-    return 1;
-  }
-
-  bo_output.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
-
-  bool pass = true;
-  std::cout << std::endl << "Output: " << std::endl;
-  for (int i = 0; i < OUTPUT_ROWS; i++) {
-    std::cout << "row " << i << std::endl;
-    for (int j = 0; j < WIDTH_SIZE / sizeof(buf_output[0]); j++) {
-      int expected_output = 0;
-      if (i == 0) {
-        expected_output = buf_input[i * INPUT_ROWS] * 2;
-      } else {
-        expected_output =
-            buf_input[(i - 1) * INPUT_ROWS] + buf_input[i * INPUT_ROWS];
-      }
-      std::cout << "expected: " << expected_output << ", ";
-      std::cout << "got: " << buf_output[i * OUTPUT_ROWS + j] << std::endl;
-      pass &= buf_output[i * OUTPUT_ROWS + j] == expected_output;
-    }
-    std::cout << std::endl << std::endl;
-  }
-  std::cout << std::endl << std::endl;
-  std::cout << (pass ? "PASS!" : "FAIL.") << std::endl;
-
-  return 0;
-}
diff --git a/programming_examples/dyn_objFifo/sliding_window_conditional/Makefile b/programming_examples/dyn_objFifo/sliding_window_conditional/Makefile
deleted file mode 100644
index 0216ac75da..0000000000
--- a/programming_examples/dyn_objFifo/sliding_window_conditional/Makefile
+++ /dev/null
@@ -1,66 +0,0 @@
-##===- Makefile -----------------------------------------------------------===##
-# 
-# This file licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
-# 
-##===----------------------------------------------------------------------===##
-
-# ---
-
-# The following environment variables that point to the Xilinx runtime (XRT)
-# should be set up by an environment setup script already.
-XILINX_XRT?=/opt/xilinx/xrt
-XILINX_VITIS?=$(shell realpath $(dir $(shell which vitis))/../)
-
-# ---
-
-srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
-
-XILINX_XRT_INCLUDE?=${XILINX_XRT}/include
-XILINX_XRT_LIB?=${XILINX_XRT}/lib
-
-CHESSCCWRAP2_FLAGS=aie2 -I${XILINX_VITIS}/aietools/include 
-XRT_FLAGS=-I${XILINX_XRT_INCLUDE} -L${XILINX_XRT_LIB}
-XRT_LIBS=-lxrt_coreutil
-CXX=g++-13 -ggdb 
-
-#mlir_target?=build/aie.mlir
-xclbin_target?=build/final.xclbin
-insts_target?=build/insts.txt
-host_target?=build/test
-
-.PHONY: all
-all: ${xclbin_target} ${host_target}
-
-build/aie.mlir: ${srcdir}/aie2.py
-	mkdir -p ${@D}
-	python3 $< > $@
-
-build/kernel.o: ${srcdir}/kernel.cc
-	mkdir -p ${@D}
-	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F}
-
-${xclbin_target}: build/aie.mlir build/kernel.o
-	mkdir -p ${@D}
-	cd ${@D} && aiecc.py -v --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
-				--dynamic-objFifos --aie-generate-npu --npu-insts-name=${insts_target:build/%=%} ${<:%=../%}
-
-${host_target}: ${srcdir}/test.cpp ${xclbin_target}
-	mkdir -p ${@D}
-	${CXX} ${XRT_FLAGS} -DM=$M -DN=$N -o $@ $< ${XRT_LIBS}
-
-.PHONY: run
-run: ${host_target}
-	./${host_target}
-
-xclbin_sign=${XILINX_XRT}/amdxdna/setup_xclbin_firmware.sh 
-.PHONY: sign
-sign: ${xclbin_target}
-	${xclbin_sign} -dev Phoenix -xclbin $<
-
-.PHONY: clean
-clean:
-	-rm -r build
\ No newline at end of file
diff --git a/programming_examples/dyn_objFifo/sliding_window_conditional/aie2.py b/programming_examples/dyn_objFifo/sliding_window_conditional/aie2.py
deleted file mode 100644
index 83719bc8e8..0000000000
--- a/programming_examples/dyn_objFifo/sliding_window_conditional/aie2.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# dynamic_object_fifo/sliding_window/aie2.py -*- Python -*-
-#
-# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
-
-import numpy as np
-
-from aie.dialects.aie import *
-from aie.dialects.aiex import *
-from aie.helpers.dialects.ext.scf import _for as range_
-from aie.extras.context import mlir_mod_ctx
-
-N = 100
-n_rows = 10
-dev = AIEDevice.npu1_1col
-col = 0
-
-
-def sliding_window():
-    with mlir_mod_ctx() as ctx:
-
-        @device(dev)
-        def device_body():
-            subtensor_ty = np.ndarray[(N // n_rows,), np.dtype[np.int32]]
-
-            # Tile declarations
-            ShimTile = tile(col, 0)
-            ComputeTile = tile(col, 2)
-
-            # AIE-array data movement with object fifos
-            of_in = object_fifo("in", ShimTile, ComputeTile, 3, subtensor_ty)
-            of_out = object_fifo("out", ComputeTile, ShimTile, 2, subtensor_ty)
-
-            # AIE Core Function declarations
-            add_10_i32 = external_func(
-                "add_10_i32", inputs=[subtensor_ty, subtensor_ty, subtensor_ty]
-            )
-
-            # Set up compute tiles
-            @core(ComputeTile, "kernel.o")
-            def core_body():
-                for i in range_(10):
-                    elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
-                    if i == 0:
-                        elemInPre = of_in.acquire(ObjectFifoPort.Consume, 1)
-                        add_10_i32(elemInPre, elemInPre, elemOut)
-                    elif i == 9:
-                        elemsInPost = of_in.acquire(ObjectFifoPort.Consume, 2)
-                        add_10_i32(elemsInPost[0], elemsInPost[1], elemOut)
-                        of_in.release(ObjectFifoPort.Consume, 2)
-                    else:
-                        elemsIn = of_in.acquire(ObjectFifoPort.Consume, 2)
-                        add_10_i32(elemsIn[0], elemsIn[1], elemOut)
-                        of_in.release(ObjectFifoPort.Consume, 1)
-
-                of_out.release(ObjectFifoPort.Produce, 1)
-
-            # To/from AIE-array data movement
-            tensor_ty = np.ndarray[(N,), np.dtype[np.int32]]
-
-            @runtime_sequence(tensor_ty, tensor_ty)
-            def sequence(A, C):
-                npu_dma_memcpy_nd(metadata=of_in, bd_id=1, mem=A, sizes=[1, 1, 1, N])
-                npu_dma_memcpy_nd(metadata=of_out, bd_id=0, mem=C, sizes=[1, 1, 1, N])
-                dma_wait(of_out)
-
-    print(ctx.module)
-
-
-sliding_window()
diff --git a/programming_examples/dyn_objFifo/sliding_window_conditional/kernel.cc b/programming_examples/dyn_objFifo/sliding_window_conditional/kernel.cc
deleted file mode 100644
index ddb474e102..0000000000
--- a/programming_examples/dyn_objFifo/sliding_window_conditional/kernel.cc
+++ /dev/null
@@ -1,24 +0,0 @@
-//
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// (c) Copyright 2024 AMD Inc.
-
-#include <aie_api/aie.hpp>
-
-template <typename T_in, typename T_out, unsigned long N>
-void add(const T_in *__restrict inA, const T_in *__restrict inB,
-         T_out *__restrict out) {
-  for (int i = 0; i < N; i++) {
-    out[i] = inA[i] + inB[i];
-  }
-}
-
-extern "C" {
-
-void add_10_i32(const int *__restrict inA, const int *__restrict inB,
-                int *__restrict out) {
-  add<int, int, 10>(inA, inB, out);
-}
-}
diff --git a/programming_examples/dyn_objFifo/sliding_window_conditional/test.cpp b/programming_examples/dyn_objFifo/sliding_window_conditional/test.cpp
deleted file mode 100644
index 3cd72ab880..0000000000
--- a/programming_examples/dyn_objFifo/sliding_window_conditional/test.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// (c) Copyright 2024 AMD Inc.
-
-#include <cassert>
-#include <cstring>
-#include <fstream>
-#include <iomanip>
-
-#include "xrt/xrt_bo.h"
-#include "xrt/xrt_device.h"
-#include "xrt/xrt_kernel.h"
-
-#ifndef XCLBIN
-#define XCLBIN "build/final.xclbin"
-#endif
-
-#ifndef INSTS_TXT
-#define INSTS_TXT "build/insts.txt"
-#endif
-
-#ifndef KERNEL_NAME
-#define KERNEL_NAME "MLIR_AIE"
-#endif
-
-#define INPUT_SIZE (100 * sizeof(int))  // in bytes
-#define OUTPUT_SIZE (100 * sizeof(int)) // in bytes
-#define WIDTH_SIZE (10 * sizeof(int))   // in bytes
-#define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE
-#define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE
-
-std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
-  std::ifstream instr_file(instr_path);
-  std::string line;
-  std::vector<uint32_t> instr_v;
-  while (std::getline(instr_file, line)) {
-    std::istringstream iss(line);
-    uint32_t a;
-    if (!(iss >> std::hex >> a)) {
-      throw std::runtime_error("Unable to parse instruction file\n");
-    }
-    instr_v.push_back(a);
-  }
-  return instr_v;
-}
-
-int main(int argc, const char *argv[]) {
-
-  std::vector<uint32_t> instr_v = load_instr_sequence(INSTS_TXT);
-  assert(instr_v.size() > 0);
-
-  // Get a device handle
-  unsigned int device_index = 0;
-  xrt::device device = xrt::device(device_index);
-
-  // Load the xclbin
-  xrt::xclbin xclbin = xrt::xclbin(XCLBIN);
-
-  // Get the kernel from the xclbin
-  std::vector<xrt::xclbin::kernel> xkernels = xclbin.get_kernels();
-  xrt::xclbin::kernel xkernel = *std::find_if(
-      xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel &k) {
-        return k.get_name().rfind(KERNEL_NAME, 0) == 0;
-      });
-  std::string kernel_name = xkernel.get_name();
-  assert(strcmp(kernel_name.c_str(), KERNEL_NAME) == 0);
-
-  device.register_xclbin(xclbin);
-
-  // get a hardware context
-  xrt::hw_context context(device, xclbin.get_uuid());
-
-  // get a kernel handle
-  auto kernel = xrt::kernel(context, kernel_name);
-
-  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
-                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
-  auto bo_input =
-      xrt::bo(device, INPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
-  auto bo_output =
-      xrt::bo(device, OUTPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
-
-  int *buf_input = bo_input.map<int *>();
-  std::cout << std::endl << std::endl << "Input: " << std::endl;
-  for (int i = 0; i < INPUT_ROWS; i++) {
-    std::cout << "row " << i << " : ";
-    for (int j = 0; j < WIDTH_SIZE / sizeof(buf_input[0]); j++) {
-      buf_input[i * INPUT_ROWS + j] = i;
-      std::cout << buf_input[i * INPUT_ROWS + j] << " ";
-    }
-    std::cout << std::endl << std::endl;
-  }
-  int *buf_output = bo_output.map<int *>();
-  memset(buf_output, 0, OUTPUT_SIZE);
-
-  // Instruction buffer for DMA configuration
-  void *buf_instr = bo_instr.map<void *>();
-  memcpy(buf_instr, instr_v.data(), instr_v.size() * sizeof(int));
-
-  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_input.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_output.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-
-  unsigned int opcode = 3;
-  auto run = kernel(opcode, bo_instr, instr_v.size(), bo_input, bo_output);
-  ert_cmd_state r = run.wait();
-  if (r != ERT_CMD_STATE_COMPLETED) {
-    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
-    return 1;
-  }
-
-  bo_output.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
-
-  bool pass = true;
-  std::cout << std::endl << "Output: " << std::endl;
-  for (int i = 0; i < OUTPUT_ROWS; i++) {
-    std::cout << "row " << i << std::endl;
-    for (int j = 0; j < WIDTH_SIZE / sizeof(buf_output[0]); j++) {
-      int expected_output = 0;
-      if (i == 0) {
-        expected_output = buf_input[i * INPUT_ROWS] * 2;
-      } else {
-        expected_output =
-            buf_input[(i - 1) * INPUT_ROWS] + buf_input[i * INPUT_ROWS];
-      }
-      std::cout << "expected: " << expected_output << ", ";
-      std::cout << "got: " << buf_output[i * OUTPUT_ROWS + j] << std::endl;
-      pass &= buf_output[i * OUTPUT_ROWS + j] == expected_output;
-    }
-    std::cout << std::endl << std::endl;
-  }
-  std::cout << std::endl << std::endl;
-  std::cout << (pass ? "PASS!" : "FAIL.") << std::endl;
-
-  return 0;
-}
diff --git a/programming_examples/dyn_objFifo/two_core_sliding_window/Makefile b/programming_examples/dyn_objFifo/two_core_sliding_window/Makefile
deleted file mode 100644
index 4e423e1df1..0000000000
--- a/programming_examples/dyn_objFifo/two_core_sliding_window/Makefile
+++ /dev/null
@@ -1,66 +0,0 @@
-##===- Makefile -----------------------------------------------------------===##
-# 
-# This file licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
-# 
-##===----------------------------------------------------------------------===##
-
-# ---
-
-# The following environment variables that point to the Xilinx runtime (XRT)
-# should be set up by an environment setup script already.
-XILINX_XRT?=/opt/xilinx/xrt
-XILINX_VITIS?=$(shell realpath $(dir $(shell which vitis))/../)
-
-# ---
-
-srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
-
-XILINX_XRT_INCLUDE?=${XILINX_XRT}/include
-XILINX_XRT_LIB?=${XILINX_XRT}/lib
-
-CHESSCCWRAP2_FLAGS=aie2 -I${XILINX_VITIS}/aietools/include 
-XRT_FLAGS=-I${XILINX_XRT_INCLUDE} -L${XILINX_XRT_LIB}
-XRT_LIBS=-lxrt_coreutil
-CXX=g++-13 -ggdb 
-
-#mlir_target?=build/aie.mlir
-xclbin_target?=build/final.xclbin
-insts_target?=build/insts.txt
-host_target?=build/test
-
-.PHONY: all
-all: ${xclbin_target} ${host_target}
-
-build/aie.mlir: ${srcdir}/aie2.py
-	mkdir -p ${@D}
-	python3 $< > $@
-
-build/kernel.o: ${srcdir}/kernel.cc
-	mkdir -p ${@D}
-	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $< -o ${@F}
-
-${xclbin_target}: build/aie.mlir build/kernel.o
-	mkdir -p ${@D}
-	cd ${@D} && aiecc.py -v --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
-				--dynamic-objFifos --aie-generate-npu --npu-insts-name=${insts_target:build/%=%} ${<:%=../%}
-
-${host_target}: ${srcdir}/test.cpp ${xclbin_target}
-	mkdir -p ${@D}
-	${CXX} ${XRT_FLAGS} -DM=$M -DN=$N -o $@ $< ${XRT_LIBS}
-
-.PHONY: run
-run: ${host_target}
-	./${host_target}
-
-xclbin_sign=${XILINX_XRT}/amdxdna/setup_xclbin_firmware.sh 
-.PHONY: sign
-sign: ${xclbin_target}
-	${xclbin_sign} -dev Phoenix -xclbin $<
-
-.PHONY: clean
-clean:
-	-rm -r build
diff --git a/programming_examples/dyn_objFifo/two_core_sliding_window/aie2.py b/programming_examples/dyn_objFifo/two_core_sliding_window/aie2.py
deleted file mode 100644
index c0d7c805ee..0000000000
--- a/programming_examples/dyn_objFifo/two_core_sliding_window/aie2.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# dynamic_object_fifo/two_core_sliding_window/aie2.py -*- Python -*-
-#
-# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
-
-import numpy as np
-
-from aie.dialects.aie import *
-from aie.dialects.aiex import *
-from aie.helpers.dialects.ext.scf import _for as range_
-from aie.extras.context import mlir_mod_ctx
-
-N = 100
-n_rows = 10
-dev = AIEDevice.npu1_1col
-col = 0
-
-
-def two_core_sliding_window():
-    with mlir_mod_ctx() as ctx:
-
-        @device(dev)
-        def device_body():
-            subtensor_ty = np.ndarray[(N // n_rows,), np.dtype[np.int32]]
-
-            # Tile declarations
-            ShimTile = tile(col, 0)
-            ComputeTile = tile(col, 2)
-            ComputeTile2 = tile(col, 4)
-
-            # AIE-array data movement with object fifos
-            of_in = object_fifo("in", ShimTile, ComputeTile, 2, subtensor_ty)
-            of_in2 = object_fifo("in2", ComputeTile, ComputeTile2, 3, subtensor_ty)
-            of_out = object_fifo("out", ComputeTile2, ShimTile, 2, subtensor_ty)
-
-            # AIE Core Function declarations
-            passthrough_10_i32 = external_func(
-                "passthrough_10_i32", inputs=[subtensor_ty, subtensor_ty]
-            )
-            add_10_i32 = external_func(
-                "add_10_i32", inputs=[subtensor_ty, subtensor_ty, subtensor_ty]
-            )
-
-            # Set up compute tiles
-
-            @core(ComputeTile, "kernel.o")
-            def core_body():
-                for _ in range_(10):
-                    elemOut = of_in2.acquire(ObjectFifoPort.Produce, 1)
-                    elemIn = of_in.acquire(ObjectFifoPort.Consume, 1)
-                    passthrough_10_i32(elemIn, elemOut)
-                    of_in.release(ObjectFifoPort.Consume, 1)
-                    of_in2.release(ObjectFifoPort.Produce, 1)
-
-            @core(ComputeTile2, "kernel.o")
-            def core_body():
-                elemOutPre = of_out.acquire(ObjectFifoPort.Produce, 1)
-                elemInPre = of_in2.acquire(ObjectFifoPort.Consume, 1)
-                add_10_i32(elemInPre, elemInPre, elemOutPre)
-                of_out.release(ObjectFifoPort.Produce, 1)
-
-                for _ in range_(8):
-                    elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
-                    elemsIn = of_in2.acquire(ObjectFifoPort.Consume, 2)
-                    add_10_i32(elemsIn[0], elemsIn[1], elemOut)
-                    of_in2.release(ObjectFifoPort.Consume, 1)
-                    of_out.release(ObjectFifoPort.Produce, 1)
-
-                elemOutPost = of_out.acquire(ObjectFifoPort.Produce, 1)
-                elemsInPost = of_in2.acquire(ObjectFifoPort.Consume, 2)
-                add_10_i32(elemsInPost[0], elemsInPost[1], elemOutPost)
-                of_in2.release(ObjectFifoPort.Consume, 2)
-                of_out.release(ObjectFifoPort.Produce, 1)
-
-            # To/from AIE-array data movement
-            tensor_ty = np.ndarray[(N,), np.dtype[np.int32]]
-
-            @runtime_sequence(tensor_ty, tensor_ty)
-            def sequence(A, C):
-                npu_dma_memcpy_nd(metadata=of_in, bd_id=1, mem=A, sizes=[1, 1, 1, N])
-                npu_dma_memcpy_nd(metadata=of_out, bd_id=0, mem=C, sizes=[1, 1, 1, N])
-                dma_wait(of_out)
-
-    print(ctx.module)
-
-
-two_core_sliding_window()
diff --git a/programming_examples/dyn_objFifo/two_core_sliding_window/kernel.cc b/programming_examples/dyn_objFifo/two_core_sliding_window/kernel.cc
deleted file mode 100644
index 7e4515193c..0000000000
--- a/programming_examples/dyn_objFifo/two_core_sliding_window/kernel.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-//
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// (c) Copyright 2024 AMD Inc.
-
-#include <aie_api/aie.hpp>
-
-template <typename T_in, typename T_out, unsigned long N>
-void passthrough(const T_in *__restrict in, T_out *__restrict out) {
-  for (int i = 0; i < N; i++) {
-    out[i] = in[i];
-  }
-}
-
-extern "C" {
-
-void passthrough_10_i32(const int *__restrict in, int *__restrict out) {
-  passthrough<int, int, 10>(in, out);
-}
-}
-
-template <typename T_in, typename T_out, unsigned long N>
-void add(const T_in *__restrict inA, const T_in *__restrict inB,
-         T_out *__restrict out) {
-  for (int i = 0; i < N; i++) {
-    out[i] = inA[i] + inB[i];
-  }
-}
-
-extern "C" {
-
-void add_10_i32(const int *__restrict inA, const int *__restrict inB,
-                int *__restrict out) {
-  add<int, int, 10>(inA, inB, out);
-}
-}
diff --git a/programming_examples/dyn_objFifo/two_core_sliding_window/test.cpp b/programming_examples/dyn_objFifo/two_core_sliding_window/test.cpp
deleted file mode 100644
index 3cd72ab880..0000000000
--- a/programming_examples/dyn_objFifo/two_core_sliding_window/test.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// (c) Copyright 2024 AMD Inc.
-
-#include <cassert>
-#include <cstring>
-#include <fstream>
-#include <iomanip>
-
-#include "xrt/xrt_bo.h"
-#include "xrt/xrt_device.h"
-#include "xrt/xrt_kernel.h"
-
-#ifndef XCLBIN
-#define XCLBIN "build/final.xclbin"
-#endif
-
-#ifndef INSTS_TXT
-#define INSTS_TXT "build/insts.txt"
-#endif
-
-#ifndef KERNEL_NAME
-#define KERNEL_NAME "MLIR_AIE"
-#endif
-
-#define INPUT_SIZE (100 * sizeof(int))  // in bytes
-#define OUTPUT_SIZE (100 * sizeof(int)) // in bytes
-#define WIDTH_SIZE (10 * sizeof(int))   // in bytes
-#define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE
-#define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE
-
-std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
-  std::ifstream instr_file(instr_path);
-  std::string line;
-  std::vector<uint32_t> instr_v;
-  while (std::getline(instr_file, line)) {
-    std::istringstream iss(line);
-    uint32_t a;
-    if (!(iss >> std::hex >> a)) {
-      throw std::runtime_error("Unable to parse instruction file\n");
-    }
-    instr_v.push_back(a);
-  }
-  return instr_v;
-}
-
-int main(int argc, const char *argv[]) {
-
-  std::vector<uint32_t> instr_v = load_instr_sequence(INSTS_TXT);
-  assert(instr_v.size() > 0);
-
-  // Get a device handle
-  unsigned int device_index = 0;
-  xrt::device device = xrt::device(device_index);
-
-  // Load the xclbin
-  xrt::xclbin xclbin = xrt::xclbin(XCLBIN);
-
-  // Get the kernel from the xclbin
-  std::vector<xrt::xclbin::kernel> xkernels = xclbin.get_kernels();
-  xrt::xclbin::kernel xkernel = *std::find_if(
-      xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel &k) {
-        return k.get_name().rfind(KERNEL_NAME, 0) == 0;
-      });
-  std::string kernel_name = xkernel.get_name();
-  assert(strcmp(kernel_name.c_str(), KERNEL_NAME) == 0);
-
-  device.register_xclbin(xclbin);
-
-  // get a hardware context
-  xrt::hw_context context(device, xclbin.get_uuid());
-
-  // get a kernel handle
-  auto kernel = xrt::kernel(context, kernel_name);
-
-  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
-                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
-  auto bo_input =
-      xrt::bo(device, INPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
-  auto bo_output =
-      xrt::bo(device, OUTPUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
-
-  int *buf_input = bo_input.map<int *>();
-  std::cout << std::endl << std::endl << "Input: " << std::endl;
-  for (int i = 0; i < INPUT_ROWS; i++) {
-    std::cout << "row " << i << " : ";
-    for (int j = 0; j < WIDTH_SIZE / sizeof(buf_input[0]); j++) {
-      buf_input[i * INPUT_ROWS + j] = i;
-      std::cout << buf_input[i * INPUT_ROWS + j] << " ";
-    }
-    std::cout << std::endl << std::endl;
-  }
-  int *buf_output = bo_output.map<int *>();
-  memset(buf_output, 0, OUTPUT_SIZE);
-
-  // Instruction buffer for DMA configuration
-  void *buf_instr = bo_instr.map<void *>();
-  memcpy(buf_instr, instr_v.data(), instr_v.size() * sizeof(int));
-
-  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_input.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_output.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-
-  unsigned int opcode = 3;
-  auto run = kernel(opcode, bo_instr, instr_v.size(), bo_input, bo_output);
-  ert_cmd_state r = run.wait();
-  if (r != ERT_CMD_STATE_COMPLETED) {
-    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
-    return 1;
-  }
-
-  bo_output.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
-
-  bool pass = true;
-  std::cout << std::endl << "Output: " << std::endl;
-  for (int i = 0; i < OUTPUT_ROWS; i++) {
-    std::cout << "row " << i << std::endl;
-    for (int j = 0; j < WIDTH_SIZE / sizeof(buf_output[0]); j++) {
-      int expected_output = 0;
-      if (i == 0) {
-        expected_output = buf_input[i * INPUT_ROWS] * 2;
-      } else {
-        expected_output =
-            buf_input[(i - 1) * INPUT_ROWS] + buf_input[i * INPUT_ROWS];
-      }
-      std::cout << "expected: " << expected_output << ", ";
-      std::cout << "got: " << buf_output[i * OUTPUT_ROWS + j] << std::endl;
-      pass &= buf_output[i * OUTPUT_ROWS + j] == expected_output;
-    }
-    std::cout << std::endl << std::endl;
-  }
-  std::cout << std::endl << std::endl;
-  std::cout << (pass ? "PASS!" : "FAIL.") << std::endl;
-
-  return 0;
-}
diff --git a/test/objectFifo-stateful-transform/dynamic_lowering_flag_test.mlir b/test/objectFifo-stateful-transform/dynamic_lowering_flag_test.mlir
index e91c1f9f21..bdc59cf3a4 100644
--- a/test/objectFifo-stateful-transform/dynamic_lowering_flag_test.mlir
+++ b/test/objectFifo-stateful-transform/dynamic_lowering_flag_test.mlir
@@ -10,21 +10,66 @@
 
 // RUN: aie-opt --aie-objectFifo-stateful-transform %s | FileCheck %s
 
+// CHECK:   aie.device(npu1_1col) {
+// CHECK:     memref.global "public" @output_fifo2_cons : memref<10xi32>
+// CHECK:     memref.global "public" @output_fifo2 : memref<10xi32>
+// CHECK:     memref.global "public" @input_fifo2_cons : memref<10xi32>
+// CHECK:     memref.global "public" @input_fifo2 : memref<10xi32>
+// CHECK:     memref.global "public" @output_fifo_cons : memref<10xi32>
+// CHECK:     memref.global "public" @output_fifo : memref<10xi32>
+// CHECK:     memref.global "public" @input_fifo_cons : memref<10xi32>
+// CHECK:     memref.global "public" @input_fifo : memref<10xi32>
+// CHECK:     func.func @passthrough_10_i32(%arg0: memref<10xi32>, %arg1: memref<10xi32>) {
+// CHECK:       return
+// CHECK:     }
+// CHECK:     %tile_0_0 = aie.tile(0, 0)
+// CHECK:     %tile_0_2 = aie.tile(0, 2)
+// CHECK:     %tile_0_4 = aie.tile(0, 4)
+// CHECK:     %output_fifo2_cons_prod_lock = aie.lock(%tile_0_0, 6) {init = 1 : i32, sym_name = "output_fifo2_cons_prod_lock"}
+// CHECK:     %output_fifo2_cons_cons_lock = aie.lock(%tile_0_0, 7) {init = 0 : i32, sym_name = "output_fifo2_cons_cons_lock"}
+// CHECK:     %output_fifo2_buff_0 = aie.buffer(%tile_0_4) {sym_name = "output_fifo2_buff_0"} : memref<10xi32> 
+// CHECK:     %output_fifo2_buff_1 = aie.buffer(%tile_0_4) {sym_name = "output_fifo2_buff_1"} : memref<10xi32> 
+// CHECK:     %output_fifo2_prod_lock = aie.lock(%tile_0_4, 2) {init = 2 : i32, sym_name = "output_fifo2_prod_lock"}
+// CHECK:     %output_fifo2_cons_lock = aie.lock(%tile_0_4, 3) {init = 0 : i32, sym_name = "output_fifo2_cons_lock"}
+// CHECK:     %input_fifo2_cons_buff_0 = aie.buffer(%tile_0_4) {sym_name = "input_fifo2_cons_buff_0"} : memref<10xi32> 
+// CHECK:     %input_fifo2_cons_buff_1 = aie.buffer(%tile_0_4) {sym_name = "input_fifo2_cons_buff_1"} : memref<10xi32> 
+// CHECK:     %input_fifo2_cons_prod_lock = aie.lock(%tile_0_4, 0) {init = 2 : i32, sym_name = "input_fifo2_cons_prod_lock"}
+// CHECK:     %input_fifo2_cons_cons_lock = aie.lock(%tile_0_4, 1) {init = 0 : i32, sym_name = "input_fifo2_cons_cons_lock"}
+// CHECK:     %input_fifo2_prod_lock = aie.lock(%tile_0_0, 4) {init = 1 : i32, sym_name = "input_fifo2_prod_lock"}
+// CHECK:     %input_fifo2_cons_lock = aie.lock(%tile_0_0, 5) {init = 0 : i32, sym_name = "input_fifo2_cons_lock"}
+// CHECK:     %output_fifo_cons_prod_lock = aie.lock(%tile_0_0, 2) {init = 1 : i32, sym_name = "output_fifo_cons_prod_lock"}
+// CHECK:     %output_fifo_cons_cons_lock = aie.lock(%tile_0_0, 3) {init = 0 : i32, sym_name = "output_fifo_cons_cons_lock"}
+// CHECK:     %output_fifo_buff_0 = aie.buffer(%tile_0_2) {sym_name = "output_fifo_buff_0"} : memref<10xi32> 
+// CHECK:     %output_fifo_buff_1 = aie.buffer(%tile_0_2) {sym_name = "output_fifo_buff_1"} : memref<10xi32> 
+// CHECK:     %output_fifo_prod_lock = aie.lock(%tile_0_2, 2) {init = 2 : i32, sym_name = "output_fifo_prod_lock"}
+// CHECK:     %output_fifo_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "output_fifo_cons_lock"}
+// CHECK:     %input_fifo_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "input_fifo_cons_buff_0"} : memref<10xi32> 
+// CHECK:     %input_fifo_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "input_fifo_cons_buff_1"} : memref<10xi32> 
+// CHECK:     %input_fifo_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 2 : i32, sym_name = "input_fifo_cons_prod_lock"}
+// CHECK:     %input_fifo_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "input_fifo_cons_cons_lock"}
+// CHECK:     %input_fifo_prod_lock = aie.lock(%tile_0_0, 0) {init = 1 : i32, sym_name = "input_fifo_prod_lock"}
+// CHECK:     %input_fifo_cons_lock = aie.lock(%tile_0_0, 1) {init = 0 : i32, sym_name = "input_fifo_cons_lock"}
+// CHECK:     aie.flow(%tile_0_0, DMA : 0, %tile_0_2, DMA : 0)
+// CHECK:     aie.flow(%tile_0_2, DMA : 0, %tile_0_0, DMA : 0)
+// CHECK:     aie.flow(%tile_0_0, DMA : 1, %tile_0_4, DMA : 0)
+// CHECK:     aie.flow(%tile_0_4, DMA : 0, %tile_0_0, DMA : 1)
+// CHECK:     %buffer_0_2 = aie.buffer(%tile_0_2) : memref<2xi32> 
 // CHECK:     %core_0_2 = aie.core(%tile_0_2) {
+// CHECK:       %c0_i32 = arith.constant 0 : i32
 // CHECK:       %c0 = arith.constant 0 : index
-// CHECK:       %c0_0 = arith.constant 0 : index
-// CHECK:       %c2 = arith.constant 2 : index
-// CHECK:       memref.store %c0, %buffer_0_2[%c0_0] : memref<2xindex>
+// CHECK:       %c2_i32 = arith.constant 2 : i32
+// CHECK:       memref.store %c0_i32, %buffer_0_2[%c0] : memref<2xi32>
 // CHECK:       %c1 = arith.constant 1 : index
-// CHECK:       %c2_1 = arith.constant 2 : index
-// CHECK:       memref.store %c0, %buffer_0_2[%c1] : memref<2xindex>
-// CHECK:       %c0_2 = arith.constant 0 : index
-// CHECK:       %c1_3 = arith.constant 1 : index
+// CHECK:       %c2_i32_0 = arith.constant 2 : i32
+// CHECK:       memref.store %c0_i32, %buffer_0_2[%c1] : memref<2xi32>
+// CHECK:       %c0_1 = arith.constant 0 : index
+// CHECK:       %c1_2 = arith.constant 1 : index
 // CHECK:       %c10 = arith.constant 10 : index
-// CHECK:       scf.for %arg0 = %c0_2 to %c10 step %c1_3 {
+// CHECK:       scf.for %arg0 = %c0_1 to %c10 step %c1_2 {
 // CHECK:         aie.use_lock(%output_fifo_prod_lock, AcquireGreaterEqual, 1)
-// CHECK:         %0 = memref.load %buffer_0_2[%c0_0] : memref<2xindex>
-// CHECK:         %1 = scf.index_switch %0 -> memref<10xi32> 
+// CHECK:         %0 = memref.load %buffer_0_2[%c0] : memref<2xi32>
+// CHECK:         %1 = arith.index_cast %0 : i32 to index
+// CHECK:         %2 = scf.index_switch %1 -> memref<10xi32> 
 // CHECK:         case 0 {
 // CHECK:           scf.yield %output_fifo_buff_0 : memref<10xi32>
 // CHECK:         }
@@ -35,8 +80,9 @@
 // CHECK:           scf.yield %output_fifo_buff_0 : memref<10xi32>
 // CHECK:         }
 // CHECK:         aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 1)
-// CHECK:         %2 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-// CHECK:         %3 = scf.index_switch %2 -> memref<10xi32> 
+// CHECK:         %3 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+// CHECK:         %4 = arith.index_cast %3 : i32 to index
+// CHECK:         %5 = scf.index_switch %4 -> memref<10xi32> 
 // CHECK:         case 0 {
 // CHECK:           scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
 // CHECK:         }
@@ -46,39 +92,75 @@
 // CHECK:         default {
 // CHECK:           scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
 // CHECK:         }
-// CHECK:         func.call @passthrough_10_i32(%3, %1) : (memref<10xi32>, memref<10xi32>) -> ()
+// CHECK:         func.call @passthrough_10_i32(%5, %2) : (memref<10xi32>, memref<10xi32>) -> ()
 // CHECK:         aie.use_lock(%input_fifo_cons_prod_lock, Release, 1)
-// CHECK:         %4 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-// CHECK:         %c1_4 = arith.constant 1 : index
-// CHECK:         %5 = arith.addi %4, %c1_4 : index
-// CHECK:         %6 = arith.remsi %5, %c2_1 : index
-// CHECK:         memref.store %6, %buffer_0_2[%c1] : memref<2xindex>
+// CHECK:         %6 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+// CHECK:         %c1_i32 = arith.constant 1 : i32
+// CHECK:         %7 = arith.addi %6, %c1_i32 : i32
+// CHECK:         %8 = arith.remsi %7, %c2_i32_0 : i32
+// CHECK:         memref.store %8, %buffer_0_2[%c1] : memref<2xi32>
 // CHECK:         aie.use_lock(%output_fifo_cons_lock, Release, 1)
-// CHECK:         %7 = memref.load %buffer_0_2[%c0_0] : memref<2xindex>
-// CHECK:         %c1_5 = arith.constant 1 : index
-// CHECK:         %8 = arith.addi %7, %c1_5 : index
-// CHECK:         %9 = arith.remsi %8, %c2 : index
-// CHECK:         memref.store %9, %buffer_0_2[%c0_0] : memref<2xindex>
+// CHECK:         %9 = memref.load %buffer_0_2[%c0] : memref<2xi32>
+// CHECK:         %c1_i32_3 = arith.constant 1 : i32
+// CHECK:         %10 = arith.addi %9, %c1_i32_3 : i32
+// CHECK:         %11 = arith.remsi %10, %c2_i32 : i32
+// CHECK:         memref.store %11, %buffer_0_2[%c0] : memref<2xi32>
 // CHECK:       }
 // CHECK:       aie.end
 // CHECK:     } {dynamic_objfifo_lowering = true}
+// CHECK:     %buffer_0_4 = aie.buffer(%tile_0_4) : memref<2xi32> 
 // CHECK:     %core_0_4 = aie.core(%tile_0_4) {
+// CHECK:       %c0_i32 = arith.constant 0 : i32
 // CHECK:       %c0 = arith.constant 0 : index
+// CHECK:       %c2_i32 = arith.constant 2 : i32
+// CHECK:       memref.store %c0_i32, %buffer_0_4[%c0] : memref<2xi32>
 // CHECK:       %c1 = arith.constant 1 : index
+// CHECK:       %c2_i32_0 = arith.constant 2 : i32
+// CHECK:       memref.store %c0_i32, %buffer_0_4[%c1] : memref<2xi32>
+// CHECK:       %c0_1 = arith.constant 0 : index
+// CHECK:       %c1_2 = arith.constant 1 : index
 // CHECK:       %c10 = arith.constant 10 : index
-// CHECK:       %c2 = arith.constant 2 : index
-// CHECK:       scf.for %arg0 = %c0 to %c10 step %c2 {
-// CHECK:         aie.use_lock(%output_fifo2_prod_lock, AcquireGreaterEqual, 1)
-// CHECK:         aie.use_lock(%input_fifo2_cons_cons_lock, AcquireGreaterEqual, 1)
-// CHECK:         func.call @passthrough_10_i32(%input_fifo2_cons_buff_0, %output_fifo2_buff_0) : (memref<10xi32>, memref<10xi32>) -> ()
-// CHECK:         aie.use_lock(%input_fifo2_cons_prod_lock, Release, 1)
-// CHECK:         aie.use_lock(%output_fifo2_cons_lock, Release, 1)
+// CHECK:       scf.for %arg0 = %c0_1 to %c10 step %c1_2 {
 // CHECK:         aie.use_lock(%output_fifo2_prod_lock, AcquireGreaterEqual, 1)
+// CHECK:         %0 = memref.load %buffer_0_4[%c0] : memref<2xi32>
+// CHECK:         %1 = arith.index_cast %0 : i32 to index
+// CHECK:         %2 = scf.index_switch %1 -> memref<10xi32> 
+// CHECK:         case 0 {
+// CHECK:           scf.yield %output_fifo2_buff_0 : memref<10xi32>
+// CHECK:         }
+// CHECK:         case 1 {
+// CHECK:           scf.yield %output_fifo2_buff_1 : memref<10xi32>
+// CHECK:         }
+// CHECK:         default {
+// CHECK:           scf.yield %output_fifo2_buff_0 : memref<10xi32>
+// CHECK:         }
 // CHECK:         aie.use_lock(%input_fifo2_cons_cons_lock, AcquireGreaterEqual, 1)
-// CHECK:         func.call @passthrough_10_i32(%input_fifo2_cons_buff_1, %output_fifo2_buff_1) : (memref<10xi32>, memref<10xi32>) -> ()
+// CHECK:         %3 = memref.load %buffer_0_4[%c1] : memref<2xi32>
+// CHECK:         %4 = arith.index_cast %3 : i32 to index
+// CHECK:         %5 = scf.index_switch %4 -> memref<10xi32> 
+// CHECK:         case 0 {
+// CHECK:           scf.yield %input_fifo2_cons_buff_0 : memref<10xi32>
+// CHECK:         }
+// CHECK:         case 1 {
+// CHECK:           scf.yield %input_fifo2_cons_buff_1 : memref<10xi32>
+// CHECK:         }
+// CHECK:         default {
+// CHECK:           scf.yield %input_fifo2_cons_buff_0 : memref<10xi32>
+// CHECK:         }
+// CHECK:         func.call @passthrough_10_i32(%5, %2) : (memref<10xi32>, memref<10xi32>) -> ()
 // CHECK:         aie.use_lock(%input_fifo2_cons_prod_lock, Release, 1)
+// CHECK:         %6 = memref.load %buffer_0_4[%c1] : memref<2xi32>
+// CHECK:         %c1_i32 = arith.constant 1 : i32
+// CHECK:         %7 = arith.addi %6, %c1_i32 : i32
+// CHECK:         %8 = arith.remsi %7, %c2_i32_0 : i32
+// CHECK:         memref.store %8, %buffer_0_4[%c1] : memref<2xi32>
 // CHECK:         aie.use_lock(%output_fifo2_cons_lock, Release, 1)
-// CHECK:       }        
+// CHECK:         %9 = memref.load %buffer_0_4[%c0] : memref<2xi32>
+// CHECK:         %c1_i32_3 = arith.constant 1 : i32
+// CHECK:         %10 = arith.addi %9, %c1_i32_3 : i32
+// CHECK:         %11 = arith.remsi %10, %c2_i32 : i32
+// CHECK:         memref.store %11, %buffer_0_4[%c0] : memref<2xi32>
+// CHECK:       }
 // CHECK:       aie.end
 // CHECK:     }
 // CHECK:     aie.shim_dma_allocation @input_fifo(MM2S, 0, 0)
diff --git a/test/objectFifo-stateful-transform/dynamic_lowering_test.mlir b/test/objectFifo-stateful-transform/dynamic_lowering_test.mlir
index c169c8472e..087b8e5a2a 100644
--- a/test/objectFifo-stateful-transform/dynamic_lowering_test.mlir
+++ b/test/objectFifo-stateful-transform/dynamic_lowering_test.mlir
@@ -35,21 +35,22 @@
 // CHECK:       %input_fifo_cons_lock = aie.lock(%tile_0_0, 1) {init = 0 : i32, sym_name = "input_fifo_cons_lock"}
 // CHECK:       aie.flow(%tile_0_0, DMA : 0, %tile_0_2, DMA : 0)
 // CHECK:       aie.flow(%tile_0_2, DMA : 0, %tile_0_0, DMA : 0)
-// CHECK:       %buffer_0_2 = aie.buffer(%tile_0_2) : memref<2xindex> 
+// CHECK:       %buffer_0_2 = aie.buffer(%tile_0_2) : memref<2xi32> 
 // CHECK:       %core_0_2 = aie.core(%tile_0_2) {
+// CHECK:         %c0_i32 = arith.constant 0 : i32
 // CHECK:         %c0 = arith.constant 0 : index
-// CHECK:         %c0_0 = arith.constant 0 : index
-// CHECK:         %c2 = arith.constant 2 : index
-// CHECK:         memref.store %c0, %buffer_0_2[%c0_0] : memref<2xindex>
+// CHECK:         %c2_i32 = arith.constant 2 : i32
+// CHECK:         memref.store %c0_i32, %buffer_0_2[%c0] : memref<2xi32>
 // CHECK:         %c1 = arith.constant 1 : index
-// CHECK:         %c3 = arith.constant 3 : index
-// CHECK:         memref.store %c0, %buffer_0_2[%c1] : memref<2xindex>
-// CHECK:         %c0_1 = arith.constant 0 : index
-// CHECK:         %c1_2 = arith.constant 1 : index
+// CHECK:         %c3_i32 = arith.constant 3 : i32
+// CHECK:         memref.store %c0_i32, %buffer_0_2[%c1] : memref<2xi32>
+// CHECK:         %c0_0 = arith.constant 0 : index
+// CHECK:         %c1_1 = arith.constant 1 : index
 // CHECK:         %c9 = arith.constant 9 : index
 // CHECK:         aie.use_lock(%output_fifo_prod_lock, AcquireGreaterEqual, 1)
-// CHECK:         %0 = memref.load %buffer_0_2[%c0_0] : memref<2xindex>
-// CHECK:         %1 = scf.index_switch %0 -> memref<10xi32> 
+// CHECK:         %0 = memref.load %buffer_0_2[%c0] : memref<2xi32>
+// CHECK:         %1 = arith.index_cast %0 : i32 to index
+// CHECK:         %2 = scf.index_switch %1 -> memref<10xi32> 
 // CHECK:         case 0 {
 // CHECK:           scf.yield %output_fifo_buff_0 : memref<10xi32>
 // CHECK:         }
@@ -60,8 +61,9 @@
 // CHECK:           scf.yield %output_fifo_buff_0 : memref<10xi32>
 // CHECK:         }
 // CHECK:         aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 1)
-// CHECK:         %2 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-// CHECK:         %3 = scf.index_switch %2 -> memref<10xi32> 
+// CHECK:         %3 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+// CHECK:         %4 = arith.index_cast %3 : i32 to index
+// CHECK:         %5 = scf.index_switch %4 -> memref<10xi32> 
 // CHECK:         case 0 {
 // CHECK:           scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
 // CHECK:         }
@@ -74,17 +76,18 @@
 // CHECK:         default {
 // CHECK:           scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
 // CHECK:         }
-// CHECK:         func.call @add_10_i32(%3, %3, %1) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
+// CHECK:         func.call @add_10_i32(%5, %5, %2) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
 // CHECK:         aie.use_lock(%output_fifo_cons_lock, Release, 1)
-// CHECK:         %4 = memref.load %buffer_0_2[%c0_0] : memref<2xindex>
-// CHECK:         %c1_3 = arith.constant 1 : index
-// CHECK:         %5 = arith.addi %4, %c1_3 : index
-// CHECK:         %6 = arith.remsi %5, %c2 : index
-// CHECK:         memref.store %6, %buffer_0_2[%c0_0] : memref<2xindex>
-// CHECK:         scf.for %arg0 = %c0_1 to %c9 step %c1_2 {
+// CHECK:         %6 = memref.load %buffer_0_2[%c0] : memref<2xi32>
+// CHECK:         %c1_i32 = arith.constant 1 : i32
+// CHECK:         %7 = arith.addi %6, %c1_i32 : i32
+// CHECK:         %8 = arith.remsi %7, %c2_i32 : i32
+// CHECK:         memref.store %8, %buffer_0_2[%c0] : memref<2xi32>
+// CHECK:         scf.for %arg0 = %c0_0 to %c9 step %c1_1 {
 // CHECK:           aie.use_lock(%output_fifo_prod_lock, AcquireGreaterEqual, 1)
-// CHECK:           %19 = memref.load %buffer_0_2[%c0_0] : memref<2xindex>
-// CHECK:           %20 = scf.index_switch %19 -> memref<10xi32> 
+// CHECK:           %24 = memref.load %buffer_0_2[%c0] : memref<2xi32>
+// CHECK:           %25 = arith.index_cast %24 : i32 to index
+// CHECK:           %26 = scf.index_switch %25 -> memref<10xi32> 
 // CHECK:           case 0 {
 // CHECK:             scf.yield %output_fifo_buff_0 : memref<10xi32>
 // CHECK:           }
@@ -95,8 +98,9 @@
 // CHECK:             scf.yield %output_fifo_buff_0 : memref<10xi32>
 // CHECK:           }
 // CHECK:           aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 1)
-// CHECK:           %21 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-// CHECK:           %22 = scf.index_switch %21 -> memref<10xi32> 
+// CHECK:           %27 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+// CHECK:           %28 = arith.index_cast %27 : i32 to index
+// CHECK:           %29 = scf.index_switch %28 -> memref<10xi32> 
 // CHECK:           case 0 {
 // CHECK:             scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
 // CHECK:           }
@@ -109,8 +113,9 @@
 // CHECK:           default {
 // CHECK:             scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
 // CHECK:           }
-// CHECK:           %23 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-// CHECK:           %24 = scf.index_switch %23 -> memref<10xi32> 
+// CHECK:           %30 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+// CHECK:           %31 = arith.index_cast %30 : i32 to index
+// CHECK:           %32 = scf.index_switch %31 -> memref<10xi32> 
 // CHECK:           case 0 {
 // CHECK:             scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
 // CHECK:           }
@@ -123,23 +128,24 @@
 // CHECK:           default {
 // CHECK:             scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
 // CHECK:           }
-// CHECK:           func.call @add_10_i32(%22, %24, %20) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
+// CHECK:           func.call @add_10_i32(%29, %32, %26) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
 // CHECK:           aie.use_lock(%input_fifo_cons_prod_lock, Release, 1)
-// CHECK:           %25 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-// CHECK:           %c1_6 = arith.constant 1 : index
-// CHECK:           %26 = arith.addi %25, %c1_6 : index
-// CHECK:           %27 = arith.remsi %26, %c3 : index
-// CHECK:           memref.store %27, %buffer_0_2[%c1] : memref<2xindex>
+// CHECK:           %33 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+// CHECK:           %c1_i32_4 = arith.constant 1 : i32
+// CHECK:           %34 = arith.addi %33, %c1_i32_4 : i32
+// CHECK:           %35 = arith.remsi %34, %c3_i32 : i32
+// CHECK:           memref.store %35, %buffer_0_2[%c1] : memref<2xi32>
 // CHECK:           aie.use_lock(%output_fifo_cons_lock, Release, 1)
-// CHECK:           %28 = memref.load %buffer_0_2[%c0_0] : memref<2xindex>
-// CHECK:           %c1_7 = arith.constant 1 : index
-// CHECK:           %29 = arith.addi %28, %c1_7 : index
-// CHECK:           %30 = arith.remsi %29, %c2 : index
-// CHECK:           memref.store %30, %buffer_0_2[%c0_0] : memref<2xindex>
+// CHECK:           %36 = memref.load %buffer_0_2[%c0] : memref<2xi32>
+// CHECK:           %c1_i32_5 = arith.constant 1 : i32
+// CHECK:           %37 = arith.addi %36, %c1_i32_5 : i32
+// CHECK:           %38 = arith.remsi %37, %c2_i32 : i32
+// CHECK:           memref.store %38, %buffer_0_2[%c0] : memref<2xi32>
 // CHECK:         }
 // CHECK:         aie.use_lock(%output_fifo_prod_lock, AcquireGreaterEqual, 1)
-// CHECK:         %7 = memref.load %buffer_0_2[%c0_0] : memref<2xindex>
-// CHECK:         %8 = scf.index_switch %7 -> memref<10xi32> 
+// CHECK:         %9 = memref.load %buffer_0_2[%c0] : memref<2xi32>
+// CHECK:         %10 = arith.index_cast %9 : i32 to index
+// CHECK:         %11 = scf.index_switch %10 -> memref<10xi32> 
 // CHECK:         case 0 {
 // CHECK:           scf.yield %output_fifo_buff_0 : memref<10xi32>
 // CHECK:         }
@@ -150,8 +156,9 @@
 // CHECK:           scf.yield %output_fifo_buff_0 : memref<10xi32>
 // CHECK:         }
 // CHECK:         aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 1)
-// CHECK:         %9 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-// CHECK:         %10 = scf.index_switch %9 -> memref<10xi32> 
+// CHECK:         %12 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+// CHECK:         %13 = arith.index_cast %12 : i32 to index
+// CHECK:         %14 = scf.index_switch %13 -> memref<10xi32> 
 // CHECK:         case 0 {
 // CHECK:           scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
 // CHECK:         }
@@ -164,8 +171,9 @@
 // CHECK:         default {
 // CHECK:           scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
 // CHECK:         }
-// CHECK:         %11 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-// CHECK:         %12 = scf.index_switch %11 -> memref<10xi32> 
+// CHECK:         %15 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+// CHECK:         %16 = arith.index_cast %15 : i32 to index
+// CHECK:         %17 = scf.index_switch %16 -> memref<10xi32> 
 // CHECK:         case 0 {
 // CHECK:           scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
 // CHECK:         }
@@ -178,19 +186,19 @@
 // CHECK:         default {
 // CHECK:           scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
 // CHECK:         }
-// CHECK:         func.call @add_10_i32(%10, %12, %8) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
+// CHECK:         func.call @add_10_i32(%14, %17, %11) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
 // CHECK:         aie.use_lock(%input_fifo_cons_prod_lock, Release, 2)
-// CHECK:         %13 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-// CHECK:         %c2_4 = arith.constant 2 : index
-// CHECK:         %14 = arith.addi %13, %c2_4 : index
-// CHECK:         %15 = arith.remsi %14, %c3 : index
-// CHECK:         memref.store %15, %buffer_0_2[%c1] : memref<2xindex>
+// CHECK:         %18 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+// CHECK:         %c2_i32_2 = arith.constant 2 : i32
+// CHECK:         %19 = arith.addi %18, %c2_i32_2 : i32
+// CHECK:         %20 = arith.remsi %19, %c3_i32 : i32
+// CHECK:         memref.store %20, %buffer_0_2[%c1] : memref<2xi32>
 // CHECK:         aie.use_lock(%output_fifo_cons_lock, Release, 1)
-// CHECK:         %16 = memref.load %buffer_0_2[%c0_0] : memref<2xindex>
-// CHECK:         %c1_5 = arith.constant 1 : index
-// CHECK:         %17 = arith.addi %16, %c1_5 : index
-// CHECK:         %18 = arith.remsi %17, %c2 : index
-// CHECK:         memref.store %18, %buffer_0_2[%c0_0] : memref<2xindex>
+// CHECK:         %21 = memref.load %buffer_0_2[%c0] : memref<2xi32>
+// CHECK:         %c1_i32_3 = arith.constant 1 : i32
+// CHECK:         %22 = arith.addi %21, %c1_i32_3 : i32
+// CHECK:         %23 = arith.remsi %22, %c2_i32 : i32
+// CHECK:         memref.store %23, %buffer_0_2[%c0] : memref<2xi32>
 // CHECK:         aie.end
 // CHECK:       }
 // CHECK:       aie.shim_dma_allocation @input_fifo(MM2S, 0, 0)

From bf8ef6631bbe1ddc1ee53d8bc5f299625c5f370c Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Wed, 4 Dec 2024 14:45:36 -0700
Subject: [PATCH 39/46] Got the flags wrong in the previous one

---
 .../dynamic_lowering_flag_test.mlir           | 96 ++-----------------
 1 file changed, 8 insertions(+), 88 deletions(-)

diff --git a/test/objectFifo-stateful-transform/dynamic_lowering_flag_test.mlir b/test/objectFifo-stateful-transform/dynamic_lowering_flag_test.mlir
index bdc59cf3a4..16c028b6c3 100644
--- a/test/objectFifo-stateful-transform/dynamic_lowering_flag_test.mlir
+++ b/test/objectFifo-stateful-transform/dynamic_lowering_flag_test.mlir
@@ -10,50 +10,6 @@
 
 // RUN: aie-opt --aie-objectFifo-stateful-transform %s | FileCheck %s
 
-// CHECK:   aie.device(npu1_1col) {
-// CHECK:     memref.global "public" @output_fifo2_cons : memref<10xi32>
-// CHECK:     memref.global "public" @output_fifo2 : memref<10xi32>
-// CHECK:     memref.global "public" @input_fifo2_cons : memref<10xi32>
-// CHECK:     memref.global "public" @input_fifo2 : memref<10xi32>
-// CHECK:     memref.global "public" @output_fifo_cons : memref<10xi32>
-// CHECK:     memref.global "public" @output_fifo : memref<10xi32>
-// CHECK:     memref.global "public" @input_fifo_cons : memref<10xi32>
-// CHECK:     memref.global "public" @input_fifo : memref<10xi32>
-// CHECK:     func.func @passthrough_10_i32(%arg0: memref<10xi32>, %arg1: memref<10xi32>) {
-// CHECK:       return
-// CHECK:     }
-// CHECK:     %tile_0_0 = aie.tile(0, 0)
-// CHECK:     %tile_0_2 = aie.tile(0, 2)
-// CHECK:     %tile_0_4 = aie.tile(0, 4)
-// CHECK:     %output_fifo2_cons_prod_lock = aie.lock(%tile_0_0, 6) {init = 1 : i32, sym_name = "output_fifo2_cons_prod_lock"}
-// CHECK:     %output_fifo2_cons_cons_lock = aie.lock(%tile_0_0, 7) {init = 0 : i32, sym_name = "output_fifo2_cons_cons_lock"}
-// CHECK:     %output_fifo2_buff_0 = aie.buffer(%tile_0_4) {sym_name = "output_fifo2_buff_0"} : memref<10xi32> 
-// CHECK:     %output_fifo2_buff_1 = aie.buffer(%tile_0_4) {sym_name = "output_fifo2_buff_1"} : memref<10xi32> 
-// CHECK:     %output_fifo2_prod_lock = aie.lock(%tile_0_4, 2) {init = 2 : i32, sym_name = "output_fifo2_prod_lock"}
-// CHECK:     %output_fifo2_cons_lock = aie.lock(%tile_0_4, 3) {init = 0 : i32, sym_name = "output_fifo2_cons_lock"}
-// CHECK:     %input_fifo2_cons_buff_0 = aie.buffer(%tile_0_4) {sym_name = "input_fifo2_cons_buff_0"} : memref<10xi32> 
-// CHECK:     %input_fifo2_cons_buff_1 = aie.buffer(%tile_0_4) {sym_name = "input_fifo2_cons_buff_1"} : memref<10xi32> 
-// CHECK:     %input_fifo2_cons_prod_lock = aie.lock(%tile_0_4, 0) {init = 2 : i32, sym_name = "input_fifo2_cons_prod_lock"}
-// CHECK:     %input_fifo2_cons_cons_lock = aie.lock(%tile_0_4, 1) {init = 0 : i32, sym_name = "input_fifo2_cons_cons_lock"}
-// CHECK:     %input_fifo2_prod_lock = aie.lock(%tile_0_0, 4) {init = 1 : i32, sym_name = "input_fifo2_prod_lock"}
-// CHECK:     %input_fifo2_cons_lock = aie.lock(%tile_0_0, 5) {init = 0 : i32, sym_name = "input_fifo2_cons_lock"}
-// CHECK:     %output_fifo_cons_prod_lock = aie.lock(%tile_0_0, 2) {init = 1 : i32, sym_name = "output_fifo_cons_prod_lock"}
-// CHECK:     %output_fifo_cons_cons_lock = aie.lock(%tile_0_0, 3) {init = 0 : i32, sym_name = "output_fifo_cons_cons_lock"}
-// CHECK:     %output_fifo_buff_0 = aie.buffer(%tile_0_2) {sym_name = "output_fifo_buff_0"} : memref<10xi32> 
-// CHECK:     %output_fifo_buff_1 = aie.buffer(%tile_0_2) {sym_name = "output_fifo_buff_1"} : memref<10xi32> 
-// CHECK:     %output_fifo_prod_lock = aie.lock(%tile_0_2, 2) {init = 2 : i32, sym_name = "output_fifo_prod_lock"}
-// CHECK:     %output_fifo_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "output_fifo_cons_lock"}
-// CHECK:     %input_fifo_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "input_fifo_cons_buff_0"} : memref<10xi32> 
-// CHECK:     %input_fifo_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "input_fifo_cons_buff_1"} : memref<10xi32> 
-// CHECK:     %input_fifo_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 2 : i32, sym_name = "input_fifo_cons_prod_lock"}
-// CHECK:     %input_fifo_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "input_fifo_cons_cons_lock"}
-// CHECK:     %input_fifo_prod_lock = aie.lock(%tile_0_0, 0) {init = 1 : i32, sym_name = "input_fifo_prod_lock"}
-// CHECK:     %input_fifo_cons_lock = aie.lock(%tile_0_0, 1) {init = 0 : i32, sym_name = "input_fifo_cons_lock"}
-// CHECK:     aie.flow(%tile_0_0, DMA : 0, %tile_0_2, DMA : 0)
-// CHECK:     aie.flow(%tile_0_2, DMA : 0, %tile_0_0, DMA : 0)
-// CHECK:     aie.flow(%tile_0_0, DMA : 1, %tile_0_4, DMA : 0)
-// CHECK:     aie.flow(%tile_0_4, DMA : 0, %tile_0_0, DMA : 1)
-// CHECK:     %buffer_0_2 = aie.buffer(%tile_0_2) : memref<2xi32> 
 // CHECK:     %core_0_2 = aie.core(%tile_0_2) {
 // CHECK:       %c0_i32 = arith.constant 0 : i32
 // CHECK:       %c0 = arith.constant 0 : index
@@ -108,58 +64,22 @@
 // CHECK:       }
 // CHECK:       aie.end
 // CHECK:     } {dynamic_objfifo_lowering = true}
-// CHECK:     %buffer_0_4 = aie.buffer(%tile_0_4) : memref<2xi32> 
 // CHECK:     %core_0_4 = aie.core(%tile_0_4) {
-// CHECK:       %c0_i32 = arith.constant 0 : i32
 // CHECK:       %c0 = arith.constant 0 : index
-// CHECK:       %c2_i32 = arith.constant 2 : i32
-// CHECK:       memref.store %c0_i32, %buffer_0_4[%c0] : memref<2xi32>
 // CHECK:       %c1 = arith.constant 1 : index
-// CHECK:       %c2_i32_0 = arith.constant 2 : i32
-// CHECK:       memref.store %c0_i32, %buffer_0_4[%c1] : memref<2xi32>
-// CHECK:       %c0_1 = arith.constant 0 : index
-// CHECK:       %c1_2 = arith.constant 1 : index
 // CHECK:       %c10 = arith.constant 10 : index
-// CHECK:       scf.for %arg0 = %c0_1 to %c10 step %c1_2 {
+// CHECK:       %c2 = arith.constant 2 : index
+// CHECK:       scf.for %arg0 = %c0 to %c10 step %c2 {
 // CHECK:         aie.use_lock(%output_fifo2_prod_lock, AcquireGreaterEqual, 1)
-// CHECK:         %0 = memref.load %buffer_0_4[%c0] : memref<2xi32>
-// CHECK:         %1 = arith.index_cast %0 : i32 to index
-// CHECK:         %2 = scf.index_switch %1 -> memref<10xi32> 
-// CHECK:         case 0 {
-// CHECK:           scf.yield %output_fifo2_buff_0 : memref<10xi32>
-// CHECK:         }
-// CHECK:         case 1 {
-// CHECK:           scf.yield %output_fifo2_buff_1 : memref<10xi32>
-// CHECK:         }
-// CHECK:         default {
-// CHECK:           scf.yield %output_fifo2_buff_0 : memref<10xi32>
-// CHECK:         }
 // CHECK:         aie.use_lock(%input_fifo2_cons_cons_lock, AcquireGreaterEqual, 1)
-// CHECK:         %3 = memref.load %buffer_0_4[%c1] : memref<2xi32>
-// CHECK:         %4 = arith.index_cast %3 : i32 to index
-// CHECK:         %5 = scf.index_switch %4 -> memref<10xi32> 
-// CHECK:         case 0 {
-// CHECK:           scf.yield %input_fifo2_cons_buff_0 : memref<10xi32>
-// CHECK:         }
-// CHECK:         case 1 {
-// CHECK:           scf.yield %input_fifo2_cons_buff_1 : memref<10xi32>
-// CHECK:         }
-// CHECK:         default {
-// CHECK:           scf.yield %input_fifo2_cons_buff_0 : memref<10xi32>
-// CHECK:         }
-// CHECK:         func.call @passthrough_10_i32(%5, %2) : (memref<10xi32>, memref<10xi32>) -> ()
+// CHECK:         func.call @passthrough_10_i32(%input_fifo2_cons_buff_0, %output_fifo2_buff_0) : (memref<10xi32>, memref<10xi32>) -> ()
+// CHECK:         aie.use_lock(%input_fifo2_cons_prod_lock, Release, 1)
+// CHECK:         aie.use_lock(%output_fifo2_cons_lock, Release, 1)
+// CHECK:         aie.use_lock(%output_fifo2_prod_lock, AcquireGreaterEqual, 1)
+// CHECK:         aie.use_lock(%input_fifo2_cons_cons_lock, AcquireGreaterEqual, 1)
+// CHECK:         func.call @passthrough_10_i32(%input_fifo2_cons_buff_1, %output_fifo2_buff_1) : (memref<10xi32>, memref<10xi32>) -> ()
 // CHECK:         aie.use_lock(%input_fifo2_cons_prod_lock, Release, 1)
-// CHECK:         %6 = memref.load %buffer_0_4[%c1] : memref<2xi32>
-// CHECK:         %c1_i32 = arith.constant 1 : i32
-// CHECK:         %7 = arith.addi %6, %c1_i32 : i32
-// CHECK:         %8 = arith.remsi %7, %c2_i32_0 : i32
-// CHECK:         memref.store %8, %buffer_0_4[%c1] : memref<2xi32>
 // CHECK:         aie.use_lock(%output_fifo2_cons_lock, Release, 1)
-// CHECK:         %9 = memref.load %buffer_0_4[%c0] : memref<2xi32>
-// CHECK:         %c1_i32_3 = arith.constant 1 : i32
-// CHECK:         %10 = arith.addi %9, %c1_i32_3 : i32
-// CHECK:         %11 = arith.remsi %10, %c2_i32 : i32
-// CHECK:         memref.store %11, %buffer_0_4[%c0] : memref<2xi32>
 // CHECK:       }
 // CHECK:       aie.end
 // CHECK:     }

From efa97dc3a4d5f88318343739d692663ecb190832 Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Wed, 4 Dec 2024 14:51:06 -0700
Subject: [PATCH 40/46] Reverting changes from test.cpp for failing tests

---
 .../sliding_window/test.cpp                   | 22 +++++--------------
 .../sliding_window_conditional/test.cpp       |  2 +-
 .../two_core_sliding_window/test.cpp          |  2 +-
 3 files changed, 7 insertions(+), 19 deletions(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window/test.cpp b/test/npu-xrt/dynamic_object_fifo/sliding_window/test.cpp
index 5c78b0e986..0fb9cfa7d4 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window/test.cpp
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window/test.cpp
@@ -14,11 +14,11 @@
 #include "xrt/xrt_kernel.h"
 
 #ifndef XCLBIN
-#define XCLBIN "build/final.xclbin"
+#define XCLBIN "final.xclbin"
 #endif
 
 #ifndef INSTS_TXT
-#define INSTS_TXT "build/insts.txt"
+#define INSTS_TXT "insts.txt"
 #endif
 
 #ifndef KERNEL_NAME
@@ -28,27 +28,15 @@
 #define INPUT_SIZE (100 * sizeof(int))  // in bytes
 #define OUTPUT_SIZE (100 * sizeof(int)) // in bytes
 #define WIDTH_SIZE (10 * sizeof(int))   // in bytes
+
 #define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE
 #define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE
 
-std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
-  std::ifstream instr_file(instr_path);
-  std::string line;
-  std::vector<uint32_t> instr_v;
-  while (std::getline(instr_file, line)) {
-    std::istringstream iss(line);
-    uint32_t a;
-    if (!(iss >> std::hex >> a)) {
-      throw std::runtime_error("Unable to parse instruction file\n");
-    }
-    instr_v.push_back(a);
-  }
-  return instr_v;
-}
+#include "test_utils.h"
 
 int main(int argc, const char *argv[]) {
 
-  std::vector<uint32_t> instr_v = load_instr_sequence(INSTS_TXT);
+  std::vector<uint32_t> instr_v = test_utils::load_instr_sequence(INSTS_TXT);
   assert(instr_v.size() > 0);
 
   // Get a device handle
diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/test.cpp b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/test.cpp
index c25d9358f6..0fb9cfa7d4 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/test.cpp
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/test.cpp
@@ -28,7 +28,7 @@
 #define INPUT_SIZE (100 * sizeof(int))  // in bytes
 #define OUTPUT_SIZE (100 * sizeof(int)) // in bytes
 #define WIDTH_SIZE (10 * sizeof(int))   // in bytes
-#define WIDTH 10
+
 #define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE
 #define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE
 
diff --git a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/test.cpp b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/test.cpp
index c25d9358f6..0fb9cfa7d4 100644
--- a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/test.cpp
+++ b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/test.cpp
@@ -28,7 +28,7 @@
 #define INPUT_SIZE (100 * sizeof(int))  // in bytes
 #define OUTPUT_SIZE (100 * sizeof(int)) // in bytes
 #define WIDTH_SIZE (10 * sizeof(int))   // in bytes
-#define WIDTH 10
+
 #define INPUT_ROWS INPUT_SIZE / WIDTH_SIZE
 #define OUTPUT_ROWS OUTPUT_SIZE / WIDTH_SIZE
 

From 946fb84bdfbf50c53be5e230bfa767ce0d043da2 Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Thu, 5 Dec 2024 09:52:21 -0700
Subject: [PATCH 41/46] MLIR version for sliding_window_conditional since
 python file fails

---
 .../sliding_window_conditional/aie.mlir       | 202 ++++++++++++++++++
 .../sliding_window_conditional/aie2.py        |   3 +-
 .../sliding_window_conditional/run.lit        |  10 +
 3 files changed, 214 insertions(+), 1 deletion(-)
 create mode 100644 test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie.mlir
 create mode 100644 test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/run.lit

diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie.mlir b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie.mlir
new file mode 100644
index 0000000000..cfe608eed0
--- /dev/null
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie.mlir
@@ -0,0 +1,202 @@
+//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+
+module {
+  aie.device(npu1_1col) {
+    memref.global "public" @output_fifo_cons : memref<10xi32>
+    memref.global "public" @output_fifo : memref<10xi32>
+    memref.global "public" @input_fifo_cons : memref<10xi32>
+    memref.global "public" @input_fifo : memref<10xi32>
+    func.func private @add_10_i32(memref<10xi32>, memref<10xi32>, memref<10xi32>)
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_2 = aie.tile(0, 2)
+    %output_fifo_cons_prod_lock = aie.lock(%tile_0_0, 2) {init = 0 : i32, sym_name = "output_fifo_cons_prod_lock"}
+    %output_fifo_cons_cons_lock = aie.lock(%tile_0_0, 3) {init = 0 : i32, sym_name = "output_fifo_cons_cons_lock"}
+    %output_fifo_buff_0 = aie.buffer(%tile_0_2) {sym_name = "output_fifo_buff_0"} : memref<10xi32> 
+    %output_fifo_buff_1 = aie.buffer(%tile_0_2) {sym_name = "output_fifo_buff_1"} : memref<10xi32> 
+    %output_fifo_prod_lock = aie.lock(%tile_0_2, 2) {init = 2 : i32, sym_name = "output_fifo_prod_lock"}
+    %output_fifo_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "output_fifo_cons_lock"}
+    %input_fifo_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "input_fifo_cons_buff_0"} : memref<10xi32> 
+    %input_fifo_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "input_fifo_cons_buff_1"} : memref<10xi32> 
+    %input_fifo_cons_buff_2 = aie.buffer(%tile_0_2) {sym_name = "input_fifo_cons_buff_2"} : memref<10xi32> 
+    %input_fifo_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 3 : i32, sym_name = "input_fifo_cons_prod_lock"}
+    %input_fifo_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "input_fifo_cons_cons_lock"}
+    %input_fifo_prod_lock = aie.lock(%tile_0_0, 0) {init = 0 : i32, sym_name = "input_fifo_prod_lock"}
+    %input_fifo_cons_lock = aie.lock(%tile_0_0, 1) {init = 0 : i32, sym_name = "input_fifo_cons_lock"}
+    aie.flow(%tile_0_0, DMA : 0, %tile_0_2, DMA : 0)
+    aie.flow(%tile_0_2, DMA : 0, %tile_0_0, DMA : 0)
+    %buffer_0_2 = aie.buffer(%tile_0_2) : memref<2xindex> 
+    %core_0_2 = aie.core(%tile_0_2) {
+      %c0 = arith.constant 0 : index
+      %c0_0 = arith.constant 0 : index
+      %c2 = arith.constant 2 : index
+      memref.store %c0, %buffer_0_2[%c0_0] : memref<2xindex>
+      %c1 = arith.constant 1 : index
+      %c3 = arith.constant 3 : index
+      memref.store %c0, %buffer_0_2[%c1] : memref<2xindex>
+      %c0_1 = arith.constant 0 : index
+      %c10 = arith.constant 10 : index
+      %c1_2 = arith.constant 1 : index
+      scf.for %arg0 = %c0_1 to %c10 step %c1_2 {
+        aie.use_lock(%output_fifo_prod_lock, AcquireGreaterEqual, 1)
+        %0 = memref.load %buffer_0_2[%c0_0] : memref<2xindex>
+        %1 = scf.index_switch %0 -> memref<10xi32> 
+        case 0 {
+          scf.yield %output_fifo_buff_0 : memref<10xi32>
+        }
+        case 1 {
+          scf.yield %output_fifo_buff_1 : memref<10xi32>
+        }
+        default {
+          scf.yield %output_fifo_buff_0 : memref<10xi32>
+        }
+        %2 = arith.cmpi eq, %arg0, %c0_1 : index
+        %3 = arith.subi %c10, %c1_2 : index
+        %4 = arith.cmpi eq, %arg0, %3 : index
+        scf.if %2 {
+          aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 1)
+          %8 = memref.load %buffer_0_2[%c1] : memref<2xindex>
+          %9 = scf.index_switch %8 -> memref<10xi32> 
+          case 0 {
+            scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
+          }
+          case 1 {
+            scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
+          }
+          case 2 {
+            scf.yield %input_fifo_cons_buff_2 : memref<10xi32>
+          }
+          default {
+            scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
+          }
+          func.call @add_10_i32(%9, %9, %1) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
+        } else {
+          scf.if %4 {
+            aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 2)
+            %8 = memref.load %buffer_0_2[%c1] : memref<2xindex>
+            %9 = scf.index_switch %8 -> memref<10xi32> 
+            case 0 {
+              scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
+            }
+            case 1 {
+              scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
+            }
+            case 2 {
+              scf.yield %input_fifo_cons_buff_2 : memref<10xi32>
+            }
+            default {
+              scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
+            }
+            %10 = memref.load %buffer_0_2[%c1] : memref<2xindex>
+            %11 = scf.index_switch %10 -> memref<10xi32> 
+            case 0 {
+              scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
+            }
+            case 1 {
+              scf.yield %input_fifo_cons_buff_2 : memref<10xi32>
+            }
+            case 2 {
+              scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
+            }
+            default {
+              scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
+            }
+            func.call @add_10_i32(%9, %11, %1) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
+            aie.use_lock(%input_fifo_cons_prod_lock, Release, 2)
+            %12 = memref.load %buffer_0_2[%c1] : memref<2xindex>
+            %c2_4 = arith.constant 2 : index
+            %13 = arith.addi %12, %c2_4 : index
+            %14 = arith.remsi %13, %c3 : index
+            memref.store %14, %buffer_0_2[%c1] : memref<2xindex>
+          } else {
+            %8 = memref.load %buffer_0_2[%c1] : memref<2xindex>
+            %9 = scf.index_switch %8 -> memref<10xi32> 
+            case 0 {
+              scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
+            }
+            case 1 {
+              scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
+            }
+            case 2 {
+              scf.yield %input_fifo_cons_buff_2 : memref<10xi32>
+            }
+            default {
+              scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
+            }
+            %10 = memref.load %buffer_0_2[%c1] : memref<2xindex>
+            %11 = scf.index_switch %10 -> memref<10xi32> 
+            case 0 {
+              scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
+            }
+            case 1 {
+              scf.yield %input_fifo_cons_buff_2 : memref<10xi32>
+            }
+            case 2 {
+              scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
+            }
+            default {
+              scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
+            }
+            func.call @add_10_i32(%9, %11, %1) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
+            aie.use_lock(%input_fifo_cons_prod_lock, Release, 1)
+            %12 = memref.load %buffer_0_2[%c1] : memref<2xindex>
+            %c1_4 = arith.constant 1 : index
+            %13 = arith.addi %12, %c1_4 : index
+            %14 = arith.remsi %13, %c3 : index
+            memref.store %14, %buffer_0_2[%c1] : memref<2xindex>
+          }
+        }
+        aie.use_lock(%output_fifo_cons_lock, Release, 1)
+        %5 = memref.load %buffer_0_2[%c0_0] : memref<2xindex>
+        %c1_3 = arith.constant 1 : index
+        %6 = arith.addi %5, %c1_3 : index
+        %7 = arith.remsi %6, %c2 : index
+        memref.store %7, %buffer_0_2[%c0_0] : memref<2xindex>
+      }
+      aie.end
+    } {link_with = "kernel.o"}
+    aie.shim_dma_allocation @input_fifo(MM2S, 0, 0)
+    aiex.runtime_sequence(%arg0: memref<10xi32>, %arg1: memref<10xi32>) {
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 100][0, 0, 0, 1]) {id = 0 : i64, metadata = @input_fifo} : memref<10xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 100][0, 0, 0, 1]) {id = 2 : i64, metadata = @output_fifo} : memref<10xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+    }
+    aie.shim_dma_allocation @output_fifo(S2MM, 0, 0)
+    %mem_0_2 = aie.mem(%tile_0_2) {
+      %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb4)
+    ^bb1:  // 2 preds: ^bb0, ^bb3
+      aie.use_lock(%input_fifo_cons_prod_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%input_fifo_cons_buff_0 : memref<10xi32>, 0, 10)
+      aie.use_lock(%input_fifo_cons_cons_lock, Release, 1)
+      aie.next_bd ^bb2
+    ^bb2:  // pred: ^bb1
+      aie.use_lock(%input_fifo_cons_prod_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%input_fifo_cons_buff_1 : memref<10xi32>, 0, 10)
+      aie.use_lock(%input_fifo_cons_cons_lock, Release, 1)
+      aie.next_bd ^bb3
+    ^bb3:  // pred: ^bb2
+      aie.use_lock(%input_fifo_cons_prod_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%input_fifo_cons_buff_2 : memref<10xi32>, 0, 10)
+      aie.use_lock(%input_fifo_cons_cons_lock, Release, 1)
+      aie.next_bd ^bb1
+    ^bb4:  // pred: ^bb0
+      %1 = aie.dma_start(MM2S, 0, ^bb5, ^bb7)
+    ^bb5:  // 2 preds: ^bb4, ^bb6
+      aie.use_lock(%output_fifo_cons_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%output_fifo_buff_0 : memref<10xi32>, 0, 10)
+      aie.use_lock(%output_fifo_prod_lock, Release, 1)
+      aie.next_bd ^bb6
+    ^bb6:  // pred: ^bb5
+      aie.use_lock(%output_fifo_cons_lock, AcquireGreaterEqual, 1)
+      aie.dma_bd(%output_fifo_buff_1 : memref<10xi32>, 0, 10)
+      aie.use_lock(%output_fifo_prod_lock, Release, 1)
+      aie.next_bd ^bb5
+    ^bb7:  // pred: ^bb4
+      aie.end
+    }
+  }
+}
diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
index 366552907b..fc12961596 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
@@ -9,10 +9,11 @@
 #
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
-# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
 # RUN: %run_on_npu ./test.exe | FileCheck %s
 # CHECK: PASS!
+
 import numpy as np
 
 from aie.dialects.aie import *
diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/run.lit b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/run.lit
new file mode 100644
index 0000000000..51904bb335
--- /dev/null
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/run.lit
@@ -0,0 +1,10 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, valid_xchess_license
+//
+// RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
+// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir
+// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// CHECK: PASS!

From 012d492ce3f95daeb6fd4dc4b800549bc2af69dc Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Thu, 5 Dec 2024 10:13:04 -0700
Subject: [PATCH 42/46] Use only mlir for testing

---
 .../sliding_window_conditional/aie2.py                    | 8 --------
 .../sliding_window_conditional/run.lit                    | 2 +-
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
index fc12961596..b4417d2a2e 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
@@ -5,14 +5,6 @@
 #
 # (c) Copyright 2024 AMD Inc.
 
-# REQUIRES: ryzen_ai, valid_xchess_license
-#
-# RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
-# RUN: %python %S/aie2.py > ./aie2.mlir
-# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
-# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
-# RUN: %run_on_npu ./test.exe | FileCheck %s
-# CHECK: PASS!
 
 import numpy as np
 
diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/run.lit b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/run.lit
index 51904bb335..68033f00d2 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/run.lit
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/run.lit
@@ -4,7 +4,7 @@
 // REQUIRES: ryzen_ai, valid_xchess_license
 //
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
-// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir
+// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --dynamic-objFifos --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir
 // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
 // RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!

From 1052567abeef9681138c0cc7271386aa517e7602 Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Thu, 5 Dec 2024 13:46:40 -0700
Subject: [PATCH 43/46] Correct MLIR code

---
 .../sliding_window_conditional/aie.mlir       | 79 +++++++++----------
 .../sliding_window_conditional/aie2.py        | 73 -----------------
 2 files changed, 39 insertions(+), 113 deletions(-)
 delete mode 100644 test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py

diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie.mlir b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie.mlir
index cfe608eed0..5ee3d1aa0d 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie.mlir
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie.mlir
@@ -1,10 +1,3 @@
-//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
-//
-// Copyright (C) 2024, Advanced Micro Devices, Inc.
-// SPDX-License-Identifier: MIT
-//
-//===----------------------------------------------------------------------===//
-
 module {
   aie.device(npu1_1col) {
     memref.global "public" @output_fifo_cons : memref<10xi32>
@@ -29,22 +22,23 @@ module {
     %input_fifo_cons_lock = aie.lock(%tile_0_0, 1) {init = 0 : i32, sym_name = "input_fifo_cons_lock"}
     aie.flow(%tile_0_0, DMA : 0, %tile_0_2, DMA : 0)
     aie.flow(%tile_0_2, DMA : 0, %tile_0_0, DMA : 0)
-    %buffer_0_2 = aie.buffer(%tile_0_2) : memref<2xindex> 
+    %buffer_0_2 = aie.buffer(%tile_0_2) : memref<2xi32> 
     %core_0_2 = aie.core(%tile_0_2) {
-      %c0 = arith.constant 0 : index
+      %c0_i32 = arith.constant 0 : i32
       %c0_0 = arith.constant 0 : index
-      %c2 = arith.constant 2 : index
-      memref.store %c0, %buffer_0_2[%c0_0] : memref<2xindex>
+      %c2_i32 = arith.constant 2 : i32
+      memref.store %c0_i32, %buffer_0_2[%c0_0] : memref<2xi32>
       %c1 = arith.constant 1 : index
-      %c3 = arith.constant 3 : index
-      memref.store %c0, %buffer_0_2[%c1] : memref<2xindex>
+      %c3_i32 = arith.constant 3 : i32
+      memref.store %c0_i32, %buffer_0_2[%c1] : memref<2xi32>
       %c0_1 = arith.constant 0 : index
       %c10 = arith.constant 10 : index
       %c1_2 = arith.constant 1 : index
       scf.for %arg0 = %c0_1 to %c10 step %c1_2 {
         aie.use_lock(%output_fifo_prod_lock, AcquireGreaterEqual, 1)
-        %0 = memref.load %buffer_0_2[%c0_0] : memref<2xindex>
-        %1 = scf.index_switch %0 -> memref<10xi32> 
+        %0 = memref.load %buffer_0_2[%c0_0] : memref<2xi32>
+        %100 = arith.index_cast %0 : i32 to index
+        %1 = scf.index_switch %100 -> memref<10xi32> 
         case 0 {
           scf.yield %output_fifo_buff_0 : memref<10xi32>
         }
@@ -59,8 +53,9 @@ module {
         %4 = arith.cmpi eq, %arg0, %3 : index
         scf.if %2 {
           aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 1)
-          %8 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-          %9 = scf.index_switch %8 -> memref<10xi32> 
+          %8 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+          %800 = arith.index_cast %8 : i32 to index
+          %9 = scf.index_switch %800 -> memref<10xi32> 
           case 0 {
             scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
           }
@@ -77,8 +72,9 @@ module {
         } else {
           scf.if %4 {
             aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 2)
-            %8 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-            %9 = scf.index_switch %8 -> memref<10xi32> 
+            %8 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+            %800 = arith.index_cast %8 : i32 to index
+            %9 = scf.index_switch %800 -> memref<10xi32> 
             case 0 {
               scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
             }
@@ -91,8 +87,9 @@ module {
             default {
               scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
             }
-            %10 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-            %11 = scf.index_switch %10 -> memref<10xi32> 
+            %10 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+            %1000 = arith.index_cast %10 : i32 to index
+            %11 = scf.index_switch %1000 -> memref<10xi32> 
             case 0 {
               scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
             }
@@ -107,14 +104,15 @@ module {
             }
             func.call @add_10_i32(%9, %11, %1) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
             aie.use_lock(%input_fifo_cons_prod_lock, Release, 2)
-            %12 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-            %c2_4 = arith.constant 2 : index
-            %13 = arith.addi %12, %c2_4 : index
-            %14 = arith.remsi %13, %c3 : index
-            memref.store %14, %buffer_0_2[%c1] : memref<2xindex>
+            %12 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+            %c2_4 = arith.constant 2 : i32
+            %13 = arith.addi %12, %c2_4 : i32
+            %14 = arith.remsi %13, %c3_i32 : i32
+            memref.store %14, %buffer_0_2[%c1] : memref<2xi32>
           } else {
-            %8 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-            %9 = scf.index_switch %8 -> memref<10xi32> 
+            %8 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+            %800 = arith.index_cast %8 : i32 to index
+            %9 = scf.index_switch %800 -> memref<10xi32> 
             case 0 {
               scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
             }
@@ -127,8 +125,9 @@ module {
             default {
               scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
             }
-            %10 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-            %11 = scf.index_switch %10 -> memref<10xi32> 
+            %10 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+            %1000 = arith.index_cast %10 : i32 to index
+            %11 = scf.index_switch %1000 -> memref<10xi32> 
             case 0 {
               scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
             }
@@ -143,19 +142,19 @@ module {
             }
             func.call @add_10_i32(%9, %11, %1) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
             aie.use_lock(%input_fifo_cons_prod_lock, Release, 1)
-            %12 = memref.load %buffer_0_2[%c1] : memref<2xindex>
-            %c1_4 = arith.constant 1 : index
-            %13 = arith.addi %12, %c1_4 : index
-            %14 = arith.remsi %13, %c3 : index
-            memref.store %14, %buffer_0_2[%c1] : memref<2xindex>
+            %12 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+            %c1_4 = arith.constant 1 : i32
+            %13 = arith.addi %12, %c1_4 : i32
+            %14 = arith.remsi %13, %c3_i32 : i32
+            memref.store %14, %buffer_0_2[%c1] : memref<2xi32>
           }
         }
         aie.use_lock(%output_fifo_cons_lock, Release, 1)
-        %5 = memref.load %buffer_0_2[%c0_0] : memref<2xindex>
-        %c1_3 = arith.constant 1 : index
-        %6 = arith.addi %5, %c1_3 : index
-        %7 = arith.remsi %6, %c2 : index
-        memref.store %7, %buffer_0_2[%c0_0] : memref<2xindex>
+        %5 = memref.load %buffer_0_2[%c0_0] : memref<2xi32>
+        %c1_3 = arith.constant 1 : i32
+        %6 = arith.addi %5, %c1_3 : i32
+        %7 = arith.remsi %6, %c2_i32 : i32
+        memref.store %7, %buffer_0_2[%c0_0] : memref<2xi32>
       }
       aie.end
     } {link_with = "kernel.o"}
diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
deleted file mode 100644
index b4417d2a2e..0000000000
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
+++ /dev/null
@@ -1,73 +0,0 @@
-#
-# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# (c) Copyright 2024 AMD Inc.
-
-
-import numpy as np
-
-from aie.dialects.aie import *
-from aie.dialects.aiex import *
-from aie.helpers.dialects.ext.scf import _for as range_
-from aie.extras.context import mlir_mod_ctx
-
-N = 100
-n_rows = 10
-dev = AIEDevice.npu1_1col
-col = 0
-
-
-def sliding_window():
-    with mlir_mod_ctx() as ctx:
-
-        @device(dev)
-        def device_body():
-            subtensor_ty = np.ndarray[(N // n_rows,), np.dtype[np.int32]]
-
-            # Tile declarations
-            ShimTile = tile(col, 0)
-            ComputeTile = tile(col, 2)
-
-            # AIE-array data movement with object fifos
-            of_in = object_fifo("in", ShimTile, ComputeTile, 3, subtensor_ty)
-            of_out = object_fifo("out", ComputeTile, ShimTile, 2, subtensor_ty)
-
-            # AIE Core Function declarations
-            add_10_i32 = external_func(
-                "add_10_i32", inputs=[subtensor_ty, subtensor_ty, subtensor_ty]
-            )
-
-            # Set up compute tiles
-            @core(ComputeTile, "kernel.o")
-            def core_body():
-                for i in range_(10):
-                    elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
-                    if i == 0:
-                        elemInPre = of_in.acquire(ObjectFifoPort.Consume, 1)
-                        add_10_i32(elemInPre, elemInPre, elemOut)
-                    elif i == 9:
-                        elemsInPost = of_in.acquire(ObjectFifoPort.Consume, 2)
-                        add_10_i32(elemsInPost[0], elemsInPost[1], elemOut)
-                        of_in.release(ObjectFifoPort.Consume, 2)
-                    else:
-                        elemsIn = of_in.acquire(ObjectFifoPort.Consume, 2)
-                        add_10_i32(elemsIn[0], elemsIn[1], elemOut)
-                        of_in.release(ObjectFifoPort.Consume, 1)
-
-                of_out.release(ObjectFifoPort.Produce, 1)
-
-            # To/from AIE-array data movement
-            tensor_ty = np.ndarray[(N,), np.dtype[np.int32]]
-
-            @runtime_sequence(tensor_ty, tensor_ty)
-            def sequence(A, C):
-                npu_dma_memcpy_nd(metadata=of_in, bd_id=1, mem=A, sizes=[1, 1, 1, N])
-                npu_dma_memcpy_nd(metadata=of_out, bd_id=0, mem=C, sizes=[1, 1, 1, N])
-                dma_wait(of_out)
-
-    print(ctx.module)
-
-
-sliding_window()

From 5a664acd2f3d7d4300403255b0c3767577ccf55f Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Thu, 5 Dec 2024 13:59:11 -0700
Subject: [PATCH 44/46] File name

---
 .../dynamic_object_fifo/sliding_window_conditional/run.lit    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/run.lit b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/run.lit
index 68033f00d2..6220c2ec10 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/run.lit
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/run.lit
@@ -4,7 +4,7 @@
 // REQUIRES: ryzen_ai, valid_xchess_license
 //
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
-// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --dynamic-objFifos --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir
+// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --dynamic-objFifos --xclbin-name=final.xclbin --npu-insts-name=insts.txt %S/aie.mlir
 // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
-// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x final.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!

From 16477b91f3fc4569e077f3ef740e70031efaf132 Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Thu, 5 Dec 2024 14:30:02 -0700
Subject: [PATCH 45/46] Renaming variables

---
 .../sliding_window_conditional/aie.mlir       | 68 +++++++++----------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie.mlir b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie.mlir
index 5ee3d1aa0d..87197925b1 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie.mlir
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie.mlir
@@ -37,8 +37,8 @@ module {
       scf.for %arg0 = %c0_1 to %c10 step %c1_2 {
         aie.use_lock(%output_fifo_prod_lock, AcquireGreaterEqual, 1)
         %0 = memref.load %buffer_0_2[%c0_0] : memref<2xi32>
-        %100 = arith.index_cast %0 : i32 to index
-        %1 = scf.index_switch %100 -> memref<10xi32> 
+        %1 = arith.index_cast %0 : i32 to index
+        %2 = scf.index_switch %1 -> memref<10xi32> 
         case 0 {
           scf.yield %output_fifo_buff_0 : memref<10xi32>
         }
@@ -48,14 +48,14 @@ module {
         default {
           scf.yield %output_fifo_buff_0 : memref<10xi32>
         }
-        %2 = arith.cmpi eq, %arg0, %c0_1 : index
-        %3 = arith.subi %c10, %c1_2 : index
-        %4 = arith.cmpi eq, %arg0, %3 : index
-        scf.if %2 {
+        %3 = arith.cmpi eq, %arg0, %c0_1 : index
+        %4 = arith.subi %c10, %c1_2 : index
+        %5 = arith.cmpi eq, %arg0, %4 : index
+        scf.if %3 {
           aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 1)
           %8 = memref.load %buffer_0_2[%c1] : memref<2xi32>
-          %800 = arith.index_cast %8 : i32 to index
-          %9 = scf.index_switch %800 -> memref<10xi32> 
+          %9 = arith.index_cast %8 : i32 to index
+          %10 = scf.index_switch %9 -> memref<10xi32> 
           case 0 {
             scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
           }
@@ -68,13 +68,13 @@ module {
           default {
             scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
           }
-          func.call @add_10_i32(%9, %9, %1) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
+          func.call @add_10_i32(%10, %10, %2) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
         } else {
-          scf.if %4 {
+          scf.if %5 {
             aie.use_lock(%input_fifo_cons_cons_lock, AcquireGreaterEqual, 2)
             %8 = memref.load %buffer_0_2[%c1] : memref<2xi32>
-            %800 = arith.index_cast %8 : i32 to index
-            %9 = scf.index_switch %800 -> memref<10xi32> 
+            %9 = arith.index_cast %8 : i32 to index
+            %10 = scf.index_switch %9 -> memref<10xi32> 
             case 0 {
               scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
             }
@@ -87,9 +87,9 @@ module {
             default {
               scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
             }
-            %10 = memref.load %buffer_0_2[%c1] : memref<2xi32>
-            %1000 = arith.index_cast %10 : i32 to index
-            %11 = scf.index_switch %1000 -> memref<10xi32> 
+            %11 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+            %12 = arith.index_cast %11 : i32 to index
+            %13 = scf.index_switch %12 -> memref<10xi32> 
             case 0 {
               scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
             }
@@ -102,17 +102,17 @@ module {
             default {
               scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
             }
-            func.call @add_10_i32(%9, %11, %1) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
+            func.call @add_10_i32(%10, %13, %2) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
             aie.use_lock(%input_fifo_cons_prod_lock, Release, 2)
-            %12 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+            %14 = memref.load %buffer_0_2[%c1] : memref<2xi32>
             %c2_4 = arith.constant 2 : i32
-            %13 = arith.addi %12, %c2_4 : i32
-            %14 = arith.remsi %13, %c3_i32 : i32
-            memref.store %14, %buffer_0_2[%c1] : memref<2xi32>
+            %15 = arith.addi %14, %c2_4 : i32
+            %16 = arith.remsi %15, %c3_i32 : i32
+            memref.store %16, %buffer_0_2[%c1] : memref<2xi32>
           } else {
             %8 = memref.load %buffer_0_2[%c1] : memref<2xi32>
-            %800 = arith.index_cast %8 : i32 to index
-            %9 = scf.index_switch %800 -> memref<10xi32> 
+            %9 = arith.index_cast %8 : i32 to index
+            %10 = scf.index_switch %9 -> memref<10xi32> 
             case 0 {
               scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
             }
@@ -125,9 +125,9 @@ module {
             default {
               scf.yield %input_fifo_cons_buff_0 : memref<10xi32>
             }
-            %10 = memref.load %buffer_0_2[%c1] : memref<2xi32>
-            %1000 = arith.index_cast %10 : i32 to index
-            %11 = scf.index_switch %1000 -> memref<10xi32> 
+            %11 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+            %12 = arith.index_cast %11 : i32 to index
+            %13 = scf.index_switch %12 -> memref<10xi32> 
             case 0 {
               scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
             }
@@ -140,21 +140,21 @@ module {
             default {
               scf.yield %input_fifo_cons_buff_1 : memref<10xi32>
             }
-            func.call @add_10_i32(%9, %11, %1) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
+            func.call @add_10_i32(%10, %13, %2) : (memref<10xi32>, memref<10xi32>, memref<10xi32>) -> ()
             aie.use_lock(%input_fifo_cons_prod_lock, Release, 1)
-            %12 = memref.load %buffer_0_2[%c1] : memref<2xi32>
+            %14 = memref.load %buffer_0_2[%c1] : memref<2xi32>
             %c1_4 = arith.constant 1 : i32
-            %13 = arith.addi %12, %c1_4 : i32
-            %14 = arith.remsi %13, %c3_i32 : i32
-            memref.store %14, %buffer_0_2[%c1] : memref<2xi32>
+            %15 = arith.addi %14, %c1_4 : i32
+            %16 = arith.remsi %15, %c3_i32 : i32
+            memref.store %16, %buffer_0_2[%c1] : memref<2xi32>
           }
         }
         aie.use_lock(%output_fifo_cons_lock, Release, 1)
-        %5 = memref.load %buffer_0_2[%c0_0] : memref<2xi32>
+        %6 = memref.load %buffer_0_2[%c0_0] : memref<2xi32>
         %c1_3 = arith.constant 1 : i32
-        %6 = arith.addi %5, %c1_3 : i32
-        %7 = arith.remsi %6, %c2_i32 : i32
-        memref.store %7, %buffer_0_2[%c0_0] : memref<2xi32>
+        %7 = arith.addi %6, %c1_3 : i32
+        %8 = arith.remsi %7, %c2_i32 : i32
+        memref.store %8, %buffer_0_2[%c0_0] : memref<2xi32>
       }
       aie.end
     } {link_with = "kernel.o"}

From 229378efb54643e2d06f94d32cce00db1a908c3d Mon Sep 17 00:00:00 2001
From: Pranathi Vasireddy <pvasired@amd.com>
Date: Thu, 5 Dec 2024 14:41:52 -0700
Subject: [PATCH 46/46] Check: Does flag positions impact(?)

---
 test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py          | 2 +-
 test/npu-xrt/dynamic_object_fifo/ping_pong/aie2.py             | 2 +-
 test/npu-xrt/dynamic_object_fifo/reduction/aie2.py             | 2 +-
 test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py        | 3 ++-
 .../dynamic_object_fifo/two_core_sliding_window/aie2.py        | 3 ++-
 5 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
index e9dc107466..7c2b664a65 100644
--- a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
@@ -10,7 +10,7 @@
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
-# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --dynamic-objFifos --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: %run_on_npu ./test.exe -x final.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 # CHECK: PASS!
 
diff --git a/test/npu-xrt/dynamic_object_fifo/ping_pong/aie2.py b/test/npu-xrt/dynamic_object_fifo/ping_pong/aie2.py
index 0a8c1112d8..19dc7c6e3c 100644
--- a/test/npu-xrt/dynamic_object_fifo/ping_pong/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/ping_pong/aie2.py
@@ -9,7 +9,7 @@
 #
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
-# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --dynamic-objFifos --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
 # RUN: %run_on_npu ./test.exe | FileCheck %s
 # CHECK: PASS!
diff --git a/test/npu-xrt/dynamic_object_fifo/reduction/aie2.py b/test/npu-xrt/dynamic_object_fifo/reduction/aie2.py
index eb5440e4cd..4814d27dae 100644
--- a/test/npu-xrt/dynamic_object_fifo/reduction/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/reduction/aie2.py
@@ -9,7 +9,7 @@
 #
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
-# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --dynamic-objFifos --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
 # RUN: %run_on_npu ./test.exe | FileCheck %s
 # CHECK: PASS!
diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py b/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py
index 37222b8a78..129b69eae5 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py
@@ -9,10 +9,11 @@
 #
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
-# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --dynamic-objFifos --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
 # RUN: %run_on_npu ./test.exe | FileCheck %s
 # CHECK: PASS!
+
 from aie.dialects.aie import *
 from aie.dialects.aiex import *
 from aie.helpers.dialects.ext.scf import _for as range_
diff --git a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py
index d0b0f53d36..a48d6149ba 100644
--- a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py
@@ -9,10 +9,11 @@
 #
 # RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o
 # RUN: %python %S/aie2.py > ./aie2.mlir
-# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --no-compile-host --aie-generate-xclbin --xclbin-name=final.xclbin --dynamic-objFifos --aie-generate-npu --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --dynamic-objFifos --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
 # RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
 # RUN: %run_on_npu ./test.exe | FileCheck %s
 # CHECK: PASS!
+
 import numpy as np
 
 from aie.dialects.aie import *