diff --git a/.gitignore b/.gitignore
index b59e080f32..2bd4e44e9c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,11 @@
 .vscode
 __pycache__
 .DS_Store
+**/CMakeFiles
+**.cmake
+include/**/Makefile
+lib/**/Makefile
+
 
 /platforms/vck190_bare/petalinux/build
 /platforms/vck190_bare/petalinux/components
diff --git a/aie_kernels/aie2/mm.cc b/aie_kernels/aie2/mm.cc
index 0444fa6018..e78bab49b3 100644
--- a/aie_kernels/aie2/mm.cc
+++ b/aie_kernels/aie2/mm.cc
@@ -366,6 +366,23 @@ void matmul_vectorized_4x4x4_i16_i16(const int16 *__restrict pA,
                                                                        pC);
 }
 
+template <unsigned m, unsigned k, unsigned n>
+void matmul_vectorized_4x4x4_i16_i32(const int16 *__restrict pA,
+                                     const int16 *__restrict pB,
+                                     int32 *__restrict pC) {
+  // matmul_vectorized operates on two 4x4 input blocks of A, and two 4x4 input
+  // blocks of B in each iteration. Make sure we have at least 2 blocks in each
+  // dimension, and that our input matrix is evenly divisible.
+  constexpr int r = 4;
+  constexpr int s = 4;
+  constexpr int t = 4;
+  static_assert(m % (2 * r) == 0 && m / (2 * r) > 0);
+  static_assert(k % (2 * s) == 0 && k / (2 * s) > 0);
+  static_assert(n % (2 * t) == 0 && n / (2 * t) > 0);
+  return matmul_vectorized<int16, int32, m / r, k / s, n / t, r, s, t>(pA, pB,
+                                                                       pC);
+}
+
 template <unsigned m, unsigned k, unsigned n>
 void matmul_vectorized_4x8x4_bf16_bf16(const bfloat16 *__restrict pA,
                                        const bfloat16 *__restrict pB,
@@ -416,6 +433,7 @@ extern "C" {
 
 #define combos(X)                                                              \
   X(int16, i16, int16, i16, 4, 4, 4)                                           \
+  X(int16, i16, int32, i32, 4, 4, 4)                                           \
   X(bfloat16, bf16, bfloat16, bf16, 4, 8, 4)                                   \
   X(bfloat16, bf16, float, f32, 4, 8, 4)
 
diff --git a/docs/AIEVectorization.md b/docs/AIEVectorization.md
index bd1729dfbf..84a36faa50 100644
--- a/docs/AIEVectorization.md
+++ b/docs/AIEVectorization.md
@@ -57,7 +57,7 @@ aie-opt -affine-super-vectorize="virtual-vector-size=8 vectorize-reductions" --a
       %0 = aievec.upd %arg0[%arg3] {index = 0 : i8, offset = 0 : si32} : memref<2048xf32>, vector<8xf32>
       %1 = aievec.upd %arg1[%arg3] {index = 0 : i8, offset = 0 : si32} : memref<2048xf32>, vector<8xf32>
       %2 = aievec.concat %0, %0 : vector<8xf32>, vector<16xf32>
-      %3 = aievec.mul %2, %1 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x76543210", zstart = "0"} : vector<16xf32>, vector<8xf32>, !aievec.acc<8xf32>
+      %3 = aievec_aie1.mul %2, %1 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x76543210", zstart = "0"} : vector<16xf32>, vector<8xf32>, !aievec.acc<8xf32>
       %4 = aievec.srs %3 {shift = 0 : i8} : !aievec.acc<8xf32>, vector<8xf32>
       vector.transfer_write %4, %arg2[%arg3] {in_bounds = [true]} : vector<8xf32>, memref<2048xf32>
     }
@@ -114,7 +114,7 @@ Results in:
     scf.for %arg3 = %c0 to %c2048 step %c16 {
       %0 = aievec.upd %arg0[%arg3] {index = 0 : i8, offset = 0 : si32} : memref<2048xi16>, vector<16xi16>
       %1 = aievec.upd %arg1[%arg3] {index = 0 : i8, offset = 0 : si32} : memref<2048xi16>, vector<16xi16>
-      %2 = aievec.mul %0, %1 : vector<16xi16>, vector<16xi16>, !aievec.acc<16xi48>
+      %2 = aievec_aie1.mul %0, %1 : vector<16xi16>, vector<16xi16>, !aievec.acc<16xi48>
       %3 = aievec.srs %2 {shift = 0 : i8} : !aievec.acc<16xi48>, vector<16xi16>
       vector.transfer_write %3, %arg2[%arg3] {in_bounds = [true]} : vector<16xi16>, memref<2048xi16>
     }
@@ -150,22 +150,22 @@ aie-opt --affine-loop-unroll="unroll-full unroll-full-threshold=3" --canonicaliz
         %4 = aievec.upd %arg2[%arg3, %arg4] {index = 0 : i8, offset = 0 : si32} : memref<2046x2046xi32>, vector<8xi32>
         %5 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : si32} : memref<2048x2048xi32>, vector<16xi32>
         %6 = aievec.ups %4 {shift = 0 : i8} : vector<8xi32>, !aievec.acc<8xi80>
-        %7 = aievec.mac %5, %0, %6 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
+        %7 = aievec_aie1.mac %5, %0, %6 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
         %c1_5 = arith.constant 1 : index
         %8 = arith.addi %arg4, %c1_5 : index
         %9 = aievec.upd %arg0[%arg3, %8], %5 {index = 1 : i8, offset = 224 : si32} : memref<2048x2048xi32>, vector<16xi32>
-        %10 = aievec.mac %9, %0, %7 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
-        %11 = aievec.mac %9, %0, %10 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
+        %10 = aievec_aie1.mac %9, %0, %7 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
+        %11 = aievec_aie1.mac %9, %0, %10 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
         %12 = aievec.upd %arg0[%2, %arg4] {index = 0 : i8, offset = 0 : si32} : memref<2048x2048xi32>, vector<16xi32>
-        %13 = aievec.mac %12, %0, %11 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
+        %13 = aievec_aie1.mac %12, %0, %11 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
         %14 = aievec.upd %arg0[%2, %8], %12 {index = 1 : i8, offset = 224 : si32} : memref<2048x2048xi32>, vector<16xi32>
-        %15 = aievec.mac %14, %0, %13 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
-        %16 = aievec.mac %14, %0, %15 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
+        %15 = aievec_aie1.mac %14, %0, %13 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
+        %16 = aievec_aie1.mac %14, %0, %15 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
         %17 = aievec.upd %arg0[%3, %arg4] {index = 0 : i8, offset = 0 : si32} : memref<2048x2048xi32>, vector<16xi32>
-        %18 = aievec.mac %17, %0, %16 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
+        %18 = aievec_aie1.mac %17, %0, %16 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
         %19 = aievec.upd %arg0[%3, %8], %17 {index = 1 : i8, offset = 224 : si32} : memref<2048x2048xi32>, vector<16xi32>
-        %20 = aievec.mac %19, %0, %18 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
-        %21 = aievec.mac %19, %1, %20 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
+        %20 = aievec_aie1.mac %19, %0, %18 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
+        %21 = aievec_aie1.mac %19, %1, %20 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
         %22 = aievec.srs %21 {shift = 0 : i8} : !aievec.acc<8xi80>, vector<8xi32>
         vector.transfer_write %22, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<8xi32>, memref<2046x2046xi32>
       }
diff --git a/docs/conferenceDescriptions/micro24TutorialDescription.md b/docs/conferenceDescriptions/micro24TutorialDescription.md
new file mode 100644
index 0000000000..1ebf26fbff
--- /dev/null
+++ b/docs/conferenceDescriptions/micro24TutorialDescription.md
@@ -0,0 +1,50 @@
+# MICRO 2024 Tutorial: Leveraging the IRON AI Engine API to program the Ryzen™ AI NPU
+
+## Introduction
+
+The NPU of AMD Ryzen™ AI devices includes an AI Engine array comprised of a set of VLIW vector processors, data movement accelerators (DMAs) and adaptable interconnect. This tutorial is targeted at performance engineers who are looking to develop designs targeting the NPU with open source design tools. We provide a close-to-metal Python API: Interface Representation for hands-ON (IRON) AIE-array programming. IRON is an open access toolkit enabling performance engineers to build fast and efficient, often specialized, designs through a set of Python language bindings around the mlir-aie dialect. Participants will first get insight into the AI Engine compute and data movement capabilities. Through small design examples expressed in the IRON API and executed on an Ryzen™ AI device, participants will leverage AI Engine features for optimizing performance of increasingly complex designs. The labs will be done on Ryzen™ AI-enabled mini-PCs, giving participants the ability to execute their own designs on real hardware.
+
+This tutorial will cover the following key topics:
+1. AI Engine architecture introduction 
+1. AIE core, array configuration, and host application code compilation
+1. Data movement and communication abstraction layers
+1. Tracing for performance monitoring
+1. Putting it all together on larger examples: matrix multiplication, convolutions as building blocks for ML and computer vision examples 
+
+## Agenda
+
+Date: Sunday, November 3rd, 2024 (morning)  
+Location: Austin, Texas, USA (with MICRO-57)  
+Prerequisite: Please bring your laptop so that you can SSH into our Ryzen™ AI-enabled miniPCs for the hands-on exercises.
+
+### Contents and Timeline (tentative)
+
+| Time | Topic | Presenter | Slides or Code |
+|------|-------|-----------|----------------|
+| 08:30am | Intro to spatial compute and explicit data movement | Kristof | [Programming Guide](../../programming_guide/) |
+| 08:45am | "Hello World" from Ryzen™ AI | Joe | [AI Engine Basic Building Blocks](../../programming_guide/section-1/) |
+| 09:00am | Data movement on Ryzen™ AI with objectFIFOs | Joe | [Data Movement](../../programming_guide/section-2/) |
+| 09:30am | Your First Program | Kristof | [My First Program](../../programming_guide/section-3) |
+| 09:50am | Exercise 1: Build and run your first program | All | [Passthrough](../../programming_examples/basic/passthrough_kernel/) |
+| 10:00am | Break | | |
+| 10:30am | Exercise 2: Vector-Scalar Mul | All | [Vector Scalar Mul](../../programming_examples/basic/vector_scalar_mul/) |
+| 10:40am | Tracing and performance analysis | Jack | [Timers](../../programming_guide/section-4/section-4a/) and [Tracing](../../programming_guide/section-4/section-4b/) |
+| 11:10am | Exercise 3: Tracing vector-scalar | All | [Vector Scalar Mul](../../programming_examples/basic/vector_scalar_mul/) |
+| 11:30am | Vectorizing on AIE | Jack | [Kernel Vectorization](../../programming_guide/section-4/section-4c/) |
+| 11:40am | Exercise 4: Vectorized vector-scalar | All | [Vector Scalar Mul](../../programming_examples/basic/vector_scalar_mul/) |
+| 12:00pm | Dataflow and larger designs | Joe | [Example Vector Designs](../../programming_guide/section-5/) and [Large Example Designs](../../programming_guide/section-6/) |
+| 12:15pm | Exercises | All | [Programming Examples](../../programming_examples/) |
+| 12:30pm | Close Tutorial | All | |
+
+
+## Organizers
+
+*Jack Lo* is a Senior Member of Technical Staff in AMD’s Research and Advanced Development group. At AMD, he is focused on developing tool frameworks and optimizing applications for current and future AMD devices, particularly in the area of adaptive computing and AI processing. 
+
+*Joseph Melber* is a Senior Member of Technical Staff in AMD’s Research and Advanced Development group. At AMD, he is working on hardware architectures and compiler technologies for current and future AMD devices. He received a BS in electrical engineering from the University Buffalo, as well as MS and PhD degrees from the electrical and computer engineering department at Carnegie Mellon University. His research interests include runtime systems, compiler abstractions for data movement, and hardware prototypes for future adaptive heterogeneous computing architectures.
+
+*Andrew Schmidt* is a Senior Member of Technical Staff in the AMD University Program.  At AMD, he provides tutorials, training workshops and engages with universities across undergraduate and graduate curriculum, as well as research projects.  The AMD University Program offers researchers access to state-of-the-art hardware through various programs including the HPC Fund, HACC program, and donation program and offers professors and lecturers free software licenses and educational resources to support classroom teaching.  He has extensive background on adaptive computing acceleration with heterogenous architectures, previously working at the University of Southern California’s Information Sciences Institute where his focus was on reconfigurable computing, computer architecture, and hardware assurance.  He received his BS and MS in Computer Engineering from the University of Kansas and his PhD in Electrical Engineering from the University of North Carolina at Charlotte where his focus was on efficient utilization of heterogenous resources for High Performance Reconfigurable Computing.
+
+*Kristof Denolf* is a Fellow in AMD's Research and Advanced Development group where he is working on energy-efficient computer vision and video processing applications to shape future AMD devices. He earned an M.Eng. in electronics from the Katholieke Hogeschool Brugge-Oostende (1998), now part of KULeuven, an M.Sc. in electronic system design from Leeds Beckett University (2000), and a Ph.D. from the Technical University Eindhoven (2007). He has over 25 years of combined research and industry experience at IMEC, Philips, Barco, Apple, Xilinx, and AMD. His main research interests are all aspects of the cost-efficient and dataflow-oriented design of video, vision, and graphics systems.
+
+*Phil James-Roxby* is a Senior Fellow in AMD’s Research and Advanced Development group, working on compilers and runtimes to support current and future AMD devices, particularly in the domain of AI processing.  In the past, he has been responsible for a number of software enablement activities for hardware devices, including SDNet and SDAccel at Xilinx, and the original development environment for the AI Engines.  He holds a PhD from the University of Manchester on hardware acceleration of embedded machine learning applications, and his main research interest continues to be how to enable users to efficiently use diverse hardware in heterogeneous systems.
diff --git a/include/aie/Dialect/AIE/IR/AIEAttrs.td b/include/aie/Dialect/AIE/IR/AIEAttrs.td
index d5bbab0b6b..903713991b 100644
--- a/include/aie/Dialect/AIE/IR/AIEAttrs.td
+++ b/include/aie/Dialect/AIE/IR/AIEAttrs.td
@@ -148,15 +148,37 @@ def BDDimLayoutAttr : AttrDef<AIE_Dialect, "BDDimLayout", []> {
 def BDDimLayoutArrayAttr : ArrayOfAttr<
     /*dialect*/AIE_Dialect,
     /*attrName*/"BDDimLayoutArray",
-    /*attrMnemonic*/"bd_dim_layout_arr",
+    /*attrMnemonic*/"bd_dim_layout_array",
     /*eltName*/BDDimLayoutAttr.cppClassName
 >;
 
 def BDDimLayoutArrayArrayAttr : ArrayOfAttr<
     /*dialect*/AIE_Dialect,
     /*attrName*/"BDDimLayoutArrayArray",
-    /*attrMnemonic*/"bd_dim_layout_arr_arr",
+    /*attrMnemonic*/"bd_dim_layout_array_array",
     /*eltName*/BDDimLayoutArrayAttr.cppClassName
 >;
 
+def BDPadLayoutAttr : AttrDef<AIE_Dialect, "BDPadLayout", []> {
+  let mnemonic = "bd_pad_layout";
+  let summary = [{
+    Tuple encoding number of zeros before and after on that dimension in an AIE2 
+    n-dimensional buffer descriptor;
+  }];
+
+  let parameters = (ins
+    "uint16_t" : $const_pad_before,
+    "uint16_t" : $const_pad_after
+  );
+
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
+def BDPadLayoutArrayAttr : ArrayOfAttr<
+    /*dialect*/AIE_Dialect,
+    /*attrName*/"BDPadLayoutArray",
+    /*attrMnemonic*/"bd_pad_layout_array",
+    /*eltName*/BDPadLayoutAttr.cppClassName
+>;
+
 #endif // AIE_ATTRS
\ No newline at end of file
diff --git a/include/aie/Dialect/AIE/IR/AIEOps.td b/include/aie/Dialect/AIE/IR/AIEOps.td
index b4101b4406..e876a9b4df 100644
--- a/include/aie/Dialect/AIE/IR/AIEOps.td
+++ b/include/aie/Dialect/AIE/IR/AIEOps.td
@@ -758,6 +758,7 @@ def AIE_DMABDOp: AIE_Op<"dma_bd", [
         2. the transfer length;
         3. the sizes and strides for n-d tensor addressing (described below);
         4. the "bd_id" with which to associate the buffer descriptor (most often left empty).
+        5. the number of zeros to pad before and after every dimension of an n-d tensor (described below);
 
     `offset`, `len`, `size`s and `stride`s are all denominated in element width; e.g., transferring the whole of
     `memref<512xi32>` means `len == 512`, and also while transferring the whole of `memref<512xi16>`, `len == 512`.
@@ -792,13 +793,13 @@ def AIE_DMABDOp: AIE_Op<"dma_bd", [
         aie.dma_bd(<$buf2 : memref<64xi32>, 0, 64)
     ```
 
-    ## Background/context:
+    #### Background/context
 
     A DMA channel in a Memory Module can process one buffer descriptor after another by chaining them.
     There are 16 buffer descriptors per Core memory module and 48 buffer descriptors per Memtile memory module.
     They are shared by four DMA channels (or 12).
 
-    ## DMA Data Layout Transformations on AIE-ML Devices
+    #### DMA Data Layout Transformations on AIE-ML Devices
 
     AIE-ML devices can apply data layout transformations at the buffer
     descriptor level. These transformation are described by strides and sizes in up to three dimensions (four
@@ -843,10 +844,18 @@ def AIE_DMABDOp: AIE_Op<"dma_bd", [
           // access/store element at/to index (i * 16 /*stride_2*/ + j * 1 /*stride_1*/ + k * 2 /*stride_0*/)
     ```
 
-    ## Important gotcha regarding strides
+    #### Important gotcha regarding strides
 
     All strides are expressed in multiples of the element width (just like `len` and `offset`)
     **with the caveat that the inner-most dimension's stride must be 1**.
+
+    ## DMA constant padding on AIE-ML Devices
+
+    AIE-ML devices can apply constant padding at the buffer descriptor level, described with pairs of padding
+    counts before and after a dimension, to all dimensions in the data layout transformations. The padding 
+    counts can be supplied to the `dma_bd` through an optional argument, an array of "tuple-like" attributes 
+    `bd_pad_layout<const_pad_before, const_pad_after>`, followed by an optional argument `const_val` (default 
+    is 0). All counts are expressed in multiples of the element width.
   }];
 
   let arguments = (
@@ -856,6 +865,8 @@ def AIE_DMABDOp: AIE_Op<"dma_bd", [
         // in multiples of element width (not bytes)
         OptionalAttr<AIEI32Attr>:$len,
         OptionalAttr<BDDimLayoutArrayAttr>:$dimensions,
+        OptionalAttr<BDPadLayoutArrayAttr>:$pad_dimensions,
+        DefaultValuedOptionalAttr<AIEI32Attr, "0">:$pad_value,
         OptionalAttr<AIEI32Attr>:$bd_id,
         // should never be assigned by user...
         OptionalAttr<AIEI32Attr>:$next_bd_id
@@ -864,7 +875,7 @@ def AIE_DMABDOp: AIE_Op<"dma_bd", [
   let hasVerifier = 1;
 
   let assemblyFormat = [{
-    `(` $buffer `:` type($buffer) (`,` $offset^)? (`,` $len^)? (`,` $dimensions^)? `)` attr-dict
+    `(` $buffer `:` type($buffer) (`,` $offset^)? (`,` $len^)? (`,` $dimensions^)? (`,` $pad_dimensions^)? (`,` `pad_value` `=` $pad_value^)? `)` attr-dict
   }];
 
   let extraClassDeclaration = [{
@@ -893,6 +904,19 @@ def AIE_DMABDOp: AIE_Op<"dma_bd", [
       $_state.addAttribute("offset", $_builder.getI32IntegerAttr(offset));
       $_state.addAttribute("len", $_builder.getI32IntegerAttr(len));
       $_state.addAttribute("dimensions", dims);
+    }]>,
+    OpBuilder<(ins "mlir::Value":$buffer, "int":$offset, "int":$len, "BDPadLayoutArrayAttr":$paddims), [{
+      $_state.addOperands(buffer);
+      $_state.addAttribute("offset", $_builder.getI32IntegerAttr(offset));
+      $_state.addAttribute("len", $_builder.getI32IntegerAttr(len));
+      $_state.addAttribute("pad_dimensions", paddims);
+    }]>,
+    OpBuilder<(ins "mlir::Value":$buffer, "int":$offset, "int":$len, "BDDimLayoutArrayAttr":$dims, "BDPadLayoutArrayAttr":$paddims), [{
+      $_state.addOperands(buffer);
+      $_state.addAttribute("offset", $_builder.getI32IntegerAttr(offset));
+      $_state.addAttribute("len", $_builder.getI32IntegerAttr(len));
+      $_state.addAttribute("dimensions", dims);
+      $_state.addAttribute("pad_dimensions", paddims);
     }]>
   ];
 }
@@ -1514,7 +1538,9 @@ def AIE_ShimDMAAllocationOp : AIE_Op<"shim_dma_allocation", [HasParent<"DeviceOp
     ins FlatSymbolRefAttr:$sym_name,
         DMAChannelDir:$channel_dir,
         AIEI64Attr:$channel_index,
-        AIEI64Attr:$col
+        AIEI64Attr:$col,
+        // If this is set we are using the PLIO in this ShimTile
+        DefaultValuedAttr<BoolAttr, "false">:$plio
   );
 
   let results = (outs);
@@ -1566,7 +1592,7 @@ def AIE_ObjectFifoCreateOp: AIE_Op<"objectfifo", [HasParent<"DeviceOp">, Symbol]
     at each tile are respectively 2, 3 and 4 for tiles `%tile12`, `%tile13` and `%tile23`. This overrides the depth analysis
     specified in the first example.
 
-    ## Data Layout Transformations on AIE-ML devices
+    #### Data Layout Transformations on AIE-ML devices
 
     On AIE-ML devices, objectFifos can also apply data layout transformations by
     using the DMAs n-dimensional address generation scheme. Two transformations
@@ -1610,7 +1636,8 @@ def AIE_ObjectFifoCreateOp: AIE_Op<"objectfifo", [HasParent<"DeviceOp">, Symbol]
         TypeAttrOf<AIE_ObjectFifoType>:$elemType,
         BDDimLayoutArrayAttr:$dimensionsToStream,
         BDDimLayoutArrayArrayAttr:$dimensionsFromStreamPerConsumer,
-        DefaultValuedAttr<BoolAttr, "false">:$via_DMA
+        DefaultValuedAttr<BoolAttr, "false">:$via_DMA,
+        DefaultValuedAttr<BoolAttr, "false">:$plio
   );
 
   let assemblyFormat = [{
diff --git a/include/aie/Dialect/AIE/Transforms/AIEPasses.h b/include/aie/Dialect/AIE/Transforms/AIEPasses.h
index 64dd858136..22642a3b9a 100644
--- a/include/aie/Dialect/AIE/Transforms/AIEPasses.h
+++ b/include/aie/Dialect/AIE/Transforms/AIEPasses.h
@@ -82,11 +82,6 @@ struct AIEPathfinderPass : AIERoutePathfinderFlowsBase<AIEPathfinderPass> {
                       WireBundle currDestBundle, int currDestChannel,
                       TileID finalTile, WireBundle finalDestBundle,
                       int finalDestChannel);
-  bool attemptFixupMemTileRouting(DeviceOp &d, SwConnection &problemConnect);
-
-  bool checkChannelEmpty(SwitchboxOp swOp, WireBundle bundle, int channel);
-  void replaceRoutingChannel(SwitchboxOp &swOp, WireBundle Bundle,
-                             int oldChannel, int newChannel);
 
   SwitchboxOp getSwitchbox(DeviceOp &d, int col, int row);
 
diff --git a/include/aie/Dialect/AIE/Transforms/AIEPathFinder.h b/include/aie/Dialect/AIE/Transforms/AIEPathFinder.h
index 6207fde071..f53ddc6eea 100644
--- a/include/aie/Dialect/AIE/Transforms/AIEPathFinder.h
+++ b/include/aie/Dialect/AIE/Transforms/AIEPathFinder.h
@@ -14,9 +14,6 @@
 #include "aie/Dialect/AIE/IR/AIEDialect.h"
 #include "aie/Dialect/AIE/IR/AIETargetModel.h"
 
-#include "llvm/ADT/DirectedGraph.h"
-#include "llvm/ADT/GraphTraits.h"
-
 #include <algorithm>
 #include <iostream>
 #include <list>
@@ -24,87 +21,270 @@
 
 namespace xilinx::AIE {
 
-using Switchbox = struct Switchbox : TileID {
-  // Necessary for initializer construction?
-  Switchbox(TileID t) : TileID(t) {}
-  Switchbox(int col, int row) : TileID{col, row} {}
-  friend std::ostream &operator<<(std::ostream &os, const Switchbox &s) {
-    os << "Switchbox(" << s.col << ", " << s.row << ")";
-    return os;
+enum class Connectivity { INVALID = -1, AVAILABLE = 0, OCCUPIED = 1 };
+
+using SwitchboxNode = struct SwitchboxNode {
+
+  SwitchboxNode(int col, int row, int id, int maxCol, int maxRow,
+                const AIETargetModel &targetModel)
+      : col{col}, row{row}, id{id} {
+
+    std::vector<WireBundle> bundles = {
+        WireBundle::Core,  WireBundle::DMA,  WireBundle::FIFO,
+        WireBundle::South, WireBundle::West, WireBundle::North,
+        WireBundle::East,  WireBundle::PLIO, WireBundle::NOC,
+        WireBundle::Trace, WireBundle::Ctrl};
+
+    for (WireBundle bundle : bundles) {
+      int maxCapacity =
+          targetModel.getNumSourceSwitchboxConnections(col, row, bundle);
+      if (targetModel.isShimNOCorPLTile(col, row) && maxCapacity == 0) {
+        // wordaround for shimMux, todo: integrate shimMux into routable grid
+        maxCapacity =
+            targetModel.getNumSourceShimMuxConnections(col, row, bundle);
+      }
+
+      for (int channel = 0; channel < maxCapacity; channel++) {
+        Port inPort = {bundle, channel};
+        inPortToId[inPort] = inPortId;
+        inPortId++;
+      }
+
+      maxCapacity =
+          targetModel.getNumDestSwitchboxConnections(col, row, bundle);
+      if (targetModel.isShimNOCorPLTile(col, row) && maxCapacity == 0) {
+        // wordaround for shimMux, todo: integrate shimMux into routable grid
+        maxCapacity =
+            targetModel.getNumDestShimMuxConnections(col, row, bundle);
+      }
+      for (int channel = 0; channel < maxCapacity; channel++) {
+        Port outPort = {bundle, channel};
+        outPortToId[outPort] = outPortId;
+        outPortId++;
+      }
+    }
+
+    connectionMatrix.resize(inPortId, std::vector<Connectivity>(
+                                          outPortId, Connectivity::AVAILABLE));
+
+    // illegal connection
+    for (const auto &[inPort, inId] : inPortToId) {
+      for (const auto &[outPort, outId] : outPortToId) {
+        if (!targetModel.isLegalTileConnection(col, row, inPort.bundle,
+                                               inPort.channel, outPort.bundle,
+                                               outPort.channel))
+          connectionMatrix[inId][outId] = Connectivity::INVALID;
+
+        if (targetModel.isShimNOCorPLTile(col, row)) {
+          // wordaround for shimMux, todo: integrate shimMux into routable grid
+          auto isBundleInList = [](WireBundle bundle,
+                                   std::vector<WireBundle> bundles) {
+            return std::find(bundles.begin(), bundles.end(), bundle) !=
+                   bundles.end();
+          };
+          std::vector<WireBundle> bundles = {WireBundle::DMA, WireBundle::NOC,
+                                             WireBundle::PLIO};
+          if (isBundleInList(inPort.bundle, bundles) ||
+              isBundleInList(outPort.bundle, bundles))
+            connectionMatrix[inId][outId] = Connectivity::AVAILABLE;
+        }
+      }
+    }
   }
 
-  GENERATE_TO_STRING(Switchbox);
+  // given a outPort, find availble input channel
+  std::vector<int> findAvailableChannelIn(WireBundle inBundle, Port outPort,
+                                          bool isPkt) {
+    std::vector<int> availableChannels;
+    if (outPortToId.count(outPort) > 0) {
+      int outId = outPortToId[outPort];
+      if (isPkt) {
+        for (const auto &[inPort, inId] : inPortToId) {
+          if (inPort.bundle == inBundle &&
+              connectionMatrix[inId][outId] != Connectivity::INVALID) {
+            bool available = true;
+            if (inPortPktCount.count(inPort) == 0) {
+              for (const auto &[outPort, outId] : outPortToId) {
+                if (connectionMatrix[inId][outId] == Connectivity::OCCUPIED) {
+                  // occupied by others as circuit-switched
+                  available = false;
+                  break;
+                }
+              }
+            } else {
+              if (inPortPktCount[inPort] >= maxPktStream) {
+                // occupied by others as packet-switched but exceed max packet
+                // stream capacity
+                available = false;
+              }
+            }
+            if (available)
+              availableChannels.push_back(inPort.channel);
+          }
+        }
+      } else {
+        for (const auto &[inPort, inId] : inPortToId) {
+          if (inPort.bundle == inBundle &&
+              connectionMatrix[inId][outId] == Connectivity::AVAILABLE) {
+            bool available = true;
+            for (const auto &[outPort, outId] : outPortToId) {
+              if (connectionMatrix[inId][outId] == Connectivity::OCCUPIED) {
+                available = false;
+                break;
+              }
+            }
+            if (available)
+              availableChannels.push_back(inPort.channel);
+          }
+        }
+      }
+    }
+    return availableChannels;
+  }
 
-  bool operator==(const Switchbox &rhs) const {
-    return static_cast<TileID>(*this) == rhs;
+  bool allocate(Port inPort, Port outPort, bool isPkt) {
+    // invalid port
+    if (outPortToId.count(outPort) == 0 || inPortToId.count(inPort) == 0)
+      return false;
+
+    int inId = inPortToId[inPort];
+    int outId = outPortToId[outPort];
+
+    // invalid connection
+    if (connectionMatrix[inId][outId] == Connectivity::INVALID)
+      return false;
+
+    if (isPkt) {
+      // a packet-switched stream to be allocated
+      if (inPortPktCount.count(inPort) == 0) {
+        for (const auto &[outPort, outId] : outPortToId) {
+          if (connectionMatrix[inId][outId] == Connectivity::OCCUPIED) {
+            // occupied by others as circuit-switched, allocation fail!
+            return false;
+          }
+        }
+        // empty channel, allocation succeed!
+        inPortPktCount[inPort] = 1;
+        connectionMatrix[inId][outId] = Connectivity::OCCUPIED;
+        return true;
+      } else {
+        if (inPortPktCount[inPort] >= maxPktStream) {
+          // occupied by others as packet-switched but exceed max packet stream
+          // capacity, allocation fail!
+          return false;
+        } else {
+          // valid packet-switched, allocation succeed!
+          inPortPktCount[inPort]++;
+          return true;
+        }
+      }
+    } else {
+      // a circuit-switched stream to be allocated
+      if (connectionMatrix[inId][outId] == Connectivity::AVAILABLE) {
+        // empty channel, allocation succeed!
+        connectionMatrix[inId][outId] = Connectivity::OCCUPIED;
+        return true;
+      } else {
+        // occupied by others, allocation fail!
+        return false;
+      }
+    }
   }
-};
 
-using Channel = struct Channel {
-  Channel(Switchbox &src, Switchbox &target, WireBundle bundle, int maxCapacity)
-      : src(src), target(target), bundle(bundle), maxCapacity(maxCapacity) {}
+  void clearAllocation() {
+    for (int inId = 0; inId < inPortId; inId++) {
+      for (int outId = 0; outId < outPortId; outId++) {
+        if (connectionMatrix[inId][outId] != Connectivity::INVALID) {
+          connectionMatrix[inId][outId] = Connectivity::AVAILABLE;
+        }
+      }
+    }
+    inPortPktCount.clear();
+  }
 
-  friend std::ostream &operator<<(std::ostream &os, const Channel &c) {
-    os << "Channel(src=" << c.src << ", dst=" << c.target << ")";
+  friend std::ostream &operator<<(std::ostream &os, const SwitchboxNode &s) {
+    os << "Switchbox(" << s.col << ", " << s.row << ")";
     return os;
   }
 
-  GENERATE_TO_STRING(Channel)
+  GENERATE_TO_STRING(SwitchboxNode);
 
   friend llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
-                                       const Channel &c) {
-    os << to_string(c);
+                                       const SwitchboxNode &s) {
+    os << to_string(s);
     return os;
   }
 
-  Switchbox &src;
-  Switchbox &target;
-  WireBundle bundle;
-  int maxCapacity = 0;  // maximum number of routing resources
-  double demand = 0.0;  // indicates how many flows want to use this Channel
-  int usedCapacity = 0; // how many flows are actually using this Channel
-  std::set<int> fixedCapacity; // channels not available to the algorithm
-  int overCapacityCount = 0;   // history of Channel being over capacity
-  int packetFlowCount = 0;     // up to 32 packet strams flow through a port
-};
+  bool operator<(const SwitchboxNode &rhs) const {
+    return std::tie(col, row) < std::tie(rhs.col, rhs.row);
+  }
+
+  bool operator==(const SwitchboxNode &rhs) const {
+    return std::tie(col, row) == std::tie(rhs.col, rhs.row);
+  }
+
+  int col, row, id;
+  int inPortId = 0, outPortId = 0;
+  std::map<Port, int> inPortToId, outPortToId;
 
-struct SwitchboxNode;
-struct ChannelEdge;
-using SwitchboxNodeBase = llvm::DGNode<SwitchboxNode, ChannelEdge>;
-using ChannelEdgeBase = llvm::DGEdge<SwitchboxNode, ChannelEdge>;
-using SwitchboxGraphBase = llvm::DirectedGraph<SwitchboxNode, ChannelEdge>;
+  // tenary representation of switchbox connectivity
+  // -1: invalid in arch, 0: empty and available, 1: occupued
+  std::vector<std::vector<Connectivity>> connectionMatrix;
 
-using SwitchboxNode = struct SwitchboxNode : SwitchboxNodeBase, Switchbox {
-  using Switchbox::Switchbox;
-  SwitchboxNode(int col, int row, int id) : Switchbox{col, row}, id{id} {}
-  int id;
+  // input ports with incoming packet-switched streams
+  std::map<Port, int> inPortPktCount;
+
+  // up to 32 packet-switched stram through a port
+  const int maxPktStream = 32;
 };
 
-// warning: 'xilinx::AIE::ChannelEdge::src' will be initialized after
-// SwitchboxNode &src; [-Wreorder]
-using ChannelEdge = struct ChannelEdge : ChannelEdgeBase, Channel {
-  using Channel::Channel;
+using ChannelEdge = struct ChannelEdge {
+  ChannelEdge(SwitchboxNode *src, SwitchboxNode *target)
+      : src(src), target(target) {
+
+    // get bundle from src to target coordinates
+    if (src->col == target->col) {
+      if (src->row > target->row)
+        bundle = WireBundle::South;
+      else
+        bundle = WireBundle::North;
+    } else {
+      if (src->col > target->col)
+        bundle = WireBundle::West;
+      else
+        bundle = WireBundle::East;
+    }
+
+    // maximum number of routing resources
+    maxCapacity = 0;
+    for (auto &[outPort, _] : src->outPortToId) {
+      if (outPort.bundle == bundle) {
+        maxCapacity++;
+      }
+    }
+  }
+
+  friend std::ostream &operator<<(std::ostream &os, const ChannelEdge &c) {
+    os << "Channel(src=" << c.src << ", dst=" << c.target << ")";
+    return os;
+  }
 
-  explicit ChannelEdge(SwitchboxNode &target) = delete;
-  ChannelEdge(SwitchboxNode &src, SwitchboxNode &target, WireBundle bundle,
-              int maxCapacity)
-      : ChannelEdgeBase(target), Channel(src, target, bundle, maxCapacity),
-        src(src) {}
+  GENERATE_TO_STRING(ChannelEdge)
 
-  // This class isn't designed to copied or moved.
-  ChannelEdge(const ChannelEdge &E) = delete;
-  ChannelEdge &operator=(ChannelEdge &&E) = delete;
+  friend llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
+                                       const ChannelEdge &c) {
+    os << to_string(c);
+    return os;
+  }
 
-  SwitchboxNode &src;
-};
+  SwitchboxNode *src;
+  SwitchboxNode *target;
 
-class SwitchboxGraph : public SwitchboxGraphBase {
-public:
-  SwitchboxGraph() = default;
-  ~SwitchboxGraph() = default;
+  int maxCapacity;
+  WireBundle bundle;
 };
 
-// A SwitchSetting defines the required settings for a Switchbox for a flow
+// A SwitchSetting defines the required settings for a SwitchboxNode for a flow
 // SwitchSetting.src is the incoming signal
 // SwitchSetting.dsts is the fanout
 using SwitchSetting = struct SwitchSetting {
@@ -143,12 +323,12 @@ using SwitchSetting = struct SwitchSetting {
   bool operator<(const SwitchSetting &rhs) const { return src < rhs.src; }
 };
 
-using SwitchSettings = std::map<Switchbox, SwitchSetting>;
+using SwitchSettings = std::map<SwitchboxNode, SwitchSetting>;
 
 // A Flow defines source and destination vertices
 // Only one source, but any number of destinations (fanout)
 using PathEndPoint = struct PathEndPoint {
-  Switchbox sb;
+  SwitchboxNode sb;
   Port port;
 
   friend std::ostream &operator<<(std::ostream &os, const PathEndPoint &s) {
@@ -198,10 +378,10 @@ class Router {
                           const AIETargetModel &targetModel) = 0;
   virtual void addFlow(TileID srcCoords, Port srcPort, TileID dstCoords,
                        Port dstPort, bool isPacketFlow) = 0;
-  virtual bool addFixedConnection(ConnectOp connectOp) = 0;
+  virtual bool addFixedConnection(SwitchboxOp switchboxOp) = 0;
   virtual std::optional<std::map<PathEndPoint, SwitchSettings>>
   findPaths(int maxIterations) = 0;
-  virtual Switchbox *getSwitchbox(TileID coords) = 0;
+  virtual SwitchboxNode getSwitchboxNode(TileID coords) = 0;
 };
 
 class Pathfinder : public Router {
@@ -211,25 +391,37 @@ class Pathfinder : public Router {
                   const AIETargetModel &targetModel) override;
   void addFlow(TileID srcCoords, Port srcPort, TileID dstCoords, Port dstPort,
                bool isPacketFlow) override;
-  bool addFixedConnection(ConnectOp connectOp) override;
+  bool addFixedConnection(SwitchboxOp switchboxOp) override;
   std::optional<std::map<PathEndPoint, SwitchSettings>>
   findPaths(int maxIterations) override;
 
-  Switchbox *getSwitchbox(TileID coords) override {
-    auto *sb = std::find_if(graph.begin(), graph.end(), [&](SwitchboxNode *sb) {
-      return sb->col == coords.col && sb->row == coords.row;
-    });
-    assert(sb != graph.end() && "couldn't find sb");
-    return *sb;
+  std::map<SwitchboxNode *, SwitchboxNode *>
+  dijkstraShortestPaths(SwitchboxNode *src);
+
+  SwitchboxNode getSwitchboxNode(TileID coords) override {
+    return grid.at(coords);
   }
 
 private:
-  SwitchboxGraph graph;
+  // Flows to be routed
   std::vector<FlowNode> flows;
+
+  // Grid of switchboxes available
   std::map<TileID, SwitchboxNode> grid;
+
   // Use a list instead of a vector because nodes have an edge list of raw
   // pointers to edges (so growing a vector would invalidate the pointers).
   std::list<ChannelEdge> edges;
+
+  // Use Dijkstra's shortest path to find routes, and use "demand" as the
+  // weights.
+  std::map<ChannelEdge *, double> demand;
+
+  // History of Channel being over capacity
+  std::map<ChannelEdge *, int> overCapacity;
+
+  // how many flows are actually using this Channel
+  std::map<ChannelEdge *, int> usedCapacity;
 };
 
 // DynamicTileAnalysis integrates the Pathfinder class into the MLIR
@@ -267,71 +459,8 @@ class DynamicTileAnalysis {
 
 } // namespace xilinx::AIE
 
-// For some mysterious reason, the only way to get the priorityQueue(cmp)
-// comparison in dijkstraShortestPaths to work correctly is to define
-// this template specialization for the pointers. Overloading operator
-// will not work. Furthermore, if  you try to move this into AIEPathFinder.cpp
-// you'll get a compile error about
-// "specialization of ‘std::less<xilinx::AIE::Switchbox*>’ after instantiation"
-// because one of the graph traits below is doing the comparison internally
-// (try moving this below the llvm namespace...)
-namespace std {
-template <>
-struct less<xilinx::AIE::Switchbox *> {
-  bool operator()(const xilinx::AIE::Switchbox *a,
-                  const xilinx::AIE::Switchbox *b) const {
-    return *a < *b;
-  }
-};
-} // namespace std
-
 namespace llvm {
 
-template <>
-struct GraphTraits<xilinx::AIE::SwitchboxNode *> {
-  using NodeRef = xilinx::AIE::SwitchboxNode *;
-
-  static xilinx::AIE::SwitchboxNode *SwitchboxGraphGetSwitchbox(
-      DGEdge<xilinx::AIE::SwitchboxNode, xilinx::AIE::ChannelEdge> *P) {
-    return &P->getTargetNode();
-  }
-
-  // Provide a mapped iterator so that the GraphTrait-based implementations can
-  // find the target nodes without having to explicitly go through the edges.
-  using ChildIteratorType =
-      mapped_iterator<xilinx::AIE::SwitchboxNode::iterator,
-                      decltype(&SwitchboxGraphGetSwitchbox)>;
-  using ChildEdgeIteratorType = xilinx::AIE::SwitchboxNode::iterator;
-
-  static NodeRef getEntryNode(NodeRef N) { return N; }
-  static ChildIteratorType child_begin(NodeRef N) {
-    return {N->begin(), &SwitchboxGraphGetSwitchbox};
-  }
-  static ChildIteratorType child_end(NodeRef N) {
-    return {N->end(), &SwitchboxGraphGetSwitchbox};
-  }
-
-  static ChildEdgeIteratorType child_edge_begin(NodeRef N) {
-    return N->begin();
-  }
-  static ChildEdgeIteratorType child_edge_end(NodeRef N) { return N->end(); }
-};
-
-template <>
-struct GraphTraits<xilinx::AIE::SwitchboxGraph *>
-    : GraphTraits<xilinx::AIE::SwitchboxNode *> {
-  using nodes_iterator = xilinx::AIE::SwitchboxGraph::iterator;
-  static NodeRef getEntryNode(xilinx::AIE::SwitchboxGraph *DG) {
-    return *DG->begin();
-  }
-  static nodes_iterator nodes_begin(xilinx::AIE::SwitchboxGraph *DG) {
-    return DG->begin();
-  }
-  static nodes_iterator nodes_end(xilinx::AIE::SwitchboxGraph *DG) {
-    return DG->end();
-  }
-};
-
 inline raw_ostream &operator<<(raw_ostream &os,
                                const xilinx::AIE::SwitchSettings &ss) {
   std::stringstream s;
@@ -347,9 +476,9 @@ inline raw_ostream &operator<<(raw_ostream &os,
 } // namespace llvm
 
 template <>
-struct std::hash<xilinx::AIE::Switchbox> {
-  std::size_t operator()(const xilinx::AIE::Switchbox &s) const noexcept {
-    return std::hash<xilinx::AIE::TileID>{}(s);
+struct std::hash<xilinx::AIE::SwitchboxNode> {
+  std::size_t operator()(const xilinx::AIE::SwitchboxNode &s) const noexcept {
+    return std::hash<xilinx::AIE::TileID>{}({s.col, s.row});
   }
 };
 
@@ -357,7 +486,7 @@ template <>
 struct std::hash<xilinx::AIE::PathEndPoint> {
   std::size_t operator()(const xilinx::AIE::PathEndPoint &pe) const noexcept {
     std::size_t h1 = std::hash<xilinx::AIE::Port>{}(pe.port);
-    std::size_t h2 = std::hash<xilinx::AIE::Switchbox>{}(pe.sb);
+    std::size_t h2 = std::hash<xilinx::AIE::SwitchboxNode>{}(pe.sb);
     return h1 ^ (h2 << 1);
   }
 };
diff --git a/include/aie/Dialect/AIEVec/AIE1/CMakeLists.txt b/include/aie/Dialect/AIEVec/AIE1/CMakeLists.txt
new file mode 100644
index 0000000000..d6c23eb655
--- /dev/null
+++ b/include/aie/Dialect/AIEVec/AIE1/CMakeLists.txt
@@ -0,0 +1,8 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Xilinx Inc.
+
+add_subdirectory(IR)
diff --git a/include/aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Dialect.h b/include/aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Dialect.h
new file mode 100644
index 0000000000..65fb554082
--- /dev/null
+++ b/include/aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Dialect.h
@@ -0,0 +1,29 @@
+//===- AIEVecAIE1Dialect.h - AIE1 Vector Dialect ----------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 Xilinx Inc.
+//
+//===----------------------------------------------------------------------===//
+// This file defines the AIE1 vector dialect.
+//===----------------------------------------------------------------------===//
+
+#ifndef AIE_DIALECT_AIEVEC_AIE1_IR_AIEVECAIE1DIALECT_H
+#define AIE_DIALECT_AIEVEC_AIE1_IR_AIEVECAIE1DIALECT_H
+
+namespace xilinx {
+namespace aievec {
+namespace aie1 {
+
+class AIEVecAIE1Dialect;
+
+} // end namespace aie1
+} // end namespace aievec
+} // end namespace xilinx
+
+#define GET_OP_CLASSES
+#include "aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1OpsDialect.h.inc"
+
+#endif // AIE_DIALECT_AIEVEC_AIE1_IR_AIEVECAIE1DIALECT_H
diff --git a/include/aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Dialect.td b/include/aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Dialect.td
new file mode 100644
index 0000000000..3fd2c4131a
--- /dev/null
+++ b/include/aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Dialect.td
@@ -0,0 +1,24 @@
+//===- AIEVecAIE1Dialect.td - AIE1 vector op definitions---*- tablegen -*-====//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+//
+//===----------------------------------------------------------------------===//
+// Defines AIE1 vector dialect.
+//===----------------------------------------------------------------------===//
+
+#ifndef AIEVEC_AIE1_DIALECT
+#define AIEVEC_AIE1_DIALECT
+
+include "mlir/IR/OpBase.td"
+
+def AIEVec_AIE1_Dialect : Dialect {
+  let name = "aievec_aie1";
+  let summary = "Types and operations for AIE1 vector dialect";
+  let cppNamespace = "::xilinx::aievec::aie1";
+}
+
+#endif // AIEVEC_AIE1_DIALECT
diff --git a/include/aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Ops.h b/include/aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Ops.h
new file mode 100644
index 0000000000..aacf88010c
--- /dev/null
+++ b/include/aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Ops.h
@@ -0,0 +1,25 @@
+//===- AIEVecAIE1Ops.h - AIE1 Vector Dialect and Operations -----*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+//
+//===----------------------------------------------------------------------===//
+// This file defines the AIE1 vector dialect and the operations.
+//===----------------------------------------------------------------------===//
+
+#ifndef AIE_DIALECT_AIEVEC_AIE1_IR_AIEVECAIE1OPS_H
+#define AIE_DIALECT_AIEVEC_AIE1_IR_AIEVECAIE1OPS_H
+
+#include "mlir/Bytecode/BytecodeOpInterface.h"
+#include "mlir/Interfaces/InferTypeOpInterface.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+
+#include "AIEVecAIE1Dialect.h"
+
+#define GET_OP_CLASSES
+#include "aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Ops.h.inc"
+
+#endif // AIE_DIALECT_AIEVEC_AIE1_IR_AIEVECAIE1OPS_H
diff --git a/include/aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Ops.td b/include/aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Ops.td
new file mode 100644
index 0000000000..33a2e95996
--- /dev/null
+++ b/include/aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Ops.td
@@ -0,0 +1,253 @@
+//===- AIEVecAIE1Ops.td - AIE1 vector op definitions ------*- tablegen -*-====//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+//
+//===----------------------------------------------------------------------===//
+// Defines AIE1 vector operations.
+//===----------------------------------------------------------------------===//
+
+#ifndef AIEVEC_AIE1_OPS
+#define AIEVEC_AIE1_OPS
+
+include "aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Dialect.td"
+
+include "mlir/Interfaces/InferTypeOpInterface.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+
+// Base class for AIE dialect ops.
+class AIEVecAIE1_Op<string mnemonic, list<Trait> traits = []> :
+    Op<AIEVec_AIE1_Dialect, mnemonic, traits> {
+  // For every AIE vector op, there needs to be a:
+  //   * void ${C++ class of Op}::print(OpAsmPrinter &p)
+  //   * LogicalResult ${C++ class of Op}::verify()
+  //   * ParseResult ${C++ class of Op}::parse(OpAsmParser &parser,
+  //                                         OperationState &result)
+  // functions.
+  let hasCustomAssemblyFormat = 1;
+  let hasVerifier = 1;
+}
+
+def AIEVecAIE1_AddOp:
+  AIEVecAIE1_Op<"add", [
+    Pure
+  ]>,
+  Arguments<(ins AnyVector:$lhs, AnyVector:$rhs,
+               DefaultValuedStrAttr<StrAttr, "">:$xstart,
+               DefaultValuedStrAttr<StrAttr, "">:$xoffsets,
+               DefaultValuedStrAttr<StrAttr, "">:$xoffsets_hi,
+               DefaultValuedStrAttr<StrAttr, "">:$xsquare,
+               DefaultValuedStrAttr<StrAttr, "">:$zstart,
+               DefaultValuedStrAttr<StrAttr, "">:$zoffsets,
+               DefaultValuedStrAttr<StrAttr, "">:$zoffsets_hi,
+               DefaultValuedStrAttr<StrAttr, "">:$zsquare)>,
+  Results<(outs AnyVector:$result)> {
+  let summary = "AIE1 vector add";
+  let description = [{
+    AMD-specific advanced add operation that adds two 1-D vectors 
+    with lane selection. The vector sizes are at least 256 bits.
+    `$result = `$lhs + $rhs`.
+  }];
+  let extraClassDeclaration = [{
+    // Get the attributes
+    llvm::StringRef getStart(int idx) { assert(idx==0 || idx==1);
+                            return idx==0 ? getXstart() : getZstart(); }
+    llvm::StringRef getOffset(int idx) { assert(idx==0 || idx==1);
+                            return idx==0 ? getXoffsets() : getZoffsets(); }
+    llvm::StringRef getOffsetHi(int idx) { assert(idx==0 || idx==1);
+                            return idx==0 ? getXoffsetsHi() : getZoffsetsHi(); }
+    llvm::StringRef getSquare(int idx) { assert(idx==0 || idx==1);
+                            return idx==0 ? getXsquare() : getZsquare(); }
+    // Get the attribute names
+    llvm::StringRef getStartAttrName(int idx) { assert(idx==0 || idx==1);
+                            return idx==0 ? "xstart" : "zstart"; }
+    llvm::StringRef getOffsetAttrName(int idx) { assert(idx==0 || idx==1);
+                            return idx==0 ? "xoffsets" : "zoffsets"; }
+    llvm::StringRef getOffsetHiAttrName(int idx) { assert(idx==0 || idx==1);
+                            return idx==0 ? "xoffsets_hi" : "zoffsets_hi"; }
+    llvm::StringRef getSquareAttrName(int idx) { assert(idx==0 || idx==1);
+                            return idx==0 ? "xsquare" : "zsquare"; }
+  }];
+}
+
+def AIEVecAIE1_SubOp:
+  AIEVecAIE1_Op<"sub", [
+    Pure
+  ]>,
+  Arguments<(ins AnyVector:$lhs, AnyVector:$rhs,
+               DefaultValuedStrAttr<StrAttr, "">:$xstart,
+               DefaultValuedStrAttr<StrAttr, "">:$xoffsets,
+               DefaultValuedStrAttr<StrAttr, "">:$xoffsets_hi,
+               DefaultValuedStrAttr<StrAttr, "">:$xsquare,
+               DefaultValuedStrAttr<StrAttr, "">:$zstart,
+               DefaultValuedStrAttr<StrAttr, "">:$zoffsets,
+               DefaultValuedStrAttr<StrAttr, "">:$zoffsets_hi,
+               DefaultValuedStrAttr<StrAttr, "">:$zsquare)>,
+  Results<(outs AnyVector:$result)> {
+  let summary = "AIE1 vector subtract";
+  let description = [{
+    AMD-specific advanced sub operation that subtracts two 1-D vectors
+    with lane selection. The vector sizes are at least 256 bits.
+    `$result = `$lhs - $rhs`.
+  }];
+  let extraClassDeclaration = [{
+    // Get the attributes
+    llvm::StringRef getStart(int idx) { assert(idx==0 || idx==1);
+                            return idx==0 ? getXstart() : getZstart(); }
+    llvm::StringRef getOffset(int idx) { assert(idx==0 || idx==1);
+                            return idx==0 ? getXoffsets() : getZoffsets(); }
+    llvm::StringRef getOffsetHi(int idx) { assert(idx==0 || idx==1);
+                            return idx==0 ? getXoffsetsHi() : getZoffsetsHi(); }
+    llvm::StringRef getSquare(int idx) { assert(idx==0 || idx==1);
+                            return idx==0 ? getXsquare() : getZsquare(); }
+    // Get the attribute names
+    llvm::StringRef getStartAttrName(int idx) { assert(idx==0 || idx==1);
+                            return idx==0 ? "xstart" : "zstart"; }
+    llvm::StringRef getOffsetAttrName(int idx) { assert(idx==0 || idx==1);
+                            return idx==0 ? "xoffsets" : "zoffsets"; }
+    llvm::StringRef getOffsetHiAttrName(int idx) { assert(idx==0 || idx==1);
+                            return idx==0 ? "xoffsets_hi" : "zoffsets_hi"; }
+    llvm::StringRef getSquareAttrName(int idx) { assert(idx==0 || idx==1);
+                            return idx==0 ? "xsquare" : "zsquare"; }
+  }];
+}
+
+def AIEVecAIE1_MulOp:
+  AIEVecAIE1_Op<"mul", [
+    Pure
+  ]>,
+  Arguments<(ins AnyVector:$lhs, AnyVector:$rhs,
+               DefaultValuedStrAttr<StrAttr, "">:$xstart,
+               DefaultValuedStrAttr<StrAttr, "">:$xoffsets,
+               DefaultValuedStrAttr<StrAttr, "">:$xoffsets_hi,
+               DefaultValuedStrAttr<StrAttr, "">:$xstep,
+               DefaultValuedStrAttr<StrAttr, "">:$xsquare,
+               DefaultValuedStrAttr<StrAttr, "">:$zstart,
+               DefaultValuedStrAttr<StrAttr, "">:$zoffsets,
+               DefaultValuedStrAttr<StrAttr, "">:$zoffsets_hi,
+               DefaultValuedStrAttr<StrAttr, "">:$zstep,
+               DefaultValuedStrAttr<StrAttr, "">:$zsquare)>,
+  Results<(outs AnyVector:$result)> {
+  let summary = "AIE vector multiply";
+  let description = [{
+    AMD-specific multiply operation that multiplies two 1-D vectors.
+    The vector sizes are at least 256 bits, and the left operand vector 
+    is at least twice the size of right operand vector. For integers, the
+    lhs and rhs are 8/16/32 bits, and result is a 48-bit or 80-bit accumulator.
+    `$result = `$lhs * $rhs`.
+  }];
+  let builders = [
+    OpBuilder<(ins "mlir::Value":$lhs, "mlir::Value":$rhs, "mlir::Type":$accType,
+            "llvm::StringRef":$xstart,
+            "llvm::StringRef":$xoffsets, "llvm::StringRef":$xoffsets_hi,
+            "llvm::StringRef":$xstep, "llvm::StringRef":$xsquare,
+            "llvm::StringRef":$zstart,
+            "llvm::StringRef":$zoffsets, "llvm::StringRef":$zoffsets_hi,
+            "llvm::StringRef":$zstep, "llvm::StringRef":$zsquare),
+    [{build($_builder, $_state, accType, lhs, rhs, 
+            xstart, xoffsets, xoffsets_hi,
+            xstep, xsquare,
+            zstart, zoffsets, zoffsets_hi,
+            zstep, zsquare);}]>
+  ];
+  let extraClassDeclaration = [{
+    // Get the attributes
+    llvm::StringRef getStart(int idx) { assert(idx==0 || idx==1);
+                            return idx==0 ? getXstart() : getZstart(); }
+    llvm::StringRef getOffset(int idx) { assert(idx==0 || idx==1);
+                            return idx==0 ? getXoffsets() : getZoffsets(); }
+    llvm::StringRef getOffsetHi(int idx) { assert(idx==0 || idx==1);
+                            return idx==0 ? getXoffsetsHi() : getZoffsetsHi(); }
+    llvm::StringRef getStep(int idx) { assert(idx==0 || idx==1);
+                            return idx==0 ? getXstep() : getZstep(); }
+    llvm::StringRef getSquare(int idx) { assert(idx==0 || idx==1);
+                            return idx==0 ? getXsquare() : getZsquare(); }
+    // Get the attribute names
+    llvm::StringRef getStartAttrName(int idx) { assert(idx==0 || idx==1);
+                            return idx==0 ? "xstart" : "zstart"; }
+    llvm::StringRef getOffsetAttrName(int idx) { assert(idx==0 || idx==1);
+                            return idx==0 ? "xoffsets" : "zoffsets"; }
+    llvm::StringRef getOffsetHiAttrName(int idx) { assert(idx==0 || idx==1);
+                            return idx==0 ? "xoffsets_hi" : "zoffsets_hi"; }
+    llvm::StringRef getStepAttrName(int idx) { assert(idx==0 || idx==1);
+                            return idx==0 ? "xstep" : "zstep"; }
+    llvm::StringRef getSquareAttrName(int idx) { assert(idx==0 || idx==1);
+                            return idx==0 ? "xsquare" : "zsquare"; }
+  }];
+}
+
+def AIEVecAIE1_FMAOp :
+  AIEVecAIE1_Op<"mac", [
+    Pure
+  ]>,
+  Arguments<(ins AnyVector:$lhs, AnyVector:$rhs, AnyVector:$acc,
+               DefaultValuedStrAttr<StrAttr, "">:$xstart,
+               DefaultValuedStrAttr<StrAttr, "">:$xoffsets,
+               DefaultValuedStrAttr<StrAttr, "">:$xoffsets_hi,
+               DefaultValuedStrAttr<StrAttr, "">:$xstep,
+               DefaultValuedStrAttr<StrAttr, "">:$xsquare,
+               DefaultValuedStrAttr<StrAttr, "">:$zstart,
+               DefaultValuedStrAttr<StrAttr, "">:$zoffsets,
+               DefaultValuedStrAttr<StrAttr, "">:$zoffsets_hi,
+               DefaultValuedStrAttr<StrAttr, "">:$zstep,
+               DefaultValuedStrAttr<StrAttr, "">:$zsquare,
+               DefaultValuedAttr<BoolAttr, "false">:$fmsub)>,
+  Results<(outs AnyVector:$result)> {
+  let summary = "AIE vector fused multiply-add";
+  let description = [{
+    AMD-specific multiply-add operation. It multiplies two 1-D vectors,
+    and adds the result to an accumulator. The vector sizes are at least
+    256 bits, and the left operand vector is at least twice the size of
+    right operand vector. For integers, the lhs and rhs are 8/16/32 bits;
+    the result and acc are 48-bit or 80-bit accumulator.
+    `$result = `$lhs * $rhs + $acc`.
+    Note: the same operator can be used as fmsub operator by setting the
+    'fmsub' bool to true.
+  }];
+  let builders = [
+    OpBuilder<(ins "mlir::Value":$lhs, "mlir::Value":$rhs, "mlir::Value":$acc,
+            "llvm::StringRef":$xstart,
+            "llvm::StringRef":$xoffsets, "llvm::StringRef":$xoffsets_hi,
+            "llvm::StringRef":$xstep, "llvm::StringRef":$xsquare,
+            "llvm::StringRef":$zstart,
+            "llvm::StringRef":$zoffsets, "llvm::StringRef":$zoffsets_hi,
+            "llvm::StringRef":$zstep, "llvm::StringRef":$zsquare,
+            "bool":$fmsub),
+    [{build($_builder, $_state, acc.getType(), lhs, rhs, acc,
+            xstart, xoffsets, xoffsets_hi,
+            xstep, xsquare,
+            zstart, zoffsets, zoffsets_hi,
+            zstep, zsquare,
+            fmsub);}]>
+  ];
+  let extraClassDeclaration = [{
+    // Get the attributes
+    llvm::StringRef getStart(int idx) { assert(idx==0 || idx==1);
+                        return idx==0 ? getXstart() : getZstart(); }
+    llvm::StringRef getOffset(int idx) { assert(idx==0 || idx==1);
+                        return idx==0 ? getXoffsets() : getZoffsets(); }
+    llvm::StringRef getOffsetHi(int idx) { assert(idx==0 || idx==1);
+                        return idx==0 ? getXoffsetsHi() : getZoffsetsHi(); }
+    llvm::StringRef getStep(int idx) { assert(idx==0 || idx==1);
+                        return idx==0 ? getXstep() : getZstep(); }
+    llvm::StringRef getSquare(int idx) { assert(idx==0 || idx==1);
+                        return idx==0 ? getXsquare() : getZsquare(); }
+    // Get the attribute names
+    llvm::StringRef getStartAttrName(int idx) { assert(idx==0 || idx==1);
+                        return idx==0 ? "xstart" : "zstart"; }
+    llvm::StringRef getOffsetAttrName(int idx) { assert(idx==0 || idx==1);
+                        return idx==0 ? "xoffsets" : "zoffsets"; }
+    llvm::StringRef getOffsetHiAttrName(int idx) { assert(idx==0 || idx==1);
+                        return idx==0 ? "xoffsets_hi" : "zoffsets_hi"; }
+    llvm::StringRef getStepAttrName(int idx) { assert(idx==0 || idx==1);
+                        return idx==0 ? "xstep" : "zstep"; }
+    llvm::StringRef getSquareAttrName(int idx) { assert(idx==0 || idx==1);
+                        return idx==0 ? "xsquare" : "zsquare"; }
+    llvm::StringRef getSubAttrName() { return "fmsub"; }
+  }];
+}
+
+#endif // AIEVEC_AIE1_OPS
diff --git a/include/aie/Dialect/AIEVec/AIE1/IR/CMakeLists.txt b/include/aie/Dialect/AIEVec/AIE1/IR/CMakeLists.txt
new file mode 100644
index 0000000000..35c2f5c863
--- /dev/null
+++ b/include/aie/Dialect/AIEVec/AIE1/IR/CMakeLists.txt
@@ -0,0 +1,9 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2022-2024 Advanced Micro Devices, Inc. or its affiliates
+
+add_mlir_dialect(AIEVecAIE1Ops aievec_aie1)
+add_mlir_doc(AIEVecAIE1Ops AIEVecAIE1Dialect ./ -gen-dialect-doc -dialect=aievec_aie1)
diff --git a/include/aie/Dialect/AIEVec/AIEVecUtils.h b/include/aie/Dialect/AIEVec/AIEVecUtils.h
index c3ed10d36a..2a6984560f 100644
--- a/include/aie/Dialect/AIEVec/AIEVecUtils.h
+++ b/include/aie/Dialect/AIEVec/AIEVecUtils.h
@@ -13,6 +13,7 @@
 #ifndef AIE_DIALECT_AIEVEC_AIEVECUTILS_H
 #define AIE_DIALECT_AIEVEC_AIEVECUTILS_H
 
+#include "aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Ops.h"
 #include "aie/Dialect/AIEVec/IR/AIEVecDialect.h"
 #include "aie/Dialect/AIEVec/IR/AIEVecOps.h"
 #include "aie/Dialect/AIEVec/IR/AIEVecTypes.h"
@@ -70,7 +71,8 @@ inline int32_t getVectorSizeInBits(mlir::VectorType type) {
 
 // Return true if this is an operation defined in AIE dialect
 inline bool isAIEOp(mlir::Operation *op) {
-  return llvm::isa<AIEVecDialect>(op->getDialect());
+  return llvm::isa<aievec::AIEVecDialect, aievec::aie1::AIEVecAIE1Dialect>(
+      op->getDialect());
 }
 
 // Determine the output type for a vector operation based on whether
diff --git a/include/aie/Dialect/AIEVec/CMakeLists.txt b/include/aie/Dialect/AIEVec/CMakeLists.txt
index e24d5879ec..c2c27d9eac 100644
--- a/include/aie/Dialect/AIEVec/CMakeLists.txt
+++ b/include/aie/Dialect/AIEVec/CMakeLists.txt
@@ -9,3 +9,4 @@ add_subdirectory(Analysis)
 add_subdirectory(IR)
 add_subdirectory(Transforms)
 add_subdirectory(TransformOps)
+add_subdirectory(AIE1)
diff --git a/include/aie/Dialect/AIEVec/IR/AIEVecOps.td b/include/aie/Dialect/AIEVec/IR/AIEVecOps.td
index 43395b47e0..b2f2a38d95 100644
--- a/include/aie/Dialect/AIEVec/IR/AIEVecOps.td
+++ b/include/aie/Dialect/AIEVec/IR/AIEVecOps.td
@@ -34,47 +34,6 @@ class AIEVec_Op<string mnemonic, list<Trait> traits = []> :
   let hasVerifier = 1;
 }
 
-def AIEVec_AddOp:
-  AIEVec_Op<"add", [
-    Pure
-  ]>,
-  Arguments<(ins AnyVector:$lhs, AnyVector:$rhs,
-               DefaultValuedStrAttr<StrAttr, "">:$xstart,
-               DefaultValuedStrAttr<StrAttr, "">:$xoffsets,
-               DefaultValuedStrAttr<StrAttr, "">:$xoffsets_hi,
-               DefaultValuedStrAttr<StrAttr, "">:$xsquare,
-               DefaultValuedStrAttr<StrAttr, "">:$zstart,
-               DefaultValuedStrAttr<StrAttr, "">:$zoffsets,
-               DefaultValuedStrAttr<StrAttr, "">:$zoffsets_hi,
-               DefaultValuedStrAttr<StrAttr, "">:$zsquare)>,
-  Results<(outs AnyVector:$result)> {
-  let summary = "AIE vector add";
-  let description = [{
-    AMD-specific advanced add operation that adds two 1-D vectors 
-    with lane selection. The vector sizes are at least 256 bits.
-    `$result = `$lhs + $rhs`.
-  }];
-  let extraClassDeclaration = [{
-    // Get the attributes
-    llvm::StringRef getStart(int idx) { assert(idx==0 || idx==1);
-                            return idx==0 ? getXstart() : getZstart(); }
-    llvm::StringRef getOffset(int idx) { assert(idx==0 || idx==1);
-                            return idx==0 ? getXoffsets() : getZoffsets(); }
-    llvm::StringRef getOffsetHi(int idx) { assert(idx==0 || idx==1);
-                            return idx==0 ? getXoffsetsHi() : getZoffsetsHi(); }
-    llvm::StringRef getSquare(int idx) { assert(idx==0 || idx==1);
-                            return idx==0 ? getXsquare() : getZsquare(); }
-    // Get the attribute names
-    llvm::StringRef getStartAttrName(int idx) { assert(idx==0 || idx==1);
-                            return idx==0 ? "xstart" : "zstart"; }
-    llvm::StringRef getOffsetAttrName(int idx) { assert(idx==0 || idx==1);
-                            return idx==0 ? "xoffsets" : "zoffsets"; }
-    llvm::StringRef getOffsetHiAttrName(int idx) { assert(idx==0 || idx==1);
-                            return idx==0 ? "xoffsets_hi" : "zoffsets_hi"; }
-    llvm::StringRef getSquareAttrName(int idx) { assert(idx==0 || idx==1);
-                            return idx==0 ? "xsquare" : "zsquare"; }
-  }];
-}
 
 def AIEVec_AddElemOp:
   AIEVec_Op<"add_elem", [
@@ -92,47 +51,6 @@ def AIEVec_AddElemOp:
   let hasVerifier = 0;
 }
 
-def AIEVec_SubOp:
-  AIEVec_Op<"sub", [
-    Pure
-  ]>,
-  Arguments<(ins AnyVector:$lhs, AnyVector:$rhs,
-               DefaultValuedStrAttr<StrAttr, "">:$xstart,
-               DefaultValuedStrAttr<StrAttr, "">:$xoffsets,
-               DefaultValuedStrAttr<StrAttr, "">:$xoffsets_hi,
-               DefaultValuedStrAttr<StrAttr, "">:$xsquare,
-               DefaultValuedStrAttr<StrAttr, "">:$zstart,
-               DefaultValuedStrAttr<StrAttr, "">:$zoffsets,
-               DefaultValuedStrAttr<StrAttr, "">:$zoffsets_hi,
-               DefaultValuedStrAttr<StrAttr, "">:$zsquare)>,
-  Results<(outs AnyVector:$result)> {
-  let summary = "AIE vector subtract";
-  let description = [{
-    AMD-specific advanced sub operation that subtracts two 1-D vectors
-    with lane selection. The vector sizes are at least 256 bits.
-    `$result = `$lhs - $rhs`.
-  }];
-  let extraClassDeclaration = [{
-    // Get the attributes
-    llvm::StringRef getStart(int idx) { assert(idx==0 || idx==1);
-                            return idx==0 ? getXstart() : getZstart(); }
-    llvm::StringRef getOffset(int idx) { assert(idx==0 || idx==1);
-                            return idx==0 ? getXoffsets() : getZoffsets(); }
-    llvm::StringRef getOffsetHi(int idx) { assert(idx==0 || idx==1);
-                            return idx==0 ? getXoffsetsHi() : getZoffsetsHi(); }
-    llvm::StringRef getSquare(int idx) { assert(idx==0 || idx==1);
-                            return idx==0 ? getXsquare() : getZsquare(); }
-    // Get the attribute names
-    llvm::StringRef getStartAttrName(int idx) { assert(idx==0 || idx==1);
-                            return idx==0 ? "xstart" : "zstart"; }
-    llvm::StringRef getOffsetAttrName(int idx) { assert(idx==0 || idx==1);
-                            return idx==0 ? "xoffsets" : "zoffsets"; }
-    llvm::StringRef getOffsetHiAttrName(int idx) { assert(idx==0 || idx==1);
-                            return idx==0 ? "xoffsets_hi" : "zoffsets_hi"; }
-    llvm::StringRef getSquareAttrName(int idx) { assert(idx==0 || idx==1);
-                            return idx==0 ? "xsquare" : "zsquare"; }
-  }];
-}
 
 def AIEVec_SubElemOp:
   AIEVec_Op<"sub_elem", [
@@ -150,76 +68,6 @@ def AIEVec_SubElemOp:
   let hasVerifier = 0;
 }
 
-def AIEVec_FMAOp :
-  AIEVec_Op<"mac", [
-    Pure
-  ]>,
-  Arguments<(ins AnyVector:$lhs, AnyVector:$rhs, AnyVector:$acc,
-               DefaultValuedStrAttr<StrAttr, "">:$xstart,
-               DefaultValuedStrAttr<StrAttr, "">:$xoffsets,
-               DefaultValuedStrAttr<StrAttr, "">:$xoffsets_hi,
-               DefaultValuedStrAttr<StrAttr, "">:$xstep,
-               DefaultValuedStrAttr<StrAttr, "">:$xsquare,
-               DefaultValuedStrAttr<StrAttr, "">:$zstart,
-               DefaultValuedStrAttr<StrAttr, "">:$zoffsets,
-               DefaultValuedStrAttr<StrAttr, "">:$zoffsets_hi,
-               DefaultValuedStrAttr<StrAttr, "">:$zstep,
-               DefaultValuedStrAttr<StrAttr, "">:$zsquare,
-               DefaultValuedAttr<BoolAttr, "false">:$fmsub)>,
-  Results<(outs AnyVector:$result)> {
-  let summary = "AIE vector fused multiply-add";
-  let description = [{
-    AMD-specific multiply-add operation. It multiplies two 1-D vectors,
-    and adds the result to an accumulator. The vector sizes are at least
-    256 bits, and the left operand vector is at least twice the size of
-    right operand vector. For integers, the lhs and rhs are 8/16/32 bits;
-    the result and acc are 48-bit or 80-bit accumulator.
-    `$result = `$lhs * $rhs + $acc`.
-    Note: the same operator can be used as fmsub operator by setting the
-    'fmsub' bool to true.
-  }];
-  let builders = [
-    OpBuilder<(ins "mlir::Value":$lhs, "mlir::Value":$rhs, "mlir::Value":$acc,
-            "llvm::StringRef":$xstart,
-            "llvm::StringRef":$xoffsets, "llvm::StringRef":$xoffsets_hi,
-            "llvm::StringRef":$xstep, "llvm::StringRef":$xsquare,
-            "llvm::StringRef":$zstart,
-            "llvm::StringRef":$zoffsets, "llvm::StringRef":$zoffsets_hi,
-            "llvm::StringRef":$zstep, "llvm::StringRef":$zsquare,
-            "bool":$fmsub),
-    [{build($_builder, $_state, acc.getType(), lhs, rhs, acc,
-            xstart, xoffsets, xoffsets_hi,
-            xstep, xsquare,
-            zstart, zoffsets, zoffsets_hi,
-            zstep, zsquare,
-            fmsub);}]>
-  ];
-  let extraClassDeclaration = [{
-    // Get the attributes
-    llvm::StringRef getStart(int idx) { assert(idx==0 || idx==1);
-                        return idx==0 ? getXstart() : getZstart(); }
-    llvm::StringRef getOffset(int idx) { assert(idx==0 || idx==1);
-                        return idx==0 ? getXoffsets() : getZoffsets(); }
-    llvm::StringRef getOffsetHi(int idx) { assert(idx==0 || idx==1);
-                        return idx==0 ? getXoffsetsHi() : getZoffsetsHi(); }
-    llvm::StringRef getStep(int idx) { assert(idx==0 || idx==1);
-                        return idx==0 ? getXstep() : getZstep(); }
-    llvm::StringRef getSquare(int idx) { assert(idx==0 || idx==1);
-                        return idx==0 ? getXsquare() : getZsquare(); }
-    // Get the attribute names
-    llvm::StringRef getStartAttrName(int idx) { assert(idx==0 || idx==1);
-                        return idx==0 ? "xstart" : "zstart"; }
-    llvm::StringRef getOffsetAttrName(int idx) { assert(idx==0 || idx==1);
-                        return idx==0 ? "xoffsets" : "zoffsets"; }
-    llvm::StringRef getOffsetHiAttrName(int idx) { assert(idx==0 || idx==1);
-                        return idx==0 ? "xoffsets_hi" : "zoffsets_hi"; }
-    llvm::StringRef getStepAttrName(int idx) { assert(idx==0 || idx==1);
-                        return idx==0 ? "xstep" : "zstep"; }
-    llvm::StringRef getSquareAttrName(int idx) { assert(idx==0 || idx==1);
-                        return idx==0 ? "xsquare" : "zsquare"; }
-    llvm::StringRef getSubAttrName() { return "fmsub"; }
-  }];
-}
 
 def AIEVec_FMAElemOp :
   AIEVec_Op<"mac_elem", [
@@ -248,69 +96,6 @@ def AIEVec_FMAElemOp :
   }];
 }
 
-def AIEVec_MulOp:
-  AIEVec_Op<"mul", [
-    Pure
-  ]>,
-  Arguments<(ins AnyVector:$lhs, AnyVector:$rhs,
-               DefaultValuedStrAttr<StrAttr, "">:$xstart,
-               DefaultValuedStrAttr<StrAttr, "">:$xoffsets,
-               DefaultValuedStrAttr<StrAttr, "">:$xoffsets_hi,
-               DefaultValuedStrAttr<StrAttr, "">:$xstep,
-               DefaultValuedStrAttr<StrAttr, "">:$xsquare,
-               DefaultValuedStrAttr<StrAttr, "">:$zstart,
-               DefaultValuedStrAttr<StrAttr, "">:$zoffsets,
-               DefaultValuedStrAttr<StrAttr, "">:$zoffsets_hi,
-               DefaultValuedStrAttr<StrAttr, "">:$zstep,
-               DefaultValuedStrAttr<StrAttr, "">:$zsquare)>,
-  Results<(outs AnyVector:$result)> {
-  let summary = "AIE vector multiply";
-  let description = [{
-    AMD-specific multiply operation that multiplies two 1-D vectors.
-    The vector sizes are at least 256 bits, and the left operand vector 
-    is at least twice the size of right operand vector. For integers, the
-    lhs and rhs are 8/16/32 bits, and result is a 48-bit or 80-bit accumulator.
-    `$result = `$lhs * $rhs`.
-  }];
-  let builders = [
-    OpBuilder<(ins "mlir::Value":$lhs, "mlir::Value":$rhs, "mlir::Type":$accType,
-            "llvm::StringRef":$xstart,
-            "llvm::StringRef":$xoffsets, "llvm::StringRef":$xoffsets_hi,
-            "llvm::StringRef":$xstep, "llvm::StringRef":$xsquare,
-            "llvm::StringRef":$zstart,
-            "llvm::StringRef":$zoffsets, "llvm::StringRef":$zoffsets_hi,
-            "llvm::StringRef":$zstep, "llvm::StringRef":$zsquare),
-    [{build($_builder, $_state, accType, lhs, rhs, 
-            xstart, xoffsets, xoffsets_hi,
-            xstep, xsquare,
-            zstart, zoffsets, zoffsets_hi,
-            zstep, zsquare);}]>
-  ];
-  let extraClassDeclaration = [{
-    // Get the attributes
-    llvm::StringRef getStart(int idx) { assert(idx==0 || idx==1);
-                            return idx==0 ? getXstart() : getZstart(); }
-    llvm::StringRef getOffset(int idx) { assert(idx==0 || idx==1);
-                            return idx==0 ? getXoffsets() : getZoffsets(); }
-    llvm::StringRef getOffsetHi(int idx) { assert(idx==0 || idx==1);
-                            return idx==0 ? getXoffsetsHi() : getZoffsetsHi(); }
-    llvm::StringRef getStep(int idx) { assert(idx==0 || idx==1);
-                            return idx==0 ? getXstep() : getZstep(); }
-    llvm::StringRef getSquare(int idx) { assert(idx==0 || idx==1);
-                            return idx==0 ? getXsquare() : getZsquare(); }
-    // Get the attribute names
-    llvm::StringRef getStartAttrName(int idx) { assert(idx==0 || idx==1);
-                            return idx==0 ? "xstart" : "zstart"; }
-    llvm::StringRef getOffsetAttrName(int idx) { assert(idx==0 || idx==1);
-                            return idx==0 ? "xoffsets" : "zoffsets"; }
-    llvm::StringRef getOffsetHiAttrName(int idx) { assert(idx==0 || idx==1);
-                            return idx==0 ? "xoffsets_hi" : "zoffsets_hi"; }
-    llvm::StringRef getStepAttrName(int idx) { assert(idx==0 || idx==1);
-                            return idx==0 ? "xstep" : "zstep"; }
-    llvm::StringRef getSquareAttrName(int idx) { assert(idx==0 || idx==1);
-                            return idx==0 ? "xsquare" : "zsquare"; }
-  }];
-}
 
 def AIEVec_MulElemOp:
   AIEVec_Op<"mul_elem", [	 
diff --git a/include/aie/Dialect/AIEVec/Transforms/Passes.h b/include/aie/Dialect/AIEVec/Transforms/Passes.h
index 8e18fd7192..2b363b23c2 100644
--- a/include/aie/Dialect/AIEVec/Transforms/Passes.h
+++ b/include/aie/Dialect/AIEVec/Transforms/Passes.h
@@ -14,6 +14,7 @@
 #ifndef AIE_DIALECT_AIEVEC_TRANSFORMS_PASSES_H
 #define AIE_DIALECT_AIEVEC_TRANSFORMS_PASSES_H
 
+#include "aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Dialect.h"
 #include "aie/Dialect/AIEVec/IR/AIEVecDialect.h"
 
 #include "mlir/Pass/Pass.h"
diff --git a/include/aie/Dialect/AIEVec/Transforms/Passes.td b/include/aie/Dialect/AIEVec/Transforms/Passes.td
index fdca06ea22..e0e0f3c52e 100644
--- a/include/aie/Dialect/AIEVec/Transforms/Passes.td
+++ b/include/aie/Dialect/AIEVec/Transforms/Passes.td
@@ -22,6 +22,7 @@ def AIEVectorize : Pass<"aie-vectorize", "mlir::ModuleOp"> {
   let dependentDialects = [
     "mlir::affine::AffineDialect",
     "xilinx::aievec::AIEVecDialect",
+    "xilinx::aievec::aie1::AIEVecAIE1Dialect",
     "mlir::arith::ArithDialect",
     "mlir::memref::MemRefDialect",
     "mlir::scf::SCFDialect",
diff --git a/include/aie/Dialect/AIEX/IR/AIEX.td b/include/aie/Dialect/AIEX/IR/AIEX.td
index 88eb70927f..29077926cf 100644
--- a/include/aie/Dialect/AIEX/IR/AIEX.td
+++ b/include/aie/Dialect/AIEX/IR/AIEX.td
@@ -20,6 +20,7 @@ include "mlir/IR/EnumAttr.td"
 include "mlir/IR/SymbolInterfaces.td"
 include "mlir/Interfaces/CallInterfaces.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/IR/CommonAttrConstraints.td"
 
 def AIEX_Dialect : Dialect {
   let name = "aiex";
@@ -463,14 +464,33 @@ def AIE_SelectOp: AIEX_Op<"select", []>, Results<(outs Index)> {
   ];
 }
 
+def AIE_RuntimeSequenceOp : AIEX_Op<"runtime_sequence", [NoTerminator, HasParent<"AIE::DeviceOp">]> {
+  let summary = "Program the configuration co-processor of the AI Engine array";
+  let description = [{
+    Instructions in this operation allow for runtime (re-)configuration of the AI Engine array, such as configuring data movement buffer descriptors.
+    These instructions will execute on the configuration co-processor of the AI Engine array.
+
+    Typically, these instructions include configuring the data transfers between host and AIE array on the shims.
+    The input arguments are arguments passed in from the host at kernel invocation time. This may include buffers on the host.
+  }];
+  let arguments = (
+    ins OptionalAttr<SymbolNameAttr>:$sym_name
+  );
+  let regions = (region
+    AnyRegion:$body
+  );
+  let hasCustomAssemblyFormat = 1;
+  let hasVerifier = 1;
+}
+
 def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [
     AttrSizedOperandSegments,
     MyOffsetSizeAndStrideOpInterface
   ]> {
-  let summary = "half dma operator";
+  let summary = "half DMA operator";
 
   let description = [{
-    An nd half dma operator.
+    An n-dimensional half DMA operator.
 
     Programs a DMA on coordinates (`x`, `y`) to access a memory `memref` with an access
     pattern specified by `offsets`, `sizes` and `strides` or `static_offsets`, `static_sizes`
@@ -480,6 +500,53 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [
     operation should issue a token which can be received and read for synchronization purposes.
     This `issue_token` attribute is set to `false` by default for `MM2S` for backward compatibility
     and **is always set to true for** `S2MM` channels.
+
+    #### `metadata` -- Specifying Tile, Channel, Direction and Linking a `dma_memcpy_nd` to its Other Half
+
+    The `metadata` attribute must point to a symbol referencing a 
+    [`aie.shim_dma_allocation` operation](AIE.html#aiedma_bd-xilinxaiedmabdop).
+    The tile coordinates of the DMA to configure, the channel number and the direction (`MM2S` or `S2MM`) are taken from this operation.
+
+    To connect the DMA to its other half (i.e. a `MM2S` DMA to its receiving end and a `S2MM` to the sending end), 
+    the user must configure a flow (`aie.flow`) between the tile and channel referenced in the `aie.shim_dma_allocation` and the corresponding other end.
+
+    When using ObjectFIFOs, the `aie.shim_dma_allocation` operations and the `aie.flows` are generated automatically.
+    The symbol of the `aie.objectfifo` (create) operation can be used directly in `metadata` in this case.
+
+    #### Notes on Synchronization and Reusing Buffer Descriptor IDs
+
+    When the `dma_memcpy_nd` operation executes, it immediately reprograms the buffer descriptor with ID `bd_id` on tile (`x`, `y`), even if that buffer descriptor is currently executing.
+    Without proper synchronization, this inevitably leads to nondeterministic results.
+
+    Programming a buffer descriptor that is not currently executing is harmless. 
+    Thus, the first `dma_memcpy_nd` call for each `bd_id` requires no synchronization.
+
+    However, if you wish to later re-use a `bd_id` on the same tile, you must wait for the previous buffer descriptor to complete. 
+    The `sync` or `dma_wait` operations can be used for this.
+
+    `sync` blocks until it receives a _task completion token_ (TCT). 
+    To properly synchronize, you must thus configure your BD to issue a TCT using the `issue_token` attribute, then wait on that token before reusing the BD.
+
+    `dma_wait` is a convenience operation that lowers to the corresponding `sync` operation for the refrenced symbol.
+
+    Note that if you have multiple concurrently running BDs and you can reason one BD will always complete after all others, it is not strictly necessary to issue and wait on the TC token for every BD.
+    For example, if you have input and output BDs on the shim, and you know the cores will only push output onto the output BD after the input BDs have completed, it may be sufficient to synchronize only on the output BD before reusing input BDs.
+
+    #### Data Layout Transformations
+
+    The `sizes` and `strides` attributes describe a data layout transformation to be performed by the DMA.
+    These transformations are described in more depth in the documentation for the 
+    [`aie.dma_bd` operation](AIE.html#aiedma_bd-xilinxaiedmabdop).
+    Note that the syntax here differs from that of the `dma_bd` operation: 
+    offsets and strides are given as separate arrays instead of tuples.
+
+    The `offsets` array is used to calculate a static offset into the memref.
+    Each offset in the array is understood in relation to the shape of the memref; 
+    the lowest-dimension `offset` is a direct offset in units of memref element type, and the higher dimensions are multiplied by the size of the memref in those dimensions. 
+    Note that this is for convenience of the user only. 
+    The hardware only supports a single static offset, and this offset is calculated at compile time.
+    Thus, all offsets can be equivalently expressed with the lowest dimension only.
+
   }];
 
   let arguments = (
@@ -549,12 +616,16 @@ def AIE_NpuDmaWaitOp: AIEX_Op<"npu.dma_wait", []> {
   let summary = "Blocking operation to wait for a DMA to complete execution.";
   let description = [{
     The NpuDmaWaitOp blocks until the DMA referenced through `symbol` completes execution
-    and issues a task-complete-token.
+    and issues a task-complete-token (TCT).
+
+    `symbol` is a reference to a `aie.shim_dma_allocation`, which contains information about the column, channel and channel direction on which to wait for a TCT. 
+    The `aie.shim_dma_allocation` may be generated from an ObjectFIFO, in which case you can directly pass the ObjectFIFO symbol refrence.
+    `npu.dma_wait` will be lowered to the corresponding `npu.sync` operation using the information from `symbol`.
 
     Example:
     ```mlir
       ...
-      aie.objectfifo @out0(%tile_0_1, {{ "{%tile_0_0}" }}, 4 : i32) : !aie.objectfifo<memref<32x32xi32>>
+      aie.objectfifo @out0(%tile_0_1, {% raw %}{%tile_0_0}{% endraw %}, 4 : i32) : !aie.objectfifo<memref<32x32xi32>>
       ...
       aiex.npu.dma_memcpy_nd(0, 0, %arg2[1, 1, 0, 0][1, 1, 32, 32][1, 1, 64, 1]) {id = 0 : i64, issue_token = true, metadata = @out0} : memref<32x64xi32>
       ...
@@ -632,6 +703,22 @@ def AIE_NpuWrite32Op: AIEX_Op<"npu.write32", []> {
   }];
 }
 
+// BLOCKWRITE
+def AIE_NpuBlockWriteOp: AIEX_Op<"npu.blockwrite", []> {
+  let summary = "blockwrite operator";
+  let arguments = (
+    ins AnyMemRef:$data,
+        UI32Attr:$address
+  );
+  let results = (outs );
+  let assemblyFormat = [{
+    `(` $data `)` attr-dict `:` type($data)
+  }];
+  let description = [{
+    blockwrite operator
+  }];
+}
+
 // OP_SYNC
 def AIE_NpuSyncOp: AIEX_Op<"npu.sync", []> {
   let summary = "sync operator";
@@ -648,7 +735,12 @@ def AIE_NpuSyncOp: AIEX_Op<"npu.sync", []> {
     attr-dict
   }];
   let description = [{
-    tct sync operator
+    The sync operation blocks execution of the instruction stream until a task-complete token (TCT) is received on `column`, `row`, channel `channel`, direction `direction` (where `0` is `S2MM` and `1` is `MM2S`).
+
+    #### Troubleshooting
+
+    If this operation appears to deadlock, ensure that at least one buffer descriptor is configured to issue a TCT on the channel you expect.
+    By default, `dma_memcpy_nd` operations only issue tokens for `S2MM` channels, and `issue_token` must be set to `true` to issue tokens for `MM2S` channels.
   }];
 }
 
diff --git a/include/aie/InitialAllDialect.h b/include/aie/InitialAllDialect.h
index 0338cd0645..67c91efb53 100644
--- a/include/aie/InitialAllDialect.h
+++ b/include/aie/InitialAllDialect.h
@@ -16,6 +16,7 @@
 
 #include "aie/Dialect/ADF/ADFDialect.h"
 #include "aie/Dialect/AIE/IR/AIEDialect.h"
+#include "aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Dialect.h"
 #include "aie/Dialect/AIEVec/IR/AIEVecDialect.h"
 #include "aie/Dialect/AIEX/IR/AIEXDialect.h"
 #include "aie/Dialect/XLLVM/XLLVMDialect.h"
@@ -31,6 +32,7 @@ inline void registerAllDialects(mlir::DialectRegistry &registry) {
     ADF::ADFDialect,
     AIE::AIEDialect,
     aievec::AIEVecDialect,
+    aievec::aie1::AIEVecAIE1Dialect,
     AIEX::AIEXDialect,
     xllvm::XLLVMDialect
   >();
diff --git a/lib/CAPI/CMakeLists.txt b/lib/CAPI/CMakeLists.txt
index 0d938dddee..342ea0ac19 100644
--- a/lib/CAPI/CMakeLists.txt
+++ b/lib/CAPI/CMakeLists.txt
@@ -16,6 +16,7 @@ add_mlir_public_c_api_library(AIECAPI
   AIEXTransforms
   AIEXUtils
   MLIRAIEVecDialect
+  MLIRAIEVecAIE1Dialect
   MLIRAIEVecToLLVM
   MLIRAIEVecTransforms
   MLIRAIEVecUtils
diff --git a/lib/Conversion/AIEVecToLLVM/AIEVecToLLVM.cpp b/lib/Conversion/AIEVecToLLVM/AIEVecToLLVM.cpp
index 95a329d974..2f93a04702 100644
--- a/lib/Conversion/AIEVecToLLVM/AIEVecToLLVM.cpp
+++ b/lib/Conversion/AIEVecToLLVM/AIEVecToLLVM.cpp
@@ -12,6 +12,7 @@
 #include "../PassDetail.h"
 
 #include "aie/Conversion/AIEVecToLLVM/AIEVecToLLVM.h"
+#include "aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Ops.h"
 #include "aie/Dialect/AIEVec/AIEVecUtils.h"
 #include "aie/Dialect/AIEVec/IR/AIEVecOps.h"
 #include "aie/Dialect/XLLVM/XLLVMDialect.h"
@@ -117,6 +118,42 @@ struct BufferParams {
   uint32_t square;
 };
 
+static VectorType getFlattenedVectorType(VectorType vecTy) {
+  if (vecTy.getRank() == 1)
+    return vecTy;
+  auto shape = vecTy.getShape();
+  return VectorType::get(
+      {std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<>())},
+      vecTy.getElementType());
+}
+
+// sgn_x: Sign mask of matrix X. If it is one matrix X is interpreted as
+// signed, else it treated as unsigned.
+// sgn_y: Sign mask of matrix Y. If it is one matrix Y is interpreted as
+// signed, else it treated as unsigned.
+// amode/bmode/variant: config acc width, mul precision, and mul mode
+// zero_acc: Zeroing of acc1. If it is one then acc1 is zeroed.
+// shift16: Shift mask of acc1. If a bit is set the <<16 operation will be
+// executed on acc1.
+// sub_mul: Negation mask of the matrix multiplication result. If it is
+// one the result of the operation will be negated.
+// sub_acc1: Negation mask of acc1. If it is one acc1 will be negated.
+// sub_acc2: Negation mask of acc2. If it is one acc2 will be negated.
+// sub_mask: Negation mask of complex multiplications. Negates a term of a
+// complex multiplication.
+static inline int aiev2_vmac_compute_control(int sgn_x, int sgn_y, int amode,
+                                             int bmode, int variant,
+                                             int zero_acc, int shift16,
+                                             int sub_mul, int sub_acc1,
+                                             int sub_acc2, int sub_mask) {
+  return ((unsigned)sub_mask << 16) | ((unsigned)shift16 << 10) |
+         ((unsigned)sub_mul << 11) | ((unsigned)sub_acc1 << 12) |
+         ((unsigned)sub_acc2 << 13) | ((unsigned)amode << 1) |
+         ((unsigned)bmode << 3) | ((unsigned)variant << 5) |
+         (((unsigned)sgn_x << 9) | ((unsigned)sgn_y << 8)) |
+         ((unsigned)zero_acc << 0);
+}
+
 std::string getVectorTypeString(VectorType type, bool abbrev = false,
                                 bool acc = false) {
   std::stringstream ss;
@@ -133,11 +170,11 @@ std::string getVectorTypeString(VectorType type, bool abbrev = false,
 std::string getMulOrFMAIntrinsicName(Operation *op) {
   std::string baseName;
   Value lhs, result;
-  if (auto mulOp = dyn_cast<aievec::MulOp>(op)) {
+  if (auto mulOp = dyn_cast<aievec::aie1::MulOp>(op)) {
     baseName = "mul";
     lhs = mulOp.getLhs();
     result = mulOp.getResult();
-  } else if (auto fmaOp = dyn_cast<aievec::FMAOp>(op)) {
+  } else if (auto fmaOp = dyn_cast<aievec::aie1::FMAOp>(op)) {
     baseName = "mac";
     lhs = fmaOp.getLhs();
     result = fmaOp.getResult();
@@ -176,36 +213,39 @@ void encodeConf(uint32_t conf[2], const BufferParams &x, const BufferParams &z,
   conf[1] |= sub << 17;
 }
 
-class AddOpConversion : public mlir::ConvertOpToLLVMPattern<aievec::AddOp> {
+class AddOpConversion
+    : public mlir::ConvertOpToLLVMPattern<aievec::aie1::AddOp> {
 public:
-  using ConvertOpToLLVMPattern<aievec::AddOp>::ConvertOpToLLVMPattern;
+  using ConvertOpToLLVMPattern<aievec::aie1::AddOp>::ConvertOpToLLVMPattern;
 
   LogicalResult
-  matchAndRewrite(aievec::AddOp op, OpAdaptor adaptor,
+  matchAndRewrite(aievec::aie1::AddOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     op.emitWarning() << "aie.add conversion is not implemented\n";
     return failure();
   }
 };
 
-class SubOpConversion : public mlir::ConvertOpToLLVMPattern<aievec::SubOp> {
+class SubOpConversion
+    : public mlir::ConvertOpToLLVMPattern<aievec::aie1::SubOp> {
 public:
-  using ConvertOpToLLVMPattern<aievec::SubOp>::ConvertOpToLLVMPattern;
+  using ConvertOpToLLVMPattern<aievec::aie1::SubOp>::ConvertOpToLLVMPattern;
 
   LogicalResult
-  matchAndRewrite(aievec::SubOp op, OpAdaptor adaptor,
+  matchAndRewrite(aievec::aie1::SubOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     op.emitWarning() << "aie.sub conversion is not implemented\n";
     return failure();
   }
 };
 
-class FMAOpConversion : public mlir::ConvertOpToLLVMPattern<aievec::FMAOp> {
+class FMAOpConversion
+    : public mlir::ConvertOpToLLVMPattern<aievec::aie1::FMAOp> {
 public:
-  using ConvertOpToLLVMPattern<aievec::FMAOp>::ConvertOpToLLVMPattern;
+  using ConvertOpToLLVMPattern<aievec::aie1::FMAOp>::ConvertOpToLLVMPattern;
 
   LogicalResult
-  matchAndRewrite(aievec::FMAOp op, OpAdaptor adaptor,
+  matchAndRewrite(aievec::aie1::FMAOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     auto module = op->getParentOfType<ModuleOp>();
     MLIRContext *context = rewriter.getContext();
@@ -277,12 +317,13 @@ class FMAOpConversion : public mlir::ConvertOpToLLVMPattern<aievec::FMAOp> {
   }
 };
 
-class MulOpConversion : public mlir::ConvertOpToLLVMPattern<aievec::MulOp> {
+class MulOpConversion
+    : public mlir::ConvertOpToLLVMPattern<aievec::aie1::MulOp> {
 public:
-  using ConvertOpToLLVMPattern<aievec::MulOp>::ConvertOpToLLVMPattern;
+  using ConvertOpToLLVMPattern<aievec::aie1::MulOp>::ConvertOpToLLVMPattern;
 
   LogicalResult
-  matchAndRewrite(aievec::MulOp op, OpAdaptor adaptor,
+  matchAndRewrite(aievec::aie1::MulOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     auto module = op->getParentOfType<ModuleOp>();
     MLIRContext *context = rewriter.getContext();
@@ -382,33 +423,6 @@ class MulElemOpConversion
     int conf;
   };
 
-  // sgn_x: Sign mask of matrix X. If it is one matrix X is interpreted as
-  // signed, else it treated as unsigned.
-  // sgn_y: Sign mask of matrix Y. If it is one matrix Y is interpreted as
-  // signed, else it treated as unsigned.
-  // amode/bmode/variant: config acc width, mul precision, and mul mode
-  // zero_acc: Zeroing of acc1. If it is one then acc1 is zeroed.
-  // shift16: Shift mask of acc1. If a bit is set the <<16 operation will be
-  // executed on acc1.
-  // sub_mul: Negation mask of the matrix multiplication result. If it is
-  // one the result of the operation will be negated.
-  // sub_acc1: Negation mask of acc1. If it is one acc1 will be negated.
-  // sub_acc2: Negation mask of acc2. If it is one acc2 will be negated.
-  // sub_mask: Negation mask of complex multiplications. Negates a term of a
-  // complex multiplication.
-  static int aiev2_mul_mac_compute_control(int sgn_x, int sgn_y, int amode,
-                                           int bmode, int variant, int zero_acc,
-                                           int shift16, int sub_mul,
-                                           int sub_acc1, int sub_acc2,
-                                           int sub_mask) {
-    return ((unsigned)sub_mask << 16) | ((unsigned)shift16 << 10) |
-           ((unsigned)sub_mul << 11) | ((unsigned)sub_acc1 << 12) |
-           ((unsigned)sub_acc2 << 13) | ((unsigned)amode << 1) |
-           ((unsigned)bmode << 3) | ((unsigned)variant << 5) |
-           (((unsigned)sgn_x << 9) | ((unsigned)sgn_y << 8)) |
-           ((unsigned)zero_acc << 0);
-  }
-
   static DecodedMulElemOp decodeMulElemOp(OpAdaptor op) {
     auto lhs = op.getLhs();
     auto lhsVecTy = cast<VectorType>(lhs.getType());
@@ -419,14 +433,14 @@ class MulElemOpConversion
     if (llvm::isa<IntegerType>(lhsScaTy)) {
       if (lhsBitWidth == 8) {
         return {DecodedMulElemOp::Kind::I8_I8_I32_32x1x2x1,
-                aiev2_mul_mac_compute_control(
+                aiev2_vmac_compute_control(
                     /*sgn_x=*/1, /*sgn_y=*/1, /*amode=*/0, /*bmode=*/1,
                     /*variant=*/1, /*zero_acc=*/0, /*shift16=*/0,
                     /*sub_mul=*/0, /*sub_acc1=*/0, /*sub_acc2=*/0,
                     /*sub_mask=*/0)};
       } else if (lhsBitWidth == 16) {
         return {DecodedMulElemOp::Kind::I16_I16_I32_32x1x1x1,
-                aiev2_mul_mac_compute_control(
+                aiev2_vmac_compute_control(
                     /*sgn_x=*/1, /*sgn_y=*/1, /*amode=*/0, /*bmode=*/3,
                     /*variant=*/1, /*zero_acc=*/0, /*shift16=*/0,
                     /*sub_mul=*/0, /*sub_acc1=*/0, /*sub_acc2=*/0,
@@ -439,7 +453,7 @@ class MulElemOpConversion
       // Float types
       if (lhsBitWidth == 16) {
         return {DecodedMulElemOp::Kind::BF16_BF16_FP32_16x1x2x1,
-                aiev2_mul_mac_compute_control(
+                aiev2_vmac_compute_control(
                     /*sgn_x=*/0, /*sgn_y=*/0, /*amode=*/2, /*bmode=*/3,
                     /*variant=*/1, /*zero_acc=*/0, /*shift16=*/0,
                     /*sub_mul=*/0, /*sub_acc1=*/0, /*sub_acc2=*/0,
@@ -517,7 +531,7 @@ class MulElemOpConversion
     // MUL + 3 * MAC
     auto mulConfCst = rewriter.create<LLVM::ConstantOp>(
         loc, rewriter.getI32Type(),
-        rewriter.getI32IntegerAttr(aiev2_mul_mac_compute_control(
+        rewriter.getI32IntegerAttr(aiev2_vmac_compute_control(
             /*sgn_x=*/1, /*sgn_y=*/1, /*amode=*/1, /*bmode=*/3,
             /*variant=*/2, /*zero_acc=*/0, /*shift16=*/0,
             /*sub_mul=*/0, /*sub_acc1=*/0, /*sub_acc2=*/0, /*sub_mask=*/0)));
@@ -551,19 +565,19 @@ class MulElemOpConversion
     auto acc64Val = mulConfOp.getResult();
     acc64Val = createMacConfOp(
         SmallVector<Value>{a_hi, b_lo, acc64Val},
-        aiev2_mul_mac_compute_control(
+        aiev2_vmac_compute_control(
             /*sgn_x=*/1, /*sgn_y=*/0, /*amode=*/1, /*bmode=*/3,
             /*variant=*/2, /*zero_acc=*/0, /*shift16=*/1,
             /*sub_mul=*/0, /*sub_acc1=*/0, /*sub_acc2=*/0, /*sub_mask=*/0));
     acc64Val = createMacConfOp(
         SmallVector<Value>{a_lo, b_hi, acc64Val},
-        aiev2_mul_mac_compute_control(
+        aiev2_vmac_compute_control(
             /*sgn_x=*/0, /*sgn_y=*/1, /*amode=*/1, /*bmode=*/3,
             /*variant=*/2, /*zero_acc=*/0, /*shift16=*/0,
             /*sub_mul=*/0, /*sub_acc1=*/0, /*sub_acc2=*/0, /*sub_mask=*/0));
     acc64Val = createMacConfOp(
         SmallVector<Value>{a_lo, b_lo, acc64Val},
-        aiev2_mul_mac_compute_control(
+        aiev2_vmac_compute_control(
             /*sgn_x=*/0, /*sgn_y=*/0, /*amode=*/1, /*bmode=*/3,
             /*variant=*/2, /*zero_acc=*/0, /*shift16=*/1,
             /*sub_mul=*/0, /*sub_acc1=*/0, /*sub_acc2=*/0, /*sub_mask=*/0));
@@ -638,7 +652,7 @@ class MulElemOpConversion
         loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(0));
     auto mscMacMulConfCst = rewriter.create<LLVM::ConstantOp>(
         loc, rewriter.getI32Type(),
-        rewriter.getI32IntegerAttr(aiev2_mul_mac_compute_control(
+        rewriter.getI32IntegerAttr(aiev2_vmac_compute_control(
             /*sgn_x=*/0, /*sgn_y=*/0, /*amode=*/2, /*bmode=*/3,
             /*variant=*/1, /*zero_acc=*/0, /*shift16=*/0,
             /*sub_mul=*/0, /*sub_acc1=*/0, /*sub_acc2=*/0, /*sub_mask=*/0)));
@@ -1972,10 +1986,53 @@ class FMAElemOpConversion
   using ConvertOpToLLVMPattern<aievec::FMAElemOp>::ConvertOpToLLVMPattern;
 
   LogicalResult
-  matchAndRewrite(aievec::FMAElemOp op, OpAdaptor adaptor,
+  matchAndRewrite(aievec::FMAElemOp fmaOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    op.emitWarning() << "aie.mac_elem conversion is not implemented\n";
-    return failure();
+    auto loc = fmaOp.getLoc();
+    auto lhs = adaptor.getLhs();
+    auto rhs = adaptor.getRhs();
+    auto acc = adaptor.getAcc();
+    auto lhsTy = cast<VectorType>(lhs.getType());
+    auto rhsTy = cast<VectorType>(rhs.getType());
+    auto accTy = cast<VectorType>(acc.getType());
+    auto flatLhsTy = getFlattenedVectorType(lhsTy);
+    auto flatRhsTy = getFlattenedVectorType(rhsTy);
+    auto flatAccTy = getFlattenedVectorType(accTy);
+
+    // Flatten operands, if needed
+    if (lhsTy != flatLhsTy)
+      lhs = rewriter.create<vector::ShapeCastOp>(loc, flatLhsTy, lhs);
+    if (rhsTy != flatRhsTy)
+      rhs = rewriter.create<vector::ShapeCastOp>(loc, flatRhsTy, rhs);
+    if (accTy != flatAccTy)
+      acc = rewriter.create<vector::ShapeCastOp>(loc, flatAccTy, acc);
+
+    // Build vmac configuration constant
+    Type i32ty = rewriter.getI32Type();
+    auto confCst = rewriter.create<LLVM::ConstantOp>(
+        loc, i32ty,
+        rewriter.getI32IntegerAttr(aiev2_vmac_compute_control(
+            /*sgn_x=*/0, /*sgn_y=*/0, /*amode=*/2, /*bmode=*/3,
+            /*variant=*/1, /*zero_acc=*/0, /*shift16=*/0,
+            /*sub_mul=*/0, /*sub_acc1=*/0, /*sub_acc2=*/0,
+            /*sub_mask=*/0)));
+
+    // Insert vmac intrinsic
+    auto v32bf16Ty = VectorType::get({32}, rewriter.getBF16Type());
+    auto v8i64Ty = VectorType::get({8}, rewriter.getI64Type());
+    auto macIntrOp = rewriter.create<xllvm::MacConfBF16IntrOp>(
+        loc, v8i64Ty,
+        forceCastOperandsToSignature(rewriter, loc, {lhs, rhs, acc, confCst},
+                                     {v32bf16Ty, v32bf16Ty, v8i64Ty, i32ty}));
+
+    // Recast/Reshape result
+    auto resVal =
+        forceCastValueToType(rewriter, loc, macIntrOp.getResult(), flatAccTy);
+    if (flatAccTy != accTy)
+      resVal = rewriter.create<vector::ShapeCastOp>(loc, accTy, resVal);
+
+    rewriter.replaceOp(fmaOp, resVal);
+    return success();
   }
 };
 
@@ -2000,16 +2057,21 @@ class MatMulOpConversion
     auto accVecTy = cast<VectorType>(acc.getType());
     if (isa<Float32Type>(accVecTy.getElementType()))
       // <4x8xbf16> x <8x4xbf16> + <4x4xf32>
-      return {DecodedMatMulOp::Kind::BF16, lhs, rhs, acc, 28};
-
-    int signConf = 0;
+      return {DecodedMatMulOp::Kind::BF16, lhs, rhs, acc,
+              aiev2_vmac_compute_control(
+                  /*sgn_x=*/0, /*sgn_y=*/0, /*amode=*/2, /*bmode=*/3,
+                  /*variant=*/0, /*zero_acc=*/0, /*shift16=*/0,
+                  /*sub_mul=*/0, /*sub_acc1=*/0, /*sub_acc2=*/0,
+                  /*sub_mask=*/0)};
+
+    int signX = 0, signY = 0;
     auto lhsVecTy = cast<VectorType>(lhs.getType());
     auto lhsScaTy = cast<IntegerType>(lhsVecTy.getElementType());
     if (auto extSIOp = lhs.getDefiningOp<arith::ExtSIOp>()) {
       lhs = extSIOp.getIn();
       lhsVecTy = cast<VectorType>(lhs.getType());
       lhsScaTy = cast<IntegerType>(lhsVecTy.getElementType());
-      signConf |= (1 << 9);
+      signX = 1;
     } else if (auto extUIOp = lhs.getDefiningOp<arith::ExtUIOp>()) {
       lhs = extUIOp.getIn();
       lhsVecTy = cast<VectorType>(lhs.getType());
@@ -2017,7 +2079,7 @@ class MatMulOpConversion
     } else {
       // NOTE: We're choosing 'signed' by default
       if (!lhsScaTy.isUnsigned())
-        signConf |= (1 << 9);
+        signX = 1;
     }
     auto lhsShape = lhsVecTy.getShape();
 
@@ -2027,7 +2089,7 @@ class MatMulOpConversion
       rhs = extSIOp.getIn();
       rhsVecTy = cast<VectorType>(rhs.getType());
       rhsScaTy = cast<IntegerType>(rhsVecTy.getElementType());
-      signConf |= (1 << 8);
+      signY = 1;
     } else if (auto extUIOp = rhs.getDefiningOp<arith::ExtUIOp>()) {
       rhs = extUIOp.getIn();
       rhsVecTy = cast<VectorType>(rhs.getType());
@@ -2035,7 +2097,7 @@ class MatMulOpConversion
     } else {
       // NOTE: We're choosing 'signed' by default
       if (!rhsScaTy.isUnsigned())
-        signConf |= (1 << 8);
+        signY = 1;
     }
 
     unsigned lhsBitWidth = lhsScaTy.getWidth();
@@ -2046,18 +2108,42 @@ class MatMulOpConversion
       if (lhsBitWidth == 8) {
         if (rhsBitWidth == 4) {
           // <4x16xi8> x <16x8xi4> + <4x8xi32>
-          return {DecodedMatMulOp::Kind::I32, lhs, rhs, acc, signConf};
+          return {DecodedMatMulOp::Kind::I32, lhs, rhs, acc,
+                  aiev2_vmac_compute_control(
+                      /*sgn_x=*/signX, /*sgn_y=*/signY, /*amode=*/0,
+                      /*bmode=*/0,
+                      /*variant=*/0, /*zero_acc=*/0, /*shift16=*/0,
+                      /*sub_mul=*/0, /*sub_acc1=*/0, /*sub_acc2=*/0,
+                      /*sub_mask=*/0)};
         } else {
           // <4x8xi8> x <8x8xi8> + <4x8xi32>
-          return {DecodedMatMulOp::Kind::I32, lhs, rhs, acc, signConf | 8};
+          return {DecodedMatMulOp::Kind::I32, lhs, rhs, acc,
+                  aiev2_vmac_compute_control(
+                      /*sgn_x=*/signX, /*sgn_y=*/signY, /*amode=*/0,
+                      /*bmode=*/1,
+                      /*variant=*/0, /*zero_acc=*/0, /*shift16=*/0,
+                      /*sub_mul=*/0, /*sub_acc1=*/0, /*sub_acc2=*/0,
+                      /*sub_mask=*/0)};
         }
       } else {
         if (rhsBitWidth == 8) {
           // <4x4xi16> x <4x8xi8> + <4x8xi32>
-          return {DecodedMatMulOp::Kind::I32, lhs, rhs, acc, signConf | 16};
+          return {DecodedMatMulOp::Kind::I32, lhs, rhs, acc,
+                  aiev2_vmac_compute_control(
+                      /*sgn_x=*/signX, /*sgn_y=*/signY, /*amode=*/0,
+                      /*bmode=*/2,
+                      /*variant=*/0, /*zero_acc=*/0, /*shift16=*/0,
+                      /*sub_mul=*/0, /*sub_acc1=*/0, /*sub_acc2=*/0,
+                      /*sub_mask=*/0)};
         } else {
           // <4x2xi16> x <2x8xi16> + <4x8xi32>
-          return {DecodedMatMulOp::Kind::I32, lhs, rhs, acc, signConf | 2};
+          return {DecodedMatMulOp::Kind::I32, lhs, rhs, acc,
+                  aiev2_vmac_compute_control(
+                      /*sgn_x=*/signX, /*sgn_y=*/signY, /*amode=*/0,
+                      /*bmode=*/3,
+                      /*variant=*/0, /*zero_acc=*/0, /*shift16=*/0,
+                      /*sub_mul=*/0, /*sub_acc1=*/0, /*sub_acc2=*/0,
+                      /*sub_mask=*/0)};
         }
       }
     }
@@ -2066,29 +2152,46 @@ class MatMulOpConversion
       if (rhsBitWidth == 8) {
         if (lhsShape == ArrayRef<int64_t>({2, 8})) {
           // <2x8xi16> x <8x8xi8> + <2x8xi64>
-          return {DecodedMatMulOp::Kind::I64, lhs, rhs, acc, signConf | 18};
+          return {DecodedMatMulOp::Kind::I64, lhs, rhs, acc,
+                  aiev2_vmac_compute_control(
+                      /*sgn_x=*/signX, /*sgn_y=*/signY, /*amode=*/1,
+                      /*bmode=*/2,
+                      /*variant=*/0, /*zero_acc=*/0, /*shift16=*/0,
+                      /*sub_mul=*/0, /*sub_acc1=*/0, /*sub_acc2=*/0,
+                      /*sub_mask=*/0)};
         }
         // <4x8xi16> x <8x4xi8> + <4x4xi64>
-        return {DecodedMatMulOp::Kind::I64, lhs, rhs, acc, signConf | 50};
+        return {DecodedMatMulOp::Kind::I64, lhs, rhs, acc,
+                aiev2_vmac_compute_control(
+                    /*sgn_x=*/signX, /*sgn_y=*/signY, /*amode=*/1, /*bmode=*/2,
+                    /*variant=*/1, /*zero_acc=*/0, /*shift16=*/0,
+                    /*sub_mul=*/0, /*sub_acc1=*/0, /*sub_acc2=*/0,
+                    /*sub_mask=*/0)};
       }
       if (lhsShape == ArrayRef<int64_t>({2, 4})) {
         // <2x4xi16> x <4x8xi16> + <2x8xi64>
-        return {DecodedMatMulOp::Kind::I64, lhs, rhs, acc, signConf | 26};
+        return {DecodedMatMulOp::Kind::I64, lhs, rhs, acc,
+                aiev2_vmac_compute_control(
+                    /*sgn_x=*/signX, /*sgn_y=*/signY, /*amode=*/1, /*bmode=*/3,
+                    /*variant=*/0, /*zero_acc=*/0, /*shift16=*/0,
+                    /*sub_mul=*/0, /*sub_acc1=*/0, /*sub_acc2=*/0,
+                    /*sub_mask=*/0)};
       }
       // <4x4xi16> x <4x4xi16> + <4x4xi64>
-      return {DecodedMatMulOp::Kind::I64, lhs, rhs, acc, signConf | 58};
+      return {DecodedMatMulOp::Kind::I64, lhs, rhs, acc,
+              aiev2_vmac_compute_control(
+                  /*sgn_x=*/signX, /*sgn_y=*/signY, /*amode=*/1, /*bmode=*/3,
+                  /*variant=*/1, /*zero_acc=*/0, /*shift16=*/0,
+                  /*sub_mul=*/0, /*sub_acc1=*/0, /*sub_acc2=*/0,
+                  /*sub_mask=*/0)};
     }
     // <4x2xi32> x <2x4xi16> + <4x4xi64>
-    return {DecodedMatMulOp::Kind::I64, lhs, rhs, acc, signConf | 2};
-  }
-
-  static VectorType getFlattenedVectorType(VectorType vecTy) {
-    if (vecTy.getRank() == 1)
-      return vecTy;
-    auto shape = vecTy.getShape();
-    return VectorType::get(
-        {std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<>())},
-        vecTy.getElementType());
+    return {DecodedMatMulOp::Kind::I64, lhs, rhs, acc,
+            aiev2_vmac_compute_control(
+                /*sgn_x=*/signX, /*sgn_y=*/signY, /*amode=*/1, /*bmode=*/0,
+                /*variant=*/0, /*zero_acc=*/0, /*shift16=*/0,
+                /*sub_mul=*/0, /*sub_acc1=*/0, /*sub_acc2=*/0,
+                /*sub_mask=*/0)};
   }
 
   LogicalResult
@@ -2259,7 +2362,8 @@ struct ConvertAIEVecToLLVMPass
                                            aie2Fp32Emulation);
 
     LLVMConversionTarget target(getContext());
-    target.addIllegalDialect<AIEVecDialect>();
+    target.addIllegalDialect<xilinx::aievec::AIEVecDialect,
+                             xilinx::aievec::aie1::AIEVecAIE1Dialect>();
     target.addLegalDialect<arith::ArithDialect, vector::VectorDialect,
                            xilinx::xllvm::XLLVMDialect>();
     if (failed(applyPartialConversion(getOperation(), target,
diff --git a/lib/Dialect/AIE/IR/AIEDialect.cpp b/lib/Dialect/AIE/IR/AIEDialect.cpp
index 88fba24e49..db7d460ddf 100644
--- a/lib/Dialect/AIE/IR/AIEDialect.cpp
+++ b/lib/Dialect/AIE/IR/AIEDialect.cpp
@@ -1618,6 +1618,36 @@ LogicalResult DMABDOp::verify() {
       return emitOpError(
           "For <32b width datatypes, inner-most dim stride must be 1");
   }
+  if (auto paddims = getPadDimensions(); paddims.has_value()) {
+    auto dims = getDimensions();
+    if (!dims.has_value())
+      return emitOpError() << "Padding requires n-d data layouts expressed as"
+                           << " wrap(s) and stride(s).";
+    if (dims->size() != paddims->size())
+      return emitOpError() << "Mismatch number of dimensions between padding(s)"
+                           << " and wrap(s) and stride(s).";
+    if (!targetModel.isMemTile(parentTileId.col, parentTileId.row))
+      return emitOpError() << "Padding is only supported by memtile dma bds.";
+    int actuallen = 1;
+    for (unsigned i = 0; i < paddims->size(); i++) {
+      auto dim = (*dims)[i];
+      auto paddim = (*paddims)[i];
+      actuallen *= paddim.getConstPadBefore() + paddim.getConstPadAfter() +
+                   dim.getSize();
+      if (actuallen > getLen())
+        return emitOpError() << "Data exceeds len after padding.";
+    }
+    if ((paddims->back().getConstPadBefore() *
+         getBufferElementTypeWidthInBytes()) %
+        4)
+      return emitOpError() << "Inner-most padding-before count must result in"
+                           << " padding in 32-bit words.";
+    if ((paddims->back().getConstPadAfter() *
+         getBufferElementTypeWidthInBytes()) %
+        4)
+      return emitOpError() << "Inner-most padding-after count must result in"
+                           << " padding in 32-bit words.";
+  }
   if (targetModel.isMemTile(parentTileId.col, parentTileId.row) ||
       targetModel.isCoreTile(parentTileId.col, parentTileId.row)) {
     if (auto baseAddr = getBufferOp().getAddress(); baseAddr.has_value()) {
@@ -1761,8 +1791,9 @@ LogicalResult SwitchboxOp::verify() {
       for (auto m : mstrs) {
         for (auto s : slvs) {
           // Stream switch connection constraints
-          if (!isLegalTileConnection(tile, targetModel, m, s))
+          if (!isLegalTileConnection(tile, targetModel, m, s)) {
             return amselOp->emitOpError("illegal stream switch connection");
+          }
         }
       }
     } else if (isa<EndOp>(ops)) {
diff --git a/lib/Dialect/AIE/IR/AIETargetModel.cpp b/lib/Dialect/AIE/IR/AIETargetModel.cpp
index e8ec49b955..30f68d6a8f 100644
--- a/lib/Dialect/AIE/IR/AIETargetModel.cpp
+++ b/lib/Dialect/AIE/IR/AIETargetModel.cpp
@@ -242,15 +242,16 @@ AIE1TargetModel::getNumDestShimMuxConnections(int col, int row,
 uint32_t
 AIE1TargetModel::getNumSourceShimMuxConnections(int col, int row,
                                                 WireBundle bundle) const {
-  if (isShimNOCTile(col, row))
+  if (isShimNOCorPLTile(col, row))
     switch (bundle) {
     case WireBundle::DMA:
       return 2;
     case WireBundle::NOC:
       return 4;
     case WireBundle::PLIO:
+      return 8;
     case WireBundle::South:
-      return 6;
+      return 6; // Connection to the south port of the stream switch
     default:
       return 0;
     }
@@ -527,7 +528,7 @@ AIE2TargetModel::getNumDestShimMuxConnections(int col, int row,
     case WireBundle::PLIO:
       return 6;
     case WireBundle::South:
-      return 8;
+      return 8; // Connection to the south port of the stream switch
     default:
       return 0;
     }
@@ -538,15 +539,16 @@ AIE2TargetModel::getNumDestShimMuxConnections(int col, int row,
 uint32_t
 AIE2TargetModel::getNumSourceShimMuxConnections(int col, int row,
                                                 WireBundle bundle) const {
-  if (isShimNOCTile(col, row))
+  if (isShimNOCorPLTile(col, row))
     switch (bundle) {
     case WireBundle::DMA:
       return 2;
     case WireBundle::NOC:
       return 4;
     case WireBundle::PLIO:
+      return 8;
     case WireBundle::South:
-      return 6;
+      return 6; // Connection to the south port of the stream switch
     default:
       return 0;
     }
diff --git a/lib/Dialect/AIE/Transforms/AIECoreToStandard.cpp b/lib/Dialect/AIE/Transforms/AIECoreToStandard.cpp
index 7c1c079be6..d93f4d9f4e 100644
--- a/lib/Dialect/AIE/Transforms/AIECoreToStandard.cpp
+++ b/lib/Dialect/AIE/Transforms/AIECoreToStandard.cpp
@@ -15,6 +15,7 @@
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
@@ -554,6 +555,7 @@ struct AIECoreToStandardPass : AIECoreToStandardBase<AIECoreToStandardPass> {
     target.addLegalDialect<VectorDialect>();
     target.addLegalDialect<arith::ArithDialect>();
     target.addLegalDialect<math::MathDialect>();
+    target.addLegalDialect<index::IndexDialect>();
     target.addLegalOp<func::FuncOp, ModuleOp>();
 
     RewritePatternSet patterns(&getContext());
diff --git a/lib/Dialect/AIE/Transforms/AIECreatePathFindFlows.cpp b/lib/Dialect/AIE/Transforms/AIECreatePathFindFlows.cpp
index c008620a9c..a5118190e2 100644
--- a/lib/Dialect/AIE/Transforms/AIECreatePathFindFlows.cpp
+++ b/lib/Dialect/AIE/Transforms/AIECreatePathFindFlows.cpp
@@ -95,7 +95,8 @@ struct ConvertFlowsToInterconnect : OpConversionPattern<FlowOp> {
 
     // if the flow (aka "net") for this FlowOp hasn't been processed yet,
     // add all switchbox connections to implement the flow
-    Switchbox srcSB = {srcCoords.col, srcCoords.row};
+    SwitchboxNode srcSB =
+        analyzer.pathfinder->getSwitchboxNode({srcCoords.col, srcCoords.row});
     if (PathEndPoint srcPoint = {srcSB, srcPort};
         !analyzer.processedFlows[srcPoint]) {
       SwitchSettings settings = analyzer.flowSolutions[srcPoint];
@@ -104,8 +105,8 @@ struct ConvertFlowsToInterconnect : OpConversionPattern<FlowOp> {
         SwitchboxOp swOp = analyzer.getSwitchbox(rewriter, curr.col, curr.row);
         int shimCh = srcChannel;
         // TODO: must reserve N3, N7, S2, S3 for DMA connections
-        if (curr == srcSB &&
-            analyzer.getTile(rewriter, srcSB.col, srcSB.row).isShimNOCTile()) {
+        if (curr == srcSB && analyzer.getTile(rewriter, srcSB.col, srcSB.row)
+                                 .isShimNOCorPLTile()) {
           // shim DMAs at start of flows
           if (srcBundle == WireBundle::DMA) {
             shimCh = srcChannel == 0
@@ -125,13 +126,10 @@ struct ConvertFlowsToInterconnect : OpConversionPattern<FlowOp> {
                           srcBundle, srcChannel, WireBundle::North, shimCh);
           } else if (srcBundle ==
                      WireBundle::PLIO) { // PLIO at start of flows with mux
-            if (srcChannel == 2 || srcChannel == 3 || srcChannel == 6 ||
-                srcChannel == 7) { // Only some PLIO requrie mux
-              ShimMuxOp shimMuxOp = analyzer.getShimMux(rewriter, srcSB.col);
-              addConnection(
-                  rewriter, cast<Interconnect>(shimMuxOp.getOperation()),
-                  flowOp, srcBundle, srcChannel, WireBundle::North, shimCh);
-            }
+            ShimMuxOp shimMuxOp = analyzer.getShimMux(rewriter, srcSB.col);
+            addConnection(rewriter,
+                          cast<Interconnect>(shimMuxOp.getOperation()), flowOp,
+                          srcBundle, srcChannel, WireBundle::North, shimCh);
           }
         }
         for (const auto &[bundle, channel] : setting.dsts) {
@@ -146,7 +144,7 @@ struct ConvertFlowsToInterconnect : OpConversionPattern<FlowOp> {
                       bundle == WireBundle::NOC)) {
             shimCh = channel;
             if (analyzer.getTile(rewriter, curr.col, curr.row)
-                    .isShimNOCTile()) {
+                    .isShimNOCorPLTile()) {
               // shim DMAs at end of flows
               if (bundle == WireBundle::DMA) {
                 shimCh = channel == 0
@@ -162,8 +160,7 @@ struct ConvertFlowsToInterconnect : OpConversionPattern<FlowOp> {
                 addConnection(
                     rewriter, cast<Interconnect>(shimMuxOp.getOperation()),
                     flowOp, WireBundle::North, shimCh, bundle, channel);
-              } else if (channel >=
-                         2) { // must be PLIO...only PLIO >= 2 require mux
+              } else if (bundle == WireBundle::PLIO) {
                 ShimMuxOp shimMuxOp = analyzer.getShimMux(rewriter, curr.col);
                 addConnection(
                     rewriter, cast<Interconnect>(shimMuxOp.getOperation()),
@@ -359,7 +356,8 @@ bool AIEPathfinderPass::findPathToDest(SwitchSettings settings, TileID currTile,
   }
 
   int neighbourSourceChannel = currDestChannel;
-  for (const auto &[tile, setting] : settings) {
+  for (const auto &[sbNode, setting] : settings) {
+    TileID tile = {sbNode.col, sbNode.row};
     if ((tile == neighbourTile) &&
         (setting.src.bundle == neighbourSourceBundle) &&
         (setting.src.channel == neighbourSourceChannel)) {
@@ -415,23 +413,27 @@ void AIEPathfinderPass::runOnPacketFlow(DeviceOp device, OpBuilder &builder) {
         if (pktFlowOp->hasAttr("keep_pkt_header"))
           keepPktHeaderAttr[{destTile, destPort}] =
               StringAttr::get(Op.getContext(), "true");
-        Switchbox srcSB = {srcCoords.col, srcCoords.row};
+        SwitchboxNode srcSB = analyzer.pathfinder->getSwitchboxNode(
+            {srcCoords.col, srcCoords.row});
         if (PathEndPoint srcPoint = {srcSB, srcPort};
             !analyzer.processedFlows[srcPoint]) {
           SwitchSettings settings = analyzer.flowSolutions[srcPoint];
           // add connections for all the Switchboxes in SwitchSettings
           for (const auto &[curr, setting] : settings) {
             for (const auto &[bundle, channel] : setting.dsts) {
+              TileID currTile = {curr.col, curr.row};
               // reject false broadcast
-              if (!findPathToDest(settings, curr, bundle, channel, destCoords,
-                                  destPort.bundle, destPort.channel))
+              if (!findPathToDest(settings, currTile, bundle, channel,
+                                  destCoords, destPort.bundle,
+                                  destPort.channel))
                 continue;
               Connect connect = {{setting.src.bundle, setting.src.channel},
                                  {bundle, channel}};
-              if (std::find(switchboxes[curr].begin(), switchboxes[curr].end(),
+              if (std::find(switchboxes[currTile].begin(),
+                            switchboxes[currTile].end(),
                             std::pair{connect, flowID}) ==
-                  switchboxes[curr].end())
-                switchboxes[curr].push_back({connect, flowID});
+                  switchboxes[currTile].end())
+                switchboxes[currTile].push_back({connect, flowID});
             }
           }
         }
@@ -918,161 +920,6 @@ void AIEPathfinderPass::runOnOperation() {
     runOnFlow(d, builder);
   if (clRoutePacket)
     runOnPacketFlow(d, builder);
-
-  // If the routing violates architecture-specific routing constraints, then
-  // attempt to partially reroute.
-  const auto &targetModel = d.getTargetModel();
-  std::vector<SwConnection> problemConnects;
-  d.walk([&](ConnectOp connect) {
-    if (auto sw = connect->getParentOfType<SwitchboxOp>()) {
-      // Constraint: memtile stream switch constraints
-      if (auto tile = sw.getTileOp();
-          tile.isMemTile() &&
-          !targetModel.isLegalTileConnection(
-              tile.colIndex(), tile.rowIndex(), connect.getSourceBundle(),
-              connect.getSourceChannel(), connect.getDestBundle(),
-              connect.getDestChannel())) {
-        problemConnects.push_back(
-            {sw, connect.sourcePort(), connect.destPort()});
-      }
-    }
-  });
-
-  d.walk([&](AMSelOp amsel) {
-    if (auto sw = amsel->getParentOfType<SwitchboxOp>()) {
-      std::vector<MasterSetOp> mstrs;
-      std::vector<PacketRulesOp> slvs;
-      for (auto *user : amsel.getResult().getUsers()) {
-        if (auto s = dyn_cast<PacketRuleOp>(user)) {
-          auto pktRules = dyn_cast<PacketRulesOp>(s->getParentOp());
-          slvs.push_back(pktRules);
-        } else if (auto m = dyn_cast<MasterSetOp>(user))
-          mstrs.push_back(m);
-      }
-      for (auto m : mstrs) {
-        for (auto s : slvs) {
-          if (auto tile = sw.getTileOp();
-              tile.isMemTile() &&
-              !targetModel.isLegalTileConnection(
-                  tile.colIndex(), tile.rowIndex(), s.sourcePort().bundle,
-                  s.sourcePort().channel, m.destPort().bundle,
-                  m.destPort().channel))
-            problemConnects.push_back({sw, s.sourcePort(), m.destPort()});
-        }
-      }
-    }
-  });
-
-  for (SwConnection swConnect : problemConnects) {
-    if (!attemptFixupMemTileRouting(d, swConnect))
-      return signalPassFailure();
-  }
-}
-
-bool AIEPathfinderPass::attemptFixupMemTileRouting(
-    DeviceOp &d, SwConnection &problemConnect) {
-  int northChannel;
-  int southChannel;
-  if (problemConnect.sourcePort.bundle == WireBundle::North &&
-      problemConnect.destPort.bundle == WireBundle::South) {
-    northChannel = problemConnect.sourcePort.channel;
-    southChannel = problemConnect.destPort.channel;
-  } else if (problemConnect.sourcePort.bundle == WireBundle::South &&
-             problemConnect.destPort.bundle == WireBundle::North) {
-    northChannel = problemConnect.destPort.channel;
-    southChannel = problemConnect.sourcePort.channel;
-  } else
-    return false; // Problem is not about n-s routing
-
-  SwitchboxOp &swBox = problemConnect.sw;
-
-  // Attempt to reroute northern channel and neighbouring sw
-  if (auto neighbourSw =
-          getSwitchbox(d, swBox.colIndex(), swBox.rowIndex() + 1)) {
-    WireBundle problemBundle = WireBundle::North;
-    WireBundle neighbourBundle = WireBundle::South;
-    int problemChannel = northChannel;
-    int candidateChannel = southChannel;
-    if (checkChannelEmpty(neighbourSw, neighbourBundle, candidateChannel)) {
-      replaceRoutingChannel(swBox, problemBundle, problemChannel,
-                            candidateChannel);
-      replaceRoutingChannel(neighbourSw, neighbourBundle, problemChannel,
-                            candidateChannel);
-      return true;
-    }
-  }
-  // Attempt to reroute southern channel and neighbouring sw
-  if (auto neighbourSw =
-          getSwitchbox(d, swBox.colIndex(), swBox.rowIndex() - 1)) {
-    WireBundle problemBundle = WireBundle::South;
-    WireBundle neighbourBundle = WireBundle::North;
-    int problemChannel = southChannel;
-    int candidateChannel = northChannel;
-    if (checkChannelEmpty(neighbourSw, neighbourBundle, candidateChannel)) {
-      replaceRoutingChannel(swBox, problemBundle, problemChannel,
-                            candidateChannel);
-      replaceRoutingChannel(neighbourSw, neighbourBundle, problemChannel,
-                            candidateChannel);
-      return true;
-    }
-  }
-
-  return false;
-}
-
-bool AIEPathfinderPass::checkChannelEmpty(SwitchboxOp swOp, WireBundle bundle,
-                                          int channel) {
-  // Check circuit-switched
-  for (auto connect : swOp.getOps<ConnectOp>()) {
-    if (connect.getSourceBundle() == bundle &&
-        connect.getSourceChannel() == channel)
-      return false;
-    if (connect.getDestBundle() == bundle &&
-        connect.getDestChannel() == channel)
-      return false;
-  }
-
-  // Check packet-switched
-  for (auto pktRules : swOp.getOps<PacketRulesOp>()) {
-    if (pktRules.sourcePort().bundle == bundle &&
-        pktRules.sourcePort().channel == channel)
-      return false;
-  }
-  for (auto masterSet : swOp.getOps<MasterSetOp>()) {
-    if (masterSet.destPort().bundle == bundle &&
-        masterSet.destPort().channel == channel)
-      return false;
-  }
-
-  return true;
-}
-
-void AIEPathfinderPass::replaceRoutingChannel(SwitchboxOp &swOp,
-                                              WireBundle bundle, int oldChannel,
-                                              int newChannel) {
-  // replace any macthed circuit-switched
-  for (auto connect : swOp.getOps<ConnectOp>()) {
-    if (connect.getSourceBundle() == bundle &&
-        connect.getSourceChannel() == oldChannel)
-      connect.setSourceChannel(newChannel);
-    if (connect.getDestBundle() == bundle &&
-        connect.getDestChannel() == oldChannel)
-      connect.setDestChannel(newChannel);
-  }
-
-  // replace any macthed packet-switched
-  std::vector<PacketRulesOp> newSourcePacketRules;
-  std::vector<MasterSetOp> newDestAMSels;
-  for (auto pktRules : swOp.getOps<PacketRulesOp>()) {
-    if (pktRules.sourcePort().bundle == bundle &&
-        pktRules.sourcePort().channel == oldChannel)
-      pktRules.setSourceChannel(newChannel);
-  }
-  for (auto masterSet : swOp.getOps<MasterSetOp>()) {
-    if (masterSet.destPort().bundle == bundle &&
-        masterSet.destPort().channel == oldChannel)
-      masterSet.setDestChannel(newChannel);
-  }
 }
 
 SwitchboxOp AIEPathfinderPass::getSwitchbox(DeviceOp &d, int col, int row) {
diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
index c9bb260637..8a2dba3906 100644
--- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
+++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
@@ -973,11 +973,12 @@ struct AIEObjectFifoStatefulTransformPass
   void createObjectFifoAllocationInfo(OpBuilder &builder, MLIRContext *ctx,
                                       FlatSymbolRefAttr obj_fifo, int colIndex,
                                       DMAChannelDir channelDir,
-                                      int channelIndex) {
+                                      int channelIndex, bool plio) {
     builder.create<ShimDMAAllocationOp>(builder.getUnknownLoc(), obj_fifo,
                                         DMAChannelDirAttr::get(ctx, channelDir),
                                         builder.getI64IntegerAttr(channelIndex),
-                                        builder.getI64IntegerAttr(colIndex));
+                                        builder.getI64IntegerAttr(colIndex),
+                                        builder.getBoolAttr(plio));
   }
 
   void runOnOperation() override {
@@ -986,6 +987,8 @@ struct AIEObjectFifoStatefulTransformPass
     DMAChannelAnalysis dmaAnalysis(device);
     OpBuilder builder = OpBuilder::atBlockEnd(device.getBody());
     auto ctx = device->getContext();
+    auto producerWireType = WireBundle::DMA;
+    auto consumerWireType = WireBundle::DMA;
     std::set<TileOp>
         objectFifoTiles; // track cores to check for loops during unrolling
 
@@ -1125,13 +1128,15 @@ struct AIEObjectFifoStatefulTransformPass
                 producerChan.channel, 0, producer.getDimensionsToStreamAttr());
       // generate objectFifo allocation info
       builder.setInsertionPoint(&device.getBody()->back());
+
       if (producer.getProducerTileOp().isShimTile())
         createObjectFifoAllocationInfo(
             builder, ctx, SymbolRefAttr::get(ctx, producer.getName()),
             producer.getProducerTileOp().colIndex(), producerChan.direction,
-            producerChan.channel);
+            producerChan.channel, producer.getPlio());
 
       for (auto consumer : consumers) {
+
         // create consumer tile DMA
         DMAChannel consumerChan =
             dmaAnalysis.getSlaveDMAChannel(consumer.getProducerTile());
@@ -1141,18 +1146,32 @@ struct AIEObjectFifoStatefulTransformPass
                   consumerChan.channel, 1, consumerDims);
         // generate objectFifo allocation info
         builder.setInsertionPoint(&device.getBody()->back());
+
+        // If we have PLIO then figure out the direction and make that a PLIO
+        if (producer.getPlio()) {
+          producerWireType = producer.getProducerTileOp().isShimTile()
+                                 ? WireBundle::PLIO
+                                 : WireBundle::DMA;
+          consumerWireType = !(producer.getProducerTileOp().isShimTile())
+                                 ? WireBundle::PLIO
+                                 : WireBundle::DMA;
+        } else {
+          producerWireType = WireBundle::DMA;
+          consumerWireType = WireBundle::DMA;
+        }
+
         if (consumer.getProducerTileOp().isShimTile())
           createObjectFifoAllocationInfo(
               builder, ctx, SymbolRefAttr::get(ctx, producer.getName()),
               consumer.getProducerTileOp().colIndex(), consumerChan.direction,
-              consumerChan.channel);
+              consumerChan.channel, producer.getPlio());
 
         // create flow
         builder.setInsertionPointAfter(producer);
         builder.create<FlowOp>(builder.getUnknownLoc(),
-                               producer.getProducerTile(), WireBundle::DMA,
+                               producer.getProducerTile(), producerWireType,
                                producerChan.channel, consumer.getProducerTile(),
-                               WireBundle::DMA, consumerChan.channel);
+                               consumerWireType, consumerChan.channel);
       }
     }
 
diff --git a/lib/Dialect/AIE/Transforms/AIEPathFinder.cpp b/lib/Dialect/AIE/Transforms/AIEPathFinder.cpp
index 966aa6c5e5..1bd80c96c9 100644
--- a/lib/Dialect/AIE/Transforms/AIEPathFinder.cpp
+++ b/lib/Dialect/AIE/Transforms/AIEPathFinder.cpp
@@ -84,10 +84,8 @@ LogicalResult DynamicTileAnalysis::runAnalysis(DeviceOp &device) {
   // add existing connections so Pathfinder knows which resources are
   // available search all existing SwitchBoxOps for exising connections
   for (SwitchboxOp switchboxOp : device.getOps<SwitchboxOp>()) {
-    for (ConnectOp connectOp : switchboxOp.getOps<ConnectOp>()) {
-      if (!pathfinder->addFixedConnection(connectOp))
-        return switchboxOp.emitOpError() << "Couldn't connect " << connectOp;
-    }
+    if (!pathfinder->addFixedConnection(switchboxOp))
+      return switchboxOp.emitOpError() << "Unable to add fixed connections";
   }
 
   // all flows are now populated, call the congestion-aware pathfinder
@@ -167,7 +165,7 @@ ShimMuxOp DynamicTileAnalysis::getShimMux(OpBuilder &builder, int col) {
   if (coordToShimMux.count({col, row})) {
     return coordToShimMux[{col, row}];
   }
-  assert(getTile(builder, col, row).isShimNOCTile());
+  assert(getTile(builder, col, row).isShimNOCorPLTile());
   auto switchboxOp = builder.create<ShimMuxOp>(builder.getUnknownLoc(),
                                                getTile(builder, col, row));
   SwitchboxOp::ensureTerminator(switchboxOp.getConnections(), builder,
@@ -184,44 +182,36 @@ void Pathfinder::initialize(int maxCol, int maxRow,
   int id = 0;
   for (int row = 0; row <= maxRow; row++) {
     for (int col = 0; col <= maxCol; col++) {
-      auto [it, _] = grid.insert({{col, row}, SwitchboxNode{col, row, id++}});
-      (void)graph.addNode(it->second);
+      grid.insert({{col, row},
+                   SwitchboxNode{col, row, id++, maxCol, maxRow, targetModel}});
       SwitchboxNode &thisNode = grid.at({col, row});
       if (row > 0) { // if not in row 0 add channel to North/South
         SwitchboxNode &southernNeighbor = grid.at({col, row - 1});
         // get the number of outgoing connections on the south side - outgoing
         // because these correspond to rhs of a connect op
-        if (uint32_t maxCapacity = targetModel.getNumDestSwitchboxConnections(
-                col, row, WireBundle::South)) {
-          edges.emplace_back(thisNode, southernNeighbor, WireBundle::South,
-                             maxCapacity);
-          (void)graph.connect(thisNode, southernNeighbor, edges.back());
+        if (targetModel.getNumDestSwitchboxConnections(col, row,
+                                                       WireBundle::South)) {
+          edges.emplace_back(&thisNode, &southernNeighbor);
         }
         // get the number of incoming connections on the south side - incoming
         // because they correspond to connections on the southside that are then
         // routed using internal connect ops through the switchbox (i.e., lhs of
         // connect ops)
-        if (uint32_t maxCapacity = targetModel.getNumSourceSwitchboxConnections(
-                col, row, WireBundle::South)) {
-          edges.emplace_back(southernNeighbor, thisNode, WireBundle::North,
-                             maxCapacity);
-          (void)graph.connect(southernNeighbor, thisNode, edges.back());
+        if (targetModel.getNumSourceSwitchboxConnections(col, row,
+                                                         WireBundle::South)) {
+          edges.emplace_back(&southernNeighbor, &thisNode);
         }
       }
 
       if (col > 0) { // if not in col 0 add channel to East/West
         SwitchboxNode &westernNeighbor = grid.at({col - 1, row});
-        if (uint32_t maxCapacity = targetModel.getNumDestSwitchboxConnections(
-                col, row, WireBundle::West)) {
-          edges.emplace_back(thisNode, westernNeighbor, WireBundle::West,
-                             maxCapacity);
-          (void)graph.connect(thisNode, westernNeighbor, edges.back());
+        if (targetModel.getNumDestSwitchboxConnections(col, row,
+                                                       WireBundle::West)) {
+          edges.emplace_back(&thisNode, &westernNeighbor);
         }
-        if (uint32_t maxCapacity = targetModel.getNumSourceSwitchboxConnections(
-                col, row, WireBundle::West)) {
-          edges.emplace_back(westernNeighbor, thisNode, WireBundle::East,
-                             maxCapacity);
-          (void)graph.connect(westernNeighbor, thisNode, edges.back());
+        if (targetModel.getNumSourceSwitchboxConnections(col, row,
+                                                         WireBundle::West)) {
+          edges.emplace_back(&westernNeighbor, &thisNode);
         }
       }
     }
@@ -234,77 +224,63 @@ void Pathfinder::addFlow(TileID srcCoords, Port srcPort, TileID dstCoords,
                          Port dstPort, bool isPacketFlow) {
   // check if a flow with this source already exists
   for (auto &[isPkt, src, dsts] : flows) {
-    SwitchboxNode *existingSrc = src.sb;
-    assert(existingSrc && "nullptr flow source");
-    if (Port existingPort = src.port; existingSrc->col == srcCoords.col &&
-                                      existingSrc->row == srcCoords.row &&
+    SwitchboxNode *existingSrcPtr = src.sb;
+    assert(existingSrcPtr && "nullptr flow source");
+    if (Port existingPort = src.port; existingSrcPtr->col == srcCoords.col &&
+                                      existingSrcPtr->row == srcCoords.row &&
                                       existingPort == srcPort) {
       // find the vertex corresponding to the destination
-      auto *matchingSb = std::find_if(
-          graph.begin(), graph.end(), [&](const SwitchboxNode *sb) {
-            return sb->col == dstCoords.col && sb->row == dstCoords.row;
-          });
-      assert(matchingSb != graph.end() && "didn't find flow dest");
-      dsts.emplace_back(*matchingSb, dstPort);
+      SwitchboxNode *matchingDstSbPtr = &grid.at(dstCoords);
+      dsts.emplace_back(matchingDstSbPtr, dstPort);
       return;
     }
   }
 
   // If no existing flow was found with this source, create a new flow.
-  auto *matchingSrcSb =
-      std::find_if(graph.begin(), graph.end(), [&](const SwitchboxNode *sb) {
-        return sb->col == srcCoords.col && sb->row == srcCoords.row;
-      });
-  assert(matchingSrcSb != graph.end() && "didn't find flow source");
-  auto *matchingDstSb =
-      std::find_if(graph.begin(), graph.end(), [&](const SwitchboxNode *sb) {
-        return sb->col == dstCoords.col && sb->row == dstCoords.row;
-      });
-  assert(matchingDstSb != graph.end() && "didn't add flow destinations");
-  flows.push_back({isPacketFlow, PathEndPointNode{*matchingSrcSb, srcPort},
-                   std::vector<PathEndPointNode>{{*matchingDstSb, dstPort}}});
+  SwitchboxNode *matchingSrcSbPtr = &grid.at(srcCoords);
+  SwitchboxNode *matchingDstSbPtr = &grid.at(dstCoords);
+  flows.push_back({isPacketFlow, PathEndPointNode{matchingSrcSbPtr, srcPort},
+                   std::vector<PathEndPointNode>{{matchingDstSbPtr, dstPort}}});
 }
 
 // Keep track of connections already used in the AIE; Pathfinder algorithm will
 // avoid using these.
-bool Pathfinder::addFixedConnection(ConnectOp connectOp) {
-  auto sb = connectOp->getParentOfType<SwitchboxOp>();
-  // TODO: keep track of capacity?
-  if (sb.getTileOp().isShimNOCTile())
-    return true;
-
-  TileID sbTile = sb.getTileID();
-  WireBundle sourceBundle = connectOp.getSourceBundle();
-  WireBundle destBundle = connectOp.getDestBundle();
-
-  // find the correct Channel and indicate the fixed direction
-  // outgoing connection
-  auto matchingCh =
-      std::find_if(edges.begin(), edges.end(), [&](ChannelEdge &ch) {
-        return static_cast<TileID>(ch.src) == sbTile && ch.bundle == destBundle;
-      });
-  if (matchingCh != edges.end())
-    return matchingCh->fixedCapacity.insert(connectOp.getDestChannel())
-               .second ||
-           true;
-
-  // incoming connection
-  matchingCh = std::find_if(edges.begin(), edges.end(), [&](ChannelEdge &ch) {
-    return static_cast<TileID>(ch.target) == sbTile &&
-           ch.bundle == getConnectingBundle(sourceBundle);
-  });
-  if (matchingCh != edges.end())
-    return matchingCh->fixedCapacity.insert(connectOp.getSourceChannel())
-               .second ||
-           true;
-
-  return false;
+bool Pathfinder::addFixedConnection(SwitchboxOp switchboxOp) {
+  int col = switchboxOp.colIndex();
+  int row = switchboxOp.rowIndex();
+  SwitchboxNode &sb = grid.at({col, row});
+  std::set<int> invalidInId, invalidOutId;
+
+  for (ConnectOp connectOp : switchboxOp.getOps<ConnectOp>()) {
+    Port srcPort = connectOp.sourcePort();
+    Port destPort = connectOp.destPort();
+    if (sb.inPortToId.count(srcPort) == 0 ||
+        sb.outPortToId.count(destPort) == 0)
+      return false;
+    int inId = sb.inPortToId.at(srcPort);
+    int outId = sb.outPortToId.at(destPort);
+    if (sb.connectionMatrix[inId][outId] != Connectivity::AVAILABLE)
+      return false;
+    invalidInId.insert(inId);
+    invalidOutId.insert(outId);
+  }
+
+  for (const auto &[inPort, inId] : sb.inPortToId) {
+    for (const auto &[outPort, outId] : sb.outPortToId) {
+      if (invalidInId.find(inId) != invalidInId.end() ||
+          invalidOutId.find(outId) != invalidOutId.end()) {
+        // mark as invalid
+        sb.connectionMatrix[inId][outId] = Connectivity::INVALID;
+      }
+    }
+  }
+  return true;
 }
 
 static constexpr double INF = std::numeric_limits<double>::max();
 
 std::map<SwitchboxNode *, SwitchboxNode *>
-dijkstraShortestPaths(const SwitchboxGraph &graph, SwitchboxNode *src) {
+Pathfinder::dijkstraShortestPaths(SwitchboxNode *src) {
   // Use std::map instead of DenseMap because DenseMap doesn't let you overwrite
   // tombstones.
   auto distance = std::map<SwitchboxNode *, double>();
@@ -318,20 +294,25 @@ dijkstraShortestPaths(const SwitchboxGraph &graph, SwitchboxNode *src) {
       MutableQueue;
   MutableQueue Q(distance, indexInHeap);
 
-  for (SwitchboxNode *sb : graph)
-    distance.emplace(sb, INF);
+  for (auto &[_, sb] : grid)
+    distance.emplace(&sb, INF);
   distance[src] = 0.0;
 
-  std::map<SwitchboxNode *, std::vector<ChannelEdge *>> edges;
+  std::map<SwitchboxNode *, std::vector<ChannelEdge *>> channels;
 
   enum Color { WHITE, GRAY, BLACK };
   std::map<SwitchboxNode *, Color> colors;
-  for (SwitchboxNode *sb : graph) {
-    colors[sb] = WHITE;
-    edges[sb] = {sb->getEdges().begin(), sb->getEdges().end()};
-    std::sort(edges[sb].begin(), edges[sb].end(),
+  for (auto &[_, sb] : grid) {
+    SwitchboxNode *sbPtr = &sb;
+    colors[sbPtr] = WHITE;
+    for (auto &e : edges) {
+      if (e.src == sbPtr) {
+        channels[sbPtr].push_back(&e);
+      }
+    }
+    std::sort(channels[sbPtr].begin(), channels[sbPtr].end(),
               [](const ChannelEdge *c1, ChannelEdge *c2) {
-                return c1->getTargetNode().id < c2->getTargetNode().id;
+                return c1->target->id < c2->target->id;
               });
   }
 
@@ -339,18 +320,18 @@ dijkstraShortestPaths(const SwitchboxGraph &graph, SwitchboxNode *src) {
   while (!Q.empty()) {
     src = Q.top();
     Q.pop();
-    for (ChannelEdge *e : edges[src]) {
-      SwitchboxNode *dest = &e->getTargetNode();
-      bool relax = distance[src] + e->demand < distance[dest];
+    for (ChannelEdge *e : channels[src]) {
+      SwitchboxNode *dest = e->target;
+      bool relax = distance[src] + demand[e] < distance[dest];
       if (colors[dest] == WHITE) {
         if (relax) {
-          distance[dest] = distance[src] + e->demand;
+          distance[dest] = distance[src] + demand[e];
           preds[dest] = src;
           colors[dest] = GRAY;
         }
         Q.push(dest);
       } else if (colors[dest] == GRAY && relax) {
-        distance[dest] = distance[src] + e->demand;
+        distance[dest] = distance[src] + demand[e];
         preds[dest] = src;
       }
     }
@@ -372,43 +353,21 @@ Pathfinder::findPaths(const int maxIterations) {
   std::map<PathEndPoint, SwitchSettings> routingSolution;
 
   // initialize all Channel histories to 0
-  for (auto &ch : edges)
-    ch.overCapacityCount = 0;
-
-  // Check that every channel does not exceed max capacity.
-  auto isLegal = [&] {
-    bool legal = true; // assume legal until found otherwise
-    for (auto &e : edges) {
-      if (e.usedCapacity > e.maxCapacity) {
-        LLVM_DEBUG(llvm::dbgs()
-                   << "Too much capacity on Edge (" << e.getTargetNode().col
-                   << ", " << e.getTargetNode().row << ") . "
-                   << stringifyWireBundle(e.bundle) << "\t: used_capacity = "
-                   << e.usedCapacity << "\t: Demand = " << e.demand << "\n");
-        e.overCapacityCount++;
-        LLVM_DEBUG(llvm::dbgs()
-                   << "over_capacity_count = " << e.overCapacityCount << "\n");
-        legal = false;
-        break;
-      }
-    }
-
-    return legal;
-  };
+  for (auto &ch : edges) {
+    overCapacity[&ch] = 0;
+    usedCapacity[&ch] = 0;
+  }
+  // assume legal until found otherwise
+  bool isLegal = true;
 
   do {
     LLVM_DEBUG(llvm::dbgs()
                << "Begin findPaths iteration #" << iterationCount << "\n");
     // update demand on all channels
     for (auto &ch : edges) {
-      if (ch.fixedCapacity.size() >=
-          static_cast<std::set<int>::size_type>(ch.maxCapacity)) {
-        ch.demand = INF;
-      } else {
-        double history = 1.0 + OVER_CAPACITY_COEFF * ch.overCapacityCount;
-        double congestion = 1.0 + USED_CAPACITY_COEFF * ch.usedCapacity;
-        ch.demand = history * congestion;
-      }
+      double history = 1.0 + OVER_CAPACITY_COEFF * overCapacity[&ch];
+      double congestion = 1.0 + USED_CAPACITY_COEFF * usedCapacity[&ch];
+      demand[&ch] = history * congestion;
     }
     // if reach maxIterations, throw an error since no routing can be found
     if (++iterationCount > maxIterations) {
@@ -419,10 +378,13 @@ Pathfinder::findPaths(const int maxIterations) {
       return std::nullopt;
     }
 
-    // "rip up" all routes, i.e. set used capacity in each Channel to 0
+    // "rip up" all routes
     routingSolution.clear();
+    for (auto &[tileID, node] : grid)
+      node.clearAllocation();
     for (auto &ch : edges)
-      ch.usedCapacity = 0;
+      usedCapacity[&ch] = 0;
+    isLegal = true;
 
     // for each flow, find the shortest path from source to destination
     // update used_capacity for the path between them
@@ -434,7 +396,16 @@ Pathfinder::findPaths(const int maxIterations) {
       assert(src.sb && "nonexistent flow source");
       std::set<SwitchboxNode *> processed;
       std::map<SwitchboxNode *, SwitchboxNode *> preds =
-          dijkstraShortestPaths(graph, src.sb);
+          dijkstraShortestPaths(src.sb);
+
+      auto findIncomingEdge = [&](SwitchboxNode *sb) -> ChannelEdge * {
+        for (auto &e : edges) {
+          if (e.src == preds[sb] && e.target == sb) {
+            return &e;
+          }
+        }
+        return nullptr;
+      };
 
       // trace the path of the flow backwards via predecessors
       // increment used_capacity for the associated channels
@@ -442,70 +413,120 @@ Pathfinder::findPaths(const int maxIterations) {
       // set the input bundle for the source endpoint
       switchSettings[*src.sb].src = src.port;
       processed.insert(src.sb);
+      // track destination ports used by src.sb
+      std::vector<Port> srcDestPorts;
       for (const PathEndPointNode &endPoint : dsts) {
         SwitchboxNode *curr = endPoint.sb;
         assert(curr && "endpoint has no source switchbox");
         // set the output bundle for this destination endpoint
         switchSettings[*curr].dsts.insert(endPoint.port);
-
+        Port lastDestPort = endPoint.port;
         // trace backwards until a vertex already processed is reached
         while (!processed.count(curr)) {
-          // find the edge from the pred to curr by searching incident edges
-          SmallVector<ChannelEdge *, 10> channels;
-          graph.findIncomingEdgesToNode(*curr, channels);
-          auto *matchingCh = std::find_if(
-              channels.begin(), channels.end(),
-              [&](ChannelEdge *ch) { return ch->src == *preds[curr]; });
-          assert(matchingCh != channels.end() && "couldn't find ch");
-          // incoming edge
-          ChannelEdge *ch = *matchingCh;
-
-          // don't use fixed channels
-          while (ch->fixedCapacity.count(ch->usedCapacity))
-            ch->usedCapacity++;
-
-          // add the entrance port for this Switchbox
-          switchSettings[*curr].src = {getConnectingBundle(ch->bundle),
-                                       ch->usedCapacity};
-          // add the current Switchbox to the map of the predecessor
-          switchSettings[*preds[curr]].dsts.insert(
-              {ch->bundle, ch->usedCapacity});
-
-          if (isPkt) {
-            ch->packetFlowCount++;
-            // up to 32 packet strams flow through a port
-            if (ch->packetFlowCount > 32) {
-              ch->packetFlowCount = 0;
-              ch->usedCapacity++;
+          // find the incoming edge from the pred to curr
+          ChannelEdge *ch = findIncomingEdge(curr);
+          assert(ch != nullptr && "couldn't find ch");
+          int channel;
+          // find all available channels in
+          std::vector<int> availableChannels = curr->findAvailableChannelIn(
+              getConnectingBundle(ch->bundle), lastDestPort, isPkt);
+          if (availableChannels.size() > 0) {
+            // if possible, choose the channel that predecessor can also use
+            // todo: consider all predecessors?
+            int bFound = false;
+            auto &pred = preds[curr];
+            if (!processed.count(pred) && pred != src.sb) {
+              ChannelEdge *predCh = findIncomingEdge(pred);
+              assert(predCh != nullptr && "couldn't find ch");
+              for (int availableCh : availableChannels) {
+                channel = availableCh;
+                std::vector<int> availablePredChannels =
+                    pred->findAvailableChannelIn(
+                        getConnectingBundle(predCh->bundle),
+                        {ch->bundle, channel}, isPkt);
+                if (availablePredChannels.size() > 0) {
+                  bFound = true;
+                  break;
+                }
+              }
             }
+            if (!bFound)
+              channel = availableChannels[0];
+            bool succeed =
+                curr->allocate({getConnectingBundle(ch->bundle), channel},
+                               lastDestPort, isPkt);
+            if (!succeed)
+              assert(false && "invalid allocation");
+            LLVM_DEBUG(llvm::dbgs()
+                       << *curr << ", connecting: "
+                       << stringifyWireBundle(getConnectingBundle(ch->bundle))
+                       << channel << " -> "
+                       << stringifyWireBundle(lastDestPort.bundle)
+                       << lastDestPort.channel << "\n");
           } else {
-            ch->packetFlowCount = 0;
-            ch->usedCapacity++;
+            // if no channel available, use a virtual channel id and mark
+            // routing as being invalid
+            channel = usedCapacity[ch];
+            if (isLegal) {
+              overCapacity[ch]++;
+              LLVM_DEBUG(llvm::dbgs()
+                         << *curr << ", congestion: "
+                         << stringifyWireBundle(getConnectingBundle(ch->bundle))
+                         << ", used_capacity = " << usedCapacity[ch]
+                         << ", over_capacity_count = " << overCapacity[ch]
+                         << "\n");
+            }
+            isLegal = false;
           }
+          usedCapacity[ch]++;
+
+          // add the entrance port for this Switchbox
+          Port currSourcePort = {getConnectingBundle(ch->bundle), channel};
+          switchSettings[*curr].src = {currSourcePort};
+
+          // add the current Switchbox to the map of the predecessor
+          Port PredDestPort = {ch->bundle, channel};
+          switchSettings[*preds[curr]].dsts.insert(PredDestPort);
+          lastDestPort = PredDestPort;
 
           // if at capacity, bump demand to discourage using this Channel
-          if (ch->usedCapacity >= ch->maxCapacity) {
-            LLVM_DEBUG(llvm::dbgs() << "ch over capacity: " << ch << "\n");
+          if (usedCapacity[ch] >= ch->maxCapacity) {
             // this means the order matters!
-            ch->demand *= DEMAND_COEFF;
+            demand[ch] *= DEMAND_COEFF;
+            LLVM_DEBUG(llvm::dbgs()
+                       << *curr << ", bump demand: "
+                       << stringifyWireBundle(getConnectingBundle(ch->bundle))
+                       << ", demand = " << demand[ch] << "\n");
           }
 
           processed.insert(curr);
           curr = preds[curr];
+
+          // allocation may fail, as we start from the dest of flow while
+          // src.port is not chosen by router
+          if (curr == src.sb &&
+              std::find(srcDestPorts.begin(), srcDestPorts.end(),
+                        lastDestPort) == srcDestPorts.end()) {
+            bool succeed = src.sb->allocate(src.port, lastDestPort, isPkt);
+            if (!succeed) {
+              isLegal = false;
+              overCapacity[ch]++;
+              LLVM_DEBUG(llvm::dbgs()
+                         << *curr << ", unable to connect: "
+                         << stringifyWireBundle(src.port.bundle)
+                         << src.port.channel << " -> "
+                         << stringifyWireBundle(lastDestPort.bundle)
+                         << lastDestPort.channel << "\n");
+            }
+            srcDestPorts.push_back(lastDestPort);
+          }
         }
       }
       // add this flow to the proposed solution
       routingSolution[src] = switchSettings;
     }
 
-    for (auto &ch : edges) {
-      if (ch.packetFlowCount > 0) {
-        ch.packetFlowCount = 0;
-        ch.usedCapacity++;
-      }
-    }
-
-  } while (!isLegal()); // continue iterations until a legal routing is found
+  } while (!isLegal); // continue iterations until a legal routing is found
 
   return routingSolution;
 }
diff --git a/lib/Dialect/AIEVec/IR/AIE1/AIEVecAIE1Ops.cpp b/lib/Dialect/AIEVec/IR/AIE1/AIEVecAIE1Ops.cpp
new file mode 100644
index 0000000000..524f8fb409
--- /dev/null
+++ b/lib/Dialect/AIEVec/IR/AIE1/AIEVecAIE1Ops.cpp
@@ -0,0 +1,348 @@
+//===-- AIEVecAIE1Ops.cpp - MLIR AIE Vector Dialect Operations --*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+//
+//===----------------------------------------------------------------------===//
+// This file implements AIE1 vector op printing, pasing, and verification.
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Transforms/FoldUtils.h"
+#include "llvm/ADT/TypeSwitch.h"
+
+#include "aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Ops.h"
+#include "aie/Dialect/AIEVec/AIEVecUtils.h"
+
+using namespace llvm;
+using namespace mlir;
+
+#include "aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1OpsDialect.cpp.inc"
+
+namespace xilinx::aievec::aie1 {
+
+//===----------------------------------------------------------------------===//
+// AIEVecAIE1Dialect
+//===----------------------------------------------------------------------===//
+
+void AIEVecAIE1Dialect::initialize() {
+  addOperations<
+#define GET_OP_LIST
+#include "aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Ops.cpp.inc"
+      >();
+}
+
+//===----------------------------------------------------------------------===//
+// AddOp and SubOp
+//===----------------------------------------------------------------------===//
+
+// Print out Add and Sub op.
+template <typename T>
+void printAddSubOp(OpAsmPrinter &p, T op) {
+  // Print the lhs operand
+  p << " " << op.getLhs();
+  // Print the rhs operand
+  p << ", " << op.getRhs();
+
+  // Print the attributes, but don't print attributes that are empty strings
+  SmallVector<StringRef, 10> elidedAttrs;
+  for (int idx = 0; idx < 2; ++idx) {
+    if (op.getStart(idx).empty())
+      elidedAttrs.push_back(op.getStartAttrName(idx));
+    if (op.getOffset(idx).empty())
+      elidedAttrs.push_back(op.getOffsetAttrName(idx));
+    if (op.getOffsetHi(idx).empty())
+      elidedAttrs.push_back(op.getOffsetHiAttrName(idx));
+    if (op.getSquare(idx).empty())
+      elidedAttrs.push_back(op.getSquareAttrName(idx));
+  }
+  p.printOptionalAttrDict(op->getAttrs(), elidedAttrs);
+
+  // And now print the types
+  p << " : " << op.getLhs().getType() << ", " << op.getRhs().getType();
+  p << ", " << op.getResult().getType();
+}
+
+void AddOp::print(OpAsmPrinter &p) { printAddSubOp<AddOp>(p, *this); }
+
+void SubOp::print(OpAsmPrinter &p) { printAddSubOp<SubOp>(p, *this); }
+
+// Verify Add and Sub op.
+template <typename T>
+LogicalResult verifyAddSubOp(T op) {
+  // Verify the types
+  auto resultType = llvm::dyn_cast<VectorType>(op.getResult().getType());
+  auto lhsType = llvm::dyn_cast<VectorType>(op.getLhs().getType());
+  auto rhsType = llvm::dyn_cast<VectorType>(op.getRhs().getType());
+
+  if (!lhsType || !rhsType || !resultType)
+    return op.emitError("requires vector type");
+
+  // All the vector types must match
+  if (lhsType != rhsType || rhsType != resultType)
+    return op.emitError("all vectors must be of same type");
+
+  return success();
+}
+
+LogicalResult AddOp::verify() { return verifyAddSubOp<AddOp>(*this); }
+
+LogicalResult SubOp::verify() { return verifyAddSubOp<SubOp>(*this); }
+
+// Parse Add and Sub op.
+ParseResult parseAddSubOp(OpAsmParser &parser, OperationState &result) {
+  llvm::SMLoc typesLoc;
+  SmallVector<Type, 3> types;
+  OpAsmParser::UnresolvedOperand lhs, rhs;
+
+  // Parse the lhs and rhs
+  if (parser.parseOperand(lhs) || parser.parseComma() ||
+      parser.parseOperand(rhs))
+    return failure();
+
+  // Parse all the attributes and types
+  if (parser.parseOptionalAttrDict(result.attributes) ||
+      parser.getCurrentLocation(&typesLoc) || parser.parseColonTypeList(types))
+    return failure();
+
+  // Assert that there are three types: lhs, rhs, and result
+  if (types.size() != 3)
+    return parser.emitError(typesLoc, "requires three types");
+
+  // Some verification
+  VectorType lhsType = llvm::dyn_cast<VectorType>(types[0]);
+  if (!lhsType)
+    return parser.emitError(typesLoc, "requires vector type");
+  VectorType rhsType = llvm::dyn_cast<VectorType>(types[1]);
+  if (!rhsType)
+    return parser.emitError(typesLoc, "requires vector type");
+  VectorType resultType = llvm::dyn_cast<VectorType>(types[2]);
+  if (!resultType)
+    return parser.emitError(typesLoc, "requires vector type");
+
+  // Populate the lhs, rhs, and accumulator in the result
+  if (parser.resolveOperand(lhs, lhsType, result.operands) ||
+      parser.resolveOperand(rhs, rhsType, result.operands))
+    return failure();
+
+  return parser.addTypeToList(resultType, result.types);
+}
+
+ParseResult AddOp::parse(OpAsmParser &parser, OperationState &result) {
+  return parseAddSubOp(parser, result);
+}
+
+ParseResult SubOp::parse(OpAsmParser &parser, OperationState &result) {
+  return parseAddSubOp(parser, result);
+}
+
+//===----------------------------------------------------------------------===//
+// MulOp and FMAOp
+//===----------------------------------------------------------------------===//
+
+// MulOp and FMAOp are structurally similar, except that FMA op has few extra
+// fields (accumulator, bool flag to indicate if it is fmsub, etc.). We create
+// some specializations to print those fields specifically for FMA op.
+
+// Print the accumulator
+template <typename T>
+void printAccumulator(OpAsmPrinter &p, T op);
+template <>
+inline void printAccumulator(OpAsmPrinter &p, FMAOp op) {
+  p << ", " << op.getAcc();
+}
+template <>
+inline void printAccumulator(OpAsmPrinter &p, MulOp op) {}
+
+// Mark fmsub indicator as elided if the FMA op is not fmsub
+template <typename T>
+void elideFMSubAttr(T op, SmallVector<StringRef, 10> &elidedAttrs);
+template <>
+inline void elideFMSubAttr(FMAOp op, SmallVector<StringRef, 10> &elidedAttrs) {
+  if (!op.getFmsub())
+    elidedAttrs.push_back(op.getSubAttrName());
+}
+template <>
+inline void elideFMSubAttr(MulOp, SmallVector<StringRef, 10> &elidedAttrs) {}
+
+// Print out Mul and FMA op.
+template <typename T>
+static void printMulFMAOp(OpAsmPrinter &p, T op) {
+  // Print the left operand
+  p << " " << op.getLhs();
+  // Print the right operand
+  p << ", " << op.getRhs();
+  // For fma op, print the accumulator
+  printAccumulator(p, op);
+
+  // Print the attributes, but don't print attributes that are empty strings
+  SmallVector<StringRef, 10> elidedAttrs;
+  for (int idx = 0; idx < 2; ++idx) {
+    if (op.getStart(idx).empty())
+      elidedAttrs.push_back(op.getStartAttrName(idx));
+    if (op.getOffset(idx).empty())
+      elidedAttrs.push_back(op.getOffsetAttrName(idx));
+    if (op.getOffsetHi(idx).empty())
+      elidedAttrs.push_back(op.getOffsetHiAttrName(idx));
+    if (op.getStep(idx).empty())
+      elidedAttrs.push_back(op.getStepAttrName(idx));
+    if (op.getSquare(idx).empty())
+      elidedAttrs.push_back(op.getSquareAttrName(idx));
+    elideFMSubAttr(op, elidedAttrs);
+  }
+  p.printOptionalAttrDict(op->getAttrs(), elidedAttrs);
+
+  // And now print the types
+  p << " : " << op.getLhs().getType() << ", " << op.getRhs().getType();
+  p << ", " << op.getResult().getType();
+}
+
+void MulOp::print(OpAsmPrinter &p) { printMulFMAOp<MulOp>(p, *this); }
+
+void FMAOp::print(OpAsmPrinter &p) { printMulFMAOp<FMAOp>(p, *this); }
+
+// Verify Mul and FMA op.
+template <typename T>
+LogicalResult verifyMulFMAOp(T op) {
+  // Verify the types
+  auto lhsType = llvm::dyn_cast<VectorType>(op.getLhs().getType());
+  auto rhsType = llvm::dyn_cast<VectorType>(op.getRhs().getType());
+
+  if (!lhsType || !rhsType)
+    return op.emitError("requires vector type");
+
+  auto resultType = llvm::dyn_cast<VectorType>(op.getResult().getType());
+  if (!resultType)
+    return op.emitError("requires vector type");
+
+  // Additional checks for FMA op
+  // Get the width of the underlying scalars of all the vectors
+  Type ltype = lhsType.getElementType();
+  Type rtype = rhsType.getElementType();
+  Type atype = resultType.getElementType();
+  unsigned ltypeWidth = ltype.getIntOrFloatBitWidth();
+  unsigned rtypeWidth = rtype.getIntOrFloatBitWidth();
+  unsigned atypeWidth = atype.getIntOrFloatBitWidth();
+
+  // Checks on the number of lanes
+  unsigned accLanes = getVectorLaneSize(resultType);
+  unsigned rhsLanes = getVectorLaneSize(rhsType);
+  unsigned lhsLanes = getVectorLaneSize(lhsType);
+
+  // If this is not a simple scheme, perform complex checks
+  if (accLanes != rhsLanes || accLanes != lhsLanes) {
+    if (rhsLanes != 256 / rtypeWidth)
+      return op.emitError("incorrect rhs operand vector lanes");
+    if (lhsLanes < 2 * rhsLanes)
+      return op.emitError("The number of lanes in lhs operand "
+                          "must be at least twice that of rhs operand");
+    if (accLanes > rhsLanes)
+      return op.emitError("The number of lanes in accumulator "
+                          "must be less than that of rhs operand");
+  }
+
+  // lhs and rhs vector's element type must match
+  if (ltype != rtype)
+    return op.emitError("The element type of lhs and rhs "
+                        "operand vectors must match");
+
+  // The datatype of accumulator must always be greater width
+  if (isa<IntegerType>(atype)) {
+    if (!isa<IntegerType>(ltype))
+      return op.emitError("Integer result must have integer operands");
+
+    if (ltypeWidth >= atypeWidth || rtypeWidth >= atypeWidth)
+      return op.emitError("the element type of accumulator must have "
+                          "wider width than that of the operand vectors");
+  } else if (isa<FloatType>(atype)) {
+    if (!isa<FloatType>(ltype))
+      return op.emitError("Floating point result must have "
+                          "floating point operands");
+
+    if (ltypeWidth != atypeWidth || rtypeWidth != atypeWidth)
+      return op.emitError("the element type of accumulator must be "
+                          "same width as the operand vectors");
+  }
+
+  return success();
+}
+
+LogicalResult MulOp::verify() { return verifyMulFMAOp<MulOp>(*this); }
+
+LogicalResult FMAOp::verify() { return verifyMulFMAOp<FMAOp>(*this); }
+
+// Parse Mul and FMA op.
+ParseResult parseMulFMAOp(OpAsmParser &parser, OperationState &result,
+                          bool isFMAOp = true) {
+  llvm::SMLoc typesLoc;
+  SmallVector<Type, 3> types;
+  OpAsmParser::UnresolvedOperand lhs, rhs, acc;
+
+  // Parse the lhs and rhs
+  if (parser.parseOperand(lhs) || parser.parseComma() ||
+      parser.parseOperand(rhs))
+    return failure();
+
+  // Parse the acc for FMA op
+  if (isFMAOp) {
+    if (parser.parseComma() || parser.parseOperand(acc))
+      return failure();
+  }
+
+  // Parse all the attributes and types
+  if (parser.parseOptionalAttrDict(result.attributes) ||
+      parser.getCurrentLocation(&typesLoc) || parser.parseColonTypeList(types))
+    return failure();
+
+  // Assert that there are three types: lhs, rhs, and acc
+  if (types.size() != 3)
+    return parser.emitError(typesLoc, "requires three types");
+
+  // Some verification
+  VectorType lhsType = llvm::dyn_cast<VectorType>(types[0]);
+  if (!lhsType)
+    return parser.emitError(typesLoc, "requires vector type");
+  VectorType rhsType = llvm::dyn_cast<VectorType>(types[1]);
+  if (!rhsType)
+    return parser.emitError(typesLoc, "requires vector type");
+
+  // Int ops use the accumulator while float ops use normal vector registers
+  VectorType accType = llvm::dyn_cast<VectorType>(types[2]);
+  if (!accType)
+    return parser.emitError(typesLoc, "requires vector type");
+
+  // Populate the lhs and rhs operands, and result
+  if (parser.resolveOperand(lhs, lhsType, result.operands) ||
+      parser.resolveOperand(rhs, rhsType, result.operands))
+    return failure();
+
+  // Populate acc operand for FMA op
+  if (isFMAOp) {
+    if (parser.resolveOperand(acc, accType, result.operands))
+      return failure();
+  }
+
+  return parser.addTypeToList(accType, result.types);
+}
+
+ParseResult MulOp::parse(OpAsmParser &parser, OperationState &result) {
+  return parseMulFMAOp(parser, result, false);
+}
+
+ParseResult FMAOp::parse(OpAsmParser &parser, OperationState &result) {
+  return parseMulFMAOp(parser, result, true);
+}
+
+} // namespace xilinx::aievec::aie1
+
+// #define GET_ATTRDEF_CLASSES
+// #include "aie/Dialect/AIEVec/IR/AIEVecAttributes.cpp.inc"
+
+#define GET_OP_CLASSES
+#include "aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Ops.cpp.inc"
diff --git a/lib/Dialect/AIEVec/IR/AIE1/CMakeLists.txt b/lib/Dialect/AIEVec/IR/AIE1/CMakeLists.txt
new file mode 100644
index 0000000000..a0c9f093d6
--- /dev/null
+++ b/lib/Dialect/AIEVec/IR/AIE1/CMakeLists.txt
@@ -0,0 +1,20 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2022 Xilinx Inc.
+
+add_mlir_dialect_library(MLIRAIEVecAIE1Dialect
+  AIEVecAIE1Ops.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../../include/aie/Dialect/AIEVec/
+
+  DEPENDS
+  MLIRAIEVecAIE1OpsIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRPass
+  )
diff --git a/lib/Dialect/AIEVec/IR/AIEVecOps.cpp b/lib/Dialect/AIEVec/IR/AIEVecOps.cpp
index 572f1f7b61..49b6ac3336 100644
--- a/lib/Dialect/AIEVec/IR/AIEVecOps.cpp
+++ b/lib/Dialect/AIEVec/IR/AIEVecOps.cpp
@@ -593,211 +593,6 @@ ParseResult BroadcastScalarOp::parse(OpAsmParser &parser,
   return parser.addTypeToList(resType, result.types);
 }
 
-//===----------------------------------------------------------------------===//
-// MulOp and FMAOp
-//===----------------------------------------------------------------------===//
-
-// MulOp and FMAOp are structurally similar, except that FMA op has few extra
-// fields (accumulator, bool flag to indicate if it is fmsub, etc.). We create
-// some specializations to print those fields specifically for FMA op.
-
-// Print the accumulator
-template <typename T>
-void printAccumulator(OpAsmPrinter &p, T op);
-template <>
-inline void printAccumulator(OpAsmPrinter &p, aievec::FMAOp op) {
-  p << ", " << op.getAcc();
-}
-template <>
-inline void printAccumulator(OpAsmPrinter &p, aievec::MulOp op) {}
-
-// Mark fmsub indicator as elided if the FMA op is not fmsub
-template <typename T>
-void elideFMSubAttr(T op, SmallVector<StringRef, 10> &elidedAttrs);
-template <>
-inline void elideFMSubAttr(aievec::FMAOp op,
-                           SmallVector<StringRef, 10> &elidedAttrs) {
-  if (!op.getFmsub())
-    elidedAttrs.push_back(op.getSubAttrName());
-}
-template <>
-inline void elideFMSubAttr(aievec::MulOp,
-                           SmallVector<StringRef, 10> &elidedAttrs) {}
-
-// Print out Mul and FMA op.
-template <typename T>
-static void printMulFMAOp(OpAsmPrinter &p, T op) {
-  // Print the left operand
-  p << " " << op.getLhs();
-  // Print the right operand
-  p << ", " << op.getRhs();
-  // For fma op, print the accumulator
-  printAccumulator(p, op);
-
-  // Print the attributes, but don't print attributes that are empty strings
-  SmallVector<StringRef, 10> elidedAttrs;
-  for (int idx = 0; idx < 2; ++idx) {
-    if (op.getStart(idx).empty())
-      elidedAttrs.push_back(op.getStartAttrName(idx));
-    if (op.getOffset(idx).empty())
-      elidedAttrs.push_back(op.getOffsetAttrName(idx));
-    if (op.getOffsetHi(idx).empty())
-      elidedAttrs.push_back(op.getOffsetHiAttrName(idx));
-    if (op.getStep(idx).empty())
-      elidedAttrs.push_back(op.getStepAttrName(idx));
-    if (op.getSquare(idx).empty())
-      elidedAttrs.push_back(op.getSquareAttrName(idx));
-    elideFMSubAttr(op, elidedAttrs);
-  }
-  p.printOptionalAttrDict(op->getAttrs(), elidedAttrs);
-
-  // And now print the types
-  p << " : " << op.getLhs().getType() << ", " << op.getRhs().getType();
-  p << ", " << op.getResult().getType();
-}
-
-void MulOp::print(OpAsmPrinter &p) { printMulFMAOp<aievec::MulOp>(p, *this); }
-
-void aievec::FMAOp::print(OpAsmPrinter &p) {
-  printMulFMAOp<aievec::FMAOp>(p, *this);
-}
-
-// Verify Mul and FMA op.
-template <typename T>
-LogicalResult verifyMulFMAOp(T op) {
-  // Verify the types
-  auto lhsType = llvm::dyn_cast<VectorType>(op.getLhs().getType());
-  auto rhsType = llvm::dyn_cast<VectorType>(op.getRhs().getType());
-
-  if (!lhsType || !rhsType)
-    return op.emitError("requires vector type");
-
-  auto resultType = llvm::dyn_cast<VectorType>(op.getResult().getType());
-  if (!resultType)
-    return op.emitError("requires vector type");
-
-  // Additional checks for FMA op
-  // Get the width of the underlying scalars of all the vectors
-  Type ltype = lhsType.getElementType();
-  Type rtype = rhsType.getElementType();
-  Type atype = resultType.getElementType();
-  unsigned ltypeWidth = ltype.getIntOrFloatBitWidth();
-  unsigned rtypeWidth = rtype.getIntOrFloatBitWidth();
-  unsigned atypeWidth = atype.getIntOrFloatBitWidth();
-
-  // Checks on the number of lanes
-  unsigned accLanes = getVectorLaneSize(resultType);
-  unsigned rhsLanes = getVectorLaneSize(rhsType);
-  unsigned lhsLanes = getVectorLaneSize(lhsType);
-
-  // If this is not a simple scheme, perform complex checks
-  if (accLanes != rhsLanes || accLanes != lhsLanes) {
-    if (rhsLanes != 256 / rtypeWidth)
-      return op.emitError("incorrect rhs operand vector lanes");
-    if (lhsLanes < 2 * rhsLanes)
-      return op.emitError("The number of lanes in lhs operand "
-                          "must be at least twice that of rhs operand");
-    if (accLanes > rhsLanes)
-      return op.emitError("The number of lanes in accumulator "
-                          "must be less than that of rhs operand");
-  }
-
-  // lhs and rhs vector's element type must match
-  if (ltype != rtype)
-    return op.emitError("The element type of lhs and rhs "
-                        "operand vectors must match");
-
-  // The datatype of accumulator must always be greater width
-  if (isa<IntegerType>(atype)) {
-    if (!isa<IntegerType>(ltype))
-      return op.emitError("Integer result must have integer operands");
-
-    if (ltypeWidth >= atypeWidth || rtypeWidth >= atypeWidth)
-      return op.emitError("the element type of accumulator must have "
-                          "wider width than that of the operand vectors");
-  } else if (isa<FloatType>(atype)) {
-    if (!isa<FloatType>(ltype))
-      return op.emitError("Floating point result must have "
-                          "floating point operands");
-
-    if (ltypeWidth != atypeWidth || rtypeWidth != atypeWidth)
-      return op.emitError("the element type of accumulator must be "
-                          "same width as the operand vectors");
-  }
-
-  return success();
-}
-
-LogicalResult aievec::MulOp::verify() {
-  return verifyMulFMAOp<aievec::MulOp>(*this);
-}
-
-LogicalResult aievec::FMAOp::verify() {
-  return verifyMulFMAOp<aievec::FMAOp>(*this);
-}
-
-// Parse Mul and FMA op.
-ParseResult parseMulFMAOp(OpAsmParser &parser, OperationState &result,
-                          bool isFMAOp = true) {
-  llvm::SMLoc typesLoc;
-  SmallVector<Type, 3> types;
-  OpAsmParser::UnresolvedOperand lhs, rhs, acc;
-
-  // Parse the lhs and rhs
-  if (parser.parseOperand(lhs) || parser.parseComma() ||
-      parser.parseOperand(rhs))
-    return failure();
-
-  // Parse the acc for FMA op
-  if (isFMAOp) {
-    if (parser.parseComma() || parser.parseOperand(acc))
-      return failure();
-  }
-
-  // Parse all the attributes and types
-  if (parser.parseOptionalAttrDict(result.attributes) ||
-      parser.getCurrentLocation(&typesLoc) || parser.parseColonTypeList(types))
-    return failure();
-
-  // Assert that there are three types: lhs, rhs, and acc
-  if (types.size() != 3)
-    return parser.emitError(typesLoc, "requires three types");
-
-  // Some verification
-  VectorType lhsType = llvm::dyn_cast<VectorType>(types[0]);
-  if (!lhsType)
-    return parser.emitError(typesLoc, "requires vector type");
-  VectorType rhsType = llvm::dyn_cast<VectorType>(types[1]);
-  if (!rhsType)
-    return parser.emitError(typesLoc, "requires vector type");
-
-  // Int ops use the accumulator while float ops use normal vector registers
-  VectorType accType = llvm::dyn_cast<VectorType>(types[2]);
-  if (!accType)
-    return parser.emitError(typesLoc, "requires vector type");
-
-  // Populate the lhs and rhs operands, and result
-  if (parser.resolveOperand(lhs, lhsType, result.operands) ||
-      parser.resolveOperand(rhs, rhsType, result.operands))
-    return failure();
-
-  // Populate acc operand for FMA op
-  if (isFMAOp) {
-    if (parser.resolveOperand(acc, accType, result.operands))
-      return failure();
-  }
-
-  return parser.addTypeToList(accType, result.types);
-}
-
-ParseResult MulOp::parse(OpAsmParser &parser, OperationState &result) {
-  return parseMulFMAOp(parser, result, false);
-}
-
-ParseResult FMAOp::parse(OpAsmParser &parser, OperationState &result) {
-  return parseMulFMAOp(parser, result, true);
-}
-
 //===----------------------------------------------------------------------===//
 // MulElemOp and FMAElemOp
 //===----------------------------------------------------------------------===//
@@ -987,118 +782,6 @@ ParseResult FMAElemOp::parse(OpAsmParser &parser, OperationState &result) {
   return parseMulFMAElemOp(parser, result, true);
 }
 
-//===----------------------------------------------------------------------===//
-// AddOp and SubOp
-//===----------------------------------------------------------------------===//
-
-// Print out Add and Sub op.
-template <typename T>
-void printAddSubOp(OpAsmPrinter &p, T op) {
-  // Print the lhs operand
-  p << " " << op.getLhs();
-  // Print the rhs operand
-  p << ", " << op.getRhs();
-
-  // Print the attributes, but don't print attributes that are empty strings
-  SmallVector<StringRef, 10> elidedAttrs;
-  for (int idx = 0; idx < 2; ++idx) {
-    if (op.getStart(idx).empty())
-      elidedAttrs.push_back(op.getStartAttrName(idx));
-    if (op.getOffset(idx).empty())
-      elidedAttrs.push_back(op.getOffsetAttrName(idx));
-    if (op.getOffsetHi(idx).empty())
-      elidedAttrs.push_back(op.getOffsetHiAttrName(idx));
-    if (op.getSquare(idx).empty())
-      elidedAttrs.push_back(op.getSquareAttrName(idx));
-  }
-  p.printOptionalAttrDict(op->getAttrs(), elidedAttrs);
-
-  // And now print the types
-  p << " : " << op.getLhs().getType() << ", " << op.getRhs().getType();
-  p << ", " << op.getResult().getType();
-}
-
-void aievec::AddOp::print(OpAsmPrinter &p) {
-  printAddSubOp<aievec::AddOp>(p, *this);
-}
-
-void aievec::SubOp::print(OpAsmPrinter &p) {
-  printAddSubOp<aievec::SubOp>(p, *this);
-}
-
-// Verify Add and Sub op.
-template <typename T>
-LogicalResult verifyAddSubOp(T op) {
-  // Verify the types
-  auto resultType = llvm::dyn_cast<VectorType>(op.getResult().getType());
-  auto lhsType = llvm::dyn_cast<VectorType>(op.getLhs().getType());
-  auto rhsType = llvm::dyn_cast<VectorType>(op.getRhs().getType());
-
-  if (!lhsType || !rhsType || !resultType)
-    return op.emitError("requires vector type");
-
-  // All the vector types must match
-  if (lhsType != rhsType || rhsType != resultType)
-    return op.emitError("all vectors must be of same type");
-
-  return success();
-}
-
-LogicalResult aievec::AddOp::verify() {
-  return verifyAddSubOp<aievec::AddOp>(*this);
-}
-
-LogicalResult aievec::SubOp::verify() {
-  return verifyAddSubOp<aievec::SubOp>(*this);
-}
-
-// Parse Add and Sub op.
-ParseResult parseAddSubOp(OpAsmParser &parser, OperationState &result) {
-  llvm::SMLoc typesLoc;
-  SmallVector<Type, 3> types;
-  OpAsmParser::UnresolvedOperand lhs, rhs;
-
-  // Parse the lhs and rhs
-  if (parser.parseOperand(lhs) || parser.parseComma() ||
-      parser.parseOperand(rhs))
-    return failure();
-
-  // Parse all the attributes and types
-  if (parser.parseOptionalAttrDict(result.attributes) ||
-      parser.getCurrentLocation(&typesLoc) || parser.parseColonTypeList(types))
-    return failure();
-
-  // Assert that there are three types: lhs, rhs, and result
-  if (types.size() != 3)
-    return parser.emitError(typesLoc, "requires three types");
-
-  // Some verification
-  VectorType lhsType = llvm::dyn_cast<VectorType>(types[0]);
-  if (!lhsType)
-    return parser.emitError(typesLoc, "requires vector type");
-  VectorType rhsType = llvm::dyn_cast<VectorType>(types[1]);
-  if (!rhsType)
-    return parser.emitError(typesLoc, "requires vector type");
-  VectorType resultType = llvm::dyn_cast<VectorType>(types[2]);
-  if (!resultType)
-    return parser.emitError(typesLoc, "requires vector type");
-
-  // Populate the lhs, rhs, and accumulator in the result
-  if (parser.resolveOperand(lhs, lhsType, result.operands) ||
-      parser.resolveOperand(rhs, rhsType, result.operands))
-    return failure();
-
-  return parser.addTypeToList(resultType, result.types);
-}
-
-ParseResult AddOp::parse(OpAsmParser &parser, OperationState &result) {
-  return parseAddSubOp(parser, result);
-}
-
-ParseResult SubOp::parse(OpAsmParser &parser, OperationState &result) {
-  return parseAddSubOp(parser, result);
-}
-
 //===----------------------------------------------------------------------===//
 // ConcatOp
 //===----------------------------------------------------------------------===//
diff --git a/lib/Dialect/AIEVec/IR/CMakeLists.txt b/lib/Dialect/AIEVec/IR/CMakeLists.txt
index dc10d46704..04e818a5ed 100644
--- a/lib/Dialect/AIEVec/IR/CMakeLists.txt
+++ b/lib/Dialect/AIEVec/IR/CMakeLists.txt
@@ -20,3 +20,5 @@ add_mlir_dialect_library(MLIRAIEVecDialect
   MLIRIR
   MLIRPass
   )
+
+  add_subdirectory(AIE1)
diff --git a/lib/Dialect/AIEVec/Transforms/AIEVecOptimizations.cpp b/lib/Dialect/AIEVec/Transforms/AIEVecOptimizations.cpp
index 5d92118836..d96b3d1cf3 100644
--- a/lib/Dialect/AIEVec/Transforms/AIEVecOptimizations.cpp
+++ b/lib/Dialect/AIEVec/Transforms/AIEVecOptimizations.cpp
@@ -13,6 +13,7 @@
 
 #include "FoldMulAddChainToConvOp.h"
 
+#include "aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Ops.h"
 #include "aie/Dialect/AIEVec/AIEVecUtils.h"
 #include "aie/Dialect/AIEVec/Analysis/Passes.h"
 #include "aie/Dialect/AIEVec/IR/AIEVecOps.h"
@@ -43,9 +44,9 @@ using namespace xilinx::aievec;
 namespace xilinx {
 namespace aievec {
 
-SmallVector<NamedAttribute> buildFMAOpSplatAttrForElemTy(aievec::FMAOp fmaOp,
-                                                         int64_t bcastPos,
-                                                         int64_t step = 1);
+SmallVector<NamedAttribute>
+buildFMAOpSplatAttrForElemTy(aievec::aie1::FMAOp fmaOp, int64_t bcastPos,
+                             int64_t step = 1);
 
 } // namespace aievec
 } // namespace xilinx
@@ -76,8 +77,8 @@ static bool canFoldAIEShiftAndBroadcast(aievec::BroadcastOp op,
 
 template <typename AIEv1MACLikeOp,
           typename = std::enable_if_t<
-              std::is_same_v<AIEv1MACLikeOp, aievec::FMAOp> ||
-              std::is_same_v<AIEv1MACLikeOp, aievec::FMAOp::Adaptor>>>
+              std::is_same_v<AIEv1MACLikeOp, aievec::aie1::FMAOp> ||
+              std::is_same_v<AIEv1MACLikeOp, aievec::aie1::FMAOp::Adaptor>>>
 static bool isSingleColumnInt16VectorTimesScalarMac(AIEv1MACLikeOp fmaOp) {
   // lhs is a 32xi16 vector
   VectorType lhsVTy = cast<VectorType>(fmaOp.getLhs().getType());
@@ -111,11 +112,11 @@ static bool isSingleColumnInt16VectorTimesScalarMac(AIEv1MACLikeOp fmaOp) {
   return llvm::all_of(cstDense, [](const APInt &val) { return val == 0; });
 }
 
-static bool singleColumnFMAOpCanFold(aievec::FMAOp fmaOp) {
+static bool singleColumnFMAOpCanFold(aievec::aie1::FMAOp fmaOp) {
   auto accProdOp = fmaOp.getAcc().getDefiningOp();
   if (!accProdOp)
     return false;
-  auto accFmaOp = dyn_cast<aievec::FMAOp>(accProdOp);
+  auto accFmaOp = dyn_cast<aievec::aie1::FMAOp>(accProdOp);
   if (!accFmaOp)
     return false;
   if (!isSingleColumnInt16VectorTimesScalarMac(accFmaOp))
@@ -128,18 +129,18 @@ static bool singleColumnFMAOpCanFold(aievec::FMAOp fmaOp) {
 // Lowering patterns
 //===----------------------------------------------------------------------===//
 struct MergeSingleColumnI16FMAOpPattern
-    : public OpConversionPattern<aievec::FMAOp> {
-  using OpConversionPattern<aievec::FMAOp>::OpConversionPattern;
+    : public OpConversionPattern<aievec::aie1::FMAOp> {
+  using OpConversionPattern<aievec::aie1::FMAOp>::OpConversionPattern;
 
   LogicalResult
-  matchAndRewrite(aievec::FMAOp fmaOp, OpAdaptor adaptor,
+  matchAndRewrite(aievec::aie1::FMAOp fmaOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     if (!isSingleColumnInt16VectorTimesScalarMac(adaptor))
       return failure();
     auto accProdOp = adaptor.getAcc().getDefiningOp();
     if (!accProdOp)
       return failure();
-    auto accFmaOp = dyn_cast<aievec::FMAOp>(accProdOp);
+    auto accFmaOp = dyn_cast<aievec::aie1::FMAOp>(accProdOp);
     if (!accFmaOp)
       return failure();
     if (!isSingleColumnInt16VectorTimesScalarMac(accFmaOp))
@@ -163,7 +164,7 @@ struct MergeSingleColumnI16FMAOpPattern
         fmaOp.getLoc(), adaptor.getLhs().getType(),
         SmallVector<Value, 2>({lowV, hiV}));
     auto newFmaOpAttr = buildFMAOpSplatAttrForElemTy(fmaOp, start, step);
-    rewriter.replaceOpWithNewOp<aievec::FMAOp>(
+    rewriter.replaceOpWithNewOp<aievec::aie1::FMAOp>(
         fmaOp, TypeRange({fmaOp.getResult().getType()}),
         ValueRange({newConcatOp, adaptor.getRhs(), accFmaOp.getAcc()}),
         newFmaOpAttr);
@@ -214,12 +215,14 @@ static void populateAIEVecV2TransformationPatterns(RewritePatternSet &patterns,
 static void
 configureAIEVecV1TransformationLegalizations(ConversionTarget &target,
                                              TargetBackend backend) {
-  target.addLegalDialect<aievec::AIEVecDialect>();
-  target.addDynamicallyLegalOp<aievec::FMAOp>([](aievec::FMAOp fmaOp) {
-    if (isSingleColumnInt16VectorTimesScalarMac(fmaOp))
-      return !singleColumnFMAOpCanFold(fmaOp);
-    return true;
-  });
+  target.addLegalDialect<aievec::AIEVecDialect,
+                         aievec::aie1::AIEVecAIE1Dialect>();
+  target.addDynamicallyLegalOp<aievec::aie1::FMAOp>(
+      [](aievec::aie1::FMAOp fmaOp) {
+        if (isSingleColumnInt16VectorTimesScalarMac(fmaOp))
+          return !singleColumnFMAOpCanFold(fmaOp);
+        return true;
+      });
 }
 
 static void
@@ -259,7 +262,8 @@ struct AIEVecTransformationPass
   void getDependentDialects(DialectRegistry &registry) const override {
     // TODO: Review list of dependent dialects.
     registry.insert<affine::AffineDialect, xilinx::aievec::AIEVecDialect,
-                    arith::ArithDialect, memref::MemRefDialect, scf::SCFDialect,
+                    aievec::aie1::AIEVecAIE1Dialect, arith::ArithDialect,
+                    memref::MemRefDialect, scf::SCFDialect,
                     vector::VectorDialect>();
   }
 
@@ -355,7 +359,8 @@ struct AIEVecConvOpTransformationPass
   void getDependentDialects(DialectRegistry &registry) const override {
     // TODO: Review list of dependent dialects.
     registry.insert<affine::AffineDialect, xilinx::aievec::AIEVecDialect,
-                    arith::ArithDialect, memref::MemRefDialect, scf::SCFDialect,
+                    aievec::aie1::AIEVecAIE1Dialect, arith::ArithDialect,
+                    memref::MemRefDialect, scf::SCFDialect,
                     vector::VectorDialect>();
   }
 
diff --git a/lib/Dialect/AIEVec/Transforms/AIEVectorize.cpp b/lib/Dialect/AIEVec/Transforms/AIEVectorize.cpp
index 07c51cf5cb..4b835811d1 100644
--- a/lib/Dialect/AIEVec/Transforms/AIEVectorize.cpp
+++ b/lib/Dialect/AIEVec/Transforms/AIEVectorize.cpp
@@ -12,6 +12,7 @@
 // AIE vector abstraction.
 //===----------------------------------------------------------------------===//
 
+#include "aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Ops.h"
 #include "aie/Dialect/AIEVec/AIEVecUtils.h"
 #include "aie/Dialect/AIEVec/IR/AIEVecOps.h"
 #include "aie/Dialect/AIEVec/Transforms/IntervalReuse.h"
@@ -361,10 +362,10 @@ static bool writesToAccumulator(Operation *op) {
   // Integer muls and FMAs write to accumulator
   if (!isAIEOp(op))
     return false;
-  if (auto mulOp = dyn_cast<aievec::MulOp>(op))
+  if (auto mulOp = dyn_cast<aievec::aie1::MulOp>(op))
     return isa<IntegerType>(
         cast<VectorType>(mulOp.getResult().getType()).getElementType());
-  if (auto fmaOp = dyn_cast<aievec::FMAOp>(op))
+  if (auto fmaOp = dyn_cast<aievec::aie1::FMAOp>(op))
     return isa<IntegerType>(
         cast<VectorType>(fmaOp.getResult().getType()).getElementType());
 
@@ -682,13 +683,13 @@ static aievec::PackOp generatePackOp(Value source, VectState *state,
 }
 
 // Generate and return an Add op.
-static aievec::AddOp generateAddOp(Operation *Op, AIEOpAttributes &opAttr,
-                                   VectState *state) {
+static aievec::aie1::AddOp generateAddOp(Operation *Op, AIEOpAttributes &opAttr,
+                                         VectState *state) {
   // Assert that we computed the attributes for both the operands
   assert(opAttr.start.size() == opAttr.offset.size() &&
          opAttr.start.size() == 2);
 
-  auto addOp = state->builder.create<aievec::AddOp>(
+  auto addOp = state->builder.create<aievec::aie1::AddOp>(
       Op->getLoc(), Op->getResult(0).getType(), Op->getOperand(0),
       Op->getOperand(1), opAttr.start[0], opAttr.offset[0], opAttr.offset_hi[0],
       opAttr.square[0], opAttr.start[1], opAttr.offset[1], opAttr.offset_hi[1],
@@ -697,13 +698,13 @@ static aievec::AddOp generateAddOp(Operation *Op, AIEOpAttributes &opAttr,
 }
 
 // Generate and return a Sub op.
-static aievec::SubOp generateSubOp(Operation *Op, AIEOpAttributes &opAttr,
-                                   VectState *state) {
+static aievec::aie1::SubOp generateSubOp(Operation *Op, AIEOpAttributes &opAttr,
+                                         VectState *state) {
   // Assert that we computed the attributes for both the operands
   assert(opAttr.start.size() == opAttr.offset.size() &&
          opAttr.start.size() == 2);
 
-  auto subOp = state->builder.create<aievec::SubOp>(
+  auto subOp = state->builder.create<aievec::aie1::SubOp>(
       Op->getLoc(), Op->getResult(0).getType(), Op->getOperand(0),
       Op->getOperand(1), opAttr.start[0], opAttr.offset[0], opAttr.offset_hi[0],
       opAttr.square[0], opAttr.start[1], opAttr.offset[1], opAttr.offset_hi[1],
@@ -906,7 +907,7 @@ static Operation *generateFMAOp(vector::FMAOp fmaOp, AIEOpAttributes &opAttr,
       }
     }
     // Create AIE dialect fma/msc op
-    xfmaOp = state->builder.create<aievec::FMAOp>(
+    xfmaOp = state->builder.create<aievec::aie1::FMAOp>(
         fmaOp->getLoc(), lhs, rhs, acc, opAttr.start[0], opAttr.offset[0],
         opAttr.offset_hi[0], opAttr.step[0], opAttr.square[0], opAttr.start[1],
         opAttr.offset[1], opAttr.offset_hi[1], opAttr.step[1], opAttr.square[1],
@@ -950,7 +951,7 @@ static Operation *generateMulOp(T mulOp, AIEOpAttributes &opAttr,
   }
 
   // Create AIE dialect mul op
-  Operation *xmulOp = state->builder.create<aievec::MulOp>(
+  Operation *xmulOp = state->builder.create<aievec::aie1::MulOp>(
       mulOp->getLoc(), lhs, rhs, opType, opAttr.start[0], opAttr.offset[0],
       opAttr.offset_hi[0], opAttr.step[0], opAttr.square[0], opAttr.start[1],
       opAttr.offset[1], opAttr.offset_hi[1], opAttr.step[1], opAttr.square[1]);
@@ -1452,13 +1453,13 @@ static void generateMulOrFMAOp(Operation *Op, Scheme &scheme,
   auto genOp = [&](Operation *Op, AIEOpAttributes &opAttr, VectState *state,
                    bool i8xi8_pairedOp = false) {
     Operation *repOp;
-    // Create aievec::FMAOp corresponding to the vector::FMAOp
+    // Create aievec::aie1::FMAOp corresponding to the vector::FMAOp
     if (auto fmaOp = dyn_cast<vector::FMAOp>(Op))
       repOp = generateFMAOp(fmaOp, opAttr, state, i8xi8_pairedOp);
-    // Create aievec::MulOp corresponding to the vector::MulIOp
+    // Create aievec::aie1::MulOp corresponding to the vector::MulIOp
     else if (auto mulOp = dyn_cast<MulIOp>(Op))
       repOp = generateMulOp<MulIOp>(mulOp, opAttr, state);
-    // Create aievec::MulOp corresponding to the vector::MulFOp
+    // Create aievec::aie1::MulOp corresponding to the vector::MulFOp
     else if (auto mulOp = dyn_cast<MulFOp>(Op))
       repOp = generateMulOp<MulFOp>(mulOp, opAttr, state);
     else
@@ -2059,26 +2060,27 @@ static bool matchAttributesAndDistanceForFusion(T1 curOp, T2 defOp) {
 // the acc of fma is a mul/fma operation which uses the same operands as fma.
 // the def of two operands are upd operations.
 // Transform -
-// %5 = aievec.mul %4, %0 {xoffsets = "[[Xo:.*]]", xoffsets_hi = "[[Xh:.*]]",
-// xsquare = "[[Sq:.*]]", xstart = "0", zoffsets = "[[Zo:.*]]", zoffsets_hi =
+// %5 = aievec_aie1.mul %4, %0 {xoffsets = "[[Xo:.*]]", xoffsets_hi =
+// "[[Xh:.*]]", xsquare = "[[Sq:.*]]", xstart = "0", zoffsets = "[[Zo:.*]]",
+// zoffsets_hi =
 // "[[Zh:.*]]", zstart = "0", zstep = "[[Zs:.*]]"}
 //
-// %6 = aievec.mac %4, %0, %5 {xoffsets = "[[Xo:.*]]",
+// %6 = aievec_aie1.mac %4, %0, %5 {xoffsets = "[[Xo:.*]]",
 // xoffsets_hi = "[[Xh:.*]]", xsquare = "[[Sq:.*]]", xstart = "2", zoffsets =
 // "[[Zo:.*]]", zoffsets_hi = "[[Zh:.*]]", zstart = "2", zstep = "[[Zs:.*]]"}
 //
 // to-
 //
-// %7 = aievec.mul_conv %6, %1 {M = 16 : si32, N = 4 : si32}
+// %7 = aievec_aie1.mul_conv %6, %1 {M = 16 : si32, N = 4 : si32}
 //
 // or transform the pattern like this-
 //
-// %9 = aievec.mac %8, %0, %6 {xoffsets = "[[Xo:.*]]", xoffsets_hi =
+// %9 = aievec_aie1.mac %8, %0, %6 {xoffsets = "[[Xo:.*]]", xoffsets_hi =
 // "[[Xh:.*]]", xsquare = "[[Sq:.*]]", xstart = "0", zoffsets = "[[Zo:.*]]",
 // zoffsets_hi =
 // "[[Zh:.*]]", zstart = "4", zstep = "[[Zs:.*]]"}
 //
-// %10 = aievec.mac %8, %0, %9 {xoffsets =
+// %10 = aievec_aie1.mac %8, %0, %9 {xoffsets =
 // "[[Xo:.*]]", xoffsets_hi = "[[Xh:.*]]", xsquare = "[[Sq:.*]]", xstart = "2",
 // zoffsets = "[[Zo:.*]]", zoffsets_hi = "[[Zh:.*]]", zstart = "6", zstep =
 // "[[Zs:.*]]"}
@@ -2091,8 +2093,8 @@ static bool matchAttributesAndDistanceForFusion(T1 curOp, T2 defOp) {
 // int16 type of AIE-ML architecture.
 static bool canFuseMulFMAOpsForInt16(Operation *Op) {
   // Check 1. This should be an aievec fma operation
-  assert(isa<aievec::FMAOp>(Op) && "operation must be an aievec fma op");
-  auto curOp = cast<aievec::FMAOp>(Op);
+  assert(isa<aievec::aie1::FMAOp>(Op) && "operation must be an aievec fma op");
+  auto curOp = cast<aievec::aie1::FMAOp>(Op);
 
   // Check 2. Element type should be int16
   auto vType = cast<VectorType>(Op->getOperand(1).getType());
@@ -2108,7 +2110,7 @@ static bool canFuseMulFMAOpsForInt16(Operation *Op) {
   // Check 3. acc operand of the Op should be a mul op or fma op
   Operation *mulOrFMAOp = Op->getOperand(2).getDefiningOp();
 
-  if (!isa<aievec::MulOp, aievec::FMAOp>(mulOrFMAOp))
+  if (!isa<aievec::aie1::MulOp, aievec::aie1::FMAOp>(mulOrFMAOp))
     return false;
 
   // Check 4. mulOrFMAOp must have one use
@@ -2127,14 +2129,14 @@ static bool canFuseMulFMAOpsForInt16(Operation *Op) {
 
   // If the acc operand is a mul op, we will try to generate mul_conv operation
   // If the acc operand is a fma op, we will try to generate fma_conv operation
-  if (auto mulOp = dyn_cast<aievec::MulOp>(mulOrFMAOp)) {
+  if (auto mulOp = dyn_cast<aievec::aie1::MulOp>(mulOrFMAOp)) {
     isMulOp = true;
 
     // Determine the lhs and rhs values for the mul_conv
     lhs = mulOp->getOperand(0);
     rhs = mulOp->getOperand(1);
   } else {
-    auto fmaOp = cast<aievec::FMAOp>(mulOrFMAOp);
+    auto fmaOp = cast<aievec::aie1::FMAOp>(mulOrFMAOp);
 
     // Determine the lhs, rhs and acc values for the fma_conv
     lhs = fmaOp->getOperand(0);
@@ -2161,14 +2163,14 @@ static bool canFuseMulFMAOpsForInt16(Operation *Op) {
   // Check 8. xstart and zstart distance between two operations should be
   // 2. offsets, offsets_hi, square and step of two operations should be same.
   return (isMulOp && matchAttributesAndDistanceForFusion(
-                         curOp, cast<aievec::MulOp>(mulOrFMAOp))) ||
-         matchAttributesAndDistanceForFusion(curOp,
-                                             cast<aievec::FMAOp>(mulOrFMAOp));
+                         curOp, cast<aievec::aie1::MulOp>(mulOrFMAOp))) ||
+         matchAttributesAndDistanceForFusion(
+             curOp, cast<aievec::aie1::FMAOp>(mulOrFMAOp));
 }
 
 // Rewrite a mul/fma and fma op as a aievec MUL_conv or FMA_Conv op
 static void fuseMulFMAOpsForInt16(Operation *Op, VectState *state) {
-  auto curOp = cast<aievec::FMAOp>(Op);
+  auto curOp = cast<aievec::aie1::FMAOp>(Op);
 
   Value lhs = curOp->getOperand(0);
 
@@ -2202,15 +2204,15 @@ static void fuseMulFMAOpsForInt16(Operation *Op, VectState *state) {
   // Get the def op of acc. It is either a mul op or a fma op.
   Operation *convOp = nullptr;
   Operation *mulOrFMAOp = Op->getOperand(2).getDefiningOp();
-  auto mulOp = dyn_cast<aievec::MulOp>(mulOrFMAOp);
-  auto fmaOp = dyn_cast<aievec::FMAOp>(mulOrFMAOp);
+  auto mulOp = dyn_cast<aievec::aie1::MulOp>(mulOrFMAOp);
+  auto fmaOp = dyn_cast<aievec::aie1::FMAOp>(mulOrFMAOp);
   int32_t zStart;
 
   if (mulOp) {
-    aievec::MulOp defOp = mulOp;
+    aievec::aie1::MulOp defOp = mulOp;
     zStart = stoi(static_cast<std::string>(defOp.getStart(1)));
   } else {
-    aievec::FMAOp defOp = fmaOp;
+    aievec::aie1::FMAOp defOp = fmaOp;
     zStart = stoi(static_cast<std::string>(defOp.getStart(1)));
   }
 
@@ -2256,7 +2258,7 @@ static void fuseMulFMAOpsForInt16(Operation *Op, VectState *state) {
 
 static void fuseMulFMAOpsByMulFMAConv(func::FuncOp func, VectState *state) {
   func.walk([&](Operation *Op) {
-    if (isa<aievec::FMAOp>(Op) && canFuseMulFMAOpsForInt16(Op))
+    if (isa<aievec::aie1::FMAOp>(Op) && canFuseMulFMAOpsForInt16(Op))
       fuseMulFMAOpsForInt16(Op, state);
   });
 }
diff --git a/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp b/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp
index 1920b258ea..438a0350ca 100644
--- a/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp
+++ b/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp
@@ -12,6 +12,7 @@
 // to ops that can be translated to a sequence of valid AIEVec ops.
 //===----------------------------------------------------------------------===//
 
+#include "aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Ops.h"
 #include "aie/Dialect/AIEVec/AIEVecUtils.h"
 #include "aie/Dialect/AIEVec/IR/AIEVecOps.h"
 #include "aie/Dialect/AIEVec/Pipelines/Passes.h"
@@ -29,7 +30,6 @@
 #include "mlir/Transforms/Passes.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/TypeSwitch.h"
-
 #include "llvm/Support/raw_ostream.h"
 #include <bitset>
 #include <optional>
@@ -111,16 +111,16 @@ extractMACOperandsFromAddOperands(Value addLhs, Value addRhs) {
   if (mulOp)
     return std::make_tuple(mulOp.getLhs(), mulOp.getRhs(), acc);
 
-  // If the MulIOp has been already translated to aievec::MulOp:
+  // If the MulIOp has been already translated to aievec::aie1::MulOp:
   auto lhsSrsOp = addLhs.getDefiningOp<aievec::SRSOp>();
   auto rhsSrsOp = addRhs.getDefiningOp<aievec::SRSOp>();
-  aievec::MulOp aieMulOp = nullptr;
+  aievec::aie1::MulOp aieMulOp = nullptr;
   if (lhsSrsOp) {
-    aieMulOp = lhsSrsOp.getSource().getDefiningOp<aievec::MulOp>();
+    aieMulOp = lhsSrsOp.getSource().getDefiningOp<aievec::aie1::MulOp>();
     acc = addRhs;
   }
   if (!aieMulOp && rhsSrsOp) {
-    aieMulOp = rhsSrsOp.getSource().getDefiningOp<aievec::MulOp>();
+    aieMulOp = rhsSrsOp.getSource().getDefiningOp<aievec::aie1::MulOp>();
     acc = addLhs;
   }
   if (aieMulOp)
@@ -275,9 +275,9 @@ buildAttributeListForRotationSelectOp(PatternRewriter &rewriter, VectorType vTy,
 
 namespace xilinx::aievec {
 
-SmallVector<NamedAttribute> buildFMAOpSplatAttrForElemTy(aievec::FMAOp fmaOp,
-                                                         int64_t bcastPos,
-                                                         int64_t step = 1) {
+SmallVector<NamedAttribute>
+buildFMAOpSplatAttrForElemTy(aievec::aie1::FMAOp fmaOp, int64_t bcastPos,
+                             int64_t step = 1) {
   unsigned width = 0;
   auto elemTy = fmaOp.getLhs().getType().getElementType();
   if (auto intTy = dyn_cast<IntegerType>(elemTy))
@@ -628,6 +628,70 @@ struct ConvertMulAddToAIEVecFMAElemOpPattern
   unsigned shiftParam;
 };
 
+// Convert `vector.fma` to `aievec.mac_elem`. Only `vector<16xf32>` and
+// `vector<16xbf16>` operand types are supported. In the case of vectors with
+// `f32` elemental type, this pattern will try to match `bf16` to `f32`
+// widening ops in the `lhs` and `rhs` operands, or fail otherwise.
+// TODO: When sign extensions are not found, a conversion from `f32` to `bf16`
+// TODO: can be inserted to emulate `f32` fma with `bf16` logic.
+struct ConvertVectorFMAOpToAIEVecFMAElemOpPattern
+    : OpConversionPattern<vector::FMAOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  ConvertVectorFMAOpToAIEVecFMAElemOpPattern(MLIRContext *context,
+                                             unsigned shiftParam = 0)
+      : OpConversionPattern(context), shiftParam(shiftParam) {}
+
+  LogicalResult
+  matchAndRewrite(vector::FMAOp fmaOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Verify the vector type is supported by AIE2
+    auto resVecTy = cast<VectorType>(fmaOp.getType());
+    auto resElemTy = resVecTy.getElementType();
+    unsigned numElems = getVectorLaneSize(resVecTy);
+
+    if (numElems != 16 || (!resElemTy.isF32() && !resElemTy.isBF16()))
+      return rewriter.notifyMatchFailure(
+          fmaOp, "Unsupported operand types in vector.fma lowering.");
+
+    Value lhs = adaptor.getLhs();
+    Value rhs = adaptor.getRhs();
+    Value acc = adaptor.getAcc();
+    if (resElemTy.isBF16())
+      acc = rewriter.create<aievec::UPSOp>(
+          fmaOp.getLoc(), VectorType::get({16}, rewriter.getF32Type()), acc,
+          shiftParam);
+    else {
+      lhs = getSourceOfWideningOp(lhs).value_or(nullptr);
+      rhs = getSourceOfWideningOp(rhs).value_or(nullptr);
+      if (!lhs || !rhs)
+        return rewriter.notifyMatchFailure(
+            fmaOp, "vector.fma operands are f32, and they don't come from "
+                   "arith.extf on bf16; can't lower to aievec.");
+      if (!cast<VectorType>(lhs.getType()).getElementType().isBF16() ||
+          !cast<VectorType>(rhs.getType()).getElementType().isBF16())
+        return rewriter.notifyMatchFailure(
+            fmaOp, "vector.fma operands come from arith.extf, but the source "
+                   "of the widening op is not bf16; can't lower to aievec.");
+    }
+    Value newOp = rewriter.create<aievec::FMAElemOp>(
+        fmaOp.getLoc(), acc.getType(), lhs, rhs, acc, /*fmsub=*/false);
+
+    if (resElemTy.isBF16()) {
+      auto shiftParamOp = rewriter.create<arith::ConstantOp>(
+          fmaOp.getLoc(), rewriter.getI32IntegerAttr(shiftParam));
+      newOp = rewriter.create<aievec::SRSOp>(fmaOp.getLoc(), resVecTy, newOp,
+                                             shiftParamOp);
+    }
+
+    rewriter.replaceOp(fmaOp, newOp);
+
+    return success();
+  }
+
+  unsigned shiftParam;
+};
+
 // This pattern replaces `arith.mulf` on vectors with
 // `aievec.mul_elem`. This pattern works for AIE2.
 struct ConvertMulFToAIEVecMulElemOpPattern
@@ -838,13 +902,13 @@ struct ConvertMulIToAIEVecMulElemOpPattern
   unsigned shiftParam;
 };
 
-// This pattern folds an extract + broadcast feeding into an `aievec::FMAOp`
-// into the op, using the shuffle attributes.
-struct FoldBroadcastToFMAOp : OpConversionPattern<aievec::FMAOp> {
+// This pattern folds an extract + broadcast feeding into an
+// `aievec::aie1::FMAOp` into the op, using the shuffle attributes.
+struct FoldBroadcastToFMAOp : OpConversionPattern<aievec::aie1::FMAOp> {
   using OpConversionPattern::OpConversionPattern;
 
   LogicalResult
-  matchAndRewrite(aievec::FMAOp fmaOp, OpAdaptor adaptor,
+  matchAndRewrite(aievec::aie1::FMAOp fmaOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     auto concatOp =
         dyn_cast<aievec::ConcatOp>(adaptor.getLhs().getDefiningOp());
@@ -879,7 +943,7 @@ struct FoldBroadcastToFMAOp : OpConversionPattern<aievec::FMAOp> {
     auto pos = extOp.getStaticPosition();
     int64_t zstart = pos[0];
     auto fmaOpAttr = buildFMAOpSplatAttrForElemTy(fmaOp, zstart);
-    rewriter.replaceOpWithNewOp<aievec::FMAOp>(
+    rewriter.replaceOpWithNewOp<aievec::aie1::FMAOp>(
         fmaOp, TypeRange({fmaOp.getResult().getType()}),
         ValueRange({lhsX2, rhs, adaptor.getAcc()}), fmaOpAttr);
 
@@ -887,11 +951,12 @@ struct FoldBroadcastToFMAOp : OpConversionPattern<aievec::FMAOp> {
   }
 };
 
-struct ConvertMulAddToAIEVecFMAOpPattern : OpConversionPattern<aievec::AddOp> {
+struct ConvertMulAddToAIEVecFMAOpPattern
+    : OpConversionPattern<aievec::aie1::AddOp> {
   using OpConversionPattern::OpConversionPattern;
 
   LogicalResult
-  matchAndRewrite(aievec::AddOp addOp, OpAdaptor adaptor,
+  matchAndRewrite(aievec::aie1::AddOp addOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     auto vecType = cast<VectorType>(addOp.getType());
 
@@ -913,7 +978,7 @@ struct ConvertMulAddToAIEVecFMAOpPattern : OpConversionPattern<aievec::AddOp> {
                                                SmallVector<Value, 2>(2, lhs))
                      .getResult();
     auto upsOp = rewriter.create<aievec::UPSOp>(addOp.getLoc(), accType, acc);
-    auto fmaOp = rewriter.create<aievec::FMAOp>(
+    auto fmaOp = rewriter.create<aievec::aie1::FMAOp>(
         addOp.getLoc(), accType, lhsX2, rhs, upsOp.getResult(),
         /*xstart=*/"", /*xoffsets=*/"", /*xoffsets_hi=*/"", /*xstep=*/"",
         /*xsquare=*/"", /*zstart=*/"", /*zoffsets=*/"", /*zoffsets_hi=*/"",
@@ -1031,7 +1096,7 @@ struct LowerVectorAddIOpToAIEVecAddOp : OpConversionPattern<arith::AddIOp> {
         (rhsDefOp && isa<arith::MulIOp>(rhsDefOp)))
       return failure();
 
-    rewriter.replaceOpWithNewOp<aievec::AddOp>(
+    rewriter.replaceOpWithNewOp<aievec::aie1::AddOp>(
         addOp, resType, lhs, rhs,
         /*xstart=*/"", /*xoffsets=*/"", /*xoffsets_hi=*/"", /*xsquare=*/"",
         /*zstart=*/"", /*zoffsets=*/"", /*zoffsets_hi=*/"", /*zsquare=*/"");
@@ -1040,13 +1105,13 @@ struct LowerVectorAddIOpToAIEVecAddOp : OpConversionPattern<arith::AddIOp> {
 };
 
 using LowerVectorAddFOpToAIEVecAddOp =
-    OneToOneVectorOpToAIEVecOpPattern<arith::AddFOp, aievec::AddOp>;
+    OneToOneVectorOpToAIEVecOpPattern<arith::AddFOp, aievec::aie1::AddOp>;
 using LowerVectorMulFOpToAIEVecMulOp =
-    OneToOneVectorOpToAIEVecOpPattern<arith::MulFOp, aievec::MulOp>;
+    OneToOneVectorOpToAIEVecOpPattern<arith::MulFOp, aievec::aie1::MulOp>;
 using LowerVectorSubIOpToAIEVecSubOp =
-    OneToOneVectorOpToAIEVecOpPattern<arith::SubIOp, aievec::SubOp>;
+    OneToOneVectorOpToAIEVecOpPattern<arith::SubIOp, aievec::aie1::SubOp>;
 using LowerVectorSubFOpToAIEVecSubOp =
-    OneToOneVectorOpToAIEVecOpPattern<arith::SubFOp, aievec::SubOp>;
+    OneToOneVectorOpToAIEVecOpPattern<arith::SubFOp, aievec::aie1::SubOp>;
 
 struct LowerVectorMulIOpToAIEVecMulOp : OpConversionPattern<arith::MulIOp> {
   using OpConversionPattern::OpConversionPattern;
@@ -1057,7 +1122,7 @@ struct LowerVectorMulIOpToAIEVecMulOp : OpConversionPattern<arith::MulIOp> {
     if (!resTy)
       return failure();
     auto accTy = getVectorOpDestType(resTy, /*AIE2 =*/false);
-    auto newMulOp = rewriter.create<aievec::MulOp>(
+    auto newMulOp = rewriter.create<aievec::aie1::MulOp>(
         mulOp.getLoc(), accTy, adaptor.getLhs(), adaptor.getRhs());
     auto shiftParamOp = rewriter.create<arith::ConstantOp>(
         mulOp.getLoc(), rewriter.getI32IntegerAttr(0));
@@ -2994,6 +3059,7 @@ static void populateAIEVecV2ConversionPatterns(RewritePatternSet &patterns,
       FoldVectorExtractAndBroadcastToAIEBroadcast,
       ConvertBroadcastToAIEBroadcast,
       ConvertMulAddToAIEVecFMAElemOpPattern,
+      ConvertVectorFMAOpToAIEVecFMAElemOpPattern,
       LowerVectorExtractStridedSliceOpAIE2Pattern,
       LowerVectorTransposeOpToAIEVecShuffleOpPattern
       >(patterns.getContext());
@@ -3065,7 +3131,8 @@ static bool isInSigmoidOperationChain(math::ExpOp expOp) {
 
 static void configureAIEVecCommonLegalizations(ConversionTarget &target,
                                                TargetBackend backend) {
-  target.addLegalDialect<xilinx::aievec::AIEVecDialect, arith::ArithDialect,
+  target.addLegalDialect<xilinx::aievec::aie1::AIEVecAIE1Dialect,
+                         xilinx::aievec::AIEVecDialect, arith::ArithDialect,
                          emitc::EmitCDialect>();
   if (backend == TargetBackend::CPP) {
     target.addIllegalOp<vector::TransferReadOp>();
@@ -3423,36 +3490,39 @@ static void configureAIEVecV1Legalizations(ConversionTarget &target,
       [](arith::MulIOp op) { return !isa<VectorType>(op.getType()); });
   target.addDynamicallyLegalOp<arith::MulFOp>(
       [](arith::MulFOp op) { return !isa<VectorType>(op.getType()); });
-  target.addDynamicallyLegalOp<aievec::FMAOp>([](xilinx::aievec::FMAOp op) {
-    auto lhsDefOp = op.getLhs().getDefiningOp();
-    aievec::ConcatOp concatOp = nullptr;
-    if (lhsDefOp)
-      concatOp = dyn_cast<aievec::ConcatOp>(op.getLhs().getDefiningOp());
-    if (!concatOp)
-      return true;
+  target.addDynamicallyLegalOp<aievec::aie1::FMAOp>(
+      [](xilinx::aievec::aie1::FMAOp op) {
+        auto lhsDefOp = op.getLhs().getDefiningOp();
+        aievec::ConcatOp concatOp = nullptr;
+        if (lhsDefOp)
+          concatOp = dyn_cast<aievec::ConcatOp>(op.getLhs().getDefiningOp());
+        if (!concatOp)
+          return true;
 
-    vector::BroadcastOp srcBcast = nullptr;
-    if (auto lhsOp = concatOp.getSources()[0].getDefiningOp())
-      srcBcast = dyn_cast<vector::BroadcastOp>(lhsOp);
-    if (!srcBcast) {
-      auto rhsOp = op.getRhs().getDefiningOp();
-      if (!rhsOp)
-        return true;
-      srcBcast = dyn_cast<vector::BroadcastOp>(rhsOp);
-    }
+        vector::BroadcastOp srcBcast = nullptr;
+        if (auto lhsOp = concatOp.getSources()[0].getDefiningOp())
+          srcBcast = dyn_cast<vector::BroadcastOp>(lhsOp);
+        if (!srcBcast) {
+          auto rhsOp = op.getRhs().getDefiningOp();
+          if (!rhsOp)
+            return true;
+          srcBcast = dyn_cast<vector::BroadcastOp>(rhsOp);
+        }
 
-    if (srcBcast)
-      if (auto srcOp = srcBcast.getSource().getDefiningOp())
-        return !isa<vector::ExtractOp>(srcOp);
+        if (srcBcast)
+          if (auto srcOp = srcBcast.getSource().getDefiningOp())
+            return !isa<vector::ExtractOp>(srcOp);
 
-    return true;
-  });
+        return true;
+      });
 
-  target.addDynamicallyLegalOp<aievec::AddOp>([](aievec::AddOp op) {
+  target.addDynamicallyLegalOp<aievec::aie1::AddOp>([](aievec::aie1::AddOp op) {
     auto lSrsOp = op.getLhs().getDefiningOp<aievec::SRSOp>();
     auto rSrsOp = op.getRhs().getDefiningOp<aievec::SRSOp>();
-    return (!lSrsOp || !lSrsOp.getSource().getDefiningOp<aievec::MulOp>()) &&
-           (!rSrsOp || !rSrsOp.getSource().getDefiningOp<aievec::MulOp>());
+    return (!lSrsOp ||
+            !lSrsOp.getSource().getDefiningOp<aievec::aie1::MulOp>()) &&
+           (!rSrsOp ||
+            !rSrsOp.getSource().getDefiningOp<aievec::aie1::MulOp>());
   });
   target.addLegalDialect<memref::MemRefDialect>();
 }
@@ -3675,7 +3745,8 @@ static void configureAIEVecV2Legalizations(ConversionTarget &target,
         return false;
       });
 
-  target.addIllegalOp<vector::ContractionOp, vector::TransposeOp>();
+  target.addIllegalOp<vector::ContractionOp, vector::TransposeOp,
+                      vector::FMAOp>();
 }
 
 //===----------------------------------------------------------------------===//
@@ -3703,9 +3774,11 @@ struct LowerVectorToAIEVec : PassWrapper<LowerVectorToAIEVec, OperationPass<>> {
     return "Lower vector operations to AIE vector intrinsics";
   }
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<affine::AffineDialect, xilinx::aievec::AIEVecDialect,
-                    arith::ArithDialect, memref::MemRefDialect, scf::SCFDialect,
-                    vector::VectorDialect, emitc::EmitCDialect>();
+    registry
+        .insert<affine::AffineDialect, xilinx::aievec::aie1::AIEVecAIE1Dialect,
+                xilinx::aievec::AIEVecDialect, arith::ArithDialect,
+                memref::MemRefDialect, scf::SCFDialect, vector::VectorDialect,
+                emitc::EmitCDialect>();
   }
 
   Option<std::string> aieTarget{
diff --git a/lib/Dialect/AIEVec/Transforms/VectorToVectorConversions.cpp b/lib/Dialect/AIEVec/Transforms/VectorToVectorConversions.cpp
index a9a3bdbd10..ee02022bb8 100644
--- a/lib/Dialect/AIEVec/Transforms/VectorToVectorConversions.cpp
+++ b/lib/Dialect/AIEVec/Transforms/VectorToVectorConversions.cpp
@@ -412,8 +412,8 @@ struct FlattenMultDimTransferReadPattern
 
     auto inBoundsArrayAttrOpt = adaptor.getInBounds();
     if (inBoundsArrayAttrOpt) {
-      SmallVector<bool> inBounds = llvm::to_vector(
-          inBoundsArrayAttrOpt.value().getAsValueRange<BoolAttr>());
+      SmallVector<bool> inBounds =
+          llvm::to_vector(inBoundsArrayAttrOpt.getAsValueRange<BoolAttr>());
       SmallVector<bool> newInBounds({false});
       newInBounds[0] = std::all_of(inBounds.begin(), inBounds.end(),
                                    [](bool v) { return v; });
@@ -472,8 +472,8 @@ struct FlattenMultDimTransferWritePattern
 
     auto inBoundsArrayAttrOpt = adaptor.getInBounds();
     if (inBoundsArrayAttrOpt) {
-      SmallVector<bool> inBounds = llvm::to_vector(
-          inBoundsArrayAttrOpt.value().getAsValueRange<BoolAttr>());
+      SmallVector<bool> inBounds =
+          llvm::to_vector(inBoundsArrayAttrOpt.getAsValueRange<BoolAttr>());
       SmallVector<bool> newInBounds({false});
       newInBounds[0] = std::all_of(inBounds.begin(), inBounds.end(),
                                    [](bool v) { return v; });
diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp
index 75625e1e15..dab70eae74 100644
--- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp
+++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp
@@ -1,4 +1,4 @@
-//===- AIEDialect.cpp -------------------------------------------*- C++ -*-===//
+//===- AIEXDialect.cpp ------------------------------------------*- C++ -*-===//
 //
 // This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -193,7 +193,9 @@ LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() {
   if (strides[1] && sizes[0] > (1 << wrap_bits) - 1)
     return emitOpError("Size 0 exceeds the [0:" +
                        std::to_string((1 << wrap_bits) - 1) + "] range.");
-  if (strides[3] > (1 << step_bits))
+  // strides[3] exceeding the range is ok iff the sizes[3] is one, which is
+  // checked below
+  if (strides[3] > (1 << step_bits) && sizes[3] != 1)
     return emitOpError("Stride 3 exceeds the [1:" +
                        std::to_string(1 << step_bits) + "] range.");
   if (strides[2] > (1 << step_bits))
@@ -274,4 +276,84 @@ LogicalResult AIEX::NpuWriteBdOp::verify() {
   if (getIterationStride() > 0xFFFFF)
     return emitOpError("Iteration Stride exceeds the [0:1M-1] range.");
   return success();
-}
\ No newline at end of file
+}
+
+//===----------------------------------------------------------------------===//
+// RuntimeSequenceOp
+//===----------------------------------------------------------------------===//
+
+ParseResult AIEX::RuntimeSequenceOp::parse(OpAsmParser &parser,
+                                           OperationState &result) {
+
+  StringAttr nameAttr;
+  (void)parser.parseOptionalSymbolName(
+      nameAttr, mlir::SymbolTable::getSymbolAttrName(), result.attributes);
+
+  SmallVector<OpAsmParser::Argument> entryArgs;
+
+  // Entry arguments,  e.g. (%addr: memref<1xi32>)
+  ParseResult argParseResult = parser.parseCommaSeparatedList(
+      OpAsmParser::Delimiter::Paren, [&]() -> ParseResult {
+        OpAsmParser::Argument argument;
+        if (parser.parseArgument(argument, true, true)) {
+          return failure();
+        }
+        entryArgs.push_back(argument);
+        return success();
+      });
+  if (argParseResult) {
+    return argParseResult;
+  }
+
+  // Body
+  auto *body = result.addRegion();
+  ParseResult bodyParseResult = parser.parseRegion(*body, entryArgs, false);
+  if (bodyParseResult) {
+    return bodyParseResult;
+  }
+
+  return success();
+}
+
+void AIEX::RuntimeSequenceOp::print(OpAsmPrinter &printer) {
+  Region &body = getRegion();
+
+  auto nameAttr = (*this)->getAttrOfType<StringAttr>(
+      mlir::SymbolTable::getSymbolAttrName());
+  if (nameAttr) {
+    printer << ' ';
+    printer.printSymbolName(nameAttr);
+  }
+
+  printer << '(';
+  for (unsigned i = 0, n = body.getNumArguments(); i < n; i++) {
+    if (i > 0) {
+      printer << ", ";
+    }
+    printer.printRegionArgument(body.getArgument(i));
+  }
+  printer << ')';
+
+  printer << ' ';
+  printer.printRegion(body, false, true);
+}
+
+LogicalResult AIEX::RuntimeSequenceOp::verify() {
+  AIE::DeviceOp device = (*this)->getParentOfType<AIE::DeviceOp>();
+  if (!device) {
+    // this check is redudnant with the HasParent trait, but can't hurt
+    (*this)->emitOpError() << "must be inside AIE device operation.";
+    return failure();
+  }
+  auto seq_ops = device.getOps<AIEX::RuntimeSequenceOp>();
+  if (std::distance(seq_ops.begin(), seq_ops.end()) > 1) {
+    auto err = device.emitOpError()
+               << "Cannot have more than one runtime sequence per device.";
+    for (auto it = seq_ops.begin(); it != seq_ops.end(); ++it) {
+      AIEX::RuntimeSequenceOp seq_op = *it;
+      err.attachNote(seq_op.getLoc()) << "Sequence operation definition here.";
+    }
+    return failure();
+  }
+  return success();
+}
diff --git a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
index 5a27bf446b..6862d639cf 100644
--- a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
+++ b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
@@ -12,7 +12,6 @@
 #include "aie/Dialect/AIEX/IR/AIEXDialect.h"
 #include "aie/Dialect/AIEX/Transforms/AIEXPasses.h"
 
-#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/ADT/DenseMap.h"
@@ -70,10 +69,10 @@ struct ShimDMAllocationGetter {
 };
 } // namespace
 
-struct RtpToNpuPattern : OpConversionPattern<NpuWriteRTPOp> {
+struct RtpToWrite32Pattern : OpConversionPattern<NpuWriteRTPOp> {
   using OpConversionPattern::OpConversionPattern;
 
-  RtpToNpuPattern(MLIRContext *context, PatternBenefit benefit = 1)
+  RtpToWrite32Pattern(MLIRContext *context, PatternBenefit benefit = 1)
       : OpConversionPattern(context, benefit) {}
 
   LogicalResult
@@ -118,12 +117,12 @@ struct RtpToNpuPattern : OpConversionPattern<NpuWriteRTPOp> {
   }
 };
 
-struct PushToNpuPattern : OpConversionPattern<NpuPushQueueOp> {
+struct PushQueuetoWrite32Pattern : OpConversionPattern<NpuPushQueueOp> {
 
 public:
   using OpConversionPattern::OpConversionPattern;
 
-  PushToNpuPattern(MLIRContext *context, PatternBenefit benefit = 1)
+  PushQueuetoWrite32Pattern(MLIRContext *context, PatternBenefit benefit = 1)
       : OpConversionPattern(context, benefit) {}
 
   LogicalResult
@@ -227,7 +226,11 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
     column = IntegerAttr::get(i32ty, col);
 
     // arg_idx
-    Block &entryBB = op->getParentOfType<func::FuncOp>().getBody().front();
+    AIEX::RuntimeSequenceOp seq_op =
+        op->getParentOfType<AIEX::RuntimeSequenceOp>();
+    assert(seq_op && "NpuDmaMemcpyNdOp must be inside a RuntimeSequenceOp; "
+                     "verify() should have ensured this.");
+    Block &entryBB = seq_op.getBody().front();
     int arg_idx = -1;
     for (int i = 0, e = entryBB.getNumArguments(); i < e; i++) {
       if (entryBB.getArgument(i) == memref) {
@@ -282,7 +285,8 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
     // iteration_current
 
     // iteration_size
-    if (strides[3])
+    // strides[3] doesn't need to lower to hardware if sizes[3] is one
+    if (strides[3] && sizes[3] != 1)
       iteration_size = IntegerAttr::get(i32ty, sizes[3] - 1);
 
     // iteration_stride
@@ -342,7 +346,7 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
 /// Convert NpuDmaWaitOp into NpuSyncOp by retrieving the necessary
 /// information from the ShimDMAAllocationOp referenced through the
 /// symbol argument of this op.
-struct DmaWaitToNpuPattern : OpConversionPattern<NpuDmaWaitOp> {
+struct DmaWaitToSyncPattern : OpConversionPattern<NpuDmaWaitOp> {
 
 private:
   ShimDMAllocationGetter &allocGetter;
@@ -350,8 +354,8 @@ struct DmaWaitToNpuPattern : OpConversionPattern<NpuDmaWaitOp> {
 public:
   using OpConversionPattern::OpConversionPattern;
 
-  DmaWaitToNpuPattern(MLIRContext *context, ShimDMAllocationGetter &getter,
-                      PatternBenefit benefit = 1)
+  DmaWaitToSyncPattern(MLIRContext *context, ShimDMAllocationGetter &getter,
+                       PatternBenefit benefit = 1)
       : OpConversionPattern(context, benefit), allocGetter(getter) {}
 
   LogicalResult
@@ -373,11 +377,104 @@ struct DmaWaitToNpuPattern : OpConversionPattern<NpuDmaWaitOp> {
         op, shimDmaAllocOp->getCol(), /* row */ 0,
         static_cast<uint32_t>(shimDmaAllocOp->getChannelDir()),
         shimDmaAllocOp->getChannelIndex(), 1, 1);
+
+    return success();
+  }
+};
+
+struct WriteBdToBlockWritePattern : OpConversionPattern<NpuWriteBdOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  WriteBdToBlockWritePattern(MLIRContext *context, PatternBenefit benefit = 1)
+      : OpConversionPattern(context, benefit) {}
+
+  LogicalResult
+  matchAndRewrite(NpuWriteBdOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    AIE::DeviceOp dev = op->getParentOfType<AIE::DeviceOp>();
+    const AIE::AIETargetModel &tm = dev.getTargetModel();
+
+    auto bd_id = op.getBdId();
+    uint32_t bd_addr = (op.getColumn() << tm.getColumnShift()) |
+                       (op.getRow() << tm.getRowShift()) |
+                       (0x1D000 + bd_id * 0x20);
+
+    std::vector<uint32_t> words(8, 0);
+
+    // DMA_BDX_0
+    words[0] = op.getBufferLength();
+
+    // DMA_BDX_1
+    words[1] = op.getBufferOffset();
+
+    // DMA_BDX_2
+    // En Packet , OoO BD ID , Packet ID , Packet Type
+    words[2] |= (op.getEnablePacket() & 0x1) << 30;
+    words[2] |= (op.getOutOfOrderId() & 0x3f) << 24;
+    words[2] |= (op.getPacketId() & 0x1f) << 19;
+    words[2] |= (op.getPacketType() & 0x7) << 16;
+
+    // DMA_BDX_3
+    // TODO: Secure Access
+    words[3] |= (op.getD0Size() & 0x3ff) << 20;
+    words[3] |= op.getD0Stride() & 0xfffff;
+
+    // DMA_BDX_4
+    words[4] = 0x80000000; // burst length;
+    words[4] |= (op.getD1Size() & 0x3ff) << 20;
+    words[4] |= op.getD1Stride() & 0xfffff;
+
+    // DMA_BDX_5
+    // TODO: SIMID, AxCache, AXQoS
+    words[5] = op.getD2Stride() & 0xfffff;
+
+    // DMA_BDX_6
+    words[6] |= (op.getIterationCurrent() & 0x3f) << 26;
+    words[6] |= (op.getIterationSize() & 0x3f) << 20;
+    words[6] |= op.getIterationStride() & 0xfffff;
+
+    // DMA_BDX_7
+    // TODO: TLAST Suppress
+    words[7] |= (op.getNextBd() & 0xf) << 27;
+    words[7] |= (op.getUseNextBd() & 0x1) << 26;
+    words[7] |= (op.getValidBd() & 0x1) << 25;
+    words[7] |= (op.getLockRelVal() & 0xef) << 18;
+    words[7] |= (op.getLockRelId() & 0xf) << 13;
+    words[7] |= (op.getLockAcqEnable() & 0x1) << 12;
+    words[7] |= (op.getLockAcqVal() & 0xef) << 5;
+    words[7] |= op.getLockAcqId() & 0xf;
+
+    MemRefType memrefType = MemRefType::get({8}, rewriter.getI32Type());
+    TensorType tensorType = RankedTensorType::get({8}, rewriter.getI32Type());
+    memref::GlobalOp global = nullptr;
+    {
+      OpBuilder::InsertionGuard guard(rewriter);
+      std::string name = "blockwrite_data_";
+      rewriter.setInsertionPoint(
+          op->getParentOfType<AIEX::RuntimeSequenceOp>());
+      int id = 0;
+      while (dev.lookupSymbol(name + std::to_string(id)))
+        id++;
+      name += std::to_string(id);
+      global = rewriter.create<memref::GlobalOp>(
+          op->getLoc(), name, rewriter.getStringAttr("private"), memrefType,
+          DenseElementsAttr::get<uint32_t>(tensorType, words), true, nullptr);
+    }
+    auto memref = rewriter.create<memref::GetGlobalOp>(op->getLoc(), memrefType,
+                                                       global.getName());
+    (void)rewriter.replaceOpWithNewOp<NpuBlockWriteOp>(
+        op, memref.getResult(), rewriter.getUI32IntegerAttr(bd_addr));
     return success();
   }
 };
 
 struct AIEDmaToNpuPass : AIEDmaToNpuBase<AIEDmaToNpuPass> {
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<memref::MemRefDialect>();
+  }
+
   void runOnOperation() override {
 
     ShimDMAllocationGetter cachingGetter;
@@ -386,18 +483,22 @@ struct AIEDmaToNpuPass : AIEDmaToNpuBase<AIEDmaToNpuPass> {
 
     ConversionTarget target(getContext());
     target.addLegalDialect<AIEXDialect>();
+    target.addLegalDialect<memref::MemRefDialect>();
     target.addLegalOp<AIE::BufferOp>();
     target.addLegalOp<AIE::ShimDMAAllocationOp>();
-    target.addIllegalOp<NpuWriteRTPOp>();
+
     target.addIllegalOp<NpuDmaMemcpyNdOp>();
     target.addIllegalOp<NpuDmaWaitOp>();
     target.addIllegalOp<NpuPushQueueOp>();
+    target.addIllegalOp<NpuWriteRTPOp>();
+    target.addIllegalOp<NpuWriteBdOp>();
 
     RewritePatternSet patterns(&getContext());
     patterns.insert<DmaToNpuPattern>(&getContext(), cachingGetter);
-    patterns.insert<DmaWaitToNpuPattern>(&getContext(), cachingGetter);
-    patterns.insert<PushToNpuPattern>(&getContext());
-    patterns.insert<RtpToNpuPattern>(&getContext());
+    patterns.insert<DmaWaitToSyncPattern>(&getContext(), cachingGetter);
+    patterns.insert<PushQueuetoWrite32Pattern>(&getContext());
+    patterns.insert<RtpToWrite32Pattern>(&getContext());
+    patterns.insert<WriteBdToBlockWritePattern>(&getContext());
 
     if (failed(applyPartialConversion(device, target, std::move(patterns))))
       signalPassFailure();
diff --git a/lib/Targets/AIETargetCDODirect.cpp b/lib/Targets/AIETargetCDODirect.cpp
index 79eb4671d8..4c1f46c9f2 100644
--- a/lib/Targets/AIETargetCDODirect.cpp
+++ b/lib/Targets/AIETargetCDODirect.cpp
@@ -324,6 +324,38 @@ LogicalResult configureBdInBlock(XAie_DevInst &devInst, XAie_DmaDesc &dmaTileBd,
                             lenInBytes);
   }
 
+  // ND zero padding.
+  std::optional<llvm::ArrayRef<BDPadLayoutAttr>> padDims =
+      bdOp.getPadDimensions();
+  if (padDims) {
+    XAie_DmaPadTensor dmaPadTensor = {};
+    dmaPadTensor.NumDim = padDims->size();
+    dmaPadTensor.PadDesc = static_cast<XAie_PadDesc *>(
+        calloc(dmaPadTensor.NumDim, sizeof(XAie_PadDesc)));
+    if (!dmaPadTensor.PadDesc)
+      return bdOp.emitError("couldn't allocate array of XAie_PadDesc");
+    // libxaie requires stride in multiples of 32b
+    double elementWidthIn32bWords =
+        static_cast<double>(bdOp.getBufferElementTypeWidthInBytes()) / 4.0;
+    for (size_t i = 0; i < padDims->size(); i++) {
+      // Pass down dimensions in reverse order.
+      int j = padDims->size() - i - 1;
+      uint8_t before;
+      uint8_t after;
+      if (j > 0) {
+        before = static_cast<uint8_t>(padDims.value()[i].getConstPadBefore());
+        after = static_cast<uint8_t>(padDims.value()[i].getConstPadAfter());
+      } else {
+        before = static_cast<uint8_t>(padDims.value()[i].getConstPadBefore() *
+                                      elementWidthIn32bWords);
+        after = static_cast<uint8_t>(padDims.value()[i].getConstPadAfter() *
+                                     elementWidthIn32bWords);
+      }
+      dmaPadTensor.PadDesc[j] = {before, after};
+    }
+    TRY_XAIE_API_EMIT_ERROR(bdOp, XAie_DmaSetPadding, &dmaTileBd,
+                            &dmaPadTensor);
+  }
   if (nextBdId) {
     auto enableNextBd = 1;
     TRY_XAIE_API_EMIT_ERROR(bdOp, XAie_DmaSetNextBd, &dmaTileBd,
diff --git a/lib/Targets/AIETargetHSA.cpp b/lib/Targets/AIETargetHSA.cpp
index 4711e11197..73de6bbd03 100644
--- a/lib/Targets/AIETargetHSA.cpp
+++ b/lib/Targets/AIETargetHSA.cpp
@@ -14,7 +14,7 @@
 #include "aie/Dialect/AIEX/IR/AIEXDialect.h"
 #include "aie/Targets/AIETargets.h"
 
-#include "mlir/Dialect/Func/IR/FuncOps.h" // Eddie added to get the NPU func ops
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/Pass/Pass.h"
@@ -68,23 +68,15 @@ mlir::LogicalResult AIETranslateToHSA(ModuleOp module, raw_ostream &output) {
   // Putting the standard header
   output << hsa_cpp_file_header;
 
-  // Getting the func op which has the data movement
-  if (targetOp.getOps<mlir::func::FuncOp>().empty()) {
-    return success();
-  }
-
   // Getting the sequence function op which contains the instructions
-  mlir::func::FuncOp funcOp = NULL;
-  for (auto op : targetOp.getOps<mlir::func::FuncOp>()) {
-    if (op.getName().str().compare("sequence") == 0) {
-      funcOp = op;
-    }
-  }
-
-  // If no funcOp then just return
-  if (funcOp == NULL) {
+  auto sequenceOps = targetOp.getOps<AIEX::RuntimeSequenceOp>();
+  if (sequenceOps.empty()) {
+    // If no sequenceOp then just return
     return success();
+  } else if (std::distance(sequenceOps.begin(), sequenceOps.end()) > 1) {
+    return module.emitOpError("expected at most one sequence operation");
   }
+  AIEX::RuntimeSequenceOp sequenceOp = *sequenceOps.begin();
 
   collectTiles(targetOp, tiles);
   collectBuffers(targetOp, buffers);
@@ -95,10 +87,11 @@ mlir::LogicalResult AIETranslateToHSA(ModuleOp module, raw_ostream &output) {
   // Looping over every Memcpy operation so we take the correct number of
   // buffers
   int num_ops = 0;
-  for (auto op : funcOp.getOps<NpuDmaMemcpyNdOp>()) {
+  for (auto op : sequenceOp.getOps<NpuDmaMemcpyNdOp>()) {
     // Getting the IDs of the buffers
     auto memref = op.getMemref();
-    Block &entryBB = op->getParentOfType<func::FuncOp>().getBody().front();
+    Block &entryBB =
+        op->getParentOfType<AIEX::RuntimeSequenceOp>().getBody().front();
     int arg_idx = -1;
     for (int i = 0, e = entryBB.getNumArguments(); i < e; i++) {
       if (entryBB.getArgument(i) == memref) {
@@ -117,8 +110,8 @@ mlir::LogicalResult AIETranslateToHSA(ModuleOp module, raw_ostream &output) {
   output << "\tuint64_t packet_id = 0;\n";
 
   int op_count = 0;
-  for (auto op : funcOp.getOps<NpuDmaMemcpyNdOp>()) {
-    auto dev = funcOp->getParentOfType<AIE::DeviceOp>();
+  for (auto op : sequenceOp.getOps<NpuDmaMemcpyNdOp>()) {
+    auto dev = sequenceOp->getParentOfType<AIE::DeviceOp>();
     if (!dev) {
       op.emitOpError("couldn't get DeviceOp");
       return failure();
@@ -134,6 +127,7 @@ mlir::LogicalResult AIETranslateToHSA(ModuleOp module, raw_ostream &output) {
     uint32_t ChannelId = infoOp->getChannelIndex();
     bool isMM2S = channelDir == AIE::DMAChannelDir::MM2S;
     int col = infoOp->getCol();
+    bool isPlio = infoOp->getPlio();
 
     llvm::SmallVector<int64_t, 4> strides = llvm::map_to_vector(
         llvm::reverse(op.getMixedStrides()),
@@ -162,7 +156,8 @@ mlir::LogicalResult AIETranslateToHSA(ModuleOp module, raw_ostream &output) {
 
     // Getting the ID of the buffer that we are using
     auto memref = op.getMemref();
-    Block &entryBB = op->getParentOfType<func::FuncOp>().getBody().front();
+    Block &entryBB =
+        op->getParentOfType<AIEX::RuntimeSequenceOp>().getBody().front();
     int arg_idx = -1;
     for (int i = 0, e = entryBB.getNumArguments(); i < e; i++) {
       if (entryBB.getArgument(i) == memref) {
@@ -182,7 +177,8 @@ mlir::LogicalResult AIETranslateToHSA(ModuleOp module, raw_ostream &output) {
     output << "\tmlir_aie_packet_nd_memcpy(&pkt" << op_count
            << ", 0 /* herd_id */, " << col << " /* col */, " << isMM2S
            << " /* dir */, " << ChannelId
-           << "/* channel */, 4 /* Burst length */, 2 /* Memory space */, "
+           << "/* channel */, 4 /* Burst length */, " << (isPlio ? 1 : 2)
+           << " /* Memory space */, "
               "(uint64_t)buf"
            << arg_idx << " + " << offset << " /* Address */, " << sizes[0] * 4
            << " /* 1d_length */, " << (strides[1] ? sizes[1] : 1)
diff --git a/lib/Targets/AIETargetNPU.cpp b/lib/Targets/AIETargetNPU.cpp
index d0b83d1212..fa0ccae3fc 100644
--- a/lib/Targets/AIETargetNPU.cpp
+++ b/lib/Targets/AIETargetNPU.cpp
@@ -104,66 +104,50 @@ void appendAddressPatch(std::vector<uint32_t> &instructions,
   words[11] = 0;
 }
 
-void appendWriteBdShimTile(std::vector<uint32_t> &instructions,
-                           NpuWriteBdOp op) {
+void appendBlockWrite(std::vector<uint32_t> &instructions, NpuBlockWriteOp op) {
 
-  auto words = reserveAndGetTail(instructions, 12);
-  const AIETargetModel &tm = op->getParentOfType<DeviceOp>().getTargetModel();
+  Value memref = op.getData();
+  int64_t width = cast<MemRefType>(memref.getType()).getElementTypeBitWidth();
+  if (width != 32) {
+    op.emitWarning("Only 32-bit data type is supported for now");
+    return;
+  }
+
+  memref::GetGlobalOp getGlobal = memref.getDefiningOp<memref::GetGlobalOp>();
+  if (!getGlobal) {
+    op.emitError("Only MemRefs from memref.get_global are supported");
+    return;
+  }
+
+  auto global = dyn_cast_if_present<memref::GlobalOp>(
+      op->getParentOfType<AIE::DeviceOp>().lookupSymbol(getGlobal.getName()));
+  if (!global) {
+    op.emitError("Global symbol not found");
+    return;
+  }
+
+  auto initVal = global.getInitialValue();
+  if (!initVal) {
+    op.emitError("Global symbol has no initial value");
+    return;
+  }
+
+  auto data = dyn_cast<DenseIntElementsAttr>(*initVal);
+  if (!data) {
+    op.emitError("Global symbol initial value is not a dense int array");
+    return;
+  }
 
   // XAIE_IO_BLOCKWRITE
+  auto words = reserveAndGetTail(instructions, data.size() + 4);
   words[0] = TXN_OPC_BLOCKWRITE;
   words[1] = 0;
-
-  // RegOff
-  auto bd_id = op.getBdId();
-  uint32_t bd_addr = (op.getColumn() << tm.getColumnShift()) |
-                     (op.getRow() << tm.getRowShift()) |
-                     (0x1D000 + bd_id * 0x20);
-  words[2] = bd_addr;                         // ADDR
+  words[2] = op.getAddress();
   words[3] = words.size() * sizeof(uint32_t); // Operation Size
 
-  // DMA_BDX_0
-  words[4] = op.getBufferLength();
-
-  // DMA_BDX_1
-  words[5] = op.getBufferOffset();
-
-  // DMA_BDX_2
-  // En Packet , OoO BD ID , Packet ID , Packet Type
-  words[6] |= (op.getEnablePacket() & 0x1) << 30;
-  words[6] |= (op.getOutOfOrderId() & 0x3f) << 24;
-  words[6] |= (op.getPacketId() & 0x1f) << 19;
-  words[6] |= (op.getPacketType() & 0x7) << 16;
-
-  // DMA_BDX_3
-  // TODO: Secure Access
-  words[7] |= (op.getD0Size() & 0x3ff) << 20;
-  words[7] |= op.getD0Stride() & 0xfffff;
-
-  // DMA_BDX_4
-  words[8] = 0x80000000; // burst length;
-  words[8] |= (op.getD1Size() & 0x3ff) << 20;
-  words[8] |= op.getD1Stride() & 0xfffff;
-
-  // DMA_BDX_5
-  // TODO: SIMID, AxCache, AXQoS
-  words[9] = op.getD2Stride() & 0xfffff;
-
-  // DMA_BDX_6
-  words[10] |= (op.getIterationCurrent() & 0x3f) << 26;
-  words[10] |= (op.getIterationSize() & 0x3f) << 20;
-  words[10] |= op.getIterationStride() & 0xfffff;
-
-  // DMA_BDX_7
-  // TODO: TLAST Suppress
-  words[11] |= (op.getNextBd() & 0xf) << 27;
-  words[11] |= (op.getUseNextBd() & 0x1) << 26;
-  words[11] |= (op.getValidBd() & 0x1) << 25;
-  words[11] |= (op.getLockRelVal() & 0xef) << 18;
-  words[11] |= (op.getLockRelId() & 0xf) << 13;
-  words[11] |= (op.getLockAcqEnable() & 0x1) << 12;
-  words[11] |= (op.getLockAcqVal() & 0xef) << 5;
-  words[11] |= op.getLockAcqId() & 0xf;
+  unsigned i = 4;
+  for (auto d : data)
+    words[i++] = d.getZExtValue();
 }
 
 } // namespace
@@ -179,12 +163,10 @@ std::vector<uint32_t> xilinx::AIE::AIETranslateToNPU(ModuleOp module) {
   words[1] = 0x00000105;
 
   DeviceOp deviceOp = *module.getOps<DeviceOp>().begin();
-  auto funcOps = deviceOp.getOps<func::FuncOp>();
+  auto sequenceOps = deviceOp.getOps<AIEX::RuntimeSequenceOp>();
   int count = 0;
-  for (auto f : funcOps) {
-    if (f.isDeclaration())
-      continue;
-    Block &entry = f.getRegion().front();
+  for (auto f : sequenceOps) {
+    Block &entry = f.getBody().front();
     for (auto &o : entry) {
       llvm::TypeSwitch<Operation *>(&o)
           .Case<NpuSyncOp>([&](auto op) {
@@ -195,13 +177,13 @@ std::vector<uint32_t> xilinx::AIE::AIETranslateToNPU(ModuleOp module) {
             count++;
             appendWrite32(instructions, op);
           })
-          .Case<NpuAddressPatchOp>([&](auto op) {
+          .Case<NpuBlockWriteOp>([&](auto op) {
             count++;
-            appendAddressPatch(instructions, op);
+            appendBlockWrite(instructions, op);
           })
-          .Case<NpuWriteBdOp>([&](auto op) {
+          .Case<NpuAddressPatchOp>([&](auto op) {
             count++;
-            appendWriteBdShimTile(instructions, op);
+            appendAddressPatch(instructions, op);
           });
     }
   }
diff --git a/lib/Targets/AIETargetXAIEV2.cpp b/lib/Targets/AIETargetXAIEV2.cpp
index 0ca128de56..5099ed2345 100644
--- a/lib/Targets/AIETargetXAIEV2.cpp
+++ b/lib/Targets/AIETargetXAIEV2.cpp
@@ -725,24 +725,43 @@ mlir::LogicalResult AIETranslateToXAIEV2(ModuleOp module, raw_ostream &output) {
     }
 
     for (auto connectOp : b.getOps<ConnectOp>()) {
-      if (connectOp.getSourceBundle() == WireBundle::North)
-        // demux!
-        output
-            << "__mlir_aie_try(XAie_EnableAieToShimDmaStrmPort("
-            << deviceInstRef << ", " << tileLocStr("x", "y")
-            << ", "
-            //               <<
-            //               stringifyWireBundle(connectOp.sourceBundle()).upper()
-            << connectOp.sourceIndex() << "));\n";
-      else if (connectOp.getDestBundle() == WireBundle::North)
-        // mux
-        output
-            << "__mlir_aie_try(XAie_EnableShimDmaToAieStrmPort("
-            << deviceInstRef << ", " << tileLocStr("x", "y")
-            << ", "
-            //               <<
-            //               stringifyWireBundle(connectOp.sourceBundle()).upper()
-            << connectOp.destIndex() << "));\n";
+
+      if (connectOp.getSourceBundle() == WireBundle::DMA ||
+          connectOp.getDestBundle() == WireBundle::DMA) {
+        if (connectOp.getSourceBundle() == WireBundle::North)
+          // demux!
+          output
+              << "__mlir_aie_try(XAie_EnableAieToShimDmaStrmPort("
+              << deviceInstRef << ", " << tileLocStr("x", "y")
+              << ", "
+              //               <<
+              //               stringifyWireBundle(connectOp.sourceBundle()).upper()
+              << connectOp.sourceIndex() << "));\n";
+        else if (connectOp.getDestBundle() == WireBundle::North)
+          // mux
+          output
+              << "__mlir_aie_try(XAie_EnableShimDmaToAieStrmPort("
+              << deviceInstRef << ", " << tileLocStr("x", "y")
+              << ", "
+              //               <<
+              //               stringifyWireBundle(connectOp.sourceBundle()).upper()
+              << connectOp.destIndex() << "));\n";
+      }
+
+      else if (connectOp.getSourceBundle() == WireBundle::PLIO ||
+               connectOp.getDestBundle() == WireBundle::PLIO) {
+        if (connectOp.getSourceBundle() == WireBundle::North) {
+          // mux
+          output << "__mlir_aie_try(XAie_AieToPlIntfEnable(" << deviceInstRef
+                 << ", " << tileLocStr("x", "y") << ", "
+                 << connectOp.destIndex() << ", PLIF_WIDTH_64));\n";
+        } else if (connectOp.getDestBundle() == WireBundle::North) {
+          // mux
+          output << "__mlir_aie_try(XAie_PlToAieIntfEnable(" << deviceInstRef
+                 << ", " << tileLocStr("x", "y") << ", "
+                 << connectOp.destIndex() << ", PLIF_WIDTH_64));\n";
+        }
+      }
     }
   }
   for (auto switchboxOp : targetOp.getOps<ShimSwitchboxOp>()) {
diff --git a/lib/Targets/AIETargets.cpp b/lib/Targets/AIETargets.cpp
index d3df1b73c8..4214a20023 100644
--- a/lib/Targets/AIETargets.cpp
+++ b/lib/Targets/AIETargets.cpp
@@ -19,6 +19,7 @@
 #include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/EmitC/IR/EmitC.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/Attributes.h"
@@ -82,6 +83,7 @@ static void registerDialects(DialectRegistry &registry) {
   registry.insert<VectorDialect>();
   registry.insert<LLVM::LLVMDialect>();
   registry.insert<emitc::EmitCDialect>();
+  registry.insert<index::IndexDialect>();
 }
 
 // Output the buffer map for the given buffer operations, with the given offset.
diff --git a/lib/Targets/AIEVecToCpp/CMakeLists.txt b/lib/Targets/AIEVecToCpp/CMakeLists.txt
index 114cc41df5..f3fbc3934d 100644
--- a/lib/Targets/AIEVecToCpp/CMakeLists.txt
+++ b/lib/Targets/AIEVecToCpp/CMakeLists.txt
@@ -13,6 +13,7 @@ add_mlir_translation_library(MLIRTargetAIEVecCpp
   $(CMAKE_CURRENT_SRC_DIR)/../../../../include/aie/Targets
 
   LINK_LIBS PUBLIC
+  MLIRAIEVecAIE1Dialect
   MLIRAIEVecDialect
   MLIRIR
   MLIRSupport
diff --git a/lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp b/lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp
index 5c80245fa0..08e912c9c3 100644
--- a/lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp
+++ b/lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp
@@ -13,6 +13,7 @@
 
 #include "aie/Targets/AIETargets.h"
 
+#include "aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Ops.h"
 #include "aie/Dialect/AIEVec/AIEVecUtils.h"
 #include "aie/Dialect/AIEVec/IR/AIEVecOps.h"
 
@@ -1333,7 +1334,8 @@ static LogicalResult printFMAOrMulConvOperand(CppEmitter &emitter, T op,
 }
 
 // Generate the Mul op
-static LogicalResult printOperation(CppEmitter &emitter, aievec::MulOp mulOp) {
+static LogicalResult printOperation(CppEmitter &emitter,
+                                    aievec::aie1::MulOp mulOp) {
   auto lhs = mulOp.getLhs();
   auto rhs = mulOp.getRhs();
 
@@ -1368,10 +1370,10 @@ static LogicalResult printOperation(CppEmitter &emitter, aievec::MulOp mulOp) {
 
   os << opname;
   os << "(";
-  if (failed(printFMAOrMulOperand<aievec::MulOp>(emitter, mulOp, 0)))
+  if (failed(printFMAOrMulOperand<aievec::aie1::MulOp>(emitter, mulOp, 0)))
     return failure();
   os << ", ";
-  if (failed(printFMAOrMulOperand<aievec::MulOp>(emitter, mulOp, 1)))
+  if (failed(printFMAOrMulOperand<aievec::aie1::MulOp>(emitter, mulOp, 1)))
     return failure();
   os << ")";
 
@@ -1513,7 +1515,8 @@ static LogicalResult printOperation(CppEmitter &emitter,
 }
 
 // Generate the Add op
-static LogicalResult printOperation(CppEmitter &emitter, aievec::AddOp addOp) {
+static LogicalResult printOperation(CppEmitter &emitter,
+                                    aievec::aie1::AddOp addOp) {
   auto lhs = addOp.getLhs();
   auto rhs = addOp.getRhs();
 
@@ -1556,10 +1559,10 @@ static LogicalResult printOperation(CppEmitter &emitter, aievec::AddOp addOp) {
   // Otherwise this is complex scheme
   os << (floatType ? "fpadd" : "add" + std::to_string(lanes));
   os << "(";
-  if (failed(printAddOrSubOperand<aievec::AddOp>(emitter, addOp, 0)))
+  if (failed(printAddOrSubOperand<aievec::aie1::AddOp>(emitter, addOp, 0)))
     return failure();
   os << ", ";
-  if (failed(printAddOrSubOperand<aievec::AddOp>(emitter, addOp, 1)))
+  if (failed(printAddOrSubOperand<aievec::aie1::AddOp>(emitter, addOp, 1)))
     return failure();
   os << ")";
 
@@ -1567,7 +1570,8 @@ static LogicalResult printOperation(CppEmitter &emitter, aievec::AddOp addOp) {
 }
 
 // Generate the Sub op
-static LogicalResult printOperation(CppEmitter &emitter, aievec::SubOp subOp) {
+static LogicalResult printOperation(CppEmitter &emitter,
+                                    aievec::aie1::SubOp subOp) {
   auto lhs = subOp.getLhs();
   auto rhs = subOp.getRhs();
 
@@ -1610,10 +1614,10 @@ static LogicalResult printOperation(CppEmitter &emitter, aievec::SubOp subOp) {
   // Otherwise this is complex scheme
   os << (floatType ? "fpsub" : "sub" + std::to_string(lanes));
   os << "(";
-  if (failed(printAddOrSubOperand<aievec::SubOp>(emitter, subOp, 0)))
+  if (failed(printAddOrSubOperand<aievec::aie1::SubOp>(emitter, subOp, 0)))
     return failure();
   os << ", ";
-  if (failed(printAddOrSubOperand<aievec::SubOp>(emitter, subOp, 1)))
+  if (failed(printAddOrSubOperand<aievec::aie1::SubOp>(emitter, subOp, 1)))
     return failure();
   os << ")";
 
@@ -1864,7 +1868,8 @@ static LogicalResult printOperation(CppEmitter &emitter,
 }
 
 // Generate the FMA op
-static LogicalResult printOperation(CppEmitter &emitter, aievec::FMAOp fmaOp) {
+static LogicalResult printOperation(CppEmitter &emitter,
+                                    aievec::aie1::FMAOp fmaOp) {
   auto acc = fmaOp.getAcc();
   auto lhs = fmaOp.getLhs();
   auto rhs = fmaOp.getRhs();
@@ -1902,10 +1907,10 @@ static LogicalResult printOperation(CppEmitter &emitter, aievec::FMAOp fmaOp) {
   os << "(";
   os << accName;
   os << ", ";
-  if (failed(printFMAOrMulOperand<aievec::FMAOp>(emitter, fmaOp, 0)))
+  if (failed(printFMAOrMulOperand<aievec::aie1::FMAOp>(emitter, fmaOp, 0)))
     return failure();
   os << ", ";
-  if (failed(printFMAOrMulOperand<aievec::FMAOp>(emitter, fmaOp, 1)))
+  if (failed(printFMAOrMulOperand<aievec::aie1::FMAOp>(emitter, fmaOp, 1)))
     return failure();
   os << ")";
 
@@ -3264,12 +3269,16 @@ LogicalResult CppEmitter::emitOperation(Operation &op, bool trailingSemicolon) {
           .Case<memref::StoreOp, memref::ExpandShapeOp,
                 memref::CollapseShapeOp>(
               [&](auto op) { return printOperation(*this, op); })
-          .Case<AddOp, AddElemOp, ConcatOp, ExtOp, FMAOp, MulOp, PackOp,
-                SelectOp, SRSOp, SubOp, SubElemOp, UPDOp, UPSOp, FMAElemOp,
-                MulElemOp, BroadcastOp, BroadcastScalarOp, MulConvOp, FMAConvOp,
-                ShiftOp, ShuffleOp, CastOp, MinOp, MaxOp, NegOp, CmpOp, SelOp,
-                ExtElemOp, BxorOp, BnegOp, BandOp, BorOp, UnpackOp, MatMulOp,
-                LegacyShuffleOp>(
+          // AievecAie1 ops
+          .Case<aievec::aie1::AddOp, aievec::aie1::SubOp, aievec::aie1::FMAOp,
+                aievec::aie1::MulOp>(
+              [&](auto op) { return printOperation(*this, op); })
+          // Aievec ops
+          .Case<AddElemOp, ConcatOp, ExtOp, PackOp, SelectOp, SRSOp, SubElemOp,
+                UPDOp, UPSOp, FMAElemOp, MulElemOp, BroadcastOp,
+                BroadcastScalarOp, MulConvOp, FMAConvOp, ShiftOp, ShuffleOp,
+                CastOp, MinOp, MaxOp, NegOp, CmpOp, SelOp, ExtElemOp, BxorOp,
+                BnegOp, BandOp, BorOp, UnpackOp, MatMulOp, LegacyShuffleOp>(
               [&](auto op) { return printOperation(*this, op); })
           .Default([&](Operation *) {
             return op.emitOpError("unable to find printer for op");
diff --git a/lib/Targets/AIEVecToCpp/TranslateRegistration.cpp b/lib/Targets/AIEVecToCpp/TranslateRegistration.cpp
index 8936120d6b..85e7f646b2 100644
--- a/lib/Targets/AIEVecToCpp/TranslateRegistration.cpp
+++ b/lib/Targets/AIEVecToCpp/TranslateRegistration.cpp
@@ -8,6 +8,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "aie/Dialect/AIEVec/AIE1/IR/AIEVecAIE1Ops.h"
 #include "aie/Dialect/AIEVec/IR/AIEVecDialect.h"
 #include "aie/Targets/AIETargets.h"
 
@@ -57,7 +58,8 @@ void registerAIEVecToCppTranslation() {
                         DLTIDialect,
                         scf::SCFDialect,
                         vector::VectorDialect,
-                        xilinx::aievec::AIEVecDialect,
+                        xilinx::aievec::aie1::AIEVecAIE1Dialect,
+                        xilinx::aievec::AIEVecDialect,                        
                         index::IndexDialect>();
         // clang-format on
       });
diff --git a/programming_examples/basic/dma_transpose/aie2.py b/programming_examples/basic/dma_transpose/aie2.py
index 9a910c4203..0562194664 100644
--- a/programming_examples/basic/dma_transpose/aie2.py
+++ b/programming_examples/basic/dma_transpose/aie2.py
@@ -51,7 +51,7 @@ def core_body():
             # To/from AIE-array data movement
             tensor_ty = T.memref(N, T.i32())
 
-            @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
+            @runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
             def sequence(A, B, C):
                 npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
                 # The strides below are configured to read across all rows in the same column
diff --git a/programming_examples/basic/matrix_multiplication/README.md b/programming_examples/basic/matrix_multiplication/README.md
index 88b701ffa2..7b001744b5 100644
--- a/programming_examples/basic/matrix_multiplication/README.md
+++ b/programming_examples/basic/matrix_multiplication/README.md
@@ -16,4 +16,22 @@ Subdirectories in this directory contain example designs that implement matrix m
 
 * [`single_core`](single_core) - This design performs matrix-matrix multiplication on a single AI Engine core. 
 * [`whole_array`](whole_array) - This design evolves `single_core`, by splitting the computation and parallelizing it. It utilizes all available AI Engine cores simultaneously.
-* [`matrix_vector`](matrix_vector) - This design is a specialization to the matrix-vector-multiplication case, which poses unique challenges due to lower computation density. *Work in progress.*
\ No newline at end of file
+* [`matrix_vector`](matrix_vector) - This design is a specialization to the matrix-vector-multiplication case, which poses unique challenges due to lower computation density. *Work in progress.*
+
+## Note on Numerical Tolerances
+
+This directory contains verification code that ensures the designs in the subdirectories produce the correct output.
+
+The designs can be configured to work on different input and output data types, based on the Makefile variables `dtype_in` and `dtype_out`.
+In the default configuration, all designs consume integer intputs and produce integer outputs.
+For this case, the verification checks for strict equivalence between the reference output computed on the host CPU and the output calculated on the AI Engine.
+That is, verification will only pass for integer data types if the output is equivalent bit-by-bit.
+
+For floating point data types, the verification code allows the AI Engine output to deviate from the reference calculated on the host CPU by some limited maximal relative and absolute tolerance (defined in `common.h`).
+This standard practice is necessary for the following reasons:
+
+ - Operations on IEEE 754 floating point values are not commutative. That is, the order of operations can affect the results. All designs in the subdirectories perform tiling of the input matrices, multiplying and accumulating sub-matrices in chunks. The reference calculation code on the CPU, on the other hand, does not perform tiling. As such, some differences due to non-commutativity are expected.
+ - The reference on the host CPU is always computed in `float32`, even if the input data type is `bfloat16`, since the host CPU does not support native `bfloat16` multiplication. This means results are calculated with higher precision on the CPU and subsequently truncated, whereas the AI Engine is able to calculate results in a more performant manner thanks to natively using the lower precision data type.
+ - If the output datatype is lower-precision than the accumulation data type, the tiling in the `K` dimension affects the results. For example, when multiplying `bfloat16` numbers, the AI Engine accumulates results in higher-precision `float32`. Our designs perform such accumulation for `k` (tiling size in `K` dimension) times before writing the results back into the output buffer. If the output buffer is lower-precision, results are truncated at that time. A larger `k` dimension means fewer such truncations take place. The AI Engine also provides a higher-precision "cascade" data path, which can be used to accumulate results between cores, although none of the designs in this directory make use of this currently.
+
+In summary, different choices of data types, tiling strategies, and usage of AI Engine components, can all affect floating point results in slight ways. Deciding on different choices for these factors presents interesting trade-offs that must be considered on a case-by-case basis for the application at hand.
diff --git a/programming_examples/basic/matrix_multiplication/common.h b/programming_examples/basic/matrix_multiplication/common.h
index b2c6c14b53..cba6ff6363 100644
--- a/programming_examples/basic/matrix_multiplication/common.h
+++ b/programming_examples/basic/matrix_multiplication/common.h
@@ -109,11 +109,16 @@ std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
 // Matrix / Float / Math
 // --------------------------------------------------------------------------
 
-static inline std::int16_t random_int16_t() {
+template <typename T>
+static inline T get_random();
+
+template <>
+std::int16_t get_random<std::int16_t>() {
   return (std::int16_t)rand() % 0x10000;
 }
 
-static inline std::bfloat16_t random_bfloat16_t() {
+template <>
+std::bfloat16_t get_random<std::bfloat16_t>() {
   // Random numbers should NOT be uniformly between 0 and 1, because that
   // would make the matrix product AB always close to 1.
   return std::bfloat16_t(4.0 * (float)rand() / (float)(RAND_MAX));
@@ -165,6 +170,51 @@ bool nearly_equal(float a, float b, float epsilon = 128 * FLT_EPSILON,
   return diff < std::max(abs_th, epsilon * norm);
 }
 
+template <typename T>
+static inline float get_abs_tol();
+template <typename T>
+static inline float get_rel_tol();
+
+template <>
+float get_abs_tol<std::int16_t>() {
+  return 0.0;
+}
+
+template <>
+float get_abs_tol<std::int32_t>() {
+  return 0.0;
+}
+
+template <>
+float get_abs_tol<std::bfloat16_t>() {
+  return 0.5;
+}
+
+template <>
+float get_abs_tol<float>() {
+  return 0.5;
+}
+
+template <>
+float get_rel_tol<std::int16_t>() {
+  return 0.0;
+}
+
+template <>
+float get_rel_tol<std::int32_t>() {
+  return 0.0;
+}
+
+template <>
+float get_rel_tol<std::bfloat16_t>() {
+  return 0.05;
+}
+
+template <>
+float get_rel_tol<float>() {
+  return 0.05;
+}
+
 template <typename T>
 void print_matrix(const std::vector<T> matrix, int n_cols,
                   int n_printable_rows = 10, int n_printable_cols = 10,
@@ -237,10 +287,14 @@ struct error {
 
 template <typename Tout>
 std::optional<struct error<Tout>>
-verify_single(std::ostream &os, int row, int col, Tout expected, Tout actual) {
-  const float absTol = 0.5;
-  const float relTol = 0.05;
-  if (!nearly_equal(expected, actual, relTol, absTol)) {
+verify_single(std::ostream &os, int row, int col, Tout expected, Tout actual,
+              float abs_tol, float rel_tol) {
+  bool match = expected == actual;
+  if (abs_tol > 0 || rel_tol > 0) {
+    // Allow for some tolerance for float data types
+    match = nearly_equal(expected, actual, rel_tol, abs_tol);
+  }
+  if (!match) {
     return (struct error<Tout>){row, col, expected, actual};
   }
   return std::nullopt;
@@ -275,7 +329,8 @@ void print_progress_bar(std::ostream &os, double progress, int len = 75) {
 
 template <typename Tin, typename Tout, typename Tacc>
 int verify(int M, int N, int K, std::vector<Tin> A, std::vector<Tin> B,
-           std::vector<Tout> C, int verbosity = 0) {
+           std::vector<Tout> C, int verbosity = 0, float abs_tol = 0.5,
+           float rel_tol = 0.05) {
   int n_errors = 0;
   std::vector<struct error<Tout>> errors;
   Tout max_rel_error = (Tout)0.0f;
@@ -285,8 +340,9 @@ int verify(int M, int N, int K, std::vector<Tin> A, std::vector<Tin> B,
 
   for (int row = 0; row < M; row++) {
     for (int col = 0; col < N; col++) {
-      std::optional<struct error<Tout>> error = verify_single(
-          std::cout, row, col, CRef[row * N + col], C[row * N + col]);
+      std::optional<struct error<Tout>> error =
+          verify_single(std::cout, row, col, CRef[row * N + col],
+                        C[row * N + col], abs_tol, rel_tol);
       if (error.has_value()) {
         if (n_errors < max_printable_errors) {
           errors.push_back(*error);
@@ -316,7 +372,8 @@ int verify(int M, int N, int K, std::vector<Tin> A, std::vector<Tin> B,
 template <typename Tin, typename Tout, typename Tacc>
 int verify_stochastic(int M, int N, int K, std::vector<Tin> A,
                       std::vector<Tin> B, std::vector<Tout> C, int n_samples,
-                      int verbosity = 0) {
+                      int verbosity = 0, float abs_tol = 0.5,
+                      float rel_tol = 0.05) {
   std::mt19937 rng;
   auto rows = std::views::iota(0, M);
   auto cols = std::views::iota(0, N);
@@ -342,8 +399,8 @@ int verify_stochastic(int M, int N, int K, std::vector<Tin> A,
       print_progress_bar(std::cerr, progress);
     }
     Tout ref = mul_acc<Tin, Tout, Tacc>(M, N, K, row, col, A, B);
-    std::optional<struct error<Tout>> error =
-        verify_single(std::cout, row, col, ref, C[row * N + col]);
+    std::optional<struct error<Tout>> error = verify_single(
+        std::cout, row, col, ref, C[row * N + col], abs_tol, rel_tol);
     if (error.has_value()) {
       if (n_errors < max_printable_errors) {
         errors.push_back(*error);
diff --git a/programming_examples/basic/matrix_multiplication/makefile-common b/programming_examples/basic/matrix_multiplication/makefile-common
index 9f336f1099..ba21462442 100644
--- a/programming_examples/basic/matrix_multiplication/makefile-common
+++ b/programming_examples/basic/matrix_multiplication/makefile-common
@@ -37,6 +37,32 @@ include ${current_dir}../../makefile-common
 M?=512	
 K?=512
 N?=512
+dtype_in?=i16
+dtype_out?=i32
+
+ifeq ($(dtype_in),bf16)
+	dtype_in_cpp=std::bfloat16_t
+endif
+ifeq ($(dtype_out),bf16)
+	dtype_out_cpp=std::bfloat16_t
+	dtype_acc_cpp=float
+endif
+
+ifeq ($(dtype_in),i16)
+	dtype_in_cpp=int16_t
+endif
+ifeq ($(dtype_out),i16)
+	dtype_out_cpp=int16_t
+	dtype_acc_cpp=int16_t
+endif
+ifeq ($(dtype_out),i32)
+	dtype_out_cpp=int32_t
+	dtype_acc_cpp=int32_t
+endif
+ifeq ($(dtype_out),f32)
+	dtype_out_cpp=float
+	dtype_acc_cpp=float
+endif
 
 trace_size?=65536
 
@@ -46,7 +72,7 @@ xclbin_target?=build/final_${target_suffix}.xclbin
 insts_target?=build/insts_${target_suffix}.txt
 
 runargs?=-v 2 --warmup 1 --iters 1
-aieargs+=-M $M -K $K -N $N
+aieargs+=-M $M -K $K -N $N --dtype_in ${dtype_in} --dtype_out ${dtype_out}
 
 kernels_dir=${srcdir}/../../../../aie_kernels/aie2
 
@@ -69,7 +95,8 @@ ${xclbin_target}: ${mlir_target} ${kernels:%=build/%.o}
 ${targetname}.exe: ${srcdir}/test.cpp ${srcdir}/../test.cpp ${srcdir}/../common.h
 	rm -rf _build
 	mkdir -p _build
-	cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23 -ggdb" cmake ${srcdir}/.. -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname} -Dsubdir=${subdir}
+	cd _build && ${powershell} cmake -E env CXXFLAGS="-std=c++23 -ggdb -DDTYPE_IN=${dtype_in_cpp} -DDTYPE_OUT=${dtype_out_cpp} -DDTYPE_ACC=${dtype_acc_cpp}" \
+		cmake ${srcdir}/.. -D CMAKE_C_COMPILER=gcc-13 -D CMAKE_CXX_COMPILER=g++-13 -DTARGET_NAME=${targetname} -Dsubdir=${subdir}
 	cd _build && ${powershell} cmake --build . --config Release
 ifeq "${powershell}" "powershell.exe"
 	cp _build/${targetname}.exe $@
diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/README.md b/programming_examples/basic/matrix_multiplication/matrix_vector/README.md
index 1eae2e2c23..d29cbafec7 100644
--- a/programming_examples/basic/matrix_multiplication/matrix_vector/README.md
+++ b/programming_examples/basic/matrix_multiplication/matrix_vector/README.md
@@ -17,7 +17,7 @@ In this design, one or multiple AI Engine compute cores (spread across hardware
 ## Differences from the [Whole-Array Matrix-Matrix Multiplication Design](../whole_array/README.md)
 
 - A specialized matrix-*vector* microkernel, named `matvec_vectorized` is used in this design, as opposed to the more general matrix-matrix microkernel (`matmul_vectorized`) used in the matrix-matrix-multiplication designs.
-- The data movement in this design varies as follows: An identical `32`-element chunk of the vector `B` is **broadcast** to the cores in all columns, whereas _distinct_ subsequent `32`&times;`32`-sized tiles of the `A` matrix are **distributed** to the cores. As such, each core is responsible for a distinct `32`-element chunk of the output vector `C`. These chunks are assembled (**joined**) at the shim tile level (in the `sequence()` function).
+- The data movement in this design varies as follows: An identical `32`-element chunk of the vector `B` is **broadcast** to the cores in all columns, whereas _distinct_ subsequent `32`&times;`32`-sized tiles of the `A` matrix are **distributed** to the cores. As such, each core is responsible for a distinct `32`-element chunk of the output vector `C`. These chunks are assembled (**joined**) at the shim tile level (in the `aiex.runtime_sequence()`).
 - This design does not use all available compute cores. Instead, it uses at most one core in each hardware column. The variable `n_cores` defines the number of columns to be used. It would however be possible to extend this design to use all cores.
 
 ## Building and Running the Design
diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py b/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py
index 05ade0e2a1..5ecc36da04 100644
--- a/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py
@@ -185,7 +185,7 @@ def core_body():
 
             # To/from AIE-array data movement
 
-            @FuncOp.from_py_func(
+            @runtime_sequence(
                 T.memref(A_sz, dtype_in()),
                 T.memref(B_sz, dtype_in()),
                 T.memref(C_sz, dtype_out()),
diff --git a/programming_examples/basic/matrix_multiplication/single_core/Makefile b/programming_examples/basic/matrix_multiplication/single_core/Makefile
index a1da00108f..3fcab3f24d 100644
--- a/programming_examples/basic/matrix_multiplication/single_core/Makefile
+++ b/programming_examples/basic/matrix_multiplication/single_core/Makefile
@@ -18,7 +18,7 @@ K?=256
 N?=256
 m?=64
 k?=64
-n?=64
+n?=32
 
 kernels=mm_${m}x${k}x${n}
 aieargs+=-m $m -k $k -n $n
diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
index 5eef847850..08c1eeb0b2 100644
--- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
@@ -25,20 +25,37 @@ def main():
     argparser.add_argument("-N", type=int, default=256)
     argparser.add_argument("-m", type=int, default=64)
     argparser.add_argument("-k", type=int, default=64)
-    argparser.add_argument("-n", type=int, default=64)
+    argparser.add_argument("-n", type=int, default=32)
+    argparser.add_argument(
+        "--dtype_in", type=str, choices=["bf16", "i16"], default="i16"
+    )
+    argparser.add_argument(
+        "--dtype_out", type=str, choices=["bf16", "i16", "f32", "i32"], default="i32"
+    )
     args = argparser.parse_args()
-    my_matmul(args.M, args.K, args.N, args.m, args.k, args.n)
+    my_matmul(
+        args.M, args.K, args.N, args.m, args.k, args.n, args.dtype_in, args.dtype_out
+    )
+
+
+def ceildiv(a, b):
+    return (a + b - 1) // b
 
 
-def my_matmul(M, K, N, m, k, n):
+def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str):
 
     assert M % m == 0
     assert K % k == 0
     assert N % n == 0
 
-    r = 4
-    s = 8
-    t = 4
+    if dtype_in_str == "bf16":
+        r = 4
+        s = 8
+        t = 4
+    elif dtype_in_str == "i16":
+        r = 4
+        s = 4
+        t = 4
 
     assert m % r == 0
     assert k % s == 0
@@ -48,10 +65,24 @@ def my_matmul(M, K, N, m, k, n):
     enable_tracing = False
     trace_size = 65536
 
+    dtype_in = None
+    if dtype_in_str == "bf16":
+        dtype_in = T.bf16
+    elif dtype_in_str == "i16":
+        dtype_in = T.i16
+    dtype_out = None
+    if dtype_out_str == "bf16":
+        dtype_out = T.bf16
+    elif dtype_out_str == "i16":
+        dtype_out = T.i16
+    elif dtype_out_str == "f32":
+        dtype_out = T.f32
+    elif dtype_out_str == "i32":
+        dtype_out = T.i32
+
     A_sz = M * K
     B_sz = K * N
     C_sz = M * N
-    C_sz_in_bytes = C_sz * 2
 
     M_div_m = M // m
     K_div_k = K // k
@@ -66,25 +97,30 @@ def my_matmul(M, K, N, m, k, n):
 
     with mlir_mod_ctx() as ctx:
 
+        C_sz_in_bytes = C_sz * dtype_out().width // 8
+
         @device(AIEDevice.npu1_1col)
         def device_body():
-            memref_a_ty = T.memref(m, k, T.bf16())
-            memref_b_ty = T.memref(k, n, T.bf16())
-            memref_c_ty = T.memref(m, n, T.bf16())
+            memref_a_ty = T.memref(m, k, dtype_in())
+            memref_b_ty = T.memref(k, n, dtype_in())
+            memref_c_ty = T.memref(m, n, dtype_out())
 
             ofifo_memref_a_ty = TypeAttr.get(ObjectFifoType.get(memref_a_ty))
             ofifo_memref_b_ty = TypeAttr.get(ObjectFifoType.get(memref_b_ty))
             ofifo_memref_c_ty = TypeAttr.get(ObjectFifoType.get(memref_c_ty))
 
             # AIE Core Function declarations
-            zero_scalar = external_func("zero_scalar_bf16", inputs=[memref_c_ty])
-            zero = external_func("zero_bf16", inputs=[memref_c_ty])
+            zero_scalar = external_func(
+                f"zero_scalar_{dtype_out_str}", inputs=[memref_c_ty]
+            )
+            zero = external_func(f"zero_{dtype_out_str}", inputs=[memref_c_ty])
             matmul_scalar = external_func(
-                "matmul_scalar_bf16_bf16",
+                f"matmul_scalar_{dtype_in_str}_{dtype_out_str}",
                 inputs=[memref_a_ty, memref_b_ty, memref_c_ty],
             )
             matmul = external_func(
-                "matmul_bf16_bf16", inputs=[memref_a_ty, memref_b_ty, memref_c_ty]
+                f"matmul_{dtype_in_str}_{dtype_out_str}",
+                inputs=[memref_a_ty, memref_b_ty, memref_c_ty],
             )
 
             # Tile declarations
@@ -195,10 +231,10 @@ def core_body():
 
             # To/from AIE-array data movement
 
-            @FuncOp.from_py_func(
-                T.memref(A_sz, T.bf16()),
-                T.memref(B_sz, T.bf16()),
-                T.memref(C_sz, T.bf16()),
+            @runtime_sequence(
+                T.memref(A_sz, dtype_in()),
+                T.memref(B_sz, dtype_in()),
+                T.memref(C_sz, dtype_out()),
             )
             def sequence(A, B, C):
 
@@ -213,9 +249,7 @@ def sequence(A, B, C):
 
                 # only do 5 tile rows at a time before synchronizing, so we can reuse BDs
                 rows_per_block = 5
-                for tile_row_block in range(
-                    (M_div_m + rows_per_block - 1) // rows_per_block
-                ):
+                for tile_row_block in range(ceildiv(M_div_m, rows_per_block)):
                     C_row_offset = tile_row_block * rows_per_block * m * N
                     num_tile_rows = min(
                         [rows_per_block, M_div_m - tile_row_block * rows_per_block]
diff --git a/programming_examples/basic/matrix_multiplication/sweep.sh b/programming_examples/basic/matrix_multiplication/sweep.sh
index 40b0464093..c7ba36b1bb 100755
--- a/programming_examples/basic/matrix_multiplication/sweep.sh
+++ b/programming_examples/basic/matrix_multiplication/sweep.sh
@@ -3,8 +3,9 @@
 # run this script from one of the subdirectories to perform a sweep,
 # e.g. from within whole_array, run ../sweep.sh.
 
-csv_out=sweep_2.csv
-log_out=sweep_2.log
+
+csv_out=my_sweep.csv
+log_out=my_sweep.log
 runargs="--iters 20 --warmup 10"
 iterations=1
 
@@ -21,8 +22,17 @@ N_step=256
 N_hi=4096
 Ns=$(seq $N_lo $N_step $N_hi)
 
-here=$(realpath $(dirname $BASH_SOURCE[0]))
-cd $here
+export m=64
+export k=64
+export n=64
+export dtype_in=i16
+export dtype_out=i16
+export n_aie_cols=4
+export XRT_HACK_UNSECURE_LOADING_XCLBIN=1
+
+# Print configuration used to run for reproducibility
+env >>$log_out
+cat Makefile >>$log_out
 
 printf "M,K,N" >>$csv_out
 for i in $(seq 1 $iterations); do
@@ -33,12 +43,15 @@ printf "\n" >>$csv_out
 for M in $Ms; do
     for K in $Ks; do
         for N in $Ns; do
+            export M=$M
+            export K=$K
+            export N=$N
             echo ${M}x${K}x${N} 1>&2
-            rm -r /lib/firmware/amdnpu/1502/*_unsigned.xclbin  # Signing step may hang otherwise
-            M=${M} K=${K} N=${N} make all 1>>$log_out 2>&1
+            make clean 1>>$log_out 2>&1
+            make all 1>>$log_out 2>&1
             printf "${M},${K},${N}" >>$csv_out
             for i in $(seq 1 $iterations); do
-                M=${M} K=${K} N=${N} runargs=${runargs} make run >.tmp_run.log
+                make run >.tmp_run.log
                 cat .tmp_run.log $run_output >>$log_out
                 t=$(cat .tmp_run.log | sed -rn 's/^Avg NPU matmul time: ([0-9.]+)us.$/\1/p')
                 printf ",${t}" >>$csv_out
diff --git a/programming_examples/basic/matrix_multiplication/test.cpp b/programming_examples/basic/matrix_multiplication/test.cpp
index c838f30aeb..378f81a407 100644
--- a/programming_examples/basic/matrix_multiplication/test.cpp
+++ b/programming_examples/basic/matrix_multiplication/test.cpp
@@ -28,15 +28,32 @@
 
 #ifndef DATATYPES_USING_DEFINED
 #define DATATYPES_USING_DEFINED
-using A_DATATYPE = std::bfloat16_t;
-using B_DATATYPE = std::bfloat16_t;
-using C_DATATYPE = std::bfloat16_t;
-using ACC_DATATYPE = float;
+#ifndef DTYPE_IN
+#define DTYPE_IN std::bfloat16_t
+#endif
+#ifndef DTYPE_OUT
+#define DTYPE_OUT std::bfloat16_t
+#endif
+#ifndef DTYPE_ACC
+#define DTYPE_ACC float
+#endif
+using A_DATATYPE = DTYPE_IN;
+using B_DATATYPE = DTYPE_IN;
+using C_DATATYPE = DTYPE_OUT;
+using ACC_DATATYPE = DTYPE_ACC;
 #endif
 
+#define XSTR(X) STR(X)
+#define STR(X) #X
+
 constexpr long long verify_stochastic_threshold = 1024 * 1024 * 1024;
 constexpr int verify_stochastic_n_samples = 1000;
 
+// Verification tolerance
+// See "Note on Numerical Tolerances" in README.md
+float abs_tol = matmul_common::get_abs_tol<C_DATATYPE>();
+float rel_tol = matmul_common::get_rel_tol<C_DATATYPE>();
+
 namespace po = boost::program_options;
 
 int main(int argc, const char *argv[]) {
@@ -139,14 +156,14 @@ int main(int argc, const char *argv[]) {
   A_DATATYPE *bufA = bo_a.map<A_DATATYPE *>();
   std::vector<A_DATATYPE> AVec(A_VOLUME);
   for (int i = 0; i < A_VOLUME; i++) {
-    AVec[i] = matmul_common::random_bfloat16_t();
+    AVec[i] = matmul_common::get_random<A_DATATYPE>();
     // AVec[i] = i;
   }
   memcpy(bufA, AVec.data(), (AVec.size() * sizeof(A_DATATYPE)));
   B_DATATYPE *bufB = bo_b.map<B_DATATYPE *>();
   std::vector<B_DATATYPE> BVec(B_VOLUME);
   for (int i = 0; i < B_VOLUME; i++) {
-    BVec[i] = matmul_common::random_bfloat16_t();
+    BVec[i] = matmul_common::get_random<B_DATATYPE>();
     // Diagonal:
     // if(i % N == i / N) {
     //   BVec[i] = 1.0;
@@ -162,6 +179,10 @@ int main(int argc, const char *argv[]) {
   memset(bufOut, 0, OUT_SIZE);
 
   if (verbosity >= 2) {
+    std::cout << "DTYPE_IN  = " XSTR(DTYPE_IN) "\n";
+    std::cout << "DTYPE_OUT = " XSTR(DTYPE_OUT) "\n";
+    std::cout << "Verification tolerance " << abs_tol << " absolute, "
+              << rel_tol << " relative.\n";
     std::cout << "A = \n";
     matmul_common::print_matrix(AVec, K);
     std::cout << "B = \n";
@@ -221,10 +242,11 @@ int main(int argc, const char *argv[]) {
       if (do_verify_stochastic) {
         errors = matmul_common::verify_stochastic<A_DATATYPE, C_DATATYPE,
                                                   ACC_DATATYPE>(
-            M, N, K, AVec, BVec, CVec, verify_stochastic_n_samples, verbosity);
+            M, N, K, AVec, BVec, CVec, verify_stochastic_n_samples, verbosity,
+            abs_tol, rel_tol);
       } else {
         errors = matmul_common::verify<A_DATATYPE, C_DATATYPE, ACC_DATATYPE>(
-            M, N, K, AVec, BVec, CVec);
+            M, N, K, AVec, BVec, CVec, abs_tol, rel_tol);
       }
       auto vstop = std::chrono::system_clock::now();
       float vtime =
diff --git a/programming_examples/basic/matrix_multiplication/whole_array/Makefile b/programming_examples/basic/matrix_multiplication/whole_array/Makefile
index 31ee48950d..127606f721 100644
--- a/programming_examples/basic/matrix_multiplication/whole_array/Makefile
+++ b/programming_examples/basic/matrix_multiplication/whole_array/Makefile
@@ -15,7 +15,7 @@ M?=640
 K?=896
 N?=768
 m?=16
-k?=64
+k?=32
 n?=48
 n_aie_cols?=2
 
diff --git a/programming_examples/basic/matrix_multiplication/whole_array/README.md b/programming_examples/basic/matrix_multiplication/whole_array/README.md
index 0e8e7d20c8..61ea47aef7 100644
--- a/programming_examples/basic/matrix_multiplication/whole_array/README.md
+++ b/programming_examples/basic/matrix_multiplication/whole_array/README.md
@@ -22,7 +22,7 @@ At a high level, the code does the following (in order):
 
 1. [**Defining Core Computations:**](#4-defining-core-computations) The `core_body()` function contains the code that will be loaded onto each AIE core. This code describes the matrix multiplication using the input submatrices `a` and `b` acquired through the ObjectFIFOs. The results are accumulated in the output submatrix `c`.
 
-1. [**Defining External Data Transfer Sequences:**](#5-defining-external-data-transfer-sequences) The `sequence()` function sets up matrix data movement from the host into the AIE compute cores, and back to the host after computation. It initializes Data Movement Accelerator (DMA) transfers, sets memory access patterns, and performs synchronization.
+1. [**Defining External Data Transfer Sequences:**](#5-defining-external-data-transfer-sequences) The `aie.runtime_sequence()` op sets up matrix data movement from the host into the AIE compute cores, and back to the host after computation. It initializes Data Movement Accelerator (DMA) transfers, sets memory access patterns, and performs synchronization.
 
 1. **Generating the Design:** The `my_matmul()` function triggers the code generation process and represents the main entry point of the design. The final print statement outputs the MLIR representation of the AIE array configuration.
 
@@ -72,7 +72,7 @@ The input and output matrix sizes are given by the user. We subdivide the input
 
 1. **Tiling to Compute Core Submatrix Chunks:** The input and output matrices stream to/from the AIE compute cores in chunks of size of `m`&times;`k`, `k`&times;`n` and `n`&times;`m`. Tiling into these chunks allows each of the computation cores to concurrently work on distinct sub-sections of the input matrices in parallel, which improves performance. This also reduces on-chip memory requirements. The final result is re-assembled using the sub-matrix results of all cores.
 
-    > This tiling occurs in the `sequence()` function describing the host-to-memory-tile transfer.
+    > This tiling occurs in the `aie.runtime_sequence()` operation describing the host-to-memory-tile transfer.
 We describe it further below, in section *"5. Defining External Data Transfer Sequences"*.
 
 1. **Tiling to Vector Intrinsic Size:** The AIE compute cores calculate the matrix multiplication using efficient "multiply-accumulate" vector intrinsic instructions (`MAC` instructions). These hardware instructions process very small blocks of the matrix: size `r`&times;`s` blocks of `A` and size `s`&times;`t` blocks of  `B`, producing an output of size `r`&times;`t` (`C`). 
@@ -198,7 +198,7 @@ We define a `core_body()` function for each compute core `i`, inside of which we
 
 ### 5. Defining External Data Transfer Sequences
 
-The function signature of the `sequence()` function lists as its arguments all the external buffers from the host that we wish to read from or write to on the AI Engine's shim tiles. The body of this function describes how these buffers are transfered from and to the host, including tiling the input matrices into `m`&times;`k` and `k`&times;`n`-sized sub-matrices, and combining the `m`&times;`n`-sized output tiles into the larger output `M`&times;`N` matrix buffer.
+The signature of the `aie.runtime_sequence()` operation lists as its arguments all the external buffers from the host that we wish to read from or write to on the AI Engine's shim tiles. The body of this function describes how these buffers are transfered from and to the host, including tiling the input matrices into `m`&times;`k` and `k`&times;`n`-sized sub-matrices, and combining the `m`&times;`n`-sized output tiles into the larger output `M`&times;`N` matrix buffer.
 
 * The `tile_row_block` variable segments the M (rows of A) into smaller chunks, each containing `rows_per_block` tile rows. This is done so the buffer descriptors (BDs) can be reused for efficient DMA transfers.
 * For each column `i`:
diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
index 0bd8d119fb..cbdb689a72 100644
--- a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
@@ -25,11 +25,27 @@ def main():
     argparser.add_argument("-N", type=int, default=512)
     argparser.add_argument("-m", type=int, default=64)
     argparser.add_argument("-k", type=int, default=64)
-    argparser.add_argument("-n", type=int, default=64)
+    argparser.add_argument("-n", type=int, default=32)
     argparser.add_argument("--n-aie-cols", type=int, choices=[1, 2, 4], default=4)
+    argparser.add_argument(
+        "--dtype_in", type=str, choices=["bf16", "i16"], default="i16"
+    )
+    argparser.add_argument(
+        "--dtype_out", type=str, choices=["bf16", "i16", "f32", "i32"], default="i16"
+    )
     args = argparser.parse_args()
     with mlir_mod_ctx() as ctx:
-        my_matmul(args.M, args.K, args.N, args.m, args.k, args.n, args.n_aie_cols)
+        my_matmul(
+            args.M,
+            args.K,
+            args.N,
+            args.m,
+            args.k,
+            args.n,
+            args.n_aie_cols,
+            args.dtype_in,
+            args.dtype_out,
+        )
         # print(ctx.module.operation.verify())
         print(ctx.module)
 
@@ -38,14 +54,35 @@ def ceildiv(a, b):
     return (a + b - 1) // b
 
 
-def my_matmul(M, K, N, m, k, n, n_aie_cols):
-    r = 4
-    s = 8
-    t = 4
+def my_matmul(M, K, N, m, k, n, n_aie_cols, dtype_in_str, dtype_out_str):
 
     n_aie_rows = 4
     n_aie_cores = n_aie_rows * n_aie_cols
 
+    dtype_in = None
+    if dtype_in_str == "bf16":
+        dtype_in = T.bf16
+    elif dtype_in_str == "i16":
+        dtype_in = T.i16
+    dtype_out = None
+    if dtype_out_str == "bf16":
+        dtype_out = T.bf16
+    elif dtype_out_str == "i16":
+        dtype_out = T.i16
+    elif dtype_out_str == "f32":
+        dtype_out = T.f32
+    elif dtype_out_str == "i32":
+        dtype_out = T.i32
+
+    if dtype_in_str == "bf16":
+        r = 4
+        s = 8
+        t = 4
+    elif dtype_in_str == "i16":
+        r = 4
+        s = 4
+        t = 4
+
     # Input matrix A:
     # Conceptually, we divide input A into (m * n_rows, k)-sized blocks. These
     # blocks are _broadcast_ across AIE core columns, then _distributed_ across
@@ -90,22 +127,30 @@ def my_matmul(M, K, N, m, k, n, n_aie_cols):
 
     @device(dev)
     def device_body():
-        A_l2_memref_ty = T.memref(m * k * n_A_tiles_per_shim, T.bf16())
-        B_l2_memref_ty = T.memref(k * n, T.bf16())
-        C_l2_memref_ty = T.memref(m * n * n_aie_rows, T.bf16())
-        A_l1_memref_ty = T.memref(m, k, T.bf16())
-        B_l1_memref_ty = T.memref(k, n, T.bf16())
-        C_l1_memref_ty = T.memref(m, n, T.bf16())
+        A_l2_memref_ty = T.memref(m * k * n_A_tiles_per_shim, dtype_in())
+        B_l2_memref_ty = T.memref(k * n, dtype_in())
+        C_l2_memref_ty = T.memref(m * n * n_aie_rows, dtype_out())
+        A_l1_memref_ty = T.memref(m, k, dtype_in())
+        B_l1_memref_ty = T.memref(k, n, dtype_in())
+        C_l1_memref_ty = T.memref(m, n, dtype_out())
 
         # AIE Core Function declarations
         zero_scalar = external_func("zero_scalar_bf16", inputs=[C_l1_memref_ty])
-        zero = external_func("zero_bf16", inputs=[C_l1_memref_ty])
+        zero_scalar = external_func(
+            f"zero_scalar_{dtype_out_str}", inputs=[C_l1_memref_ty]
+        )
+        zero = external_func(f"zero_{dtype_out_str}", inputs=[C_l1_memref_ty])
         matmul_scalar = external_func(
             "matmul_scalar_bf16_bf16",
             inputs=[A_l1_memref_ty, B_l1_memref_ty, C_l1_memref_ty],
         )
+        matmul_scalar = external_func(
+            f"matmul_scalar_{dtype_in_str}_{dtype_out_str}",
+            inputs=[A_l1_memref_ty, B_l1_memref_ty, C_l1_memref_ty],
+        )
         matmul = external_func(
-            "matmul_bf16_bf16", inputs=[A_l1_memref_ty, B_l1_memref_ty, C_l1_memref_ty]
+            f"matmul_{dtype_in_str}_{dtype_out_str}",
+            inputs=[A_l1_memref_ty, B_l1_memref_ty, C_l1_memref_ty],
         )
 
         # Tile declarations as tile[row][col]
@@ -249,10 +294,10 @@ def core_body():
                             yield_([])
 
         # To/from AIE-array data movement
-        @FuncOp.from_py_func(
-            T.memref(M * K, T.bf16()),
-            T.memref(K * N, T.bf16()),
-            T.memref(M * N, T.bf16()),
+        @runtime_sequence(
+            T.memref(M * K, dtype_in()),
+            T.memref(K * N, dtype_in()),
+            T.memref(M * N, dtype_out()),
         )
         def sequence(A, B, C):
             # We are limited in the number of BDs. After synchronizing, we can reuse BDs.
diff --git a/programming_examples/basic/matrix_multiplication/whole_array/run_makefile_1_col.lit b/programming_examples/basic/matrix_multiplication/whole_array/run_makefile_1_col.lit
new file mode 100644
index 0000000000..9234eb1e67
--- /dev/null
+++ b/programming_examples/basic/matrix_multiplication/whole_array/run_makefile_1_col.lit
@@ -0,0 +1,11 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, chess
+//
+// RUN: mkdir -p %S/test_1_col
+// RUN: cd %S/test_1_col
+// RUN: make -f %S/Makefile clean
+// RUN: env n_aie_cols=1 make -f %S/Makefile 
+// RUN: %run_on_npu env n_aie_cols=2 make -f %S/Makefile run | FileCheck %s
+// CHECK: PASS!
diff --git a/programming_examples/basic/matrix_multiplication/whole_array/run_makefile_2_col.lit b/programming_examples/basic/matrix_multiplication/whole_array/run_makefile_2_col.lit
index d3c36a3897..859d669d0d 100644
--- a/programming_examples/basic/matrix_multiplication/whole_array/run_makefile_2_col.lit
+++ b/programming_examples/basic/matrix_multiplication/whole_array/run_makefile_2_col.lit
@@ -3,6 +3,8 @@
 //
 // REQUIRES: ryzen_ai, chess
 //
+// RUN: mkdir -p %S/test_2_col
+// RUN: cd %S/test_2_col
 // RUN: make -f %S/Makefile clean
 // RUN: env n_aie_cols=2 make -f %S/Makefile 
 // RUN: %run_on_npu env n_aie_cols=2 make -f %S/Makefile run | FileCheck %s
diff --git a/programming_examples/basic/matrix_multiplication/whole_array/run_makefile_4_col.lit b/programming_examples/basic/matrix_multiplication/whole_array/run_makefile_4_col.lit
index 431d01d08a..170a68df0e 100644
--- a/programming_examples/basic/matrix_multiplication/whole_array/run_makefile_4_col.lit
+++ b/programming_examples/basic/matrix_multiplication/whole_array/run_makefile_4_col.lit
@@ -3,6 +3,8 @@
 //
 // REQUIRES: ryzen_ai, chess
 //
+// RUN: mkdir -p %S/test_4_col
+// RUN: cd %S/test_4_col
 // RUN: make -f %S/Makefile clean
 // RUN: env n_aie_cols=4 make -f %S/Makefile 
 // RUN: %run_on_npu env n_aie_cols=4 make -f %S/Makefile run | FileCheck %s
diff --git a/programming_examples/basic/matrix_scalar_add/aie2.py b/programming_examples/basic/matrix_scalar_add/aie2.py
index dd02fc081d..bfdd226186 100644
--- a/programming_examples/basic/matrix_scalar_add/aie2.py
+++ b/programming_examples/basic/matrix_scalar_add/aie2.py
@@ -78,7 +78,7 @@ def core_body():
 
         tensor_ty = T.memref(TILE_SIZE, T.i32())
 
-        @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
+        @runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
         def sequence(inTensor, notUsed, outTensor):
             npu_dma_memcpy_nd(
                 metadata="out0",
diff --git a/programming_examples/basic/passthrough_dmas/aie2.py b/programming_examples/basic/passthrough_dmas/aie2.py
index 10becd4e27..7ea797a84d 100644
--- a/programming_examples/basic/passthrough_dmas/aie2.py
+++ b/programming_examples/basic/passthrough_dmas/aie2.py
@@ -60,7 +60,7 @@ def core_body():
             # To/from AIE-array data movement
             tensor_ty = T.memref(N, T.i32())
 
-            @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
+            @runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
             def sequence(A, B, C):
                 npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
                 npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
diff --git a/programming_examples/basic/passthrough_dmas_plio/CMakeLists.txt b/programming_examples/basic/passthrough_dmas_plio/CMakeLists.txt
new file mode 100644
index 0000000000..c17d3d365b
--- /dev/null
+++ b/programming_examples/basic/passthrough_dmas_plio/CMakeLists.txt
@@ -0,0 +1,75 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+set(CMAKE_CXX_STANDARD 23) 
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+    set(CMAKE_C_COMPILER gcc-13)
+    set(CMAKE_CXX_COMPILER g++-13)
+    set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+    set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif()
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName proj_${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+	set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+
+add_executable(${currentTarget}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
+    test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
+
+target_include_directories (${currentTarget} PUBLIC 
+    ${XRT_INC_DIR}
+    ${Boost_INCLUDE_DIRS}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
+)
+
+target_link_directories(${currentTarget} PUBLIC
+    ${XRT_LIB_DIR}
+    ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+        boost_program_options
+        boost_filesystem
+    )
+else()
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+    )
+endif()
diff --git a/programming_examples/basic/passthrough_dmas_plio/Makefile b/programming_examples/basic/passthrough_dmas_plio/Makefile
new file mode 100644
index 0000000000..a88e6c49d6
--- /dev/null
+++ b/programming_examples/basic/passthrough_dmas_plio/Makefile
@@ -0,0 +1,48 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# 
+##===----------------------------------------------------------------------===##
+
+srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+
+include ${srcdir}/../../makefile-common
+
+targetname = passThroughDMAs
+LENGTH ?= 1024
+
+all: input output
+
+build/aie-input-plio.mlir: ${srcdir}/aie2-input-plio.py
+	mkdir -p ${@D}
+	python3 $< ${LENGTH} > $@
+
+build/aie-output-plio.mlir: ${srcdir}/aie2-output-plio.py
+	mkdir -p ${@D}
+	python3 $< ${LENGTH} > $@
+
+input: build/aie-input-plio.mlir
+	aiecc.py --link_against_hsa --host-target=x86_64-amd-linux-gnu build/aie-input-plio.mlir \
+		-I${srcdir}/../../../install/runtime_lib/x86_64-hsa/test_lib/include \
+		-L/lib/x86_64-linux-gnu/ \
+		${srcdir}/test_vck5000.cpp \
+		${srcdir}/../../../install/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \
+		-Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o input.elf
+
+output: build/aie-output-plio.mlir
+	aiecc.py --link_against_hsa --host-target=x86_64-amd-linux-gnu build/aie-output-plio.mlir \
+		-I${srcdir}/../../../install/runtime_lib/x86_64-hsa/test_lib/include \
+		-L/lib/x86_64-linux-gnu/ \
+		${srcdir}/test_vck5000.cpp \
+		${srcdir}/../../../install/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \
+		-Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o output.elf
+
+run_vck5000:
+	test.elf
+
+clean:
+	rm -rf build aie-output-plio.mlir.prj aie-input-plio.mlir.prj core_* input.elf output.elf 
diff --git a/programming_examples/basic/passthrough_dmas_plio/README.md b/programming_examples/basic/passthrough_dmas_plio/README.md
new file mode 100644
index 0000000000..182a1801c0
--- /dev/null
+++ b/programming_examples/basic/passthrough_dmas_plio/README.md
@@ -0,0 +1,27 @@
+<!---//===- README.md --------------------------*- Markdown -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+// 
+//===----------------------------------------------------------------------===//-->
+
+# <ins>Passthrough DMAs with PLIO</ins>
+
+This reference design can be run on the VCK5000 Versal device. This design leverages the same data movement pattern as the [Passthrough DMAs](../passthrough-dmas) example design but it uses a soft DMA. Please see the [platforms repo](https://github.com/Xilinx/ROCm-air-platforms) for more information on how the programmable logic is integrated with the AIEs. This is meant to be an illustrative example to highlight how to integrate PL designs with AIE designs programmed using mlir-aie.
+
+In the platform, tile (26, 0) has PLIO connected to a DMA implemented in the programmable logic. There are two designs, `aie2-input-plio.py` uses the soft DMA to push data from DRAM into the AIEs, wheras `aie2-output-plio.py` uses the soft DMA to receive data from the AIEs and push it to DRAM. The soft DMA is programmed using the same mechanism as the ShimDMAs.
+
+In the [design](./aie2.py) data is brought from external memory to `ComputeTile2` and back, without modification from the tile, by using an implicit copy via the compute tile's Data Movement Accelerator (DMA). The data is read from and written to external memory through the Shim tile (`col`, 0).
+
+The implicit copy is performed using the `object_fifo_link` operation that specifies how input data arriving via `of_in` should be sent further via `of_out` by specifically leveraging the compute tile's DMA. This operation and its functionality are described in more depth in [Section-2b](../../../programming_guide/section-2/section-2b/03_Link_Distribute_Join/README.md#object-fifo-link) of the programming guide.
+
+
+To compile and run the design for VCK5000:
+```
+make all
+./output.elf // To run the kernel which outputs over PLIO
+./input.elf // To run the kernel which inputs over PLIO
+```
diff --git a/programming_examples/basic/passthrough_dmas_plio/aie2-input-plio.py b/programming_examples/basic/passthrough_dmas_plio/aie2-input-plio.py
new file mode 100644
index 0000000000..19d776a772
--- /dev/null
+++ b/programming_examples/basic/passthrough_dmas_plio/aie2-input-plio.py
@@ -0,0 +1,62 @@
+# passthrough_dmas_plio/aie2-output-plio.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.dialects.ext import memref, arith
+from aie.extras.context import mlir_mod_ctx
+
+N = 1024
+
+if len(sys.argv) > 1:
+    N = int(sys.argv[1])
+
+dev = AIEDevice.xcvc1902
+
+
+def my_passthrough():
+    with mlir_mod_ctx() as ctx:
+
+        @device(dev)
+        def device_body():
+            memRef_ty = T.memref(1024, T.i32())
+
+            # Tile declarations
+            ShimTile1 = tile(30, 0)
+            ShimTile2 = tile(26, 0)
+            ComputeTile2 = tile(30, 2)
+
+            # AIE-array data movement with object fifos
+            of_in = object_fifo("in", ShimTile1, ComputeTile2, 2, memRef_ty, plio=True)
+            of_out = object_fifo("out", ComputeTile2, ShimTile2, 2, memRef_ty)
+            object_fifo_link(of_in, of_out)
+
+            # Set up compute tiles
+
+            # Compute tile 2
+            @core(ComputeTile2)
+            def core_body():
+                for _ in for_(sys.maxsize):
+                    yield_([])
+
+            # To/from AIE-array data movement
+            tensor_ty = T.memref(N, T.i32())
+
+            @runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
+            def sequence(A, B, C):
+                npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
+                npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+                npu_sync(column=0, row=0, direction=0, channel=0)
+
+    print(ctx.module)
+
+
+my_passthrough()
diff --git a/programming_examples/basic/passthrough_dmas_plio/aie2-output-plio.py b/programming_examples/basic/passthrough_dmas_plio/aie2-output-plio.py
new file mode 100644
index 0000000000..925b86f6da
--- /dev/null
+++ b/programming_examples/basic/passthrough_dmas_plio/aie2-output-plio.py
@@ -0,0 +1,64 @@
+# passthrough_dmas_plio/aie2-output-plio.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.dialects.ext import memref, arith
+from aie.extras.context import mlir_mod_ctx
+
+N = 1024
+
+if len(sys.argv) > 1:
+    N = int(sys.argv[1])
+
+dev = AIEDevice.xcvc1902
+
+
+def my_passthrough():
+    with mlir_mod_ctx() as ctx:
+
+        @device(dev)
+        def device_body():
+            memRef_ty = T.memref(1024, T.i32())
+
+            # Tile declarations
+            ShimTile1 = tile(26, 0)
+            ShimTile2 = tile(30, 0)
+            ComputeTile2 = tile(30, 2)
+
+            # AIE-array data movement with object fifos
+            of_in = object_fifo("in", ShimTile1, ComputeTile2, 2, memRef_ty)
+            of_out = object_fifo(
+                "out", ComputeTile2, ShimTile2, 2, memRef_ty, plio=True
+            )
+            object_fifo_link(of_in, of_out)
+
+            # Set up compute tiles
+
+            # Compute tile 2
+            @core(ComputeTile2)
+            def core_body():
+                for _ in for_(sys.maxsize):
+                    yield_([])
+
+            # To/from AIE-array data movement
+            tensor_ty = T.memref(N, T.i32())
+
+            @runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
+            def sequence(A, B, C):
+                npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
+                npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+                npu_sync(column=0, row=0, direction=0, channel=0)
+
+    print(ctx.module)
+
+
+my_passthrough()
diff --git a/programming_examples/basic/passthrough_dmas_plio/run_vck5000.lit b/programming_examples/basic/passthrough_dmas_plio/run_vck5000.lit
new file mode 100644
index 0000000000..0d392dd2ef
--- /dev/null
+++ b/programming_examples/basic/passthrough_dmas_plio/run_vck5000.lit
@@ -0,0 +1,9 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: hsa
+//
+// RUN: make -f %S/Makefile clean
+// RUN: make -f %S/Makefile all
+// RUN: %run_on_vck5000 ./input.elf
+// RUN: %run_on_vck5000 ./output.elf
diff --git a/programming_examples/basic/passthrough_dmas_plio/test_vck5000.cpp b/programming_examples/basic/passthrough_dmas_plio/test_vck5000.cpp
new file mode 100644
index 0000000000..7ab2cb3e7e
--- /dev/null
+++ b/programming_examples/basic/passthrough_dmas_plio/test_vck5000.cpp
@@ -0,0 +1,134 @@
+//===- test_vck5000.cpp -----------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fcntl.h>
+#include <iostream>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <thread>
+#include <unistd.h>
+#include <vector>
+#include <xaiengine.h>
+
+#include "memory_allocator.h"
+#include "test_library.h"
+
+#include "aie_data_movement.cpp"
+#include "aie_inc.cpp"
+
+#include "hsa/hsa.h"
+#include "hsa/hsa_ext_amd.h"
+
+constexpr int DMA_COUNT = 1024;
+
+void hsa_check_status(const std::string func_name, hsa_status_t status) {
+  if (status != HSA_STATUS_SUCCESS) {
+    const char *status_string(new char[1024]);
+    hsa_status_string(status, &status_string);
+    std::cout << func_name << " failed: " << status_string << std::endl;
+    delete[] status_string;
+  } else {
+    std::cout << func_name << " success" << std::endl;
+  }
+}
+
+int main(int argc, char *argv[]) {
+  uint64_t row = 0;
+  uint64_t col = 6;
+
+  std::vector<hsa_queue_t *> queues;
+  uint32_t aie_max_queue_size(0);
+
+  aie_libxaie_ctx_t *xaie = mlir_aie_init_libxaie();
+
+  // This is going to initialize HSA, create a queue
+  // and get an agent
+  int ret = mlir_aie_init_device(xaie);
+
+  if (ret) {
+    std::cout << "[ERROR] Error when calling mlir_aie_init_device)"
+              << std::endl;
+    return -1;
+  }
+
+  // Getting access to all of the HSA agents
+  std::vector<hsa_agent_t> agents = xaie->agents;
+
+  if (agents.empty()) {
+    std::cout << "No agents found. Exiting." << std::endl;
+    return -1;
+  }
+
+  std::cout << "Found " << agents.size() << " agents" << std::endl;
+
+  hsa_queue_t *q = xaie->cmd_queue;
+
+  // Adding to our vector of queues
+  queues.push_back(q);
+  assert(queues.size() > 0 && "No queues were sucesfully created!");
+
+  mlir_aie_configure_cores(xaie);
+  mlir_aie_configure_switchboxes(xaie);
+  mlir_aie_initialize_locks(xaie);
+  mlir_aie_configure_dmas(xaie);
+  mlir_aie_start_cores(xaie);
+
+  // Allocating some device memory
+  ext_mem_model_t buf0, buf1, buf2;
+  uint32_t *in_a = (uint32_t *)mlir_aie_mem_alloc(xaie, buf0, DMA_COUNT);
+  uint32_t *in_b = (uint32_t *)mlir_aie_mem_alloc(xaie, buf1, DMA_COUNT);
+  uint32_t *out = (uint32_t *)mlir_aie_mem_alloc(xaie, buf2, DMA_COUNT);
+  mlir_aie_sync_mem_dev(buf0);
+  mlir_aie_sync_mem_dev(buf1);
+  mlir_aie_sync_mem_dev(buf2);
+
+  if (in_a == nullptr || in_b == nullptr || out == nullptr) {
+    std::cout << "Could not allocate in device memory" << std::endl;
+    return -1;
+  }
+
+  for (int i = 0; i < DMA_COUNT; i++) {
+    in_a[i] = i + 1;
+    in_b[i] = i + 1;
+    out[i] = 0xdeface;
+  }
+
+  // Pass arguments in the order of dma_memcpys in the mlir
+  invoke_data_movement(queues[0], &agents[0], out, in_a);
+
+  int errors = 0;
+
+  for (int i = 0; i < DMA_COUNT; i++) {
+    uint32_t s = in_a[i];
+    uint32_t d = out[i];
+    if (d != s) {
+      errors++;
+      printf("mismatch %x != %x\n", d, s);
+    }
+  }
+
+  // destroying the queue
+  hsa_queue_destroy(queues[0]);
+
+  // Shutdown AIR and HSA
+  mlir_aie_deinit_libxaie(xaie);
+
+  if (!errors) {
+    printf("PASS!\n");
+    return 0;
+  } else {
+    printf("fail %d/%d.\n", errors, DMA_COUNT);
+    return -1;
+  }
+}
diff --git a/programming_examples/basic/passthrough_kernel/README.md b/programming_examples/basic/passthrough_kernel/README.md
index 4ba311a690..c7e2446761 100644
--- a/programming_examples/basic/passthrough_kernel/README.md
+++ b/programming_examples/basic/passthrough_kernel/README.md
@@ -54,9 +54,9 @@ This design performs a memcpy operation on a vector of input data. The AIE desig
 
 1. **Core Definition:** The `core_body` function loops through sub-vectors of the input data, acquiring elements from `of_in`, processing using `passThroughLine`, and outputting the result to `of_out`.
 
-1. **Data Movement Configuration:** The `sequence` function configures data movement and synchronization on the `ShimTile` for input and output buffer management.
+1. **Data Movement Configuration:** The `aie.runtime_sequence` operation configures data movement and synchronization on the `ShimTile` for input and output buffer management.
 
-1. **Tracing Configuration (Optional):** Trace control, event groups, and buffer descriptors are set up in the `sequence` function when tracing is enabled.
+1. **Tracing Configuration (Optional):** Trace control, event groups, and buffer descriptors are set up in the `aie.runtime_sequence` operation when tracing is enabled.
 
 1. **Generate the design:** The `passthroughKernel()` function triggers the code generation process. The final print statement outputs the MLIR representation of the AIE array configuration.
 
diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py
index fcd6c84632..39ef9106bd 100644
--- a/programming_examples/basic/passthrough_kernel/aie2.py
+++ b/programming_examples/basic/passthrough_kernel/aie2.py
@@ -59,7 +59,7 @@ def core_body():
 
         tensor_ty = T.memref(N, T.ui8())
 
-        @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
+        @runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
         def sequence(inTensor, outTensor, notUsed):
             if trace_size > 0:
                 trace_utils.configure_simple_tracing_aie2(
diff --git a/programming_examples/basic/row_wise_bias_add/README.md b/programming_examples/basic/row_wise_bias_add/README.md
index 5041106346..0912f3f6d2 100644
--- a/programming_examples/basic/row_wise_bias_add/README.md
+++ b/programming_examples/basic/row_wise_bias_add/README.md
@@ -21,7 +21,7 @@ The data movement and call into the kernel (see below) is described in `aie2.py`
 A single AIE core is configured to process chunks of `m`&times;`n` of `in` and chunks of `n` of `bias` to produce `m`&times;`n` chunks of output.
 Therefore, the output is tiled into `M/m`&times;`N/n` tiles, and the kernel function is called that number of times.
 To avoid unnecessarily reloading the `bias` vector, we iterate through these tiles in a column-major fashion.
-The `strides` and `sizes` in the `sequence` function describe this column-major iteration.
+The `strides` and `sizes` in the `aie.runtime_sequence` operation describe this column-major iteration.
 
 ## Kernel
 
diff --git a/programming_examples/basic/row_wise_bias_add/aie2.py b/programming_examples/basic/row_wise_bias_add/aie2.py
index ca9a588582..dfe8a5ac14 100644
--- a/programming_examples/basic/row_wise_bias_add/aie2.py
+++ b/programming_examples/basic/row_wise_bias_add/aie2.py
@@ -57,9 +57,7 @@ def core_body():
                     yield_([])
                 yield_([])
 
-        @FuncOp.from_py_func(
-            complete_in_memref, complete_bias_memref, complete_out_memref
-        )
+        @runtime_sequence(complete_in_memref, complete_bias_memref, complete_out_memref)
         def sequence(inp, bias, out):
             npu_dma_memcpy_nd(
                 metadata=in_fifo.sym_name.value,
diff --git a/programming_examples/basic/vector_exp/aie2.py b/programming_examples/basic/vector_exp/aie2.py
index 87c8f33c31..14ed048f67 100644
--- a/programming_examples/basic/vector_exp/aie2.py
+++ b/programming_examples/basic/vector_exp/aie2.py
@@ -100,7 +100,7 @@ def core_body():
         # To/from AIE-array data movement
         tensor_ty = T.memref(N, T.bf16())
 
-        @FuncOp.from_py_func(tensor_ty, tensor_ty)
+        @runtime_sequence(tensor_ty, tensor_ty)
         def sequence(A, C):
             npu_dma_memcpy_nd(metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, N])
             npu_dma_memcpy_nd(metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, N])
diff --git a/programming_examples/basic/vector_reduce_add/aie2.py b/programming_examples/basic/vector_reduce_add/aie2.py
index fe6f049984..cb0e26a866 100644
--- a/programming_examples/basic/vector_reduce_add/aie2.py
+++ b/programming_examples/basic/vector_reduce_add/aie2.py
@@ -66,7 +66,7 @@ def core_body():
         # To/from AIE-array data movement
         tensor_ty = T.memref(N, T.i32())
 
-        @FuncOp.from_py_func(tensor_ty, tensor_ty)
+        @runtime_sequence(tensor_ty, tensor_ty)
         def sequence(A, C):
             npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1])
             npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
diff --git a/programming_examples/basic/vector_reduce_max/aie2.py b/programming_examples/basic/vector_reduce_max/aie2.py
index 31ee9f181a..bc5f30e34c 100644
--- a/programming_examples/basic/vector_reduce_max/aie2.py
+++ b/programming_examples/basic/vector_reduce_max/aie2.py
@@ -66,7 +66,7 @@ def core_body():
         # To/from AIE-array data movement
         tensor_ty = T.memref(N, T.i32())
 
-        @FuncOp.from_py_func(tensor_ty, tensor_ty)
+        @runtime_sequence(tensor_ty, tensor_ty)
         def sequence(A, C):
             npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1])
             npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
diff --git a/programming_examples/basic/vector_reduce_min/aie2.py b/programming_examples/basic/vector_reduce_min/aie2.py
index 430ad5f9ef..c39b9f5ae9 100644
--- a/programming_examples/basic/vector_reduce_min/aie2.py
+++ b/programming_examples/basic/vector_reduce_min/aie2.py
@@ -66,7 +66,7 @@ def core_body():
         # To/from AIE-array data movement
         tensor_ty = T.memref(N, T.i32())
 
-        @FuncOp.from_py_func(tensor_ty, tensor_ty)
+        @runtime_sequence(tensor_ty, tensor_ty)
         def sequence(A, C):
             npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1])
             npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
diff --git a/programming_examples/basic/vector_scalar_add/aie2.py b/programming_examples/basic/vector_scalar_add/aie2.py
index 46b44308b6..754f38c584 100644
--- a/programming_examples/basic/vector_scalar_add/aie2.py
+++ b/programming_examples/basic/vector_scalar_add/aie2.py
@@ -20,7 +20,6 @@
 
 
 def my_vector_bias_add():
-
     @device(AIEDevice.npu1_1col)
     def device_body():
         memRef_mem_tile_ty = T.memref(MEM_TILE_WIDTH, T.i32())
@@ -63,7 +62,7 @@ def core_body():
         # To/from AIE-array data movement
         tensor_ty = T.memref(PROBLEM_SIZE, T.i32())
 
-        @FuncOp.from_py_func(tensor_ty, tensor_ty)
+        @runtime_sequence(tensor_ty, tensor_ty)
         def sequence(inTensor, outTensor):
             npu_dma_memcpy_nd(
                 metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, PROBLEM_SIZE]
diff --git a/programming_examples/basic/vector_scalar_add_runlist/aie2.py b/programming_examples/basic/vector_scalar_add_runlist/aie2.py
index 46b44308b6..754f38c584 100644
--- a/programming_examples/basic/vector_scalar_add_runlist/aie2.py
+++ b/programming_examples/basic/vector_scalar_add_runlist/aie2.py
@@ -20,7 +20,6 @@
 
 
 def my_vector_bias_add():
-
     @device(AIEDevice.npu1_1col)
     def device_body():
         memRef_mem_tile_ty = T.memref(MEM_TILE_WIDTH, T.i32())
@@ -63,7 +62,7 @@ def core_body():
         # To/from AIE-array data movement
         tensor_ty = T.memref(PROBLEM_SIZE, T.i32())
 
-        @FuncOp.from_py_func(tensor_ty, tensor_ty)
+        @runtime_sequence(tensor_ty, tensor_ty)
         def sequence(inTensor, outTensor):
             npu_dma_memcpy_nd(
                 metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, PROBLEM_SIZE]
diff --git a/programming_examples/basic/vector_scalar_mul/README.md b/programming_examples/basic/vector_scalar_mul/README.md
index 0923a99a62..016f175577 100644
--- a/programming_examples/basic/vector_scalar_mul/README.md
+++ b/programming_examples/basic/vector_scalar_mul/README.md
@@ -54,9 +54,9 @@ This design performs a memcpy operation on a vector of input data. The AIE desig
 
 1. **Core Definition:** The `core_body` function loops through sub-vectors of the input data, acquiring elements from `of_in`, processing using `vector_scalar_mul_aie_scalar()` or `vector_scalar_mul_aie()`, and outputting the result to `of_out`.
 
-1. **Data Movement Configuration:** The `sequence` function configures data movement and synchronization on the `ShimTile` for input and output buffer management.
+1. **Data Movement Configuration:** The `aie.runtime_sequence` operation configures data movement and synchronization on the `ShimTile` for input and output buffer management.
 
-1. **Tracing Configuration (Optional):** Trace control, event groups, and buffer descriptors are set up in the `sequence` function when tracing is enabled.
+1. **Tracing Configuration (Optional):** Trace control, event groups, and buffer descriptors are set up in the `aie.runtime_sequence` operation when tracing is enabled.
 
 1. **Generate the design:** The `my_vector_scalar()` function triggers the code generation process. The final print statement outputs the MLIR representation of the AIE array configuration.
 
diff --git a/programming_examples/basic/vector_scalar_mul/aie2.py b/programming_examples/basic/vector_scalar_mul/aie2.py
index b0a957393b..dd02a1010c 100644
--- a/programming_examples/basic/vector_scalar_mul/aie2.py
+++ b/programming_examples/basic/vector_scalar_mul/aie2.py
@@ -83,7 +83,7 @@ def core_body():
         tensor_ty = T.memref(N, T.i16())
         scalar_ty = T.memref(1, T.i32())
 
-        @FuncOp.from_py_func(tensor_ty, scalar_ty, tensor_ty)
+        @runtime_sequence(tensor_ty, scalar_ty, tensor_ty)
         def sequence(A, F, C):
 
             if trace_size > 0:
diff --git a/programming_examples/basic/vector_vector_add/Makefile b/programming_examples/basic/vector_vector_add/Makefile
index 8f9367d49a..c1b116d226 100755
--- a/programming_examples/basic/vector_vector_add/Makefile
+++ b/programming_examples/basic/vector_vector_add/Makefile
@@ -25,6 +25,7 @@ build/aie.mlir: ${srcdir}/aie2.py
 build/final.xclbin: build/aie.mlir
 	mkdir -p ${@D}
 	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
+    	--no-xchesscc --no-xbridge --peano ${PEANO_DIR} \
 		--xclbin-name=${@F} --npu-insts-name=insts.txt ${<F}
 
 ${targetname}.exe: ${srcdir}/test.cpp
@@ -45,7 +46,8 @@ vck5000: devicename=xcvc1902
 vck5000: col=6
 
 vck5000: build/aie.mlir
-	aiecc.py --xchesscc --link_against_hsa --host-target=x86_64-amd-linux-gnu build/aie.mlir \
+	aiecc.py --no-xchesscc --no-xbridge --peano ${PEANO_DIR} \
+    --link_against_hsa --host-target=x86_64-amd-linux-gnu build/aie.mlir \
 		-I${srcdir}/../../../install/runtime_lib/x86_64-hsa/test_lib/include \
 		${srcdir}/test_vck5000.cpp \
 		${srcdir}/../../../install/runtime_lib/x86_64-hsa/test_lib/src/test_library.cpp \
diff --git a/programming_examples/basic/vector_vector_add/README.md b/programming_examples/basic/vector_vector_add/README.md
index 7666012ca8..2bb6d6c4d2 100644
--- a/programming_examples/basic/vector_vector_add/README.md
+++ b/programming_examples/basic/vector_vector_add/README.md
@@ -10,7 +10,9 @@
 
 # <ins>Vector Vector Add</ins>
 
-A simple binary operator, which uses a single AIE core to add two vectors together.  The overall vector size in this design is `256` and it processed by the core in smaller sub tiles of size `16`.  It shows how simple it can be to just feed data into the AIEs using the Object FIFO abstraction, and drain the results back to external memory.  This reference design can be run on either a Ryzen™ AI NPU or a VCK5000. 
+A simple binary operator, which uses a single AIE core to add two vectors together.  The overall vector size in this design is `256` and it processed by the core in smaller sub tiles of size `16`.  It shows how simple it can be to just feed data into the AIEs using the Object FIFO abstraction, and drain the results back to external memory.  This reference design can be run on either a Ryzen™ AI NPU or a VCK5000.
+
+> **NOTE:** This example makes use of the [Peano](https://github.com/Xilinx/llvm-aie) compiler to compile an elf executed on the AIE core. Peano is an open source single-core compiler for AI Engines. It is a fork of LLVM extended to support the in-order, exposed-pipeline VLIW processor core.
 
 The kernel executes on AIE tile (`col`, 2). Both input vectors are brought into the tile from Shim tile (`col`, 0). The value of `col` is dependent on whether the application is targeting NPU or VCK5000. The AIE tile performs the summation operations and the Shim tile brings the data back out to external memory.
 
diff --git a/programming_examples/basic/vector_vector_add/aie2.py b/programming_examples/basic/vector_vector_add/aie2.py
index 62ad20534c..2a3595a754 100644
--- a/programming_examples/basic/vector_vector_add/aie2.py
+++ b/programming_examples/basic/vector_vector_add/aie2.py
@@ -76,7 +76,7 @@ def core_body():
         # To/from AIE-array data movement
         tensor_ty = T.memref(N, T.i32())
 
-        @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
+        @runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
         def sequence(A, B, C):
             npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
             npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N])
diff --git a/programming_examples/basic/vector_vector_add/run_makefile.lit b/programming_examples/basic/vector_vector_add/run_makefile.lit
index 6875524001..de53923059 100644
--- a/programming_examples/basic/vector_vector_add/run_makefile.lit
+++ b/programming_examples/basic/vector_vector_add/run_makefile.lit
@@ -1,7 +1,7 @@
 // (c) Copyright 2024 Advanced Micro Devices, Inc.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// REQUIRES: ryzen_ai, chess
+// REQUIRES: ryzen_ai, peano
 //
 // RUN: make -f %S/Makefile clean
 // RUN: make -f %S/Makefile 
diff --git a/programming_examples/basic/vector_vector_add/run_vck5000.lit b/programming_examples/basic/vector_vector_add/run_vck5000.lit
index dcaa9f99c5..c1411b0c91 100644
--- a/programming_examples/basic/vector_vector_add/run_vck5000.lit
+++ b/programming_examples/basic/vector_vector_add/run_vck5000.lit
@@ -1,7 +1,7 @@
 // (c) Copyright 2024 Advanced Micro Devices, Inc.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// REQUIRES: hsa, chess
+// REQUIRES: hsa, peano
 //
 // RUN: make -f %S/Makefile clean
 // RUN: make -f %S/Makefile vck5000
diff --git a/programming_examples/basic/vector_vector_modulo/aie2.py b/programming_examples/basic/vector_vector_modulo/aie2.py
index 83d5675e85..eb3e8f8d03 100644
--- a/programming_examples/basic/vector_vector_modulo/aie2.py
+++ b/programming_examples/basic/vector_vector_modulo/aie2.py
@@ -76,7 +76,7 @@ def core_body():
         # To/from AIE-array data movement
         tensor_ty = T.memref(N, T.i32())
 
-        @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
+        @runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
         def sequence(A, B, C):
             npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
             npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N])
diff --git a/programming_examples/basic/vector_vector_mul/aie2.py b/programming_examples/basic/vector_vector_mul/aie2.py
index fa07bbe58a..414d62fa26 100644
--- a/programming_examples/basic/vector_vector_mul/aie2.py
+++ b/programming_examples/basic/vector_vector_mul/aie2.py
@@ -76,7 +76,7 @@ def core_body():
         # To/from AIE-array data movement
         tensor_ty = T.memref(N, T.i32())
 
-        @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
+        @runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
         def sequence(A, B, C):
             npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
             npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N])
diff --git a/programming_examples/ml/bottleneck/aie2.py b/programming_examples/ml/bottleneck/aie2.py
index 73710e499a..f8db29952e 100644
--- a/programming_examples/ml/bottleneck/aie2.py
+++ b/programming_examples/ml/bottleneck/aie2.py
@@ -512,7 +512,7 @@ def core_body():
             activationsInL3_ty = MemRefType.get((activationsIn,), int8_ty)
             weightsInL3_ty = MemRefType.get((totalWeights,), uint8_ty)
 
-            @FuncOp.from_py_func(activationsInL3_ty, weightsInL3_ty, activationsInL3_ty)
+            @runtime_sequence(activationsInL3_ty, weightsInL3_ty, activationsInL3_ty)
             def sequence(inputFromL3, weightsFromL3, outputToL3):
 
                 if enableTrace:
diff --git a/programming_examples/ml/conv2d/aie2.py b/programming_examples/ml/conv2d/aie2.py
index b6fb537a26..7b7e72c227 100644
--- a/programming_examples/ml/conv2d/aie2.py
+++ b/programming_examples/ml/conv2d/aie2.py
@@ -142,7 +142,7 @@ def core_body():
             memRef_wts_ty = T.memref(weights, T.i8())
             # memRef_16x16_ty = T.memref(16, 16, T.i32())
 
-            @FuncOp.from_py_func(tensor_ty, memRef_wts_ty, tensor_ty)
+            @runtime_sequence(tensor_ty, memRef_wts_ty, tensor_ty)
             def sequence(I, W, O):
                 NpuWriteRTPOp("rtp2", col=0, row=2, index=0, value=10)
 
diff --git a/programming_examples/ml/conv2d_fused_relu/aie2.py b/programming_examples/ml/conv2d_fused_relu/aie2.py
index af0afd6a43..2b53fbd983 100644
--- a/programming_examples/ml/conv2d_fused_relu/aie2.py
+++ b/programming_examples/ml/conv2d_fused_relu/aie2.py
@@ -149,7 +149,7 @@ def core_body():
             memRef_wts_ty = T.memref(weights, T.i8())
             # memRef_16x16_ty = T.memref(16, 16, T.i32())
 
-            @FuncOp.from_py_func(tensor_ty, memRef_wts_ty, tensor_ty)
+            @runtime_sequence(tensor_ty, memRef_wts_ty, tensor_ty)
             def sequence(I, W, O):
                 if enableTrace:
                     # 0x340D0: Trace Control 0
diff --git a/programming_examples/ml/eltwise_add/aie2.py b/programming_examples/ml/eltwise_add/aie2.py
index 4d0716fa1c..b91caa1fe6 100644
--- a/programming_examples/ml/eltwise_add/aie2.py
+++ b/programming_examples/ml/eltwise_add/aie2.py
@@ -127,7 +127,7 @@ def core_body():
         # To/from AIE-array data movement
         tensor_ty = T.memref(N, T.bf16())
 
-        @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
+        @runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
         def sequence(A, B, C):
 
             if trace_size > 0:
diff --git a/programming_examples/ml/eltwise_mul/aie2.py b/programming_examples/ml/eltwise_mul/aie2.py
index 4966ecd06e..4eb2eeac8b 100644
--- a/programming_examples/ml/eltwise_mul/aie2.py
+++ b/programming_examples/ml/eltwise_mul/aie2.py
@@ -128,7 +128,7 @@ def core_body():
         # To/from AIE-array data movement
         tensor_ty = T.memref(N, T.bf16())
 
-        @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
+        @runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
         def sequence(A, B, C):
 
             if trace_size > 0:
diff --git a/programming_examples/ml/relu/aie2.py b/programming_examples/ml/relu/aie2.py
index 2d62135f27..88c9c2860a 100644
--- a/programming_examples/ml/relu/aie2.py
+++ b/programming_examples/ml/relu/aie2.py
@@ -104,7 +104,7 @@ def core_body():
         # To/from AIE-array data movement
         tensor_ty = T.memref(N, T.bf16())
 
-        @FuncOp.from_py_func(tensor_ty, tensor_ty)
+        @runtime_sequence(tensor_ty, tensor_ty)
         def sequence(A, C):
 
             if trace_size > 0:
diff --git a/programming_examples/ml/resnet/layers_conv2_x/aie.mlir b/programming_examples/ml/resnet/layers_conv2_x/aie.mlir
index 856baf8b2d..d2e8540dd1 100755
--- a/programming_examples/ml/resnet/layers_conv2_x/aie.mlir
+++ b/programming_examples/ml/resnet/layers_conv2_x/aie.mlir
@@ -880,7 +880,7 @@ aie.device(npu1_3col) {
     } { link_with="conv2dk1_skip.o" }
 
 
-  func.func @sequence(%in0 : memref<16384xi32>, %wts0 : memref<53248xi32>, %out : memref<65536xi32>) {
+  aiex.runtime_sequence(%in0 : memref<16384xi32>, %wts0 : memref<53248xi32>, %out : memref<65536xi32>) {
                   // Trace output
 
       // Trace_Event0, Trace_Event1: Select which events to trace.
@@ -1006,7 +1006,6 @@ aie.device(npu1_3col) {
       aiex.npu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, %total_wts_3_off][1, 1, 1, %total_wts_3][0, 0, 0, 1]) {id = 1 : i64, metadata = @inOF_wts_2_L3L2} : memref<53248xi32>
 
       aiex.npu.sync {channel = 0 : i32, column = 1 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
     }
 
     }
diff --git a/programming_examples/ml/resnet/layers_conv2_x/aie2.py b/programming_examples/ml/resnet/layers_conv2_x/aie2.py
index 729ed2b0fb..1178b8e237 100755
--- a/programming_examples/ml/resnet/layers_conv2_x/aie2.py
+++ b/programming_examples/ml/resnet/layers_conv2_x/aie2.py
@@ -918,7 +918,7 @@ def core_body():
 
             weightsInL3_ty_complete = MemRefType.get((totalWeights_complete,), int8_ty)
 
-            @FuncOp.from_py_func(
+            @runtime_sequence(
                 activationsInL3_ty, weightsInL3_ty_complete, activationsOutL3_ty
             )
             def sequence(inputFromL3, weightsFromL3, outputToL3):
diff --git a/programming_examples/ml/softmax/aie2.py b/programming_examples/ml/softmax/aie2.py
index 812bd71781..56fd9069f2 100755
--- a/programming_examples/ml/softmax/aie2.py
+++ b/programming_examples/ml/softmax/aie2.py
@@ -107,7 +107,7 @@ def core_body():
         # To/from AIE-array data movement
         tensor_ty = T.memref(N, T.bf16())
 
-        @FuncOp.from_py_func(tensor_ty, tensor_ty)
+        @runtime_sequence(tensor_ty, tensor_ty)
         def sequence(A, C):
 
             if trace_size > 0:
diff --git a/programming_examples/vision/color_detect/aie2_colorDetect.py b/programming_examples/vision/color_detect/aie2_colorDetect.py
index 9a66785bbb..026aa0a00f 100644
--- a/programming_examples/vision/color_detect/aie2_colorDetect.py
+++ b/programming_examples/vision/color_detect/aie2_colorDetect.py
@@ -249,7 +249,7 @@ def coreBody():
                 T.i32(),
             )
 
-            @FuncOp.from_py_func(tensor_ty, memRef_16x16_ty, tensor_ty)
+            @runtime_sequence(tensor_ty, memRef_16x16_ty, tensor_ty)
             def sequence(I, B, O):
                 npu_dma_memcpy_nd(
                     metadata="inOF_L3L2",
diff --git a/programming_examples/vision/color_threshold/aie2_colorThreshold.py b/programming_examples/vision/color_threshold/aie2_colorThreshold.py
index fa067226dc..0cda1f7cc3 100644
--- a/programming_examples/vision/color_threshold/aie2_colorThreshold.py
+++ b/programming_examples/vision/color_threshold/aie2_colorThreshold.py
@@ -248,7 +248,7 @@ def core_body():
 
             tensorSize = width * height
 
-            @FuncOp.from_py_func(
+            @runtime_sequence(
                 T.memref(tensorSize, T.i8()),
                 T.memref(32, T.i32()),  # not used
                 T.memref(tensorSize, T.i8()),
diff --git a/programming_examples/vision/edge_detect/aie2_edgeDetect.py b/programming_examples/vision/edge_detect/aie2_edgeDetect.py
index 3e095e356d..933f5404f7 100644
--- a/programming_examples/vision/edge_detect/aie2_edgeDetect.py
+++ b/programming_examples/vision/edge_detect/aie2_edgeDetect.py
@@ -296,7 +296,7 @@ def core_body():
             tensor_ty = T.memref(tensorSize, T.i8())
             memRef_16x16_ty = T.memref(16, 16, T.i32())
 
-            @FuncOp.from_py_func(tensor_ty, memRef_16x16_ty, tensor_ty)
+            @runtime_sequence(tensor_ty, memRef_16x16_ty, tensor_ty)
             def sequence(I, B, O):
                 npu_dma_memcpy_nd(
                     metadata="outOF_L2L3",
diff --git a/programming_examples/vision/vision_passthrough/aie2.py b/programming_examples/vision/vision_passthrough/aie2.py
index 6c3d1f4b51..abadaa785e 100644
--- a/programming_examples/vision/vision_passthrough/aie2.py
+++ b/programming_examples/vision/vision_passthrough/aie2.py
@@ -69,7 +69,7 @@ def core_body():
             tensorSize = width * height
             tensor_ty = T.memref(tensorSize, T.i8())
 
-            @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
+            @runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
             def sequence(inTensor, notUsed, outTensor):
                 if enableTrace:
                     # Trace output
diff --git a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_1080.mlir b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_1080.mlir
index a9aa8a969d..cdaa8c1f35 100644
--- a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_1080.mlir
+++ b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_1080.mlir
@@ -46,7 +46,7 @@ module @passThroughLine_aie2 {
             aie.end
         } { link_with="passThrough.cc.o" } // indicate kernel object name used by this core
 
-        func.func @sequence(%in : memref<518400xi32>, %arg1 : memref<1xi32>, %out : memref<518400xi32>) {
+        aiex.runtime_sequence(%in : memref<518400xi32>, %arg1 : memref<1xi32>, %out : memref<518400xi32>) {
             %c0 = arith.constant 0 : i64
             %c1 = arith.constant 1 : i64
             %tileheight = arith.constant 1080  : i64
@@ -56,7 +56,6 @@ module @passThroughLine_aie2 {
             aiex.npu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth, %c1]) { metadata = @inOF, id = 1 : i64 } : memref<518400xi32>
             aiex.npu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth, %c1]) { metadata = @outOF, id = 0 : i64 } : memref<518400xi32>
             aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-            return
         }
     }
 }
diff --git a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_8k.mlir b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_8k.mlir
index 711cb7ff16..1bc1baccf4 100644
--- a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_8k.mlir
+++ b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_8k.mlir
@@ -46,7 +46,7 @@ module @passThroughLine_aie2 {
             aie.end
         } { link_with="passThrough.cc.o" } // indicate kernel object name used by this core
 
-        func.func @sequence(%in : memref<2073600xi32>, %arg1 : memref<1xi32>, %out : memref<2073600xi32>) {
+        aiex.runtime_sequence(%in : memref<2073600xi32>, %arg1 : memref<1xi32>, %out : memref<2073600xi32>) {
             %c0 = arith.constant 0 : i64
             %c1 = arith.constant 1 : i64
             %tileheight = arith.constant 1080  : i64
@@ -57,7 +57,6 @@ module @passThroughLine_aie2 {
             aiex.npu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %c1, %totalLenRGBA][%c0, %c0, %c0, %c1]) { metadata = @inOF, id = 1 : i64 } : memref<2073600xi32>
             aiex.npu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %c1, %totalLenRGBA][%c0, %c0, %c0, %c1]) { metadata = @outOF, id = 0 : i64 } : memref<2073600xi32>
             aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-            return
         }
     }
 }
diff --git a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_tiny.mlir b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_tiny.mlir
index 507a356b2a..fef2f487b1 100644
--- a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_tiny.mlir
+++ b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_tiny.mlir
@@ -46,7 +46,7 @@ module @passThroughLine_aie2 {
             aie.end
         } { link_with="passThrough.cc.o" } // indicate kernel object name used by this core
 
-        func.func @sequence(%in : memref<1152xi32>, %arg1 : memref<1xi32>, %out : memref<1152xi32>) {
+        aiex.runtime_sequence(%in : memref<1152xi32>, %arg1 : memref<1xi32>, %out : memref<1152xi32>) {
             %c0 = arith.constant 0 : i64
             %c1 = arith.constant 1 : i64
             %tileheight = arith.constant 9  : i64
@@ -56,7 +56,6 @@ module @passThroughLine_aie2 {
             aiex.npu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth, %c1]) { metadata = @inOF, id = 1 : i64 } : memref<1152xi32>
             aiex.npu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth, %c1]) { metadata = @outOF, id = 0 : i64 } : memref<1152xi32>
             aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-            return
         }
     }
 }
diff --git a/programming_guide/section-2/section-2e/02_external_mem_to_core/ext_to_core.py b/programming_guide/section-2/section-2e/02_external_mem_to_core/ext_to_core.py
index 6925e6bd2d..0a0c17f1eb 100644
--- a/programming_guide/section-2/section-2e/02_external_mem_to_core/ext_to_core.py
+++ b/programming_guide/section-2/section-2e/02_external_mem_to_core/ext_to_core.py
@@ -52,7 +52,7 @@ def core_body():
 
             memRef_48_ty = T.memref(48, T.i32())
 
-            @FuncOp.from_py_func(memRef_48_ty, memRef_48_ty, memRef_48_ty)
+            @runtime_sequence(memRef_48_ty, memRef_48_ty, memRef_48_ty)
             def sequence(inTensor, notUsed, outTensor):
                 npu_dma_memcpy_nd(
                     metadata="out", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 48]
diff --git a/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/ext_to_core_L2.py b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/ext_to_core_L2.py
index 989808392c..4267abd903 100644
--- a/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/ext_to_core_L2.py
+++ b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/ext_to_core_L2.py
@@ -56,7 +56,7 @@ def core_body():
             memRef_48_ty = T.memref(48, T.i32())
 
             # To/from AIE-array data movement
-            @FuncOp.from_py_func(memRef_48_ty, memRef_48_ty, memRef_48_ty)
+            @runtime_sequence(memRef_48_ty, memRef_48_ty, memRef_48_ty)
             def sequence(inTensor, notUsed, outTensor):
                 npu_dma_memcpy_nd(
                     metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 48]
diff --git a/programming_guide/section-2/section-2e/05_join_L2/distribute_and_join_L2.py b/programming_guide/section-2/section-2e/05_join_L2/distribute_and_join_L2.py
index b8c264ea28..0e77a63a2c 100644
--- a/programming_guide/section-2/section-2e/05_join_L2/distribute_and_join_L2.py
+++ b/programming_guide/section-2/section-2e/05_join_L2/distribute_and_join_L2.py
@@ -93,7 +93,7 @@ def core_body():
 
             memRef_48_ty = T.memref(48, T.i32())
 
-            @FuncOp.from_py_func(memRef_48_ty, memRef_48_ty, memRef_48_ty)
+            @runtime_sequence(memRef_48_ty, memRef_48_ty, memRef_48_ty)
             def sequence(inTensor, notUsed, outTensor):
                 npu_dma_memcpy_nd(
                     metadata="out", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 48]
diff --git a/programming_guide/section-2/section-2g/README.md b/programming_guide/section-2/section-2g/README.md
index 0af27515d9..f6f35c042a 100644
--- a/programming_guide/section-2/section-2g/README.md
+++ b/programming_guide/section-2/section-2g/README.md
@@ -23,7 +23,7 @@
 
 In the preceding sections, we looked at how we can describe data movement between tiles *within* the AIE-array. However, to do anything useful, we need to get data from outside the array, i.e., from the "host", into the AIE-array and back. On NPU devices, we can achieve this with the operations described in this section. 
 
-The operations that will be described in this section must be placed in a separate `sequence` function. The arguments to this function describe buffers that will be available on the host side; the body of the function describes how those buffers are moved into the AIE-array. [Section 3](../../section-3/) contains an example.
+The operations that will be described in this section must be placed in a separate `aie.runtime_sequence` operation. The arguments to this function describe buffers that will be available on the host side; the body of the function describes how those buffers are moved into the AIE-array. [Section 3](../../section-3/) contains an example.
 
 ### Guide to Managing Runtime Data Movement to/from Host Memory
 
diff --git a/programming_guide/section-3/README.md b/programming_guide/section-3/README.md
index 0e1f44cabc..e46da9dc13 100644
--- a/programming_guide/section-3/README.md
+++ b/programming_guide/section-3/README.md
@@ -74,7 +74,7 @@ We also need to set up the data movement to/from the AIE-array: configure n-dime
         tensor_ty = T.memref(4096, T.i32())
         scalar_ty = T.memref(1, T.i32())
 
-        @FuncOp.from_py_func(tensor_ty, scalar_ty, tensor_ty)
+        @runtime_sequence(tensor_ty, scalar_ty, tensor_ty)
         def sequence(A, F, C):
             npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 4096])
             npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, 4096])
diff --git a/programming_guide/section-3/aie2.py b/programming_guide/section-3/aie2.py
index fc23ff6b17..198eaba58e 100644
--- a/programming_guide/section-3/aie2.py
+++ b/programming_guide/section-3/aie2.py
@@ -17,7 +17,6 @@
 
 
 def my_vector_scalar():
-
     @device(AIEDevice.npu1_1col)
     def device_body():
         memRef_ty = T.memref(1024, T.i32())
@@ -61,7 +60,7 @@ def core_body():
         tensor_ty = T.memref(4096, T.i32())
         scalar_ty = T.memref(1, T.i32())
 
-        @FuncOp.from_py_func(tensor_ty, scalar_ty, tensor_ty)
+        @runtime_sequence(tensor_ty, scalar_ty, tensor_ty)
         def sequence(A, F, C):
             npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 4096])
             npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, 4096])
diff --git a/programming_guide/section-4/section-4a/aie2.py b/programming_guide/section-4/section-4a/aie2.py
index a9b4b70ab5..3cde8754c2 100644
--- a/programming_guide/section-4/section-4a/aie2.py
+++ b/programming_guide/section-4/section-4a/aie2.py
@@ -17,7 +17,6 @@
 
 
 def my_vector_scalar():
-
     @device(AIEDevice.npu1_1col)
     def device_body():
         memRef_ty = T.memref(1024, T.i32())
@@ -61,7 +60,7 @@ def core_body():
         tensor_ty = T.memref(4096, T.i32())
         scalar_ty = T.memref(1, T.i32())
 
-        @FuncOp.from_py_func(tensor_ty, scalar_ty, tensor_ty)
+        @runtime_sequence(tensor_ty, scalar_ty, tensor_ty)
         def sequence(A, F, C):
             npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 4096])
             npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, 4096])
diff --git a/programming_guide/section-4/section-4b/README.md b/programming_guide/section-4/section-4b/README.md
index f62b787280..ea24f04ee5 100644
--- a/programming_guide/section-4/section-4b/README.md
+++ b/programming_guide/section-4/section-4b/README.md
@@ -44,7 +44,7 @@ The first necessary component for trace configuration is setting the right value
         offset=tensorSize,
     )
 ```
-This block is defined within the sequence definition for `@FuncOp.from_py_func` where we define the shimDMA data movement to the 3 inout buffers. 
+This block is defined within the sequence definition for `@runtime_sequence` where we define the shimDMA data movement to the 3 inout buffers. 
 > **Note** This simplification works very well for the trace buffer from a single tile to the shimDMA. However, if we want to do something more advaned like allocating the trace buffer from multiple tiles into a single larger buffer, this function will not be able to express that. For that, please consult the [README](../../../python/utils) under `python/utils` for more guidance on how to customize the trace configuration.
 
 ### <u>(1b) Define trace event routes from tile to shimDMA</u>
diff --git a/programming_guide/section-4/section-4b/aie2.py b/programming_guide/section-4/section-4b/aie2.py
index 910d4b1a94..2f02e67d6f 100644
--- a/programming_guide/section-4/section-4b/aie2.py
+++ b/programming_guide/section-4/section-4b/aie2.py
@@ -68,7 +68,7 @@ def core_body():
         tensor_ty = T.memref(4096, T.i32())
         scalar_ty = T.memref(1, T.i32())
 
-        @FuncOp.from_py_func(tensor_ty, scalar_ty, tensor_ty)
+        @runtime_sequence(tensor_ty, scalar_ty, tensor_ty)
         def sequence(A, F, C):
             if enableTrace:
                 trace_utils.configure_simple_tracing_aie2(
diff --git a/python/compiler/aiecc/main.py b/python/compiler/aiecc/main.py
index c4e381ee1a..f52a39ef9c 100644
--- a/python/compiler/aiecc/main.py
+++ b/python/compiler/aiecc/main.py
@@ -63,6 +63,8 @@
     .expand_strided_metadata()
     .lower_affine()
     .convert_math_to_llvm()
+    .convert_index_to_llvm()
+    .arith_expand()
     .convert_arith_to_llvm()
     .finalize_memref_to_llvm()
     .convert_func_to_llvm(use_bare_ptr_memref_call_conv=True)
@@ -583,7 +585,7 @@ async def process_xclbin_gen(self):
         if opts.xclbin_input:
             await self.do_call(task, ["xclbinutil",
                                       "--dump-section", "AIE_PARTITION:JSON:" + self.prepend_tmp("aie_input_partition.json"),
-                                      "--force", "--input", opts.xclbin_input])
+                                      "--force", "--quiet", "--input", opts.xclbin_input])
             with open(self.prepend_tmp("aie_input_partition.json")) as f:
                 input_partition = json.load(f)
             with open(self.prepend_tmp("aie_partition.json")) as f:
@@ -598,7 +600,7 @@ async def process_xclbin_gen(self):
         await self.do_call(task, ["xclbinutil"] + flag +
                                  ["--add-kernel", self.prepend_tmp("kernels.json"),
                                   "--add-replace-section", "AIE_PARTITION:JSON:" + self.prepend_tmp("aie_partition.json"),
-                                  "--force", "--output", opts.xclbin_name])
+                                  "--force", "--quiet", "--output", opts.xclbin_name])
         # fmt: on
 
     async def process_host_cgen(self, aie_target, file_with_addresses):
diff --git a/python/dialects/aie.py b/python/dialects/aie.py
index cd02b3381e..0018c7c919 100644
--- a/python/dialects/aie.py
+++ b/python/dialects/aie.py
@@ -108,7 +108,7 @@ def bd_dim_layout_array_attr_builder(
     if isinstance(tups, list) and all(isinstance(t, tuple) for t in tups):
         tups = list(map(lambda t: bd_dim_layout(*t), tups))
     return Attribute.parse(
-        f'#aie<bd_dim_layout_arr[{", ".join(map(str, tups))}]>', context=context
+        f'#aie<bd_dim_layout_array[{", ".join(map(str, tups))}]>', context=context
     )
 
 
@@ -116,7 +116,8 @@ def bd_dim_layout_array_attr_builder(
 def bd_dim_layout_array_array_attr_builder(tup_arrs: List[List[tuple]], context=None):
     tup_arrs = list(map(bd_dim_layout_array_attr_builder, tup_arrs))
     return Attribute.parse(
-        f'#aie<bd_dim_layout_arr_arr[{", ".join(map(str, tup_arrs))}]>', context=context
+        f'#aie<bd_dim_layout_array_array[{", ".join(map(str, tup_arrs))}]>',
+        context=context,
     )
 
 
@@ -211,6 +212,7 @@ def __init__(
         dimensionsToStream=None,
         dimensionsFromStreamPerConsumer=None,
         via_DMA=None,
+        plio=None,
     ):
         self.datatype = datatype
         if not isinstance(consumerTiles, List):
@@ -230,6 +232,7 @@ def __init__(
             dimensionsToStream=dimensionsToStream,
             dimensionsFromStreamPerConsumer=dimensionsFromStreamPerConsumer,
             via_DMA=via_DMA,
+            plio=plio,
         )
 
     def acquire(self, port, num_elem):
diff --git a/python/dialects/aiex.py b/python/dialects/aiex.py
index 350a2aca93..e01e732c65 100644
--- a/python/dialects/aiex.py
+++ b/python/dialects/aiex.py
@@ -4,7 +4,7 @@
 from functools import partial
 import itertools
 from operator import itemgetter
-from typing import Union
+from typing import Union, Optional
 
 import numpy as np
 
@@ -23,7 +23,7 @@
 from .transform.structured import MixedValues, _dispatch_mixed_values
 from .._mlir_libs import get_dialect_registry
 from .._mlir_libs._aie import *
-from ..ir import DictAttr, IntegerAttr, UnitAttr
+from ..ir import DictAttr, IntegerAttr, UnitAttr, Type, InsertionPoint
 
 # noinspection PyUnresolvedReferences
 from ..extras.dialects.ext import memref
@@ -46,6 +46,7 @@ def __init__(
         offsets: MixedValues = None,
         sizes: MixedValues = None,
         strides: MixedValues = None,
+        issue_token: Optional[bool] = None,
     ):
         x = 0
         y = 0
@@ -74,6 +75,7 @@ def __init__(
             static_strides,
             metadata,
             bd_id,
+            issue_token=issue_token,
         )
 
 
@@ -759,3 +761,17 @@ def _find_next_channel(used_channels):
     if len(flows) == 1:
         flows = flows[0]
     return flows
+
+
+# Runtime sequence
+
+
+def runtime_sequence(*inputs: Type):
+    def decorator(f):
+        seq_op = RuntimeSequenceOp()
+        entry_block = seq_op.body.blocks.append(*inputs)
+        args = entry_block.arguments
+        with InsertionPoint(entry_block):
+            f(*args)
+
+    return decorator
diff --git a/python/utils/README.md b/python/utils/README.md
index e2c9130ab5..d84e0ecbae 100644
--- a/python/utils/README.md
+++ b/python/utils/README.md
@@ -130,7 +130,7 @@ configure_tracing_aie2(
 `PortEvent` is defined in `aie.utils.trace` and `CoreEvent` is defined in `aie.utils.trace_events_enum`.
 
 ### Configure tile trace settings
-Within the `func.func @sequence` block, we call a set of configuration register writes (`aiex.npu.write32`) to configure the tile trace units and (`aiex.npu.writebd`) to configure the shimDMA. 
+Within the `aiex.runtime_sequence` block, we call a set of configuration register writes (`aiex.npu.write32`) to configure the tile trace units and (`aiex.npu.writebd`) to configure the shimDMA. 
 
 For a give AIE2 tile, we configure the trace control registers for the tile core and tile memory separately. There are 4 registers we generally use to configure the trace unit behavior. 2 are for configuring the general trace control and the other 2 are to specify which events our tile's trace hardware is monitoring.
 
diff --git a/python/utils/xrt.py b/python/utils/xrt.py
index 7319edd872..b1a1e00529 100644
--- a/python/utils/xrt.py
+++ b/python/utils/xrt.py
@@ -55,7 +55,7 @@ def call(self):
         h = self.kernel(
             opcode,
             self.insts_buffer.bo,
-            self.n_insts * 4,
+            self.n_insts,
             *[b.bo for b in self.buffers if b is not None],
         )
         return h
diff --git a/test/Conversion/AIEVecToLLVM/mac16_i16.mlir b/test/Conversion/AIEVecToLLVM/mac16_i16.mlir
index 2b4d050554..0f24799f1c 100644
--- a/test/Conversion/AIEVecToLLVM/mac16_i16.mlir
+++ b/test/Conversion/AIEVecToLLVM/mac16_i16.mlir
@@ -1,7 +1,7 @@
 // RUN: aie-opt %s --convert-aievec-to-llvm | FileCheck %s
 module {
   func.func @mac16(%arg0: vector<32xi16>, %arg1: vector<16xi16>, %arg2: vector<16xi48>) {
-    %0 = aievec.mac %arg0, %arg1, %arg2 {xoffsets= "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "1", zoffsets = "0", zoffsets_hi = "0", zstart = "2", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+    %0 = aievec_aie1.mac %arg0, %arg1, %arg2 {xoffsets= "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "1", zoffsets = "0", zoffsets_hi = "0", zstart = "2", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
     return
   }
 }
diff --git a/test/Conversion/AIEVecToLLVM/test-mac_elem.mlir b/test/Conversion/AIEVecToLLVM/test-mac_elem.mlir
new file mode 100644
index 0000000000..7a8cd165eb
--- /dev/null
+++ b/test/Conversion/AIEVecToLLVM/test-mac_elem.mlir
@@ -0,0 +1,89 @@
+// RUN: aie-opt %s -convert-aievec-to-llvm -split-input-file | FileCheck %s
+
+// CHECK-LABEL: mac_flat_vec
+// CHECK-SAME: %[[V0:[a-zA-Z0-9]+]]: vector<16xbf16>,
+// CHECK-SAME: %[[V1:.*]]: vector<16xbf16>,
+// CHECK-SAME: %[[V2:.*]]: vector<16xf32>)
+func.func @mac_flat_vec(%v0 : vector<16xbf16>,
+                        %v1 : vector<16xbf16>,
+                        %v2 : vector<16xf32>) -> vector<16xf32> {
+  // CHECK: %[[C60:.*]] = llvm.mlir.constant(60 : i32) : i32
+
+  // CHECK: %[[C0_0:.*]] = llvm.mlir.constant(0 : i32) : i32
+  // CHECK: %[[BV0:.*]] = llvm.bitcast %[[V0]]
+  // CHECK-SAME:                        : vector<16xbf16> to vector<8xi32>
+  // CHECK: %[[BV02C:.*]] = "xllvm.intr.aie2.set.I512.I256"(%[[BV0]], %[[C0_0]])
+  // CHECK-SAME:                        : (vector<8xi32>, i32) -> vector<16xi32>
+  // CHECK: %[[V02C:.*]] = llvm.bitcast %[[BV02C]]
+  // CHECK-SAME:                        : vector<16xi32> to vector<32xbf16>
+
+  // CHECK: %[[C0_1:.*]] = llvm.mlir.constant(0 : i32) : i32
+  // CHECK: %[[BV1:.*]] = llvm.bitcast %[[V1]]
+  // CHECK-SAME:                        : vector<16xbf16> to vector<8xi32>
+  // CHECK: %[[BV12C:.*]] = "xllvm.intr.aie2.set.I512.I256"(%[[BV1]], %[[C0_1]])
+  // CHECK-SAME:                        : (vector<8xi32>, i32) -> vector<16xi32>
+  // CHECK: %[[V12C:.*]] = llvm.bitcast %[[BV12C]]
+  // CHECK-SAME:                        : vector<16xi32> to vector<32xbf16>
+
+  // CHECK: %[[BV2:.*]] = llvm.bitcast %[[V2]]
+  // CHECK-SAME:                        : vector<16xf32> to vector<8xi64>
+  // CHECK: %[[BRS:.*]] = "xllvm.intr.aie2.bf.mac16.conf"(%[[V02C]], %[[V12C]],
+  // CHECK-SAME:                                          %[[BV2]], %[[C60]])
+  // CHECK-SAME:                        : (vector<32xbf16>, vector<32xbf16>,
+  // CHECK-SAME:                           vector<8xi64>, i32) -> vector<8xi64>
+  // CHECK: %[[RS:.*]] = llvm.bitcast %[[BRS]]
+  // CHECK-SAME:                        : vector<8xi64> to vector<16xf32>
+
+  %0 = aievec.mac_elem %v0, %v1, %v2 : vector<16xbf16>, vector<16xbf16>, vector<16xf32>
+  return %0 : vector<16xf32>
+}
+
+// -----
+
+// CHECK-LABEL: mac_2d_vec
+// CHECK-SAME: %[[V02D:[a-zA-Z0-9]+]]: vector<4x4xbf16>,
+// CHECK-SAME: %[[V12D:.*]]: vector<4x4xbf16>,
+// CHECK-SAME: %[[V22D:.*]]: vector<4x4xf32>)
+func.func @mac_2d_vec(%v0 : vector<4x4xbf16>,
+                      %v1 : vector<4x4xbf16>,
+                      %v2 : vector<4x4xf32>) -> vector<4x4xf32> {
+  // CHECK: %[[V0:.*]] = vector.shape_cast %[[V02D]]
+  // CHECK-SAME:                        : vector<4x4xbf16> to vector<16xbf16>
+  // CHECK: %[[V1:.*]] = vector.shape_cast %[[V12D]]
+  // CHECK-SAME:                        : vector<4x4xbf16> to vector<16xbf16>
+  // CHECK: %[[V2:.*]] = vector.shape_cast %[[V22D]]
+  // CHECK-SAME:                        : vector<4x4xf32> to vector<16xf32>
+
+  // CHECK: %[[C60:.*]] = llvm.mlir.constant(60 : i32) : i32
+
+  // CHECK: %[[C0_0:.*]] = llvm.mlir.constant(0 : i32) : i32
+  // CHECK: %[[BV0:.*]] = llvm.bitcast %[[V0]]
+  // CHECK-SAME:                        : vector<16xbf16> to vector<8xi32>
+  // CHECK: %[[BV02C:.*]] = "xllvm.intr.aie2.set.I512.I256"(%[[BV0]], %[[C0_0]])
+  // CHECK-SAME:                        : (vector<8xi32>, i32) -> vector<16xi32>
+  // CHECK: %[[V02C:.*]] = llvm.bitcast %[[BV02C]]
+  // CHECK-SAME:                        : vector<16xi32> to vector<32xbf16>
+
+  // CHECK: %[[C0_1:.*]] = llvm.mlir.constant(0 : i32) : i32
+  // CHECK: %[[BV1:.*]] = llvm.bitcast %[[V1]]
+  // CHECK-SAME:                        : vector<16xbf16> to vector<8xi32>
+  // CHECK: %[[BV12C:.*]] = "xllvm.intr.aie2.set.I512.I256"(%[[BV1]], %[[C0_1]])
+  // CHECK-SAME:                        : (vector<8xi32>, i32) -> vector<16xi32>
+  // CHECK: %[[V12C:.*]] = llvm.bitcast %[[BV12C]]
+  // CHECK-SAME:                        : vector<16xi32> to vector<32xbf16>
+
+  // CHECK: %[[BV2:.*]] = llvm.bitcast %[[V2]]
+  // CHECK-SAME:                        : vector<16xf32> to vector<8xi64>
+  // CHECK: %[[BRS:.*]] = "xllvm.intr.aie2.bf.mac16.conf"(%[[V02C]], %[[V12C]],
+  // CHECK-SAME:                                          %[[BV2]], %[[C60]])
+  // CHECK-SAME:                        : (vector<32xbf16>, vector<32xbf16>,
+  // CHECK-SAME:                           vector<8xi64>, i32) -> vector<8xi64>
+  // CHECK: %[[RS:.*]] = llvm.bitcast %[[BRS]]
+  // CHECK-SAME:                        : vector<8xi64> to vector<16xf32>
+
+  // CHECK: %[[RS2D:.*]] = vector.shape_cast %[[RS]]
+  // CHECK-SAME:                        : vector<16xf32> to vector<4x4xf32>
+
+  %0 = aievec.mac_elem %v0, %v1, %v2 : vector<4x4xbf16>, vector<4x4xbf16>, vector<4x4xf32>
+  return %0 : vector<4x4xf32>
+}
\ No newline at end of file
diff --git a/test/Conversion/AIEVecToLLVM/test-mac_intrinsic_names.mlir b/test/Conversion/AIEVecToLLVM/test-mac_intrinsic_names.mlir
index f1e18c10a1..0572de9d14 100644
--- a/test/Conversion/AIEVecToLLVM/test-mac_intrinsic_names.mlir
+++ b/test/Conversion/AIEVecToLLVM/test-mac_intrinsic_names.mlir
@@ -10,9 +10,9 @@ module {
     %v16i32 = llvm.mlir.undef : vector<16xi32>
     %v8i32 = llvm.mlir.undef : vector<8xi32>
     %v8i80 = llvm.mlir.undef : vector<8xi80>
-    %0 = aievec.mac %v32i16, %v16i16, %v16i48 : vector<32xi16>, vector<16xi16>, vector<16xi48>
-    %1 = aievec.mac %v64i8, %v32i8, %v8i48 : vector<64xi8>, vector<32xi8>, vector<8xi48>
-    %2 = aievec.mac %v16i32, %v8i32, %v8i80 : vector<16xi32>, vector<8xi32>, vector<8xi80>
+    %0 = aievec_aie1.mac %v32i16, %v16i16, %v16i48 : vector<32xi16>, vector<16xi16>, vector<16xi48>
+    %1 = aievec_aie1.mac %v64i8, %v32i8, %v8i48 : vector<64xi8>, vector<32xi8>, vector<8xi48>
+    %2 = aievec_aie1.mac %v16i32, %v8i32, %v8i80 : vector<16xi32>, vector<8xi32>, vector<8xi80>
     return
   }
 }
diff --git a/test/Conversion/AIEVecToLLVM/test-mac_params.mlir b/test/Conversion/AIEVecToLLVM/test-mac_params.mlir
index f466204017..f329d04fce 100644
--- a/test/Conversion/AIEVecToLLVM/test-mac_params.mlir
+++ b/test/Conversion/AIEVecToLLVM/test-mac_params.mlir
@@ -2,17 +2,17 @@
 module {
   func.func @test(%arg0: vector<32xi16>, %arg1: vector<16xi16>, %arg2: vector<16xi48>) {
     // check the parameters that go into separate constants
-    %0 = aievec.mac %arg0, %arg1, %arg2 {xoffsets= "0x00000000", xoffsets_hi = "0x00000000", xsquare = "0x0000", xstart = "0", xstep = "0", zoffsets = "0x00000000", zoffsets_hi = "0x00000000", zsquare = "0x0000", zstart = "0", zstep = "0", fmsub = false} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-    %1 = aievec.mac %arg0, %arg1, %arg2 {xoffsets= "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x0000", xstart = "2", xstep = "0", zoffsets = "0x03020100", zoffsets_hi = "0x07060504", zsquare = "0x0000", zstart = "7", zstep = "0", fmsub = false} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+    %0 = aievec_aie1.mac %arg0, %arg1, %arg2 {xoffsets= "0x00000000", xoffsets_hi = "0x00000000", xsquare = "0x0000", xstart = "0", xstep = "0", zoffsets = "0x00000000", zoffsets_hi = "0x00000000", zsquare = "0x0000", zstart = "0", zstep = "0", fmsub = false} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+    %1 = aievec_aie1.mac %arg0, %arg1, %arg2 {xoffsets= "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x0000", xstart = "2", xstep = "0", zoffsets = "0x03020100", zoffsets_hi = "0x07060504", zsquare = "0x0000", zstart = "7", zstep = "0", fmsub = false} : vector<32xi16>, vector<16xi16>, vector<16xi48>
 
     // check the various combinations that make up the configuration value
-    %2 = aievec.mac %arg0, %arg1, %arg2 {xoffsets= "0x00000000", xoffsets_hi = "0x00000000", xsquare = "0x3210", xstart = "0", xstep = "0", zoffsets = "0x00000000", zoffsets_hi = "0x00000000", zsquare = "0x0000", zstart = "0", zstep = "0", fmsub = false} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-    %3 = aievec.mac %arg0, %arg1, %arg2 {xoffsets= "0x00000000", xoffsets_hi = "0x00000000", xsquare = "0x0000", xstart = "0", xstep = "0", zoffsets = "0x00000000", zoffsets_hi = "0x00000000", zsquare = "0x3210", zstart = "0", zstep = "0", fmsub = false} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-    %4 = aievec.mac %arg0, %arg1, %arg2 {xoffsets= "0x00000000", xoffsets_hi = "0x00000000", xsquare = "0x0000", xstart = "0", xstep = "4", zoffsets = "0x00000000", zoffsets_hi = "0x00000000", zsquare = "0x0000", zstart = "0", zstep = "0", fmsub = false} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-    %5 = aievec.mac %arg0, %arg1, %arg2 {xoffsets= "0x00000000", xoffsets_hi = "0x00000000", xsquare = "0x0000", xstart = "0", xstep = "0", zoffsets = "0x00000000", zoffsets_hi = "0x00000000", zsquare = "0x0000", zstart = "0", zstep = "1", fmsub = false} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-    %6 = aievec.mac %arg0, %arg1, %arg2 {xoffsets= "0x00000000", xoffsets_hi = "0x00000000", xsquare = "0x0000", xstart = "0", xstep = "0", zoffsets = "0x00000000", zoffsets_hi = "0x00000000", zsquare = "0x0000", zstart = "0", zstep = "0", fmsub = true} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+    %2 = aievec_aie1.mac %arg0, %arg1, %arg2 {xoffsets= "0x00000000", xoffsets_hi = "0x00000000", xsquare = "0x3210", xstart = "0", xstep = "0", zoffsets = "0x00000000", zoffsets_hi = "0x00000000", zsquare = "0x0000", zstart = "0", zstep = "0", fmsub = false} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+    %3 = aievec_aie1.mac %arg0, %arg1, %arg2 {xoffsets= "0x00000000", xoffsets_hi = "0x00000000", xsquare = "0x0000", xstart = "0", xstep = "0", zoffsets = "0x00000000", zoffsets_hi = "0x00000000", zsquare = "0x3210", zstart = "0", zstep = "0", fmsub = false} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+    %4 = aievec_aie1.mac %arg0, %arg1, %arg2 {xoffsets= "0x00000000", xoffsets_hi = "0x00000000", xsquare = "0x0000", xstart = "0", xstep = "4", zoffsets = "0x00000000", zoffsets_hi = "0x00000000", zsquare = "0x0000", zstart = "0", zstep = "0", fmsub = false} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+    %5 = aievec_aie1.mac %arg0, %arg1, %arg2 {xoffsets= "0x00000000", xoffsets_hi = "0x00000000", xsquare = "0x0000", xstart = "0", xstep = "0", zoffsets = "0x00000000", zoffsets_hi = "0x00000000", zsquare = "0x0000", zstart = "0", zstep = "1", fmsub = false} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+    %6 = aievec_aie1.mac %arg0, %arg1, %arg2 {xoffsets= "0x00000000", xoffsets_hi = "0x00000000", xsquare = "0x0000", xstart = "0", xstep = "0", zoffsets = "0x00000000", zoffsets_hi = "0x00000000", zsquare = "0x0000", zstart = "0", zstep = "0", fmsub = true} : vector<32xi16>, vector<16xi16>, vector<16xi48>
     // all of the configuration register values
-    %7 = aievec.mac %arg0, %arg1, %arg2 {xoffsets= "0x00000000", xoffsets_hi = "0x00000000", xsquare = "0x3210", xstart = "0", xstep = "4", zoffsets = "0x00000000", zoffsets_hi = "0x00000000", zsquare = "0x3210", zstart = "0", zstep = "1", fmsub = true} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+    %7 = aievec_aie1.mac %arg0, %arg1, %arg2 {xoffsets= "0x00000000", xoffsets_hi = "0x00000000", xsquare = "0x3210", xstart = "0", xstep = "4", zoffsets = "0x00000000", zoffsets_hi = "0x00000000", zsquare = "0x3210", zstart = "0", zstep = "1", fmsub = true} : vector<32xi16>, vector<16xi16>, vector<16xi48>
     return
   }
 }
diff --git a/test/Conversion/AIEVecToLLVM/test-mul_intrinsic_names.mlir b/test/Conversion/AIEVecToLLVM/test-mul_intrinsic_names.mlir
index adfb98aa97..c172f14fe3 100644
--- a/test/Conversion/AIEVecToLLVM/test-mul_intrinsic_names.mlir
+++ b/test/Conversion/AIEVecToLLVM/test-mul_intrinsic_names.mlir
@@ -8,9 +8,9 @@ module {
     %v8i48 = llvm.mlir.undef : vector<8xi48>
     %v16i32 = llvm.mlir.undef : vector<16xi32>
     %v8i32 = llvm.mlir.undef : vector<8xi32>
-    %0 = aievec.mul %v32i16, %v16i16 : vector<32xi16>, vector<16xi16>, vector<16xi48>
-    %1 = aievec.mul %v64i8, %v32i8 : vector<64xi8>, vector<32xi8>, vector<8xi48>
-    %2 = aievec.mul %v16i32, %v8i32 : vector<16xi32>, vector<8xi32>, vector<8xi80>
+    %0 = aievec_aie1.mul %v32i16, %v16i16 : vector<32xi16>, vector<16xi16>, vector<16xi48>
+    %1 = aievec_aie1.mul %v64i8, %v32i8 : vector<64xi8>, vector<32xi8>, vector<8xi48>
+    %2 = aievec_aie1.mul %v16i32, %v8i32 : vector<16xi32>, vector<8xi32>, vector<8xi80>
     return
   }
 }
diff --git a/test/Conversion/AIEVecToLLVM/test-mul_params.mlir b/test/Conversion/AIEVecToLLVM/test-mul_params.mlir
index 07ce6ebffb..c5fc022ad2 100644
--- a/test/Conversion/AIEVecToLLVM/test-mul_params.mlir
+++ b/test/Conversion/AIEVecToLLVM/test-mul_params.mlir
@@ -2,16 +2,16 @@
 module {
   func.func @test(%arg0: vector<32xi16>, %arg1: vector<16xi16>) {
     // check the parameters that go into separate constants
-    %0 = aievec.mul %arg0, %arg1 {xoffsets= "0x00000000", xoffsets_hi = "0x00000000", xsquare = "0x0000", xstart = "0", xstep = "0", zoffsets = "0x00000000", zoffsets_hi = "0x00000000", zsquare = "0x0000", zstart = "0", zstep = "0"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-    %1 = aievec.mul %arg0, %arg1 {xoffsets= "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x0000", xstart = "2", xstep = "0", zoffsets = "0x03020100", zoffsets_hi = "0x07060504", zsquare = "0x0000", zstart = "7", zstep = "0"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+    %0 = aievec_aie1.mul %arg0, %arg1 {xoffsets= "0x00000000", xoffsets_hi = "0x00000000", xsquare = "0x0000", xstart = "0", xstep = "0", zoffsets = "0x00000000", zoffsets_hi = "0x00000000", zsquare = "0x0000", zstart = "0", zstep = "0"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+    %1 = aievec_aie1.mul %arg0, %arg1 {xoffsets= "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x0000", xstart = "2", xstep = "0", zoffsets = "0x03020100", zoffsets_hi = "0x07060504", zsquare = "0x0000", zstart = "7", zstep = "0"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
 
     // check the various combinations that make up the configuration value
-    %2 = aievec.mul %arg0, %arg1 {xoffsets= "0x00000000", xoffsets_hi = "0x00000000", xsquare = "0x3210", xstart = "0", xstep = "0", zoffsets = "0x00000000", zoffsets_hi = "0x00000000", zsquare = "0x0000", zstart = "0", zstep = "0"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-    %3 = aievec.mul %arg0, %arg1 {xoffsets= "0x00000000", xoffsets_hi = "0x00000000", xsquare = "0x0000", xstart = "0", xstep = "0", zoffsets = "0x00000000", zoffsets_hi = "0x00000000", zsquare = "0x3210", zstart = "0", zstep = "0"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-    %4 = aievec.mul %arg0, %arg1 {xoffsets= "0x00000000", xoffsets_hi = "0x00000000", xsquare = "0x0000", xstart = "0", xstep = "4", zoffsets = "0x00000000", zoffsets_hi = "0x00000000", zsquare = "0x0000", zstart = "0", zstep = "0"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-    %5 = aievec.mul %arg0, %arg1 {xoffsets= "0x00000000", xoffsets_hi = "0x00000000", xsquare = "0x0000", xstart = "0", xstep = "0", zoffsets = "0x00000000", zoffsets_hi = "0x00000000", zsquare = "0x0000", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+    %2 = aievec_aie1.mul %arg0, %arg1 {xoffsets= "0x00000000", xoffsets_hi = "0x00000000", xsquare = "0x3210", xstart = "0", xstep = "0", zoffsets = "0x00000000", zoffsets_hi = "0x00000000", zsquare = "0x0000", zstart = "0", zstep = "0"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+    %3 = aievec_aie1.mul %arg0, %arg1 {xoffsets= "0x00000000", xoffsets_hi = "0x00000000", xsquare = "0x0000", xstart = "0", xstep = "0", zoffsets = "0x00000000", zoffsets_hi = "0x00000000", zsquare = "0x3210", zstart = "0", zstep = "0"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+    %4 = aievec_aie1.mul %arg0, %arg1 {xoffsets= "0x00000000", xoffsets_hi = "0x00000000", xsquare = "0x0000", xstart = "0", xstep = "4", zoffsets = "0x00000000", zoffsets_hi = "0x00000000", zsquare = "0x0000", zstart = "0", zstep = "0"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+    %5 = aievec_aie1.mul %arg0, %arg1 {xoffsets= "0x00000000", xoffsets_hi = "0x00000000", xsquare = "0x0000", xstart = "0", xstep = "0", zoffsets = "0x00000000", zoffsets_hi = "0x00000000", zsquare = "0x0000", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
     // all of the configuration register values
-    %6 = aievec.mul %arg0, %arg1 {xoffsets= "0x00000000", xoffsets_hi = "0x00000000", xsquare = "0x3210", xstart = "0", xstep = "4", zoffsets = "0x00000000", zoffsets_hi = "0x00000000", zsquare = "0x3210", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+    %6 = aievec_aie1.mul %arg0, %arg1 {xoffsets= "0x00000000", xoffsets_hi = "0x00000000", xsquare = "0x3210", xstart = "0", xstep = "4", zoffsets = "0x00000000", zoffsets_hi = "0x00000000", zsquare = "0x3210", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
     return
   }
 }
diff --git a/test/Conversion/DmaToNpu/aiert_insts.mlir b/test/Conversion/DmaToNpu/aiert_insts.mlir
index 0811153462..5462a6abf2 100644
--- a/test/Conversion/DmaToNpu/aiert_insts.mlir
+++ b/test/Conversion/DmaToNpu/aiert_insts.mlir
@@ -7,16 +7,16 @@
 //===----------------------------------------------------------------------===//
 
 // RUN: aie-opt --aie-dma-to-npu %s | FileCheck %s
-// CHECK: aiex.npu.writebd {bd_id = 1 : i32, buffer_length = 32 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+// CHECK: aiex.npu.blockwrite(%{{.*}}) {address = 118816 : ui32} : memref<8xi32>
 // CHECK: aiex.npu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483649 : ui32}
-// CHECK: aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 2 : i32, d1_stride = 7 : i32, d2_stride = 15 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+// CHECK: aiex.npu.blockwrite(%{{.*}}) {address = 118784 : ui32} : memref<8xi32>
 // CHECK: aiex.npu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 0 : ui32}
 
 module {
   aie.device(npu1_4col) {
     memref.global "public" @of_toMem : memref<32xi32>
     memref.global "public" @of_fromMem : memref<32xi32>
-    func.func @sequence(%in : memref<4x2x8xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) {
+    aiex.runtime_sequence(%in : memref<4x2x8xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) {
       %c0 = arith.constant 0 : i64
       %c1 = arith.constant 1 : i64
       %c2 = arith.constant 2 : i64
@@ -24,9 +24,8 @@ module {
       %c8 = arith.constant 8 : i64
       %c16 = arith.constant 16 : i64
       %c32 = arith.constant 32 : i64
-      aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c32][%c0,%c0,%c0, %c1]) { metadata = @of_toMem, id = 1 : i64 } : memref<64xi32>
-      aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c2,%c0,%c0][%c1,%c2,%c2,%c8][%c0,%c16,%c8, %c1]) { metadata = @of_fromMem, id = 0 : i64 } : memref<4x2x8xi32>
-      return
+      aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c32][%c0,%c0,%c0, %c1]) { metadata = @of_toMem, id = 1 : i64, issue_token = true } : memref<64xi32>
+      aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c2,%c0,%c0][%c1,%c2,%c2,%c8][%c0,%c16,%c8, %c1]) { metadata = @of_fromMem, id = 0 : i64, issue_token = false } : memref<4x2x8xi32>
     }
     aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0)
     aie.shim_dma_allocation @of_toMem (S2MM, 0, 0)
diff --git a/test/Conversion/DmaToNpu/bad_dma_to_npu.mlir b/test/Conversion/DmaToNpu/bad_dma_to_npu.mlir
index 7aca4b981d..a6e6663f73 100644
--- a/test/Conversion/DmaToNpu/bad_dma_to_npu.mlir
+++ b/test/Conversion/DmaToNpu/bad_dma_to_npu.mlir
@@ -18,10 +18,9 @@
 module @shimDmaMemcpy{
   aie.device(xcve2302) {
     memref.global "public" @toMem : memref<1xbf16>
-    func.func @sequence(%arg0: memref<1xbf16>, %arg1: memref<1xbf16>, %arg2: memref<1xbf16>) {
+    aiex.runtime_sequence(%arg0: memref<1xbf16>, %arg1: memref<1xbf16>, %arg2: memref<1xbf16>) {
       aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][4, 4, 64, 64][0, 64, 256, 1]) {id = 0 : i64, metadata = @toMem} : memref<1xbf16>
       aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
     }
     aie.shim_dma_allocation @toMem (S2MM, 0, 0)
   }
diff --git a/test/Conversion/DmaToNpu/bad_dma_to_npu_datatype.mlir b/test/Conversion/DmaToNpu/bad_dma_to_npu_datatype.mlir
index eecbf921df..fd99c22dd8 100644
--- a/test/Conversion/DmaToNpu/bad_dma_to_npu_datatype.mlir
+++ b/test/Conversion/DmaToNpu/bad_dma_to_npu_datatype.mlir
@@ -18,10 +18,9 @@
 module @shimDmaMemcpy{
   aie.device(xcve2302) {
     memref.global "public" @toMem : memref<65536xi64>
-    func.func @sequence(%arg0: memref<65536xi64>, %arg1: memref<65536xi64>, %arg2: memref<65536xi64>) {
+    aiex.runtime_sequence(%arg0: memref<65536xi64>, %arg1: memref<65536xi64>, %arg2: memref<65536xi64>) {
       aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][4, 4, 64, 64][0, 64, 256, 1]) {id = 0 : i64, metadata = @toMem} : memref<65536xi64>
       aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
     }
     aie.shim_dma_allocation @toMem (S2MM, 0, 0)
   }
diff --git a/test/Conversion/DmaToNpu/bad_rtp_write.mlir b/test/Conversion/DmaToNpu/bad_rtp_write.mlir
index bb5fe11eb3..fb2e617533 100644
--- a/test/Conversion/DmaToNpu/bad_rtp_write.mlir
+++ b/test/Conversion/DmaToNpu/bad_rtp_write.mlir
@@ -9,10 +9,9 @@
 // RUN: aie-opt --aie-dma-to-npu -verify-diagnostics %s
 
 aie.device(npu1_4col) {
-  func.func @sequence() {
+  aiex.runtime_sequence() {
     // expected-error@+2 {{'aiex.npu.rtp_write' op RTP buffer address cannot be found. Has an RTP buffer been allocated?}}
     // expected-error@+1 {{failed to legalize operation 'aiex.npu.rtp_write' that was explicitly marked illegal}}
     aiex.npu.rtp_write(0, 2, 4, 99) { buffer_sym_name = "RTP" }
-    return
   }
 }
diff --git a/test/Conversion/DmaToNpu/dma_to_npu.mlir b/test/Conversion/DmaToNpu/dma_to_npu.mlir
index 2cfa5444a2..de61a0d2c5 100644
--- a/test/Conversion/DmaToNpu/dma_to_npu.mlir
+++ b/test/Conversion/DmaToNpu/dma_to_npu.mlir
@@ -11,22 +11,20 @@
 // RUN: aie-opt --split-input-file -aie-dma-to-npu %s | FileCheck %s
 
 // TODO - more
-// CHECK-LABEL: dma_memcpy_nd_0
-// CHECK: aiex.npu.writebd
-// CHECK-SAME: valid_bd = 1 : i32
+// CHECK: module
+// CHECK: aiex.npu.blockwrite
 // CHECK: aiex.npu.address_patch
 // CHECK-SAME: arg_idx = 0 : i32
-// CHECK: aiex.npu.writebd
+// CHECK: aiex.npu.blockwrite
 // CHECK: aiex.npu.address_patch
 // CHECK-SAME: arg_idx = 1 : i32
 module  {
   aie.device(npu1_4col) {
     memref.global "public" @toMem : memref<16xi32>
     memref.global "public" @fromMem : memref<16xi32>
-    func.func @dma_memcpy_nd_0(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
+    aiex.runtime_sequence(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
       aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32>
       aiex.npu.dma_memcpy_nd (0, 1, %arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @fromMem, id = 0 : i64 } : memref<16xi32>
-      return
     }
     aie.shim_dma_allocation @fromMem (MM2S, 0, 0)
     aie.shim_dma_allocation @toMem (S2MM, 0, 0)
@@ -35,12 +33,11 @@ module  {
 
 // -----
 
-// CHECK-LABEL: dma_wait_s2mm
-// CHECK: aiex.npu.writebd
-// CHECK-SAME: valid_bd = 1 : i32
+// CHECK: module
 // CHECK: aiex.npu.address_patch
 // CHECK-SAME: arg_idx = 0 : i32
 // CHECK: aiex.npu.write32
+// CHECK-SAME: value = 2147483649
 // CHECK: aiex.npu.sync 
 // CHECK-SAME: channel = 0 : i32
 // CHECK-SAME: column = 0 : i32
@@ -51,10 +48,9 @@ module  {
 module  {
   aie.device(npu1_4col) {
     memref.global "public" @toMem : memref<16xi32>
-    func.func @dma_wait_s2mm(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
-      aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32>
+    aiex.runtime_sequence(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
+      aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { issue_token = true, metadata = @toMem, id = 1 : i64 } : memref<16xi32>
       aiex.npu.dma_wait {symbol = @toMem}
-      return
     }
     aie.shim_dma_allocation @toMem (S2MM, 0, 0)
   }
@@ -62,12 +58,12 @@ module  {
 
 // -----
 
-// CHECK-LABEL: dma_wait_mm2s
-// CHECK: aiex.npu.writebd
-// CHECK-SAME: valid_bd = 1 : i32
+// CHECK: module
+// CHECK: aiex.npu.blockwrite
 // CHECK: aiex.npu.address_patch
 // CHECK-SAME: arg_idx = 0 : i32
 // CHECK: aiex.npu.write32
+// CHECK-SAME: value = 2147483649
 // CHECK: aiex.npu.sync 
 // CHECK-SAME: channel = 1 : i32
 // CHECK-SAME: column = 1 : i32
@@ -78,10 +74,9 @@ module  {
 module  {
   aie.device(npu1_4col) {
     memref.global "public" @toMem : memref<16xi32>
-    func.func @dma_wait_mm2s(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
-      aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32>
+    aiex.runtime_sequence(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
+      aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { issue_token = true, metadata = @toMem, id = 1 : i64 } : memref<16xi32>
       aiex.npu.dma_wait {symbol = @toMem}
-      return
     }
     aie.shim_dma_allocation @toMem (MM2S, 1, 1)
   }
diff --git a/test/Conversion/DmaToNpu/dma_to_npu_invalid.mlir b/test/Conversion/DmaToNpu/dma_to_npu_invalid.mlir
index 2178309055..89737ec34e 100644
--- a/test/Conversion/DmaToNpu/dma_to_npu_invalid.mlir
+++ b/test/Conversion/DmaToNpu/dma_to_npu_invalid.mlir
@@ -13,11 +13,10 @@
 module  {
   aie.device(npu1_4col) {
     memref.global "public" @toMem : memref<16xi32>
-    func.func @sequence() {
+    aiex.runtime_sequence() {
       // expected-error@+2 {{failed to legalize operation 'aiex.npu.dma_wait' that was explicitly marked illegal}}
       // expected-error@+1 {{couldn't find shim_dma_allocation op}}
       aiex.npu.dma_wait {symbol = @toMem}
-      return
     }
   }
 }
diff --git a/test/Conversion/DmaToNpu/dma_to_npu_issue_token.mlir b/test/Conversion/DmaToNpu/dma_to_npu_issue_token.mlir
index 7d16d4d0f0..76f6a0ed88 100644
--- a/test/Conversion/DmaToNpu/dma_to_npu_issue_token.mlir
+++ b/test/Conversion/DmaToNpu/dma_to_npu_issue_token.mlir
@@ -11,14 +11,12 @@
 // RUN: aie-opt -aie-dma-to-npu %s | FileCheck %s
 
 // TODO - more
-// CHECK-LABEL: test1
-// CHECK: aiex.npu.writebd
-// CHECK-SAME: valid_bd = 1 : i32
+// CHECK: aiex.npu.blockwrite
 // CHECK: aiex.npu.address_patch
 // CHECK-SAME: arg_idx = 0 : i32
 // CHECK: aiex.npu.write32
 // CHECK-SAME: value = 2147483649
-// CHECK: aiex.npu.writebd
+// CHECK: aiex.npu.blockwrite
 // CHECK: aiex.npu.address_patch
 // CHECK-SAME: arg_idx = 1 : i32
 // CHECK: aiex.npu.write32
@@ -27,10 +25,9 @@ module  {
   aie.device(npu1_4col) {
     memref.global "public" @toMem : memref<16xi32>
     memref.global "public" @fromMem : memref<16xi32>
-    func.func @test1(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
+    aiex.runtime_sequence(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
         aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64, issue_token = true } : memref<16xi32>
-        aiex.npu.dma_memcpy_nd (0, 1, %arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @fromMem, id = 0 : i64 } : memref<16xi32>
-        return
+        aiex.npu.dma_memcpy_nd (0, 1, %arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @fromMem, id = 0 : i64, issue_token = false } : memref<16xi32>
     }
     aie.shim_dma_allocation @fromMem (MM2S, 0, 0)
     aie.shim_dma_allocation @toMem (S2MM, 0, 0)
diff --git a/test/Conversion/DmaToNpu/dma_to_npu_width_conversion.mlir b/test/Conversion/DmaToNpu/dma_to_npu_width_conversion.mlir
index 71bc4fe6e7..ebcc0b910d 100644
--- a/test/Conversion/DmaToNpu/dma_to_npu_width_conversion.mlir
+++ b/test/Conversion/DmaToNpu/dma_to_npu_width_conversion.mlir
@@ -12,27 +12,24 @@
 
 // RUN: aie-opt --aie-dma-to-npu %s 2>&1 | FileCheck %s
 
-//CHECK-LABEL:  aie.device(xcve2302) {
-//CHECK:      memref.global "public" @toMem : memref<65536xbf16>
-//CHECK:      func.func @sequence(%arg0: memref<65536xbf16>, %arg1: memref<65536xbf16>, %arg2: memref<65536xbf16>) {
-//CHECK:      aiex.npu.writebd {bd_id = 0 : i32, buffer_length = 8192 : i32, buffer_offset = 0 : i32, column = 0 : i32, d0_size = 32 : i32, d0_stride = 0 : i32, d1_size = 64 : i32, d1_stride = 127 : i32, d2_stride = 31 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, row = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
-//CHECK:      aiex.npu.address_patch {addr = 118788 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32}
-//CHECK:      aiex.npu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147680256 : ui32}
-//CHECK:      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-//CHECK:      return
-//CHECK:    }
-//CHECK:    aie.shim_dma_allocation @toMem(S2MM, 0, 0)
-//CHECK:  }
-
-
+// CHECK-LABEL:  aie.device(xcve2302) {
+// CHECK:  memref.global "public" @toMem : memref<65536xbf16>
+// CHECK:  memref.global "private" constant @blockwrite_data_0 : memref<8xi32> = dense<[8192, 0, 0, 33554432, -2080374657, 31, 0, 33554432]>
+// CHECK:  aiex.runtime_sequence(%arg0: memref<65536xbf16>, %arg1: memref<65536xbf16>, %arg2: memref<65536xbf16>) {
+// CHECK:    %0 = memref.get_global @blockwrite_data_0 : memref<8xi32>
+// CHECK:    aiex.npu.blockwrite(%0) {address = 118784 : ui32} : memref<8xi32>
+// CHECK:    aiex.npu.address_patch {addr = 118788 : ui32, arg_idx = 0 : i32, arg_plus = 0 : i32}
+// CHECK:    aiex.npu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147680256 : ui32}
+// CHECK:    aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+// CHECK:    }
+// CHECK:    aie.shim_dma_allocation @toMem(S2MM, 0, 0)
 
 module @shimDmaMemcpy{
   aie.device(xcve2302) {
     memref.global "public" @toMem : memref<65536xbf16>
-    func.func @sequence(%arg0: memref<65536xbf16>, %arg1: memref<65536xbf16>, %arg2: memref<65536xbf16>) {
+    aiex.runtime_sequence(%arg0: memref<65536xbf16>, %arg1: memref<65536xbf16>, %arg2: memref<65536xbf16>) {
       aiex.npu.dma_memcpy_nd (2, 0, %arg0[0, 0, 0, 0][4, 4, 64, 64][0, 64, 256, 1]) {id = 0 : i64, metadata = @toMem} : memref<65536xbf16>
       aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
     }
     aie.shim_dma_allocation @toMem (S2MM, 0, 0)
   }
diff --git a/test/Conversion/DmaToNpu/push_to_queue.mlir b/test/Conversion/DmaToNpu/push_to_queue.mlir
index 4f9adad9d7..b607b6974b 100644
--- a/test/Conversion/DmaToNpu/push_to_queue.mlir
+++ b/test/Conversion/DmaToNpu/push_to_queue.mlir
@@ -12,10 +12,9 @@
 
 module {
   aie.device(npu1_4col) {
-    func.func @sequence() {
+    aiex.runtime_sequence() {
       aiex.npu.push_queue (0, 0, S2MM:1) {issue_token = true, repeat_count = 0 : i32, bd_id = 3 : i32 }
       aiex.npu.push_queue (2, 0, MM2S:0) {issue_token = false, repeat_count = 3 : i32, bd_id = 2 : i32 }
-      return
     }
   }
 }
diff --git a/test/Conversion/DmaToNpu/rtp_write.mlir b/test/Conversion/DmaToNpu/rtp_write.mlir
index 81d6e018fc..bfff00e7fb 100644
--- a/test/Conversion/DmaToNpu/rtp_write.mlir
+++ b/test/Conversion/DmaToNpu/rtp_write.mlir
@@ -16,10 +16,9 @@ module {
     %1 = aie.buffer(%0) {address = 1536 : i32, sym_name = "rtp"} : memref<16xi32>
     %2 = aie.tile(0, 2)
     %3 = aie.buffer(%2) {address = 3200 : i32, sym_name = "RTP"} : memref<16xi32>
-    func.func @sequence() {
+    aiex.runtime_sequence() {
       aiex.npu.rtp_write(2, 3, 0, 50) { buffer_sym_name = "rtp" }
       aiex.npu.rtp_write(0, 2, 4, 99) { buffer_sym_name = "RTP" }
-      return
     }
   }
 }
diff --git a/test/Conversion/VectorToAIEVec/gemm64_int16_unroll16_vectorized.mlir b/test/Conversion/VectorToAIEVec/gemm64_int16_unroll16_vectorized.mlir
index 6e104bf4d0..c196bd3765 100644
--- a/test/Conversion/VectorToAIEVec/gemm64_int16_unroll16_vectorized.mlir
+++ b/test/Conversion/VectorToAIEVec/gemm64_int16_unroll16_vectorized.mlir
@@ -58,7 +58,7 @@ func.func @matmul(%arg0: memref<?x64xi16>, %arg1: memref<?x64xi16>, %arg2: memre
         // CHECK-SAME:                  {index = 0 : i8, offset = 0 : i32}
         // CHECK-SAME:                  : memref<?x64xi16>, vector<16xi16>
         // CHECK: %[[VC:.*]] = aievec.ups %[[ACCk]] {shift = 0 : i8} : vector<16xi16>, vector<16xi48>
-        %2 = vector.transfer_read %arg0[%arg3, %arg5], %c0_i16 {permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
+        %2 = vector.transfer_read %arg0[%arg3, %arg5], %c0_i16 {in_bounds = [true], permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
         %3 = vector.transfer_read %arg1[%arg5, %arg4], %c0_i16 : memref<?x64xi16>, vector<16xi16>
         %4 = arith.muli %2, %3 : vector<16xi16>
         %5 = arith.addi %arg6, %4 : vector<16xi16>
@@ -67,12 +67,12 @@ func.func @matmul(%arg0: memref<?x64xi16>, %arg1: memref<?x64xi16>, %arg2: memre
         // CHECK-SAME:                  {index = 0 : i8, offset = 0 : i32}
         // CHECK-SAME:                  : memref<?x64xi16>, vector<16xi16>
         // CHECK: %[[VB01:.*]] = aievec.concat %[[VB0]], %[[VB1]] : vector<16xi16>, vector<32xi16>
-        // CHECK: %[[ACCk0:.*]] = aievec.mac %[[VB01]], %[[VA]], %[[VC]]
+        // CHECK: %[[ACCk0:.*]] = aievec_aie1.mac %[[VB01]], %[[VA]], %[[VC]]
         // CHECK-SAME:                       {xoffsets = "0x73727170", xoffsets_hi = "0x77767574", xsquare = "0x3120",
         // CHECK-SAME:                        xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "0", zstep = "1"}
         // CHECK-SAME:                       : vector<32xi16>, vector<16xi16>, vector<16xi48>
         %6 = affine.apply #map1(%arg5)
-        %7 = vector.transfer_read %arg0[%arg3, %6], %c0_i16 {permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
+        %7 = vector.transfer_read %arg0[%arg3, %6], %c0_i16 {in_bounds = [true], permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
         %8 = vector.transfer_read %arg1[%6, %arg4], %c0_i16 : memref<?x64xi16>, vector<16xi16>
         %9 = arith.muli %7, %8 : vector<16xi16>
         %10 = arith.addi %5, %9 : vector<16xi16>
@@ -81,7 +81,7 @@ func.func @matmul(%arg0: memref<?x64xi16>, %arg1: memref<?x64xi16>, %arg2: memre
         // CHECK-SAME:                  {index = 0 : i8, offset = 0 : i32}
         // CHECK-SAME:                  : memref<?x64xi16>, vector<16xi16>
         %11 = affine.apply #map2(%arg5)
-        %12 = vector.transfer_read %arg0[%arg3, %11], %c0_i16 {permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
+        %12 = vector.transfer_read %arg0[%arg3, %11], %c0_i16 {in_bounds = [true], permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
         %13 = vector.transfer_read %arg1[%11, %arg4], %c0_i16 : memref<?x64xi16>, vector<16xi16>
         %14 = arith.muli %12, %13 : vector<16xi16>
         %15 = arith.addi %10, %14 : vector<16xi16>
@@ -90,11 +90,11 @@ func.func @matmul(%arg0: memref<?x64xi16>, %arg1: memref<?x64xi16>, %arg2: memre
         // CHECK-SAME:                  {index = 0 : i8, offset = 0 : i32}
         // CHECK-SAME:                  : memref<?x64xi16>, vector<16xi16>
         // CHECK: %[[VB23:.*]] = aievec.concat %[[VB2]], %[[VB3]] : vector<16xi16>, vector<32xi16>
-        // CHECK: %[[ACCk2:.*]] = aievec.mac %[[VB23]], %[[VA]], %[[ACCk0]]
+        // CHECK: %[[ACCk2:.*]] = aievec_aie1.mac %[[VB23]], %[[VA]], %[[ACCk0]]
         // CHECK-SAME:                       {xoffsets = "0x73727170", xoffsets_hi = "0x77767574", xsquare = "0x3120",
         // CHECK-SAME:                        xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "2", zstep = "1"}
         %16 = affine.apply #map3(%arg5)
-        %17 = vector.transfer_read %arg0[%arg3, %16], %c0_i16 {permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
+        %17 = vector.transfer_read %arg0[%arg3, %16], %c0_i16 {in_bounds = [true], permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
         %18 = vector.transfer_read %arg1[%16, %arg4], %c0_i16 : memref<?x64xi16>, vector<16xi16>
         %19 = arith.muli %17, %18 : vector<16xi16>
         %20 = arith.addi %15, %19 : vector<16xi16>
@@ -103,7 +103,7 @@ func.func @matmul(%arg0: memref<?x64xi16>, %arg1: memref<?x64xi16>, %arg2: memre
         // CHECK-SAME:                  {index = 0 : i8, offset = 0 : i32}
         // CHECK-SAME:                  : memref<?x64xi16>, vector<16xi16>
         %21 = affine.apply #map4(%arg5)
-        %22 = vector.transfer_read %arg0[%arg3, %21], %c0_i16 {permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
+        %22 = vector.transfer_read %arg0[%arg3, %21], %c0_i16 {in_bounds = [true], permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
         %23 = vector.transfer_read %arg1[%21, %arg4], %c0_i16 : memref<?x64xi16>, vector<16xi16>
         %24 = arith.muli %22, %23 : vector<16xi16>
         %25 = arith.addi %20, %24 : vector<16xi16>
@@ -112,11 +112,11 @@ func.func @matmul(%arg0: memref<?x64xi16>, %arg1: memref<?x64xi16>, %arg2: memre
         // CHECK-SAME:                  {index = 0 : i8, offset = 0 : i32}
         // CHECK-SAME:                  : memref<?x64xi16>, vector<16xi16>
         // CHECK: %[[VB45:.*]] = aievec.concat %[[VB4]], %[[VB5]] : vector<16xi16>, vector<32xi16>
-        // CHECK: %[[ACCk4:.*]] = aievec.mac %[[VB45]], %[[VA]], %[[ACCk2]]
+        // CHECK: %[[ACCk4:.*]] = aievec_aie1.mac %[[VB45]], %[[VA]], %[[ACCk2]]
         // CHECK-SAME:                       {xoffsets = "0x73727170", xoffsets_hi = "0x77767574", xsquare = "0x3120",
         // CHECK-SAME:                        xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "4", zstep = "1"}
         %26 = affine.apply #map5(%arg5)
-        %27 = vector.transfer_read %arg0[%arg3, %26], %c0_i16 {permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
+        %27 = vector.transfer_read %arg0[%arg3, %26], %c0_i16 {in_bounds = [true], permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
         %28 = vector.transfer_read %arg1[%26, %arg4], %c0_i16 : memref<?x64xi16>, vector<16xi16>
         %29 = arith.muli %27, %28 : vector<16xi16>
         %30 = arith.addi %25, %29 : vector<16xi16>
@@ -125,7 +125,7 @@ func.func @matmul(%arg0: memref<?x64xi16>, %arg1: memref<?x64xi16>, %arg2: memre
         // CHECK-SAME:                  {index = 0 : i8, offset = 0 : i32}
         // CHECK-SAME:                  : memref<?x64xi16>, vector<16xi16>
         %31 = affine.apply #map6(%arg5)
-        %32 = vector.transfer_read %arg0[%arg3, %31], %c0_i16 {permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
+        %32 = vector.transfer_read %arg0[%arg3, %31], %c0_i16 {in_bounds = [true], permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
         %33 = vector.transfer_read %arg1[%31, %arg4], %c0_i16 : memref<?x64xi16>, vector<16xi16>
         %34 = arith.muli %32, %33 : vector<16xi16>
         %35 = arith.addi %30, %34 : vector<16xi16>
@@ -134,11 +134,11 @@ func.func @matmul(%arg0: memref<?x64xi16>, %arg1: memref<?x64xi16>, %arg2: memre
         // CHECK-SAME:                  {index = 0 : i8, offset = 0 : i32}
         // CHECK-SAME:                  : memref<?x64xi16>, vector<16xi16>
         // CHECK: %[[VB67:.*]] = aievec.concat %[[VB6]], %[[VB7]] : vector<16xi16>, vector<32xi16>
-        // CHECK: %[[ACCk6:.*]] = aievec.mac %[[VB67]], %[[VA]], %[[ACCk4]]
+        // CHECK: %[[ACCk6:.*]] = aievec_aie1.mac %[[VB67]], %[[VA]], %[[ACCk4]]
         // CHECK-SAME:                       {xoffsets = "0x73727170", xoffsets_hi = "0x77767574", xsquare = "0x3120",
         // CHECK-SAME:                        xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "6", zstep = "1"}
         %36 = affine.apply #map7(%arg5)
-        %37 = vector.transfer_read %arg0[%arg3, %36], %c0_i16 {permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
+        %37 = vector.transfer_read %arg0[%arg3, %36], %c0_i16 {in_bounds = [true], permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
         %38 = vector.transfer_read %arg1[%36, %arg4], %c0_i16 : memref<?x64xi16>, vector<16xi16>
         %39 = arith.muli %37, %38 : vector<16xi16>
         %40 = arith.addi %35, %39 : vector<16xi16>
@@ -147,7 +147,7 @@ func.func @matmul(%arg0: memref<?x64xi16>, %arg1: memref<?x64xi16>, %arg2: memre
         // CHECK-SAME:                  {index = 0 : i8, offset = 0 : i32}
         // CHECK-SAME:                  : memref<?x64xi16>, vector<16xi16>
         %41 = affine.apply #map8(%arg5)
-        %42 = vector.transfer_read %arg0[%arg3, %41], %c0_i16 {permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
+        %42 = vector.transfer_read %arg0[%arg3, %41], %c0_i16 {in_bounds = [true], permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
         %43 = vector.transfer_read %arg1[%41, %arg4], %c0_i16 : memref<?x64xi16>, vector<16xi16>
         %44 = arith.muli %42, %43 : vector<16xi16>
         %45 = arith.addi %40, %44 : vector<16xi16>
@@ -156,11 +156,11 @@ func.func @matmul(%arg0: memref<?x64xi16>, %arg1: memref<?x64xi16>, %arg2: memre
         // CHECK-SAME:                  {index = 0 : i8, offset = 0 : i32}
         // CHECK-SAME:                  : memref<?x64xi16>, vector<16xi16>
         // CHECK: %[[VB89:.*]] = aievec.concat %[[VB8]], %[[VB9]] : vector<16xi16>, vector<32xi16>
-        // CHECK: %[[ACCk8:.*]] = aievec.mac %[[VB89]], %[[VA]], %[[ACCk6]]
+        // CHECK: %[[ACCk8:.*]] = aievec_aie1.mac %[[VB89]], %[[VA]], %[[ACCk6]]
         // CHECK-SAME:                       {xoffsets = "0x73727170", xoffsets_hi = "0x77767574", xsquare = "0x3120",
         // CHECK-SAME:                        xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "8", zstep = "1"}
         %46 = affine.apply #map9(%arg5)
-        %47 = vector.transfer_read %arg0[%arg3, %46], %c0_i16 {permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
+        %47 = vector.transfer_read %arg0[%arg3, %46], %c0_i16 {in_bounds = [true], permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
         %48 = vector.transfer_read %arg1[%46, %arg4], %c0_i16 : memref<?x64xi16>, vector<16xi16>
         %49 = arith.muli %47, %48 : vector<16xi16>
         %50 = arith.addi %45, %49 : vector<16xi16>
@@ -169,7 +169,7 @@ func.func @matmul(%arg0: memref<?x64xi16>, %arg1: memref<?x64xi16>, %arg2: memre
         // CHECK-SAME:                  {index = 0 : i8, offset = 0 : i32}
         // CHECK-SAME:                  : memref<?x64xi16>, vector<16xi16>
         %51 = affine.apply #map10(%arg5)
-        %52 = vector.transfer_read %arg0[%arg3, %51], %c0_i16 {permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
+        %52 = vector.transfer_read %arg0[%arg3, %51], %c0_i16 {in_bounds = [true], permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
         %53 = vector.transfer_read %arg1[%51, %arg4], %c0_i16 : memref<?x64xi16>, vector<16xi16>
         %54 = arith.muli %52, %53 : vector<16xi16>
         %55 = arith.addi %50, %54 : vector<16xi16>
@@ -178,11 +178,11 @@ func.func @matmul(%arg0: memref<?x64xi16>, %arg1: memref<?x64xi16>, %arg2: memre
         // CHECK-SAME:                  {index = 0 : i8, offset = 0 : i32}
         // CHECK-SAME:                  : memref<?x64xi16>, vector<16xi16>
         // CHECK: %[[VBab:.*]] = aievec.concat %[[VB10]], %[[VB11]] : vector<16xi16>, vector<32xi16>
-        // CHECK: %[[ACCk10:.*]] = aievec.mac %[[VBab]], %[[VA]], %[[ACCk8]]
+        // CHECK: %[[ACCk10:.*]] = aievec_aie1.mac %[[VBab]], %[[VA]], %[[ACCk8]]
         // CHECK-SAME:                       {xoffsets = "0x73727170", xoffsets_hi = "0x77767574", xsquare = "0x3120",
         // CHECK-SAME:                        xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "10", zstep = "1"}
         %56 = affine.apply #map11(%arg5)
-        %57 = vector.transfer_read %arg0[%arg3, %56], %c0_i16 {permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
+        %57 = vector.transfer_read %arg0[%arg3, %56], %c0_i16 {in_bounds = [true], permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
         %58 = vector.transfer_read %arg1[%56, %arg4], %c0_i16 : memref<?x64xi16>, vector<16xi16>
         %59 = arith.muli %57, %58 : vector<16xi16>
         %60 = arith.addi %55, %59 : vector<16xi16>
@@ -191,7 +191,7 @@ func.func @matmul(%arg0: memref<?x64xi16>, %arg1: memref<?x64xi16>, %arg2: memre
         // CHECK-SAME:                  {index = 0 : i8, offset = 0 : i32}
         // CHECK-SAME:                  : memref<?x64xi16>, vector<16xi16>
         %61 = affine.apply #map12(%arg5)
-        %62 = vector.transfer_read %arg0[%arg3, %61], %c0_i16 {permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
+        %62 = vector.transfer_read %arg0[%arg3, %61], %c0_i16 {in_bounds = [true], permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
         %63 = vector.transfer_read %arg1[%61, %arg4], %c0_i16 : memref<?x64xi16>, vector<16xi16>
         %64 = arith.muli %62, %63 : vector<16xi16>
         %65 = arith.addi %60, %64 : vector<16xi16>
@@ -200,11 +200,11 @@ func.func @matmul(%arg0: memref<?x64xi16>, %arg1: memref<?x64xi16>, %arg2: memre
         // CHECK-SAME:                  {index = 0 : i8, offset = 0 : i32}
         // CHECK-SAME:                  : memref<?x64xi16>, vector<16xi16>
         // CHECK: %[[VBcd:.*]] = aievec.concat %[[VB12]], %[[VB13]] : vector<16xi16>, vector<32xi16>
-        // CHECK: %[[ACCk12:.*]] = aievec.mac %[[VBcd]], %[[VA]], %[[ACCk10]]
+        // CHECK: %[[ACCk12:.*]] = aievec_aie1.mac %[[VBcd]], %[[VA]], %[[ACCk10]]
         // CHECK-SAME:                       {xoffsets = "0x73727170", xoffsets_hi = "0x77767574", xsquare = "0x3120",
         // CHECK-SAME:                        xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "12", zstep = "1"}
         %66 = affine.apply #map13(%arg5)
-        %67 = vector.transfer_read %arg0[%arg3, %66], %c0_i16 {permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
+        %67 = vector.transfer_read %arg0[%arg3, %66], %c0_i16 {in_bounds = [true], permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
         %68 = vector.transfer_read %arg1[%66, %arg4], %c0_i16 : memref<?x64xi16>, vector<16xi16>
         %69 = arith.muli %67, %68 : vector<16xi16>
         %70 = arith.addi %65, %69 : vector<16xi16>
@@ -213,7 +213,7 @@ func.func @matmul(%arg0: memref<?x64xi16>, %arg1: memref<?x64xi16>, %arg2: memre
         // CHECK-SAME:                  {index = 0 : i8, offset = 0 : i32}
         // CHECK-SAME:                  : memref<?x64xi16>, vector<16xi16>
         %71 = affine.apply #map14(%arg5)
-        %72 = vector.transfer_read %arg0[%arg3, %71], %c0_i16 {permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
+        %72 = vector.transfer_read %arg0[%arg3, %71], %c0_i16 {in_bounds = [true], permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
         %73 = vector.transfer_read %arg1[%71, %arg4], %c0_i16 : memref<?x64xi16>, vector<16xi16>
         %74 = arith.muli %72, %73 : vector<16xi16>
         %75 = arith.addi %70, %74 : vector<16xi16>
@@ -222,19 +222,19 @@ func.func @matmul(%arg0: memref<?x64xi16>, %arg1: memref<?x64xi16>, %arg2: memre
         // CHECK-SAME:                  {index = 0 : i8, offset = 0 : i32}
         // CHECK-SAME:                  : memref<?x64xi16>, vector<16xi16>
         // CHECK: %[[VBef:.*]] = aievec.concat %[[VB14]], %[[VB15]] : vector<16xi16>, vector<32xi16>
-        // CHECK: %[[ACCk14:.*]] = aievec.mac %[[VBef]], %[[VA]], %[[ACCk12]]
+        // CHECK: %[[ACCk14:.*]] = aievec_aie1.mac %[[VBef]], %[[VA]], %[[ACCk12]]
         // CHECK-SAME:                       {xoffsets = "0x73727170", xoffsets_hi = "0x77767574", xsquare = "0x3120",
         // CHECK-SAME:                        xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "14", zstep = "1"}
         // CHECK: %[[ACC:.*]] = aievec.srs %[[ACCk14]], %[[C0I32]] : vector<16xi48>, i32, vector<16xi16>
         %76 = affine.apply #map15(%arg5)
-        %77 = vector.transfer_read %arg0[%arg3, %76], %c0_i16 {permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
+        %77 = vector.transfer_read %arg0[%arg3, %76], %c0_i16 {in_bounds = [true], permutation_map = #map} : memref<?x64xi16>, vector<16xi16>
         %78 = vector.transfer_read %arg1[%76, %arg4], %c0_i16 : memref<?x64xi16>, vector<16xi16>
         %79 = arith.muli %77, %78 : vector<16xi16>
         %80 = arith.addi %75, %79 : vector<16xi16>
         // CHECK: scf.yield %[[ACC]] : vector<16xi16>
         affine.yield %80 : vector<16xi16>
       }
-      // CHECK: vector.transfer_write %[[ACCn]], %[[MC]][%[[I]], %[[J]]] {in_bounds = [true]} : vector<16xi16>, memref<?x64xi16>
+      // CHECK: vector.transfer_write %[[ACCn]], %[[MC]][%[[I]], %[[J]]] : vector<16xi16>, memref<?x64xi16>
       vector.transfer_write %1, %arg2[%arg3, %arg4] : vector<16xi16>, memref<?x64xi16>
     }
   }
diff --git a/test/Conversion/VectorToAIEVec/test-arith.mlir b/test/Conversion/VectorToAIEVec/test-arith.mlir
index 04c984c2c4..8a9450a1e9 100644
--- a/test/Conversion/VectorToAIEVec/test-arith.mlir
+++ b/test/Conversion/VectorToAIEVec/test-arith.mlir
@@ -4,7 +4,7 @@
 // CHECK-SAME: %[[LHS:.*]]: vector<16xi32>,
 // CHECK-SAME: %[[RHS:.*]]: vector<16xi32>)
 func.func @vecaddi(%arg0: vector<16xi32>, %arg1: vector<16xi32>) -> vector<16xi32> {
-  // CHECK: %[[RES:.*]] = aievec.add %[[LHS]], %[[RHS]] : vector<16xi32>, vector<16xi32>, vector<16xi32>
+  // CHECK: %[[RES:.*]] = aievec_aie1.add %[[LHS]], %[[RHS]] : vector<16xi32>, vector<16xi32>, vector<16xi32>
   %0 = arith.addi %arg0, %arg1 : vector<16xi32>
   // CHECK: return %[[RES]] : vector<16xi32>
   return %0 : vector<16xi32>
@@ -14,7 +14,7 @@ func.func @vecaddi(%arg0: vector<16xi32>, %arg1: vector<16xi32>) -> vector<16xi3
 // CHECK-SAME: %[[LHS:.*]]: vector<16xf32>,
 // CHECK-SAME: %[[RHS:.*]]: vector<16xf32>)
 func.func @vecaddf(%arg0: vector<16xf32>, %arg1: vector<16xf32>) -> vector<16xf32> {
-  // CHECK: %[[RES:.*]] = aievec.add %[[LHS]], %[[RHS]] : vector<16xf32>, vector<16xf32>, vector<16xf32>
+  // CHECK: %[[RES:.*]] = aievec_aie1.add %[[LHS]], %[[RHS]] : vector<16xf32>, vector<16xf32>, vector<16xf32>
   %0 = arith.addf %arg0, %arg1 : vector<16xf32>
   // CHECK: return %[[RES]] : vector<16xf32>
   return %0 : vector<16xf32>
@@ -24,7 +24,7 @@ func.func @vecaddf(%arg0: vector<16xf32>, %arg1: vector<16xf32>) -> vector<16xf3
 // CHECK-SAME: %[[LHS:.*]]: vector<16xi32>,
 // CHECK-SAME: %[[RHS:.*]]: vector<16xi32>)
 func.func @vecsubi(%arg0: vector<16xi32>, %arg1: vector<16xi32>) -> vector<16xi32> {
-  // CHECK: %[[RES:.*]] = aievec.sub %[[LHS]], %[[RHS]] : vector<16xi32>, vector<16xi32>, vector<16xi32>
+  // CHECK: %[[RES:.*]] = aievec_aie1.sub %[[LHS]], %[[RHS]] : vector<16xi32>, vector<16xi32>, vector<16xi32>
   %0 = arith.subi %arg0, %arg1 : vector<16xi32>
   // CHECK: return %[[RES]] : vector<16xi32>
   return %0 : vector<16xi32>
@@ -34,7 +34,7 @@ func.func @vecsubi(%arg0: vector<16xi32>, %arg1: vector<16xi32>) -> vector<16xi3
 // CHECK-SAME: %[[LHS:.*]]: vector<16xf32>,
 // CHECK-SAME: %[[RHS:.*]]: vector<16xf32>)
 func.func @vecsubf(%arg0: vector<16xf32>, %arg1: vector<16xf32>) -> vector<16xf32> {
-  // CHECK: %[[RES:.*]] = aievec.sub %[[LHS]], %[[RHS]] : vector<16xf32>, vector<16xf32>, vector<16xf32>
+  // CHECK: %[[RES:.*]] = aievec_aie1.sub %[[LHS]], %[[RHS]] : vector<16xf32>, vector<16xf32>, vector<16xf32>
   %0 = arith.subf %arg0, %arg1 : vector<16xf32>
   // CHECK: return %[[RES]] : vector<16xf32>
   return %0 : vector<16xf32>
@@ -45,7 +45,7 @@ func.func @vecsubf(%arg0: vector<16xf32>, %arg1: vector<16xf32>) -> vector<16xf3
 // CHECK-SAME: %[[RHS:.*]]: vector<16xi16>)
 func.func @vecmuli16(%arg0: vector<16xi16>, %arg1: vector<16xi16>) -> vector<16xi16> {
   // CHECK: %[[C0:.*]] = arith.constant 0 : i32
-  // CHECK: %[[MUL:.*]] = aievec.mul %[[LHS]], %[[RHS]] : vector<16xi16>, vector<16xi16>, vector<16xi48>
+  // CHECK: %[[MUL:.*]] = aievec_aie1.mul %[[LHS]], %[[RHS]] : vector<16xi16>, vector<16xi16>, vector<16xi48>
   // CHECK: %[[RES:.*]] = aievec.srs %[[MUL]], %[[C0]] : vector<16xi48>, i32, vector<16xi16>
   %0 = arith.muli %arg0, %arg1 : vector<16xi16>
   // CHECK: return %[[RES]] : vector<16xi16>
@@ -57,7 +57,7 @@ func.func @vecmuli16(%arg0: vector<16xi16>, %arg1: vector<16xi16>) -> vector<16x
 // CHECK-SAME: %[[RHS:.*]]: vector<8xi32>)
 func.func @vecmuli32(%arg0: vector<8xi32>, %arg1: vector<8xi32>) -> vector<8xi32> {
   // CHECK: %[[C0:.*]] = arith.constant 0 : i32
-  // CHECK: %[[MUL:.*]] = aievec.mul %[[LHS]], %[[RHS]] : vector<8xi32>, vector<8xi32>, vector<8xi80>
+  // CHECK: %[[MUL:.*]] = aievec_aie1.mul %[[LHS]], %[[RHS]] : vector<8xi32>, vector<8xi32>, vector<8xi80>
   // CHECK: %[[RES:.*]] = aievec.srs %[[MUL]], %[[C0]] : vector<8xi80>, i32, vector<8xi32>
   %0 = arith.muli %arg0, %arg1 : vector<8xi32>
   // CHECK: return %[[RES]] : vector<8xi32>
@@ -68,7 +68,7 @@ func.func @vecmuli32(%arg0: vector<8xi32>, %arg1: vector<8xi32>) -> vector<8xi32
 // CHECK-SAME: %[[LHS:.*]]: vector<8xf32>,
 // CHECK-SAME: %[[RHS:.*]]: vector<8xf32>)
 func.func @vecmulf(%arg0: vector<8xf32>, %arg1: vector<8xf32>) -> vector<8xf32> {
-  // CHECK: %[[RES:.*]] = aievec.mul %[[LHS]], %[[RHS]] : vector<8xf32>, vector<8xf32>, vector<8xf32>
+  // CHECK: %[[RES:.*]] = aievec_aie1.mul %[[LHS]], %[[RHS]] : vector<8xf32>, vector<8xf32>, vector<8xf32>
   %0 = arith.mulf %arg0, %arg1 : vector<8xf32>
   // CHECK: return %[[RES]] : vector<8xf32>
   return %0 : vector<8xf32>
diff --git a/test/Conversion/VectorToAIEVec/test-mac.mlir b/test/Conversion/VectorToAIEVec/test-mac.mlir
index e700e4fd98..700362afcf 100644
--- a/test/Conversion/VectorToAIEVec/test-mac.mlir
+++ b/test/Conversion/VectorToAIEVec/test-mac.mlir
@@ -10,7 +10,7 @@ func.func @muladd2mac_i32(%a : vector<8xi32>,
     // CHECK:  %[[C0:.*]] = arith.constant 0 : i32
     // CHECK: %[[AA:.*]] = aievec.concat %[[A]], %[[A]] : vector<8xi32>, vector<16xi32>
     // CHECK: %[[ACC:.*]] = aievec.ups %[[C]] {shift = 0 : i8} : vector<8xi32>, vector<8xi80>
-    // CHECK: %[[MAC:.*]] = aievec.mac %[[AA]], %[[B]], %[[ACC]] : vector<16xi32>, vector<8xi32>, vector<8xi80>
+    // CHECK: %[[MAC:.*]] = aievec_aie1.mac %[[AA]], %[[B]], %[[ACC]] : vector<16xi32>, vector<8xi32>, vector<8xi80>
     // CHECK: %[[RES:.*]] = aievec.srs %[[MAC]], %[[C0]] : vector<8xi80>, i32, vector<8xi32>
     %0 = arith.muli %a, %b : vector<8xi32>
     %1 = arith.addi %0, %c : vector<8xi32>
@@ -28,7 +28,7 @@ func.func @muladd2mac_inv(%a : vector<8xi32>,
     // CHECK:  %[[C0:.*]] = arith.constant 0 : i32
     // CHECK: %[[AA:.*]] = aievec.concat %[[A]], %[[A]] : vector<8xi32>, vector<16xi32>
     // CHECK: %[[ACC:.*]] = aievec.ups %[[C]] {shift = 0 : i8} : vector<8xi32>, vector<8xi80>
-    // CHECK: %[[MAC:.*]] = aievec.mac %[[AA]], %[[B]], %[[ACC]] : vector<16xi32>, vector<8xi32>, vector<8xi80>
+    // CHECK: %[[MAC:.*]] = aievec_aie1.mac %[[AA]], %[[B]], %[[ACC]] : vector<16xi32>, vector<8xi32>, vector<8xi80>
     // CHECK: %[[RES:.*]] = aievec.srs %[[MAC]], %[[C0]] : vector<8xi80>, i32, vector<8xi32>
     %0 = arith.muli %a, %b : vector<8xi32>
     %1 = arith.addi %c, %0 : vector<8xi32>
@@ -47,12 +47,12 @@ func.func @splatAndMac2SplatMac(%a : vector<8xi32>,
     %1 = vector.broadcast %0 : i32 to vector<8xi32>
     %2 = aievec.concat %1, %1 : vector<8xi32>, vector<16xi32>
     %3 = aievec.ups %c {shift = 0 : i8} : vector<8xi32>, vector<8xi80>
-    %4 = aievec.mac %2, %b, %3 : vector<16xi32>, vector<8xi32>, vector<8xi80>
+    %4 = aievec_aie1.mac %2, %b, %3 : vector<16xi32>, vector<8xi32>, vector<8xi80>
     return %4 : vector<8xi80>
     // CHECK-DAG: %[[CVZ:.*]] = arith.constant dense<0> : vector<8xi32>
     // CHECK-DAG: %[[BB:.*]] = aievec.concat %[[B]], %[[CVZ]] : vector<8xi32>, vector<16xi32>
     // CHECK-DAG: %[[ACC:.*]] = aievec.ups %[[C]] {shift = 0 : i8} : vector<8xi32>, vector<8xi80>
-    // CHECK-DAG: %[[MAC:.*]] = aievec.mac %[[BB]], %[[A]], %[[ACC]]
+    // CHECK-DAG: %[[MAC:.*]] = aievec_aie1.mac %[[BB]], %[[A]], %[[ACC]]
     // CHECK-SAME:              {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000",
     // CHECK-SAME:               zstart = "2"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
     // CHECK: return %[[MAC]] : vector<8xi80>
@@ -69,12 +69,12 @@ func.func @splatAndMac2SplatMac_inv(%a : vector<8xi32>,
     %1 = vector.broadcast %0 : i32 to vector<8xi32>
     %2 = aievec.concat %a, %a : vector<8xi32>, vector<16xi32>
     %3 = aievec.ups %c {shift = 0 : i8} : vector<8xi32>, vector<8xi80>
-    %4 = aievec.mac %2, %1, %3 : vector<16xi32>, vector<8xi32>, vector<8xi80>
+    %4 = aievec_aie1.mac %2, %1, %3 : vector<16xi32>, vector<8xi32>, vector<8xi80>
     return %4 : vector<8xi80>
     // CHECK-DAG: %[[CVZ:.*]] = arith.constant dense<0> : vector<8xi32>
     // CHECK-DAG: %[[AA:.*]] = aievec.concat %[[A]], %[[CVZ]] : vector<8xi32>, vector<16xi32>
     // CHECK-DAG: %[[ACC:.*]] = aievec.ups %[[C]] {shift = 0 : i8} : vector<8xi32>, vector<8xi80>
-    // CHECK-DAG: %[[MAC:.*]] = aievec.mac %[[AA]], %[[B]], %[[ACC]]
+    // CHECK-DAG: %[[MAC:.*]] = aievec_aie1.mac %[[AA]], %[[B]], %[[ACC]]
     // CHECK-SAME:              {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000",
     // CHECK-SAME:               zstart = "4"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
     // CHECK: return %[[MAC]] : vector<8xi80>
@@ -91,12 +91,12 @@ func.func @splatAndMac2SplatMacI16(%a : vector<16xi16>,
     %1 = vector.broadcast %0 : i16 to vector<16xi16>
     %2 = aievec.concat %1, %1 : vector<16xi16>, vector<32xi16>
     %3 = aievec.ups %c {shift = 0 : i8} : vector<16xi16>, vector<16xi48>
-    %4 = aievec.mac %2, %b, %3 : vector<32xi16>, vector<16xi16>, vector<16xi48>
+    %4 = aievec_aie1.mac %2, %b, %3 : vector<32xi16>, vector<16xi16>, vector<16xi48>
     return %4 : vector<16xi48>
     // CHECK-DAG: %[[CVZ:.*]] = arith.constant dense<0> : vector<16xi16>
     // CHECK-DAG: %[[BB:.*]] = aievec.concat %[[B]], %[[CVZ]] : vector<16xi16>, vector<32xi16>
     // CHECK-DAG: %[[ACC:.*]] = aievec.ups %[[C]] {shift = 0 : i8} : vector<16xi16>, vector<16xi48>
-    // CHECK-DAG: %[[MAC:.*]] = aievec.mac %[[BB]], %[[A]], %[[ACC]]
+    // CHECK-DAG: %[[MAC:.*]] = aievec_aie1.mac %[[BB]], %[[A]], %[[ACC]]
     // CHECK-SAME:              {xoffsets = "0x73727170", xoffsets_hi = "0x77767574", xsquare = "0x3120",
     // CHECK-SAME:               xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "3", zstep = "1"}
     // CHECK-SAME:              : vector<32xi16>, vector<16xi16>, vector<16xi48>
diff --git a/test/Conversion/VectorToAIEVec/test-upd.mlir b/test/Conversion/VectorToAIEVec/test-upd.mlir
index ea7f61d3b2..2edec34d08 100644
--- a/test/Conversion/VectorToAIEVec/test-upd.mlir
+++ b/test/Conversion/VectorToAIEVec/test-upd.mlir
@@ -8,10 +8,10 @@ func.func @veccopy_i8(%arg0: memref<256xi8>, %arg1: memref<256xi8>) {
   %c0_i8 = arith.constant 0 : i8
   affine.for %arg2 = 0 to 256 step 16 {
     // CHECK-V2: %[[LD:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : i32} : memref<256xi8>, vector<16xi8>
-    // CHECK-V2-LLVM: %[[LD:.*]] = vector.transfer_read {{.*}}, {{.*}} {in_bounds = [true]} : memref<256xi8>, vector<16xi8>
+    // CHECK-V2-LLVM: %[[LD:.*]] = vector.transfer_read {{.*}}, {{.*}} : memref<256xi8>, vector<16xi8>
     %0 = vector.transfer_read %arg0[%arg2], %c0_i8 : memref<256xi8>, vector<16xi8>
-    // CHECK-V2: vector.transfer_write %[[LD]], {{.*}} {in_bounds = [true]} : vector<16xi8>, memref<256xi8>
-    // CHECK-V2-LLVM: vector.transfer_write %[[LD]], {{.*}} {in_bounds = [true]} : vector<16xi8>, memref<256xi8>
+    // CHECK-V2: vector.transfer_write %[[LD]], {{.*}} : vector<16xi8>, memref<256xi8>
+    // CHECK-V2-LLVM: vector.transfer_write %[[LD]], {{.*}} : vector<16xi8>, memref<256xi8>
     vector.transfer_write %0, %arg1[%arg2] : vector<16xi8>, memref<256xi8>
   }
   return
@@ -27,11 +27,11 @@ func.func @veccopy_i16(%arg0: memref<256xi16>, %arg1: memref<256xi16>) {
   affine.for %arg2 = 0 to 256 step 16 {
     // CHECK: %[[LD:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : i32} : memref<256xi16>, vector<16xi16>
     // CHECK-V2: %[[LD:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : i32} : memref<256xi16>, vector<16xi16>
-    // CHECK-V2-LLVM: %[[LD:.*]] = vector.transfer_read {{.*}}, {{.*}} {in_bounds = [true]} : memref<256xi16>, vector<16xi16>
+    // CHECK-V2-LLVM: %[[LD:.*]] = vector.transfer_read {{.*}}, {{.*}} : memref<256xi16>, vector<16xi16>
     %0 = vector.transfer_read %arg0[%arg2], %c0_i16 : memref<256xi16>, vector<16xi16>
-    // CHECK: vector.transfer_write %[[LD]], {{.*}} {in_bounds = [true]} : vector<16xi16>, memref<256xi16>
-    // CHECK-V2: vector.transfer_write %[[LD]], {{.*}} {in_bounds = [true]} : vector<16xi16>, memref<256xi16>
-    // CHECK-V2-LLVM: vector.transfer_write %[[LD]], {{.*}} {in_bounds = [true]} : vector<16xi16>, memref<256xi16>
+    // CHECK: vector.transfer_write %[[LD]], {{.*}} : vector<16xi16>, memref<256xi16>
+    // CHECK-V2: vector.transfer_write %[[LD]], {{.*}} : vector<16xi16>, memref<256xi16>
+    // CHECK-V2-LLVM: vector.transfer_write %[[LD]], {{.*}} : vector<16xi16>, memref<256xi16>
     vector.transfer_write %0, %arg1[%arg2] : vector<16xi16>, memref<256xi16>
   }
   return
@@ -47,11 +47,11 @@ func.func @veccopy_i32(%arg0: memref<256xi32>, %arg1: memref<256xi32>) {
   affine.for %arg2 = 0 to 256 step 8 {
     // CHECK: %[[LD:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : i32} : memref<256xi32>, vector<8xi32>
     // CHECK-V2: %[[LD:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : i32} : memref<256xi32>, vector<8xi32>
-    // CHECK-V2-LLVM: %[[LD:.*]] = vector.transfer_read {{.*}}, {{.*}} {in_bounds = [true]} : memref<256xi32>, vector<8xi32>
+    // CHECK-V2-LLVM: %[[LD:.*]] = vector.transfer_read {{.*}}, {{.*}} : memref<256xi32>, vector<8xi32>
     %0 = vector.transfer_read %arg0[%arg2], %c0_i32 : memref<256xi32>, vector<8xi32>
-    // CHECK: vector.transfer_write %[[LD]], {{.*}} {in_bounds = [true]} : vector<8xi32>, memref<256xi32>
-    // CHECK-V2: vector.transfer_write %[[LD]], {{.*}} {in_bounds = [true]} : vector<8xi32>, memref<256xi32>
-    // CHECK-V2-LLVM: vector.transfer_write %[[LD]], {{.*}} {in_bounds = [true]} : vector<8xi32>, memref<256xi32>
+    // CHECK: vector.transfer_write %[[LD]], {{.*}} : vector<8xi32>, memref<256xi32>
+    // CHECK-V2: vector.transfer_write %[[LD]], {{.*}} : vector<8xi32>, memref<256xi32>
+    // CHECK-V2-LLVM: vector.transfer_write %[[LD]], {{.*}} : vector<8xi32>, memref<256xi32>
     vector.transfer_write %0, %arg1[%arg2] : vector<8xi32>, memref<256xi32>
   }
   return
@@ -68,11 +68,11 @@ func.func @veccopy_long_i32(%arg0: memref<256xi32>, %arg1: memref<256xi32>) {
     // CHECK: %[[LD0:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : i32} : memref<256xi32>, vector<16xi32>
     // CHECK-NEXT: %[[LD1:.*]] = aievec.upd {{.*}}, %[[LD0]] {index = 1 : i8, offset = 256 : i32} : memref<256xi32>, vector<16xi32>
     // CHECK-V2: %[[LD:.*]] = aievec.upd {{.*}} {index = 0 : i8, offset = 0 : i32} : memref<256xi32>, vector<16xi32>
-    // CHECK-V2-LLVM: %[[LD:.*]] = vector.transfer_read {{.*}}, {{.*}} {in_bounds = [true]} : memref<256xi32>, vector<16xi32>
+    // CHECK-V2-LLVM: %[[LD:.*]] = vector.transfer_read {{.*}}, {{.*}} : memref<256xi32>, vector<16xi32>
     %0 = vector.transfer_read %arg0[%arg2], %c0_i32 : memref<256xi32>, vector<16xi32>
-    // CHECK: vector.transfer_write %[[LD1]], {{.*}} {in_bounds = [true]} : vector<16xi32>, memref<256xi32>
-    // CHECK-V2: vector.transfer_write %[[LD]], {{.*}} {in_bounds = [true]} : vector<16xi32>, memref<256xi32>
-    // CHECK-V2-LLVM: vector.transfer_write %[[LD]], {{.*}} {in_bounds = [true]} : vector<16xi32>, memref<256xi32>
+    // CHECK: vector.transfer_write %[[LD1]], {{.*}} : vector<16xi32>, memref<256xi32>
+    // CHECK-V2: vector.transfer_write %[[LD]], {{.*}} : vector<16xi32>, memref<256xi32>
+    // CHECK-V2-LLVM: vector.transfer_write %[[LD]], {{.*}} : vector<16xi32>, memref<256xi32>
     vector.transfer_write %0, %arg1[%arg2] : vector<16xi32>, memref<256xi32>
   }
   return
diff --git a/test/Conversion/VectorToAIEVec/test-vector-fma.mlir b/test/Conversion/VectorToAIEVec/test-vector-fma.mlir
new file mode 100644
index 0000000000..4f35823a22
--- /dev/null
+++ b/test/Conversion/VectorToAIEVec/test-vector-fma.mlir
@@ -0,0 +1,37 @@
+// RUN: aie-opt %s -convert-vector-to-aievec="aie-target=aie2" -split-input-file | FileCheck %s
+
+// CHECK-LABEL: test_fma_bf16
+// CHECK-SAME: %[[V0:[a-zA-Z0-9]+]]: vector<16xbf16>,
+// CHECK-SAME: %[[V1:.*]]: vector<16xbf16>,
+// CHECK-SAME: %[[V2:.*]]: vector<16xbf16>)
+func.func @test_fma_bf16(%v0: vector<16xbf16>,
+                         %v1: vector<16xbf16>,
+                         %v2: vector<16xbf16>) -> vector<16xbf16> {
+  // CHECK: %[[ACC:.*]] = aievec.ups %[[V2]] {shift = 0 : i8}
+  // CHECK-SAME:              : vector<16xbf16>, vector<16xf32>
+  // CHECK: %[[FMA:.*]] = aievec.mac_elem %[[V0]], %[[V1]], %[[ACC]]
+  // CHECK-SAME:              : vector<16xbf16>, vector<16xbf16>, vector<16xf32>
+  // CHECK: %[[RES:.*]] = aievec.srs %[[FMA]], %{{[a-zA-Z0-9]+}}
+  // CHECK-SAME:              : vector<16xf32>, i32, vector<16xbf16>
+  // CHECK: return %[[RES]] : vector<16xbf16>
+  %0 = vector.fma %v0, %v1, %v2 : vector<16xbf16>
+  return %0 : vector<16xbf16>
+}
+
+// -----
+
+// CHECK-LABEL: test_fma_f32
+// CHECK-SAME: %[[V0:[a-zA-Z0-9]+]]: vector<16xbf16>,
+// CHECK-SAME: %[[V1:.*]]: vector<16xbf16>,
+// CHECK-SAME: %[[V2:.*]]: vector<16xf32>)
+func.func @test_fma_f32(%v0: vector<16xbf16>,
+                        %v1: vector<16xbf16>,
+                        %v2: vector<16xf32>) -> vector<16xf32> {
+  // CHECK: %[[RES:.*]] = aievec.mac_elem %[[V0]], %[[V1]], %[[V2]]
+  // CHECK-SAME:              : vector<16xbf16>, vector<16xbf16>, vector<16xf32>
+  // CHECK: return %[[RES]] : vector<16xf32>
+  %v0f32 = arith.extf %v0 : vector<16xbf16> to vector<16xf32>
+  %v1f32 = arith.extf %v1 : vector<16xbf16> to vector<16xf32>
+  %0 = vector.fma %v0f32, %v1f32, %v2 : vector<16xf32>
+  return %0 : vector<16xf32>
+}
diff --git a/test/Targets/AIEGenerateXAIE/plio_shimmux.mlir b/test/Targets/AIEGenerateXAIE/plio_shimmux.mlir
index d8ee2023e0..415090dfac 100644
--- a/test/Targets/AIEGenerateXAIE/plio_shimmux.mlir
+++ b/test/Targets/AIEGenerateXAIE/plio_shimmux.mlir
@@ -26,7 +26,6 @@
 // CHECK: __mlir_aie_try(XAie_StrmConnCctEnable(&(ctx->DevInst), XAie_TileLoc(x,y), NORTH, 0, SOUTH, 0));
 // CHECK: x = 2;
 // CHECK: y = 0;
-// CHECK: __mlir_aie_try(XAie_EnableAieToShimDmaStrmPort(&(ctx->DevInst), XAie_TileLoc(x,y), 2));
 
 module {
  aie.device(xcvc1902) {
diff --git a/test/Targets/AIETargetHSA/input_with_addresses.mlir b/test/Targets/AIETargetHSA/input_with_addresses.mlir
index 36fc056f55..be4ece326b 100644
--- a/test/Targets/AIETargetHSA/input_with_addresses.mlir
+++ b/test/Targets/AIETargetHSA/input_with_addresses.mlir
@@ -54,11 +54,10 @@ module {
     aie.shim_dma_allocation @in0(MM2S, 0, 6)
     aie.shim_dma_allocation @out0(S2MM, 0, 6)
 
-    func.func @sequence(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) {
+    aiex.runtime_sequence(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) {
       aiex.npu.dma_memcpy_nd (0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0, 1]) {id = 0 : i64, metadata = @out0} : memref<64xi32>
       aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0, 1]) {id = 1 : i64, metadata = @in0} : memref<64xi32>
       aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
     }
   }
 }
diff --git a/test/Targets/AIEVecToCpp/translate_conv2d_uij_f32.mlir b/test/Targets/AIEVecToCpp/translate_conv2d_uij_f32.mlir
index 0275924408..992339f6db 100644
--- a/test/Targets/AIEVecToCpp/translate_conv2d_uij_f32.mlir
+++ b/test/Targets/AIEVecToCpp/translate_conv2d_uij_f32.mlir
@@ -30,22 +30,22 @@ func.func @conv2d_0(%arg0: memref<?x?xf32>, %arg1: memref<?xf32>, %arg2: memref<
     scf.for %arg4 = %c0_3 to %1 step %c8_4 {
       %6 = aievec.upd %arg2[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<?x?xf32>, vector<8xf32>
       %7 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<?x?xf32>, vector<16xf32>
-      %8 = aievec.mac %7, %2, %6 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+      %8 = aievec_aie1.mac %7, %2, %6 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
       %c1_5 = arith.constant 1 : index
       %9 = arith.addi %arg4, %c1_5 : index
       %10 = aievec.upd %arg0[%arg3, %9], %7 {index = 1 : i8, offset = 224 : i32} : memref<?x?xf32>, vector<16xf32>
-      %11 = aievec.mac %10, %2, %8 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-      %12 = aievec.mac %10, %2, %11 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+      %11 = aievec_aie1.mac %10, %2, %8 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+      %12 = aievec_aie1.mac %10, %2, %11 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
       %13 = aievec.upd %arg0[%4, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<?x?xf32>, vector<16xf32>
-      %14 = aievec.mac %13, %2, %12 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+      %14 = aievec_aie1.mac %13, %2, %12 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
       %15 = aievec.upd %arg0[%4, %9], %13 {index = 1 : i8, offset = 224 : i32} : memref<?x?xf32>, vector<16xf32>
-      %16 = aievec.mac %15, %2, %14 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-      %17 = aievec.mac %15, %2, %16 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+      %16 = aievec_aie1.mac %15, %2, %14 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+      %17 = aievec_aie1.mac %15, %2, %16 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
       %18 = aievec.upd %arg0[%5, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<?x?xf32>, vector<16xf32>
-      %19 = aievec.mac %18, %2, %17 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+      %19 = aievec_aie1.mac %18, %2, %17 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
       %20 = aievec.upd %arg0[%5, %9], %18 {index = 1 : i8, offset = 224 : i32} : memref<?x?xf32>, vector<16xf32>
-      %21 = aievec.mac %20, %2, %19 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-      %22 = aievec.mac %20, %3, %21 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+      %21 = aievec_aie1.mac %20, %2, %19 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+      %22 = aievec_aie1.mac %20, %3, %21 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
       vector.transfer_write %22, %arg2[%arg3, %arg4] : vector<8xf32>, memref<?x?xf32>
     }
   }
@@ -119,22 +119,22 @@ func.func @conv2d_1(%arg0: memref<?x256xf32>, %arg1: memref<?xf32>, %arg2: memre
     scf.for %arg4 = %c0_2 to %c256 step %c8_3 {
       %5 = aievec.upd %arg2[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<?x256xf32>, vector<8xf32>
       %6 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<?x256xf32>, vector<16xf32>
-      %7 = aievec.mac %6, %1, %5 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+      %7 = aievec_aie1.mac %6, %1, %5 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
       %c1_4 = arith.constant 1 : index
       %8 = arith.addi %arg4, %c1_4 : index
       %9 = aievec.upd %arg0[%arg3, %8], %6 {index = 1 : i8, offset = 224 : i32} : memref<?x256xf32>, vector<16xf32>
-      %10 = aievec.mac %9, %1, %7 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-      %11 = aievec.mac %9, %1, %10 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+      %10 = aievec_aie1.mac %9, %1, %7 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+      %11 = aievec_aie1.mac %9, %1, %10 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
       %12 = aievec.upd %arg0[%3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<?x256xf32>, vector<16xf32>
-      %13 = aievec.mac %12, %1, %11 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+      %13 = aievec_aie1.mac %12, %1, %11 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
       %14 = aievec.upd %arg0[%3, %8], %12 {index = 1 : i8, offset = 224 : i32} : memref<?x256xf32>, vector<16xf32>
-      %15 = aievec.mac %14, %1, %13 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-      %16 = aievec.mac %14, %1, %15 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+      %15 = aievec_aie1.mac %14, %1, %13 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+      %16 = aievec_aie1.mac %14, %1, %15 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
       %17 = aievec.upd %arg0[%4, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<?x256xf32>, vector<16xf32>
-      %18 = aievec.mac %17, %1, %16 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+      %18 = aievec_aie1.mac %17, %1, %16 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
       %19 = aievec.upd %arg0[%4, %8], %17 {index = 1 : i8, offset = 224 : i32} : memref<?x256xf32>, vector<16xf32>
-      %20 = aievec.mac %19, %1, %18 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-      %21 = aievec.mac %19, %2, %20 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+      %20 = aievec_aie1.mac %19, %1, %18 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+      %21 = aievec_aie1.mac %19, %2, %20 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
       vector.transfer_write %21, %arg2[%arg3, %arg4] : vector<8xf32>, memref<?x256xf32>
     }
   }
diff --git a/test/Targets/AIEVecToCpp/translate_conv2d_uij_i16.mlir b/test/Targets/AIEVecToCpp/translate_conv2d_uij_i16.mlir
index 98e60ef286..e968e1cdec 100644
--- a/test/Targets/AIEVecToCpp/translate_conv2d_uij_i16.mlir
+++ b/test/Targets/AIEVecToCpp/translate_conv2d_uij_i16.mlir
@@ -30,19 +30,19 @@ module  {
         %3 = aievec.upd %arg2[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2046x2046xi16>, vector<16xi16>
         %4 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi16>, vector<32xi16>
         %5 = aievec.ups %3 {shift = 0 : i8} : vector<16xi16>, vector<16xi48>
-        %6 = aievec.mac %4, %0, %5 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+        %6 = aievec_aie1.mac %4, %0, %5 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
         %c1_4 = arith.constant 1 : index
         %7 = arith.addi %arg4, %c1_4 : index
         %8 = aievec.upd %arg0[%arg3, %7], %4 {index = 1 : i8, offset = 240 : i32} : memref<2048x2048xi16>, vector<32xi16>
-        %9 = aievec.mac %8, %0, %6 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "2", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+        %9 = aievec_aie1.mac %8, %0, %6 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "2", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
         %10 = aievec.upd %arg0[%1, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi16>, vector<32xi16>
-        %11 = aievec.mac %10, %0, %9 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "4", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+        %11 = aievec_aie1.mac %10, %0, %9 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "4", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
         %12 = aievec.upd %arg0[%1, %7], %10 {index = 1 : i8, offset = 240 : i32} : memref<2048x2048xi16>, vector<32xi16>
-        %13 = aievec.mac %12, %0, %11 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "6", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+        %13 = aievec_aie1.mac %12, %0, %11 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "6", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
         %14 = aievec.upd %arg0[%2, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi16>, vector<32xi16>
-        %15 = aievec.mac %14, %0, %13 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "8", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+        %15 = aievec_aie1.mac %14, %0, %13 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "8", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
         %16 = aievec.upd %arg0[%2, %7], %14 {index = 1 : i8, offset = 240 : i32} : memref<2048x2048xi16>, vector<32xi16>
-        %17 = aievec.mac %16, %0, %15 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "10", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+        %17 = aievec_aie1.mac %16, %0, %15 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "10", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
         %18 = aievec.srs %17, %c0_i32 : vector<16xi48>, i32, vector<16xi16>
         vector.transfer_write %18, %arg2[%arg3, %arg4] : vector<16xi16>, memref<2046x2046xi16>
       }
@@ -116,19 +116,19 @@ module  {
         %3 = aievec.upd %arg2[%arg5, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<?x?xi16>, vector<16xi16>
         %4 = aievec.upd %arg0[%arg5, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<?x?xi16>, vector<32xi16>
         %5 = aievec.ups %3 {shift = 0 : i8} : vector<16xi16>, vector<16xi48>
-        %6 = aievec.mac %4, %0, %5 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+        %6 = aievec_aie1.mac %4, %0, %5 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
         %c1_3 = arith.constant 1 : index
         %7 = arith.addi %arg6, %c1_3 : index
         %8 = aievec.upd %arg0[%arg5, %7], %4 {index = 1 : i8, offset = 240 : i32} : memref<?x?xi16>, vector<32xi16>
-        %9 = aievec.mac %8, %0, %6 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "2", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+        %9 = aievec_aie1.mac %8, %0, %6 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "2", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
         %10 = aievec.upd %arg0[%1, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<?x?xi16>, vector<32xi16>
-        %11 = aievec.mac %10, %0, %9 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "4", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+        %11 = aievec_aie1.mac %10, %0, %9 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "4", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
         %12 = aievec.upd %arg0[%1, %7], %10 {index = 1 : i8, offset = 240 : i32} : memref<?x?xi16>, vector<32xi16>
-        %13 = aievec.mac %12, %0, %11 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "6", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+        %13 = aievec_aie1.mac %12, %0, %11 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "6", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
         %14 = aievec.upd %arg0[%2, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<?x?xi16>, vector<32xi16>
-        %15 = aievec.mac %14, %0, %13 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "8", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+        %15 = aievec_aie1.mac %14, %0, %13 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "8", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
         %16 = aievec.upd %arg0[%2, %7], %14 {index = 1 : i8, offset = 240 : i32} : memref<?x?xi16>, vector<32xi16>
-        %17 = aievec.mac %16, %0, %15 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "10", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+        %17 = aievec_aie1.mac %16, %0, %15 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "10", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
         %18 = aievec.srs %17, %c0_i32 : vector<16xi48>, i32, vector<16xi16>
         vector.transfer_write %18, %arg2[%arg5, %arg6] : vector<16xi16>, memref<?x?xi16>
       }
@@ -202,16 +202,16 @@ module  {
         %6 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<?x?xi16>, vector<32xi16>
         %7 = aievec.upd %arg0[%arg3, %arg4], %6 {index = 1 : i8, offset = 256 : i32} : memref<?x?xi16>, vector<32xi16>
         %8 = aievec.ups %5 {shift = 0 : i8} : vector<16xi16>, vector<16xi48>
-        %9 = aievec.mac %7, %2, %8 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-        %10 = aievec.mac %7, %2, %9 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "2", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+        %9 = aievec_aie1.mac %7, %2, %8 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+        %10 = aievec_aie1.mac %7, %2, %9 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "2", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
         %11 = aievec.upd %arg0[%3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<?x?xi16>, vector<32xi16>
         %12 = aievec.upd %arg0[%3, %arg4], %11 {index = 1 : i8, offset = 256 : i32} : memref<?x?xi16>, vector<32xi16>
-        %13 = aievec.mac %12, %2, %10 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "4", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-        %14 = aievec.mac %12, %2, %13 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "6", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+        %13 = aievec_aie1.mac %12, %2, %10 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "4", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+        %14 = aievec_aie1.mac %12, %2, %13 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "6", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
         %15 = aievec.upd %arg0[%4, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<?x?xi16>, vector<32xi16>
         %16 = aievec.upd %arg0[%4, %arg4], %15 {index = 1 : i8, offset = 256 : i32} : memref<?x?xi16>, vector<32xi16>
-        %17 = aievec.mac %16, %2, %14 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "8", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-        %18 = aievec.mac %16, %2, %17 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "10", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+        %17 = aievec_aie1.mac %16, %2, %14 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "8", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+        %18 = aievec_aie1.mac %16, %2, %17 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "10", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
         %19 = aievec.srs %18, %c0_i32 : vector<16xi48>, i32, vector<16xi16>
         vector.transfer_write %19, %arg2[%arg3, %arg4] : vector<16xi16>, memref<?x?xi16>
       }
@@ -282,16 +282,16 @@ module  {
         %5 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<?x256xi16>, vector<32xi16>
         %6 = aievec.upd %arg0[%arg3, %arg4], %5 {index = 1 : i8, offset = 256 : i32} : memref<?x256xi16>, vector<32xi16>
         %7 = aievec.ups %4 {shift = 0 : i8} : vector<16xi16>, vector<16xi48>
-        %8 = aievec.mac %6, %1, %7 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-        %9 = aievec.mac %6, %1, %8 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "2", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+        %8 = aievec_aie1.mac %6, %1, %7 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+        %9 = aievec_aie1.mac %6, %1, %8 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "2", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
         %10 = aievec.upd %arg0[%2, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<?x256xi16>, vector<32xi16>
         %11 = aievec.upd %arg0[%2, %arg4], %10 {index = 1 : i8, offset = 256 : i32} : memref<?x256xi16>, vector<32xi16>
-        %12 = aievec.mac %11, %1, %9 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "4", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-        %13 = aievec.mac %11, %1, %12 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "6", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+        %12 = aievec_aie1.mac %11, %1, %9 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "4", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+        %13 = aievec_aie1.mac %11, %1, %12 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "6", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
         %14 = aievec.upd %arg0[%3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<?x256xi16>, vector<32xi16>
         %15 = aievec.upd %arg0[%3, %arg4], %14 {index = 1 : i8, offset = 256 : i32} : memref<?x256xi16>, vector<32xi16>
-        %16 = aievec.mac %15, %1, %13 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "8", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-        %17 = aievec.mac %15, %1, %16 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "10", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+        %16 = aievec_aie1.mac %15, %1, %13 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "8", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+        %17 = aievec_aie1.mac %15, %1, %16 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "10", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
         %18 = aievec.srs %17, %c0_i32 : vector<16xi48>, i32, vector<16xi16>
         vector.transfer_write %18, %arg2[%arg3, %arg4] : vector<16xi16>, memref<?x256xi16>
       }
diff --git a/test/Targets/AIEVecToCpp/translate_conv2d_uij_i32.mlir b/test/Targets/AIEVecToCpp/translate_conv2d_uij_i32.mlir
index 779c703c54..74dfb4e7db 100644
--- a/test/Targets/AIEVecToCpp/translate_conv2d_uij_i32.mlir
+++ b/test/Targets/AIEVecToCpp/translate_conv2d_uij_i32.mlir
@@ -31,22 +31,22 @@ func.func @conv2d_0(%arg0: memref<2048x2048xi32>, %arg1: memref<9xi32>, %arg2: m
       %4 = aievec.upd %arg2[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2046x2046xi32>, vector<8xi32>
       %5 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi32>, vector<16xi32>
       %6 = aievec.ups %4 {shift = 0 : i8} : vector<8xi32>, vector<8xi80>
-      %7 = aievec.mac %5, %0, %6 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %7 = aievec_aie1.mac %5, %0, %6 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %c1_5 = arith.constant 1 : index
       %8 = arith.addi %arg4, %c1_5 : index
       %9 = aievec.upd %arg0[%arg3, %8], %5 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xi32>, vector<16xi32>
-      %10 = aievec.mac %9, %0, %7 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-      %11 = aievec.mac %9, %0, %10 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %10 = aievec_aie1.mac %9, %0, %7 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %11 = aievec_aie1.mac %9, %0, %10 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %12 = aievec.upd %arg0[%2, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi32>, vector<16xi32>
-      %13 = aievec.mac %12, %0, %11 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %13 = aievec_aie1.mac %12, %0, %11 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %14 = aievec.upd %arg0[%2, %8], %12 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xi32>, vector<16xi32>
-      %15 = aievec.mac %14, %0, %13 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-      %16 = aievec.mac %14, %0, %15 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %15 = aievec_aie1.mac %14, %0, %13 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %16 = aievec_aie1.mac %14, %0, %15 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %17 = aievec.upd %arg0[%3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi32>, vector<16xi32>
-      %18 = aievec.mac %17, %0, %16 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %18 = aievec_aie1.mac %17, %0, %16 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %19 = aievec.upd %arg0[%3, %8], %17 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xi32>, vector<16xi32>
-      %20 = aievec.mac %19, %0, %18 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-      %21 = aievec.mac %19, %1, %20 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %20 = aievec_aie1.mac %19, %0, %18 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %21 = aievec_aie1.mac %19, %1, %20 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %22 = aievec.srs %21, %c0_i32: vector<8xi80>, i32, vector<8xi32>
       vector.transfer_write %22, %arg2[%arg3, %arg4] : vector<8xi32>, memref<2046x2046xi32>
     }
@@ -125,22 +125,22 @@ func.func @conv2d_1(%arg0: memref<?x?xi32>, %arg1: memref<?xi32>, %arg2: memref<
       %4 = aievec.upd %arg2[%arg5, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<?x?xi32>, vector<8xi32>
       %5 = aievec.upd %arg0[%arg5, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<?x?xi32>, vector<16xi32>
       %6 = aievec.ups %4 {shift = 0 : i8} : vector<8xi32>, vector<8xi80>
-      %7 = aievec.mac %5, %0, %6 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %7 = aievec_aie1.mac %5, %0, %6 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %c1_4 = arith.constant 1 : index
       %8 = arith.addi %arg6, %c1_4 : index
       %9 = aievec.upd %arg0[%arg5, %8], %5 {index = 1 : i8, offset = 224 : i32} : memref<?x?xi32>, vector<16xi32>
-      %10 = aievec.mac %9, %0, %7 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-      %11 = aievec.mac %9, %0, %10 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %10 = aievec_aie1.mac %9, %0, %7 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %11 = aievec_aie1.mac %9, %0, %10 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %12 = aievec.upd %arg0[%2, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<?x?xi32>, vector<16xi32>
-      %13 = aievec.mac %12, %0, %11 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %13 = aievec_aie1.mac %12, %0, %11 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %14 = aievec.upd %arg0[%2, %8], %12 {index = 1 : i8, offset = 224 : i32} : memref<?x?xi32>, vector<16xi32>
-      %15 = aievec.mac %14, %0, %13 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-      %16 = aievec.mac %14, %0, %15 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %15 = aievec_aie1.mac %14, %0, %13 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %16 = aievec_aie1.mac %14, %0, %15 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %17 = aievec.upd %arg0[%3, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<?x?xi32>, vector<16xi32>
-      %18 = aievec.mac %17, %0, %16 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %18 = aievec_aie1.mac %17, %0, %16 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %19 = aievec.upd %arg0[%3, %8], %17 {index = 1 : i8, offset = 224 : i32} : memref<?x?xi32>, vector<16xi32>
-      %20 = aievec.mac %19, %0, %18 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-      %21 = aievec.mac %19, %1, %20 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %20 = aievec_aie1.mac %19, %0, %18 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %21 = aievec_aie1.mac %19, %1, %20 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %22 = aievec.srs %21, %c0_i32 : vector<8xi80>, i32, vector<8xi32>
       vector.transfer_write %22, %arg2[%arg5, %arg6] : vector<8xi32>, memref<?x?xi32>
     }
@@ -218,22 +218,22 @@ func.func @conv2d_2(%arg0: memref<?x?xi32>, %arg1: memref<?xi32>, %arg2: memref<
       %6 = aievec.upd %arg2[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<?x?xi32>, vector<8xi32>
       %7 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<?x?xi32>, vector<16xi32>
       %8 = aievec.ups %6 {shift = 0 : i8} : vector<8xi32>, vector<8xi80>
-      %9 = aievec.mac %7, %2, %8 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %9 = aievec_aie1.mac %7, %2, %8 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %c1_5 = arith.constant 1 : index
       %10 = arith.addi %arg4, %c1_5 : index
       %11 = aievec.upd %arg0[%arg3, %10], %7 {index = 1 : i8, offset = 224 : i32} : memref<?x?xi32>, vector<16xi32>
-      %12 = aievec.mac %11, %2, %9 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-      %13 = aievec.mac %11, %2, %12 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %12 = aievec_aie1.mac %11, %2, %9 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %13 = aievec_aie1.mac %11, %2, %12 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %14 = aievec.upd %arg0[%4, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<?x?xi32>, vector<16xi32>
-      %15 = aievec.mac %14, %2, %13 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %15 = aievec_aie1.mac %14, %2, %13 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %16 = aievec.upd %arg0[%4, %10], %14 {index = 1 : i8, offset = 224 : i32} : memref<?x?xi32>, vector<16xi32>
-      %17 = aievec.mac %16, %2, %15 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-      %18 = aievec.mac %16, %2, %17 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %17 = aievec_aie1.mac %16, %2, %15 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %18 = aievec_aie1.mac %16, %2, %17 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %19 = aievec.upd %arg0[%5, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<?x?xi32>, vector<16xi32>
-      %20 = aievec.mac %19, %2, %18 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %20 = aievec_aie1.mac %19, %2, %18 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %21 = aievec.upd %arg0[%5, %10], %19 {index = 1 : i8, offset = 224 : i32} : memref<?x?xi32>, vector<16xi32>
-      %22 = aievec.mac %21, %2, %20 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-      %23 = aievec.mac %21, %3, %22 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %22 = aievec_aie1.mac %21, %2, %20 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %23 = aievec_aie1.mac %21, %3, %22 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %24 = aievec.srs %23, %c0_i32 : vector<8xi80>, i32, vector<8xi32>
       vector.transfer_write %24, %arg2[%arg3, %arg4] : vector<8xi32>, memref<?x?xi32>
     }
@@ -310,22 +310,22 @@ func.func @conv2d_3(%arg0: memref<?x256xi32>, %arg1: memref<?xi32>, %arg2: memre
       %5 = aievec.upd %arg2[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<?x256xi32>, vector<8xi32>
       %6 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<?x256xi32>, vector<16xi32>
       %7 = aievec.ups %5 {shift = 0 : i8} : vector<8xi32>, vector<8xi80>
-      %8 = aievec.mac %6, %1, %7 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %8 = aievec_aie1.mac %6, %1, %7 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %c1_4 = arith.constant 1 : index
       %9 = arith.addi %arg4, %c1_4 : index
       %10 = aievec.upd %arg0[%arg3, %9], %6 {index = 1 : i8, offset = 224 : i32} : memref<?x256xi32>, vector<16xi32>
-      %11 = aievec.mac %10, %1, %8 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-      %12 = aievec.mac %10, %1, %11 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %11 = aievec_aie1.mac %10, %1, %8 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %12 = aievec_aie1.mac %10, %1, %11 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %13 = aievec.upd %arg0[%3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<?x256xi32>, vector<16xi32>
-      %14 = aievec.mac %13, %1, %12 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %14 = aievec_aie1.mac %13, %1, %12 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %15 = aievec.upd %arg0[%3, %9], %13 {index = 1 : i8, offset = 224 : i32} : memref<?x256xi32>, vector<16xi32>
-      %16 = aievec.mac %15, %1, %14 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-      %17 = aievec.mac %15, %1, %16 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %16 = aievec_aie1.mac %15, %1, %14 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %17 = aievec_aie1.mac %15, %1, %16 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %18 = aievec.upd %arg0[%4, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<?x256xi32>, vector<16xi32>
-      %19 = aievec.mac %18, %1, %17 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %19 = aievec_aie1.mac %18, %1, %17 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %20 = aievec.upd %arg0[%4, %9], %18 {index = 1 : i8, offset = 224 : i32} : memref<?x256xi32>, vector<16xi32>
-      %21 = aievec.mac %20, %1, %19 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-      %22 = aievec.mac %20, %2, %21 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %21 = aievec_aie1.mac %20, %1, %19 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      %22 = aievec_aie1.mac %20, %2, %21 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %23 = aievec.srs %22, %c0_i32 : vector<8xi80>, i32, vector<8xi32>
       vector.transfer_write %23, %arg2[%arg3, %arg4] : vector<8xi32>, memref<?x256xi32>
     }
diff --git a/test/Targets/AIEVecToCpp/translate_conv2d_uij_i8.mlir b/test/Targets/AIEVecToCpp/translate_conv2d_uij_i8.mlir
index b640526f92..48f950e305 100644
--- a/test/Targets/AIEVecToCpp/translate_conv2d_uij_i8.mlir
+++ b/test/Targets/AIEVecToCpp/translate_conv2d_uij_i8.mlir
@@ -31,14 +31,14 @@ func.func @conv2d_0(%arg0: memref<18x288xi8>, %arg1: memref<48xi8>, %arg2: memre
       %4 = aievec.upd %arg2[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<16x256xi8>, vector<16xi8>
       %5 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<18x288xi8>, vector<32xi8>
       %6 = aievec.ups %4 {shift = 10 : i8} : vector<16xi8>, vector<16xi48>
-      %7 = aievec.mac %0, %5, %6 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "0", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
-      %8 = aievec.mac %0, %5, %6 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "0", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "4", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
+      %7 = aievec_aie1.mac %0, %5, %6 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "0", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
+      %8 = aievec_aie1.mac %0, %5, %6 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "0", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "4", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
       %9 = aievec.upd %arg0[%2, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<18x288xi8>, vector<32xi8>
-      %10 = aievec.mac %0, %9, %7 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "16", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
-      %11 = aievec.mac %0, %9, %8 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "16", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "4", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
+      %10 = aievec_aie1.mac %0, %9, %7 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "16", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
+      %11 = aievec_aie1.mac %0, %9, %8 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "16", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "4", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
       %12 = aievec.upd %arg0[%3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<18x288xi8>, vector<32xi8>
-      %13 = aievec.mac %1, %12, %10 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "32", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
-      %14 = aievec.mac %1, %12, %11 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "32", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "4", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
+      %13 = aievec_aie1.mac %1, %12, %10 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "32", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
+      %14 = aievec_aie1.mac %1, %12, %11 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "32", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "4", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
       %15 = aievec.srs %13, %c10 : vector<16xi48>, i32, vector<16xi16>
       %16 = aievec.srs %14, %c10 : vector<16xi48>, i32, vector<16xi16>
       %17 = aievec.concat %15, %16 : vector<16xi16>, vector<32xi16>
@@ -115,14 +115,14 @@ func.func @conv2d_1(%arg0: memref<18x288xi8>, %arg1: memref<48xi8>, %arg2: memre
     %c16_3 = arith.constant 16 : index
     scf.for %arg4 = %c0_2 to %c256 step %c16_3 {
       %4 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<18x288xi8>, vector<32xi8>
-      %5 = aievec.mul %0, %4 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "0", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
-      %6 = aievec.mul %0, %4 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "0", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "4", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
+      %5 = aievec_aie1.mul %0, %4 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "0", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
+      %6 = aievec_aie1.mul %0, %4 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "0", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "4", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
       %7 = aievec.upd %arg0[%2, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<18x288xi8>, vector<32xi8>
-      %8 = aievec.mac %0, %7, %5 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "16", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
-      %9 = aievec.mac %0, %7, %6 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "16", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "4", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
+      %8 = aievec_aie1.mac %0, %7, %5 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "16", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
+      %9 = aievec_aie1.mac %0, %7, %6 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "16", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "4", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
       %10 = aievec.upd %arg0[%3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<18x288xi8>, vector<32xi8>
-      %11 = aievec.mac %1, %10, %8 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "32", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
-      %12 = aievec.mac %1, %10, %9 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "32", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "4", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
+      %11 = aievec_aie1.mac %1, %10, %8 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "32", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
+      %12 = aievec_aie1.mac %1, %10, %9 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "32", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "4", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
       %13 = aievec.srs %11, %c10 : vector<16xi48>, i32, vector<16xi16>
       %14 = aievec.srs %12, %c10 : vector<16xi48>, i32, vector<16xi16>
       %15 = aievec.concat %13, %14 : vector<16xi16>, vector<32xi16>
diff --git a/test/Targets/NPU/npu_instgen.mlir b/test/Targets/NPU/npu_instgen.mlir
index 847a51dbaf..560ac1171c 100644
--- a/test/Targets/NPU/npu_instgen.mlir
+++ b/test/Targets/NPU/npu_instgen.mlir
@@ -8,10 +8,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-// RUN: aie-translate --aie-npu-instgen %s | FileCheck %s
+// RUN: aie-opt --aie-dma-to-npu %s | aie-translate --aie-npu-instgen | FileCheck %s
 module {
   aie.device(npu1_4col) {
-    func.func @test0(%arg0: memref<16xf32>, %arg1: memref<16xf32>) {
+    aiex.runtime_sequence(%arg0: memref<16xf32>, %arg1: memref<16xf32>) {
 
       // TXN header
       // CHECK: 06030100
@@ -73,7 +73,6 @@ module {
       // CHECK: 00030401
       // CHECK: 05010200
       aiex.npu.sync { column = 3 : i32, row = 4 : i32, direction = 1 : i32, channel = 5 : i32, column_num = 1 : i32, row_num = 2 : i32 }
-      return
     }
   }
 }
diff --git a/test/aiecc/buffers_xclbin.mlir b/test/aiecc/buffers_xclbin.mlir
index 82aff8ae03..ddc01886d2 100644
--- a/test/aiecc/buffers_xclbin.mlir
+++ b/test/aiecc/buffers_xclbin.mlir
@@ -94,7 +94,7 @@ module {
     %02 = aie.tile(0, 2)
     %12 = aie.tile(1, 2)
     %22 = aie.tile(2, 2)
-    func.func @sequence(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>, %arg2: memref<1024xi32>, %arg3: memref<1024xi32>, %arg4: memref<1024xi32>, %arg5: memref<1024xi32>) {
+    aiex.runtime_sequence(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>, %arg2: memref<1024xi32>, %arg3: memref<1024xi32>, %arg4: memref<1024xi32>, %arg5: memref<1024xi32>) {
       aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 0 : i64, metadata = @in0} : memref<1024xi32>
       aiex.npu.dma_memcpy_nd (0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 1 : i64, metadata = @out0} : memref<1024xi32>
       aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
@@ -104,7 +104,6 @@ module {
       aiex.npu.dma_memcpy_nd (0, 0, %arg4[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 2 : i64, metadata = @in2} : memref<1024xi32>
       aiex.npu.dma_memcpy_nd (0, 0, %arg5[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 3 : i64, metadata = @out2} : memref<1024xi32>
       aiex.npu.sync {channel = 0 : i32, column = 2 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
     }
   }
 }
\ No newline at end of file
diff --git a/test/aievec/conv2d_f32.mlir b/test/aievec/conv2d_f32.mlir
deleted file mode 100644
index 85be5c3314..0000000000
--- a/test/aievec/conv2d_f32.mlir
+++ /dev/null
@@ -1,59 +0,0 @@
-// RUN: aie-opt %s --affine-loop-unroll="unroll-full unroll-full-threshold=3" --canonicalize -affine-super-vectorize="virtual-vector-size=8" --aie-vectorize -unaligned-loads-check=false -split-input-file | FileCheck %s
-
-//CHECK-LABEL: func.func @conv2d(%arg0: memref<2048x2048xf32>, %arg1: memref<9xf32>, %arg2: memref<2046x2046xf32>) {
-func.func @conv2d (%A: memref<2048x2048xf32>, %B: memref<9xf32>, %C: memref<2046x2046xf32>) {
-    affine.for %arg3 = 0 to 2046 {
-        affine.for %arg4 = 0 to 2046 {
-            //3x3 stencil 
-            affine.for %arg5 = 0 to 3 {
-                affine.for %arg6 = 0 to 3 {   
-                    //Load the output point
-                    %ci = affine.load %C[%arg3, %arg4] : memref<2046x2046xf32>
-                     %a11 = affine.load %A[%arg3+%arg5, %arg4+%arg6] : memref<2048x2048xf32>
-                     %b11 = affine.load %B[3*%arg5+%arg6] : memref<9xf32>
-                     %p11 = arith.mulf %a11, %b11 : f32
-                     %c11 = arith.addf %ci, %p11 : f32
-                     //Store accumulated sum
-                     affine.store %c11, %C[%arg3, %arg4] : memref<2046x2046xf32>
-                }
-            }
-        }
-    }
-    return
-}
-
-//CHECK-NEXT: %c8 = arith.constant 8 : index
-//CHECK-NEXT: %c0 = arith.constant 0 : index
-//CHECK-NEXT: %0 = aievec.upd %arg1[%c0] {index = 0 : i8, offset = 0 : i32} : memref<9xf32>, vector<8xf32>
-//CHECK-NEXT: %1 = aievec.upd %arg1[%c8] {index = 0 : i8, offset = 0 : i32} : memref<9xf32>, vector<8xf32>
-//CHECK-NEXT: %c0_0 = arith.constant 0 : index
-//CHECK-NEXT: %c2046 = arith.constant 2046 : index
-//CHECK-NEXT: %c1 = arith.constant 1 : index
-//CHECK-NEXT: scf.for %arg3 = %c0_0 to %c2046 step %c1 {
-//CHECK-NEXT: %c1_1 = arith.constant 1 : index
-//CHECK-NEXT: %2 = arith.addi %arg3, %c1_1 : index
-//CHECK-NEXT: %c2 = arith.constant 2 : index
-//CHECK-NEXT: %3 = arith.addi %arg3, %c2 : index
-//CHECK-NEXT: %c0_2 = arith.constant 0 : index
-//CHECK-NEXT: %c2046_3 = arith.constant 2046 : index
-//CHECK-NEXT: %c8_4 = arith.constant 8 : index
-//CHECK-NEXT: scf.for %arg4 = %c0_2 to %c2046_3 step %c8_4 {
-//CHECK-NEXT: %4 = aievec.upd %arg2[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2046x2046xf32>, vector<8xf32>
-//CHECK-NEXT: %5 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xf32>, vector<16xf32>
-//CHECK-NEXT: %6 = aievec.mac %5, %0, %4 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT: %c1_5 = arith.constant 1 : index
-//CHECK-NEXT: %7 = arith.addi %arg4, %c1_5 : index
-//CHECK-NEXT: %8 = aievec.upd %arg0[%arg3, %7], %5 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xf32>, vector<16xf32>
-//CHECK-NEXT: %9 = aievec.mac %8, %0, %6 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT: %10 = aievec.mac %8, %0, %9 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT: %11 = aievec.upd %arg0[%2, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xf32>, vector<16xf32>
-//CHECK-NEXT: %12 = aievec.mac %11, %0, %10 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT: %13 = aievec.upd %arg0[%2, %7], %11 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xf32>, vector<16xf32>
-//CHECK-NEXT: %14 = aievec.mac %13, %0, %12 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT: %15 = aievec.mac %13, %0, %14 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT: %16 = aievec.upd %arg0[%3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xf32>, vector<16xf32>
-//CHECK-NEXT: %17 = aievec.mac %16, %0, %15 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT: %18 = aievec.upd %arg0[%3, %7], %16 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xf32>, vector<16xf32>
-//CHECK-NEXT: %19 = aievec.mac %18, %0, %17 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT: %20 = aievec.mac %18, %1, %19 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT: vector.transfer_write %20, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<8xf32>, memref<2046x2046xf32>
diff --git a/test/aievec/conv2d_i16.mlir b/test/aievec/conv2d_i16.mlir
deleted file mode 100644
index 47ead7ea20..0000000000
--- a/test/aievec/conv2d_i16.mlir
+++ /dev/null
@@ -1,55 +0,0 @@
-// RUN: aie-opt %s --affine-loop-unroll="unroll-full unroll-full-threshold=3" --canonicalize -affine-super-vectorize="virtual-vector-size=16" --aie-vectorize="zero-offset=4" -unaligned-loads-check=false -split-input-file | FileCheck %s
-
-//CHECK-LABEL: func.func @conv2d(%arg0: memref<2048x2048xi16>, %arg1: memref<12xi16>, %arg2: memref<2046x2046xi16>) {
-func.func @conv2d (%A: memref<2048x2048xi16>, %B: memref<12xi16>, %C: memref<2046x2046xi16>) {
-    affine.for %arg3 = 0 to 2046 {
-        affine.for %arg4 = 0 to 2046 {
-            //3x3 stencil 
-            affine.for %arg5 = 0 to 3 {
-                affine.for %arg6 = 0 to 3 {   
-                    //Load the output point
-                    %ci = affine.load %C[%arg3, %arg4] : memref<2046x2046xi16>
-                     %a11 = affine.load %A[%arg3+%arg5, %arg4+%arg6] : memref<2048x2048xi16>
-                     %b11 = affine.load %B[4*%arg5+%arg6] : memref<12xi16>
-                     %p11 = arith.muli %a11, %b11 : i16
-                     %c11 = arith.addi %ci, %p11 : i16
-                     //Store accumulated sum
-                     affine.store %c11, %C[%arg3, %arg4] : memref<2046x2046xi16>
-                }
-            }
-        }
-    }
-    return
-}
-
-//CHECK-NEXT:    %c0_i32 = arith.constant 0 : i32
-//CHECK-NEXT:    %c0 = arith.constant 0 : index
-//CHECK-NEXT:    %0 = aievec.upd %arg1[%c0] {index = 0 : i8, offset = 0 : i32} : memref<12xi16>, vector<16xi16>
-//CHECK-NEXT:    %c0_0 = arith.constant 0 : index
-//CHECK-NEXT:    %c2046 = arith.constant 2046 : index
-//CHECK-NEXT:    %c1 = arith.constant 1 : index
-//CHECK-NEXT:    scf.for %arg3 = %c0_0 to %c2046 step %c1 {
-//CHECK-NEXT:      %c1_1 = arith.constant 1 : index
-//CHECK-NEXT:      %1 = arith.addi %arg3, %c1_1 : index
-//CHECK-NEXT:      %c2 = arith.constant 2 : index
-//CHECK-NEXT:      %2 = arith.addi %arg3, %c2 : index
-//CHECK-NEXT:      %c0_2 = arith.constant 0 : index
-//CHECK-NEXT:      %c2046_3 = arith.constant 2046 : index
-//CHECK-NEXT:      %c16 = arith.constant 16 : index
-//CHECK-NEXT:      scf.for %arg4 = %c0_2 to %c2046_3 step %c16 {
-//CHECK-NEXT:        %3 = aievec.upd %arg2[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2046x2046xi16>, vector<16xi16>
-//CHECK-NEXT:        %4 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi16>, vector<32xi16>
-//CHECK-NEXT:        %5 = aievec.upd %arg0[%arg3, %arg4], %4 {index = 1 : i8, offset = 256 : i32} : memref<2048x2048xi16>, vector<32xi16>
-//CHECK-NEXT:        %6 = aievec.ups %3 {shift = 0 : i8} : vector<16xi16>, vector<16xi48>
-//CHECK-NEXT:        %7 = aievec.mac %5, %0, %6 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-//CHECK-NEXT:        %8 = aievec.mac %5, %0, %7 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "2", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-//CHECK-NEXT:        %9 = aievec.upd %arg0[%1, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi16>, vector<32xi16>
-//CHECK-NEXT:        %10 = aievec.upd %arg0[%1, %arg4], %9 {index = 1 : i8, offset = 256 : i32} : memref<2048x2048xi16>, vector<32xi16>
-//CHECK-NEXT:        %11 = aievec.mac %10, %0, %8 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "4", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-//CHECK-NEXT:        %12 = aievec.mac %10, %0, %11 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "6", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-//CHECK-NEXT:        %13 = aievec.upd %arg0[%2, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi16>, vector<32xi16>
-//CHECK-NEXT:        %14 = aievec.upd %arg0[%2, %arg4], %13 {index = 1 : i8, offset = 256 : i32} : memref<2048x2048xi16>, vector<32xi16>
-//CHECK-NEXT:        %15 = aievec.mac %14, %0, %12 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "8", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-//CHECK-NEXT:        %16 = aievec.mac %14, %0, %15 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "10", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-//CHECK-NEXT:        %17 = aievec.srs %16, %c0_i32 : vector<16xi48>, i32, vector<16xi16>
-//CHECK-NEXT:        vector.transfer_write %17, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<16xi16>, memref<2046x2046xi16>
diff --git a/test/aievec/conv2d_i16_after_polygeist.mlir b/test/aievec/conv2d_i16_after_polygeist.mlir
index fb8d1b6337..5240ac3119 100644
--- a/test/aievec/conv2d_i16_after_polygeist.mlir
+++ b/test/aievec/conv2d_i16_after_polygeist.mlir
@@ -45,4 +45,4 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.endianness"
 //      CHECK:        %[[T2:.*]] = aievec.upd %[[A0]][%[[A3:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x288xi16>, vector<32xi16>
 //      CHECK:        %[[T3:.*]] = aievec.mul_conv %[[T2:.*]], %[[T1:.*]] {M = 16 : i32, N = 4 : i32} : vector<32xi16>, vector<32xi16>, vector<16xi64>
 //      CHECK:        %[[T4:.*]] = aievec.srs %[[T3:.*]], %[[C10]] : vector<16xi64>, i32, vector<16xi16>
-//      CHECK:        vector.transfer_write %[[T4:.*]], %[[A2]][%[[A3:.*]], %[[A4:.*]]] {in_bounds = [true]} : vector<16xi16>, memref<?x256xi16>
+//      CHECK:        vector.transfer_write %[[T4:.*]], %[[A2]][%[[A3:.*]], %[[A4:.*]]] : vector<16xi16>, memref<?x256xi16>
diff --git a/test/aievec/conv2d_i16_after_polygeist_2.mlir b/test/aievec/conv2d_i16_after_polygeist_2.mlir
index fdb14c028b..144fd8472f 100644
--- a/test/aievec/conv2d_i16_after_polygeist_2.mlir
+++ b/test/aievec/conv2d_i16_after_polygeist_2.mlir
@@ -44,4 +44,4 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.endianness"
 //      CHECK:        %[[T2:.*]] = aievec.upd %[[A0]][%[[A3:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x288xi16>, vector<32xi16>
 //      CHECK:        %[[T3:.*]] = aievec.mul_conv %[[T2:.*]], %[[T1:.*]] {M = 16 : i32, N = 4 : i32} : vector<32xi16>, vector<32xi16>, vector<16xi64>
 //      CHECK:        %[[T4:.*]] = aievec.srs %[[T3:.*]], %[[C10]] : vector<16xi64>, i32, vector<16xi32>
-//      CHECK:        vector.transfer_write %[[T4:.*]], %[[A2]][%[[A3:.*]], %[[A4:.*]]] {in_bounds = [true]} : vector<16xi32>, memref<?x256xi32>
+//      CHECK:        vector.transfer_write %[[T4:.*]], %[[A2]][%[[A3:.*]], %[[A4:.*]]] : vector<16xi32>, memref<?x256xi32>
diff --git a/test/aievec/conv2d_i32.mlir b/test/aievec/conv2d_i32.mlir
deleted file mode 100644
index 5da5e47031..0000000000
--- a/test/aievec/conv2d_i32.mlir
+++ /dev/null
@@ -1,62 +0,0 @@
-// RUN: aie-opt %s --affine-loop-unroll="unroll-full unroll-full-threshold=3" --canonicalize -affine-super-vectorize="virtual-vector-size=8" --aie-vectorize -unaligned-loads-check=false -split-input-file | FileCheck %s
-
-//CHECK-LABEL: func.func @conv2d(%arg0: memref<2048x2048xi32>, %arg1: memref<9xi32>, %arg2: memref<2046x2046xi32>) {
-func.func @conv2d (%A: memref<2048x2048xi32>, %B: memref<9xi32>, %C: memref<2046x2046xi32>) {
-    affine.for %arg3 = 0 to 2046 {
-        affine.for %arg4 = 0 to 2046 {
-            //3x3 stencil 
-            affine.for %arg5 = 0 to 3 {
-                affine.for %arg6 = 0 to 3 {   
-                    //Load the output point
-                    %ci = affine.load %C[%arg3, %arg4] : memref<2046x2046xi32>
-                     %a11 = affine.load %A[%arg3+%arg5, %arg4+%arg6] : memref<2048x2048xi32>
-                     %b11 = affine.load %B[3*%arg5+%arg6] : memref<9xi32>
-                     %p11 = arith.muli %a11, %b11 : i32
-                     %c11 = arith.addi %ci, %p11 : i32
-                     //Store accumulated sum
-                     affine.store %c11, %C[%arg3, %arg4] : memref<2046x2046xi32>
-                }
-            }
-        }
-    }
-    return
-}
-
-//CHECK-NEXT:    %c8 = arith.constant 8 : index
-//CHECK-NEXT:    %c0 = arith.constant 0 : index
-//CHECK-NEXT:    %c0_i32 = arith.constant 0 : i32
-//CHECK-NEXT:    %0 = aievec.upd %arg1[%c0] {index = 0 : i8, offset = 0 : i32} : memref<9xi32>, vector<8xi32>
-//CHECK-NEXT:    %1 = aievec.upd %arg1[%c8] {index = 0 : i8, offset = 0 : i32} : memref<9xi32>, vector<8xi32>
-//CHECK-NEXT:    %c0_0 = arith.constant 0 : index
-//CHECK-NEXT:    %c2046 = arith.constant 2046 : index
-//CHECK-NEXT:    %c1 = arith.constant 1 : index
-//CHECK-NEXT:    scf.for %arg3 = %c0_0 to %c2046 step %c1 {
-//CHECK-NEXT:      %c1_1 = arith.constant 1 : index
-//CHECK-NEXT:      %2 = arith.addi %arg3, %c1_1 : index
-//CHECK-NEXT:      %c2 = arith.constant 2 : index
-//CHECK-NEXT:      %3 = arith.addi %arg3, %c2 : index
-//CHECK-NEXT:      %c0_2 = arith.constant 0 : index
-//CHECK-NEXT:      %c2046_3 = arith.constant 2046 : index
-//CHECK-NEXT:      %c8_4 = arith.constant 8 : index
-//CHECK-NEXT:      scf.for %arg4 = %c0_2 to %c2046_3 step %c8_4 {
-//CHECK-NEXT:        %4 = aievec.upd %arg2[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2046x2046xi32>, vector<8xi32>
-//CHECK-NEXT:        %5 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %6 = aievec.ups %4 {shift = 0 : i8} : vector<8xi32>, vector<8xi80>
-//CHECK-NEXT:        %7 = aievec.mac %5, %0, %6 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-//CHECK-NEXT:        %c1_5 = arith.constant 1 : index
-//CHECK-NEXT:        %8 = arith.addi %arg4, %c1_5 : index
-//CHECK-NEXT:        %9 = aievec.upd %arg0[%arg3, %8], %5 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %10 = aievec.mac %9, %0, %7 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-//CHECK-NEXT:        %11 = aievec.mac %9, %0, %10 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-//CHECK-NEXT:        %12 = aievec.upd %arg0[%2, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %13 = aievec.mac %12, %0, %11 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-//CHECK-NEXT:        %14 = aievec.upd %arg0[%2, %8], %12 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %15 = aievec.mac %14, %0, %13 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-//CHECK-NEXT:        %16 = aievec.mac %14, %0, %15 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-//CHECK-NEXT:        %17 = aievec.upd %arg0[%3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %18 = aievec.mac %17, %0, %16 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-//CHECK-NEXT:        %19 = aievec.upd %arg0[%3, %8], %17 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %20 = aievec.mac %19, %0, %18 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-//CHECK-NEXT:        %21 = aievec.mac %19, %1, %20 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-//CHECK-NEXT:        %22 = aievec.srs %21, %c0_i32 : vector<8xi80>, i32, vector<8xi32>
-//CHECK-NEXT:        vector.transfer_write %22, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<8xi32>, memref<2046x2046xi32>
diff --git a/test/aievec/conv2d_i8.mlir b/test/aievec/conv2d_i8.mlir
deleted file mode 100644
index 48212c4b96..0000000000
--- a/test/aievec/conv2d_i8.mlir
+++ /dev/null
@@ -1,60 +0,0 @@
-// RUN: aie-opt %s --affine-loop-unroll="unroll-full unroll-full-threshold=3" --canonicalize -affine-super-vectorize="virtual-vector-size=16" --aie-vectorize -split-input-file | FileCheck %s
-
-// CHECK-LABEL: func.func @conv2d(%arg0: memref<18x288xi8>, %arg1: memref<48xi8>, %arg2: memref<16x256xi8>) {
-func.func @conv2d (%A: memref<18x288xi8>, %B: memref<48xi8>, %C: memref<16x256xi8>) {
-    affine.for %arg3 = 0 to 16 {
-        affine.for %arg4 = 0 to 256 {
-            //3x3 stencil 
-            affine.for %arg5 = 0 to 3 {
-                affine.for %arg6 = 0 to 3 {   
-                    //Load the output point
-                    %ci = affine.load %C[%arg3, %arg4] : memref<16x256xi8>
-                     %a11 = affine.load %A[%arg3+%arg5, %arg4+%arg6] : memref<18x288xi8>
-                     %b11 = affine.load %B[16*%arg5 + 2*%arg6] : memref<48xi8>
-                     %p11 = arith.muli %a11, %b11 : i8
-                     %c11 = arith.addi %ci, %p11 : i8
-                     //Store accumulated sum
-                     affine.store %c11, %C[%arg3, %arg4] : memref<16x256xi8>
-                }
-            }
-        }
-    }
-    return
-}
-
-//CHECK-NEXT:    %c0_i32 = arith.constant 0 : i32
-//CHECK-NEXT:    %c32 = arith.constant 32 : index
-//CHECK-NEXT:    %c0 = arith.constant 0 : index
-//CHECK-NEXT:    %0 = aievec.upd %arg1[%c0] {index = 0 : i8, offset = 0 : i32} : memref<48xi8>, vector<64xi8>
-//CHECK-NEXT:    %1 = aievec.upd %arg1[%c32], %0 {index = 1 : i8, offset = 0 : i32} : memref<48xi8>, vector<64xi8>
-//CHECK-NEXT:    %c0_0 = arith.constant 0 : index
-//CHECK-NEXT:    %c16 = arith.constant 16 : index
-//CHECK-NEXT:    %c1 = arith.constant 1 : index
-//CHECK-NEXT:    scf.for %arg3 = %c0_0 to %c16 step %c1 {
-//CHECK-NEXT:      %c1_1 = arith.constant 1 : index
-//CHECK-NEXT:      %2 = arith.addi %arg3, %c1_1 : index
-//CHECK-NEXT:      %c2 = arith.constant 2 : index
-//CHECK-NEXT:      %3 = arith.addi %arg3, %c2 : index
-//CHECK-NEXT:      %c0_2 = arith.constant 0 : index
-//CHECK-NEXT:      %c256 = arith.constant 256 : index
-//CHECK-NEXT:      %c16_3 = arith.constant 16 : index
-//CHECK-NEXT:      scf.for %arg4 = %c0_2 to %c256 step %c16_3 {
-//CHECK-NEXT:        %4 = aievec.upd %arg2[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<16x256xi8>, vector<16xi8>
-//CHECK-NEXT:        %5 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<18x288xi8>, vector<32xi8>
-//CHECK-NEXT:        %6 = aievec.ups %4 {shift = 0 : i8} : vector<16xi8>, vector<16xi48>
-//CHECK-NEXT:        %7 = aievec.mac %0, %5, %6 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "0", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
-//CHECK-NEXT:        %8 = aievec.mac %0, %5, %6 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "0", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "8", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
-//CHECK-NEXT:        %9 = aievec.upd %arg0[%2, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<18x288xi8>, vector<32xi8>
-//CHECK-NEXT:        %10 = aievec.mac %0, %9, %7 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "16", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
-//CHECK-NEXT:        %11 = aievec.mac %0, %9, %8 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "16", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "8", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
-//CHECK-NEXT:        %12 = aievec.upd %arg0[%3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<18x288xi8>, vector<32xi8>
-//CHECK-NEXT:        %13 = aievec.mac %1, %12, %10 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "32", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
-//CHECK-NEXT:        %14 = aievec.mac %1, %12, %11 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "32", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "8", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
-//CHECK-NEXT:        %15 = aievec.srs %13, %c0_i32 : vector<16xi48>, i32, vector<16xi16>
-//CHECK-NEXT:        %16 = aievec.srs %14, %c0_i32 : vector<16xi48>, i32, vector<16xi16>
-//CHECK-NEXT:        %17 = aievec.concat %15, %16 : vector<16xi16>, vector<32xi16>
-//CHECK-NEXT:        %18 = aievec.select %17 {select = "0xcccccccc", xoffsets = "0x0c080400", xoffsets_hi = "0x0", xsquare = "0x1010", xstart = "0", yoffsets = "0x0c080400", yoffsets_hi = "0x0", ysquare = "0x1010", ystart = "4"} : vector<32xi16>, vector<32xi16>
-//CHECK-NEXT:        %19 = aievec.ext %18 {index = 0 : i8} : vector<32xi16>, vector<16xi16>
-//CHECK-NEXT:        %20 = aievec.pack %19 : vector<16xi16>, vector<16xi8>
-//CHECK-NEXT:        vector.transfer_write %20, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<16xi8>, memref<16x256xi8>
-
diff --git a/test/aievec/conv2d_i8_after_polygeist.mlir b/test/aievec/conv2d_i8_after_polygeist.mlir
index eaa5e40451..1cb5817446 100644
--- a/test/aievec/conv2d_i8_after_polygeist.mlir
+++ b/test/aievec/conv2d_i8_after_polygeist.mlir
@@ -44,4 +44,4 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.endianness"
 //      CHECK:        %[[T2:.*]] = aievec.upd %[[A0]][%[[A3:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x288xi8>, vector<64xi8>
 //      CHECK:        %[[T3:.*]] = aievec.mul_conv %[[T2:.*]], %[[T1:.*]] {M = 32 : i32, N = 8 : i32} : vector<64xi8>, vector<64xi8>, vector<32xi32>
 //      CHECK:        %[[T4:.*]] = aievec.cast %[[T3:.*]] {isResAcc = false} : vector<32xi32>, vector<32xi32>
-//      CHECK:        vector.transfer_write %[[T4:.*]], %[[A2]][%[[A3:.*]], %[[A4:.*]]] {in_bounds = [true]} : vector<32xi32>, memref<?x256xi32>
+//      CHECK:        vector.transfer_write %[[T4:.*]], %[[A2]][%[[A3:.*]], %[[A4:.*]]] : vector<32xi32>, memref<?x256xi32>
diff --git a/test/aievec/conv2d_msc_uij_f32_noinit.mlir b/test/aievec/conv2d_msc_uij_f32_noinit.mlir
index 9dc7945845..9c68918530 100644
--- a/test/aievec/conv2d_msc_uij_f32_noinit.mlir
+++ b/test/aievec/conv2d_msc_uij_f32_noinit.mlir
@@ -84,21 +84,20 @@ func.func @conv2d (%A: memref<2048x2048xf32>, %B: memref<9xf32>, %C: memref<2046
 //CHECK-NEXT: %c8_4 = arith.constant 8 : index
 //CHECK-NEXT: scf.for %arg4 = %c0_2 to %c2046_3 step %c8_4 {
 //CHECK-NEXT: %4 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xf32>, vector<16xf32>
-//CHECK-NEXT: %5 = aievec.mul %4, %0 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: %5 = aievec_aie1.mul %4, %0 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
 //CHECK-NEXT: %c1_5 = arith.constant 1 : index
 //CHECK-NEXT: %6 = arith.addi %arg4, %c1_5 : index
 //CHECK-NEXT: %7 = aievec.upd %arg0[%arg3, %6], %4 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xf32>, vector<16xf32>
-//CHECK-NEXT: %8 = aievec.mac %7, %0, %5 {fmsub = true, xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT: %9 = aievec.mac %7, %0, %8 {fmsub = true, xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: %8 = aievec_aie1.mac %7, %0, %5 {fmsub = true, xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: %9 = aievec_aie1.mac %7, %0, %8 {fmsub = true, xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
 //CHECK-NEXT: %10 = aievec.upd %arg0[%2, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xf32>, vector<16xf32>
-//CHECK-NEXT: %11 = aievec.mac %10, %0, %9 {fmsub = true, xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: %11 = aievec_aie1.mac %10, %0, %9 {fmsub = true, xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
 //CHECK-NEXT: %12 = aievec.upd %arg0[%2, %6], %10 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xf32>, vector<16xf32>
-//CHECK-NEXT: %13 = aievec.mac %12, %0, %11 {fmsub = true, xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT: %14 = aievec.mac %12, %0, %13 {fmsub = true, xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: %13 = aievec_aie1.mac %12, %0, %11 {fmsub = true, xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: %14 = aievec_aie1.mac %12, %0, %13 {fmsub = true, xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
 //CHECK-NEXT: %15 = aievec.upd %arg0[%3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xf32>, vector<16xf32>
-//CHECK-NEXT: %16 = aievec.mac %15, %0, %14 {fmsub = true, xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: %16 = aievec_aie1.mac %15, %0, %14 {fmsub = true, xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
 //CHECK-NEXT: %17 = aievec.upd %arg0[%3, %6], %15 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xf32>, vector<16xf32>
-//CHECK-NEXT: %18 = aievec.mac %17, %0, %16 {fmsub = true, xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT: %19 = aievec.mac %17, %1, %18 {fmsub = true, xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT: vector.transfer_write %19, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<8xf32>, memref<2046x2046xf32>
-
+//CHECK-NEXT: %18 = aievec_aie1.mac %17, %0, %16 {fmsub = true, xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: %19 = aievec_aie1.mac %17, %1, %18 {fmsub = true, xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: vector.transfer_write %19, %arg2[%arg3, %arg4] : vector<8xf32>, memref<2046x2046xf32>
diff --git a/test/aievec/conv2d_msc_uij_i32_noinit.mlir b/test/aievec/conv2d_msc_uij_i32_noinit.mlir
index 06fa406c3f..e74612fa89 100644
--- a/test/aievec/conv2d_msc_uij_i32_noinit.mlir
+++ b/test/aievec/conv2d_msc_uij_i32_noinit.mlir
@@ -85,21 +85,21 @@ func.func @conv2d (%A: memref<2048x2048xi32>, %B: memref<9xi32>, %C: memref<2046
 //CHECK-NEXT:      %c8_4 = arith.constant 8 : index
 //CHECK-NEXT:      scf.for %arg4 = %c0_2 to %c2046_3 step %c8_4 {
 //CHECK-NEXT:        %4 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %5 = aievec.mul %4, %0 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %5 = aievec_aie1.mul %4, %0 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 //CHECK-NEXT:        %c1_5 = arith.constant 1 : index
 //CHECK-NEXT:        %6 = arith.addi %arg4, %c1_5 : index
 //CHECK-NEXT:        %7 = aievec.upd %arg0[%arg3, %6], %4 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %8 = aievec.mac %7, %0, %5 {fmsub = true, xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-//CHECK-NEXT:        %9 = aievec.mac %7, %0, %8 {fmsub = true, xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %8 = aievec_aie1.mac %7, %0, %5 {fmsub = true, xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %9 = aievec_aie1.mac %7, %0, %8 {fmsub = true, xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 //CHECK-NEXT:        %10 = aievec.upd %arg0[%2, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %11 = aievec.mac %10, %0, %9 {fmsub = true, xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %11 = aievec_aie1.mac %10, %0, %9 {fmsub = true, xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 //CHECK-NEXT:        %12 = aievec.upd %arg0[%2, %6], %10 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %13 = aievec.mac %12, %0, %11 {fmsub = true, xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-//CHECK-NEXT:        %14 = aievec.mac %12, %0, %13 {fmsub = true, xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %13 = aievec_aie1.mac %12, %0, %11 {fmsub = true, xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %14 = aievec_aie1.mac %12, %0, %13 {fmsub = true, xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 //CHECK-NEXT:        %15 = aievec.upd %arg0[%3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %16 = aievec.mac %15, %0, %14 {fmsub = true, xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %16 = aievec_aie1.mac %15, %0, %14 {fmsub = true, xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 //CHECK-NEXT:        %17 = aievec.upd %arg0[%3, %6], %15 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %18 = aievec.mac %17, %0, %16 {fmsub = true, xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-//CHECK-NEXT:        %19 = aievec.mac %17, %1, %18 {fmsub = true, xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %18 = aievec_aie1.mac %17, %0, %16 {fmsub = true, xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %19 = aievec_aie1.mac %17, %1, %18 {fmsub = true, xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 //CHECK-NEXT:        %20 = aievec.srs %19, %c0_i32 : vector<8xi80>, i32, vector<8xi32>
-//CHECK-NEXT:        vector.transfer_write %20, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<8xi32>, memref<2046x2046xi32>
+//CHECK-NEXT:        vector.transfer_write %20, %arg2[%arg3, %arg4] : vector<8xi32>, memref<2046x2046xi32>
diff --git a/test/aievec/conv2d_uij_f32.mlir b/test/aievec/conv2d_uij_f32.mlir
index 1bafe8beed..13ac8a16ac 100644
--- a/test/aievec/conv2d_uij_f32.mlir
+++ b/test/aievec/conv2d_uij_f32.mlir
@@ -89,23 +89,23 @@ func.func @conv2d (%A: memref<2048x2048xf32>, %B: memref<9xf32>, %C: memref<2046
 //CHECK-NEXT: scf.for %arg4 = %c0_2 to %c2046_3 step %c8_4 {
 //CHECK-NEXT: %4 = aievec.upd %arg2[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2046x2046xf32>, vector<8xf32>
 //CHECK-NEXT: %5 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xf32>, vector<16xf32>
-//CHECK-NEXT: %6 = aievec.mac %5, %0, %4 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: %6 = aievec_aie1.mac %5, %0, %4 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
 //CHECK-NEXT: %c1_5 = arith.constant 1 : index
 //CHECK-NEXT: %7 = arith.addi %arg4, %c1_5 : index
 //CHECK-NEXT: %8 = aievec.upd %arg0[%arg3, %7], %5 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xf32>, vector<16xf32>
-//CHECK-NEXT: %9 = aievec.mac %8, %0, %6 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT: %10 = aievec.mac %8, %0, %9 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: %9 = aievec_aie1.mac %8, %0, %6 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: %10 = aievec_aie1.mac %8, %0, %9 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
 //CHECK-NEXT: %11 = aievec.upd %arg0[%2, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xf32>, vector<16xf32>
-//CHECK-NEXT: %12 = aievec.mac %11, %0, %10 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: %12 = aievec_aie1.mac %11, %0, %10 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
 //CHECK-NEXT: %13 = aievec.upd %arg0[%2, %7], %11 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xf32>, vector<16xf32>
-//CHECK-NEXT: %14 = aievec.mac %13, %0, %12 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT: %15 = aievec.mac %13, %0, %14 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: %14 = aievec_aie1.mac %13, %0, %12 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: %15 = aievec_aie1.mac %13, %0, %14 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
 //CHECK-NEXT: %16 = aievec.upd %arg0[%3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xf32>, vector<16xf32>
-//CHECK-NEXT: %17 = aievec.mac %16, %0, %15 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: %17 = aievec_aie1.mac %16, %0, %15 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
 //CHECK-NEXT: %18 = aievec.upd %arg0[%3, %7], %16 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xf32>, vector<16xf32>
-//CHECK-NEXT: %19 = aievec.mac %18, %0, %17 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT: %20 = aievec.mac %18, %1, %19 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT: vector.transfer_write %20, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<8xf32>, memref<2046x2046xf32>
+//CHECK-NEXT: %19 = aievec_aie1.mac %18, %0, %17 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: %20 = aievec_aie1.mac %18, %1, %19 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: vector.transfer_write %20, %arg2[%arg3, %arg4] : vector<8xf32>, memref<2046x2046xf32>
 
 // This test case will directly return the result generated from -affine-super-vectorize when
 // -unaligned-loads-check=true. The reason is that in transfer_read %arg2[%arg3, %arg4],
diff --git a/test/aievec/conv2d_uij_f32_noinit.mlir b/test/aievec/conv2d_uij_f32_noinit.mlir
index 9068c24f6b..92b99e2745 100644
--- a/test/aievec/conv2d_uij_f32_noinit.mlir
+++ b/test/aievec/conv2d_uij_f32_noinit.mlir
@@ -84,20 +84,20 @@ func.func @conv2d (%A: memref<2048x2048xf32>, %B: memref<9xf32>, %C: memref<2046
 //CHECK-NEXT: %c8_4 = arith.constant 8 : index
 //CHECK-NEXT: scf.for %arg4 = %c0_2 to %c2046_3 step %c8_4 {
 //CHECK-NEXT: %4 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xf32>, vector<16xf32>
-//CHECK-NEXT: %5 = aievec.mul %4, %0 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: %5 = aievec_aie1.mul %4, %0 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
 //CHECK-NEXT: %c1_5 = arith.constant 1 : index
 //CHECK-NEXT: %6 = arith.addi %arg4, %c1_5 : index
 //CHECK-NEXT: %7 = aievec.upd %arg0[%arg3, %6], %4 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xf32>, vector<16xf32>
-//CHECK-NEXT: %8 = aievec.mac %7, %0, %5 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT: %9 = aievec.mac %7, %0, %8 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: %8 = aievec_aie1.mac %7, %0, %5 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: %9 = aievec_aie1.mac %7, %0, %8 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
 //CHECK-NEXT: %10 = aievec.upd %arg0[%2, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xf32>, vector<16xf32>
-//CHECK-NEXT: %11 = aievec.mac %10, %0, %9 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: %11 = aievec_aie1.mac %10, %0, %9 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
 //CHECK-NEXT: %12 = aievec.upd %arg0[%2, %6], %10 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xf32>, vector<16xf32>
-//CHECK-NEXT: %13 = aievec.mac %12, %0, %11 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT: %14 = aievec.mac %12, %0, %13 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: %13 = aievec_aie1.mac %12, %0, %11 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: %14 = aievec_aie1.mac %12, %0, %13 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
 //CHECK-NEXT: %15 = aievec.upd %arg0[%3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xf32>, vector<16xf32>
-//CHECK-NEXT: %16 = aievec.mac %15, %0, %14 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: %16 = aievec_aie1.mac %15, %0, %14 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
 //CHECK-NEXT: %17 = aievec.upd %arg0[%3, %6], %15 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xf32>, vector<16xf32>
-//CHECK-NEXT: %18 = aievec.mac %17, %0, %16 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT: %19 = aievec.mac %17, %1, %18 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT: vector.transfer_write %19, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<8xf32>, memref<2046x2046xf32>
+//CHECK-NEXT: %18 = aievec_aie1.mac %17, %0, %16 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: %19 = aievec_aie1.mac %17, %1, %18 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+//CHECK-NEXT: vector.transfer_write %19, %arg2[%arg3, %arg4] : vector<8xf32>, memref<2046x2046xf32>
diff --git a/test/aievec/conv2d_uij_f32_unbounded.mlir b/test/aievec/conv2d_uij_f32_unbounded.mlir
index 964357a66d..3275862743 100644
--- a/test/aievec/conv2d_uij_f32_unbounded.mlir
+++ b/test/aievec/conv2d_uij_f32_unbounded.mlir
@@ -98,23 +98,23 @@ func.func @conv2d_0 (%A: memref<?x?xf32>, %B: memref<?xf32>, %C: memref<?x?xf32>
 // CHECK:             scf.for %[[VAL_19:.*]] = %[[VAL_17]] to %[[VAL_7]] step %[[VAL_18]] {
 // CHECK:               %[[VAL_20:.*]] = aievec.upd %[[VAL_2]]{{\[}}%[[VAL_12]], %[[VAL_19]]] {index = 0 : i8, offset = 0 : i32} : memref<?x?xf32>, vector<8xf32>
 // CHECK:               %[[VAL_21:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_12]], %[[VAL_19]]] {index = 0 : i8, offset = 0 : i32} : memref<?x?xf32>, vector<16xf32>
-// CHECK:               %[[VAL_22:.*]] = aievec.mac %[[VAL_21]], %[[VAL_8]], %[[VAL_20]] {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+// CHECK:               %[[VAL_22:.*]] = aievec_aie1.mac %[[VAL_21]], %[[VAL_8]], %[[VAL_20]] {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
 // CHECK:               %[[VAL_23:.*]] = arith.constant 1 : index
 // CHECK:               %[[VAL_24:.*]] = arith.addi %[[VAL_19]], %[[VAL_23]] : index
 // CHECK:               %[[VAL_25:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_12]], %[[VAL_24]]], %[[VAL_21]] {index = 1 : i8, offset = 224 : i32} : memref<?x?xf32>, vector<16xf32>
-// CHECK:               %[[VAL_26:.*]] = aievec.mac %[[VAL_25]], %[[VAL_8]], %[[VAL_22]] {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-// CHECK:               %[[VAL_27:.*]] = aievec.mac %[[VAL_25]], %[[VAL_8]], %[[VAL_26]] {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+// CHECK:               %[[VAL_26:.*]] = aievec_aie1.mac %[[VAL_25]], %[[VAL_8]], %[[VAL_22]] {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+// CHECK:               %[[VAL_27:.*]] = aievec_aie1.mac %[[VAL_25]], %[[VAL_8]], %[[VAL_26]] {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
 // CHECK:               %[[VAL_28:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_14]], %[[VAL_19]]] {index = 0 : i8, offset = 0 : i32} : memref<?x?xf32>, vector<16xf32>
-// CHECK:               %[[VAL_29:.*]] = aievec.mac %[[VAL_28]], %[[VAL_8]], %[[VAL_27]] {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+// CHECK:               %[[VAL_29:.*]] = aievec_aie1.mac %[[VAL_28]], %[[VAL_8]], %[[VAL_27]] {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
 // CHECK:               %[[VAL_30:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_14]], %[[VAL_24]]], %[[VAL_28]] {index = 1 : i8, offset = 224 : i32} : memref<?x?xf32>, vector<16xf32>
-// CHECK:               %[[VAL_31:.*]] = aievec.mac %[[VAL_30]], %[[VAL_8]], %[[VAL_29]] {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-// CHECK:               %[[VAL_32:.*]] = aievec.mac %[[VAL_30]], %[[VAL_8]], %[[VAL_31]] {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+// CHECK:               %[[VAL_31:.*]] = aievec_aie1.mac %[[VAL_30]], %[[VAL_8]], %[[VAL_29]] {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+// CHECK:               %[[VAL_32:.*]] = aievec_aie1.mac %[[VAL_30]], %[[VAL_8]], %[[VAL_31]] {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
 // CHECK:               %[[VAL_33:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_16]], %[[VAL_19]]] {index = 0 : i8, offset = 0 : i32} : memref<?x?xf32>, vector<16xf32>
-// CHECK:               %[[VAL_34:.*]] = aievec.mac %[[VAL_33]], %[[VAL_8]], %[[VAL_32]] {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+// CHECK:               %[[VAL_34:.*]] = aievec_aie1.mac %[[VAL_33]], %[[VAL_8]], %[[VAL_32]] {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
 // CHECK:               %[[VAL_35:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_16]], %[[VAL_24]]], %[[VAL_33]] {index = 1 : i8, offset = 224 : i32} : memref<?x?xf32>, vector<16xf32>
-// CHECK:               %[[VAL_36:.*]] = aievec.mac %[[VAL_35]], %[[VAL_8]], %[[VAL_34]] {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-// CHECK:               %[[VAL_37:.*]] = aievec.mac %[[VAL_35]], %[[VAL_9]], %[[VAL_36]] {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-// CHECK:               vector.transfer_write %[[VAL_37]], %[[VAL_2]]{{\[}}%[[VAL_12]], %[[VAL_19]]] {in_bounds = [true]} : vector<8xf32>, memref<?x?xf32>
+// CHECK:               %[[VAL_36:.*]] = aievec_aie1.mac %[[VAL_35]], %[[VAL_8]], %[[VAL_34]] {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+// CHECK:               %[[VAL_37:.*]] = aievec_aie1.mac %[[VAL_35]], %[[VAL_9]], %[[VAL_36]] {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+// CHECK:               vector.transfer_write %[[VAL_37]], %[[VAL_2]]{{\[}}%[[VAL_12]], %[[VAL_19]]] : vector<8xf32>, memref<?x?xf32>
 
 //CHECK-LABEL: func.func @conv2d_1
 func.func @conv2d_1 (%A: memref<?x256xf32>, %B: memref<?xf32>, %C: memref<?x256xf32>) {
@@ -213,20 +213,20 @@ func.func @conv2d_1 (%A: memref<?x256xf32>, %B: memref<?xf32>, %C: memref<?x256x
 // CHECK:             scf.for %[[VAL_18:.*]] = %[[VAL_15]] to %[[VAL_16]] step %[[VAL_17]] {
 // CHECK:               %[[VAL_19:.*]] = aievec.upd %[[VAL_2]]{{\[}}%[[VAL_10]], %[[VAL_18]]] {index = 0 : i8, offset = 0 : i32} : memref<?x256xf32>, vector<8xf32>
 // CHECK:               %[[VAL_20:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_10]], %[[VAL_18]]] {index = 0 : i8, offset = 0 : i32} : memref<?x256xf32>, vector<16xf32>
-// CHECK:               %[[VAL_21:.*]] = aievec.mac %[[VAL_20]], %[[VAL_6]], %[[VAL_19]] {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+// CHECK:               %[[VAL_21:.*]] = aievec_aie1.mac %[[VAL_20]], %[[VAL_6]], %[[VAL_19]] {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
 // CHECK:               %[[VAL_22:.*]] = arith.constant 1 : index
 // CHECK:               %[[VAL_23:.*]] = arith.addi %[[VAL_18]], %[[VAL_22]] : index
 // CHECK:               %[[VAL_24:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_10]], %[[VAL_23]]], %[[VAL_20]] {index = 1 : i8, offset = 224 : i32} : memref<?x256xf32>, vector<16xf32>
-// CHECK:               %[[VAL_25:.*]] = aievec.mac %[[VAL_24]], %[[VAL_6]], %[[VAL_21]] {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-// CHECK:               %[[VAL_26:.*]] = aievec.mac %[[VAL_24]], %[[VAL_6]], %[[VAL_25]] {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+// CHECK:               %[[VAL_25:.*]] = aievec_aie1.mac %[[VAL_24]], %[[VAL_6]], %[[VAL_21]] {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+// CHECK:               %[[VAL_26:.*]] = aievec_aie1.mac %[[VAL_24]], %[[VAL_6]], %[[VAL_25]] {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
 // CHECK:               %[[VAL_27:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_12]], %[[VAL_18]]] {index = 0 : i8, offset = 0 : i32} : memref<?x256xf32>, vector<16xf32>
-// CHECK:               %[[VAL_28:.*]] = aievec.mac %[[VAL_27]], %[[VAL_6]], %[[VAL_26]] {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+// CHECK:               %[[VAL_28:.*]] = aievec_aie1.mac %[[VAL_27]], %[[VAL_6]], %[[VAL_26]] {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
 // CHECK:               %[[VAL_29:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_12]], %[[VAL_23]]], %[[VAL_27]] {index = 1 : i8, offset = 224 : i32} : memref<?x256xf32>, vector<16xf32>
-// CHECK:               %[[VAL_30:.*]] = aievec.mac %[[VAL_29]], %[[VAL_6]], %[[VAL_28]] {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-// CHECK:               %[[VAL_31:.*]] = aievec.mac %[[VAL_29]], %[[VAL_6]], %[[VAL_30]] {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+// CHECK:               %[[VAL_30:.*]] = aievec_aie1.mac %[[VAL_29]], %[[VAL_6]], %[[VAL_28]] {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+// CHECK:               %[[VAL_31:.*]] = aievec_aie1.mac %[[VAL_29]], %[[VAL_6]], %[[VAL_30]] {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
 // CHECK:               %[[VAL_32:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_14]], %[[VAL_18]]] {index = 0 : i8, offset = 0 : i32} : memref<?x256xf32>, vector<16xf32>
-// CHECK:               %[[VAL_33:.*]] = aievec.mac %[[VAL_32]], %[[VAL_6]], %[[VAL_31]] {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+// CHECK:               %[[VAL_33:.*]] = aievec_aie1.mac %[[VAL_32]], %[[VAL_6]], %[[VAL_31]] {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
 // CHECK:               %[[VAL_34:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_14]], %[[VAL_23]]], %[[VAL_32]] {index = 1 : i8, offset = 224 : i32} : memref<?x256xf32>, vector<16xf32>
-// CHECK:               %[[VAL_35:.*]] = aievec.mac %[[VAL_34]], %[[VAL_6]], %[[VAL_33]] {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-// CHECK:               %[[VAL_36:.*]] = aievec.mac %[[VAL_34]], %[[VAL_7]], %[[VAL_35]] {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-// CHECK:               vector.transfer_write %[[VAL_36]], %[[VAL_2]]{{\[}}%[[VAL_10]], %[[VAL_18]]] {in_bounds = [true]} : vector<8xf32>, memref<?x256xf32>
\ No newline at end of file
+// CHECK:               %[[VAL_35:.*]] = aievec_aie1.mac %[[VAL_34]], %[[VAL_6]], %[[VAL_33]] {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+// CHECK:               %[[VAL_36:.*]] = aievec_aie1.mac %[[VAL_34]], %[[VAL_7]], %[[VAL_35]] {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+// CHECK:               vector.transfer_write %[[VAL_36]], %[[VAL_2]]{{\[}}%[[VAL_10]], %[[VAL_18]]] : vector<8xf32>, memref<?x256xf32>
diff --git a/test/aievec/conv2d_uij_i16.mlir b/test/aievec/conv2d_uij_i16.mlir
index 2659effd9b..9cec6ad1f8 100644
--- a/test/aievec/conv2d_uij_i16.mlir
+++ b/test/aievec/conv2d_uij_i16.mlir
@@ -90,15 +90,15 @@ func.func @conv2d (%A: memref<2048x2048xi16>, %B: memref<12xi16>, %C: memref<204
 //CHECK-NEXT:        %4 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi16>, vector<32xi16>
 //CHECK-NEXT:        %5 = aievec.upd %arg0[%arg3, %arg4], %4 {index = 1 : i8, offset = 256 : i32} : memref<2048x2048xi16>, vector<32xi16>
 //CHECK-NEXT:        %6 = aievec.ups %3 {shift = 0 : i8} : vector<16xi16>, vector<16xi48>
-//CHECK-NEXT:       %7 = aievec.mac %5, %0, %6 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-//CHECK-NEXT:        %8 = aievec.mac %5, %0, %7 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "2", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+//CHECK-NEXT:       %7 = aievec_aie1.mac %5, %0, %6 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+//CHECK-NEXT:        %8 = aievec_aie1.mac %5, %0, %7 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "2", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
 //CHECK-NEXT:        %9 = aievec.upd %arg0[%1, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi16>, vector<32xi16>
 //CHECK-NEXT:        %10 = aievec.upd %arg0[%1, %arg4], %9 {index = 1 : i8, offset = 256 : i32} : memref<2048x2048xi16>, vector<32xi16>
-//CHECK-NEXT:        %11 = aievec.mac %10, %0, %8 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "4", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-//CHECK-NEXT:        %12 = aievec.mac %10, %0, %11 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "6", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+//CHECK-NEXT:        %11 = aievec_aie1.mac %10, %0, %8 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "4", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+//CHECK-NEXT:        %12 = aievec_aie1.mac %10, %0, %11 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "6", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
 //CHECK-NEXT:        %13 = aievec.upd %arg0[%2, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi16>, vector<32xi16>
 //CHECK-NEXT:        %14 = aievec.upd %arg0[%2, %arg4], %13 {index = 1 : i8, offset = 256 : i32} : memref<2048x2048xi16>, vector<32xi16>
-//CHECK-NEXT:        %15 = aievec.mac %14, %0, %12 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "8", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-//CHECK-NEXT:        %16 = aievec.mac %14, %0, %15 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "10", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+//CHECK-NEXT:        %15 = aievec_aie1.mac %14, %0, %12 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "8", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+//CHECK-NEXT:        %16 = aievec_aie1.mac %14, %0, %15 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "10", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
 //CHECK-NEXT:        %17 = aievec.srs %16, %c0_i32 : vector<16xi48>, i32, vector<16xi16>
-//CHECK-NEXT:        vector.transfer_write %17, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<16xi16>, memref<2046x2046xi16>
+//CHECK-NEXT:        vector.transfer_write %17, %arg2[%arg3, %arg4] : vector<16xi16>, memref<2046x2046xi16>
diff --git a/test/aievec/conv2d_uij_i16_noinit.mlir b/test/aievec/conv2d_uij_i16_noinit.mlir
index 92232db147..e68b30cb5b 100644
--- a/test/aievec/conv2d_uij_i16_noinit.mlir
+++ b/test/aievec/conv2d_uij_i16_noinit.mlir
@@ -84,15 +84,15 @@ func.func @conv2d (%A: memref<18x288xi16>, %B: memref<12xi16>, %C: memref<16x256
 //CHECK-NEXT:      scf.for %arg4 = %c0_2 to %c256 step %c16_3 {
 //CHECK-NEXT:        %3 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<18x288xi16>, vector<32xi16>
 //CHECK-NEXT:        %4 = aievec.upd %arg0[%arg3, %arg4], %3 {index = 1 : i8, offset = 256 : i32} : memref<18x288xi16>, vector<32xi16>
-//CHECK-NEXT:        %5 = aievec.mul %4, %0 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-//CHECK-NEXT:        %6 = aievec.mac %4, %0, %5 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "2", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+//CHECK-NEXT:        %5 = aievec_aie1.mul %4, %0 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+//CHECK-NEXT:        %6 = aievec_aie1.mac %4, %0, %5 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "2", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
 //CHECK-NEXT:        %7 = aievec.upd %arg0[%1, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<18x288xi16>, vector<32xi16>
 //CHECK-NEXT:        %8 = aievec.upd %arg0[%1, %arg4], %7 {index = 1 : i8, offset = 256 : i32} : memref<18x288xi16>, vector<32xi16>
-//CHECK-NEXT:        %9 = aievec.mac %8, %0, %6 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "4", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-//CHECK-NEXT:        %10 = aievec.mac %8, %0, %9 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "6", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+//CHECK-NEXT:        %9 = aievec_aie1.mac %8, %0, %6 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "4", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+//CHECK-NEXT:        %10 = aievec_aie1.mac %8, %0, %9 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "6", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
 //CHECK-NEXT:        %11 = aievec.upd %arg0[%2, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<18x288xi16>, vector<32xi16>
 //CHECK-NEXT:        %12 = aievec.upd %arg0[%2, %arg4], %11 {index = 1 : i8, offset = 256 : i32} : memref<18x288xi16>, vector<32xi16>
-//CHECK-NEXT:        %13 = aievec.mac %12, %0, %10 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "8", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-//CHECK-NEXT:        %14 = aievec.mac %12, %0, %13 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "10", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+//CHECK-NEXT:        %13 = aievec_aie1.mac %12, %0, %10 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "8", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+//CHECK-NEXT:        %14 = aievec_aie1.mac %12, %0, %13 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "10", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
 //CHECK-NEXT:        %15 = aievec.srs %14, %c10_i32 : vector<16xi48>, i32, vector<16xi16>
-//CHECK-NEXT:        vector.transfer_write %15, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<16xi16>, memref<16x256xi16>
+//CHECK-NEXT:        vector.transfer_write %15, %arg2[%arg3, %arg4] : vector<16xi16>, memref<16x256xi16>
diff --git a/test/aievec/conv2d_uij_i16_noinit_aie-ml.mlir b/test/aievec/conv2d_uij_i16_noinit_aie-ml.mlir
index b7077fda89..7eef56b809 100644
--- a/test/aievec/conv2d_uij_i16_noinit_aie-ml.mlir
+++ b/test/aievec/conv2d_uij_i16_noinit_aie-ml.mlir
@@ -93,4 +93,4 @@ func.func @conv2d (%A: memref<18x288xi16>, %B: memref<12xi16>, %C: memref<16x256
 //      CHECK:        %[[T10:.*]] = aievec.upd %[[A0]][%[[T5:.*]], %[[A4]]] {index = 0 : i8, offset = 0 : i32} : memref<18x288xi16>, vector<32xi16>
 //      CHECK:        %[[T11:.*]] = aievec.fma_conv %[[T10:.*]], %[[T3:.*]], %[[T9:.*]] {M = 16 : i32, N = 4 : i32} : vector<32xi16>, vector<32xi16>, vector<16xi64>
 //      CHECK:        %[[T12:.*]] = aievec.srs %[[T11:.*]], %[[C10]] : vector<16xi64>, i32, vector<16xi16>
-//      CHECK:        vector.transfer_write %[[T12:.*]], %[[A2]][%[[A3]], %[[A4]]] {in_bounds = [true]} : vector<16xi16>, memref<16x256xi16>
+//      CHECK:        vector.transfer_write %[[T12:.*]], %[[A2]][%[[A3]], %[[A4]]] : vector<16xi16>, memref<16x256xi16>
diff --git a/test/aievec/conv2d_uij_i16_unbounded.mlir b/test/aievec/conv2d_uij_i16_unbounded.mlir
index 0018e021f7..474ddf0bd9 100644
--- a/test/aievec/conv2d_uij_i16_unbounded.mlir
+++ b/test/aievec/conv2d_uij_i16_unbounded.mlir
@@ -99,18 +99,18 @@ func.func @conv2d_0 (%A: memref<?x?xi16>, %B: memref<?xi16>, %C: memref<?x?xi16>
 // CHECK:               %[[VAL_19:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_10]], %[[VAL_17]]] {index = 0 : i8, offset = 0 : i32} : memref<?x?xi16>, vector<32xi16>
 // CHECK:               %[[VAL_20:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_10]], %[[VAL_17]]], %[[VAL_19]] {index = 1 : i8, offset = 256 : i32} : memref<?x?xi16>, vector<32xi16>
 // CHECK:               %[[VAL_21:.*]] = aievec.ups %[[VAL_18]] {shift = 0 : i8} : vector<16xi16>, vector<16xi48>
-// CHECK:               %[[VAL_22:.*]] = aievec.mac %[[VAL_20]], %[[VAL_7]], %[[VAL_21]] {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-// CHECK:               %[[VAL_23:.*]] = aievec.mac %[[VAL_20]], %[[VAL_7]], %[[VAL_22]] {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "2", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+// CHECK:               %[[VAL_22:.*]] = aievec_aie1.mac %[[VAL_20]], %[[VAL_7]], %[[VAL_21]] {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+// CHECK:               %[[VAL_23:.*]] = aievec_aie1.mac %[[VAL_20]], %[[VAL_7]], %[[VAL_22]] {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "2", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
 // CHECK:               %[[VAL_24:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_12]], %[[VAL_17]]] {index = 0 : i8, offset = 0 : i32} : memref<?x?xi16>, vector<32xi16>
 // CHECK:               %[[VAL_25:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_12]], %[[VAL_17]]], %[[VAL_24]] {index = 1 : i8, offset = 256 : i32} : memref<?x?xi16>, vector<32xi16>
-// CHECK:               %[[VAL_26:.*]] = aievec.mac %[[VAL_25]], %[[VAL_7]], %[[VAL_23]] {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "4", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-// CHECK:               %[[VAL_27:.*]] = aievec.mac %[[VAL_25]], %[[VAL_7]], %[[VAL_26]] {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "6", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+// CHECK:               %[[VAL_26:.*]] = aievec_aie1.mac %[[VAL_25]], %[[VAL_7]], %[[VAL_23]] {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "4", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+// CHECK:               %[[VAL_27:.*]] = aievec_aie1.mac %[[VAL_25]], %[[VAL_7]], %[[VAL_26]] {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "6", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
 // CHECK:               %[[VAL_28:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_14]], %[[VAL_17]]] {index = 0 : i8, offset = 0 : i32} : memref<?x?xi16>, vector<32xi16>
 // CHECK:               %[[VAL_29:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_14]], %[[VAL_17]]], %[[VAL_28]] {index = 1 : i8, offset = 256 : i32} : memref<?x?xi16>, vector<32xi16>
-// CHECK:               %[[VAL_30:.*]] = aievec.mac %[[VAL_29]], %[[VAL_7]], %[[VAL_27]] {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "8", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-// CHECK:               %[[VAL_31:.*]] = aievec.mac %[[VAL_29]], %[[VAL_7]], %[[VAL_30]] {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "10", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+// CHECK:               %[[VAL_30:.*]] = aievec_aie1.mac %[[VAL_29]], %[[VAL_7]], %[[VAL_27]] {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "8", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+// CHECK:               %[[VAL_31:.*]] = aievec_aie1.mac %[[VAL_29]], %[[VAL_7]], %[[VAL_30]] {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "10", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
 // CHECK:               %[[VAL_32:.*]] = aievec.srs %[[VAL_31]], %[[C0]] : vector<16xi48>, i32, vector<16xi16>
-// CHECK:               vector.transfer_write %[[VAL_32]], %[[VAL_2]]{{\[}}%[[VAL_10]], %[[VAL_17]]] {in_bounds = [true]} : vector<16xi16>, memref<?x?xi16>
+// CHECK:               vector.transfer_write %[[VAL_32]], %[[VAL_2]]{{\[}}%[[VAL_10]], %[[VAL_17]]] : vector<16xi16>, memref<?x?xi16>
 
 //CHECK-LABEL: func.func @conv2d_1
 func.func @conv2d_1 (%A: memref<?x256xi16>, %B: memref<?xi16>, %C: memref<?x256xi16>) {
@@ -208,15 +208,15 @@ func.func @conv2d_1 (%A: memref<?x256xi16>, %B: memref<?xi16>, %C: memref<?x256x
 // CHECK:               %[[VAL_18:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_8]], %[[VAL_16]]] {index = 0 : i8, offset = 0 : i32} : memref<?x256xi16>, vector<32xi16>
 // CHECK:               %[[VAL_19:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_8]], %[[VAL_16]]], %[[VAL_18]] {index = 1 : i8, offset = 256 : i32} : memref<?x256xi16>, vector<32xi16>
 // CHECK:               %[[VAL_20:.*]] = aievec.ups %[[VAL_17]] {shift = 0 : i8} : vector<16xi16>, vector<16xi48>
-// CHECK:               %[[VAL_21:.*]] = aievec.mac %[[VAL_19]], %[[VAL_5]], %[[VAL_20]] {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-// CHECK:               %[[VAL_22:.*]] = aievec.mac %[[VAL_19]], %[[VAL_5]], %[[VAL_21]] {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "2", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+// CHECK:               %[[VAL_21:.*]] = aievec_aie1.mac %[[VAL_19]], %[[VAL_5]], %[[VAL_20]] {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+// CHECK:               %[[VAL_22:.*]] = aievec_aie1.mac %[[VAL_19]], %[[VAL_5]], %[[VAL_21]] {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "2", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
 // CHECK:               %[[VAL_23:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_10]], %[[VAL_16]]] {index = 0 : i8, offset = 0 : i32} : memref<?x256xi16>, vector<32xi16>
 // CHECK:               %[[VAL_24:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_10]], %[[VAL_16]]], %[[VAL_23]] {index = 1 : i8, offset = 256 : i32} : memref<?x256xi16>, vector<32xi16>
-// CHECK:               %[[VAL_25:.*]] = aievec.mac %[[VAL_24]], %[[VAL_5]], %[[VAL_22]] {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "4", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-// CHECK:               %[[VAL_26:.*]] = aievec.mac %[[VAL_24]], %[[VAL_5]], %[[VAL_25]] {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "6", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+// CHECK:               %[[VAL_25:.*]] = aievec_aie1.mac %[[VAL_24]], %[[VAL_5]], %[[VAL_22]] {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "4", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+// CHECK:               %[[VAL_26:.*]] = aievec_aie1.mac %[[VAL_24]], %[[VAL_5]], %[[VAL_25]] {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "6", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
 // CHECK:               %[[VAL_27:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_12]], %[[VAL_16]]] {index = 0 : i8, offset = 0 : i32} : memref<?x256xi16>, vector<32xi16>
 // CHECK:               %[[VAL_28:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_12]], %[[VAL_16]]], %[[VAL_27]] {index = 1 : i8, offset = 256 : i32} : memref<?x256xi16>, vector<32xi16>
-// CHECK:               %[[VAL_29:.*]] = aievec.mac %[[VAL_28]], %[[VAL_5]], %[[VAL_26]] {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "8", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-// CHECK:               %[[VAL_30:.*]] = aievec.mac %[[VAL_28]], %[[VAL_5]], %[[VAL_29]] {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "10", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+// CHECK:               %[[VAL_29:.*]] = aievec_aie1.mac %[[VAL_28]], %[[VAL_5]], %[[VAL_26]] {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "8", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+// CHECK:               %[[VAL_30:.*]] = aievec_aie1.mac %[[VAL_28]], %[[VAL_5]], %[[VAL_29]] {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "10", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
 // CHECK:               %[[VAL_31:.*]] = aievec.srs %[[VAL_30]], %[[C0]] : vector<16xi48>, i32, vector<16xi16>
-// CHECK:               vector.transfer_write %[[VAL_31]], %[[VAL_2]]{{\[}}%[[VAL_8]], %[[VAL_16]]] {in_bounds = [true]} : vector<16xi16>, memref<?x256xi16>
+// CHECK:               vector.transfer_write %[[VAL_31]], %[[VAL_2]]{{\[}}%[[VAL_8]], %[[VAL_16]]] : vector<16xi16>, memref<?x256xi16>
diff --git a/test/aievec/conv2d_uij_i32.mlir b/test/aievec/conv2d_uij_i32.mlir
index 7d3754668e..48d80cbf11 100644
--- a/test/aievec/conv2d_uij_i32.mlir
+++ b/test/aievec/conv2d_uij_i32.mlir
@@ -91,22 +91,22 @@ func.func @conv2d (%A: memref<2048x2048xi32>, %B: memref<9xi32>, %C: memref<2046
 //CHECK-NEXT:        %4 = aievec.upd %arg2[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2046x2046xi32>, vector<8xi32>
 //CHECK-NEXT:        %5 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi32>, vector<16xi32>
 //CHECK-NEXT:        %6 = aievec.ups %4 {shift = 0 : i8} : vector<8xi32>, vector<8xi80>
-//CHECK-NEXT:        %7 = aievec.mac %5, %0, %6 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %7 = aievec_aie1.mac %5, %0, %6 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 //CHECK-NEXT:        %c1_5 = arith.constant 1 : index
 //CHECK-NEXT:        %8 = arith.addi %arg4, %c1_5 : index
 //CHECK-NEXT:        %9 = aievec.upd %arg0[%arg3, %8], %5 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %10 = aievec.mac %9, %0, %7 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-//CHECK-NEXT:        %11 = aievec.mac %9, %0, %10 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %10 = aievec_aie1.mac %9, %0, %7 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %11 = aievec_aie1.mac %9, %0, %10 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 //CHECK-NEXT:        %12 = aievec.upd %arg0[%2, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %13 = aievec.mac %12, %0, %11 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %13 = aievec_aie1.mac %12, %0, %11 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 //CHECK-NEXT:        %14 = aievec.upd %arg0[%2, %8], %12 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %15 = aievec.mac %14, %0, %13 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-//CHECK-NEXT:        %16 = aievec.mac %14, %0, %15 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %15 = aievec_aie1.mac %14, %0, %13 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %16 = aievec_aie1.mac %14, %0, %15 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 //CHECK-NEXT:        %17 = aievec.upd %arg0[%3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %18 = aievec.mac %17, %0, %16 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %18 = aievec_aie1.mac %17, %0, %16 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 //CHECK-NEXT:        %19 = aievec.upd %arg0[%3, %8], %17 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %20 = aievec.mac %19, %0, %18 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-//CHECK-NEXT:        %21 = aievec.mac %19, %1, %20 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %20 = aievec_aie1.mac %19, %0, %18 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %21 = aievec_aie1.mac %19, %1, %20 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 //CHECK-NEXT:        %22 = aievec.srs %21, %c0_i32 : vector<8xi80>, i32, vector<8xi32>
-//CHECK-NEXT:        vector.transfer_write %22, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<8xi32>, memref<2046x2046xi32>
+//CHECK-NEXT:        vector.transfer_write %22, %arg2[%arg3, %arg4] : vector<8xi32>, memref<2046x2046xi32>
 
diff --git a/test/aievec/conv2d_uij_i32_noinit.mlir b/test/aievec/conv2d_uij_i32_noinit.mlir
index 5b8f773b3c..f6ae5ae876 100644
--- a/test/aievec/conv2d_uij_i32_noinit.mlir
+++ b/test/aievec/conv2d_uij_i32_noinit.mlir
@@ -85,22 +85,22 @@ func.func @conv2d (%A: memref<2048x2048xi32>, %B: memref<9xi32>, %C: memref<2046
 //CHECK-NEXT:      %c8_4 = arith.constant 8 : index
 //CHECK-NEXT:      scf.for %arg4 = %c0_2 to %c2046_3 step %c8_4 {
 //CHECK-NEXT:        %4 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %5 = aievec.mul %4, %0 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %5 = aievec_aie1.mul %4, %0 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 //CHECK-NEXT:        %c1_5 = arith.constant 1 : index
 //CHECK-NEXT:        %6 = arith.addi %arg4, %c1_5 : index
 //CHECK-NEXT:        %7 = aievec.upd %arg0[%arg3, %6], %4 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %8 = aievec.mac %7, %0, %5 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-//CHECK-NEXT:        %9 = aievec.mac %7, %0, %8 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %8 = aievec_aie1.mac %7, %0, %5 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %9 = aievec_aie1.mac %7, %0, %8 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 //CHECK-NEXT:        %10 = aievec.upd %arg0[%2, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %11 = aievec.mac %10, %0, %9 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %11 = aievec_aie1.mac %10, %0, %9 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 //CHECK-NEXT:        %12 = aievec.upd %arg0[%2, %6], %10 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %13 = aievec.mac %12, %0, %11 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-//CHECK-NEXT:        %14 = aievec.mac %12, %0, %13 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %13 = aievec_aie1.mac %12, %0, %11 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %14 = aievec_aie1.mac %12, %0, %13 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 //CHECK-NEXT:        %15 = aievec.upd %arg0[%3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %16 = aievec.mac %15, %0, %14 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %16 = aievec_aie1.mac %15, %0, %14 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 //CHECK-NEXT:        %17 = aievec.upd %arg0[%3, %6], %15 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %18 = aievec.mac %17, %0, %16 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-//CHECK-NEXT:        %19 = aievec.mac %17, %1, %18 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %18 = aievec_aie1.mac %17, %0, %16 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %19 = aievec_aie1.mac %17, %1, %18 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 //CHECK-NEXT:        %20 = aievec.srs %19, %c0_i32 : vector<8xi80>, i32, vector<8xi32>
-//CHECK-NEXT:        vector.transfer_write %20, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<8xi32>, memref<2046x2046xi32>
+//CHECK-NEXT:        vector.transfer_write %20, %arg2[%arg3, %arg4] : vector<8xi32>, memref<2046x2046xi32>
 
diff --git a/test/aievec/conv2d_uij_i32_unbounded.mlir b/test/aievec/conv2d_uij_i32_unbounded.mlir
index 1ee65de3ef..efe893d169 100644
--- a/test/aievec/conv2d_uij_i32_unbounded.mlir
+++ b/test/aievec/conv2d_uij_i32_unbounded.mlir
@@ -100,24 +100,24 @@ func.func @conv2d_0 (%A: memref<?x?xi32>, %B: memref<?xi32>, %C: memref<?x?xi32>
 // CHECK:               %[[VAL_20:.*]] = aievec.upd %[[VAL_2]]{{\[}}%[[VAL_12]], %[[VAL_19]]] {index = 0 : i8, offset = 0 : i32} : memref<?x?xi32>, vector<8xi32>
 // CHECK:               %[[VAL_21:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_12]], %[[VAL_19]]] {index = 0 : i8, offset = 0 : i32} : memref<?x?xi32>, vector<16xi32>
 // CHECK:               %[[VAL_22:.*]] = aievec.ups %[[VAL_20]] {shift = 0 : i8} : vector<8xi32>, vector<8xi80>
-// CHECK:               %[[VAL_23:.*]] = aievec.mac %[[VAL_21]], %[[VAL_8]], %[[VAL_22]] {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+// CHECK:               %[[VAL_23:.*]] = aievec_aie1.mac %[[VAL_21]], %[[VAL_8]], %[[VAL_22]] {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 // CHECK:               %[[VAL_24:.*]] = arith.constant 1 : index
 // CHECK:               %[[VAL_25:.*]] = arith.addi %[[VAL_19]], %[[VAL_24]] : index
 // CHECK:               %[[VAL_26:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_12]], %[[VAL_25]]], %[[VAL_21]] {index = 1 : i8, offset = 224 : i32} : memref<?x?xi32>, vector<16xi32>
-// CHECK:               %[[VAL_27:.*]] = aievec.mac %[[VAL_26]], %[[VAL_8]], %[[VAL_23]] {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-// CHECK:               %[[VAL_28:.*]] = aievec.mac %[[VAL_26]], %[[VAL_8]], %[[VAL_27]] {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+// CHECK:               %[[VAL_27:.*]] = aievec_aie1.mac %[[VAL_26]], %[[VAL_8]], %[[VAL_23]] {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+// CHECK:               %[[VAL_28:.*]] = aievec_aie1.mac %[[VAL_26]], %[[VAL_8]], %[[VAL_27]] {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 // CHECK:               %[[VAL_29:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_14]], %[[VAL_19]]] {index = 0 : i8, offset = 0 : i32} : memref<?x?xi32>, vector<16xi32>
-// CHECK:               %[[VAL_30:.*]] = aievec.mac %[[VAL_29]], %[[VAL_8]], %[[VAL_28]] {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+// CHECK:               %[[VAL_30:.*]] = aievec_aie1.mac %[[VAL_29]], %[[VAL_8]], %[[VAL_28]] {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 // CHECK:               %[[VAL_31:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_14]], %[[VAL_25]]], %[[VAL_29]] {index = 1 : i8, offset = 224 : i32} : memref<?x?xi32>, vector<16xi32>
-// CHECK:               %[[VAL_32:.*]] = aievec.mac %[[VAL_31]], %[[VAL_8]], %[[VAL_30]] {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-// CHECK:               %[[VAL_33:.*]] = aievec.mac %[[VAL_31]], %[[VAL_8]], %[[VAL_32]] {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+// CHECK:               %[[VAL_32:.*]] = aievec_aie1.mac %[[VAL_31]], %[[VAL_8]], %[[VAL_30]] {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+// CHECK:               %[[VAL_33:.*]] = aievec_aie1.mac %[[VAL_31]], %[[VAL_8]], %[[VAL_32]] {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 // CHECK:               %[[VAL_34:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_16]], %[[VAL_19]]] {index = 0 : i8, offset = 0 : i32} : memref<?x?xi32>, vector<16xi32>
-// CHECK:               %[[VAL_35:.*]] = aievec.mac %[[VAL_34]], %[[VAL_8]], %[[VAL_33]] {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+// CHECK:               %[[VAL_35:.*]] = aievec_aie1.mac %[[VAL_34]], %[[VAL_8]], %[[VAL_33]] {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 // CHECK:               %[[VAL_36:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_16]], %[[VAL_25]]], %[[VAL_34]] {index = 1 : i8, offset = 224 : i32} : memref<?x?xi32>, vector<16xi32>
-// CHECK:               %[[VAL_37:.*]] = aievec.mac %[[VAL_36]], %[[VAL_8]], %[[VAL_35]] {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-// CHECK:               %[[VAL_38:.*]] = aievec.mac %[[VAL_36]], %[[VAL_9]], %[[VAL_37]] {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+// CHECK:               %[[VAL_37:.*]] = aievec_aie1.mac %[[VAL_36]], %[[VAL_8]], %[[VAL_35]] {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+// CHECK:               %[[VAL_38:.*]] = aievec_aie1.mac %[[VAL_36]], %[[VAL_9]], %[[VAL_37]] {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 // CHECK:               %[[VAL_39:.*]] = aievec.srs %[[VAL_38]], %[[C0]] : vector<8xi80>, i32, vector<8xi32>
-// CHECK:               vector.transfer_write %[[VAL_39]], %[[VAL_2]]{{\[}}%[[VAL_12]], %[[VAL_19]]] {in_bounds = [true]} : vector<8xi32>, memref<?x?xi32>
+// CHECK:               vector.transfer_write %[[VAL_39]], %[[VAL_2]]{{\[}}%[[VAL_12]], %[[VAL_19]]] : vector<8xi32>, memref<?x?xi32>
 
 // CHECK-LABEL: func.func @conv2d_1
 func.func @conv2d_1 (%A: memref<?x256xi32>, %B: memref<?xi32>, %C: memref<?x256xi32>) {
@@ -218,21 +218,21 @@ func.func @conv2d_1 (%A: memref<?x256xi32>, %B: memref<?xi32>, %C: memref<?x256x
 // CHECK:               %[[VAL_19:.*]] = aievec.upd %[[VAL_2]]{{\[}}%[[VAL_10]], %[[VAL_18]]] {index = 0 : i8, offset = 0 : i32} : memref<?x256xi32>, vector<8xi32>
 // CHECK:               %[[VAL_20:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_10]], %[[VAL_18]]] {index = 0 : i8, offset = 0 : i32} : memref<?x256xi32>, vector<16xi32>
 // CHECK:               %[[VAL_21:.*]] = aievec.ups %[[VAL_19]] {shift = 0 : i8} : vector<8xi32>, vector<8xi80>
-// CHECK:               %[[VAL_22:.*]] = aievec.mac %[[VAL_20]], %[[VAL_6]], %[[VAL_21]] {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+// CHECK:               %[[VAL_22:.*]] = aievec_aie1.mac %[[VAL_20]], %[[VAL_6]], %[[VAL_21]] {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 // CHECK:               %[[VAL_23:.*]] = arith.constant 1 : index
 // CHECK:               %[[VAL_24:.*]] = arith.addi %[[VAL_18]], %[[VAL_23]] : index
 // CHECK:               %[[VAL_25:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_10]], %[[VAL_24]]], %[[VAL_20]] {index = 1 : i8, offset = 224 : i32} : memref<?x256xi32>, vector<16xi32>
-// CHECK:               %[[VAL_26:.*]] = aievec.mac %[[VAL_25]], %[[VAL_6]], %[[VAL_22]] {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-// CHECK:               %[[VAL_27:.*]] = aievec.mac %[[VAL_25]], %[[VAL_6]], %[[VAL_26]] {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+// CHECK:               %[[VAL_26:.*]] = aievec_aie1.mac %[[VAL_25]], %[[VAL_6]], %[[VAL_22]] {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+// CHECK:               %[[VAL_27:.*]] = aievec_aie1.mac %[[VAL_25]], %[[VAL_6]], %[[VAL_26]] {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 // CHECK:               %[[VAL_28:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_12]], %[[VAL_18]]] {index = 0 : i8, offset = 0 : i32} : memref<?x256xi32>, vector<16xi32>
-// CHECK:               %[[VAL_29:.*]] = aievec.mac %[[VAL_28]], %[[VAL_6]], %[[VAL_27]] {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+// CHECK:               %[[VAL_29:.*]] = aievec_aie1.mac %[[VAL_28]], %[[VAL_6]], %[[VAL_27]] {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 // CHECK:               %[[VAL_30:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_12]], %[[VAL_24]]], %[[VAL_28]] {index = 1 : i8, offset = 224 : i32} : memref<?x256xi32>, vector<16xi32>
-// CHECK:               %[[VAL_31:.*]] = aievec.mac %[[VAL_30]], %[[VAL_6]], %[[VAL_29]] {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-// CHECK:               %[[VAL_32:.*]] = aievec.mac %[[VAL_30]], %[[VAL_6]], %[[VAL_31]] {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+// CHECK:               %[[VAL_31:.*]] = aievec_aie1.mac %[[VAL_30]], %[[VAL_6]], %[[VAL_29]] {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+// CHECK:               %[[VAL_32:.*]] = aievec_aie1.mac %[[VAL_30]], %[[VAL_6]], %[[VAL_31]] {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 // CHECK:               %[[VAL_33:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_14]], %[[VAL_18]]] {index = 0 : i8, offset = 0 : i32} : memref<?x256xi32>, vector<16xi32>
-// CHECK:               %[[VAL_34:.*]] = aievec.mac %[[VAL_33]], %[[VAL_6]], %[[VAL_32]] {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+// CHECK:               %[[VAL_34:.*]] = aievec_aie1.mac %[[VAL_33]], %[[VAL_6]], %[[VAL_32]] {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 // CHECK:               %[[VAL_35:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_14]], %[[VAL_24]]], %[[VAL_33]] {index = 1 : i8, offset = 224 : i32} : memref<?x256xi32>, vector<16xi32>
-// CHECK:               %[[VAL_36:.*]] = aievec.mac %[[VAL_35]], %[[VAL_6]], %[[VAL_34]] {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-// CHECK:               %[[VAL_37:.*]] = aievec.mac %[[VAL_35]], %[[VAL_7]], %[[VAL_36]] {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+// CHECK:               %[[VAL_36:.*]] = aievec_aie1.mac %[[VAL_35]], %[[VAL_6]], %[[VAL_34]] {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+// CHECK:               %[[VAL_37:.*]] = aievec_aie1.mac %[[VAL_35]], %[[VAL_7]], %[[VAL_36]] {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 // CHECK:               %[[VAL_38:.*]] = aievec.srs %[[VAL_37]], %[[C0]] : vector<8xi80>, i32, vector<8xi32>
-// CHECK:               vector.transfer_write %[[VAL_38]], %[[VAL_2]]{{\[}}%[[VAL_10]], %[[VAL_18]]] {in_bounds = [true]} : vector<8xi32>, memref<?x256xi32>
+// CHECK:               vector.transfer_write %[[VAL_38]], %[[VAL_2]]{{\[}}%[[VAL_10]], %[[VAL_18]]] : vector<8xi32>, memref<?x256xi32>
diff --git a/test/aievec/conv2d_uij_i8.mlir b/test/aievec/conv2d_uij_i8.mlir
index ffe06d9131..ef68be8060 100644
--- a/test/aievec/conv2d_uij_i8.mlir
+++ b/test/aievec/conv2d_uij_i8.mlir
@@ -91,18 +91,18 @@ func.func @conv2d (%A: memref<18x288xi8>, %B: memref<48xi8>, %C: memref<16x256xi
 //CHECK-NEXT:        %4 = aievec.upd %arg2[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<16x256xi8>, vector<16xi8>
 //CHECK-NEXT:        %5 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<18x288xi8>, vector<32xi8>
 //CHECK-NEXT:        %6 = aievec.ups %4 {shift = 10 : i8} : vector<16xi8>, vector<16xi48>
-//CHECK-NEXT:        %7 = aievec.mac %0, %5, %6 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "0", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
-//CHECK-NEXT:        %8 = aievec.mac %0, %5, %6 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "0", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "8", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
+//CHECK-NEXT:        %7 = aievec_aie1.mac %0, %5, %6 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "0", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
+//CHECK-NEXT:        %8 = aievec_aie1.mac %0, %5, %6 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "0", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "8", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
 //CHECK-NEXT:        %9 = aievec.upd %arg0[%2, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<18x288xi8>, vector<32xi8>
-//CHECK-NEXT:        %10 = aievec.mac %0, %9, %7 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "16", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
-//CHECK-NEXT:        %11 = aievec.mac %0, %9, %8 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "16", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "8", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
+//CHECK-NEXT:        %10 = aievec_aie1.mac %0, %9, %7 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "16", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
+//CHECK-NEXT:        %11 = aievec_aie1.mac %0, %9, %8 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "16", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "8", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
 //CHECK-NEXT:        %12 = aievec.upd %arg0[%3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<18x288xi8>, vector<32xi8>
-//CHECK-NEXT:        %13 = aievec.mac %1, %12, %10 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "32", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
-//CHECK-NEXT:        %14 = aievec.mac %1, %12, %11 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "32", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "8", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
+//CHECK-NEXT:        %13 = aievec_aie1.mac %1, %12, %10 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "32", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
+//CHECK-NEXT:        %14 = aievec_aie1.mac %1, %12, %11 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "32", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "8", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
 //CHECK-NEXT:        %15 = aievec.srs %13, %c10_i32 : vector<16xi48>, i32, vector<16xi16>
 //CHECK-NEXT:        %16 = aievec.srs %14, %c10_i32 : vector<16xi48>, i32, vector<16xi16>
 //CHECK-NEXT:        %17 = aievec.concat %15, %16 : vector<16xi16>, vector<32xi16>
 //CHECK-NEXT:        %18 = aievec.select %17 {select = "0xcccccccc", xoffsets = "0x0c080400", xoffsets_hi = "0x0", xsquare = "0x1010", xstart = "0", yoffsets = "0x0c080400", yoffsets_hi = "0x0", ysquare = "0x1010", ystart = "4"} : vector<32xi16>, vector<32xi16>
 //CHECK-NEXT:        %19 = aievec.ext %18 {index = 0 : i8} : vector<32xi16>, vector<16xi16>
 //CHECK-NEXT:        %20 = aievec.pack %19 : vector<16xi16>, vector<16xi8>
-//CHECK-NEXT:        vector.transfer_write %20, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<16xi8>, memref<16x256xi8>
+//CHECK-NEXT:        vector.transfer_write %20, %arg2[%arg3, %arg4] : vector<16xi8>, memref<16x256xi8>
diff --git a/test/aievec/conv2d_uij_i8_noinit.mlir b/test/aievec/conv2d_uij_i8_noinit.mlir
index 799c6f555e..298f9ad0bc 100644
--- a/test/aievec/conv2d_uij_i8_noinit.mlir
+++ b/test/aievec/conv2d_uij_i8_noinit.mlir
@@ -85,18 +85,18 @@ func.func @conv2d (%A: memref<18x288xi8>, %B: memref<48xi8>, %C: memref<16x256xi
 //CHECK-NEXT:      %c16_3 = arith.constant 16 : index
 //CHECK-NEXT:      scf.for %arg4 = %c0_2 to %c256 step %c16_3 {
 //CHECK-NEXT:        %4 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<18x288xi8>, vector<32xi8>
-//CHECK-NEXT:        %5 = aievec.mul %0, %4 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "0", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
-//CHECK-NEXT:        %6 = aievec.mul %0, %4 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "0", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "8", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
+//CHECK-NEXT:        %5 = aievec_aie1.mul %0, %4 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "0", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
+//CHECK-NEXT:        %6 = aievec_aie1.mul %0, %4 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "0", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "8", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
 //CHECK-NEXT:        %7 = aievec.upd %arg0[%2, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<18x288xi8>, vector<32xi8>
-//CHECK-NEXT:        %8 = aievec.mac %0, %7, %5 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "16", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
-//CHECK-NEXT:        %9 = aievec.mac %0, %7, %6 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "16", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "8", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
+//CHECK-NEXT:        %8 = aievec_aie1.mac %0, %7, %5 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "16", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
+//CHECK-NEXT:        %9 = aievec_aie1.mac %0, %7, %6 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "16", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "8", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
 //CHECK-NEXT:        %10 = aievec.upd %arg0[%3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<18x288xi8>, vector<32xi8>
-//CHECK-NEXT:        %11 = aievec.mac %1, %10, %8 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "32", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
-//CHECK-NEXT:        %12 = aievec.mac %1, %10, %9 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "32", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "8", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
+//CHECK-NEXT:        %11 = aievec_aie1.mac %1, %10, %8 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "32", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "0", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
+//CHECK-NEXT:        %12 = aievec_aie1.mac %1, %10, %9 {xoffsets = "0x00000000", xsquare = "0x1010", xstart = "32", xstep = "4", zoffsets = "0x43322110", zsquare = "0x2110", zstart = "8", zstep = "2"} : vector<64xi8>, vector<32xi8>, vector<16xi48>
 //CHECK-NEXT:        %13 = aievec.srs %11, %c10_i32 : vector<16xi48>, i32, vector<16xi16>
 //CHECK-NEXT:        %14 = aievec.srs %12, %c10_i32 : vector<16xi48>, i32, vector<16xi16>
 //CHECK-NEXT:        %15 = aievec.concat %13, %14 : vector<16xi16>, vector<32xi16>
 //CHECK-NEXT:        %16 = aievec.select %15 {select = "0xcccccccc", xoffsets = "0x0c080400", xoffsets_hi = "0x0", xsquare = "0x1010", xstart = "0", yoffsets = "0x0c080400", yoffsets_hi = "0x0", ysquare = "0x1010", ystart = "4"} : vector<32xi16>, vector<32xi16>
 //CHECK-NEXT:        %17 = aievec.ext %16 {index = 0 : i8} : vector<32xi16>, vector<16xi16>
 //CHECK-NEXT:        %18 = aievec.pack %17 : vector<16xi16>, vector<16xi8>
-//CHECK-NEXT:        vector.transfer_write %18, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<16xi8>, memref<16x256xi8>
+//CHECK-NEXT:        vector.transfer_write %18, %arg2[%arg3, %arg4] : vector<16xi8>, memref<16x256xi8>
diff --git a/test/aievec/conv2d_uij_i8_noinit_aie-ml.mlir b/test/aievec/conv2d_uij_i8_noinit_aie-ml.mlir
index 7e5f8d90a8..9fd3dbf9e4 100644
--- a/test/aievec/conv2d_uij_i8_noinit_aie-ml.mlir
+++ b/test/aievec/conv2d_uij_i8_noinit_aie-ml.mlir
@@ -94,4 +94,4 @@ func.func @conv2d (%A: memref<18x288xi8>, %B: memref<48xi8>, %C: memref<16x256xi
 //      CHECK:        %[[T10:.*]] = aievec.upd %[[A0]][%[[T5]], %[[A4]]] {index = 0 : i8, offset = 0 : i32} : memref<18x288xi8>, vector<64xi8>
 //      CHECK:        %[[T11:.*]] = aievec.fma_conv %[[T10]], %[[T3]], %[[T9]] {M = 32 : i32, N = 8 : i32} : vector<64xi8>, vector<64xi8>, vector<32xi32>
 //      CHECK:        %[[T12:.*]] = aievec.srs %[[T11]], %[[C0I32]] : vector<32xi32>, i32, vector<32xi8>
-//      CHECK:        vector.transfer_write %[[T12]], %[[A2]][%[[A3]], %[[A4]]] {in_bounds = [true]} : vector<32xi8>, memref<16x256xi8>
+//      CHECK:        vector.transfer_write %[[T12]], %[[A2]][%[[A3]], %[[A4]]] : vector<32xi8>, memref<16x256xi8>
diff --git a/test/aievec/conv2d_uj_i16.mlir b/test/aievec/conv2d_uj_i16.mlir
index db4512a2c9..0d5d479aeb 100644
--- a/test/aievec/conv2d_uj_i16.mlir
+++ b/test/aievec/conv2d_uj_i16.mlir
@@ -54,8 +54,8 @@ func.func @conv2d (%A: memref<2048x2048xi16>, %B: memref<3x3xi16>, %C: memref<20
 //CHECK-NEXT:          %3 = aievec.upd %arg0[%2, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi16>, vector<32xi16>
 //CHECK-NEXT:          %4 = aievec.upd %arg0[%2, %arg4], %3 {index = 1 : i8, offset = 256 : i32} : memref<2048x2048xi16>, vector<32xi16>
 //CHECK-NEXT:          %5 = aievec.upd %arg1[%arg5, %c0] {index = 0 : i8, offset = 0 : i32} : memref<3x3xi16>, vector<16xi16>
-//CHECK-NEXT:          %6 = aievec.mac %4, %5, %1 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
-//CHECK-NEXT:          %7 = aievec.mac %4, %5, %6 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "2", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+//CHECK-NEXT:          %6 = aievec_aie1.mac %4, %5, %1 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zstart = "0", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
+//CHECK-NEXT:          %7 = aievec_aie1.mac %4, %5, %6 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x2110", xstart = "2", zoffsets = "0", zoffsets_hi = "0", zstart = "2", zstep = "1"} : vector<32xi16>, vector<16xi16>, vector<16xi48>
 //CHECK-NEXT:          %8 = aievec.srs %7, %c0_i32 : vector<16xi48>, i32, vector<16xi16>
-//CHECK-NEXT:          vector.transfer_write %8, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<16xi16>, memref<2046x2046xi16>
+//CHECK-NEXT:          vector.transfer_write %8, %arg2[%arg3, %arg4] : vector<16xi16>, memref<2046x2046xi16>
 
diff --git a/test/aievec/conv2d_uj_i32.mlir b/test/aievec/conv2d_uj_i32.mlir
index 637644a804..b1bcc34438 100644
--- a/test/aievec/conv2d_uj_i32.mlir
+++ b/test/aievec/conv2d_uj_i32.mlir
@@ -55,9 +55,9 @@ func.func @conv2d (%A: memref<2048x2048xi32>, %B: memref<3x3xi32>, %C: memref<20
 //CHECK-NEXT:          %3 = arith.addi %arg3, %arg5 : index
 //CHECK-NEXT:          %4 = aievec.upd %arg0[%3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi32>, vector<16xi32>
 //CHECK-NEXT:          %5 = aievec.upd %arg1[%arg5, %c0] {index = 0 : i8, offset = 0 : i32} : memref<3x3xi32>, vector<8xi32>
-//CHECK-NEXT:          %6 = aievec.mac %4, %5, %1 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:          %6 = aievec_aie1.mac %4, %5, %1 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 //CHECK-NEXT:          %7 = aievec.upd %arg0[%3, %2], %4 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:          %8 = aievec.mac %7, %5, %6 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-//CHECK-NEXT:          %9 = aievec.mac %7, %5, %8 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:          %8 = aievec_aie1.mac %7, %5, %6 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:          %9 = aievec_aie1.mac %7, %5, %8 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 //CHECK-NEXT:          %10 = aievec.srs %9, %c0_i32 : vector<8xi80>, i32, vector<8xi32>
-//CHECK-NEXT:          vector.transfer_write %10, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<8xi32>, memref<2046x2046xi32>
+//CHECK-NEXT:          vector.transfer_write %10, %arg2[%arg3, %arg4] : vector<8xi32>, memref<2046x2046xi32>
diff --git a/test/aievec/gemm64_int16_unroll32_after_polygeist.mlir b/test/aievec/gemm64_int16_unroll32_after_polygeist.mlir
deleted file mode 100644
index 8e78b59b1f..0000000000
--- a/test/aievec/gemm64_int16_unroll32_after_polygeist.mlir
+++ /dev/null
@@ -1,475 +0,0 @@
-// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=32" -aieml=true --aie-vectorize -canonicalize | FileCheck %s
-
-module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>>, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu", "polygeist.target-cpu" = "x86-64", "polygeist.target-features" = "+cx8,+fxsr,+mmx,+sse,+sse2,+x87", "polygeist.tune-cpu" = "generic"} {
-  func.func @matmul(%arg0: memref<?x64xi16>, %arg1: memref<?x64xi16>, %arg2: memref<?x64xi16>) attributes {llvm.linkage = #llvm.linkage<external>} {
-    affine.for %arg3 = 0 to 64 {
-      affine.for %arg4 = 0 to 64 {
-        affine.for %arg5 = 0 to 64 step 32 {
-          %0 = affine.load %arg0[%arg3, %arg5] : memref<?x64xi16>
-          %1 = arith.extsi %0 : i16 to i32
-          %2 = affine.load %arg1[%arg5, %arg4] : memref<?x64xi16>
-          %3 = arith.extsi %2 : i16 to i32
-          %4 = arith.muli %1, %3 : i32
-          %5 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %6 = arith.trunci %4 : i32 to i16
-          %7 = arith.addi %5, %6 : i16
-          affine.store %7, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %8 = affine.load %arg0[%arg3, %arg5 + 1] : memref<?x64xi16>
-          %9 = arith.extsi %8 : i16 to i32
-          %10 = affine.load %arg1[%arg5 + 1, %arg4] : memref<?x64xi16>
-          %11 = arith.extsi %10 : i16 to i32
-          %12 = arith.muli %9, %11 : i32
-          %13 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %14 = arith.trunci %12 : i32 to i16
-          %15 = arith.addi %13, %14 : i16
-          affine.store %15, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %16 = affine.load %arg0[%arg3, %arg5 + 2] : memref<?x64xi16>
-          %17 = arith.extsi %16 : i16 to i32
-          %18 = affine.load %arg1[%arg5 + 2, %arg4] : memref<?x64xi16>
-          %19 = arith.extsi %18 : i16 to i32
-          %20 = arith.muli %17, %19 : i32
-          %21 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %22 = arith.trunci %20 : i32 to i16
-          %23 = arith.addi %21, %22 : i16
-          affine.store %23, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %24 = affine.load %arg0[%arg3, %arg5 + 3] : memref<?x64xi16>
-          %25 = arith.extsi %24 : i16 to i32
-          %26 = affine.load %arg1[%arg5 + 3, %arg4] : memref<?x64xi16>
-          %27 = arith.extsi %26 : i16 to i32
-          %28 = arith.muli %25, %27 : i32
-          %29 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %30 = arith.trunci %28 : i32 to i16
-          %31 = arith.addi %29, %30 : i16
-          affine.store %31, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %32 = affine.load %arg0[%arg3, %arg5 + 4] : memref<?x64xi16>
-          %33 = arith.extsi %32 : i16 to i32
-          %34 = affine.load %arg1[%arg5 + 4, %arg4] : memref<?x64xi16>
-          %35 = arith.extsi %34 : i16 to i32
-          %36 = arith.muli %33, %35 : i32
-          %37 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %38 = arith.trunci %36 : i32 to i16
-          %39 = arith.addi %37, %38 : i16
-          affine.store %39, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %40 = affine.load %arg0[%arg3, %arg5 + 5] : memref<?x64xi16>
-          %41 = arith.extsi %40 : i16 to i32
-          %42 = affine.load %arg1[%arg5 + 5, %arg4] : memref<?x64xi16>
-          %43 = arith.extsi %42 : i16 to i32
-          %44 = arith.muli %41, %43 : i32
-          %45 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %46 = arith.trunci %44 : i32 to i16
-          %47 = arith.addi %45, %46 : i16
-          affine.store %47, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %48 = affine.load %arg0[%arg3, %arg5 + 6] : memref<?x64xi16>
-          %49 = arith.extsi %48 : i16 to i32
-          %50 = affine.load %arg1[%arg5 + 6, %arg4] : memref<?x64xi16>
-          %51 = arith.extsi %50 : i16 to i32
-          %52 = arith.muli %49, %51 : i32
-          %53 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %54 = arith.trunci %52 : i32 to i16
-          %55 = arith.addi %53, %54 : i16
-          affine.store %55, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %56 = affine.load %arg0[%arg3, %arg5 + 7] : memref<?x64xi16>
-          %57 = arith.extsi %56 : i16 to i32
-          %58 = affine.load %arg1[%arg5 + 7, %arg4] : memref<?x64xi16>
-          %59 = arith.extsi %58 : i16 to i32
-          %60 = arith.muli %57, %59 : i32
-          %61 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %62 = arith.trunci %60 : i32 to i16
-          %63 = arith.addi %61, %62 : i16
-          affine.store %63, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %64 = affine.load %arg0[%arg3, %arg5 + 8] : memref<?x64xi16>
-          %65 = arith.extsi %64 : i16 to i32
-          %66 = affine.load %arg1[%arg5 + 8, %arg4] : memref<?x64xi16>
-          %67 = arith.extsi %66 : i16 to i32
-          %68 = arith.muli %65, %67 : i32
-          %69 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %70 = arith.trunci %68 : i32 to i16
-          %71 = arith.addi %69, %70 : i16
-          affine.store %71, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %72 = affine.load %arg0[%arg3, %arg5 + 9] : memref<?x64xi16>
-          %73 = arith.extsi %72 : i16 to i32
-          %74 = affine.load %arg1[%arg5 + 9, %arg4] : memref<?x64xi16>
-          %75 = arith.extsi %74 : i16 to i32
-          %76 = arith.muli %73, %75 : i32
-          %77 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %78 = arith.trunci %76 : i32 to i16
-          %79 = arith.addi %77, %78 : i16
-          affine.store %79, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %80 = affine.load %arg0[%arg3, %arg5 + 10] : memref<?x64xi16>
-          %81 = arith.extsi %80 : i16 to i32
-          %82 = affine.load %arg1[%arg5 + 10, %arg4] : memref<?x64xi16>
-          %83 = arith.extsi %82 : i16 to i32
-          %84 = arith.muli %81, %83 : i32
-          %85 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %86 = arith.trunci %84 : i32 to i16
-          %87 = arith.addi %85, %86 : i16
-          affine.store %87, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %88 = affine.load %arg0[%arg3, %arg5 + 11] : memref<?x64xi16>
-          %89 = arith.extsi %88 : i16 to i32
-          %90 = affine.load %arg1[%arg5 + 11, %arg4] : memref<?x64xi16>
-          %91 = arith.extsi %90 : i16 to i32
-          %92 = arith.muli %89, %91 : i32
-          %93 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %94 = arith.trunci %92 : i32 to i16
-          %95 = arith.addi %93, %94 : i16
-          affine.store %95, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %96 = affine.load %arg0[%arg3, %arg5 + 12] : memref<?x64xi16>
-          %97 = arith.extsi %96 : i16 to i32
-          %98 = affine.load %arg1[%arg5 + 12, %arg4] : memref<?x64xi16>
-          %99 = arith.extsi %98 : i16 to i32
-          %100 = arith.muli %97, %99 : i32
-          %101 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %102 = arith.trunci %100 : i32 to i16
-          %103 = arith.addi %101, %102 : i16
-          affine.store %103, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %104 = affine.load %arg0[%arg3, %arg5 + 13] : memref<?x64xi16>
-          %105 = arith.extsi %104 : i16 to i32
-          %106 = affine.load %arg1[%arg5 + 13, %arg4] : memref<?x64xi16>
-          %107 = arith.extsi %106 : i16 to i32
-          %108 = arith.muli %105, %107 : i32
-          %109 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %110 = arith.trunci %108 : i32 to i16
-          %111 = arith.addi %109, %110 : i16
-          affine.store %111, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %112 = affine.load %arg0[%arg3, %arg5 + 14] : memref<?x64xi16>
-          %113 = arith.extsi %112 : i16 to i32
-          %114 = affine.load %arg1[%arg5 + 14, %arg4] : memref<?x64xi16>
-          %115 = arith.extsi %114 : i16 to i32
-          %116 = arith.muli %113, %115 : i32
-          %117 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %118 = arith.trunci %116 : i32 to i16
-          %119 = arith.addi %117, %118 : i16
-          affine.store %119, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %120 = affine.load %arg0[%arg3, %arg5 + 15] : memref<?x64xi16>
-          %121 = arith.extsi %120 : i16 to i32
-          %122 = affine.load %arg1[%arg5 + 15, %arg4] : memref<?x64xi16>
-          %123 = arith.extsi %122 : i16 to i32
-          %124 = arith.muli %121, %123 : i32
-          %125 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %126 = arith.trunci %124 : i32 to i16
-          %127 = arith.addi %125, %126 : i16
-          affine.store %127, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %128 = affine.load %arg0[%arg3, %arg5 + 16] : memref<?x64xi16>
-          %129 = arith.extsi %128 : i16 to i32
-          %130 = affine.load %arg1[%arg5 + 16, %arg4] : memref<?x64xi16>
-          %131 = arith.extsi %130 : i16 to i32
-          %132 = arith.muli %129, %131 : i32
-          %133 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %134 = arith.trunci %132 : i32 to i16
-          %135 = arith.addi %133, %134 : i16
-          affine.store %135, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %136 = affine.load %arg0[%arg3, %arg5 + 17] : memref<?x64xi16>
-          %137 = arith.extsi %136 : i16 to i32
-          %138 = affine.load %arg1[%arg5 + 17, %arg4] : memref<?x64xi16>
-          %139 = arith.extsi %138 : i16 to i32
-          %140 = arith.muli %137, %139 : i32
-          %141 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %142 = arith.trunci %140 : i32 to i16
-          %143 = arith.addi %141, %142 : i16
-          affine.store %143, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %144 = affine.load %arg0[%arg3, %arg5 + 18] : memref<?x64xi16>
-          %145 = arith.extsi %144 : i16 to i32
-          %146 = affine.load %arg1[%arg5 + 18, %arg4] : memref<?x64xi16>
-          %147 = arith.extsi %146 : i16 to i32
-          %148 = arith.muli %145, %147 : i32
-          %149 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %150 = arith.trunci %148 : i32 to i16
-          %151 = arith.addi %149, %150 : i16
-          affine.store %151, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %152 = affine.load %arg0[%arg3, %arg5 + 19] : memref<?x64xi16>
-          %153 = arith.extsi %152 : i16 to i32
-          %154 = affine.load %arg1[%arg5 + 19, %arg4] : memref<?x64xi16>
-          %155 = arith.extsi %154 : i16 to i32
-          %156 = arith.muli %153, %155 : i32
-          %157 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %158 = arith.trunci %156 : i32 to i16
-          %159 = arith.addi %157, %158 : i16
-          affine.store %159, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %160 = affine.load %arg0[%arg3, %arg5 + 20] : memref<?x64xi16>
-          %161 = arith.extsi %160 : i16 to i32
-          %162 = affine.load %arg1[%arg5 + 20, %arg4] : memref<?x64xi16>
-          %163 = arith.extsi %162 : i16 to i32
-          %164 = arith.muli %161, %163 : i32
-          %165 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %166 = arith.trunci %164 : i32 to i16
-          %167 = arith.addi %165, %166 : i16
-          affine.store %167, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %168 = affine.load %arg0[%arg3, %arg5 + 21] : memref<?x64xi16>
-          %169 = arith.extsi %168 : i16 to i32
-          %170 = affine.load %arg1[%arg5 + 21, %arg4] : memref<?x64xi16>
-          %171 = arith.extsi %170 : i16 to i32
-          %172 = arith.muli %169, %171 : i32
-          %173 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %174 = arith.trunci %172 : i32 to i16
-          %175 = arith.addi %173, %174 : i16
-          affine.store %175, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %176 = affine.load %arg0[%arg3, %arg5 + 22] : memref<?x64xi16>
-          %177 = arith.extsi %176 : i16 to i32
-          %178 = affine.load %arg1[%arg5 + 22, %arg4] : memref<?x64xi16>
-          %179 = arith.extsi %178 : i16 to i32
-          %180 = arith.muli %177, %179 : i32
-          %181 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %182 = arith.trunci %180 : i32 to i16
-          %183 = arith.addi %181, %182 : i16
-          affine.store %183, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %184 = affine.load %arg0[%arg3, %arg5 + 23] : memref<?x64xi16>
-          %185 = arith.extsi %184 : i16 to i32
-          %186 = affine.load %arg1[%arg5 + 23, %arg4] : memref<?x64xi16>
-          %187 = arith.extsi %186 : i16 to i32
-          %188 = arith.muli %185, %187 : i32
-          %189 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %190 = arith.trunci %188 : i32 to i16
-          %191 = arith.addi %189, %190 : i16
-          affine.store %191, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %192 = affine.load %arg0[%arg3, %arg5 + 24] : memref<?x64xi16>
-          %193 = arith.extsi %192 : i16 to i32
-          %194 = affine.load %arg1[%arg5 + 24, %arg4] : memref<?x64xi16>
-          %195 = arith.extsi %194 : i16 to i32
-          %196 = arith.muli %193, %195 : i32
-          %197 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %198 = arith.trunci %196 : i32 to i16
-          %199 = arith.addi %197, %198 : i16
-          affine.store %199, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %200 = affine.load %arg0[%arg3, %arg5 + 25] : memref<?x64xi16>
-          %201 = arith.extsi %200 : i16 to i32
-          %202 = affine.load %arg1[%arg5 + 25, %arg4] : memref<?x64xi16>
-          %203 = arith.extsi %202 : i16 to i32
-          %204 = arith.muli %201, %203 : i32
-          %205 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %206 = arith.trunci %204 : i32 to i16
-          %207 = arith.addi %205, %206 : i16
-          affine.store %207, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %208 = affine.load %arg0[%arg3, %arg5 + 26] : memref<?x64xi16>
-          %209 = arith.extsi %208 : i16 to i32
-          %210 = affine.load %arg1[%arg5 + 26, %arg4] : memref<?x64xi16>
-          %211 = arith.extsi %210 : i16 to i32
-          %212 = arith.muli %209, %211 : i32
-          %213 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %214 = arith.trunci %212 : i32 to i16
-          %215 = arith.addi %213, %214 : i16
-          affine.store %215, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %216 = affine.load %arg0[%arg3, %arg5 + 27] : memref<?x64xi16>
-          %217 = arith.extsi %216 : i16 to i32
-          %218 = affine.load %arg1[%arg5 + 27, %arg4] : memref<?x64xi16>
-          %219 = arith.extsi %218 : i16 to i32
-          %220 = arith.muli %217, %219 : i32
-          %221 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %222 = arith.trunci %220 : i32 to i16
-          %223 = arith.addi %221, %222 : i16
-          affine.store %223, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %224 = affine.load %arg0[%arg3, %arg5 + 28] : memref<?x64xi16>
-          %225 = arith.extsi %224 : i16 to i32
-          %226 = affine.load %arg1[%arg5 + 28, %arg4] : memref<?x64xi16>
-          %227 = arith.extsi %226 : i16 to i32
-          %228 = arith.muli %225, %227 : i32
-          %229 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %230 = arith.trunci %228 : i32 to i16
-          %231 = arith.addi %229, %230 : i16
-          affine.store %231, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %232 = affine.load %arg0[%arg3, %arg5 + 29] : memref<?x64xi16>
-          %233 = arith.extsi %232 : i16 to i32
-          %234 = affine.load %arg1[%arg5 + 29, %arg4] : memref<?x64xi16>
-          %235 = arith.extsi %234 : i16 to i32
-          %236 = arith.muli %233, %235 : i32
-          %237 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %238 = arith.trunci %236 : i32 to i16
-          %239 = arith.addi %237, %238 : i16
-          affine.store %239, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %240 = affine.load %arg0[%arg3, %arg5 + 30] : memref<?x64xi16>
-          %241 = arith.extsi %240 : i16 to i32
-          %242 = affine.load %arg1[%arg5 + 30, %arg4] : memref<?x64xi16>
-          %243 = arith.extsi %242 : i16 to i32
-          %244 = arith.muli %241, %243 : i32
-          %245 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %246 = arith.trunci %244 : i32 to i16
-          %247 = arith.addi %245, %246 : i16
-          affine.store %247, %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %248 = affine.load %arg0[%arg3, %arg5 + 31] : memref<?x64xi16>
-          %249 = arith.extsi %248 : i16 to i32
-          %250 = affine.load %arg1[%arg5 + 31, %arg4] : memref<?x64xi16>
-          %251 = arith.extsi %250 : i16 to i32
-          %252 = arith.muli %249, %251 : i32
-          %253 = affine.load %arg2[%arg3, %arg4] : memref<?x64xi16>
-          %254 = arith.trunci %252 : i32 to i16
-          %255 = arith.addi %253, %254 : i16
-          affine.store %255, %arg2[%arg3, %arg4] : memref<?x64xi16>
-        }
-      }
-    }
-    return
-  }
-}
-// CHECK-LABEL: @matmul
-// CHECK-SAME: %[[A0:[0-9a-zA-Z]*]]: memref<?x64xi16>
-// CHECK-SAME: %[[A1:[0-9a-zA-Z]*]]: memref<?x64xi16>
-// CHECK-SAME: %[[A2:[0-9a-zA-Z]*]]: memref<?x64xi16>
-//      CHECK:    %[[C31:.*]] = arith.constant 31 : index
-//      CHECK:    %[[C30:.*]] = arith.constant 30 : index
-//      CHECK:    %[[C29:.*]] = arith.constant 29 : index
-//      CHECK:    %[[C28:.*]] = arith.constant 28 : index
-//      CHECK:    %[[C27:.*]] = arith.constant 27 : index
-//      CHECK:    %[[C26:.*]] = arith.constant 26 : index
-//      CHECK:    %[[C25:.*]] = arith.constant 25 : index
-//      CHECK:    %[[C24:.*]] = arith.constant 24 : index
-//      CHECK:    %[[C23:.*]] = arith.constant 23 : index
-//      CHECK:    %[[C22:.*]] = arith.constant 22 : index
-//      CHECK:    %[[C21:.*]] = arith.constant 21 : index
-//      CHECK:    %[[C20:.*]] = arith.constant 20 : index
-//      CHECK:    %[[C19:.*]] = arith.constant 19 : index
-//      CHECK:    %[[C18:.*]] = arith.constant 18 : index
-//      CHECK:    %[[C17:.*]] = arith.constant 17 : index
-//      CHECK:    %[[C16:.*]] = arith.constant 16 : index
-//      CHECK:    %[[C15:.*]] = arith.constant 15 : index
-//      CHECK:    %[[C14:.*]] = arith.constant 14 : index
-//      CHECK:    %[[C13:.*]] = arith.constant 13 : index
-//      CHECK:    %[[C12:.*]] = arith.constant 12 : index
-//      CHECK:    %[[C11:.*]] = arith.constant 11 : index
-//      CHECK:    %[[C10:.*]] = arith.constant 10 : index
-//      CHECK:    %[[C9:.*]] = arith.constant 9 : index
-//      CHECK:    %[[C8:.*]] = arith.constant 8 : index
-//      CHECK:    %[[C7:.*]] = arith.constant 7 : index
-//      CHECK:    %[[C6:.*]] = arith.constant 6 : index
-//      CHECK:    %[[C5:.*]] = arith.constant 5 : index
-//      CHECK:    %[[C4:.*]] = arith.constant 4 : index
-//      CHECK:    %[[C3:.*]] = arith.constant 3 : index
-//      CHECK:    %[[C2:.*]] = arith.constant 2 : index
-//      CHECK:    %[[C32:.*]] = arith.constant 32 : index
-//      CHECK:    %[[C0I32:.*]] = arith.constant 0 : i32
-//      CHECK:    %[[C0:.*]] = arith.constant 0 : index
-//      CHECK:    %[[C64:.*]] = arith.constant 64 : index
-//      CHECK:    %[[C1:.*]] = arith.constant 1 : index
-//      CHECK:    scf.for %[[A3:.*]] = %[[C0:.*]] to %[[C64:.*]] step %[[C1:.*]] {
-//      CHECK:      scf.for %[[A4:.*]] = %[[C0:.*]] to %[[C64:.*]] step %[[C32:.*]] {
-//      CHECK:        %[[T0:.*]] = aievec.upd %[[A2]][%[[A3:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:        %[[T1:.*]] = aievec.ups %[[T0:.*]] {shift = 0 : i8} : vector<32xi16>, vector<32xi32>
-//      CHECK:        scf.for %[[A5:.*]] = %[[C0:.*]] to %[[C64:.*]] step %[[C32:.*]] {
-//      CHECK:          %[[T2:.*]] = aievec.upd %[[A0]][%[[A3:.*]], %[[A5:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T3:.*]] = aievec.upd %[[A1]][%[[A5:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T4:.*]] = aievec.broadcast %[[T2:.*]] {idx = 0 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T5:.*]] = aievec.mac_elem %[[T3:.*]], %[[T4:.*]], %[[T1:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T6:.*]] = arith.addi %[[A5:.*]], %[[C1:.*]] : index
-//      CHECK:          %[[T7:.*]] = aievec.upd %[[A1]][%[[T6:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T8:.*]] = aievec.broadcast %[[T2:.*]] {idx = 1 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T9:.*]] = aievec.mac_elem %[[T7:.*]], %[[T8:.*]], %[[T5:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T10:.*]] = arith.addi %[[A5:.*]], %[[C2:.*]] : index
-//      CHECK:          %[[T11:.*]] = aievec.upd %[[A1]][%[[T10:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T12:.*]] = aievec.broadcast %[[T2:.*]] {idx = 2 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T13:.*]] = aievec.mac_elem %[[T11:.*]], %[[T12:.*]], %[[T9:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T14:.*]] = arith.addi %[[A5:.*]], %[[C3:.*]] : index
-//      CHECK:          %[[T15:.*]] = aievec.upd %[[A1]][%[[T14:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T16:.*]] = aievec.broadcast %[[T2:.*]] {idx = 3 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T17:.*]] = aievec.mac_elem %[[T15:.*]], %[[T16:.*]], %[[T13:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T18:.*]] = arith.addi %[[A5:.*]], %[[C4:.*]] : index
-//      CHECK:          %[[T19:.*]] = aievec.upd %[[A1]][%[[T18:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T20:.*]] = aievec.broadcast %[[T2:.*]] {idx = 4 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T21:.*]] = aievec.mac_elem %[[T19:.*]], %[[T20:.*]], %[[T17:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T22:.*]] = arith.addi %[[A5:.*]], %[[C5:.*]] : index
-//      CHECK:          %[[T23:.*]] = aievec.upd %[[A1]][%[[T22:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T24:.*]] = aievec.broadcast %[[T2:.*]] {idx = 5 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T25:.*]] = aievec.mac_elem %[[T23:.*]], %[[T24:.*]], %[[T21:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T26:.*]] = arith.addi %[[A5:.*]], %[[C6:.*]] : index
-//      CHECK:          %[[T27:.*]] = aievec.upd %[[A1]][%[[T26:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T28:.*]] = aievec.broadcast %[[T2:.*]] {idx = 6 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T29:.*]] = aievec.mac_elem %[[T27:.*]], %[[T28:.*]], %[[T25:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T30:.*]] = arith.addi %[[A5:.*]], %[[C7:.*]] : index
-//      CHECK:          %[[T31:.*]] = aievec.upd %[[A1]][%[[T30:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T32:.*]] = aievec.broadcast %[[T2:.*]] {idx = 7 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T33:.*]] = aievec.mac_elem %[[T31:.*]], %[[T32:.*]], %[[T29:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T34:.*]] = arith.addi %[[A5:.*]], %[[C8:.*]] : index
-//      CHECK:          %[[T35:.*]] = aievec.upd %[[A1]][%[[T34:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T36:.*]] = aievec.broadcast %[[T2:.*]] {idx = 8 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T37:.*]] = aievec.mac_elem %[[T35:.*]], %[[T36:.*]], %[[T33:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T38:.*]] = arith.addi %[[A5:.*]], %[[C9:.*]] : index
-//      CHECK:          %[[T39:.*]] = aievec.upd %[[A1]][%[[T38:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T40:.*]] = aievec.broadcast %[[T2:.*]] {idx = 9 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T41:.*]] = aievec.mac_elem %[[T39:.*]], %[[T40:.*]], %[[T37:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T42:.*]] = arith.addi %[[A5:.*]], %[[C10:.*]] : index
-//      CHECK:          %[[T43:.*]] = aievec.upd %[[A1]][%[[T42:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T44:.*]] = aievec.broadcast %[[T2:.*]] {idx = 10 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T45:.*]] = aievec.mac_elem %[[T43:.*]], %[[T44:.*]], %[[T41:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T46:.*]] = arith.addi %[[A5:.*]], %[[C11:.*]] : index
-//      CHECK:          %[[T47:.*]] = aievec.upd %[[A1]][%[[T46:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T48:.*]] = aievec.broadcast %[[T2:.*]] {idx = 11 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T49:.*]] = aievec.mac_elem %[[T47:.*]], %[[T48:.*]], %[[T45:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T50:.*]] = arith.addi %[[A5:.*]], %[[C12:.*]] : index
-//      CHECK:          %[[T51:.*]] = aievec.upd %[[A1]][%[[T50:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T52:.*]] = aievec.broadcast %[[T2:.*]] {idx = 12 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T53:.*]] = aievec.mac_elem %[[T51:.*]], %[[T52:.*]], %[[T49:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T54:.*]] = arith.addi %[[A5:.*]], %[[C13:.*]] : index
-//      CHECK:          %[[T55:.*]] = aievec.upd %[[A1]][%[[T54:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T56:.*]] = aievec.broadcast %[[T2:.*]] {idx = 13 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T57:.*]] = aievec.mac_elem %[[T55:.*]], %[[T56:.*]], %[[T53:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T58:.*]] = arith.addi %[[A5:.*]], %[[C14:.*]] : index
-//      CHECK:          %[[T59:.*]] = aievec.upd %[[A1]][%[[T58:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T60:.*]] = aievec.broadcast %[[T2:.*]] {idx = 14 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T61:.*]] = aievec.mac_elem %[[T59:.*]], %[[T60:.*]], %[[T57:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T62:.*]] = arith.addi %[[A5:.*]], %[[C15:.*]] : index
-//      CHECK:          %[[T63:.*]] = aievec.upd %[[A1]][%[[T62:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T64:.*]] = aievec.broadcast %[[T2:.*]] {idx = 15 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T65:.*]] = aievec.mac_elem %[[T63:.*]], %[[T64:.*]], %[[T61:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T66:.*]] = arith.addi %[[A5:.*]], %[[C16:.*]] : index
-//      CHECK:          %[[T67:.*]] = aievec.upd %[[A1]][%[[T66:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T68:.*]] = aievec.broadcast %[[T2:.*]] {idx = 16 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T69:.*]] = aievec.mac_elem %[[T67:.*]], %[[T68:.*]], %[[T65:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T70:.*]] = arith.addi %[[A5:.*]], %[[C17:.*]] : index
-//      CHECK:          %[[T71:.*]] = aievec.upd %[[A1]][%[[T70:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T72:.*]] = aievec.broadcast %[[T2:.*]] {idx = 17 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T73:.*]] = aievec.mac_elem %[[T71:.*]], %[[T72:.*]], %[[T69:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T74:.*]] = arith.addi %[[A5:.*]], %[[C18:.*]] : index
-//      CHECK:          %[[T75:.*]] = aievec.upd %[[A1]][%[[T74:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T76:.*]] = aievec.broadcast %[[T2:.*]] {idx = 18 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T77:.*]] = aievec.mac_elem %[[T75:.*]], %[[T76:.*]], %[[T73:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T78:.*]] = arith.addi %[[A5:.*]], %[[C19:.*]] : index
-//      CHECK:          %[[T79:.*]] = aievec.upd %[[A1]][%[[T78:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T80:.*]] = aievec.broadcast %[[T2:.*]] {idx = 19 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T81:.*]] = aievec.mac_elem %[[T79:.*]], %[[T80:.*]], %[[T77:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T82:.*]] = arith.addi %[[A5:.*]], %[[C20:.*]] : index
-//      CHECK:          %[[T83:.*]] = aievec.upd %[[A1]][%[[T82:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T84:.*]] = aievec.broadcast %[[T2:.*]] {idx = 20 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T85:.*]] = aievec.mac_elem %[[T83:.*]], %[[T84:.*]], %[[T81:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T86:.*]] = arith.addi %[[A5:.*]], %[[C21:.*]] : index
-//      CHECK:          %[[T87:.*]] = aievec.upd %[[A1]][%[[T86:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T88:.*]] = aievec.broadcast %[[T2:.*]] {idx = 21 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T89:.*]] = aievec.mac_elem %[[T87:.*]], %[[T88:.*]], %[[T85:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T90:.*]] = arith.addi %[[A5:.*]], %[[C22:.*]] : index
-//      CHECK:          %[[T91:.*]] = aievec.upd %[[A1]][%90, %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T92:.*]] = aievec.broadcast %[[T2:.*]] {idx = 22 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T93:.*]] = aievec.mac_elem %[[T91:.*]], %[[T92:.*]], %[[T89:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T94:.*]] = arith.addi %[[A5:.*]], %[[C23:.*]] : index
-//      CHECK:          %[[T95:.*]] = aievec.upd %[[A1]][%[[T94:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T96:.*]] = aievec.broadcast %[[T2:.*]] {idx = 23 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T97:.*]] = aievec.mac_elem %[[T95:.*]], %[[T96:.*]], %[[T93:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T98:.*]] = arith.addi %[[A5:.*]], %[[C24:.*]] : index
-//      CHECK:          %[[T99:.*]] = aievec.upd %[[A1]][%[[T98:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T100:.*]] = aievec.broadcast %[[T2:.*]] {idx = 24 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T101:.*]] = aievec.mac_elem %[[T99:.*]], %[[T100:.*]], %[[T97:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T102:.*]] = arith.addi %[[A5:.*]], %[[C25:.*]] : index
-//      CHECK:          %[[T103:.*]] = aievec.upd %[[A1]][%[[T102:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T104:.*]] = aievec.broadcast %[[T2:.*]] {idx = 25 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T105:.*]] = aievec.mac_elem %[[T103:.*]], %[[T104:.*]], %[[T101:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T106:.*]] = arith.addi %[[A5:.*]], %[[C26:.*]] : index
-//      CHECK:          %[[T107:.*]] = aievec.upd %[[A1]][%[[T106:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T108:.*]] = aievec.broadcast %[[T2:.*]] {idx = 26 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T109:.*]] = aievec.mac_elem %[[T107:.*]], %[[T108:.*]], %[[T105:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T110:.*]] = arith.addi %[[A5:.*]], %[[C27:.*]] : index
-//      CHECK:          %[[T111:.*]] = aievec.upd %[[A1]][%[[T110:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T112:.*]] = aievec.broadcast %[[T2:.*]] {idx = 27 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T113:.*]] = aievec.mac_elem %[[T111:.*]], %[[T112:.*]], %[[T109:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T114:.*]] = arith.addi %[[A5:.*]], %[[C28:.*]] : index
-//      CHECK:          %[[T115:.*]] = aievec.upd %[[A1]][%[[T114:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T116:.*]] = aievec.broadcast %[[T2:.*]] {idx = 28 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T117:.*]] = aievec.mac_elem %[[T115:.*]], %[[T116:.*]], %[[T113:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T118:.*]] = arith.addi %[[A5:.*]], %[[C29:.*]] : index
-//      CHECK:          %[[T119:.*]] = aievec.upd %[[A1]][%[[T118:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T120:.*]] = aievec.broadcast %[[T2:.*]] {idx = 29 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T121:.*]] = aievec.mac_elem %[[T119:.*]], %[[T120:.*]], %[[T117:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T122:.*]] = arith.addi %[[A5:.*]], %[[C30:.*]] : index
-//      CHECK:          %[[T123:.*]] = aievec.upd %[[A1]][%[[T122:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T124:.*]] = aievec.broadcast %[[T2:.*]] {idx = 30 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T125:.*]] = aievec.mac_elem %[[T123:.*]], %[[T124:.*]], %[[T121:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T126:.*]] = arith.addi %[[A5:.*]], %[[C31:.*]] : index
-//      CHECK:          %[[T127:.*]] = aievec.upd %[[A1]][%[[T126:.*]], %[[A4:.*]]] {index = 0 : i8, offset = 0 : i32} : memref<?x64xi16>, vector<32xi16>
-//      CHECK:          %[[T128:.*]] = aievec.broadcast %[[T2:.*]] {idx = 31 : i8} : vector<32xi16>, vector<32xi16>
-//      CHECK:          %[[T129:.*]] = aievec.mac_elem %[[T127:.*]], %[[T128:.*]], %[[T125:.*]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
-//      CHECK:          %[[T130:.*]] = aievec.srs %[[T129:.*]], %[[C0I32]] : vector<32xi32>, i32, vector<32xi16>
-//      CHECK:          vector.transfer_write %[[T130:.*]], %[[A2]][%[[A3:.*]], %[[A4:.*]]] {in_bounds = [true]} : vector<32xi16>, memref<?x64xi16>
diff --git a/test/aievec/gemm64_tile448_unroll8_unaligned_loads.mlir b/test/aievec/gemm64_tile448_unroll8_unaligned_loads.mlir
deleted file mode 100644
index c93301505b..0000000000
--- a/test/aievec/gemm64_tile448_unroll8_unaligned_loads.mlir
+++ /dev/null
@@ -1,150 +0,0 @@
-// This test case will directly return the result generated from -affine-super-vectorize
-// because in transfer_read %arg1[%arg5, %arg7], lower dim iv %arg7's corresponding loop
-// (affine.for %arg7 = #map0(%arg4) to #map1(%arg4))'s upper bound’s affine_map(<(d0) -> (d0 + 4)>)
-// result's offset(4) is not divisible by the vector lane size(8).
-
-// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=8" --aie-vectorize 2>&1 | FileCheck %s
-
-// CHECK-LABEL: Loop upper bound's affine map offset of inner index of vector.transfer_read is not divisible by number of vector lanes.
-// CHECK-LABEL: Cannot apply aie-vectorize to func.func because alignment check has failed.
-
-#map0 = affine_map<(d0) -> (d0)>
-#map1 = affine_map<(d0) -> (d0 + 4)>
-
-module {
-  func.func @matmul(%arg0: memref<64x64xf32>, %arg1: memref<64x64xf32>, %arg2: memref<64x64xf32>) {
-    affine.for %arg3 = 0 to 64 step 4 {
-      affine.for %arg4 = 0 to 64 step 4 {
-        affine.for %arg5 = 0 to 64 step 8 {
-          affine.for %arg6 = #map0(%arg3) to #map1(%arg3) {
-            affine.for %arg7 = #map0(%arg4) to #map1(%arg4) {
-              %0 = affine.load %arg0[%arg6, %arg5] : memref<64x64xf32>
-              %1 = affine.load %arg1[%arg5, %arg7] : memref<64x64xf32>
-              %2 = arith.mulf %0, %1 : f32
-              %3 = affine.load %arg2[%arg6, %arg7] : memref<64x64xf32>
-              %4 = arith.addf %3, %2 : f32
-              affine.store %4, %arg2[%arg6, %arg7] : memref<64x64xf32>
-              %6 = affine.load %arg0[%arg6, %arg5+1] : memref<64x64xf32>
-              %7 = affine.load %arg1[%arg5+1, %arg7] : memref<64x64xf32>
-              %8 = arith.mulf %6, %7 : f32
-              %9 = affine.load %arg2[%arg6, %arg7] : memref<64x64xf32>
-              %10 = arith.addf %9, %8 : f32
-              affine.store %10, %arg2[%arg6, %arg7] : memref<64x64xf32>
-              %12 = affine.load %arg0[%arg6, %arg5+2] : memref<64x64xf32>
-              %13 = affine.load %arg1[%arg5+2, %arg7] : memref<64x64xf32>
-              %14 = arith.mulf %12, %13 : f32
-              %15 = affine.load %arg2[%arg6, %arg7] : memref<64x64xf32>
-              %16 = arith.addf %15, %14 : f32
-              affine.store %16, %arg2[%arg6, %arg7] : memref<64x64xf32>
-              %18 = affine.load %arg0[%arg6, %arg5+3] : memref<64x64xf32>
-              %19 = affine.load %arg1[%arg5+3, %arg7] : memref<64x64xf32>
-              %20 = arith.mulf %18, %19 : f32
-              %21 = affine.load %arg2[%arg6, %arg7] : memref<64x64xf32>
-              %22 = arith.addf %21, %20 : f32
-              affine.store %22, %arg2[%arg6, %arg7] : memref<64x64xf32>
-              %24 = affine.load %arg0[%arg6, %arg5+4] : memref<64x64xf32>
-              %25 = affine.load %arg1[%arg5+4, %arg7] : memref<64x64xf32>
-              %26 = arith.mulf %24, %25 : f32
-              %27 = affine.load %arg2[%arg6, %arg7] : memref<64x64xf32>
-              %28 = arith.addf %27, %26 : f32
-              affine.store %28, %arg2[%arg6, %arg7] : memref<64x64xf32>
-              %30 = affine.load %arg0[%arg6, %arg5+5] : memref<64x64xf32>
-              %31 = affine.load %arg1[%arg5+5, %arg7] : memref<64x64xf32>
-              %32 = arith.mulf %30, %31 : f32
-              %33 = affine.load %arg2[%arg6, %arg7] : memref<64x64xf32>
-              %34 = arith.addf %33, %32 : f32
-              affine.store %34, %arg2[%arg6, %arg7] : memref<64x64xf32>
-              %36 = affine.load %arg0[%arg6, %arg5+6] : memref<64x64xf32>
-              %37 = affine.load %arg1[%arg5+6, %arg7] : memref<64x64xf32>
-              %38 = arith.mulf %36, %37 : f32
-              %39 = affine.load %arg2[%arg6, %arg7] : memref<64x64xf32>
-              %40 = arith.addf %39, %38 : f32
-              affine.store %40, %arg2[%arg6, %arg7] : memref<64x64xf32>
-              %42 = affine.load %arg0[%arg6, %arg5+7] : memref<64x64xf32>
-%43 = affine.load %arg1[%arg5+7, %arg7] : memref<64x64xf32>
-              %44 = arith.mulf %42, %43 : f32
-              %45 = affine.load %arg2[%arg6, %arg7] : memref<64x64xf32>
-              %46 = arith.addf %45, %44 : f32
-              affine.store %46, %arg2[%arg6, %arg7] : memref<64x64xf32>
-            }
-          }
-        }
-      }
-    }
-    return
-  }
-}
-
-// CHECK:       #map = affine_map<(d0) -> (d0)>
-// CHECK:       #map1 = affine_map<(d0) -> (d0 + 4)>
-// CHECK:       #map2 = affine_map<(d0, d1) -> (0)>
-// CHECK:       #map3 = affine_map<(d0) -> (d0 + 1)>
-// CHECK:       #map4 = affine_map<(d0) -> (d0 + 2)>
-// CHECK:       #map5 = affine_map<(d0) -> (d0 + 3)>
-// CHECK:       #map6 = affine_map<(d0) -> (d0 + 5)>
-// CHECK:       #map7 = affine_map<(d0) -> (d0 + 6)>
-// CHECK:       #map8 = affine_map<(d0) -> (d0 + 7)>
-// CHECK:       module {
-// CHECK:         func.func @matmul(%[[VAL_0:.*]]: memref<64x64xf32>, %[[VAL_1:.*]]: memref<64x64xf32>, %[[VAL_2:.*]]: memref<64x64xf32>) {
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK:           affine.for %[[VAL_4:.*]] = 0 to 64 step 4 {
-// CHECK:             affine.for %[[VAL_5:.*]] = 0 to 64 step 4 {
-// CHECK:               affine.for %[[VAL_6:.*]] = 0 to 64 step 8 {
-// CHECK:                 affine.for %[[VAL_7:.*]] = #map(%[[VAL_4]]) to #map1(%[[VAL_4]]) {
-// CHECK:                   affine.for %[[VAL_8:.*]] = #map(%[[VAL_5]]) to #map1(%[[VAL_5]]) step 8 {
-// CHECK:                     %[[VAL_9:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_7]], %[[VAL_6]]], %[[VAL_3]] {in_bounds = [true], permutation_map = #map2} : memref<64x64xf32>, vector<8xf32>
-// CHECK:                     %[[VAL_10:.*]] = vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_6]], %[[VAL_8]]], %[[VAL_3]] {in_bounds = [true]} : memref<64x64xf32>, vector<8xf32>
-// CHECK:                     %[[VAL_11:.*]] = arith.mulf %[[VAL_9]], %[[VAL_10]] : vector<8xf32>
-// CHECK:                     %[[VAL_12:.*]] = vector.transfer_read %[[VAL_2]]{{\[}}%[[VAL_7]], %[[VAL_8]]], %[[VAL_3]] {in_bounds = [true]} : memref<64x64xf32>, vector<8xf32>
-// CHECK:                     %[[VAL_13:.*]] = arith.addf %[[VAL_12]], %[[VAL_11]] : vector<8xf32>
-// CHECK:                     %[[VAL_14:.*]] = affine.apply #map3(%[[VAL_6]])
-// CHECK:                     %[[VAL_15:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_7]], %[[VAL_14]]], %[[VAL_3]] {in_bounds = [true], permutation_map = #map2} : memref<64x64xf32>, vector<8xf32>
-// CHECK:                     %[[VAL_16:.*]] = affine.apply #map3(%[[VAL_6]])
-// CHECK:                     %[[VAL_17:.*]] = vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_16]], %[[VAL_8]]], %[[VAL_3]] {in_bounds = [true]} : memref<64x64xf32>, vector<8xf32>
-// CHECK:                     %[[VAL_18:.*]] = arith.mulf %[[VAL_15]], %[[VAL_17]] : vector<8xf32>
-// CHECK:                     %[[VAL_19:.*]] = arith.addf %[[VAL_13]], %[[VAL_18]] : vector<8xf32>
-// CHECK:                     %[[VAL_20:.*]] = affine.apply #map4(%[[VAL_6]])
-// CHECK:                     %[[VAL_21:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_7]], %[[VAL_20]]], %[[VAL_3]] {in_bounds = [true], permutation_map = #map2} : memref<64x64xf32>, vector<8xf32>
-// CHECK:                     %[[VAL_22:.*]] = affine.apply #map4(%[[VAL_6]])
-// CHECK:                     %[[VAL_23:.*]] = vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_22]], %[[VAL_8]]], %[[VAL_3]] {in_bounds = [true]} : memref<64x64xf32>, vector<8xf32>
-// CHECK:                     %[[VAL_24:.*]] = arith.mulf %[[VAL_21]], %[[VAL_23]] : vector<8xf32>
-// CHECK:                     %[[VAL_25:.*]] = arith.addf %[[VAL_19]], %[[VAL_24]] : vector<8xf32>
-// CHECK:                     %[[VAL_26:.*]] = affine.apply #map5(%[[VAL_6]])
-// CHECK:                     %[[VAL_27:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_7]], %[[VAL_26]]], %[[VAL_3]] {in_bounds = [true], permutation_map = #map2} : memref<64x64xf32>, vector<8xf32>
-// CHECK:                     %[[VAL_28:.*]] = affine.apply #map5(%[[VAL_6]])
-// CHECK:                     %[[VAL_29:.*]] = vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_28]], %[[VAL_8]]], %[[VAL_3]] {in_bounds = [true]} : memref<64x64xf32>, vector<8xf32>
-// CHECK:                     %[[VAL_30:.*]] = arith.mulf %[[VAL_27]], %[[VAL_29]] : vector<8xf32>
-// CHECK:                     %[[VAL_31:.*]] = arith.addf %[[VAL_25]], %[[VAL_30]] : vector<8xf32>
-// CHECK:                     %[[VAL_32:.*]] = affine.apply #map1(%[[VAL_6]])
-// CHECK:                     %[[VAL_33:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_7]], %[[VAL_32]]], %[[VAL_3]] {in_bounds = [true], permutation_map = #map2} : memref<64x64xf32>, vector<8xf32>
-// CHECK:                     %[[VAL_34:.*]] = affine.apply #map1(%[[VAL_6]])
-// CHECK:                     %[[VAL_35:.*]] = vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_34]], %[[VAL_8]]], %[[VAL_3]] {in_bounds = [true]} : memref<64x64xf32>, vector<8xf32>
-// CHECK:                     %[[VAL_36:.*]] = arith.mulf %[[VAL_33]], %[[VAL_35]] : vector<8xf32>
-// CHECK:                     %[[VAL_37:.*]] = arith.addf %[[VAL_31]], %[[VAL_36]] : vector<8xf32>
-// CHECK:                     %[[VAL_38:.*]] = affine.apply #map6(%[[VAL_6]])
-// CHECK:                     %[[VAL_39:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_7]], %[[VAL_38]]], %[[VAL_3]] {in_bounds = [true], permutation_map = #map2} : memref<64x64xf32>, vector<8xf32>
-// CHECK:                     %[[VAL_40:.*]] = affine.apply #map6(%[[VAL_6]])
-// CHECK:                     %[[VAL_41:.*]] = vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_40]], %[[VAL_8]]], %[[VAL_3]] {in_bounds = [true]} : memref<64x64xf32>, vector<8xf32>
-// CHECK:                     %[[VAL_42:.*]] = arith.mulf %[[VAL_39]], %[[VAL_41]] : vector<8xf32>
-// CHECK:                     %[[VAL_43:.*]] = arith.addf %[[VAL_37]], %[[VAL_42]] : vector<8xf32>
-// CHECK:                     %[[VAL_44:.*]] = affine.apply #map7(%[[VAL_6]])
-// CHECK:                     %[[VAL_45:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_7]], %[[VAL_44]]], %[[VAL_3]] {in_bounds = [true], permutation_map = #map2} : memref<64x64xf32>, vector<8xf32>
-// CHECK:                     %[[VAL_46:.*]] = affine.apply #map7(%[[VAL_6]])
-// CHECK:                     %[[VAL_47:.*]] = vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_46]], %[[VAL_8]]], %[[VAL_3]] {in_bounds = [true]} : memref<64x64xf32>, vector<8xf32>
-// CHECK:                     %[[VAL_48:.*]] = arith.mulf %[[VAL_45]], %[[VAL_47]] : vector<8xf32>
-// CHECK:                     %[[VAL_49:.*]] = arith.addf %[[VAL_43]], %[[VAL_48]] : vector<8xf32>
-// CHECK:                     %[[VAL_50:.*]] = affine.apply #map8(%[[VAL_6]])
-// CHECK:                     %[[VAL_51:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_7]], %[[VAL_50]]], %[[VAL_3]] {in_bounds = [true], permutation_map = #map2} : memref<64x64xf32>, vector<8xf32>
-// CHECK:                     %[[VAL_52:.*]] = affine.apply #map8(%[[VAL_6]])
-// CHECK:                     %[[VAL_53:.*]] = vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_52]], %[[VAL_8]]], %[[VAL_3]] {in_bounds = [true]} : memref<64x64xf32>, vector<8xf32>
-// CHECK:                     %[[VAL_54:.*]] = arith.mulf %[[VAL_51]], %[[VAL_53]] : vector<8xf32>
-// CHECK:                     %[[VAL_55:.*]] = arith.addf %[[VAL_49]], %[[VAL_54]] : vector<8xf32>
-// CHECK:                     vector.transfer_write %[[VAL_55]], %[[VAL_2]]{{\[}}%[[VAL_7]], %[[VAL_8]]] {in_bounds = [true]} : vector<8xf32>, memref<64x64xf32>
-// CHECK:                   }
-// CHECK:                 }
-// CHECK:               }
-// CHECK:             }
-// CHECK:           }
-// CHECK:           return
-// CHECK:         }
-// CHECK:       }
\ No newline at end of file
diff --git a/test/aievec/gemm64_unroll4_unaligned_loads.mlir b/test/aievec/gemm64_unroll4_unaligned_loads.mlir
deleted file mode 100644
index 57f92044f8..0000000000
--- a/test/aievec/gemm64_unroll4_unaligned_loads.mlir
+++ /dev/null
@@ -1,86 +0,0 @@
-// This test case will directly return the result generated from -affine-super-vectorize
-// because in transfer_read %arg0[%arg3, %arg5], the lowest dim(%arg5)'s corresponding
-// loop step(4) is not divisible by the vector lanes(8).
-
-// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=8" --aie-vectorize 2>&1 | FileCheck %s
-
-// CHECK-LABEL: Loop step of inner index of vector.transfer_read is not divisible by number of vector lanes.
-// CHECK-LABEL: Cannot apply aie-vectorize to func.func because alignment check has failed.
-
-module {
-  func.func @matmul(%arg0: memref<64x64xf32>, %arg1: memref<64x64xf32>, %arg2: memref<64x64xf32>) {
-    affine.for %arg3 = 0 to 64 {
-      affine.for %arg4 = 0 to 64 {
-        affine.for %arg5 = 0 to 64 step 4 {
-          %0 = affine.load %arg0[%arg3, %arg5] : memref<64x64xf32>
-          %1 = affine.load %arg1[%arg5, %arg4] : memref<64x64xf32>
-          %2 = arith.mulf %0, %1 : f32
-          %3 = affine.load %arg2[%arg3, %arg4] : memref<64x64xf32>
-          %4 = arith.addf %3, %2 : f32
-          affine.store %4, %arg2[%arg3, %arg4] : memref<64x64xf32>
-          %5 = affine.load %arg0[%arg3, %arg5 + 1] : memref<64x64xf32>
-          %6 = affine.load %arg1[%arg5 + 1, %arg4] : memref<64x64xf32>
-          %7 = arith.mulf %5, %6 : f32
-          %8 = affine.load %arg2[%arg3, %arg4] : memref<64x64xf32>
-          %9 = arith.addf %8, %7 : f32
-          affine.store %9, %arg2[%arg3, %arg4] : memref<64x64xf32>
-          %10 = affine.load %arg0[%arg3, %arg5 + 2] : memref<64x64xf32>
-          %11 = affine.load %arg1[%arg5 + 2, %arg4] : memref<64x64xf32>
-          %12 = arith.mulf %10, %11 : f32
-          %13 = affine.load %arg2[%arg3, %arg4] : memref<64x64xf32>
-          %14 = arith.addf %13, %12 : f32
-          affine.store %14, %arg2[%arg3, %arg4] : memref<64x64xf32>
-          %15 = affine.load %arg0[%arg3, %arg5 + 3] : memref<64x64xf32>
-          %16 = affine.load %arg1[%arg5 + 3, %arg4] : memref<64x64xf32>
-          %17 = arith.mulf %15, %16 : f32
-          %18 = affine.load %arg2[%arg3, %arg4] : memref<64x64xf32>
-          %19 = arith.addf %18, %17 : f32
-          affine.store %19, %arg2[%arg3, %arg4] : memref<64x64xf32>
-        }
-      }
-    }
-    return
-  }
-}
-
-
-// CHECK:       #map = affine_map<(d0, d1) -> (0)>
-// CHECK:       #map1 = affine_map<(d0) -> (d0 + 1)>
-// CHECK:       #map2 = affine_map<(d0) -> (d0 + 2)>
-// CHECK:       #map3 = affine_map<(d0) -> (d0 + 3)>
-// CHECK:       module {
-// CHECK:         func.func @matmul(%[[VAL_0:.*]]: memref<64x64xf32>, %[[VAL_1:.*]]: memref<64x64xf32>, %[[VAL_2:.*]]: memref<64x64xf32>) {
-// CHECK:           %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK:           affine.for %[[VAL_4:.*]] = 0 to 64 {
-// CHECK:             affine.for %[[VAL_5:.*]] = 0 to 64 step 8 {
-// CHECK:               affine.for %[[VAL_6:.*]] = 0 to 64 step 4 {
-// CHECK:                 %[[VAL_7:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_4]], %[[VAL_6]]], %[[VAL_3]] {in_bounds = [true], permutation_map = #map} : memref<64x64xf32>, vector<8xf32>
-// CHECK:                 %[[VAL_8:.*]] = vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_6]], %[[VAL_5]]], %[[VAL_3]] {in_bounds = [true]} : memref<64x64xf32>, vector<8xf32>
-// CHECK:                 %[[VAL_9:.*]] = arith.mulf %[[VAL_7]], %[[VAL_8]] : vector<8xf32>
-// CHECK:                 %[[VAL_10:.*]] = vector.transfer_read %[[VAL_2]]{{\[}}%[[VAL_4]], %[[VAL_5]]], %[[VAL_3]] {in_bounds = [true]} : memref<64x64xf32>, vector<8xf32>
-// CHECK:                 %[[VAL_11:.*]] = arith.addf %[[VAL_10]], %[[VAL_9]] : vector<8xf32>
-// CHECK:                 %[[VAL_12:.*]] = affine.apply #map1(%[[VAL_6]])
-// CHECK:                 %[[VAL_13:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_4]], %[[VAL_12]]], %[[VAL_3]] {in_bounds = [true], permutation_map = #map} : memref<64x64xf32>, vector<8xf32>
-// CHECK:                 %[[VAL_14:.*]] = affine.apply #map1(%[[VAL_6]])
-// CHECK:                 %[[VAL_15:.*]] = vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_14]], %[[VAL_5]]], %[[VAL_3]] {in_bounds = [true]} : memref<64x64xf32>, vector<8xf32>
-// CHECK:                 %[[VAL_16:.*]] = arith.mulf %[[VAL_13]], %[[VAL_15]] : vector<8xf32>
-// CHECK:                 %[[VAL_17:.*]] = arith.addf %[[VAL_11]], %[[VAL_16]] : vector<8xf32>
-// CHECK:                 %[[VAL_18:.*]] = affine.apply #map2(%[[VAL_6]])
-// CHECK:                 %[[VAL_19:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_4]], %[[VAL_18]]], %[[VAL_3]] {in_bounds = [true], permutation_map = #map} : memref<64x64xf32>, vector<8xf32>
-// CHECK:                 %[[VAL_20:.*]] = affine.apply #map2(%[[VAL_6]])
-// CHECK:                 %[[VAL_21:.*]] = vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_20]], %[[VAL_5]]], %[[VAL_3]] {in_bounds = [true]} : memref<64x64xf32>, vector<8xf32>
-// CHECK:                 %[[VAL_22:.*]] = arith.mulf %[[VAL_19]], %[[VAL_21]] : vector<8xf32>
-// CHECK:                 %[[VAL_23:.*]] = arith.addf %[[VAL_17]], %[[VAL_22]] : vector<8xf32>
-// CHECK:                 %[[VAL_24:.*]] = affine.apply #map3(%[[VAL_6]])
-// CHECK:                 %[[VAL_25:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_4]], %[[VAL_24]]], %[[VAL_3]] {in_bounds = [true], permutation_map = #map} : memref<64x64xf32>, vector<8xf32>
-// CHECK:                 %[[VAL_26:.*]] = affine.apply #map3(%[[VAL_6]])
-// CHECK:                 %[[VAL_27:.*]] = vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_26]], %[[VAL_5]]], %[[VAL_3]] {in_bounds = [true]} : memref<64x64xf32>, vector<8xf32>
-// CHECK:                 %[[VAL_28:.*]] = arith.mulf %[[VAL_25]], %[[VAL_27]] : vector<8xf32>
-// CHECK:                 %[[VAL_29:.*]] = arith.addf %[[VAL_23]], %[[VAL_28]] : vector<8xf32>
-// CHECK:                 vector.transfer_write %[[VAL_29]], %[[VAL_2]]{{\[}}%[[VAL_4]], %[[VAL_5]]] {in_bounds = [true]} : vector<8xf32>, memref<64x64xf32>
-// CHECK:               }
-// CHECK:             }
-// CHECK:           }
-// CHECK:           return
-// CHECK:         }
-// CHECK:       }
\ No newline at end of file
diff --git a/test/aievec/linalg_conv2d_f32.mlir b/test/aievec/linalg_conv2d_f32.mlir
deleted file mode 100644
index 9c508ed8f7..0000000000
--- a/test/aievec/linalg_conv2d_f32.mlir
+++ /dev/null
@@ -1,91 +0,0 @@
-// RUN: aie-opt %s --convert-linalg-to-affine-loops --affine-loop-unroll="unroll-full unroll-full-threshold=3" --canonicalize -affine-super-vectorize="virtual-vector-size=8" --aie-vectorize -unaligned-loads-check=false -split-input-file | FileCheck %s
-
-//The affine dialect code is generated from the following linalg operator, with the maps inlined. 
-//CHECK-LABEL: func.func @conv_2d(%arg0: memref<10x3x256x256xf32>, %arg1: memref<10x3x3x3xf32>, %arg2: memref<10x10x254x254xf32>) {
-func.func @conv_2d(%input: memref<10x3x256x256xf32>, %filter: memref<10x3x3x3xf32>, %output: memref<10x10x254x254xf32>) {
-  linalg.conv_2d_nchw_fchw{dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
-     ins (%input, %filter: memref<10x3x256x256xf32>, memref<10x3x3x3xf32>)
-    outs (%output: memref<10x10x254x254xf32>)
-  return
-}
-
-//CHECK-NEXT: %c2 = arith.constant 2 : index
-//CHECK-NEXT: %c1 = arith.constant 1 : index
-//CHECK-NEXT: %c0 = arith.constant 0 : index
-//CHECK-NEXT: %c0_0 = arith.constant 0 : index
-//CHECK-NEXT: %c10 = arith.constant 10 : index
-//CHECK-NEXT: %c1_1 = arith.constant 1 : index
-//CHECK-NEXT: scf.for %arg3 = %c0_0 to %c10 step %c1_1 {
-//CHECK-NEXT:   %c0_2 = arith.constant 0 : index
-//CHECK-NEXT:   %c10_3 = arith.constant 10 : index
-//CHECK-NEXT:   %c1_4 = arith.constant 1 : index
-//CHECK-NEXT:   scf.for %arg4 = %c0_2 to %c10_3 step %c1_4 {
-//CHECK-NEXT:     %0 = aievec.upd %arg1[%arg4, %c0, %c0, %c0] {index = 0 : i8, offset = 0 : i32} : memref<10x3x3x3xf32>, vector<8xf32>
-//CHECK-NEXT:     %1 = aievec.upd %arg1[%arg4, %c0, %c2, %c2] {index = 0 : i8, offset = 0 : i32} : memref<10x3x3x3xf32>, vector<8xf32>
-//CHECK-NEXT:     %2 = aievec.upd %arg1[%arg4, %c1, %c2, %c1] {index = 0 : i8, offset = 0 : i32} : memref<10x3x3x3xf32>, vector<8xf32>
-//CHECK-NEXT:     %3 = aievec.upd %arg1[%arg4, %c2, %c2, %c0] {index = 0 : i8, offset = 0 : i32} : memref<10x3x3x3xf32>, vector<8xf32>
-//CHECK-NEXT:     %c0_5 = arith.constant 0 : index
-//CHECK-NEXT:     %c254 = arith.constant 254 : index
-//CHECK-NEXT:     %c1_6 = arith.constant 1 : index
-//CHECK-NEXT:     scf.for %arg5 = %c0_5 to %c254 step %c1_6 {
-//CHECK-NEXT:       %c1_7 = arith.constant 1 : index
-//CHECK-NEXT:       %4 = arith.addi %arg5, %c1_7 : index
-//CHECK-NEXT:       %c2_8 = arith.constant 2 : index
-//CHECK-NEXT:       %5 = arith.addi %arg5, %c2_8 : index
-//CHECK-NEXT:       %c0_9 = arith.constant 0 : index
-//CHECK-NEXT:       %c254_10 = arith.constant 254 : index
-//CHECK-NEXT:       %c8 = arith.constant 8 : index
-//CHECK-NEXT:       scf.for %arg6 = %c0_9 to %c254_10 step %c8 {
-//CHECK-NEXT:         %6 = aievec.upd %arg0[%arg3, %c0, %arg5, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %7 = aievec.upd %arg2[%arg3, %arg4, %arg5, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<10x10x254x254xf32>, vector<8xf32>
-//CHECK-NEXT:         %8 = aievec.mac %6, %0, %7 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %c1_11 = arith.constant 1 : index
-//CHECK-NEXT:         %9 = arith.addi %arg6, %c1_11 : index
-//CHECK-NEXT:         %10 = aievec.upd %arg0[%arg3, %c0, %arg5, %9], %6 {index = 1 : i8, offset = 224 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %11 = aievec.mac %10, %0, %8 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %12 = aievec.mac %10, %0, %11 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %13 = aievec.upd %arg0[%arg3, %c0, %4, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %14 = aievec.mac %13, %0, %12 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %15 = aievec.upd %arg0[%arg3, %c0, %4, %9], %13 {index = 1 : i8, offset = 224 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %16 = aievec.mac %15, %0, %14 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %17 = aievec.mac %15, %0, %16 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %18 = aievec.upd %arg0[%arg3, %c0, %5, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %19 = aievec.mac %18, %0, %17 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %20 = aievec.upd %arg0[%arg3, %c0, %5, %9], %18 {index = 1 : i8, offset = 224 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %21 = aievec.mac %20, %0, %19 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %22 = aievec.mac %20, %1, %21 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %23 = aievec.upd %arg0[%arg3, %c1, %arg5, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %24 = aievec.mac %23, %1, %22 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "1"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %25 = aievec.upd %arg0[%arg3, %c1, %arg5, %9], %23 {index = 1 : i8, offset = 224 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %26 = aievec.mac %25, %1, %24 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "2"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %27 = aievec.mac %25, %1, %26 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "3"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %28 = aievec.upd %arg0[%arg3, %c1, %4, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %29 = aievec.mac %28, %1, %27 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "4"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %30 = aievec.upd %arg0[%arg3, %c1, %4, %9], %28 {index = 1 : i8, offset = 224 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %31 = aievec.mac %30, %1, %29 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "5"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %32 = aievec.mac %30, %1, %31 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "6"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %33 = aievec.upd %arg0[%arg3, %c1, %5, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %34 = aievec.mac %33, %1, %32 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "7"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %35 = aievec.upd %arg0[%arg3, %c1, %5, %9], %33 {index = 1 : i8, offset = 224 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %36 = aievec.mac %35, %2, %34 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %37 = aievec.mac %35, %2, %36 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "1"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %38 = aievec.upd %arg0[%arg3, %c2, %arg5, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %39 = aievec.mac %38, %2, %37 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "2"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %40 = aievec.upd %arg0[%arg3, %c2, %arg5, %9], %38 {index = 1 : i8, offset = 224 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %41 = aievec.mac %40, %2, %39 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "3"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %42 = aievec.mac %40, %2, %41 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "4"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %43 = aievec.upd %arg0[%arg3, %c2, %4, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %44 = aievec.mac %43, %2, %42 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "5"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %45 = aievec.upd %arg0[%arg3, %c2, %4, %9], %43 {index = 1 : i8, offset = 224 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %46 = aievec.mac %45, %2, %44 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "6"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %47 = aievec.mac %45, %2, %46 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "7"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %48 = aievec.upd %arg0[%arg3, %c2, %5, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %49 = aievec.mac %48, %3, %47 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %50 = aievec.upd %arg0[%arg3, %c2, %5, %9], %48 {index = 1 : i8, offset = 224 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %51 = aievec.mac %50, %3, %49 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %52 = aievec.mac %50, %3, %51 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         vector.transfer_write %52, %arg2[%arg3, %arg4, %arg5, %arg6] {in_bounds = [true]} : vector<8xf32>, memref<10x10x254x254xf32>
-//CHECK-NEXT:       }
-//CHECK-NEXT:     }
-//CHECK-NEXT:   }
-//CHECK-NEXT: }
diff --git a/test/aievec/mul_mul_f32.mlir b/test/aievec/mul_mul_f32.mlir
index 5c51793d1b..38c5ec9e9f 100644
--- a/test/aievec/mul_mul_f32.mlir
+++ b/test/aievec/mul_mul_f32.mlir
@@ -5,7 +5,7 @@ func @mul_mul (%A: memref<2048xf32>, %B: memref<2048xf32>, %C: memref<2048xf32>,
 // CHECK-LABEL: func @mul_mul
 // CHECK:  %2 = aievec.upd %arg0[%arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048xf32>, vector<8xf32>
 // CHECK:  %3 = aievec.upd %arg1[%arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048xf32>, vector<8xf32>
-// CHECK:  %4 = aievec.mul %1, %2 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x76543210", zstart = "0"} : vector<16xf32>, vector<8xf32>, !aievec.acc<8xf32>
+// CHECK:  %4 = aievec_aie1.mul %1, %2 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x76543210", zstart = "0"} : vector<16xf32>, vector<8xf32>, !aievec.acc<8xf32>
 // CHECK:  %5 = aievec.srs %4 {shift = 10 : i8} : !aievec.acc<8xf32>, vector<8xf32>
 // CHECK:  %6 = arith.mulf %5, %3 : vector<8xf32>
 // CHECK:  vector.transfer_write %6, %arg2[%arg4] {in_bounds = [true]} : vector<8xf32>, memref<2048xf32>
diff --git a/test/aievec/pointwise_mult_f32.mlir b/test/aievec/pointwise_mult_f32.mlir
index fb6cd07955..3bf648e49f 100644
--- a/test/aievec/pointwise_mult_f32.mlir
+++ b/test/aievec/pointwise_mult_f32.mlir
@@ -6,7 +6,7 @@ func.func @pointwise_mult (%A: memref<2048xf32>, %B: memref<2048xf32>, %C: memre
        %a = affine.load %A[%arg0] : memref<2048xf32>
        %b = affine.load %B[%arg0] : memref<2048xf32>
        //CHECK: %2 = aievec.concat %0, %0 : vector<8xf32>, vector<16xf32>
-       //CHECK: %3 = aievec.mul %2, %1 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x76543210", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+       //CHECK: %3 = aievec_aie1.mul %2, %1 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x76543210", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
        %c = arith.mulf %a, %b : f32
        affine.store %c, %C[%arg0] : memref<2048xf32>
     }
diff --git a/test/aievec/pointwise_mult_i16.mlir b/test/aievec/pointwise_mult_i16.mlir
index 75044af989..9f51b82eee 100644
--- a/test/aievec/pointwise_mult_i16.mlir
+++ b/test/aievec/pointwise_mult_i16.mlir
@@ -5,7 +5,7 @@ func.func @pointwise_mult (%A: memref<2048xi16>, %B: memref<2048xi16>, %C: memre
     affine.for %arg0 = 0 to 2048 {
        %a = affine.load %A[%arg0] : memref<2048xi16>
        %b = affine.load %B[%arg0] : memref<2048xi16>
-       //CHECK: %2 = aievec.mul %0, %1 : vector<16xi16>, vector<16xi16>, vector<16xi48>
+       //CHECK: %2 = aievec_aie1.mul %0, %1 : vector<16xi16>, vector<16xi16>, vector<16xi48>
        %c = arith.muli %a, %b : i16
        affine.store %c, %C[%arg0] : memref<2048xi16>
     }
diff --git a/test/aievec/test_add_i16.mlir b/test/aievec/test_add_i16.mlir
index 1e1f7ffd01..5647afb388 100644
--- a/test/aievec/test_add_i16.mlir
+++ b/test/aievec/test_add_i16.mlir
@@ -5,7 +5,7 @@ func.func @conv2d (%A: memref<256xi16>, %B: memref<1xi16>, %C: memref<256xi16>)
     affine.for %arg0 = 0 to 256 {
       %a1 = affine.load %A[%arg0] : memref<256xi16>
       %b1 = affine.load %B[0] : memref<1xi16>
-      //CHECK: %4 = aievec.add %3, %1 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x3210", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zsquare = "0", zstart = "0"} : vector<32xi16>, vector<32xi16>, vector<32xi16>
+      //CHECK: %4 = aievec_aie1.add %3, %1 {xoffsets = "0x03020100", xoffsets_hi = "0x07060504", xsquare = "0x3210", xstart = "0", zoffsets = "0", zoffsets_hi = "0", zsquare = "0", zstart = "0"} : vector<32xi16>, vector<32xi16>, vector<32xi16>
       %d1 = arith.addi %a1, %b1 : i16 
       affine.store %d1, %C[%arg0] : memref<256xi16>
     }
diff --git a/test/aievec/test_linalg_conv2d.mlir b/test/aievec/test_linalg_conv2d.mlir
deleted file mode 100644
index 873ebecf85..0000000000
--- a/test/aievec/test_linalg_conv2d.mlir
+++ /dev/null
@@ -1,260 +0,0 @@
-// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=8" --aie-vectorize -unaligned-loads-check=false -split-input-file | FileCheck %s
-
-//CHECK-LABEL: func.func @conv_2d(%arg0: memref<10x3x256x256xf32>, %arg1: memref<10x3x3x3xf32>, %arg2: memref<10x10x254x254xf32>) {
-func.func @conv_2d(%arg0: memref<10x3x256x256xf32>, %arg1: memref<10x3x3x3xf32>, %arg2: memref<10x10x254x254xf32>) {
-  %c0 = arith.constant 0 : index
-  %c0_0 = arith.constant 0 : index
-  %c0_1 = arith.constant 0 : index
-  affine.for %arg3 = 0 to 10 {
-    affine.for %arg4 = 0 to 10 {
-      affine.for %arg5 = 0 to 254 {
-        affine.for %arg6 = 0 to 254 {
-          %2 = affine.load %arg0[%arg3, %c0, %arg5+%c0_0, %arg6+%c0_1] : memref<10x3x256x256xf32>
-          %3 = affine.load %arg1[%arg4, %c0, %c0_0, %c0_1] : memref<10x3x3x3xf32>
-          %4 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %5 = arith.mulf %2, %3 : f32
-          %6 = arith.addf %4, %5 : f32
-          affine.store %6, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %10 = affine.load %arg0[%arg3, %c0, %arg5+%c0_0, %arg6+%c0_1+1] : memref<10x3x256x256xf32>
-          %11 = affine.load %arg1[%arg4, %c0, %c0_0, %c0_1+1] : memref<10x3x3x3xf32>
-          %12 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %13 = arith.mulf %10, %11 : f32
-          %14 = arith.addf %12, %13 : f32
-          affine.store %14, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %18 = affine.load %arg0[%arg3, %c0, %arg5+%c0_0, %arg6+%c0_1+2] : memref<10x3x256x256xf32>
-          %19 = affine.load %arg1[%arg4, %c0, %c0_0, %c0_1+2] : memref<10x3x3x3xf32>
-          %20 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %21 = arith.mulf %18, %19 : f32
-          %22 = arith.addf %20, %21 : f32
-          affine.store %22, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %26 = affine.load %arg0[%arg3, %c0, %arg5+%c0_0+1, %arg6+%c0_1] : memref<10x3x256x256xf32>
-          %27 = affine.load %arg1[%arg4, %c0, %c0_0+1, %c0_1] : memref<10x3x3x3xf32>
-          %28 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %29 = arith.mulf %26, %27 : f32
-          %30 = arith.addf %28, %29 : f32
-          affine.store %30, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %34 = affine.load %arg0[%arg3, %c0, %arg5+%c0_0+1, %arg6+%c0_1+1] : memref<10x3x256x256xf32>
-          %35 = affine.load %arg1[%arg4, %c0, %c0_0+1, %c0_1+1] : memref<10x3x3x3xf32>
-          %36 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %37 = arith.mulf %34, %35 : f32
-          %38 = arith.addf %36, %37 : f32
-          affine.store %38, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %42 = affine.load %arg0[%arg3, %c0, %arg5+%c0_0+1, %arg6+%c0_1+2] : memref<10x3x256x256xf32>
-          %43 = affine.load %arg1[%arg4, %c0, %c0_0+1, %c0_1+2] : memref<10x3x3x3xf32>
-          %44 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %45 = arith.mulf %42, %43 : f32
-          %46 = arith.addf %44, %45 : f32
-          affine.store %46, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %50 = affine.load %arg0[%arg3, %c0, %arg5+%c0_0+2, %arg6+%c0_1] : memref<10x3x256x256xf32>
-          %51 = affine.load %arg1[%arg4, %c0, %c0_0+2, %c0_1] : memref<10x3x3x3xf32>
-          %52 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %53 = arith.mulf %50, %51 : f32
-          %54 = arith.addf %52, %53 : f32
-          affine.store %54, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %58 = affine.load %arg0[%arg3, %c0, %arg5+%c0_0+2, %arg6+%c0_1+1] : memref<10x3x256x256xf32>
-          %59 = affine.load %arg1[%arg4, %c0, %c0_0+2, %c0_1+1] : memref<10x3x3x3xf32>
-          %60 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %61 = arith.mulf %58, %59 : f32
-          %62 = arith.addf %60, %61 : f32
-          affine.store %62, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %66 = affine.load %arg0[%arg3, %c0, %arg5+%c0_0+2, %arg6+%c0_1+2] : memref<10x3x256x256xf32>
-          %67 = affine.load %arg1[%arg4, %c0, %c0_0+2, %c0_1+2] : memref<10x3x3x3xf32>
-          %68 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %69 = arith.mulf %66, %67 : f32
-          %70 = arith.addf %68, %69 : f32
-          affine.store %70, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %74 = affine.load %arg0[%arg3, %c0+1, %arg5+%c0_0, %arg6+%c0_1] : memref<10x3x256x256xf32>
-          %75 = affine.load %arg1[%arg4, %c0+1, %c0_0, %c0_1] : memref<10x3x3x3xf32>
-          %76 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %77 = arith.mulf %74, %75 : f32
-          %78 = arith.addf %76, %77 : f32
-          affine.store %78, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %82 = affine.load %arg0[%arg3, %c0+1, %arg5+%c0_0, %arg6+%c0_1+1] : memref<10x3x256x256xf32>
-          %83 = affine.load %arg1[%arg4, %c0+1, %c0_0, %c0_1+1] : memref<10x3x3x3xf32>
-          %84 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %85 = arith.mulf %82, %83 : f32
-          %86 = arith.addf %84, %85 : f32
-          affine.store %86, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %90 = affine.load %arg0[%arg3, %c0+1, %arg5+%c0_0, %arg6+%c0_1+2] : memref<10x3x256x256xf32>
-          %91 = affine.load %arg1[%arg4, %c0+1, %c0_0, %c0_1+2] : memref<10x3x3x3xf32>
-          %92 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %93 = arith.mulf %90, %91 : f32
-          %94 = arith.addf %92, %93 : f32
-          affine.store %94, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %98 = affine.load %arg0[%arg3, %c0+1, %arg5+%c0_0+1, %arg6+%c0_1] : memref<10x3x256x256xf32>
-          %99 = affine.load %arg1[%arg4, %c0+1, %c0_0+1, %c0_1] : memref<10x3x3x3xf32>
-          %100 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %101 = arith.mulf %98, %99 : f32
-          %102 = arith.addf %100, %101 : f32
-          affine.store %102, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %106 = affine.load %arg0[%arg3, %c0+1, %arg5+%c0_0+1, %arg6+%c0_1+1] : memref<10x3x256x256xf32>
-          %107 = affine.load %arg1[%arg4, %c0+1, %c0_0+1, %c0_1+1] : memref<10x3x3x3xf32>
-          %108 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %109 = arith.mulf %106, %107 : f32
-          %110 = arith.addf %108, %109 : f32
-          affine.store %110, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %114 = affine.load %arg0[%arg3, %c0+1, %arg5+%c0_0+1, %arg6+%c0_1+2] : memref<10x3x256x256xf32>
-          %115 = affine.load %arg1[%arg4, %c0+1, %c0_0+1, %c0_1+2] : memref<10x3x3x3xf32>
-          %116 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %117 = arith.mulf %114, %115 : f32
-          %118 = arith.addf %116, %117 : f32
-          affine.store %118, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %122 = affine.load %arg0[%arg3, %c0+1, %arg5+%c0_0+2, %arg6+%c0_1] : memref<10x3x256x256xf32>
-          %123 = affine.load %arg1[%arg4, %c0+1, %c0_0+2, %c0_1] : memref<10x3x3x3xf32>
-          %124 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %125 = arith.mulf %122, %123 : f32
-          %126 = arith.addf %124, %125 : f32
-          affine.store %126, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %130 = affine.load %arg0[%arg3, %c0+1, %arg5+%c0_0+2, %arg6+%c0_1+1] : memref<10x3x256x256xf32>
-          %131 = affine.load %arg1[%arg4, %c0+1, %c0_0+2, %c0_1+1] : memref<10x3x3x3xf32>
-          %132 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %133 = arith.mulf %130, %131 : f32
-          %134 = arith.addf %132, %133 : f32
-          affine.store %134, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %138 = affine.load %arg0[%arg3, %c0+1, %arg5+%c0_0+2, %arg6+%c0_1+2] : memref<10x3x256x256xf32>
-          %139 = affine.load %arg1[%arg4, %c0+1, %c0_0+2, %c0_1+2] : memref<10x3x3x3xf32>
-          %140 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %141 = arith.mulf %138, %139 : f32
-          %142 = arith.addf %140, %141 : f32
-          affine.store %142, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %146 = affine.load %arg0[%arg3, %c0+2, %arg5+%c0_0, %arg6+%c0_1] : memref<10x3x256x256xf32>
-          %147 = affine.load %arg1[%arg4, %c0+2, %c0_0, %c0_1] : memref<10x3x3x3xf32>
-          %148 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %149 = arith.mulf %146, %147 : f32
-          %150 = arith.addf %148, %149 : f32
-          affine.store %150, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %154 = affine.load %arg0[%arg3, %c0+2, %arg5+%c0_0, %arg6+%c0_1+1] : memref<10x3x256x256xf32>
-          %155 = affine.load %arg1[%arg4, %c0+2, %c0_0, %c0_1+1] : memref<10x3x3x3xf32>
-          %156 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %157 = arith.mulf %154, %155 : f32
-          %158 = arith.addf %156, %157 : f32
-          affine.store %158, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %162 = affine.load %arg0[%arg3, %c0+2, %arg5+%c0_0, %arg6+%c0_1+2] : memref<10x3x256x256xf32>
-          %163 = affine.load %arg1[%arg4, %c0+2, %c0_0, %c0_1+2] : memref<10x3x3x3xf32>
-          %164 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %165 = arith.mulf %162, %163 : f32
-          %166 = arith.addf %164, %165 : f32
-          affine.store %166, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %170 = affine.load %arg0[%arg3, %c0+2, %arg5+%c0_0+1, %arg6+%c0_1] : memref<10x3x256x256xf32>
-          %171 = affine.load %arg1[%arg4, %c0+2, %c0_0+1, %c0_1] : memref<10x3x3x3xf32>
-          %172 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %173 = arith.mulf %170, %171 : f32
-          %174 = arith.addf %172, %173 : f32
-          affine.store %174, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %178 = affine.load %arg0[%arg3, %c0+2, %arg5+%c0_0+1, %arg6+%c0_1+1] : memref<10x3x256x256xf32>
-          %179 = affine.load %arg1[%arg4, %c0+2, %c0_0+1, %c0_1+1] : memref<10x3x3x3xf32>
-          %180 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %181 = arith.mulf %178, %179 : f32
-          %182 = arith.addf %180, %181 : f32
-          affine.store %182, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %186 = affine.load %arg0[%arg3, %c0+2, %arg5+%c0_0+1, %arg6+%c0_1+2] : memref<10x3x256x256xf32>
-          %187 = affine.load %arg1[%arg4, %c0+2, %c0_0+1, %c0_1+2] : memref<10x3x3x3xf32>
-          %188 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %189 = arith.mulf %186, %187 : f32
-          %190 = arith.addf %188, %189 : f32
-          affine.store %190, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %194 = affine.load %arg0[%arg3, %c0+2, %arg5+%c0_0+2, %arg6+%c0_1] : memref<10x3x256x256xf32>
-          %195 = affine.load %arg1[%arg4, %c0+2, %c0_0+2, %c0_1] : memref<10x3x3x3xf32>
-          %196 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %197 = arith.mulf %194, %195 : f32
-          %198 = arith.addf %196, %197 : f32
-          affine.store %198, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %202 = affine.load %arg0[%arg3, %c0+2, %arg5+%c0_0+2, %arg6+%c0_1+1] : memref<10x3x256x256xf32>
-          %203 = affine.load %arg1[%arg4, %c0+2, %c0_0+2, %c0_1+1] : memref<10x3x3x3xf32>
-          %204 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %205 = arith.mulf %202, %203 : f32
-          %206 = arith.addf %204, %205 : f32
-          affine.store %206, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %210 = affine.load %arg0[%arg3, %c0+2, %arg5+%c0_0+2, %arg6+%c0_1+2] : memref<10x3x256x256xf32>
-          %211 = affine.load %arg1[%arg4, %c0+2, %c0_0+2, %c0_1+2] : memref<10x3x3x3xf32>
-          %212 = affine.load %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-          %213 = arith.mulf %210, %211 : f32
-          %214 = arith.addf %212, %213 : f32
-          affine.store %214, %arg2[%arg3, %arg4, %arg5, %arg6] : memref<10x10x254x254xf32>
-        }
-      }
-    }
-  }
-  return
-}
-
-//CHECK-NEXT: %c2 = arith.constant 2 : index
-//CHECK-NEXT: %c1 = arith.constant 1 : index
-//CHECK-NEXT: %c0 = arith.constant 0 : index
-//CHECK-NEXT: %c0_0 = arith.constant 0 : index
-//CHECK-NEXT: %c10 = arith.constant 10 : index
-//CHECK-NEXT: %c1_1 = arith.constant 1 : index
-//CHECK-NEXT: scf.for %arg3 = %c0_0 to %c10 step %c1_1 {
-//CHECK-NEXT:   %c0_2 = arith.constant 0 : index
-//CHECK-NEXT:   %c10_3 = arith.constant 10 : index
-//CHECK-NEXT:   %c1_4 = arith.constant 1 : index
-//CHECK-NEXT:   scf.for %arg4 = %c0_2 to %c10_3 step %c1_4 {
-//CHECK-NEXT:     %0 = aievec.upd %arg1[%arg4, %c0, %c0, %c0] {index = 0 : i8, offset = 0 : i32} : memref<10x3x3x3xf32>, vector<8xf32>
-//CHECK-NEXT:     %1 = aievec.upd %arg1[%arg4, %c0, %c2, %c2] {index = 0 : i8, offset = 0 : i32} : memref<10x3x3x3xf32>, vector<8xf32>
-//CHECK-NEXT:     %2 = aievec.upd %arg1[%arg4, %c1, %c2, %c1] {index = 0 : i8, offset = 0 : i32} : memref<10x3x3x3xf32>, vector<8xf32>
-//CHECK-NEXT:     %3 = aievec.upd %arg1[%arg4, %c2, %c2, %c0] {index = 0 : i8, offset = 0 : i32} : memref<10x3x3x3xf32>, vector<8xf32>
-//CHECK-NEXT:     %c0_5 = arith.constant 0 : index
-//CHECK-NEXT:     %c254 = arith.constant 254 : index
-//CHECK-NEXT:     %c1_6 = arith.constant 1 : index
-//CHECK-NEXT:     scf.for %arg5 = %c0_5 to %c254 step %c1_6 {
-//CHECK-NEXT:       %c1_7 = arith.constant 1 : index
-//CHECK-NEXT:       %4 = arith.addi %arg5, %c1_7 : index
-//CHECK-NEXT:       %c2_8 = arith.constant 2 : index
-//CHECK-NEXT:       %5 = arith.addi %arg5, %c2_8 : index
-//CHECK-NEXT:       %c0_9 = arith.constant 0 : index
-//CHECK-NEXT:       %c254_10 = arith.constant 254 : index
-//CHECK-NEXT:       %c8 = arith.constant 8 : index
-//CHECK-NEXT:       scf.for %arg6 = %c0_9 to %c254_10 step %c8 {
-//CHECK-NEXT:         %6 = aievec.upd %arg0[%arg3, %c0, %arg5, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %7 = aievec.upd %arg2[%arg3, %arg4, %arg5, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<10x10x254x254xf32>, vector<8xf32>
-//CHECK-NEXT:         %8 = aievec.mac %6, %0, %7 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %c1_11 = arith.constant 1 : index
-//CHECK-NEXT:         %9 = arith.addi %arg6, %c1_11 : index
-//CHECK-NEXT:         %10 = aievec.upd %arg0[%arg3, %c0, %arg5, %9], %6 {index = 1 : i8, offset = 224 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %11 = aievec.mac %10, %0, %8 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %12 = aievec.mac %10, %0, %11 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %13 = aievec.upd %arg0[%arg3, %c0, %4, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %14 = aievec.mac %13, %0, %12 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %15 = aievec.upd %arg0[%arg3, %c0, %4, %9], %13 {index = 1 : i8, offset = 224 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %16 = aievec.mac %15, %0, %14 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %17 = aievec.mac %15, %0, %16 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %18 = aievec.upd %arg0[%arg3, %c0, %5, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %19 = aievec.mac %18, %0, %17 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %20 = aievec.upd %arg0[%arg3, %c0, %5, %9], %18 {index = 1 : i8, offset = 224 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %21 = aievec.mac %20, %0, %19 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %22 = aievec.mac %20, %1, %21 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %23 = aievec.upd %arg0[%arg3, %c1, %arg5, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %24 = aievec.mac %23, %1, %22 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "1"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %25 = aievec.upd %arg0[%arg3, %c1, %arg5, %9], %23 {index = 1 : i8, offset = 224 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %26 = aievec.mac %25, %1, %24 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "2"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %27 = aievec.mac %25, %1, %26 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "3"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %28 = aievec.upd %arg0[%arg3, %c1, %4, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %29 = aievec.mac %28, %1, %27 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "4"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %30 = aievec.upd %arg0[%arg3, %c1, %4, %9], %28 {index = 1 : i8, offset = 224 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %31 = aievec.mac %30, %1, %29 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "5"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %32 = aievec.mac %30, %1, %31 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "6"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %33 = aievec.upd %arg0[%arg3, %c1, %5, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %34 = aievec.mac %33, %1, %32 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "7"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %35 = aievec.upd %arg0[%arg3, %c1, %5, %9], %33 {index = 1 : i8, offset = 224 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %36 = aievec.mac %35, %2, %34 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %37 = aievec.mac %35, %2, %36 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "1"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %38 = aievec.upd %arg0[%arg3, %c2, %arg5, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %39 = aievec.mac %38, %2, %37 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "2"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %40 = aievec.upd %arg0[%arg3, %c2, %arg5, %9], %38 {index = 1 : i8, offset = 224 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %41 = aievec.mac %40, %2, %39 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "3"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %42 = aievec.mac %40, %2, %41 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "4"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %43 = aievec.upd %arg0[%arg3, %c2, %4, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %44 = aievec.mac %43, %2, %42 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "5"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %45 = aievec.upd %arg0[%arg3, %c2, %4, %9], %43 {index = 1 : i8, offset = 224 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %46 = aievec.mac %45, %2, %44 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "6"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %47 = aievec.mac %45, %2, %46 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "7"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %48 = aievec.upd %arg0[%arg3, %c2, %5, %arg6] {index = 0 : i8, offset = 0 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %49 = aievec.mac %48, %3, %47 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %50 = aievec.upd %arg0[%arg3, %c2, %5, %9], %48 {index = 1 : i8, offset = 224 : i32} : memref<10x3x256x256xf32>, vector<16xf32>
-//CHECK-NEXT:         %51 = aievec.mac %50, %3, %49 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         %52 = aievec.mac %50, %3, %51 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-//CHECK-NEXT:         vector.transfer_write %52, %arg2[%arg3, %arg4, %arg5, %arg6] {in_bounds = [true]} : vector<8xf32>, memref<10x10x254x254xf32>
-//CHECK-NEXT:       }
-//CHECK-NEXT:     }
-//CHECK-NEXT:   }
-//CHECK-NEXT: }
diff --git a/test/aievec/test_reassoc.mlir b/test/aievec/test_reassoc.mlir
index aaea7b8dab..a2a2c64ea7 100644
--- a/test/aievec/test_reassoc.mlir
+++ b/test/aievec/test_reassoc.mlir
@@ -91,21 +91,21 @@ func.func @conv2d (%A: memref<2048x2048xi32>, %B: memref<9xi32>, %C: memref<2046
 //CHECK-NEXT:        %4 = aievec.upd %arg2[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2046x2046xi32>, vector<8xi32>
 //CHECK-NEXT:        %5 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi32>, vector<16xi32>
 //CHECK-NEXT:        %6 = aievec.ups %4 {shift = 0 : i8} : vector<8xi32>, vector<8xi80>
-//CHECK-NEXT:        %7 = aievec.mac %5, %0, %6 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %7 = aievec_aie1.mac %5, %0, %6 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 //CHECK-NEXT:        %c1_5 = arith.constant 1 : index
 //CHECK-NEXT:        %8 = arith.addi %arg4, %c1_5 : index
 //CHECK-NEXT:        %9 = aievec.upd %arg0[%arg3, %8], %5 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %10 = aievec.mac %9, %0, %7 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-//CHECK-NEXT:        %11 = aievec.mac %9, %0, %10 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %10 = aievec_aie1.mac %9, %0, %7 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %11 = aievec_aie1.mac %9, %0, %10 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 //CHECK-NEXT:        %12 = aievec.upd %arg0[%2, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %13 = aievec.mac %12, %0, %11 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %13 = aievec_aie1.mac %12, %0, %11 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 //CHECK-NEXT:        %14 = aievec.upd %arg0[%2, %8], %12 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %15 = aievec.mac %14, %0, %13 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-//CHECK-NEXT:        %16 = aievec.mac %14, %0, %15 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %15 = aievec_aie1.mac %14, %0, %13 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %16 = aievec_aie1.mac %14, %0, %15 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 //CHECK-NEXT:        %17 = aievec.upd %arg0[%3, %arg4] {index = 0 : i8, offset = 0 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %18 = aievec.mac %17, %0, %16 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %18 = aievec_aie1.mac %17, %0, %16 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 //CHECK-NEXT:        %19 = aievec.upd %arg0[%3, %8], %17 {index = 1 : i8, offset = 224 : i32} : memref<2048x2048xi32>, vector<16xi32>
-//CHECK-NEXT:        %20 = aievec.mac %19, %0, %18 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
-//CHECK-NEXT:        %21 = aievec.mac %19, %1, %20 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %20 = aievec_aie1.mac %19, %0, %18 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+//CHECK-NEXT:        %21 = aievec_aie1.mac %19, %1, %20 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
 //CHECK-NEXT:        %22 = aievec.srs %21, %c0_i32 : vector<8xi80>, i32, vector<8xi32>
-//CHECK-NEXT:        vector.transfer_write %22, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<8xi32>, memref<2046x2046xi32>
+//CHECK-NEXT:        vector.transfer_write %22, %arg2[%arg3, %arg4] : vector<8xi32>, memref<2046x2046xi32>
diff --git a/test/aievec/test_reassoc_add.mlir b/test/aievec/test_reassoc_add.mlir
index f970e3310d..f771a008e3 100644
--- a/test/aievec/test_reassoc_add.mlir
+++ b/test/aievec/test_reassoc_add.mlir
@@ -7,7 +7,7 @@ func.func @conv2d (%A: memref<256xi32>, %B: memref<1xi32>, %C: memref<256xi32>)
       %b1 = affine.load %B[0] : memref<1xi32>
       //CHECK: %2 = aievec.ups %1 {shift = 0 : i8} : vector<8xi32>, vector<8xi80>
       //CHECK: %3 = aievec.concat %1, %1 : vector<8xi32>, vector<16xi32>
-      //CHECK: %4 = aievec.mac %3, %0, %2 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      //CHECK: %4 = aievec_aie1.mac %3, %0, %2 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %c1 = arith.muli %b1, %a1 : i32
       %d1 = arith.addi %c1, %a1 : i32 
       affine.store %d1, %C[%arg0] : memref<256xi32>
diff --git a/test/aievec/test_reassoc_mult.mlir b/test/aievec/test_reassoc_mult.mlir
index a51583aa41..c932556ad6 100644
--- a/test/aievec/test_reassoc_mult.mlir
+++ b/test/aievec/test_reassoc_mult.mlir
@@ -6,7 +6,7 @@ func.func @conv2d (%A: memref<256xi32>, %B: memref<1xi32>, %C: memref<256xi32>)
       %a1 = affine.load %A[%arg0] : memref<256xi32>
       %b1 = affine.load %B[0] : memref<1xi32>
       //CHECK: %2 = aievec.concat %1, %1 : vector<8xi32>, vector<16xi32>
-      //CHECK: %3 = aievec.mul %2, %0 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      //CHECK: %3 = aievec_aie1.mul %2, %0 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %p1 = arith.muli %b1, %a1 : i32
       affine.store %p1, %C[%arg0] : memref<256xi32>
     }
diff --git a/test/aievec/test_srs.mlir b/test/aievec/test_srs.mlir
index 87fccd5f7c..2124270eba 100644
--- a/test/aievec/test_srs.mlir
+++ b/test/aievec/test_srs.mlir
@@ -17,13 +17,13 @@ func.func @conv2d (%A: memref<128xi32>, %B: memref<8xi32>, %C: memref<126xi32>)
       %a = affine.load %A[%arg3] : memref<128xi32>
       %b = affine.load %B[0] : memref<8xi32>
       //CHECK-NEXT: %4 = aievec.concat %2, %2 : vector<8xi32>, vector<16xi32>
-      //CHECK-NEXT: %5 = aievec.mac %4, %0, %3 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
+      //CHECK-NEXT: %5 = aievec_aie1.mac %4, %0, %3 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, vector<8xi80>
       %p = arith.muli %a, %b : i32
       %co = arith.addi %ci, %p : i32
       //CHECK-NEXT: %6 = aievec.srs %5, %c0_i32 : vector<8xi80>, i32, vector<8xi32>
-      //CHECK-NEXT: %7 = aievec.add %6, %6 : vector<8xi32>, vector<8xi32>, vector<8xi32>
+      //CHECK-NEXT: %7 = aievec_aie1.add %6, %6 : vector<8xi32>, vector<8xi32>, vector<8xi32>
       %co1 = arith.addi %co, %co : i32
-      //CHECK-NEXT: vector.transfer_write %7, %arg2[%arg3] {in_bounds = [true]} : vector<8xi32>, memref<126xi32>
+      //CHECK-NEXT: vector.transfer_write %7, %arg2[%arg3] : vector<8xi32>, memref<126xi32>
       affine.store %co1, %C[%arg3] : memref<126xi32>
     }
     return
diff --git a/test/create-flows/memtile_routing_constraints.mlir b/test/create-flows/memtile_routing_constraints.mlir
index 9806089ff2..12f50b74ca 100644
--- a/test/create-flows/memtile_routing_constraints.mlir
+++ b/test/create-flows/memtile_routing_constraints.mlir
@@ -15,30 +15,18 @@
 // CHECK: %[[T22:.*]] = aie.tile(2, 2)
 // CHECK: %[[T21:.*]] = aie.tile(2, 1)
 // CHECK: %[[T20:.*]] = aie.tile(2, 0)
-// CHECK: aie.switchbox(%[[T21]]) {
-// CHECK:   aie.connect<North : 0, DMA : 0>
-// CHECK:   aie.connect<North : 1, South : 1>
-// CHECK: }
-// CHECK: aie.switchbox(%[[T22]]) {
-// CHECK:   aie.connect<DMA : 0, South : 0>
-// CHECK:   aie.connect<North : 1, South : 1>
-// CHECK: }
-// CHECK: aie.switchbox(%[[T20]]) {
-// CHECK:   aie.connect<North : 1, South : 2>
-// CHECK: }
-// CHECK: aie.switchbox(%[[T23]]) {
-// CHECK:   aie.connect<DMA : 0, South : 1>
-// CHECK: }
+// CHECK: aie.flow(%[[T23]], DMA : 0, %[[T20]], DMA : 0)
+// CHECK: aie.flow(%[[T22]], DMA : 0, %[[T21]], DMA : 0)
 
 module {
     aie.device(xcve2802) {
-        %t04 = aie.tile(2, 4)
-        %t03 = aie.tile(2, 3)
-        %t02 = aie.tile(2, 2)
-        %t01 = aie.tile(2, 1)
-        %t00 = aie.tile(2, 0)
+        %t24 = aie.tile(2, 4)
+        %t23 = aie.tile(2, 3)
+        %t22 = aie.tile(2, 2)
+        %t21 = aie.tile(2, 1)
+        %t20 = aie.tile(2, 0)
 
-        aie.flow(%t02, DMA : 0, %t01, DMA : 0)
-        aie.flow(%t03, DMA : 0, %t00, DMA : 0)
+        aie.flow(%t22, DMA : 0, %t21, DMA : 0)
+        aie.flow(%t23, DMA : 0, %t20, DMA : 0)
     }
 }
diff --git a/test/create-flows/unit_broadcast.mlir b/test/create-flows/unit_broadcast.mlir
index cf580b234f..45ff1526d5 100644
--- a/test/create-flows/unit_broadcast.mlir
+++ b/test/create-flows/unit_broadcast.mlir
@@ -247,4 +247,3 @@ module {
         aie.flow(%t60, DMA : 0, %t31, DMA : 1)
     }
 }
-
diff --git a/test/create-flows/unit_more_flows_shim.mlir b/test/create-flows/unit_more_flows_shim.mlir
index 3e25b09a70..eb7480a20a 100644
--- a/test/create-flows/unit_more_flows_shim.mlir
+++ b/test/create-flows/unit_more_flows_shim.mlir
@@ -12,7 +12,7 @@
 // These tests verify pathfinder routing flows to/from PLIO in shim tiles.  
 //
 
-// RUN: aie-opt --split-input-file --aie-create-pathfinder-flows -split-input-file %s | FileCheck %s
+// RUN: aie-opt --aie-create-pathfinder-flows -split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: test70
 // CHECK: %[[T70:.*]] = aie.tile(7, 0)
diff --git a/test/dialect/AIE/memtiledma.mlir b/test/dialect/AIE/memtiledma.mlir
index d740b09ee6..8e95cbaae1 100644
--- a/test/dialect/AIE/memtiledma.mlir
+++ b/test/dialect/AIE/memtiledma.mlir
@@ -65,13 +65,13 @@ aie.device(xcve2802) {
       aie.dma_bd(%buf : memref<256xi32>, 0, 256)
       aie.next_bd ^bd14
     ^bd14:
-      aie.dma_bd(%buf : memref<256xi32>, 0, 256)
+      aie.dma_bd(%buf : memref<256xi32>, 0, 256, [<size = 2, stride = 128>])
       aie.next_bd ^bd15
     ^bd15:
-      aie.dma_bd(%buf : memref<256xi32>, 0, 256)
+      aie.dma_bd(%buf : memref<256xi32>, 0, 256, [<size = 2, stride = 128>], [<const_pad_before = 1, const_pad_after = 1>])
       aie.next_bd ^bd16
     ^bd16:
-      aie.dma_bd(%buf : memref<256xi32>, 0, 256)
+      aie.dma_bd(%buf : memref<256xi32>, 0, 256, [<size = 2, stride = 128>], [<const_pad_before = 1, const_pad_after = 1>], pad_value = 0)
       aie.end
   }
 }
diff --git a/test/dialect/AIE/nd-dma-bad-pad.mlir b/test/dialect/AIE/nd-dma-bad-pad.mlir
new file mode 100644
index 0000000000..384040ce86
--- /dev/null
+++ b/test/dialect/AIE/nd-dma-bad-pad.mlir
@@ -0,0 +1,30 @@
+//===- nd-dma-bad-pad.mlir -------------------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// RUN: aie-opt --verify-diagnostics %s
+
+// CHECK-LABEL: module {
+// CHECK:       }
+
+module {
+  aie.device(xcve2802) {
+    %t1 = aie.tile(1, 1)
+    %buf = aie.buffer(%t1) : memref<256xi8>
+    %mem = aie.memtile_dma(%t1) {
+      aie.dma_start("MM2S", 0, ^bd0, ^end)
+      ^bd0:
+        // expected-error@+1 {{'aie.dma_bd' op Inner-most padding-before count must result in padding in 32-bit words.}}
+        aie.dma_bd(%buf : memref<256xi8>, 0, 8, [<size = 4, stride = 1>], [<const_pad_before = 2, const_pad_after = 2>], pad_value = 0)
+        aie.next_bd ^end
+      ^end:
+        aie.end
+    }
+  }
+}
diff --git a/test/dialect/AIE/nd-dma-pad-exceeds-len.mlir b/test/dialect/AIE/nd-dma-pad-exceeds-len.mlir
new file mode 100644
index 0000000000..56886b2d14
--- /dev/null
+++ b/test/dialect/AIE/nd-dma-pad-exceeds-len.mlir
@@ -0,0 +1,30 @@
+//===- nd-dma-pad-exceeds-len.mlir -----------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// RUN: aie-opt --verify-diagnostics %s
+
+// CHECK-LABEL: module {
+// CHECK:       }
+
+module {
+  aie.device(xcve2802) {
+    %t1 = aie.tile(1, 1)
+    %buf = aie.buffer(%t1) : memref<256xi32>
+    %mem = aie.memtile_dma(%t1) {
+      aie.dma_start("MM2S", 0, ^bd0, ^end)
+      ^bd0:
+        // expected-error@+1 {{'aie.dma_bd' op Data exceeds len after padding.}}
+        aie.dma_bd(%buf : memref<256xi32>, 0, 4, [<size = 2, stride = 128>], [<const_pad_before = 2, const_pad_after = 1>], pad_value = 0)
+        aie.next_bd ^end
+      ^end:
+        aie.end
+    }
+  }
+}
diff --git a/test/dialect/AIEVec/AIE1/roundtrip.mlir b/test/dialect/AIEVec/AIE1/roundtrip.mlir
new file mode 100644
index 0000000000..d269c95013
--- /dev/null
+++ b/test/dialect/AIEVec/AIE1/roundtrip.mlir
@@ -0,0 +1,53 @@
+// RUN: aie-opt %s -split-input-file -verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL: @add_i32
+// CHECK-SAME: %[[A:.*]]: vector<8xi32>,
+// CHECK-SAME: %[[B:.*]]: vector<8xi32>
+// CHECK:      %[[RES:.*]] = aievec_aie1.add %[[A]], %[[B]] :
+// CHECK-SAME: vector<8xi32>, vector<8xi32>, vector<8xi32>
+// CHECK: return %[[RES]] : vector<8xi32>
+func.func @add_i32(%A : vector<8xi32>, %B : vector<8xi32>) -> vector<8xi32> {
+  %0 = aievec_aie1.add %A, %B : vector<8xi32>, vector<8xi32>, vector<8xi32>
+  return %0 : vector<8xi32>
+}
+
+// -----
+
+// CHECK-LABEL: @sub_i32
+// CHECK-SAME: %[[A:.*]]: vector<8xi32>,
+// CHECK-SAME: %[[B:.*]]: vector<8xi32>
+// CHECK:      %[[RES:.*]] = aievec_aie1.sub %[[A]], %[[B]] :
+// CHECK-SAME: vector<8xi32>, vector<8xi32>, vector<8xi32>
+// CHECK: return %[[RES]] : vector<8xi32>
+func.func @sub_i32(%A : vector<8xi32>, %B : vector<8xi32>) -> vector<8xi32> {
+  %0 = aievec_aie1.sub %A, %B : vector<8xi32>, vector<8xi32>, vector<8xi32>
+  return %0 : vector<8xi32>
+}
+
+// -----
+
+// CHECK-LABEL: @mul_i16
+// CHECK-SAME: %[[A:.*]]: vector<16xi16>,
+// CHECK-SAME: %[[B:.*]]: vector<16xi16>
+// CHECK:      %[[RES:.*]] = aievec_aie1.mul %[[A]], %[[B]] :
+// CHECK-SAME: vector<16xi16>, vector<16xi16>, vector<16xi48>
+// CHECK: return %[[RES]] : vector<16xi48>
+func.func @mul_i16(%A : vector<16xi16>, %B : vector<16xi16>) -> vector<16xi48> {
+  %0 = aievec_aie1.mul %A, %B : vector<16xi16>, vector<16xi16>, vector<16xi48>
+  return %0 : vector<16xi48>
+}
+
+// -----
+
+// CHECK-LABEL: @mac_f32
+// CHECK-SAME: %[[A:.*]]: vector<16xf32>,
+// CHECK-SAME: %[[B:.*]]: vector<8xf32>,
+// CHECK-SAME: %[[ACC:.*]]: vector<8xf32>
+// CHECK:      %[[RES:.*]] = aievec_aie1.mac %[[A]], %[[B]], %[[ACC]] 
+// CHECK-SAME: {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : 
+// CHECK-SAME: vector<16xf32>, vector<8xf32>, vector<8xf32>
+// CHECK: return %[[RES]] : vector<8xf32>
+func.func @mac_f32(%A : vector<16xf32>, %B : vector<8xf32>, %ACC : vector<8xf32>) -> vector<8xf32> {
+  %0 = aievec_aie1.mac %A, %B, %ACC {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
+  return %0 : vector<8xf32>
+}
diff --git a/test/dialect/AIEVec/mac-optimization.mlir b/test/dialect/AIEVec/mac-optimization.mlir
index 9c5e5f9277..3b9d5c7387 100644
--- a/test/dialect/AIEVec/mac-optimization.mlir
+++ b/test/dialect/AIEVec/mac-optimization.mlir
@@ -9,7 +9,7 @@ func.func @merge_single_column_mac(%A : vector<16xi16>,
                                    %C : vector<16xi16>) -> vector<16xi48> {
     // CHECK: %[[ACC:.*]] = arith.constant dense<0> : vector<16xi48>
     // CHECK-NEXT: %[[VAB:.*]] = aievec.concat %[[VA]], %[[VB]] : vector<16xi16>, vector<32xi16>
-    // CHECK-NEXT: %[[MAC:.*]] = aievec.mac %[[VAB]], %[[VC]], %[[ACC]] {
+    // CHECK-NEXT: %[[MAC:.*]] = aievec_aie1.mac %[[VAB]], %[[VC]], %[[ACC]] {
     // CHECK-SAME: xoffsets = "0x73727170", xoffsets_hi = "0x77767574", xsquare = "0x3120", xstart = "0",
     // CHECK-SAME: zoffsets = "0", zoffsets_hi = "0", zstart = "0", zstep = "1"}
     // CHECK-SAME: : vector<32xi16>, vector<16xi16>, vector<16xi48>
@@ -17,14 +17,14 @@ func.func @merge_single_column_mac(%A : vector<16xi16>,
     %acc = arith.constant dense<0> : vector<16xi48>
     %zvec = arith.constant dense<0> : vector<16xi16>
     %la = aievec.concat %A, %zvec : vector<16xi16>, vector<32xi16>
-    %mac0 = aievec.mac %la, %C, %acc {xoffsets = "0x73727170",
+    %mac0 = aievec_aie1.mac %la, %C, %acc {xoffsets = "0x73727170",
                                       xoffsets_hi = "0x77767574",
                                       xsquare = "0x3120", xstart = "0",
                                       zoffsets = "0", zoffsets_hi = "0",
                                       zstart = "0", zstep = "1"}
                                     : vector<32xi16>, vector<16xi16>, vector<16xi48>
     %lb = aievec.concat %B, %zvec : vector<16xi16>, vector<32xi16>
-    %mac1 = aievec.mac %lb, %C, %mac0 {xoffsets = "0x73727170",
+    %mac1 = aievec_aie1.mac %lb, %C, %mac0 {xoffsets = "0x73727170",
                                        xoffsets_hi = "0x77767574",
                                        xsquare = "0x3120", xstart = "0",
                                        zoffsets = "0", zoffsets_hi = "0",
@@ -44,7 +44,7 @@ func.func @merge_single_column_mac(%A : vector<16xi16>,
                                    %C : vector<16xi16>) -> vector<16xi48> {
     // CHECK: %[[ACC:.*]] = arith.constant dense<0> : vector<16xi48>
     // CHECK-NEXT: %[[VAB:.*]] = aievec.concat %[[VA]], %[[VB]] : vector<16xi16>, vector<32xi16>
-    // CHECK-NEXT: %[[MAC:.*]] = aievec.mac %[[VAB]], %[[VC]], %[[ACC]] {
+    // CHECK-NEXT: %[[MAC:.*]] = aievec_aie1.mac %[[VAB]], %[[VC]], %[[ACC]] {
     // CHECK-SAME: xoffsets = "0x73727170", xoffsets_hi = "0x77767574", xsquare = "0x3120", xstart = "0",
     // CHECK-SAME: zoffsets = "0", zoffsets_hi = "0", zstart = "3", zstep = "4"}
     // CHECK-SAME: : vector<32xi16>, vector<16xi16>, vector<16xi48>
@@ -52,14 +52,14 @@ func.func @merge_single_column_mac(%A : vector<16xi16>,
     %acc = arith.constant dense<0> : vector<16xi48>
     %zvec = arith.constant dense<0> : vector<16xi16>
     %la = aievec.concat %A, %zvec : vector<16xi16>, vector<32xi16>
-    %mac0 = aievec.mac %la, %C, %acc {xoffsets = "0x73727170",
+    %mac0 = aievec_aie1.mac %la, %C, %acc {xoffsets = "0x73727170",
                                       xoffsets_hi = "0x77767574",
                                       xsquare = "0x3120", xstart = "0",
                                       zoffsets = "0", zoffsets_hi = "0",
                                       zstart = "3", zstep = "1"}
                                     : vector<32xi16>, vector<16xi16>, vector<16xi48>
     %lb = aievec.concat %B, %zvec : vector<16xi16>, vector<32xi16>
-    %mac1 = aievec.mac %lb, %C, %mac0 {xoffsets = "0x73727170",
+    %mac1 = aievec_aie1.mac %lb, %C, %mac0 {xoffsets = "0x73727170",
                                        xoffsets_hi = "0x77767574",
                                        xsquare = "0x3120", xstart = "0",
                                        zoffsets = "0", zoffsets_hi = "0",
@@ -79,7 +79,7 @@ func.func @merge_single_column_mac(%A : vector<16xi16>,
                                    %C : vector<16xi16>) -> vector<16xi48> {
     // CHECK: %[[ACC:.*]] = arith.constant dense<0> : vector<16xi48>
     // CHECK-NEXT: %[[VBA:.*]] = aievec.concat %[[VB]], %[[VA]] : vector<16xi16>, vector<32xi16>
-    // CHECK-NEXT: %[[MAC:.*]] = aievec.mac %[[VBA]], %[[VC]], %[[ACC]] {
+    // CHECK-NEXT: %[[MAC:.*]] = aievec_aie1.mac %[[VBA]], %[[VC]], %[[ACC]] {
     // CHECK-SAME: xoffsets = "0x73727170", xoffsets_hi = "0x77767574", xsquare = "0x3120", xstart = "0",
     // CHECK-SAME: zoffsets = "0", zoffsets_hi = "0", zstart = "4", zstep = "5"}
     // CHECK-SAME: : vector<32xi16>, vector<16xi16>, vector<16xi48>
@@ -87,14 +87,14 @@ func.func @merge_single_column_mac(%A : vector<16xi16>,
     %acc = arith.constant dense<0> : vector<16xi48>
     %zvec = arith.constant dense<0> : vector<16xi16>
     %la = aievec.concat %A, %zvec : vector<16xi16>, vector<32xi16>
-    %mac0 = aievec.mac %la, %C, %acc {xoffsets = "0x73727170",
+    %mac0 = aievec_aie1.mac %la, %C, %acc {xoffsets = "0x73727170",
                                       xoffsets_hi = "0x77767574",
                                       xsquare = "0x3120", xstart = "0",
                                       zoffsets = "0", zoffsets_hi = "0",
                                       zstart = "9", zstep = "1"}
                                     : vector<32xi16>, vector<16xi16>, vector<16xi48>
     %lb = aievec.concat %B, %zvec : vector<16xi16>, vector<32xi16>
-    %mac1 = aievec.mac %lb, %C, %mac0 {xoffsets = "0x73727170",
+    %mac1 = aievec_aie1.mac %lb, %C, %mac0 {xoffsets = "0x73727170",
                                        xoffsets_hi = "0x77767574",
                                        xsquare = "0x3120", xstart = "0",
                                        zoffsets = "0", zoffsets_hi = "0",
diff --git a/test/dialect/AIEVec/precanonicalization.mlir b/test/dialect/AIEVec/precanonicalization.mlir
index 823b7a6269..2163027a54 100644
--- a/test/dialect/AIEVec/precanonicalization.mlir
+++ b/test/dialect/AIEVec/precanonicalization.mlir
@@ -10,7 +10,7 @@ func.func @splat(%m : memref<?xi32>, %pos : index) -> vector<8xi32> {
     // CHECK: %[[V:.*]] = vector.transfer_read %[[MEM]][%[[POS]]], %[[C0]] : memref<?xi32>, vector<8xi32>
     // CHECK: %[[E:.*]] = vector.extract %[[V]][5] : i32 from vector<8xi32>
     // CHECK: %[[S:.*]] = vector.broadcast %[[E]] : i32 to vector<8xi32>
-    %v = vector.transfer_read %m[%i], %c0_i32 {permutation_map = affine_map<(d0) -> (0)>} : memref<?xi32>, vector<8xi32>
+    %v = vector.transfer_read %m[%i], %c0_i32 {in_bounds = [true], permutation_map = affine_map<(d0) -> (0)>} : memref<?xi32>, vector<8xi32>
     // CHECK: return %[[S]] : vector<8xi32>
     return %v : vector<8xi32>
 }
@@ -29,7 +29,7 @@ func.func @far_splat(%m : memref<?xi32>, %pos : index) -> vector<8xi32> {
     // CHECK: %[[V:.*]] = vector.transfer_read %[[MEM]][%[[IDX]]], %[[C0]] : memref<?xi32>, vector<8xi32>
     // CHECK: %[[E:.*]] = vector.extract %[[V]][5] : i32 from vector<8xi32>
     // CHECK: %[[S:.*]] = vector.broadcast %[[E]] : i32 to vector<8xi32>
-    %v = vector.transfer_read %m[%i], %c0_i32 {permutation_map = affine_map<(d0) -> (0)>} : memref<?xi32>, vector<8xi32>
+    %v = vector.transfer_read %m[%i], %c0_i32 {in_bounds = [true], permutation_map = affine_map<(d0) -> (0)>} : memref<?xi32>, vector<8xi32>
     // CHECK: return %[[S]] : vector<8xi32>
     return %v : vector<8xi32>
 }
@@ -62,7 +62,7 @@ func.func @rank_zero_transfer_read(%m : memref<i16>) -> vector<16xi16> {
     // CHECK: %[[LV:.*]] = vector.transfer_read %[[EXPMEM]][%[[C0idx]]], %[[C0i16]] : memref<1xi16>, vector<16xi16>
     // CHECK: %[[E:.*]] = vector.extract %[[LV]][0] : i16 from vector<16xi16>
     // CHECK: %[[S:.*]] = vector.broadcast %[[E]] : i16 to vector<16xi16>
-    %v = vector.transfer_read %m[], %c0_i16 {permutation_map = affine_map<()->(0)>} : memref<i16>, vector<16xi16>
+    %v = vector.transfer_read %m[], %c0_i16 {in_bounds = [true], permutation_map = affine_map<()->(0)>} : memref<i16>, vector<16xi16>
     // CHECK: return %[[S]] : vector<16xi16>
     return %v : vector<16xi16>
 }
@@ -106,4 +106,4 @@ func.func @extsi_hoisting_through_extract_strided_slice(%m : memref<?xi8>)
                 vector<32xi8> to vector<16xi8>
     %vi32 = arith.extsi %slice : vector<16xi8> to vector<16xi32>
     return %vi32 : vector<16xi32>
-}
\ No newline at end of file
+}
diff --git a/test/dialect/AIEX/bad_npu_nd.mlir b/test/dialect/AIEX/bad_npu_nd.mlir
index ed5e5daf28..fc0dea7a26 100644
--- a/test/dialect/AIEX/bad_npu_nd.mlir
+++ b/test/dialect/AIEX/bad_npu_nd.mlir
@@ -13,14 +13,13 @@
 
 module {
   aie.device(npu1_4col) {
-    func.func @bad_npu_nd_length(%in : memref<1920x1080xi32>, %buf : memref<32xi32>, %out : memref<1920x1080xi32>) {
+    aiex.runtime_sequence(%in : memref<1920x1080xi32>, %buf : memref<32xi32>, %out : memref<1920x1080xi32>) {
       %c0 = arith.constant 0 : i64
       %c1 = arith.constant 1 : i64
       %c1920 = arith.constant 1920 : i64
       %c1080 = arith.constant 1080 : i64
       // expected-error@+1 {{Size 0 exceeds the [0:1023] range}}
       aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1080,%c1920][%c0,%c0,%c1920,%c1]) { metadata = @of_fromMem, id = 0 : i64 } : memref<1920x1080xi32>
-      return
     }
     aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0)
   }
@@ -30,7 +29,7 @@ module {
 
 module {
   aie.device(npu1_4col) {
-    func.func @bad_npu_nd_repeat(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) {
+    aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) {
       %c0 = arith.constant 0 : i64
       %c1 = arith.constant 1 : i64
       %c2 = arith.constant 2 : i64
@@ -41,7 +40,6 @@ module {
       %c128 = arith.constant 128 : i64
       // expected-error@+1 {{Size 3 exceeds the [1:64] range}}
       aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c128,%c2,%c2,%c8][%c0,%c16,%c8,%c1]) { metadata = @of_fromMem, id = 0 : i64 } : memref<128x4x2x8xi32>
-      return
     }
     aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0)
   }
@@ -51,14 +49,13 @@ module {
 
 module {
   aie.device(npu1_4col) {
-    func.func @bad_npu_nd_stride(%in : memref<8388608xi32>, %buf : memref<32xi32>, %out : memref<8388608xi32>) {
+    aiex.runtime_sequence(%in : memref<8388608xi32>, %buf : memref<32xi32>, %out : memref<8388608xi32>) {
       %c0 = arith.constant 0 : i64
       %c1 = arith.constant 1 : i64
       %c2 = arith.constant 2 : i64
       %c2097152 = arith.constant 2097152 : i64
       // expected-error@+1 {{Stride 1 exceeds the [1:1048576] range}}
       aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2][%c0,%c0,%c2097152,%c1]) { metadata = @of_fromMem, id = 0 : i64 } : memref<8388608xi32>
-      return
     }
     aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0)
   }
@@ -70,14 +67,13 @@ module {
 
 module {
   aie.device(npu1_4col) {
-    func.func @bad_npu_nd_stride(%a : memref<8xi8>) {
+    aiex.runtime_sequence(%a : memref<8xi8>) {
       %c0 = arith.constant 0 : i64
       %c1 = arith.constant 1 : i64
       %c2 = arith.constant 2 : i64
       %c8 = arith.constant 8 : i64
       // expected-error@+1 {{Offset must be 4-byte-aligned}}
       aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c1][%c1,%c1,%c1,%c8][%c0,%c0,%c1,%c1]) { metadata = @fifo, id = 0 : i64 } : memref<8xi8>
-      return
     }
     aie.shim_dma_allocation @fifo (MM2S, 0, 0)
   }
@@ -90,7 +86,7 @@ module {
 
 module {
   aie.device(npu1_4col) {
-    func.func @bad_npu_nd(%a : memref<8xi8>) {
+    aiex.runtime_sequence(%a : memref<8xi8>) {
       %c0 = arith.constant 0 : i64
       %c1 = arith.constant 1 : i64
       %c2 = arith.constant 2 : i64
@@ -100,7 +96,6 @@ module {
       // Although 2048 exceeds the 0:1023 limit for size 0, since the elements are i8s,
       // this should be a size of 512 in address granularity (4 bytes) and hence pass the test.
       aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2048][%c0,%c0,%c4,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8>
-      return
     }
     aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
   }
@@ -110,7 +105,7 @@ module {
 
 module {
   aie.device(npu1_4col) {
-    func.func @bad_npu_nd(%a : memref<8xi16>) {
+    aiex.runtime_sequence(%a : memref<8xi16>) {
       %c0 = arith.constant 0 : i64
       %c1 = arith.constant 1 : i64
       %c2 = arith.constant 2 : i64
@@ -119,7 +114,6 @@ module {
       %c2048 = arith.constant 2048 : i64
       // expected-error@+1 {{Size 0 exceeds the [0:1023] range}}
       aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2048][%c0,%c0,%c4,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi16>
-      return
     }
     aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
   }
@@ -132,14 +126,13 @@ module {
 
 module {
   aie.device(npu1_4col) {
-    func.func @bad_npu_nd(%a : memref<8xi8>) {
+    aiex.runtime_sequence(%a : memref<8xi8>) {
       %c0 = arith.constant 0 : i64
       %c1 = arith.constant 1 : i64
       %c2 = arith.constant 2 : i64  // Stride of 2 i8s = 2 bytes < 4 byte granularity, should not be possible
       %c8 = arith.constant 8 : i64
       // expected-error@+1 {{Stride 1 is 2 elements * 1 bytes = 2 bytes, which is not divisible by 4}}
       aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c8][%c0,%c0,%c2,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8>
-      return
     }
     aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
   }
@@ -149,7 +142,7 @@ module {
 
 module {
   aie.device(npu1_4col) {
-    func.func @bad_npu_nd(%a : memref<8xi8>) {
+    aiex.runtime_sequence(%a : memref<8xi8>) {
       %c0 = arith.constant 0 : i64
       %c1 = arith.constant 1 : i64
       %c2 = arith.constant 2 : i64
@@ -157,7 +150,6 @@ module {
       %c8 = arith.constant 8 : i64
       // expected-error@+1 {{2 elements at 1 bytes each equal 2 bytes, which is not divisible by 4}}
       aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2][%c0,%c0,%c4,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8>
-      return
     }
     aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
   }
@@ -169,7 +161,7 @@ module {
 
 module {
   aie.device(npu1_4col) {
-    func.func @bad_npu_nd(%a : memref<8xi8>) {
+    aiex.runtime_sequence(%a : memref<8xi8>) {
       %c0 = arith.constant 0 : i64
       %c1 = arith.constant 1 : i64
       %c2 = arith.constant 2 : i64
@@ -177,7 +169,6 @@ module {
       %c8 = arith.constant 8 : i64
       // expected-error@+1 {{Stride 0 is 2 elements * 1 bytes = 2 bytes, which is not divisible by 4}}
       aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c8][%c0,%c0,%c0,%c2]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi8>
-      return
     }
     aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
   }
@@ -189,14 +180,13 @@ module {
 
 module {
   aie.device(npu1_4col) {
-    func.func @bad_npu_nd(%a : memref<8xi16>) {
+    aiex.runtime_sequence(%a : memref<8xi16>) {
       %c0 = arith.constant 0 : i64
       %c1 = arith.constant 1 : i64
       %c3 = arith.constant 3 : i64
       %c8 = arith.constant 8 : i64
       // expected-error@+1 {{3 elements at 2 bytes each equal 6 bytes, which is not divisible by 4}}
       aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c3][%c0,%c0,%c0,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi16>
-      return
     }
     aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
   }
@@ -208,14 +198,34 @@ module {
 
 module {
   aie.device(npu1) {
-    func.func @bad_npu_nd(%a : memref<8xi16>) {
+    aiex.runtime_sequence(%a : memref<8xi16>) {
       %c0 = arith.constant 0 : i64
       %c1 = arith.constant 1 : i64
       %c4 = arith.constant 4 : i64
       %c8 = arith.constant 8 : i64
       // expected-error@+1 {{Unsupported tile type at (0, 0) Must be ShimNOC, Mem or Core.}}
       aiex.npu.dma_memcpy_nd (0, 0, %a[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c4][%c0,%c0,%c0,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi16>
-      return
+    }
+    aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
+  }
+}
+
+// -----
+
+// first (highest-dimension) stride can go beyond the limit, as long as the corresponding wrap is 1
+
+module {
+  aie.device(npu1_4col) {
+    aiex.runtime_sequence(%a : memref<8xi32>) {
+      %c0 = arith.constant 0 : i64
+      %c1 = arith.constant 1 : i64
+      %c2 = arith.constant 2 : i64
+      %c3 = arith.constant 3 : i64
+      %c8 = arith.constant 8 : i64
+      %c1572864 = arith.constant 1572864 : i64
+      aiex.npu.dma_memcpy_nd (0, 0, %a[%c1,%c0,%c0,%c0][%c1,%c1,%c1,%c2][%c1572864,%c0,%c0,%c1]) { metadata = @objectfifo, id = 0 : i64 } : memref<8xi32>
+      // expected-error@+1 {{Stride 3 exceeds the [1:1048576] range.}}
+      aiex.npu.dma_memcpy_nd (0, 0, %a[%c1,%c0,%c0,%c0][%c2,%c1,%c1,%c2][%c1572864,%c0,%c0,%c1]) { metadata = @objectfifo, id = 1 : i64 } : memref<8xi32>
     }
     aie.shim_dma_allocation @objectfifo (MM2S, 0, 0)
   }
diff --git a/test/dialect/AIEX/bad_npu_push_queue.mlir b/test/dialect/AIEX/bad_npu_push_queue.mlir
index c6e66d37aa..9ed061a8a8 100644
--- a/test/dialect/AIEX/bad_npu_push_queue.mlir
+++ b/test/dialect/AIEX/bad_npu_push_queue.mlir
@@ -13,10 +13,9 @@
 
 module {
   aie.device(npu1_4col) {
-    func.func @bad_bd_id(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) {
+    aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) {
       // expected-error@+1 {{BD ID exceeds the maximum ID.}}
       aiex.npu.push_queue (0, 0, MM2S:0) {issue_token = false, repeat_count = 3 : i32, bd_id = 28 : i32 }
-      return
     }
   }
 }
@@ -25,10 +24,9 @@ module {
 
 module {
   aie.device(npu1_4col) {
-    func.func @bad_repeat_count(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) {
+    aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) {
       // expected-error@+1 {{Repeat count exceeds the [0:255] range.}}
       aiex.npu.push_queue (0, 0, MM2S:0) {issue_token = false, repeat_count = 384 : i32, bd_id = 8 : i32 }
-      return
     }
   }
 }
\ No newline at end of file
diff --git a/test/dialect/AIEX/bad_npu_write_bd.mlir b/test/dialect/AIEX/bad_npu_write_bd.mlir
index 37b0ada3f5..383f6ac567 100644
--- a/test/dialect/AIEX/bad_npu_write_bd.mlir
+++ b/test/dialect/AIEX/bad_npu_write_bd.mlir
@@ -13,10 +13,9 @@
 
 module {
   aie.device(npu1_4col) {
-    func.func @bad_bd_id(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) {
+    aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) {
       // expected-error@+1 {{BD ID exceeds the maximum ID.}}
       aiex.npu.writebd {bd_id = 17 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
-      return
     }
   }
 }
@@ -25,10 +24,9 @@ module {
 
 module {
   aie.device(npu1_4col) {
-    func.func @bad_iteration_size(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) {
+    aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) {
       // expected-error@+1 {{Iteration Size exceeds the [0:63] range.}}
       aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 4 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 1024 : i32, iteration_size = 128 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
-      return
     }
   }
 }
@@ -37,10 +35,9 @@ module {
 
 module {
   aie.device(npu1_4col) {
-    func.func @bad_stride(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) {
+    aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) {
       // expected-error@+1 {{D0 Stride exceeds the [0:1M-1] range.}}
       aiex.npu.writebd {bd_id = 2 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 2097356 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
-      return
     }
   }
 }
@@ -49,10 +46,9 @@ module {
 
 module {
   aie.device(npu1_4col) {
-    func.func @bad_size(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) {
+    aiex.runtime_sequence(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) {
       // expected-error@+1 {{D1 Size exceeds the [0:1023] range.}}
       aiex.npu.writebd {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, row = 0 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 1024 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
-      return
     }
   }
 }
\ No newline at end of file
diff --git a/test/dialect/AIEX/invalid.mlir b/test/dialect/AIEX/invalid.mlir
index 77b5d4ec33..f4573bf7d6 100644
--- a/test/dialect/AIEX/invalid.mlir
+++ b/test/dialect/AIEX/invalid.mlir
@@ -11,9 +11,8 @@
 // RUN: aie-opt --split-input-file --verify-diagnostics %s
 
 aie.device(npu1_4col) {
-  func.func @npu_dma_wait_no_symbol() {
+  aiex.runtime_sequence() {
     // expected-error@+1 {{'aiex.npu.dma_wait' op couldn't find symbol in parent device}}
     aiex.npu.dma_wait {symbol = @out0}
-    return
   }
 }
diff --git a/test/dialect/AIEX/roundtrip.mlir b/test/dialect/AIEX/roundtrip.mlir
index bbab413598..4dc3b313f9 100644
--- a/test/dialect/AIEX/roundtrip.mlir
+++ b/test/dialect/AIEX/roundtrip.mlir
@@ -10,30 +10,43 @@
 
 // RUN: aie-opt --split-input-file %s | FileCheck %s
 
-// CHECK-LABEL: func.func @npu_dma_wait
+// CHECK: aie.device
 // CHECK: aiex.npu.dma_wait {symbol = @out0}
 aie.device(npu1_4col) {
   memref.global "public" @out0 : memref<16xi32>
-  func.func @npu_dma_wait() {
+  aiex.runtime_sequence() {
     aiex.npu.dma_wait {symbol = @out0}
-    return
   }
 }
 
 // -----
 
-// CHECK-LABEL: func.func @npu_dma_wait_no_device
+// CHECK: aie.device
 // CHECK: aiex.npu.dma_wait {symbol = @out0}
-func.func @npu_dma_wait_no_device() {
-  aiex.npu.dma_wait {symbol = @out0}
-  return
+aie.device(npu1_4col) {
+  memref.global "public" @out0 : memref<16xi32>
+  aiex.runtime_sequence() {
+    aiex.npu.dma_wait {symbol = @out0}
+  }
 }
 
 // -----
 
-// CHECK-LABEL: func.func @npu_addr_patch
+// CHECK: aie.device
 // CHECK: aiex.npu.address_patch {addr = 123 : ui32, arg_idx = 3 : i32, arg_plus = 0 : i32}
-func.func @npu_addr_patch() {
-  aiex.npu.address_patch {addr = 123 : ui32, arg_idx = 3 : i32, arg_plus = 0 : i32}
-  return
+aie.device(npu1_4col) {
+  aiex.runtime_sequence() {
+    aiex.npu.address_patch {addr = 123 : ui32, arg_idx = 3 : i32, arg_plus = 0 : i32}
+  }
+}
+
+// -----
+
+// CHECK: aie.device
+// CHECK: runtime_sequence @seq(%arg0: memref<1xi32>)
+// CHECK: aiex.npu.write32 {address = 432 : ui32, value = 1 : ui32}
+aie.device(npu1_4col) {
+  aiex.runtime_sequence @seq(%arg0 : memref<1xi32>) {
+    aiex.npu.write32 {address = 432 : ui32, value = 1 : ui32}
+  }
 }
\ No newline at end of file
diff --git a/test/generate-mmap/allocation_error.mlir b/test/generate-mmap/allocation_error.mlir
index c21b61dbe4..602960bd57 100644
--- a/test/generate-mmap/allocation_error.mlir
+++ b/test/generate-mmap/allocation_error.mlir
@@ -8,16 +8,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-// RUN: not aiecc.py --xchesscc --xbridge %s 2>&1 | FileCheck %s --check-prefix=CHESS
-// RUN: not aiecc.py --no-xchesscc --no-xbridge %s 2>&1 | FileCheck %s --check-prefix=PEANO
-
-// CHESS: Error: could not find free space for SpaceSymbol x in memory DMb
+// REQUIRES: peano
+// RUN: not aiecc.py --basic-alloc-scheme --no-xchesscc --no-xbridge %s 2>&1 | FileCheck %s --check-prefix=PEANO
 // PEANO: ld.lld: error: section '.bss' will not fit in region 'data': overflowed by 4 bytes
 
-// REQUIRES: chess
-// REQUIRES: peano
 // If we use all of the local memory, then linking the AIE executable should fail.
-
+// The fundamental problem here is that we can stuff things in the executable that 
+// aren't visibla at the MLIR level, so the assign-buffer-addresses pass can't generate
+// a good error message.
 module @example0 {
  aie.device(xcvc1902) {
   memref.global @x : memref<4xi8> = uninitialized
diff --git a/test/generate-mmap/allocation_error_chess.mlir b/test/generate-mmap/allocation_error_chess.mlir
new file mode 100644
index 0000000000..6bb1a97699
--- /dev/null
+++ b/test/generate-mmap/allocation_error_chess.mlir
@@ -0,0 +1,42 @@
+//===- allocation_error.mlir -----------------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2023 Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: chess
+// RUN: not aiecc.py --basic-alloc-scheme --xchesscc --xbridge %s 2>&1 | FileCheck %s --check-prefix=CHESS
+// CHESS: Error: could not find free space for SpaceSymbol x in memory DMb
+
+// If we use all of the local memory, then linking the AIE executable should fail.
+// The fundamental problem here is that we can stuff things in the executable that 
+// aren't visibla at the MLIR level, so the assign-buffer-addresses pass can't generate
+// a good error message.
+module @example0 {
+ aie.device(xcvc1902) {
+  memref.global @x : memref<4xi8> = uninitialized
+  func.func @test (%i: index, %v: i8) -> i8 {
+      %x = memref.get_global @x : memref<4xi8>
+      memref.store %v, %x[%i] : memref<4xi8>
+      %r = memref.load %x[%i] : memref<4xi8>
+      func.return %r : i8
+  }
+
+  %t33 = aie.tile(3, 3)
+
+  // Use all the local memory for buffers, combined with the 1024 byte stack size.
+  %buf33 = aie.buffer(%t33) : memref<31744xi8>
+
+  %c33 = aie.core(%t33) {
+    %idx1 = arith.constant 3 : index
+    %val1 = arith.constant 7 : i8
+    memref.store %val1, %buf33[%idx1] : memref<31744xi8>
+    func.call @test(%idx1, %val1) : (index, i8) -> i8
+    aie.end
+  }
+ }
+}
diff --git a/test/lower-to-standard/aiex_standard_lowering.mlir b/test/lower-to-standard/aiex_standard_lowering.mlir
index 4ba2c6dc22..4545259d3e 100644
--- a/test/lower-to-standard/aiex_standard_lowering.mlir
+++ b/test/lower-to-standard/aiex_standard_lowering.mlir
@@ -10,16 +10,14 @@
 
 // RUN: aie-opt --split-input-file --aiex-standard-lowering %s | FileCheck %s
 
-// CHECK-LABEL: dma_and_wait
 // CHECK-NOT: aiex.npu.dma_memcpy_nd
 // CHECK-NOT: aiex.npu.dma_wait
 module  {
   aie.device(npu1_4col) {
     memref.global "public" @toMem : memref<16xi32>
-    func.func @dma_and_wait(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
+    aiex.runtime_sequence(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
       aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64, 1]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32>
       aiex.npu.dma_wait {symbol = @toMem}
-      return
     }
     aie.shim_dma_allocation @toMem (MM2S, 1, 1)
   }
diff --git a/test/npu-xrt/add_12_i8_using_2d_dma_op_with_padding/aie.mlir b/test/npu-xrt/add_12_i8_using_2d_dma_op_with_padding/aie.mlir
new file mode 100644
index 0000000000..f4743e3ced
--- /dev/null
+++ b/test/npu-xrt/add_12_i8_using_2d_dma_op_with_padding/aie.mlir
@@ -0,0 +1,118 @@
+//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2022-2024 Advanced Micro Devices, Inc. or its affiliates
+// Copyright (C) 2020-2022, Xilinx Inc.
+//
+//===----------------------------------------------------------------------===//
+
+module {
+  aie.device(npu1_1col) {
+    memref.global "public" @objFifo_in0 : memref<56x56xi8>
+    memref.global "public" @objFifo_out0 : memref<64x64xi8>
+
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_1 = aie.tile(0, 1)
+    %tile_0_2 = aie.tile(0, 2)
+
+    %objFifo_in1_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "objFifo_in1_cons_buff_0"} : memref<64x64xi8>
+    %objFifo_in1_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "objFifo_in1_cons_buff_1"} : memref<64x64xi8>
+    %objFifo_out1_buff_0 = aie.buffer(%tile_0_2) {sym_name = "objFifo_out1_buff_0"} : memref<64x64xi8>
+    %objFifo_out1_buff_1 = aie.buffer(%tile_0_2) {sym_name = "objFifo_out1_buff_1"} : memref<64x64xi8>
+
+    %objFifo_in1_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 1 : i32, sym_name = "objFifo_in1_cons_prod_lock"}
+    %objFifo_in1_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "objFifo_in1_cons_cons_lock"}
+    %objFifo_out1_prod_lock = aie.lock(%tile_0_2, 2) {init = 1 : i32, sym_name = "objFifo_out1_prod_lock"}
+    %objFifo_out1_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "objFifo_out1_cons_lock"}
+
+    aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0)
+    aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0)
+    aie.flow(%tile_0_1, DMA : 1, %tile_0_0, DMA : 0)
+    aie.flow(%tile_0_2, DMA : 0, %tile_0_1, DMA : 1)
+
+    %core_0_2 = aie.core(%tile_0_2) {
+      %c8 = arith.constant 8 : index
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c12_i8 = arith.constant 12 : i8
+      %c2 = arith.constant 2 : index
+      %c64 = arith.constant 64 : index
+      aie.use_lock(%objFifo_in1_cons_cons_lock, AcquireGreaterEqual, 1)
+      aie.use_lock(%objFifo_out1_prod_lock, AcquireGreaterEqual, 1)
+      scf.for %arg1 = %c0 to %c64 step %c1 {
+        scf.for %arg2 = %c0 to %c64 step %c1 {
+          %0 = memref.load %objFifo_in1_cons_buff_0[%arg1, %arg2] : memref<64x64xi8>
+          %1 = arith.addi %0, %c12_i8 : i8
+          memref.store %1, %objFifo_out1_buff_0[%arg1, %arg2] : memref<64x64xi8>
+        }
+      }
+      aie.use_lock(%objFifo_in1_cons_prod_lock, Release, 1)
+      aie.use_lock(%objFifo_out1_cons_lock, Release, 1)
+      aie.end
+    }
+
+    aie.shim_dma_allocation @objFifo_in0(MM2S, 0, 0)
+
+    aiex.runtime_sequence(%arg0: memref<61x56xi8>, %arg1: memref<32xi8>, %arg2: memref<64x64xi8>) {
+      %c0_i64 = arith.constant 0 : i64
+      %c1_i64 = arith.constant 1 : i64
+      %c56_i64 = arith.constant 56 : i64
+      %c61_i64 = arith.constant 61 : i64
+      %c64_i64 = arith.constant 64 : i64
+      aiex.npu.dma_memcpy_nd (0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c61_i64, %c56_i64][%c0_i64, %c0_i64, %c56_i64, %c1_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<61x56xi8>
+      aiex.npu.dma_memcpy_nd (0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64x64xi8>
+      aiex.npu.dma_wait { symbol = @objFifo_out0 }
+    }
+
+    %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {
+      %objFifo_in0_cons_buff_0 = aie.buffer(%tile_0_1) {sym_name = "objFifo_in0_cons_buff_0"} : memref<64x64xi8>
+      %objFifo_in0_cons_buff_1 = aie.buffer(%tile_0_1) {sym_name = "objFifo_in0_cons_buff_1"} : memref<64x64xi8>
+      %objFifo_out0_buff_0 = aie.buffer(%tile_0_1) {sym_name = "objFifo_out0_buff_0"} : memref<64x64xi8>
+      %objFifo_out0_buff_1 = aie.buffer(%tile_0_1) {sym_name = "objFifo_out0_buff_1"} : memref<64x64xi8>
+      %objFifo_in0_cons_prod_lock = aie.lock(%tile_0_1, 0) {init = 1 : i32, sym_name = "objFifo_in0_cons_prod_lock"}
+      %objFifo_in0_cons_cons_lock = aie.lock(%tile_0_1, 1) {init = 0 : i32, sym_name = "objFifo_in0_cons_cons_lock"}
+      %objFifo_out0_prod_lock = aie.lock(%tile_0_1, 2) {init = 1 : i32, sym_name = "objFifo_out0_prod_lock"}
+      %objFifo_out0_cons_lock = aie.lock(%tile_0_1, 3) {init = 0 : i32, sym_name = "objFifo_out0_cons_lock"}
+      %0 = aie.dma(S2MM, 0) [{
+        aie.use_lock(%objFifo_in0_cons_prod_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_in0_cons_buff_0 : memref<64x64xi8>, 0, 3416)
+        aie.use_lock(%objFifo_in0_cons_cons_lock, Release, 1)
+      }]
+      %1 = aie.dma(MM2S, 0) [{
+        aie.use_lock(%objFifo_in0_cons_cons_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_in0_cons_buff_0 : memref<64x64xi8>, 0, 4096, [<size = 61, stride = 56>, <size = 56, stride = 1>], [<const_pad_before = 2, const_pad_after = 1>, <const_pad_before = 4, const_pad_after = 4>])
+        aie.use_lock(%objFifo_in0_cons_prod_lock, Release, 1)
+      }]
+      %2 = aie.dma(MM2S, 1) [{
+        aie.use_lock(%objFifo_out0_cons_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_out0_buff_0 : memref<64x64xi8>)
+        aie.use_lock(%objFifo_out0_prod_lock, Release, 1)
+      }]
+      %3 = aie.dma(S2MM, 1) [{
+        aie.use_lock(%objFifo_out0_prod_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_out0_buff_0 : memref<64x64xi8>)
+        aie.use_lock(%objFifo_out0_cons_lock, Release, 1)
+      }]
+      aie.end
+    }
+
+    aie.shim_dma_allocation @objFifo_out0(S2MM, 0, 0)
+
+    %mem_0_2 = aie.mem(%tile_0_2) {
+      %0 = aie.dma(S2MM, 0) [{
+        aie.use_lock(%objFifo_in1_cons_prod_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_in1_cons_buff_0 : memref<64x64xi8>)
+        aie.use_lock(%objFifo_in1_cons_cons_lock, Release, 1)
+      }]
+      %1 = aie.dma(MM2S, 0) [{
+        aie.use_lock(%objFifo_out1_cons_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_out1_buff_0 : memref<64x64xi8>)
+        aie.use_lock(%objFifo_out1_prod_lock, Release, 1)
+      }]
+      aie.end
+    }
+  }
+}
diff --git a/test/npu-xrt/add_12_i8_using_2d_dma_op_with_padding/run.lit b/test/npu-xrt/add_12_i8_using_2d_dma_op_with_padding/run.lit
new file mode 100644
index 0000000000..5329b2789e
--- /dev/null
+++ b/test/npu-xrt/add_12_i8_using_2d_dma_op_with_padding/run.lit
@@ -0,0 +1,14 @@
+// (c) Copyright 2023 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai
+//
+// RUN: %python aiecc.py %S/aie.mlir
+// RUN: aie-translate --aie-generate-cdo aie.mlir.prj/input_physical.mlir
+// RUN: cp *.elf aie.mlir.prj/
+// RUN: cp *.bin aie.mlir.prj/
+// RUN: %python aiecc.py --no-aiesim --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir
+// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++
+// RUN: %run_on_npu ./test.exe aie.xclbin | FileCheck %s
+// CHECK: PASS!
+
diff --git a/test/npu-xrt/add_12_i8_using_2d_dma_op_with_padding/test.cpp b/test/npu-xrt/add_12_i8_using_2d_dma_op_with_padding/test.cpp
new file mode 100644
index 0000000000..bb95ddfb4e
--- /dev/null
+++ b/test/npu-xrt/add_12_i8_using_2d_dma_op_with_padding/test.cpp
@@ -0,0 +1,134 @@
+//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+constexpr int IN_SIZE = 61 * 56;
+constexpr int OUT_SIZE = 64 * 64;
+
+#define IN_DATATYPE int8_t
+#define OUT_DATATYPE int8_t
+
+std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
+  std::ifstream instr_file(instr_path);
+  std::string line;
+  std::vector<uint32_t> instr_v;
+  while (std::getline(instr_file, line)) {
+    std::istringstream iss(line);
+    uint32_t a;
+    if (!(iss >> std::hex >> a)) {
+      throw std::runtime_error("Unable to parse instruction file\n");
+    }
+    instr_v.push_back(a);
+  }
+  return instr_v;
+}
+
+int main(int argc, const char *argv[]) {
+  std::vector<uint32_t> instr_v = load_instr_sequence("insts.txt");
+
+  // Start the XRT test code
+  // Get a device handle
+  unsigned int device_index = 0;
+  auto device = xrt::device(device_index);
+
+  // Load the xclbin
+  auto xclbin = xrt::xclbin("aie.xclbin");
+
+  std::string Node = "MLIR_AIE";
+
+  // Get the kernel from the xclbin
+  auto xkernels = xclbin.get_kernels();
+  auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
+                               [Node](xrt::xclbin::kernel &k) {
+                                 auto name = k.get_name();
+                                 std::cout << "Name: " << name << std::endl;
+                                 return name.rfind(Node, 0) == 0;
+                               });
+  auto kernelName = xkernel.get_name();
+
+  device.register_xclbin(xclbin);
+
+  // get a hardware context
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // get a kernel handle
+  auto kernel = xrt::kernel(context, kernelName);
+
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(IN_DATATYPE),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(IN_DATATYPE),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+  auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(OUT_DATATYPE),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
+
+  IN_DATATYPE *bufInA = bo_inA.map<IN_DATATYPE *>();
+  std::vector<IN_DATATYPE> srcVecA;
+  for (int i = 0; i < IN_SIZE; i++)
+    srcVecA.push_back(1);
+  memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(IN_DATATYPE)));
+
+  void *bufInstr = bo_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  unsigned int opcode = 3;
+  auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
+  ert_cmd_state r = run.wait();
+  if (r != ERT_CMD_STATE_COMPLETED) {
+    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
+    return 1;
+  }
+
+  bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+  OUT_DATATYPE *bufOut = bo_out.map<OUT_DATATYPE *>();
+
+  int errors = 0;
+
+  for (uint32_t i = 0; i < 64; i++) {
+    for (uint32_t j = 0; j < 64; j++) {
+      uint32_t ref = 1 + 12;
+      // Zero padding at first and last element for every 14 inputs.
+      if ((i < 2) || (i >= 63))
+        ref = 0 + 12;
+      else if ((j < 4) || (j >= 60))
+        ref = 0 + 12;
+      if (*(bufOut + i * 64 + j) != ref) {
+        std::cout << "Error in output " << std::to_string(bufOut[i * 64 + j])
+                  << " != " << ref << std::endl;
+        errors++;
+      } else
+        std::cout << "Correct output " << std::to_string(bufOut[i * 64 + j])
+                  << " == " << ref << std::endl;
+    }
+  }
+
+  if (!errors) {
+    std::cout << "\nPASS!\n\n";
+    return 0;
+  }
+
+  std::cout << "\nfailed.\n\n";
+  return 1;
+}
diff --git a/test/npu-xrt/add_21_i8_using_dma_op_with_padding/aie.mlir b/test/npu-xrt/add_21_i8_using_dma_op_with_padding/aie.mlir
new file mode 100644
index 0000000000..48af170aa8
--- /dev/null
+++ b/test/npu-xrt/add_21_i8_using_dma_op_with_padding/aie.mlir
@@ -0,0 +1,149 @@
+//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2022-2024 Advanced Micro Devices, Inc. or its affiliates
+// Copyright (C) 2020-2022, Xilinx Inc.
+//
+//===----------------------------------------------------------------------===//
+
+module {
+  aie.device(npu1_1col) {
+    memref.global "public" @objFifo_in0 : memref<16xi8>
+    memref.global "public" @objFifo_out0 : memref<16xi8>
+
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_1 = aie.tile(0, 1)
+    %tile_0_2 = aie.tile(0, 2)
+
+    %objFifo_in1_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "objFifo_in1_cons_buff_0"} : memref<8xi8>
+    %objFifo_in1_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "objFifo_in1_cons_buff_1"} : memref<8xi8>
+    %objFifo_out1_buff_0 = aie.buffer(%tile_0_2) {sym_name = "objFifo_out1_buff_0"} : memref<8xi8>
+    %objFifo_out1_buff_1 = aie.buffer(%tile_0_2) {sym_name = "objFifo_out1_buff_1"} : memref<8xi8>
+
+    %objFifo_in1_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 2 : i32, sym_name = "objFifo_in1_cons_prod_lock"}
+    %objFifo_in1_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "objFifo_in1_cons_cons_lock"}
+    %objFifo_out1_prod_lock = aie.lock(%tile_0_2, 2) {init = 2 : i32, sym_name = "objFifo_out1_prod_lock"}
+    %objFifo_out1_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "objFifo_out1_cons_lock"}
+
+    aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0)
+    aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0)
+    aie.flow(%tile_0_1, DMA : 1, %tile_0_0, DMA : 0)
+    aie.flow(%tile_0_2, DMA : 0, %tile_0_1, DMA : 1)
+
+    %core_0_2 = aie.core(%tile_0_2) {
+      %c8 = arith.constant 8 : index
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c21_i8 = arith.constant 21 : i8
+      %c2 = arith.constant 2 : index
+      scf.for %arg0 = %c0 to %c8 step %c2 {
+        aie.use_lock(%objFifo_in1_cons_cons_lock, AcquireGreaterEqual, 1)
+        aie.use_lock(%objFifo_out1_prod_lock, AcquireGreaterEqual, 1)
+        scf.for %arg1 = %c0 to %c8 step %c1 {
+          %0 = memref.load %objFifo_in1_cons_buff_0[%arg1] : memref<8xi8>
+          %1 = arith.addi %0, %c21_i8 : i8
+          memref.store %1, %objFifo_out1_buff_0[%arg1] : memref<8xi8>
+        }
+        aie.use_lock(%objFifo_in1_cons_prod_lock, Release, 1)
+        aie.use_lock(%objFifo_out1_cons_lock, Release, 1)
+        aie.use_lock(%objFifo_in1_cons_cons_lock, AcquireGreaterEqual, 1)
+        aie.use_lock(%objFifo_out1_prod_lock, AcquireGreaterEqual, 1)
+        scf.for %arg1 = %c0 to %c8 step %c1 {
+          %0 = memref.load %objFifo_in1_cons_buff_1[%arg1] : memref<8xi8>
+          %1 = arith.addi %0, %c21_i8 : i8
+          memref.store %1, %objFifo_out1_buff_1[%arg1] : memref<8xi8>
+        }
+        aie.use_lock(%objFifo_in1_cons_prod_lock, Release, 1)
+        aie.use_lock(%objFifo_out1_cons_lock, Release, 1)
+      }
+      aie.end
+    }
+
+    aie.shim_dma_allocation @objFifo_in0(MM2S, 0, 0)
+
+    aiex.runtime_sequence(%arg0: memref<64xi8>, %arg1: memref<32xi8>, %arg2: memref<64xi8>) {
+      %c0_i64 = arith.constant 0 : i64
+      %c1_i64 = arith.constant 1 : i64
+      %c32_i64 = arith.constant 32 : i64
+      %c64_i64 = arith.constant 64 : i64
+      aiex.npu.dma_memcpy_nd (0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c32_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi8>
+      aiex.npu.dma_memcpy_nd (0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64xi8>
+      aiex.npu.dma_wait { symbol = @objFifo_out0 }
+    }
+
+    %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {
+      %objFifo_in0_cons_buff_0 = aie.buffer(%tile_0_1) {sym_name = "objFifo_in0_cons_buff_0"} : memref<16xi8>
+      %objFifo_in0_cons_buff_1 = aie.buffer(%tile_0_1) {sym_name = "objFifo_in0_cons_buff_1"} : memref<16xi8>
+      %objFifo_out0_buff_0 = aie.buffer(%tile_0_1) {sym_name = "objFifo_out0_buff_0"} : memref<16xi8>
+      %objFifo_out0_buff_1 = aie.buffer(%tile_0_1) {sym_name = "objFifo_out0_buff_1"} : memref<16xi8>
+      %objFifo_in0_cons_prod_lock = aie.lock(%tile_0_1, 0) {init = 2 : i32, sym_name = "objFifo_in0_cons_prod_lock"}
+      %objFifo_in0_cons_cons_lock = aie.lock(%tile_0_1, 1) {init = 0 : i32, sym_name = "objFifo_in0_cons_cons_lock"}
+      %objFifo_out0_prod_lock = aie.lock(%tile_0_1, 2) {init = 2 : i32, sym_name = "objFifo_out0_prod_lock"}
+      %objFifo_out0_cons_lock = aie.lock(%tile_0_1, 3) {init = 0 : i32, sym_name = "objFifo_out0_cons_lock"}
+      %0 = aie.dma(S2MM, 0) [{
+        aie.use_lock(%objFifo_in0_cons_prod_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_in0_cons_buff_0 : memref<16xi8>, 0, 8)
+        aie.use_lock(%objFifo_in0_cons_cons_lock, Release, 1)
+      }, {
+        aie.use_lock(%objFifo_in0_cons_prod_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_in0_cons_buff_1 : memref<16xi8>, 0, 8)
+        aie.use_lock(%objFifo_in0_cons_cons_lock, Release, 1)
+      }]
+      %1 = aie.dma(MM2S, 0) [{
+        aie.use_lock(%objFifo_in0_cons_cons_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_in0_cons_buff_0 : memref<16xi8>, 0, 16, [<size = 8, stride = 1>], [<const_pad_before = 4, const_pad_after = 4>])
+        aie.use_lock(%objFifo_in0_cons_prod_lock, Release, 1)
+      }, {
+        aie.use_lock(%objFifo_in0_cons_cons_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_in0_cons_buff_1 : memref<16xi8>, 0, 16, [<size = 8, stride = 1>], [<const_pad_before = 4, const_pad_after = 4>])
+        aie.use_lock(%objFifo_in0_cons_prod_lock, Release, 1)
+      }]
+      %2 = aie.dma(MM2S, 1) [{
+        aie.use_lock(%objFifo_out0_cons_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_out0_buff_0 : memref<16xi8>)
+        aie.use_lock(%objFifo_out0_prod_lock, Release, 1)
+      }, {
+        aie.use_lock(%objFifo_out0_cons_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_out0_buff_1 : memref<16xi8>)
+        aie.use_lock(%objFifo_out0_prod_lock, Release, 1)
+      }]
+      %3 = aie.dma(S2MM, 1) [{
+        aie.use_lock(%objFifo_out0_prod_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_out0_buff_0 : memref<16xi8>)
+        aie.use_lock(%objFifo_out0_cons_lock, Release, 1)
+      }, {
+        aie.use_lock(%objFifo_out0_prod_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_out0_buff_1 : memref<16xi8>)
+        aie.use_lock(%objFifo_out0_cons_lock, Release, 1)
+      }]
+      aie.end
+    }
+
+    aie.shim_dma_allocation @objFifo_out0(S2MM, 0, 0)
+
+    %mem_0_2 = aie.mem(%tile_0_2) {
+      %0 = aie.dma(S2MM, 0) [{
+        aie.use_lock(%objFifo_in1_cons_prod_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_in1_cons_buff_0 : memref<8xi8>)
+        aie.use_lock(%objFifo_in1_cons_cons_lock, Release, 1)
+      }, {
+        aie.use_lock(%objFifo_in1_cons_prod_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_in1_cons_buff_1 : memref<8xi8>)
+        aie.use_lock(%objFifo_in1_cons_cons_lock, Release, 1)
+      }]
+      %1 = aie.dma(MM2S, 0) [{
+        aie.use_lock(%objFifo_out1_cons_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_out1_buff_0 : memref<8xi8>)
+        aie.use_lock(%objFifo_out1_prod_lock, Release, 1)
+      }, {
+        aie.use_lock(%objFifo_out1_cons_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_out1_buff_1 : memref<8xi8>)
+        aie.use_lock(%objFifo_out1_prod_lock, Release, 1)
+      }]
+      aie.end
+    }
+  }
+}
diff --git a/test/npu-xrt/add_21_i8_using_dma_op_with_padding/run.lit b/test/npu-xrt/add_21_i8_using_dma_op_with_padding/run.lit
new file mode 100644
index 0000000000..5329b2789e
--- /dev/null
+++ b/test/npu-xrt/add_21_i8_using_dma_op_with_padding/run.lit
@@ -0,0 +1,14 @@
+// (c) Copyright 2023 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai
+//
+// RUN: %python aiecc.py %S/aie.mlir
+// RUN: aie-translate --aie-generate-cdo aie.mlir.prj/input_physical.mlir
+// RUN: cp *.elf aie.mlir.prj/
+// RUN: cp *.bin aie.mlir.prj/
+// RUN: %python aiecc.py --no-aiesim --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir
+// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++
+// RUN: %run_on_npu ./test.exe aie.xclbin | FileCheck %s
+// CHECK: PASS!
+
diff --git a/test/npu-xrt/add_21_i8_using_dma_op_with_padding/test.cpp b/test/npu-xrt/add_21_i8_using_dma_op_with_padding/test.cpp
new file mode 100644
index 0000000000..8467923480
--- /dev/null
+++ b/test/npu-xrt/add_21_i8_using_dma_op_with_padding/test.cpp
@@ -0,0 +1,135 @@
+//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+constexpr int IN_SIZE = 32;
+constexpr int OUT_SIZE = 64;
+
+#define IN_DATATYPE int8_t
+#define OUT_DATATYPE int8_t
+
+std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
+  std::ifstream instr_file(instr_path);
+  std::string line;
+  std::vector<uint32_t> instr_v;
+  while (std::getline(instr_file, line)) {
+    std::istringstream iss(line);
+    uint32_t a;
+    if (!(iss >> std::hex >> a)) {
+      throw std::runtime_error("Unable to parse instruction file\n");
+    }
+    instr_v.push_back(a);
+  }
+  return instr_v;
+}
+
+int main(int argc, const char *argv[]) {
+  std::vector<uint32_t> instr_v = load_instr_sequence("insts.txt");
+
+  // Start the XRT test code
+  // Get a device handle
+  unsigned int device_index = 0;
+  auto device = xrt::device(device_index);
+
+  // Load the xclbin
+  auto xclbin = xrt::xclbin("aie.xclbin");
+
+  std::string Node = "MLIR_AIE";
+
+  // Get the kernel from the xclbin
+  auto xkernels = xclbin.get_kernels();
+  auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
+                               [Node](xrt::xclbin::kernel &k) {
+                                 auto name = k.get_name();
+                                 std::cout << "Name: " << name << std::endl;
+                                 return name.rfind(Node, 0) == 0;
+                               });
+  auto kernelName = xkernel.get_name();
+
+  device.register_xclbin(xclbin);
+
+  // get a hardware context
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // get a kernel handle
+  auto kernel = xrt::kernel(context, kernelName);
+
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(IN_DATATYPE),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(IN_DATATYPE),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+  auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(OUT_DATATYPE),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
+
+  IN_DATATYPE *bufInA = bo_inA.map<IN_DATATYPE *>();
+  std::vector<IN_DATATYPE> srcVecA;
+  for (int i = 0; i < IN_SIZE; i++)
+    srcVecA.push_back(i + 1);
+  memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(IN_DATATYPE)));
+
+  void *bufInstr = bo_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  unsigned int opcode = 3;
+  auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
+  ert_cmd_state r = run.wait();
+  if (r != ERT_CMD_STATE_COMPLETED) {
+    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
+    return 1;
+  }
+
+  bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+  OUT_DATATYPE *bufOut = bo_out.map<OUT_DATATYPE *>();
+
+  int errors = 0;
+
+  int idx = 0;
+  for (uint32_t i = 0; i < 4; i++) {
+    for (uint32_t j = 0; j < 16; j++) {
+      uint32_t ref = idx + 1 + 21;
+      // Zero padding at first and last element for every 14 inputs.
+      if ((j < 4) || (j >= 12))
+        ref = 0 + 21;
+      else
+        idx++;
+      if (*(bufOut + i * 16 + j) != ref) {
+        std::cout << "Error in output " << std::to_string(bufOut[i * 16 + j])
+                  << " != " << ref << std::endl;
+        errors++;
+      } else
+        std::cout << "Correct output " << std::to_string(bufOut[i * 16 + j])
+                  << " == " << ref << std::endl;
+    }
+  }
+
+  if (!errors) {
+    std::cout << "\nPASS!\n\n";
+    return 0;
+  }
+
+  std::cout << "\nfailed.\n\n";
+  return 1;
+}
diff --git a/test/npu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir b/test/npu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir
index 9f7fe997da..1fc7df0961 100644
--- a/test/npu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir
+++ b/test/npu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir
@@ -98,15 +98,14 @@ module {
     }
 
     // the absolutely only thing that's relevant here is (MM2S, 0, 0) and (S2MM, 0, 0)
-    memref.global "public" @this_just_creates_a_symbol_and_the_type_means_nothing_in : memref<1xi32>
-    memref.global "public" @this_just_creates_a_symbol_and_the_type_means_nothing_out : memref<1xi32>
-    aie.shim_dma_allocation @this_just_creates_a_symbol_and_the_type_means_nothing_in(MM2S, 0, 0)
-    aie.shim_dma_allocation @this_just_creates_a_symbol_and_the_type_means_nothing_out(S2MM, 0, 0)
-    func.func @bobsyouruncle(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) {
-      aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0, 1]) {id = 0 : i64, metadata = @this_just_creates_a_symbol_and_the_type_means_nothing_in} : memref<64xi32>
-      aiex.npu.dma_memcpy_nd (0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0, 1]) {id = 1 : i64, metadata = @this_just_creates_a_symbol_and_the_type_means_nothing_out} : memref<64xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
+    memref.global "public" @data_in : memref<1xi32>
+    memref.global "public" @data_out : memref<1xi32>
+    aie.shim_dma_allocation @data_in(MM2S, 0, 0)
+    aie.shim_dma_allocation @data_out(S2MM, 0, 0)
+    aiex.runtime_sequence(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) {
+      aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0, 1]) {id = 0 : i64, metadata = @data_in} : memref<64xi32>
+      aiex.npu.dma_memcpy_nd (0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0, 1]) {id = 1 : i64, metadata = @data_out, issue_token = true} : memref<64xi32>
+      aiex.npu.dma_wait {symbol = @data_out}
     }
   }
 }
diff --git a/test/npu-xrt/add_256_using_dma_op_no_double_buffering/test.cpp b/test/npu-xrt/add_256_using_dma_op_no_double_buffering/test.cpp
index 5c2048546a..06c072b47a 100644
--- a/test/npu-xrt/add_256_using_dma_op_no_double_buffering/test.cpp
+++ b/test/npu-xrt/add_256_using_dma_op_no_double_buffering/test.cpp
@@ -91,7 +91,11 @@ int main(int argc, const char *argv[]) {
 
   unsigned int opcode = 3;
   auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
-  run.wait();
+  ert_cmd_state r = run.wait();
+  if (r != ERT_CMD_STATE_COMPLETED) {
+    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
+    return 1;
+  }
 
   bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
 
diff --git a/test/npu-xrt/add_314_using_dma_op/aie.mlir b/test/npu-xrt/add_314_using_dma_op/aie.mlir
index 7d4f1183b6..544f4fc837 100644
--- a/test/npu-xrt/add_314_using_dma_op/aie.mlir
+++ b/test/npu-xrt/add_314_using_dma_op/aie.mlir
@@ -64,14 +64,13 @@ module {
 
     aie.shim_dma_allocation @objFifo_in0(MM2S, 0, 0)
 
-    func.func @bobsyouruncle(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) {
+    aiex.runtime_sequence(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) {
       %c0_i64 = arith.constant 0 : i64
       %c1_i64 = arith.constant 1 : i64
       %c64_i64 = arith.constant 64 : i64
       aiex.npu.dma_memcpy_nd (0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32>
-      aiex.npu.dma_memcpy_nd (0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0} : memref<64xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
+      aiex.npu.dma_memcpy_nd (0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64xi32>
+      aiex.npu.dma_wait { symbol = @objFifo_out0 }
     }
 
     %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {
diff --git a/test/npu-xrt/add_314_using_dma_op/test.cpp b/test/npu-xrt/add_314_using_dma_op/test.cpp
index f98179be32..d2d192b5b5 100644
--- a/test/npu-xrt/add_314_using_dma_op/test.cpp
+++ b/test/npu-xrt/add_314_using_dma_op/test.cpp
@@ -91,7 +91,11 @@ int main(int argc, const char *argv[]) {
 
   unsigned int opcode = 3;
   auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
-  run.wait();
+  ert_cmd_state r = run.wait();
+  if (r != ERT_CMD_STATE_COMPLETED) {
+    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
+    return 1;
+  }
 
   bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
 
diff --git a/test/npu-xrt/add_378_i32_using_dma_op_with_padding/aie.mlir b/test/npu-xrt/add_378_i32_using_dma_op_with_padding/aie.mlir
new file mode 100644
index 0000000000..0c058cd4d1
--- /dev/null
+++ b/test/npu-xrt/add_378_i32_using_dma_op_with_padding/aie.mlir
@@ -0,0 +1,149 @@
+//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2022-2024 Advanced Micro Devices, Inc. or its affiliates
+// Copyright (C) 2020-2022, Xilinx Inc.
+//
+//===----------------------------------------------------------------------===//
+
+module {
+  aie.device(npu1_1col) {
+    memref.global "public" @objFifo_in0 : memref<16xi32>
+    memref.global "public" @objFifo_out0 : memref<16xi32>
+
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_1 = aie.tile(0, 1)
+    %tile_0_2 = aie.tile(0, 2)
+
+    %objFifo_in1_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "objFifo_in1_cons_buff_0"} : memref<8xi32>
+    %objFifo_in1_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "objFifo_in1_cons_buff_1"} : memref<8xi32>
+    %objFifo_out1_buff_0 = aie.buffer(%tile_0_2) {sym_name = "objFifo_out1_buff_0"} : memref<8xi32>
+    %objFifo_out1_buff_1 = aie.buffer(%tile_0_2) {sym_name = "objFifo_out1_buff_1"} : memref<8xi32>
+
+    %objFifo_in1_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 2 : i32, sym_name = "objFifo_in1_cons_prod_lock"}
+    %objFifo_in1_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "objFifo_in1_cons_cons_lock"}
+    %objFifo_out1_prod_lock = aie.lock(%tile_0_2, 2) {init = 2 : i32, sym_name = "objFifo_out1_prod_lock"}
+    %objFifo_out1_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "objFifo_out1_cons_lock"}
+
+    aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0)
+    aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0)
+    aie.flow(%tile_0_1, DMA : 1, %tile_0_0, DMA : 0)
+    aie.flow(%tile_0_2, DMA : 0, %tile_0_1, DMA : 1)
+
+    %core_0_2 = aie.core(%tile_0_2) {
+      %c8 = arith.constant 8 : index
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c378_i32 = arith.constant 378 : i32
+      %c2 = arith.constant 2 : index
+      scf.for %arg0 = %c0 to %c8 step %c2 {
+        aie.use_lock(%objFifo_in1_cons_cons_lock, AcquireGreaterEqual, 1)
+        aie.use_lock(%objFifo_out1_prod_lock, AcquireGreaterEqual, 1)
+        scf.for %arg1 = %c0 to %c8 step %c1 {
+          %0 = memref.load %objFifo_in1_cons_buff_0[%arg1] : memref<8xi32>
+          %1 = arith.addi %0, %c378_i32 : i32
+          memref.store %1, %objFifo_out1_buff_0[%arg1] : memref<8xi32>
+        }
+        aie.use_lock(%objFifo_in1_cons_prod_lock, Release, 1)
+        aie.use_lock(%objFifo_out1_cons_lock, Release, 1)
+        aie.use_lock(%objFifo_in1_cons_cons_lock, AcquireGreaterEqual, 1)
+        aie.use_lock(%objFifo_out1_prod_lock, AcquireGreaterEqual, 1)
+        scf.for %arg1 = %c0 to %c8 step %c1 {
+          %0 = memref.load %objFifo_in1_cons_buff_1[%arg1] : memref<8xi32>
+          %1 = arith.addi %0, %c378_i32 : i32
+          memref.store %1, %objFifo_out1_buff_1[%arg1] : memref<8xi32>
+        }
+        aie.use_lock(%objFifo_in1_cons_prod_lock, Release, 1)
+        aie.use_lock(%objFifo_out1_cons_lock, Release, 1)
+      }
+      aie.end
+    }
+
+    aie.shim_dma_allocation @objFifo_in0(MM2S, 0, 0)
+
+    aiex.runtime_sequence(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) {
+      %c0_i64 = arith.constant 0 : i64
+      %c1_i64 = arith.constant 1 : i64
+      %c52_i64 = arith.constant 52 : i64
+      %c64_i64 = arith.constant 64 : i64
+      aiex.npu.dma_memcpy_nd (0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c52_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32>
+      aiex.npu.dma_memcpy_nd (0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64xi32>
+      aiex.npu.dma_wait {symbol = @objFifo_out0}
+    }
+
+    %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {
+      %objFifo_in0_cons_buff_0 = aie.buffer(%tile_0_1) {sym_name = "objFifo_in0_cons_buff_0"} : memref<16xi32>
+      %objFifo_in0_cons_buff_1 = aie.buffer(%tile_0_1) {sym_name = "objFifo_in0_cons_buff_1"} : memref<16xi32>
+      %objFifo_out0_buff_0 = aie.buffer(%tile_0_1) {sym_name = "objFifo_out0_buff_0"} : memref<16xi32>
+      %objFifo_out0_buff_1 = aie.buffer(%tile_0_1) {sym_name = "objFifo_out0_buff_1"} : memref<16xi32>
+      %objFifo_in0_cons_prod_lock = aie.lock(%tile_0_1, 0) {init = 2 : i32, sym_name = "objFifo_in0_cons_prod_lock"}
+      %objFifo_in0_cons_cons_lock = aie.lock(%tile_0_1, 1) {init = 0 : i32, sym_name = "objFifo_in0_cons_cons_lock"}
+      %objFifo_out0_prod_lock = aie.lock(%tile_0_1, 2) {init = 2 : i32, sym_name = "objFifo_out0_prod_lock"}
+      %objFifo_out0_cons_lock = aie.lock(%tile_0_1, 3) {init = 0 : i32, sym_name = "objFifo_out0_cons_lock"}
+      %0 = aie.dma(S2MM, 0) [{
+        aie.use_lock(%objFifo_in0_cons_prod_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_in0_cons_buff_0 : memref<16xi32>, 0, 13)
+        aie.use_lock(%objFifo_in0_cons_cons_lock, Release, 1)
+      }, {
+        aie.use_lock(%objFifo_in0_cons_prod_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_in0_cons_buff_1 : memref<16xi32>, 0, 13)
+        aie.use_lock(%objFifo_in0_cons_cons_lock, Release, 1)
+      }]
+      %1 = aie.dma(MM2S, 0) [{
+        aie.use_lock(%objFifo_in0_cons_cons_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_in0_cons_buff_0 : memref<16xi32>, 0, 16, [<size = 13, stride = 1>], [<const_pad_before = 2, const_pad_after = 1>])
+        aie.use_lock(%objFifo_in0_cons_prod_lock, Release, 1)
+      }, {
+        aie.use_lock(%objFifo_in0_cons_cons_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_in0_cons_buff_1 : memref<16xi32>, 0, 16, [<size = 13, stride = 1>], [<const_pad_before = 2, const_pad_after = 1>])
+        aie.use_lock(%objFifo_in0_cons_prod_lock, Release, 1)
+      }]
+      %2 = aie.dma(MM2S, 1) [{
+        aie.use_lock(%objFifo_out0_cons_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_out0_buff_0 : memref<16xi32>)
+        aie.use_lock(%objFifo_out0_prod_lock, Release, 1)
+      }, {
+        aie.use_lock(%objFifo_out0_cons_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_out0_buff_1 : memref<16xi32>)
+        aie.use_lock(%objFifo_out0_prod_lock, Release, 1)
+      }]
+      %3 = aie.dma(S2MM, 1) [{
+        aie.use_lock(%objFifo_out0_prod_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_out0_buff_0 : memref<16xi32>)
+        aie.use_lock(%objFifo_out0_cons_lock, Release, 1)
+      }, {
+        aie.use_lock(%objFifo_out0_prod_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_out0_buff_1 : memref<16xi32>)
+        aie.use_lock(%objFifo_out0_cons_lock, Release, 1)
+      }]
+      aie.end
+    }
+
+    aie.shim_dma_allocation @objFifo_out0(S2MM, 0, 0)
+
+    %mem_0_2 = aie.mem(%tile_0_2) {
+      %0 = aie.dma(S2MM, 0) [{
+        aie.use_lock(%objFifo_in1_cons_prod_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_in1_cons_buff_0 : memref<8xi32>)
+        aie.use_lock(%objFifo_in1_cons_cons_lock, Release, 1)
+      }, {
+        aie.use_lock(%objFifo_in1_cons_prod_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_in1_cons_buff_1 : memref<8xi32>)
+        aie.use_lock(%objFifo_in1_cons_cons_lock, Release, 1)
+      }]
+      %1 = aie.dma(MM2S, 0) [{
+        aie.use_lock(%objFifo_out1_cons_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_out1_buff_0 : memref<8xi32>)
+        aie.use_lock(%objFifo_out1_prod_lock, Release, 1)
+      }, {
+        aie.use_lock(%objFifo_out1_cons_lock, AcquireGreaterEqual, 1)
+        aie.dma_bd(%objFifo_out1_buff_1 : memref<8xi32>)
+        aie.use_lock(%objFifo_out1_prod_lock, Release, 1)
+      }]
+      aie.end
+    }
+  }
+}
diff --git a/test/npu-xrt/add_378_i32_using_dma_op_with_padding/run.lit b/test/npu-xrt/add_378_i32_using_dma_op_with_padding/run.lit
new file mode 100644
index 0000000000..5329b2789e
--- /dev/null
+++ b/test/npu-xrt/add_378_i32_using_dma_op_with_padding/run.lit
@@ -0,0 +1,14 @@
+// (c) Copyright 2023 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai
+//
+// RUN: %python aiecc.py %S/aie.mlir
+// RUN: aie-translate --aie-generate-cdo aie.mlir.prj/input_physical.mlir
+// RUN: cp *.elf aie.mlir.prj/
+// RUN: cp *.bin aie.mlir.prj/
+// RUN: %python aiecc.py --no-aiesim --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir
+// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++
+// RUN: %run_on_npu ./test.exe aie.xclbin | FileCheck %s
+// CHECK: PASS!
+
diff --git a/test/npu-xrt/add_378_i32_using_dma_op_with_padding/test.cpp b/test/npu-xrt/add_378_i32_using_dma_op_with_padding/test.cpp
new file mode 100644
index 0000000000..31384ac275
--- /dev/null
+++ b/test/npu-xrt/add_378_i32_using_dma_op_with_padding/test.cpp
@@ -0,0 +1,132 @@
+//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+constexpr int IN_SIZE = 52;
+constexpr int OUT_SIZE = 64;
+
+std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
+  std::ifstream instr_file(instr_path);
+  std::string line;
+  std::vector<uint32_t> instr_v;
+  while (std::getline(instr_file, line)) {
+    std::istringstream iss(line);
+    uint32_t a;
+    if (!(iss >> std::hex >> a)) {
+      throw std::runtime_error("Unable to parse instruction file\n");
+    }
+    instr_v.push_back(a);
+  }
+  return instr_v;
+}
+
+int main(int argc, const char *argv[]) {
+  std::vector<uint32_t> instr_v = load_instr_sequence("insts.txt");
+
+  // Start the XRT test code
+  // Get a device handle
+  unsigned int device_index = 0;
+  auto device = xrt::device(device_index);
+
+  // Load the xclbin
+  auto xclbin = xrt::xclbin("aie.xclbin");
+
+  std::string Node = "MLIR_AIE";
+
+  // Get the kernel from the xclbin
+  auto xkernels = xclbin.get_kernels();
+  auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
+                               [Node](xrt::xclbin::kernel &k) {
+                                 auto name = k.get_name();
+                                 std::cout << "Name: " << name << std::endl;
+                                 return name.rfind(Node, 0) == 0;
+                               });
+  auto kernelName = xkernel.get_name();
+
+  device.register_xclbin(xclbin);
+
+  // get a hardware context
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // get a kernel handle
+  auto kernel = xrt::kernel(context, kernelName);
+
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+  auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
+
+  uint32_t *bufInA = bo_inA.map<uint32_t *>();
+  std::vector<uint32_t> srcVecA;
+  for (int i = 0; i < IN_SIZE; i++)
+    srcVecA.push_back(i + 1);
+  memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(uint32_t)));
+
+  void *bufInstr = bo_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  unsigned int opcode = 3;
+  auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
+  ert_cmd_state r = run.wait();
+  if (r != ERT_CMD_STATE_COMPLETED) {
+    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
+    return 1;
+  }
+
+  bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+  uint32_t *bufOut = bo_out.map<uint32_t *>();
+
+  int errors = 0;
+
+  int idx = 0;
+  for (uint32_t i = 0; i < 4; i++) {
+    for (uint32_t j = 0; j < 16; j++) {
+      uint32_t ref = idx + 1 + 378;
+      // Zero padding at first and last element for every 14 inputs.
+      if (j == 0 || j == 1 || j == 15)
+        ref = 0 + 378;
+      else
+        idx++;
+      if (*(bufOut + i * 16 + j) != ref) {
+        std::cout << "Error in output " << *(bufOut + i * 16 + j)
+                  << " != " << ref << std::endl;
+        errors++;
+      } else
+        std::cout << "Correct output " << *(bufOut + i * 16 + j)
+                  << " == " << ref << std::endl;
+    }
+  }
+
+  if (!errors) {
+    std::cout << "\nPASS!\n\n";
+    return 0;
+  }
+
+  std::cout << "\nfailed.\n\n";
+  return 1;
+}
diff --git a/test/npu-xrt/add_one_objFifo/aie.mlir b/test/npu-xrt/add_one_objFifo/aie.mlir
index d0056ffebe..9ceb45f842 100644
--- a/test/npu-xrt/add_one_objFifo/aie.mlir
+++ b/test/npu-xrt/add_one_objFifo/aie.mlir
@@ -43,14 +43,13 @@ module {
       }
       aie.end
     }
-    func.func @sequence(%in : memref<64xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) {
+    aiex.runtime_sequence(%in : memref<64xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) {
       %c0 = arith.constant 0 : i64
       %c1 = arith.constant 1 : i64
       %c64 = arith.constant 64 : i64
       aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32>
-      aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32>
-      aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 }
-      return
+      aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_in0, id = 0 : i64, issue_token = true } : memref<64xi32>
+      aiex.npu.dma_wait { symbol = @objFifo_out0 }
     }
   }
 }
diff --git a/test/npu-xrt/add_one_objFifo/test.cpp b/test/npu-xrt/add_one_objFifo/test.cpp
index a3e2795480..4163cdb542 100644
--- a/test/npu-xrt/add_one_objFifo/test.cpp
+++ b/test/npu-xrt/add_one_objFifo/test.cpp
@@ -160,7 +160,11 @@ int main(int argc, const char *argv[]) {
     std::cout << "Running Kernel.\n";
   unsigned int opcode = 3;
   auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
-  run.wait();
+  ert_cmd_state r = run.wait();
+  if (r != ERT_CMD_STATE_COMPLETED) {
+    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
+    return 1;
+  }
 
   bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
 
diff --git a/test/npu-xrt/add_one_two/aie1.mlir b/test/npu-xrt/add_one_two/aie1.mlir
index 93d47dbc69..0d0c7f39cc 100644
--- a/test/npu-xrt/add_one_two/aie1.mlir
+++ b/test/npu-xrt/add_one_two/aie1.mlir
@@ -40,14 +40,13 @@ module {
       }
       aie.end
     }
-    func.func @sequence(%in : memref<64xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) {
+    aiex.runtime_sequence(%in : memref<64xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) {
       %c0 = arith.constant 0 : i64
       %c1 = arith.constant 1 : i64
       %c64 = arith.constant 64 : i64
-      aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32>
+      aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_out0, id = 1 : i64, issue_token = true } : memref<64xi32>
       aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32>
-      aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 }
-      return
+      aiex.npu.dma_wait { symbol = @objFifo_out0 }
     }
   }
 }
diff --git a/test/npu-xrt/add_one_two/aie2.mlir b/test/npu-xrt/add_one_two/aie2.mlir
index 538f84c8e1..8aa02271ce 100644
--- a/test/npu-xrt/add_one_two/aie2.mlir
+++ b/test/npu-xrt/add_one_two/aie2.mlir
@@ -40,14 +40,13 @@ module {
       }
       aie.end
     }
-    func.func @sequence(%in : memref<64xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) {
+    aiex.runtime_sequence(%in : memref<64xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) {
       %c0 = arith.constant 0 : i64
       %c1 = arith.constant 1 : i64
       %c64 = arith.constant 64 : i64
-      aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32>
+      aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_out0, id = 1 : i64, issue_token = true } : memref<64xi32>
       aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32>
-      aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 }
-      return
+      aiex.npu.dma_wait { symbol = @objFifo_out0 }
     }
   }
 }
diff --git a/test/npu-xrt/add_one_using_dma/aie.mlir b/test/npu-xrt/add_one_using_dma/aie.mlir
index f8524edef6..28c3e8b49c 100644
--- a/test/npu-xrt/add_one_using_dma/aie.mlir
+++ b/test/npu-xrt/add_one_using_dma/aie.mlir
@@ -75,14 +75,13 @@ module {
 
     aie.shim_dma_allocation @objFifo_in0(MM2S, 0, 0)
 
-    func.func @bobsyouruncle(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) {
+    aiex.runtime_sequence(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) {
       %c0_i64 = arith.constant 0 : i64
       %c1_i64 = arith.constant 1 : i64
       %c64_i64 = arith.constant 64 : i64
       aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0} : memref<64xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
+      aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64xi32>
+      aiex.npu.dma_wait {symbol = @objFifo_out0}
     }
 
     %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {
diff --git a/test/npu-xrt/add_one_using_dma/test.cpp b/test/npu-xrt/add_one_using_dma/test.cpp
index a3e2795480..4163cdb542 100644
--- a/test/npu-xrt/add_one_using_dma/test.cpp
+++ b/test/npu-xrt/add_one_using_dma/test.cpp
@@ -160,7 +160,11 @@ int main(int argc, const char *argv[]) {
     std::cout << "Running Kernel.\n";
   unsigned int opcode = 3;
   auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
-  run.wait();
+  ert_cmd_state r = run.wait();
+  if (r != ERT_CMD_STATE_COMPLETED) {
+    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
+    return 1;
+  }
 
   bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
 
diff --git a/test/npu-xrt/cascade_flows/aie.mlir b/test/npu-xrt/cascade_flows/aie.mlir
index 73e76c1a29..c10bab0a23 100644
--- a/test/npu-xrt/cascade_flows/aie.mlir
+++ b/test/npu-xrt/cascade_flows/aie.mlir
@@ -59,14 +59,13 @@ module {
       aie.end
     } { link_with="kernel3.o" }
 
-    func.func @sequence(%in : memref<64xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) {
+    aiex.runtime_sequence(%in : memref<64xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) {
       %c0 = arith.constant 0 : i64
       %c1 = arith.constant 1 : i64
       %c64 = arith.constant 64 : i64
-      aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32>
+      aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_out0, id = 1 : i64, issue_token = true } : memref<64xi32>
       aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0, %c1]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32>
-      aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 }
-      return
+      aiex.npu.dma_wait { symbol = @objFifo_out0 }
     }
   }
 }
diff --git a/test/npu-xrt/cascade_flows/test.cpp b/test/npu-xrt/cascade_flows/test.cpp
index db5cb25086..2a3affc09c 100644
--- a/test/npu-xrt/cascade_flows/test.cpp
+++ b/test/npu-xrt/cascade_flows/test.cpp
@@ -160,7 +160,11 @@ int main(int argc, const char *argv[]) {
     std::cout << "Running Kernel.\n";
   unsigned int opcode = 3;
   auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
-  run.wait();
+  ert_cmd_state r = run.wait();
+  if (r != ERT_CMD_STATE_COMPLETED) {
+    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
+    return 1;
+  }
 
   bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
 
diff --git a/test/npu-xrt/e2e/tiled_matrix_add.ipynb b/test/npu-xrt/e2e/tiled_matrix_add.ipynb
index 0c9a2247ac..9e90eb894b 100644
--- a/test/npu-xrt/e2e/tiled_matrix_add.ipynb
+++ b/test/npu-xrt/e2e/tiled_matrix_add.ipynb
@@ -375,7 +375,7 @@
       "    aie.flow(%tile_0_1, DMA : 1, %tile_0_2, DMA : 1)\n",
       "    aie.flow(%tile_0_2, DMA : 0, %tile_0_1, DMA : 2)\n",
       "    aie.flow(%tile_0_1, DMA : 2, %tile_0_0, DMA : 0)\n",
-      "    func.func @bobsyouruncle() {\n",
+      "    aiex.runtime_sequence() {\n",
       "      aiex.npu.writebd_shimtile {bd_id = 0 : i32, buffer_length = 64 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n",
       "      aiex.npu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 0 : ui32}\n",
       "      aiex.npu.writebd_shimtile {bd_id = 1 : i32, buffer_length = 64 : i32, buffer_offset = 32 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n",
diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir
index 6e604e8d5c..a88877e659 100644
--- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir
+++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir
@@ -568,7 +568,7 @@ module {
     memref.global "public" @airMemcpyId4 : memref<16x16xi32, 1 : i32>
     aie.shim_dma_allocation @airMemcpyId5(MM2S, 1, 0)
     memref.global "public" @airMemcpyId5 : memref<16x16xi32, 1 : i32>
-    func.func @matmul_16x16_16xi32__dispatch_0_matmul_16x16x16_i32(%arg0: memref<16x16xi32>, %arg1: memref<16x16xi32>, %arg2: memref<16x16xi32>) {
+    aiex.runtime_sequence(%arg0: memref<16x16xi32>, %arg1: memref<16x16xi32>, %arg2: memref<16x16xi32>) {
       // <trace>
       aiex.npu.write32 {address = 212992 : ui32, column = 3 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15)	
       aiex.npu.write32 {address = 213200 : ui32, column = 3 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15)
@@ -640,9 +640,8 @@ module {
       memref.assume_alignment %arg2, 64 : memref<16x16xi32>
       aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32>
       aiex.npu.dma_memcpy_nd (0, 0, %arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32>
-      aiex.npu.dma_memcpy_nd (0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<16x16xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
+      aiex.npu.dma_memcpy_nd (0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 2 : i64, metadata = @airMemcpyId12, issue_token = true} : memref<16x16xi32>
+      aiex.npu.dma_wait {symbol = @airMemcpyId12}
     }
   } {sym_name = "segment_0"}
 }
diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir
index defdf5337d..09f1eac02f 100644
--- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir
+++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir
@@ -416,7 +416,7 @@ module {
     memref.global "public" @airMemcpyId4 : memref<16x16xi32, 1 : i32>
     aie.shim_dma_allocation @airMemcpyId5(MM2S, 1, 0)
     memref.global "public" @airMemcpyId5 : memref<16x16xi32, 1 : i32>
-    func.func @matmul_16x16_16xi32__dispatch_0_matmul_16x16x16_i32(%arg0: memref<16x16xi32>, %arg1: memref<16x16xi32>, %arg2: memref<16x16xi32>) {
+    aiex.runtime_sequence(%arg0: memref<16x16xi32>, %arg1: memref<16x16xi32>, %arg2: memref<16x16xi32>) {
       // <trace>
       aiex.npu.write32 {address = 212992 : ui32, column = 3 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15)	
       aiex.npu.write32 {address = 213200 : ui32, column = 3 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15)
@@ -488,9 +488,8 @@ module {
       memref.assume_alignment %arg2, 64 : memref<16x16xi32>
       aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32>
       aiex.npu.dma_memcpy_nd (0, 0, %arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32>
-      aiex.npu.dma_memcpy_nd (0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<16x16xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
+      aiex.npu.dma_memcpy_nd (0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 2 : i64, metadata = @airMemcpyId12, issue_token = true} : memref<16x16xi32>
+      aiex.npu.dma_wait {symbol = @airMemcpyId12}
     }
   } {sym_name = "segment_0"}
 }
diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir
index e0ea07f680..be13d7523c 100644
--- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir
+++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir
@@ -162,7 +162,7 @@ module {
     memref.global "public" @airMemcpyId4 : memref<16x16xi32, 1 : i32>
     aie.shim_dma_allocation @airMemcpyId5(MM2S, 1, 0)
     memref.global "public" @airMemcpyId5 : memref<16x16xi32, 1 : i32>
-    func.func @matmul_16x16_16xi32__dispatch_0_matmul_16x16x16_i32(%arg0: memref<16x16xi32>, %arg1: memref<16x16xi32>, %arg2: memref<16x16xi32>) {     
+    aiex.runtime_sequence(%arg0: memref<16x16xi32>, %arg1: memref<16x16xi32>, %arg2: memref<16x16xi32>) {     
       // <trace>
       aiex.npu.write32 {address = 212992 : ui32, column = 0 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15)
       aiex.npu.write32 {address = 213200 : ui32, column = 0 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15)
@@ -193,9 +193,8 @@ module {
       memref.assume_alignment %arg2, 64 : memref<16x16xi32>
       aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32>
       aiex.npu.dma_memcpy_nd (0, 0, %arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32>
-      aiex.npu.dma_memcpy_nd (0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<16x16xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
+      aiex.npu.dma_memcpy_nd (0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 2 : i64, metadata = @airMemcpyId12, issue_token = true} : memref<16x16xi32>
+      aiex.npu.dma_wait { symbol = @airMemcpyId12}
     }
   } {sym_name = "segment_0"}
 }
diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir
index 19d48cc7e6..1a78398e26 100644
--- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir
+++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir
@@ -442,7 +442,7 @@ module {
     memref.global "public" @airMemcpyId4 : memref<16x16xi32, 1 : i32>
     aie.shim_dma_allocation @airMemcpyId5(MM2S, 1, 0)
     memref.global "public" @airMemcpyId5 : memref<16x16xi32, 1 : i32>
-    func.func @matmul_16x16_16xi32__dispatch_0_matmul_16x16x16_i32(%arg0: memref<16x16xi32>, %arg1: memref<16x16xi32>, %arg2: memref<16x16xi32>) {     
+    aiex.runtime_sequence(%arg0: memref<16x16xi32>, %arg1: memref<16x16xi32>, %arg2: memref<16x16xi32>) {     
       // <trace>
       aiex.npu.write32 {address = 212992 : ui32, column = 3 : i32, row = 2 : i32, value = 31232 : ui32} // [14:8] reset event: 122(BROADCAST_15)	
       aiex.npu.write32 {address = 213200 : ui32, column = 3 : i32, row = 2 : i32, value = 7995392 : ui32} // [22:16] start event: 122(BROADCAST_15)
@@ -524,9 +524,8 @@ module {
       memref.assume_alignment %arg2, 64 : memref<16x16xi32>
       aiex.npu.dma_memcpy_nd (0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32>
       aiex.npu.dma_memcpy_nd (0, 0, %arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32>
-      aiex.npu.dma_memcpy_nd (0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<16x16xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
+      aiex.npu.dma_memcpy_nd (0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16, 1]) {id = 2 : i64, metadata = @airMemcpyId12, issue_token = true} : memref<16x16xi32>
+      aiex.npu.dma_wait {symbol = @airMemcpyId12}
     }
   } {sym_name = "segment_0"}
 }
diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/test.cpp b/test/npu-xrt/matrix_multiplication_using_cascade/test.cpp
index 70090933ef..c3440ef956 100644
--- a/test/npu-xrt/matrix_multiplication_using_cascade/test.cpp
+++ b/test/npu-xrt/matrix_multiplication_using_cascade/test.cpp
@@ -159,7 +159,11 @@ int main(int argc, const char *argv[]) {
     auto start = std::chrono::high_resolution_clock::now();
     unsigned int opcode = 3;
     auto run = kernel(opcode, bo_instr, instr_v.size(), bo_a, bo_b, bo_c);
-    run.wait();
+    ert_cmd_state r = run.wait();
+    if (r != ERT_CMD_STATE_COMPLETED) {
+      std::cout << "Kernel did not complete. Returned status: " << r << "\n";
+      return 1;
+    }
     auto stop = std::chrono::high_resolution_clock::now();
     bo_c.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
     memcpy(CVec.data(), bufC, (CVec.size() * sizeof(C_DATATYPE)));
diff --git a/test/npu-xrt/matrix_transpose/aie2.py b/test/npu-xrt/matrix_transpose/aie2.py
index 1d5efd8721..e95db67d5b 100644
--- a/test/npu-xrt/matrix_transpose/aie2.py
+++ b/test/npu-xrt/matrix_transpose/aie2.py
@@ -68,7 +68,7 @@ def core_body():
                     yield_([])
 
             # To/from AIE-array data movement
-            @FuncOp.from_py_func(matrix_memref, matrix_memref)
+            @runtime_sequence(matrix_memref, matrix_memref)
             def sequence(inp, out):
                 npu_dma_memcpy_nd(
                     metadata=fifo_in.sym_name.value,
diff --git a/test/npu-xrt/nd_memcpy_transforms/aie2.py b/test/npu-xrt/nd_memcpy_transforms/aie2.py
index d09efc9884..1dc30ad47f 100644
--- a/test/npu-xrt/nd_memcpy_transforms/aie2.py
+++ b/test/npu-xrt/nd_memcpy_transforms/aie2.py
@@ -84,7 +84,7 @@ def core_body():
                     yield_([])
 
             # To/from AIE-array data movement
-            @FuncOp.from_py_func(memref_a, memref_b, memref_c)
+            @runtime_sequence(memref_a, memref_b, memref_c)
             def sequence(A, B, C):
                 npu_dma_memcpy_nd(
                     metadata=fifo_a.sym_name.value,
diff --git a/test/npu-xrt/sync_task_complete_token/aie2.py b/test/npu-xrt/sync_task_complete_token/aie2.py
new file mode 100644
index 0000000000..333fdbb668
--- /dev/null
+++ b/test/npu-xrt/sync_task_complete_token/aie2.py
@@ -0,0 +1,103 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 AMD Inc.
+
+# REQUIRES: ryzen_ai
+#
+# RUN: %python %S/aie2.py > ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: clang %S/test.cpp -o test -std=c++11 -Wall %xrt_flags -lrt -lstdc++
+# RUN: %run_on_npu ./test | FileCheck %s
+# CHECK: PASS!
+
+from aie.extras.context import mlir_mod_ctx
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+
+
+dtype = T.i32
+output_sz = 16
+
+# This design produces `n_tiles` output tiles of size tile_sz.
+# For each output tile, it reads the next contiguous 16 input tiles of size tile_sz, adds the values at each index together, and writes it to the output tile.
+# In other words, this design produces:
+#  output[i] = input[16*i] + input[16*i + 1] +  ... + input[16*i + 15]
+# and the processing occurs in chuncks of tile_sz, i.e. one core call produces output[i], output[i+1], ... output[i+tile_sz-1]
+
+
+def design():
+
+    with mlir_mod_ctx() as ctx:
+
+        @device(AIEDevice.npu1_4col)
+        def device_body():
+            memref_t = T.memref(1, dtype())
+
+            # Tile declarations as tile[row][col]
+            tiles = [[tile(col, row) for col in range(0, 4)] for row in range(0, 6)]
+            # Shim tiles: tiles[0][0..3]
+            # Mem tiles: tiles[1][0..3]
+            # Cores: tiles[2..5][0..3]
+
+            fifo_input = object_fifo(
+                "fifo_input", tiles[0][0], tiles[2][0], 1, memref_t
+            )
+            fifo_output = object_fifo(
+                "fifo_output", tiles[2][0], tiles[0][0], 1, memref_t
+            )
+
+            # Core
+            @core(tiles[2][0])
+            def core_body():
+                for _ in for_(0xFFFFFFFF):
+                    elem_output = fifo_output.acquire(ObjectFifoPort.Produce, 1)
+                    zero = constant(T.i32(), 0)
+                    memref.store(zero, elem_output, [0])
+                    for _ in for_(16):
+                        elem_input = fifo_input.acquire(ObjectFifoPort.Consume, 1)
+                        a = memref.load(elem_output, [0])
+                        b = memref.load(elem_input, [0])
+                        c = a + b
+                        memref.store(c, elem_output, [0])
+                        fifo_input.release(ObjectFifoPort.Consume, 1)
+                        yield_([])
+                    fifo_output.release(ObjectFifoPort.Produce, 1)
+                    yield_([])
+
+            # To/from AIE-array data movement
+            @runtime_sequence(memref_t, memref_t)
+            def sequence(input, output):
+                for i in range(output_sz):
+                    # Configure and start, and wait for 16 BDs, each transferring the next contiguous input tile.
+                    for j in range(16):
+                        npu_dma_memcpy_nd(
+                            metadata=fifo_input.sym_name.value,
+                            bd_id=j,
+                            mem=input,
+                            offsets=[0, 0, 0, i * 16 + j],
+                            sizes=[1, 1, 1, 1],
+                            strides=[0, 0, 0, 1],
+                            issue_token=True,
+                        )
+                        npu_dma_wait(fifo_input.sym_name.value)
+                    # After transferring 16 input tiles, one output tile will be produced;
+                    # issue a BD to transfer it back
+                    npu_dma_memcpy_nd(
+                        metadata=fifo_output.sym_name.value,
+                        bd_id=0,
+                        mem=output,
+                        offsets=[0, 0, 0, i],
+                        sizes=[1, 1, 1, 1],
+                        strides=[0, 0, 0, 1],
+                    )
+                    npu_dma_wait(fifo_output.sym_name.value)
+
+    print(ctx.module)
+
+
+design()
diff --git a/test/npu-xrt/sync_task_complete_token/test.cpp b/test/npu-xrt/sync_task_complete_token/test.cpp
new file mode 100644
index 0000000000..d2906461ec
--- /dev/null
+++ b/test/npu-xrt/sync_task_complete_token/test.cpp
@@ -0,0 +1,134 @@
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 AMD Inc.
+
+#include <cassert>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
+  std::ifstream instr_file(instr_path);
+  std::string line;
+  std::vector<uint32_t> instr_v;
+  while (std::getline(instr_file, line)) {
+    std::istringstream iss(line);
+    uint32_t a;
+    if (!(iss >> std::hex >> a)) {
+      throw std::runtime_error("Unable to parse instruction file\n");
+    }
+    instr_v.push_back(a);
+  }
+  return instr_v;
+}
+
+#ifndef XCLBIN
+#define XCLBIN "final.xclbin"
+#endif
+
+#ifndef INSTS_TXT
+#define INSTS_TXT "insts.txt"
+#endif
+
+#ifndef KERNEL_NAME
+#define KERNEL_NAME "MLIR_AIE"
+#endif
+
+#define DTYPE int32_t
+
+#define OUTPUT_LEN (4 * 4)
+#define INPUT_LEN (16 * OUTPUT_LEN)
+
+int main(int argc, const char *argv[]) {
+
+  std::vector<uint32_t> instr_v = load_instr_sequence(INSTS_TXT);
+  assert(instr_v.size() > 0);
+
+  // Get a device handle
+  unsigned int device_index = 0;
+  xrt::device device = xrt::device(device_index);
+
+  // Load the xclbin
+  xrt::xclbin xclbin = xrt::xclbin(XCLBIN);
+
+  // Get the kernel from the xclbin
+  std::vector<xrt::xclbin::kernel> xkernels = xclbin.get_kernels();
+  xrt::xclbin::kernel xkernel = *std::find_if(
+      xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel &k) {
+        return k.get_name().rfind(KERNEL_NAME, 0) == 0;
+      });
+  std::string kernel_name = xkernel.get_name();
+  assert(strcmp(kernel_name.c_str(), KERNEL_NAME) == 0);
+
+  device.register_xclbin(xclbin);
+
+  // get a hardware context
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // get a kernel handle
+  auto kernel = xrt::kernel(context, kernel_name);
+
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_input = xrt::bo(device, INPUT_LEN * sizeof(DTYPE),
+                          XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_output = xrt::bo(device, OUTPUT_LEN * sizeof(DTYPE),
+                           XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+
+  DTYPE *buf_input = bo_input.map<DTYPE *>();
+  for (int i = 0; i < INPUT_LEN; i++) {
+    buf_input[i] = i;
+  }
+  DTYPE *buf_output = bo_output.map<DTYPE *>();
+  memset(buf_output, 0, OUTPUT_LEN * sizeof(DTYPE));
+
+  // Instruction buffer for DMA configuration
+  void *buf_instr = bo_instr.map<void *>();
+  memcpy(buf_instr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_input.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_output.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  unsigned int opcode = 3;
+  auto run = kernel(opcode, bo_instr, instr_v.size(), bo_input, bo_output);
+  ert_cmd_state r = run.wait();
+  if (r != ERT_CMD_STATE_COMPLETED) {
+    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
+    return 1;
+  }
+
+  bo_output.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+  DTYPE ref[OUTPUT_LEN] = {};
+  for (int i = 0; i < OUTPUT_LEN; i++) {
+    for (int j = 0; j < 16; j++) {
+      ref[i] += buf_input[i * 16 + j];
+    }
+  }
+  std::cout << "Reference: ";
+  for (int i = 0; i < OUTPUT_LEN; i++) {
+    std::cout << std::setw(4) << (long)ref[i] << " ";
+  }
+  std::cout << std::endl;
+
+  std::cout << "Output:    ";
+  for (int i = 0; i < OUTPUT_LEN; i++) {
+    std::cout << std::setw(4) << (long)buf_output[i] << " ";
+  }
+  std::cout << std::endl;
+
+  if (memcmp(ref, buf_output, sizeof(ref)) == 0) {
+    std::cout << "PASS!" << std::endl;
+  } else {
+    std::cout << "FAIL." << std::endl;
+  }
+
+  return 0;
+}
diff --git a/test/npu-xrt/sync_task_complete_token_bd_chaining/aie2.py b/test/npu-xrt/sync_task_complete_token_bd_chaining/aie2.py
new file mode 100644
index 0000000000..3cdce92088
--- /dev/null
+++ b/test/npu-xrt/sync_task_complete_token_bd_chaining/aie2.py
@@ -0,0 +1,147 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 AMD Inc.
+
+# REQUIRES: ryzen_ai
+#
+# RUN: %python %S/aie2.py > ./aie2.mlir
+# RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.txt ./aie2.mlir
+# RUN: clang %S/test.cpp -o test -std=c++11 -Wall %xrt_flags -lrt -lstdc++
+# RUN: %run_on_npu ./test | FileCheck %s
+# CHECK: PASS!
+
+from aie.extras.context import mlir_mod_ctx
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+
+
+dtype = T.i32
+output_sz = 16
+
+# This design produces `n_tiles` output tiles of size tile_sz.
+# For each output tile, it reads the next contiguous 16 input tiles of size tile_sz, adds the values at each index together, and writes it to the output tile.
+# In other words, this design produces:
+#  output[i] = input[16*i] + input[16*i + 1] +  ... + input[16*i + 15]
+# and the processing occurs in chuncks of tile_sz, i.e. one core call produces output[i], output[i+1], ... output[i+tile_sz-1]
+
+
+def design():
+
+    with mlir_mod_ctx() as ctx:
+
+        @device(AIEDevice.npu1_4col)
+        def device_body():
+            memref_t = T.memref(1, dtype())
+
+            # Tile declarations as tile[row][col]
+            tiles = [[tile(col, row) for col in range(0, 4)] for row in range(0, 6)]
+            # Shim tiles: tiles[0][0..3]
+            # Mem tiles: tiles[1][0..3]
+            # Cores: tiles[2..5][0..3]
+
+            fifo_input = object_fifo(
+                "fifo_input", tiles[0][0], tiles[2][0], 1, memref_t
+            )
+            fifo_output = object_fifo(
+                "fifo_output", tiles[2][0], tiles[0][0], 1, memref_t
+            )
+
+            # Core
+            @core(tiles[2][0])
+            def core_body():
+                for _ in for_(0xFFFFFFFF):
+                    elem_output = fifo_output.acquire(ObjectFifoPort.Produce, 1)
+                    zero = constant(T.i32(), 0)
+                    memref.store(zero, elem_output, [0])
+                    for _ in for_(16):
+                        elem_input = fifo_input.acquire(ObjectFifoPort.Consume, 1)
+                        a = memref.load(elem_output, [0])
+                        b = memref.load(elem_input, [0])
+                        c = a + b
+                        memref.store(c, elem_output, [0])
+                        fifo_input.release(ObjectFifoPort.Consume, 1)
+                        yield_([])
+                    fifo_output.release(ObjectFifoPort.Produce, 1)
+                    yield_([])
+
+            # To/from AIE-array data movement
+            @runtime_sequence(memref_t, memref_t)
+            def sequence(input, output):
+                for i in range(output_sz):
+
+                    # Configure 16 BDs, each transferring the next contiguous input tile.
+                    for j in range(16):
+                        # Configure BDs 1-16. The configuration is as generated from the following npu_dma_memcpy_nd instruction, except for the BD chaining.
+                        # npu_dma_memcpy_nd( metadata=fifo_input.sym_name.value, bd_id=j, mem=input, offsets=[0, 0, 0, i * 16 + j], sizes=[1, 1, 1, 1], strides=[0, 0, 0, 1], issue_token=True,)
+                        use_next_bd = 1 if j < 15 else 0
+                        next_bd = j + 1 if j < 15 else 0
+                        buffer_offset = (i * 16 + j) * 4
+                        npu_writebd(
+                            column=0,
+                            row=0,
+                            bd_id=j,
+                            buffer_length=1,
+                            buffer_offset=buffer_offset,
+                            use_next_bd=use_next_bd,
+                            next_bd=next_bd,
+                            valid_bd=1,
+                            iteration_current=0,
+                            iteration_size=0,
+                            iteration_stride=0,
+                            d0_size=0,
+                            d0_stride=0,
+                            d1_size=0,
+                            d1_stride=0,
+                            d2_stride=0,
+                            enable_packet=0,
+                            out_of_order_id=0,
+                            packet_id=0,
+                            packet_type=0,
+                            lock_acq_enable=0,
+                            lock_acq_val=0,
+                            lock_acq_id=0,
+                            lock_rel_val=0,
+                            lock_rel_id=0,
+                        )
+                        # aiex.npu.address_patch writes the pointer to argument 0 (input, arg_idx=0) to the respective BD0, BD1, ... address plus an offset
+                        # 0x1D004  DMA_BD0_1  Base_Address_Low  <- buffer address for bd 0
+                        # 0x1D024  DMA_BD1_1  Base_Address_Low  <- buffer address for bd 1
+                        # ... and so forth
+                        npu_address_patch(
+                            addr=(0x1D004 + j * 0x20), arg_idx=0, arg_plus=buffer_offset
+                        )
+                    # npu_push_queue adds BD0 to be executed next; BD1 is chained to execute after BD0 and so forth, so this sets off all BDs sequentially on MM2S channel 0
+                    # 0x1D214  DMA_MM2S_0_Task_Queue  <- Enqueue Buffer Descriptor on MM2S Channel 0
+                    npu_push_queue(
+                        bd_id=0,
+                        column=0,
+                        row=0,
+                        direction=1,
+                        channel=0,
+                        issue_token=1,
+                        repeat_count=0,
+                    )
+                    # Wait for the task completion token of the previously set off chain of BDs
+                    npu_dma_wait(fifo_input.sym_name.value)
+
+                    # After transferring 16 input tiles, one output tile will be produced;
+                    # issue a BD to transfer it back
+                    npu_dma_memcpy_nd(
+                        metadata=fifo_output.sym_name.value,
+                        bd_id=0,
+                        mem=output,
+                        offsets=[0, 0, 0, i],
+                        sizes=[1, 1, 1, 1],
+                        strides=[0, 0, 0, 1],
+                    )
+                    npu_dma_wait(fifo_output.sym_name.value)
+
+    print(ctx.module)
+
+
+design()
diff --git a/test/npu-xrt/sync_task_complete_token_bd_chaining/test.cpp b/test/npu-xrt/sync_task_complete_token_bd_chaining/test.cpp
new file mode 100644
index 0000000000..d2906461ec
--- /dev/null
+++ b/test/npu-xrt/sync_task_complete_token_bd_chaining/test.cpp
@@ -0,0 +1,134 @@
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2024 AMD Inc.
+
+#include <cassert>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+std::vector<uint32_t> load_instr_sequence(std::string instr_path) {
+  std::ifstream instr_file(instr_path);
+  std::string line;
+  std::vector<uint32_t> instr_v;
+  while (std::getline(instr_file, line)) {
+    std::istringstream iss(line);
+    uint32_t a;
+    if (!(iss >> std::hex >> a)) {
+      throw std::runtime_error("Unable to parse instruction file\n");
+    }
+    instr_v.push_back(a);
+  }
+  return instr_v;
+}
+
+#ifndef XCLBIN
+#define XCLBIN "final.xclbin"
+#endif
+
+#ifndef INSTS_TXT
+#define INSTS_TXT "insts.txt"
+#endif
+
+#ifndef KERNEL_NAME
+#define KERNEL_NAME "MLIR_AIE"
+#endif
+
+#define DTYPE int32_t
+
+#define OUTPUT_LEN (4 * 4)
+#define INPUT_LEN (16 * OUTPUT_LEN)
+
+int main(int argc, const char *argv[]) {
+
+  std::vector<uint32_t> instr_v = load_instr_sequence(INSTS_TXT);
+  assert(instr_v.size() > 0);
+
+  // Get a device handle
+  unsigned int device_index = 0;
+  xrt::device device = xrt::device(device_index);
+
+  // Load the xclbin
+  xrt::xclbin xclbin = xrt::xclbin(XCLBIN);
+
+  // Get the kernel from the xclbin
+  std::vector<xrt::xclbin::kernel> xkernels = xclbin.get_kernels();
+  xrt::xclbin::kernel xkernel = *std::find_if(
+      xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel &k) {
+        return k.get_name().rfind(KERNEL_NAME, 0) == 0;
+      });
+  std::string kernel_name = xkernel.get_name();
+  assert(strcmp(kernel_name.c_str(), KERNEL_NAME) == 0);
+
+  device.register_xclbin(xclbin);
+
+  // get a hardware context
+  xrt::hw_context context(device, xclbin.get_uuid());
+
+  // get a kernel handle
+  auto kernel = xrt::kernel(context, kernel_name);
+
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_input = xrt::bo(device, INPUT_LEN * sizeof(DTYPE),
+                          XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_output = xrt::bo(device, OUTPUT_LEN * sizeof(DTYPE),
+                           XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+
+  DTYPE *buf_input = bo_input.map<DTYPE *>();
+  for (int i = 0; i < INPUT_LEN; i++) {
+    buf_input[i] = i;
+  }
+  DTYPE *buf_output = bo_output.map<DTYPE *>();
+  memset(buf_output, 0, OUTPUT_LEN * sizeof(DTYPE));
+
+  // Instruction buffer for DMA configuration
+  void *buf_instr = bo_instr.map<void *>();
+  memcpy(buf_instr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_input.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_output.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  unsigned int opcode = 3;
+  auto run = kernel(opcode, bo_instr, instr_v.size(), bo_input, bo_output);
+  ert_cmd_state r = run.wait();
+  if (r != ERT_CMD_STATE_COMPLETED) {
+    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
+    return 1;
+  }
+
+  bo_output.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+  DTYPE ref[OUTPUT_LEN] = {};
+  for (int i = 0; i < OUTPUT_LEN; i++) {
+    for (int j = 0; j < 16; j++) {
+      ref[i] += buf_input[i * 16 + j];
+    }
+  }
+  std::cout << "Reference: ";
+  for (int i = 0; i < OUTPUT_LEN; i++) {
+    std::cout << std::setw(4) << (long)ref[i] << " ";
+  }
+  std::cout << std::endl;
+
+  std::cout << "Output:    ";
+  for (int i = 0; i < OUTPUT_LEN; i++) {
+    std::cout << std::setw(4) << (long)buf_output[i] << " ";
+  }
+  std::cout << std::endl;
+
+  if (memcmp(ref, buf_output, sizeof(ref)) == 0) {
+    std::cout << "PASS!" << std::endl;
+  } else {
+    std::cout << "FAIL." << std::endl;
+  }
+
+  return 0;
+}
diff --git a/test/npu-xrt/two_col/aie.mlir b/test/npu-xrt/two_col/aie.mlir
index ef9fef706c..5518e0e392 100644
--- a/test/npu-xrt/two_col/aie.mlir
+++ b/test/npu-xrt/two_col/aie.mlir
@@ -128,7 +128,7 @@ module {
       }
       aie.end
     } {link_with = "threshold.o"}
-    func.func @sequence(%in : memref<2048xi32>, %buf : memref<32xi32>, %out : memref<2048xi32>) {
+    aiex.runtime_sequence(%in : memref<2048xi32>, %buf : memref<32xi32>, %out : memref<2048xi32>) {
       %c0 = arith.constant 0 : i64
       %c1 = arith.constant 1 : i64
       %c2048 = arith.constant 2048 : i64
@@ -140,10 +140,9 @@ module {
       aiex.npu.rtp_write(0, 3, 1, 0) { buffer_sym_name = "rtp1" }
       aiex.npu.rtp_write(1, 4, 1, 0) { buffer_sym_name = "rtp2" }
       aiex.npu.rtp_write(1, 5, 1, 0) { buffer_sym_name = "rtp3" }
-      aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2048][%c0,%c0,%c0, %c1]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<2048xi32>
+      aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2048][%c0,%c0,%c0, %c1]) { metadata = @objFifo_out0, id = 1 : i64, issue_token = true } : memref<2048xi32>
       aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2048][%c0,%c0,%c0, %c1]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<2048xi32>
-      aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 }
-      return
+      aiex.npu.dma_wait {symbol = @objFifo_out0}
     }
   }
 }
diff --git a/test/npu-xrt/two_col/test.cpp b/test/npu-xrt/two_col/test.cpp
index ef26e7960b..5563c860f1 100644
--- a/test/npu-xrt/two_col/test.cpp
+++ b/test/npu-xrt/two_col/test.cpp
@@ -162,7 +162,11 @@ int main(int argc, const char *argv[]) {
     std::cout << "Running Kernel.\n";
   unsigned int opcode = 3;
   auto run = kernel(opcode, bo_instr, instr_v.size(), bo_in, debug, bo_out);
-  run.wait();
+  ert_cmd_state r = run.wait();
+  if (r != ERT_CMD_STATE_COMPLETED) {
+    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
+    return 1;
+  }
 
   bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
 
diff --git a/test/npu-xrt/vector_scalar_using_dma/aie.mlir b/test/npu-xrt/vector_scalar_using_dma/aie.mlir
index 4e6dd6627a..a390823633 100644
--- a/test/npu-xrt/vector_scalar_using_dma/aie.mlir
+++ b/test/npu-xrt/vector_scalar_using_dma/aie.mlir
@@ -65,14 +65,13 @@ module {
 
     aie.shim_dma_allocation @in(MM2S, 0, 0)
 
-    func.func @sequence(%arg0: memref<4096xi32>, %arg1: memref<4096xi32>, %arg2: memref<4096xi32>) {
+    aiex.runtime_sequence(%arg0: memref<4096xi32>, %arg1: memref<4096xi32>, %arg2: memref<4096xi32>) {
       %c0_i64 = arith.constant 0 : i64
       %c1_i64 = arith.constant 1 : i64
       %c4096_i64 = arith.constant 4096 : i64
-      aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c4096_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, metadata = @out} : memref<4096xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c4096_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 0 : i64, metadata = @out, issue_token = true} : memref<4096xi32>
       aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c4096_i64] [%c0_i64, %c0_i64, %c0_i64, %c1_i64]) {id = 1 : i64, metadata = @in} : memref<4096xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
+      aiex.npu.dma_wait { symbol = @out }
     }
 
     aie.shim_dma_allocation @out(S2MM, 0, 0)
diff --git a/test/npu-xrt/vector_scalar_using_dma/test.cpp b/test/npu-xrt/vector_scalar_using_dma/test.cpp
index 3b055931fa..598d87ca69 100644
--- a/test/npu-xrt/vector_scalar_using_dma/test.cpp
+++ b/test/npu-xrt/vector_scalar_using_dma/test.cpp
@@ -161,7 +161,11 @@ int main(int argc, const char *argv[]) {
     std::cout << "Running Kernel.\n";
   unsigned int opcode = 3;
   auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
-  run.wait();
+  ert_cmd_state r = run.wait();
+  if (r != ERT_CMD_STATE_COMPLETED) {
+    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
+    return 1;
+  }
 
   bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
 
diff --git a/test/python/aievec.py b/test/python/aievec.py
index da7b30bbaf..dcc0d73d83 100644
--- a/test/python/aievec.py
+++ b/test/python/aievec.py
@@ -46,39 +46,23 @@ def demo_fun1():
 @construct_and_print_module
 def test_aievec():
     @func
-    def mul_mul(
-        A: T.memref(2048, T.f32()),
-        B: T.memref(2048, T.f32()),
-        C: T.memref(2048, T.f32()),
-        d: T.f32(),
+    def mul_elem(
+        A: T.memref(2048, T.i16()),
+        B: T.memref(2048, T.i16()),
+        C: T.memref(2048, T.i16()),
     ):
-        v0 = vector.broadcast(T.vector(8, T.f32()), d)
-        v1 = aievec.concat([v0, v0])
-        for i in scf.for_(0, 2048, 8):
-            v2 = aievec.upd(T.vector(8, T.f32()), A, [i])
-            v3 = aievec.upd(T.vector(8, T.f32()), B, [i])
-            v4 = aievec.mul(
-                T.vector(8, T.f32()),
+        for i in scf.for_(0, 2048, 32):
+            v0 = aievec.upd(T.vector(32, T.i16()), A, [i])
+            v1 = aievec.upd(T.vector(32, T.i16()), B, [i])
+            v2 = aievec.mul_elem(
+                T.vector(32, T.i32()),
+                v0,
                 v1,
-                v2,
-                xoffsets="0x76543210",
-                xstart="0",
-                zoffsets="0x76543210",
-                zstart="0",
-            )
-            v5 = aievec.concat([v4, v4])
-            v6 = aievec.mul(
-                T.vector(8, T.f32()),
-                v5,
-                v3,
-                xoffsets="0x76543210",
-                xstart="0",
-                zoffsets="0x76543210",
-                zstart="0",
             )
+            v3 = aievec.srs(T.vector(32, T.i16()), v2, arith.constant(0))
             vector.transfer_write(
                 None,
-                v6,
+                v3,
                 C,
                 [i],
                 AffineMap.get_identity(1),
@@ -87,24 +71,22 @@ def mul_mul(
 
             scf.yield_([])
 
-    # CHECK-LABEL:   func.func @mul_mul(
-    # CHECK-SAME:                       %[[VAL_0:.*]]: memref<2048xf32>, %[[VAL_1:.*]]: memref<2048xf32>, %[[VAL_2:.*]]: memref<2048xf32>, %[[VAL_3:.*]]: f32) {
-    # CHECK:           %[[VAL_4:.*]] = vector.broadcast %[[VAL_3]] : f32 to vector<8xf32>
-    # CHECK:           %[[VAL_5:.*]] = aievec.concat %[[VAL_4]], %[[VAL_4]] : vector<8xf32>, vector<16xf32>
-    # CHECK:           %[[VAL_6:.*]] = arith.constant 0 : index
-    # CHECK:           %[[VAL_7:.*]] = arith.constant 2048 : index
-    # CHECK:           %[[VAL_8:.*]] = arith.constant 8 : index
-    # CHECK:           scf.for %[[VAL_9:.*]] = %[[VAL_6]] to %[[VAL_7]] step %[[VAL_8]] {
-    # CHECK:             %[[VAL_10:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_9]]] {index = 0 : i8, offset = 0 : i32} : memref<2048xf32>, vector<8xf32>
-    # CHECK:             %[[VAL_11:.*]] = aievec.upd %[[VAL_1]]{{\[}}%[[VAL_9]]] {index = 0 : i8, offset = 0 : i32} : memref<2048xf32>, vector<8xf32>
-    # CHECK:             %[[VAL_12:.*]] = aievec.mul %[[VAL_5]], %[[VAL_10]] {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x76543210", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-    # CHECK:             %[[VAL_13:.*]] = aievec.concat %[[VAL_12]], %[[VAL_12]] : vector<8xf32>, vector<16xf32>
-    # CHECK:             %[[VAL_14:.*]] = aievec.mul %[[VAL_13]], %[[VAL_11]] {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x76543210", zstart = "0"} : vector<16xf32>, vector<8xf32>, vector<8xf32>
-    # CHECK:             vector.transfer_write %[[VAL_14]], %[[VAL_2]]{{\[}}%[[VAL_9]]] {in_bounds = [true]} : vector<8xf32>, memref<2048xf32>
+    # CHECK-LABEL:   func.func @mul_elem(
+    # CHECK-SAME:                       %[[VAL_0:.*]]: memref<2048xi16>, %[[VAL_1:.*]]: memref<2048xi16>, %[[VAL_2:.*]]: memref<2048xi16>) {
+    # CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
+    # CHECK:           %[[VAL_4:.*]] = arith.constant 2048 : index
+    # CHECK:           %[[VAL_5:.*]] = arith.constant 32 : index
+    # CHECK:           scf.for %[[VAL_6:.*]] = %[[VAL_3]] to %[[VAL_4]] step %[[VAL_5]] {
+    # CHECK:             %[[VAL_7:.*]] = aievec.upd %[[VAL_0]]{{\[}}%[[VAL_6]]] {index = 0 : i8, offset = 0 : i32} : memref<2048xi16>, vector<32xi16>
+    # CHECK:             %[[VAL_8:.*]] = aievec.upd %[[VAL_1]]{{\[}}%[[VAL_6]]] {index = 0 : i8, offset = 0 : i32} : memref<2048xi16>, vector<32xi16>
+    # CHECK:             %[[VAL_9:.*]] = aievec.mul_elem %[[VAL_7]], %[[VAL_8]] : vector<32xi16>, vector<32xi16>, vector<32xi32>
+    # CHECK:             %[[VAL_10:.*]] = arith.constant 0 : i32
+    # CHECK:             %[[VAL_11:.*]] = aievec.srs %[[VAL_9]], %[[VAL_10]] : vector<32xi32>, i32, vector<32xi16>
+    # CHECK:             vector.transfer_write %[[VAL_11]], %[[VAL_2]]{{\[}}%[[VAL_6]]] {in_bounds = [true]} : vector<32xi16>, memref<2048xi16>
     # CHECK:           }
     # CHECK:           return
     # CHECK:         }
-    mul_mul.emit()
+    mul_elem.emit()
 
 
 @construct_and_print_module
@@ -309,7 +291,7 @@ def dut(A: T.tensor(1024, T.i8()), B: T.tensor(1024, T.i8())):
     # CHECK:     %0 = aievec.upd %arg0[%arg3] {index = 0 : i8, offset = 0 : i32} : memref<1024xi8>, vector<64xi8>
     # CHECK:     %1 = aievec.upd %arg1[%arg3] {index = 0 : i8, offset = 0 : i32} : memref<1024xi8>, vector<64xi8>
     # CHECK:     %2 = aievec.add_elem %0, %1 : vector<64xi8>
-    # CHECK:     vector.transfer_write %2, %arg2[%arg3] {in_bounds = [true]} : vector<64xi8>, memref<1024xi8>
+    # CHECK:     vector.transfer_write %2, %arg2[%arg3] : vector<64xi8>, memref<1024xi8>
     # CHECK:   }
     # CHECK:   return
     # CHECK: }
diff --git a/test/python/npu.py b/test/python/npu.py
index 228fcd10f3..7ea96af716 100644
--- a/test/python/npu.py
+++ b/test/python/npu.py
@@ -23,7 +23,7 @@
     object_fifo_link,
     tile,
 )
-from aie.dialects.aiex import npu_sync, npu_dma_memcpy_nd
+from aie.dialects.aiex import npu_sync, npu_dma_memcpy_nd, runtime_sequence
 from aie.dialects.func import FuncOp
 from aie.dialects.scf import for_
 from aie.dialects.scf import yield_
@@ -75,7 +75,7 @@ def core_body():
                     yield_([])
                 yield_([])
 
-        @FuncOp.from_py_func(
+        @runtime_sequence(
             T.memref(N, T.i32()), T.memref(N, T.i32()), T.memref(N, T.i32())
         )
         def sequence(A, B, C):
@@ -177,7 +177,7 @@ def core_body():
                     yield_([])
                 yield_([])
 
-        @FuncOp.from_py_func(
+        @runtime_sequence(
             T.memref(A_sz_in_i32s, T.i32()),
             T.memref(B_sz_in_i32s, T.i32()),
             T.memref(C_sz_in_i32s, T.i32()),
@@ -437,7 +437,7 @@ def core_body():
                 outOF_L1L2.release(ObjectFifoPort.Produce, 1)
                 yield_([])
 
-        @FuncOp.from_py_func(
+        @runtime_sequence(
             T.memref(2304, T.i32()), T.memref(2304, T.i32()), T.memref(2304, T.i32())
         )
         def sequence(I, B, O):
@@ -492,7 +492,7 @@ def core_body():
                 of_out1.release(ObjectFifoPort.Produce, 1)
                 yield_([])
 
-        @FuncOp.from_py_func(
+        @runtime_sequence(
             T.memref(64, T.i32()), T.memref(32, T.i32()), T.memref(64, T.i32())
         )
         def sequence(inTensor, notUsed, outTensor):
diff --git a/test/python/trace_utils.py b/test/python/trace_utils.py
index 68814971f1..c9432cc1dd 100644
--- a/test/python/trace_utils.py
+++ b/test/python/trace_utils.py
@@ -81,7 +81,7 @@ def core_body():
             tensorSizeInInt32s = tensorSize // 4
             tensor_ty = T.memref(lineWidthInInt32s, T.i32())
 
-            @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
+            @runtime_sequence(tensor_ty, tensor_ty, tensor_ty)
             def sequence(inTensor, outTensor, notUsed):
                 if enableTrace:
                     configure_simple_tracing_aie2(
diff --git a/test/unit_tests/aievec_tests/bf16_max_reduce/bf16_max_reduce-llvm.mlir b/test/unit_tests/aievec_tests/bf16_max_reduce/bf16_max_reduce-llvm.mlir
index 1cdef347c0..3acfa8fa98 100644
--- a/test/unit_tests/aievec_tests/bf16_max_reduce/bf16_max_reduce-llvm.mlir
+++ b/test/unit_tests/aievec_tests/bf16_max_reduce/bf16_max_reduce-llvm.mlir
@@ -2,7 +2,7 @@
 // Copyright (C) 2024, Advanced Micro Devices, Inc.
 
 // REQUIRES: valid_xchess_license
-// REQUIRES: peano
+// REQUIRES: peano, peano_and_chess
 // RUN: mkdir -p %t/data; cd %t
 // RUN: aie-opt %s %vector-to-llvmir% -o llvmir.mlir
 // RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll
diff --git a/test/unit_tests/aievec_tests/bf16xbf16_mul_elem/bf16xbf16_mul_elem-llvm-scalar.mlir b/test/unit_tests/aievec_tests/bf16xbf16_mul_elem/bf16xbf16_mul_elem-llvm-scalar.mlir
index 359932941f..823d263843 100644
--- a/test/unit_tests/aievec_tests/bf16xbf16_mul_elem/bf16xbf16_mul_elem-llvm-scalar.mlir
+++ b/test/unit_tests/aievec_tests/bf16xbf16_mul_elem/bf16xbf16_mul_elem-llvm-scalar.mlir
@@ -2,7 +2,7 @@
 // Copyright (C) 2023, Advanced Micro Devices, Inc.
 
 // REQUIRES: valid_xchess_license
-// REQUIRES: peano
+// REQUIRES: peano, peano_and_chess
 // RUN: mkdir -p %t/data; cd %t
 // RUN: aie-opt %s %vector-to-generic-llvmir% -o llvmir.mlir
 // RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll
diff --git a/test/unit_tests/aievec_tests/bf16xbf16_mul_elem/bf16xbf16_mul_elem-llvm.mlir b/test/unit_tests/aievec_tests/bf16xbf16_mul_elem/bf16xbf16_mul_elem-llvm.mlir
index ea0ddfe410..1e0e3ba99d 100644
--- a/test/unit_tests/aievec_tests/bf16xbf16_mul_elem/bf16xbf16_mul_elem-llvm.mlir
+++ b/test/unit_tests/aievec_tests/bf16xbf16_mul_elem/bf16xbf16_mul_elem-llvm.mlir
@@ -2,7 +2,7 @@
 // Copyright (C) 2023, Advanced Micro Devices, Inc.
 
 // REQUIRES: valid_xchess_license
-// REQUIRES: peano
+// REQUIRES: peano, peano_and_chess
 // RUN: mkdir -p %t/data; cd %t
 // RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=16" %vector-to-llvmir% -o llvmir.mlir
 // RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll
diff --git a/test/unit_tests/aievec_tests/bf16xbf16_mul_elem_2/bf16xbf16_mul_elem-llvm.mlir b/test/unit_tests/aievec_tests/bf16xbf16_mul_elem_2/bf16xbf16_mul_elem-llvm.mlir
index 1945da8978..41b73ac156 100644
--- a/test/unit_tests/aievec_tests/bf16xbf16_mul_elem_2/bf16xbf16_mul_elem-llvm.mlir
+++ b/test/unit_tests/aievec_tests/bf16xbf16_mul_elem_2/bf16xbf16_mul_elem-llvm.mlir
@@ -2,7 +2,7 @@
 // Copyright (C) 2023, Advanced Micro Devices, Inc.
 
 // REQUIRES: valid_xchess_license
-// REQUIRES: peano
+// REQUIRES: peano, peano_and_chess
 // RUN: mkdir -p %t/data; cd %t
 // RUN: aie-opt %s %vector-to-llvmir% -o llvmir.mlir
 // RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll
diff --git a/test/unit_tests/aievec_tests/floatxfloat_mul_elem/floatxfloat_mul_elem-llvm-scalar.mlir b/test/unit_tests/aievec_tests/floatxfloat_mul_elem/floatxfloat_mul_elem-llvm-scalar.mlir
index b48686883e..17e8be81e5 100644
--- a/test/unit_tests/aievec_tests/floatxfloat_mul_elem/floatxfloat_mul_elem-llvm-scalar.mlir
+++ b/test/unit_tests/aievec_tests/floatxfloat_mul_elem/floatxfloat_mul_elem-llvm-scalar.mlir
@@ -2,7 +2,7 @@
 // Copyright (C) 2024, Advanced Micro Devices, Inc.
 
 // REQUIRES: valid_xchess_license
-// REQUIRES: peano
+// REQUIRES: peano, peano_and_chess
 // RUN: mkdir -p %t/data; cd %t
 // RUN: aie-opt %s %vector-to-generic-llvmir% -o llvmir.mlir
 // RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll
diff --git a/test/unit_tests/aievec_tests/floatxfloat_mul_elem/floatxfloat_mul_elem-llvm.mlir b/test/unit_tests/aievec_tests/floatxfloat_mul_elem/floatxfloat_mul_elem-llvm.mlir
index 94a57e9261..b15fc9adf7 100644
--- a/test/unit_tests/aievec_tests/floatxfloat_mul_elem/floatxfloat_mul_elem-llvm.mlir
+++ b/test/unit_tests/aievec_tests/floatxfloat_mul_elem/floatxfloat_mul_elem-llvm.mlir
@@ -2,7 +2,7 @@
 // Copyright (C) 2024, Advanced Micro Devices, Inc.
 
 // REQUIRES: valid_xchess_license
-// REQUIRES: peano
+// REQUIRES: peano, peano_and_chess
 // RUN: mkdir -p %t/data; cd %t
 // RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=16" %vector-to-llvmir% -o llvmir.mlir
 // RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll
diff --git a/test/unit_tests/aievec_tests/i16xi16_mul_elem/i16xi16_mul_elem-llvm-scalar.mlir b/test/unit_tests/aievec_tests/i16xi16_mul_elem/i16xi16_mul_elem-llvm-scalar.mlir
index 39f161867f..f2af352513 100644
--- a/test/unit_tests/aievec_tests/i16xi16_mul_elem/i16xi16_mul_elem-llvm-scalar.mlir
+++ b/test/unit_tests/aievec_tests/i16xi16_mul_elem/i16xi16_mul_elem-llvm-scalar.mlir
@@ -2,7 +2,7 @@
 // Copyright (C) 2023, Advanced Micro Devices, Inc.
 
 // REQUIRES: valid_xchess_license
-// REQUIRES: peano
+// REQUIRES: peano, peano_and_chess
 // RUN: mkdir -p %t/data; cd %t
 // RUN: aie-opt %s %vector-to-generic-llvmir% -o llvmir.mlir
 // RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll
diff --git a/test/unit_tests/aievec_tests/i16xi16_mul_elem/i16xi16_mul_elem-llvm.mlir b/test/unit_tests/aievec_tests/i16xi16_mul_elem/i16xi16_mul_elem-llvm.mlir
index dbcf445f3c..821e6318df 100644
--- a/test/unit_tests/aievec_tests/i16xi16_mul_elem/i16xi16_mul_elem-llvm.mlir
+++ b/test/unit_tests/aievec_tests/i16xi16_mul_elem/i16xi16_mul_elem-llvm.mlir
@@ -2,7 +2,7 @@
 // Copyright (C) 2023, Advanced Micro Devices, Inc.
 
 // REQUIRES: valid_xchess_license
-// REQUIRES: peano
+// REQUIRES: peano, peano_and_chess
 // RUN: mkdir -p %t/data; cd %t
 // RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=32" %vector-to-llvmir% -o llvmir.mlir
 // RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll
diff --git a/test/unit_tests/aievec_tests/i16xi16_mul_elem_2/i16xi16_mul_elem-llvm-scalar.mlir b/test/unit_tests/aievec_tests/i16xi16_mul_elem_2/i16xi16_mul_elem-llvm-scalar.mlir
index c41e069510..b96688c20e 100644
--- a/test/unit_tests/aievec_tests/i16xi16_mul_elem_2/i16xi16_mul_elem-llvm-scalar.mlir
+++ b/test/unit_tests/aievec_tests/i16xi16_mul_elem_2/i16xi16_mul_elem-llvm-scalar.mlir
@@ -2,7 +2,7 @@
 // Copyright (C) 2023, Advanced Micro Devices, Inc.
 
 // REQUIRES: valid_xchess_license
-// REQUIRES: peano
+// REQUIRES: peano, peano_and_chess
 // RUN: mkdir -p %t/data; cd %t
 // RUN: aie-opt %s %vector-to-generic-llvmir% -o llvmir.mlir
 // RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll
diff --git a/test/unit_tests/aievec_tests/i16xi16_mul_elem_2/i16xi16_mul_elem-llvm.mlir b/test/unit_tests/aievec_tests/i16xi16_mul_elem_2/i16xi16_mul_elem-llvm.mlir
index 49ba35e81b..045020aee4 100644
--- a/test/unit_tests/aievec_tests/i16xi16_mul_elem_2/i16xi16_mul_elem-llvm.mlir
+++ b/test/unit_tests/aievec_tests/i16xi16_mul_elem_2/i16xi16_mul_elem-llvm.mlir
@@ -2,7 +2,7 @@
 // Copyright (C) 2023, Advanced Micro Devices, Inc.
 
 // REQUIRES: valid_xchess_license
-// REQUIRES: peano
+// REQUIRES: peano, peano_and_chess
 // RUN: mkdir -p %t/data; cd %t
 // RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=32" %vector-to-llvmir% -o llvmir.mlir
 // RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll
diff --git a/test/unit_tests/aievec_tests/i32xi32_mul_elem/i32xi32_mul_elem-llvm-scalar.mlir b/test/unit_tests/aievec_tests/i32xi32_mul_elem/i32xi32_mul_elem-llvm-scalar.mlir
index e35536af7b..8b1ebe0523 100644
--- a/test/unit_tests/aievec_tests/i32xi32_mul_elem/i32xi32_mul_elem-llvm-scalar.mlir
+++ b/test/unit_tests/aievec_tests/i32xi32_mul_elem/i32xi32_mul_elem-llvm-scalar.mlir
@@ -2,7 +2,7 @@
 // Copyright (C) 2023, Advanced Micro Devices, Inc.
 
 // REQUIRES: valid_xchess_license
-// REQUIRES: peano
+// REQUIRES: peano, peano_and_chess
 // RUN: mkdir -p %t/data; cd %t
 // RUN: aie-opt %s %vector-to-generic-llvmir% -o llvmir.mlir
 // RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll
diff --git a/test/unit_tests/aievec_tests/i32xi32_mul_elem/i32xi32_mul_elem-llvm.mlir b/test/unit_tests/aievec_tests/i32xi32_mul_elem/i32xi32_mul_elem-llvm.mlir
index 8a32609a10..71695ab586 100644
--- a/test/unit_tests/aievec_tests/i32xi32_mul_elem/i32xi32_mul_elem-llvm.mlir
+++ b/test/unit_tests/aievec_tests/i32xi32_mul_elem/i32xi32_mul_elem-llvm.mlir
@@ -2,7 +2,7 @@
 // Copyright (C) 2023, Advanced Micro Devices, Inc.
 
 // REQUIRES: valid_xchess_license
-// REQUIRES: peano
+// REQUIRES: peano, peano_and_chess
 // RUN: mkdir -p %t/data; cd %t
 // RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=16" %vector-to-llvmir% -o llvmir.mlir
 // RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll
diff --git a/test/unit_tests/aievec_tests/i8xi8_mul_elem/i8xi8_mul_elem-llvm-scalar.mlir b/test/unit_tests/aievec_tests/i8xi8_mul_elem/i8xi8_mul_elem-llvm-scalar.mlir
index 9472f79b22..2eaaac179d 100644
--- a/test/unit_tests/aievec_tests/i8xi8_mul_elem/i8xi8_mul_elem-llvm-scalar.mlir
+++ b/test/unit_tests/aievec_tests/i8xi8_mul_elem/i8xi8_mul_elem-llvm-scalar.mlir
@@ -2,7 +2,7 @@
 // Copyright (C) 2023, Advanced Micro Devices, Inc.
 
 // REQUIRES: valid_xchess_license
-// REQUIRES: peano
+// REQUIRES: peano, peano_and_chess
 // RUN: mkdir -p %t/data; cd %t
 // RUN: aie-opt %s %vector-to-generic-llvmir% -o llvmir.mlir
 // RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll
diff --git a/test/unit_tests/aievec_tests/i8xi8_mul_elem/i8xi8_mul_elem-llvm.mlir b/test/unit_tests/aievec_tests/i8xi8_mul_elem/i8xi8_mul_elem-llvm.mlir
index a48d0ca243..72a14d2e74 100644
--- a/test/unit_tests/aievec_tests/i8xi8_mul_elem/i8xi8_mul_elem-llvm.mlir
+++ b/test/unit_tests/aievec_tests/i8xi8_mul_elem/i8xi8_mul_elem-llvm.mlir
@@ -2,7 +2,7 @@
 // Copyright (C) 2023, Advanced Micro Devices, Inc.
 
 // REQUIRES: valid_xchess_license
-// REQUIRES: peano
+// REQUIRES: peano, peano_and_chess
 // RUN: mkdir -p %t/data; cd %t
 // RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=32" %vector-to-llvmir% -o llvmir.mlir
 // RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll
diff --git a/test/unit_tests/aievec_tests/i8xi8_mul_elem_2/i8xi8_mul_elem-llvm-scalar.mlir b/test/unit_tests/aievec_tests/i8xi8_mul_elem_2/i8xi8_mul_elem-llvm-scalar.mlir
index 8179f35958..598e7d2292 100644
--- a/test/unit_tests/aievec_tests/i8xi8_mul_elem_2/i8xi8_mul_elem-llvm-scalar.mlir
+++ b/test/unit_tests/aievec_tests/i8xi8_mul_elem_2/i8xi8_mul_elem-llvm-scalar.mlir
@@ -2,7 +2,7 @@
 // Copyright (C) 2023, Advanced Micro Devices, Inc.
 
 // REQUIRES: valid_xchess_license
-// REQUIRES: peano
+// REQUIRES: peano, peano_and_chess
 // RUN: mkdir -p %t/data; cd %t
 // RUN: aie-opt %s %vector-to-generic-llvmir% -o llvmir.mlir
 // RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll
diff --git a/test/unit_tests/aievec_tests/i8xi8_mul_elem_2/i8xi8_mul_elem-llvm.mlir b/test/unit_tests/aievec_tests/i8xi8_mul_elem_2/i8xi8_mul_elem-llvm.mlir
index 599937d2a6..4d8b066559 100644
--- a/test/unit_tests/aievec_tests/i8xi8_mul_elem_2/i8xi8_mul_elem-llvm.mlir
+++ b/test/unit_tests/aievec_tests/i8xi8_mul_elem_2/i8xi8_mul_elem-llvm.mlir
@@ -2,7 +2,7 @@
 // Copyright (C) 2023, Advanced Micro Devices, Inc.
 
 // REQUIRES: valid_xchess_license
-// REQUIRES: peano
+// REQUIRES: peano, peano_and_chess
 // RUN: mkdir -p %t/data; cd %t
 // RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=32" %vector-to-llvmir% -o llvmir.mlir
 // RUN: aie-translate llvmir.mlir %llvmir-to-ll% -o dut.ll
diff --git a/tools/aie-opt/CMakeLists.txt b/tools/aie-opt/CMakeLists.txt
index 115ec9a51c..53b08c2eea 100644
--- a/tools/aie-opt/CMakeLists.txt
+++ b/tools/aie-opt/CMakeLists.txt
@@ -31,6 +31,7 @@ set(LIBS
   AIEXTransforms
   AIEXUtils
   MLIRAIEVecDialect
+  MLIRAIEVecAIE1Dialect
   MLIRAIEVecTransformOps
   MLIRAIEVecTransforms
   MLIRAIEVecToLLVM
diff --git a/tools/aie-translate/CMakeLists.txt b/tools/aie-translate/CMakeLists.txt
index 61ed9cba43..84d2df142e 100644
--- a/tools/aie-translate/CMakeLists.txt
+++ b/tools/aie-translate/CMakeLists.txt
@@ -30,6 +30,8 @@ target_link_libraries(aie-translate
   AIEXTransforms
   AIEXUtils
   AIETargets
+  MLIRAIEVecDialect
+  MLIRAIEVecAIE1Dialect
   MLIRXLLVMToLLVMIRTranslation
   MLIRIR
   MLIRParser
diff --git a/tools/aie-visualize/CMakeLists.txt b/tools/aie-visualize/CMakeLists.txt
index 6c7ac17cd4..249508decb 100644
--- a/tools/aie-visualize/CMakeLists.txt
+++ b/tools/aie-visualize/CMakeLists.txt
@@ -37,6 +37,7 @@ target_link_libraries(aie-visualize
   AIEX
   AIEXTransforms
   MLIRAIEVecDialect
+  MLIRAIEVecAIE1Dialect
   MLIRXLLVMDialect)
 
 install(TARGETS aie-visualize
diff --git a/utils/clone-llvm.sh b/utils/clone-llvm.sh
index 26fcbc5f2b..74bd9771f9 100755
--- a/utils/clone-llvm.sh
+++ b/utils/clone-llvm.sh
@@ -13,8 +13,8 @@
 ##===----------------------------------------------------------------------===##
 
 # The LLVM commit to use.
-LLVM_PROJECT_COMMIT=ae570d82e8c021f45209830db8c9c7bb79bed394
-DATETIME=2024070115
+LLVM_PROJECT_COMMIT=db1d88137212fec6c884dcb0f76a8dfab4fcab98
+DATETIME=2024072223
 WHEEL_VERSION=19.0.0.$DATETIME+${LLVM_PROJECT_COMMIT:0:8}
 
 ############################################################################################
diff --git a/utils/quick_setup.sh b/utils/quick_setup.sh
index 445aed4f26..97a5ca6a78 100755
--- a/utils/quick_setup.sh
+++ b/utils/quick_setup.sh
@@ -65,11 +65,15 @@ if test -f "$VPP"; then
   unzip -q mlir_aie-*_x86_64.whl
   pip download mlir -f https://github.com/Xilinx/mlir-aie/releases/expanded_assets/mlir-distro/
   unzip -q mlir-*_x86_64.whl
-  pip install https://github.com/makslevental/mlir-python-extras/archive/d84f05582adb2eed07145dabce1e03e13d0e29a6.zip
+  pip -q download llvm-aie -f https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly
+  unzip -q llvm_aie*.whl
   rm -rf mlir*.whl
-  export PATH=`realpath mlir_aie/bin`:`realpath mlir/bin`:$PATH
-  export LD_LIBRARY_PATH=`realpath mlir_aie/lib`:`realpath mlir/lib`:$LD_LIBRARY_PATH
+  rm -rf llvm_aie*.whl
+  pip install https://github.com/makslevental/mlir-python-extras/archive/d84f05582adb2eed07145dabce1e03e13d0e29a6.zip
+  export PATH=`realpath llvm-aie/bin`:`realpath mlir_aie/bin`:`realpath mlir/bin`:$PATH
+  export LD_LIBRARY_PATH=`realpath llvm-aie/lib`:`realpath mlir_aie/lib`:`realpath mlir/lib`:$LD_LIBRARY_PATH
   export PYTHONPATH=`realpath mlir_aie/python`:$PYTHONPATH
+  export PEANO_DIR=`realpath llvm-aie`
   popd
   python3 -m pip install --upgrade --force-reinstall --no-cache-dir -r python/requirements.txt
   python3 -m pip install --upgrade --force-reinstall --no-cache-dir -r python/requirements_ml.txt