Xilinx · AndraBisca · Mar 7, 2024 · Mar 8, 2024 · Mar 8, 2024 · Mar 8, 2024
@@ -419,6 +419,46 @@ struct DmaWaitToNpuPattern : OpConversionPattern<NpuDmaWaitOp> {
   }
 };
 
+std::optional<AIE::ShimDMAAllocationOp>
+getAllocOpForSymbol(SmallVector<AIE::ShimDMAAllocationOp> shimDmaAllocOps,
+                    StringRef sym_name) {
+  for (auto shimDmaAllocOp : shimDmaAllocOps)
+    if (shimDmaAllocOp.getSymName() == sym_name)
+      return shimDmaAllocOp;
+  return std::nullopt;
+}
+
+void insertNpuSyncOpForResults(AIE::DeviceOp device) {
+  SmallVector<AIE::ShimDMAAllocationOp> shimDmaAllocOps;
+  device.walk([&](AIE::ShimDMAAllocationOp shimDmaAllocOp) {
+    shimDmaAllocOps.push_back(shimDmaAllocOp);
+  });
+  device.walk([&](mlir::func::FuncOp f) {
+    SmallVector<AIEX::NpuDmaMemcpyNdOp> dmas;
+    Operation *returnOp = nullptr;
+    f.walk([&](mlir::func::ReturnOp op) { returnOp = op.getOperation(); });
+    f.walk([&](AIEX::NpuDmaMemcpyNdOp dma) { dmas.push_back(dma); });
+    for (auto dma : dmas) {
+      if (auto infoOp =
+              getAllocOpForSymbol(shimDmaAllocOps, dma.getMetadata())) {
+        if (infoOp->getChannelDir() == AIE::DMAChannelDir::S2MM) {
+          // Found dma op copying results to host
+          OpBuilder builder(dma);
+          auto col = builder.getI32IntegerAttr(infoOp->getCol());
+          auto row = builder.getI32IntegerAttr(0);
+          auto dir = builder.getI32IntegerAttr(0);
+          auto chan = builder.getI32IntegerAttr(infoOp->getChannelIndex());
+          auto col_num = builder.getI32IntegerAttr(1);
+          auto row_num = builder.getI32IntegerAttr(1);
+          builder.setInsertionPoint(returnOp);
+          builder.create<AIEX::NpuSyncOp>(dma->getLoc(), col, row, dir, chan,
+                                          col_num, row_num);
+        }
+      }
+    }
+  });
+}
+
 struct AIEDmaToNpuPass : AIEDmaToNpuBase<AIEDmaToNpuPass> {
   void runOnOperation() override {
 
@@ -441,6 +481,9 @@ struct AIEDmaToNpuPass : AIEDmaToNpuBase<AIEDmaToNpuPass> {
     patterns.insert<PushToNpuPattern>(&getContext(), cachingGetter);
     patterns.insert<RtpToNpuPattern>(&getContext());
 
+    // Insert sync op after copying data out to host
+    insertNpuSyncOpForResults(device);
+
     if (failed(applyPartialConversion(device, target, std::move(patterns))))
       signalPassFailure();
   }

@@ -59,7 +59,6 @@ def sequence(A, B, C):
                 npu_dma_memcpy_nd(
                     metadata="in", bd_id=1, mem=A, sizes=[1, K, M, 1], strides=[1, 1, K]
                 )
-                npu_sync(column=0, row=0, direction=0, channel=0)
 
     print(ctx.module)
 

@@ -204,9 +204,6 @@ def sequence(A, B, C):
                         strides=[0, 0, 0],
                     )
 
-                for i in range(n_cores):
-                    npu_sync(column=i, row=0, direction=0, channel=0)
-
     print(ctx.module)
 
 

@@ -227,8 +227,6 @@ def sequence(A, B, C):
                             strides=[n_in_i32s, k_x_N_in_i32s, N_in_i32s],
                         )
 
-                    npu_sync(column=0, row=0, direction=0, channel=0)
-
     print(ctx.module)
 
 

@@ -366,8 +366,6 @@ def sequence(A, B, C):
                                 sizes=[N_div_n_div_n_cols, K_div_k, k, n_in_i32s],
                                 strides=[n_x_n_cols_in_i32s, k_x_N_in_i32s, N_in_i32s],
                             )
-                    for i in range(n_cols):
-                        npu_sync(column=i, row=0, direction=0, channel=0)
 
     # print(ctx.module.operation.verify())
     print(ctx.module)

@@ -94,7 +94,6 @@ def sequence(inTensor, notUsed, outTensor):
                 sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
                 strides=[1, 1, IMAGE_WIDTH],
             )
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 with mlir_mod_ctx() as ctx:

@@ -64,7 +64,6 @@ def core_body():
             def sequence(A, B, C):
                 npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
                 npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
-                npu_sync(column=0, row=0, direction=0, channel=0)
 
     print(ctx.module)
 

@@ -85,7 +85,6 @@ def sequence(inTensor, outTensor, notUsed):
                 mem=outTensor,
                 sizes=[1, 1, 1, tensorSizeInInt32s],
             )
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 try:

@@ -113,7 +113,6 @@ def sequence(A, C):
             npu_dma_memcpy_nd(
                 metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
             )
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 with mlir_mod_ctx() as ctx:

@@ -70,7 +70,6 @@ def core_body():
         def sequence(A, C):
             npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1])
             npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 with mlir_mod_ctx() as ctx:

@@ -70,7 +70,6 @@ def core_body():
         def sequence(A, C):
             npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1])
             npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 with mlir_mod_ctx() as ctx:

@@ -70,7 +70,6 @@ def core_body():
         def sequence(A, C):
             npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1])
             npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 with mlir_mod_ctx() as ctx:

@@ -71,7 +71,6 @@ def sequence(inTensor, outTensor):
             npu_dma_memcpy_nd(
                 metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, PROBLEM_SIZE]
             )
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 # Declares that subsequent code is in mlir-aie context

@@ -101,7 +101,6 @@ def sequence(A, F, C):
             )
             npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N_in_i32s])
             npu_dma_memcpy_nd(metadata="infactor", bd_id=2, mem=F, sizes=[1, 1, 1, 1])
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 try:

@@ -81,7 +81,6 @@ def sequence(A, B, C):
             npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
             npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N])
             npu_dma_memcpy_nd(metadata="in2", bd_id=2, mem=B, sizes=[1, 1, 1, N])
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 with mlir_mod_ctx() as ctx:

@@ -81,7 +81,6 @@ def sequence(A, B, C):
             npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
             npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N])
             npu_dma_memcpy_nd(metadata="in2", bd_id=2, mem=B, sizes=[1, 1, 1, N])
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 with mlir_mod_ctx() as ctx:

@@ -81,7 +81,6 @@ def sequence(A, B, C):
             npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
             npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N])
             npu_dma_memcpy_nd(metadata="in2", bd_id=2, mem=B, sizes=[1, 1, 1, N])
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 with mlir_mod_ctx() as ctx:

@@ -631,8 +631,6 @@ def sequence(inputFromL3, weightsFromL3, outputToL3):
                     sizes=[1, 1, 1, totalWeightsSize32b],
                 )
 
-                npu_sync(column=0, row=0, direction=0, channel=0)
-
     print(ctx.module)
 
 

@@ -168,7 +168,6 @@ def sequence(I, W, O):
                     mem=W,
                     sizes=[1, 1, 1, weightsInInt32s],
                 )
-                npu_sync(column=0, row=0, direction=0, channel=0)
 
     #    print(ctx.module.operation.verify())
     print(ctx.module)

@@ -254,7 +254,6 @@ def sequence(I, W, O):
                     mem=W,
                     sizes=[1, 1, 1, weightsInInt32s],
                 )
-                npu_sync(column=0, row=0, direction=0, channel=0)
 
     #    print(ctx.module.operation.verify())
     print(ctx.module)

@@ -152,7 +152,6 @@ def sequence(A, B, C):
             npu_dma_memcpy_nd(
                 metadata="inB", bd_id=2, mem=B, sizes=[1, 1, 1, B_sz_in_i32s]
             )
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 try:

@@ -153,7 +153,6 @@ def sequence(A, B, C):
             npu_dma_memcpy_nd(
                 metadata="inB", bd_id=2, mem=B, sizes=[1, 1, 1, B_sz_in_i32s]
             )
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 try:

@@ -124,7 +124,6 @@ def sequence(A, C):
             npu_dma_memcpy_nd(
                 metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
             )
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 try:

@@ -986,8 +986,6 @@ def sequence(inputFromL3, weightsFromL3, outputToL3):
                     sizes=[1, 1, 1, totalWeightsSize32b_rest],
                 )
 
-                npu_sync(column=1, row=0, direction=0, channel=0)
-
     res = ctx.module.operation.verify()
     if res == True:
         print(ctx.module)

@@ -128,7 +128,6 @@ def sequence(A, C):
             npu_dma_memcpy_nd(
                 metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
             )
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 try:

@@ -266,7 +266,6 @@ def sequence(I, B, O):
                     mem=O,
                     sizes=[1, 1, 1, height * lineWidthInInt32s],
                 )
-                npu_sync(column=0, row=0, direction=0, channel=0)
 
     print(ctx.module)
 

@@ -284,7 +284,6 @@ def sequence(inTensor, notUsed, outTensor):
                     mem=outTensor,
                     sizes=[1, 1, 1, tensorSizeInInt32s],
                 )
-                npu_sync(column=0, row=0, direction=0, channel=0)
 
     # print(ctx.module.operation.verify())
     print(ctx.module)

@@ -312,7 +312,6 @@ def sequence(I, B, O):
                     mem=I,
                     sizes=[1, 1, 1, tensorSizeInInt32s],
                 )
-                npu_sync(column=0, row=0, direction=0, channel=0)
 
     #    print(ctx.module.operation.verify())
     print(ctx.module)

@@ -165,7 +165,6 @@ def sequence(inTensor, notUsed, outTensor):
                     mem=outTensor,
                     sizes=[1, 1, 1, tensorSizeInInt32s],
                 )
-                npu_sync(column=0, row=0, direction=0, channel=0)
 
     print(ctx.module)
 

@@ -55,7 +55,6 @@ module @passThroughLine_aie2 {
             //dma_memcpy_nd ([offset in 32b words][length in 32b words][stride in 32b words])
             aiex.npu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @inOF, id = 1 : i64 } : memref<518400xi32>
             aiex.npu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @outOF, id = 0 : i64 } : memref<518400xi32>
-            aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
             return
         }
     }

@@ -56,7 +56,6 @@ module @passThroughLine_aie2 {
             //dma_memcpy_nd ([offset in 32b words][length in 32b words][stride in 32b words])
             aiex.npu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %c1, %totalLenRGBA][%c0, %c0, %c0]) { metadata = @inOF, id = 1 : i64 } : memref<2073600xi32>
             aiex.npu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %c1, %totalLenRGBA][%c0, %c0, %c0]) { metadata = @outOF, id = 0 : i64 } : memref<2073600xi32>
-            aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
             return
         }
     }

@@ -55,7 +55,6 @@ module @passThroughLine_aie2 {
             //dma_memcpy_nd ([offset in 32b words][length in 32b words][stride in 32b words])
             aiex.npu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @inOF, id = 1 : i64 } : memref<1152xi32>
             aiex.npu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @outOF, id = 0 : i64 } : memref<1152xi32>
-            aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
             return
         }
     }

diff --git a/programming_guide/section-2/section-2e/02_external_mem_to_core/ext_to_core.py b/programming_guide/section-2/section-2e/02_external_mem_to_core/ext_to_core.py
@@ -60,7 +60,6 @@ def sequence(inTensor, notUsed, outTensor):
                 npu_dma_memcpy_nd(
                     metadata="in", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 48]
                 )
-                npu_sync(column=0, row=0, direction=0, channel=0)
 
     res = ctx.module.operation.verify()
     if res == True:

diff --git a/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/ext_to_core_L2.py b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/ext_to_core_L2.py
@@ -64,7 +64,6 @@ def sequence(inTensor, notUsed, outTensor):
                 npu_dma_memcpy_nd(
                     metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 48]
                 )
-                npu_sync(column=0, row=0, direction=0, channel=0)
 
     res = ctx.module.operation.verify()
     if res == True:

diff --git a/programming_guide/section-2/section-2e/05_join_L2/distribute_and_join_L2.py b/programming_guide/section-2/section-2e/05_join_L2/distribute_and_join_L2.py
@@ -101,7 +101,6 @@ def sequence(inTensor, notUsed, outTensor):
                 npu_dma_memcpy_nd(
                     metadata="in", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 48]
                 )
-                npu_sync(column=0, row=0, direction=0, channel=0)
 
     print(ctx.module)
 

diff --git a/programming_guide/section-4/section-4b/aie2.py b/programming_guide/section-4/section-4b/aie2.py
@@ -82,7 +82,6 @@ def sequence(A, F, C):
             npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 4096])
             npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, 4096])
             npu_dma_memcpy_nd(metadata="infactor", bd_id=2, mem=F, sizes=[1, 1, 1, 1])
-            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 with mlir_mod_ctx() as ctx:

@@ -105,7 +105,6 @@ module {
     func.func @bobsyouruncle(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) {
       aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 0 : i64, metadata = @this_just_creates_a_symbol_and_the_type_means_nothing_in} : memref<64xi32>
       aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 1 : i64, metadata = @this_just_creates_a_symbol_and_the_type_means_nothing_out} : memref<64xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
       return
     }
   }

@@ -70,7 +70,6 @@ module {
       %c64_i64 = arith.constant 64 : i64
       aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32>
       aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64]) {id = 1 : i64, metadata = @objFifo_out0} : memref<64xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
       return
     }
 

@@ -49,7 +49,6 @@ module {
       %c64 = arith.constant 64 : i64
       aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32>
       aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32>
-      aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 }
       return
     }
   }

@@ -46,7 +46,6 @@ module {
       %c64 = arith.constant 64 : i64
       aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32>
       aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32>
-      aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 }
       return
     }
   }

@@ -81,7 +81,6 @@ module {
       %c64_i64 = arith.constant 64 : i64
       aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32>
       aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 1 : i64, metadata = @objFifo_out0} : memref<64xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
       return
     }
 

@@ -65,7 +65,6 @@ module {
       %c64 = arith.constant 64 : i64
       aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32>
       aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32>
-      aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 }
       return
     }
   }

@@ -641,7 +641,6 @@ module {
       aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 0 : i64, metadata = @airMemcpyId4} : memref<16x16xi32>
       aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 1 : i64, metadata = @airMemcpyId5} : memref<16x16xi32>
       aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 16, 16][0, 0, 16]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<16x16xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
       return
     }
   } {sym_name = "segment_0"}
-Original file line number
+Diff line change
@@ Expand Up / @@ -227,8 +227,6 @@ def sequence(A, B, C): @@
                                 strides=[n_in_i32s, k_x_N_in_i32s, N_in_i32s],
                             )
-                        npu_sync(column=0, row=0, direction=0, channel=0)
         print(ctx.module)
@@ Expand Down @@