@@ -455,10 +455,10 @@ module {
455
455
// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 64, 0][4, 8, 64, 256][0, 256, 2048]) {id = 1 : i64, metadata = @airMemcpyId20} : memref<2048x2048xi32>
456
456
// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 128, 0][4, 8, 64, 256][0, 256, 2048]) {id = 2 : i64, metadata = @airMemcpyId20} : memref<2048x2048xi32>
457
457
// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 192, 0][4, 8, 64, 256][0, 256, 2048]) {id = 3 : i64, metadata = @airMemcpyId20} : memref<2048x2048xi32>
458
- // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][4, 512, 4 , 64][64, 8192 , 2048]) {id = 4 : i64, metadata = @airMemcpyId21} : memref<2048x2048xi32>
459
- // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][4, 512, 4 , 64][64, 8192 , 2048]) {id = 5 : i64, metadata = @airMemcpyId21} : memref<2048x2048xi32>
460
- // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][4, 512, 4 , 64][64, 8192 , 2048]) {id = 6 : i64, metadata = @airMemcpyId21} : memref<2048x2048xi32>
461
- // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][4, 512, 4 , 64][64, 8192 , 2048]) {id = 7 : i64, metadata = @airMemcpyId21} : memref<2048x2048xi32>
458
+ // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][4, 4, 512 , 64][64, 1048576 , 2048]) {id = 4 : i64, metadata = @airMemcpyId21} : memref<2048x2048xi32>
459
+ // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][4, 4, 512 , 64][64, 1048576 , 2048]) {id = 5 : i64, metadata = @airMemcpyId21} : memref<2048x2048xi32>
460
+ // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][4, 4, 512 , 64][64, 1048576 , 2048]) {id = 6 : i64, metadata = @airMemcpyId21} : memref<2048x2048xi32>
461
+ // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][4, 4, 512 , 64][64, 1048576 , 2048]) {id = 7 : i64, metadata = @airMemcpyId21} : memref<2048x2048xi32>
462
462
// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG2]][0, 0, 0, 0][4, 4, 64, 64][131072, 64, 2048]) {id = 8 : i64, metadata = @airMemcpyId26} : memref<2048x2048xi32>
463
463
464
464
#map = affine_map <()[s0 ] -> (s0 * 64 )>
@@ -701,8 +701,8 @@ module {
701
701
// CHECK-SAME: %[[VAL_0:.*]]: memref<262144xi32>, %[[VAL_1:.*]]: memref<262144xi32>, %[[VAL_2:.*]]: memref<131072xi32>) {
702
702
// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[VAL_0]][0, 0, 0, 0][2, 4, 256, 128][0, 128, 512]) {id = 0 : i64, metadata = @airMemcpyId7} : memref<262144xi32>
703
703
// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[VAL_0]][0, 0, 0, 131072][2, 4, 256, 128][0, 128, 512]) {id = 1 : i64, metadata = @airMemcpyId7} : memref<262144xi32>
704
- // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[VAL_1]][0, 0, 0, 0][2, 512, 2 , 128][128, 512 , 256]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<262144xi32>
705
- // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[VAL_1]][0, 0, 0, 0][2, 512, 2 , 128][128, 512 , 256]) {id = 3 : i64, metadata = @airMemcpyId12} : memref<262144xi32>
704
+ // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[VAL_1]][0, 0, 0, 0][2, 2, 512 , 128][128, 131072 , 256]) {id = 2 : i64, metadata = @airMemcpyId12} : memref<262144xi32>
705
+ // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[VAL_1]][0, 0, 0, 0][2, 2, 512 , 128][128, 131072 , 256]) {id = 3 : i64, metadata = @airMemcpyId12} : memref<262144xi32>
706
706
// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[VAL_2]][0, 0, 0, 0][2, 2, 64, 128][65536, 128, 256]) {id = 4 : i64, metadata = @airMemcpyId45} : memref<131072xi32>
707
707
// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[VAL_2]][0, 0, 0, 16384][2, 2, 64, 128][65536, 128, 256]) {id = 5 : i64, metadata = @airMemcpyId46} : memref<131072xi32>
708
708
// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[VAL_2]][0, 0, 0, 32768][2, 2, 64, 128][65536, 128, 256]) {id = 0 : i64, metadata = @airMemcpyId47} : memref<131072xi32>
@@ -930,3 +930,43 @@ module {
930
930
return
931
931
}
932
932
}
933
+
934
+ // -----
935
+
936
+ // Outermost wrap must be in range [1:64] for AIE2.
937
+
938
+ // CHECK-LABEL: func21
939
+ // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][38, 2, 64, 32][77824, 32, 1216]) {id = 0 : i64, metadata = @airMemcpyId10} : memref<11829248xi32>
940
+ // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 2957312][38, 2, 64, 32][77824, 32, 1216]) {id = 1 : i64, metadata = @airMemcpyId10} : memref<11829248xi32>
941
+ // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 5914624][38, 2, 64, 32][77824, 32, 1216]) {id = 2 : i64, metadata = @airMemcpyId10} : memref<11829248xi32>
942
+ // CHECK: aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 8871936][38, 2, 64, 32][77824, 32, 1216]) {id = 3 : i64, metadata = @airMemcpyId10} : memref<11829248xi32>
943
+ // CHECK: return
944
+
945
+ #map = affine_map <()[s0 ] -> (s0 * 128 )>
946
+ module {
947
+ aie.device (npu1_4col ) {
948
+ aie.shim_dma_allocation @airMemcpyId10 (MM2S , 1 , 0 )
949
+ memref.global " public" @airMemcpyId10 : memref <1 x2 x64 x64 xbf16 , 1 : i32 >
950
+ } {sym_name = " matmul_bf16_large_dispatch_0_matmul_308x2432x9728_bf16_0" }
951
+ airrt.module_metadata {
952
+ }
953
+ func.func @func21 (%arg0: memref <9728 x2432 xbf16 >) {
954
+ %c2_i64 = arith.constant 2 : i64
955
+ %c2432_i64 = arith.constant 2432 : i64
956
+ %c155648_i64 = arith.constant 155648 : i64
957
+ %c152_i64 = arith.constant 152 : i64
958
+ %c64_i64 = arith.constant 64 : i64
959
+ %c10_i32 = arith.constant 10 : i32
960
+ %c0_i64 = arith.constant 0 : i64
961
+ affine.for %arg3 = 0 to 1 {
962
+ affine.for %arg4 = 0 to 1 {
963
+ %0 = affine.apply #map ()[%arg4 ]
964
+ %1 = arith.index_cast %arg3 : index to i64
965
+ %2 = arith.index_cast %arg4 : index to i64
966
+ %3 = arith.index_cast %0 : index to i64
967
+ %4 = airrt.dma_memcpy_nd (%c10_i32 , %1 , %2 , %arg0 [%c0_i64 , %c0_i64 , %c0_i64 , %3 ], [%c152_i64 , %c2_i64 , %c64_i64 , %c64_i64 ], [%c155648_i64 , %c64_i64 , %c2432_i64 ]) {metadata = @airMemcpyId10 } : (i32 , i64 , i64 , memref <9728 x2432 xbf16 >, [i64 , i64 , i64 , i64 ], [i64 , i64 , i64 , i64 ], [i64 , i64 , i64 ]) : !airrt.event
968
+ }
969
+ }
970
+ return
971
+ }
972
+ }
0 commit comments