Skip to content

Commit

Permalink
Merge branch 'main' into peano
Browse files Browse the repository at this point in the history
  • Loading branch information
jgmelber authored Jul 30, 2024
2 parents cb827bb + b9cdb91 commit 0b59052
Show file tree
Hide file tree
Showing 265 changed files with 5,339 additions and 3,465 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
.vscode
__pycache__
.DS_Store
**/CMakeFiles
**.cmake
include/**/Makefile
lib/**/Makefile


/platforms/vck190_bare/petalinux/build
/platforms/vck190_bare/petalinux/components
Expand Down
18 changes: 18 additions & 0 deletions aie_kernels/aie2/mm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,23 @@ void matmul_vectorized_4x4x4_i16_i16(const int16 *__restrict pA,
pC);
}

template <unsigned m, unsigned k, unsigned n>
void matmul_vectorized_4x4x4_i16_i32(const int16 *__restrict pA,
const int16 *__restrict pB,
int32 *__restrict pC) {
// matmul_vectorized operates on two 4x4 input blocks of A, and two 4x4 input
// blocks of B in each iteration. Make sure we have at least 2 blocks in each
// dimension, and that our input matrix is evenly divisible.
constexpr int r = 4;
constexpr int s = 4;
constexpr int t = 4;
static_assert(m % (2 * r) == 0 && m / (2 * r) > 0);
static_assert(k % (2 * s) == 0 && k / (2 * s) > 0);
static_assert(n % (2 * t) == 0 && n / (2 * t) > 0);
return matmul_vectorized<int16, int32, m / r, k / s, n / t, r, s, t>(pA, pB,
pC);
}

template <unsigned m, unsigned k, unsigned n>
void matmul_vectorized_4x8x4_bf16_bf16(const bfloat16 *__restrict pA,
const bfloat16 *__restrict pB,
Expand Down Expand Up @@ -416,6 +433,7 @@ extern "C" {

#define combos(X) \
X(int16, i16, int16, i16, 4, 4, 4) \
X(int16, i16, int32, i32, 4, 4, 4) \
X(bfloat16, bf16, bfloat16, bf16, 4, 8, 4) \
X(bfloat16, bf16, float, f32, 4, 8, 4)

Expand Down
22 changes: 11 additions & 11 deletions docs/AIEVectorization.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ aie-opt -affine-super-vectorize="virtual-vector-size=8 vectorize-reductions" --a
%0 = aievec.upd %arg0[%arg3] {index = 0 : i8, offset = 0 : si32} : memref<2048xf32>, vector<8xf32>
%1 = aievec.upd %arg1[%arg3] {index = 0 : i8, offset = 0 : si32} : memref<2048xf32>, vector<8xf32>
%2 = aievec.concat %0, %0 : vector<8xf32>, vector<16xf32>
%3 = aievec.mul %2, %1 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x76543210", zstart = "0"} : vector<16xf32>, vector<8xf32>, !aievec.acc<8xf32>
%3 = aievec_aie1.mul %2, %1 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x76543210", zstart = "0"} : vector<16xf32>, vector<8xf32>, !aievec.acc<8xf32>
%4 = aievec.srs %3 {shift = 0 : i8} : !aievec.acc<8xf32>, vector<8xf32>
vector.transfer_write %4, %arg2[%arg3] {in_bounds = [true]} : vector<8xf32>, memref<2048xf32>
}
Expand Down Expand Up @@ -114,7 +114,7 @@ Results in:
scf.for %arg3 = %c0 to %c2048 step %c16 {
%0 = aievec.upd %arg0[%arg3] {index = 0 : i8, offset = 0 : si32} : memref<2048xi16>, vector<16xi16>
%1 = aievec.upd %arg1[%arg3] {index = 0 : i8, offset = 0 : si32} : memref<2048xi16>, vector<16xi16>
%2 = aievec.mul %0, %1 : vector<16xi16>, vector<16xi16>, !aievec.acc<16xi48>
%2 = aievec_aie1.mul %0, %1 : vector<16xi16>, vector<16xi16>, !aievec.acc<16xi48>
%3 = aievec.srs %2 {shift = 0 : i8} : !aievec.acc<16xi48>, vector<16xi16>
vector.transfer_write %3, %arg2[%arg3] {in_bounds = [true]} : vector<16xi16>, memref<2048xi16>
}
Expand Down Expand Up @@ -150,22 +150,22 @@ aie-opt --affine-loop-unroll="unroll-full unroll-full-threshold=3" --canonicaliz
%4 = aievec.upd %arg2[%arg3, %arg4] {index = 0 : i8, offset = 0 : si32} : memref<2046x2046xi32>, vector<8xi32>
%5 = aievec.upd %arg0[%arg3, %arg4] {index = 0 : i8, offset = 0 : si32} : memref<2048x2048xi32>, vector<16xi32>
%6 = aievec.ups %4 {shift = 0 : i8} : vector<8xi32>, !aievec.acc<8xi80>
%7 = aievec.mac %5, %0, %6 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
%7 = aievec_aie1.mac %5, %0, %6 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
%c1_5 = arith.constant 1 : index
%8 = arith.addi %arg4, %c1_5 : index
%9 = aievec.upd %arg0[%arg3, %8], %5 {index = 1 : i8, offset = 224 : si32} : memref<2048x2048xi32>, vector<16xi32>
%10 = aievec.mac %9, %0, %7 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
%11 = aievec.mac %9, %0, %10 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
%10 = aievec_aie1.mac %9, %0, %7 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "1"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
%11 = aievec_aie1.mac %9, %0, %10 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "2"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
%12 = aievec.upd %arg0[%2, %arg4] {index = 0 : i8, offset = 0 : si32} : memref<2048x2048xi32>, vector<16xi32>
%13 = aievec.mac %12, %0, %11 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
%13 = aievec_aie1.mac %12, %0, %11 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "3"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
%14 = aievec.upd %arg0[%2, %8], %12 {index = 1 : i8, offset = 224 : si32} : memref<2048x2048xi32>, vector<16xi32>
%15 = aievec.mac %14, %0, %13 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
%16 = aievec.mac %14, %0, %15 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
%15 = aievec_aie1.mac %14, %0, %13 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "4"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
%16 = aievec_aie1.mac %14, %0, %15 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "5"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
%17 = aievec.upd %arg0[%3, %arg4] {index = 0 : i8, offset = 0 : si32} : memref<2048x2048xi32>, vector<16xi32>
%18 = aievec.mac %17, %0, %16 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
%18 = aievec_aie1.mac %17, %0, %16 {xoffsets = "0x76543210", xstart = "0", zoffsets = "0x00000000", zstart = "6"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
%19 = aievec.upd %arg0[%3, %8], %17 {index = 1 : i8, offset = 224 : si32} : memref<2048x2048xi32>, vector<16xi32>
%20 = aievec.mac %19, %0, %18 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
%21 = aievec.mac %19, %1, %20 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
%20 = aievec_aie1.mac %19, %0, %18 {xoffsets = "0x76543210", xstart = "1", zoffsets = "0x00000000", zstart = "7"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
%21 = aievec_aie1.mac %19, %1, %20 {xoffsets = "0x76543210", xstart = "2", zoffsets = "0x00000000", zstart = "0"} : vector<16xi32>, vector<8xi32>, !aievec.acc<8xi80>
%22 = aievec.srs %21 {shift = 0 : i8} : !aievec.acc<8xi80>, vector<8xi32>
vector.transfer_write %22, %arg2[%arg3, %arg4] {in_bounds = [true]} : vector<8xi32>, memref<2046x2046xi32>
}
Expand Down
50 changes: 50 additions & 0 deletions docs/conferenceDescriptions/micro24TutorialDescription.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# MICRO 2024 Tutorial: Leveraging the IRON AI Engine API to program the Ryzen™ AI NPU

## Introduction

The NPU of AMD Ryzen™ AI devices includes an AI Engine array comprised of a set of VLIW vector processors, data movement accelerators (DMAs) and adaptable interconnect. This tutorial is targeted at performance engineers who are looking to develop designs targeting the NPU with open source design tools. We provide a close-to-metal Python API: Interface Representation for hands-ON (IRON) AIE-array programming. IRON is an open access toolkit enabling performance engineers to build fast and efficient, often specialized, designs through a set of Python language bindings around the mlir-aie dialect. Participants will first get insight into the AI Engine compute and data movement capabilities. Through small design examples expressed in the IRON API and executed on an Ryzen™ AI device, participants will leverage AI Engine features for optimizing performance of increasingly complex designs. The labs will be done on Ryzen™ AI-enabled mini-PCs, giving participants the ability to execute their own designs on real hardware.

This tutorial will cover the following key topics:
1. AI Engine architecture introduction
1. AIE core, array configuration, and host application code compilation
1. Data movement and communication abstraction layers
1. Tracing for performance monitoring
1. Putting it all together on larger examples: matrix multiplication, convolutions as building blocks for ML and computer vision examples

## Agenda

Date: Sunday, November 3rd, 2024 (morning)
Location: Austin, Texas, USA (with MICRO-57)
Prerequisite: Please bring your laptop so that you can SSH into our Ryzen™ AI-enabled miniPCs for the hands-on exercises.

### Contents and Timeline (tentative)

| Time | Topic | Presenter | Slides or Code |
|------|-------|-----------|----------------|
| 08:30am | Intro to spatial compute and explicit data movement | Kristof | [Programming Guide](../../programming_guide/) |
| 08:45am | "Hello World" from Ryzen™ AI | Joe | [AI Engine Basic Building Blocks](../../programming_guide/section-1/) |
| 09:00am | Data movement on Ryzen™ AI with objectFIFOs | Joe | [Data Movement](../../programming_guide/section-2/) |
| 09:30am | Your First Program | Kristof | [My First Program](../../programming_guide/section-3) |
| 09:50am | Exercise 1: Build and run your first program | All | [Passthrough](../../programming_examples/basic/passthrough_kernel/) |
| 10:00am | Break | | |
| 10:30am | Exercise 2: Vector-Scalar Mul | All | [Vector Scalar Mul](../../programming_examples/basic/vector_scalar_mul/) |
| 10:40am | Tracing and performance analysis | Jack | [Timers](../../programming_guide/section-4/section-4a/) and [Tracing](../../programming_guide/section-4/section-4b/) |
| 11:10am | Exercise 3: Tracing vector-scalar | All | [Vector Scalar Mul](../../programming_examples/basic/vector_scalar_mul/) |
| 11:30am | Vectorizing on AIE | Jack | [Kernel Vectorization](../../programming_guide/section-4/section-4c/) |
| 11:40am | Exercise 4: Vectorized vector-scalar | All | [Vector Scalar Mul](../../programming_examples/basic/vector_scalar_mul/) |
| 12:00pm | Dataflow and larger designs | Joe | [Example Vector Designs](../../programming_guide/section-5/) and [Large Example Designs](../../programming_guide/section-6/) |
| 12:15pm | Exercises | All | [Programming Examples](../../programming_examples/) |
| 12:30pm | Close Tutorial | All | |


## Organizers

*Jack Lo* is a Senior Member of Technical Staff in AMD’s Research and Advanced Development group. At AMD, he is focused on developing tool frameworks and optimizing applications for current and future AMD devices, particularly in the area of adaptive computing and AI processing.

*Joseph Melber* is a Senior Member of Technical Staff in AMD’s Research and Advanced Development group. At AMD, he is working on hardware architectures and compiler technologies for current and future AMD devices. He received a BS in electrical engineering from the University Buffalo, as well as MS and PhD degrees from the electrical and computer engineering department at Carnegie Mellon University. His research interests include runtime systems, compiler abstractions for data movement, and hardware prototypes for future adaptive heterogeneous computing architectures.

*Andrew Schmidt* is a Senior Member of Technical Staff in the AMD University Program. At AMD, he provides tutorials, training workshops and engages with universities across undergraduate and graduate curriculum, as well as research projects. The AMD University Program offers researchers access to state-of-the-art hardware through various programs including the HPC Fund, HACC program, and donation program and offers professors and lecturers free software licenses and educational resources to support classroom teaching. He has extensive background on adaptive computing acceleration with heterogenous architectures, previously working at the University of Southern California’s Information Sciences Institute where his focus was on reconfigurable computing, computer architecture, and hardware assurance. He received his BS and MS in Computer Engineering from the University of Kansas and his PhD in Electrical Engineering from the University of North Carolina at Charlotte where his focus was on efficient utilization of heterogenous resources for High Performance Reconfigurable Computing.

*Kristof Denolf* is a Fellow in AMD's Research and Advanced Development group where he is working on energy-efficient computer vision and video processing applications to shape future AMD devices. He earned an M.Eng. in electronics from the Katholieke Hogeschool Brugge-Oostende (1998), now part of KULeuven, an M.Sc. in electronic system design from Leeds Beckett University (2000), and a Ph.D. from the Technical University Eindhoven (2007). He has over 25 years of combined research and industry experience at IMEC, Philips, Barco, Apple, Xilinx, and AMD. His main research interests are all aspects of the cost-efficient and dataflow-oriented design of video, vision, and graphics systems.

*Phil James-Roxby* is a Senior Fellow in AMD’s Research and Advanced Development group, working on compilers and runtimes to support current and future AMD devices, particularly in the domain of AI processing. In the past, he has been responsible for a number of software enablement activities for hardware devices, including SDNet and SDAccel at Xilinx, and the original development environment for the AI Engines. He holds a PhD from the University of Manchester on hardware acceleration of embedded machine learning applications, and his main research interest continues to be how to enable users to efficiently use diverse hardware in heterogeneous systems.
26 changes: 24 additions & 2 deletions include/aie/Dialect/AIE/IR/AIEAttrs.td
Original file line number Diff line number Diff line change
Expand Up @@ -148,15 +148,37 @@ def BDDimLayoutAttr : AttrDef<AIE_Dialect, "BDDimLayout", []> {
def BDDimLayoutArrayAttr : ArrayOfAttr<
/*dialect*/AIE_Dialect,
/*attrName*/"BDDimLayoutArray",
/*attrMnemonic*/"bd_dim_layout_arr",
/*attrMnemonic*/"bd_dim_layout_array",
/*eltName*/BDDimLayoutAttr.cppClassName
>;

def BDDimLayoutArrayArrayAttr : ArrayOfAttr<
/*dialect*/AIE_Dialect,
/*attrName*/"BDDimLayoutArrayArray",
/*attrMnemonic*/"bd_dim_layout_arr_arr",
/*attrMnemonic*/"bd_dim_layout_array_array",
/*eltName*/BDDimLayoutArrayAttr.cppClassName
>;

def BDPadLayoutAttr : AttrDef<AIE_Dialect, "BDPadLayout", []> {
let mnemonic = "bd_pad_layout";
let summary = [{
Tuple encoding number of zeros before and after on that dimension in an AIE2
n-dimensional buffer descriptor;
}];

let parameters = (ins
"uint16_t" : $const_pad_before,
"uint16_t" : $const_pad_after
);

let assemblyFormat = "`<` struct(params) `>`";
}

def BDPadLayoutArrayAttr : ArrayOfAttr<
/*dialect*/AIE_Dialect,
/*attrName*/"BDPadLayoutArray",
/*attrMnemonic*/"bd_pad_layout_array",
/*eltName*/BDPadLayoutAttr.cppClassName
>;

#endif // AIE_ATTRS
Loading

0 comments on commit 0b59052

Please sign in to comment.