-
Couldn't load subscription status.
- Fork 29
E2E Linux Example
This is a minimal guide to executing an E2E test of llvm-aie on an AIE2 device (all environment specifics will be described below).
- Working AIE2 device with a driver and runtime installed (more info at https://github.com/amd/xdna-driver);
- This doc was written against the following system configuration:
System Configuration OS Name : Linux Release : 6.8.8 Version : #2 SMP PREEMPT_DYNAMIC Fri May 3 14:13:56 CDT 2024 Machine : x86_64 CPU Cores : 16 Memory : 94278 MB Distribution : Ubuntu 22.04.3 LTS GLIBC : 2.35 Model : F7BSC BIOS vendor : American Megatrends International, LLC. BIOS version : 1.04 XRT Version : 2.18.0 Branch : HEAD Hash : c678a9469f9b20fcb9a04bbedb5c51f8473faec0 Hash Date : 2024-05-24 18:16:53 XOCL : unknown, unknown XCLMGMT : unknown, unknown WARNING: xclmgmt version is unknown. Is xclmgmt driver loaded? Or is MSD/MPD running? AMDXDNA : 2.18.0_20240524, 4ef6d95ad37a2de0aa22264c950dec8ec1bd9f52 Firmware Version : N/A Devices present BDF : Name --------------------------------- [0000:c5:00.1] : RyzenAI-npu1
- This doc was written against the following system configuration:
- Distro install of
llvm-aie(test on commit 70703e80a6ecf8f8cf3fa724191dd1f36951dea3);- Here is a plausible CMake configure:
-C $LLVM_AIE_REPO_ROOT/clang/cmake/caches/Peano-AIE.cmake \ -DCMAKE_INSTALL_PREFIX=$LLVM_AIE_REPO_ROOT/install
- Here is a plausible CMake configure:
- A python environment with
xaiepyinstalled;- A one-liner:
pip install xaiepy==0.0.1 -f https://github.com/nod-ai/prototype-aie-toolchain/releases/expanded_assets/release
- A one-liner:
All programs/scripts are "attached" below.
The example program is very simple and does exactly one thing:
#include "aiev2_locks.h"
#define ACQ_LOCK 48
#define REL_LOCK 49
extern float _anonymous0[1];
int main() {
acquire_greater_equal(ACQ_LOCK, 1);
_anonymous0[0] = 5 * 3.14159;
release(REL_LOCK, 1);
return 0;
}i.e., it stores 5 * 3.14159 == 15.70795 to a global array. To go along with this brilliant program you will need the following linker script:
MEMORY
{
program (RX) : ORIGIN = 0, LENGTH = 0x0020000
data (!RX) : ORIGIN = 0x70404, LENGTH = 0xFBFC
}
ENTRY(_main_init)
SECTIONS
{
. = 0x0;
.text : {
*me_basic.o(.text)
. = 0x200;
_ctors_start = .;
_init_array_start = .;
KEEP(SORT(*.init_array))
_ctors_end = .;
_init_array_end = .;
_dtors_start = .;
_dtors_end = .;
*(.text)
} > program
.data : {
*(.data*);
*(.rodata*)
} > data
. = 0x70000;
_sp_start_value_DM_stack = .;
. += 0x400; /* stack */
. = 0x40000;
. += 0x10000;
. = 0x50000;
. += 0x10000;
. = 0x70400;
_anonymous0 = .;
. += 0x4;
.bss : { *(.bss) } > data
.bss.DMb.4 : { *(.bss.DMb.4) } > data
}
PROVIDE(_main = main);
Writing this is beyond the scope of this intro.
Get all of your ducks in line (turn the above code into a main.cpp, find Peano and set PEANO_INSTALL_DIR=...) and then incant the following magical incantations:
me@mydesk: $PEANO_INSTALL_DIR/bin/clang -O2 -I$PEANO_INSTALL_DIR/lib/clang/18/include \
-S --target=aie2-none-unknown-elf main.cpp -emit-llvm
me@mydesk: $PEANO_INSTALL_DIR/bin/clang -O2 --target=aie2-none-unknown-elf main.ll \
-ccc-install-dir $PEANO_INSTALL_DIR/bin -Wl,-T $PWD/main.ld.script \
-o fivepi.elf && $PEANO_INSTALL_DIR/bin/llvm-readelf -Ss fivepi.elfIf everything went according to plan you will see roughly the following as verification that your elf file is fully baked:
There are 8 section headers, starting at offset 0x1544:
Section Headers:
[Nr] Name Type Address Off Size ES Flg Lk Inf Al
[ 0] NULL 00000000 000000 000000 00 0 0 0
[ 1] .text PROGBITS 00000000 001000 000260 00 AX 0 0 16
[ 2] .text._Exit PROGBITS 00000260 001260 000020 00 AX 0 0 16
[ 3] .text._main_init PROGBITS 00000280 001280 000050 00 AX 0 0 16
[ 4] .comment PROGBITS 00000000 0012d0 00007f 01 MS 0 0 1
[ 5] .symtab SYMTAB 00000000 001350 000100 10 7 3 4
[ 6] .shstrtab STRTAB 00000000 001450 000047 00 0 0 1
[ 7] .strtab STRTAB 00000000 001497 0000ad 00 0 0 1
Symbol table '.symtab' contains 16 entries:
Num: Value Size Type Bind Vis Ndx Name
0: 00000000 0 NOTYPE LOCAL DEFAULT UND
1: 00000000 0 FILE LOCAL DEFAULT ABS main.cpp
2: 00000000 0 FILE LOCAL DEFAULT ABS crt1.cc
3: 00000200 64 FUNC GLOBAL DEFAULT 1 main
4: 00070400 0 NOTYPE GLOBAL DEFAULT 1 _anonymous0
5: 00000240 0 FUNC GLOBAL DEFAULT 1 __start
6: 00070000 0 NOTYPE GLOBAL DEFAULT 1 _sp_start_value_DM_stack
7: 00000280 80 FUNC GLOBAL DEFAULT 3 _main_init
8: 00000260 32 FUNC GLOBAL DEFAULT 2 _Exit
9: 00000200 0 FUNC GLOBAL DEFAULT 1 _main
10: 00000200 0 NOTYPE GLOBAL DEFAULT 1 _ctors_start
11: 00000200 0 NOTYPE GLOBAL DEFAULT 1 _init_array_start
12: 00000200 0 NOTYPE GLOBAL DEFAULT 1 _ctors_end
13: 00000200 0 NOTYPE GLOBAL DEFAULT 1 _init_array_end
14: 00000200 0 NOTYPE GLOBAL DEFAULT 1 _dtors_start
15: 00000200 0 NOTYPE GLOBAL DEFAULT 1 _dtors_end
There are two example scripts in xaiepy that demonstrate how to configure the Phoenix device and run the program using XRT APIs:
-
gen_example.py, which generates an.xclbinthat can be loaded/run by XRT -
xrt.py, which loads and runs the aforementioned.xclbin.
Run both of these et voila you should see something resembling 15.70795.
In the next episode we'll explain what all of these things actually do...
#include "aiev2_locks.h"
#define ACQ_LOCK 48
#define REL_LOCK 49
extern float _anonymous0[1];
int main() {
acquire_greater_equal(ACQ_LOCK, 1);
_anonymous0[0] = 5 * 3.14159;
release(REL_LOCK, 1);
return 0;
}MEMORY
{
program (RX) : ORIGIN = 0, LENGTH = 0x0020000
data (!RX) : ORIGIN = 0x70404, LENGTH = 0xFBFC
}
ENTRY(_main_init)
SECTIONS
{
. = 0x0;
.text : {
/* the _main_init symbol from me_basic.o has to come at address zero. */
*me_basic.o(.text)
. = 0x200;
_ctors_start = .;
_init_array_start = .;
KEEP(SORT(*.init_array))
_ctors_end = .;
_init_array_end = .;
_dtors_start = .;
_dtors_end = .;
*(.text)
} > program
.data : {
*(.data*);
*(.rodata*)
} > data
. = 0x70000;
_sp_start_value_DM_stack = .;
. += 0x400; /* stack */
/* No tile with memory exists to the south. */
. = 0x40000;
. += 0x10000;
/* No tile with memory exists to the west. */
. = 0x50000;
. += 0x10000;
. = 0x70400;
_anonymous0 = .;
. += 0x4;
.bss : { *(.bss) } > data
.bss.DMb.4 : { *(.bss.DMb.4) } > data
}
PROVIDE(_main = main);
PEANO_INSTALL_DIR=<fill me in.................>
$PEANO_INSTALL_DIR/bin/clang -O2 -I$PEANO_INSTALL_DIR/lib/clang/18/include \
-S --target=aie2-none-unknown-elf main.cpp -emit-llvm
$PEANO_INSTALL_DIR/bin/clang -O2 --target=aie2-none-unknown-elf main.ll \
-ccc-install-dir $PEANO_INSTALL_DIR/bin -Wl,-T $PWD/main.ld.script \
-o fivepi.elf && $PEANO_INSTALL_DIR/bin/llvm-readelf -Ss fivepi.elf#! /usr/bin/env python
import argparse
import json
import logging
import platform
from pathlib import Path
from xaiepy import bootgen, xclbinutil
from xaiepy.cdo import (
startCDOFileStream,
FileHeader,
configureHeader,
endCurrentCDOFileStream,
EnAXIdebug,
setEndianness,
Little_Endian,
)
logging.basicConfig(
level=logging.DEBUG,
format="%(message)s",
datefmt="%H:%M:%S",
)
from xaiepy import (
XAie_Config,
XAie_BackendType,
XAie_PartitionProp,
XAie_DevInst,
XAie_CfgInitialize,
XAie_LocType,
XAie_LoadElf,
XAie_SetupPartitionConfig,
XAie_UpdateNpiAddr,
XAie_CoreReset,
XAie_CoreUnreset,
XAie_LockSetValue,
XAie_Lock,
XAie_DmaDescInit,
XAie_DmaSetAddrLen,
XAie_DmaEnableBd,
XAie_DmaWriteBd,
XAie_DmaChannelSetStartQueue,
XAie_DmaChannelEnable,
XAie_StrmConnCctEnable,
XAie_CoreEnable,
StrmSwPortType,
XAie_EnableAieToShimDmaStrmPort,
XAie_DmaDesc,
)
if platform.system() != "Windows":
from xaiepy import XAie_ErrorHandlingInit
XAIE_DEV_GEN_AIEML = 2
XAIE_BASE_ADDR = 0x40000000
XAIE_COL_SHIFT = 25
XAIE_ROW_SHIFT = 20
XAIE_SHIM_ROW = 0
XAIE_MEM_TILE_ROW_START = 1
XAIE_PARTITION_BASE_ADDR = 0x0
XAIE_TRANSACTION_DISABLE_AUTO_FLUSH = 0b0
DDR_AIE_ADDR_OFFSET = 0x80000000
col = 0
def build_cdo(which_pi):
tile_0_0 = XAie_LocType(0, col)
tile_0_1 = XAie_LocType(1, col)
tile_0_2 = XAie_LocType(2, col)
configPtr = XAie_Config(
XAIE_DEV_GEN_AIEML,
XAIE_BASE_ADDR,
XAIE_COL_SHIFT,
XAIE_ROW_SHIFT,
6,
5,
XAIE_SHIM_ROW,
XAIE_MEM_TILE_ROW_START,
1,
(XAIE_MEM_TILE_ROW_START + 1),
(6 - 1 - 1),
XAie_PartitionProp(),
XAie_BackendType.XAIE_IO_BACKEND_CDO,
)
devInst = XAie_DevInst()
XAie_SetupPartitionConfig(devInst, 0, 1, 1)
XAie_CfgInitialize(devInst, configPtr)
XAie_UpdateNpiAddr(devInst, 0)
EnAXIdebug()
setEndianness(Little_Endian)
cdo_fp = Path(__file__).parent.absolute() / f"{which_pi}_cdo.bin"
startCDOFileStream(str(cdo_fp))
FileHeader()
if platform.system() != "Windows":
XAie_ErrorHandlingInit(devInst)
elf_path = Path(__file__).parent.absolute() / f"{which_pi}.elf"
assert elf_path.exists()
XAie_LoadElf(devInst, tile_0_2, str(elf_path), False)
XAie_CoreReset(devInst, tile_0_2)
XAie_CoreUnreset(devInst, tile_0_2)
XAie_LockSetValue(devInst, tile_0_2, XAie_Lock(0, 1))
XAie_LockSetValue(devInst, tile_0_2, XAie_Lock(1, 0))
dmaTileBd = XAie_DmaDesc()
XAie_DmaDescInit(devInst, dmaTileBd, tile_0_2)
dmaTileBd.DmaMod.contents.SetLock(
dmaTileBd, XAie_Lock(1, -1), XAie_Lock(0, 1), 1, 0
)
XAie_DmaSetAddrLen(dmaTileBd, 1024, 4)
XAie_DmaEnableBd(dmaTileBd)
XAie_DmaWriteBd(devInst, dmaTileBd, tile_0_2, 0)
XAie_DmaChannelSetStartQueue(devInst, tile_0_2, 0, 1, 0, 1, 0)
XAie_DmaChannelEnable(devInst, tile_0_2, 0, 1)
XAie_StrmConnCctEnable(
devInst, tile_0_0, StrmSwPortType.CTRL, 0, StrmSwPortType.SOUTH, 0
)
XAie_StrmConnCctEnable(
devInst, tile_0_0, StrmSwPortType.NORTH, 0, StrmSwPortType.SOUTH, 2
)
XAie_StrmConnCctEnable(
devInst, tile_0_1, StrmSwPortType.NORTH, 0, StrmSwPortType.SOUTH, 0
)
XAie_StrmConnCctEnable(
devInst, tile_0_2, StrmSwPortType.DMA, 0, StrmSwPortType.SOUTH, 0
)
XAie_EnableAieToShimDmaStrmPort(devInst, tile_0_0, 2)
XAie_CoreEnable(devInst, tile_0_2)
configureHeader()
endCurrentCDOFileStream()
bif_fp = Path(__file__).parent.absolute() / f"{which_pi}.bif"
with open(bif_fp, "w") as f:
f.write(bootgen.emit_design_bif([cdo_fp]))
pdi_fp = Path(__file__).parent.absolute() / f"{which_pi}.pdi"
bootgen.make_design_pdi(str(bif_fp), str(pdi_fp))
mem_top_json_fp = Path(__file__).parent.absolute() / f"{which_pi}_mem_topology.json"
with open(mem_top_json_fp, "w") as f:
json.dump(xclbinutil.mem_topology, f, indent=2)
aie_part_json_fp = (
Path(__file__).parent.absolute() / f"{which_pi}_aie_partition.json"
)
kernel_id = "0x902" if "two" in which_pi else "0x901"
pdi_spec = xclbinutil.pdi_spec(pdi_fp, kernel_ids=[kernel_id])
with open(aie_part_json_fp, "w") as f:
json.dump(xclbinutil.emit_partition([pdi_spec], num_cols=1), f, indent=2)
kernels_json_fp = Path(__file__).parent.absolute() / f"{which_pi}_kernel.json"
kernel_spec = xclbinutil.kernel_spec(
kernel_name=which_pi, kernel_id=kernel_id, buffer_args=["c0"]
)
with open(kernels_json_fp, "w") as f:
json.dump(xclbinutil.emit_design_kernel_json([kernel_spec]), f, indent=2)
pi_xclbin_fp = Path(__file__).parent.absolute() / f"{which_pi}.xclbin"
xclbinutil.make_xclbin(
str(mem_top_json_fp),
str(aie_part_json_fp),
str(kernels_json_fp),
str(pi_xclbin_fp),
)
if __name__ == "__main__":
build_cdo("fivepi")from pathlib import Path
import numpy as np
from xaiepy import pyxrt
from xaiepy.pyxrt import ert_cmd_state
def init_xrt_load_kernel(xclbin: Path):
device = pyxrt.device(0)
xclbin = pyxrt.xclbin(str(xclbin))
device.register_xclbin(xclbin)
return device, xclbin
_PROLOG = [
0x00000011,
0x01000405,
0x01000100,
0x0B590100,
0x000055FF,
0x00000001,
0x00000010,
0x314E5A5F,
0x635F5F31,
0x676E696C,
0x39354E5F,
0x6E693131,
0x5F727473,
0x64726F77,
0x00004573,
0x07BD9630,
0x000055FF,
]
shim_instr_v = [
0x06000100,
0x00000000,
0x00000001,
0x00000000,
0x00000000,
0x00000000,
0x80000000,
0x00000000,
0x00000000,
0x02000000,
0x02000000,
0x0001D204,
0x80000000,
0x03000000,
0x00010100,
]
whichpi = "fivepi"
instr_v = _PROLOG + shim_instr_v
instr_v = np.array(instr_v, dtype=np.uint32)
inout0 = np.zeros((1,), dtype=np.float32)
device, xclbin = init_xrt_load_kernel(Path(__file__).parent.absolute() / f"{whichpi}.xclbin")
def go():
context = pyxrt.hw_context(device, xclbin.get_uuid())
xkernel = next(k for k in xclbin.get_kernels() if k.get_name() == whichpi)
kernel = pyxrt.kernel(context, xkernel.get_name())
bo_instr = pyxrt.bo(
device, len(instr_v) * 4, pyxrt.bo.cacheable, kernel.group_id(0)
)
bo_inout0 = pyxrt.bo(device, 1 * 4, pyxrt.bo.host_only, kernel.group_id(2))
bo_instr.write(instr_v, 0)
bo_inout0.write(inout0, 0)
bo_instr.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
bo_inout0.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
h = kernel(bo_instr, len(instr_v), bo_inout0)
assert h.wait() == ert_cmd_state.ERT_CMD_STATE_COMPLETED
bo_inout0.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
entire_buffer = bo_inout0.read(4, 0).view(np.float32)
print(entire_buffer[0])
v = entire_buffer[0].item()
assert isinstance(v, float)
assert np.isclose(v, 3.14)
go()