Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
e0c5114
added tuned gemms for r9700
big-yellow-duck Jan 5, 2026
e532f3a
Merge branch 'ROCm:main' into main
vllmellm Jan 6, 2026
1a286e8
Merge branch 'ROCm:main' into main
tjtanaa Jan 8, 2026
bdab40d
Added gemm_a8w8_blockscale support for gfx1201 with tuning script
big-yellow-duck Jan 13, 2026
c7664b8
Merge branch 'main' into support_gfx1201_min
big-yellow-duck Jan 13, 2026
c162331
added gfx1201 to types.py
big-yellow-duck Jan 14, 2026
fd925f4
Merge branch 'support_gfx1201_min' of https://github.com/EmbeddedLLM/…
big-yellow-duck Jan 14, 2026
2afe833
Merge branch 'ROCm:main' into support_gfx1201_min
big-yellow-duck Jan 14, 2026
a9f329a
Merge branch 'ROCm:main' into support_gfx1201_min
big-yellow-duck Jan 14, 2026
0ef32a3
Merge branch 'support_gfx1201_min' of https://github.com/EmbeddedLLM/…
big-yellow-duck Jan 14, 2026
897fd62
Add readme file and rename base to utils
iAmir97 Jan 15, 2026
622fd33
add fp8 dtype
iAmir97 Jan 15, 2026
ab93b43
added gemm_a8w8_blocscale_shuffle
big-yellow-duck Jan 16, 2026
5ff3029
Merge branch 'support_gfx1201_min' of https://github.com/EmbeddedLLM/…
big-yellow-duck Jan 16, 2026
7f09f13
Merge branch 'main' into support_gfx1201_min
big-yellow-duck Jan 16, 2026
aea5797
update tuned gemm_a8w8_blockscale
big-yellow-duck Jan 16, 2026
60ee427
Merge branch 'support_gfx1201_min' of https://github.com/EmbeddedLLM/…
big-yellow-duck Jan 16, 2026
1d53884
Merge branch 'main' into support_gfx1201_min
big-yellow-duck Jan 16, 2026
879c2c5
Add readme for tuning Co-authored-by: Jeff Aw <jeffaw99@hotmail.com>
iAmir97 Jan 21, 2026
c285687
Add readme for tuning Co-authored-by: Jeff Aw <jeffaw99@hotmail.com>
iAmir97 Jan 21, 2026
05f9ea7
update tuning readme
big-yellow-duck Jan 21, 2026
971dcd8
Revert "Add readme for tuning Co-authored-by: Jeff Aw <jeffaw99@hotma…
iAmir97 Jan 22, 2026
b645bcb
Merge branch 'ROCm:main' into main
tjtanaa Jan 22, 2026
47bba80
rebase and revert the submodule changes
tjtanaa Jan 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions aiter/ops/triton/configs/gemm/gfx1201-BATCHED_GEMM-A8W8.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"small": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 8,
"num_stages": 2,
"waves_per_eu": 2,
"kpack": 2,
"matrix_instr_nonkdim": 16
},
"large": {
"BLOCK_SIZE_M": 256,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 1,
"waves_per_eu": 2,
"kpack": 2,
"matrix_instr_nonkdim": 16
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
{
"small": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 2,
"waves_per_eu": 4,
"matrix_instr_nonkdim": 16,
"cache_modifier": "",
"NUM_KSPLIT": 1,
"SPLITK_BLOCK_SIZE": 1024
},
"medium_M32": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 3,
"waves_per_eu": 8,
"matrix_instr_nonkdim": 16,
"cache_modifier": "",
"NUM_KSPLIT": 1,
"SPLITK_BLOCK_SIZE": 1024
},
"medium_M64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 3,
"waves_per_eu": 4,
"matrix_instr_nonkdim": 16,
"cache_modifier": "",
"NUM_KSPLIT": 1,
"SPLITK_BLOCK_SIZE": 1024
},
"medium_M128": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"num_warps": 8,
"num_stages": 2,
"waves_per_eu": 2,
"matrix_instr_nonkdim": 16,
"cache_modifier": "",
"NUM_KSPLIT": 1,
"SPLITK_BLOCK_SIZE": 1024
},
"large": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 8,
"num_stages": 2,
"waves_per_eu": 4,
"matrix_instr_nonkdim": 16,
"cache_modifier": "",
"NUM_KSPLIT": 1,
"SPLITK_BLOCK_SIZE": 1024
},
"xlarge": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 64,
"num_warps": 8,
"num_stages": 2,
"waves_per_eu": 4,
"matrix_instr_nonkdim": 16,
"cache_modifier": "",
"NUM_KSPLIT": 1,
"SPLITK_BLOCK_SIZE": 1024
},
"any": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 32,
"num_warps": 8,
"num_stages": 2,
"waves_per_eu": 4,
"matrix_instr_nonkdim": 16,
"cache_modifier": "",
"NUM_KSPLIT": 1,
"SPLITK_BLOCK_SIZE": 1024
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
{
"small": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 5,
"waves_per_eu": 4,
"matrix_instr_nonkdim": 16,
"cache_modifier": "",
"NUM_KSPLIT": 1,
"SPLITK_BLOCK_SIZE": 2048
},
"medium_M32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 4,
"num_stages": 3,
"waves_per_eu": 8,
"matrix_instr_nonkdim": 16,
"cache_modifier": "",
"NUM_KSPLIT": 1,
"SPLITK_BLOCK_SIZE": 2048
},
"medium_M64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 8,
"num_stages": 2,
"waves_per_eu": 2,
"matrix_instr_nonkdim": 16,
"cache_modifier": "",
"NUM_KSPLIT": 1,
"SPLITK_BLOCK_SIZE": 2048
},
"medium_M128": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 8,
"num_warps": 8,
"num_stages": 2,
"waves_per_eu": 2,
"matrix_instr_nonkdim": 16,
"cache_modifier": "",
"NUM_KSPLIT": 1,
"SPLITK_BLOCK_SIZE": 2048
},
"large": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"waves_per_eu": 2,
"matrix_instr_nonkdim": 16,
"cache_modifier": "",
"NUM_KSPLIT": 1,
"SPLITK_BLOCK_SIZE": 2048
},
"xlarge": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"waves_per_eu": 4,
"matrix_instr_nonkdim": 16,
"cache_modifier": "",
"NUM_KSPLIT": 1,
"SPLITK_BLOCK_SIZE": 2048
},
"any": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"waves_per_eu": 4,
"matrix_instr_nonkdim": 16,
"cache_modifier": "",
"NUM_KSPLIT": 1,
"SPLITK_BLOCK_SIZE": 2048
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
{
"small": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 8,
"num_stages": 2,
"waves_per_eu": 4,
"matrix_instr_nonkdim": 16,
"cache_modifier": "",
"NUM_KSPLIT": 1,
"SPLITK_BLOCK_SIZE": 3072
},
"medium_M32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"waves_per_eu": 2,
"matrix_instr_nonkdim": 16,
"cache_modifier": "",
"NUM_KSPLIT": 1,
"SPLITK_BLOCK_SIZE": 3072
},
"medium_M64": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 8,
"num_stages": 2,
"waves_per_eu": 8,
"matrix_instr_nonkdim": 16,
"cache_modifier": "",
"NUM_KSPLIT": 1,
"SPLITK_BLOCK_SIZE": 3072
},
"medium_M128": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 2,
"waves_per_eu": 2,
"matrix_instr_nonkdim": 16,
"cache_modifier": "",
"NUM_KSPLIT": 1,
"SPLITK_BLOCK_SIZE": 3072
},
"large": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 8,
"num_stages": 2,
"waves_per_eu": 2,
"matrix_instr_nonkdim": 16,
"cache_modifier": "",
"NUM_KSPLIT": 1,
"SPLITK_BLOCK_SIZE": 3072
},
"xlarge": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 8,
"num_warps": 8,
"num_stages": 2,
"waves_per_eu": 4,
"matrix_instr_nonkdim": 16,
"cache_modifier": "",
"NUM_KSPLIT": 1,
"SPLITK_BLOCK_SIZE": 3072
},
"any": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 64,
"num_warps": 8,
"num_stages": 2,
"waves_per_eu": 2,
"matrix_instr_nonkdim": 16,
"cache_modifier": "",
"NUM_KSPLIT": 1,
"SPLITK_BLOCK_SIZE": 3072
}
}
Loading