Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
ba8af53
Enable xattention on xe1
WeldonWangwang Mar 26, 2023
5f9b15e
Update 1st token
WeldonWangwang Mar 26, 2023
799d619
Configure BLOCK_SG_M/N for different platforms
WeldonWangwang Mar 26, 2023
82a95c7
support xe2
WeldonWangwang Mar 26, 2023
e0f7b57
Remove debug messages
WeldonWangwang Dec 19, 2025
f3f8f51
fix WG_N_size=128 mismatch issue
WeldonWangwang Dec 29, 2025
80e4c13
Fix cm_ptr_store failure by stabilizing quantization path on A770
WeldonWangwang Jan 16, 2026
62c4327
pass KV cache quant mode into paged attention JIT constants
WeldonWangwang Jan 16, 2026
52d16ca
Fix 2nd token with by-token mode
WeldonWangwang Jan 16, 2026
c97de30
Fix in M tail of smallest query on xe1
WeldonWangwang Jan 16, 2026
f49d7aa
Update 1st token
WeldonWangwang Jan 16, 2026
451fc2a
Fix 32k input out-of-resource on xe1
WeldonWangwang Jan 16, 2026
447c20f
remove debug messages
WeldonWangwang Jan 16, 2026
1d1f4d2
Standardized API
WeldonWangwang Jan 16, 2026
ca62a33
Merge branch 'master' into ww/enable_xattention_xe1
WeldonWangwang Jan 21, 2026
3c642ef
modify API
WeldonWangwang Jan 21, 2026
bee93ee
Fix merge_q_num
WeldonWangwang Jan 22, 2026
337c868
Fix build error
WeldonWangwang Jan 22, 2026
a525ed5
Merge branch 'master' into ww/enable_xattention_xe1
WeldonWangwang Jan 26, 2026
73f6ddc
Update src/plugins/intel_gpu/src/graph/impls/cm/include/cm_pa_common.hpp
WeldonWangwang Jan 27, 2026
86499d1
Fix kv cache update on A770
WeldonWangwang Jan 28, 2026
ea39043
Some updates
WeldonWangwang Jan 28, 2026
be5632c
xe1 non-lsc by-token reduce SVM reads via 4-way K/V tile loads
WeldonWangwang Jan 28, 2026
a53f55a
Merge branch 'master' into ww/enable_xattention_xe1
ceciliapeng2011 Feb 2, 2026
087f050
Apply suggestion from @ceciliapeng2011
ceciliapeng2011 Feb 2, 2026
4bab916
Apply suggestions from code review
ceciliapeng2011 Feb 2, 2026
3e0ed5f
Apply stateful api to access v_cache for u8
WeldonWangwang Feb 3, 2026
9ee5662
Refactor sparse mask logic with unified shift-based check for 128/256…
WeldonWangwang Feb 3, 2026
70dae17
use if constexpr instead of macro guards for better readability and s…
WeldonWangwang Feb 3, 2026
9e18fe6
Some updates
WeldonWangwang Feb 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -338,65 +338,64 @@ vector<float, cols> online_softmax_update(matrix_ref<T, rows, cols> St, vector_r
#define cm_load_transpose cm_load<lsc::Transpose>
#define cm_load_vnni cm_load<lsc::VNNI>
#define cm_store_normal cm_store
#else
// simulation of LSC API using SVM API
template <typename T = int, unsigned NBlocks = 1, unsigned BlockH = 1, unsigned BlockW = 1>
inline void cm_load_normal(vector_ref<T, NBlocks*BlockH*BlockW> Res, const lsc::block_2d_desc<T, NBlocks, BlockH, BlockW> &Desc, int16_t Pred = 1) {
static_assert(NBlocks == 1);
auto pitch = Desc.get_pitch() + 1;
auto base = reinterpret_cast<svmptr_t>(Desc.get_base() + Desc.get_block_y()*pitch + Desc.get_block_x() * sizeof(T));
#pragma unroll
for(int i = 0; i < BlockH; i++) {
cm_svm_block_read(base + i * pitch, Res.select<BlockW, 1>(i*BlockW));
}
}

template <typename T = int, unsigned NBlocks = 1, unsigned BlockH = 1, unsigned BlockW = 1>
inline void cm_load_transpose(vector_ref<T, NBlocks*BlockW*BlockH> Res, const lsc::block_2d_desc<T, NBlocks, BlockH, BlockW> &Desc, int16_t Pred = 1) {
static_assert(NBlocks == 1);
auto pitch = Desc.get_pitch() + 1;
auto base = reinterpret_cast<svmptr_t>(Desc.get_base() + Desc.get_block_y()*pitch + Desc.get_block_x() * sizeof(T));
matrix<T, BlockH, BlockW> temp;
#pragma unroll
for(int i = 0; i < BlockH; i++) {
cm_svm_block_read(base + i * pitch, temp[i]);
}
Transpose2DMatrix(temp, Res.format<T, BlockW, BlockH>());
}
// // simulation of LSC API using SVM API
// template <typename T = int, unsigned NBlocks = 1, unsigned BlockH = 1, unsigned BlockW = 1>
// inline void cm_load_normal(vector_ref<T, NBlocks*BlockH*BlockW> Res, const lsc::block_2d_desc<T, NBlocks, BlockH, BlockW> &Desc, int16_t Pred = 1) {
// static_assert(NBlocks == 1);
// auto pitch = Desc.get_pitch() + 1;
// auto base = reinterpret_cast<svmptr_t>(Desc.get_base() + Desc.get_block_y()*pitch + Desc.get_block_x() * sizeof(T));
// #pragma unroll
// for(int i = 0; i < BlockH; i++) {
// cm_svm_block_read(base + i * pitch, Res.select<BlockW, 1>(i*BlockW));
// }
// }

// in VNNI case, NBlocks is increasing along X dimension (increase cache-line usage)
template <typename T = int, unsigned NBlocks = 1, unsigned BlockH = 1, unsigned BlockW = 1>
inline void cm_load_vnni(vector_ref<T, NBlocks*BlockW*BlockH> Res, const lsc::block_2d_desc<T, NBlocks, BlockH, BlockW> &Desc, int16_t Pred = 1) {
static_assert(NBlocks == 1 || NBlocks == 2);
// each block must be a full XMX B matrix
static_assert(BlockH == REG_K);
static_assert(BlockW == REG_N);
auto pitch = Desc.get_pitch() + 1;
auto base = reinterpret_cast<svmptr_t>(Desc.get_base() + Desc.get_block_y()*pitch + Desc.get_block_x() * sizeof(T));
matrix<T, BlockH, NBlocks * BlockW> temp;
#pragma unroll
for(int i = 0; i < BlockH; i++) {
cm_svm_block_read(base + i * pitch, temp[i]);
}
// template <typename T = int, unsigned NBlocks = 1, unsigned BlockH = 1, unsigned BlockW = 1>
// inline void cm_load_transpose(vector_ref<T, NBlocks*BlockW*BlockH> Res, const lsc::block_2d_desc<T, NBlocks, BlockH, BlockW> &Desc, int16_t Pred = 1) {
// static_assert(NBlocks == 1);
// auto pitch = Desc.get_pitch() + 1;
// auto base = reinterpret_cast<svmptr_t>(Desc.get_base() + Desc.get_block_y()*pitch + Desc.get_block_x() * sizeof(T));
// matrix<T, BlockH, BlockW> temp;
// #pragma unroll
// for(int i = 0; i < BlockH; i++) {
// cm_svm_block_read(base + i * pitch, temp[i]);
// }
// Transpose2DMatrix(temp, Res.format<T, BlockW, BlockH>());
// }

auto out_vnni = Res.format<T, NBlocks * (BlockH/2), 2*BlockW>();
#pragma unroll
for(int i = 0; i < NBlocks; i ++) {
out_vnni.select<BlockH/2, 1, BlockW, 2>(i*(BlockH/2), 0) = temp.select<BlockH/2, 2, BlockW, 1>(0, i*BlockW);
out_vnni.select<BlockH/2, 1, BlockW, 2>(i*(BlockH/2), 1) = temp.select<BlockH/2, 2, BlockW, 1>(1, i*BlockW);
}
}
// // in VNNI case, NBlocks is increasing along X dimension (increase cache-line usage)
// template <typename T = int, unsigned NBlocks = 1, unsigned BlockH = 1, unsigned BlockW = 1>
// inline void cm_load_vnni(vector_ref<T, NBlocks*BlockW*BlockH> Res, const lsc::block_2d_desc<T, NBlocks, BlockH, BlockW> &Desc, int16_t Pred = 1) {
// static_assert(NBlocks == 1 || NBlocks == 2);
// // each block must be a full XMX B matrix
// static_assert(BlockH == REG_K);
// static_assert(BlockW == REG_N);
// auto pitch = Desc.get_pitch() + 1;
// auto base = reinterpret_cast<svmptr_t>(Desc.get_base() + Desc.get_block_y()*pitch + Desc.get_block_x() * sizeof(T));
// matrix<T, BlockH, NBlocks * BlockW> temp;
// #pragma unroll
// for(int i = 0; i < BlockH; i++) {
// cm_svm_block_read(base + i * pitch, temp[i]);
// }

// auto out_vnni = Res.format<T, NBlocks * (BlockH/2), 2*BlockW>();
// #pragma unroll
// for(int i = 0; i < NBlocks; i ++) {
// out_vnni.select<BlockH/2, 1, BlockW, 2>(i*(BlockH/2), 0) = temp.select<BlockH/2, 2, BlockW, 1>(0, i*BlockW);
// out_vnni.select<BlockH/2, 1, BlockW, 2>(i*(BlockH/2), 1) = temp.select<BlockH/2, 2, BlockW, 1>(1, i*BlockW);
// }
// }

template <typename T = int, unsigned NBlocks = 1, unsigned BlockH = 1, unsigned BlockW = 1>
inline void cm_store_normal(const lsc::block_2d_desc<T, NBlocks, BlockH, BlockW> &Desc, vector_ref<T, NBlocks*BlockW*BlockH> Res) {
static_assert(NBlocks == 1);
auto pitch = Desc.get_pitch() + 1;
auto base = reinterpret_cast<svmptr_t>(Desc.get_base() + Desc.get_block_y()*pitch + Desc.get_block_x() * sizeof(T));
#pragma unroll
for(int i = 0; i < BlockH; i++) {
cm_svm_block_write(base + i * pitch, Res.select<BlockW, 1>(i*BlockW));
}
}
// template <typename T = int, unsigned NBlocks = 1, unsigned BlockH = 1, unsigned BlockW = 1>
// inline void cm_store_normal(const lsc::block_2d_desc<T, NBlocks, BlockH, BlockW> &Desc, vector_ref<T, NBlocks*BlockW*BlockH> Res) {
// static_assert(NBlocks == 1);
// auto pitch = Desc.get_pitch() + 1;
// auto base = reinterpret_cast<svmptr_t>(Desc.get_base() + Desc.get_block_y()*pitch + Desc.get_block_x() * sizeof(T));
// #pragma unroll
// for(int i = 0; i < BlockH; i++) {
// cm_svm_block_write(base + i * pitch, Res.select<BlockW, 1>(i*BlockW));
// }
// }
#endif

//===============================================================================================
Expand All @@ -416,4 +415,4 @@ inline void prepackAsVNNIWidth2(matrix_ref<T1, K, N> input, matrix_ref<T2, K/2,
out.row(r).select<N, 2>(0) = input.row(r*2);
out.row(r).select<N, 2>(1) = input.row(r*2+1);
}
}
}
Loading
Loading