Skip to content

Commit ebee310

Browse files
author
Dmitry Razdoburdin
committed
l1 optimization dispatching
1 parent f82c9b0 commit ebee310

File tree

1 file changed

+8
-45
lines changed

1 file changed

+8
-45
lines changed

plugin/sycl/tree/hist_dispatcher.h

Lines changed: 8 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -41,54 +41,15 @@ class HistDispatcher {
4141
size_t eu_per_core =
4242
device.get_info<::sycl::ext::intel::info::device::gpu_eu_count_per_subslice>();
4343
switch (arch) {
44-
case ::sycl::ext::oneapi::experimental::architecture::intel_gpu_acm_g10:
45-
case ::sycl::ext::oneapi::experimental::architecture::intel_gpu_acm_g11:
46-
case ::sycl::ext::oneapi::experimental::architecture::intel_gpu_acm_g12: {
47-
LOG(INFO) << "Xe-HPG (Alchemist) Architecture";
48-
size_t l1_size = 128 * 1024;
49-
size_t registers_size = 128 * 1024;
50-
sram_size_per_eu = (l1_size + registers_size) / eu_per_core;
51-
break;
52-
}
53-
54-
case ::sycl::ext::oneapi::experimental::architecture::intel_gpu_mtl_u:
55-
case ::sycl::ext::oneapi::experimental::architecture::intel_gpu_mtl_h:
56-
case ::sycl::ext::oneapi::experimental::architecture::intel_gpu_arl_h: {
57-
LOG(INFO) << "Xe-LPG (MTL) and Xe-LPG+ (ARL) Architectures";
58-
size_t l1_size = 192 * 1024;
59-
size_t registers_size = 128 * 1024;
60-
sram_size_per_eu = (l1_size + registers_size) / eu_per_core;
61-
break;
62-
}
63-
64-
case ::sycl::ext::oneapi::experimental::architecture::intel_gpu_lnl_m: {
65-
LOG(INFO) << "Xe2-LPG (Lunar Lake) Architecture";
66-
size_t l1_size = 192 * 1024;
67-
// Xe2 share the registers and L1
68-
size_t registers_size = 0;
69-
sram_size_per_eu = (l1_size + registers_size) / eu_per_core;
70-
break;
71-
}
72-
73-
case ::sycl::ext::oneapi::experimental::architecture::intel_gpu_bmg_g21: {
74-
LOG(INFO) << "Xe2-HPG (Battlemage) Architecture";
75-
size_t l1_size = 256 * 1024;
76-
// Xe2 share the registers and L1
77-
size_t registers_size = 0;
78-
sram_size_per_eu = (l1_size + registers_size) / eu_per_core;
79-
break;
80-
}
81-
8244
case ::sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc: {
83-
LOG(INFO) << "Xe-HPC (Ponte Vecchio) Architecture";
45+
LOG(INFO) << "Xe-HPC (Ponte Vecchio) Architecture. L1 friendly optimization enabled.";
8446
size_t l1_size = 512 * 1024;
85-
size_t registers_size = 256 * 1024;
86-
sram_size_per_eu = (l1_size + registers_size) / eu_per_core;
47+
size_t registers_size = 64 * 1024;
48+
sram_size_per_eu = l1_size / eu_per_core + registers_size;
8749
break;
8850
}
89-
default:
90-
LOG(WARNING) << "Unknown SYCL GPU architecture. Performance may be suboptimal.";
91-
sram_size_per_eu = 0;
51+
default:
52+
sram_size_per_eu = 0;
9253
}
9354
}
9455

@@ -125,6 +86,8 @@ class HistDispatcher {
12586
constexpr static size_t KMaxEffectiveBlockSize = 1u << 11;
12687
// Maximal number of bins acceptable for local histograms
12788
constexpr static size_t KMaxNumBins = 256;
89+
// Amount of sram for local-histogram kernel launch
90+
constexpr static float KLocalHistSRAM = 32. * 1024;
12891
// Max workgroups size, used by atomic-based hist-building
12992
constexpr static size_t kMaxWorkGroupSizeAtomic = 32;
13093
// Max workgroups size, used for local histograms
@@ -165,7 +128,7 @@ class HistDispatcher {
165128
* most part of buffer isn't used and perf suffers.
166129
*/
167130
const size_t th_block_size = max_num_bins;
168-
build_params.use_local_hist = (buff_size < 0.8 * sram_size_per_eu)
131+
build_params.use_local_hist = (buff_size < sram_size_per_eu - KLocalHistSRAM)
169132
&& isDense
170133
&& (max_num_bins <= KMaxNumBins)
171134
&& (build_params.block.size >= th_block_size);

0 commit comments

Comments
 (0)