@@ -41,54 +41,15 @@ class HistDispatcher {
41
41
size_t eu_per_core =
42
42
device.get_info <::sycl::ext::intel::info::device::gpu_eu_count_per_subslice>();
43
43
switch (arch) {
44
- case ::sycl::ext::oneapi::experimental::architecture::intel_gpu_acm_g10:
45
- case ::sycl::ext::oneapi::experimental::architecture::intel_gpu_acm_g11:
46
- case ::sycl::ext::oneapi::experimental::architecture::intel_gpu_acm_g12: {
47
- LOG (INFO) << " Xe-HPG (Alchemist) Architecture" ;
48
- size_t l1_size = 128 * 1024 ;
49
- size_t registers_size = 128 * 1024 ;
50
- sram_size_per_eu = (l1_size + registers_size) / eu_per_core;
51
- break ;
52
- }
53
-
54
- case ::sycl::ext::oneapi::experimental::architecture::intel_gpu_mtl_u:
55
- case ::sycl::ext::oneapi::experimental::architecture::intel_gpu_mtl_h:
56
- case ::sycl::ext::oneapi::experimental::architecture::intel_gpu_arl_h: {
57
- LOG (INFO) << " Xe-LPG (MTL) and Xe-LPG+ (ARL) Architectures" ;
58
- size_t l1_size = 192 * 1024 ;
59
- size_t registers_size = 128 * 1024 ;
60
- sram_size_per_eu = (l1_size + registers_size) / eu_per_core;
61
- break ;
62
- }
63
-
64
- case ::sycl::ext::oneapi::experimental::architecture::intel_gpu_lnl_m: {
65
- LOG (INFO) << " Xe2-LPG (Lunar Lake) Architecture" ;
66
- size_t l1_size = 192 * 1024 ;
67
- // Xe2 share the registers and L1
68
- size_t registers_size = 0 ;
69
- sram_size_per_eu = (l1_size + registers_size) / eu_per_core;
70
- break ;
71
- }
72
-
73
- case ::sycl::ext::oneapi::experimental::architecture::intel_gpu_bmg_g21: {
74
- LOG (INFO) << " Xe2-HPG (Battlemage) Architecture" ;
75
- size_t l1_size = 256 * 1024 ;
76
- // Xe2 share the registers and L1
77
- size_t registers_size = 0 ;
78
- sram_size_per_eu = (l1_size + registers_size) / eu_per_core;
79
- break ;
80
- }
81
-
82
44
case ::sycl::ext::oneapi::experimental::architecture::intel_gpu_pvc: {
83
- LOG (INFO) << " Xe-HPC (Ponte Vecchio) Architecture" ;
45
+ LOG (INFO) << " Xe-HPC (Ponte Vecchio) Architecture. L1 friendly optimization enabled. " ;
84
46
size_t l1_size = 512 * 1024 ;
85
- size_t registers_size = 256 * 1024 ;
86
- sram_size_per_eu = ( l1_size + registers_size) / eu_per_core;
47
+ size_t registers_size = 64 * 1024 ;
48
+ sram_size_per_eu = l1_size / eu_per_core + registers_size ;
87
49
break ;
88
50
}
89
- default :
90
- LOG (WARNING) << " Unknown SYCL GPU architecture. Performance may be suboptimal." ;
91
- sram_size_per_eu = 0 ;
51
+ default :
52
+ sram_size_per_eu = 0 ;
92
53
}
93
54
}
94
55
@@ -125,6 +86,8 @@ class HistDispatcher {
125
86
constexpr static size_t KMaxEffectiveBlockSize = 1u << 11 ;
126
87
// Maximal number of bins acceptable for local histograms
127
88
constexpr static size_t KMaxNumBins = 256 ;
89
+ // Amount of sram for local-histogram kernel launch
90
+ constexpr static float KLocalHistSRAM = 32 . * 1024 ;
128
91
// Max workgroups size, used by atomic-based hist-building
129
92
constexpr static size_t kMaxWorkGroupSizeAtomic = 32 ;
130
93
// Max workgroups size, used for local histograms
@@ -165,7 +128,7 @@ class HistDispatcher {
165
128
* most part of buffer isn't used and perf suffers.
166
129
*/
167
130
const size_t th_block_size = max_num_bins;
168
- build_params.use_local_hist = (buff_size < 0.8 * sram_size_per_eu )
131
+ build_params.use_local_hist = (buff_size < sram_size_per_eu - KLocalHistSRAM )
169
132
&& isDense
170
133
&& (max_num_bins <= KMaxNumBins)
171
134
&& (build_params.block .size >= th_block_size);
0 commit comments