7
7
#include < algorithm>
8
8
9
9
#include " ../data/gradient_index.h"
10
+ #include " ../tree/hist_dispatcher.h"
10
11
#include " hist_util.h"
11
12
12
13
#include < sycl/sycl.hpp>
@@ -91,28 +92,27 @@ template ::sycl::event SubtractionHist(::sycl::queue* qu,
91
92
const GHistRow<double , MemoryType::on_device>& src2,
92
93
size_t size, ::sycl::event event_priv);
93
94
94
- inline auto GetBlocksParameters (::sycl::queue* qu, size_t size, size_t max_nblocks) {
95
- struct _ {
96
- size_t block_size, nblocks;
97
- };
95
+ template <typename GradientPairT>
96
+ ::sycl::event ReduceHist (::sycl::queue* qu, GradientPairT* hist_data,
97
+ GradientPairT* hist_buffer_data,
98
+ size_t nblocks, size_t nbins,
99
+ const ::sycl::event& event_main) {
100
+ auto event_save = qu->submit ([&](::sycl::handler& cgh) {
101
+ cgh.depends_on (event_main);
102
+ cgh.parallel_for <>(::sycl::range<1 >(nbins), [=](::sycl::item<1 > pid) {
103
+ size_t idx_bin = pid.get_id (0 );
98
104
99
- const size_t min_block_size = 32 ;
100
- const size_t max_compute_units =
101
- qu->get_device ().get_info <::sycl::info::device::max_compute_units>();
105
+ GradientPairT gpair = {0 , 0 };
102
106
103
- size_t nblocks = max_compute_units;
107
+ for (size_t j = 0 ; j < nblocks; ++j) {
108
+ gpair += hist_buffer_data[j * nbins + idx_bin];
109
+ }
104
110
105
- size_t block_size = size / nblocks + !!(size % nblocks);
106
- if (block_size > (1u << 12 )) {
107
- nblocks = max_nblocks;
108
- block_size = size / nblocks + !!(size % nblocks);
109
- }
110
- if (block_size < min_block_size) {
111
- block_size = min_block_size;
112
- nblocks = size / block_size + !!(size % block_size);
113
- }
111
+ hist_data[idx_bin] = gpair;
112
+ });
113
+ });
114
114
115
- return _{block_size, nblocks} ;
115
+ return event_save ;
116
116
}
117
117
118
118
// Kernel with buffer using
@@ -123,6 +123,7 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
123
123
const GHistIndexMatrix& gmat,
124
124
GHistRow<FPType, MemoryType::on_device>* hist,
125
125
GHistRow<FPType, MemoryType::on_device>* hist_buffer,
126
+ const tree::HistDispatcher<FPType>& dispatcher,
126
127
::sycl::event event_priv) {
127
128
using GradientPairT = xgboost::detail::GradientPairInternal<FPType>;
128
129
const size_t size = row_indices.Size ();
@@ -133,18 +134,13 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
133
134
const uint32_t * offsets = gmat.cut .cut_ptrs_ .ConstDevicePointer ();
134
135
const size_t nbins = gmat.nbins ;
135
136
136
- const size_t max_work_group_size =
137
- qu->get_device ().get_info <::sycl::info::device::max_work_group_size>();
138
- const size_t work_group_size = n_columns < max_work_group_size ? n_columns : max_work_group_size;
139
-
140
- // Captured structured bindings are a C++20 extension
141
- const auto block_params = GetBlocksParameters (qu, size, hist_buffer->Size () / (nbins * 2 ));
142
- const size_t block_size = block_params.block_size ;
143
- const size_t nblocks = block_params.nblocks ;
137
+ const size_t work_group_size = dispatcher.work_group_size ;
138
+ const size_t block_size = dispatcher.block .size ;
139
+ const size_t nblocks = dispatcher.block .nblocks ;
144
140
145
141
GradientPairT* hist_buffer_data = hist_buffer->Data ();
146
142
auto event_fill = qu->fill (hist_buffer_data, GradientPairT (0 , 0 ),
147
- nblocks * nbins * 2 , event_priv);
143
+ nblocks * nbins, event_priv);
148
144
auto event_main = qu->submit ([&](::sycl::handler& cgh) {
149
145
cgh.depends_on (event_fill);
150
146
cgh.parallel_for <>(::sycl::nd_range<2 >(::sycl::range<2 >(nblocks, work_group_size),
@@ -179,20 +175,84 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
179
175
});
180
176
181
177
GradientPairT* hist_data = hist->Data ();
182
- auto event_save = qu->submit ([&](::sycl::handler& cgh) {
183
- cgh.depends_on (event_main);
184
- cgh.parallel_for <>(::sycl::range<1 >(nbins), [=](::sycl::item<1 > pid) {
185
- size_t idx_bin = pid.get_id (0 );
178
+ auto event_save = ReduceHist (qu, hist_data, hist_buffer_data, nblocks,
179
+ nbins, event_main);
186
180
187
- GradientPairT gpair = {0 , 0 };
181
+ return event_save;
182
+ }
188
183
189
- for (size_t j = 0 ; j < nblocks; ++j) {
190
- gpair += hist_buffer_data[j * nbins + idx_bin];
191
- }
184
+ // Kernel with buffer and local hist using
185
+ template <typename FPType, typename BinIdxType>
186
+ ::sycl::event BuildHistKernelLocal (::sycl::queue* qu,
187
+ const HostDeviceVector<GradientPair>& gpair,
188
+ const RowSetCollection::Elem& row_indices,
189
+ const GHistIndexMatrix& gmat,
190
+ GHistRow<FPType, MemoryType::on_device>* hist,
191
+ GHistRow<FPType, MemoryType::on_device>* hist_buffer,
192
+ const tree::HistDispatcher<FPType>& dispatcher,
193
+ ::sycl::event event_priv) {
194
+ constexpr int kMaxNumBins = tree::HistDispatcher<FPType>::KMaxNumBins;
195
+ using GradientPairT = xgboost::detail::GradientPairInternal<FPType>;
196
+ const size_t size = row_indices.Size ();
197
+ const size_t * rid = row_indices.begin ;
198
+ const size_t n_columns = gmat.nfeatures ;
199
+ const auto * pgh = gpair.ConstDevicePointer ();
200
+ const BinIdxType* gradient_index = gmat.index .data <BinIdxType>();
201
+ const uint32_t * offsets = gmat.cut .cut_ptrs_ .ConstDevicePointer ();
202
+ const size_t nbins = gmat.nbins ;
192
203
193
- hist_data[idx_bin] = gpair;
204
+ const size_t work_group_size = dispatcher.work_group_size ;
205
+ const size_t block_size = dispatcher.block .size ;
206
+ const size_t nblocks = dispatcher.block .nblocks ;
207
+
208
+ GradientPairT* hist_buffer_data = hist_buffer->Data ();
209
+
210
+ auto event_main = qu->submit ([&](::sycl::handler& cgh) {
211
+ cgh.depends_on (event_priv);
212
+ cgh.parallel_for <>(::sycl::nd_range<2 >(::sycl::range<2 >(nblocks, work_group_size),
213
+ ::sycl::range<2 >(1 , work_group_size)),
214
+ [=](::sycl::nd_item<2 > pid) {
215
+ size_t block = pid.get_global_id (0 );
216
+ size_t feat = pid.get_global_id (1 );
217
+
218
+ // This buffer will be keeped in L1/registers
219
+ GradientPairT hist_fast[kMaxNumBins ];
220
+
221
+ GradientPairT* hist_local = hist_buffer_data + block * nbins;
222
+ for (size_t fid = feat; fid < n_columns; fid += work_group_size) {
223
+ size_t n_bins_feature = offsets[fid+1 ] - offsets[fid];
224
+
225
+ // Not all elements of hist_fast are actually used: n_bins_feature <= kMaxNumBins
226
+ // We initililize only the requared elements to prevent the unused go to cache.
227
+ for (int bin = 0 ; bin < n_bins_feature; ++bin) {
228
+ hist_fast[bin] = {0 , 0 };
229
+ }
230
+
231
+ for (size_t idx = 0 ; idx < block_size; ++idx) {
232
+ size_t i = block * block_size + idx;
233
+ if (i < size) {
234
+ size_t row_id = rid[i];
235
+
236
+ const size_t icol_start = n_columns * row_id;
237
+ const GradientPairT pgh_row (pgh[row_id].GetGrad (),
238
+ pgh[row_id].GetHess ());
239
+
240
+ const BinIdxType* gr_index_local = gradient_index + icol_start;
241
+ uint32_t idx_bin = gr_index_local[fid];
242
+
243
+ hist_fast[idx_bin] += pgh_row;
244
+ }
245
+ }
246
+ for (int bin = 0 ; bin < n_bins_feature; ++bin) {
247
+ hist_local[bin + offsets[fid]] = hist_fast[bin];
248
+ }
249
+ }
194
250
});
195
251
});
252
+
253
+ GradientPairT* hist_data = hist->Data ();
254
+ auto event_save = ReduceHist (qu, hist_data, hist_buffer_data, nblocks,
255
+ nbins, event_main);
196
256
return event_save;
197
257
}
198
258
@@ -203,6 +263,7 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
203
263
const RowSetCollection::Elem& row_indices,
204
264
const GHistIndexMatrix& gmat,
205
265
GHistRow<FPType, MemoryType::on_device>* hist,
266
+ const tree::HistDispatcher<FPType>& dispatcher,
206
267
::sycl::event event_priv) {
207
268
const size_t size = row_indices.Size ();
208
269
const size_t * rid = row_indices.begin ;
@@ -214,7 +275,7 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
214
275
FPType* hist_data = reinterpret_cast <FPType*>(hist->Data ());
215
276
const size_t nbins = gmat.nbins ;
216
277
217
- constexpr size_t work_group_size = 32 ;
278
+ size_t work_group_size = dispatcher. work_group_size ;
218
279
const size_t n_work_groups = n_columns / work_group_size + (n_columns % work_group_size > 0 );
219
280
220
281
auto event_fill = qu->fill (hist_data, FPType (0 ), nbins * 2 , event_priv);
@@ -260,34 +321,47 @@ ::sycl::event BuildHistDispatchKernel(
260
321
GHistRow<FPType, MemoryType::on_device>* hist,
261
322
bool isDense,
262
323
GHistRow<FPType, MemoryType::on_device>* hist_buffer,
324
+ const tree::DeviceProperties& device_prop,
263
325
::sycl::event events_priv,
264
326
bool force_atomic_use) {
265
327
const size_t size = row_indices.Size ();
266
328
const size_t n_columns = isDense ? gmat.nfeatures : gmat.row_stride ;
267
329
const size_t nbins = gmat.nbins ;
330
+ const size_t max_num_bins = gmat.max_num_bins ;
331
+ const size_t min_num_bins = gmat.min_num_bins ;
268
332
269
- // TODO(razdoburdin): replace the add-hock dispatching criteria by more sutable one
270
- bool use_atomic = (size < nbins) || (gmat.max_num_bins == gmat.nbins / n_columns);
333
+ size_t max_n_blocks = hist_buffer->Size () / nbins;
334
+ auto dispatcher = tree::HistDispatcher<FPType>
335
+ (device_prop, isDense, size, max_n_blocks, nbins,
336
+ n_columns, max_num_bins, min_num_bins);
271
337
272
338
// force_atomic_use flag is used only for testing
273
- use_atomic = use_atomic || force_atomic_use;
339
+ bool use_atomic = dispatcher. use_atomics || force_atomic_use;
274
340
if (!use_atomic) {
275
341
if (isDense) {
276
- return BuildHistKernel<FPType, BinIdxType, true >(qu, gpair, row_indices,
277
- gmat, hist, hist_buffer,
278
- events_priv);
342
+ if (dispatcher.use_local_hist ) {
343
+ return BuildHistKernelLocal<FPType, BinIdxType>(qu, gpair, row_indices,
344
+ gmat, hist, hist_buffer,
345
+ dispatcher, events_priv);
346
+ } else {
347
+ return BuildHistKernel<FPType, BinIdxType, true >(qu, gpair, row_indices,
348
+ gmat, hist, hist_buffer,
349
+ dispatcher, events_priv);
350
+ }
279
351
} else {
280
352
return BuildHistKernel<FPType, uint32_t , false >(qu, gpair, row_indices,
281
353
gmat, hist, hist_buffer,
282
- events_priv);
354
+ dispatcher, events_priv);
283
355
}
284
356
} else {
285
357
if (isDense) {
286
358
return BuildHistKernel<FPType, BinIdxType, true >(qu, gpair, row_indices,
287
- gmat, hist, events_priv);
359
+ gmat, hist,
360
+ dispatcher, events_priv);
288
361
} else {
289
362
return BuildHistKernel<FPType, uint32_t , false >(qu, gpair, row_indices,
290
- gmat, hist, events_priv);
363
+ gmat, hist,
364
+ dispatcher, events_priv);
291
365
}
292
366
}
293
367
}
@@ -299,23 +373,27 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
299
373
const GHistIndexMatrix& gmat, const bool isDense,
300
374
GHistRow<FPType, MemoryType::on_device>* hist,
301
375
GHistRow<FPType, MemoryType::on_device>* hist_buffer,
376
+ const tree::DeviceProperties& device_prop,
302
377
::sycl::event event_priv,
303
378
bool force_atomic_use) {
304
379
const bool is_dense = isDense;
305
380
switch (gmat.index .GetBinTypeSize ()) {
306
381
case BinTypeSize::kUint8BinsTypeSize :
307
382
return BuildHistDispatchKernel<FPType, uint8_t >(qu, gpair, row_indices,
308
383
gmat, hist, is_dense, hist_buffer,
384
+ device_prop,
309
385
event_priv, force_atomic_use);
310
386
break ;
311
387
case BinTypeSize::kUint16BinsTypeSize :
312
388
return BuildHistDispatchKernel<FPType, uint16_t >(qu, gpair, row_indices,
313
389
gmat, hist, is_dense, hist_buffer,
390
+ device_prop,
314
391
event_priv, force_atomic_use);
315
392
break ;
316
393
case BinTypeSize::kUint32BinsTypeSize :
317
394
return BuildHistDispatchKernel<FPType, uint32_t >(qu, gpair, row_indices,
318
395
gmat, hist, is_dense, hist_buffer,
396
+ device_prop,
319
397
event_priv, force_atomic_use);
320
398
break ;
321
399
default :
@@ -331,10 +409,12 @@ ::sycl::event GHistBuilder<GradientSumT>::BuildHist(
331
409
GHistRowT<MemoryType::on_device>* hist,
332
410
bool isDense,
333
411
GHistRowT<MemoryType::on_device>* hist_buffer,
412
+ const tree::DeviceProperties& device_prop,
334
413
::sycl::event event_priv,
335
414
bool force_atomic_use) {
336
415
return BuildHistKernel<GradientSumT>(qu_, gpair, row_indices, gmat,
337
- isDense, hist, hist_buffer, event_priv,
416
+ isDense, hist, hist_buffer,
417
+ device_prop, event_priv,
338
418
force_atomic_use);
339
419
}
340
420
@@ -346,6 +426,7 @@ ::sycl::event GHistBuilder<float>::BuildHist(
346
426
GHistRow<float , MemoryType::on_device>* hist,
347
427
bool isDense,
348
428
GHistRow<float , MemoryType::on_device>* hist_buffer,
429
+ const tree::DeviceProperties& device_prop,
349
430
::sycl::event event_priv,
350
431
bool force_atomic_use);
351
432
template
@@ -356,6 +437,7 @@ ::sycl::event GHistBuilder<double>::BuildHist(
356
437
GHistRow<double , MemoryType::on_device>* hist,
357
438
bool isDense,
358
439
GHistRow<double , MemoryType::on_device>* hist_buffer,
440
+ const tree::DeviceProperties& device_prop,
359
441
::sycl::event event_priv,
360
442
bool force_atomic_use);
361
443
0 commit comments