@@ -92,6 +92,29 @@ template ::sycl::event SubtractionHist(::sycl::queue* qu,
92
92
const GHistRow<double , MemoryType::on_device>& src2,
93
93
size_t size, ::sycl::event event_priv);
94
94
95
+ template <typename GradientPairT>
96
+ ::sycl::event ReduceHist (::sycl::queue* qu, GradientPairT* hist_data,
97
+ GradientPairT* hist_buffer_data,
98
+ size_t nblocks, size_t nbins,
99
+ const ::sycl::event& event_main) {
100
+ auto event_save = qu->submit ([&](::sycl::handler& cgh) {
101
+ cgh.depends_on (event_main);
102
+ cgh.parallel_for <>(::sycl::range<1 >(nbins), [=](::sycl::item<1 > pid) {
103
+ size_t idx_bin = pid.get_id (0 );
104
+
105
+ GradientPairT gpair = {0 , 0 };
106
+
107
+ for (size_t j = 0 ; j < nblocks; ++j) {
108
+ gpair += hist_buffer_data[j * nbins + idx_bin];
109
+ }
110
+
111
+ hist_data[idx_bin] = gpair;
112
+ });
113
+ });
114
+
115
+ return event_save;
116
+ }
117
+
95
118
// Kernel with buffer using
96
119
template <typename FPType, typename BinIdxType, bool isDense>
97
120
::sycl::event BuildHistKernel (::sycl::queue* qu,
@@ -100,7 +123,7 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
100
123
const GHistIndexMatrix& gmat,
101
124
GHistRow<FPType, MemoryType::on_device>* hist,
102
125
GHistRow<FPType, MemoryType::on_device>* hist_buffer,
103
- const tree::HistBuildParameters& params ,
126
+ const tree::HistDispatcher<FPType>& dispatcher ,
104
127
::sycl::event event_priv) {
105
128
using GradientPairT = xgboost::detail::GradientPairInternal<FPType>;
106
129
const size_t size = row_indices.Size ();
@@ -111,9 +134,9 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
111
134
const uint32_t * offsets = gmat.cut .cut_ptrs_ .ConstDevicePointer ();
112
135
const size_t nbins = gmat.nbins ;
113
136
114
- const size_t work_group_size = params .work_group_size ;
115
- const size_t block_size = params .block .size ;
116
- const size_t nblocks = params .block .nblocks ;
137
+ const size_t work_group_size = dispatcher .work_group_size ;
138
+ const size_t block_size = dispatcher .block .size ;
139
+ const size_t nblocks = dispatcher .block .nblocks ;
117
140
118
141
GradientPairT* hist_buffer_data = hist_buffer->Data ();
119
142
auto event_fill = qu->fill (hist_buffer_data, GradientPairT (0 , 0 ),
@@ -152,20 +175,9 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
152
175
});
153
176
154
177
GradientPairT* hist_data = hist->Data ();
155
- auto event_save = qu->submit ([&](::sycl::handler& cgh) {
156
- cgh.depends_on (event_main);
157
- cgh.parallel_for <>(::sycl::range<1 >(nbins), [=](::sycl::item<1 > pid) {
158
- size_t idx_bin = pid.get_id (0 );
159
-
160
- GradientPairT gpair = {0 , 0 };
161
-
162
- for (size_t j = 0 ; j < nblocks; ++j) {
163
- gpair += hist_buffer_data[j * nbins + idx_bin];
164
- }
178
+ auto event_save = ReduceHist (qu, hist_data, hist_buffer_data, nblocks,
179
+ nbins, event_main);
165
180
166
- hist_data[idx_bin] = gpair;
167
- });
168
- });
169
181
return event_save;
170
182
}
171
183
@@ -177,9 +189,9 @@ ::sycl::event BuildHistKernelLocal(::sycl::queue* qu,
177
189
const GHistIndexMatrix& gmat,
178
190
GHistRow<FPType, MemoryType::on_device>* hist,
179
191
GHistRow<FPType, MemoryType::on_device>* hist_buffer,
180
- const tree::HistBuildParameters& params ,
192
+ const tree::HistDispatcher<FPType>& dispatcher ,
181
193
::sycl::event event_priv) {
182
- constexpr int kMaxNumBins = tree::HistDispatcher::KMaxNumBins;
194
+ constexpr int kMaxNumBins = tree::HistDispatcher<FPType> ::KMaxNumBins;
183
195
using GradientPairT = xgboost::detail::GradientPairInternal<FPType>;
184
196
const size_t size = row_indices.Size ();
185
197
const size_t * rid = row_indices.begin ;
@@ -189,9 +201,9 @@ ::sycl::event BuildHistKernelLocal(::sycl::queue* qu,
189
201
const uint32_t * offsets = gmat.cut .cut_ptrs_ .ConstDevicePointer ();
190
202
const size_t nbins = gmat.nbins ;
191
203
192
- const size_t work_group_size = params .work_group_size ;
193
- const size_t block_size = params .block .size ;
194
- const size_t nblocks = params .block .nblocks ;
204
+ const size_t work_group_size = dispatcher .work_group_size ;
205
+ const size_t block_size = dispatcher .block .size ;
206
+ const size_t nblocks = dispatcher .block .nblocks ;
195
207
196
208
GradientPairT* hist_buffer_data = hist_buffer->Data ();
197
209
@@ -239,20 +251,8 @@ ::sycl::event BuildHistKernelLocal(::sycl::queue* qu,
239
251
});
240
252
241
253
GradientPairT* hist_data = hist->Data ();
242
- auto event_save = qu->submit ([&](::sycl::handler& cgh) {
243
- cgh.depends_on (event_main);
244
- cgh.parallel_for <>(::sycl::range<1 >(nbins), [=](::sycl::item<1 > pid) {
245
- size_t idx_bin = pid.get_id (0 );
246
-
247
- GradientPairT gpair = {0 , 0 };
248
-
249
- for (size_t j = 0 ; j < nblocks; ++j) {
250
- gpair += hist_buffer_data[j * nbins + idx_bin];
251
- }
252
-
253
- hist_data[idx_bin] = gpair;
254
- });
255
- });
254
+ auto event_save = ReduceHist (qu, hist_data, hist_buffer_data, nblocks,
255
+ nbins, event_main);
256
256
return event_save;
257
257
}
258
258
@@ -263,7 +263,7 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
263
263
const RowSetCollection::Elem& row_indices,
264
264
const GHistIndexMatrix& gmat,
265
265
GHistRow<FPType, MemoryType::on_device>* hist,
266
- const tree::HistBuildParameters& params ,
266
+ const tree::HistDispatcher<FPType>& dispatcher ,
267
267
::sycl::event event_priv) {
268
268
const size_t size = row_indices.Size ();
269
269
const size_t * rid = row_indices.begin ;
@@ -275,7 +275,7 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
275
275
FPType* hist_data = reinterpret_cast <FPType*>(hist->Data ());
276
276
const size_t nbins = gmat.nbins ;
277
277
278
- size_t work_group_size = params .work_group_size ;
278
+ size_t work_group_size = dispatcher .work_group_size ;
279
279
const size_t n_work_groups = n_columns / work_group_size + (n_columns % work_group_size > 0 );
280
280
281
281
auto event_fill = qu->fill (hist_data, FPType (0 ), nbins * 2 , event_priv);
@@ -321,47 +321,47 @@ ::sycl::event BuildHistDispatchKernel(
321
321
GHistRow<FPType, MemoryType::on_device>* hist,
322
322
bool isDense,
323
323
GHistRow<FPType, MemoryType::on_device>* hist_buffer,
324
- const tree::HistDispatcher& dispatcher ,
324
+ const tree::DeviceProperties& device_prop ,
325
325
::sycl::event events_priv,
326
326
bool force_atomic_use) {
327
327
const size_t size = row_indices.Size ();
328
328
const size_t n_columns = isDense ? gmat.nfeatures : gmat.row_stride ;
329
329
const size_t nbins = gmat.nbins ;
330
330
const size_t max_num_bins = gmat.max_num_bins ;
331
331
const size_t min_num_bins = gmat.min_num_bins ;
332
- using GradientPairT = xgboost::detail::GradientPairInternal<FPType>;
333
332
334
- size_t max_n_blocks = hist_buffer->Size () / (nbins * 2 );
335
- auto build_params = dispatcher.GetHistBuildParameters <GradientPairT>
336
- (isDense, size, max_n_blocks, nbins, n_columns, max_num_bins, min_num_bins);
333
+ size_t max_n_blocks = hist_buffer->Size () / nbins;
334
+ auto dispatcher = tree::HistDispatcher<FPType>
335
+ (device_prop, isDense, size, max_n_blocks, nbins,
336
+ n_columns, max_num_bins, min_num_bins);
337
337
338
338
// force_atomic_use flag is used only for testing
339
- bool use_atomic = build_params .use_atomics || force_atomic_use;
339
+ bool use_atomic = dispatcher .use_atomics || force_atomic_use;
340
340
if (!use_atomic) {
341
341
if (isDense) {
342
- if (build_params .use_local_hist ) {
342
+ if (dispatcher .use_local_hist ) {
343
343
return BuildHistKernelLocal<FPType, BinIdxType>(qu, gpair, row_indices,
344
344
gmat, hist, hist_buffer,
345
- build_params , events_priv);
345
+ dispatcher , events_priv);
346
346
} else {
347
347
return BuildHistKernel<FPType, BinIdxType, true >(qu, gpair, row_indices,
348
348
gmat, hist, hist_buffer,
349
- build_params , events_priv);
349
+ dispatcher , events_priv);
350
350
}
351
351
} else {
352
352
return BuildHistKernel<FPType, uint32_t , false >(qu, gpair, row_indices,
353
353
gmat, hist, hist_buffer,
354
- build_params , events_priv);
354
+ dispatcher , events_priv);
355
355
}
356
356
} else {
357
357
if (isDense) {
358
358
return BuildHistKernel<FPType, BinIdxType, true >(qu, gpair, row_indices,
359
359
gmat, hist,
360
- build_params , events_priv);
360
+ dispatcher , events_priv);
361
361
} else {
362
362
return BuildHistKernel<FPType, uint32_t , false >(qu, gpair, row_indices,
363
363
gmat, hist,
364
- build_params , events_priv);
364
+ dispatcher , events_priv);
365
365
}
366
366
}
367
367
}
@@ -373,27 +373,27 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
373
373
const GHistIndexMatrix& gmat, const bool isDense,
374
374
GHistRow<FPType, MemoryType::on_device>* hist,
375
375
GHistRow<FPType, MemoryType::on_device>* hist_buffer,
376
- const tree::HistDispatcher& dispatcher ,
376
+ const tree::DeviceProperties& device_prop ,
377
377
::sycl::event event_priv,
378
378
bool force_atomic_use) {
379
379
const bool is_dense = isDense;
380
380
switch (gmat.index .GetBinTypeSize ()) {
381
381
case BinTypeSize::kUint8BinsTypeSize :
382
382
return BuildHistDispatchKernel<FPType, uint8_t >(qu, gpair, row_indices,
383
383
gmat, hist, is_dense, hist_buffer,
384
- dispatcher ,
384
+ device_prop ,
385
385
event_priv, force_atomic_use);
386
386
break ;
387
387
case BinTypeSize::kUint16BinsTypeSize :
388
388
return BuildHistDispatchKernel<FPType, uint16_t >(qu, gpair, row_indices,
389
389
gmat, hist, is_dense, hist_buffer,
390
- dispatcher ,
390
+ device_prop ,
391
391
event_priv, force_atomic_use);
392
392
break ;
393
393
case BinTypeSize::kUint32BinsTypeSize :
394
394
return BuildHistDispatchKernel<FPType, uint32_t >(qu, gpair, row_indices,
395
395
gmat, hist, is_dense, hist_buffer,
396
- dispatcher ,
396
+ device_prop ,
397
397
event_priv, force_atomic_use);
398
398
break ;
399
399
default :
@@ -409,12 +409,12 @@ ::sycl::event GHistBuilder<GradientSumT>::BuildHist(
409
409
GHistRowT<MemoryType::on_device>* hist,
410
410
bool isDense,
411
411
GHistRowT<MemoryType::on_device>* hist_buffer,
412
- const tree::HistDispatcher& dispatcher ,
412
+ const tree::DeviceProperties& device_prop ,
413
413
::sycl::event event_priv,
414
414
bool force_atomic_use) {
415
415
return BuildHistKernel<GradientSumT>(qu_, gpair, row_indices, gmat,
416
416
isDense, hist, hist_buffer,
417
- dispatcher , event_priv,
417
+ device_prop , event_priv,
418
418
force_atomic_use);
419
419
}
420
420
@@ -426,7 +426,7 @@ ::sycl::event GHistBuilder<float>::BuildHist(
426
426
GHistRow<float , MemoryType::on_device>* hist,
427
427
bool isDense,
428
428
GHistRow<float , MemoryType::on_device>* hist_buffer,
429
- const tree::HistDispatcher& dispatcher ,
429
+ const tree::DeviceProperties& device_prop ,
430
430
::sycl::event event_priv,
431
431
bool force_atomic_use);
432
432
template
@@ -437,7 +437,7 @@ ::sycl::event GHistBuilder<double>::BuildHist(
437
437
GHistRow<double , MemoryType::on_device>* hist,
438
438
bool isDense,
439
439
GHistRow<double , MemoryType::on_device>* hist_buffer,
440
- const tree::HistDispatcher& dispatcher ,
440
+ const tree::DeviceProperties& device_prop ,
441
441
::sycl::event event_priv,
442
442
bool force_atomic_use);
443
443
0 commit comments