Skip to content

Commit b11d452

Browse files
authored
[sycl] Improve L1 cache locality for histogram building. (#11555)
--------- Co-authored-by: Dmitry Razdoburdin <>
1 parent 804c5e8 commit b11d452

File tree

9 files changed

+379
-59
lines changed

9 files changed

+379
-59
lines changed

ops/conda_env/linux_sycl_test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,6 @@ dependencies:
2020
- pytest-cov
2121
- dask=2024.11
2222
- ninja
23-
- dpcpp_linux-64
23+
- dpcpp_linux-64>=2024.2.1
2424
- onedpl-devel
2525
- intel-openmp

plugin/sycl/common/hist_util.cc

Lines changed: 129 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <algorithm>
88

99
#include "../data/gradient_index.h"
10+
#include "../tree/hist_dispatcher.h"
1011
#include "hist_util.h"
1112

1213
#include <sycl/sycl.hpp>
@@ -91,28 +92,27 @@ template ::sycl::event SubtractionHist(::sycl::queue* qu,
9192
const GHistRow<double, MemoryType::on_device>& src2,
9293
size_t size, ::sycl::event event_priv);
9394

94-
inline auto GetBlocksParameters(::sycl::queue* qu, size_t size, size_t max_nblocks) {
95-
struct _ {
96-
size_t block_size, nblocks;
97-
};
95+
template <typename GradientPairT>
96+
::sycl::event ReduceHist(::sycl::queue* qu, GradientPairT* hist_data,
97+
GradientPairT* hist_buffer_data,
98+
size_t nblocks, size_t nbins,
99+
const ::sycl::event& event_main) {
100+
auto event_save = qu->submit([&](::sycl::handler& cgh) {
101+
cgh.depends_on(event_main);
102+
cgh.parallel_for<>(::sycl::range<1>(nbins), [=](::sycl::item<1> pid) {
103+
size_t idx_bin = pid.get_id(0);
98104

99-
const size_t min_block_size = 32;
100-
const size_t max_compute_units =
101-
qu->get_device().get_info<::sycl::info::device::max_compute_units>();
105+
GradientPairT gpair = {0, 0};
102106

103-
size_t nblocks = max_compute_units;
107+
for (size_t j = 0; j < nblocks; ++j) {
108+
gpair += hist_buffer_data[j * nbins + idx_bin];
109+
}
104110

105-
size_t block_size = size / nblocks + !!(size % nblocks);
106-
if (block_size > (1u << 12)) {
107-
nblocks = max_nblocks;
108-
block_size = size / nblocks + !!(size % nblocks);
109-
}
110-
if (block_size < min_block_size) {
111-
block_size = min_block_size;
112-
nblocks = size / block_size + !!(size % block_size);
113-
}
111+
hist_data[idx_bin] = gpair;
112+
});
113+
});
114114

115-
return _{block_size, nblocks};
115+
return event_save;
116116
}
117117

118118
// Kernel with buffer using
@@ -123,6 +123,7 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
123123
const GHistIndexMatrix& gmat,
124124
GHistRow<FPType, MemoryType::on_device>* hist,
125125
GHistRow<FPType, MemoryType::on_device>* hist_buffer,
126+
const tree::HistDispatcher<FPType>& dispatcher,
126127
::sycl::event event_priv) {
127128
using GradientPairT = xgboost::detail::GradientPairInternal<FPType>;
128129
const size_t size = row_indices.Size();
@@ -133,18 +134,13 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
133134
const uint32_t* offsets = gmat.cut.cut_ptrs_.ConstDevicePointer();
134135
const size_t nbins = gmat.nbins;
135136

136-
const size_t max_work_group_size =
137-
qu->get_device().get_info<::sycl::info::device::max_work_group_size>();
138-
const size_t work_group_size = n_columns < max_work_group_size ? n_columns : max_work_group_size;
139-
140-
// Captured structured bindings are a C++20 extension
141-
const auto block_params = GetBlocksParameters(qu, size, hist_buffer->Size() / (nbins * 2));
142-
const size_t block_size = block_params.block_size;
143-
const size_t nblocks = block_params.nblocks;
137+
const size_t work_group_size = dispatcher.work_group_size;
138+
const size_t block_size = dispatcher.block.size;
139+
const size_t nblocks = dispatcher.block.nblocks;
144140

145141
GradientPairT* hist_buffer_data = hist_buffer->Data();
146142
auto event_fill = qu->fill(hist_buffer_data, GradientPairT(0, 0),
147-
nblocks * nbins * 2, event_priv);
143+
nblocks * nbins, event_priv);
148144
auto event_main = qu->submit([&](::sycl::handler& cgh) {
149145
cgh.depends_on(event_fill);
150146
cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(nblocks, work_group_size),
@@ -179,20 +175,84 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
179175
});
180176

181177
GradientPairT* hist_data = hist->Data();
182-
auto event_save = qu->submit([&](::sycl::handler& cgh) {
183-
cgh.depends_on(event_main);
184-
cgh.parallel_for<>(::sycl::range<1>(nbins), [=](::sycl::item<1> pid) {
185-
size_t idx_bin = pid.get_id(0);
178+
auto event_save = ReduceHist(qu, hist_data, hist_buffer_data, nblocks,
179+
nbins, event_main);
186180

187-
GradientPairT gpair = {0, 0};
181+
return event_save;
182+
}
188183

189-
for (size_t j = 0; j < nblocks; ++j) {
190-
gpair += hist_buffer_data[j * nbins + idx_bin];
191-
}
184+
// Kernel with buffer and local hist using
185+
template<typename FPType, typename BinIdxType>
186+
::sycl::event BuildHistKernelLocal(::sycl::queue* qu,
187+
const HostDeviceVector<GradientPair>& gpair,
188+
const RowSetCollection::Elem& row_indices,
189+
const GHistIndexMatrix& gmat,
190+
GHistRow<FPType, MemoryType::on_device>* hist,
191+
GHistRow<FPType, MemoryType::on_device>* hist_buffer,
192+
const tree::HistDispatcher<FPType>& dispatcher,
193+
::sycl::event event_priv) {
194+
constexpr int kMaxNumBins = tree::HistDispatcher<FPType>::KMaxNumBins;
195+
using GradientPairT = xgboost::detail::GradientPairInternal<FPType>;
196+
const size_t size = row_indices.Size();
197+
const size_t* rid = row_indices.begin;
198+
const size_t n_columns = gmat.nfeatures;
199+
const auto* pgh = gpair.ConstDevicePointer();
200+
const BinIdxType* gradient_index = gmat.index.data<BinIdxType>();
201+
const uint32_t* offsets = gmat.cut.cut_ptrs_.ConstDevicePointer();
202+
const size_t nbins = gmat.nbins;
192203

193-
hist_data[idx_bin] = gpair;
204+
const size_t work_group_size = dispatcher.work_group_size;
205+
const size_t block_size = dispatcher.block.size;
206+
const size_t nblocks = dispatcher.block.nblocks;
207+
208+
GradientPairT* hist_buffer_data = hist_buffer->Data();
209+
210+
auto event_main = qu->submit([&](::sycl::handler& cgh) {
211+
cgh.depends_on(event_priv);
212+
cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(nblocks, work_group_size),
213+
::sycl::range<2>(1, work_group_size)),
214+
[=](::sycl::nd_item<2> pid) {
215+
size_t block = pid.get_global_id(0);
216+
size_t feat = pid.get_global_id(1);
217+
218+
// This buffer will be keeped in L1/registers
219+
GradientPairT hist_fast[kMaxNumBins];
220+
221+
GradientPairT* hist_local = hist_buffer_data + block * nbins;
222+
for (size_t fid = feat; fid < n_columns; fid += work_group_size) {
223+
size_t n_bins_feature = offsets[fid+1] - offsets[fid];
224+
225+
// Not all elements of hist_fast are actually used: n_bins_feature <= kMaxNumBins
226+
// We initililize only the requared elements to prevent the unused go to cache.
227+
for (int bin = 0; bin < n_bins_feature; ++bin) {
228+
hist_fast[bin] = {0, 0};
229+
}
230+
231+
for (size_t idx = 0; idx < block_size; ++idx) {
232+
size_t i = block * block_size + idx;
233+
if (i < size) {
234+
size_t row_id = rid[i];
235+
236+
const size_t icol_start = n_columns * row_id;
237+
const GradientPairT pgh_row(pgh[row_id].GetGrad(),
238+
pgh[row_id].GetHess());
239+
240+
const BinIdxType* gr_index_local = gradient_index + icol_start;
241+
uint32_t idx_bin = gr_index_local[fid];
242+
243+
hist_fast[idx_bin] += pgh_row;
244+
}
245+
}
246+
for (int bin = 0 ; bin < n_bins_feature; ++bin) {
247+
hist_local[bin + offsets[fid]] = hist_fast[bin];
248+
}
249+
}
194250
});
195251
});
252+
253+
GradientPairT* hist_data = hist->Data();
254+
auto event_save = ReduceHist(qu, hist_data, hist_buffer_data, nblocks,
255+
nbins, event_main);
196256
return event_save;
197257
}
198258

@@ -203,6 +263,7 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
203263
const RowSetCollection::Elem& row_indices,
204264
const GHistIndexMatrix& gmat,
205265
GHistRow<FPType, MemoryType::on_device>* hist,
266+
const tree::HistDispatcher<FPType>& dispatcher,
206267
::sycl::event event_priv) {
207268
const size_t size = row_indices.Size();
208269
const size_t* rid = row_indices.begin;
@@ -214,7 +275,7 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
214275
FPType* hist_data = reinterpret_cast<FPType*>(hist->Data());
215276
const size_t nbins = gmat.nbins;
216277

217-
constexpr size_t work_group_size = 32;
278+
size_t work_group_size = dispatcher.work_group_size;
218279
const size_t n_work_groups = n_columns / work_group_size + (n_columns % work_group_size > 0);
219280

220281
auto event_fill = qu->fill(hist_data, FPType(0), nbins * 2, event_priv);
@@ -260,34 +321,47 @@ ::sycl::event BuildHistDispatchKernel(
260321
GHistRow<FPType, MemoryType::on_device>* hist,
261322
bool isDense,
262323
GHistRow<FPType, MemoryType::on_device>* hist_buffer,
324+
const tree::DeviceProperties& device_prop,
263325
::sycl::event events_priv,
264326
bool force_atomic_use) {
265327
const size_t size = row_indices.Size();
266328
const size_t n_columns = isDense ? gmat.nfeatures : gmat.row_stride;
267329
const size_t nbins = gmat.nbins;
330+
const size_t max_num_bins = gmat.max_num_bins;
331+
const size_t min_num_bins = gmat.min_num_bins;
268332

269-
// TODO(razdoburdin): replace the add-hock dispatching criteria by more sutable one
270-
bool use_atomic = (size < nbins) || (gmat.max_num_bins == gmat.nbins / n_columns);
333+
size_t max_n_blocks = hist_buffer->Size() / nbins;
334+
auto dispatcher = tree::HistDispatcher<FPType>
335+
(device_prop, isDense, size, max_n_blocks, nbins,
336+
n_columns, max_num_bins, min_num_bins);
271337

272338
// force_atomic_use flag is used only for testing
273-
use_atomic = use_atomic || force_atomic_use;
339+
bool use_atomic = dispatcher.use_atomics || force_atomic_use;
274340
if (!use_atomic) {
275341
if (isDense) {
276-
return BuildHistKernel<FPType, BinIdxType, true>(qu, gpair, row_indices,
277-
gmat, hist, hist_buffer,
278-
events_priv);
342+
if (dispatcher.use_local_hist) {
343+
return BuildHistKernelLocal<FPType, BinIdxType>(qu, gpair, row_indices,
344+
gmat, hist, hist_buffer,
345+
dispatcher, events_priv);
346+
} else {
347+
return BuildHistKernel<FPType, BinIdxType, true>(qu, gpair, row_indices,
348+
gmat, hist, hist_buffer,
349+
dispatcher, events_priv);
350+
}
279351
} else {
280352
return BuildHistKernel<FPType, uint32_t, false>(qu, gpair, row_indices,
281353
gmat, hist, hist_buffer,
282-
events_priv);
354+
dispatcher, events_priv);
283355
}
284356
} else {
285357
if (isDense) {
286358
return BuildHistKernel<FPType, BinIdxType, true>(qu, gpair, row_indices,
287-
gmat, hist, events_priv);
359+
gmat, hist,
360+
dispatcher, events_priv);
288361
} else {
289362
return BuildHistKernel<FPType, uint32_t, false>(qu, gpair, row_indices,
290-
gmat, hist, events_priv);
363+
gmat, hist,
364+
dispatcher, events_priv);
291365
}
292366
}
293367
}
@@ -299,23 +373,27 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
299373
const GHistIndexMatrix& gmat, const bool isDense,
300374
GHistRow<FPType, MemoryType::on_device>* hist,
301375
GHistRow<FPType, MemoryType::on_device>* hist_buffer,
376+
const tree::DeviceProperties& device_prop,
302377
::sycl::event event_priv,
303378
bool force_atomic_use) {
304379
const bool is_dense = isDense;
305380
switch (gmat.index.GetBinTypeSize()) {
306381
case BinTypeSize::kUint8BinsTypeSize:
307382
return BuildHistDispatchKernel<FPType, uint8_t>(qu, gpair, row_indices,
308383
gmat, hist, is_dense, hist_buffer,
384+
device_prop,
309385
event_priv, force_atomic_use);
310386
break;
311387
case BinTypeSize::kUint16BinsTypeSize:
312388
return BuildHistDispatchKernel<FPType, uint16_t>(qu, gpair, row_indices,
313389
gmat, hist, is_dense, hist_buffer,
390+
device_prop,
314391
event_priv, force_atomic_use);
315392
break;
316393
case BinTypeSize::kUint32BinsTypeSize:
317394
return BuildHistDispatchKernel<FPType, uint32_t>(qu, gpair, row_indices,
318395
gmat, hist, is_dense, hist_buffer,
396+
device_prop,
319397
event_priv, force_atomic_use);
320398
break;
321399
default:
@@ -331,10 +409,12 @@ ::sycl::event GHistBuilder<GradientSumT>::BuildHist(
331409
GHistRowT<MemoryType::on_device>* hist,
332410
bool isDense,
333411
GHistRowT<MemoryType::on_device>* hist_buffer,
412+
const tree::DeviceProperties& device_prop,
334413
::sycl::event event_priv,
335414
bool force_atomic_use) {
336415
return BuildHistKernel<GradientSumT>(qu_, gpair, row_indices, gmat,
337-
isDense, hist, hist_buffer, event_priv,
416+
isDense, hist, hist_buffer,
417+
device_prop, event_priv,
338418
force_atomic_use);
339419
}
340420

@@ -346,6 +426,7 @@ ::sycl::event GHistBuilder<float>::BuildHist(
346426
GHistRow<float, MemoryType::on_device>* hist,
347427
bool isDense,
348428
GHistRow<float, MemoryType::on_device>* hist_buffer,
429+
const tree::DeviceProperties& device_prop,
349430
::sycl::event event_priv,
350431
bool force_atomic_use);
351432
template
@@ -356,6 +437,7 @@ ::sycl::event GHistBuilder<double>::BuildHist(
356437
GHistRow<double, MemoryType::on_device>* hist,
357438
bool isDense,
358439
GHistRow<double, MemoryType::on_device>* hist_buffer,
440+
const tree::DeviceProperties& device_prop,
359441
::sycl::event event_priv,
360442
bool force_atomic_use);
361443

plugin/sycl/common/hist_util.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
#include "../../src/common/hist_util.h"
1616
#include "../data/gradient_index.h"
17+
#include "../tree/hist_dispatcher.h"
1718

1819
#include <sycl/sycl.hpp>
1920

@@ -123,7 +124,7 @@ class ParallelGHistBuilder {
123124
}
124125

125126
void Reset(size_t nblocks) {
126-
hist_device_buffer_.Resize(qu_, nblocks * nbins_ * 2);
127+
hist_device_buffer_.Resize(qu_, nblocks * nbins_);
127128
}
128129

129130
GHistRowT& GetDeviceBuffer() {
@@ -161,6 +162,7 @@ class GHistBuilder {
161162
GHistRowT<MemoryType::on_device>* HistCollection,
162163
bool isDense,
163164
GHistRowT<MemoryType::on_device>* hist_buffer,
165+
const tree::DeviceProperties& device_prop,
164166
::sycl::event event,
165167
bool force_atomic_use = false);
166168

plugin/sycl/data/gradient_index.cc

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,14 @@ void GHistIndexMatrix::Init(::sycl::queue* qu,
121121
max_num_bins = max_bins;
122122
nbins = cut.Ptrs().back();
123123

124+
min_num_bins = nbins;
125+
const size_t n_offsets = cut.cut_ptrs_.Size() - 1;
126+
for (unsigned fid = 0; fid < n_offsets; ++fid) {
127+
auto ibegin = cut.cut_ptrs_.ConstHostVector()[fid];
128+
auto iend = cut.cut_ptrs_.ConstHostVector()[fid + 1];
129+
min_num_bins = std::min<size_t>(min_num_bins, iend - ibegin);
130+
}
131+
124132
hit_count.SetDevice(ctx->Device());
125133
hit_count.Resize(nbins, 0);
126134

@@ -141,7 +149,7 @@ void GHistIndexMatrix::Init(::sycl::queue* qu,
141149
row_stride = nfeatures;
142150
n_rows = dmat->Info().num_row_;
143151
}
144-
const size_t n_offsets = cut.cut_ptrs_.Size() - 1;
152+
145153
const size_t n_index = n_rows * row_stride;
146154
ResizeIndex(qu, n_index);
147155

plugin/sycl/data/gradient_index.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ struct GHistIndexMatrix {
8686
/*! \brief The corresponding cuts */
8787
xgboost::common::HistogramCuts cut;
8888
size_t max_num_bins;
89+
size_t min_num_bins;
8990
size_t nbins;
9091
size_t nfeatures;
9192
size_t row_stride;

0 commit comments

Comments
 (0)