21
21
22
22
#include < cudf/column/column_device_view.cuh>
23
23
#include < cudf/column/column_view.hpp>
24
- #include < cudf/detail/utilities/integer_utils.hpp >
24
+ #include < cudf/detail/utilities/cuda.cuh >
25
25
#include < cudf/unary.hpp>
26
26
27
27
#include < rmm/cuda_stream_view.hpp>
@@ -253,16 +253,11 @@ struct binary_op_double_device_dispatcher {
253
253
template <typename Functor>
254
254
CUDF_KERNEL void for_each_kernel (cudf::size_type size, Functor f)
255
255
{
256
- int tid = threadIdx .x ;
257
- int blkid = blockIdx .x ;
258
- int blksz = blockDim .x ;
259
- int gridsz = gridDim .x ;
260
-
261
- int start = tid + blkid * blksz;
262
- int step = blksz * gridsz;
256
+ auto start = cudf::detail::grid_1d::global_thread_id ();
257
+ auto const stride = cudf::detail::grid_1d::grid_stride ();
263
258
264
259
#pragma unroll
265
- for (cudf::size_type i = start; i < size; i += step ) {
260
+ for (auto i = start; i < size; i += stride ) {
266
261
f (i);
267
262
}
268
263
}
@@ -282,9 +277,9 @@ void for_each(rmm::cuda_stream_view stream, cudf::size_type size, Functor f)
282
277
int min_grid_size;
283
278
CUDF_CUDA_TRY (
284
279
cudaOccupancyMaxPotentialBlockSize (&min_grid_size, &block_size, for_each_kernel<decltype (f)>));
285
- // 2 elements per thread.
286
- int const grid_size = util::div_rounding_up_safe (size, 2 * block_size);
287
- for_each_kernel <<<grid_size, block_size, 0 , stream.value()>>> ( size, std::forward<Functor&&>(f));
280
+ auto grid = cudf::detail::grid_1d (size, block_size, 2 /* elements_per_thread */ );
281
+ for_each_kernel <<<grid.num_blocks, grid.num_threads_per_block, 0 , stream.value()>>> (
282
+ size, std::forward<Functor&&>(f));
288
283
}
289
284
290
285
template <class BinaryOperator >
0 commit comments