diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh index 93401f01026..f7984b29d6b 100644 --- a/cpp/benchmarks/join/generate_input_tables.cuh +++ b/cpp/benchmarks/join/generate_input_tables.cuh @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include @@ -34,7 +35,7 @@ CUDF_KERNEL void init_curand(curandState* state, int const nstates) { - int ithread = threadIdx.x + blockIdx.x * blockDim.x; + int ithread = cudf::detail::grid_1d::global_thread_id(); if (ithread < nstates) { curand_init(1234ULL, ithread, 0, state + ithread); } } @@ -46,13 +47,14 @@ CUDF_KERNEL void init_build_tbl(key_type* const build_tbl, curandState* state, int const num_states) { - auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x; - auto const stride = blockDim.x * gridDim.x; + auto const start_idx = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); assert(start_idx < num_states); curandState localState = state[start_idx]; - for (size_type idx = start_idx; idx < build_tbl_size; idx += stride) { + for (cudf::thread_index_type tidx = start_idx; tidx < build_tbl_size; tidx += stride) { + auto const idx = static_cast(tidx); double const x = curand_uniform_double(&localState); build_tbl[idx] = static_cast(x * (build_tbl_size / multiplicity)); @@ -71,13 +73,14 @@ CUDF_KERNEL void init_probe_tbl(key_type* const probe_tbl, curandState* state, int const num_states) { - auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x; - auto const stride = blockDim.x * gridDim.x; + auto const start_idx = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); assert(start_idx < num_states); curandState localState = state[start_idx]; - for (size_type idx = start_idx; idx < probe_tbl_size; idx += stride) { + for (cudf::thread_index_type tidx = start_idx; tidx < probe_tbl_size; tidx += stride) { + auto const idx = static_cast(tidx); key_type val; double x = curand_uniform_double(&localState); diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu index 161328ae088..3aff75d840e 100644 --- a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu +++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu @@ -60,13 +60,15 @@ constexpr int block_size = 256; template CUDF_KERNEL void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_type n_cols) { - using F = Functor; - cudf::size_type index = blockIdx.x * blockDim.x + threadIdx.x; - while (index < n_rows) { + using F = Functor; + auto tidx = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); + while (tidx < n_rows) { + auto const index = static_cast(tidx); for (int c = 0; c < n_cols; c++) { A[c][index] = F::f(A[c][index]); } - index += blockDim.x * gridDim.x; + tidx += stride; } } @@ -74,12 +76,14 @@ CUDF_KERNEL void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size template CUDF_KERNEL void host_dispatching_kernel(cudf::mutable_column_device_view source_column) { - using F = Functor; - T* A = source_column.data(); - cudf::size_type index = blockIdx.x * blockDim.x + threadIdx.x; - while (index < source_column.size()) { - A[index] = F::f(A[index]); - index += blockDim.x * gridDim.x; + using F = Functor; + T* A = source_column.data(); + auto tidx = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); + while (tidx < source_column.size()) { + auto const index = static_cast(tidx); + A[index] = F::f(A[index]); + tidx += stride; } } @@ -127,14 +131,15 @@ template CUDF_KERNEL void device_dispatching_kernel(cudf::mutable_table_device_view source) { cudf::size_type const n_rows = source.num_rows(); - cudf::size_type index = threadIdx.x + blockIdx.x * blockDim.x; - - while (index < n_rows) { + auto tidx = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); + while (tidx < n_rows) { + auto const index = static_cast(tidx); for (cudf::size_type i = 0; i < source.num_columns(); i++) { cudf::type_dispatcher( source.column(i).type(), RowHandle{}, source.column(i), index); } - index += blockDim.x * gridDim.x; + tidx += stride; } // while } diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh index ac5cb0ad141..8418e279ce7 100644 --- a/cpp/include/cudf/detail/copy_if_else.cuh +++ b/cpp/include/cudf/detail/copy_if_else.cuh @@ -45,29 +45,30 @@ __launch_bounds__(block_size) CUDF_KERNEL mutable_column_device_view out, size_type* __restrict__ const valid_count) { - size_type const tid = threadIdx.x + blockIdx.x * block_size; - int const warp_id = tid / warp_size; - size_type const warps_per_grid = gridDim.x * block_size / warp_size; + auto tidx = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); + int const warp_id = tidx / cudf::detail::warp_size; + size_type const warps_per_grid = gridDim.x * block_size / cudf::detail::warp_size; // begin/end indices for the column data - size_type begin = 0; - size_type end = out.size(); + size_type const begin = 0; + size_type const end = out.size(); // warp indices. since 1 warp == 32 threads == sizeof(bitmask_type) * 8, // each warp will process one (32 bit) of the validity mask via // __ballot_sync() - size_type warp_begin = cudf::word_index(begin); - size_type warp_end = cudf::word_index(end - 1); + size_type const warp_begin = cudf::word_index(begin); + size_type const warp_end = cudf::word_index(end - 1); // lane id within the current warp constexpr size_type leader_lane{0}; - int const lane_id = threadIdx.x % warp_size; + int const lane_id = threadIdx.x % cudf::detail::warp_size; size_type warp_valid_count{0}; // current warp. size_type warp_cur = warp_begin + warp_id; - size_type index = tid; while (warp_cur <= warp_end) { + auto const index = static_cast(tidx); auto const opt_value = (index < end) ? (filter(index) ? lhs[index] : rhs[index]) : thrust::nullopt; if (opt_value) { out.element(index) = static_cast(*opt_value); } @@ -85,7 +86,7 @@ __launch_bounds__(block_size) CUDF_KERNEL // next grid warp_cur += warps_per_grid; - index += block_size * gridDim.x; + tidx += stride; } if (has_nulls) { @@ -159,7 +160,7 @@ std::unique_ptr copy_if_else(bool nullable, using Element = typename thrust::iterator_traits::value_type::value_type; size_type size = std::distance(lhs_begin, lhs_end); - size_type num_els = cudf::util::round_up_safe(size, warp_size); + size_type num_els = cudf::util::round_up_safe(size, cudf::detail::warp_size); constexpr int block_size = 256; cudf::detail::grid_1d grid{num_els, block_size, 1}; diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh index 86c85ca8d06..f1775c6d6d7 100644 --- a/cpp/include/cudf/detail/utilities/cuda.cuh +++ b/cpp/include/cudf/detail/utilities/cuda.cuh @@ -93,6 +93,19 @@ class grid_1d { return global_thread_id(threadIdx.x, blockIdx.x, blockDim.x); } + /** + * @brief Returns the global thread index of the current thread in a 1D grid. + * + * @tparam num_threads_per_block The number of threads per block + * + * @return thread_index_type The global thread index + */ + template + static __device__ thread_index_type global_thread_id() + { + return global_thread_id(threadIdx.x, blockIdx.x, num_threads_per_block); + } + /** * @brief Returns the stride of a 1D grid. * @@ -115,6 +128,19 @@ class grid_1d { * @return thread_index_type The number of threads in the grid. */ static __device__ thread_index_type grid_stride() { return grid_stride(blockDim.x, gridDim.x); } + + /** + * @brief Returns the stride of the current 1D grid. + * + * @tparam num_threads_per_block The number of threads per block + * + * @return thread_index_type The number of threads in the grid. + */ + template + static __device__ thread_index_type grid_stride() + { + return grid_stride(num_threads_per_block, gridDim.x); + } }; /** diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh index 66163d6059a..64a3c4edf78 100644 --- a/cpp/include/cudf/detail/valid_if.cuh +++ b/cpp/include/cudf/detail/valid_if.cuh @@ -50,8 +50,8 @@ CUDF_KERNEL void valid_if_kernel( { constexpr size_type leader_lane{0}; auto const lane_id{threadIdx.x % warp_size}; - auto i = cudf::detail::grid_1d::global_thread_id(); - auto const stride = cudf::detail::grid_1d::grid_stride(); + auto i = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); size_type warp_valid_count{0}; auto active_mask = __ballot_sync(0xFFFF'FFFFu, i < size); diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index 4da2e502ce6..d0faeea8336 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -269,8 +269,8 @@ CUDF_KERNEL void count_set_bits_kernel(bitmask_type const* bitmask, auto const first_word_index{word_index(first_bit_index)}; auto const last_word_index{word_index(last_bit_index)}; - thread_index_type const tid = grid_1d::global_thread_id(); - thread_index_type const stride = grid_1d::grid_stride(); + thread_index_type const tid = grid_1d::global_thread_id(); + thread_index_type const stride = grid_1d::grid_stride(); thread_index_type thread_word_index = tid + first_word_index; size_type thread_count{0}; diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index b1136a9eeb3..47e74a5cb48 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -121,8 +121,8 @@ CUDF_KERNEL void concatenate_masks_kernel(column_device_view const* views, size_type number_of_mask_bits, size_type* out_valid_count) { - auto tidx = cudf::detail::grid_1d::global_thread_id(); - auto const stride = cudf::detail::grid_1d::grid_stride(); + auto tidx = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); auto active_mask = __ballot_sync(0xFFFF'FFFFu, tidx < number_of_mask_bits); size_type warp_valid_count = 0; diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh index 5e190eb2b27..1e16c451f5a 100644 --- a/cpp/src/join/conditional_join_kernels.cuh +++ b/cpp/src/join/conditional_join_kernels.cuh @@ -67,8 +67,8 @@ CUDF_KERNEL void compute_conditional_join_output_size( &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates]; std::size_t thread_counter{0}; - auto const start_idx = cudf::detail::grid_1d::global_thread_id(); - auto const stride = cudf::detail::grid_1d::grid_stride(); + auto const start_idx = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); cudf::thread_index_type const left_num_rows = left_table.num_rows(); cudf::thread_index_type const right_num_rows = right_table.num_rows(); @@ -174,7 +174,7 @@ CUDF_KERNEL void conditional_join(table_device_view left_table, __syncwarp(); - auto outer_row_index = cudf::detail::grid_1d::global_thread_id(); + auto outer_row_index = cudf::detail::grid_1d::global_thread_id(); unsigned int const activemask = __ballot_sync(0xffff'ffffu, outer_row_index < outer_num_rows); @@ -295,8 +295,8 @@ CUDF_KERNEL void conditional_join_anti_semi( int const lane_id = threadIdx.x % detail::warp_size; cudf::thread_index_type const outer_num_rows = left_table.num_rows(); cudf::thread_index_type const inner_num_rows = right_table.num_rows(); - auto const stride = cudf::detail::grid_1d::grid_stride(); - auto const start_idx = cudf::detail::grid_1d::global_thread_id(); + auto const stride = cudf::detail::grid_1d::grid_stride(); + auto const start_idx = cudf::detail::grid_1d::global_thread_id(); if (0 == lane_id) { current_idx_shared[warp_id] = 0; } diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu index 459c3e88a4e..d9920be045f 100644 --- a/cpp/src/strings/convert/convert_urls.cu +++ b/cpp/src/strings/convert/convert_urls.cu @@ -202,10 +202,11 @@ CUDF_KERNEL void url_decode_char_counter(column_device_view const in_strings, __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size]; __shared__ typename cub::WarpReduce::TempStorage cub_storage[num_warps_per_threadblock]; - auto const global_thread_id = cudf::detail::grid_1d::global_thread_id(); - auto const global_warp_id = static_cast(global_thread_id / cudf::detail::warp_size); - auto const local_warp_id = static_cast(threadIdx.x / cudf::detail::warp_size); - auto const warp_lane = static_cast(threadIdx.x % cudf::detail::warp_size); + auto const global_thread_id = + cudf::detail::grid_1d::global_thread_id(); + auto const global_warp_id = static_cast(global_thread_id / cudf::detail::warp_size); + auto const local_warp_id = static_cast(threadIdx.x / cudf::detail::warp_size); + auto const warp_lane = static_cast(threadIdx.x % cudf::detail::warp_size); auto const nwarps = static_cast(gridDim.x * blockDim.x / cudf::detail::warp_size); char* in_chars_shared = temporary_buffer[local_warp_id]; @@ -287,10 +288,11 @@ CUDF_KERNEL void url_decode_char_replacer(column_device_view const in_strings, __shared__ typename cub::WarpScan::TempStorage cub_storage[num_warps_per_threadblock]; __shared__ size_type out_idx[num_warps_per_threadblock]; - auto const global_thread_id = cudf::detail::grid_1d::global_thread_id(); - auto const global_warp_id = static_cast(global_thread_id / cudf::detail::warp_size); - auto const local_warp_id = static_cast(threadIdx.x / cudf::detail::warp_size); - auto const warp_lane = static_cast(threadIdx.x % cudf::detail::warp_size); + auto const global_thread_id = + cudf::detail::grid_1d::global_thread_id(); + auto const global_warp_id = static_cast(global_thread_id / cudf::detail::warp_size); + auto const local_warp_id = static_cast(threadIdx.x / cudf::detail::warp_size); + auto const warp_lane = static_cast(threadIdx.x % cudf::detail::warp_size); auto const nwarps = static_cast(gridDim.x * blockDim.x / cudf::detail::warp_size); char* in_chars_shared = temporary_buffer[local_warp_id];