Fix integer overflow in compiled binaryop (#17354)

wence- · web-flow · commit c7bfa779b6b6 · 2024-11-19T17:44:51.000Z
For large columns, the computed stride might end up overflowing size_type. To fix this, use the grid_1d helper. See also #10368. - Closes #17353 Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Bradley Dice (https://github.com/bdice) - David Wendt (https://github.com/davidwendt) - Tianyu Liu (https://github.com/kingcrimsontianyu) - Muhammad Haseeb (https://github.com/mhaseeb123) - Nghia Truong (https://github.com/ttnghia) URL: #17354
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -21,7 +21,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_view.hpp>
-#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/unary.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -253,16 +253,11 @@ struct binary_op_double_device_dispatcher {
 template <typename Functor>
 CUDF_KERNEL void for_each_kernel(cudf::size_type size, Functor f)
 {
-  int tid    = threadIdx.x;
-  int blkid  = blockIdx.x;
-  int blksz  = blockDim.x;
-  int gridsz = gridDim.x;
-
-  int start = tid + blkid * blksz;
-  int step  = blksz * gridsz;
+  auto start        = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
 
 #pragma unroll
-  for (cudf::size_type i = start; i < size; i += step) {
+  for (auto i = start; i < size; i += stride) {
     f(i);
   }
 }
@@ -282,9 +277,9 @@ void for_each(rmm::cuda_stream_view stream, cudf::size_type size, Functor f)
   int min_grid_size;
   CUDF_CUDA_TRY(
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, for_each_kernel<decltype(f)>));
-  // 2 elements per thread.
-  int const grid_size = util::div_rounding_up_safe(size, 2 * block_size);
-  for_each_kernel<<<grid_size, block_size, 0, stream.value()>>>(size, std::forward<Functor&&>(f));
+  auto grid = cudf::detail::grid_1d(size, block_size, 2 /* elements_per_thread */);
+  for_each_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+    size, std::forward<Functor&&>(f));
 }
 
 template <class BinaryOperator>
diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp
@@ -23,9 +23,11 @@
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <cudf/aggregation.hpp>
 #include <cudf/binaryop.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/reduction.hpp>
 #include <cudf/types.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
@@ -820,4 +822,24 @@ TEST_F(BinaryOperationCompiledTest_NullOpsString, NullMin_Vector_Vector)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result->view());
 }
 
+TEST(BinaryOperationCompiledTest, LargeColumnNoOverflow)
+{
+  cudf::size_type num_rows{1'799'989'091};
+  auto big = cudf::make_column_from_scalar(
+    cudf::numeric_scalar<cudf::id_to_type<cudf::type_id::INT8>>{10, true}, num_rows);
+  auto small = cudf::make_column_from_scalar(
+    cudf::numeric_scalar<cudf::id_to_type<cudf::type_id::INT8>>{1, true}, num_rows);
+
+  auto mask = cudf::binary_operation(big->view(),
+                                     small->view(),
+                                     cudf::binary_operator::GREATER,
+                                     cudf::data_type{cudf::type_id::BOOL8});
+
+  auto agg = cudf::make_sum_aggregation<cudf::reduce_aggregation>();
+  auto result =
+    cudf::reduce(mask->view(), *agg, cudf::data_type{cudf::type_to_id<cudf::size_type>()});
+  auto got = static_cast<cudf::numeric_scalar<cudf::size_type>*>(result.get())->value();
+  EXPECT_EQ(num_rows, got);
+}
+
 CUDF_TEST_PROGRAM_MAIN()