From 1777c29840b0d8fce1799cee249fb5d44e7ddf74 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 7 Nov 2024 19:34:38 -0500 Subject: [PATCH] Allow generating large strings in benchmarks (#17224) Updates the benchmark utility `create_random_utf8_string_column` to support large strings. Replaces the hardcoded `size_type` offsets with the offsetalator and related utilities. Reference #16948 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/17224 --- cpp/benchmarks/common/generate_input.cu | 37 +++++++++++++------------ 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu index bdce8a31176..8bce718c7d8 100644 --- a/cpp/benchmarks/common/generate_input.cu +++ b/cpp/benchmarks/common/generate_input.cu @@ -23,11 +23,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -540,7 +542,7 @@ struct string_generator { // range 32-127 is ASCII; 127-136 will be multi-byte UTF-8 { } - __device__ void operator()(thrust::tuple str_begin_end) + __device__ void operator()(thrust::tuple str_begin_end) { auto begin = thrust::get<0>(str_begin_end); auto end = thrust::get<1>(str_begin_end); @@ -569,6 +571,9 @@ std::unique_ptr create_random_utf8_string_column(data_profile cons distribution_params{1. - profile.get_null_probability().value_or(0)}); auto lengths = len_dist(engine, num_rows + 1); auto null_mask = valid_dist(engine, num_rows + 1); + auto stream = cudf::get_default_stream(); + auto mr = cudf::get_current_device_resource_ref(); + thrust::transform_if( thrust::device, lengths.begin(), @@ -580,28 +585,26 @@ std::unique_ptr create_random_utf8_string_column(data_profile cons auto valid_lengths = thrust::make_transform_iterator( thrust::make_zip_iterator(thrust::make_tuple(lengths.begin(), null_mask.begin())), valid_or_zero{}); - rmm::device_uvector offsets(num_rows + 1, cudf::get_default_stream()); - thrust::exclusive_scan( - thrust::device, valid_lengths, valid_lengths + lengths.size(), offsets.begin()); - // offsets are ready. - auto chars_length = *thrust::device_pointer_cast(offsets.end() - 1); + + // offsets are created as INT32 or INT64 as appropriate + auto [offsets, chars_length] = cudf::strings::detail::make_offsets_child_column( + valid_lengths, valid_lengths + num_rows, stream, mr); + // use the offsetalator to normalize the offset values for use by the string_generator + auto offsets_itr = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view()); rmm::device_uvector chars(chars_length, cudf::get_default_stream()); thrust::for_each_n(thrust::device, - thrust::make_zip_iterator(offsets.begin(), offsets.begin() + 1), + thrust::make_zip_iterator(offsets_itr, offsets_itr + 1), num_rows, string_generator{chars.data(), engine}); + auto [result_bitmask, null_count] = - cudf::detail::valid_if(null_mask.begin(), - null_mask.end() - 1, - thrust::identity{}, - cudf::get_default_stream(), - cudf::get_current_device_resource_ref()); + profile.get_null_probability().has_value() + ? cudf::detail::valid_if( + null_mask.begin(), null_mask.end() - 1, thrust::identity{}, stream, mr) + : std::pair{rmm::device_buffer{}, 0}; + return cudf::make_strings_column( - num_rows, - std::make_unique(std::move(offsets), rmm::device_buffer{}, 0), - chars.release(), - null_count, - profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{}); + num_rows, std::move(offsets), chars.release(), null_count, std::move(result_bitmask)); } /**