diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index d6fc5dc6039..e61a8e6e1e6 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -384,6 +384,7 @@ ConfigureNVBench( string/join_strings.cpp string/lengths.cpp string/like.cpp + string/make_strings_column.cu string/replace_re.cpp string/reverse.cpp string/slice.cpp diff --git a/cpp/benchmarks/string/make_strings_column.cu b/cpp/benchmarks/string/make_strings_column.cu new file mode 100644 index 00000000000..e86824b9f40 --- /dev/null +++ b/cpp/benchmarks/string/make_strings_column.cu @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include +#include + +#include +#include + +#include + +#include + +namespace { + +constexpr int min_row_width = 0; +constexpr int max_row_width = 50; + +using string_index_pair = thrust::pair; + +template +std::vector> make_strings_columns( + std::vector> const& input, + rmm::cuda_stream_view stream) +{ + if constexpr (batch_construction) { + return cudf::make_strings_column_batch(input, stream); + } else { + std::vector> output; + output.reserve(input.size()); + for (auto const& column_input : input) { + output.emplace_back(cudf::make_strings_column(column_input, stream)); + } + return output; + } +} + +} // namespace + +static void BM_make_strings_column_batch(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const batch_size = static_cast(state.get_int64("batch_size")); + auto const has_nulls = true; + + data_profile const table_profile = + data_profile_builder() + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_row_width, max_row_width) + .null_probability(has_nulls ? std::optional{0.1} : std::nullopt); + auto const data_table = create_random_table( + cycle_dtypes({cudf::type_id::STRING}, batch_size), row_count{num_rows}, table_profile); + + auto const stream = cudf::get_default_stream(); + auto input_data = std::vector>{}; + auto input = std::vector>{}; + input_data.reserve(batch_size); + input.reserve(batch_size); + for (auto const& cv : data_table->view()) { + auto const d_data_ptr = cudf::column_device_view::create(cv, stream); + auto batch_input = rmm::device_uvector(cv.size(), stream); + thrust::tabulate(rmm::exec_policy(stream), + batch_input.begin(), + batch_input.end(), + [data_col = *d_data_ptr] __device__(auto const idx) { + if (data_col.is_null(idx)) { return string_index_pair{nullptr, 0}; } + auto const row = data_col.element(idx); + return string_index_pair{row.data(), row.size_bytes()}; + }); + input_data.emplace_back(std::move(batch_input)); + input.emplace_back(input_data.back()); + } + + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + [[maybe_unused]] auto const output = make_strings_columns(input, stream); + }); +} + +NVBENCH_BENCH(BM_make_strings_column_batch) + .set_name("make_strings_column_batch") + .add_int64_axis("num_rows", {100'000, 500'000, 1'000'000, 2'000'000}) + .add_int64_axis("batch_size", {10, 20, 50, 100}); diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp index c3b68b52c36..6bbe32de134 100644 --- a/cpp/include/cudf/column/column_factories.hpp +++ b/cpp/include/cudf/column/column_factories.hpp @@ -378,6 +378,26 @@ std::unique_ptr make_strings_column( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Construct a batch of STRING type columns given an array of device spans of pointer/size + * pairs. + * + * This function has input/output expectation similar to the `make_strings_column()` API that + * accepts only one device span of pointer/size pairs. The difference is that, this is designed to + * create many strings columns at once with minimal overhead of multiple kernel launches and + * stream synchronizations. + * + * @param input Array of device spans of pointer/size pairs, where each pointer is a device memory + * address or `nullptr` (indicating a null string), and size is string length (in bytes) + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used for memory allocation of the output columns + * @return Array of constructed strings columns + */ +std::vector> make_strings_column_batch( + std::vector const>> const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** * @brief Construct a STRING type column given a device span of string_view. * diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh index 1283226879b..fb0b25cf9f1 100644 --- a/cpp/include/cudf/strings/detail/strings_children.cuh +++ b/cpp/include/cudf/strings/detail/strings_children.cuh @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -29,6 +30,8 @@ #include #include +#include +#include #include #include @@ -38,6 +41,61 @@ namespace cudf { namespace strings { namespace detail { +/** + * @brief Gather characters to create a strings column using the given string-index pair iterator + * + * @tparam IndexPairIterator iterator over type `pair` values + * + * @param offsets The offsets for the output strings column + * @param chars_size The size (in bytes) of the chars data + * @param begin Iterator to the first string-index pair + * @param strings_count The number of strings + * @param stream CUDA stream used for device memory operations + * @param mr Device memory resource used to allocate the returned column's device memory + * @return An array of chars gathered from the input string-index pair iterator + */ +template +rmm::device_uvector make_chars_buffer(column_view const& offsets, + int64_t chars_size, + IndexPairIterator begin, + size_type strings_count, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto chars_data = rmm::device_uvector(chars_size, stream, mr); + auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets); + + auto const src_ptrs = cudf::detail::make_counting_transform_iterator( + 0u, cuda::proclaim_return_type([begin] __device__(uint32_t idx) { + // Due to a bug in cub (https://github.com/NVIDIA/cccl/issues/586), + // we have to use `const_cast` to remove `const` qualifier from the source pointer. + // This should be fine as long as we only read but not write anything to the source. + return reinterpret_cast(const_cast(begin[idx].first)); + })); + auto const src_sizes = cudf::detail::make_counting_transform_iterator( + 0u, cuda::proclaim_return_type([begin] __device__(uint32_t idx) { + return begin[idx].second; + })); + auto const dst_ptrs = cudf::detail::make_counting_transform_iterator( + 0u, + cuda::proclaim_return_type([offsets = d_offsets, output = chars_data.data()] __device__( + uint32_t idx) { return output + offsets[idx]; })); + + size_t temp_storage_bytes = 0; + CUDF_CUDA_TRY(cub::DeviceMemcpy::Batched( + nullptr, temp_storage_bytes, src_ptrs, dst_ptrs, src_sizes, strings_count, stream.value())); + rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); + CUDF_CUDA_TRY(cub::DeviceMemcpy::Batched(d_temp_storage.data(), + temp_storage_bytes, + src_ptrs, + dst_ptrs, + src_sizes, + strings_count, + stream.value())); + + return chars_data; +} + /** * @brief Create an offsets column to be a child of a compound column * diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh index 6b1b453a752..03240f418fe 100644 --- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh +++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh @@ -49,16 +49,6 @@ namespace detail { */ using string_index_pair = thrust::pair; -/** - * @brief Average string byte-length threshold for deciding character-level - * vs. row-level parallel algorithm. - * - * This value was determined by running the factory_benchmark against different - * string lengths and observing the point where the performance is faster for - * long strings. - */ -constexpr size_type FACTORY_BYTES_PER_ROW_THRESHOLD = 64; - /** * @brief Create a strings-type column from iterators of pointer/size pairs * @@ -88,8 +78,6 @@ std::unique_ptr make_strings_column(IndexPairIterator begin, auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer); auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column( offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); - auto const d_offsets = - cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view()); // create null mask auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; }; @@ -99,38 +87,8 @@ std::unique_ptr make_strings_column(IndexPairIterator begin, (null_count > 0) ? std::move(new_nulls.first) : rmm::device_buffer{0, stream, mr}; // build chars column - auto chars_data = [d_offsets, bytes = bytes, begin, strings_count, null_count, stream, mr] { - auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1); - // use a character-parallel kernel for long string lengths - if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) { - auto const str_begin = thrust::make_transform_iterator( - begin, cuda::proclaim_return_type([] __device__(auto ip) { - return string_view{ip.first, ip.second}; - })); - - return gather_chars(str_begin, - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - d_offsets, - bytes, - stream, - mr); - } else { - // this approach is 2-3x faster for a large number of smaller string lengths - auto chars_data = rmm::device_uvector(bytes, stream, mr); - auto d_chars = chars_data.data(); - auto copy_chars = [d_chars] __device__(auto item) { - string_index_pair const str = thrust::get<0>(item); - int64_t const offset = thrust::get<1>(item); - if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second); - }; - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_zip_iterator(thrust::make_tuple(begin, d_offsets)), - strings_count, - copy_chars); - return chars_data; - } - }(); + auto chars_data = + make_chars_buffer(offsets_column->view(), bytes, begin, strings_count, stream, mr); return make_strings_column(strings_count, std::move(offsets_column), diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu index 07516f91dcf..8e00a29f8e9 100644 --- a/cpp/src/strings/strings_column_factories.cu +++ b/cpp/src/strings/strings_column_factories.cu @@ -16,36 +16,171 @@ #include #include +#include #include +#include +#include #include -#include -#include #include -#include #include #include +#include #include +#include #include #include +#include +#include namespace cudf { +namespace strings::detail { + namespace { -struct string_view_to_pair { - string_view null_placeholder; - string_view_to_pair(string_view n) : null_placeholder(n) {} - __device__ thrust::pair operator()(string_view const& i) - { - return (i.data() == null_placeholder.data()) - ? thrust::pair{nullptr, 0} - : thrust::pair{i.data(), i.size_bytes()}; + +using column_string_pairs = cudf::device_span; + +template +std::pair>, rmm::device_uvector> +make_offsets_child_column_batch_async(std::vector const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto const num_columns = input.size(); + std::vector> offsets_columns(num_columns); + rmm::device_uvector chars_sizes(num_columns, stream); + for (std::size_t idx = 0; idx < num_columns; ++idx) { + auto const string_pairs = input[idx]; + auto const string_count = static_cast(string_pairs.size()); + auto offsets = make_numeric_column( + data_type{type_to_id()}, string_count + 1, mask_state::UNALLOCATED, stream, mr); + + auto const offsets_transformer = cuda::proclaim_return_type( + [string_count, string_pairs = string_pairs.data()] __device__(size_type idx) -> size_type { + return idx < string_count ? string_pairs[idx].second : size_type{0}; + }); + auto const input_it = cudf::detail::make_counting_transform_iterator(0, offsets_transformer); + auto const d_offsets = offsets->mutable_view().template data(); + auto const output_it = cudf::detail::make_sizes_to_offsets_iterator( + d_offsets, d_offsets + string_count + 1, chars_sizes.data() + idx); + thrust::exclusive_scan(rmm::exec_policy_nosync(stream), + input_it, + input_it + string_count + 1, + output_it, + int64_t{0}); + offsets_columns[idx] = std::move(offsets); } -}; + + return {std::move(offsets_columns), std::move(chars_sizes)}; +} } // namespace +std::vector> make_strings_column_batch( + std::vector const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto const num_columns = input.size(); + + auto [offsets_cols, d_chars_sizes] = + make_offsets_child_column_batch_async(input, stream, mr); + + std::vector null_masks; + null_masks.reserve(num_columns); + + rmm::device_uvector d_valid_counts(num_columns, stream, mr); + thrust::uninitialized_fill( + rmm::exec_policy_nosync(stream), d_valid_counts.begin(), d_valid_counts.end(), 0); + + for (std::size_t idx = 0; idx < num_columns; ++idx) { + auto const& string_pairs = input[idx]; + auto const string_count = static_cast(string_pairs.size()); + null_masks.emplace_back( + cudf::create_null_mask(string_count, mask_state::UNINITIALIZED, stream, mr)); + + if (string_count == 0) { continue; } + + constexpr size_type block_size{256}; + auto const grid = + cudf::detail::grid_1d{static_cast(string_count), block_size}; + cudf::detail::valid_if_kernel + <<>>( + reinterpret_cast(null_masks.back().data()), + string_pairs.data(), + string_count, + [] __device__(string_index_pair const pair) -> bool { return pair.first != nullptr; }, + d_valid_counts.data() + idx); + } + + auto const chars_sizes = cudf::detail::make_std_vector_async(d_chars_sizes, stream); + auto const valid_counts = cudf::detail::make_std_vector_async(d_valid_counts, stream); + + // Except for other stream syncs in `CUB` that we cannot control, + // this should be the only stream sync we need in the entire API. + stream.synchronize(); + + auto const threshold = cudf::strings::get_offset64_threshold(); + auto const overflow_count = + std::count_if(chars_sizes.begin(), chars_sizes.end(), [threshold](auto const chars_size) { + return chars_size >= threshold; + }); + CUDF_EXPECTS(cudf::strings::is_large_strings_enabled() || overflow_count == 0, + "Size of output exceeds the column size limit", + std::overflow_error); + + if (overflow_count > 0) { + std::vector long_string_input; + std::vector long_string_col_idx; + long_string_input.reserve(overflow_count); + long_string_col_idx.reserve(overflow_count); + for (std::size_t idx = 0; idx < num_columns; ++idx) { + if (chars_sizes[idx] >= threshold) { + long_string_input.push_back(input[idx]); + long_string_col_idx.push_back(idx); + } + } + + [[maybe_unused]] auto [new_offsets_cols, d_new_chars_sizes] = + make_offsets_child_column_batch_async(long_string_input, stream, mr); + + // Update the new offsets columns. + // The new chars sizes should be the same as before, thus we don't need to update them. + for (std::size_t idx = 0; idx < long_string_col_idx.size(); ++idx) { + offsets_cols[long_string_col_idx[idx]] = std::move(new_offsets_cols[idx]); + } + } + + std::vector> output(num_columns); + for (std::size_t idx = 0; idx < num_columns; ++idx) { + auto const strings_count = static_cast(input[idx].size()); + if (strings_count == 0) { + output[idx] = make_empty_column(type_id::STRING); + continue; + } + + auto const chars_size = chars_sizes[idx]; + auto const valid_count = valid_counts[idx]; + + auto chars_data = make_chars_buffer( + offsets_cols[idx]->view(), chars_size, input[idx].data(), strings_count, stream, mr); + + auto const null_count = strings_count - valid_count; + output[idx] = make_strings_column( + strings_count, + std::move(offsets_cols[idx]), + chars_data.release(), + null_count, + null_count ? std::move(null_masks[idx]) : rmm::device_buffer{0, stream, mr}); + } + + return output; +} + +} // namespace strings::detail + // Create a strings-type column from vector of pointer/size pairs std::unique_ptr make_strings_column( device_span const> strings, @@ -53,10 +188,32 @@ std::unique_ptr make_strings_column( rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return cudf::strings::detail::make_strings_column(strings.begin(), strings.end(), stream, mr); } +std::vector> make_strings_column_batch( + std::vector const>> const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return cudf::strings::detail::make_strings_column_batch(input, stream, mr); +} + +namespace { +struct string_view_to_pair { + string_view null_placeholder; + string_view_to_pair(string_view n) : null_placeholder(n) {} + __device__ thrust::pair operator()(string_view const& i) + { + return (i.data() == null_placeholder.data()) + ? thrust::pair{nullptr, 0} + : thrust::pair{i.data(), i.size_bytes()}; + } +}; + +} // namespace + std::unique_ptr make_strings_column(device_span string_views, string_view null_placeholder, rmm::cuda_stream_view stream, diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 799a84cbc37..a4213dcbe94 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -726,6 +726,7 @@ ConfigureTest( streams/strings/contains_test.cpp streams/strings/convert_test.cpp streams/strings/extract_test.cpp + streams/strings/factory_test.cpp streams/strings/filter_test.cpp streams/strings/find_test.cpp streams/strings/replace_test.cpp diff --git a/cpp/tests/streams/strings/factory_test.cpp b/cpp/tests/streams/strings/factory_test.cpp new file mode 100644 index 00000000000..36e595ab9fa --- /dev/null +++ b/cpp/tests/streams/strings/factory_test.cpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include + +#include + +#include +#include + +class StringsFactoryTest : public cudf::test::BaseFixture {}; + +using string_pair = thrust::pair; + +TEST_F(StringsFactoryTest, StringConstructionFromPairs) +{ + auto const stream = cudf::test::get_default_stream(); + + auto const h_data = std::vector{'a', 'b', 'c'}; + auto const d_data = cudf::detail::make_device_uvector_async( + h_data, stream, cudf::get_current_device_resource_ref()); + + auto const h_input = + std::vector{{d_data.data(), 1}, {d_data.data() + 1, 1}, {d_data.data() + 2, 1}}; + auto const d_input = cudf::detail::make_device_uvector_async( + h_input, stream, cudf::get_current_device_resource_ref()); + auto const input = cudf::device_span{d_input.data(), d_input.size()}; + cudf::make_strings_column(input, stream); +} + +TEST_F(StringsFactoryTest, StringBatchConstruction) +{ + auto const stream = cudf::test::get_default_stream(); + + auto const h_data = std::vector{'a', 'b', 'c'}; + auto const d_data = cudf::detail::make_device_uvector_async( + h_data, stream, cudf::get_current_device_resource_ref()); + + auto const h_input = + std::vector{{d_data.data(), 1}, {d_data.data() + 1, 1}, {d_data.data() + 2, 1}}; + auto const d_input = cudf::detail::make_device_uvector_async( + h_input, stream, cudf::get_current_device_resource_ref()); + + std::vector> input( + 10, cudf::device_span{d_input.data(), d_input.size()}); + cudf::make_strings_column_batch(input, stream); +} diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu index 90054e41d36..7eb429da7d9 100644 --- a/cpp/tests/strings/factories_test.cu +++ b/cpp/tests/strings/factories_test.cu @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -44,6 +45,8 @@ struct StringsFactoriesTest : public cudf::test::BaseFixture {}; +using string_pair = thrust::pair; + TEST_F(StringsFactoriesTest, CreateColumnFromPair) { std::vector h_test_strings{"the quick brown fox jumps over the lazy dog", @@ -61,7 +64,7 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair) cudf::size_type count = (cudf::size_type)h_test_strings.size(); thrust::host_vector h_buffer(memsize); rmm::device_uvector d_buffer(memsize, cudf::get_default_stream()); - thrust::host_vector> strings(count); + thrust::host_vector strings(count); thrust::host_vector h_offsets(count + 1); cudf::size_type offset = 0; cudf::size_type nulls = 0; @@ -69,12 +72,12 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair) for (cudf::size_type idx = 0; idx < count; ++idx) { char const* str = h_test_strings[idx]; if (!str) { - strings[idx] = thrust::pair{nullptr, 0}; + strings[idx] = string_pair{nullptr, 0}; nulls++; } else { auto length = (cudf::size_type)strlen(str); memcpy(h_buffer.data() + offset, str, length); - strings[idx] = thrust::pair{d_buffer.data() + offset, length}; + strings[idx] = string_pair{d_buffer.data() + offset, length}; offset += length; } h_offsets[idx + 1] = offset; @@ -201,14 +204,13 @@ TEST_F(StringsFactoriesTest, EmptyStringsColumn) cudf::make_strings_column(0, std::move(d_offsets), d_chars.release(), 0, d_nulls.release()); cudf::test::expect_column_empty(results->view()); - rmm::device_uvector> d_strings{ - 0, cudf::get_default_stream()}; + rmm::device_uvector d_strings{0, cudf::get_default_stream()}; results = cudf::make_strings_column(d_strings); cudf::test::expect_column_empty(results->view()); } namespace { -using string_pair = thrust::pair; + struct string_view_to_pair { __device__ string_pair operator()(thrust::pair const& p) { @@ -234,3 +236,198 @@ TEST_F(StringsFactoriesTest, StringPairWithNullsAndEmpty) auto result = cudf::make_strings_column(pairs); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), data); } + +struct StringsBatchConstructionTest : public cudf::test::BaseFixture {}; + +TEST_F(StringsBatchConstructionTest, EmptyColumns) +{ + auto constexpr num_columns = 10; + auto const stream = cudf::get_default_stream(); + + auto const d_string_pairs = rmm::device_uvector{0, stream}; + auto const input = std::vector>( + num_columns, {d_string_pairs.data(), d_string_pairs.size()}); + auto const output = cudf::make_strings_column_batch(input, stream); + + auto const expected_col = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + for (auto const& col : output) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col->view(), col->view()); + } +} + +TEST_F(StringsBatchConstructionTest, AllNullsColumns) +{ + auto constexpr num_columns = 10; + auto constexpr num_rows = 100; + auto const stream = cudf::get_default_stream(); + + auto d_string_pairs = rmm::device_uvector{num_rows, stream}; + thrust::uninitialized_fill_n(rmm::exec_policy(stream), + d_string_pairs.data(), + d_string_pairs.size(), + string_pair{nullptr, 0}); + auto const input = std::vector>( + num_columns, {d_string_pairs.data(), d_string_pairs.size()}); + auto const output = cudf::make_strings_column_batch(input, stream); + + auto const expected_col = cudf::make_strings_column(d_string_pairs); + for (auto const& col : output) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col->view(), col->view()); + } +} + +namespace { + +struct index_to_pair { + int const num_test_strings; + char const* d_chars; + std::size_t const* d_offsets; + int const* is_null; + + __device__ string_pair operator()(cudf::size_type idx) + { + auto const data_idx = idx % num_test_strings; + return {is_null[data_idx] ? nullptr : d_chars + d_offsets[data_idx], + static_cast(d_offsets[data_idx + 1] - d_offsets[data_idx])}; + } +}; + +} // namespace + +TEST_F(StringsBatchConstructionTest, CreateColumnsFromPairs) +{ + auto constexpr num_columns = 10; + auto constexpr max_num_rows = 1000; + auto const stream = cudf::get_default_stream(); + auto const mr = cudf::get_current_device_resource_ref(); + + std::vector h_test_strings{"the quick brown fox jumps over the lazy dog", + "the fat cat lays next to the other accénted cat", + "a slow moving turtlé cannot catch the bird", + "which can be composéd together to form a more complete", + "thé result does not include the value in the sum in", + "", + nullptr, + "absent stop words"}; + auto const num_test_strings = static_cast(h_test_strings.size()); + + std::vector h_offsets(num_test_strings + 1, 0); + for (int i = 0; i < num_test_strings; ++i) { + h_offsets[i + 1] = h_offsets[i] + (h_test_strings[i] ? strlen(h_test_strings[i]) : 0); + } + + std::vector h_chars(h_offsets.back()); + std::vector is_null(num_test_strings, 0); + for (int i = 0; i < num_test_strings; ++i) { + if (h_test_strings[i]) { + memcpy(h_chars.data() + h_offsets[i], h_test_strings[i], strlen(h_test_strings[i])); + } else { + is_null[i] = 1; + } + } + + auto const d_offsets = cudf::detail::make_device_uvector_async(h_offsets, stream, mr); + auto const d_chars = cudf::detail::make_device_uvector_async(h_chars, stream, mr); + auto const d_is_null = cudf::detail::make_device_uvector_async(is_null, stream, mr); + + std::vector> d_input; + std::vector> input; + d_input.reserve(num_columns); + input.reserve(num_columns); + + for (int col_idx = 0; col_idx < num_columns; ++col_idx) { + // Columns have sizes increase from `max_num_rows / num_columns` to `max_num_rows`. + auto const num_rows = + static_cast(static_cast(col_idx + 1) / num_columns * max_num_rows); + + auto string_pairs = rmm::device_uvector(num_rows, stream); + thrust::tabulate( + rmm::exec_policy_nosync(stream), + string_pairs.begin(), + string_pairs.end(), + index_to_pair{num_test_strings, d_chars.begin(), d_offsets.begin(), d_is_null.begin()}); + + d_input.emplace_back(std::move(string_pairs)); + input.emplace_back(d_input.back()); + } + + auto const output = cudf::make_strings_column_batch(input, stream, mr); + + for (std::size_t i = 0; i < num_columns; ++i) { + auto const string_pairs = input[i]; + auto const expected = cudf::make_strings_column(string_pairs, stream, mr); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view(), output[i]->view()); + } +} + +// The test below requires a huge amount of memory, thus it is disabled by default. +TEST_F(StringsBatchConstructionTest, DISABLED_CreateLongStringsColumns) +{ + auto constexpr num_columns = 2; + auto const stream = cudf::get_default_stream(); + auto const mr = cudf::get_current_device_resource_ref(); + + std::vector h_test_strings{"the quick brown fox jumps over the lazy dog", + "the fat cat lays next to the other accénted cat", + "a slow moving turtlé cannot catch the bird", + "which can be composéd together to form a more complete", + "thé result does not include the value in the sum in", + "", + nullptr, + "absent stop words"}; + auto const num_test_strings = static_cast(h_test_strings.size()); + + std::vector h_offsets(num_test_strings + 1, 0); + for (int i = 0; i < num_test_strings; ++i) { + h_offsets[i + 1] = h_offsets[i] + (h_test_strings[i] ? strlen(h_test_strings[i]) : 0); + } + + std::vector h_chars(h_offsets.back()); + std::vector is_null(num_test_strings, 0); + for (int i = 0; i < num_test_strings; ++i) { + if (h_test_strings[i]) { + memcpy(h_chars.data() + h_offsets[i], h_test_strings[i], strlen(h_test_strings[i])); + } else { + is_null[i] = 1; + } + } + + auto const d_offsets = cudf::detail::make_device_uvector_async(h_offsets, stream, mr); + auto const d_chars = cudf::detail::make_device_uvector_async(h_chars, stream, mr); + auto const d_is_null = cudf::detail::make_device_uvector_async(is_null, stream, mr); + + // If we create a column by repeating h_test_strings by `max_cycles` times, + // we will have it size around (1.5*INT_MAX) bytes. + auto const max_cycles = static_cast(static_cast(std::numeric_limits::max()) * + 1.5 / h_offsets.back()); + + std::vector> d_input; + std::vector> input; + d_input.reserve(num_columns); + input.reserve(num_columns); + + for (int col_idx = 0; col_idx < num_columns; ++col_idx) { + // Columns have sizes increase from `max_cycles * num_test_strings / num_columns` to + // `max_cycles * num_test_strings`. + auto const num_rows = static_cast(static_cast(col_idx + 1) / num_columns * + max_cycles * num_test_strings); + + auto string_pairs = rmm::device_uvector(num_rows, stream); + thrust::tabulate( + rmm::exec_policy_nosync(stream), + string_pairs.begin(), + string_pairs.end(), + index_to_pair{num_test_strings, d_chars.begin(), d_offsets.begin(), d_is_null.begin()}); + + d_input.emplace_back(std::move(string_pairs)); + input.emplace_back(d_input.back()); + } + + auto const output = cudf::make_strings_column_batch(input, stream, mr); + + for (std::size_t i = 0; i < num_columns; ++i) { + auto const string_pairs = input[i]; + auto const expected = cudf::make_strings_column(string_pairs, stream, mr); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view(), output[i]->view()); + } +}