Skip to content

Commit

Permalink
Implement batch construction for strings columns (#17035)
Browse files Browse the repository at this point in the history
This implements batch construction of strings columns, allowing to create a large number of strings columns at once with minimal overhead of kernel launch and stream synchronization. There should be only one stream sync in the entire column construction process.

Benchmark: #17035 (comment)

Closes #16486.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Yunsong Wang (https://github.com/PointKernel)

URL: #17035
  • Loading branch information
ttnghia authored Oct 17, 2024
1 parent c9202a0 commit 5f863a5
Show file tree
Hide file tree
Showing 9 changed files with 622 additions and 63 deletions.
1 change: 1 addition & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,7 @@ ConfigureNVBench(
string/join_strings.cpp
string/lengths.cpp
string/like.cpp
string/make_strings_column.cu
string/replace_re.cpp
string/reverse.cpp
string/slice.cpp
Expand Down
100 changes: 100 additions & 0 deletions cpp/benchmarks/string/make_strings_column.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>

#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/pair.h>
#include <thrust/tabulate.h>

#include <nvbench/nvbench.cuh>

#include <vector>

namespace {

constexpr int min_row_width = 0;
constexpr int max_row_width = 50;

using string_index_pair = thrust::pair<char const*, cudf::size_type>;

template <bool batch_construction>
std::vector<std::unique_ptr<cudf::column>> make_strings_columns(
std::vector<cudf::device_span<string_index_pair const>> const& input,
rmm::cuda_stream_view stream)
{
if constexpr (batch_construction) {
return cudf::make_strings_column_batch(input, stream);
} else {
std::vector<std::unique_ptr<cudf::column>> output;
output.reserve(input.size());
for (auto const& column_input : input) {
output.emplace_back(cudf::make_strings_column(column_input, stream));
}
return output;
}
}

} // namespace

static void BM_make_strings_column_batch(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const batch_size = static_cast<cudf::size_type>(state.get_int64("batch_size"));
auto const has_nulls = true;

data_profile const table_profile =
data_profile_builder()
.distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_row_width, max_row_width)
.null_probability(has_nulls ? std::optional<double>{0.1} : std::nullopt);
auto const data_table = create_random_table(
cycle_dtypes({cudf::type_id::STRING}, batch_size), row_count{num_rows}, table_profile);

auto const stream = cudf::get_default_stream();
auto input_data = std::vector<rmm::device_uvector<string_index_pair>>{};
auto input = std::vector<cudf::device_span<string_index_pair const>>{};
input_data.reserve(batch_size);
input.reserve(batch_size);
for (auto const& cv : data_table->view()) {
auto const d_data_ptr = cudf::column_device_view::create(cv, stream);
auto batch_input = rmm::device_uvector<string_index_pair>(cv.size(), stream);
thrust::tabulate(rmm::exec_policy(stream),
batch_input.begin(),
batch_input.end(),
[data_col = *d_data_ptr] __device__(auto const idx) {
if (data_col.is_null(idx)) { return string_index_pair{nullptr, 0}; }
auto const row = data_col.element<cudf::string_view>(idx);
return string_index_pair{row.data(), row.size_bytes()};
});
input_data.emplace_back(std::move(batch_input));
input.emplace_back(input_data.back());
}

state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
[[maybe_unused]] auto const output = make_strings_columns<true>(input, stream);
});
}

NVBENCH_BENCH(BM_make_strings_column_batch)
.set_name("make_strings_column_batch")
.add_int64_axis("num_rows", {100'000, 500'000, 1'000'000, 2'000'000})
.add_int64_axis("batch_size", {10, 20, 50, 100});
20 changes: 20 additions & 0 deletions cpp/include/cudf/column/column_factories.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,26 @@ std::unique_ptr<column> make_strings_column(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Construct a batch of STRING type columns given an array of device spans of pointer/size
* pairs.
*
* This function has input/output expectation similar to the `make_strings_column()` API that
* accepts only one device span of pointer/size pairs. The difference is that, this is designed to
* create many strings columns at once with minimal overhead of multiple kernel launches and
* stream synchronizations.
*
* @param input Array of device spans of pointer/size pairs, where each pointer is a device memory
* address or `nullptr` (indicating a null string), and size is string length (in bytes)
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used for memory allocation of the output columns
* @return Array of constructed strings columns
*/
std::vector<std::unique_ptr<column>> make_strings_column_batch(
std::vector<cudf::device_span<thrust::pair<char const*, size_type> const>> const& input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Construct a STRING type column given a device span of string_view.
*
Expand Down
58 changes: 58 additions & 0 deletions cpp/include/cudf/strings/detail/strings_children.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include <cudf/column/column.hpp>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/detail/sizes_to_offsets_iterator.cuh>
#include <cudf/detail/utilities/cuda.cuh>
Expand All @@ -29,6 +30,8 @@
#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>

#include <cub/device/device_memcpy.cuh>
#include <cuda/functional>
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>

Expand All @@ -38,6 +41,61 @@ namespace cudf {
namespace strings {
namespace detail {

/**
* @brief Gather characters to create a strings column using the given string-index pair iterator
*
* @tparam IndexPairIterator iterator over type `pair<char const*,size_type>` values
*
* @param offsets The offsets for the output strings column
* @param chars_size The size (in bytes) of the chars data
* @param begin Iterator to the first string-index pair
* @param strings_count The number of strings
* @param stream CUDA stream used for device memory operations
* @param mr Device memory resource used to allocate the returned column's device memory
* @return An array of chars gathered from the input string-index pair iterator
*/
template <typename IndexPairIterator>
rmm::device_uvector<char> make_chars_buffer(column_view const& offsets,
int64_t chars_size,
IndexPairIterator begin,
size_type strings_count,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
auto chars_data = rmm::device_uvector<char>(chars_size, stream, mr);
auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets);

auto const src_ptrs = cudf::detail::make_counting_transform_iterator(
0u, cuda::proclaim_return_type<void*>([begin] __device__(uint32_t idx) {
// Due to a bug in cub (https://github.com/NVIDIA/cccl/issues/586),
// we have to use `const_cast` to remove `const` qualifier from the source pointer.
// This should be fine as long as we only read but not write anything to the source.
return reinterpret_cast<void*>(const_cast<char*>(begin[idx].first));
}));
auto const src_sizes = cudf::detail::make_counting_transform_iterator(
0u, cuda::proclaim_return_type<size_type>([begin] __device__(uint32_t idx) {
return begin[idx].second;
}));
auto const dst_ptrs = cudf::detail::make_counting_transform_iterator(
0u,
cuda::proclaim_return_type<char*>([offsets = d_offsets, output = chars_data.data()] __device__(
uint32_t idx) { return output + offsets[idx]; }));

size_t temp_storage_bytes = 0;
CUDF_CUDA_TRY(cub::DeviceMemcpy::Batched(
nullptr, temp_storage_bytes, src_ptrs, dst_ptrs, src_sizes, strings_count, stream.value()));
rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
CUDF_CUDA_TRY(cub::DeviceMemcpy::Batched(d_temp_storage.data(),
temp_storage_bytes,
src_ptrs,
dst_ptrs,
src_sizes,
strings_count,
stream.value()));

return chars_data;
}

/**
* @brief Create an offsets column to be a child of a compound column
*
Expand Down
46 changes: 2 additions & 44 deletions cpp/include/cudf/strings/detail/strings_column_factories.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,6 @@ namespace detail {
*/
using string_index_pair = thrust::pair<char const*, size_type>;

/**
* @brief Average string byte-length threshold for deciding character-level
* vs. row-level parallel algorithm.
*
* This value was determined by running the factory_benchmark against different
* string lengths and observing the point where the performance is faster for
* long strings.
*/
constexpr size_type FACTORY_BYTES_PER_ROW_THRESHOLD = 64;

/**
* @brief Create a strings-type column from iterators of pointer/size pairs
*
Expand Down Expand Up @@ -88,8 +78,6 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer);
auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
auto const d_offsets =
cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());

// create null mask
auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; };
Expand All @@ -99,38 +87,8 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
(null_count > 0) ? std::move(new_nulls.first) : rmm::device_buffer{0, stream, mr};

// build chars column
auto chars_data = [d_offsets, bytes = bytes, begin, strings_count, null_count, stream, mr] {
auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1);
// use a character-parallel kernel for long string lengths
if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) {
auto const str_begin = thrust::make_transform_iterator(
begin, cuda::proclaim_return_type<string_view>([] __device__(auto ip) {
return string_view{ip.first, ip.second};
}));

return gather_chars(str_begin,
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(strings_count),
d_offsets,
bytes,
stream,
mr);
} else {
// this approach is 2-3x faster for a large number of smaller string lengths
auto chars_data = rmm::device_uvector<char>(bytes, stream, mr);
auto d_chars = chars_data.data();
auto copy_chars = [d_chars] __device__(auto item) {
string_index_pair const str = thrust::get<0>(item);
int64_t const offset = thrust::get<1>(item);
if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second);
};
thrust::for_each_n(rmm::exec_policy(stream),
thrust::make_zip_iterator(thrust::make_tuple(begin, d_offsets)),
strings_count,
copy_chars);
return chars_data;
}
}();
auto chars_data =
make_chars_buffer(offsets_column->view(), bytes, begin, strings_count, stream, mr);

return make_strings_column(strings_count,
std::move(offsets_column),
Expand Down
Loading

0 comments on commit 5f863a5

Please sign in to comment.