Implement batch construction for strings columns (#17035)

This implements batch construction of strings columns, allowing to create a large number of strings columns at once with minimal overhead of kernel launch and stream synchronization. There should be only one stream sync in the entire column construction process. Benchmark: #17035 (comment) Closes #16486. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - David Wendt (https://github.com/davidwendt) - Yunsong Wang (https://github.com/PointKernel) URL: #17035
rapidsai · Oct 17, 2024 · 5f863a5 · 5f863a5
1 parent c9202a0
commit 5f863a5
Show file tree

Hide file tree

Showing 9 changed files with 622 additions and 63 deletions.
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -384,6 +384,7 @@ ConfigureNVBench(
   string/join_strings.cpp
   string/lengths.cpp
   string/like.cpp
+  string/make_strings_column.cu
   string/replace_re.cpp
   string/reverse.cpp
   string/slice.cpp

diff --git a/cpp/benchmarks/string/make_strings_column.cu b/cpp/benchmarks/string/make_strings_column.cu
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/pair.h>
+#include <thrust/tabulate.h>
+
+#include <nvbench/nvbench.cuh>
+
+#include <vector>
+
+namespace {
+
+constexpr int min_row_width = 0;
+constexpr int max_row_width = 50;
+
+using string_index_pair = thrust::pair<char const*, cudf::size_type>;
+
+template <bool batch_construction>
+std::vector<std::unique_ptr<cudf::column>> make_strings_columns(
+  std::vector<cudf::device_span<string_index_pair const>> const& input,
+  rmm::cuda_stream_view stream)
+{
+  if constexpr (batch_construction) {
+    return cudf::make_strings_column_batch(input, stream);
+  } else {
+    std::vector<std::unique_ptr<cudf::column>> output;
+    output.reserve(input.size());
+    for (auto const& column_input : input) {
+      output.emplace_back(cudf::make_strings_column(column_input, stream));
+    }
+    return output;
+  }
+}
+
+}  // namespace
+
+static void BM_make_strings_column_batch(nvbench::state& state)
+{
+  auto const num_rows   = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const batch_size = static_cast<cudf::size_type>(state.get_int64("batch_size"));
+  auto const has_nulls  = true;
+
+  data_profile const table_profile =
+    data_profile_builder()
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_row_width, max_row_width)
+      .null_probability(has_nulls ? std::optional<double>{0.1} : std::nullopt);
+  auto const data_table = create_random_table(
+    cycle_dtypes({cudf::type_id::STRING}, batch_size), row_count{num_rows}, table_profile);
+
+  auto const stream = cudf::get_default_stream();
+  auto input_data   = std::vector<rmm::device_uvector<string_index_pair>>{};
+  auto input        = std::vector<cudf::device_span<string_index_pair const>>{};
+  input_data.reserve(batch_size);
+  input.reserve(batch_size);
+  for (auto const& cv : data_table->view()) {
+    auto const d_data_ptr = cudf::column_device_view::create(cv, stream);
+    auto batch_input      = rmm::device_uvector<string_index_pair>(cv.size(), stream);
+    thrust::tabulate(rmm::exec_policy(stream),
+                     batch_input.begin(),
+                     batch_input.end(),
+                     [data_col = *d_data_ptr] __device__(auto const idx) {
+                       if (data_col.is_null(idx)) { return string_index_pair{nullptr, 0}; }
+                       auto const row = data_col.element<cudf::string_view>(idx);
+                       return string_index_pair{row.data(), row.size_bytes()};
+                     });
+    input_data.emplace_back(std::move(batch_input));
+    input.emplace_back(input_data.back());
+  }
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    [[maybe_unused]] auto const output = make_strings_columns<true>(input, stream);
+  });
+}
+
+NVBENCH_BENCH(BM_make_strings_column_batch)
+  .set_name("make_strings_column_batch")
+  .add_int64_axis("num_rows", {100'000, 500'000, 1'000'000, 2'000'000})
+  .add_int64_axis("batch_size", {10, 20, 50, 100});
diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
@@ -378,6 +378,26 @@ std::unique_ptr<column> make_strings_column(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Construct a batch of STRING type columns given an array of device spans of pointer/size
+ * pairs.
+ *
+ * This function has input/output expectation similar to the `make_strings_column()` API that
+ * accepts only one device span of pointer/size pairs. The difference is that, this is designed to
+ * create many strings columns at once with minimal overhead of multiple kernel launches and
+ * stream synchronizations.
+ *
+ * @param input Array of device spans of pointer/size pairs, where each pointer is a device memory
+ *        address or `nullptr` (indicating a null string), and size is string length (in bytes)
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used for memory allocation of the output columns
+ * @return Array of constructed strings columns
+ */
+std::vector<std::unique_ptr<column>> make_strings_column_batch(
+  std::vector<cudf::device_span<thrust::pair<char const*, size_type> const>> const& input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /**
  * @brief Construct a STRING type column given a device span of string_view.
  *

diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -29,6 +30,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cub/device/device_memcpy.cuh>
+#include <cuda/functional>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 
@@ -38,6 +41,61 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
+/**
+ * @brief Gather characters to create a strings column using the given string-index pair iterator
+ *
+ * @tparam IndexPairIterator iterator over type `pair<char const*,size_type>` values
+ *
+ * @param offsets The offsets for the output strings column
+ * @param chars_size The size (in bytes) of the chars data
+ * @param begin Iterator to the first string-index pair
+ * @param strings_count The number of strings
+ * @param stream CUDA stream used for device memory operations
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return An array of chars gathered from the input string-index pair iterator
+ */
+template <typename IndexPairIterator>
+rmm::device_uvector<char> make_chars_buffer(column_view const& offsets,
+                                            int64_t chars_size,
+                                            IndexPairIterator begin,
+                                            size_type strings_count,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::device_async_resource_ref mr)
+{
+  auto chars_data      = rmm::device_uvector<char>(chars_size, stream, mr);
+  auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets);
+
+  auto const src_ptrs = cudf::detail::make_counting_transform_iterator(
+    0u, cuda::proclaim_return_type<void*>([begin] __device__(uint32_t idx) {
+      // Due to a bug in cub (https://github.com/NVIDIA/cccl/issues/586),
+      // we have to use `const_cast` to remove `const` qualifier from the source pointer.
+      // This should be fine as long as we only read but not write anything to the source.
+      return reinterpret_cast<void*>(const_cast<char*>(begin[idx].first));
+    }));
+  auto const src_sizes = cudf::detail::make_counting_transform_iterator(
+    0u, cuda::proclaim_return_type<size_type>([begin] __device__(uint32_t idx) {
+      return begin[idx].second;
+    }));
+  auto const dst_ptrs = cudf::detail::make_counting_transform_iterator(
+    0u,
+    cuda::proclaim_return_type<char*>([offsets = d_offsets, output = chars_data.data()] __device__(
+                                        uint32_t idx) { return output + offsets[idx]; }));
+
+  size_t temp_storage_bytes = 0;
+  CUDF_CUDA_TRY(cub::DeviceMemcpy::Batched(
+    nullptr, temp_storage_bytes, src_ptrs, dst_ptrs, src_sizes, strings_count, stream.value()));
+  rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
+  CUDF_CUDA_TRY(cub::DeviceMemcpy::Batched(d_temp_storage.data(),
+                                           temp_storage_bytes,
+                                           src_ptrs,
+                                           dst_ptrs,
+                                           src_sizes,
+                                           strings_count,
+                                           stream.value()));
+
+  return chars_data;
+}
+
 /**
  * @brief Create an offsets column to be a child of a compound column
  *

diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -49,16 +49,6 @@ namespace detail {
  */
 using string_index_pair = thrust::pair<char const*, size_type>;
 
-/**
- * @brief Average string byte-length threshold for deciding character-level
- * vs. row-level parallel algorithm.
- *
- * This value was determined by running the factory_benchmark against different
- * string lengths and observing the point where the performance is faster for
- * long strings.
- */
-constexpr size_type FACTORY_BYTES_PER_ROW_THRESHOLD = 64;
-
 /**
  * @brief Create a strings-type column from iterators of pointer/size pairs
  *
@@ -88,8 +78,6 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
   auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer);
   auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
     offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto const d_offsets =
-    cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
 
   // create null mask
   auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; };
@@ -99,38 +87,8 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
     (null_count > 0) ? std::move(new_nulls.first) : rmm::device_buffer{0, stream, mr};
 
   // build chars column
-  auto chars_data = [d_offsets, bytes = bytes, begin, strings_count, null_count, stream, mr] {
-    auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1);
-    // use a character-parallel kernel for long string lengths
-    if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) {
-      auto const str_begin = thrust::make_transform_iterator(
-        begin, cuda::proclaim_return_type<string_view>([] __device__(auto ip) {
-          return string_view{ip.first, ip.second};
-        }));
-
-      return gather_chars(str_begin,
-                          thrust::make_counting_iterator<size_type>(0),
-                          thrust::make_counting_iterator<size_type>(strings_count),
-                          d_offsets,
-                          bytes,
-                          stream,
-                          mr);
-    } else {
-      // this approach is 2-3x faster for a large number of smaller string lengths
-      auto chars_data = rmm::device_uvector<char>(bytes, stream, mr);
-      auto d_chars    = chars_data.data();
-      auto copy_chars = [d_chars] __device__(auto item) {
-        string_index_pair const str = thrust::get<0>(item);
-        int64_t const offset        = thrust::get<1>(item);
-        if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second);
-      };
-      thrust::for_each_n(rmm::exec_policy(stream),
-                         thrust::make_zip_iterator(thrust::make_tuple(begin, d_offsets)),
-                         strings_count,
-                         copy_chars);
-      return chars_data;
-    }
-  }();
+  auto chars_data =
+    make_chars_buffer(offsets_column->view(), bytes, begin, strings_count, stream, mr);
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),