Add a public API for copying a table_view to device array (#18450)

Matt711 · web-flow · commit 7e555e0a265f · 2025-05-07T14:49:38.000Z
Contributes to #16483. This PR adds a new libcudf API: `cudf::table_to_array`, which copies data from a table_view into a preallocated column-major device array using `cub::DeviceMemcpy::Batched`. The primary use case for this API is to accelerate the conversion of a cudf.DataFrame to a CuPy array when users access `DataFrame.values` in Python. In a follow-up PR, I'll integrate this API into the cudf Python layer. - [x] I am familiar with the [Contributing Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md). - [x] New or existing tests cover these changes. - [x] The documentation is up to date with these changes. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - David Wendt (https://github.com/davidwendt) - Vukasin Milovanovic (https://github.com/vuule) - Bradley Dice (https://github.com/bdice) URL: #18450
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -647,6 +647,7 @@ add_library(
   src/replace/replace.cu
   src/reshape/byte_cast.cu
   src/reshape/interleave_columns.cu
+  src/reshape/table_to_array.cu
   src/reshape/tile.cu
   src/rolling/detail/optimized_unbounded_window.cpp
   src/rolling/detail/rolling_collect_list.cu
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -433,7 +433,7 @@ ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp)
 # ##################################################################################################
 # * reshape benchmark
 # ---------------------------------------------------------------------------------
-ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp)
+ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp reshape/table_to_array.cpp)
 
 # ##################################################################################################
 # * rolling benchmark
diff --git a/cpp/benchmarks/reshape/table_to_array.cpp b/cpp/benchmarks/reshape/table_to_array.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/reshape.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <cuda/functional>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_table_to_array(nvbench::state& state)
+{
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const num_cols = static_cast<cudf::size_type>(state.get_int64("columns"));
+
+  data_profile profile = data_profile_builder()
+                           .distribution(cudf::type_id::INT32, distribution_id::UNIFORM, 0, 1000)
+                           .no_validity();
+  std::vector<cudf::type_id> types(num_cols, cudf::type_id::INT32);
+  auto input_table = create_random_table(types, row_count{num_rows}, profile);
+
+  auto input_view = input_table->view();
+  auto stream     = cudf::get_default_stream();
+
+  rmm::device_buffer output(num_rows * num_cols * sizeof(int32_t), stream);
+  auto span = cudf::device_span<cuda::std::byte>(reinterpret_cast<cuda::std::byte*>(output.data()),
+                                                 output.size());
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.add_global_memory_reads<int32_t>(num_rows * num_cols);   // all bytes are read
+  state.add_global_memory_writes<int32_t>(num_rows * num_cols);  // all bytes are written
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { cudf::table_to_array(input_view, span, stream); });
+}
+
+NVBENCH_BENCH(bench_table_to_array)
+  .set_name("table_to_array")
+  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
+  .add_int64_axis("columns", {2, 10, 100});
diff --git a/cpp/include/cudf/detail/reshape.hpp b/cpp/include/cudf/detail/reshape.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -41,5 +42,12 @@ std::unique_ptr<column> interleave_columns(table_view const& input,
                                            rmm::cuda_stream_view,
                                            rmm::device_async_resource_ref mr);
 
+/**
+ * @copydoc cudf::table_to_array
+ */
+void table_to_array(table_view const& input,
+                    device_span<cuda::std::byte> output,
+                    rmm::cuda_stream_view stream = cudf::get_default_stream());
+
 }  // namespace detail
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/reshape.hpp b/cpp/include/cudf/reshape.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,9 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/memory_resource.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <cuda/functional>
 
 #include <memory>
 
@@ -107,6 +110,28 @@ std::unique_ptr<column> byte_cast(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Copies a table into a contiguous column-major device array.
+ *
+ * This function copies a `table_view` with columns of the same fixed-width type
+ * into a 2D device array stored in column-major order.
+ *
+ * The output buffer must be preallocated and passed as a `device_span` using
+ * a `device_span<cuda::std::byte>`. It must be large enough to hold
+ * `num_rows * num_columns * sizeof(dtype)` bytes.
+ *
+ * @throws cudf::logic_error if columns do not all have the same type
+ * @throws cudf::logic_error if the dtype of the columns is not a fixed-width type
+ * @throws std::invalid_argument if the output span is too small
+ *
+ * @param input A table with fixed-width, non-nullable columns of the same type
+ * @param output A span representing preallocated device memory for the output
+ * @param stream CUDA stream used for memory operations
+ */
+void table_to_array(table_view const& input,
+                    device_span<cuda::std::byte> output,
+                    rmm::cuda_stream_view stream = cudf::get_default_stream());
+
 /** @} */  // end of group
 
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/reshape/table_to_array.cu b/cpp/src/reshape/table_to_array.cu
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/reshape.hpp>
+#include <cudf/detail/utilities/batched_memcpy.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/reshape.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/span.hpp>
+#include <cudf/utilities/type_checks.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <cub/device/device_memcpy.cuh>
+#include <cuda/functional>
+#include <cuda_runtime.h>
+#include <thrust/device_vector.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+namespace cudf {
+namespace detail {
+namespace {
+
+template <typename T>
+void table_to_array_impl(table_view const& input,
+                         device_span<cuda::std::byte> output,
+                         rmm::cuda_stream_view stream)
+{
+  auto const num_columns = input.num_columns();
+  auto const num_rows    = input.num_rows();
+  auto const item_size   = sizeof(T);
+  auto const total_bytes = static_cast<size_t>(num_columns) * num_rows * item_size;
+
+  CUDF_EXPECTS(output.size() >= total_bytes, "Output span is too small", std::invalid_argument);
+  CUDF_EXPECTS(cudf::all_have_same_types(input.begin(), input.end()),
+               "All columns must have the same data type",
+               cudf::data_type_error);
+  CUDF_EXPECTS(!cudf::has_nulls(input), "All columns must contain no nulls", std::invalid_argument);
+
+  auto* base_ptr = output.data();
+
+  auto h_srcs = make_host_vector<T const*>(num_columns, stream);
+  auto h_dsts = make_host_vector<T*>(num_columns, stream);
+
+  std::transform(input.begin(), input.end(), h_srcs.begin(), [](auto& col) {
+    return const_cast<T*>(col.template data<T>());
+  });
+
+  for (int i = 0; i < num_columns; ++i) {
+    h_dsts[i] = reinterpret_cast<T*>(base_ptr + i * item_size * num_rows);
+  }
+
+  auto const mr = cudf::get_current_device_resource_ref();
+
+  auto d_srcs = cudf::detail::make_device_uvector_async(h_srcs, stream, mr);
+  auto d_dsts = cudf::detail::make_device_uvector_async(h_dsts, stream, mr);
+
+  thrust::constant_iterator<size_t> sizes(static_cast<size_t>(item_size * num_rows));
+
+  cudf::detail::batched_memcpy_async(
+    d_srcs.begin(), d_dsts.begin(), sizes, num_columns, stream.value());
+}
+
+struct table_to_array_dispatcher {
+  table_view const& input;
+  device_span<cuda::std::byte> output;
+  rmm::cuda_stream_view stream;
+
+  template <typename T, CUDF_ENABLE_IF(is_fixed_width<T>())>
+  void operator()() const
+  {
+    table_to_array_impl<T>(input, output, stream);
+  }
+
+  template <typename T, CUDF_ENABLE_IF(!is_fixed_width<T>())>
+  void operator()() const
+  {
+    CUDF_FAIL("Unsupported dtype");
+  }
+};
+
+}  // namespace
+
+void table_to_array(table_view const& input,
+                    device_span<cuda::std::byte> output,
+                    rmm::cuda_stream_view stream)
+{
+  if (input.num_columns() == 0) return;
+
+  auto const dtype = input.column(0).type();
+
+  cudf::type_dispatcher<cudf::dispatch_storage_type>(
+    dtype, table_to_array_dispatcher{input, output, stream});
+}
+
+}  // namespace detail
+
+void table_to_array(table_view const& input,
+                    device_span<cuda::std::byte> output,
+                    rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+  cudf::detail::table_to_array(input, output, stream);
+}
+
+}  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
@@ -525,7 +525,7 @@ ConfigureTest(
 # * reshape test ----------------------------------------------------------------------------------
 ConfigureTest(
   RESHAPE_TEST reshape/byte_cast_tests.cpp reshape/interleave_columns_tests.cpp
-  reshape/tile_tests.cpp
+  reshape/table_to_array_tests.cpp reshape/tile_tests.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/tests/reshape/table_to_array_tests.cpp b/cpp/tests/reshape/table_to_array_tests.cpp

Original file line number	Diff line number	Diff line change
`@@ -525,7 +525,7 @@ ConfigureTest(`
`525`	`525`	`# * reshape test ----------------------------------------------------------------------------------`
`526`	`526`	`ConfigureTest(`
`527`	`527`	`RESHAPE_TEST reshape/byte_cast_tests.cpp reshape/interleave_columns_tests.cpp`
`528`		`- reshape/tile_tests.cpp`
	`528`	`+ reshape/table_to_array_tests.cpp reshape/tile_tests.cpp`
`529`	`529`	`)`
`530`	`530`
`531`	`531`	`# ##################################################################################################`