Skip to content

Commit 7e555e0

Browse files
authored
Add a public API for copying a table_view to device array (#18450)
Contributes to #16483. This PR adds a new libcudf API: `cudf::table_to_array`, which copies data from a table_view into a preallocated column-major device array using `cub::DeviceMemcpy::Batched`. The primary use case for this API is to accelerate the conversion of a cudf.DataFrame to a CuPy array when users access `DataFrame.values` in Python. In a follow-up PR, I'll integrate this API into the cudf Python layer. - [x] I am familiar with the [Contributing Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md). - [x] New or existing tests cover these changes. - [x] The documentation is up to date with these changes. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - David Wendt (https://github.com/davidwendt) - Vukasin Milovanovic (https://github.com/vuule) - Bradley Dice (https://github.com/bdice) URL: #18450
1 parent 499fbe4 commit 7e555e0

File tree

8 files changed

+462
-4
lines changed

8 files changed

+462
-4
lines changed

cpp/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -647,6 +647,7 @@ add_library(
647647
src/replace/replace.cu
648648
src/reshape/byte_cast.cu
649649
src/reshape/interleave_columns.cu
650+
src/reshape/table_to_array.cu
650651
src/reshape/tile.cu
651652
src/rolling/detail/optimized_unbounded_window.cpp
652653
src/rolling/detail/rolling_collect_list.cu

cpp/benchmarks/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,7 @@ ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp)
433433
# ##################################################################################################
434434
# * reshape benchmark
435435
# ---------------------------------------------------------------------------------
436-
ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp)
436+
ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp reshape/table_to_array.cpp)
437437

438438
# ##################################################################################################
439439
# * rolling benchmark
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/*
2+
* Copyright (c) 2025, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include <benchmarks/common/generate_input.hpp>
18+
19+
#include <cudf/reshape.hpp>
20+
#include <cudf/utilities/default_stream.hpp>
21+
#include <cudf/utilities/span.hpp>
22+
23+
#include <cuda/functional>
24+
25+
#include <nvbench/nvbench.cuh>
26+
27+
static void bench_table_to_array(nvbench::state& state)
28+
{
29+
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
30+
auto const num_cols = static_cast<cudf::size_type>(state.get_int64("columns"));
31+
32+
data_profile profile = data_profile_builder()
33+
.distribution(cudf::type_id::INT32, distribution_id::UNIFORM, 0, 1000)
34+
.no_validity();
35+
std::vector<cudf::type_id> types(num_cols, cudf::type_id::INT32);
36+
auto input_table = create_random_table(types, row_count{num_rows}, profile);
37+
38+
auto input_view = input_table->view();
39+
auto stream = cudf::get_default_stream();
40+
41+
rmm::device_buffer output(num_rows * num_cols * sizeof(int32_t), stream);
42+
auto span = cudf::device_span<cuda::std::byte>(reinterpret_cast<cuda::std::byte*>(output.data()),
43+
output.size());
44+
45+
state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
46+
state.add_global_memory_reads<int32_t>(num_rows * num_cols); // all bytes are read
47+
state.add_global_memory_writes<int32_t>(num_rows * num_cols); // all bytes are written
48+
49+
state.exec(nvbench::exec_tag::sync,
50+
[&](nvbench::launch& launch) { cudf::table_to_array(input_view, span, stream); });
51+
}
52+
53+
NVBENCH_BENCH(bench_table_to_array)
54+
.set_name("table_to_array")
55+
.add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
56+
.add_int64_axis("columns", {2, 10, 100});

cpp/include/cudf/detail/reshape.hpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
1919
#include <cudf/types.hpp>
2020
#include <cudf/utilities/default_stream.hpp>
2121
#include <cudf/utilities/memory_resource.hpp>
22+
#include <cudf/utilities/span.hpp>
2223

2324
#include <rmm/cuda_stream_view.hpp>
2425

@@ -41,5 +42,12 @@ std::unique_ptr<column> interleave_columns(table_view const& input,
4142
rmm::cuda_stream_view,
4243
rmm::device_async_resource_ref mr);
4344

45+
/**
46+
* @copydoc cudf::table_to_array
47+
*/
48+
void table_to_array(table_view const& input,
49+
device_span<cuda::std::byte> output,
50+
rmm::cuda_stream_view stream = cudf::get_default_stream());
51+
4452
} // namespace detail
4553
} // namespace CUDF_EXPORT cudf

cpp/include/cudf/reshape.hpp

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -21,6 +21,9 @@
2121
#include <cudf/types.hpp>
2222
#include <cudf/utilities/export.hpp>
2323
#include <cudf/utilities/memory_resource.hpp>
24+
#include <cudf/utilities/span.hpp>
25+
26+
#include <cuda/functional>
2427

2528
#include <memory>
2629

@@ -107,6 +110,28 @@ std::unique_ptr<column> byte_cast(
107110
rmm::cuda_stream_view stream = cudf::get_default_stream(),
108111
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
109112

113+
/**
114+
* @brief Copies a table into a contiguous column-major device array.
115+
*
116+
* This function copies a `table_view` with columns of the same fixed-width type
117+
* into a 2D device array stored in column-major order.
118+
*
119+
* The output buffer must be preallocated and passed as a `device_span` using
120+
* a `device_span<cuda::std::byte>`. It must be large enough to hold
121+
* `num_rows * num_columns * sizeof(dtype)` bytes.
122+
*
123+
* @throws cudf::logic_error if columns do not all have the same type
124+
* @throws cudf::logic_error if the dtype of the columns is not a fixed-width type
125+
* @throws std::invalid_argument if the output span is too small
126+
*
127+
* @param input A table with fixed-width, non-nullable columns of the same type
128+
* @param output A span representing preallocated device memory for the output
129+
* @param stream CUDA stream used for memory operations
130+
*/
131+
void table_to_array(table_view const& input,
132+
device_span<cuda::std::byte> output,
133+
rmm::cuda_stream_view stream = cudf::get_default_stream());
134+
110135
/** @} */ // end of group
111136

112137
} // namespace CUDF_EXPORT cudf

cpp/src/reshape/table_to_array.cu

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
/*
2+
* Copyright (c) 2025, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include <cudf/detail/nvtx/ranges.hpp>
18+
#include <cudf/detail/reshape.hpp>
19+
#include <cudf/detail/utilities/batched_memcpy.hpp>
20+
#include <cudf/detail/utilities/vector_factories.hpp>
21+
#include <cudf/reshape.hpp>
22+
#include <cudf/types.hpp>
23+
#include <cudf/utilities/default_stream.hpp>
24+
#include <cudf/utilities/error.hpp>
25+
#include <cudf/utilities/span.hpp>
26+
#include <cudf/utilities/type_checks.hpp>
27+
#include <cudf/utilities/type_dispatcher.hpp>
28+
29+
#include <rmm/cuda_stream_view.hpp>
30+
#include <rmm/device_uvector.hpp>
31+
32+
#include <cub/device/device_memcpy.cuh>
33+
#include <cuda/functional>
34+
#include <cuda_runtime.h>
35+
#include <thrust/device_vector.h>
36+
#include <thrust/iterator/constant_iterator.h>
37+
#include <thrust/iterator/counting_iterator.h>
38+
#include <thrust/iterator/transform_iterator.h>
39+
40+
namespace cudf {
41+
namespace detail {
42+
namespace {
43+
44+
template <typename T>
45+
void table_to_array_impl(table_view const& input,
46+
device_span<cuda::std::byte> output,
47+
rmm::cuda_stream_view stream)
48+
{
49+
auto const num_columns = input.num_columns();
50+
auto const num_rows = input.num_rows();
51+
auto const item_size = sizeof(T);
52+
auto const total_bytes = static_cast<size_t>(num_columns) * num_rows * item_size;
53+
54+
CUDF_EXPECTS(output.size() >= total_bytes, "Output span is too small", std::invalid_argument);
55+
CUDF_EXPECTS(cudf::all_have_same_types(input.begin(), input.end()),
56+
"All columns must have the same data type",
57+
cudf::data_type_error);
58+
CUDF_EXPECTS(!cudf::has_nulls(input), "All columns must contain no nulls", std::invalid_argument);
59+
60+
auto* base_ptr = output.data();
61+
62+
auto h_srcs = make_host_vector<T const*>(num_columns, stream);
63+
auto h_dsts = make_host_vector<T*>(num_columns, stream);
64+
65+
std::transform(input.begin(), input.end(), h_srcs.begin(), [](auto& col) {
66+
return const_cast<T*>(col.template data<T>());
67+
});
68+
69+
for (int i = 0; i < num_columns; ++i) {
70+
h_dsts[i] = reinterpret_cast<T*>(base_ptr + i * item_size * num_rows);
71+
}
72+
73+
auto const mr = cudf::get_current_device_resource_ref();
74+
75+
auto d_srcs = cudf::detail::make_device_uvector_async(h_srcs, stream, mr);
76+
auto d_dsts = cudf::detail::make_device_uvector_async(h_dsts, stream, mr);
77+
78+
thrust::constant_iterator<size_t> sizes(static_cast<size_t>(item_size * num_rows));
79+
80+
cudf::detail::batched_memcpy_async(
81+
d_srcs.begin(), d_dsts.begin(), sizes, num_columns, stream.value());
82+
}
83+
84+
struct table_to_array_dispatcher {
85+
table_view const& input;
86+
device_span<cuda::std::byte> output;
87+
rmm::cuda_stream_view stream;
88+
89+
template <typename T, CUDF_ENABLE_IF(is_fixed_width<T>())>
90+
void operator()() const
91+
{
92+
table_to_array_impl<T>(input, output, stream);
93+
}
94+
95+
template <typename T, CUDF_ENABLE_IF(!is_fixed_width<T>())>
96+
void operator()() const
97+
{
98+
CUDF_FAIL("Unsupported dtype");
99+
}
100+
};
101+
102+
} // namespace
103+
104+
void table_to_array(table_view const& input,
105+
device_span<cuda::std::byte> output,
106+
rmm::cuda_stream_view stream)
107+
{
108+
if (input.num_columns() == 0) return;
109+
110+
auto const dtype = input.column(0).type();
111+
112+
cudf::type_dispatcher<cudf::dispatch_storage_type>(
113+
dtype, table_to_array_dispatcher{input, output, stream});
114+
}
115+
116+
} // namespace detail
117+
118+
void table_to_array(table_view const& input,
119+
device_span<cuda::std::byte> output,
120+
rmm::cuda_stream_view stream)
121+
{
122+
CUDF_FUNC_RANGE();
123+
cudf::detail::table_to_array(input, output, stream);
124+
}
125+
126+
} // namespace cudf

cpp/tests/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -525,7 +525,7 @@ ConfigureTest(
525525
# * reshape test ----------------------------------------------------------------------------------
526526
ConfigureTest(
527527
RESHAPE_TEST reshape/byte_cast_tests.cpp reshape/interleave_columns_tests.cpp
528-
reshape/tile_tests.cpp
528+
reshape/table_to_array_tests.cpp reshape/tile_tests.cpp
529529
)
530530

531531
# ##################################################################################################

0 commit comments

Comments
 (0)