Skip to content
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
e201564
Add a public API for converting a table_view to device array
Matt711 Apr 7, 2025
6e9289e
support decimals and add more tests
Matt711 Apr 7, 2025
21a9201
Merge branch 'branch-25.06' into fea/cpp/table-to-device-array
Matt711 Apr 8, 2025
7eb3690
fallback if cuda version < 12.8
Matt711 Apr 8, 2025
854cbbf
Merge branch 'branch-25.06' into fea/cpp/table-to-device-array
Matt711 Apr 8, 2025
5a9d195
clean up
Matt711 Apr 8, 2025
e5e65cc
address reviews
Matt711 Apr 8, 2025
0343bb3
address review
Matt711 Apr 8, 2025
7530ecf
use snake case
Matt711 Apr 8, 2025
c7ab103
Merge branch 'branch-25.06' into fea/cpp/table-to-device-array
Matt711 Apr 9, 2025
eec23b5
Merge branch 'branch-25.06' into fea/cpp/table-to-device-array
Matt711 Apr 10, 2025
f027616
address reviews
Matt711 Apr 10, 2025
b3251fe
clean up
Matt711 Apr 10, 2025
f5a0096
Merge branch 'branch-25.06' into fea/cpp/table-to-device-array
Matt711 Apr 14, 2025
96ef619
address review
Matt711 Apr 14, 2025
82c5b22
pass a device_span instead of a raw pointer
Matt711 Apr 14, 2025
4399938
sort file names
Matt711 Apr 15, 2025
737fa22
Merge branch 'branch-25.06' into fea/cpp/table-to-device-array
Matt711 Apr 15, 2025
36abd46
Merge branch 'branch-25.06' into fea/cpp/table-to-device-array
Matt711 Apr 15, 2025
1fb9563
Merge branch 'branch-25.06' into fea/cpp/table-to-device-array
Matt711 Apr 21, 2025
c6785c7
add other impl for benchmarking purposes
Matt711 Apr 21, 2025
04c837f
clean up
Matt711 Apr 21, 2025
1530b5f
Merge branch 'branch-25.06' into fea/cpp/table-to-device-array
Matt711 May 5, 2025
fd0506b
address reviews
Matt711 May 5, 2025
db78162
address review
Matt711 May 5, 2025
f5bf21c
Merge branch 'branch-25.06' into fea/cpp/table-to-device-array
Matt711 May 6, 2025
b71ec3a
address review
Matt711 May 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -643,6 +643,7 @@ add_library(
src/replace/replace.cu
src/reshape/byte_cast.cu
src/reshape/interleave_columns.cu
src/reshape/table_to_array.cu
src/reshape/tile.cu
src/rolling/detail/optimized_unbounded_window.cpp
src/rolling/detail/rolling_collect_list.cu
Expand Down
2 changes: 1 addition & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp)
# ##################################################################################################
# * reshape benchmark
# ---------------------------------------------------------------------------------
ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp)
ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp reshape/table_to_array.cpp)

# ##################################################################################################
# * rolling benchmark
Expand Down
63 changes: 63 additions & 0 deletions cpp/benchmarks/reshape/table_to_array.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>

#include <cudf/reshape.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/span.hpp>

#include <cuda/functional>

#include <nvbench/nvbench.cuh>

static void bench_table_to_array(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const num_cols = static_cast<cudf::size_type>(state.get_int64("columns"));

if (static_cast<std::size_t>(num_rows) * num_cols >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Input size exceeds cudf::size_type limit");
}

data_profile profile = data_profile_builder()
.distribution(cudf::type_id::INT32, distribution_id::UNIFORM, 0, 1000)
.no_validity();
std::vector<cudf::type_id> types(num_cols, cudf::type_id::INT32);
auto input_table = create_random_table(types, row_count{num_rows}, profile);

auto input_view = input_table->view();
auto stream = cudf::get_default_stream();
auto dtype = cudf::data_type{cudf::type_id::INT32};

rmm::device_buffer output(num_rows * num_cols * sizeof(int32_t), stream);
auto span = cudf::device_span<cuda::std::byte>(reinterpret_cast<cuda::std::byte*>(output.data()),
output.size());

state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
state.add_global_memory_reads<int32_t>(num_rows * num_cols); // all bytes are read
state.add_global_memory_writes<int32_t>(num_rows * num_cols); // all bytes are written

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
cudf::table_to_array(input_view, span, dtype, stream);
});
}

NVBENCH_BENCH(bench_table_to_array)
.set_name("table_to_array")
.add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
.add_int64_axis("columns", {2, 10, 100});
10 changes: 9 additions & 1 deletion cpp/include/cudf/detail/reshape.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -41,5 +41,13 @@ std::unique_ptr<column> interleave_columns(table_view const& input,
rmm::cuda_stream_view,
rmm::device_async_resource_ref mr);

/**
* @copydoc cudf::table_to_array
*/
void table_to_array(table_view const& input,
void* output,
data_type output_dtype,
rmm::cuda_stream_view stream);

} // namespace detail
} // namespace CUDF_EXPORT cudf
29 changes: 28 additions & 1 deletion cpp/include/cudf/reshape.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -21,6 +21,9 @@
#include <cudf/types.hpp>
#include <cudf/utilities/export.hpp>
#include <cudf/utilities/memory_resource.hpp>
#include <cudf/utilities/span.hpp>

#include <cuda/functional>

#include <memory>

Expand Down Expand Up @@ -107,6 +110,30 @@ std::unique_ptr<column> byte_cast(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Copies a table into a contiguous column-major device array.
*
* This function copies a `table_view` with columns of the same fixed-width type
* into a 2D device array stored in column-major order.
*
* The output buffer must be preallocated and passed as a `device_span` using
* a `device_span<cuda::std::byte>`. It must be large enough to hold
* `num_rows * num_columns * size_of(output_dtype)` bytes.
*
* @throws cudf::logic_error if columns do not all have the same type as `output_dtype`
* @throws cudf::logic_error if `output_dtype` is not a fixed-width type
* @throws std::invalid_argument if the output span is too small
*
* @param input A table with fixed-width, non-nullable columns of the same type
* @param output A span representing preallocated device memory for the output
* @param output_dtype The data type of the output elements
* @param stream CUDA stream used for memory operations
*/
void table_to_array(table_view const& input,
device_span<cuda::std::byte> output,
cudf::data_type output_dtype,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could we avoid passing this and derive this type from the column types, since we throw on mismatch anyway?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done b71ec3a

rmm::cuda_stream_view stream = cudf::get_default_stream());

/** @} */ // end of group

} // namespace CUDF_EXPORT cudf
142 changes: 142 additions & 0 deletions cpp/src/reshape/table_to_array.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/reshape.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/reshape.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/span.hpp>
#include <cudf/utilities/type_checks.hpp>
#include <cudf/utilities/type_dispatcher.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>

#include <cub/device/device_memcpy.cuh>
#include <cuda/functional>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>

namespace cudf {
namespace detail {
namespace {

template <typename T>
void table_to_array_impl(table_view const& input,
device_span<cuda::std::byte> output,
rmm::cuda_stream_view stream)
{
auto const num_columns = input.num_columns();
auto const num_rows = input.num_rows();
auto const item_size = sizeof(T);
auto const total_bytes = num_columns * num_rows * item_size;

CUDF_EXPECTS(output.size() >= total_bytes, "Output span is too small", std::invalid_argument);
CUDF_EXPECTS(cudf::all_have_same_types(input.begin(), input.end()),
"All columns must have the same data type",
cudf::data_type_error);
CUDF_EXPECTS(!cudf::has_nulls(input), "All columns must contain no nulls", std::invalid_argument);

auto* base_ptr = output.data();

auto h_srcs = make_host_vector<void*>(num_columns, stream);
auto h_dsts = make_host_vector<void*>(num_columns, stream);

std::transform(input.begin(), input.end(), h_srcs.begin(), [](auto& col) {
return const_cast<void*>(static_cast<void const*>(col.template data<T>()));
});

for (int i = 0; i < num_columns; ++i) {
h_dsts[i] = static_cast<void*>(base_ptr + i * item_size * num_rows);
}

auto const mr = cudf::get_current_device_resource_ref();

auto d_srcs = cudf::detail::make_device_uvector_async(h_srcs, stream, mr);
auto d_dsts = cudf::detail::make_device_uvector_async(h_dsts, stream, mr);

thrust::constant_iterator<size_t> sizes(static_cast<size_t>(item_size * num_rows));

void* d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
cub::DeviceMemcpy::Batched(d_temp_storage,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please benchmark cub::DeviceMemcpy::Batched against cudaMemcpyBatchAsync on CUDA 12.8. I'd like to see if there is a benefit for using the new API where it is supported. I think it should be more efficient.

I also want to check the performance against a multi-stream copy like I implemented for gather in #14162, but that can be done for a follow-up.

Copy link
Contributor Author

@Matt711 Matt711 Apr 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wasn't able to do the benchmark against cudaMemcpyBatchAsync. I get errors like...

cudaErrorInvalidValue invalid argument
cudaErrorInvalidDevice: invalid device ordinal
cudaErrorInvalidResourceHandle: invalid resource handle

Do you have any ideas on root causes? I've already checked for null pointers. And the next thing I'll try is changing the source access order in the attrs arg (currently set to cudaMemcpySrcAccessOrderStream).

Regardless, I did benchmark cub::DeviceMemcpy::Batched against num_columns cudaMemcpyAsync calls and it generally under performs when only two buffers are copied. Performance looks good when 10 and 100 buffers are copied.

$ python nvbench/scripts/nvbench_compare.py table_to_array_bench_cudaMemcpyBatchAsync.json table_to_array_bench_cub::DeviceMemcpy::Batched.json

table_to_array

[0] NVIDIA RTX 5880 Ada Generation

num_rows columns Ref Time Ref Noise Cmp Time Cmp Noise Diff %Diff Status
262144 2 11.118 us 11.28% 19.879 us 4.33% 8.761 us 78.80% SLOW
2097152 2 43.316 us 2.46% 59.083 us 2.05% 15.767 us 36.40% SLOW
16777216 2 321.465 us 0.54% 356.736 us 0.84% 35.272 us 10.97% SLOW
32768 10 27.827 us 4.91% 18.861 us 4.22% -8.966 us -32.22% FAST
262144 10 44.987 us 2.13% 41.143 us 2.56% -3.844 us -8.54% FAST
2097152 10 201.164 us 0.66% 228.313 us 1.05% 27.149 us 13.50% SLOW
16777216 10 1.589 ms 0.27% 1.668 ms 0.92% 79.189 us 4.98% SLOW
32768 100 229.088 us 1.71% 50.000 us 2.46% -179.088 us -78.17% FAST
262144 100 397.733 us 0.36% 283.406 us 0.91% -114.328 us -28.74% FAST
2097152 100 1.929 ms 0.25% 2.060 ms 1.36% 130.913 us 6.78% SLOW
16777216 100 15.836 ms 0.09% 16.030 ms 0.18% 193.286 us 1.22% SLOW
32768 2 7.285 us 9.26% 31.464 us 3.99% 24.179 us 331.92% SLOW
262144 2 11.670 us 6.06% 35.361 us 4.10% 23.690 us 203.00% SLOW
2097152 2 43.570 us 1.85% 245.818 us 3.46% 202.247 us 464.19% SLOW
16777216 2 321.276 us 0.52% 544.230 us 1.25% 222.954 us 69.40% SLOW
32768 10 25.450 us 3.65% 32.252 us 3.21% 6.802 us 26.73% SLOW
262144 10 44.983 us 1.88% 56.335 us 2.68% 11.352 us 25.24% SLOW
2097152 10 201.004 us 0.68% 417.523 us 2.45% 216.519 us 107.72% SLOW
16777216 10 1.595 ms 0.30% 1.872 ms 12.15% 276.805 us 17.35% SLOW
32768 100 230.065 us 1.79% 64.702 us 2.72% -165.363 us -71.88% FAST
262144 100 394.490 us 0.40% 298.147 us 0.95% -96.343 us -24.42% FAST
2097152 100 1.939 ms 0.32% 2.270 ms 5.17% 331.012 us 17.07% SLOW
16777216 100 15.895 ms 0.09% 16.164 ms 1.25% 269.368 us 1.69% SLOW

[1] NVIDIA RTX 5880 Ada Generation

num_rows columns Ref Time Ref Noise Cmp Time Cmp Noise Diff %Diff Status
262144 2 11.118 us 11.28% 19.879 us 4.33% 8.761 us 78.80% SLOW
2097152 2 43.316 us 2.46% 59.083 us 2.05% 15.767 us 36.40% SLOW
16777216 2 321.465 us 0.54% 356.736 us 0.84% 35.272 us 10.97% SLOW
32768 10 27.827 us 4.91% 18.861 us 4.22% -8.966 us -32.22% FAST
262144 10 44.987 us 2.13% 41.143 us 2.56% -3.844 us -8.54% FAST
2097152 10 201.164 us 0.66% 228.313 us 1.05% 27.149 us 13.50% SLOW
16777216 10 1.589 ms 0.27% 1.668 ms 0.92% 79.189 us 4.98% SLOW
32768 100 229.088 us 1.71% 50.000 us 2.46% -179.088 us -78.17% FAST
262144 100 397.733 us 0.36% 283.406 us 0.91% -114.328 us -28.74% FAST
2097152 100 1.929 ms 0.25% 2.060 ms 1.36% 130.913 us 6.78% SLOW
16777216 100 15.836 ms 0.09% 16.030 ms 0.18% 193.286 us 1.22% SLOW
32768 2 7.285 us 9.26% 31.464 us 3.99% 24.179 us 331.92% SLOW
262144 2 11.670 us 6.06% 35.361 us 4.10% 23.690 us 203.00% SLOW
2097152 2 43.570 us 1.85% 245.818 us 3.46% 202.247 us 464.19% SLOW
16777216 2 321.276 us 0.52% 544.230 us 1.25% 222.954 us 69.40% SLOW
32768 10 25.450 us 3.65% 32.252 us 3.21% 6.802 us 26.73% SLOW
262144 10 44.983 us 1.88% 56.335 us 2.68% 11.352 us 25.24% SLOW
2097152 10 201.004 us 0.68% 417.523 us 2.45% 216.519 us 107.72% SLOW
16777216 10 1.595 ms 0.30% 1.872 ms 12.15% 276.805 us 17.35% SLOW
32768 100 230.065 us 1.79% 64.702 us 2.72% -165.363 us -71.88% FAST
262144 100 394.490 us 0.40% 298.147 us 0.95% -96.343 us -24.42% FAST
2097152 100 1.939 ms 0.32% 2.270 ms 5.17% 331.012 us 17.07% SLOW
16777216 100 15.895 ms 0.09% 16.164 ms 1.25% 269.368 us 1.69% SLOW

[2] NVIDIA GeForce GT 1030

num_rows columns Ref Time Ref Noise Cmp Time Cmp Noise Diff %Diff Status
262144 2 11.118 us 11.28% 19.879 us 4.33% 8.761 us 78.80% SLOW
2097152 2 43.316 us 2.46% 59.083 us 2.05% 15.767 us 36.40% SLOW
16777216 2 321.465 us 0.54% 356.736 us 0.84% 35.272 us 10.97% SLOW
32768 10 27.827 us 4.91% 18.861 us 4.22% -8.966 us -32.22% FAST
262144 10 44.987 us 2.13% 41.143 us 2.56% -3.844 us -8.54% FAST
2097152 10 201.164 us 0.66% 228.313 us 1.05% 27.149 us 13.50% SLOW
16777216 10 1.589 ms 0.27% 1.668 ms 0.92% 79.189 us 4.98% SLOW
32768 100 229.088 us 1.71% 50.000 us 2.46% -179.088 us -78.17% FAST
262144 100 397.733 us 0.36% 283.406 us 0.91% -114.328 us -28.74% FAST
2097152 100 1.929 ms 0.25% 2.060 ms 1.36% 130.913 us 6.78% SLOW
16777216 100 15.836 ms 0.09% 16.030 ms 0.18% 193.286 us 1.22% SLOW
32768 2 7.285 us 9.26% 31.464 us 3.99% 24.179 us 331.92% SLOW
262144 2 11.670 us 6.06% 35.361 us 4.10% 23.690 us 203.00% SLOW
2097152 2 43.570 us 1.85% 245.818 us 3.46% 202.247 us 464.19% SLOW
16777216 2 321.276 us 0.52% 544.230 us 1.25% 222.954 us 69.40% SLOW
32768 10 25.450 us 3.65% 32.252 us 3.21% 6.802 us 26.73% SLOW
262144 10 44.983 us 1.88% 56.335 us 2.68% 11.352 us 25.24% SLOW
2097152 10 201.004 us 0.68% 417.523 us 2.45% 216.519 us 107.72% SLOW
16777216 10 1.595 ms 0.30% 1.872 ms 12.15% 276.805 us 17.35% SLOW
32768 100 230.065 us 1.79% 64.702 us 2.72% -165.363 us -71.88% FAST
262144 100 394.490 us 0.40% 298.147 us 0.95% -96.343 us -24.42% FAST
2097152 100 1.939 ms 0.32% 2.270 ms 5.17% 331.012 us 17.07% SLOW
16777216 100 15.895 ms 0.09% 16.164 ms 1.25% 269.368 us 1.69% SLOW

Summary

  • Total Matches: 69
    • Pass (diff <= min_noise): 0
    • Unknown (infinite noise): 0
    • Failure (diff > min_noise): 69

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

@vuule vuule Apr 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't looked at the code yet, but maybe using our wrapper batched_memcpy_async would be helpful here, it simplifies the use a bit.
My bad, thought the issue was with cub::DeviceMemcpy::Batched

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@elstehle we're seeing a great perf boost when copying a hundred buffers using cub::DeviceMemcpy::Batched instead of a set of memcpys. However, the performance of the batched memcpy is significantly worse when only two buffers are copied. Is this expected?

Copy link
Contributor

@elstehle elstehle Apr 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for joining the discussion a bit late.

@elstehle we're seeing a great perf boost when copying a hundred buffers using cub::DeviceMemcpy::Batched instead of a set of memcpys. However, the performance of the batched memcpy is significantly worse when only two buffers are copied. Is this expected?

If both, the number of columns is small and the total time to copy is small, like ~10s of microseconds, I think that slowdown comes from kernel launch overhead from DeviceMemcpy::Batched launching a couple of kernels.

Another factor may be underutilized SMs. If the total number of bytes being copied is too small to saturate all SMs, I expect using copy engines is more efficient.

I wasn't able to do the benchmark against cudaMemcpyBatchAsync. I get errors like...

cudaErrorInvalidValue invalid argument
cudaErrorInvalidDevice: invalid device ordinal
cudaErrorInvalidResourceHandle: invalid resource handle

Do you have any ideas on root causes? I've already checked for null pointers. And the next thing I'll try is changing the source access order in the attrs arg (currently set to cudaMemcpySrcAccessOrderStream).

I'm not sure if you had accounted for this, but iirc, cudaMemcpyBatchAsync expects a host array of device pointers (aka host-pointers-to-device-pointers). If that doesn't help, could you try running your benchmarks just on a single device to see if your issues relate to currentDevice? If these are nvbench, you can just pass mybench --device 0.

Copy link
Contributor Author

@Matt711 Matt711 Apr 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @elstehle

I'm not sure if you had accounted for this, but iirc, cudaMemcpyBatchAsync expects a host array of device pointers (aka host-pointers-to-device-pointers).

Yup, I added the version of table_to_array_impl that uses cudaMemcpyBatchAsync. I'm passing a host array of device pointers.

If that doesn't help, could you try running your benchmarks just on a single device to see if your issues relate to currentDevice? If these are nvbench, you can just pass mybench --device 0.

I tried running on a single device and I get the same errors.

Copy link
Contributor

@bdice bdice Apr 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are very interesting benchmarks. Here is a plot from ChatGPT:
a01cdef9-100a-4427-8036-fc75d7f39b7e

(There is some overlap where "fast" and "slow" results were reported for the same rows/columns. I assume there is a data type difference that wasn't shown in the raw data.)

We could probably come up with a heuristic to choose between copying algorithms based on the number of bytes per column and number of columns in a follow-up PR. For now, I think either approach is viable. However, some of the common use cases I could see for this API are more square-ish matrices -- wide and not necessarily super long. For those cases, the DeviceMemcpy::Batched could be really compelling.

temp_storage_bytes,
d_srcs.begin(),
d_dsts.begin(),
sizes,
num_columns,
stream.value());

rmm::device_buffer temp_storage(temp_storage_bytes, stream);
cub::DeviceMemcpy::Batched(temp_storage.data(),
temp_storage_bytes,
d_srcs.begin(),
d_dsts.begin(),
sizes,
num_columns,
stream.value());
}

struct table_to_array_dispatcher {
table_view const& input;
device_span<cuda::std::byte> output;
rmm::cuda_stream_view stream;

template <typename T, CUDF_ENABLE_IF(is_fixed_width<T>())>
void operator()() const
{
table_to_array_impl<T>(input, output, stream);
}

template <typename T, CUDF_ENABLE_IF(!is_fixed_width<T>())>
void operator()() const
{
CUDF_FAIL("Unsupported dtype");
}
};

} // namespace

void table_to_array(table_view const& input,
device_span<cuda::std::byte> output,
data_type output_dtype,
rmm::cuda_stream_view stream)
{
CUDF_EXPECTS(
input.num_columns() > 0, "Input must have at least one column.", std::invalid_argument);

cudf::type_dispatcher<cudf::dispatch_storage_type>(
output_dtype, table_to_array_dispatcher{input, output, stream});
}

} // namespace detail

void table_to_array(table_view const& input,
device_span<cuda::std::byte> output,
data_type output_dtype,
rmm::cuda_stream_view stream)
{
CUDF_FUNC_RANGE();
cudf::detail::table_to_array(input, output, output_dtype, stream);
}

} // namespace cudf
2 changes: 1 addition & 1 deletion cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -520,7 +520,7 @@ ConfigureTest(
# * reshape test ----------------------------------------------------------------------------------
ConfigureTest(
RESHAPE_TEST reshape/byte_cast_tests.cpp reshape/interleave_columns_tests.cpp
reshape/tile_tests.cpp
reshape/table_to_array_tests.cpp reshape/tile_tests.cpp
)

# ##################################################################################################
Expand Down
Loading
Loading