Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -643,6 +643,7 @@ add_library(
src/replace/replace.cu
src/reshape/byte_cast.cu
src/reshape/interleave_columns.cu
src/reshape/table_to_array.cu
src/reshape/tile.cu
src/rolling/detail/optimized_unbounded_window.cpp
src/rolling/detail/rolling_collect_list.cu
Expand Down
2 changes: 1 addition & 1 deletion cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp)
# ##################################################################################################
# * reshape benchmark
# ---------------------------------------------------------------------------------
ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp)
ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp reshape/table_to_array.cpp)

# ##################################################################################################
# * rolling benchmark
Expand Down
63 changes: 63 additions & 0 deletions cpp/benchmarks/reshape/table_to_array.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>

#include <cudf/reshape.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/span.hpp>

#include <cuda/functional>

#include <nvbench/nvbench.cuh>

static void bench_table_to_array(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const num_cols = static_cast<cudf::size_type>(state.get_int64("columns"));

if (static_cast<std::size_t>(num_rows) * num_cols >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
state.skip("Input size exceeds cudf::size_type limit");
}

data_profile profile = data_profile_builder()
.distribution(cudf::type_id::INT32, distribution_id::UNIFORM, 0, 1000)
.no_validity();
std::vector<cudf::type_id> types(num_cols, cudf::type_id::INT32);
auto input_table = create_random_table(types, row_count{num_rows}, profile);

auto input_view = input_table->view();
auto stream = cudf::get_default_stream();
auto dtype = cudf::data_type{cudf::type_id::INT32};

rmm::device_buffer output(num_rows * num_cols * sizeof(int32_t), stream);
auto span = cudf::device_span<cuda::std::byte>(reinterpret_cast<cuda::std::byte*>(output.data()),
output.size());

state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
state.add_global_memory_reads<int32_t>(num_rows * num_cols); // all bytes are read
state.add_global_memory_writes<int32_t>(num_rows * num_cols); // all bytes are written

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
cudf::table_to_array(input_view, span, dtype, stream);
});
}

NVBENCH_BENCH(bench_table_to_array)
.set_name("table_to_array")
.add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
.add_int64_axis("columns", {2, 10, 100});
10 changes: 9 additions & 1 deletion cpp/include/cudf/detail/reshape.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -41,5 +41,13 @@ std::unique_ptr<column> interleave_columns(table_view const& input,
rmm::cuda_stream_view,
rmm::device_async_resource_ref mr);

/**
* @copydoc cudf::table_to_array
*/
void table_to_array(table_view const& input,
void* output,
data_type output_dtype,
rmm::cuda_stream_view stream);

} // namespace detail
} // namespace CUDF_EXPORT cudf
29 changes: 28 additions & 1 deletion cpp/include/cudf/reshape.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -21,6 +21,9 @@
#include <cudf/types.hpp>
#include <cudf/utilities/export.hpp>
#include <cudf/utilities/memory_resource.hpp>
#include <cudf/utilities/span.hpp>

#include <cuda/functional>

#include <memory>

Expand Down Expand Up @@ -107,6 +110,30 @@ std::unique_ptr<column> byte_cast(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Copies a table into a contiguous column-major device array.
*
* This function copies a `table_view` with columns of the same fixed-width type
* into a 2D device array stored in column-major order.
*
* The output buffer must be preallocated and passed as a `device_span` using
* a `device_span<cuda::std::byte>`. It must be large enough to hold
* `num_rows * num_columns * size_of(output_dtype)` bytes.
*
* @throws cudf::logic_error if columns do not all have the same type as `output_dtype`
* @throws cudf::logic_error if `output_dtype` is not a fixed-width type
* @throws std::invalid_argument if the output span is too small
*
* @param input A table with fixed-width, non-nullable columns of the same type
* @param output A span representing preallocated device memory for the output
* @param output_dtype The data type of the output elements
* @param stream CUDA stream used for memory operations
*/
void table_to_array(table_view const& input,
device_span<cuda::std::byte> output,
cudf::data_type output_dtype,
rmm::cuda_stream_view stream = cudf::get_default_stream());

/** @} */ // end of group

} // namespace CUDF_EXPORT cudf
142 changes: 142 additions & 0 deletions cpp/src/reshape/table_to_array.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/reshape.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/reshape.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/span.hpp>
#include <cudf/utilities/type_checks.hpp>
#include <cudf/utilities/type_dispatcher.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>

#include <cub/device/device_memcpy.cuh>
#include <cuda/functional>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>

namespace cudf {
namespace detail {
namespace {

template <typename T>
void table_to_array_impl(table_view const& input,
device_span<cuda::std::byte> output,
rmm::cuda_stream_view stream)
{
auto const num_columns = input.num_columns();
auto const num_rows = input.num_rows();
auto const item_size = sizeof(T);
auto const total_bytes = num_columns * num_rows * item_size;

CUDF_EXPECTS(output.size() >= total_bytes, "Output span is too small", std::invalid_argument);
CUDF_EXPECTS(cudf::all_have_same_types(input.begin(), input.end()),
"All columns must have the same data type",
cudf::data_type_error);
CUDF_EXPECTS(!cudf::has_nulls(input), "All columns must contain no nulls", std::invalid_argument);

auto* base_ptr = output.data();

auto h_srcs = make_host_vector<void*>(num_columns, stream);
auto h_dsts = make_host_vector<void*>(num_columns, stream);

std::transform(input.begin(), input.end(), h_srcs.begin(), [](auto& col) {
return const_cast<void*>(static_cast<void const*>(col.template data<T>()));
});

for (int i = 0; i < num_columns; ++i) {
h_dsts[i] = static_cast<void*>(base_ptr + i * item_size * num_rows);
}

auto const mr = cudf::get_current_device_resource_ref();

auto d_srcs = cudf::detail::make_device_uvector_async(h_srcs, stream, mr);
auto d_dsts = cudf::detail::make_device_uvector_async(h_dsts, stream, mr);

thrust::constant_iterator<size_t> sizes(static_cast<size_t>(item_size * num_rows));

void* d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
cub::DeviceMemcpy::Batched(d_temp_storage,
temp_storage_bytes,
d_srcs.begin(),
d_dsts.begin(),
sizes,
num_columns,
stream.value());

rmm::device_buffer temp_storage(temp_storage_bytes, stream);
cub::DeviceMemcpy::Batched(temp_storage.data(),
temp_storage_bytes,
d_srcs.begin(),
d_dsts.begin(),
sizes,
num_columns,
stream.value());
}

struct table_to_array_dispatcher {
table_view const& input;
device_span<cuda::std::byte> output;
rmm::cuda_stream_view stream;

template <typename T, CUDF_ENABLE_IF(is_fixed_width<T>())>
void operator()() const
{
table_to_array_impl<T>(input, output, stream);
}

template <typename T, CUDF_ENABLE_IF(!is_fixed_width<T>())>
void operator()() const
{
CUDF_FAIL("Unsupported dtype");
}
};

} // namespace

void table_to_array(table_view const& input,
device_span<cuda::std::byte> output,
data_type output_dtype,
rmm::cuda_stream_view stream)
{
CUDF_EXPECTS(
input.num_columns() > 0, "Input must have at least one column.", std::invalid_argument);

cudf::type_dispatcher<cudf::dispatch_storage_type>(
output_dtype, table_to_array_dispatcher{input, output, stream});
}

} // namespace detail

void table_to_array(table_view const& input,
device_span<cuda::std::byte> output,
data_type output_dtype,
rmm::cuda_stream_view stream)
{
CUDF_FUNC_RANGE();
cudf::detail::table_to_array(input, output, output_dtype, stream);
}

} // namespace cudf
2 changes: 1 addition & 1 deletion cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -520,7 +520,7 @@ ConfigureTest(
# * reshape test ----------------------------------------------------------------------------------
ConfigureTest(
RESHAPE_TEST reshape/byte_cast_tests.cpp reshape/interleave_columns_tests.cpp
reshape/tile_tests.cpp
reshape/tile_tests.cpp reshape/table_to_array_tests.cpp
)

# ##################################################################################################
Expand Down
Loading
Loading