rapidsai · Matt711 · Apr 7, 2025 · Apr 7, 2025 · Apr 8, 2025 · Apr 8, 2025
@@ -643,6 +643,7 @@ add_library(
   src/replace/replace.cu
   src/reshape/byte_cast.cu
   src/reshape/interleave_columns.cu
+  src/reshape/table_to_array.cu
   src/reshape/tile.cu
   src/rolling/detail/optimized_unbounded_window.cpp
   src/rolling/detail/rolling_collect_list.cu

@@ -430,7 +430,7 @@ ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp)
 # ##################################################################################################
 # * reshape benchmark
 # ---------------------------------------------------------------------------------
-ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp)
+ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp reshape/table_to_array.cpp)
 
 # ##################################################################################################
 # * rolling benchmark

@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/reshape.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <cuda/functional>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_table_to_array(nvbench::state& state)
+{
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const num_cols = static_cast<cudf::size_type>(state.get_int64("columns"));
+
+  if (static_cast<std::size_t>(num_rows) * num_cols >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Input size exceeds cudf::size_type limit");
+  }
+
+  data_profile profile = data_profile_builder()
+                           .distribution(cudf::type_id::INT32, distribution_id::UNIFORM, 0, 1000)
+                           .no_validity();
+  std::vector<cudf::type_id> types(num_cols, cudf::type_id::INT32);
+  auto input_table = create_random_table(types, row_count{num_rows}, profile);
+
+  auto input_view = input_table->view();
+  auto stream     = cudf::get_default_stream();
+  auto dtype      = cudf::data_type{cudf::type_id::INT32};
+
+  rmm::device_buffer output(num_rows * num_cols * sizeof(int32_t), stream);
+  auto span = cudf::device_span<cuda::std::byte>(reinterpret_cast<cuda::std::byte*>(output.data()),
+                                                 output.size());
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.add_global_memory_reads<int32_t>(num_rows * num_cols);   // all bytes are read
+  state.add_global_memory_writes<int32_t>(num_rows * num_cols);  // all bytes are written
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    cudf::table_to_array(input_view, span, dtype, stream);
+  });
+}
+
+NVBENCH_BENCH(bench_table_to_array)
+  .set_name("table_to_array")
+  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
+  .add_int64_axis("columns", {2, 10, 100});
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,5 +41,13 @@ std::unique_ptr<column> interleave_columns(table_view const& input,
                                            rmm::cuda_stream_view,
                                            rmm::device_async_resource_ref mr);
 
+/**
+ * @copydoc cudf::table_to_array
+ */
+void table_to_array(table_view const& input,
+                    void* output,
+                    data_type output_dtype,
+                    rmm::cuda_stream_view stream);
+
 }  // namespace detail
 }  // namespace CUDF_EXPORT cudf
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,9 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/memory_resource.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <cuda/functional>
 
 #include <memory>
 
@@ -107,6 +110,30 @@ std::unique_ptr<column> byte_cast(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Copies a table into a contiguous column-major device array.
+ *
+ * This function copies a `table_view` with columns of the same fixed-width type
+ * into a 2D device array stored in column-major order.
+ *
+ * The output buffer must be preallocated and passed as a `device_span` using
+ * a `device_span<cuda::std::byte>`. It must be large enough to hold
+ * `num_rows * num_columns * size_of(output_dtype)` bytes.
+ *
+ * @throws cudf::logic_error if columns do not all have the same type as `output_dtype`
+ * @throws cudf::logic_error if `output_dtype` is not a fixed-width type
+ * @throws std::invalid_argument if the output span is too small
+ *
+ * @param input A table with fixed-width, non-nullable columns of the same type
+ * @param output A span representing preallocated device memory for the output
+ * @param output_dtype The data type of the output elements
+ * @param stream CUDA stream used for memory operations
+ */
+void table_to_array(table_view const& input,
+                    device_span<cuda::std::byte> output,
+                    cudf::data_type output_dtype,
+                    rmm::cuda_stream_view stream = cudf::get_default_stream());
+
 /** @} */  // end of group
 
 }  // namespace CUDF_EXPORT cudf
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/reshape.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/reshape.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/span.hpp>
+#include <cudf/utilities/type_checks.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <cub/device/device_memcpy.cuh>
+#include <cuda/functional>
+#include <cuda_runtime.h>
+#include <thrust/device_vector.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+namespace cudf {
+namespace detail {
+namespace {
+
+template <typename T>
+void table_to_array_impl(table_view const& input,
+                         device_span<cuda::std::byte> output,
+                         rmm::cuda_stream_view stream)
+{
+  auto const num_columns = input.num_columns();
+  auto const num_rows    = input.num_rows();
+  auto const item_size   = sizeof(T);
+  auto const total_bytes = num_columns * num_rows * item_size;
+
+  CUDF_EXPECTS(output.size() >= total_bytes, "Output span is too small", std::invalid_argument);
+  CUDF_EXPECTS(cudf::all_have_same_types(input.begin(), input.end()),
+               "All columns must have the same data type",
+               cudf::data_type_error);
+  CUDF_EXPECTS(!cudf::has_nulls(input), "All columns must contain no nulls", std::invalid_argument);
+
+  auto* base_ptr = output.data();
+
+  auto h_srcs = make_host_vector<void*>(num_columns, stream);
+  auto h_dsts = make_host_vector<void*>(num_columns, stream);
+
+  std::transform(input.begin(), input.end(), h_srcs.begin(), [](auto& col) {
+    return const_cast<void*>(static_cast<void const*>(col.template data<T>()));
+  });
+
+  for (int i = 0; i < num_columns; ++i) {
+    h_dsts[i] = static_cast<void*>(base_ptr + i * item_size * num_rows);
+  }
+
+  auto const mr = cudf::get_current_device_resource_ref();
+
+  auto d_srcs = cudf::detail::make_device_uvector_async(h_srcs, stream, mr);
+  auto d_dsts = cudf::detail::make_device_uvector_async(h_dsts, stream, mr);
+
+  thrust::constant_iterator<size_t> sizes(static_cast<size_t>(item_size * num_rows));
+
+  void* d_temp_storage      = nullptr;
+  size_t temp_storage_bytes = 0;
+  cub::DeviceMemcpy::Batched(d_temp_storage,
+                             temp_storage_bytes,
+                             d_srcs.begin(),
+                             d_dsts.begin(),
+                             sizes,
+                             num_columns,
+                             stream.value());
+
+  rmm::device_buffer temp_storage(temp_storage_bytes, stream);
+  cub::DeviceMemcpy::Batched(temp_storage.data(),
+                             temp_storage_bytes,
+                             d_srcs.begin(),
+                             d_dsts.begin(),
+                             sizes,
+                             num_columns,
+                             stream.value());
+}
+
+struct table_to_array_dispatcher {
+  table_view const& input;
+  device_span<cuda::std::byte> output;
+  rmm::cuda_stream_view stream;
+
+  template <typename T, CUDF_ENABLE_IF(is_fixed_width<T>())>
+  void operator()() const
+  {
+    table_to_array_impl<T>(input, output, stream);
+  }
+
+  template <typename T, CUDF_ENABLE_IF(!is_fixed_width<T>())>
+  void operator()() const
+  {
+    CUDF_FAIL("Unsupported dtype");
+  }
+};
+
+}  // namespace
+
+void table_to_array(table_view const& input,
+                    device_span<cuda::std::byte> output,
+                    data_type output_dtype,
+                    rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(
+    input.num_columns() > 0, "Input must have at least one column.", std::invalid_argument);
+
+  cudf::type_dispatcher<cudf::dispatch_storage_type>(
+    output_dtype, table_to_array_dispatcher{input, output, stream});
+}
+
+}  // namespace detail
+
+void table_to_array(table_view const& input,
+                    device_span<cuda::std::byte> output,
+                    data_type output_dtype,
+                    rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+  cudf::detail::table_to_array(input, output, output_dtype, stream);
+}
+
+}  // namespace cudf
@@ -520,7 +520,7 @@ ConfigureTest(
 # * reshape test ----------------------------------------------------------------------------------
 ConfigureTest(
   RESHAPE_TEST reshape/byte_cast_tests.cpp reshape/interleave_columns_tests.cpp
-  reshape/tile_tests.cpp
+  reshape/tile_tests.cpp reshape/table_to_array_tests.cpp
 )
 
 # ##################################################################################################