rapidsai · rapids-bot · May 7, 2025 · Apr 7, 2025 · Apr 7, 2025 · Apr 8, 2025
@@ -644,6 +644,7 @@ add_library(
   src/reshape/byte_cast.cu
   src/reshape/interleave_columns.cu
   src/reshape/tile.cu
+  src/reshape/table_to_array.cu
   src/rolling/detail/optimized_unbounded_window.cpp
   src/rolling/detail/rolling_collect_list.cu
   src/rolling/detail/rolling_fixed_window.cu

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -107,6 +107,29 @@ std::unique_ptr<column> byte_cast(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Copies a table into a contiguous column-major device array.
+ *
+ * This function converts a table_view with columns of the same type
+ * into a 2D device array in column-major order. The output buffer must be
+ * preallocated and large enough to hold `num_rows * num_columns` values of `output_dtype`.
+ *
+ * @throws cudf::logic_error if column types do not match `output_dtype`
+ * @throws cudf::logic_error if `output_dtype` is not fixed-width or is a fixed-point type
+ *
+ * @param input A table with fixed-width, non-nullable columns of the same type
+ * @param output Pointer to device memory large enough to hold `num_rows * num_columns` values
+ * @param output_dtype The logical data type of the output array
+ * @param stream CUDA stream used for memory operations
+ * @param mr Memory resource used for device allocations (currently unused)
+ */
+void table_to_device_array(
+  table_view const& input,
+  void* output,
+  cudf::data_type output_dtype,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /** @} */  // end of group
 
 }  // namespace CUDF_EXPORT cudf
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/reshape.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <cub/device/device_memcpy.cuh>
+#include <cuda_runtime.h>
+#include <thrust/device_vector.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+namespace cudf {
+namespace {
+
+// template <typename T>
+// void _table_to_device_array(cudf::table_view const& input,
+//                             void* output,
+//                             rmm::cuda_stream_view stream)
+// {
+//   auto const num_columns = input.num_columns();
+//   auto const num_rows    = input.num_rows();
+//   auto const item_size   = sizeof(T);
+
+//   std::vector<void*> dsts(num_columns);
+//   std::vector<void const*> srcs(num_columns);
+//   std::vector<size_t> sizes(num_columns, item_size * num_rows);
+
+//   auto* base_ptr = static_cast<uint8_t*>(output);
+
+//   for (int i = 0; i < num_columns; ++i) {
+//     auto const& col = input.column(i);
+//     CUDF_EXPECTS(col.type() == input.column(0).type(), "All columns must have the same dtype");
+
+//     auto* src_ptr = static_cast<void const*>(col.data<T>());
+//     auto* dst_ptr = base_ptr + i * item_size * num_rows;
+
+//     srcs[i] = src_ptr;
+//     dsts[i] = dst_ptr;
+//   }
+
+//   cudaMemcpyAttributes attr{};
+//   attr.srcAccessOrder = cudaMemcpySrcAccessOrderStream;
+//   std::vector<cudaMemcpyAttributes> attrs{attr};
+//   std::vector<size_t> attr_idxs{0};
+//   size_t fail_idx = SIZE_MAX;
+
+//   CUDF_CUDA_TRY(cudaMemcpyBatchAsync(dsts.data(),
+//                                      const_cast<void**>(srcs.data()),
+//                                      sizes.data(),
+//                                      num_columns,
+//                                      attrs.data(),
+//                                      attr_idxs.data(),
+//                                      attrs.size(),
+//                                      &fail_idx,
+//                                      stream.value()));
+// }
+
+template <typename T>
+void _table_to_device_array(cudf::table_view const& input,
+                            void* output,
+                            rmm::cuda_stream_view stream)
+{
+  auto const num_columns = input.num_columns();
+  auto const num_rows    = input.num_rows();
+  auto const item_size   = sizeof(T);
+
+  std::vector<void*> dsts(num_columns);
+  std::vector<void const*> srcs(num_columns);
+  std::vector<size_t> sizes(num_columns, item_size * num_rows);
+
+  auto* base_ptr = static_cast<uint8_t*>(output);
+
+  for (int i = 0; i < num_columns; ++i) {
+    auto const& col = input.column(i);
+    CUDF_EXPECTS(col.type() == input.column(0).type(), "All columns must have the same dtype");
+
+    auto* src_ptr = static_cast<void const*>(col.data<T>());
+    auto* dst_ptr = base_ptr + i * item_size * num_rows;
+
+    srcs[i] = src_ptr;
+    dsts[i] = dst_ptr;
+  }
+
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 12080
+  cudaMemcpyAttributes attr{};
+  attr.srcAccessOrder = cudaMemcpySrcAccessOrderStream;
+  std::vector<cudaMemcpyAttributes> attrs{attr};
+  std::vector<size_t> attr_idxs{0};
+  size_t fail_idx = SIZE_MAX;
+
+  CUDF_CUDA_TRY(cudaMemcpyBatchAsync(dsts.data(),
+                                     const_cast<void**>(srcs.data()),
+                                     sizes.data(),
+                                     num_columns,
+                                     attrs.data(),
+                                     attr_idxs.data(),
+                                     attrs.size(),
+                                     &fail_idx,
+                                     stream.value()));
+#else
+  for (int i = 0; i < num_columns; ++i) {
+    CUDF_CUDA_TRY(cudaMemcpyAsync(dsts[i], srcs[i], sizes[i], cudaMemcpyDeviceToDevice, stream.value()));
+  }
+#endif
+}
+
+
+struct TableToArrayDispatcher {
+  table_view const& input;
+  void* output;
+  rmm::cuda_stream_view stream;
+
+  template <typename T, CUDF_ENABLE_IF(is_fixed_width<T>() || is_fixed_point<T>())>
+  void operator()() const
+  {
+    if constexpr (is_fixed_point<T>()) {
+      using StorageType = cudf::device_storage_type_t<T>;
+      _table_to_device_array<StorageType>(input, output, stream);
+    } else {
+      _table_to_device_array<T>(input, output, stream);
+    }
+  }
+
+  template <typename T, CUDF_ENABLE_IF(!is_fixed_width<T>() && !is_fixed_point<T>())>
+  void operator()() const
+  {
+    CUDF_FAIL("Unsupported dtype");
+  }
+};
+
+}  // namespace
+
+void table_to_device_array(table_view const& input,
+                           void* output,
+                           data_type output_dtype,
+                           rmm::cuda_stream_view stream,
+                           rmm::device_async_resource_ref)
+{
+  CUDF_FUNC_RANGE();
+  cudf::type_dispatcher(output_dtype, TableToArrayDispatcher{input, output, stream});
+}
+
+}  // namespace cudf
@@ -520,7 +520,7 @@ ConfigureTest(
 # * reshape test ----------------------------------------------------------------------------------
 ConfigureTest(
   RESHAPE_TEST reshape/byte_cast_tests.cpp reshape/interleave_columns_tests.cpp
-  reshape/tile_tests.cpp
+  reshape/tile_tests.cpp reshape/table_to_array.cpp
 )
 
 # ##################################################################################################

diff --git a/cpp/tests/reshape/table_to_array.cpp b/cpp/tests/reshape/table_to_array.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_list_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/reshape.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+
+template <typename T>
+struct TableToDeviceArrayTypedTest : public cudf::test::BaseFixture {};
+
+using SupportedTypes = cudf::test::Types<int8_t,
+                                         int16_t,
+                                         int32_t,
+                                         int64_t,
+                                         uint8_t,
+                                         uint16_t,
+                                         uint32_t,
+                                         uint64_t,
+                                         float,
+                                         double,
+                                         cudf::timestamp_D,
+                                         cudf::timestamp_s,
+                                         cudf::timestamp_ms,
+                                         cudf::timestamp_us,
+                                         cudf::timestamp_ns,
+                                         cudf::duration_D,
+                                         cudf::duration_s,
+                                         cudf::duration_ms,
+                                         cudf::duration_us,
+                                         cudf::duration_ns>;
+
+TYPED_TEST_SUITE(TableToDeviceArrayTypedTest, SupportedTypes);
+
+TYPED_TEST(TableToDeviceArrayTypedTest, SupportedTypes)
+{
+  using T     = TypeParam;
+  auto stream = cudf::get_default_stream();
+
+  auto const dtype = cudf::data_type{cudf::type_to_id<T>()};
+
+  auto const col0 = cudf::test::make_type_param_vector<T>({1, 2, 3});
+  auto const col1 = cudf::test::make_type_param_vector<T>({4, 5, 6});
+  auto const col2 = cudf::test::make_type_param_vector<T>({7, 8, 9});
+  auto const col3 = cudf::test::make_type_param_vector<T>({10, 11, 12});
+
+  std::vector<std::unique_ptr<cudf::column>> cols;
+  auto make_col = [&](auto const& data) {
+    return std::make_unique<cudf::column>(
+      cudf::test::fixed_width_column_wrapper<T>(data.begin(), data.end()));
+  };
+
+  cols.push_back(make_col(col0));
+  cols.push_back(make_col(col1));
+  cols.push_back(make_col(col2));
+  cols.push_back(make_col(col3));
+
+  cudf::table_view input({cols[0]->view(), cols[1]->view(), cols[2]->view(), cols[3]->view()});
+  size_t num_elements = 3 * 4;
+  rmm::device_buffer output(num_elements * sizeof(T), stream);
+
+  cudf::table_to_device_array(
+    input, output.data(), dtype, stream, rmm::mr::get_current_device_resource());
+
+  std::vector<T> host_result(num_elements);
+  CUDF_CUDA_TRY(cudaMemcpy(
+    host_result.data(), output.data(), num_elements * sizeof(T), cudaMemcpyDeviceToHost));
+
+  auto const expected_data =
+    cudf::test::make_type_param_vector<T>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  std::vector<T> expected(expected_data.begin(), expected_data.end());
+
+  EXPECT_EQ(host_result, expected);
+}
+
+template <typename T>
+struct FixedPointTableToDeviceArrayTest : public cudf::test::BaseFixture {};
+
+TYPED_TEST_SUITE(FixedPointTableToDeviceArrayTest, cudf::test::FixedPointTypes);
+
+TYPED_TEST(FixedPointTableToDeviceArrayTest, SupportedFixedPointTypes)
+{
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  auto stream = cudf::get_default_stream();
+  auto scale  = numeric::scale_type{-2};
+  auto dtype  = cudf::data_type{cudf::type_to_id<decimalXX>(), scale};
+
+  fp_wrapper col0({123, 456, 789}, scale);
+  fp_wrapper col1({321, 654, 987}, scale);
+
+  cudf::table_view input({col0, col1});
+  rmm::device_buffer output(2 * 3 * sizeof(RepType), stream);
+
+  cudf::table_to_device_array(
+    input, output.data(), dtype, stream, rmm::mr::get_current_device_resource());
+
+  std::vector<RepType> host_result(6);
+  CUDF_CUDA_TRY(cudaMemcpy(host_result.data(),
+                           output.data(),
+                           host_result.size() * sizeof(RepType),
+                           cudaMemcpyDeviceToHost));
+
+  std::vector<RepType> expected{123, 456, 789, 321, 654, 987};
+  EXPECT_EQ(host_result, expected);
+}
+
+struct TableToDeviceArrayTest : public cudf::test::BaseFixture {};
+
+TEST(TableToDeviceArrayTest, UnsupportedStringType)
+{
+  auto stream = cudf::get_default_stream();
+  auto col    = cudf::test::strings_column_wrapper({"a", "b", "c"});
+  cudf::table_view input_table({col});
+  rmm::device_buffer output(3 * sizeof(int32_t), stream);
+
+  EXPECT_THROW(cudf::table_to_device_array(input_table,
+                                           output.data(),
+                                           cudf::data_type{cudf::type_id::STRING},
+                                           stream,
+                                           rmm::mr::get_current_device_resource()),
+               cudf::logic_error);
+}