From 268996ad101dc69414992aa0227eba4f93012c91 Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Mon, 1 Apr 2024 18:59:48 -0400
Subject: [PATCH] Add `to_arrow_device` function to cudf interop using
 nanoarrow (#15047)

Introduce new `to_arrow_device` and `to_arrow_schema` functions to utilize the `ArrowDeviceArray` structure for zero-copy passing of libcudf::table.

Add nanoarrow as a vendored lib and a script to update it.

Initial step towards addressing #14926

Authors:
  - Matt Topol (https://github.com/zeroshade)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/15047
---
 cpp/CMakeLists.txt                         |   8 +-
 cpp/cmake/thirdparty/get_nanoarrow.cmake   |  36 +
 cpp/include/cudf/interop.hpp               |  96 ++-
 cpp/include/cudf/interop/detail/arrow.hpp  |  48 ++
 cpp/src/interop/to_arrow_device.cu         | 727 ++++++++++++++++++++
 cpp/tests/CMakeLists.txt                   |   7 +-
 cpp/tests/interop/nanoarrow_utils.hpp      | 226 +++++++
 cpp/tests/interop/to_arrow_device_test.cpp | 739 +++++++++++++++++++++
 docs/cudf/source/conf.py                   |   1 +
 9 files changed, 1882 insertions(+), 6 deletions(-)
 create mode 100644 cpp/cmake/thirdparty/get_nanoarrow.cmake
 create mode 100644 cpp/include/cudf/interop/detail/arrow.hpp
 create mode 100644 cpp/src/interop/to_arrow_device.cu
 create mode 100644 cpp/tests/interop/nanoarrow_utils.hpp
 create mode 100644 cpp/tests/interop/to_arrow_device_test.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 618d03f7078..f1d43e3c35f 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -210,12 +210,14 @@ include(cmake/thirdparty/get_kvikio.cmake)
 include(cmake/thirdparty/get_fmt.cmake)
 # find spdlog
 include(cmake/thirdparty/get_spdlog.cmake)
+# find nanoarrow
+include(cmake/thirdparty/get_nanoarrow.cmake)
 
 # Workaround until https://github.com/rapidsai/rapids-cmake/issues/176 is resolved
 if(NOT BUILD_SHARED_LIBS)
   include("${rapids-cmake-dir}/export/find_package_file.cmake")
   list(APPEND METADATA_KINDS BUILD INSTALL)
-  list(APPEND dependencies KvikIO ZLIB nvcomp)
+  list(APPEND dependencies KvikIO ZLIB nvcomp nanoarrow)
   if(TARGET cufile::cuFile_interface)
     list(APPEND dependencies cuFile)
   endif()
@@ -358,6 +360,7 @@ add_library(
   src/interop/dlpack.cpp
   src/interop/from_arrow.cu
   src/interop/to_arrow.cu
+  src/interop/to_arrow_device.cu
   src/interop/detail/arrow_allocator.cpp
   src/io/avro/avro.cpp
   src/io/avro/avro_gpu.cu
@@ -735,6 +738,7 @@ target_include_directories(
          "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/include>"
          "$<BUILD_INTERFACE:${CUDF_GENERATED_INCLUDE_DIR}/include>"
   PRIVATE "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
+          "$<BUILD_INTERFACE:${nanoarrow_SOURCE_DIR}/src>"
   INTERFACE "$<INSTALL_INTERFACE:include>"
 )
 
@@ -783,7 +787,7 @@ target_link_libraries(
   cudf
   PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm
   PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3-cpp> cuco::cuco ZLIB::ZLIB nvcomp::nvcomp kvikio::kvikio
-          $<TARGET_NAME_IF_EXISTS:cuFile_interface>
+          $<TARGET_NAME_IF_EXISTS:cuFile_interface> nanoarrow
 )
 
 # Add Conda library, and include paths if specified
diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake
new file mode 100644
index 00000000000..be938a89ccd
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake
@@ -0,0 +1,36 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# This function finds nanoarrow and sets any additional necessary environment variables.
+function(find_and_configure_nanoarrow)
+  set(oneValueArgs VERSION FORK PINNED_TAG)
+  cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  rapids_cpm_find(
+    nanoarrow ${PKG_VERSION}
+    GLOBAL_TARGETS nanoarrow
+    CPM_ARGS
+    GIT_REPOSITORY https://github.com/${PKG_FORK}/arrow-nanoarrow.git
+    GIT_TAG ${PKG_PINNED_TAG}
+    # TODO: Commit hashes are not supported with shallow clones. Can switch this if and when we pin
+    # to an actual tag.
+    GIT_SHALLOW FALSE
+    OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf"
+  )
+  set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endfunction()
+
+find_and_configure_nanoarrow(
+  VERSION 0.4.0 FORK apache PINNED_TAG c97720003ff863b81805bcdb9f7c91306ab6b6a8
+)
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index 2ee6f19614d..871f48e3aac 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,11 +34,16 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
 struct DLManagedTensor;
 
+struct ArrowDeviceArray;
+
+struct ArrowSchema;
+
 namespace cudf {
 /**
  * @addtogroup interop_dlpack
@@ -162,6 +167,95 @@ std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
                                         column_metadata const& metadata = {},
                                         rmm::cuda_stream_view stream = cudf::get_default_stream(),
                                         arrow::MemoryPool* ar_mr = arrow::default_memory_pool());
+
+/**
+ * @brief typedef for a unique_ptr to an ArrowSchema with custom deleter
+ *
+ */
+using unique_schema_t = std::unique_ptr<ArrowSchema, void (*)(ArrowSchema*)>;
+
+/**
+ * @brief typedef for a unique_ptr to an ArrowDeviceArray with a custom deleter
+ *
+ */
+using unique_device_array_t = std::unique_ptr<ArrowDeviceArray, void (*)(ArrowDeviceArray*)>;
+
+/**
+ * @brief Create ArrowSchema from cudf table and metadata
+ *
+ * Populates and returns an ArrowSchema C struct using a table and metadata.
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf,
+ * decimals will be converted to an Arrow decimal128 which has the widest precision that cudf
+ * decimal type supports. For example, `numeric::decimal32` will be converted to Arrow decimal128
+ * with the precision of 9 which is the maximum precision for 32-bit types. Similarly,
+ * `numeric::decimal128` will be converted to Arrow decimal128 with the precision of 38.
+ *
+ * @param input Table to create a schema from
+ * @param metadata Contains the hierarchy of names of columns and children
+ * @return ArrowSchema generated from `input`
+ */
+unique_schema_t to_arrow_schema(cudf::table_view const& input,
+                                cudf::host_span<column_metadata const> metadata);
+
+/**
+ * @brief Create `ArrowDeviceArray` from cudf table and metadata
+ *
+ * Populates the C struct ArrowDeviceArray without performing copies if possible.
+ * This maintains the data on the GPU device and gives ownership of the table
+ * and its buffers to the ArrowDeviceArray struct.
+ *
+ * After calling this function, the release callback on the returned ArrowDeviceArray
+ * must be called to clean up the memory.
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf
+ * it will be converted to an Arrow decimal128 with the widest-precision the cudf decimal type
+ * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
+ * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
+ * converted to Arrow decimal128 of the precision 38.
+ *
+ * @note Copies will be performed in the cases where cudf differs from Arrow
+ * such as in the representation of bools (Arrow uses a bitmap, cudf uses 1-byte per value).
+ *
+ * @param table Input table, ownership of the data will be moved to the result
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used for any allocations during conversion
+ * @return ArrowDeviceArray which will have ownership of the GPU data, consumer must call release
+ */
+unique_device_array_t to_arrow_device(
+  cudf::table&& table,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Create `ArrowDeviceArray` from cudf column and metadata
+ *
+ * Populates the C struct ArrowDeviceArray without performing copies if possible.
+ * This maintains the data on the GPU device and gives ownership of the table
+ * and its buffers to the ArrowDeviceArray struct.
+ *
+ * After calling this function, the release callback on the returned ArrowDeviceArray
+ * must be called to clean up the memory.
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf
+ * it will be converted to an Arrow decimal128 with the widest-precision the cudf decimal type
+ * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
+ * 9 which is the maximum precision for 32-bit types. Similar, numeric::decimal128 will be
+ * converted to Arrow decimal128 of the precision 38.
+ *
+ * @note Copies will be performed in the cases where cudf differs from Arrow such as
+ * in the representation of bools (Arrow uses a bitmap, cudf uses 1 byte per value).
+ *
+ * @param col Input column, ownership of the data will be moved to the result
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used for any allocations during conversion
+ * @return ArrowDeviceArray which will have ownership of the GPU data
+ */
+unique_device_array_t to_arrow_device(
+  cudf::column&& col,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Create `cudf::table` from given arrow Table input
  *
diff --git a/cpp/include/cudf/interop/detail/arrow.hpp b/cpp/include/cudf/interop/detail/arrow.hpp
new file mode 100644
index 00000000000..8043ecf5422
--- /dev/null
+++ b/cpp/include/cudf/interop/detail/arrow.hpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanoarrow/nanoarrow.hpp>
+
+// from Arrow C Device Data Interface
+// https://arrow.apache.org/docs/format/CDeviceDataInterface.html
+#ifndef ARROW_C_DEVICE_DATA_INTERFACE
+#define ARROW_C_DEVICE_DATA_INTERFACE
+
+// Device type for the allocated memory
+typedef int32_t ArrowDeviceType;
+
+// CPU device, same as using ArrowArray directly
+#define ARROW_DEVICE_CPU 1
+// CUDA GPU Device
+#define ARROW_DEVICE_CUDA 2
+// Pinned CUDA CPU memory by cudaMallocHost
+#define ARROW_DEVICE_CUDA_HOST 3
+// CUDA managed/unified memory allocated by cudaMallocManaged
+#define ARROW_DEVICE_CUDA_MANAGED 13
+
+struct ArrowDeviceArray {
+  struct ArrowArray array;
+  int64_t device_id;
+  ArrowDeviceType device_type;
+  void* sync_event;
+
+  // reserved bytes for future expansion
+  int64_t reserved[3];
+};
+
+#endif  // ARROW_C_DEVICE_DATA_INTERFACE
diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
new file mode 100644
index 00000000000..e824412e71c
--- /dev/null
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -0,0 +1,727 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/interop.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/interop/detail/arrow.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/null_mask.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+
+#include <nanoarrow/nanoarrow.h>
+#include <nanoarrow/nanoarrow.hpp>
+
+namespace cudf {
+namespace detail {
+namespace {
+static constexpr int validity_buffer_idx         = 0;
+static constexpr int fixed_width_data_buffer_idx = 1;
+
+ArrowType id_to_arrow_type(cudf::type_id id)
+{
+  switch (id) {
+    case cudf::type_id::BOOL8: return NANOARROW_TYPE_BOOL;
+    case cudf::type_id::INT8: return NANOARROW_TYPE_INT8;
+    case cudf::type_id::INT16: return NANOARROW_TYPE_INT16;
+    case cudf::type_id::INT32: return NANOARROW_TYPE_INT32;
+    case cudf::type_id::INT64: return NANOARROW_TYPE_INT64;
+    case cudf::type_id::UINT8: return NANOARROW_TYPE_UINT8;
+    case cudf::type_id::UINT16: return NANOARROW_TYPE_UINT16;
+    case cudf::type_id::UINT32: return NANOARROW_TYPE_UINT32;
+    case cudf::type_id::UINT64: return NANOARROW_TYPE_UINT64;
+    case cudf::type_id::FLOAT32: return NANOARROW_TYPE_FLOAT;
+    case cudf::type_id::FLOAT64: return NANOARROW_TYPE_DOUBLE;
+    case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_DATE32;
+    default: CUDF_FAIL("Unsupported type_id conversion to arrow type");
+  }
+}
+
+struct dispatch_to_arrow_type {
+  template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
+  int operator()(column_view, column_metadata const&, ArrowSchema*)
+  {
+    CUDF_FAIL("Unsupported type for to_arrow_schema");
+  }
+
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
+  int operator()(column_view input_view, column_metadata const&, ArrowSchema* out)
+  {
+    cudf::type_id id = input_view.type().id();
+    switch (id) {
+      case cudf::type_id::TIMESTAMP_SECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_SECOND, nullptr);
+      case cudf::type_id::TIMESTAMP_MILLISECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr);
+      case cudf::type_id::TIMESTAMP_MICROSECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MICRO, nullptr);
+      case cudf::type_id::TIMESTAMP_NANOSECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_NANO, nullptr);
+      case cudf::type_id::DURATION_SECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_SECOND, nullptr);
+      case cudf::type_id::DURATION_MILLISECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_MILLI, nullptr);
+      case cudf::type_id::DURATION_MICROSECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_MICRO, nullptr);
+      case cudf::type_id::DURATION_NANOSECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_NANO, nullptr);
+      default: return ArrowSchemaSetType(out, id_to_arrow_type(id));
+    }
+  }
+};
+
+template <typename DeviceType>
+int decimals_to_arrow(column_view input, ArrowSchema* out)
+{
+  // Arrow doesn't support decimal32/decimal64 currently. decimal128
+  // is the smallest that arrow supports besides float32/float64 so we
+  // upcast to decimal128.
+  return ArrowSchemaSetTypeDecimal(out,
+                                   NANOARROW_TYPE_DECIMAL128,
+                                   cudf::detail::max_precision<DeviceType>(),
+                                   -input.type().scale());
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<numeric::decimal32>(column_view input,
+                                                           column_metadata const&,
+                                                           ArrowSchema* out)
+{
+  using DeviceType = int32_t;
+  return decimals_to_arrow<DeviceType>(input, out);
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<numeric::decimal64>(column_view input,
+                                                           column_metadata const&,
+                                                           ArrowSchema* out)
+{
+  using DeviceType = int64_t;
+  return decimals_to_arrow<DeviceType>(input, out);
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<numeric::decimal128>(column_view input,
+                                                            column_metadata const&,
+                                                            ArrowSchema* out)
+{
+  using DeviceType = __int128_t;
+  return decimals_to_arrow<DeviceType>(input, out);
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<cudf::string_view>(column_view input,
+                                                          column_metadata const&,
+                                                          ArrowSchema* out)
+{
+  return ArrowSchemaSetType(out, NANOARROW_TYPE_STRING);
+}
+
+// these forward declarations are needed due to the recursive calls to them
+// inside their definitions and in struct_vew for handling children
+template <>
+int dispatch_to_arrow_type::operator()<cudf::list_view>(column_view input,
+                                                        column_metadata const& metadata,
+                                                        ArrowSchema* out);
+
+template <>
+int dispatch_to_arrow_type::operator()<cudf::dictionary32>(column_view input,
+                                                           column_metadata const& metadata,
+                                                           ArrowSchema* out);
+
+template <>
+int dispatch_to_arrow_type::operator()<cudf::struct_view>(column_view input,
+                                                          column_metadata const& metadata,
+                                                          ArrowSchema* out)
+{
+  CUDF_EXPECTS(metadata.children_meta.size() == static_cast<std::size_t>(input.num_children()),
+               "Number of field names and number of children doesn't match\n");
+
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetTypeStruct(out, input.num_children()));
+  for (int i = 0; i < input.num_children(); ++i) {
+    auto child = out->children[i];
+    auto col   = input.child(i);
+    ArrowSchemaInit(child);
+    NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(child, metadata.children_meta[i].name.c_str()));
+
+    child->flags = col.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
+
+    if (col.type().id() == cudf::type_id::EMPTY) {
+      NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(child, NANOARROW_TYPE_NA));
+      continue;
+    }
+
+    NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+      col.type(), detail::dispatch_to_arrow_type{}, col, metadata.children_meta[i], child));
+  }
+
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<cudf::list_view>(column_view input,
+                                                        column_metadata const& metadata,
+                                                        ArrowSchema* out)
+{
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(out, NANOARROW_TYPE_LIST));
+  auto child = input.child(cudf::lists_column_view::child_column_index);
+  ArrowSchemaInit(out->children[0]);
+  if (child.type().id() == cudf::type_id::EMPTY) {
+    return ArrowSchemaSetType(out->children[0], NANOARROW_TYPE_NA);
+  }
+  auto child_meta =
+    metadata.children_meta.empty() ? column_metadata{"element"} : metadata.children_meta[0];
+
+  out->flags = input.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(out->children[0], child_meta.name.c_str()));
+  out->children[0]->flags = child.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
+  return cudf::type_dispatcher(
+    child.type(), detail::dispatch_to_arrow_type{}, child, child_meta, out->children[0]);
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<cudf::dictionary32>(column_view input,
+                                                           column_metadata const& metadata,
+                                                           ArrowSchema* out)
+{
+  cudf::dictionary_column_view dview{input};
+
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(out, id_to_arrow_type(dview.indices().type().id())));
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateDictionary(out));
+  ArrowSchemaInit(out->dictionary);
+
+  auto dict_keys = dview.keys();
+  return cudf::type_dispatcher(
+    dict_keys.type(),
+    detail::dispatch_to_arrow_type{},
+    dict_keys,
+    metadata.children_meta.empty() ? column_metadata{"keys"} : metadata.children_meta[0],
+    out->dictionary);
+}
+
+template <typename T>
+void device_buffer_finalize(ArrowBufferAllocator* allocator, uint8_t*, int64_t)
+{
+  auto* unique_buffer = reinterpret_cast<std::unique_ptr<T>*>(allocator->private_data);
+  delete unique_buffer;
+}
+
+template <typename>
+struct is_device_scalar : public std::false_type {};
+
+template <typename T>
+struct is_device_scalar<rmm::device_scalar<T>> : public std::true_type {};
+
+template <typename>
+struct is_device_uvector : public std::false_type {};
+
+template <typename T>
+struct is_device_uvector<rmm::device_uvector<T>> : public std::true_type {};
+
+template <typename T>
+int set_buffer(std::unique_ptr<T> device_buf, int64_t i, ArrowArray* out)
+{
+  ArrowBuffer* buf = ArrowArrayBuffer(out, i);
+  auto ptr         = reinterpret_cast<uint8_t*>(device_buf->data());
+  buf->size_bytes  = [&] {
+    if constexpr (is_device_scalar<T>::value) {
+      return sizeof(typename T::value_type);
+    } else if constexpr (is_device_uvector<T>::value) {
+      return sizeof(typename T::value_type) * device_buf->size();
+    } else {
+      return device_buf->size();
+    }
+  }();
+  // we make a new unique_ptr and move to it in case there was a custom deleter
+  NANOARROW_RETURN_NOT_OK(
+    ArrowBufferSetAllocator(buf,
+                            ArrowBufferDeallocator(&device_buffer_finalize<T>,
+                                                   new std::unique_ptr<T>(std::move(device_buf)))));
+  buf->data = ptr;
+  return NANOARROW_OK;
+}
+
+int initialize_array(ArrowArray* arr, ArrowType storage_type, cudf::column const& column)
+{
+  NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(arr, storage_type));
+  arr->length     = column.size();
+  arr->null_count = column.null_count();
+  return NANOARROW_OK;
+}
+
+struct dispatch_to_arrow_device {
+  template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
+  int operator()(cudf::column&&,
+                 rmm::cuda_stream_view,
+                 rmm::mr::device_memory_resource*,
+                 ArrowArray*)
+  {
+    CUDF_FAIL("Unsupported type for to_arrow_device");
+  }
+
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
+  int operator()(cudf::column&& column,
+                 rmm::cuda_stream_view stream,
+                 rmm::mr::device_memory_resource* mr,
+                 ArrowArray* out)
+  {
+    nanoarrow::UniqueArray tmp;
+
+    const ArrowType storage_type = [&] {
+      switch (column.type().id()) {
+        case cudf::type_id::TIMESTAMP_SECONDS:
+        case cudf::type_id::TIMESTAMP_MILLISECONDS:
+        case cudf::type_id::TIMESTAMP_MICROSECONDS:
+        case cudf::type_id::TIMESTAMP_NANOSECONDS: return NANOARROW_TYPE_INT64;
+        case cudf::type_id::DURATION_SECONDS:
+        case cudf::type_id::DURATION_MILLISECONDS:
+        case cudf::type_id::DURATION_MICROSECONDS:
+        case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TYPE_INT64;
+        default: return id_to_arrow_type(column.type().id());
+      }
+    }();
+    NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), storage_type, column));
+
+    auto contents = column.release();
+    if (contents.null_mask) {
+      NANOARROW_RETURN_NOT_OK(
+        set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
+    }
+
+    NANOARROW_RETURN_NOT_OK(
+      set_buffer(std::move(contents.data), fixed_width_data_buffer_idx, tmp.get()));
+
+    ArrowArrayMove(tmp.get(), out);
+    return NANOARROW_OK;
+  }
+};
+
+template <typename DeviceType>
+int decimals_to_arrow(cudf::column&& input,
+                      int32_t precision,
+                      rmm::cuda_stream_view stream,
+                      rmm::mr::device_memory_resource* mr,
+                      ArrowArray* out)
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, input));
+
+  if constexpr (!std::is_same_v<DeviceType, __int128_t>) {
+    constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType);
+    auto buf =
+      std::make_unique<rmm::device_uvector<DeviceType>>(input.size() * BIT_WIDTH_RATIO, stream, mr);
+
+    auto count = thrust::make_counting_iterator(0);
+
+    thrust::for_each(rmm::exec_policy(stream, mr),
+                     count,
+                     count + input.size(),
+                     [in  = input.view().begin<DeviceType>(),
+                      out = buf->data(),
+                      BIT_WIDTH_RATIO] __device__(auto in_idx) {
+                       auto const out_idx = in_idx * BIT_WIDTH_RATIO;
+                       // the lowest order bits are the value, the remainder
+                       // simply matches the sign bit to satisfy the two's
+                       // complement integer representation of negative numbers.
+                       out[out_idx] = in[in_idx];
+#pragma unroll BIT_WIDTH_RATIO - 1
+                       for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
+                         out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
+                       }
+                     });
+    NANOARROW_RETURN_NOT_OK(set_buffer(std::move(buf), fixed_width_data_buffer_idx, tmp.get()));
+  }
+
+  auto contents = input.release();
+  if (contents.null_mask) {
+    NANOARROW_RETURN_NOT_OK(
+      set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
+  }
+
+  if constexpr (std::is_same_v<DeviceType, __int128_t>) {
+    NANOARROW_RETURN_NOT_OK(
+      set_buffer(std::move(contents.data), fixed_width_data_buffer_idx, tmp.get()));
+  }
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device::operator()<numeric::decimal32>(cudf::column&& column,
+                                                             rmm::cuda_stream_view stream,
+                                                             rmm::mr::device_memory_resource* mr,
+                                                             ArrowArray* out)
+{
+  using DeviceType = int32_t;
+  return decimals_to_arrow<DeviceType>(
+    std::move(column), cudf::detail::max_precision<DeviceType>(), stream, mr, out);
+}
+
+template <>
+int dispatch_to_arrow_device::operator()<numeric::decimal64>(cudf::column&& column,
+                                                             rmm::cuda_stream_view stream,
+                                                             rmm::mr::device_memory_resource* mr,
+                                                             ArrowArray* out)
+{
+  using DeviceType = int64_t;
+  return decimals_to_arrow<DeviceType>(
+    std::move(column), cudf::detail::max_precision<DeviceType>(), stream, mr, out);
+}
+
+template <>
+int dispatch_to_arrow_device::operator()<numeric::decimal128>(cudf::column&& column,
+                                                              rmm::cuda_stream_view stream,
+                                                              rmm::mr::device_memory_resource* mr,
+                                                              ArrowArray* out)
+{
+  using DeviceType = __int128_t;
+  return decimals_to_arrow<DeviceType>(
+    std::move(column), cudf::detail::max_precision<DeviceType>(), stream, mr, out);
+}
+
+template <>
+int dispatch_to_arrow_device::operator()<bool>(cudf::column&& column,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr,
+                                               ArrowArray* out)
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_BOOL, column));
+
+  auto bitmask  = bools_to_mask(column.view(), stream, mr);
+  auto contents = column.release();
+  if (contents.null_mask) {
+    NANOARROW_RETURN_NOT_OK(
+      set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
+  }
+  NANOARROW_RETURN_NOT_OK(
+    set_buffer(std::move(bitmask.first), fixed_width_data_buffer_idx, tmp.get()));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device::operator()<cudf::string_view>(cudf::column&& column,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::mr::device_memory_resource* mr,
+                                                            ArrowArray* out)
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_STRING, column));
+
+  if (column.size() == 0) {
+    // the scalar zero here is necessary because the spec for string arrays states
+    // that the offsets buffer should contain "length + 1" signed integers. So in
+    // the case of a 0 length string array, there should be exactly 1 value, zero,
+    // in the offsets buffer. While some arrow implementations may accept a zero-sized
+    // offsets buffer, best practices would be to allocate the buffer with the single value.
+    auto zero = std::make_unique<rmm::device_scalar<int32_t>>(0, stream, mr);
+    NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
+    ArrowArrayMove(tmp.get(), out);
+    return NANOARROW_OK;
+  }
+
+  auto contents = column.release();
+  if (contents.null_mask) {
+    NANOARROW_RETURN_NOT_OK(
+      set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
+  }
+
+  auto offsets_contents =
+    contents.children[cudf::strings_column_view::offsets_column_index]->release();
+  NANOARROW_RETURN_NOT_OK(set_buffer(std::move(offsets_contents.data), 1, tmp.get()));
+  NANOARROW_RETURN_NOT_OK(set_buffer(std::move(contents.data), 2, tmp.get()));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device::operator()<cudf::list_view>(cudf::column&& column,
+                                                          rmm::cuda_stream_view stream,
+                                                          rmm::mr::device_memory_resource* mr,
+                                                          ArrowArray* out);
+
+template <>
+int dispatch_to_arrow_device::operator()<cudf::dictionary32>(cudf::column&& column,
+                                                             rmm::cuda_stream_view stream,
+                                                             rmm::mr::device_memory_resource* mr,
+                                                             ArrowArray* out);
+
+template <>
+int dispatch_to_arrow_device::operator()<cudf::struct_view>(cudf::column&& column,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::mr::device_memory_resource* mr,
+                                                            ArrowArray* out)
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_STRUCT, column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), column.num_children()));
+
+  auto contents = column.release();
+  if (contents.null_mask) {
+    NANOARROW_RETURN_NOT_OK(
+      set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
+  }
+
+  for (size_t i = 0; i < size_t(tmp->n_children); ++i) {
+    ArrowArray* child_ptr = tmp->children[i];
+    auto& child           = contents.children[i];
+    if (child->type().id() == cudf::type_id::EMPTY) {
+      NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(child_ptr, NANOARROW_TYPE_NA));
+      child_ptr->length     = child->size();
+      child_ptr->null_count = child->size();
+    } else {
+      NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+        child->type(), dispatch_to_arrow_device{}, std::move(*child), stream, mr, child_ptr));
+    }
+  }
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device::operator()<cudf::list_view>(cudf::column&& column,
+                                                          rmm::cuda_stream_view stream,
+                                                          rmm::mr::device_memory_resource* mr,
+                                                          ArrowArray* out)
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_LIST, column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), 1));
+
+  auto contents = column.release();
+  if (contents.null_mask) {
+    NANOARROW_RETURN_NOT_OK(
+      set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
+  }
+
+  auto offsets_contents =
+    contents.children[cudf::lists_column_view::offsets_column_index]->release();
+  NANOARROW_RETURN_NOT_OK(set_buffer(std::move(offsets_contents.data), 1, tmp.get()));
+
+  auto& child = contents.children[cudf::lists_column_view::child_column_index];
+  if (child->type().id() == cudf::type_id::EMPTY) {
+    NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(tmp->children[0], NANOARROW_TYPE_NA));
+    tmp->children[0]->length     = 0;
+    tmp->children[0]->null_count = 0;
+  } else {
+    NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+      child->type(), dispatch_to_arrow_device{}, std::move(*child), stream, mr, tmp->children[0]));
+  }
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device::operator()<cudf::dictionary32>(cudf::column&& column,
+                                                             rmm::cuda_stream_view stream,
+                                                             rmm::mr::device_memory_resource* mr,
+                                                             ArrowArray* out)
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(
+    tmp.get(),
+    id_to_arrow_type(column.child(cudf::dictionary_column_view::indices_column_index).type().id()),
+    column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateDictionary(tmp.get()));
+
+  auto contents = column.release();
+  if (contents.null_mask) {
+    NANOARROW_RETURN_NOT_OK(
+      set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
+  }
+
+  auto indices_contents =
+    contents.children[cudf::dictionary_column_view::indices_column_index]->release();
+  NANOARROW_RETURN_NOT_OK(
+    set_buffer(std::move(indices_contents.data), fixed_width_data_buffer_idx, tmp.get()));
+
+  auto& keys = contents.children[cudf::dictionary_column_view::keys_column_index];
+  NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+    keys->type(), dispatch_to_arrow_device{}, std::move(*keys), stream, mr, tmp->dictionary));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+struct ArrowDeviceArrayPrivateData {
+  ArrowArray parent;
+  cudaEvent_t sync_event;
+};
+
+void ArrowDeviceArrayRelease(ArrowArray* array)
+{
+  auto private_data = reinterpret_cast<ArrowDeviceArrayPrivateData*>(array->private_data);
+  cudaEventDestroy(private_data->sync_event);
+  ArrowArrayRelease(&private_data->parent);
+  delete private_data;
+  array->release = nullptr;
+}
+
+}  // namespace
+}  // namespace detail
+
+unique_schema_t to_arrow_schema(cudf::table_view const& input,
+                                cudf::host_span<column_metadata const> metadata)
+{
+  CUDF_EXPECTS((metadata.size() == static_cast<std::size_t>(input.num_columns())),
+               "columns' metadata should be equal to the number of columns in table");
+
+  nanoarrow::UniqueSchema result;
+  ArrowSchemaInit(result.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(result.get(), input.num_columns()));
+
+  for (int i = 0; i < input.num_columns(); ++i) {
+    auto child = result->children[i];
+    auto col   = input.column(i);
+    ArrowSchemaInit(child);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child, metadata[i].name.c_str()));
+    child->flags = col.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
+
+    if (col.type().id() == cudf::type_id::EMPTY) {
+      NANOARROW_THROW_NOT_OK(ArrowSchemaSetType(child, NANOARROW_TYPE_NA));
+      continue;
+    }
+
+    NANOARROW_THROW_NOT_OK(
+      cudf::type_dispatcher(col.type(), detail::dispatch_to_arrow_type{}, col, metadata[i], child));
+  }
+
+  unique_schema_t out(new ArrowSchema, [](ArrowSchema* schema) {
+    if (schema->release != nullptr) { ArrowSchemaRelease(schema); }
+    delete schema;
+  });
+  result.move(out.get());
+  return out;
+}
+
+unique_device_array_t to_arrow_device(cudf::table&& table,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_STRUCT));
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), table.num_columns()));
+  tmp->length     = table.num_rows();
+  tmp->null_count = 0;
+
+  auto cols = table.release();
+  for (size_t i = 0; i < cols.size(); ++i) {
+    auto child = tmp->children[i];
+    auto col   = cols[i].get();
+
+    if (col->type().id() == cudf::type_id::EMPTY) {
+      NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(child, NANOARROW_TYPE_NA));
+      child->length     = col->size();
+      child->null_count = col->size();
+      continue;
+    }
+
+    NANOARROW_THROW_NOT_OK(cudf::type_dispatcher(
+      col->type(), detail::dispatch_to_arrow_device{}, std::move(*col), stream, mr, child));
+  }
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(tmp.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr));
+
+  auto private_data = std::make_unique<detail::ArrowDeviceArrayPrivateData>();
+  cudaEventCreate(&private_data->sync_event);
+
+  auto status = cudaEventRecord(private_data->sync_event, stream);
+  if (status != cudaSuccess) { CUDF_FAIL("could not create event to sync on"); }
+
+  ArrowArrayMove(tmp.get(), &private_data->parent);
+  unique_device_array_t result(new ArrowDeviceArray, [](ArrowDeviceArray* arr) {
+    if (arr->array.release != nullptr) { ArrowArrayRelease(&arr->array); }
+    delete arr;
+  });
+  result->device_id          = rmm::get_current_cuda_device().value();
+  result->device_type        = ARROW_DEVICE_CUDA;
+  result->sync_event         = &private_data->sync_event;
+  result->array              = private_data->parent;
+  result->array.private_data = private_data.release();
+  result->array.release      = &detail::ArrowDeviceArrayRelease;
+  return result;
+}
+
+unique_device_array_t to_arrow_device(cudf::column&& col,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  nanoarrow::UniqueArray tmp;
+  if (col.type().id() == cudf::type_id::EMPTY) {
+    NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_NA));
+    tmp->length     = col.size();
+    tmp->null_count = col.size();
+  }
+
+  NANOARROW_THROW_NOT_OK(cudf::type_dispatcher(
+    col.type(), detail::dispatch_to_arrow_device{}, std::move(col), stream, mr, tmp.get()));
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(tmp.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr));
+
+  auto private_data = std::make_unique<detail::ArrowDeviceArrayPrivateData>();
+  cudaEventCreate(&private_data->sync_event);
+
+  auto status = cudaEventRecord(private_data->sync_event, stream);
+  if (status != cudaSuccess) { CUDF_FAIL("could not create event to sync on"); }
+
+  ArrowArrayMove(tmp.get(), &private_data->parent);
+  unique_device_array_t result(new ArrowDeviceArray, [](ArrowDeviceArray* arr) {
+    if (arr->array.release != nullptr) { ArrowArrayRelease(&arr->array); }
+    delete arr;
+  });
+  result->device_id          = rmm::get_current_cuda_device().value();
+  result->device_type        = ARROW_DEVICE_CUDA;
+  result->sync_event         = &private_data->sync_event;
+  result->array              = private_data->parent;
+  result->array.private_data = private_data.release();
+  result->array.release      = &detail::ArrowDeviceArrayRelease;
+  return result;
+}
+
+}  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 9dbf278c71d..053fcc0989a 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -24,7 +24,7 @@ rapids_test_init()
 # properties and linking to build the test
 function(ConfigureTest CMAKE_TEST_NAME)
   set(options)
-  set(one_value GPUS PERCENT STREAM_MODE)
+  set(one_value GPUS PERCENT STREAM_MODE EXTRA_LIB)
   set(multi_value)
   cmake_parse_arguments(_CUDF_TEST "${options}" "${one_value}" "${multi_value}" ${ARGN})
   if(NOT DEFINED _CUDF_TEST_GPUS AND NOT DEFINED _CUDF_TEST_PERCENT)
@@ -56,7 +56,7 @@ function(ConfigureTest CMAKE_TEST_NAME)
 
   target_link_libraries(
     ${CMAKE_TEST_NAME} PRIVATE cudftestutil GTest::gmock_main GTest::gtest_main nvtx3-cpp
-                               $<TARGET_NAME_IF_EXISTS:conda_env>
+                               $<TARGET_NAME_IF_EXISTS:conda_env> "${_CUDF_TEST_EXTRA_LIB}"
   )
   rapids_cuda_set_runtime(${CMAKE_TEST_NAME} USE_STATIC ${CUDA_STATIC_RUNTIME})
   rapids_test_add(
@@ -267,7 +267,8 @@ ConfigureTest(
 # ##################################################################################################
 # * interop tests -------------------------------------------------------------------------
 ConfigureTest(
-  INTEROP_TEST interop/to_arrow_test.cpp interop/from_arrow_test.cpp interop/dlpack_test.cpp
+  INTEROP_TEST interop/to_arrow_device_test.cpp interop/to_arrow_test.cpp
+  interop/from_arrow_test.cpp interop/dlpack_test.cpp EXTRA_LIB nanoarrow
 )
 
 # ##################################################################################################
diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp
new file mode 100644
index 00000000000..e7ffa9e40f4
--- /dev/null
+++ b/cpp/tests/interop/nanoarrow_utils.hpp
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/interop/detail/arrow.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/transform.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/traits.hpp>
+
+// no-op allocator/deallocator to set into ArrowArray buffers that we don't
+// want to own their buffers.
+static ArrowBufferAllocator noop_alloc = (struct ArrowBufferAllocator){
+  .reallocate = [](ArrowBufferAllocator*, uint8_t* ptr, int64_t, int64_t) -> uint8_t* {
+    return ptr;
+  },
+  .free         = [](ArrowBufferAllocator*, uint8_t*, int64_t) {},
+  .private_data = nullptr,
+};
+
+// populate the ArrowArray by copying host data buffers for fixed width types other
+// than boolean.
+template <typename T>
+std::enable_if_t<cudf::is_fixed_width<T>() and !std::is_same_v<T, bool>, void> get_nanoarrow_array(
+  ArrowArray* arr, std::vector<T> const& data, std::vector<uint8_t> const& mask = {})
+{
+  arr->length = data.size();
+  NANOARROW_THROW_NOT_OK(
+    ArrowBufferAppend(ArrowArrayBuffer(arr, 1), data.data(), sizeof(T) * data.size()));
+  if (!mask.empty()) {
+    NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(arr), mask.size()));
+    ArrowBitmapAppendInt8Unsafe(
+      ArrowArrayValidityBitmap(arr), reinterpret_cast<const int8_t*>(mask.data()), mask.size());
+    arr->null_count = ArrowBitCountSet(ArrowArrayValidityBitmap(arr)->buffer.data, 0, data.size());
+  } else {
+    arr->null_count = 0;
+  }
+
+  CUDF_EXPECTS(ArrowArrayFinishBuildingDefault(arr, nullptr) == NANOARROW_OK,
+               "failed to construct array");
+}
+
+// populate an ArrowArray with pointers to the raw device buffers of a cudf::column_view
+// and use the no-op alloc so that the ArrowArray doesn't presume ownership of the data
+template <typename T>
+std::enable_if_t<cudf::is_fixed_width<T>() and !std::is_same_v<T, bool>, void> populate_from_col(
+  ArrowArray* arr, cudf::column_view view)
+{
+  arr->length     = view.size();
+  arr->null_count = view.null_count();
+  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  ArrowArrayValidityBitmap(arr)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
+  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc);
+  ArrowArrayBuffer(arr, 1)->data = const_cast<uint8_t*>(view.data<uint8_t>());
+}
+
+// populate an ArrowArray with boolean data by generating the appropriate
+// bitmaps to copy the data.
+template <typename T>
+std::enable_if_t<std::is_same_v<T, bool>, void> get_nanoarrow_array(
+  ArrowArray* arr, std::vector<bool> const& data, std::vector<bool> const& mask = {})
+{
+  ArrowBitmap bool_data;
+  ArrowBitmapInit(&bool_data);
+  NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&bool_data, data.size()));
+  std::for_each(data.begin(), data.end(), [&](const auto&& elem) {
+    NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&bool_data, (elem) ? 1 : 0, 1));
+  });
+  NANOARROW_THROW_NOT_OK(ArrowArraySetBuffer(arr, 1, &bool_data.buffer));
+
+  if (!mask.empty()) {
+    NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(arr), mask.size()));
+    std::for_each(mask.begin(), mask.end(), [&](const auto&& elem) {
+      NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(arr), (elem) ? 1 : 0, 1));
+    });
+    arr->null_count = ArrowBitCountSet(ArrowArrayValidityBitmap(arr)->buffer.data, 0, data.size());
+  } else {
+    arr->null_count = 0;
+  }
+
+  CUDF_EXPECTS(ArrowArrayFinishBuildingDefault(arr, nullptr) == NANOARROW_OK,
+               "failed to construct boolean array");
+}
+
+// populate an ArrowArray from a boolean cudf column. Since Arrow and cudf
+// still represent boolean arrays differently, we have to use bools_to_mask
+// and give the ArrowArray object ownership of the device data.
+template <typename T>
+std::enable_if_t<std::is_same_v<T, bool>, void> populate_from_col(ArrowArray* arr,
+                                                                  cudf::column_view view)
+{
+  arr->length     = view.size();
+  arr->null_count = view.null_count();
+  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  ArrowArrayValidityBitmap(arr)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
+
+  auto bitmask = cudf::bools_to_mask(view);
+  auto ptr     = reinterpret_cast<uint8_t*>(bitmask.first->data());
+  ArrowBufferSetAllocator(
+    ArrowArrayBuffer(arr, 1),
+    ArrowBufferDeallocator(
+      [](ArrowBufferAllocator* alloc, uint8_t*, int64_t) {
+        auto buf = reinterpret_cast<std::unique_ptr<rmm::device_buffer>*>(alloc->private_data);
+        delete buf;
+      },
+      new std::unique_ptr<rmm::device_buffer>(std::move(bitmask.first))));
+  ArrowArrayBuffer(arr, 1)->data = ptr;
+}
+
+// populate an ArrowArray by copying the string data and constructing the offsets
+// buffer.
+template <typename T>
+std::enable_if_t<std::is_same_v<T, cudf::string_view>, void> get_nanoarrow_array(
+  ArrowArray* arr, std::vector<std::string> const& data, std::vector<uint8_t> const& mask = {})
+{
+  NANOARROW_THROW_NOT_OK(ArrowArrayStartAppending(arr));
+  for (auto& str : data) {
+    NANOARROW_THROW_NOT_OK(ArrowArrayAppendString(arr, ArrowCharView(str.c_str())));
+  }
+
+  if (!mask.empty()) {
+    ArrowBitmapReset(ArrowArrayValidityBitmap(arr));
+    NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(arr), mask.size()));
+    ArrowBitmapAppendInt8Unsafe(
+      ArrowArrayValidityBitmap(arr), reinterpret_cast<const int8_t*>(mask.data()), mask.size());
+    arr->null_count = ArrowBitCountSet(ArrowArrayValidityBitmap(arr)->buffer.data, 0, data.size());
+  } else {
+    arr->null_count = 0;
+  }
+
+  CUDF_EXPECTS(ArrowArrayFinishBuildingDefault(arr, nullptr) == NANOARROW_OK,
+               "failed to construct string array");
+}
+
+// populate an ArrowArray with the string data buffers of a cudf column_view
+// using no-op allocator so the ArrowArray knows it doesn't have ownership
+// of the device buffers.
+template <typename T>
+std::enable_if_t<std::is_same_v<T, cudf::string_view>, void> populate_from_col(
+  ArrowArray* arr, cudf::column_view view)
+{
+  arr->length     = view.size();
+  arr->null_count = view.null_count();
+  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  ArrowArrayValidityBitmap(arr)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
+
+  cudf::strings_column_view sview{view};
+  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc);
+  ArrowArrayBuffer(arr, 1)->data = const_cast<uint8_t*>(sview.offsets().data<uint8_t>());
+  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 2), noop_alloc);
+  ArrowArrayBuffer(arr, 2)->data = const_cast<uint8_t*>(view.data<uint8_t>());
+}
+
+// populate a dictionary ArrowArray by delegating the copying of the indices
+// and key arrays
+template <typename KEY_TYPE, typename IND_TYPE>
+void get_nanoarrow_dict_array(ArrowArray* arr,
+                              std::vector<KEY_TYPE> const& keys,
+                              std::vector<IND_TYPE> const& ind,
+                              std::vector<uint8_t> const& validity = {})
+{
+  get_nanoarrow_array<KEY_TYPE>(arr->dictionary, keys);
+  get_nanoarrow_array<IND_TYPE>(arr, ind, validity);
+}
+
+// populate a list ArrowArray by copying the offsets and data buffers
+template <typename T>
+void get_nanoarrow_list_array(ArrowArray* arr,
+                              std::vector<T> data,
+                              std::vector<int32_t> offsets,
+                              std::vector<uint8_t> data_validity = {},
+                              std::vector<uint8_t> list_validity = {})
+{
+  get_nanoarrow_array<T>(arr->children[0], data, data_validity);
+
+  arr->length = offsets.size() - 1;
+  NANOARROW_THROW_NOT_OK(
+    ArrowBufferAppend(ArrowArrayBuffer(arr, 1), offsets.data(), sizeof(int32_t) * offsets.size()));
+  if (!list_validity.empty()) {
+    NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(arr), list_validity.size()));
+    ArrowBitmapAppendInt8Unsafe(ArrowArrayValidityBitmap(arr),
+                                reinterpret_cast<const int8_t*>(list_validity.data()),
+                                arr->length);
+    arr->null_count = ArrowBitCountSet(ArrowArrayValidityBitmap(arr)->buffer.data, 0, arr->length);
+  } else {
+    arr->null_count = 0;
+  }
+
+  CUDF_EXPECTS(ArrowArrayFinishBuildingDefault(arr, nullptr) == NANOARROW_OK,
+               "failed to construct list array");
+}
+
+// populate an ArrowArray list array from device buffers using a no-op
+// allocator so that the ArrowArray doesn't have ownership of the buffers
+void populate_list_from_col(ArrowArray* arr, cudf::lists_column_view view)
+{
+  arr->length     = view.size();
+  arr->null_count = view.null_count();
+
+  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  ArrowArrayValidityBitmap(arr)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
+
+  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc);
+  ArrowArrayBuffer(arr, 1)->data = const_cast<uint8_t*>(view.offsets().data<uint8_t>());
+}
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
new file mode 100644
index 00000000000..243aa4e81af
--- /dev/null
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -0,0 +1,739 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nanoarrow_utils.hpp"
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/testing_main.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/interop.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/dictionary/encode.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/interop/detail/arrow.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+using vector_of_columns = std::vector<std::unique_ptr<cudf::column>>;
+
+std::tuple<std::unique_ptr<cudf::table>, nanoarrow::UniqueSchema, nanoarrow::UniqueArray>
+get_nanoarrow_tables(cudf::size_type length)
+{
+  std::vector<int64_t> int64_data(length);
+  std::vector<bool> bool_data(length);
+  std::vector<std::string> string_data(length);
+  std::vector<uint8_t> validity(length);
+  std::vector<bool> bool_validity(length);
+  std::vector<uint8_t> bool_data_validity;
+  cudf::size_type length_of_individual_list = 3;
+  cudf::size_type length_of_list            = length_of_individual_list * length;
+  std::vector<int64_t> list_int64_data(length_of_list);
+  std::vector<uint8_t> list_int64_data_validity(length_of_list);
+  std::vector<int32_t> list_offsets(length + 1);
+
+  std::vector<std::unique_ptr<cudf::column>> columns;
+
+  columns.emplace_back(cudf::test::fixed_width_column_wrapper<int64_t>(
+                         int64_data.begin(), int64_data.end(), validity.begin())
+                         .release());
+  columns.emplace_back(
+    cudf::test::strings_column_wrapper(string_data.begin(), string_data.end(), validity.begin())
+      .release());
+  auto col4 = cudf::test::fixed_width_column_wrapper<int64_t>(
+    int64_data.begin(), int64_data.end(), validity.begin());
+  auto dict_col = cudf::dictionary::encode(col4);
+  columns.emplace_back(std::move(cudf::dictionary::encode(col4)));
+  columns.emplace_back(cudf::test::fixed_width_column_wrapper<bool>(
+                         bool_data.begin(), bool_data.end(), bool_validity.begin())
+                         .release());
+  auto list_child_column = cudf::test::fixed_width_column_wrapper<int64_t>(
+    list_int64_data.begin(), list_int64_data.end(), list_int64_data_validity.begin());
+  auto list_offsets_column =
+    cudf::test::fixed_width_column_wrapper<int32_t>(list_offsets.begin(), list_offsets.end());
+  auto [list_mask, list_nulls] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper<bool>(
+    bool_data_validity.begin(), bool_data_validity.end()));
+  columns.emplace_back(cudf::make_lists_column(length,
+                                               list_offsets_column.release(),
+                                               list_child_column.release(),
+                                               list_nulls,
+                                               std::move(*list_mask)));
+  auto int_column = cudf::test::fixed_width_column_wrapper<int64_t>(
+                      int64_data.begin(), int64_data.end(), validity.begin())
+                      .release();
+  auto str_column =
+    cudf::test::strings_column_wrapper(string_data.begin(), string_data.end(), validity.begin())
+      .release();
+  vector_of_columns cols;
+  cols.push_back(move(int_column));
+  cols.push_back(move(str_column));
+  auto [null_mask, null_count] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper<bool>(
+    bool_data_validity.begin(), bool_data_validity.end()));
+  columns.emplace_back(
+    cudf::make_structs_column(length, std::move(cols), null_count, std::move(*null_mask)));
+
+  nanoarrow::UniqueSchema schema;
+  ArrowSchemaInit(schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(schema.get(), 6));
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[0], "a"));
+  if (columns[0]->null_count() > 0) {
+    schema->children[0]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[0]->flags = 0;
+  }
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[1], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[1], "b"));
+  if (columns[1]->null_count() > 0) {
+    schema->children[1]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[1]->flags = 0;
+  }
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[2], NANOARROW_TYPE_UINT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(schema->children[2]));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(schema->children[2]->dictionary, NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[2], "c"));
+  if (columns[2]->null_count() > 0) {
+    schema->children[2]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[2]->flags = 0;
+  }
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[3], NANOARROW_TYPE_BOOL));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[3], "d"));
+  if (columns[3]->null_count() > 0) {
+    schema->children[3]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[3]->flags = 0;
+  }
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[4], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(schema->children[4]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[4]->children[0], "element"));
+  if (columns[4]->child(1).null_count() > 0) {
+    schema->children[4]->children[0]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[4]->children[0]->flags = 0;
+  }
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[4], "e"));
+  if (columns[4]->has_nulls()) {
+    schema->children[4]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[4]->flags = 0;
+  }
+
+  ArrowSchemaInit(schema->children[5]);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(schema->children[5], 2));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(schema->children[5]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[5]->children[0], "integral"));
+  if (columns[5]->child(0).has_nulls()) {
+    schema->children[5]->children[0]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[5]->children[0]->flags = 0;
+  }
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(schema->children[5]->children[1], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[5]->children[1], "string"));
+  if (columns[5]->child(1).has_nulls()) {
+    schema->children[5]->children[1]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[5]->children[1]->flags = 0;
+  }
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema->children[5], "f"));
+  if (columns[5]->has_nulls()) {
+    schema->children[5]->flags |= ARROW_FLAG_NULLABLE;
+  } else {
+    schema->children[5]->flags = 0;
+  }
+
+  nanoarrow::UniqueArray arrow;
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(arrow.get(), schema.get(), nullptr));
+
+  get_nanoarrow_array<int64_t>(arrow->children[0], int64_data, validity);
+  get_nanoarrow_array<cudf::string_view>(arrow->children[1], string_data, validity);
+  cudf::dictionary_column_view view(dict_col->view());
+  auto keys    = cudf::test::to_host<int64_t>(view.keys()).first;
+  auto indices = cudf::test::to_host<uint32_t>(view.indices()).first;
+  get_nanoarrow_dict_array(arrow->children[2],
+                           std::vector<int64_t>(keys.begin(), keys.end()),
+                           std::vector<int32_t>(indices.begin(), indices.end()),
+                           validity);
+  get_nanoarrow_array<bool>(arrow->children[3], bool_data, bool_validity);
+  get_nanoarrow_list_array<int64_t>(arrow->children[4],
+                                    list_int64_data,
+                                    list_offsets,
+                                    list_int64_data_validity,
+                                    bool_data_validity);
+
+  get_nanoarrow_array<int64_t>(arrow->children[5]->children[0], int64_data, validity);
+  get_nanoarrow_array<cudf::string_view>(arrow->children[5]->children[1], string_data, validity);
+  arrow->children[5]->length = length;
+  NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(arrow->children[5]), length));
+  std::for_each(bool_data_validity.begin(), bool_data_validity.end(), [&](auto&& elem) {
+    NANOARROW_THROW_NOT_OK(
+      ArrowBitmapAppend(ArrowArrayValidityBitmap(arrow->children[5]), (elem) ? 1 : 0, 1));
+  });
+  arrow->children[5]->null_count =
+    ArrowBitCountSet(ArrowArrayValidityBitmap(arrow->children[5])->buffer.data, 0, length);
+
+  CUDF_EXPECTS(ArrowArrayFinishBuildingDefault(arrow.get(), nullptr) == NANOARROW_OK,
+               "failed to build example Arrays");
+
+  return std::make_tuple(
+    std::make_unique<cudf::table>(std::move(columns)), std::move(schema), std::move(arrow));
+}
+
+struct BaseArrowFixture : public cudf::test::BaseFixture {
+  void compare_schemas(const ArrowSchema* expected, const ArrowSchema* actual)
+  {
+    EXPECT_STREQ(expected->format, actual->format);
+    EXPECT_STREQ(expected->name, actual->name);
+    EXPECT_STREQ(expected->metadata, actual->metadata);
+    EXPECT_EQ(expected->flags, actual->flags);
+    EXPECT_EQ(expected->n_children, actual->n_children);
+
+    if (expected->n_children == 0) {
+      EXPECT_EQ(nullptr, actual->children);
+    } else {
+      for (int i = 0; i < expected->n_children; ++i) {
+        SCOPED_TRACE(expected->children[i]->name);
+        compare_schemas(expected->children[i], actual->children[i]);
+      }
+    }
+
+    if (expected->dictionary != nullptr) {
+      EXPECT_NE(nullptr, actual->dictionary);
+      SCOPED_TRACE("dictionary");
+      compare_schemas(expected->dictionary, actual->dictionary);
+    } else {
+      EXPECT_EQ(nullptr, actual->dictionary);
+    }
+  }
+
+  void compare_device_buffers(const size_t nbytes,
+                              const int buffer_idx,
+                              const ArrowArray* expected,
+                              const ArrowArray* actual)
+  {
+    std::vector<uint8_t> actual_bytes;
+    std::vector<uint8_t> expected_bytes;
+    expected_bytes.resize(nbytes);
+    actual_bytes.resize(nbytes);
+
+    // synchronous copies so we don't have to worry about async weirdness
+    cudaMemcpy(
+      expected_bytes.data(), expected->buffers[buffer_idx], nbytes, cudaMemcpyDeviceToHost);
+    cudaMemcpy(actual_bytes.data(), actual->buffers[buffer_idx], nbytes, cudaMemcpyDeviceToHost);
+
+    ASSERT_EQ(expected_bytes, actual_bytes);
+  }
+
+  void compare_arrays(const ArrowSchema* schema,
+                      const ArrowArray* expected,
+                      const ArrowArray* actual)
+  {
+    ArrowSchemaView schema_view;
+    ArrowSchemaViewInit(&schema_view, schema, nullptr);
+
+    EXPECT_EQ(expected->length, actual->length);
+    EXPECT_EQ(expected->null_count, actual->null_count);
+    EXPECT_EQ(expected->offset, actual->offset);
+    EXPECT_EQ(expected->n_buffers, actual->n_buffers);
+    EXPECT_EQ(expected->n_children, actual->n_children);
+
+    if (expected->length > 0) {
+      EXPECT_EQ(expected->buffers[0], actual->buffers[0]);
+      if (schema_view.type == NANOARROW_TYPE_BOOL) {
+        const size_t nbytes = (expected->length + 7) >> 3;
+        compare_device_buffers(nbytes, 1, expected, actual);
+      } else if (schema_view.type == NANOARROW_TYPE_DECIMAL128) {
+        const size_t nbytes = (expected->length * sizeof(__int128_t));
+        compare_device_buffers(nbytes, 1, expected, actual);
+      } else {
+        for (int i = 1; i < expected->n_buffers; ++i) {
+          EXPECT_EQ(expected->buffers[i], actual->buffers[i]);
+        }
+      }
+    }
+
+    if (expected->n_children == 0) {
+      EXPECT_EQ(nullptr, actual->children);
+    } else {
+      for (int i = 0; i < expected->n_children; ++i) {
+        SCOPED_TRACE(schema->children[i]->name);
+        compare_arrays(schema->children[i], expected->children[i], actual->children[i]);
+      }
+    }
+
+    if (expected->dictionary != nullptr) {
+      EXPECT_NE(nullptr, actual->dictionary);
+      SCOPED_TRACE("dictionary");
+      compare_arrays(schema->dictionary, expected->dictionary, actual->dictionary);
+    } else {
+      EXPECT_EQ(nullptr, actual->dictionary);
+    }
+  }
+};
+
+struct ToArrowDeviceTest : public BaseArrowFixture {};
+
+template <typename T>
+struct ToArrowDeviceTestDurationsTest : public BaseArrowFixture {};
+
+TYPED_TEST_SUITE(ToArrowDeviceTestDurationsTest, cudf::test::DurationTypes);
+
+TEST_F(ToArrowDeviceTest, EmptyTable)
+{
+  const auto [table, schema, arr] = get_nanoarrow_tables(0);
+
+  auto struct_meta          = cudf::column_metadata{"f"};
+  struct_meta.children_meta = {{"integral"}, {"string"}};
+
+  cudf::dictionary_column_view dview{table->view().column(2)};
+
+  std::vector<cudf::column_metadata> meta{{"a"}, {"b"}, {"c"}, {"d"}, {"e"}, struct_meta};
+  auto got_arrow_schema = cudf::to_arrow_schema(table->view(), meta);
+
+  compare_schemas(schema.get(), got_arrow_schema.get());
+  ArrowSchemaRelease(got_arrow_schema.get());
+
+  auto got_arrow_device = cudf::to_arrow_device(std::move(*table));
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_device->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_device->device_type);
+
+  compare_arrays(schema.get(), arr.get(), &got_arrow_device->array);
+  ArrowArrayRelease(&got_arrow_device->array);
+}
+
+TEST_F(ToArrowDeviceTest, DateTimeTable)
+{
+  auto data = {1, 2, 3, 4, 5, 6};
+  auto col =
+    cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>(data);
+  std::vector<std::unique_ptr<cudf::column>> cols;
+  cols.emplace_back(col.release());
+  cudf::table input(std::move(cols));
+
+  auto got_arrow_schema =
+    cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
+  nanoarrow::UniqueSchema expected_schema;
+  ArrowSchemaInit(expected_schema.get());
+  ArrowSchemaSetTypeStruct(expected_schema.get(), 1);
+  ArrowSchemaInit(expected_schema->children[0]);
+  ArrowSchemaSetTypeDateTime(
+    expected_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr);
+  ArrowSchemaSetName(expected_schema->children[0], "a");
+  expected_schema->children[0]->flags = 0;
+
+  compare_schemas(expected_schema.get(), got_arrow_schema.get());
+  ArrowSchemaRelease(got_arrow_schema.get());
+
+  auto data_ptr        = input.get_column(0).view().data<int64_t>();
+  auto got_arrow_array = cudf::to_arrow_device(std::move(input));
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+
+  EXPECT_EQ(data.size(), got_arrow_array->array.length);
+  EXPECT_EQ(0, got_arrow_array->array.null_count);
+  EXPECT_EQ(0, got_arrow_array->array.offset);
+  EXPECT_EQ(1, got_arrow_array->array.n_children);
+  EXPECT_EQ(nullptr, got_arrow_array->array.buffers[0]);
+
+  EXPECT_EQ(data.size(), got_arrow_array->array.children[0]->length);
+  EXPECT_EQ(0, got_arrow_array->array.children[0]->null_count);
+  EXPECT_EQ(0, got_arrow_array->array.children[0]->offset);
+  EXPECT_EQ(nullptr, got_arrow_array->array.children[0]->buffers[0]);
+  EXPECT_EQ(data_ptr, got_arrow_array->array.children[0]->buffers[1]);
+
+  ArrowArrayRelease(&got_arrow_array->array);
+}
+
+TYPED_TEST(ToArrowDeviceTestDurationsTest, DurationTable)
+{
+  using T = TypeParam;
+
+  if (cudf::type_to_id<TypeParam>() == cudf::type_id::DURATION_DAYS) { return; }
+
+  auto data = {T{1}, T{2}, T{3}, T{4}, T{5}, T{6}};
+  auto col  = cudf::test::fixed_width_column_wrapper<T>(data);
+
+  std::vector<std::unique_ptr<cudf::column>> cols;
+  cols.emplace_back(col.release());
+  cudf::table input(std::move(cols));
+
+  nanoarrow::UniqueSchema expected_schema;
+  ArrowSchemaInit(expected_schema.get());
+  ArrowSchemaSetTypeStruct(expected_schema.get(), 1);
+
+  ArrowSchemaInit(expected_schema->children[0]);
+  const ArrowTimeUnit arrow_unit = [&] {
+    switch (cudf::type_to_id<TypeParam>()) {
+      case cudf::type_id::DURATION_SECONDS: return NANOARROW_TIME_UNIT_SECOND;
+      case cudf::type_id::DURATION_MILLISECONDS: return NANOARROW_TIME_UNIT_MILLI;
+      case cudf::type_id::DURATION_MICROSECONDS: return NANOARROW_TIME_UNIT_MICRO;
+      case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TIME_UNIT_NANO;
+      default: CUDF_FAIL("Unsupported duration unit in arrow");
+    }
+  }();
+  ArrowSchemaSetTypeDateTime(
+    expected_schema->children[0], NANOARROW_TYPE_DURATION, arrow_unit, nullptr);
+  ArrowSchemaSetName(expected_schema->children[0], "a");
+  expected_schema->children[0]->flags = 0;
+
+  auto got_arrow_schema =
+    cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
+  BaseArrowFixture::compare_schemas(expected_schema.get(), got_arrow_schema.get());
+  ArrowSchemaRelease(got_arrow_schema.get());
+
+  auto data_ptr        = input.get_column(0).view().data<int64_t>();
+  auto got_arrow_array = cudf::to_arrow_device(std::move(input));
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+
+  EXPECT_EQ(data.size(), got_arrow_array->array.length);
+  EXPECT_EQ(0, got_arrow_array->array.null_count);
+  EXPECT_EQ(0, got_arrow_array->array.offset);
+  EXPECT_EQ(1, got_arrow_array->array.n_children);
+  EXPECT_EQ(nullptr, got_arrow_array->array.buffers[0]);
+
+  EXPECT_EQ(data.size(), got_arrow_array->array.children[0]->length);
+  EXPECT_EQ(0, got_arrow_array->array.children[0]->null_count);
+  EXPECT_EQ(0, got_arrow_array->array.children[0]->offset);
+  EXPECT_EQ(nullptr, got_arrow_array->array.children[0]->buffers[0]);
+  EXPECT_EQ(data_ptr, got_arrow_array->array.children[0]->buffers[1]);
+
+  ArrowArrayRelease(&got_arrow_array->array);
+}
+
+TEST_F(ToArrowDeviceTest, NestedList)
+{
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 != 0; });
+  auto col = cudf::test::lists_column_wrapper<int64_t>(
+    {{{{{1, 2}, valids}, {{3, 4}, valids}, {5}}, {{6}, {{7, 8, 9}, valids}}}, valids});
+
+  std::vector<std::unique_ptr<cudf::column>> cols;
+  cols.emplace_back(col.release());
+  cudf::table input(std::move(cols));
+
+  nanoarrow::UniqueSchema expected_schema;
+  ArrowSchemaInit(expected_schema.get());
+  ArrowSchemaSetTypeStruct(expected_schema.get(), 1);
+
+  ArrowSchemaInitFromType(expected_schema->children[0], NANOARROW_TYPE_LIST);
+  ArrowSchemaSetName(expected_schema->children[0], "a");
+  expected_schema->children[0]->flags = ARROW_FLAG_NULLABLE;
+
+  ArrowSchemaInitFromType(expected_schema->children[0]->children[0], NANOARROW_TYPE_LIST);
+  ArrowSchemaSetName(expected_schema->children[0]->children[0], "element");
+  expected_schema->children[0]->children[0]->flags = 0;
+
+  ArrowSchemaInitFromType(expected_schema->children[0]->children[0]->children[0],
+                          NANOARROW_TYPE_INT64);
+  ArrowSchemaSetName(expected_schema->children[0]->children[0]->children[0], "element");
+  expected_schema->children[0]->children[0]->children[0]->flags = ARROW_FLAG_NULLABLE;
+
+  auto got_arrow_schema =
+    cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
+  compare_schemas(expected_schema.get(), got_arrow_schema.get());
+  ArrowSchemaRelease(got_arrow_schema.get());
+
+  nanoarrow::UniqueArray expected_array;
+  EXPECT_EQ(NANOARROW_OK,
+            ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+  expected_array->length = input.num_rows();
+  auto top_list          = expected_array->children[0];
+  cudf::lists_column_view lview{input.get_column(0).view()};
+  populate_list_from_col(top_list, lview);
+  cudf::lists_column_view nested_view{lview.child()};
+  populate_list_from_col(top_list->children[0], nested_view);
+  populate_from_col<int64_t>(top_list->children[0]->children[0], nested_view.child());
+
+  ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+
+  auto got_arrow_array = cudf::to_arrow_device(std::move(input));
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+
+  compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
+  ArrowArrayRelease(&got_arrow_array->array);
+}
+
+TEST_F(ToArrowDeviceTest, StructColumn)
+{
+  // Create cudf table
+  auto nested_type_field_names =
+    std::vector<std::vector<std::string>>{{"string", "integral", "bool", "nested_list", "struct"}};
+  auto str_col =
+    cudf::test::strings_column_wrapper{
+      "Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"}
+      .release();
+  auto str_col2 =
+    cudf::test::strings_column_wrapper{{"CUDF", "ROCKS", "EVERYWHERE"}, {0, 1, 0}}.release();
+  int num_rows{str_col->size()};
+  auto int_col = cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{48, 27, 25}}.release();
+  auto int_col2 =
+    cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{12, 24, 47}, {1, 0, 1}}.release();
+  auto bool_col = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}}.release();
+  auto list_col =
+    cudf::test::lists_column_wrapper<int64_t>({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})
+      .release();
+  vector_of_columns cols2;
+  cols2.push_back(std::move(str_col2));
+  cols2.push_back(std::move(int_col2));
+  auto [null_mask, null_count] =
+    cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}});
+  auto sub_struct_col =
+    cudf::make_structs_column(num_rows, std::move(cols2), null_count, std::move(*null_mask));
+  vector_of_columns cols;
+  cols.push_back(std::move(str_col));
+  cols.push_back(std::move(int_col));
+  cols.push_back(std::move(bool_col));
+  cols.push_back(std::move(list_col));
+  cols.push_back(std::move(sub_struct_col));
+
+  auto struct_col = cudf::make_structs_column(num_rows, std::move(cols), 0, {});
+  std::vector<std::unique_ptr<cudf::column>> table_cols;
+  table_cols.emplace_back(struct_col.release());
+  cudf::table input(std::move(table_cols));
+
+  // Create name metadata
+  auto sub_metadata          = cudf::column_metadata{"struct"};
+  sub_metadata.children_meta = {{"string2"}, {"integral2"}};
+  auto metadata              = cudf::column_metadata{"a"};
+  metadata.children_meta     = {{"string"}, {"integral"}, {"bool"}, {"nested_list"}, sub_metadata};
+
+  nanoarrow::UniqueSchema expected_schema;
+  ArrowSchemaInit(expected_schema.get());
+  ArrowSchemaSetTypeStruct(expected_schema.get(), 1);
+
+  ArrowSchemaInit(expected_schema->children[0]);
+  ArrowSchemaSetTypeStruct(expected_schema->children[0], 5);
+  ArrowSchemaSetName(expected_schema->children[0], "a");
+  expected_schema->children[0]->flags = 0;
+
+  auto child = expected_schema->children[0];
+  ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING);
+  ArrowSchemaSetName(child->children[0], "string");
+  child->children[0]->flags = 0;
+
+  ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32);
+  ArrowSchemaSetName(child->children[1], "integral");
+  child->children[1]->flags = 0;
+
+  ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL);
+  ArrowSchemaSetName(child->children[2], "bool");
+  child->children[2]->flags = 0;
+
+  ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST);
+  ArrowSchemaSetName(child->children[3], "nested_list");
+  child->children[3]->flags = 0;
+  ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST);
+  ArrowSchemaSetName(child->children[3]->children[0], "element");
+  child->children[3]->children[0]->flags = 0;
+  ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64);
+  ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element");
+  child->children[3]->children[0]->children[0]->flags = 0;
+
+  ArrowSchemaInit(child->children[4]);
+  ArrowSchemaSetTypeStruct(child->children[4], 2);
+  ArrowSchemaSetName(child->children[4], "struct");
+
+  ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING);
+  ArrowSchemaSetName(child->children[4]->children[0], "string2");
+  ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32);
+  ArrowSchemaSetName(child->children[4]->children[1], "integral2");
+
+  auto got_arrow_schema =
+    cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{metadata});
+  compare_schemas(expected_schema.get(), got_arrow_schema.get());
+  ArrowSchemaRelease(got_arrow_schema.get());
+
+  nanoarrow::UniqueArray expected_array;
+  ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr);
+
+  expected_array->length = input.num_rows();
+
+  auto array_a        = expected_array->children[0];
+  auto view_a         = input.view().column(0);
+  array_a->length     = view_a.size();
+  array_a->null_count = view_a.null_count();
+
+  ArrowBufferSetAllocator(ArrowArrayBuffer(array_a, 0), noop_alloc);
+  ArrowArrayValidityBitmap(array_a)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_a.null_mask()));
+
+  populate_from_col<cudf::string_view>(array_a->children[0], view_a.child(0));
+  populate_from_col<int32_t>(array_a->children[1], view_a.child(1));
+  populate_from_col<bool>(array_a->children[2], view_a.child(2));
+  populate_list_from_col(array_a->children[3], cudf::lists_column_view{view_a.child(3)});
+  populate_list_from_col(array_a->children[3]->children[0],
+                         cudf::lists_column_view{view_a.child(3).child(1)});
+  populate_from_col<int64_t>(array_a->children[3]->children[0]->children[0],
+                             view_a.child(3).child(1).child(1));
+
+  auto array_struct        = array_a->children[4];
+  auto view_struct         = view_a.child(4);
+  array_struct->length     = view_struct.size();
+  array_struct->null_count = view_struct.null_count();
+
+  ArrowBufferSetAllocator(ArrowArrayBuffer(array_struct, 0), noop_alloc);
+  ArrowArrayValidityBitmap(array_struct)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_struct.null_mask()));
+
+  populate_from_col<cudf::string_view>(array_struct->children[0], view_struct.child(0));
+  populate_from_col<int32_t>(array_struct->children[1], view_struct.child(1));
+
+  ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+
+  auto got_arrow_array = cudf::to_arrow_device(std::move(input));
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+
+  compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
+  ArrowArrayRelease(&got_arrow_array->array);
+}
+
+template <typename T>
+using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
+
+TEST_F(ToArrowDeviceTest, FixedPoint64Table)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const expect_data = std::vector<int64_t>{-1, -1, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0};
+    auto col               = fp_wrapper<int64_t>({-1, 2, 3, 4, 5, 6}, scale_type{scale});
+    std::vector<std::unique_ptr<cudf::column>> table_cols;
+    table_cols.emplace_back(col.release());
+    auto input = cudf::table(std::move(table_cols));
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    ArrowSchemaSetTypeStruct(expected_schema.get(), 1);
+    ArrowSchemaInit(expected_schema->children[0]);
+    ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                              NANOARROW_TYPE_DECIMAL128,
+                              cudf::detail::max_precision<int64_t>(),
+                              -scale);
+    ArrowSchemaSetName(expected_schema->children[0], "a");
+    expected_schema->children[0]->flags = 0;
+
+    auto got_arrow_schema =
+      cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
+    compare_schemas(expected_schema.get(), got_arrow_schema.get());
+    ArrowSchemaRelease(got_arrow_schema.get());
+
+    auto result_dev_data = std::make_unique<rmm::device_uvector<int64_t>>(
+      expect_data.size(), cudf::get_default_stream());
+    cudaMemcpy(result_dev_data->data(),
+               expect_data.data(),
+               sizeof(int64_t) * expect_data.size(),
+               cudaMemcpyHostToDevice);
+
+    cudf::get_default_stream().synchronize();
+    nanoarrow::UniqueArray expected_array;
+    ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr);
+    expected_array->length = input.num_rows();
+
+    expected_array->children[0]->length = input.num_rows();
+    ArrowBufferSetAllocator(ArrowArrayBuffer(expected_array->children[0], 0), noop_alloc);
+    ArrowArrayValidityBitmap(expected_array->children[0])->buffer.data =
+      const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(input.view().column(0).null_mask()));
+
+    auto data_ptr = reinterpret_cast<uint8_t*>(result_dev_data->data());
+    ArrowBufferSetAllocator(
+      ArrowArrayBuffer(expected_array->children[0], 1),
+      ArrowBufferDeallocator(
+        [](ArrowBufferAllocator* alloc, uint8_t*, int64_t) {
+          auto buf =
+            reinterpret_cast<std::unique_ptr<rmm::device_uvector<int64_t>>*>(alloc->private_data);
+          delete buf;
+        },
+        new std::unique_ptr<rmm::device_uvector<int64_t>>(std::move(result_dev_data))));
+    ArrowArrayBuffer(expected_array->children[0], 1)->data = data_ptr;
+    ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+
+    auto got_arrow_array = cudf::to_arrow_device(std::move(input));
+    ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+    ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+
+    compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
+    ArrowArrayRelease(&got_arrow_array->array);
+  }
+}
+
+TEST_F(ToArrowDeviceTest, FixedPoint128Table)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const expect_data = std::vector<__int128_t>{-1, 2, 3, 4, 5, 6};
+    auto col               = fp_wrapper<__int128_t>({-1, 2, 3, 4, 5, 6}, scale_type{scale});
+    std::vector<std::unique_ptr<cudf::column>> table_cols;
+    table_cols.emplace_back(col.release());
+    auto input = cudf::table(std::move(table_cols));
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    ArrowSchemaSetTypeStruct(expected_schema.get(), 1);
+    ArrowSchemaInit(expected_schema->children[0]);
+    ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                              NANOARROW_TYPE_DECIMAL128,
+                              cudf::detail::max_precision<__int128_t>(),
+                              -scale);
+    ArrowSchemaSetName(expected_schema->children[0], "a");
+    expected_schema->children[0]->flags = 0;
+
+    auto got_arrow_schema =
+      cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
+    compare_schemas(expected_schema.get(), got_arrow_schema.get());
+    ArrowSchemaRelease(got_arrow_schema.get());
+
+    nanoarrow::UniqueArray expected_array;
+    ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr);
+    expected_array->length = input.num_rows();
+
+    populate_from_col<__int128_t>(expected_array->children[0], input.view().column(0));
+    ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+
+    auto got_arrow_array = cudf::to_arrow_device(std::move(input));
+    EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+    EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+
+    compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
+    ArrowArrayRelease(&got_arrow_array->array);
+  }
+}
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 7afc8fe19bf..b891ff99d47 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -306,6 +306,7 @@ def clean_all_xml_files(path):
 intersphinx_mapping = {
     "cupy": ("https://docs.cupy.dev/en/stable/", None),
     "dlpack": ("https://dmlc.github.io/dlpack/latest/", None),
+    "nanoarrow": ("https://arrow.apache.org/nanoarrow/latest", None),
     "numpy": ("https://numpy.org/doc/stable", None),
     "pandas": ("https://pandas.pydata.org/docs/", None),
     "pyarrow": ("https://arrow.apache.org/docs/", None),