Add to_arrow_device function to cudf interop using nanoarrow (#15047)

Introduce new `to_arrow_device` and `to_arrow_schema` functions to utilize the `ArrowDeviceArray` structure for zero-copy passing of libcudf::table. Add nanoarrow as a vendored lib and a script to update it. Initial step towards addressing #14926 Authors: - Matt Topol (https://github.com/zeroshade) - Vyas Ramasubramani (https://github.com/vyasr) - David Wendt (https://github.com/davidwendt) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - David Wendt (https://github.com/davidwendt) URL: #15047
rapidsai · Apr 1, 2024 · 268996a · 268996a
1 parent 09f8c8a
commit 268996a
Show file tree

Hide file tree

Showing 9 changed files with 1,882 additions and 6 deletions.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -210,12 +210,14 @@ include(cmake/thirdparty/get_kvikio.cmake)
 include(cmake/thirdparty/get_fmt.cmake)
 # find spdlog
 include(cmake/thirdparty/get_spdlog.cmake)
+# find nanoarrow
+include(cmake/thirdparty/get_nanoarrow.cmake)
 
 # Workaround until https://github.com/rapidsai/rapids-cmake/issues/176 is resolved
 if(NOT BUILD_SHARED_LIBS)
   include("${rapids-cmake-dir}/export/find_package_file.cmake")
   list(APPEND METADATA_KINDS BUILD INSTALL)
-  list(APPEND dependencies KvikIO ZLIB nvcomp)
+  list(APPEND dependencies KvikIO ZLIB nvcomp nanoarrow)
   if(TARGET cufile::cuFile_interface)
     list(APPEND dependencies cuFile)
   endif()
@@ -358,6 +360,7 @@ add_library(
   src/interop/dlpack.cpp
   src/interop/from_arrow.cu
   src/interop/to_arrow.cu
+  src/interop/to_arrow_device.cu
   src/interop/detail/arrow_allocator.cpp
   src/io/avro/avro.cpp
   src/io/avro/avro_gpu.cu
@@ -735,6 +738,7 @@ target_include_directories(
          "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/include>"
          "$<BUILD_INTERFACE:${CUDF_GENERATED_INCLUDE_DIR}/include>"
   PRIVATE "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
+          "$<BUILD_INTERFACE:${nanoarrow_SOURCE_DIR}/src>"
   INTERFACE "$<INSTALL_INTERFACE:include>"
 )
 
@@ -783,7 +787,7 @@ target_link_libraries(
   cudf
   PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm
   PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3-cpp> cuco::cuco ZLIB::ZLIB nvcomp::nvcomp kvikio::kvikio
-          $<TARGET_NAME_IF_EXISTS:cuFile_interface>
+          $<TARGET_NAME_IF_EXISTS:cuFile_interface> nanoarrow
 )
 
 # Add Conda library, and include paths if specified

diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake
@@ -0,0 +1,36 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# This function finds nanoarrow and sets any additional necessary environment variables.
+function(find_and_configure_nanoarrow)
+  set(oneValueArgs VERSION FORK PINNED_TAG)
+  cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  rapids_cpm_find(
+    nanoarrow ${PKG_VERSION}
+    GLOBAL_TARGETS nanoarrow
+    CPM_ARGS
+    GIT_REPOSITORY https://github.com/${PKG_FORK}/arrow-nanoarrow.git
+    GIT_TAG ${PKG_PINNED_TAG}
+    # TODO: Commit hashes are not supported with shallow clones. Can switch this if and when we pin
+    # to an actual tag.
+    GIT_SHALLOW FALSE
+    OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf"
+  )
+  set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endfunction()
+
+find_and_configure_nanoarrow(
+  VERSION 0.4.0 FORK apache PINNED_TAG c97720003ff863b81805bcdb9f7c91306ab6b6a8
+)
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,11 +34,16 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
 struct DLManagedTensor;
 
+struct ArrowDeviceArray;
+
+struct ArrowSchema;
+
 namespace cudf {
 /**
  * @addtogroup interop_dlpack
@@ -162,6 +167,95 @@ std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
                                         column_metadata const& metadata = {},
                                         rmm::cuda_stream_view stream = cudf::get_default_stream(),
                                         arrow::MemoryPool* ar_mr = arrow::default_memory_pool());
+
+/**
+ * @brief typedef for a unique_ptr to an ArrowSchema with custom deleter
+ *
+ */
+using unique_schema_t = std::unique_ptr<ArrowSchema, void (*)(ArrowSchema*)>;
+
+/**
+ * @brief typedef for a unique_ptr to an ArrowDeviceArray with a custom deleter
+ *
+ */
+using unique_device_array_t = std::unique_ptr<ArrowDeviceArray, void (*)(ArrowDeviceArray*)>;
+
+/**
+ * @brief Create ArrowSchema from cudf table and metadata
+ *
+ * Populates and returns an ArrowSchema C struct using a table and metadata.
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf,
+ * decimals will be converted to an Arrow decimal128 which has the widest precision that cudf
+ * decimal type supports. For example, `numeric::decimal32` will be converted to Arrow decimal128
+ * with the precision of 9 which is the maximum precision for 32-bit types. Similarly,
+ * `numeric::decimal128` will be converted to Arrow decimal128 with the precision of 38.
+ *
+ * @param input Table to create a schema from
+ * @param metadata Contains the hierarchy of names of columns and children
+ * @return ArrowSchema generated from `input`
+ */
+unique_schema_t to_arrow_schema(cudf::table_view const& input,
+                                cudf::host_span<column_metadata const> metadata);
+
+/**
+ * @brief Create `ArrowDeviceArray` from cudf table and metadata
+ *
+ * Populates the C struct ArrowDeviceArray without performing copies if possible.
+ * This maintains the data on the GPU device and gives ownership of the table
+ * and its buffers to the ArrowDeviceArray struct.
+ *
+ * After calling this function, the release callback on the returned ArrowDeviceArray
+ * must be called to clean up the memory.
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf
+ * it will be converted to an Arrow decimal128 with the widest-precision the cudf decimal type
+ * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
+ * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
+ * converted to Arrow decimal128 of the precision 38.
+ *
+ * @note Copies will be performed in the cases where cudf differs from Arrow
+ * such as in the representation of bools (Arrow uses a bitmap, cudf uses 1-byte per value).
+ *
+ * @param table Input table, ownership of the data will be moved to the result
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used for any allocations during conversion
+ * @return ArrowDeviceArray which will have ownership of the GPU data, consumer must call release
+ */
+unique_device_array_t to_arrow_device(
+  cudf::table&& table,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Create `ArrowDeviceArray` from cudf column and metadata
+ *
+ * Populates the C struct ArrowDeviceArray without performing copies if possible.
+ * This maintains the data on the GPU device and gives ownership of the table
+ * and its buffers to the ArrowDeviceArray struct.
+ *
+ * After calling this function, the release callback on the returned ArrowDeviceArray
+ * must be called to clean up the memory.
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf
+ * it will be converted to an Arrow decimal128 with the widest-precision the cudf decimal type
+ * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
+ * 9 which is the maximum precision for 32-bit types. Similar, numeric::decimal128 will be
+ * converted to Arrow decimal128 of the precision 38.
+ *
+ * @note Copies will be performed in the cases where cudf differs from Arrow such as
+ * in the representation of bools (Arrow uses a bitmap, cudf uses 1 byte per value).
+ *
+ * @param col Input column, ownership of the data will be moved to the result
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used for any allocations during conversion
+ * @return ArrowDeviceArray which will have ownership of the GPU data
+ */
+unique_device_array_t to_arrow_device(
+  cudf::column&& col,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Create `cudf::table` from given arrow Table input
  *

diff --git a/cpp/include/cudf/interop/detail/arrow.hpp b/cpp/include/cudf/interop/detail/arrow.hpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanoarrow/nanoarrow.hpp>
+
+// from Arrow C Device Data Interface
+// https://arrow.apache.org/docs/format/CDeviceDataInterface.html
+#ifndef ARROW_C_DEVICE_DATA_INTERFACE
+#define ARROW_C_DEVICE_DATA_INTERFACE
+
+// Device type for the allocated memory
+typedef int32_t ArrowDeviceType;
+
+// CPU device, same as using ArrowArray directly
+#define ARROW_DEVICE_CPU 1
+// CUDA GPU Device
+#define ARROW_DEVICE_CUDA 2
+// Pinned CUDA CPU memory by cudaMallocHost
+#define ARROW_DEVICE_CUDA_HOST 3
+// CUDA managed/unified memory allocated by cudaMallocManaged
+#define ARROW_DEVICE_CUDA_MANAGED 13
+
+struct ArrowDeviceArray {
+  struct ArrowArray array;
+  int64_t device_id;
+  ArrowDeviceType device_type;
+  void* sync_event;
+
+  // reserved bytes for future expansion
+  int64_t reserved[3];
+};
+
+#endif  // ARROW_C_DEVICE_DATA_INTERFACE