Avoid converting Decimal32/Decimal64 in to_arrow and from_arrow A…

…PIs (#17422) Now that the Arrow format includes `Decimal32` and `Decimal64` data types, CUDF no longer needs to convert them to decimal128 when importing/exporting values via the `to_arrow` and `from_arrow` APIs. Instead we can just treat them like any other fixed-width data type and use the buffers directly. This doesn't fully address #17080 as it doesn't make any changes to the Parquet side of things This also incorporates the changes from #17405 which are needed for debug tests. That should get merged first, and then I can rebase this. Authors: - Matt Topol (https://github.com/zeroshade) - David Wendt (https://github.com/davidwendt) - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) Approvers: - Paul Mattione (https://github.com/pmattione-nvidia) - Bradley Dice (https://github.com/bdice) - Lawrence Mitchell (https://github.com/wence-) - GALI PREM SAGAR (https://github.com/galipremsagar) - Robert (Bobby) Evans (https://github.com/revans2) - David Wendt (https://github.com/davidwendt) - Vyas Ramasubramani (https://github.com/vyasr) URL: #17422
rapidsai · Jan 29, 2025 · d9b7a98 · d9b7a98
1 parent aa80d45
commit d9b7a98
Show file tree

Hide file tree

Showing 23 changed files with 735 additions and 417 deletions.
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
@@ -81,7 +81,7 @@ requirements:
     - numba-cuda >=0.2.0,<0.3.0a0
     - numba >=0.59.1,<0.61.0a0
     - numpy >=1.23,<3.0a0
-    - pyarrow>=14.0.0,<18.0.0a0
+    - pyarrow>=14.0.0,<20.0.0a0
     - libcudf ={{ version }}
     - pylibcudf ={{ version }}
     - {{ pin_compatible('rmm', max_pin='x.x') }}

diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml
@@ -77,7 +77,7 @@ requirements:
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.4dev0
     - numpy >=1.23,<3.0a0
-    - pyarrow>=14.0.0,<18.0.0a0
+    - pyarrow>=14.0.0,<20.0.0a0
     - {{ pin_compatible('rmm', max_pin='x.x') }}
     - fsspec >=0.6.0
     {% if cuda_major == "11" %}

diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -347,7 +347,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow)
   set(CUDF_VERSION_Arrow
       # This version must be kept in sync with the libarrow version pinned for builds in
       # dependencies.yaml.
-      16.1.0
+      19.0.0
       CACHE STRING "The version of Arrow to find (or build)"
   )
 endif()

diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -23,11 +23,11 @@ function(find_and_configure_nanoarrow)
   # Currently we need to always build nanoarrow so we don't pickup a previous installed version
   set(CPM_DOWNLOAD_nanoarrow ON)
   rapids_cpm_find(
-    nanoarrow 0.6.0.dev
+    nanoarrow 0.7.0.dev
     GLOBAL_TARGETS nanoarrow
     CPM_ARGS
     GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git
-    GIT_TAG 1e2664a70ec14907409cadcceb14d79b9670bcdb
+    GIT_TAG 4bf5a9322626e95e3717e43de7616c0a256179eb
     GIT_SHALLOW FALSE
     OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" ${_exclude_from_all}
   )

diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_override.json b/cpp/cmake/thirdparty/patches/nanoarrow_override.json
@@ -0,0 +1,18 @@
+
+{
+  "packages" : {
+    "nanoarrow" : {
+      "version" : "0.7.0.dev",
+      "git_url" : "https://github.com/apache/arrow-nanoarrow.git",
+      "git_tag" : "4bf5a9322626e95e3717e43de7616c0a256179eb",
+      "git_shallow" : false,
+      "patches" : [
+        {
+          "file" : "${current_json_dir}/nanoarrow_clang_tidy_compliance.diff",
+          "issue" : "https://github.com/apache/arrow-nanoarrow/issues/537",
+          "fixed_in" : ""
+        }
+      ]
+    }
+  }
+}
diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -63,6 +63,8 @@ data_type arrow_to_cudf_type(ArrowSchemaView const* arrow_view)
         default: CUDF_FAIL("Unsupported duration unit in arrow", cudf::data_type_error);
       }
     }
+    case NANOARROW_TYPE_DECIMAL32: return data_type{type_id::DECIMAL32, -arrow_view->decimal_scale};
+    case NANOARROW_TYPE_DECIMAL64: return data_type{type_id::DECIMAL64, -arrow_view->decimal_scale};
     case NANOARROW_TYPE_DECIMAL128:
       return data_type{type_id::DECIMAL128, -arrow_view->decimal_scale};
     default: CUDF_FAIL("Unsupported type_id conversion to cudf", cudf::data_type_error);
@@ -84,6 +86,8 @@ ArrowType id_to_arrow_type(cudf::type_id id)
     case cudf::type_id::FLOAT32: return NANOARROW_TYPE_FLOAT;
     case cudf::type_id::FLOAT64: return NANOARROW_TYPE_DOUBLE;
     case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_DATE32;
+    case cudf::type_id::DECIMAL32: return NANOARROW_TYPE_DECIMAL32;
+    case cudf::type_id::DECIMAL64: return NANOARROW_TYPE_DECIMAL64;
     case cudf::type_id::DECIMAL128: return NANOARROW_TYPE_DECIMAL128;
     default: CUDF_FAIL("Unsupported type_id conversion to arrow type", cudf::data_type_error);
   }

diff --git a/cpp/src/interop/arrow_utilities.hpp b/cpp/src/interop/arrow_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -70,21 +70,5 @@ ArrowType id_to_arrow_storage_type(cudf::type_id id);
  */
 int initialize_array(ArrowArray* arr, ArrowType storage_type, cudf::column_view column);
 
-/**
- * @brief Helper to convert decimal values to 128-bit versions for Arrow compatibility
- *
- * The template parameter should be the underlying type of the data (e.g. int32_t for
- * 32-bit decimal and int64_t for 64-bit decimal).
- *
- * @param input column_view of the data
- * @param stream cuda stream to perform the operations on
- * @param mr memory resource to allocate the returned device_uvector with
- * @return unique_ptr to a device_buffer containing the upcasted data
- */
-template <typename DeviceType>
-std::unique_ptr<rmm::device_buffer> decimals_to_arrow(cudf::column_view input,
-                                                      rmm::cuda_stream_view stream,
-                                                      rmm::device_async_resource_ref mr);
-
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,9 +49,7 @@ namespace {
 using dispatch_tuple_t = std::tuple<column_view, owned_columns_t>;
 
 struct dispatch_from_arrow_device {
-  template <typename T,
-            CUDF_ENABLE_IF(not is_rep_layout_compatible<T>() &&
-                           !std::is_same_v<T, numeric::decimal128>)>
+  template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>() && !is_fixed_point<T>())>
   dispatch_tuple_t operator()(ArrowSchemaView*,
                               ArrowArray const*,
                               data_type,
@@ -62,8 +60,7 @@ struct dispatch_from_arrow_device {
     CUDF_FAIL("Unsupported type in from_arrow_device", cudf::data_type_error);
   }
 
-  template <typename T,
-            CUDF_ENABLE_IF(is_rep_layout_compatible<T>() || std::is_same_v<T, numeric::decimal128>)>
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>() || is_fixed_point<T>())>
   dispatch_tuple_t operator()(ArrowSchemaView* schema,
                               ArrowArray const* input,
                               data_type type,

diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,22 +69,19 @@ struct dispatch_copy_from_arrow_host {
     return mask;
   }
 
-  template <typename T,
-            CUDF_ENABLE_IF(not is_rep_layout_compatible<T>() &&
-                           !std::is_same_v<T, numeric::decimal128>)>
+  template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>() && !is_fixed_point<T>())>
   std::unique_ptr<column> operator()(ArrowSchemaView*, ArrowArray const*, data_type, bool)
   {
     CUDF_FAIL("Unsupported type in copy_from_arrow_host.");
   }
 
-  template <typename T,
-            CUDF_ENABLE_IF(is_rep_layout_compatible<T>() || std::is_same_v<T, numeric::decimal128>)>
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>() || is_fixed_point<T>())>
   std::unique_ptr<column> operator()(ArrowSchemaView* schema,
                                      ArrowArray const* input,
                                      data_type type,
                                      bool skip_mask)
   {
-    using DeviceType = std::conditional_t<std::is_same_v<T, numeric::decimal128>, __int128_t, T>;
+    using DeviceType = device_storage_type_t<T>;
 
     size_type const num_rows   = input->length;
     size_type const offset     = input->offset;

diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -92,13 +92,15 @@ int set_buffer(std::unique_ptr<T> device_buf, int64_t i, ArrowArray* out)
 }
 
 struct dispatch_to_arrow_device {
-  template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
+  template <typename T,
+            CUDF_ENABLE_IF(not is_rep_layout_compatible<T>() and not is_fixed_point<T>())>
   int operator()(cudf::column&&, rmm::cuda_stream_view, rmm::device_async_resource_ref, ArrowArray*)
   {
     CUDF_FAIL("Unsupported type for to_arrow_device", cudf::data_type_error);
   }
 
-  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
+  // cover rep layout compatible and decimal types
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>() or is_fixed_point<T>())>
   int operator()(cudf::column&& column,
                  rmm::cuda_stream_view stream,
                  rmm::device_async_resource_ref mr,
@@ -132,64 +134,6 @@ struct dispatch_to_arrow_device {
   }
 };
 
-template <typename DeviceType>
-int construct_decimals(cudf::column_view input,
-                       rmm::cuda_stream_view stream,
-                       rmm::device_async_resource_ref mr,
-                       ArrowArray* out)
-{
-  nanoarrow::UniqueArray tmp;
-  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, input));
-
-  auto buf = detail::convert_decimals_to_decimal128<DeviceType>(input, stream, mr);
-  // Synchronize stream here to ensure the decimal128 buffer is ready.
-  stream.synchronize();
-  NANOARROW_RETURN_NOT_OK(set_buffer(std::move(buf), fixed_width_data_buffer_idx, tmp.get()));
-
-  ArrowArrayMove(tmp.get(), out);
-  return NANOARROW_OK;
-}
-
-template <>
-int dispatch_to_arrow_device::operator()<numeric::decimal32>(cudf::column&& column,
-                                                             rmm::cuda_stream_view stream,
-                                                             rmm::device_async_resource_ref mr,
-                                                             ArrowArray* out)
-{
-  using DeviceType = int32_t;
-  NANOARROW_RETURN_NOT_OK(construct_decimals<DeviceType>(column.view(), stream, mr, out));
-  auto contents = column.release();
-  NANOARROW_RETURN_NOT_OK(set_null_mask(contents, out));
-  return NANOARROW_OK;
-}
-
-template <>
-int dispatch_to_arrow_device::operator()<numeric::decimal64>(cudf::column&& column,
-                                                             rmm::cuda_stream_view stream,
-                                                             rmm::device_async_resource_ref mr,
-                                                             ArrowArray* out)
-{
-  using DeviceType = int64_t;
-  NANOARROW_RETURN_NOT_OK(construct_decimals<DeviceType>(column.view(), stream, mr, out));
-  auto contents = column.release();
-  NANOARROW_RETURN_NOT_OK(set_null_mask(contents, out));
-  return NANOARROW_OK;
-}
-
-template <>
-int dispatch_to_arrow_device::operator()<numeric::decimal128>(cudf::column&& column,
-                                                              rmm::cuda_stream_view stream,
-                                                              rmm::device_async_resource_ref mr,
-                                                              ArrowArray* out)
-{
-  nanoarrow::UniqueArray tmp;
-  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, column));
-  auto contents = column.release();
-  NANOARROW_RETURN_NOT_OK(set_contents(contents, tmp.get()));
-  ArrowArrayMove(tmp.get(), out);
-  return NANOARROW_OK;
-}
-
 template <>
 int dispatch_to_arrow_device::operator()<bool>(cudf::column&& column,
                                                rmm::cuda_stream_view stream,
@@ -350,13 +294,14 @@ struct dispatch_to_arrow_device_view {
   rmm::cuda_stream_view stream;
   rmm::device_async_resource_ref mr;
 
-  template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
+  template <typename T,
+            CUDF_ENABLE_IF(not is_rep_layout_compatible<T>() and not is_fixed_point<T>())>
   int operator()(ArrowArray*) const
   {
     CUDF_FAIL("Unsupported type for to_arrow_device", cudf::data_type_error);
   }
 
-  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>() or is_fixed_point<T>())>
   int operator()(ArrowArray* out) const
   {
     nanoarrow::UniqueArray tmp;
@@ -404,37 +349,6 @@ struct dispatch_to_arrow_device_view {
   }
 };
 
-template <>
-int dispatch_to_arrow_device_view::operator()<numeric::decimal32>(ArrowArray* out) const
-{
-  using DeviceType = int32_t;
-  NANOARROW_RETURN_NOT_OK(construct_decimals<DeviceType>(column, stream, mr, out));
-  NANOARROW_RETURN_NOT_OK(set_null_mask(column, out));
-  return NANOARROW_OK;
-}
-
-template <>
-int dispatch_to_arrow_device_view::operator()<numeric::decimal64>(ArrowArray* out) const
-{
-  using DeviceType = int64_t;
-  NANOARROW_RETURN_NOT_OK(construct_decimals<DeviceType>(column, stream, mr, out));
-  NANOARROW_RETURN_NOT_OK(set_null_mask(column, out));
-  return NANOARROW_OK;
-}
-
-template <>
-int dispatch_to_arrow_device_view::operator()<numeric::decimal128>(ArrowArray* out) const
-{
-  nanoarrow::UniqueArray tmp;
-
-  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, column));
-  NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
-  NANOARROW_RETURN_NOT_OK(set_view_to_buffer(column, tmp.get()));
-
-  ArrowArrayMove(tmp.get(), out);
-  return NANOARROW_OK;
-}
-
 template <>
 int dispatch_to_arrow_device_view::operator()<bool>(ArrowArray* out) const
 {

diff --git a/cpp/src/interop/to_arrow_host.cu b/cpp/src/interop/to_arrow_host.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -115,8 +115,7 @@ struct dispatch_to_arrow_host {
     CUDF_FAIL("Unsupported type for to_arrow_host", cudf::data_type_error);
   }
 
-  template <typename T,
-            CUDF_ENABLE_IF(is_rep_layout_compatible<T>() || std::is_same_v<T, numeric::decimal128>)>
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>() || is_fixed_point<T>())>
   int operator()(ArrowArray* out) const
   {
     nanoarrow::UniqueArray tmp;
@@ -125,40 +124,14 @@ struct dispatch_to_arrow_host {
     NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), storage_type, column));
 
     NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
-    using DataType = std::conditional_t<std::is_same_v<T, numeric::decimal128>, __int128_t, T>;
+    using DataType = device_storage_type_t<T>;
     NANOARROW_RETURN_NOT_OK(
       populate_data_buffer(device_span<DataType const>(column.data<DataType>(), column.size()),
                            ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
 
     ArrowArrayMove(tmp.get(), out);
     return NANOARROW_OK;
   }
-
-  // convert decimal types from libcudf to arrow where those types are not directly
-  // supported by Arrow. These types must be fit into 128 bits, the smallest
-  // decimal resolution supported by Arrow
-  template <typename T,
-            CUDF_ENABLE_IF(!is_rep_layout_compatible<T>() &&
-                           (std::is_same_v<T, numeric::decimal32> ||
-                            std::is_same_v<T, numeric::decimal64>))>
-  int operator()(ArrowArray* out) const
-  {
-    using DeviceType = std::conditional_t<std::is_same_v<T, numeric::decimal32>, int32_t, int64_t>;
-    nanoarrow::UniqueArray tmp;
-    NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, column));
-
-    NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
-    auto buf = detail::convert_decimals_to_decimal128<DeviceType>(column, stream, mr);
-    // No need to synchronize stream here as populate_data_buffer uses the same stream to copy data
-    // to host.
-    NANOARROW_RETURN_NOT_OK(
-      populate_data_buffer(device_span<__int128_t const>(
-                             reinterpret_cast<const __int128_t*>(buf->data()), column.size()),
-                           ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
-
-    ArrowArrayMove(tmp.get(), out);
-    return NANOARROW_OK;
-  }
 };
 
 int get_column(cudf::column_view column,