Skip to content

Commit

Permalink
Avoid converting Decimal32/Decimal64 in to_arrow and from_arrow A…
Browse files Browse the repository at this point in the history
…PIs (#17422)

Now that the Arrow format includes `Decimal32` and `Decimal64` data types, CUDF no longer needs to convert them to decimal128 when importing/exporting values via the `to_arrow` and `from_arrow` APIs. Instead we can just treat them like any other fixed-width data type and use the buffers directly.

This doesn't fully address #17080 as it doesn't make any changes to the Parquet side of things

This also incorporates the changes from #17405 which are needed for debug tests. That should get merged first, and then I can rebase this.

Authors:
  - Matt Topol (https://github.com/zeroshade)
  - David Wendt (https://github.com/davidwendt)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Bradley Dice (https://github.com/bdice)
  - Lawrence Mitchell (https://github.com/wence-)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Robert (Bobby) Evans (https://github.com/revans2)
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #17422
  • Loading branch information
zeroshade authored Jan 29, 2025
1 parent aa80d45 commit d9b7a98
Show file tree
Hide file tree
Showing 23 changed files with 735 additions and 417 deletions.
2 changes: 1 addition & 1 deletion conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ requirements:
- numba-cuda >=0.2.0,<0.3.0a0
- numba >=0.59.1,<0.61.0a0
- numpy >=1.23,<3.0a0
- pyarrow>=14.0.0,<18.0.0a0
- pyarrow>=14.0.0,<20.0.0a0
- libcudf ={{ version }}
- pylibcudf ={{ version }}
- {{ pin_compatible('rmm', max_pin='x.x') }}
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/pylibcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ requirements:
- typing_extensions >=4.0.0
- pandas >=2.0,<2.2.4dev0
- numpy >=1.23,<3.0a0
- pyarrow>=14.0.0,<18.0.0a0
- pyarrow>=14.0.0,<20.0.0a0
- {{ pin_compatible('rmm', max_pin='x.x') }}
- fsspec >=0.6.0
{% if cuda_major == "11" %}
Expand Down
4 changes: 2 additions & 2 deletions cpp/cmake/thirdparty/get_arrow.cmake
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# =============================================================================
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
# Copyright (c) 2020-2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
Expand Down Expand Up @@ -347,7 +347,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow)
set(CUDF_VERSION_Arrow
# This version must be kept in sync with the libarrow version pinned for builds in
# dependencies.yaml.
16.1.0
19.0.0
CACHE STRING "The version of Arrow to find (or build)"
)
endif()
Expand Down
6 changes: 3 additions & 3 deletions cpp/cmake/thirdparty/get_nanoarrow.cmake
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# =============================================================================
# Copyright (c) 2024, NVIDIA CORPORATION.
# Copyright (c) 2024-2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
Expand All @@ -23,11 +23,11 @@ function(find_and_configure_nanoarrow)
# Currently we need to always build nanoarrow so we don't pickup a previous installed version
set(CPM_DOWNLOAD_nanoarrow ON)
rapids_cpm_find(
nanoarrow 0.6.0.dev
nanoarrow 0.7.0.dev
GLOBAL_TARGETS nanoarrow
CPM_ARGS
GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git
GIT_TAG 1e2664a70ec14907409cadcceb14d79b9670bcdb
GIT_TAG 4bf5a9322626e95e3717e43de7616c0a256179eb
GIT_SHALLOW FALSE
OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" ${_exclude_from_all}
)
Expand Down
18 changes: 18 additions & 0 deletions cpp/cmake/thirdparty/patches/nanoarrow_override.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@

{
"packages" : {
"nanoarrow" : {
"version" : "0.7.0.dev",
"git_url" : "https://github.com/apache/arrow-nanoarrow.git",
"git_tag" : "4bf5a9322626e95e3717e43de7616c0a256179eb",
"git_shallow" : false,
"patches" : [
{
"file" : "${current_json_dir}/nanoarrow_clang_tidy_compliance.diff",
"issue" : "https://github.com/apache/arrow-nanoarrow/issues/537",
"fixed_in" : ""
}
]
}
}
}
6 changes: 5 additions & 1 deletion cpp/src/interop/arrow_utilities.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -63,6 +63,8 @@ data_type arrow_to_cudf_type(ArrowSchemaView const* arrow_view)
default: CUDF_FAIL("Unsupported duration unit in arrow", cudf::data_type_error);
}
}
case NANOARROW_TYPE_DECIMAL32: return data_type{type_id::DECIMAL32, -arrow_view->decimal_scale};
case NANOARROW_TYPE_DECIMAL64: return data_type{type_id::DECIMAL64, -arrow_view->decimal_scale};
case NANOARROW_TYPE_DECIMAL128:
return data_type{type_id::DECIMAL128, -arrow_view->decimal_scale};
default: CUDF_FAIL("Unsupported type_id conversion to cudf", cudf::data_type_error);
Expand All @@ -84,6 +86,8 @@ ArrowType id_to_arrow_type(cudf::type_id id)
case cudf::type_id::FLOAT32: return NANOARROW_TYPE_FLOAT;
case cudf::type_id::FLOAT64: return NANOARROW_TYPE_DOUBLE;
case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_DATE32;
case cudf::type_id::DECIMAL32: return NANOARROW_TYPE_DECIMAL32;
case cudf::type_id::DECIMAL64: return NANOARROW_TYPE_DECIMAL64;
case cudf::type_id::DECIMAL128: return NANOARROW_TYPE_DECIMAL128;
default: CUDF_FAIL("Unsupported type_id conversion to arrow type", cudf::data_type_error);
}
Expand Down
18 changes: 1 addition & 17 deletions cpp/src/interop/arrow_utilities.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -70,21 +70,5 @@ ArrowType id_to_arrow_storage_type(cudf::type_id id);
*/
int initialize_array(ArrowArray* arr, ArrowType storage_type, cudf::column_view column);

/**
* @brief Helper to convert decimal values to 128-bit versions for Arrow compatibility
*
* The template parameter should be the underlying type of the data (e.g. int32_t for
* 32-bit decimal and int64_t for 64-bit decimal).
*
* @param input column_view of the data
* @param stream cuda stream to perform the operations on
* @param mr memory resource to allocate the returned device_uvector with
* @return unique_ptr to a device_buffer containing the upcasted data
*/
template <typename DeviceType>
std::unique_ptr<rmm::device_buffer> decimals_to_arrow(cudf::column_view input,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

} // namespace detail
} // namespace cudf
9 changes: 3 additions & 6 deletions cpp/src/interop/from_arrow_device.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -49,9 +49,7 @@ namespace {
using dispatch_tuple_t = std::tuple<column_view, owned_columns_t>;

struct dispatch_from_arrow_device {
template <typename T,
CUDF_ENABLE_IF(not is_rep_layout_compatible<T>() &&
!std::is_same_v<T, numeric::decimal128>)>
template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>() && !is_fixed_point<T>())>
dispatch_tuple_t operator()(ArrowSchemaView*,
ArrowArray const*,
data_type,
Expand All @@ -62,8 +60,7 @@ struct dispatch_from_arrow_device {
CUDF_FAIL("Unsupported type in from_arrow_device", cudf::data_type_error);
}

template <typename T,
CUDF_ENABLE_IF(is_rep_layout_compatible<T>() || std::is_same_v<T, numeric::decimal128>)>
template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>() || is_fixed_point<T>())>
dispatch_tuple_t operator()(ArrowSchemaView* schema,
ArrowArray const* input,
data_type type,
Expand Down
11 changes: 4 additions & 7 deletions cpp/src/interop/from_arrow_host.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -69,22 +69,19 @@ struct dispatch_copy_from_arrow_host {
return mask;
}

template <typename T,
CUDF_ENABLE_IF(not is_rep_layout_compatible<T>() &&
!std::is_same_v<T, numeric::decimal128>)>
template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>() && !is_fixed_point<T>())>
std::unique_ptr<column> operator()(ArrowSchemaView*, ArrowArray const*, data_type, bool)
{
CUDF_FAIL("Unsupported type in copy_from_arrow_host.");
}

template <typename T,
CUDF_ENABLE_IF(is_rep_layout_compatible<T>() || std::is_same_v<T, numeric::decimal128>)>
template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>() || is_fixed_point<T>())>
std::unique_ptr<column> operator()(ArrowSchemaView* schema,
ArrowArray const* input,
data_type type,
bool skip_mask)
{
using DeviceType = std::conditional_t<std::is_same_v<T, numeric::decimal128>, __int128_t, T>;
using DeviceType = device_storage_type_t<T>;

size_type const num_rows = input->length;
size_type const offset = input->offset;
Expand Down
102 changes: 8 additions & 94 deletions cpp/src/interop/to_arrow_device.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -92,13 +92,15 @@ int set_buffer(std::unique_ptr<T> device_buf, int64_t i, ArrowArray* out)
}

struct dispatch_to_arrow_device {
template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
template <typename T,
CUDF_ENABLE_IF(not is_rep_layout_compatible<T>() and not is_fixed_point<T>())>
int operator()(cudf::column&&, rmm::cuda_stream_view, rmm::device_async_resource_ref, ArrowArray*)
{
CUDF_FAIL("Unsupported type for to_arrow_device", cudf::data_type_error);
}

template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
// cover rep layout compatible and decimal types
template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>() or is_fixed_point<T>())>
int operator()(cudf::column&& column,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr,
Expand Down Expand Up @@ -132,64 +134,6 @@ struct dispatch_to_arrow_device {
}
};

template <typename DeviceType>
int construct_decimals(cudf::column_view input,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr,
ArrowArray* out)
{
nanoarrow::UniqueArray tmp;
NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, input));

auto buf = detail::convert_decimals_to_decimal128<DeviceType>(input, stream, mr);
// Synchronize stream here to ensure the decimal128 buffer is ready.
stream.synchronize();
NANOARROW_RETURN_NOT_OK(set_buffer(std::move(buf), fixed_width_data_buffer_idx, tmp.get()));

ArrowArrayMove(tmp.get(), out);
return NANOARROW_OK;
}

template <>
int dispatch_to_arrow_device::operator()<numeric::decimal32>(cudf::column&& column,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr,
ArrowArray* out)
{
using DeviceType = int32_t;
NANOARROW_RETURN_NOT_OK(construct_decimals<DeviceType>(column.view(), stream, mr, out));
auto contents = column.release();
NANOARROW_RETURN_NOT_OK(set_null_mask(contents, out));
return NANOARROW_OK;
}

template <>
int dispatch_to_arrow_device::operator()<numeric::decimal64>(cudf::column&& column,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr,
ArrowArray* out)
{
using DeviceType = int64_t;
NANOARROW_RETURN_NOT_OK(construct_decimals<DeviceType>(column.view(), stream, mr, out));
auto contents = column.release();
NANOARROW_RETURN_NOT_OK(set_null_mask(contents, out));
return NANOARROW_OK;
}

template <>
int dispatch_to_arrow_device::operator()<numeric::decimal128>(cudf::column&& column,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr,
ArrowArray* out)
{
nanoarrow::UniqueArray tmp;
NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, column));
auto contents = column.release();
NANOARROW_RETURN_NOT_OK(set_contents(contents, tmp.get()));
ArrowArrayMove(tmp.get(), out);
return NANOARROW_OK;
}

template <>
int dispatch_to_arrow_device::operator()<bool>(cudf::column&& column,
rmm::cuda_stream_view stream,
Expand Down Expand Up @@ -350,13 +294,14 @@ struct dispatch_to_arrow_device_view {
rmm::cuda_stream_view stream;
rmm::device_async_resource_ref mr;

template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
template <typename T,
CUDF_ENABLE_IF(not is_rep_layout_compatible<T>() and not is_fixed_point<T>())>
int operator()(ArrowArray*) const
{
CUDF_FAIL("Unsupported type for to_arrow_device", cudf::data_type_error);
}

template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>() or is_fixed_point<T>())>
int operator()(ArrowArray* out) const
{
nanoarrow::UniqueArray tmp;
Expand Down Expand Up @@ -404,37 +349,6 @@ struct dispatch_to_arrow_device_view {
}
};

template <>
int dispatch_to_arrow_device_view::operator()<numeric::decimal32>(ArrowArray* out) const
{
using DeviceType = int32_t;
NANOARROW_RETURN_NOT_OK(construct_decimals<DeviceType>(column, stream, mr, out));
NANOARROW_RETURN_NOT_OK(set_null_mask(column, out));
return NANOARROW_OK;
}

template <>
int dispatch_to_arrow_device_view::operator()<numeric::decimal64>(ArrowArray* out) const
{
using DeviceType = int64_t;
NANOARROW_RETURN_NOT_OK(construct_decimals<DeviceType>(column, stream, mr, out));
NANOARROW_RETURN_NOT_OK(set_null_mask(column, out));
return NANOARROW_OK;
}

template <>
int dispatch_to_arrow_device_view::operator()<numeric::decimal128>(ArrowArray* out) const
{
nanoarrow::UniqueArray tmp;

NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, column));
NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
NANOARROW_RETURN_NOT_OK(set_view_to_buffer(column, tmp.get()));

ArrowArrayMove(tmp.get(), out);
return NANOARROW_OK;
}

template <>
int dispatch_to_arrow_device_view::operator()<bool>(ArrowArray* out) const
{
Expand Down
33 changes: 3 additions & 30 deletions cpp/src/interop/to_arrow_host.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -115,8 +115,7 @@ struct dispatch_to_arrow_host {
CUDF_FAIL("Unsupported type for to_arrow_host", cudf::data_type_error);
}

template <typename T,
CUDF_ENABLE_IF(is_rep_layout_compatible<T>() || std::is_same_v<T, numeric::decimal128>)>
template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>() || is_fixed_point<T>())>
int operator()(ArrowArray* out) const
{
nanoarrow::UniqueArray tmp;
Expand All @@ -125,40 +124,14 @@ struct dispatch_to_arrow_host {
NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), storage_type, column));

NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
using DataType = std::conditional_t<std::is_same_v<T, numeric::decimal128>, __int128_t, T>;
using DataType = device_storage_type_t<T>;
NANOARROW_RETURN_NOT_OK(
populate_data_buffer(device_span<DataType const>(column.data<DataType>(), column.size()),
ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));

ArrowArrayMove(tmp.get(), out);
return NANOARROW_OK;
}

// convert decimal types from libcudf to arrow where those types are not directly
// supported by Arrow. These types must be fit into 128 bits, the smallest
// decimal resolution supported by Arrow
template <typename T,
CUDF_ENABLE_IF(!is_rep_layout_compatible<T>() &&
(std::is_same_v<T, numeric::decimal32> ||
std::is_same_v<T, numeric::decimal64>))>
int operator()(ArrowArray* out) const
{
using DeviceType = std::conditional_t<std::is_same_v<T, numeric::decimal32>, int32_t, int64_t>;
nanoarrow::UniqueArray tmp;
NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, column));

NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
auto buf = detail::convert_decimals_to_decimal128<DeviceType>(column, stream, mr);
// No need to synchronize stream here as populate_data_buffer uses the same stream to copy data
// to host.
NANOARROW_RETURN_NOT_OK(
populate_data_buffer(device_span<__int128_t const>(
reinterpret_cast<const __int128_t*>(buf->data()), column.size()),
ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));

ArrowArrayMove(tmp.get(), out);
return NANOARROW_OK;
}
};

int get_column(cudf::column_view column,
Expand Down
Loading

0 comments on commit d9b7a98

Please sign in to comment.