From eafa570c24a2130292894dd91b68e57edfcbcc96 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Wed, 29 May 2024 14:46:54 -0400 Subject: [PATCH] Add `from_arrow_host` functions for cudf interop with nanoarrow (#15645) Following up from #15458 and continuing the work to address #14926 adding host memory version of `from_arrow_device` which will perform the copies from host memory to create cudf objects. Authors: - Matt Topol (https://github.com/zeroshade) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Paul Mattione (https://github.com/pmattione-nvidia) - Vyas Ramasubramani (https://github.com/vyasr) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/15645 --- cpp/CMakeLists.txt | 3 +- cpp/include/cudf/interop.hpp | 91 ++- cpp/src/interop/arrow_utilities.cpp | 90 +++ cpp/src/interop/arrow_utilities.hpp | 21 + cpp/src/interop/from_arrow_device.cu | 109 ++-- cpp/src/interop/from_arrow_host.cu | 492 +++++++++++++++ cpp/src/interop/to_arrow_device.cu | 1 - cpp/src/interop/to_arrow_schema.cpp | 2 +- cpp/src/interop/to_arrow_utilities.cpp | 44 -- cpp/src/interop/to_arrow_utilities.hpp | 34 -- cpp/tests/CMakeLists.txt | 1 + cpp/tests/interop/from_arrow_device_test.cpp | 12 +- cpp/tests/interop/from_arrow_host_test.cpp | 612 +++++++++++++++++++ cpp/tests/interop/nanoarrow_utils.hpp | 236 +++++++ cpp/tests/interop/to_arrow_device_test.cpp | 107 ++-- 15 files changed, 1631 insertions(+), 224 deletions(-) create mode 100644 cpp/src/interop/arrow_utilities.cpp create mode 100644 cpp/src/interop/from_arrow_host.cu delete mode 100644 cpp/src/interop/to_arrow_utilities.cpp delete mode 100644 cpp/src/interop/to_arrow_utilities.hpp create mode 100644 cpp/tests/interop/from_arrow_host_test.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f69f04f9c10..f637db66c2c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -360,11 +360,12 @@ add_library( src/hash/xxhash_64.cu src/interop/dlpack.cpp src/interop/from_arrow.cu + src/interop/arrow_utilities.cpp src/interop/to_arrow.cu src/interop/to_arrow_device.cu src/interop/from_arrow_device.cu + src/interop/from_arrow_host.cu src/interop/to_arrow_schema.cpp - src/interop/to_arrow_utilities.cpp src/interop/detail/arrow_allocator.cpp src/io/avro/avro.cpp src/io/avro/avro_gpu.cu diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index bb05a622f40..f3ff0009d5c 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -46,6 +46,8 @@ struct ArrowDeviceArray; struct ArrowSchema; +struct ArrowArray; + namespace cudf { /** * @addtogroup interop_dlpack @@ -348,6 +350,91 @@ std::unique_ptr from_arrow( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); +/** + * @brief Create `cudf::table` from given ArrowArray and ArrowSchema input + * + * @throws std::invalid_argument if either schema or input are NULL + * + * @throws cudf::data_type_error if the input array is not a struct array. + * + * The conversion will not call release on the input Array. + * + * @param schema `ArrowSchema` pointer to describe the type of the data + * @param input `ArrowArray` pointer that needs to be converted to cudf::table + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate `cudf::table` + * @return cudf table generated from given arrow data + */ +std::unique_ptr from_arrow(ArrowSchema const* schema, + ArrowArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +/** + * @brief Create `cudf::column` from a given ArrowArray and ArrowSchema input + * + * @throws std::invalid_argument if either schema or input are NULL + * + * The conversion will not call release on the input Array. + * + * @param schema `ArrowSchema` pointer to describe the type of the data + * @param input `ArrowArray` pointer that needs to be converted to cudf::column + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate `cudf::column` + * @return cudf column generated from given arrow data + */ +std::unique_ptr from_arrow_column(ArrowSchema const* schema, + ArrowArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +/** + * @brief Create `cudf::table` from given ArrowDeviceArray input + * + * @throws std::invalid_argument if either schema or input are NULL + * + * @throws std::invalid_argument if the device_type is not `ARROW_DEVICE_CPU` + * + * @throws cudf::data_type_error if the input array is not a struct array, + * non-struct arrays should be passed to `from_arrow_host_column` instead. + * + * The conversion will not call release on the input Array. + * + * @param schema `ArrowSchema` pointer to describe the type of the data + * @param input `ArrowDeviceArray` pointer to object owning the Arrow data + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to perform cuda allocation + * @return cudf table generated from the given Arrow data + */ +std::unique_ptr from_arrow_host( + ArrowSchema const* schema, + ArrowDeviceArray const* input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Create `cudf::column` from given ArrowDeviceArray input + * + * @throws std::invalid_argument if either schema or input are NULL + * + * @throws std::invalid_argument if the device_type is not `ARROW_DEVICE_CPU` + * + * @throws cudf::data_type_error if input arrow data type is not supported in cudf. + * + * The conversion will not call release on the input Array. + * + * @param schema `ArrowSchema` pointer to describe the type of the data + * @param input `ArrowDeviceArray` pointer to object owning the Arrow data + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to perform cuda allocation + * @return cudf column generated from the given Arrow data + */ +std::unique_ptr from_arrow_host_column( + ArrowSchema const* schema, + ArrowDeviceArray const* input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief typedef for a vector of owning columns, used for conversion from ArrowDeviceArray * @@ -398,7 +485,7 @@ using unique_table_view_t = * `ArrowDeviceArray` after it is no longer needed, and that the `cudf::table_view` is not * accessed after this happens. * - * @throws cudf::logic_error if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST` + * @throws std::invalid_argument if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST` * or `ARROW_DEVICE_CUDA_MANAGED` * * @throws cudf::data_type_error if the input array is not a struct array, non-struct @@ -446,7 +533,7 @@ using unique_column_view_t = * `ArrowDeviceArray` after it is no longer needed, and that the `cudf::column_view` is not * accessed after this happens. * - * @throws cudf::logic_error if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST` + * @throws std::invalid_argument if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST` * or `ARROW_DEVICE_CUDA_MANAGED` * * @throws cudf::data_type_error input arrow data type is not supported. diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp new file mode 100644 index 00000000000..05beecfbf9b --- /dev/null +++ b/cpp/src/interop/arrow_utilities.cpp @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arrow_utilities.hpp" + +#include +#include + +#include + +namespace cudf { +namespace detail { +data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view) +{ + switch (arrow_view->type) { + case NANOARROW_TYPE_NA: return data_type(type_id::EMPTY); + case NANOARROW_TYPE_BOOL: return data_type(type_id::BOOL8); + case NANOARROW_TYPE_INT8: return data_type(type_id::INT8); + case NANOARROW_TYPE_INT16: return data_type(type_id::INT16); + case NANOARROW_TYPE_INT32: return data_type(type_id::INT32); + case NANOARROW_TYPE_INT64: return data_type(type_id::INT64); + case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8); + case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16); + case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32); + case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64); + case NANOARROW_TYPE_FLOAT: return data_type(type_id::FLOAT32); + case NANOARROW_TYPE_DOUBLE: return data_type(type_id::FLOAT64); + case NANOARROW_TYPE_DATE32: return data_type(type_id::TIMESTAMP_DAYS); + case NANOARROW_TYPE_STRING: return data_type(type_id::STRING); + case NANOARROW_TYPE_LIST: return data_type(type_id::LIST); + case NANOARROW_TYPE_DICTIONARY: return data_type(type_id::DICTIONARY32); + case NANOARROW_TYPE_STRUCT: return data_type(type_id::STRUCT); + case NANOARROW_TYPE_TIMESTAMP: { + switch (arrow_view->time_unit) { + case NANOARROW_TIME_UNIT_SECOND: return data_type(type_id::TIMESTAMP_SECONDS); + case NANOARROW_TIME_UNIT_MILLI: return data_type(type_id::TIMESTAMP_MILLISECONDS); + case NANOARROW_TIME_UNIT_MICRO: return data_type(type_id::TIMESTAMP_MICROSECONDS); + case NANOARROW_TIME_UNIT_NANO: return data_type(type_id::TIMESTAMP_NANOSECONDS); + default: CUDF_FAIL("Unsupported timestamp unit in arrow", cudf::data_type_error); + } + } + case NANOARROW_TYPE_DURATION: { + switch (arrow_view->time_unit) { + case NANOARROW_TIME_UNIT_SECOND: return data_type(type_id::DURATION_SECONDS); + case NANOARROW_TIME_UNIT_MILLI: return data_type(type_id::DURATION_MILLISECONDS); + case NANOARROW_TIME_UNIT_MICRO: return data_type(type_id::DURATION_MICROSECONDS); + case NANOARROW_TIME_UNIT_NANO: return data_type(type_id::DURATION_NANOSECONDS); + default: CUDF_FAIL("Unsupported duration unit in arrow", cudf::data_type_error); + } + } + case NANOARROW_TYPE_DECIMAL128: + return data_type{type_id::DECIMAL128, -arrow_view->decimal_scale}; + default: CUDF_FAIL("Unsupported type_id conversion to cudf", cudf::data_type_error); + } +} + +ArrowType id_to_arrow_type(cudf::type_id id) +{ + switch (id) { + case cudf::type_id::BOOL8: return NANOARROW_TYPE_BOOL; + case cudf::type_id::INT8: return NANOARROW_TYPE_INT8; + case cudf::type_id::INT16: return NANOARROW_TYPE_INT16; + case cudf::type_id::INT32: return NANOARROW_TYPE_INT32; + case cudf::type_id::INT64: return NANOARROW_TYPE_INT64; + case cudf::type_id::UINT8: return NANOARROW_TYPE_UINT8; + case cudf::type_id::UINT16: return NANOARROW_TYPE_UINT16; + case cudf::type_id::UINT32: return NANOARROW_TYPE_UINT32; + case cudf::type_id::UINT64: return NANOARROW_TYPE_UINT64; + case cudf::type_id::FLOAT32: return NANOARROW_TYPE_FLOAT; + case cudf::type_id::FLOAT64: return NANOARROW_TYPE_DOUBLE; + case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_DATE32; + default: CUDF_FAIL("Unsupported type_id conversion to arrow type", cudf::data_type_error); + } +} + +} // namespace detail +} // namespace cudf diff --git a/cpp/src/interop/arrow_utilities.hpp b/cpp/src/interop/arrow_utilities.hpp index 9bbdaa2c363..defddb4dc42 100644 --- a/cpp/src/interop/arrow_utilities.hpp +++ b/cpp/src/interop/arrow_utilities.hpp @@ -16,6 +16,11 @@ #pragma once +#include + +#include +#include + namespace cudf { namespace detail { @@ -26,5 +31,21 @@ namespace detail { static constexpr int validity_buffer_idx = 0; static constexpr int fixed_width_data_buffer_idx = 1; +/** + * @brief Map ArrowType id to cudf column type id + * + * @param arrow_view SchemaView to pull the logical and storage types from + * @return Column type id + */ +data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view); + +/** + * @brief Map cudf column type id to ArrowType id + * + * @param id Column type id + * @return ArrowType id + */ +ArrowType id_to_arrow_type(cudf::type_id id); + } // namespace detail } // namespace cudf diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu index d4d31d1989b..002a8ec1f14 100644 --- a/cpp/src/interop/from_arrow_device.cu +++ b/cpp/src/interop/from_arrow_device.cu @@ -42,49 +42,6 @@ namespace cudf { namespace detail { -data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view) -{ - switch (arrow_view->type) { - case NANOARROW_TYPE_NA: return data_type(type_id::EMPTY); - case NANOARROW_TYPE_BOOL: return data_type(type_id::BOOL8); - case NANOARROW_TYPE_INT8: return data_type(type_id::INT8); - case NANOARROW_TYPE_INT16: return data_type(type_id::INT16); - case NANOARROW_TYPE_INT32: return data_type(type_id::INT32); - case NANOARROW_TYPE_INT64: return data_type(type_id::INT64); - case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8); - case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16); - case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32); - case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64); - case NANOARROW_TYPE_FLOAT: return data_type(type_id::FLOAT32); - case NANOARROW_TYPE_DOUBLE: return data_type(type_id::FLOAT64); - case NANOARROW_TYPE_DATE32: return data_type(type_id::TIMESTAMP_DAYS); - case NANOARROW_TYPE_STRING: return data_type(type_id::STRING); - case NANOARROW_TYPE_LIST: return data_type(type_id::LIST); - case NANOARROW_TYPE_DICTIONARY: return data_type(type_id::DICTIONARY32); - case NANOARROW_TYPE_STRUCT: return data_type(type_id::STRUCT); - case NANOARROW_TYPE_TIMESTAMP: { - switch (arrow_view->time_unit) { - case NANOARROW_TIME_UNIT_SECOND: return data_type(type_id::TIMESTAMP_SECONDS); - case NANOARROW_TIME_UNIT_MILLI: return data_type(type_id::TIMESTAMP_MILLISECONDS); - case NANOARROW_TIME_UNIT_MICRO: return data_type(type_id::TIMESTAMP_MICROSECONDS); - case NANOARROW_TIME_UNIT_NANO: return data_type(type_id::TIMESTAMP_NANOSECONDS); - default: CUDF_FAIL("Unsupported timestamp unit in arrow", cudf::data_type_error); - } - } - case NANOARROW_TYPE_DURATION: { - switch (arrow_view->time_unit) { - case NANOARROW_TIME_UNIT_SECOND: return data_type(type_id::DURATION_SECONDS); - case NANOARROW_TIME_UNIT_MILLI: return data_type(type_id::DURATION_MILLISECONDS); - case NANOARROW_TIME_UNIT_MICRO: return data_type(type_id::DURATION_MICROSECONDS); - case NANOARROW_TIME_UNIT_NANO: return data_type(type_id::DURATION_NANOSECONDS); - default: CUDF_FAIL("Unsupported duration unit in arrow", cudf::data_type_error); - } - } - case NANOARROW_TYPE_DECIMAL128: - return data_type{type_id::DECIMAL128, -arrow_view->decimal_scale}; - default: CUDF_FAIL("Unsupported type_id conversion to cudf", cudf::data_type_error); - } -} namespace { @@ -379,11 +336,25 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema, } // namespace -unique_table_view_t from_arrow_device(ArrowSchemaView* schema, +unique_table_view_t from_arrow_device(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + CUDF_EXPECTS(schema != nullptr && input != nullptr, + "input ArrowSchema and ArrowDeviceArray must not be NULL", + std::invalid_argument); + CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CUDA || + input->device_type == ARROW_DEVICE_CUDA_HOST || + input->device_type == ARROW_DEVICE_CUDA_MANAGED, + "ArrowDeviceArray memory must be accessible to CUDA", + std::invalid_argument); + + rmm::cuda_set_device_raii dev( + rmm::cuda_device_id{static_cast(input->device_id)}); + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); + if (input->sync_event != nullptr) { CUDF_CUDA_TRY( cudaStreamWaitEvent(stream.value(), *reinterpret_cast(input->sync_event))); @@ -392,14 +363,14 @@ unique_table_view_t from_arrow_device(ArrowSchemaView* schema, std::vector columns; owned_columns_t owned_mem; - auto type = arrow_to_cudf_type(schema); + auto type = arrow_to_cudf_type(&view); CUDF_EXPECTS(type == data_type(type_id::STRUCT), "Must pass a struct to `from_arrow_device`", cudf::data_type_error); std::transform( input->array.children, input->array.children + input->array.n_children, - schema->schema->children, + view.schema->children, std::back_inserter(columns), [&owned_mem, &stream, &mr](ArrowArray const* child, ArrowSchema const* child_schema) { ArrowSchemaView view; @@ -420,18 +391,32 @@ unique_table_view_t from_arrow_device(ArrowSchemaView* schema, custom_view_deleter{std::move(owned_mem)}}; } -unique_column_view_t from_arrow_device_column(ArrowSchemaView* schema, +unique_column_view_t from_arrow_device_column(ArrowSchema const* schema, ArrowDeviceArray const* input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + CUDF_EXPECTS(schema != nullptr && input != nullptr, + "input ArrowSchema and ArrowDeviceArray must not be NULL", + std::invalid_argument); + CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CUDA || + input->device_type == ARROW_DEVICE_CUDA_HOST || + input->device_type == ARROW_DEVICE_CUDA_MANAGED, + "ArrowDeviceArray must be accessible to CUDA", + std::invalid_argument); + + rmm::cuda_set_device_raii dev( + rmm::cuda_device_id{static_cast(input->device_id)}); + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); + if (input->sync_event != nullptr) { CUDF_CUDA_TRY( cudaStreamWaitEvent(stream.value(), *reinterpret_cast(input->sync_event))); } - auto type = arrow_to_cudf_type(schema); - auto [colview, owned] = get_column(schema, &input->array, type, false, stream, mr); + auto type = arrow_to_cudf_type(&view); + auto [colview, owned] = get_column(&view, &input->array, type, false, stream, mr); return unique_column_view_t{new column_view{colview}, custom_view_deleter{std::move(owned)}}; } @@ -443,20 +428,9 @@ unique_table_view_t from_arrow_device(ArrowSchema const* schema, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(schema != nullptr && input != nullptr, - "input ArrowSchema and ArrowDeviceArray must not be NULL"); - CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CUDA || - input->device_type == ARROW_DEVICE_CUDA_HOST || - input->device_type == ARROW_DEVICE_CUDA_MANAGED, - "ArrowDeviceArray memory must be accessible to CUDA"); - CUDF_FUNC_RANGE(); - rmm::cuda_set_device_raii dev( - rmm::cuda_device_id{static_cast(input->device_id)}); - ArrowSchemaView view; - NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); - return detail::from_arrow_device(&view, input, stream, mr); + return detail::from_arrow_device(schema, input, stream, mr); } unique_column_view_t from_arrow_device_column(ArrowSchema const* schema, @@ -464,20 +438,9 @@ unique_column_view_t from_arrow_device_column(ArrowSchema const* schema, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(schema != nullptr && input != nullptr, - "input ArrowSchema and ArrowDeviceArray must not be NULL"); - CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CUDA || - input->device_type == ARROW_DEVICE_CUDA_HOST || - input->device_type == ARROW_DEVICE_CUDA_MANAGED, - "ArrowDeviceArray must be accessible to CUDA"); - CUDF_FUNC_RANGE(); - rmm::cuda_set_device_raii dev( - rmm::cuda_device_id{static_cast(input->device_id)}); - ArrowSchemaView view; - NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); - return detail::from_arrow_device_column(&view, input, stream, mr); + return detail::from_arrow_device_column(schema, input, stream, mr); } } // namespace cudf diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu new file mode 100644 index 00000000000..36bb35d9419 --- /dev/null +++ b/cpp/src/interop/from_arrow_host.cu @@ -0,0 +1,492 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arrow_utilities.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +namespace cudf { +namespace detail { + +namespace { + +struct dispatch_copy_from_arrow_host { + rmm::cuda_stream_view stream; + rmm::mr::device_memory_resource* mr; + + std::unique_ptr get_mask_buffer(ArrowArray const* array) + { + auto* bitmap = array->buffers[validity_buffer_idx]; + if (bitmap == nullptr) { return std::make_unique(0, stream, mr); } + + auto const bitmask_size = array->length + array->offset; + auto const allocation_size = + bitmask_allocation_size_bytes(static_cast(bitmask_size)); + auto mask = std::make_unique(allocation_size, stream, mr); + CUDF_CUDA_TRY(cudaMemcpyAsync(mask->data(), + reinterpret_cast(bitmap), + allocation_size, + cudaMemcpyDefault, + stream.value())); + return mask; + } + + template () && + !std::is_same_v)> + std::unique_ptr operator()(ArrowSchemaView*, ArrowArray const*, data_type, bool) + { + CUDF_FAIL("Unsupported type in copy_from_arrow_host."); + } + + template () || std::is_same_v)> + std::unique_ptr operator()(ArrowSchemaView* schema, + ArrowArray const* input, + data_type type, + bool skip_mask) + { + using DeviceType = std::conditional_t, __int128_t, T>; + + size_type const num_rows = input->length; + size_type const offset = input->offset; + size_type const null_count = input->null_count; + auto data_buffer = input->buffers[fixed_width_data_buffer_idx]; + + auto const has_nulls = skip_mask ? false : input->buffers[validity_buffer_idx] != nullptr; + auto col = make_fixed_width_column(type, num_rows, mask_state::UNALLOCATED, stream, mr); + auto mutable_column_view = col->mutable_view(); + CUDF_CUDA_TRY( + cudaMemcpyAsync(mutable_column_view.data(), + reinterpret_cast(data_buffer) + offset * sizeof(DeviceType), + sizeof(DeviceType) * num_rows, + cudaMemcpyDefault, + stream.value())); + + if (has_nulls) { + auto tmp_mask = get_mask_buffer(input); + + // if array is sliced, we have to copy the whole mask and then take copy + auto out_mask = + (offset == 0) + ? std::move(*tmp_mask) + : cudf::detail::copy_bitmask( + static_cast(tmp_mask->data()), offset, offset + num_rows, stream, mr); + + col->set_null_mask(std::move(out_mask), null_count); + } + + return col; + } +}; + +// forward declaration is needed because `type_dispatch` instantiates the +// dispatch_copy_from_arrow_host struct causing a recursive situation for struct, +// dictionary and list_view types. +// +// This function is simply a convenience wrapper around the dispatch functor with +// some extra handling to avoid having to reproduce it for all of the nested types. +// It also allows us to centralize the location where the recursive calls happen +// so that we only need to forward declare this one function, rather than multiple +// functions which handle the overloads for nested types (list, struct, etc.) +std::unique_ptr get_column_copy(ArrowSchemaView* schema, + ArrowArray const* input, + data_type type, + bool skip_mask, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +template <> +std::unique_ptr dispatch_copy_from_arrow_host::operator()(ArrowSchemaView* schema, + ArrowArray const* input, + data_type type, + bool skip_mask) +{ + auto data_buffer = input->buffers[fixed_width_data_buffer_idx]; + const auto buffer_length = bitmask_allocation_size_bytes(input->length + input->offset); + + auto data = rmm::device_buffer(buffer_length, stream, mr); + CUDF_CUDA_TRY(cudaMemcpyAsync(data.data(), + reinterpret_cast(data_buffer), + buffer_length, + cudaMemcpyDefault, + stream.value())); + auto out_col = mask_to_bools(static_cast(data.data()), + input->offset, + input->offset + input->length, + stream, + mr); + + auto const has_nulls = skip_mask ? false : input->buffers[validity_buffer_idx] != nullptr; + if (has_nulls) { + auto out_mask = detail::copy_bitmask(static_cast(get_mask_buffer(input)->data()), + input->offset, + input->offset + input->length, + stream, + mr); + + out_col->set_null_mask(std::move(out_mask), input->null_count); + } + + return out_col; +} + +template <> +std::unique_ptr dispatch_copy_from_arrow_host::operator()( + ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask) +{ + if (input->length == 0) { return make_empty_column(type_id::STRING); } + + // offsets column should contain no nulls so we can put nullptr for the bitmask + // nulls are tracked in the parent string column itself, not in the offsets + void const* offset_buffers[2] = {nullptr, input->buffers[fixed_width_data_buffer_idx]}; + ArrowArray offsets_array = { + .length = input->offset + input->length + 1, + .null_count = 0, + .offset = 0, + .n_buffers = 2, + .n_children = 0, + .buffers = offset_buffers, + }; + + // chars_column does not contain any nulls, they are tracked by the parent string column + // itself instead. So we pass nullptr for the validity bitmask. + size_type const char_data_length = + reinterpret_cast(offset_buffers[1])[input->length + input->offset]; + void const* char_buffers[2] = {nullptr, input->buffers[2]}; + ArrowArray char_array = { + .length = char_data_length, + .null_count = 0, + .offset = 0, + .n_buffers = 2, + .n_children = 0, + .buffers = char_buffers, + }; + + nanoarrow::UniqueSchema offset_schema; + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(offset_schema.get(), NANOARROW_TYPE_INT32)); + + nanoarrow::UniqueSchema char_data_schema; + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(char_data_schema.get(), NANOARROW_TYPE_INT8)); + + // leverage the dispatch overloads for int32 and char(int8) to generate the child + // offset and char data columns for us. + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, offset_schema.get(), nullptr)); + auto offsets_column = + this->operator()(&view, &offsets_array, data_type(type_id::INT32), true); + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, char_data_schema.get(), nullptr)); + auto chars_column = this->operator()(&view, &char_array, data_type(type_id::INT8), true); + + auto const num_rows = offsets_column->size() - 1; + auto out_col = make_strings_column(num_rows, + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), + input->null_count, + std::move(*get_mask_buffer(input))); + + return input->offset == 0 + ? std::move(out_col) + : std::make_unique( + cudf::detail::slice(out_col->view(), + static_cast(input->offset), + static_cast(input->offset + input->length), + stream), + stream, + mr); +} + +template <> +std::unique_ptr dispatch_copy_from_arrow_host::operator()( + ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask) +{ + ArrowSchemaView keys_schema_view; + NANOARROW_THROW_NOT_OK( + ArrowSchemaViewInit(&keys_schema_view, schema->schema->dictionary, nullptr)); + + auto const keys_type = arrow_to_cudf_type(&keys_schema_view); + auto keys_column = + get_column_copy(&keys_schema_view, input->dictionary, keys_type, true, stream, mr); + + auto const dict_indices_type = [&schema]() -> data_type { + // cudf dictionary requires an unsigned type for the indices, + // since it is invalid for an arrow dictionary to contain negative + // indices, we can safely use the unsigned equivalent without having + // to modify the buffers. + switch (schema->storage_type) { + case NANOARROW_TYPE_INT8: + case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8); + case NANOARROW_TYPE_INT16: + case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16); + case NANOARROW_TYPE_INT32: + case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32); + case NANOARROW_TYPE_INT64: + case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64); + default: CUDF_FAIL("Unsupported type_id for dictionary indices", cudf::data_type_error); + } + }(); + + auto indices_column = get_column_copy(schema, input, dict_indices_type, false, stream, mr); + // child columns shouldn't have masks and we need the mask in the main column + auto column_contents = indices_column->release(); + indices_column = std::make_unique(dict_indices_type, + static_cast(input->length), + std::move(*(column_contents.data)), + rmm::device_buffer{}, + 0); + + return make_dictionary_column(std::move(keys_column), + std::move(indices_column), + std::move(*(column_contents.null_mask)), + input->null_count); +} + +template <> +std::unique_ptr dispatch_copy_from_arrow_host::operator()( + ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask) +{ + std::vector> child_columns; + std::transform( + input->children, + input->children + input->n_children, + schema->schema->children, + std::back_inserter(child_columns), + [this, input](ArrowArray const* child, ArrowSchema const* child_schema) { + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, child_schema, nullptr)); + auto type = arrow_to_cudf_type(&view); + + auto out = get_column_copy(&view, child, type, false, stream, mr); + return input->offset == 0 && input->length == out->size() + ? std::move(out) + : std::make_unique( + cudf::detail::slice(out->view(), + static_cast(input->offset), + static_cast(input->offset + input->length), + stream), + stream, + mr); + }); + + auto out_mask = std::move(*(get_mask_buffer(input))); + if (input->buffers[validity_buffer_idx] != nullptr) { + out_mask = detail::copy_bitmask(static_cast(out_mask.data()), + input->offset, + input->offset + input->length, + stream, + mr); + } + + return make_structs_column( + input->length, std::move(child_columns), input->null_count, std::move(out_mask), stream, mr); +} + +template <> +std::unique_ptr dispatch_copy_from_arrow_host::operator()( + ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask) +{ + const void* offset_buffers[2] = {nullptr, input->buffers[fixed_width_data_buffer_idx]}; + ArrowArray offsets_array = { + .length = input->offset + input->length + 1, + .null_count = 0, + .offset = 0, + .n_buffers = 2, + .n_children = 0, + .buffers = offset_buffers, + }; + nanoarrow::UniqueSchema offset_schema; + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(offset_schema.get(), NANOARROW_TYPE_INT32)); + + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, offset_schema.get(), nullptr)); + auto offsets_column = + this->operator()(&view, &offsets_array, data_type(type_id::INT32), true); + + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema->schema->children[0], nullptr)); + auto child_type = arrow_to_cudf_type(&view); + auto child_column = get_column_copy(&view, input->children[0], child_type, false, stream, mr); + + auto const num_rows = offsets_column->size() - 1; + auto out_col = make_lists_column(num_rows, + std::move(offsets_column), + std::move(child_column), + input->null_count, + std::move(*get_mask_buffer(input)), + stream, + mr); + + return num_rows == input->length + ? std::move(out_col) + : std::make_unique( + cudf::detail::slice(out_col->view(), + static_cast(input->offset), + static_cast(input->offset + input->length), + stream), + stream, + mr); +} + +std::unique_ptr get_column_copy(ArrowSchemaView* schema, + ArrowArray const* input, + data_type type, + bool skip_mask, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return type.id() != type_id::EMPTY + ? std::move(type_dispatcher( + type, dispatch_copy_from_arrow_host{stream, mr}, schema, input, type, skip_mask)) + : std::make_unique(data_type(type_id::EMPTY), + input->length, + rmm::device_buffer{}, + rmm::device_buffer{}, + input->length); +} + +} // namespace + +std::unique_ptr
from_arrow_host(ArrowSchema const* schema, + ArrowDeviceArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(schema != nullptr && input != nullptr, + "input ArrowSchema and ArrowDeviceArray must not be NULL", + std::invalid_argument); + CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CPU, + "ArrowDeviceArray must have CPU device type for `from_arrow_host`", + std::invalid_argument); + + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); + + std::vector> columns; + + auto type = arrow_to_cudf_type(&view); + CUDF_EXPECTS(type == data_type(type_id::STRUCT), + "Must pass a struct to `from_arrow_host`", + cudf::data_type_error); + + std::transform(input->array.children, + input->array.children + input->array.n_children, + view.schema->children, + std::back_inserter(columns), + [&stream, &mr](ArrowArray const* child, ArrowSchema const* child_schema) { + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, child_schema, nullptr)); + auto type = arrow_to_cudf_type(&view); + return get_column_copy(&view, child, type, false, stream, mr); + }); + + return std::make_unique
(std::move(columns)); +} + +std::unique_ptr from_arrow_host_column(ArrowSchema const* schema, + ArrowDeviceArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(schema != nullptr && input != nullptr, + "input ArrowSchema and ArrowDeviceArray must not be NULL", + std::invalid_argument); + CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CPU, + "ArrowDeviceArray must have CPU device type for `from_arrow_host_column`", + std::invalid_argument); + + ArrowSchemaView view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr)); + + auto type = arrow_to_cudf_type(&view); + return get_column_copy(&view, &input->array, type, false, stream, mr); +} + +} // namespace detail + +std::unique_ptr
from_arrow_host(ArrowSchema const* schema, + ArrowDeviceArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + + return detail::from_arrow_host(schema, input, stream, mr); +} + +std::unique_ptr from_arrow_host_column(ArrowSchema const* schema, + ArrowDeviceArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + + return detail::from_arrow_host_column(schema, input, stream, mr); +} + +std::unique_ptr
from_arrow(ArrowSchema const* schema, + ArrowArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + + ArrowDeviceArray const device_input = { + .array = *input, + .device_id = -1, + .device_type = ARROW_DEVICE_CPU, + }; + return detail::from_arrow_host(schema, &device_input, stream, mr); +} + +std::unique_ptr from_arrow_column(ArrowSchema const* schema, + ArrowArray const* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + + ArrowDeviceArray const device_input = { + .array = *input, + .device_id = -1, + .device_type = ARROW_DEVICE_CPU, + }; + return detail::from_arrow_host_column(schema, &device_input, stream, mr); +} + +} // namespace cudf diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu index f2b1669df9b..ebfd6605977 100644 --- a/cpp/src/interop/to_arrow_device.cu +++ b/cpp/src/interop/to_arrow_device.cu @@ -15,7 +15,6 @@ */ #include "arrow_utilities.hpp" -#include "to_arrow_utilities.hpp" #include #include diff --git a/cpp/src/interop/to_arrow_schema.cpp b/cpp/src/interop/to_arrow_schema.cpp index 6f943593dce..19915464236 100644 --- a/cpp/src/interop/to_arrow_schema.cpp +++ b/cpp/src/interop/to_arrow_schema.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "to_arrow_utilities.hpp" +#include "arrow_utilities.hpp" #include #include diff --git a/cpp/src/interop/to_arrow_utilities.cpp b/cpp/src/interop/to_arrow_utilities.cpp deleted file mode 100644 index 04d17847273..00000000000 --- a/cpp/src/interop/to_arrow_utilities.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "to_arrow_utilities.hpp" - -#include - -namespace cudf { -namespace detail { - -ArrowType id_to_arrow_type(cudf::type_id id) -{ - switch (id) { - case cudf::type_id::BOOL8: return NANOARROW_TYPE_BOOL; - case cudf::type_id::INT8: return NANOARROW_TYPE_INT8; - case cudf::type_id::INT16: return NANOARROW_TYPE_INT16; - case cudf::type_id::INT32: return NANOARROW_TYPE_INT32; - case cudf::type_id::INT64: return NANOARROW_TYPE_INT64; - case cudf::type_id::UINT8: return NANOARROW_TYPE_UINT8; - case cudf::type_id::UINT16: return NANOARROW_TYPE_UINT16; - case cudf::type_id::UINT32: return NANOARROW_TYPE_UINT32; - case cudf::type_id::UINT64: return NANOARROW_TYPE_UINT64; - case cudf::type_id::FLOAT32: return NANOARROW_TYPE_FLOAT; - case cudf::type_id::FLOAT64: return NANOARROW_TYPE_DOUBLE; - case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_DATE32; - default: CUDF_FAIL("Unsupported type_id conversion to arrow type", cudf::data_type_error); - } -} - -} // namespace detail -} // namespace cudf diff --git a/cpp/src/interop/to_arrow_utilities.hpp b/cpp/src/interop/to_arrow_utilities.hpp deleted file mode 100644 index 3c01c726a7b..00000000000 --- a/cpp/src/interop/to_arrow_utilities.hpp +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include - -namespace cudf { -namespace detail { - -/** - * @brief Map cudf column type id to ArrowType id - * - * @param id Column type id - * @return ArrowType id - */ -ArrowType id_to_arrow_type(cudf::type_id id); - -} // namespace detail -} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 42b7f089d61..c6ab8aa021a 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -269,6 +269,7 @@ ConfigureTest( interop/to_arrow_test.cpp interop/from_arrow_test.cpp interop/from_arrow_device_test.cpp + interop/from_arrow_host_test.cpp interop/dlpack_test.cpp EXTRA_LIB nanoarrow diff --git a/cpp/tests/interop/from_arrow_device_test.cpp b/cpp/tests/interop/from_arrow_device_test.cpp index 66bd4dd1bfb..d776ca57ef6 100644 --- a/cpp/tests/interop/from_arrow_device_test.cpp +++ b/cpp/tests/interop/from_arrow_device_test.cpp @@ -49,23 +49,23 @@ TYPED_TEST_SUITE(FromArrowDeviceTestDurationsTest, cudf::test::DurationTypes); TEST_F(FromArrowDeviceTest, FailConditions) { // can't pass null for schema or device array - EXPECT_THROW(cudf::from_arrow_device(nullptr, nullptr), cudf::logic_error); + EXPECT_THROW(cudf::from_arrow_device(nullptr, nullptr), std::invalid_argument); // can't pass null for device array ArrowSchema schema; - EXPECT_THROW(cudf::from_arrow_device(&schema, nullptr), cudf::logic_error); + EXPECT_THROW(cudf::from_arrow_device(&schema, nullptr), std::invalid_argument); // device_type must be CUDA/CUDA_HOST/CUDA_MANAGED // should fail with ARROW_DEVICE_CPU ArrowDeviceArray arr; arr.device_type = ARROW_DEVICE_CPU; - EXPECT_THROW(cudf::from_arrow_device(&schema, &arr), cudf::logic_error); + EXPECT_THROW(cudf::from_arrow_device(&schema, &arr), std::invalid_argument); // can't pass null for schema or device array - EXPECT_THROW(cudf::from_arrow_device_column(nullptr, nullptr), cudf::logic_error); + EXPECT_THROW(cudf::from_arrow_device_column(nullptr, nullptr), std::invalid_argument); // can't pass null for device array - EXPECT_THROW(cudf::from_arrow_device_column(&schema, nullptr), cudf::logic_error); + EXPECT_THROW(cudf::from_arrow_device_column(&schema, nullptr), std::invalid_argument); // device_type must be CUDA/CUDA_HOST/CUDA_MANAGED // should fail with ARROW_DEVICE_CPU - EXPECT_THROW(cudf::from_arrow_device_column(&schema, &arr), cudf::logic_error); + EXPECT_THROW(cudf::from_arrow_device_column(&schema, &arr), std::invalid_argument); } TEST_F(FromArrowDeviceTest, EmptyTable) diff --git a/cpp/tests/interop/from_arrow_host_test.cpp b/cpp/tests/interop/from_arrow_host_test.cpp new file mode 100644 index 00000000000..e6e52099a0c --- /dev/null +++ b/cpp/tests/interop/from_arrow_host_test.cpp @@ -0,0 +1,612 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nanoarrow_utils.hpp" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +// create a cudf::table and equivalent arrow table with host memory +std::tuple, nanoarrow::UniqueSchema, nanoarrow::UniqueArray> +get_nanoarrow_host_tables(cudf::size_type length) +{ + auto [table, schema, test_data] = get_nanoarrow_cudf_table(length); + + auto int64_array = get_nanoarrow_array(test_data.int64_data, test_data.validity); + auto string_array = + get_nanoarrow_array(test_data.string_data, test_data.validity); + cudf::dictionary_column_view view(table->get_column(2).view()); + auto keys = cudf::test::to_host(view.keys()).first; + auto indices = cudf::test::to_host(view.indices()).first; + auto dict_array = get_nanoarrow_dict_array(std::vector(keys.begin(), keys.end()), + std::vector(indices.begin(), indices.end()), + test_data.validity); + auto boolarray = get_nanoarrow_array(test_data.bool_data, test_data.bool_validity); + auto list_array = get_nanoarrow_list_array(test_data.list_int64_data, + test_data.list_offsets, + test_data.list_int64_data_validity, + test_data.bool_data_validity); + + nanoarrow::UniqueArray arrow; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(arrow.get(), schema.get(), nullptr)); + arrow->length = length; + + int64_array.move(arrow->children[0]); + string_array.move(arrow->children[1]); + dict_array.move(arrow->children[2]); + boolarray.move(arrow->children[3]); + list_array.move(arrow->children[4]); + + int64_array = get_nanoarrow_array(test_data.int64_data, test_data.validity); + string_array = get_nanoarrow_array(test_data.string_data, test_data.validity); + int64_array.move(arrow->children[5]->children[0]); + string_array.move(arrow->children[5]->children[1]); + + ArrowBitmap struct_validity; + ArrowBitmapInit(&struct_validity); + NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&struct_validity, length)); + ArrowBitmapAppendInt8Unsafe( + &struct_validity, reinterpret_cast(test_data.bool_data_validity.data()), length); + arrow->children[5]->length = length; + ArrowArraySetValidityBitmap(arrow->children[5], &struct_validity); + arrow->children[5]->null_count = + length - ArrowBitCountSet(ArrowArrayValidityBitmap(arrow->children[5])->buffer.data, 0, length); + + ArrowError error; + if (ArrowArrayFinishBuilding(arrow.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, &error) != + NANOARROW_OK) { + std::cerr << ArrowErrorMessage(&error) << std::endl; + CUDF_FAIL("failed to build example arrays"); + } + + return std::make_tuple(std::move(table), std::move(schema), std::move(arrow)); +} + +struct FromArrowHostDeviceTest : public cudf::test::BaseFixture {}; + +template +struct FromArrowHostDeviceTestDurationsTest : public cudf::test::BaseFixture {}; + +TYPED_TEST_SUITE(FromArrowHostDeviceTestDurationsTest, cudf::test::DurationTypes); + +TEST_F(FromArrowHostDeviceTest, EmptyTable) +{ + auto [tbl, schema, arr] = get_nanoarrow_host_tables(0); + + auto expected_cudf_table = tbl->view(); + ArrowDeviceArray input; + memcpy(&input.array, arr.get(), sizeof(ArrowArray)); + input.device_id = -1; + input.device_type = ARROW_DEVICE_CPU; + + auto got_cudf_table = cudf::from_arrow_host(schema.get(), &input); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table, got_cudf_table->view()); +} + +TEST_F(FromArrowHostDeviceTest, DateTimeTable) +{ + auto data = std::vector{1, 2, 3, 4, 5, 6}; + auto col = cudf::test::fixed_width_column_wrapper( + data.begin(), data.end()); + cudf::table_view expected_table_view({col}); + + // construct equivalent arrow schema with nanoarrow + nanoarrow::UniqueSchema input_schema; + ArrowSchemaInit(input_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1)); + ArrowSchemaInit(input_schema->children[0]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime( + input_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a")); + + // equivalent arrow record batch + nanoarrow::UniqueArray input_array; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr)); + input_array->length = 6; + input_array->null_count = 0; + + auto arr = get_nanoarrow_array(data); + arr.move(input_array->children[0]); + NANOARROW_THROW_NOT_OK( + ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr)); + + ArrowDeviceArray input; + memcpy(&input.array, input_array.get(), sizeof(ArrowArray)); + input.device_id = -1; + input.device_type = ARROW_DEVICE_CPU; + + // test that we get the same cudf table as we expect by converting the + // host arrow memory to a cudf table + auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view()); + + // test that we get a cudf table with a single struct column that is equivalent + // if we use from_arrow_host_column + auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input); + EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); + auto got_cudf_col_view = got_cudf_col->view(); + cudf::table_view from_struct{ + std::vector(got_cudf_col_view.child_begin(), got_cudf_col_view.child_end())}; + CUDF_TEST_EXPECT_TABLES_EQUAL(got_cudf_table->view(), from_struct); +} + +TYPED_TEST(FromArrowHostDeviceTestDurationsTest, DurationTable) +{ + using T = TypeParam; + if (cudf::type_to_id() == cudf::type_id::DURATION_DAYS) { return; } + + auto data = {T{1}, T{2}, T{3}, T{4}, T{5}, T{6}}; + auto col = cudf::test::fixed_width_column_wrapper(data); + + cudf::table_view expected_table_view({col}); + const ArrowTimeUnit time_unit = [&] { + switch (cudf::type_to_id()) { + case cudf::type_id::DURATION_SECONDS: return NANOARROW_TIME_UNIT_SECOND; + case cudf::type_id::DURATION_MILLISECONDS: return NANOARROW_TIME_UNIT_MILLI; + case cudf::type_id::DURATION_MICROSECONDS: return NANOARROW_TIME_UNIT_MICRO; + case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TIME_UNIT_NANO; + default: CUDF_FAIL("Unsupported duration unit in arrow"); + } + }(); + + nanoarrow::UniqueSchema input_schema; + ArrowSchemaInit(input_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1)); + + ArrowSchemaInit(input_schema->children[0]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime( + input_schema->children[0], NANOARROW_TYPE_DURATION, time_unit, nullptr)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a")); + + nanoarrow::UniqueArray input_array; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr)); + input_array->length = expected_table_view.num_rows(); + input_array->null_count = 0; + + auto arr = get_nanoarrow_array(data); + arr.move(input_array->children[0]); + NANOARROW_THROW_NOT_OK( + ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr)); + + ArrowDeviceArray input; + memcpy(&input.array, input_array.get(), sizeof(ArrowArray)); + input.device_id = -1; + input.device_type = ARROW_DEVICE_CPU; + + // converting arrow host memory to cudf table gives us the expected table + auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view()); + + // converting to a cudf table with a single struct column gives us the expected + // result column + auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input); + EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); + auto got_cudf_col_view = got_cudf_col->view(); + cudf::table_view from_struct{ + std::vector(got_cudf_col_view.child_begin(), got_cudf_col_view.child_end())}; + CUDF_TEST_EXPECT_TABLES_EQUAL(got_cudf_table->view(), from_struct); +} + +TEST_F(FromArrowHostDeviceTest, NestedList) +{ + auto valids = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 != 0; }); + auto col = cudf::test::lists_column_wrapper( + {{{{{1, 2}, valids}, {{3, 4}, valids}, {5}}, {{6}, {{7, 8, 9}, valids}}}, valids}); + cudf::table_view expected_table_view({col}); + + nanoarrow::UniqueSchema input_schema; + ArrowSchemaInit(input_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1)); + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_LIST)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a")); + input_schema->children[0]->flags = ARROW_FLAG_NULLABLE; + + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(input_schema->children[0]->children[0], NANOARROW_TYPE_LIST)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0]->children[0], "element")); + input_schema->children[0]->children[0]->flags = 0; + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType( + input_schema->children[0]->children[0]->children[0], NANOARROW_TYPE_INT64)); + NANOARROW_THROW_NOT_OK( + ArrowSchemaSetName(input_schema->children[0]->children[0]->children[0], "element")); + input_schema->children[0]->children[0]->children[0]->flags = ARROW_FLAG_NULLABLE; + + // create the base arrow list array + auto list_arr = get_nanoarrow_list_array({6, 7, 8, 9}, {0, 1, 4}, {1, 0, 1, 1}); + std::vector offset{0, 0, 2}; + + // populate the bitmask we're going to use for the top level list + ArrowBitmap mask; + ArrowBitmapInit(&mask); + NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&mask, 2)); + NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 0, 1)); + NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 1, 1)); + + nanoarrow::UniqueArray input_array; + EXPECT_EQ(NANOARROW_OK, ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr)); + input_array->length = expected_table_view.num_rows(); + input_array->null_count = 0; + + ArrowArraySetValidityBitmap(input_array->children[0], &mask); + input_array->children[0]->length = expected_table_view.num_rows(); + input_array->children[0]->null_count = 1; + auto offset_buf = ArrowArrayBuffer(input_array->children[0], 1); + EXPECT_EQ( + NANOARROW_OK, + ArrowBufferAppend( + offset_buf, reinterpret_cast(offset.data()), offset.size() * sizeof(int32_t))); + + // move our base list to be the child of the one we just created + // so that we now have an equivalent value to what we created for cudf + list_arr.move(input_array->children[0]->children[0]); + NANOARROW_THROW_NOT_OK( + ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr)); + + ArrowDeviceArray input; + memcpy(&input.array, input_array.get(), sizeof(ArrowArray)); + input.device_id = -1; + input.device_type = ARROW_DEVICE_CPU; + + // converting from arrow host memory to cudf gives us the expected table + auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view()); + + // converting to a single column cudf table gives us the expected struct column + auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input); + EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); + auto got_cudf_col_view = got_cudf_col->view(); + cudf::table_view from_struct{ + std::vector(got_cudf_col_view.child_begin(), got_cudf_col_view.child_end())}; + CUDF_TEST_EXPECT_TABLES_EQUAL(got_cudf_table->view(), from_struct); +} + +TEST_F(FromArrowHostDeviceTest, StructColumn) +{ + // Create cudf table + auto nested_type_field_names = + std::vector>{{"string", "integral", "bool", "nested_list", "struct"}}; + auto str_col = + cudf::test::strings_column_wrapper{ + "Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"} + .release(); + auto str_col2 = + cudf::test::strings_column_wrapper{{"CUDF", "ROCKS", "EVERYWHERE"}, {0, 1, 0}}.release(); + int num_rows{str_col->size()}; + auto int_col = cudf::test::fixed_width_column_wrapper{{48, 27, 25}}.release(); + auto int_col2 = + cudf::test::fixed_width_column_wrapper{{12, 24, 47}, {1, 0, 1}}.release(); + auto bool_col = cudf::test::fixed_width_column_wrapper{{true, true, false}}.release(); + auto list_col = + cudf::test::lists_column_wrapper({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) + .release(); + vector_of_columns cols2; + cols2.push_back(std::move(str_col2)); + cols2.push_back(std::move(int_col2)); + auto [null_mask, null_count] = + cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper{{true, true, false}}); + auto sub_struct_col = + cudf::make_structs_column(num_rows, std::move(cols2), null_count, std::move(*null_mask)); + vector_of_columns cols; + cols.push_back(std::move(str_col)); + cols.push_back(std::move(int_col)); + cols.push_back(std::move(bool_col)); + cols.push_back(std::move(list_col)); + cols.push_back(std::move(sub_struct_col)); + + auto struct_col = cudf::make_structs_column(num_rows, std::move(cols), 0, {}); + cudf::table_view expected_table_view({struct_col->view()}); + + // Create name metadata + auto sub_metadata = cudf::column_metadata{"struct"}; + sub_metadata.children_meta = {{"string2"}, {"integral2"}}; + auto metadata = cudf::column_metadata{"a"}; + metadata.children_meta = {{"string"}, {"integral"}, {"bool"}, {"nested_list"}, sub_metadata}; + + // create the equivalent arrow schema using nanoarrow + nanoarrow::UniqueSchema input_schema; + ArrowSchemaInit(input_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1)); + + ArrowSchemaInit(input_schema->children[0]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema->children[0], 5)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a")); + input_schema->children[0]->flags = 0; + + auto child = input_schema->children[0]; + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[0], "string")); + child->children[0]->flags = 0; + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[1], "integral")); + child->children[1]->flags = 0; + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[2], "bool")); + child->children[2]->flags = 0; + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3], "nested_list")); + child->children[3]->flags = 0; + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3]->children[0], "element")); + child->children[3]->children[0]->flags = 0; + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64)); + NANOARROW_THROW_NOT_OK( + ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element")); + child->children[3]->children[0]->children[0]->flags = 0; + + ArrowSchemaInit(child->children[4]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(child->children[4], 2)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4], "struct")); + + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[0], "string2")); + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[1], "integral2")); + + // create nanoarrow table + // first our underlying arrays + std::vector str{"Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"}; + std::vector str2{"CUDF", "ROCKS", "EVERYWHERE"}; + auto str_array = get_nanoarrow_array(str); + auto int_array = get_nanoarrow_array({48, 27, 25}); + auto str2_array = get_nanoarrow_array(str2, {0, 1, 0}); + auto int2_array = get_nanoarrow_array({12, 24, 47}, {1, 0, 1}); + auto bool_array = get_nanoarrow_array({true, true, false}); + auto list_arr = + get_nanoarrow_list_array({1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 2, 4, 5, 6, 7, 9}); + std::vector offset{0, 3, 4, 6}; + + // create the struct array + nanoarrow::UniqueArray input_array; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr)); + + input_array->length = expected_table_view.num_rows(); + + auto array_a = input_array->children[0]; + auto view_a = expected_table_view.column(0); + array_a->length = view_a.size(); + array_a->null_count = view_a.null_count(); + // populate the children of our struct by moving them from the original arrays + str_array.move(array_a->children[0]); + int_array.move(array_a->children[1]); + bool_array.move(array_a->children[2]); + + array_a->children[3]->length = expected_table_view.num_rows(); + array_a->children[3]->null_count = 0; + auto offset_buf = ArrowArrayBuffer(array_a->children[3], 1); + EXPECT_EQ( + NANOARROW_OK, + ArrowBufferAppend( + offset_buf, reinterpret_cast(offset.data()), offset.size() * sizeof(int32_t))); + + list_arr.move(array_a->children[3]->children[0]); + + // set our struct bitmap validity mask + ArrowBitmap mask; + ArrowBitmapInit(&mask); + NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&mask, 3)); + NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 1, 2)); + NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 0, 1)); + + auto array_struct = array_a->children[4]; + auto view_struct = view_a.child(4); + ArrowArraySetValidityBitmap(array_struct, &mask); + array_struct->null_count = view_struct.null_count(); + array_struct->length = view_struct.size(); + + str2_array.move(array_struct->children[0]); + int2_array.move(array_struct->children[1]); + + NANOARROW_THROW_NOT_OK( + ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr)); + + ArrowDeviceArray input; + memcpy(&input.array, input_array.get(), sizeof(ArrowArray)); + input.device_id = -1; + input.device_type = ARROW_DEVICE_CPU; + + // test we get the expected cudf::table from the arrow host memory data + auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view()); + + // test we get the expected cudf struct column + auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input); + EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); + auto got_cudf_col_view = got_cudf_col->view(); + cudf::table_view from_struct{ + std::vector(got_cudf_col_view.child_begin(), got_cudf_col_view.child_end())}; + CUDF_TEST_EXPECT_TABLES_EQUAL(got_cudf_table->view(), from_struct); +} + +TEST_F(FromArrowHostDeviceTest, DictionaryIndicesType) +{ + // test dictionary arrays with different index types + // cudf asserts that the index type must be unsigned + auto array1 = + get_nanoarrow_dict_array({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + auto array2 = + get_nanoarrow_dict_array({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + auto array3 = + get_nanoarrow_dict_array({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + + // create equivalent cudf dictionary columns + auto keys_col = cudf::test::fixed_width_column_wrapper({1, 2, 5, 7}); + auto ind1_col = cudf::test::fixed_width_column_wrapper({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + auto ind2_col = + cudf::test::fixed_width_column_wrapper({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + auto ind3_col = + cudf::test::fixed_width_column_wrapper({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + + vector_of_columns columns; + columns.emplace_back(cudf::make_dictionary_column(keys_col, ind1_col)); + columns.emplace_back(cudf::make_dictionary_column(keys_col, ind2_col)); + columns.emplace_back(cudf::make_dictionary_column(keys_col, ind3_col)); + + cudf::table expected_table(std::move(columns)); + + nanoarrow::UniqueSchema input_schema; + ArrowSchemaInit(input_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 3)); + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_UINT8)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a")); + NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[0])); + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(input_schema->children[0]->dictionary, NANOARROW_TYPE_INT64)); + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[1], NANOARROW_TYPE_UINT16)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[1], "b")); + NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[1])); + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(input_schema->children[1]->dictionary, NANOARROW_TYPE_INT64)); + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[2], NANOARROW_TYPE_UINT64)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[2], "c")); + NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[2])); + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(input_schema->children[2]->dictionary, NANOARROW_TYPE_INT64)); + + nanoarrow::UniqueArray input_array; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr)); + input_array->length = expected_table.num_rows(); + input_array->null_count = 0; + + array1.move(input_array->children[0]); + array2.move(input_array->children[1]); + array3.move(input_array->children[2]); + + NANOARROW_THROW_NOT_OK( + ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr)); + + ArrowDeviceArray input; + memcpy(&input.array, input_array.get(), sizeof(ArrowArray)); + input.device_id = -1; + input.device_type = ARROW_DEVICE_CPU; + + // test we get the expected cudf table when we convert from Arrow host memory + auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.view(), got_cudf_table->view()); + + // test we get the expected cudf::column as a struct column + auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input); + EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); + auto got_cudf_col_view = got_cudf_col->view(); + cudf::table_view from_struct{ + std::vector(got_cudf_col_view.child_begin(), got_cudf_col_view.child_end())}; + CUDF_TEST_EXPECT_TABLES_EQUAL(got_cudf_table->view(), from_struct); +} + +void slice_host_nanoarrow(ArrowArray* arr, int64_t start, int64_t end) +{ + auto op = [&](ArrowArray* array) { + // slicing only needs to happen at the top level of an array + array->offset = start; + array->length = end - start; + if (array->null_count != 0) { + array->null_count = + array->length - + ArrowBitCountSet(ArrowArrayValidityBitmap(array)->buffer.data, start, end - start); + } + }; + + if (arr->n_children == 0) { + op(arr); + return; + } + + // since we want to simulate a sliced table where the children are sliced, + // we slice each individual child of the record batch + arr->length = end - start; + for (int64_t i = 0; i < arr->n_children; ++i) { + op(arr->children[i]); + } +} + +struct FromArrowHostDeviceTestSlice + : public FromArrowHostDeviceTest, + public ::testing::WithParamInterface> {}; + +TEST_P(FromArrowHostDeviceTestSlice, SliceTest) +{ + auto [table, schema, array] = get_nanoarrow_host_tables(10000); + auto cudf_table_view = table->view(); + auto const [start, end] = GetParam(); + + auto sliced_cudf_table = cudf::slice(cudf_table_view, {start, end})[0]; + auto expected_cudf_table = cudf::table{sliced_cudf_table}; + slice_host_nanoarrow(array.get(), start, end); + + ArrowDeviceArray input; + memcpy(&input.array, array.get(), sizeof(ArrowArray)); + input.device_id = -1; + input.device_type = ARROW_DEVICE_CPU; + + auto got_cudf_table = cudf::from_arrow_host(schema.get(), &input); + if (got_cudf_table->num_rows() == 0 and sliced_cudf_table.num_rows() == 0) { + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_cudf_table.view(), got_cudf_table->view()); + + auto got_cudf_col = cudf::from_arrow_host_column(schema.get(), &input); + EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); + auto got_cudf_col_view = got_cudf_col->view(); + cudf::table_view from_struct{std::vector(got_cudf_col_view.child_begin(), + got_cudf_col_view.child_end())}; + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(got_cudf_table->view(), from_struct); + } else { + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table.view(), got_cudf_table->view()); + + auto got_cudf_col = cudf::from_arrow_host_column(schema.get(), &input); + EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT}); + auto got_cudf_col_view = got_cudf_col->view(); + cudf::table_view from_struct{std::vector(got_cudf_col_view.child_begin(), + got_cudf_col_view.child_end())}; + CUDF_TEST_EXPECT_TABLES_EQUAL(got_cudf_table->view(), from_struct); + } +} + +INSTANTIATE_TEST_CASE_P(FromArrowHostDeviceTest, + FromArrowHostDeviceTestSlice, + ::testing::Values(std::make_tuple(0, 10000), + std::make_tuple(2912, 2915), + std::make_tuple(100, 3000), + std::make_tuple(0, 0), + std::make_tuple(0, 3000), + std::make_tuple(10000, 10000))); diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp index fb5d1060f6f..a79e6fdc49c 100644 --- a/cpp/tests/interop/nanoarrow_utils.hpp +++ b/cpp/tests/interop/nanoarrow_utils.hpp @@ -20,14 +20,61 @@ #include #include #include +#include #include #include #include #include #include +#include #include +struct generated_test_data { + generated_test_data(cudf::size_type length) + : int64_data(length), + bool_data(length), + string_data(length), + validity(length), + bool_validity(length), + list_int64_data(3 * length), + list_int64_data_validity(3 * length), + list_offsets(length + 1) + { + cudf::size_type length_of_individual_list = 3; + + std::generate(int64_data.begin(), int64_data.end(), []() { return rand() % 500000; }); + std::generate(list_int64_data.begin(), list_int64_data.end(), []() { return rand() % 500000; }); + auto validity_generator = []() { return rand() % 7 != 0; }; + std::generate( + list_int64_data_validity.begin(), list_int64_data_validity.end(), validity_generator); + std::generate( + list_offsets.begin(), list_offsets.end(), [length_of_individual_list, n = 0]() mutable { + return (n++) * length_of_individual_list; + }); + std::generate(bool_data.begin(), bool_data.end(), validity_generator); + std::generate( + string_data.begin(), string_data.end(), []() { return rand() % 7 != 0 ? "CUDF" : "Rocks"; }); + std::generate(validity.begin(), validity.end(), validity_generator); + std::generate(bool_validity.begin(), bool_validity.end(), validity_generator); + + std::transform(bool_validity.cbegin(), + bool_validity.cend(), + std::back_inserter(bool_data_validity), + [](auto val) { return static_cast(val); }); + } + + std::vector int64_data; + std::vector bool_data; + std::vector string_data; + std::vector validity; + std::vector bool_validity; + std::vector bool_data_validity; + std::vector list_int64_data; + std::vector list_int64_data_validity; + std::vector list_offsets; +}; + // no-op allocator/deallocator to set into ArrowArray buffers that we don't // want to own their buffers. static ArrowBufferAllocator noop_alloc = (struct ArrowBufferAllocator){ @@ -135,7 +182,196 @@ void populate_dict_from_col(ArrowArray* arr, cudf::dictionary_column_view dview) populate_from_col(arr->dictionary, dview.keys()); } +using vector_of_columns = std::vector>; + std::tuple, nanoarrow::UniqueSchema, nanoarrow::UniqueArray> get_nanoarrow_tables(cudf::size_type length = 10000); void populate_list_from_col(ArrowArray* arr, cudf::lists_column_view view); + +std::unique_ptr get_cudf_table(); + +template +struct nanoarrow_storage_type {}; + +#define DEFINE_NANOARROW_STORAGE(T, NanoType) \ + template <> \ + struct nanoarrow_storage_type { \ + static constexpr ArrowType type = NANOARROW_TYPE_##NanoType; \ + } + +DEFINE_NANOARROW_STORAGE(bool, BOOL); +DEFINE_NANOARROW_STORAGE(int64_t, INT64); +DEFINE_NANOARROW_STORAGE(uint16_t, UINT16); +DEFINE_NANOARROW_STORAGE(uint64_t, UINT64); +DEFINE_NANOARROW_STORAGE(cudf::duration_D, INT32); +DEFINE_NANOARROW_STORAGE(cudf::duration_s, INT64); +DEFINE_NANOARROW_STORAGE(cudf::duration_ms, INT64); +DEFINE_NANOARROW_STORAGE(cudf::duration_us, INT64); +DEFINE_NANOARROW_STORAGE(cudf::duration_ns, INT64); +DEFINE_NANOARROW_STORAGE(uint8_t, UINT8); +DEFINE_NANOARROW_STORAGE(int32_t, INT32); + +#undef DEFINE_NANOARROW_STORAGE + +template +std::enable_if_t() and !std::is_same_v, nanoarrow::UniqueArray> +get_nanoarrow_array(std::vector const& data, std::vector const& mask = {}) +{ + nanoarrow::UniqueArray tmp; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), nanoarrow_storage_type::type)); + + if (!mask.empty()) { + ArrowBitmap bitmap; + ArrowBitmapInit(&bitmap); + NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&bitmap, mask.size())); + ArrowBitmapAppendInt8Unsafe(&bitmap, reinterpret_cast(mask.data()), mask.size()); + + ArrowArraySetValidityBitmap(tmp.get(), &bitmap); + tmp->null_count = + data.size() - + ArrowBitCountSet(ArrowArrayValidityBitmap(tmp.get())->buffer.data, 0, mask.size()); + } + + ArrowBuffer buf; + ArrowBufferInit(&buf); + NANOARROW_THROW_NOT_OK( + ArrowBufferAppend(&buf, reinterpret_cast(data.data()), sizeof(T) * data.size())); + NANOARROW_THROW_NOT_OK(ArrowArraySetBuffer(tmp.get(), 1, &buf)); + + tmp->length = data.size(); + + return tmp; +} + +template +std::enable_if_t, nanoarrow::UniqueArray> get_nanoarrow_array( + std::vector const& data, std::vector const& mask = {}) +{ + nanoarrow::UniqueArray tmp; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_BOOL)); + + auto to_arrow_bitmap = [](std::vector const& b) -> ArrowBitmap { + ArrowBitmap out; + ArrowBitmapInit(&out); + NANOARROW_THROW_NOT_OK(ArrowBitmapResize(&out, b.size(), 1)); + out.buffer.size_bytes = (b.size() >> 3) + ((b.size() & 7) != 0); + out.size_bits = b.size(); + + for (size_t i = 0; i < b.size(); ++i) { + ArrowBitSetTo(out.buffer.data, i, static_cast(b[i])); + } + + return out; + }; + + if (!mask.empty()) { + auto validity_bitmap = to_arrow_bitmap(mask); + ArrowArraySetValidityBitmap(tmp.get(), &validity_bitmap); + tmp->null_count = + mask.size() - + ArrowBitCountSet(ArrowArrayValidityBitmap(tmp.get())->buffer.data, 0, mask.size()); + } + + auto raw_buffer = to_arrow_bitmap(data); + NANOARROW_THROW_NOT_OK(ArrowArraySetBuffer(tmp.get(), 1, &raw_buffer.buffer)); + tmp->length = data.size(); + + return tmp; +} + +template +nanoarrow::UniqueArray get_nanoarrow_array(std::initializer_list elements, + std::initializer_list validity = {}) +{ + std::vector mask(validity); + std::vector data(elements); + + return get_nanoarrow_array(data, mask); +} + +template +std::enable_if_t, nanoarrow::UniqueArray> get_nanoarrow_array( + std::vector const& data, std::vector const& mask = {}) +{ + nanoarrow::UniqueArray tmp; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_STRING)); + NANOARROW_THROW_NOT_OK(ArrowArrayStartAppending(tmp.get())); + NANOARROW_THROW_NOT_OK(ArrowArrayReserve(tmp.get(), data.size())); + + for (size_t i = 0; i < data.size(); ++i) { + if (!mask.empty() && mask[i] == 0) { + NANOARROW_THROW_NOT_OK(ArrowArrayAppendNull(tmp.get(), 1)); + } else { + NANOARROW_THROW_NOT_OK(ArrowArrayAppendString(tmp.get(), ArrowCharView(data[i].c_str()))); + } + } + + return tmp; +} + +template +nanoarrow::UniqueArray get_nanoarrow_dict_array(std::vector const& keys, + std::vector const& ind, + std::vector const& validity = {}) +{ + auto indices_array = get_nanoarrow_array(ind, validity); + NANOARROW_THROW_NOT_OK(ArrowArrayAllocateDictionary(indices_array.get())); + + auto keys_array = get_nanoarrow_array(keys); + keys_array.move(indices_array->dictionary); + + return indices_array; +} + +template +nanoarrow::UniqueArray get_nanoarrow_list_array(std::vector const& data, + std::vector const& offsets, + std::vector const& data_validity = {}, + std::vector const& list_validity = {}) +{ + auto data_array = get_nanoarrow_array(data, data_validity); + + nanoarrow::UniqueArray tmp; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_LIST)); + NANOARROW_THROW_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), 1)); + data_array.move(tmp->children[0]); + + tmp->length = offsets.size() - 1; + if (!list_validity.empty()) { + ArrowBitmap bitmap; + ArrowBitmapInit(&bitmap); + NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&bitmap, list_validity.size())); + ArrowBitmapAppendInt8Unsafe( + &bitmap, reinterpret_cast(list_validity.data()), list_validity.size()); + + ArrowArraySetValidityBitmap(tmp.get(), &bitmap); + tmp->null_count = + tmp->length - + ArrowBitCountSet(ArrowArrayValidityBitmap(tmp.get())->buffer.data, 0, list_validity.size()); + } + + ArrowBuffer buf; + ArrowBufferInit(&buf); + NANOARROW_THROW_NOT_OK(ArrowBufferAppend( + &buf, reinterpret_cast(offsets.data()), sizeof(int32_t) * offsets.size())); + NANOARROW_THROW_NOT_OK(ArrowArraySetBuffer(tmp.get(), 1, &buf)); + + return tmp; +} + +template +nanoarrow::UniqueArray get_nanoarrow_list_array(std::initializer_list data, + std::initializer_list offsets, + std::initializer_list data_validity = {}, + std::initializer_list list_validity = {}) +{ + std::vector data_vector(data); + std::vector offset(offsets); + std::vector data_mask(data_validity); + std::vector list_mask(list_validity); + return get_nanoarrow_list_array(data_vector, offset, data_mask, list_mask); +} + +std::tuple, nanoarrow::UniqueSchema, generated_test_data> +get_nanoarrow_cudf_table(cudf::size_type length); diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp index 626aeb53cdd..4c73cd637a4 100644 --- a/cpp/tests/interop/to_arrow_device_test.cpp +++ b/cpp/tests/interop/to_arrow_device_test.cpp @@ -38,80 +38,55 @@ #include -using vector_of_columns = std::vector>; - -std::tuple, nanoarrow::UniqueSchema, nanoarrow::UniqueArray> -get_nanoarrow_tables(cudf::size_type length) +std::tuple, nanoarrow::UniqueSchema, generated_test_data> +get_nanoarrow_cudf_table(cudf::size_type length) { - std::vector int64_data(length); - std::vector bool_data(length); - std::vector string_data(length); - std::vector validity(length); - std::vector bool_validity(length); - std::vector bool_data_validity; - cudf::size_type length_of_individual_list = 3; - cudf::size_type length_of_list = length_of_individual_list * length; - std::vector list_int64_data(length_of_list); - std::vector list_int64_data_validity(length_of_list); - std::vector list_offsets(length + 1); + generated_test_data test_data(length); std::vector> columns; - std::generate(int64_data.begin(), int64_data.end(), []() { return rand() % 500000; }); - std::generate(list_int64_data.begin(), list_int64_data.end(), []() { return rand() % 500000; }); - auto validity_generator = []() { return rand() % 7 != 0; }; - std::generate( - list_int64_data_validity.begin(), list_int64_data_validity.end(), validity_generator); - std::generate( - list_offsets.begin(), list_offsets.end(), [length_of_individual_list, n = 0]() mutable { - return (n++) * length_of_individual_list; - }); - std::generate(bool_data.begin(), bool_data.end(), validity_generator); - std::generate( - string_data.begin(), string_data.end(), []() { return rand() % 7 != 0 ? "CUDF" : "Rocks"; }); - std::generate(validity.begin(), validity.end(), validity_generator); - std::generate(bool_validity.begin(), bool_validity.end(), validity_generator); - - std::transform(bool_validity.cbegin(), - bool_validity.cend(), - std::back_inserter(bool_data_validity), - [](auto val) { return static_cast(val); }); - - columns.emplace_back(cudf::test::fixed_width_column_wrapper( - int64_data.begin(), int64_data.end(), validity.begin()) + columns.emplace_back(cudf::test::fixed_width_column_wrapper(test_data.int64_data.begin(), + test_data.int64_data.end(), + test_data.validity.begin()) + .release()); + columns.emplace_back(cudf::test::strings_column_wrapper(test_data.string_data.begin(), + test_data.string_data.end(), + test_data.validity.begin()) .release()); - columns.emplace_back( - cudf::test::strings_column_wrapper(string_data.begin(), string_data.end(), validity.begin()) - .release()); auto col4 = cudf::test::fixed_width_column_wrapper( - int64_data.begin(), int64_data.end(), validity.begin()); + test_data.int64_data.begin(), test_data.int64_data.end(), test_data.validity.begin()); auto dict_col = cudf::dictionary::encode(col4); columns.emplace_back(std::move(cudf::dictionary::encode(col4))); - columns.emplace_back(cudf::test::fixed_width_column_wrapper( - bool_data.begin(), bool_data.end(), bool_validity.begin()) + columns.emplace_back(cudf::test::fixed_width_column_wrapper(test_data.bool_data.begin(), + test_data.bool_data.end(), + test_data.bool_validity.begin()) .release()); - auto list_child_column = cudf::test::fixed_width_column_wrapper( - list_int64_data.begin(), list_int64_data.end(), list_int64_data_validity.begin()); - auto list_offsets_column = - cudf::test::fixed_width_column_wrapper(list_offsets.begin(), list_offsets.end()); + auto list_child_column = + cudf::test::fixed_width_column_wrapper(test_data.list_int64_data.begin(), + test_data.list_int64_data.end(), + test_data.list_int64_data_validity.begin()); + auto list_offsets_column = cudf::test::fixed_width_column_wrapper( + test_data.list_offsets.begin(), test_data.list_offsets.end()); auto [list_mask, list_nulls] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper( - bool_data_validity.begin(), bool_data_validity.end())); + test_data.bool_data_validity.begin(), test_data.bool_data_validity.end())); columns.emplace_back(cudf::make_lists_column(length, list_offsets_column.release(), list_child_column.release(), list_nulls, std::move(*list_mask))); - auto int_column = cudf::test::fixed_width_column_wrapper( - int64_data.begin(), int64_data.end(), validity.begin()) - .release(); + auto int_column = + cudf::test::fixed_width_column_wrapper( + test_data.int64_data.begin(), test_data.int64_data.end(), test_data.validity.begin()) + .release(); auto str_column = - cudf::test::strings_column_wrapper(string_data.begin(), string_data.end(), validity.begin()) + cudf::test::strings_column_wrapper( + test_data.string_data.begin(), test_data.string_data.end(), test_data.validity.begin()) .release(); vector_of_columns cols; cols.push_back(move(int_column)); cols.push_back(move(str_column)); auto [null_mask, null_count] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper( - bool_data_validity.begin(), bool_data_validity.end())); + test_data.bool_data_validity.begin(), test_data.bool_data_validity.end())); columns.emplace_back( cudf::make_structs_column(length, std::move(cols), null_count, std::move(*null_mask))); @@ -198,21 +173,30 @@ get_nanoarrow_tables(cudf::size_type length) schema->children[5]->flags = 0; } + return std::make_tuple( + std::make_unique(std::move(columns)), std::move(schema), std::move(test_data)); +} + +std::tuple, nanoarrow::UniqueSchema, nanoarrow::UniqueArray> +get_nanoarrow_tables(cudf::size_type length) +{ + auto [table, schema, test_data] = get_nanoarrow_cudf_table(length); + nanoarrow::UniqueArray arrow; NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(arrow.get(), schema.get(), nullptr)); arrow->length = length; - populate_from_col(arrow->children[0], columns[0]->view()); - populate_from_col(arrow->children[1], columns[1]->view()); - populate_dict_from_col(arrow->children[2], - cudf::dictionary_column_view(columns[2]->view())); + populate_from_col(arrow->children[0], table->get_column(0).view()); + populate_from_col(arrow->children[1], table->get_column(1).view()); + populate_dict_from_col( + arrow->children[2], cudf::dictionary_column_view(table->get_column(2).view())); - populate_from_col(arrow->children[3], columns[3]->view()); - cudf::lists_column_view list_view{columns[4]->view()}; + populate_from_col(arrow->children[3], table->get_column(3).view()); + cudf::lists_column_view list_view{table->get_column(4).view()}; populate_list_from_col(arrow->children[4], list_view); populate_from_col(arrow->children[4]->children[0], list_view.child()); - cudf::structs_column_view struct_view{columns[5]->view()}; + cudf::structs_column_view struct_view{table->get_column(5).view()}; populate_from_col(arrow->children[5]->children[0], struct_view.child(0)); populate_from_col(arrow->children[5]->children[1], struct_view.child(1)); arrow->children[5]->length = struct_view.size(); @@ -231,8 +215,7 @@ get_nanoarrow_tables(cudf::size_type length) CUDF_FAIL("failed to build example arrays"); } - return std::make_tuple( - std::make_unique(std::move(columns)), std::move(schema), std::move(arrow)); + return std::make_tuple(std::move(table), std::move(schema), std::move(arrow)); } // populate an ArrowArray list array from device buffers using a no-op