Skip to content

Commit 27404bc

Browse files
authored
Implement HOST_UDF aggregation for groupby (#17592)
This implements `HOST_UDF` aggregation, allowing to execute a host-side user-defined function (UDF) through libcudf aggregation framework. * A host-side function can be an arbitrarily independent function running on the host machine. It may or may not call other device kernels depending on its implementation. * Such user-defined function must follow the libcudf provided interface (`cudf::host_udf_base`). The interface provides the ability to fully interact with libcudf aggregation framework. * Since it is implemented on the user application side, it has a very high degree of freedom to perform arbitrary operations to satisfy the user's need. Partially contributes to #16633. --- Usage 1. Define a functor deriving from `cudf::host_udf_base` and implement the required virtual functions declared in that base struct. For example: ``` struct my_aggregation : cudf::host_udf_base { ... }; ``` 2. Create an instance of libcudf `HOST_UDF` aggregation which is constructed from an instance of the functor defined above. For example: ``` auto agg = cudf::make_host_udf_aggregation<cudf::groupby_aggregation>( std::make_unique<my_aggregation>()); ``` 3. Perform aggregation operation on the created instance. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Chong Gao (https://github.com/res-life) - Vyas Ramasubramani (https://github.com/vyasr) - David Wendt (https://github.com/davidwendt) URL: #17592
1 parent 2837a45 commit 27404bc

File tree

15 files changed

+1175
-43
lines changed

15 files changed

+1175
-43
lines changed

cpp/CMakeLists.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -446,14 +446,15 @@ add_library(
446446
src/groupby/sort/group_quantiles.cu
447447
src/groupby/sort/group_std.cu
448448
src/groupby/sort/group_sum.cu
449-
src/groupby/sort/scan.cpp
450449
src/groupby/sort/group_count_scan.cu
451450
src/groupby/sort/group_max_scan.cu
452451
src/groupby/sort/group_min_scan.cu
453452
src/groupby/sort/group_product_scan.cu
454453
src/groupby/sort/group_rank_scan.cu
455454
src/groupby/sort/group_replace_nulls.cu
456455
src/groupby/sort/group_sum_scan.cu
456+
src/groupby/sort/host_udf_aggregation.cpp
457+
src/groupby/sort/scan.cpp
457458
src/groupby/sort/sort_helper.cu
458459
src/hash/md5_hash.cu
459460
src/hash/murmurhash3_x86_32.cu

cpp/include/cudf/aggregation.hpp

+16-3
Original file line numberDiff line numberDiff line change
@@ -110,8 +110,9 @@ class aggregation {
110110
COLLECT_SET, ///< collect values into a list without duplicate entries
111111
LEAD, ///< window function, accesses row at specified offset following current row
112112
LAG, ///< window function, accesses row at specified offset preceding current row
113-
PTX, ///< PTX UDF based reduction
114-
CUDA, ///< CUDA UDF based reduction
113+
PTX, ///< PTX based UDF aggregation
114+
CUDA, ///< CUDA based UDF aggregation
115+
HOST_UDF, ///< host based UDF aggregation
115116
MERGE_LISTS, ///< merge multiple lists values into one list
116117
MERGE_SETS, ///< merge multiple lists values into one list then drop duplicate entries
117118
MERGE_M2, ///< merge partial values of M2 aggregation,
@@ -120,7 +121,7 @@ class aggregation {
120121
TDIGEST, ///< create a tdigest from a set of input values
121122
MERGE_TDIGEST, ///< create a tdigest by merging multiple tdigests together
122123
HISTOGRAM, ///< compute frequency of each element
123-
MERGE_HISTOGRAM ///< merge partial values of HISTOGRAM aggregation,
124+
MERGE_HISTOGRAM ///< merge partial values of HISTOGRAM aggregation
124125
};
125126

126127
aggregation() = delete;
@@ -599,6 +600,18 @@ std::unique_ptr<Base> make_udf_aggregation(udf_type type,
599600
std::string const& user_defined_aggregator,
600601
data_type output_type);
601602

603+
// Forward declaration of `host_udf_base` for the factory function of `HOST_UDF` aggregation.
604+
struct host_udf_base;
605+
606+
/**
607+
* @brief Factory to create a HOST_UDF aggregation.
608+
*
609+
* @param host_udf An instance of a class derived from `host_udf_base` to perform aggregation
610+
* @return A HOST_UDF aggregation object
611+
*/
612+
template <typename Base = aggregation>
613+
std::unique_ptr<Base> make_host_udf_aggregation(std::unique_ptr<host_udf_base> host_udf);
614+
602615
/**
603616
* @brief Factory to create a MERGE_LISTS aggregation.
604617
*
+294
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,294 @@
1+
/*
2+
* Copyright (c) 2024, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
19+
#include <cudf/aggregation.hpp>
20+
#include <cudf/types.hpp>
21+
#include <cudf/utilities/export.hpp>
22+
#include <cudf/utilities/span.hpp>
23+
#include <cudf/utilities/traits.hpp>
24+
25+
#include <rmm/cuda_stream_view.hpp>
26+
#include <rmm/resource_ref.hpp>
27+
28+
#include <optional>
29+
#include <unordered_map>
30+
#include <unordered_set>
31+
#include <variant>
32+
33+
/**
34+
* @file host_udf.hpp
35+
* @brief Declare the base class for host-side user-defined function (`HOST_UDF`) and example of
36+
* subclass implementation.
37+
*/
38+
39+
namespace CUDF_EXPORT cudf {
40+
/**
41+
* @addtogroup aggregation_factories
42+
* @{
43+
*/
44+
45+
/**
46+
* @brief The interface for host-based UDF implementation.
47+
*
48+
* An implementation of host-based UDF needs to be derived from this base class, defining
49+
* its own version of the required functions. In particular:
50+
* - The derived class is required to implement `get_empty_output`, `operator()`, `is_equal`,
51+
* and `clone` functions.
52+
* - If necessary, the derived class can also override `do_hash` to compute hashing for its
53+
* instance, and `get_required_data` to selectively access to the input data as well as
54+
* intermediate data provided by libcudf.
55+
*
56+
* Example of such implementation:
57+
* @code{.cpp}
58+
* struct my_udf_aggregation : cudf::host_udf_base {
59+
* my_udf_aggregation() = default;
60+
*
61+
* // This UDF aggregation needs `GROUPED_VALUES` and `GROUP_OFFSETS`,
62+
* // and the result from groupby `MAX` aggregation.
63+
* [[nodiscard]] data_attribute_set_t get_required_data() const override
64+
* {
65+
* return {groupby_data_attribute::GROUPED_VALUES,
66+
* groupby_data_attribute::GROUP_OFFSETS,
67+
* cudf::make_max_aggregation<cudf::groupby_aggregation>()};
68+
* }
69+
*
70+
* [[nodiscard]] output_t get_empty_output(
71+
* [[maybe_unused]] std::optional<cudf::data_type> output_dtype,
72+
* [[maybe_unused]] rmm::cuda_stream_view stream,
73+
* [[maybe_unused]] rmm::device_async_resource_ref mr) const override
74+
* {
75+
* // This UDF aggregation always returns a column of type INT32.
76+
* return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32});
77+
* }
78+
*
79+
* [[nodiscard]] output_t operator()(input_map_t const& input,
80+
* rmm::cuda_stream_view stream,
81+
* rmm::device_async_resource_ref mr) const override
82+
* {
83+
* // Perform UDF computation using the input data and return the result.
84+
* }
85+
*
86+
* [[nodiscard]] bool is_equal(host_udf_base const& other) const override
87+
* {
88+
* // Check if the other object is also instance of this class.
89+
* return dynamic_cast<my_udf_aggregation const*>(&other) != nullptr;
90+
* }
91+
*
92+
* [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
93+
* {
94+
* return std::make_unique<my_udf_aggregation>();
95+
* }
96+
* };
97+
* @endcode
98+
*/
99+
struct host_udf_base {
100+
host_udf_base() = default;
101+
virtual ~host_udf_base() = default;
102+
103+
/**
104+
* @brief Define the possible data needed for groupby aggregations.
105+
*
106+
* Note that only sort-based groupby aggregations are supported.
107+
*/
108+
enum class groupby_data_attribute : int32_t {
109+
INPUT_VALUES, ///< The input values column.
110+
GROUPED_VALUES, ///< The input values grouped according to the input `keys` for which the
111+
///< values within each group maintain their original order.
112+
SORTED_GROUPED_VALUES, ///< The input values grouped according to the input `keys` and
113+
///< sorted within each group.
114+
NUM_GROUPS, ///< The number of groups (i.e., number of distinct keys).
115+
GROUP_OFFSETS, ///< The offsets separating groups.
116+
GROUP_LABELS ///< Group labels (which is also the same as group indices).
117+
};
118+
119+
/**
120+
* @brief Describe possible data that may be needed in the derived class for its operations.
121+
*
122+
* Such data can be either intermediate data such as sorted values or group labels etc, or the
123+
* results of other aggregations.
124+
*
125+
* Each derived host-based UDF class may need a different set of data. It is inefficient to
126+
* evaluate and pass down all these possible data at once from libcudf. A solution for that is,
127+
* the derived class can define a subset of data that it needs and libcudf will evaluate
128+
* and pass down only data requested from that set.
129+
*/
130+
struct data_attribute {
131+
/**
132+
* @brief Hold all possible data types for the input of the aggregation in the derived class.
133+
*/
134+
using value_type = std::variant<groupby_data_attribute, std::unique_ptr<aggregation>>;
135+
value_type value; ///< The actual data attribute, wrapped by this struct
136+
///< as a wrapper is needed to define `hash` and `equal_to` functors.
137+
138+
data_attribute() = default; ///< Default constructor
139+
data_attribute(data_attribute&&) = default; ///< Move constructor
140+
141+
/**
142+
* @brief Construct a new data attribute from an aggregation attribute.
143+
* @param value_ An aggregation attribute
144+
*/
145+
template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, groupby_data_attribute>)>
146+
data_attribute(T value_) : value{value_}
147+
{
148+
}
149+
150+
/**
151+
* @brief Construct a new data attribute from another aggregation request.
152+
* @param value_ An aggregation request
153+
*/
154+
template <typename T,
155+
CUDF_ENABLE_IF(std::is_same_v<T, aggregation> ||
156+
std::is_same_v<T, groupby_aggregation>)>
157+
data_attribute(std::unique_ptr<T> value_) : value{std::move(value_)}
158+
{
159+
CUDF_EXPECTS(std::get<std::unique_ptr<aggregation>>(value) != nullptr,
160+
"Invalid aggregation request.");
161+
if constexpr (std::is_same_v<T, aggregation>) {
162+
CUDF_EXPECTS(
163+
dynamic_cast<groupby_aggregation*>(std::get<std::unique_ptr<T>>(value).get()) != nullptr,
164+
"Requesting results from other aggregations is only supported in groupby "
165+
"aggregations.");
166+
}
167+
}
168+
169+
/**
170+
* @brief Copy constructor.
171+
* @param other The other data attribute to copy from
172+
*/
173+
data_attribute(data_attribute const& other);
174+
175+
/**
176+
* @brief Hash functor for `data_attribute`.
177+
*/
178+
struct hash {
179+
/**
180+
* @brief Compute the hash value of a data attribute.
181+
* @param attr The data attribute to hash
182+
* @return The hash value of the data attribute
183+
*/
184+
std::size_t operator()(data_attribute const& attr) const;
185+
}; // struct hash
186+
187+
/**
188+
* @brief Equality comparison functor for `data_attribute`.
189+
*/
190+
struct equal_to {
191+
/**
192+
* @brief Check if two data attributes are equal.
193+
* @param lhs The left-hand side data attribute
194+
* @param rhs The right-hand side data attribute
195+
* @return True if the two data attributes are equal
196+
*/
197+
bool operator()(data_attribute const& lhs, data_attribute const& rhs) const;
198+
}; // struct equal_to
199+
}; // struct data_attribute
200+
201+
/**
202+
* @brief Set of attributes for the input data that is needed for computing the aggregation.
203+
*/
204+
using data_attribute_set_t =
205+
std::unordered_set<data_attribute, data_attribute::hash, data_attribute::equal_to>;
206+
207+
/**
208+
* @brief Return a set of attributes for the data that is needed for computing the aggregation.
209+
*
210+
* The derived class should return the attributes corresponding to only the data that it needs to
211+
* avoid unnecessary computation performed in libcudf. If this function is not overridden, an
212+
* empty set is returned. That means all the data attributes (except results from other
213+
* aggregations in groupby) will be needed.
214+
*
215+
* @return A set of `data_attribute`
216+
*/
217+
[[nodiscard]] virtual data_attribute_set_t get_required_data() const { return {}; }
218+
219+
/**
220+
* @brief Hold all possible types of the data that is passed to the derived class for executing
221+
* the aggregation.
222+
*/
223+
using input_data_t = std::variant<column_view, size_type, device_span<size_type const>>;
224+
225+
/**
226+
* @brief Input to the aggregation, mapping from each data attribute to its actual data.
227+
*/
228+
using input_map_t = std::
229+
unordered_map<data_attribute, input_data_t, data_attribute::hash, data_attribute::equal_to>;
230+
231+
/**
232+
* @brief Output type of the aggregation.
233+
*
234+
* Currently only a single type is supported as the output of the aggregation, but it will hold
235+
* more type in the future when reduction is supported.
236+
*/
237+
using output_t = std::variant<std::unique_ptr<column>>;
238+
239+
/**
240+
* @brief Get the output when the input values column is empty.
241+
*
242+
* This is called in libcudf when the input values column is empty. In such situations libcudf
243+
* tries to generate the output directly without unnecessarily evaluating the intermediate data.
244+
*
245+
* @param output_dtype The expected output data type
246+
* @param stream The CUDA stream to use for any kernel launches
247+
* @param mr Device memory resource to use for any allocations
248+
* @return The output result of the aggregation when input values is empty
249+
*/
250+
[[nodiscard]] virtual output_t get_empty_output(std::optional<data_type> output_dtype,
251+
rmm::cuda_stream_view stream,
252+
rmm::device_async_resource_ref mr) const = 0;
253+
254+
/**
255+
* @brief Perform the main computation for the host-based UDF.
256+
*
257+
* @param input The input data needed for performing all computation
258+
* @param stream The CUDA stream to use for any kernel launches
259+
* @param mr Device memory resource to use for any allocations
260+
* @return The output result of the aggregation
261+
*/
262+
[[nodiscard]] virtual output_t operator()(input_map_t const& input,
263+
rmm::cuda_stream_view stream,
264+
rmm::device_async_resource_ref mr) const = 0;
265+
266+
/**
267+
* @brief Computes hash value of the class's instance.
268+
* @return The hash value of the instance
269+
*/
270+
[[nodiscard]] virtual std::size_t do_hash() const
271+
{
272+
return std::hash<int>{}(static_cast<int>(aggregation::Kind::HOST_UDF));
273+
}
274+
275+
/**
276+
* @brief Compares two instances of the derived class for equality.
277+
* @param other The other derived class's instance to compare with
278+
* @return True if the two instances are equal
279+
*/
280+
[[nodiscard]] virtual bool is_equal(host_udf_base const& other) const = 0;
281+
282+
/**
283+
* @brief Clones the instance.
284+
*
285+
* A class derived from `host_udf_base` should not store too much data such that its instances
286+
* remain lightweight for efficient cloning.
287+
*
288+
* @return A new instance cloned from this
289+
*/
290+
[[nodiscard]] virtual std::unique_ptr<host_udf_base> clone() const = 0;
291+
};
292+
293+
/** @} */ // end of group
294+
} // namespace CUDF_EXPORT cudf

0 commit comments

Comments
 (0)