From 03c0f5a1ea332caac37f0d5b57b9838e80e691c1 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Wed, 16 Oct 2024 10:01:15 +0800 Subject: [PATCH 01/28] Add HLL++ evaluation function --- src/main/cpp/CMakeLists.txt | 2 + src/main/cpp/src/HLLPP.cu | 102 ++++++++++++++++++ src/main/cpp/src/HLLPP.hpp | 32 ++++++ src/main/cpp/src/HLLPPJni.cpp | 34 ++++++ .../com/nvidia/spark/rapids/jni/HLLPP.java | 45 ++++++++ .../nvidia/spark/rapids/jni/HLLPPTest.java | 37 +++++++ 6 files changed, 252 insertions(+) create mode 100644 src/main/cpp/src/HLLPP.cu create mode 100644 src/main/cpp/src/HLLPP.hpp create mode 100644 src/main/cpp/src/HLLPPJni.cpp create mode 100644 src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java create mode 100644 src/test/java/com/nvidia/spark/rapids/jni/HLLPPTest.java diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index bfb9d55377..997e7bda15 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -196,6 +196,7 @@ add_library( src/HashJni.cpp src/HistogramJni.cpp src/HostTableJni.cpp + src/HLLPPJni.cpp src/JSONUtilsJni.cpp src/NativeParquetJni.cpp src/ParseURIJni.cpp @@ -204,6 +205,7 @@ add_library( src/SparkResourceAdaptorJni.cpp src/SubStringIndexJni.cpp src/ZOrderJni.cpp + src/HLLPP.cu src/bloom_filter.cu src/case_when.cu src/cast_decimal_to_string.cu diff --git a/src/main/cpp/src/HLLPP.cu b/src/main/cpp/src/HLLPP.cu new file mode 100644 index 0000000000..439b9e1706 --- /dev/null +++ b/src/main/cpp/src/HLLPP.cu @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "HLLPP.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +namespace spark_rapids_jni { + +namespace { + +// The number of bits required by register value. Register value stores num of zeros. +// XXHash64 value is 64 bits, it's safe to use 6 bits to store a register value. +constexpr int REGISTER_VALUE_BITS = 6; + +// MASK binary 6 bits: 111111 +constexpr uint64_t MASK = (1L << REGISTER_VALUE_BITS) - 1L; + +// One long stores 10 register values +constexpr int REGISTERS_PER_LONG = 64 / REGISTER_VALUE_BITS; + +__device__ inline int get_register_value(int64_t const long_10_registers, int reg_idx) +{ + int64_t shift_mask = MASK << (REGISTER_VALUE_BITS * reg_idx); + int64_t v = (long_10_registers & shift_mask) >> (REGISTER_VALUE_BITS * reg_idx); + return static_cast(v); +} + +struct estimate_fn { + cudf::device_span sketch_longs; + int const precision; + int64_t* const out; + + __device__ void operator()(cudf::size_type const idx) const + { + auto const num_regs = 1ull << precision; + double sum = 0; + int zeroes = 0; + + for (auto reg_idx = 0; reg_idx < num_regs; ++reg_idx) { + // each long contains 10 register values + int long_col_idx = reg_idx / REGISTERS_PER_LONG; + int reg_idx_in_long = reg_idx % REGISTERS_PER_LONG; + int reg = get_register_value(sketch_longs[long_col_idx][idx], reg_idx_in_long); + sum += double{1} / static_cast(1ull << reg); + zeroes += reg == 0; + } + + auto const finalize = cuco::hyperloglog_ns::detail::finalizer(precision); + out[idx] = finalize(sum, zeroes); + } +}; + +} // end anonymous namespace + +std::unique_ptr estimate_from_hll_sketches(cudf::column_view const& input, + int precision, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_EXPECTS(precision >= 4 && precision <= 18, "HLL++ requires precision in range: [4, 18]"); + auto const input_iter = cudf::detail::make_counting_transform_iterator( + 0, [&](int i) { return input.child(i).begin(); }); + auto input_cols = std::vector(input_iter, input_iter + input.num_children()); + auto d_inputs = cudf::detail::make_device_uvector_async(input_cols, stream, mr); + auto result = cudf::make_numeric_column( + cudf::data_type{cudf::type_id::INT64}, input.size(), cudf::mask_state::ALL_VALID, stream); + // evaluate from struct + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + input.size(), + estimate_fn{d_inputs, precision, result->mutable_view().data()}); + return result; +} + +} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/HLLPP.hpp b/src/main/cpp/src/HLLPP.hpp new file mode 100644 index 0000000000..69e0b237e5 --- /dev/null +++ b/src/main/cpp/src/HLLPP.hpp @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +namespace spark_rapids_jni { + +std::unique_ptr estimate_from_hll_sketches( + cudf::column_view const& input, + int precision, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + +} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/HLLPPJni.cpp b/src/main/cpp/src/HLLPPJni.cpp new file mode 100644 index 0000000000..581af90a90 --- /dev/null +++ b/src/main/cpp/src/HLLPPJni.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "HLLPP.hpp" +#include "cudf_jni_apis.hpp" + +extern "C" { + +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_HLLPP_estimateDistinctValueFromSketches( + JNIEnv* env, jclass, jlong sketches, jint precision) +{ + JNI_NULL_CHECK(env, sketches, "Sketch column is null", 0); + try { + cudf::jni::auto_set_device(env); + auto const sketch_view = reinterpret_cast(sketches); + return cudf::jni::ptr_as_jlong( + spark_rapids_jni::estimate_from_hll_sketches(*sketch_view, precision).release()); + } + CATCH_STD(env, 0); +} +} diff --git a/src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java b/src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java new file mode 100644 index 0000000000..1be2c80512 --- /dev/null +++ b/src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.jni; + +import ai.rapids.cudf.ColumnVector; +import ai.rapids.cudf.ColumnView; +import ai.rapids.cudf.NativeDepsLoader; + +/** + * HyperLogLogPlusPlus + */ +public class HLLPP { + static { + NativeDepsLoader.loadNativeDeps(); + } + + /** + * Compute the approximate count distinct value from sketch values. + *

+ * The input sketch values must be given in the format `LIST`. + * + * @param input The sketch column which constains `LIST values. + * @param precision The num of bits for addressing. + * @return A INT64 column with each value indicates the approximate count distinct value. + */ + public static ColumnVector estimateDistinctValueFromSketches(ColumnView input, int precision) { + return new ColumnVector(estimateDistinctValueFromSketches(input.getNativeView(), precision)); + } + + private static native long estimateDistinctValueFromSketches(long inputHandle, int precision); +} diff --git a/src/test/java/com/nvidia/spark/rapids/jni/HLLPPTest.java b/src/test/java/com/nvidia/spark/rapids/jni/HLLPPTest.java new file mode 100644 index 0000000000..c14b565313 --- /dev/null +++ b/src/test/java/com/nvidia/spark/rapids/jni/HLLPPTest.java @@ -0,0 +1,37 @@ +/* +* Copyright (c) 2024, NVIDIA CORPORATION. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +package com.nvidia.spark.rapids.jni; + +import ai.rapids.cudf.GroupByAggregation; +import ai.rapids.cudf.Table; + +import org.junit.jupiter.api.Test; + + +public class HLLPPTest { + + @Test + void testGroupByHLL() { + // A trivial test: + try (Table input = new Table.TestBuilder().column(1, 2, 3, 1, 2, 2, 1, 3, 3, 2) + .column(0, 1, -2, 3, -4, -5, -6, 7, -8, 9) + .build()){ + input.groupBy(0).aggregate(GroupByAggregation.HLLPP(0) + .onColumn(1)); + } + } +} From df8b223a6391dbd82c85bb2005b0d426a14ca304 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Mon, 4 Nov 2024 10:18:19 +0800 Subject: [PATCH 02/28] Update function comments --- src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java b/src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java index 1be2c80512..9e51761f4a 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java @@ -31,9 +31,12 @@ public class HLLPP { /** * Compute the approximate count distinct value from sketch values. *

- * The input sketch values must be given in the format `LIST`. + * The input sketch values must be given in the format `Struct`, + * The num of children is: num_registers_per_sketch / 10 + 1, here 10 means a INT64 contains + * max 10 registers. Register value is 6 bits. The input is columnar data, e.g.: sketch 0 + * is composed of by all the data of the children at index 0. * - * @param input The sketch column which constains `LIST values. + * @param input The sketch column which constains Struct values. * @param precision The num of bits for addressing. * @return A INT64 column with each value indicates the approximate count distinct value. */ From 2daca3f536a847d25de7edc6555bd824d704df2f Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Tue, 19 Nov 2024 17:18:01 +0800 Subject: [PATCH 03/28] Fix --- src/main/cpp/src/HLLPP.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/cpp/src/HLLPP.cu b/src/main/cpp/src/HLLPP.cu index 439b9e1706..ca35e77861 100644 --- a/src/main/cpp/src/HLLPP.cu +++ b/src/main/cpp/src/HLLPP.cu @@ -84,7 +84,7 @@ std::unique_ptr estimate_from_hll_sketches(cudf::column_view const rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS(precision >= 4 && precision <= 18, "HLL++ requires precision in range: [4, 18]"); + CUDF_EXPECTS(precision >= 4 , "HyperLogLogPlusPlus requires precision is bigger than 4."); auto const input_iter = cudf::detail::make_counting_transform_iterator( 0, [&](int i) { return input.child(i).begin(); }); auto input_cols = std::vector(input_iter, input_iter + input.num_children()); From 3afdfdef7ac93cda55267994f9865296e061c25c Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Tue, 26 Nov 2024 15:43:44 +0800 Subject: [PATCH 04/28] Use exec_policy_nosync instead of exec_policy --- src/main/cpp/compile_commands.json | 1 + src/main/cpp/src/HLLPP.cu | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 120000 src/main/cpp/compile_commands.json diff --git a/src/main/cpp/compile_commands.json b/src/main/cpp/compile_commands.json new file mode 120000 index 0000000000..921c8b97d1 --- /dev/null +++ b/src/main/cpp/compile_commands.json @@ -0,0 +1 @@ +/home/chongg/code/spark-rapids-jni/target/jni/cmake-build/compile_commands.json \ No newline at end of file diff --git a/src/main/cpp/src/HLLPP.cu b/src/main/cpp/src/HLLPP.cu index ca35e77861..939d8fe2e0 100644 --- a/src/main/cpp/src/HLLPP.cu +++ b/src/main/cpp/src/HLLPP.cu @@ -92,7 +92,7 @@ std::unique_ptr estimate_from_hll_sketches(cudf::column_view const auto result = cudf::make_numeric_column( cudf::data_type{cudf::type_id::INT64}, input.size(), cudf::mask_state::ALL_VALID, stream); // evaluate from struct - thrust::for_each_n(rmm::exec_policy(stream), + thrust::for_each_n(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), input.size(), estimate_fn{d_inputs, precision, result->mutable_view().data()}); From 956af394dba6d784efa36a1e5ccc943ed53eea2c Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Tue, 26 Nov 2024 15:48:51 +0800 Subject: [PATCH 05/28] Format code; Remove a useless file Signed-off-by: Chong Gao --- src/main/cpp/compile_commands.json | 1 - src/main/cpp/src/HLLPP.cu | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) delete mode 120000 src/main/cpp/compile_commands.json diff --git a/src/main/cpp/compile_commands.json b/src/main/cpp/compile_commands.json deleted file mode 120000 index 921c8b97d1..0000000000 --- a/src/main/cpp/compile_commands.json +++ /dev/null @@ -1 +0,0 @@ -/home/chongg/code/spark-rapids-jni/target/jni/cmake-build/compile_commands.json \ No newline at end of file diff --git a/src/main/cpp/src/HLLPP.cu b/src/main/cpp/src/HLLPP.cu index 939d8fe2e0..d2d9493cf7 100644 --- a/src/main/cpp/src/HLLPP.cu +++ b/src/main/cpp/src/HLLPP.cu @@ -84,7 +84,7 @@ std::unique_ptr estimate_from_hll_sketches(cudf::column_view const rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS(precision >= 4 , "HyperLogLogPlusPlus requires precision is bigger than 4."); + CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision is bigger than 4."); auto const input_iter = cudf::detail::make_counting_transform_iterator( 0, [&](int i) { return input.child(i).begin(); }); auto input_cols = std::vector(input_iter, input_iter + input.num_children()); From 5bfb54426a4cb137f1cce70d843681167c5f929b Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Sun, 15 Dec 2024 16:31:42 +0800 Subject: [PATCH 06/28] Use UDF --- src/main/cpp/CMakeLists.txt | 4 +- src/main/cpp/src/HLLPPHostUDFJni.cpp | 66 ++ src/main/cpp/src/hllpp.cu | 969 ++++++++++++++++++ src/main/cpp/src/hllpp.hpp | 100 ++ src/main/cpp/src/hllpp_host_udf.cu | 183 ++++ src/main/cpp/src/hllpp_host_udf.hpp | 35 + .../nvidia/spark/rapids/jni/HLLPPHostUDF.java | 105 ++ 7 files changed, 1461 insertions(+), 1 deletion(-) create mode 100644 src/main/cpp/src/HLLPPHostUDFJni.cpp create mode 100644 src/main/cpp/src/hllpp.cu create mode 100644 src/main/cpp/src/hllpp.hpp create mode 100644 src/main/cpp/src/hllpp_host_udf.cu create mode 100644 src/main/cpp/src/hllpp_host_udf.hpp create mode 100644 src/main/java/com/nvidia/spark/rapids/jni/HLLPPHostUDF.java diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index 8872303e73..b8b5f3a139 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -193,6 +193,7 @@ add_library( src/DateTimeRebaseJni.cpp src/DecimalUtilsJni.cpp src/GpuTimeZoneDBJni.cpp + src/HLLPPHostUDFJni.cpp src/HashJni.cpp src/HistogramJni.cpp src/HostTableJni.cpp @@ -205,7 +206,6 @@ add_library( src/SparkResourceAdaptorJni.cpp src/SubStringIndexJni.cpp src/ZOrderJni.cpp - src/HLLPP.cu src/bloom_filter.cu src/case_when.cu src/cast_decimal_to_string.cu @@ -219,6 +219,8 @@ add_library( src/from_json_to_structs.cu src/get_json_object.cu src/histogram.cu + src/hllpp_host_udf.cu + src/hllpp.cu src/json_utils.cu src/murmur_hash.cu src/parse_uri.cu diff --git a/src/main/cpp/src/HLLPPHostUDFJni.cpp b/src/main/cpp/src/HLLPPHostUDFJni.cpp new file mode 100644 index 0000000000..3132d088ac --- /dev/null +++ b/src/main/cpp/src/HLLPPHostUDFJni.cpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cudf_jni_apis.hpp" +#include "hllpp.hpp" +#include "hllpp_host_udf.hpp" + +extern "C" { + +JNIEXPORT jlong JNICALL +Java_com_nvidia_spark_rapids_jni_HLLPPHostUDF_createHLLPPHostUDF( + JNIEnv *env, jclass, jint agg_type, int precision) { + try { + cudf::jni::auto_set_device(env); + auto udf_ptr = [&] { + // The value of agg_type must be sync with + // `HLLPPHostUDF.java#AggregationType`. + switch (agg_type) { + case 0: + return spark_rapids_jni::create_hllpp_reduction_host_udf(precision); + case 1: + return spark_rapids_jni::create_hllpp_reduction_merge_host_udf( + precision); + case 2: + return spark_rapids_jni::create_hllpp_groupby_host_udf(precision); + default: + return spark_rapids_jni::create_hllpp_groupby_merge_host_udf(precision); + } + }(); + CUDF_EXPECTS(udf_ptr != nullptr, + "Invalid HyperLogLogPlusPlus(HLLPP) UDF instance."); + + return reinterpret_cast(udf_ptr.release()); + } + CATCH_STD(env, 0); +} + +JNIEXPORT jlong JNICALL +Java_com_nvidia_spark_rapids_jni_HLLPPHostUDF_estimateDistinctValueFromSketches( + JNIEnv *env, jclass, jlong sketches, jint precision) { + JNI_NULL_CHECK(env, sketches, "Sketch column is null", 0); + try { + cudf::jni::auto_set_device(env); + auto const sketch_view = + reinterpret_cast(sketches); + return cudf::jni::ptr_as_jlong( + spark_rapids_jni::estimate_from_hll_sketches(*sketch_view, precision) + .release()); + } + CATCH_STD(env, 0); +} + +} // extern "C" diff --git a/src/main/cpp/src/hllpp.cu b/src/main/cpp/src/hllpp.cu new file mode 100644 index 0000000000..08f452ad76 --- /dev/null +++ b/src/main/cpp/src/hllpp.cu @@ -0,0 +1,969 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "hash.hpp" +#include "hllpp.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include // TODO #include once available +#include +#include +#include +#include +#include +#include + +namespace spark_rapids_jni { + +namespace { + +/** + * @brief Get register value from a long which contains 10 register values, + * each register value in long is 6 bits. + */ +__device__ inline int get_register_value(int64_t const ten_registers, + int reg_idx) { + int64_t shift_mask = MASK << (REGISTER_VALUE_BITS * reg_idx); + int64_t v = (ten_registers & shift_mask) >> (REGISTER_VALUE_BITS * reg_idx); + return static_cast(v); +} + +/** + * @brief Computes HLLPP sketches(register values) from hash values and + * partially merge the sketches. + * + * `reduce_by_key` uses num_rows_input intermidate cache: + * https://github.com/NVIDIA/thrust/blob/2.1.0/thrust/system/detail/generic/reduce_by_key.inl#L112 + * + * // scan the values by flag + * thrust::detail::temporary_array + * scanned_values(exec, n); + * + * Each sketch contains multiple integers, by default 512 integers(precision is + * 9), num_rows_input * 512 is huge, so this function uses a differrent approach + * to use less intermidate cache. New approach uses 2 phase merges: partial + * merge and final merge + * + * This function splits input into multiple segments with each segment has + * num_hashs_per_thread items. The input is sorted by group labels, each segment + * contains one or more consecutive groups. Each thread handles one segment with + * num_hashs_per_thread items in it: + * - Scan all the items in the segment, update the max value. + * - Output max value into registers_output_cache for the previous group when + * meets a new group. + * - Output max value into registers_thread_cache when reach the last item in + * the segment. + * + * In this way, we can save memory usage, cache less intermidate sketches + * (num_hashs / num_hashs_per_thread) sketches. + * num_threads = div_round_up(num_hashs, num_hashs_per_thread). + * + * e.g.: num_registers_per_sketch = 512 and num_hashs_per_thread = 4; + * + * Input is hashs, compute and get pair: register index -> register value + * + * reg_index, reg_value, group_lable + * [ + * ---------- segment 0 begin -------------------------- + * (0, 1), g0 + * (0, 2), g0 + * // meets new group g1, save result for group g0 into registers_output_cache + * (1, 1), g1 + * // outputs result at segemnt end for this thread to registers_thread_cache + * (1, 9), g1 + * ---------- segment 1 begin -------------------------- + * (1, 1), g1 + * (1, 1), g1 + * (1, 5), g1 + * // outputs result at segemnt end for this thread to registers_thread_cache + * (1, 1), g1 + * ---------- segment 2 begin -------------------------- + * (1, 1), g1 + * (1, 1), g1 + * (1, 8), g1 + * // outputs result at segemnt end for this thread to registers_thread_cache + * // assumes meets new group when at the end, save to registers_output_cache + * (1, 1), g1 + * ] + * Output e.g.: + * + * group_lables_thread_cache: + * [ + * g1 + * g1 + * g1 + * ] + * Has num_threads rows. + * + * registers_thread_cache: + * [ + * 512 values: [0, 9, 0, ... ] // register values for group 1 + * 512 values: [0, 5, 0, ... ] // register values for group 1 + * 512 values: [0, 8, 0, ... ] // register values for group 1 + * ] + * Has num_threads rows, each row is corresponding to + * `group_lables_thread_cache` + * + * registers_output_cache: + * [ + * 512 values: [2, 0, 0, ... ] // register values for group 0 + * 512 values: [0, 8, 0, ... ] // register values for group 1 + * ] + * Has num_groups rows. + * + * The next kernel will merge the registers_output_cache and + * registers_thread_cache and get the final result. + */ +template +CUDF_KERNEL void partial_group_sketches_from_hashs_kernel( + cudf::column_device_view hashs, + cudf::device_span group_lables, + int64_t const precision, // num of bits for register addressing, e.g.: 9 + int *const + registers_output_cache, // num is num_groups * num_registers_per_sketch + int *const + registers_thread_cache, // num is num_threads * num_registers_per_sketch + cudf::size_type *const + group_lables_thread_cache // save the group lables for each thread +) { + auto const tid = cudf::detail::grid_1d::global_thread_id(); + int64_t const num_hashs = hashs.size(); + if (tid * num_hashs_per_thread >= hashs.size()) { + return; + } + + // 2^precision = num_registers_per_sketch + int64_t num_registers_per_sketch = 1L << precision; + // e.g.: integer in binary: 1 0000 0000 + uint64_t const w_padding = 1ULL << (precision - 1); + // e.g.: 64 - 9 = 55 + int const idx_shift = 64 - precision; + + auto const hash_first = tid * num_hashs_per_thread; + auto const hash_end = + cuda::std::min((tid + 1) * num_hashs_per_thread, num_hashs); + + // init sketches for each thread + int *const sketch_ptr = + registers_thread_cache + tid * num_registers_per_sketch; + for (auto i = 0; i < num_registers_per_sketch; i++) { + sketch_ptr[i] = 0; + } + + cudf::size_type prev_group = group_lables[hash_first]; + for (auto hash_idx = hash_first; hash_idx < hash_end; hash_idx++) { + cudf::size_type curr_group = group_lables[hash_idx]; + + // cast to unsigned, then >> will shift without preserve the sign bit. + uint64_t const hash = + static_cast(hashs.element(hash_idx)); + auto const reg_idx = hash >> idx_shift; + int const reg_v = static_cast( + cuda::std::countl_zero((hash << precision) | w_padding) + 1ULL); + + if (curr_group == prev_group) { + // still in the same group, update the max value + if (reg_v > sketch_ptr[reg_idx]) { + sketch_ptr[reg_idx] = reg_v; + } + } else { + // meets new group, save output for the previous group and reset + for (auto i = 0; i < num_registers_per_sketch; i++) { + registers_output_cache[prev_group * num_registers_per_sketch + i] = + sketch_ptr[i]; + sketch_ptr[i] = 0; + } + // save the result for current group + sketch_ptr[reg_idx] = reg_v; + } + + if (hash_idx == hash_end - 1) { + // meets the last hash in the segment + if (hash_idx == num_hashs - 1) { + // meets the last segment, special logic: assume meets new group + for (auto i = 0; i < num_registers_per_sketch; i++) { + registers_output_cache[curr_group * num_registers_per_sketch + i] = + sketch_ptr[i]; + } + } else { + // not the last segment, probe one item forward. + if (curr_group != group_lables[hash_idx + 1]) { + // meets a new group by checking the next item in the next segment + for (auto i = 0; i < num_registers_per_sketch; i++) { + registers_output_cache[curr_group * num_registers_per_sketch + i] = + sketch_ptr[i]; + } + } + } + } + + prev_group = curr_group; + } + + // save the group lable for this thread + group_lables_thread_cache[tid] = group_lables[hash_end - 1]; +} + +/* + * @brief Merge registers_thread_cache into registers_output_cache, both of them + * are produced in the above kernel. Merge sketches vertically. + * + * For each register index, starts a thread to merge registers in + * registers_thread_cache to registers_output_cache. num_threads = + * num_registers_per_sketch. + * + * Input e.g.: + * + * group_lables_thread_cache: + * [ + * g0 + * g0 + * g1 + * ... + * gN + * ] + * Has num_threads rows. + * + * registers_thread_cache: + * [ + * r0_g0, r1_g0, r2_g0, r3_g0, ... , r511_g0 // register values for group 0 + * r0_g0, r1_g0, r2_g0, r3_g0, ... , r511_g0 // register values for group 0 + * r0_g1, r1_g1, r2_g1, r3_g1, ... , r511_g1 // register values for group 1 + * ... + * r0_gN, r1_gN, r2_gN, r3_gN, ... , r511_gN // register values for group N + * ] + * Has num_threads rows, each row is corresponding to + * `group_lables_thread_cache` + * + * registers_output_cache: + * [ + * r0_g0, r1_g0, r2_g0, r3_g0, ... , r511_g0 // register values for group 0 + * r0_g1, r1_g1, r2_g1, r3_g1, ... , r511_g1 // register values for group 1 + * ... + * r0_gN, r1_gN, r2_gN, r3_gN, ... , r511_gN // register values for group N + * ] + * registers_output_cache has num_groups rows. + * + * For each thread, scan from the first register to the last register, find the + * max value in the same group, and then update to registers_output_cache + */ +template +CUDF_KERNEL void merge_sketches_vertically( + int64_t num_sketches, int64_t num_registers_per_sketch, + int *const registers_output_cache, int const *const registers_thread_cache, + cudf::size_type const *const group_lables_thread_cache) { + __shared__ int8_t shared_data[block_size]; + auto const tid = cudf::detail::grid_1d::global_thread_id(); + int shared_idx = tid % block_size; + + // register idx is tid + shared_data[shared_idx] = static_cast(0); + int prev_group = group_lables_thread_cache[0]; + for (auto i = 0; i < num_sketches; i++) { + int curr_group = group_lables_thread_cache[i]; + int8_t curr_reg_v = static_cast( + registers_thread_cache[i * num_registers_per_sketch + tid]); + if (curr_group == prev_group) { + if (curr_reg_v > shared_data[shared_idx]) { + shared_data[shared_idx] = curr_reg_v; + } + } else { + // meets a new group, store the result for previous group + int64_t result_reg_idx = prev_group * num_registers_per_sketch + tid; + int result_curr_reg_v = registers_output_cache[result_reg_idx]; + if (shared_data[shared_idx] > result_curr_reg_v) { + registers_output_cache[result_reg_idx] = shared_data[shared_idx]; + } + + shared_data[shared_idx] = curr_reg_v; + } + prev_group = curr_group; + } + + // handles the last register in this thread + int64_t reg_idx = prev_group * num_registers_per_sketch + tid; + int curr_reg_v = registers_output_cache[reg_idx]; + if (shared_data[shared_idx] > curr_reg_v) { + registers_output_cache[reg_idx] = shared_data[shared_idx]; + } +} + +/** + * @brief Compact register values, compact 10 registers values + * (each register value is 6 bits) into a long. + * This is consistent with Spark. + * Output: long columns which will be composed into a struct column + * + * Number of threads is num_groups * num_long_cols. + * + * e.g., num_registers_per_sketch is 512(precision is 9): + * Input: + * registers_output_cache: + * [ + * r0_g0, r1_g0, r2_g0, r3_g0, ... , r511_g0 // register values for group 0 + * r0_g1, r1_g1, r2_g1, r3_g1, ... , r511_g1 // register values for group 1 + * ... + * r0_gN, r1_gN, r2_gN, r3_gN, ... , r511_gN // register values for group N + * ] + * Has num_groups rows. + * + * Output: + * 52 long columns + * + * e.g.: r0 to r9 integers are all: 00000000-00000000-00000000-00100001, tailing + * 6 bits: 100-001 Compact to one long is: + * 100001-100001-100001-100001-100001-100001-100001-100001-100001-100001 + */ +CUDF_KERNEL void +compact_kernel(int64_t const num_groups, int64_t const num_registers_per_sketch, + cudf::device_span sketches_output, + // num_groups * num_registers_per_sketch integers + cudf::device_span registers_output_cache) { + int64_t const tid = cudf::detail::grid_1d::global_thread_id(); + int64_t const num_long_cols = + num_registers_per_sketch / REGISTERS_PER_LONG + 1; + if (tid >= num_groups * num_long_cols) { + return; + } + + int64_t const group_idx = tid / num_long_cols; + int64_t const long_idx = tid % num_long_cols; + + int64_t const reg_begin_idx = + group_idx * num_registers_per_sketch + long_idx * REGISTERS_PER_LONG; + int64_t num_regs = REGISTERS_PER_LONG; + if (long_idx == num_long_cols - 1) { + num_regs = num_registers_per_sketch % REGISTERS_PER_LONG; + } + + int64_t ten_registers = 0; + for (auto i = 0; i < num_regs; i++) { + int64_t reg_v = registers_output_cache[reg_begin_idx + i]; + int64_t tmp = reg_v << (REGISTER_VALUE_BITS * i); + ten_registers |= tmp; + } + + sketches_output[long_idx][group_idx] = ten_registers; +} + +std::unique_ptr +group_hllpp(cudf::column_view const &input, int64_t const num_groups, + cudf::device_span group_lables, + int64_t const precision, rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { + int64_t num_registers_per_sketch = 1 << precision; + constexpr int64_t block_size = 256; + constexpr int num_hashs_per_thread = 256; // handles 256 items per thread + int64_t num_threads_partial_kernel = + cudf::util::div_rounding_up_safe(input.size(), num_hashs_per_thread); + + auto sketches_output = rmm::device_uvector( + num_groups * num_registers_per_sketch, stream, mr); + + { // add this block to release `registers_thread_cache` and + // `group_lables_thread_cache` + auto registers_thread_cache = rmm::device_uvector( + num_threads_partial_kernel * num_registers_per_sketch, stream, mr); + auto group_lables_thread_cache = + rmm::device_uvector(num_threads_partial_kernel, stream, mr); + + { // add this block to release `hash_col` + // 1. compute all the hashs + auto input_table_view = cudf::table_view{{input}}; + auto hash_col = xxhash64(input_table_view, SEED, stream, mr); + auto d_hashs = cudf::column_device_view::create(hash_col->view(), stream); + + // 2. execute partial group by + int64_t num_blocks_p1 = cudf::util::div_rounding_up_safe( + num_threads_partial_kernel, block_size); + partial_group_sketches_from_hashs_kernel + <<>>( + *d_hashs, group_lables, precision, sketches_output.begin(), + registers_thread_cache.begin(), + group_lables_thread_cache.begin()); + } + // 3. merge the intermidate result + auto num_merge_threads = num_registers_per_sketch; + auto num_merge_blocks = + cudf::util::div_rounding_up_safe(num_merge_threads, block_size); + merge_sketches_vertically + <<>>( + num_threads_partial_kernel, // num_sketches + num_registers_per_sketch, sketches_output.begin(), + registers_thread_cache.begin(), group_lables_thread_cache.begin()); + } + + // 4. create output columns + auto num_long_cols = num_registers_per_sketch / REGISTERS_PER_LONG + 1; + auto const results_iter = + cudf::detail::make_counting_transform_iterator(0, [&](int i) { + return cudf::make_numeric_column( + cudf::data_type{cudf::type_id::INT64}, num_groups, + cudf::mask_state::ALL_VALID, stream, mr); + }); + auto children = std::vector>( + results_iter, results_iter + num_long_cols); + auto d_results = [&] { + auto host_results_pointer_iter = thrust::make_transform_iterator( + children.begin(), [](auto const &results_column) { + return results_column->mutable_view().template data(); + }); + auto host_results_pointers = std::vector( + host_results_pointer_iter, host_results_pointer_iter + children.size()); + return cudf::detail::make_device_uvector_async(host_results_pointers, + stream, mr); + }(); + auto result = cudf::make_structs_column(num_groups, std::move(children), + 0, // null count + rmm::device_buffer{}, // null mask + stream); + + // 5. compact sketches + auto num_phase3_threads = num_groups * num_long_cols; + auto num_phase3_blocks = + cudf::util::div_rounding_up_safe(num_phase3_threads, block_size); + compact_kernel<<>>( + num_groups, num_registers_per_sketch, d_results, sketches_output); + + return result; +} + +/** + * @brief Partial groups sketches in long columns, similar to + * `partial_group_sketches_from_hashs_kernel` It split longs into segments with + * each has `num_longs_per_threads` elements e.g.: num_registers_per_sketch = + * 512. Each sketch uses 52 (512 / 10 + 1) longs. + * + * Input: + * col_0 col_1 col_51 + * sketch_0: long, long, ..., long + * sketch_1: long, long, ..., long + * sketch_2: long, long, ..., long + * + * num_threads = 52 * div_round_up(num_sketches_input, num_longs_per_threads) + * Each thread scans and merge num_longs_per_threads longs, + * and output the max register value when meets a new group. + * For the last long in a thread, outputs the result into + * `registers_thread_cache`. + * + * By split inputs into segments like `partial_group_sketches_from_hashs_kernel` + * and do partial merge, it will use less memory. Then the kernel + * merge_sketches_vertically can be used to merge the intermidate results: + * registers_output_cache, registers_thread_cache + */ +template +CUDF_KERNEL void partial_group_long_sketches_kernel( + cudf::device_span sketches_input, + int64_t const num_sketches_input, int64_t const num_threads_per_col, + int64_t const num_registers_per_sketch, int64_t const num_groups, + cudf::device_span group_lables, + // num_groups * num_registers_per_sketch integers + int *const registers_output_cache, + // num_threads * num_registers_per_sketch integers + int *const registers_thread_cache, + // num_threads integers + cudf::size_type *const group_lables_thread_cache) { + auto const tid = cudf::detail::grid_1d::global_thread_id(); + auto const num_long_cols = sketches_input.size(); + if (tid >= num_threads_per_col * num_long_cols) { + return; + } + + auto const long_idx = tid / num_threads_per_col; + auto const thread_idx_in_cols = tid % num_threads_per_col; + int64_t const *const longs_ptr = sketches_input[long_idx]; + + int *const registers_thread_ptr = + registers_thread_cache + thread_idx_in_cols * num_registers_per_sketch; + + auto const sketch_first = thread_idx_in_cols * num_longs_per_threads; + auto const sketch_end = + cuda::std::min(sketch_first + num_longs_per_threads, num_sketches_input); + + int num_regs = REGISTERS_PER_LONG; + if (long_idx == num_long_cols - 1) { + num_regs = num_registers_per_sketch % REGISTERS_PER_LONG; + } + + for (auto i = 0; i < num_regs; i++) { + cudf::size_type prev_group = group_lables[sketch_first]; + int max_reg_v = 0; + int reg_idx_in_sketch = long_idx * REGISTERS_PER_LONG + i; + for (auto sketch_idx = sketch_first; sketch_idx < sketch_end; + sketch_idx++) { + cudf::size_type curr_group = group_lables[sketch_idx]; + int curr_reg_v = get_register_value(longs_ptr[sketch_idx], i); + if (curr_group == prev_group) { + // still in the same group, update the max value + if (curr_reg_v > max_reg_v) { + max_reg_v = curr_reg_v; + } + } else { + // meets new group, save output for the previous group + int64_t output_idx_prev = + num_registers_per_sketch * prev_group + reg_idx_in_sketch; + registers_output_cache[output_idx_prev] = max_reg_v; + + // reset + max_reg_v = curr_reg_v; + } + + if (sketch_idx == sketch_end - 1) { + // last item in the segment + int64_t output_idx_curr = + num_registers_per_sketch * curr_group + reg_idx_in_sketch; + if (sketch_idx == num_sketches_input - 1) { + // last segment + registers_output_cache[output_idx_curr] = max_reg_v; + max_reg_v = curr_reg_v; + } else { + if (curr_group != group_lables[sketch_idx + 1]) { + // look the first item in the next segment + registers_output_cache[output_idx_curr] = max_reg_v; + max_reg_v = curr_reg_v; + } + } + } + + prev_group = curr_group; + } + + // For each thread, output current max value + registers_thread_ptr[reg_idx_in_sketch] = max_reg_v; + } + + if (long_idx == 0) { + group_lables_thread_cache[thread_idx_in_cols] = + group_lables[sketch_end - 1]; + } +} + +/** + * @brief Merge for struct column. Each long contains 10 + * register values. Merge all rows in the same group. + */ +std::unique_ptr group_merge_hllpp( + cudf::column_view const &hll_input, // struct column + int64_t const num_groups, + cudf::device_span group_lables, + int64_t const precision, rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { + int64_t num_registers_per_sketch = 1 << precision; + int64_t const num_sketches = hll_input.size(); + int64_t const num_long_cols = + num_registers_per_sketch / REGISTERS_PER_LONG + 1; + constexpr int64_t num_longs_per_threads = 256; + constexpr int64_t block_size = 256; + + int64_t num_threads_per_col_phase1 = + cudf::util::div_rounding_up_safe(num_sketches, num_longs_per_threads); + int64_t num_threads_phase1 = num_threads_per_col_phase1 * num_long_cols; + int64_t num_blocks = + cudf::util::div_rounding_up_safe(num_threads_phase1, block_size); + auto registers_output_cache = rmm::device_uvector( + num_registers_per_sketch * num_groups, stream, mr); + { + auto registers_thread_cache = rmm::device_uvector( + num_registers_per_sketch * num_threads_phase1, stream, mr); + auto group_lables_thread_cache = + rmm::device_uvector(num_threads_per_col_phase1, stream, mr); + + cudf::structs_column_view scv(hll_input); + auto const input_iter = + cudf::detail::make_counting_transform_iterator(0, [&](int i) { + return scv.get_sliced_child(i, stream).begin(); + }); + auto input_cols = + std::vector(input_iter, input_iter + num_long_cols); + auto d_inputs = + cudf::detail::make_device_uvector_async(input_cols, stream, mr); + // 1st kernel: partially group + partial_group_long_sketches_kernel + <<>>( + d_inputs, num_sketches, num_threads_per_col_phase1, + num_registers_per_sketch, num_groups, group_lables, + registers_output_cache.begin(), registers_thread_cache.begin(), + group_lables_thread_cache.begin()); + auto const num_phase2_threads = num_registers_per_sketch; + auto const num_phase2_blocks = + cudf::util::div_rounding_up_safe(num_phase2_threads, block_size); + // 2nd kernel: vertical merge + merge_sketches_vertically + <<>>( + num_threads_per_col_phase1, // num_sketches + num_registers_per_sketch, registers_output_cache.begin(), + registers_thread_cache.begin(), group_lables_thread_cache.begin()); + } + + // create output columns + auto const results_iter = + cudf::detail::make_counting_transform_iterator(0, [&](int i) { + return cudf::make_numeric_column( + cudf::data_type{cudf::type_id::INT64}, num_groups, + cudf::mask_state::ALL_VALID, stream, mr); + }); + auto results = std::vector>( + results_iter, results_iter + num_long_cols); + auto d_sketches_output = [&] { + auto host_results_pointer_iter = thrust::make_transform_iterator( + results.begin(), [](auto const &results_column) { + return results_column->mutable_view().template data(); + }); + auto host_results_pointers = std::vector( + host_results_pointer_iter, host_results_pointer_iter + results.size()); + return cudf::detail::make_device_uvector_async(host_results_pointers, + stream, mr); + }(); + + // 3rd kernel: compact + auto num_phase3_threads = num_groups * num_long_cols; + auto num_phase3_blocks = + cudf::util::div_rounding_up_safe(num_phase3_threads, block_size); + compact_kernel<<>>( + num_groups, num_registers_per_sketch, d_sketches_output, + registers_output_cache); + + return make_structs_column(num_groups, std::move(results), 0, + rmm::device_buffer{}); +} + +/** + * Launch only 1 block, uses max 1M(2^18 *sizeof(int)) shared memory. + * For each hash, get a pair: (register index, register value). + * Use shared memory to speedup the fetch max atomic operation. + */ +template +CUDF_KERNEL void reduce_hllpp_kernel(cudf::column_device_view hashs, + cudf::device_span output, + int precision) { + __shared__ int32_t shared_data[block_size]; + + auto const tid = cudf::detail::grid_1d::global_thread_id(); + auto const num_hashs = hashs.size(); + uint64_t const num_registers_per_sketch = 1L << precision; + int const idx_shift = 64 - precision; + uint64_t const w_padding = 1ULL << (precision - 1); + + // init tmp data + for (int i = tid; i < num_registers_per_sketch; i += block_size) { + shared_data[i] = 0; + } + __syncthreads(); + + // update max reg value for the reg index + for (int i = tid; i < num_hashs; i += block_size) { + uint64_t const hash = static_cast(hashs.element(i)); + // use unsigned int to avoid insert 1 for the highest bit when do right + // shift + uint64_t const reg_idx = hash >> idx_shift; + // get the leading zeros + int const reg_v = static_cast( + cuda::std::countl_zero((hash << precision) | w_padding) + 1ULL); + cuda::atomic_ref register_ref( + shared_data[reg_idx]); + register_ref.fetch_max(reg_v, cuda::memory_order_relaxed); + } + __syncthreads(); + + // compact from register values (int array) to long array + // each long holds 10 integers, note reg value < 64 which means the bits from + // 7 to highest are all 0. + if (tid * REGISTERS_PER_LONG < num_registers_per_sketch) { + int start = tid * REGISTERS_PER_LONG; + int end = (tid + 1) * REGISTERS_PER_LONG; + if (end > num_registers_per_sketch) { + end = num_registers_per_sketch; + } + + int64_t ret = 0; + for (int i = 0; i < end - start; i++) { + int shift = i * REGISTER_VALUE_BITS; + int64_t reg = shared_data[start + i]; + ret |= (reg << shift); + } + + output[tid][0] = ret; + } +} + +std::unique_ptr reduce_hllpp(cudf::column_view const &input, + int64_t const precision, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { + int64_t num_registers_per_sketch = 1L << precision; + // 1. compute all the hashs + auto input_table_view = cudf::table_view{{input}}; + auto hash_col = xxhash64(input_table_view, SEED, stream, mr); + auto d_hashs = cudf::column_device_view::create(hash_col->view(), stream); + + // 2. generate long columns, the size of each long column is 1 + auto num_long_cols = num_registers_per_sketch / REGISTERS_PER_LONG + 1; + auto const results_iter = + cudf::detail::make_counting_transform_iterator(0, [&](int i) { + return cudf::make_numeric_column( + cudf::data_type{cudf::type_id::INT64}, 1 /**num_groups*/, + cudf::mask_state::ALL_VALID, stream, mr); + }); + auto children = std::vector>( + results_iter, results_iter + num_long_cols); + auto d_results = [&] { + auto host_results_pointer_iter = thrust::make_transform_iterator( + children.begin(), [](auto const &results_column) { + return results_column->mutable_view().template data(); + }); + auto host_results_pointers = std::vector( + host_results_pointer_iter, host_results_pointer_iter + children.size()); + return cudf::detail::make_device_uvector_async(host_results_pointers, + stream, mr); + }(); + + // 2. reduce and generate compacted long values + constexpr int64_t block_size = 256; + // max shared memory is 2^18 * 4 = 1M + auto const shared_mem_size = num_registers_per_sketch * sizeof(int32_t); + reduce_hllpp_kernel + <<<1, block_size, shared_mem_size, stream.value()>>>(*d_hashs, d_results, + precision); + + // 3. create struct scalar + auto host_results_view_iter = thrust::make_transform_iterator( + children.begin(), + [](auto const &results_column) { return results_column->view(); }); + auto views = std::vector( + host_results_view_iter, host_results_view_iter + num_long_cols); + auto table_view = cudf::table_view{views}; + auto table = cudf::table(table_view); + return std::make_unique(std::move(table), true, stream, + mr); +} + +CUDF_KERNEL void reduce_merge_hll_kernel_vertically( + cudf::device_span sketch_longs, + cudf::size_type num_sketches, int num_registers_per_sketch, + int *const output) { + auto const tid = cudf::detail::grid_1d::global_thread_id(); + if (tid >= num_registers_per_sketch) { + return; + } + auto long_idx = tid / REGISTERS_PER_LONG; + auto reg_idx_in_long = tid % REGISTERS_PER_LONG; + int max = 0; + for (auto row_idx = 0; row_idx < num_sketches; row_idx++) { + int reg_v = + get_register_value(sketch_longs[long_idx][row_idx], reg_idx_in_long); + if (reg_v > max) { + max = reg_v; + } + } + output[tid] = max; +} + +std::unique_ptr +reduce_merge_hllpp(cudf::column_view const &input, int64_t const precision, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { + // create device input + int64_t num_registers_per_sketch = 1 << precision; + auto num_long_cols = num_registers_per_sketch / REGISTERS_PER_LONG + 1; + cudf::structs_column_view scv(input); + auto const input_iter = + cudf::detail::make_counting_transform_iterator(0, [&](int i) { + return scv.get_sliced_child(i, stream).begin(); + }); + auto input_cols = + std::vector(input_iter, input_iter + num_long_cols); + auto d_inputs = + cudf::detail::make_device_uvector_async(input_cols, stream, mr); + + // create one row output + auto const results_iter = + cudf::detail::make_counting_transform_iterator(0, [&](int i) { + return cudf::make_numeric_column( + cudf::data_type{cudf::type_id::INT64}, 1 /** num_rows */, + cudf::mask_state::ALL_VALID, stream, mr); + }); + auto children = std::vector>( + results_iter, results_iter + num_long_cols); + auto d_results = [&] { + auto host_results_pointer_iter = thrust::make_transform_iterator( + children.begin(), [](auto const &results_column) { + return results_column->mutable_view().template data(); + }); + auto host_results_pointers = std::vector( + host_results_pointer_iter, host_results_pointer_iter + children.size()); + return cudf::detail::make_device_uvector_async(host_results_pointers, + stream, mr); + }(); + + // execute merge kernel + auto num_threads = num_registers_per_sketch; + constexpr int64_t block_size = 256; + auto num_blocks = cudf::util::div_rounding_up_safe(num_threads, block_size); + auto output_cache = + rmm::device_uvector(num_registers_per_sketch, stream, mr); + reduce_merge_hll_kernel_vertically<<>>( + d_inputs, input.size(), num_registers_per_sketch, output_cache.begin()); + + // compact to longs + auto const num_compact_threads = num_long_cols; + auto const num_compact_blocks = + cudf::util::div_rounding_up_safe(num_compact_threads, block_size); + compact_kernel<<>>( + 1 /** num_groups **/, num_registers_per_sketch, d_results, output_cache); + + // create scalar + auto host_results_view_iter = thrust::make_transform_iterator( + children.begin(), + [](auto const &results_column) { return results_column->view(); }); + auto views = std::vector( + host_results_view_iter, host_results_view_iter + num_long_cols); + auto table_view = cudf::table_view{views}; + auto table = cudf::table(table_view); + return std::make_unique(std::move(table), true, stream, + mr); +} + +struct estimate_fn { + cudf::device_span sketch_longs; + int const precision; + int64_t *const out; + + __device__ void operator()(cudf::size_type const idx) const { + auto const num_regs = 1ull << precision; + double sum = 0; + int zeroes = 0; + + for (auto reg_idx = 0; reg_idx < num_regs; ++reg_idx) { + // each long contains 10 register values + int long_col_idx = reg_idx / REGISTERS_PER_LONG; + int reg_idx_in_long = reg_idx % REGISTERS_PER_LONG; + int reg = + get_register_value(sketch_longs[long_col_idx][idx], reg_idx_in_long); + sum += double{1} / static_cast(1ull << reg); + zeroes += reg == 0; + } + + auto const finalize = cuco::hyperloglog_ns::detail::finalizer(precision); + out[idx] = finalize(sum, zeroes); + } +}; + +} // end anonymous namespace + +std::unique_ptr group_hyper_log_log_plus_plus( + cudf::column_view const &input, int64_t const num_groups, + cudf::device_span group_lables, + int64_t const precision, rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { + CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision >= 4."); + auto adjust_precision = precision > MAX_PRECISION ? MAX_PRECISION : precision; + return group_hllpp(input, num_groups, group_lables, adjust_precision, stream, + mr); +} + +std::unique_ptr group_merge_hyper_log_log_plus_plus( + cudf::column_view const &input, int64_t const num_groups, + cudf::device_span group_lables, + int64_t const precision, rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { + CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision >= 4."); + CUDF_EXPECTS( + input.type().id() == cudf::type_id::STRUCT, + "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns."); + for (auto i = 0; i < input.num_children(); i++) { + CUDF_EXPECTS( + input.child(i).type().id() == cudf::type_id::INT64, + "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns."); + } + auto adjust_precision = precision > MAX_PRECISION ? MAX_PRECISION : precision; + auto expected_num_longs = (1 << adjust_precision) / REGISTERS_PER_LONG + 1; + CUDF_EXPECTS(input.num_children() == expected_num_longs, + "The num of long columns in input is incorrect."); + return group_merge_hllpp(input, num_groups, group_lables, adjust_precision, + stream, mr); +} + +std::unique_ptr reduce_hyper_log_log_plus_plus( + cudf::column_view const &input, int64_t const precision, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision >= 4."); + auto adjust_precision = precision > MAX_PRECISION ? MAX_PRECISION : precision; + return reduce_hllpp(input, adjust_precision, stream, mr); +} + +std::unique_ptr reduce_merge_hyper_log_log_plus_plus( + cudf::column_view const &input, int64_t const precision, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision >= 4."); + CUDF_EXPECTS( + input.type().id() == cudf::type_id::STRUCT, + "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns."); + for (auto i = 0; i < input.num_children(); i++) { + CUDF_EXPECTS( + input.child(i).type().id() == cudf::type_id::INT64, + "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns."); + } + auto adjust_precision = precision > MAX_PRECISION ? MAX_PRECISION : precision; + auto expected_num_longs = (1 << adjust_precision) / REGISTERS_PER_LONG + 1; + CUDF_EXPECTS(input.num_children() == expected_num_longs, + "The num of long columns in input is incorrect."); + return reduce_merge_hllpp(input, adjust_precision, stream, mr); +} + +std::unique_ptr +estimate_from_hll_sketches(cudf::column_view const &input, int precision, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { + CUDF_EXPECTS(precision >= 4, + "HyperLogLogPlusPlus requires precision is bigger than 4."); + auto const input_iter = cudf::detail::make_counting_transform_iterator( + 0, [&](int i) { return input.child(i).begin(); }); + auto input_cols = std::vector( + input_iter, input_iter + input.num_children()); + auto d_inputs = + cudf::detail::make_device_uvector_async(input_cols, stream, mr); + auto result = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT64}, + input.size(), + cudf::mask_state::ALL_VALID, stream); + // evaluate from struct + thrust::for_each_n( + rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), + input.size(), + estimate_fn{d_inputs, precision, result->mutable_view().data()}); + return result; +} + +} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/hllpp.hpp b/src/main/cpp/src/hllpp.hpp new file mode 100644 index 0000000000..4dda342a4f --- /dev/null +++ b/src/main/cpp/src/hllpp.hpp @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +#include +#include + +namespace spark_rapids_jni { + +/** + * The number of bits that is required for a HLLPP register value. + * + * This number is determined by the maximum number of leading binary zeros a + * hashcode can produce. This is equal to the number of bits the hashcode + * returns. The current implementation uses a 64-bit hashcode, this means 6-bits + * are (at most) needed to store the number of leading zeros. + */ +constexpr int REGISTER_VALUE_BITS = 6; + +// MASK binary 6 bits: 111-111 +constexpr uint64_t MASK = (1L << REGISTER_VALUE_BITS) - 1L; + +// This value is 10, one long stores 10 register values +constexpr int REGISTERS_PER_LONG = 64 / REGISTER_VALUE_BITS; + +// XXHash seed, consistent with Spark +constexpr int64_t SEED = 42L; + +// max precision, if require a precision bigger than 18, then use 18. +constexpr int MAX_PRECISION = 18; + +/** + * Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP) + * sketches from hash codes, and merge the sketches in the same group. Output is + * a struct column with multiple long columns which is consistent with Spark. + */ +std::unique_ptr group_hyper_log_log_plus_plus( + cudf::column_view const &input, int64_t const num_groups, + cudf::device_span group_lables, + int64_t const precision, rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + +/** + * Merge HyperLogLogPlusPlus(HLLPP) sketches in the same group. + * Input is a struct column with multiple long columns which is consistent with + * Spark. + */ +std::unique_ptr group_merge_hyper_log_log_plus_plus( + cudf::column_view const &input, int64_t const num_groups, + cudf::device_span group_lables, + int64_t const precision, rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + +/** + * Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP) + * sketches from hash codes, and merge all the sketches into one sketch, output + * is a struct scalar with multiple long values. + */ +std::unique_ptr reduce_hyper_log_log_plus_plus( + cudf::column_view const &input, int64_t const precision, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); + +/** + * Merge all HyperLogLogPlusPlus(HLLPP) sketches in the input column into one + * sketch. Input is a struct column with multiple long columns which is + * consistent with Spark. Output is a struct scalar with multiple long values. + */ +std::unique_ptr reduce_merge_hyper_log_log_plus_plus( + cudf::column_view const &input, int64_t const precision, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); + +/** + * Estimate count distinct values for the input which contains + * Input is a struct column with multiple long columns which is consistent with + * Spark. Output is a long column with all values are not null. Spark returns 0 + * for null values when doing APPROX_COUNT_DISTINCT. + */ +std::unique_ptr estimate_from_hll_sketches( + cudf::column_view const &input, int precision, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + +} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/hllpp_host_udf.cu b/src/main/cpp/src/hllpp_host_udf.cu new file mode 100644 index 0000000000..c9ad271876 --- /dev/null +++ b/src/main/cpp/src/hllpp_host_udf.cu @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hllpp.hpp" +#include "hllpp_host_udf.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +namespace spark_rapids_jni { + +namespace { + +template struct hllpp_udf : cudf::host_udf_base { + static_assert(std::is_same_v || + std::is_same_v); + + hllpp_udf(int precision_, bool is_merge_) + : precision(precision_), is_merge(is_merge_) {} + + [[nodiscard]] input_data_attributes get_required_data() const override { + if constexpr (std::is_same_v) { + return {reduction_data_attribute::INPUT_VALUES}; + } else { + return {groupby_data_attribute::GROUPED_VALUES, + groupby_data_attribute::GROUP_OFFSETS, + groupby_data_attribute::GROUP_LABELS}; + } + } + + [[nodiscard]] output_type + operator()(host_udf_input const &udf_input, rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const override { + if constexpr (std::is_same_v) { + // reduce + auto const &input_values = std::get( + udf_input.at(reduction_data_attribute::INPUT_VALUES)); + if (input_values.size() == 0) { + return get_empty_output(std::nullopt, stream, mr); + } + if (is_merge) { + // reduce intermidate result, input_values are struct of long columns + return spark_rapids_jni::reduce_merge_hyper_log_log_plus_plus( + input_values, precision, stream, mr); + } else { + return spark_rapids_jni::reduce_hyper_log_log_plus_plus( + input_values, precision, stream, mr); + } + } else { + // groupby + auto const &group_values = std::get( + udf_input.at(groupby_data_attribute::GROUPED_VALUES)); + if (group_values.size() == 0) { + return get_empty_output(std::nullopt, stream, mr); + } + auto const group_offsets = + std::get>( + udf_input.at(groupby_data_attribute::GROUP_OFFSETS)); + int num_groups = group_offsets.size() - 1; + auto const group_lables = + std::get>( + udf_input.at(groupby_data_attribute::GROUP_LABELS)); + if (is_merge) { + // group by intermidate result, group_values are struct of long columns + return spark_rapids_jni::group_merge_hyper_log_log_plus_plus( + group_values, num_groups, group_lables, precision, stream, mr); + } else { + return spark_rapids_jni::group_hyper_log_log_plus_plus( + group_values, num_groups, group_lables, precision, stream, mr); + } + } + } + + /** + * @brief create an empty struct scalar + */ + [[nodiscard]] output_type + get_empty_output([[maybe_unused]] std::optional output_dtype, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const override { + int num_registers = 1 << precision; + int num_long_cols = num_registers / REGISTERS_PER_LONG + 1; + auto const results_iter = + cudf::detail::make_counting_transform_iterator(0, [&](int i) { + return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT64}); + }); + auto children = std::vector>( + results_iter, results_iter + num_long_cols); + + if constexpr (std::is_same_v) { + // reduce + auto host_results_view_iter = thrust::make_transform_iterator( + children.begin(), + [](auto const &results_column) { return results_column->view(); }); + auto views = std::vector( + host_results_view_iter, host_results_view_iter + num_long_cols); + auto table_view = cudf::table_view{views}; + auto table = cudf::table(table_view); + return std::make_unique(std::move(table), true, + stream, mr); + } else { + // groupby + return cudf::make_structs_column(0, std::move(children), + 0, // null count + rmm::device_buffer{}, // null mask + stream); + } + } + + [[nodiscard]] bool is_equal(host_udf_base const &other) const override { + auto o = dynamic_cast(&other); + return o != nullptr && o->precision == this->precision; + } + + [[nodiscard]] std::size_t do_hash() const override { + return 31 * (31 * std::hash{}({"hllpp_udf"}) + precision) + + is_merge; + } + + [[nodiscard]] std::unique_ptr clone() const override { + return std::make_unique(precision, is_merge); + } + + int precision; + bool is_merge; +}; + +} // namespace + +std::unique_ptr +create_hllpp_reduction_host_udf(int precision) { + return std::make_unique>( + precision, /*is_merge*/ false); +} + +std::unique_ptr +create_hllpp_reduction_merge_host_udf(int precision) { + return std::make_unique>( + precision, /*is_merge*/ true); +} + +std::unique_ptr +create_hllpp_groupby_host_udf(int precision) { + return std::make_unique>( + precision, /*is_merge*/ false); +} + +std::unique_ptr +create_hllpp_groupby_merge_host_udf(int precision) { + return std::make_unique>( + precision, /*is_merge*/ true); +} + +} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/hllpp_host_udf.hpp b/src/main/cpp/src/hllpp_host_udf.hpp new file mode 100644 index 0000000000..fc4bb8b21b --- /dev/null +++ b/src/main/cpp/src/hllpp_host_udf.hpp @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace spark_rapids_jni { + +std::unique_ptr +create_hllpp_reduction_host_udf(int precision); + +std::unique_ptr +create_hllpp_reduction_merge_host_udf(int precision); + +std::unique_ptr +create_hllpp_groupby_host_udf(int precision); + +std::unique_ptr +create_hllpp_groupby_merge_host_udf(int precision); + +} // namespace spark_rapids_jni diff --git a/src/main/java/com/nvidia/spark/rapids/jni/HLLPPHostUDF.java b/src/main/java/com/nvidia/spark/rapids/jni/HLLPPHostUDF.java new file mode 100644 index 0000000000..9018474c27 --- /dev/null +++ b/src/main/java/com/nvidia/spark/rapids/jni/HLLPPHostUDF.java @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.jni; + +import ai.rapids.cudf.ColumnVector; +import ai.rapids.cudf.ColumnView; +import ai.rapids.cudf.NativeDepsLoader; + +/** + * HyperLogLogPlusPlus(HLLPP) host UDF aggregation utils + */ +public class HLLPPHostUDF { + static { + NativeDepsLoader.loadNativeDeps(); + } + + /** + * HyperLogLogPlusPlus(HLLPP) aggregation types + */ + public enum AggregationType { + + /** + * Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP) + * sketches from hash codes, and merge all the sketches into one sketch, output + * is a struct scalar with multiple long values. + */ + Reduction(0), + + /** + * Merge all HyperLogLogPlusPlus(HLLPP) sketches in the input column into one + * sketch. Input is a struct column with multiple long columns which is + * consistent with Spark. Output is a struct scalar with multiple long values. + */ + Reduction_MERGE(1), + + /** + * Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP) + * sketches from hash codes, and merge the sketches in the same group. Output is + * a struct column with multiple long columns which is consistent with Spark. + */ + GroupBy(2), + + /** + * Merge HyperLogLogPlusPlus(HLLPP) sketches in the same group. + * Input is a struct column with multiple long columns which is consistent with + * Spark. + */ + GroupByMerge(3); + + final int nativeId; + + AggregationType(int nativeId) { + this.nativeId = nativeId; + } + } + + /** + * Create a HyperLogLogPlusPlus(HLLPP) host UDF + */ + public static long createHLLPPHostUDF(AggregationType type, int precision) { + return createHLLPPHostUDF(type.nativeId, precision); + } + + /** + * Compute the approximate count distinct value from sketch values. + * + * The input is sketch values, must be given in the format: + * `Struct`, + * The value of num_registers_per_sketch = 2^precision + * The children num of this Struct is: num_registers_per_sketch / 10 + 1, + * Here 10 means a INT64 contains 10 register values, + * each register value is 6 bits. + * Register value is the number of leading zero bits in xxhash64 hash code. + * xxhash64 hash code is 64 bits, Register value is 6 bits, + * 6 bits is enough to hold the max value 64. + * + * @param input The sketch column which constains Struct + * values. + * @param precision The num of bits for HLLPP register addressing. + * @return A INT64 column with each value indicates the approximate count + * distinct value. + */ + public static ColumnVector estimateDistinctValueFromSketches(ColumnView input, int precision) { + return new ColumnVector(estimateDistinctValueFromSketches(input.getNativeView(), precision)); + } + + private static native long createHLLPPHostUDF(int type, int precision); + + private static native long estimateDistinctValueFromSketches(long inputHandle, int precision); + +} From f8c6a02eecc3357fd6e3a784218ee38e1761b86a Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Tue, 17 Dec 2024 21:04:54 +0800 Subject: [PATCH 07/28] Use UDF --- src/main/cpp/src/HLLPP.cu | 102 --- src/main/cpp/src/HLLPP.hpp | 32 - src/main/cpp/src/HLLPPHostUDFJni.cpp | 37 +- src/main/cpp/src/HLLPPJni.cpp | 34 - src/main/cpp/src/hllpp.cu | 723 +++++++++--------- src/main/cpp/src/hllpp.hpp | 42 +- src/main/cpp/src/hllpp_host_udf.cu | 140 ++-- src/main/cpp/src/hllpp_host_udf.hpp | 14 +- .../com/nvidia/spark/rapids/jni/HLLPP.java | 48 -- .../nvidia/spark/rapids/jni/HLLPPTest.java | 37 - 10 files changed, 453 insertions(+), 756 deletions(-) delete mode 100644 src/main/cpp/src/HLLPP.cu delete mode 100644 src/main/cpp/src/HLLPP.hpp delete mode 100644 src/main/cpp/src/HLLPPJni.cpp delete mode 100644 src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java delete mode 100644 src/test/java/com/nvidia/spark/rapids/jni/HLLPPTest.java diff --git a/src/main/cpp/src/HLLPP.cu b/src/main/cpp/src/HLLPP.cu deleted file mode 100644 index d2d9493cf7..0000000000 --- a/src/main/cpp/src/HLLPP.cu +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "HLLPP.hpp" - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include - -namespace spark_rapids_jni { - -namespace { - -// The number of bits required by register value. Register value stores num of zeros. -// XXHash64 value is 64 bits, it's safe to use 6 bits to store a register value. -constexpr int REGISTER_VALUE_BITS = 6; - -// MASK binary 6 bits: 111111 -constexpr uint64_t MASK = (1L << REGISTER_VALUE_BITS) - 1L; - -// One long stores 10 register values -constexpr int REGISTERS_PER_LONG = 64 / REGISTER_VALUE_BITS; - -__device__ inline int get_register_value(int64_t const long_10_registers, int reg_idx) -{ - int64_t shift_mask = MASK << (REGISTER_VALUE_BITS * reg_idx); - int64_t v = (long_10_registers & shift_mask) >> (REGISTER_VALUE_BITS * reg_idx); - return static_cast(v); -} - -struct estimate_fn { - cudf::device_span sketch_longs; - int const precision; - int64_t* const out; - - __device__ void operator()(cudf::size_type const idx) const - { - auto const num_regs = 1ull << precision; - double sum = 0; - int zeroes = 0; - - for (auto reg_idx = 0; reg_idx < num_regs; ++reg_idx) { - // each long contains 10 register values - int long_col_idx = reg_idx / REGISTERS_PER_LONG; - int reg_idx_in_long = reg_idx % REGISTERS_PER_LONG; - int reg = get_register_value(sketch_longs[long_col_idx][idx], reg_idx_in_long); - sum += double{1} / static_cast(1ull << reg); - zeroes += reg == 0; - } - - auto const finalize = cuco::hyperloglog_ns::detail::finalizer(precision); - out[idx] = finalize(sum, zeroes); - } -}; - -} // end anonymous namespace - -std::unique_ptr estimate_from_hll_sketches(cudf::column_view const& input, - int precision, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision is bigger than 4."); - auto const input_iter = cudf::detail::make_counting_transform_iterator( - 0, [&](int i) { return input.child(i).begin(); }); - auto input_cols = std::vector(input_iter, input_iter + input.num_children()); - auto d_inputs = cudf::detail::make_device_uvector_async(input_cols, stream, mr); - auto result = cudf::make_numeric_column( - cudf::data_type{cudf::type_id::INT64}, input.size(), cudf::mask_state::ALL_VALID, stream); - // evaluate from struct - thrust::for_each_n(rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(0), - input.size(), - estimate_fn{d_inputs, precision, result->mutable_view().data()}); - return result; -} - -} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/HLLPP.hpp b/src/main/cpp/src/HLLPP.hpp deleted file mode 100644 index 69e0b237e5..0000000000 --- a/src/main/cpp/src/HLLPP.hpp +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -#include -#include - -namespace spark_rapids_jni { - -std::unique_ptr estimate_from_hll_sketches( - cudf::column_view const& input, - int precision, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); - -} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/HLLPPHostUDFJni.cpp b/src/main/cpp/src/HLLPPHostUDFJni.cpp index 3132d088ac..a80a78c6b8 100644 --- a/src/main/cpp/src/HLLPPHostUDFJni.cpp +++ b/src/main/cpp/src/HLLPPHostUDFJni.cpp @@ -20,28 +20,22 @@ extern "C" { -JNIEXPORT jlong JNICALL -Java_com_nvidia_spark_rapids_jni_HLLPPHostUDF_createHLLPPHostUDF( - JNIEnv *env, jclass, jint agg_type, int precision) { +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_HLLPPHostUDF_createHLLPPHostUDF( + JNIEnv* env, jclass, jint agg_type, int precision) +{ try { cudf::jni::auto_set_device(env); auto udf_ptr = [&] { // The value of agg_type must be sync with // `HLLPPHostUDF.java#AggregationType`. switch (agg_type) { - case 0: - return spark_rapids_jni::create_hllpp_reduction_host_udf(precision); - case 1: - return spark_rapids_jni::create_hllpp_reduction_merge_host_udf( - precision); - case 2: - return spark_rapids_jni::create_hllpp_groupby_host_udf(precision); - default: - return spark_rapids_jni::create_hllpp_groupby_merge_host_udf(precision); + case 0: return spark_rapids_jni::create_hllpp_reduction_host_udf(precision); + case 1: return spark_rapids_jni::create_hllpp_reduction_merge_host_udf(precision); + case 2: return spark_rapids_jni::create_hllpp_groupby_host_udf(precision); + default: return spark_rapids_jni::create_hllpp_groupby_merge_host_udf(precision); } }(); - CUDF_EXPECTS(udf_ptr != nullptr, - "Invalid HyperLogLogPlusPlus(HLLPP) UDF instance."); + CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HyperLogLogPlusPlus(HLLPP) UDF instance."); return reinterpret_cast(udf_ptr.release()); } @@ -49,18 +43,19 @@ Java_com_nvidia_spark_rapids_jni_HLLPPHostUDF_createHLLPPHostUDF( } JNIEXPORT jlong JNICALL -Java_com_nvidia_spark_rapids_jni_HLLPPHostUDF_estimateDistinctValueFromSketches( - JNIEnv *env, jclass, jlong sketches, jint precision) { +Java_com_nvidia_spark_rapids_jni_HLLPPHostUDF_estimateDistinctValueFromSketches(JNIEnv* env, + jclass, + jlong sketches, + jint precision) +{ JNI_NULL_CHECK(env, sketches, "Sketch column is null", 0); try { cudf::jni::auto_set_device(env); - auto const sketch_view = - reinterpret_cast(sketches); + auto const sketch_view = reinterpret_cast(sketches); return cudf::jni::ptr_as_jlong( - spark_rapids_jni::estimate_from_hll_sketches(*sketch_view, precision) - .release()); + spark_rapids_jni::estimate_from_hll_sketches(*sketch_view, precision).release()); } CATCH_STD(env, 0); } -} // extern "C" +} // extern "C" diff --git a/src/main/cpp/src/HLLPPJni.cpp b/src/main/cpp/src/HLLPPJni.cpp deleted file mode 100644 index 581af90a90..0000000000 --- a/src/main/cpp/src/HLLPPJni.cpp +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "HLLPP.hpp" -#include "cudf_jni_apis.hpp" - -extern "C" { - -JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_HLLPP_estimateDistinctValueFromSketches( - JNIEnv* env, jclass, jlong sketches, jint precision) -{ - JNI_NULL_CHECK(env, sketches, "Sketch column is null", 0); - try { - cudf::jni::auto_set_device(env); - auto const sketch_view = reinterpret_cast(sketches); - return cudf::jni::ptr_as_jlong( - spark_rapids_jni::estimate_from_hll_sketches(*sketch_view, precision).release()); - } - CATCH_STD(env, 0); -} -} diff --git a/src/main/cpp/src/hllpp.cu b/src/main/cpp/src/hllpp.cu index 08f452ad76..8d39c66865 100644 --- a/src/main/cpp/src/hllpp.cu +++ b/src/main/cpp/src/hllpp.cu @@ -37,7 +37,7 @@ #include #include -#include // TODO #include once available +#include // TODO #include once available #include #include #include @@ -53,10 +53,10 @@ namespace { * @brief Get register value from a long which contains 10 register values, * each register value in long is 6 bits. */ -__device__ inline int get_register_value(int64_t const ten_registers, - int reg_idx) { +__device__ inline int get_register_value(int64_t const ten_registers, int reg_idx) +{ int64_t shift_mask = MASK << (REGISTER_VALUE_BITS * reg_idx); - int64_t v = (ten_registers & shift_mask) >> (REGISTER_VALUE_BITS * reg_idx); + int64_t v = (ten_registers & shift_mask) >> (REGISTER_VALUE_BITS * reg_idx); return static_cast(v); } @@ -148,21 +148,17 @@ __device__ inline int get_register_value(int64_t const ten_registers, */ template CUDF_KERNEL void partial_group_sketches_from_hashs_kernel( - cudf::column_device_view hashs, - cudf::device_span group_lables, - int64_t const precision, // num of bits for register addressing, e.g.: 9 - int *const - registers_output_cache, // num is num_groups * num_registers_per_sketch - int *const - registers_thread_cache, // num is num_threads * num_registers_per_sketch - cudf::size_type *const - group_lables_thread_cache // save the group lables for each thread -) { - auto const tid = cudf::detail::grid_1d::global_thread_id(); + cudf::column_device_view hashs, + cudf::device_span group_lables, + int64_t const precision, // num of bits for register addressing, e.g.: 9 + int* const registers_output_cache, // num is num_groups * num_registers_per_sketch + int* const registers_thread_cache, // num is num_threads * num_registers_per_sketch + cudf::size_type* const group_lables_thread_cache // save the group lables for each thread +) +{ + auto const tid = cudf::detail::grid_1d::global_thread_id(); int64_t const num_hashs = hashs.size(); - if (tid * num_hashs_per_thread >= hashs.size()) { - return; - } + if (tid * num_hashs_per_thread >= hashs.size()) { return; } // 2^precision = num_registers_per_sketch int64_t num_registers_per_sketch = 1L << precision; @@ -172,12 +168,10 @@ CUDF_KERNEL void partial_group_sketches_from_hashs_kernel( int const idx_shift = 64 - precision; auto const hash_first = tid * num_hashs_per_thread; - auto const hash_end = - cuda::std::min((tid + 1) * num_hashs_per_thread, num_hashs); + auto const hash_end = cuda::std::min((tid + 1) * num_hashs_per_thread, num_hashs); // init sketches for each thread - int *const sketch_ptr = - registers_thread_cache + tid * num_registers_per_sketch; + int* const sketch_ptr = registers_thread_cache + tid * num_registers_per_sketch; for (auto i = 0; i < num_registers_per_sketch; i++) { sketch_ptr[i] = 0; } @@ -187,23 +181,19 @@ CUDF_KERNEL void partial_group_sketches_from_hashs_kernel( cudf::size_type curr_group = group_lables[hash_idx]; // cast to unsigned, then >> will shift without preserve the sign bit. - uint64_t const hash = - static_cast(hashs.element(hash_idx)); - auto const reg_idx = hash >> idx_shift; - int const reg_v = static_cast( - cuda::std::countl_zero((hash << precision) | w_padding) + 1ULL); + uint64_t const hash = static_cast(hashs.element(hash_idx)); + auto const reg_idx = hash >> idx_shift; + int const reg_v = + static_cast(cuda::std::countl_zero((hash << precision) | w_padding) + 1ULL); if (curr_group == prev_group) { // still in the same group, update the max value - if (reg_v > sketch_ptr[reg_idx]) { - sketch_ptr[reg_idx] = reg_v; - } + if (reg_v > sketch_ptr[reg_idx]) { sketch_ptr[reg_idx] = reg_v; } } else { // meets new group, save output for the previous group and reset for (auto i = 0; i < num_registers_per_sketch; i++) { - registers_output_cache[prev_group * num_registers_per_sketch + i] = - sketch_ptr[i]; - sketch_ptr[i] = 0; + registers_output_cache[prev_group * num_registers_per_sketch + i] = sketch_ptr[i]; + sketch_ptr[i] = 0; } // save the result for current group sketch_ptr[reg_idx] = reg_v; @@ -214,16 +204,14 @@ CUDF_KERNEL void partial_group_sketches_from_hashs_kernel( if (hash_idx == num_hashs - 1) { // meets the last segment, special logic: assume meets new group for (auto i = 0; i < num_registers_per_sketch; i++) { - registers_output_cache[curr_group * num_registers_per_sketch + i] = - sketch_ptr[i]; + registers_output_cache[curr_group * num_registers_per_sketch + i] = sketch_ptr[i]; } } else { // not the last segment, probe one item forward. if (curr_group != group_lables[hash_idx + 1]) { // meets a new group by checking the next item in the next segment for (auto i = 0; i < num_registers_per_sketch; i++) { - registers_output_cache[curr_group * num_registers_per_sketch + i] = - sketch_ptr[i]; + registers_output_cache[curr_group * num_registers_per_sketch + i] = sketch_ptr[i]; } } } @@ -280,29 +268,29 @@ CUDF_KERNEL void partial_group_sketches_from_hashs_kernel( * max value in the same group, and then update to registers_output_cache */ template -CUDF_KERNEL void merge_sketches_vertically( - int64_t num_sketches, int64_t num_registers_per_sketch, - int *const registers_output_cache, int const *const registers_thread_cache, - cudf::size_type const *const group_lables_thread_cache) { +CUDF_KERNEL void merge_sketches_vertically(int64_t num_sketches, + int64_t num_registers_per_sketch, + int* const registers_output_cache, + int const* const registers_thread_cache, + cudf::size_type const* const group_lables_thread_cache) +{ __shared__ int8_t shared_data[block_size]; auto const tid = cudf::detail::grid_1d::global_thread_id(); int shared_idx = tid % block_size; // register idx is tid shared_data[shared_idx] = static_cast(0); - int prev_group = group_lables_thread_cache[0]; + int prev_group = group_lables_thread_cache[0]; for (auto i = 0; i < num_sketches; i++) { int curr_group = group_lables_thread_cache[i]; - int8_t curr_reg_v = static_cast( - registers_thread_cache[i * num_registers_per_sketch + tid]); + int8_t curr_reg_v = + static_cast(registers_thread_cache[i * num_registers_per_sketch + tid]); if (curr_group == prev_group) { - if (curr_reg_v > shared_data[shared_idx]) { - shared_data[shared_idx] = curr_reg_v; - } + if (curr_reg_v > shared_data[shared_idx]) { shared_data[shared_idx] = curr_reg_v; } } else { // meets a new group, store the result for previous group int64_t result_reg_idx = prev_group * num_registers_per_sketch + tid; - int result_curr_reg_v = registers_output_cache[result_reg_idx]; + int result_curr_reg_v = registers_output_cache[result_reg_idx]; if (shared_data[shared_idx] > result_curr_reg_v) { registers_output_cache[result_reg_idx] = shared_data[shared_idx]; } @@ -314,7 +302,7 @@ CUDF_KERNEL void merge_sketches_vertically( // handles the last register in this thread int64_t reg_idx = prev_group * num_registers_per_sketch + tid; - int curr_reg_v = registers_output_cache[reg_idx]; + int curr_reg_v = registers_output_cache[reg_idx]; if (shared_data[shared_idx] > curr_reg_v) { registers_output_cache[reg_idx] = shared_data[shared_idx]; } @@ -346,116 +334,114 @@ CUDF_KERNEL void merge_sketches_vertically( * 6 bits: 100-001 Compact to one long is: * 100001-100001-100001-100001-100001-100001-100001-100001-100001-100001 */ -CUDF_KERNEL void -compact_kernel(int64_t const num_groups, int64_t const num_registers_per_sketch, - cudf::device_span sketches_output, - // num_groups * num_registers_per_sketch integers - cudf::device_span registers_output_cache) { - int64_t const tid = cudf::detail::grid_1d::global_thread_id(); - int64_t const num_long_cols = - num_registers_per_sketch / REGISTERS_PER_LONG + 1; - if (tid >= num_groups * num_long_cols) { - return; - } +CUDF_KERNEL void compact_kernel(int64_t const num_groups, + int64_t const num_registers_per_sketch, + cudf::device_span sketches_output, + // num_groups * num_registers_per_sketch integers + cudf::device_span registers_output_cache) +{ + int64_t const tid = cudf::detail::grid_1d::global_thread_id(); + int64_t const num_long_cols = num_registers_per_sketch / REGISTERS_PER_LONG + 1; + if (tid >= num_groups * num_long_cols) { return; } int64_t const group_idx = tid / num_long_cols; - int64_t const long_idx = tid % num_long_cols; + int64_t const long_idx = tid % num_long_cols; int64_t const reg_begin_idx = - group_idx * num_registers_per_sketch + long_idx * REGISTERS_PER_LONG; + group_idx * num_registers_per_sketch + long_idx * REGISTERS_PER_LONG; int64_t num_regs = REGISTERS_PER_LONG; - if (long_idx == num_long_cols - 1) { - num_regs = num_registers_per_sketch % REGISTERS_PER_LONG; - } + if (long_idx == num_long_cols - 1) { num_regs = num_registers_per_sketch % REGISTERS_PER_LONG; } int64_t ten_registers = 0; for (auto i = 0; i < num_regs; i++) { int64_t reg_v = registers_output_cache[reg_begin_idx + i]; - int64_t tmp = reg_v << (REGISTER_VALUE_BITS * i); + int64_t tmp = reg_v << (REGISTER_VALUE_BITS * i); ten_registers |= tmp; } sketches_output[long_idx][group_idx] = ten_registers; } -std::unique_ptr -group_hllpp(cudf::column_view const &input, int64_t const num_groups, - cudf::device_span group_lables, - int64_t const precision, rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) { - int64_t num_registers_per_sketch = 1 << precision; - constexpr int64_t block_size = 256; - constexpr int num_hashs_per_thread = 256; // handles 256 items per thread +std::unique_ptr group_hllpp(cudf::column_view const& input, + int64_t const num_groups, + cudf::device_span group_lables, + int64_t const precision, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + int64_t num_registers_per_sketch = 1 << precision; + constexpr int64_t block_size = 256; + constexpr int num_hashs_per_thread = 256; // handles 256 items per thread int64_t num_threads_partial_kernel = - cudf::util::div_rounding_up_safe(input.size(), num_hashs_per_thread); + cudf::util::div_rounding_up_safe(input.size(), num_hashs_per_thread); - auto sketches_output = rmm::device_uvector( - num_groups * num_registers_per_sketch, stream, mr); + auto sketches_output = + rmm::device_uvector(num_groups * num_registers_per_sketch, stream, mr); - { // add this block to release `registers_thread_cache` and + { // add this block to release `registers_thread_cache` and // `group_lables_thread_cache` auto registers_thread_cache = rmm::device_uvector( - num_threads_partial_kernel * num_registers_per_sketch, stream, mr); + num_threads_partial_kernel * num_registers_per_sketch, stream, mr); auto group_lables_thread_cache = - rmm::device_uvector(num_threads_partial_kernel, stream, mr); + rmm::device_uvector(num_threads_partial_kernel, stream, mr); - { // add this block to release `hash_col` + { // add this block to release `hash_col` // 1. compute all the hashs auto input_table_view = cudf::table_view{{input}}; - auto hash_col = xxhash64(input_table_view, SEED, stream, mr); - auto d_hashs = cudf::column_device_view::create(hash_col->view(), stream); + auto hash_col = xxhash64(input_table_view, SEED, stream, mr); + auto d_hashs = cudf::column_device_view::create(hash_col->view(), stream); // 2. execute partial group by - int64_t num_blocks_p1 = cudf::util::div_rounding_up_safe( - num_threads_partial_kernel, block_size); + int64_t num_blocks_p1 = + cudf::util::div_rounding_up_safe(num_threads_partial_kernel, block_size); partial_group_sketches_from_hashs_kernel - <<>>( - *d_hashs, group_lables, precision, sketches_output.begin(), - registers_thread_cache.begin(), - group_lables_thread_cache.begin()); + <<>>(*d_hashs, + group_lables, + precision, + sketches_output.begin(), + registers_thread_cache.begin(), + group_lables_thread_cache.begin()); } // 3. merge the intermidate result auto num_merge_threads = num_registers_per_sketch; - auto num_merge_blocks = - cudf::util::div_rounding_up_safe(num_merge_threads, block_size); + auto num_merge_blocks = cudf::util::div_rounding_up_safe(num_merge_threads, block_size); merge_sketches_vertically - <<>>( - num_threads_partial_kernel, // num_sketches - num_registers_per_sketch, sketches_output.begin(), - registers_thread_cache.begin(), group_lables_thread_cache.begin()); + <<>>( + num_threads_partial_kernel, // num_sketches + num_registers_per_sketch, + sketches_output.begin(), + registers_thread_cache.begin(), + group_lables_thread_cache.begin()); } // 4. create output columns - auto num_long_cols = num_registers_per_sketch / REGISTERS_PER_LONG + 1; - auto const results_iter = - cudf::detail::make_counting_transform_iterator(0, [&](int i) { - return cudf::make_numeric_column( - cudf::data_type{cudf::type_id::INT64}, num_groups, - cudf::mask_state::ALL_VALID, stream, mr); - }); - auto children = std::vector>( - results_iter, results_iter + num_long_cols); + auto num_long_cols = num_registers_per_sketch / REGISTERS_PER_LONG + 1; + auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) { + return cudf::make_numeric_column( + cudf::data_type{cudf::type_id::INT64}, num_groups, cudf::mask_state::ALL_VALID, stream, mr); + }); + auto children = + std::vector>(results_iter, results_iter + num_long_cols); auto d_results = [&] { - auto host_results_pointer_iter = thrust::make_transform_iterator( - children.begin(), [](auto const &results_column) { - return results_column->mutable_view().template data(); - }); - auto host_results_pointers = std::vector( - host_results_pointer_iter, host_results_pointer_iter + children.size()); - return cudf::detail::make_device_uvector_async(host_results_pointers, - stream, mr); + auto host_results_pointer_iter = + thrust::make_transform_iterator(children.begin(), [](auto const& results_column) { + return results_column->mutable_view().template data(); + }); + auto host_results_pointers = + std::vector(host_results_pointer_iter, host_results_pointer_iter + children.size()); + return cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr); }(); - auto result = cudf::make_structs_column(num_groups, std::move(children), - 0, // null count - rmm::device_buffer{}, // null mask + auto result = cudf::make_structs_column(num_groups, + std::move(children), + 0, // null count + rmm::device_buffer{}, // null mask stream); // 5. compact sketches auto num_phase3_threads = num_groups * num_long_cols; - auto num_phase3_blocks = - cudf::util::div_rounding_up_safe(num_phase3_threads, block_size); + auto num_phase3_blocks = cudf::util::div_rounding_up_safe(num_phase3_threads, block_size); compact_kernel<<>>( - num_groups, num_registers_per_sketch, d_results, sketches_output); + num_groups, num_registers_per_sketch, d_results, sketches_output); return result; } @@ -485,55 +471,49 @@ group_hllpp(cudf::column_view const &input, int64_t const num_groups, */ template CUDF_KERNEL void partial_group_long_sketches_kernel( - cudf::device_span sketches_input, - int64_t const num_sketches_input, int64_t const num_threads_per_col, - int64_t const num_registers_per_sketch, int64_t const num_groups, - cudf::device_span group_lables, - // num_groups * num_registers_per_sketch integers - int *const registers_output_cache, - // num_threads * num_registers_per_sketch integers - int *const registers_thread_cache, - // num_threads integers - cudf::size_type *const group_lables_thread_cache) { - auto const tid = cudf::detail::grid_1d::global_thread_id(); + cudf::device_span sketches_input, + int64_t const num_sketches_input, + int64_t const num_threads_per_col, + int64_t const num_registers_per_sketch, + int64_t const num_groups, + cudf::device_span group_lables, + // num_groups * num_registers_per_sketch integers + int* const registers_output_cache, + // num_threads * num_registers_per_sketch integers + int* const registers_thread_cache, + // num_threads integers + cudf::size_type* const group_lables_thread_cache) +{ + auto const tid = cudf::detail::grid_1d::global_thread_id(); auto const num_long_cols = sketches_input.size(); - if (tid >= num_threads_per_col * num_long_cols) { - return; - } + if (tid >= num_threads_per_col * num_long_cols) { return; } - auto const long_idx = tid / num_threads_per_col; - auto const thread_idx_in_cols = tid % num_threads_per_col; - int64_t const *const longs_ptr = sketches_input[long_idx]; + auto const long_idx = tid / num_threads_per_col; + auto const thread_idx_in_cols = tid % num_threads_per_col; + int64_t const* const longs_ptr = sketches_input[long_idx]; - int *const registers_thread_ptr = - registers_thread_cache + thread_idx_in_cols * num_registers_per_sketch; + int* const registers_thread_ptr = + registers_thread_cache + thread_idx_in_cols * num_registers_per_sketch; auto const sketch_first = thread_idx_in_cols * num_longs_per_threads; - auto const sketch_end = - cuda::std::min(sketch_first + num_longs_per_threads, num_sketches_input); + auto const sketch_end = cuda::std::min(sketch_first + num_longs_per_threads, num_sketches_input); int num_regs = REGISTERS_PER_LONG; - if (long_idx == num_long_cols - 1) { - num_regs = num_registers_per_sketch % REGISTERS_PER_LONG; - } + if (long_idx == num_long_cols - 1) { num_regs = num_registers_per_sketch % REGISTERS_PER_LONG; } for (auto i = 0; i < num_regs; i++) { cudf::size_type prev_group = group_lables[sketch_first]; - int max_reg_v = 0; - int reg_idx_in_sketch = long_idx * REGISTERS_PER_LONG + i; - for (auto sketch_idx = sketch_first; sketch_idx < sketch_end; - sketch_idx++) { + int max_reg_v = 0; + int reg_idx_in_sketch = long_idx * REGISTERS_PER_LONG + i; + for (auto sketch_idx = sketch_first; sketch_idx < sketch_end; sketch_idx++) { cudf::size_type curr_group = group_lables[sketch_idx]; - int curr_reg_v = get_register_value(longs_ptr[sketch_idx], i); + int curr_reg_v = get_register_value(longs_ptr[sketch_idx], i); if (curr_group == prev_group) { // still in the same group, update the max value - if (curr_reg_v > max_reg_v) { - max_reg_v = curr_reg_v; - } + if (curr_reg_v > max_reg_v) { max_reg_v = curr_reg_v; } } else { // meets new group, save output for the previous group - int64_t output_idx_prev = - num_registers_per_sketch * prev_group + reg_idx_in_sketch; + int64_t output_idx_prev = num_registers_per_sketch * prev_group + reg_idx_in_sketch; registers_output_cache[output_idx_prev] = max_reg_v; // reset @@ -542,17 +522,16 @@ CUDF_KERNEL void partial_group_long_sketches_kernel( if (sketch_idx == sketch_end - 1) { // last item in the segment - int64_t output_idx_curr = - num_registers_per_sketch * curr_group + reg_idx_in_sketch; + int64_t output_idx_curr = num_registers_per_sketch * curr_group + reg_idx_in_sketch; if (sketch_idx == num_sketches_input - 1) { // last segment registers_output_cache[output_idx_curr] = max_reg_v; - max_reg_v = curr_reg_v; + max_reg_v = curr_reg_v; } else { if (curr_group != group_lables[sketch_idx + 1]) { // look the first item in the next segment registers_output_cache[output_idx_curr] = max_reg_v; - max_reg_v = curr_reg_v; + max_reg_v = curr_reg_v; } } } @@ -565,8 +544,7 @@ CUDF_KERNEL void partial_group_long_sketches_kernel( } if (long_idx == 0) { - group_lables_thread_cache[thread_idx_in_cols] = - group_lables[sketch_end - 1]; + group_lables_thread_cache[thread_idx_in_cols] = group_lables[sketch_end - 1]; } } @@ -575,88 +553,83 @@ CUDF_KERNEL void partial_group_long_sketches_kernel( * register values. Merge all rows in the same group. */ std::unique_ptr group_merge_hllpp( - cudf::column_view const &hll_input, // struct column - int64_t const num_groups, - cudf::device_span group_lables, - int64_t const precision, rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) { - int64_t num_registers_per_sketch = 1 << precision; - int64_t const num_sketches = hll_input.size(); - int64_t const num_long_cols = - num_registers_per_sketch / REGISTERS_PER_LONG + 1; + cudf::column_view const& hll_input, // struct column + int64_t const num_groups, + cudf::device_span group_lables, + int64_t const precision, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + int64_t num_registers_per_sketch = 1 << precision; + int64_t const num_sketches = hll_input.size(); + int64_t const num_long_cols = num_registers_per_sketch / REGISTERS_PER_LONG + 1; constexpr int64_t num_longs_per_threads = 256; - constexpr int64_t block_size = 256; + constexpr int64_t block_size = 256; int64_t num_threads_per_col_phase1 = - cudf::util::div_rounding_up_safe(num_sketches, num_longs_per_threads); + cudf::util::div_rounding_up_safe(num_sketches, num_longs_per_threads); int64_t num_threads_phase1 = num_threads_per_col_phase1 * num_long_cols; - int64_t num_blocks = - cudf::util::div_rounding_up_safe(num_threads_phase1, block_size); - auto registers_output_cache = rmm::device_uvector( - num_registers_per_sketch * num_groups, stream, mr); + int64_t num_blocks = cudf::util::div_rounding_up_safe(num_threads_phase1, block_size); + auto registers_output_cache = + rmm::device_uvector(num_registers_per_sketch * num_groups, stream, mr); { - auto registers_thread_cache = rmm::device_uvector( - num_registers_per_sketch * num_threads_phase1, stream, mr); + auto registers_thread_cache = + rmm::device_uvector(num_registers_per_sketch * num_threads_phase1, stream, mr); auto group_lables_thread_cache = - rmm::device_uvector(num_threads_per_col_phase1, stream, mr); + rmm::device_uvector(num_threads_per_col_phase1, stream, mr); cudf::structs_column_view scv(hll_input); - auto const input_iter = - cudf::detail::make_counting_transform_iterator(0, [&](int i) { - return scv.get_sliced_child(i, stream).begin(); - }); - auto input_cols = - std::vector(input_iter, input_iter + num_long_cols); - auto d_inputs = - cudf::detail::make_device_uvector_async(input_cols, stream, mr); + auto const input_iter = cudf::detail::make_counting_transform_iterator( + 0, [&](int i) { return scv.get_sliced_child(i, stream).begin(); }); + auto input_cols = std::vector(input_iter, input_iter + num_long_cols); + auto d_inputs = cudf::detail::make_device_uvector_async(input_cols, stream, mr); // 1st kernel: partially group partial_group_long_sketches_kernel - <<>>( - d_inputs, num_sketches, num_threads_per_col_phase1, - num_registers_per_sketch, num_groups, group_lables, - registers_output_cache.begin(), registers_thread_cache.begin(), - group_lables_thread_cache.begin()); + <<>>(d_inputs, + num_sketches, + num_threads_per_col_phase1, + num_registers_per_sketch, + num_groups, + group_lables, + registers_output_cache.begin(), + registers_thread_cache.begin(), + group_lables_thread_cache.begin()); auto const num_phase2_threads = num_registers_per_sketch; - auto const num_phase2_blocks = - cudf::util::div_rounding_up_safe(num_phase2_threads, block_size); + auto const num_phase2_blocks = cudf::util::div_rounding_up_safe(num_phase2_threads, block_size); // 2nd kernel: vertical merge merge_sketches_vertically - <<>>( - num_threads_per_col_phase1, // num_sketches - num_registers_per_sketch, registers_output_cache.begin(), - registers_thread_cache.begin(), group_lables_thread_cache.begin()); + <<>>( + num_threads_per_col_phase1, // num_sketches + num_registers_per_sketch, + registers_output_cache.begin(), + registers_thread_cache.begin(), + group_lables_thread_cache.begin()); } // create output columns - auto const results_iter = - cudf::detail::make_counting_transform_iterator(0, [&](int i) { - return cudf::make_numeric_column( - cudf::data_type{cudf::type_id::INT64}, num_groups, - cudf::mask_state::ALL_VALID, stream, mr); - }); - auto results = std::vector>( - results_iter, results_iter + num_long_cols); + auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) { + return cudf::make_numeric_column( + cudf::data_type{cudf::type_id::INT64}, num_groups, cudf::mask_state::ALL_VALID, stream, mr); + }); + auto results = + std::vector>(results_iter, results_iter + num_long_cols); auto d_sketches_output = [&] { - auto host_results_pointer_iter = thrust::make_transform_iterator( - results.begin(), [](auto const &results_column) { - return results_column->mutable_view().template data(); - }); - auto host_results_pointers = std::vector( - host_results_pointer_iter, host_results_pointer_iter + results.size()); - return cudf::detail::make_device_uvector_async(host_results_pointers, - stream, mr); + auto host_results_pointer_iter = + thrust::make_transform_iterator(results.begin(), [](auto const& results_column) { + return results_column->mutable_view().template data(); + }); + auto host_results_pointers = + std::vector(host_results_pointer_iter, host_results_pointer_iter + results.size()); + return cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr); }(); // 3rd kernel: compact auto num_phase3_threads = num_groups * num_long_cols; - auto num_phase3_blocks = - cudf::util::div_rounding_up_safe(num_phase3_threads, block_size); + auto num_phase3_blocks = cudf::util::div_rounding_up_safe(num_phase3_threads, block_size); compact_kernel<<>>( - num_groups, num_registers_per_sketch, d_sketches_output, - registers_output_cache); + num_groups, num_registers_per_sketch, d_sketches_output, registers_output_cache); - return make_structs_column(num_groups, std::move(results), 0, - rmm::device_buffer{}); + return make_structs_column(num_groups, std::move(results), 0, rmm::device_buffer{}); } /** @@ -666,15 +639,16 @@ std::unique_ptr group_merge_hllpp( */ template CUDF_KERNEL void reduce_hllpp_kernel(cudf::column_device_view hashs, - cudf::device_span output, - int precision) { + cudf::device_span output, + int precision) +{ __shared__ int32_t shared_data[block_size]; - auto const tid = cudf::detail::grid_1d::global_thread_id(); - auto const num_hashs = hashs.size(); + auto const tid = cudf::detail::grid_1d::global_thread_id(); + auto const num_hashs = hashs.size(); uint64_t const num_registers_per_sketch = 1L << precision; - int const idx_shift = 64 - precision; - uint64_t const w_padding = 1ULL << (precision - 1); + int const idx_shift = 64 - precision; + uint64_t const w_padding = 1ULL << (precision - 1); // init tmp data for (int i = tid; i < num_registers_per_sketch; i += block_size) { @@ -689,10 +663,9 @@ CUDF_KERNEL void reduce_hllpp_kernel(cudf::column_device_view hashs, // shift uint64_t const reg_idx = hash >> idx_shift; // get the leading zeros - int const reg_v = static_cast( - cuda::std::countl_zero((hash << precision) | w_padding) + 1ULL); - cuda::atomic_ref register_ref( - shared_data[reg_idx]); + int const reg_v = + static_cast(cuda::std::countl_zero((hash << precision) | w_padding) + 1ULL); + cuda::atomic_ref register_ref(shared_data[reg_idx]); register_ref.fetch_max(reg_v, cuda::memory_order_relaxed); } __syncthreads(); @@ -702,14 +675,12 @@ CUDF_KERNEL void reduce_hllpp_kernel(cudf::column_device_view hashs, // 7 to highest are all 0. if (tid * REGISTERS_PER_LONG < num_registers_per_sketch) { int start = tid * REGISTERS_PER_LONG; - int end = (tid + 1) * REGISTERS_PER_LONG; - if (end > num_registers_per_sketch) { - end = num_registers_per_sketch; - } + int end = (tid + 1) * REGISTERS_PER_LONG; + if (end > num_registers_per_sketch) { end = num_registers_per_sketch; } int64_t ret = 0; for (int i = 0; i < end - start; i++) { - int shift = i * REGISTER_VALUE_BITS; + int shift = i * REGISTER_VALUE_BITS; int64_t reg = shared_data[start + i]; ret |= (reg << shift); } @@ -718,35 +689,36 @@ CUDF_KERNEL void reduce_hllpp_kernel(cudf::column_device_view hashs, } } -std::unique_ptr reduce_hllpp(cudf::column_view const &input, +std::unique_ptr reduce_hllpp(cudf::column_view const& input, int64_t const precision, rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) { + rmm::device_async_resource_ref mr) +{ int64_t num_registers_per_sketch = 1L << precision; // 1. compute all the hashs auto input_table_view = cudf::table_view{{input}}; - auto hash_col = xxhash64(input_table_view, SEED, stream, mr); - auto d_hashs = cudf::column_device_view::create(hash_col->view(), stream); + auto hash_col = xxhash64(input_table_view, SEED, stream, mr); + auto d_hashs = cudf::column_device_view::create(hash_col->view(), stream); // 2. generate long columns, the size of each long column is 1 - auto num_long_cols = num_registers_per_sketch / REGISTERS_PER_LONG + 1; - auto const results_iter = - cudf::detail::make_counting_transform_iterator(0, [&](int i) { - return cudf::make_numeric_column( - cudf::data_type{cudf::type_id::INT64}, 1 /**num_groups*/, - cudf::mask_state::ALL_VALID, stream, mr); - }); - auto children = std::vector>( - results_iter, results_iter + num_long_cols); + auto num_long_cols = num_registers_per_sketch / REGISTERS_PER_LONG + 1; + auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) { + return cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT64}, + 1 /**num_groups*/, + cudf::mask_state::ALL_VALID, + stream, + mr); + }); + auto children = + std::vector>(results_iter, results_iter + num_long_cols); auto d_results = [&] { - auto host_results_pointer_iter = thrust::make_transform_iterator( - children.begin(), [](auto const &results_column) { - return results_column->mutable_view().template data(); - }); - auto host_results_pointers = std::vector( - host_results_pointer_iter, host_results_pointer_iter + children.size()); - return cudf::detail::make_device_uvector_async(host_results_pointers, - stream, mr); + auto host_results_pointer_iter = + thrust::make_transform_iterator(children.begin(), [](auto const& results_column) { + return results_column->mutable_view().template data(); + }); + auto host_results_pointers = + std::vector(host_results_pointer_iter, host_results_pointer_iter + children.size()); + return cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr); }(); // 2. reduce and generate compacted long values @@ -754,216 +726,203 @@ std::unique_ptr reduce_hllpp(cudf::column_view const &input, // max shared memory is 2^18 * 4 = 1M auto const shared_mem_size = num_registers_per_sketch * sizeof(int32_t); reduce_hllpp_kernel - <<<1, block_size, shared_mem_size, stream.value()>>>(*d_hashs, d_results, - precision); + <<<1, block_size, shared_mem_size, stream.value()>>>(*d_hashs, d_results, precision); // 3. create struct scalar auto host_results_view_iter = thrust::make_transform_iterator( - children.begin(), - [](auto const &results_column) { return results_column->view(); }); - auto views = std::vector( - host_results_view_iter, host_results_view_iter + num_long_cols); + children.begin(), [](auto const& results_column) { return results_column->view(); }); + auto views = + std::vector(host_results_view_iter, host_results_view_iter + num_long_cols); auto table_view = cudf::table_view{views}; - auto table = cudf::table(table_view); - return std::make_unique(std::move(table), true, stream, - mr); + auto table = cudf::table(table_view); + return std::make_unique(std::move(table), true, stream, mr); } -CUDF_KERNEL void reduce_merge_hll_kernel_vertically( - cudf::device_span sketch_longs, - cudf::size_type num_sketches, int num_registers_per_sketch, - int *const output) { +CUDF_KERNEL void reduce_merge_hll_kernel_vertically(cudf::device_span sketch_longs, + cudf::size_type num_sketches, + int num_registers_per_sketch, + int* const output) +{ auto const tid = cudf::detail::grid_1d::global_thread_id(); - if (tid >= num_registers_per_sketch) { - return; - } - auto long_idx = tid / REGISTERS_PER_LONG; + if (tid >= num_registers_per_sketch) { return; } + auto long_idx = tid / REGISTERS_PER_LONG; auto reg_idx_in_long = tid % REGISTERS_PER_LONG; - int max = 0; + int max = 0; for (auto row_idx = 0; row_idx < num_sketches; row_idx++) { - int reg_v = - get_register_value(sketch_longs[long_idx][row_idx], reg_idx_in_long); - if (reg_v > max) { - max = reg_v; - } + int reg_v = get_register_value(sketch_longs[long_idx][row_idx], reg_idx_in_long); + if (reg_v > max) { max = reg_v; } } output[tid] = max; } -std::unique_ptr -reduce_merge_hllpp(cudf::column_view const &input, int64_t const precision, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) { +std::unique_ptr reduce_merge_hllpp(cudf::column_view const& input, + int64_t const precision, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ // create device input int64_t num_registers_per_sketch = 1 << precision; - auto num_long_cols = num_registers_per_sketch / REGISTERS_PER_LONG + 1; + auto num_long_cols = num_registers_per_sketch / REGISTERS_PER_LONG + 1; cudf::structs_column_view scv(input); - auto const input_iter = - cudf::detail::make_counting_transform_iterator(0, [&](int i) { - return scv.get_sliced_child(i, stream).begin(); - }); - auto input_cols = - std::vector(input_iter, input_iter + num_long_cols); - auto d_inputs = - cudf::detail::make_device_uvector_async(input_cols, stream, mr); + auto const input_iter = cudf::detail::make_counting_transform_iterator( + 0, [&](int i) { return scv.get_sliced_child(i, stream).begin(); }); + auto input_cols = std::vector(input_iter, input_iter + num_long_cols); + auto d_inputs = cudf::detail::make_device_uvector_async(input_cols, stream, mr); // create one row output - auto const results_iter = - cudf::detail::make_counting_transform_iterator(0, [&](int i) { - return cudf::make_numeric_column( - cudf::data_type{cudf::type_id::INT64}, 1 /** num_rows */, - cudf::mask_state::ALL_VALID, stream, mr); - }); - auto children = std::vector>( - results_iter, results_iter + num_long_cols); + auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) { + return cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT64}, + 1 /** num_rows */, + cudf::mask_state::ALL_VALID, + stream, + mr); + }); + auto children = + std::vector>(results_iter, results_iter + num_long_cols); auto d_results = [&] { - auto host_results_pointer_iter = thrust::make_transform_iterator( - children.begin(), [](auto const &results_column) { - return results_column->mutable_view().template data(); - }); - auto host_results_pointers = std::vector( - host_results_pointer_iter, host_results_pointer_iter + children.size()); - return cudf::detail::make_device_uvector_async(host_results_pointers, - stream, mr); + auto host_results_pointer_iter = + thrust::make_transform_iterator(children.begin(), [](auto const& results_column) { + return results_column->mutable_view().template data(); + }); + auto host_results_pointers = + std::vector(host_results_pointer_iter, host_results_pointer_iter + children.size()); + return cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr); }(); // execute merge kernel - auto num_threads = num_registers_per_sketch; + auto num_threads = num_registers_per_sketch; constexpr int64_t block_size = 256; - auto num_blocks = cudf::util::div_rounding_up_safe(num_threads, block_size); - auto output_cache = - rmm::device_uvector(num_registers_per_sketch, stream, mr); - reduce_merge_hll_kernel_vertically<<>>( - d_inputs, input.size(), num_registers_per_sketch, output_cache.begin()); + auto num_blocks = cudf::util::div_rounding_up_safe(num_threads, block_size); + auto output_cache = rmm::device_uvector(num_registers_per_sketch, stream, mr); + reduce_merge_hll_kernel_vertically<<>>( + d_inputs, input.size(), num_registers_per_sketch, output_cache.begin()); // compact to longs auto const num_compact_threads = num_long_cols; - auto const num_compact_blocks = - cudf::util::div_rounding_up_safe(num_compact_threads, block_size); + auto const num_compact_blocks = cudf::util::div_rounding_up_safe(num_compact_threads, block_size); compact_kernel<<>>( - 1 /** num_groups **/, num_registers_per_sketch, d_results, output_cache); + 1 /** num_groups **/, num_registers_per_sketch, d_results, output_cache); // create scalar auto host_results_view_iter = thrust::make_transform_iterator( - children.begin(), - [](auto const &results_column) { return results_column->view(); }); - auto views = std::vector( - host_results_view_iter, host_results_view_iter + num_long_cols); + children.begin(), [](auto const& results_column) { return results_column->view(); }); + auto views = + std::vector(host_results_view_iter, host_results_view_iter + num_long_cols); auto table_view = cudf::table_view{views}; - auto table = cudf::table(table_view); - return std::make_unique(std::move(table), true, stream, - mr); + auto table = cudf::table(table_view); + return std::make_unique(std::move(table), true, stream, mr); } struct estimate_fn { - cudf::device_span sketch_longs; + cudf::device_span sketch_longs; int const precision; - int64_t *const out; + int64_t* const out; - __device__ void operator()(cudf::size_type const idx) const { + __device__ void operator()(cudf::size_type const idx) const + { auto const num_regs = 1ull << precision; - double sum = 0; - int zeroes = 0; + double sum = 0; + int zeroes = 0; for (auto reg_idx = 0; reg_idx < num_regs; ++reg_idx) { // each long contains 10 register values - int long_col_idx = reg_idx / REGISTERS_PER_LONG; + int long_col_idx = reg_idx / REGISTERS_PER_LONG; int reg_idx_in_long = reg_idx % REGISTERS_PER_LONG; - int reg = - get_register_value(sketch_longs[long_col_idx][idx], reg_idx_in_long); + int reg = get_register_value(sketch_longs[long_col_idx][idx], reg_idx_in_long); sum += double{1} / static_cast(1ull << reg); zeroes += reg == 0; } auto const finalize = cuco::hyperloglog_ns::detail::finalizer(precision); - out[idx] = finalize(sum, zeroes); + out[idx] = finalize(sum, zeroes); } }; -} // end anonymous namespace +} // end anonymous namespace std::unique_ptr group_hyper_log_log_plus_plus( - cudf::column_view const &input, int64_t const num_groups, - cudf::device_span group_lables, - int64_t const precision, rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) { + cudf::column_view const& input, + int64_t const num_groups, + cudf::device_span group_lables, + int64_t const precision, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision >= 4."); auto adjust_precision = precision > MAX_PRECISION ? MAX_PRECISION : precision; - return group_hllpp(input, num_groups, group_lables, adjust_precision, stream, - mr); + return group_hllpp(input, num_groups, group_lables, adjust_precision, stream, mr); } std::unique_ptr group_merge_hyper_log_log_plus_plus( - cudf::column_view const &input, int64_t const num_groups, - cudf::device_span group_lables, - int64_t const precision, rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) { + cudf::column_view const& input, + int64_t const num_groups, + cudf::device_span group_lables, + int64_t const precision, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision >= 4."); - CUDF_EXPECTS( - input.type().id() == cudf::type_id::STRUCT, - "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns."); + CUDF_EXPECTS(input.type().id() == cudf::type_id::STRUCT, + "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns."); for (auto i = 0; i < input.num_children(); i++) { - CUDF_EXPECTS( - input.child(i).type().id() == cudf::type_id::INT64, - "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns."); + CUDF_EXPECTS(input.child(i).type().id() == cudf::type_id::INT64, + "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns."); } - auto adjust_precision = precision > MAX_PRECISION ? MAX_PRECISION : precision; + auto adjust_precision = precision > MAX_PRECISION ? MAX_PRECISION : precision; auto expected_num_longs = (1 << adjust_precision) / REGISTERS_PER_LONG + 1; CUDF_EXPECTS(input.num_children() == expected_num_longs, "The num of long columns in input is incorrect."); - return group_merge_hllpp(input, num_groups, group_lables, adjust_precision, - stream, mr); + return group_merge_hllpp(input, num_groups, group_lables, adjust_precision, stream, mr); } -std::unique_ptr reduce_hyper_log_log_plus_plus( - cudf::column_view const &input, int64_t const precision, - rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { +std::unique_ptr reduce_hyper_log_log_plus_plus(cudf::column_view const& input, + int64_t const precision, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision >= 4."); auto adjust_precision = precision > MAX_PRECISION ? MAX_PRECISION : precision; return reduce_hllpp(input, adjust_precision, stream, mr); } std::unique_ptr reduce_merge_hyper_log_log_plus_plus( - cudf::column_view const &input, int64_t const precision, - rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + cudf::column_view const& input, + int64_t const precision, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision >= 4."); - CUDF_EXPECTS( - input.type().id() == cudf::type_id::STRUCT, - "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns."); + CUDF_EXPECTS(input.type().id() == cudf::type_id::STRUCT, + "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns."); for (auto i = 0; i < input.num_children(); i++) { - CUDF_EXPECTS( - input.child(i).type().id() == cudf::type_id::INT64, - "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns."); + CUDF_EXPECTS(input.child(i).type().id() == cudf::type_id::INT64, + "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns."); } - auto adjust_precision = precision > MAX_PRECISION ? MAX_PRECISION : precision; + auto adjust_precision = precision > MAX_PRECISION ? MAX_PRECISION : precision; auto expected_num_longs = (1 << adjust_precision) / REGISTERS_PER_LONG + 1; CUDF_EXPECTS(input.num_children() == expected_num_longs, "The num of long columns in input is incorrect."); return reduce_merge_hllpp(input, adjust_precision, stream, mr); } -std::unique_ptr -estimate_from_hll_sketches(cudf::column_view const &input, int precision, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) { - CUDF_EXPECTS(precision >= 4, - "HyperLogLogPlusPlus requires precision is bigger than 4."); +std::unique_ptr estimate_from_hll_sketches(cudf::column_view const& input, + int precision, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision is bigger than 4."); auto const input_iter = cudf::detail::make_counting_transform_iterator( - 0, [&](int i) { return input.child(i).begin(); }); - auto input_cols = std::vector( - input_iter, input_iter + input.num_children()); - auto d_inputs = - cudf::detail::make_device_uvector_async(input_cols, stream, mr); - auto result = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT64}, - input.size(), - cudf::mask_state::ALL_VALID, stream); + 0, [&](int i) { return input.child(i).begin(); }); + auto input_cols = std::vector(input_iter, input_iter + input.num_children()); + auto d_inputs = cudf::detail::make_device_uvector_async(input_cols, stream, mr); + auto result = cudf::make_numeric_column( + cudf::data_type{cudf::type_id::INT64}, input.size(), cudf::mask_state::ALL_VALID, stream); // evaluate from struct - thrust::for_each_n( - rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), - input.size(), - estimate_fn{d_inputs, precision, result->mutable_view().data()}); + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + input.size(), + estimate_fn{d_inputs, precision, result->mutable_view().data()}); return result; } -} // namespace spark_rapids_jni +} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/hllpp.hpp b/src/main/cpp/src/hllpp.hpp index 4dda342a4f..d93e1debdf 100644 --- a/src/main/cpp/src/hllpp.hpp +++ b/src/main/cpp/src/hllpp.hpp @@ -52,10 +52,12 @@ constexpr int MAX_PRECISION = 18; * a struct column with multiple long columns which is consistent with Spark. */ std::unique_ptr group_hyper_log_log_plus_plus( - cudf::column_view const &input, int64_t const num_groups, - cudf::device_span group_lables, - int64_t const precision, rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); + cudf::column_view const& input, + int64_t const num_groups, + cudf::device_span group_lables, + int64_t const precision, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); /** * Merge HyperLogLogPlusPlus(HLLPP) sketches in the same group. @@ -63,19 +65,22 @@ std::unique_ptr group_hyper_log_log_plus_plus( * Spark. */ std::unique_ptr group_merge_hyper_log_log_plus_plus( - cudf::column_view const &input, int64_t const num_groups, - cudf::device_span group_lables, - int64_t const precision, rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); + cudf::column_view const& input, + int64_t const num_groups, + cudf::device_span group_lables, + int64_t const precision, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); /** * Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP) * sketches from hash codes, and merge all the sketches into one sketch, output * is a struct scalar with multiple long values. */ -std::unique_ptr reduce_hyper_log_log_plus_plus( - cudf::column_view const &input, int64_t const precision, - rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +std::unique_ptr reduce_hyper_log_log_plus_plus(cudf::column_view const& input, + int64_t const precision, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); /** * Merge all HyperLogLogPlusPlus(HLLPP) sketches in the input column into one @@ -83,8 +88,10 @@ std::unique_ptr reduce_hyper_log_log_plus_plus( * consistent with Spark. Output is a struct scalar with multiple long values. */ std::unique_ptr reduce_merge_hyper_log_log_plus_plus( - cudf::column_view const &input, int64_t const precision, - rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); + cudf::column_view const& input, + int64_t const precision, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); /** * Estimate count distinct values for the input which contains @@ -93,8 +100,9 @@ std::unique_ptr reduce_merge_hyper_log_log_plus_plus( * for null values when doing APPROX_COUNT_DISTINCT. */ std::unique_ptr estimate_from_hll_sketches( - cudf::column_view const &input, int precision, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + cudf::column_view const& input, + int precision, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); -} // namespace spark_rapids_jni +} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/hllpp_host_udf.cu b/src/main/cpp/src/hllpp_host_udf.cu index c9ad271876..370b906b65 100644 --- a/src/main/cpp/src/hllpp_host_udf.cu +++ b/src/main/cpp/src/hllpp_host_udf.cu @@ -40,14 +40,15 @@ namespace spark_rapids_jni { namespace { -template struct hllpp_udf : cudf::host_udf_base { +template +struct hllpp_udf : cudf::host_udf_base { static_assert(std::is_same_v || std::is_same_v); - hllpp_udf(int precision_, bool is_merge_) - : precision(precision_), is_merge(is_merge_) {} + hllpp_udf(int precision_, bool is_merge_) : precision(precision_), is_merge(is_merge_) {} - [[nodiscard]] input_data_attributes get_required_data() const override { + [[nodiscard]] input_data_attributes get_required_data() const override + { if constexpr (std::is_same_v) { return {reduction_data_attribute::INPUT_VALUES}; } else { @@ -57,45 +58,40 @@ template struct hllpp_udf : cudf::host_udf_base { } } - [[nodiscard]] output_type - operator()(host_udf_input const &udf_input, rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const override { + [[nodiscard]] output_type operator()(host_udf_input const& udf_input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const override + { if constexpr (std::is_same_v) { // reduce - auto const &input_values = std::get( - udf_input.at(reduction_data_attribute::INPUT_VALUES)); - if (input_values.size() == 0) { - return get_empty_output(std::nullopt, stream, mr); - } + auto const& input_values = + std::get(udf_input.at(reduction_data_attribute::INPUT_VALUES)); + if (input_values.size() == 0) { return get_empty_output(std::nullopt, stream, mr); } if (is_merge) { // reduce intermidate result, input_values are struct of long columns return spark_rapids_jni::reduce_merge_hyper_log_log_plus_plus( - input_values, precision, stream, mr); + input_values, precision, stream, mr); } else { return spark_rapids_jni::reduce_hyper_log_log_plus_plus( - input_values, precision, stream, mr); + input_values, precision, stream, mr); } } else { // groupby - auto const &group_values = std::get( - udf_input.at(groupby_data_attribute::GROUPED_VALUES)); - if (group_values.size() == 0) { - return get_empty_output(std::nullopt, stream, mr); - } - auto const group_offsets = - std::get>( - udf_input.at(groupby_data_attribute::GROUP_OFFSETS)); - int num_groups = group_offsets.size() - 1; - auto const group_lables = - std::get>( - udf_input.at(groupby_data_attribute::GROUP_LABELS)); + auto const& group_values = + std::get(udf_input.at(groupby_data_attribute::GROUPED_VALUES)); + if (group_values.size() == 0) { return get_empty_output(std::nullopt, stream, mr); } + auto const group_offsets = std::get>( + udf_input.at(groupby_data_attribute::GROUP_OFFSETS)); + int num_groups = group_offsets.size() - 1; + auto const group_lables = std::get>( + udf_input.at(groupby_data_attribute::GROUP_LABELS)); if (is_merge) { // group by intermidate result, group_values are struct of long columns return spark_rapids_jni::group_merge_hyper_log_log_plus_plus( - group_values, num_groups, group_lables, precision, stream, mr); + group_values, num_groups, group_lables, precision, stream, mr); } else { return spark_rapids_jni::group_hyper_log_log_plus_plus( - group_values, num_groups, group_lables, precision, stream, mr); + group_values, num_groups, group_lables, precision, stream, mr); } } } @@ -103,50 +99,50 @@ template struct hllpp_udf : cudf::host_udf_base { /** * @brief create an empty struct scalar */ - [[nodiscard]] output_type - get_empty_output([[maybe_unused]] std::optional output_dtype, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const override { - int num_registers = 1 << precision; - int num_long_cols = num_registers / REGISTERS_PER_LONG + 1; - auto const results_iter = - cudf::detail::make_counting_transform_iterator(0, [&](int i) { - return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT64}); - }); - auto children = std::vector>( - results_iter, results_iter + num_long_cols); + [[nodiscard]] output_type get_empty_output( + [[maybe_unused]] std::optional output_dtype, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const override + { + int num_registers = 1 << precision; + int num_long_cols = num_registers / REGISTERS_PER_LONG + 1; + auto const results_iter = cudf::detail::make_counting_transform_iterator( + 0, [&](int i) { return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT64}); }); + auto children = + std::vector>(results_iter, results_iter + num_long_cols); if constexpr (std::is_same_v) { // reduce auto host_results_view_iter = thrust::make_transform_iterator( - children.begin(), - [](auto const &results_column) { return results_column->view(); }); - auto views = std::vector( - host_results_view_iter, host_results_view_iter + num_long_cols); + children.begin(), [](auto const& results_column) { return results_column->view(); }); + auto views = std::vector(host_results_view_iter, + host_results_view_iter + num_long_cols); auto table_view = cudf::table_view{views}; - auto table = cudf::table(table_view); - return std::make_unique(std::move(table), true, - stream, mr); + auto table = cudf::table(table_view); + return std::make_unique(std::move(table), true, stream, mr); } else { // groupby - return cudf::make_structs_column(0, std::move(children), - 0, // null count - rmm::device_buffer{}, // null mask + return cudf::make_structs_column(0, + std::move(children), + 0, // null count + rmm::device_buffer{}, // null mask stream); } } - [[nodiscard]] bool is_equal(host_udf_base const &other) const override { - auto o = dynamic_cast(&other); + [[nodiscard]] bool is_equal(host_udf_base const& other) const override + { + auto o = dynamic_cast(&other); return o != nullptr && o->precision == this->precision; } - [[nodiscard]] std::size_t do_hash() const override { - return 31 * (31 * std::hash{}({"hllpp_udf"}) + precision) + - is_merge; + [[nodiscard]] std::size_t do_hash() const override + { + return 31 * (31 * std::hash{}({"hllpp_udf"}) + precision) + is_merge; } - [[nodiscard]] std::unique_ptr clone() const override { + [[nodiscard]] std::unique_ptr clone() const override + { return std::make_unique(precision, is_merge); } @@ -154,30 +150,26 @@ template struct hllpp_udf : cudf::host_udf_base { bool is_merge; }; -} // namespace +} // namespace -std::unique_ptr -create_hllpp_reduction_host_udf(int precision) { - return std::make_unique>( - precision, /*is_merge*/ false); +std::unique_ptr create_hllpp_reduction_host_udf(int precision) +{ + return std::make_unique>(precision, /*is_merge*/ false); } -std::unique_ptr -create_hllpp_reduction_merge_host_udf(int precision) { - return std::make_unique>( - precision, /*is_merge*/ true); +std::unique_ptr create_hllpp_reduction_merge_host_udf(int precision) +{ + return std::make_unique>(precision, /*is_merge*/ true); } -std::unique_ptr -create_hllpp_groupby_host_udf(int precision) { - return std::make_unique>( - precision, /*is_merge*/ false); +std::unique_ptr create_hllpp_groupby_host_udf(int precision) +{ + return std::make_unique>(precision, /*is_merge*/ false); } -std::unique_ptr -create_hllpp_groupby_merge_host_udf(int precision) { - return std::make_unique>( - precision, /*is_merge*/ true); +std::unique_ptr create_hllpp_groupby_merge_host_udf(int precision) +{ + return std::make_unique>(precision, /*is_merge*/ true); } -} // namespace spark_rapids_jni +} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/hllpp_host_udf.hpp b/src/main/cpp/src/hllpp_host_udf.hpp index fc4bb8b21b..e89cdf4e5f 100644 --- a/src/main/cpp/src/hllpp_host_udf.hpp +++ b/src/main/cpp/src/hllpp_host_udf.hpp @@ -20,16 +20,12 @@ namespace spark_rapids_jni { -std::unique_ptr -create_hllpp_reduction_host_udf(int precision); +std::unique_ptr create_hllpp_reduction_host_udf(int precision); -std::unique_ptr -create_hllpp_reduction_merge_host_udf(int precision); +std::unique_ptr create_hllpp_reduction_merge_host_udf(int precision); -std::unique_ptr -create_hllpp_groupby_host_udf(int precision); +std::unique_ptr create_hllpp_groupby_host_udf(int precision); -std::unique_ptr -create_hllpp_groupby_merge_host_udf(int precision); +std::unique_ptr create_hllpp_groupby_merge_host_udf(int precision); -} // namespace spark_rapids_jni +} // namespace spark_rapids_jni diff --git a/src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java b/src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java deleted file mode 100644 index 9e51761f4a..0000000000 --- a/src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids.jni; - -import ai.rapids.cudf.ColumnVector; -import ai.rapids.cudf.ColumnView; -import ai.rapids.cudf.NativeDepsLoader; - -/** - * HyperLogLogPlusPlus - */ -public class HLLPP { - static { - NativeDepsLoader.loadNativeDeps(); - } - - /** - * Compute the approximate count distinct value from sketch values. - *

- * The input sketch values must be given in the format `Struct`, - * The num of children is: num_registers_per_sketch / 10 + 1, here 10 means a INT64 contains - * max 10 registers. Register value is 6 bits. The input is columnar data, e.g.: sketch 0 - * is composed of by all the data of the children at index 0. - * - * @param input The sketch column which constains Struct values. - * @param precision The num of bits for addressing. - * @return A INT64 column with each value indicates the approximate count distinct value. - */ - public static ColumnVector estimateDistinctValueFromSketches(ColumnView input, int precision) { - return new ColumnVector(estimateDistinctValueFromSketches(input.getNativeView(), precision)); - } - - private static native long estimateDistinctValueFromSketches(long inputHandle, int precision); -} diff --git a/src/test/java/com/nvidia/spark/rapids/jni/HLLPPTest.java b/src/test/java/com/nvidia/spark/rapids/jni/HLLPPTest.java deleted file mode 100644 index c14b565313..0000000000 --- a/src/test/java/com/nvidia/spark/rapids/jni/HLLPPTest.java +++ /dev/null @@ -1,37 +0,0 @@ -/* -* Copyright (c) 2024, NVIDIA CORPORATION. -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -package com.nvidia.spark.rapids.jni; - -import ai.rapids.cudf.GroupByAggregation; -import ai.rapids.cudf.Table; - -import org.junit.jupiter.api.Test; - - -public class HLLPPTest { - - @Test - void testGroupByHLL() { - // A trivial test: - try (Table input = new Table.TestBuilder().column(1, 2, 3, 1, 2, 2, 1, 3, 3, 2) - .column(0, 1, -2, 3, -4, -5, -6, 7, -8, 9) - .build()){ - input.groupBy(0).aggregate(GroupByAggregation.HLLPP(0) - .onColumn(1)); - } - } -} From 208d67e5e71696ad8f619aa82c78838acc7292c5 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Tue, 17 Dec 2024 21:07:24 +0800 Subject: [PATCH 08/28] Use UDF --- src/main/cpp/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index b8b5f3a139..44863aa220 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -197,7 +197,6 @@ add_library( src/HashJni.cpp src/HistogramJni.cpp src/HostTableJni.cpp - src/HLLPPJni.cpp src/JSONUtilsJni.cpp src/NativeParquetJni.cpp src/ParseURIJni.cpp From e29d5a12b97af8460d2a5bcf1fbeb67917e8a1ee Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Wed, 18 Dec 2024 17:17:15 +0800 Subject: [PATCH 09/28] Address comments --- src/main/cpp/CMakeLists.txt | 8 +-- ....cpp => HyperLogLogPlusPlusHostUDFJni.cpp} | 19 +++---- .../{hllpp.cu => hyper_log_log_plus_plus.cu} | 49 +++++++++++-------- ...{hllpp.hpp => hyper_log_log_plus_plus.hpp} | 26 +++++----- ...cu => hyper_log_log_plus_plus_host_udf.cu} | 8 +-- ...p => hyper_log_log_plus_plus_host_udf.hpp} | 0 ...F.java => HyperLogLogPlusPlusHostUDF.java} | 2 +- 7 files changed, 61 insertions(+), 51 deletions(-) rename src/main/cpp/src/{HLLPPHostUDFJni.cpp => HyperLogLogPlusPlusHostUDFJni.cpp} (79%) rename src/main/cpp/src/{hllpp.cu => hyper_log_log_plus_plus.cu} (96%) rename src/main/cpp/src/{hllpp.hpp => hyper_log_log_plus_plus.hpp} (80%) rename src/main/cpp/src/{hllpp_host_udf.cu => hyper_log_log_plus_plus_host_udf.cu} (97%) rename src/main/cpp/src/{hllpp_host_udf.hpp => hyper_log_log_plus_plus_host_udf.hpp} (100%) rename src/main/java/com/nvidia/spark/rapids/jni/{HLLPPHostUDF.java => HyperLogLogPlusPlusHostUDF.java} (98%) diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index 44863aa220..70c9cd2a59 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -193,10 +193,10 @@ add_library( src/DateTimeRebaseJni.cpp src/DecimalUtilsJni.cpp src/GpuTimeZoneDBJni.cpp - src/HLLPPHostUDFJni.cpp src/HashJni.cpp src/HistogramJni.cpp src/HostTableJni.cpp + src/HyperLogLogPlusPlusHostUDFJni.cpp src/JSONUtilsJni.cpp src/NativeParquetJni.cpp src/ParseURIJni.cpp @@ -218,8 +218,9 @@ add_library( src/from_json_to_structs.cu src/get_json_object.cu src/histogram.cu - src/hllpp_host_udf.cu - src/hllpp.cu + src/hive_hash.cu + src/hyper_log_log_plus_plus.cu + src/hyper_log_log_plus_plus_host_udf.cu src/json_utils.cu src/murmur_hash.cu src/parse_uri.cu @@ -229,7 +230,6 @@ add_library( src/timezones.cu src/utilities.cu src/xxhash64.cu - src/hive_hash.cu src/zorder.cu ) diff --git a/src/main/cpp/src/HLLPPHostUDFJni.cpp b/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp similarity index 79% rename from src/main/cpp/src/HLLPPHostUDFJni.cpp rename to src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp index a80a78c6b8..adf5da52f7 100644 --- a/src/main/cpp/src/HLLPPHostUDFJni.cpp +++ b/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp @@ -15,19 +15,22 @@ */ #include "cudf_jni_apis.hpp" -#include "hllpp.hpp" -#include "hllpp_host_udf.hpp" +#include "hyper_log_log_plus_plus.hpp" +#include "hyper_log_log_plus_plus_host_udf.hpp" extern "C" { -JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_HLLPPHostUDF_createHLLPPHostUDF( - JNIEnv* env, jclass, jint agg_type, int precision) +JNIEXPORT jlong JNICALL +Java_com_nvidia_spark_rapids_jni_HyperLogLogPlusPlusHostUDF_createHLLPPHostUDF(JNIEnv* env, + jclass, + jint agg_type, + int precision) { try { cudf::jni::auto_set_device(env); auto udf_ptr = [&] { // The value of agg_type must be sync with - // `HLLPPHostUDF.java#AggregationType`. + // `HyperLogLogPlusPlusHostUDF.java#AggregationType`. switch (agg_type) { case 0: return spark_rapids_jni::create_hllpp_reduction_host_udf(precision); case 1: return spark_rapids_jni::create_hllpp_reduction_merge_host_udf(precision); @@ -43,10 +46,8 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_HLLPPHostUDF_createHLLP } JNIEXPORT jlong JNICALL -Java_com_nvidia_spark_rapids_jni_HLLPPHostUDF_estimateDistinctValueFromSketches(JNIEnv* env, - jclass, - jlong sketches, - jint precision) +Java_com_nvidia_spark_rapids_jni_HyperLogLogPlusPlusHostUDF_estimateDistinctValueFromSketches( + JNIEnv* env, jclass, jlong sketches, jint precision) { JNI_NULL_CHECK(env, sketches, "Sketch column is null", 0); try { diff --git a/src/main/cpp/src/hllpp.cu b/src/main/cpp/src/hyper_log_log_plus_plus.cu similarity index 96% rename from src/main/cpp/src/hllpp.cu rename to src/main/cpp/src/hyper_log_log_plus_plus.cu index 8d39c66865..4ff7850558 100644 --- a/src/main/cpp/src/hllpp.cu +++ b/src/main/cpp/src/hyper_log_log_plus_plus.cu @@ -55,8 +55,9 @@ namespace { */ __device__ inline int get_register_value(int64_t const ten_registers, int reg_idx) { - int64_t shift_mask = MASK << (REGISTER_VALUE_BITS * reg_idx); - int64_t v = (ten_registers & shift_mask) >> (REGISTER_VALUE_BITS * reg_idx); + auto const shift_bits = REGISTER_VALUE_BITS * reg_idx; + auto const shift_mask = MASK << shift_bits; + auto const v = (ten_registers & shift_mask) >> shift_bit; return static_cast(v); } @@ -418,7 +419,7 @@ std::unique_ptr group_hllpp(cudf::column_view const& input, auto num_long_cols = num_registers_per_sketch / REGISTERS_PER_LONG + 1; auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) { return cudf::make_numeric_column( - cudf::data_type{cudf::type_id::INT64}, num_groups, cudf::mask_state::ALL_VALID, stream, mr); + cudf::data_type{cudf::type_id::INT64}, num_groups, cudf::mask_state::UNALLOCATED, stream, mr); }); auto children = std::vector>(results_iter, results_iter + num_long_cols); @@ -609,7 +610,7 @@ std::unique_ptr group_merge_hllpp( // create output columns auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) { return cudf::make_numeric_column( - cudf::data_type{cudf::type_id::INT64}, num_groups, cudf::mask_state::ALL_VALID, stream, mr); + cudf::data_type{cudf::type_id::INT64}, num_groups, cudf::mask_state::UNALLOCATED, stream, mr); }); auto results = std::vector>(results_iter, results_iter + num_long_cols); @@ -705,7 +706,7 @@ std::unique_ptr reduce_hllpp(cudf::column_view const& input, auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) { return cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT64}, 1 /**num_groups*/, - cudf::mask_state::ALL_VALID, + cudf::mask_state::UNALLOCATED, stream, mr); }); @@ -773,7 +774,7 @@ std::unique_ptr reduce_merge_hllpp(cudf::column_view const& input, auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) { return cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT64}, 1 /** num_rows */, - cudf::mask_state::ALL_VALID, + cudf::mask_state::UNALLOCATED, stream, mr); }); @@ -814,13 +815,13 @@ std::unique_ptr reduce_merge_hllpp(cudf::column_view const& input, } struct estimate_fn { - cudf::device_span sketch_longs; - int const precision; - int64_t* const out; + cudf::device_span sketches; + int64_t* out; + int precision; __device__ void operator()(cudf::size_type const idx) const { - auto const num_regs = 1ull << precision; + auto const num_regs = 1 << precision; double sum = 0; int zeroes = 0; @@ -828,7 +829,7 @@ struct estimate_fn { // each long contains 10 register values int long_col_idx = reg_idx / REGISTERS_PER_LONG; int reg_idx_in_long = reg_idx % REGISTERS_PER_LONG; - int reg = get_register_value(sketch_longs[long_col_idx][idx], reg_idx_in_long); + int reg = get_register_value(sketches[long_col_idx][idx], reg_idx_in_long); sum += double{1} / static_cast(1ull << reg); zeroes += reg == 0; } @@ -848,7 +849,7 @@ std::unique_ptr group_hyper_log_log_plus_plus( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision >= 4."); + CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision bigger than 4."); auto adjust_precision = precision > MAX_PRECISION ? MAX_PRECISION : precision; return group_hllpp(input, num_groups, group_lables, adjust_precision, stream, mr); } @@ -861,7 +862,7 @@ std::unique_ptr group_merge_hyper_log_log_plus_plus( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision >= 4."); + CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision bigger than 4."); CUDF_EXPECTS(input.type().id() == cudf::type_id::STRUCT, "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns."); for (auto i = 0; i < input.num_children(); i++) { @@ -880,7 +881,7 @@ std::unique_ptr reduce_hyper_log_log_plus_plus(cudf::column_view c rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision >= 4."); + CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision bigger than 4."); auto adjust_precision = precision > MAX_PRECISION ? MAX_PRECISION : precision; return reduce_hllpp(input, adjust_precision, stream, mr); } @@ -891,7 +892,7 @@ std::unique_ptr reduce_merge_hyper_log_log_plus_plus( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision >= 4."); + CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision bigger than 4."); CUDF_EXPECTS(input.type().id() == cudf::type_id::STRUCT, "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns."); for (auto i = 0; i < input.num_children(); i++) { @@ -910,13 +911,21 @@ std::unique_ptr estimate_from_hll_sketches(cudf::column_view const rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision is bigger than 4."); + CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision bigger than 4."); + CUDF_EXPECTS(input.type().id() == cudf::type_id::STRUCT, + "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns."); + for (auto i = 0; i < input.num_children(); i++) { + CUDF_EXPECTS(input.child(i).type().id() == cudf::type_id::INT64, + "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns."); + } auto const input_iter = cudf::detail::make_counting_transform_iterator( 0, [&](int i) { return input.child(i).begin(); }); - auto input_cols = std::vector(input_iter, input_iter + input.num_children()); - auto d_inputs = cudf::detail::make_device_uvector_async(input_cols, stream, mr); - auto result = cudf::make_numeric_column( - cudf::data_type{cudf::type_id::INT64}, input.size(), cudf::mask_state::ALL_VALID, stream); + auto const h_input_ptrs = + std::vector(input_iter, input_iter + input.num_children()); + auto d_inputs = cudf::detail::make_device_uvector_async( + h_input_ptrs, stream, cudf::get_current_device_resource_ref()); + auto result = cudf::make_numeric_column( + cudf::data_type{cudf::type_id::INT64}, input.size(), cudf::mask_state::UNALLOCATED, stream, mr); // evaluate from struct thrust::for_each_n(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), diff --git a/src/main/cpp/src/hllpp.hpp b/src/main/cpp/src/hyper_log_log_plus_plus.hpp similarity index 80% rename from src/main/cpp/src/hllpp.hpp rename to src/main/cpp/src/hyper_log_log_plus_plus.hpp index d93e1debdf..33df3b37a4 100644 --- a/src/main/cpp/src/hllpp.hpp +++ b/src/main/cpp/src/hyper_log_log_plus_plus.hpp @@ -18,9 +18,9 @@ #include #include #include +#include #include -#include namespace spark_rapids_jni { @@ -56,8 +56,8 @@ std::unique_ptr group_hyper_log_log_plus_plus( int64_t const num_groups, cudf::device_span group_lables, int64_t const precision, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** * Merge HyperLogLogPlusPlus(HLLPP) sketches in the same group. @@ -69,18 +69,19 @@ std::unique_ptr group_merge_hyper_log_log_plus_plus( int64_t const num_groups, cudf::device_span group_lables, int64_t const precision, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** * Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP) * sketches from hash codes, and merge all the sketches into one sketch, output * is a struct scalar with multiple long values. */ -std::unique_ptr reduce_hyper_log_log_plus_plus(cudf::column_view const& input, - int64_t const precision, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); +std::unique_ptr reduce_hyper_log_log_plus_plus( + cudf::column_view const& input, + int64_t const precision, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** * Merge all HyperLogLogPlusPlus(HLLPP) sketches in the input column into one @@ -90,8 +91,8 @@ std::unique_ptr reduce_hyper_log_log_plus_plus(cudf::column_view c std::unique_ptr reduce_merge_hyper_log_log_plus_plus( cudf::column_view const& input, int64_t const precision, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** * Estimate count distinct values for the input which contains @@ -103,6 +104,5 @@ std::unique_ptr estimate_from_hll_sketches( cudf::column_view const& input, int precision, rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); - + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/hllpp_host_udf.cu b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu similarity index 97% rename from src/main/cpp/src/hllpp_host_udf.cu rename to src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu index 370b906b65..a112117c35 100644 --- a/src/main/cpp/src/hllpp_host_udf.cu +++ b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu @@ -13,9 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -#include "hllpp.hpp" -#include "hllpp_host_udf.hpp" +#include "hyper_log_log_plus_plus.hpp" +#include "hyper_log_log_plus_plus_host_udf.hpp" #include #include @@ -126,7 +125,8 @@ struct hllpp_udf : cudf::host_udf_base { std::move(children), 0, // null count rmm::device_buffer{}, // null mask - stream); + stream, + mr); } } diff --git a/src/main/cpp/src/hllpp_host_udf.hpp b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.hpp similarity index 100% rename from src/main/cpp/src/hllpp_host_udf.hpp rename to src/main/cpp/src/hyper_log_log_plus_plus_host_udf.hpp diff --git a/src/main/java/com/nvidia/spark/rapids/jni/HLLPPHostUDF.java b/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java similarity index 98% rename from src/main/java/com/nvidia/spark/rapids/jni/HLLPPHostUDF.java rename to src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java index 9018474c27..6d09be3de6 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/HLLPPHostUDF.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java @@ -23,7 +23,7 @@ /** * HyperLogLogPlusPlus(HLLPP) host UDF aggregation utils */ -public class HLLPPHostUDF { +public class HyperLogLogPlusPlusHostUDF { static { NativeDepsLoader.loadNativeDeps(); } From 3e225129f21fbff240afa3e61857fe94a1469a4b Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 19 Dec 2024 19:21:32 +0800 Subject: [PATCH 10/28] Fix compile error --- src/main/cpp/src/hyper_log_log_plus_plus.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/cpp/src/hyper_log_log_plus_plus.cu b/src/main/cpp/src/hyper_log_log_plus_plus.cu index 4ff7850558..0576988322 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus.cu +++ b/src/main/cpp/src/hyper_log_log_plus_plus.cu @@ -14,7 +14,7 @@ * limitations under the License. */ #include "hash.hpp" -#include "hllpp.hpp" +#include "hyper_log_log_plus_plus.hpp" #include #include @@ -57,7 +57,7 @@ __device__ inline int get_register_value(int64_t const ten_registers, int reg_id { auto const shift_bits = REGISTER_VALUE_BITS * reg_idx; auto const shift_mask = MASK << shift_bits; - auto const v = (ten_registers & shift_mask) >> shift_bit; + auto const v = (ten_registers & shift_mask) >> shift_bits; return static_cast(v); } @@ -930,7 +930,7 @@ std::unique_ptr estimate_from_hll_sketches(cudf::column_view const thrust::for_each_n(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), input.size(), - estimate_fn{d_inputs, precision, result->mutable_view().data()}); + estimate_fn{d_inputs, result->mutable_view().data(), precision}); return result; } From aa7ca68f003a3722a98d526ae08db690b59e2dff Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Fri, 20 Dec 2024 19:40:26 +0800 Subject: [PATCH 11/28] Handle null inputs: must ignore the null input values --- src/main/cpp/src/hyper_log_log_plus_plus.cu | 40 +++++++++++++-------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/src/main/cpp/src/hyper_log_log_plus_plus.cu b/src/main/cpp/src/hyper_log_log_plus_plus.cu index 0576988322..974f533987 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus.cu +++ b/src/main/cpp/src/hyper_log_log_plus_plus.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -91,6 +92,8 @@ __device__ inline int get_register_value(int64_t const ten_registers, int reg_id * (num_hashs / num_hashs_per_thread) sketches. * num_threads = div_round_up(num_hashs, num_hashs_per_thread). * + * Note: Must exclude null hash values from computing HLLPP sketches. + * * e.g.: num_registers_per_sketch = 512 and num_hashs_per_thread = 4; * * Input is hashs, compute and get pair: register index -> register value @@ -181,11 +184,15 @@ CUDF_KERNEL void partial_group_sketches_from_hashs_kernel( for (auto hash_idx = hash_first; hash_idx < hash_end; hash_idx++) { cudf::size_type curr_group = group_lables[hash_idx]; - // cast to unsigned, then >> will shift without preserve the sign bit. - uint64_t const hash = static_cast(hashs.element(hash_idx)); - auto const reg_idx = hash >> idx_shift; - int const reg_v = - static_cast(cuda::std::countl_zero((hash << precision) | w_padding) + 1ULL); + int reg_idx = 0; // init value for null hash + int reg_v = 0; // init value for null hash + if (!hashs.is_null(hash_idx)) { + // cast to unsigned, then >> will shift without preserve the sign bit. + uint64_t const hash = static_cast(hashs.element(hash_idx)); + reg_idx = hash >> idx_shift; + // get the leading zeros + reg_v = static_cast(cuda::std::countl_zero((hash << precision) | w_padding) + 1ULL); + } if (curr_group == prev_group) { // still in the same group, update the max value @@ -390,7 +397,8 @@ std::unique_ptr group_hllpp(cudf::column_view const& input, // 1. compute all the hashs auto input_table_view = cudf::table_view{{input}}; auto hash_col = xxhash64(input_table_view, SEED, stream, mr); - auto d_hashs = cudf::column_device_view::create(hash_col->view(), stream); + hash_col->set_null_mask(cudf::detail::copy_bitmask(input, stream, mr), input.null_count()); + auto d_hashs = cudf::column_device_view::create(hash_col->view(), stream); // 2. execute partial group by int64_t num_blocks_p1 = @@ -659,13 +667,16 @@ CUDF_KERNEL void reduce_hllpp_kernel(cudf::column_device_view hashs, // update max reg value for the reg index for (int i = tid; i < num_hashs; i += block_size) { - uint64_t const hash = static_cast(hashs.element(i)); - // use unsigned int to avoid insert 1 for the highest bit when do right - // shift - uint64_t const reg_idx = hash >> idx_shift; - // get the leading zeros - int const reg_v = - static_cast(cuda::std::countl_zero((hash << precision) | w_padding) + 1ULL); + int reg_idx = 0; // init value for null hash + int reg_v = 0; // init value for null hash + if (!hashs.is_null(i)) { + // cast to unsigned, then >> will shift without preserve the sign bit. + uint64_t const hash = static_cast(hashs.element(i)); + reg_idx = hash >> idx_shift; + // get the leading zeros + reg_v = static_cast(cuda::std::countl_zero((hash << precision) | w_padding) + 1ULL); + } + cuda::atomic_ref register_ref(shared_data[reg_idx]); register_ref.fetch_max(reg_v, cuda::memory_order_relaxed); } @@ -699,7 +710,8 @@ std::unique_ptr reduce_hllpp(cudf::column_view const& input, // 1. compute all the hashs auto input_table_view = cudf::table_view{{input}}; auto hash_col = xxhash64(input_table_view, SEED, stream, mr); - auto d_hashs = cudf::column_device_view::create(hash_col->view(), stream); + hash_col->set_null_mask(cudf::detail::copy_bitmask(input, stream, mr), input.null_count()); + auto d_hashs = cudf::column_device_view::create(hash_col->view(), stream); // 2. generate long columns, the size of each long column is 1 auto num_long_cols = num_registers_per_sketch / REGISTERS_PER_LONG + 1; From f0970c05b631a02fdd9c4ab1ba04bfb08bedfb73 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Mon, 23 Dec 2024 12:04:39 +0800 Subject: [PATCH 12/28] Rename refactor: Correct spelling errors --- src/main/cpp/src/hyper_log_log_plus_plus.cu | 72 +++++++++---------- src/main/cpp/src/hyper_log_log_plus_plus.hpp | 4 +- .../src/hyper_log_log_plus_plus_host_udf.cu | 6 +- 3 files changed, 41 insertions(+), 41 deletions(-) diff --git a/src/main/cpp/src/hyper_log_log_plus_plus.cu b/src/main/cpp/src/hyper_log_log_plus_plus.cu index 974f533987..f83240f498 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus.cu +++ b/src/main/cpp/src/hyper_log_log_plus_plus.cu @@ -98,7 +98,7 @@ __device__ inline int get_register_value(int64_t const ten_registers, int reg_id * * Input is hashs, compute and get pair: register index -> register value * - * reg_index, reg_value, group_lable + * reg_index, reg_value, group_label * [ * ---------- segment 0 begin -------------------------- * (0, 1), g0 @@ -123,7 +123,7 @@ __device__ inline int get_register_value(int64_t const ten_registers, int reg_id * ] * Output e.g.: * - * group_lables_thread_cache: + * group_labels_thread_cache: * [ * g1 * g1 @@ -138,7 +138,7 @@ __device__ inline int get_register_value(int64_t const ten_registers, int reg_id * 512 values: [0, 8, 0, ... ] // register values for group 1 * ] * Has num_threads rows, each row is corresponding to - * `group_lables_thread_cache` + * `group_labels_thread_cache` * * registers_output_cache: * [ @@ -153,11 +153,11 @@ __device__ inline int get_register_value(int64_t const ten_registers, int reg_id template CUDF_KERNEL void partial_group_sketches_from_hashs_kernel( cudf::column_device_view hashs, - cudf::device_span group_lables, + cudf::device_span group_labels, int64_t const precision, // num of bits for register addressing, e.g.: 9 int* const registers_output_cache, // num is num_groups * num_registers_per_sketch int* const registers_thread_cache, // num is num_threads * num_registers_per_sketch - cudf::size_type* const group_lables_thread_cache // save the group lables for each thread + cudf::size_type* const group_labels_thread_cache // save the group labels for each thread ) { auto const tid = cudf::detail::grid_1d::global_thread_id(); @@ -180,9 +180,9 @@ CUDF_KERNEL void partial_group_sketches_from_hashs_kernel( sketch_ptr[i] = 0; } - cudf::size_type prev_group = group_lables[hash_first]; + cudf::size_type prev_group = group_labels[hash_first]; for (auto hash_idx = hash_first; hash_idx < hash_end; hash_idx++) { - cudf::size_type curr_group = group_lables[hash_idx]; + cudf::size_type curr_group = group_labels[hash_idx]; int reg_idx = 0; // init value for null hash int reg_v = 0; // init value for null hash @@ -216,7 +216,7 @@ CUDF_KERNEL void partial_group_sketches_from_hashs_kernel( } } else { // not the last segment, probe one item forward. - if (curr_group != group_lables[hash_idx + 1]) { + if (curr_group != group_labels[hash_idx + 1]) { // meets a new group by checking the next item in the next segment for (auto i = 0; i < num_registers_per_sketch; i++) { registers_output_cache[curr_group * num_registers_per_sketch + i] = sketch_ptr[i]; @@ -228,8 +228,8 @@ CUDF_KERNEL void partial_group_sketches_from_hashs_kernel( prev_group = curr_group; } - // save the group lable for this thread - group_lables_thread_cache[tid] = group_lables[hash_end - 1]; + // save the group label for this thread + group_labels_thread_cache[tid] = group_labels[hash_end - 1]; } /* @@ -242,7 +242,7 @@ CUDF_KERNEL void partial_group_sketches_from_hashs_kernel( * * Input e.g.: * - * group_lables_thread_cache: + * group_labels_thread_cache: * [ * g0 * g0 @@ -261,7 +261,7 @@ CUDF_KERNEL void partial_group_sketches_from_hashs_kernel( * r0_gN, r1_gN, r2_gN, r3_gN, ... , r511_gN // register values for group N * ] * Has num_threads rows, each row is corresponding to - * `group_lables_thread_cache` + * `group_labels_thread_cache` * * registers_output_cache: * [ @@ -280,7 +280,7 @@ CUDF_KERNEL void merge_sketches_vertically(int64_t num_sketches, int64_t num_registers_per_sketch, int* const registers_output_cache, int const* const registers_thread_cache, - cudf::size_type const* const group_lables_thread_cache) + cudf::size_type const* const group_labels_thread_cache) { __shared__ int8_t shared_data[block_size]; auto const tid = cudf::detail::grid_1d::global_thread_id(); @@ -288,9 +288,9 @@ CUDF_KERNEL void merge_sketches_vertically(int64_t num_sketches, // register idx is tid shared_data[shared_idx] = static_cast(0); - int prev_group = group_lables_thread_cache[0]; + int prev_group = group_labels_thread_cache[0]; for (auto i = 0; i < num_sketches; i++) { - int curr_group = group_lables_thread_cache[i]; + int curr_group = group_labels_thread_cache[i]; int8_t curr_reg_v = static_cast(registers_thread_cache[i * num_registers_per_sketch + tid]); if (curr_group == prev_group) { @@ -372,7 +372,7 @@ CUDF_KERNEL void compact_kernel(int64_t const num_groups, std::unique_ptr group_hllpp(cudf::column_view const& input, int64_t const num_groups, - cudf::device_span group_lables, + cudf::device_span group_labels, int64_t const precision, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) @@ -387,10 +387,10 @@ std::unique_ptr group_hllpp(cudf::column_view const& input, rmm::device_uvector(num_groups * num_registers_per_sketch, stream, mr); { // add this block to release `registers_thread_cache` and - // `group_lables_thread_cache` + // `group_labels_thread_cache` auto registers_thread_cache = rmm::device_uvector( num_threads_partial_kernel * num_registers_per_sketch, stream, mr); - auto group_lables_thread_cache = + auto group_labels_thread_cache = rmm::device_uvector(num_threads_partial_kernel, stream, mr); { // add this block to release `hash_col` @@ -405,11 +405,11 @@ std::unique_ptr group_hllpp(cudf::column_view const& input, cudf::util::div_rounding_up_safe(num_threads_partial_kernel, block_size); partial_group_sketches_from_hashs_kernel <<>>(*d_hashs, - group_lables, + group_labels, precision, sketches_output.begin(), registers_thread_cache.begin(), - group_lables_thread_cache.begin()); + group_labels_thread_cache.begin()); } // 3. merge the intermidate result auto num_merge_threads = num_registers_per_sketch; @@ -420,7 +420,7 @@ std::unique_ptr group_hllpp(cudf::column_view const& input, num_registers_per_sketch, sketches_output.begin(), registers_thread_cache.begin(), - group_lables_thread_cache.begin()); + group_labels_thread_cache.begin()); } // 4. create output columns @@ -485,13 +485,13 @@ CUDF_KERNEL void partial_group_long_sketches_kernel( int64_t const num_threads_per_col, int64_t const num_registers_per_sketch, int64_t const num_groups, - cudf::device_span group_lables, + cudf::device_span group_labels, // num_groups * num_registers_per_sketch integers int* const registers_output_cache, // num_threads * num_registers_per_sketch integers int* const registers_thread_cache, // num_threads integers - cudf::size_type* const group_lables_thread_cache) + cudf::size_type* const group_labels_thread_cache) { auto const tid = cudf::detail::grid_1d::global_thread_id(); auto const num_long_cols = sketches_input.size(); @@ -511,11 +511,11 @@ CUDF_KERNEL void partial_group_long_sketches_kernel( if (long_idx == num_long_cols - 1) { num_regs = num_registers_per_sketch % REGISTERS_PER_LONG; } for (auto i = 0; i < num_regs; i++) { - cudf::size_type prev_group = group_lables[sketch_first]; + cudf::size_type prev_group = group_labels[sketch_first]; int max_reg_v = 0; int reg_idx_in_sketch = long_idx * REGISTERS_PER_LONG + i; for (auto sketch_idx = sketch_first; sketch_idx < sketch_end; sketch_idx++) { - cudf::size_type curr_group = group_lables[sketch_idx]; + cudf::size_type curr_group = group_labels[sketch_idx]; int curr_reg_v = get_register_value(longs_ptr[sketch_idx], i); if (curr_group == prev_group) { // still in the same group, update the max value @@ -537,7 +537,7 @@ CUDF_KERNEL void partial_group_long_sketches_kernel( registers_output_cache[output_idx_curr] = max_reg_v; max_reg_v = curr_reg_v; } else { - if (curr_group != group_lables[sketch_idx + 1]) { + if (curr_group != group_labels[sketch_idx + 1]) { // look the first item in the next segment registers_output_cache[output_idx_curr] = max_reg_v; max_reg_v = curr_reg_v; @@ -553,7 +553,7 @@ CUDF_KERNEL void partial_group_long_sketches_kernel( } if (long_idx == 0) { - group_lables_thread_cache[thread_idx_in_cols] = group_lables[sketch_end - 1]; + group_labels_thread_cache[thread_idx_in_cols] = group_labels[sketch_end - 1]; } } @@ -564,7 +564,7 @@ CUDF_KERNEL void partial_group_long_sketches_kernel( std::unique_ptr group_merge_hllpp( cudf::column_view const& hll_input, // struct column int64_t const num_groups, - cudf::device_span group_lables, + cudf::device_span group_labels, int64_t const precision, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) @@ -584,7 +584,7 @@ std::unique_ptr group_merge_hllpp( { auto registers_thread_cache = rmm::device_uvector(num_registers_per_sketch * num_threads_phase1, stream, mr); - auto group_lables_thread_cache = + auto group_labels_thread_cache = rmm::device_uvector(num_threads_per_col_phase1, stream, mr); cudf::structs_column_view scv(hll_input); @@ -599,10 +599,10 @@ std::unique_ptr group_merge_hllpp( num_threads_per_col_phase1, num_registers_per_sketch, num_groups, - group_lables, + group_labels, registers_output_cache.begin(), registers_thread_cache.begin(), - group_lables_thread_cache.begin()); + group_labels_thread_cache.begin()); auto const num_phase2_threads = num_registers_per_sketch; auto const num_phase2_blocks = cudf::util::div_rounding_up_safe(num_phase2_threads, block_size); // 2nd kernel: vertical merge @@ -612,7 +612,7 @@ std::unique_ptr group_merge_hllpp( num_registers_per_sketch, registers_output_cache.begin(), registers_thread_cache.begin(), - group_lables_thread_cache.begin()); + group_labels_thread_cache.begin()); } // create output columns @@ -856,20 +856,20 @@ struct estimate_fn { std::unique_ptr group_hyper_log_log_plus_plus( cudf::column_view const& input, int64_t const num_groups, - cudf::device_span group_lables, + cudf::device_span group_labels, int64_t const precision, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision bigger than 4."); auto adjust_precision = precision > MAX_PRECISION ? MAX_PRECISION : precision; - return group_hllpp(input, num_groups, group_lables, adjust_precision, stream, mr); + return group_hllpp(input, num_groups, group_labels, adjust_precision, stream, mr); } std::unique_ptr group_merge_hyper_log_log_plus_plus( cudf::column_view const& input, int64_t const num_groups, - cudf::device_span group_lables, + cudf::device_span group_labels, int64_t const precision, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) @@ -885,7 +885,7 @@ std::unique_ptr group_merge_hyper_log_log_plus_plus( auto expected_num_longs = (1 << adjust_precision) / REGISTERS_PER_LONG + 1; CUDF_EXPECTS(input.num_children() == expected_num_longs, "The num of long columns in input is incorrect."); - return group_merge_hllpp(input, num_groups, group_lables, adjust_precision, stream, mr); + return group_merge_hllpp(input, num_groups, group_labels, adjust_precision, stream, mr); } std::unique_ptr reduce_hyper_log_log_plus_plus(cudf::column_view const& input, diff --git a/src/main/cpp/src/hyper_log_log_plus_plus.hpp b/src/main/cpp/src/hyper_log_log_plus_plus.hpp index 33df3b37a4..0489e67b1f 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus.hpp +++ b/src/main/cpp/src/hyper_log_log_plus_plus.hpp @@ -54,7 +54,7 @@ constexpr int MAX_PRECISION = 18; std::unique_ptr group_hyper_log_log_plus_plus( cudf::column_view const& input, int64_t const num_groups, - cudf::device_span group_lables, + cudf::device_span group_labels, int64_t const precision, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); @@ -67,7 +67,7 @@ std::unique_ptr group_hyper_log_log_plus_plus( std::unique_ptr group_merge_hyper_log_log_plus_plus( cudf::column_view const& input, int64_t const num_groups, - cudf::device_span group_lables, + cudf::device_span group_labels, int64_t const precision, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); diff --git a/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu index a112117c35..ac99018f0d 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu +++ b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu @@ -82,15 +82,15 @@ struct hllpp_udf : cudf::host_udf_base { auto const group_offsets = std::get>( udf_input.at(groupby_data_attribute::GROUP_OFFSETS)); int num_groups = group_offsets.size() - 1; - auto const group_lables = std::get>( + auto const group_labels = std::get>( udf_input.at(groupby_data_attribute::GROUP_LABELS)); if (is_merge) { // group by intermidate result, group_values are struct of long columns return spark_rapids_jni::group_merge_hyper_log_log_plus_plus( - group_values, num_groups, group_lables, precision, stream, mr); + group_values, num_groups, group_labels, precision, stream, mr); } else { return spark_rapids_jni::group_hyper_log_log_plus_plus( - group_values, num_groups, group_lables, precision, stream, mr); + group_values, num_groups, group_labels, precision, stream, mr); } } } From 2e74412ef2ae886a983b3a100e584148898eae43 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Mon, 13 Jan 2025 14:58:09 +0800 Subject: [PATCH 13/28] Update copyright for new year 2025 --- src/main/cpp/CMakeLists.txt | 2 +- src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp | 2 +- src/main/cpp/src/hyper_log_log_plus_plus.cu | 2 +- src/main/cpp/src/hyper_log_log_plus_plus.hpp | 2 +- src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu | 2 +- src/main/cpp/src/hyper_log_log_plus_plus_host_udf.hpp | 2 +- .../com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index bb6f9d0ac6..3d3875b9be 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at diff --git a/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp b/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp index adf5da52f7..2d92d5d96a 100644 --- a/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp +++ b/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/main/cpp/src/hyper_log_log_plus_plus.cu b/src/main/cpp/src/hyper_log_log_plus_plus.cu index f83240f498..18934938b9 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus.cu +++ b/src/main/cpp/src/hyper_log_log_plus_plus.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/main/cpp/src/hyper_log_log_plus_plus.hpp b/src/main/cpp/src/hyper_log_log_plus_plus.hpp index 0489e67b1f..a7735944f3 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus.hpp +++ b/src/main/cpp/src/hyper_log_log_plus_plus.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu index ac99018f0d..de5aafcdae 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu +++ b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.hpp b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.hpp index e89cdf4e5f..12ce65b386 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.hpp +++ b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java b/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java index 6d09be3de6..9e6c812a0c 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 7fe4a390ab3e747ac97621cfff25a979f623fb05 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Mon, 13 Jan 2025 15:24:59 +0800 Subject: [PATCH 14/28] Use get_current_device_resource_ref --- src/main/cpp/src/hyper_log_log_plus_plus.cu | 42 ++++++++++++--------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/src/main/cpp/src/hyper_log_log_plus_plus.cu b/src/main/cpp/src/hyper_log_log_plus_plus.cu index 18934938b9..98a7c7f114 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus.cu +++ b/src/main/cpp/src/hyper_log_log_plus_plus.cu @@ -383,15 +383,17 @@ std::unique_ptr group_hllpp(cudf::column_view const& input, int64_t num_threads_partial_kernel = cudf::util::div_rounding_up_safe(input.size(), num_hashs_per_thread); - auto sketches_output = - rmm::device_uvector(num_groups * num_registers_per_sketch, stream, mr); + auto sketches_output = rmm::device_uvector( + num_groups * num_registers_per_sketch, stream, cudf::get_current_device_resource_ref()); { // add this block to release `registers_thread_cache` and // `group_labels_thread_cache` - auto registers_thread_cache = rmm::device_uvector( - num_threads_partial_kernel * num_registers_per_sketch, stream, mr); - auto group_labels_thread_cache = - rmm::device_uvector(num_threads_partial_kernel, stream, mr); + auto registers_thread_cache = + rmm::device_uvector(num_threads_partial_kernel * num_registers_per_sketch, + stream, + cudf::get_current_device_resource_ref()); + auto group_labels_thread_cache = rmm::device_uvector( + num_threads_partial_kernel, stream, cudf::get_current_device_resource_ref()); { // add this block to release `hash_col` // 1. compute all the hashs @@ -577,21 +579,24 @@ std::unique_ptr group_merge_hllpp( int64_t num_threads_per_col_phase1 = cudf::util::div_rounding_up_safe(num_sketches, num_longs_per_threads); - int64_t num_threads_phase1 = num_threads_per_col_phase1 * num_long_cols; - int64_t num_blocks = cudf::util::div_rounding_up_safe(num_threads_phase1, block_size); - auto registers_output_cache = - rmm::device_uvector(num_registers_per_sketch * num_groups, stream, mr); + int64_t num_threads_phase1 = num_threads_per_col_phase1 * num_long_cols; + int64_t num_blocks = cudf::util::div_rounding_up_safe(num_threads_phase1, block_size); + auto registers_output_cache = rmm::device_uvector( + num_registers_per_sketch * num_groups, stream, cudf::get_current_device_resource_ref()); { auto registers_thread_cache = - rmm::device_uvector(num_registers_per_sketch * num_threads_phase1, stream, mr); - auto group_labels_thread_cache = - rmm::device_uvector(num_threads_per_col_phase1, stream, mr); + rmm::device_uvector(num_registers_per_sketch * num_threads_phase1, + stream, + cudf::get_current_device_resource_ref()); + auto group_labels_thread_cache = rmm::device_uvector( + num_threads_per_col_phase1, stream, cudf::get_current_device_resource_ref()); cudf::structs_column_view scv(hll_input); auto const input_iter = cudf::detail::make_counting_transform_iterator( 0, [&](int i) { return scv.get_sliced_child(i, stream).begin(); }); auto input_cols = std::vector(input_iter, input_iter + num_long_cols); - auto d_inputs = cudf::detail::make_device_uvector_async(input_cols, stream, mr); + auto d_inputs = cudf::detail::make_device_uvector_async( + input_cols, stream, cudf::get_current_device_resource_ref()); // 1st kernel: partially group partial_group_long_sketches_kernel <<>>(d_inputs, @@ -731,7 +736,8 @@ std::unique_ptr reduce_hllpp(cudf::column_view const& input, }); auto host_results_pointers = std::vector(host_results_pointer_iter, host_results_pointer_iter + children.size()); - return cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr); + return cudf::detail::make_device_uvector_async( + host_results_pointers, stream, cudf::get_current_device_resource_ref()); }(); // 2. reduce and generate compacted long values @@ -780,7 +786,8 @@ std::unique_ptr reduce_merge_hllpp(cudf::column_view const& input, auto const input_iter = cudf::detail::make_counting_transform_iterator( 0, [&](int i) { return scv.get_sliced_child(i, stream).begin(); }); auto input_cols = std::vector(input_iter, input_iter + num_long_cols); - auto d_inputs = cudf::detail::make_device_uvector_async(input_cols, stream, mr); + auto d_inputs = cudf::detail::make_device_uvector_async( + input_cols, stream, cudf::get_current_device_resource_ref()); // create one row output auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) { @@ -806,7 +813,8 @@ std::unique_ptr reduce_merge_hllpp(cudf::column_view const& input, auto num_threads = num_registers_per_sketch; constexpr int64_t block_size = 256; auto num_blocks = cudf::util::div_rounding_up_safe(num_threads, block_size); - auto output_cache = rmm::device_uvector(num_registers_per_sketch, stream, mr); + auto output_cache = rmm::device_uvector( + num_registers_per_sketch, stream, cudf::get_current_device_resource_ref()); reduce_merge_hll_kernel_vertically<<>>( d_inputs, input.size(), num_registers_per_sketch, output_cache.begin()); From b7058a711509be7d4aa5d8cb474b3ae5bb62268c Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Mon, 13 Jan 2025 15:40:53 +0800 Subject: [PATCH 15/28] Fix comments --- .../cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp | 3 ++- src/main/cpp/src/hyper_log_log_plus_plus.cu | 22 +++++++++++++++++++ src/main/cpp/src/hyper_log_log_plus_plus.hpp | 22 ------------------- .../src/hyper_log_log_plus_plus_host_udf.cu | 5 +---- .../jni/HyperLogLogPlusPlusHostUDF.java | 2 +- 5 files changed, 26 insertions(+), 28 deletions(-) diff --git a/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp b/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp index 2d92d5d96a..d752376a55 100644 --- a/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp +++ b/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp @@ -35,7 +35,8 @@ Java_com_nvidia_spark_rapids_jni_HyperLogLogPlusPlusHostUDF_createHLLPPHostUDF(J case 0: return spark_rapids_jni::create_hllpp_reduction_host_udf(precision); case 1: return spark_rapids_jni::create_hllpp_reduction_merge_host_udf(precision); case 2: return spark_rapids_jni::create_hllpp_groupby_host_udf(precision); - default: return spark_rapids_jni::create_hllpp_groupby_merge_host_udf(precision); + case 3: return spark_rapids_jni::create_hllpp_groupby_merge_host_udf(precision); + default: CUDF_FAIL("Invalid aggregation type."); } }(); CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HyperLogLogPlusPlus(HLLPP) UDF instance."); diff --git a/src/main/cpp/src/hyper_log_log_plus_plus.cu b/src/main/cpp/src/hyper_log_log_plus_plus.cu index 98a7c7f114..1b9cca7a20 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus.cu +++ b/src/main/cpp/src/hyper_log_log_plus_plus.cu @@ -50,6 +50,28 @@ namespace spark_rapids_jni { namespace { +/** + * The number of bits that is required for a HLLPP register value. + * + * This number is determined by the maximum number of leading binary zeros a + * hashcode can produce. This is equal to the number of bits the hashcode + * returns. The current implementation uses a 64-bit hashcode, this means 6-bits + * are (at most) needed to store the number of leading zeros. + */ +constexpr int REGISTER_VALUE_BITS = 6; + +// MASK binary 6 bits: 111-111 +constexpr uint64_t MASK = (1L << REGISTER_VALUE_BITS) - 1L; + +// This value is 10, one long stores 10 register values +constexpr int REGISTERS_PER_LONG = 64 / REGISTER_VALUE_BITS; + +// XXHash seed, consistent with Spark +constexpr int64_t SEED = 42L; + +// max precision, if require a precision bigger than 18, then use 18. +constexpr int MAX_PRECISION = 18; + /** * @brief Get register value from a long which contains 10 register values, * each register value in long is 6 bits. diff --git a/src/main/cpp/src/hyper_log_log_plus_plus.hpp b/src/main/cpp/src/hyper_log_log_plus_plus.hpp index a7735944f3..2d08ab2819 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus.hpp +++ b/src/main/cpp/src/hyper_log_log_plus_plus.hpp @@ -24,28 +24,6 @@ namespace spark_rapids_jni { -/** - * The number of bits that is required for a HLLPP register value. - * - * This number is determined by the maximum number of leading binary zeros a - * hashcode can produce. This is equal to the number of bits the hashcode - * returns. The current implementation uses a 64-bit hashcode, this means 6-bits - * are (at most) needed to store the number of leading zeros. - */ -constexpr int REGISTER_VALUE_BITS = 6; - -// MASK binary 6 bits: 111-111 -constexpr uint64_t MASK = (1L << REGISTER_VALUE_BITS) - 1L; - -// This value is 10, one long stores 10 register values -constexpr int REGISTERS_PER_LONG = 64 / REGISTER_VALUE_BITS; - -// XXHash seed, consistent with Spark -constexpr int64_t SEED = 42L; - -// max precision, if require a precision bigger than 18, then use 18. -constexpr int MAX_PRECISION = 18; - /** * Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP) * sketches from hash codes, and merge the sketches in the same group. Output is diff --git a/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu index de5aafcdae..905a9433b7 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu +++ b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu @@ -40,10 +40,7 @@ namespace spark_rapids_jni { namespace { template -struct hllpp_udf : cudf::host_udf_base { - static_assert(std::is_same_v || - std::is_same_v); - +struct hllpp_udf : cudf::groupby_host_udf, cudf::reduce_host_udf { hllpp_udf(int precision_, bool is_merge_) : precision(precision_), is_merge(is_merge_) {} [[nodiscard]] input_data_attributes get_required_data() const override diff --git a/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java b/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java index 9e6c812a0c..4e1388d2dc 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java @@ -45,7 +45,7 @@ public enum AggregationType { * sketch. Input is a struct column with multiple long columns which is * consistent with Spark. Output is a struct scalar with multiple long values. */ - Reduction_MERGE(1), + ReductionMERGE(1), /** * Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP) From 78cc20713841852245449055fa48b88db9e0ab76 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Mon, 13 Jan 2025 15:45:40 +0800 Subject: [PATCH 16/28] Fix comments --- .../src/hyper_log_log_plus_plus_host_udf.cu | 27 +++++-------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu index 905a9433b7..753991d1d4 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu +++ b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu @@ -93,7 +93,7 @@ struct hllpp_udf : cudf::groupby_host_udf, cudf::reduce_host_udf { } /** - * @brief create an empty struct scalar + * @brief Create an empty column when the input is empty. */ [[nodiscard]] output_type get_empty_output( [[maybe_unused]] std::optional output_dtype, @@ -106,25 +106,12 @@ struct hllpp_udf : cudf::groupby_host_udf, cudf::reduce_host_udf { 0, [&](int i) { return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT64}); }); auto children = std::vector>(results_iter, results_iter + num_long_cols); - - if constexpr (std::is_same_v) { - // reduce - auto host_results_view_iter = thrust::make_transform_iterator( - children.begin(), [](auto const& results_column) { return results_column->view(); }); - auto views = std::vector(host_results_view_iter, - host_results_view_iter + num_long_cols); - auto table_view = cudf::table_view{views}; - auto table = cudf::table(table_view); - return std::make_unique(std::move(table), true, stream, mr); - } else { - // groupby - return cudf::make_structs_column(0, - std::move(children), - 0, // null count - rmm::device_buffer{}, // null mask - stream, - mr); - } + return cudf::make_structs_column(0, + std::move(children), + 0, // null count + rmm::device_buffer{}, // null mask + stream, + mr); } [[nodiscard]] bool is_equal(host_udf_base const& other) const override From d04502013f14816f75ad2805dc26e8114dea0704 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Wed, 15 Jan 2025 21:06:30 +0800 Subject: [PATCH 17/28] Close host UDF instance using JNI --- .../cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp | 10 +++++ .../jni/HyperLogLogPlusPlusHostUDF.java | 41 ++++++++++++++++++- 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp b/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp index d752376a55..448b40151f 100644 --- a/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp +++ b/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp @@ -60,4 +60,14 @@ Java_com_nvidia_spark_rapids_jni_HyperLogLogPlusPlusHostUDF_estimateDistinctValu CATCH_STD(env, 0); } +JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_HyperLogLogPlusPlusHostUDF_close( + JNIEnv* env, jclass class_object, jlong ptr) +try { + cudf::jni::auto_set_device(env); + auto to_del = reinterpret_cast(ptr); + delete to_del; +} +CATCH_STD(env, ); +} + } // extern "C" diff --git a/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java b/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java index 4e1388d2dc..22653c92cf 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java @@ -16,18 +16,40 @@ package com.nvidia.spark.rapids.jni; +import java.util.Objects; + import ai.rapids.cudf.ColumnVector; import ai.rapids.cudf.ColumnView; +import ai.rapids.cudf.HostUDFWrapper; import ai.rapids.cudf.NativeDepsLoader; /** * HyperLogLogPlusPlus(HLLPP) host UDF aggregation utils */ -public class HyperLogLogPlusPlusHostUDF { +public class HyperLogLogPlusPlusHostUDF extends HostUDFWrapper { static { NativeDepsLoader.loadNativeDeps(); } + public HyperLogLogPlusPlusHostUDF(AggregationType type, int precision) { + super(createHLLPPHostUDF(type, precision)); + this.type = type; + this.precision = precision; + } + + @Override + public int hashCode() { + return Objects.hash(this.getClass().getName(), type, precision); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + HyperLogLogPlusPlusHostUDF other = (HyperLogLogPlusPlusHostUDF) o; + return type == other.type && precision == other.precision; + } + /** * HyperLogLogPlusPlus(HLLPP) aggregation types */ @@ -71,7 +93,7 @@ public enum AggregationType { /** * Create a HyperLogLogPlusPlus(HLLPP) host UDF */ - public static long createHLLPPHostUDF(AggregationType type, int precision) { + private static long createHLLPPHostUDF(AggregationType type, int precision) { return createHLLPPHostUDF(type.nativeId, precision); } @@ -102,4 +124,19 @@ public static ColumnVector estimateDistinctValueFromSketches(ColumnView input, i private static native long estimateDistinctValueFromSketches(long inputHandle, int precision); + private AggregationType type; + private int precision; + + /** + * TODO: move this to cuDF HostUDFWrapper + */ + @Override + public void close() throws Exception { + close(udfNativeHandle); + } + + /** + * TODO: move this to cuDF HostUDFWrapper + */ + static native void close(long ptr); } From a7cef890630cee3bcb8d7d337b45807c1c76c92c Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Wed, 15 Jan 2025 21:06:59 +0800 Subject: [PATCH 18/28] Address comments --- .../src/hyper_log_log_plus_plus_host_udf.cu | 90 +++++++++---------- 1 file changed, 40 insertions(+), 50 deletions(-) diff --git a/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu index 753991d1d4..f1c5c6243f 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu +++ b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu @@ -39,56 +39,46 @@ namespace spark_rapids_jni { namespace { -template struct hllpp_udf : cudf::groupby_host_udf, cudf::reduce_host_udf { hllpp_udf(int precision_, bool is_merge_) : precision(precision_), is_merge(is_merge_) {} - [[nodiscard]] input_data_attributes get_required_data() const override + /** + * Perform the main groupby computation for HLLPP UDF + */ + [[nodiscard]] output_type operator()(rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const override { - if constexpr (std::is_same_v) { - return {reduction_data_attribute::INPUT_VALUES}; + // groupby + auto const& group_values = get_grouped_values(); + if (group_values.size() == 0) { return get_empty_output(std::nullopt, stream, mr); } + int num_groups = get_num_groups(); + auto const group_labels = get_group_labels(); + if (is_merge) { + // group by intermidate result, group_values are struct of long columns + return spark_rapids_jni::group_merge_hyper_log_log_plus_plus( + group_values, num_groups, group_labels, precision, stream, mr); } else { - return {groupby_data_attribute::GROUPED_VALUES, - groupby_data_attribute::GROUP_OFFSETS, - groupby_data_attribute::GROUP_LABELS}; + return spark_rapids_jni::group_hyper_log_log_plus_plus( + group_values, num_groups, group_labels, precision, stream, mr); } } - [[nodiscard]] output_type operator()(host_udf_input const& udf_input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const override + /** + * Perform the main reduce computation for HLLPP UDF + */ + std::unique_ptr operator()( + column_view const& input, + data_type, /** output_dtype is useless */ + std::optional>, /** init is useless */ + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const override { - if constexpr (std::is_same_v) { - // reduce - auto const& input_values = - std::get(udf_input.at(reduction_data_attribute::INPUT_VALUES)); - if (input_values.size() == 0) { return get_empty_output(std::nullopt, stream, mr); } - if (is_merge) { - // reduce intermidate result, input_values are struct of long columns - return spark_rapids_jni::reduce_merge_hyper_log_log_plus_plus( - input_values, precision, stream, mr); - } else { - return spark_rapids_jni::reduce_hyper_log_log_plus_plus( - input_values, precision, stream, mr); - } + if (input.size() == 0) { return get_empty_output(std::nullopt, stream, mr); } + if (is_merge) { + // reduce intermidate result, input are struct of long columns + return spark_rapids_jni::reduce_merge_hyper_log_log_plus_plus(input, precision, stream, mr); } else { - // groupby - auto const& group_values = - std::get(udf_input.at(groupby_data_attribute::GROUPED_VALUES)); - if (group_values.size() == 0) { return get_empty_output(std::nullopt, stream, mr); } - auto const group_offsets = std::get>( - udf_input.at(groupby_data_attribute::GROUP_OFFSETS)); - int num_groups = group_offsets.size() - 1; - auto const group_labels = std::get>( - udf_input.at(groupby_data_attribute::GROUP_LABELS)); - if (is_merge) { - // group by intermidate result, group_values are struct of long columns - return spark_rapids_jni::group_merge_hyper_log_log_plus_plus( - group_values, num_groups, group_labels, precision, stream, mr); - } else { - return spark_rapids_jni::group_hyper_log_log_plus_plus( - group_values, num_groups, group_labels, precision, stream, mr); - } + return spark_rapids_jni::reduce_hyper_log_log_plus_plus(input, precision, stream, mr); } } @@ -107,17 +97,17 @@ struct hllpp_udf : cudf::groupby_host_udf, cudf::reduce_host_udf { auto children = std::vector>(results_iter, results_iter + num_long_cols); return cudf::make_structs_column(0, - std::move(children), - 0, // null count - rmm::device_buffer{}, // null mask - stream, - mr); + std::move(children), + 0, // null count + rmm::device_buffer{}, // null mask + stream, + mr); } [[nodiscard]] bool is_equal(host_udf_base const& other) const override { auto o = dynamic_cast(&other); - return o != nullptr && o->precision == this->precision; + return o != nullptr && o->precision == this->precision && o->is_merge == this->is_merge; } [[nodiscard]] std::size_t do_hash() const override @@ -138,22 +128,22 @@ struct hllpp_udf : cudf::groupby_host_udf, cudf::reduce_host_udf { std::unique_ptr create_hllpp_reduction_host_udf(int precision) { - return std::make_unique>(precision, /*is_merge*/ false); + return std::make_unique(precision, /*is_merge*/ false); } std::unique_ptr create_hllpp_reduction_merge_host_udf(int precision) { - return std::make_unique>(precision, /*is_merge*/ true); + return std::make_unique(precision, /*is_merge*/ true); } std::unique_ptr create_hllpp_groupby_host_udf(int precision) { - return std::make_unique>(precision, /*is_merge*/ false); + return std::make_unique(precision, /*is_merge*/ false); } std::unique_ptr create_hllpp_groupby_merge_host_udf(int precision) { - return std::make_unique>(precision, /*is_merge*/ true); + return std::make_unique(precision, /*is_merge*/ true); } } // namespace spark_rapids_jni From 278d8d9c2ee0b13f0159bc794fe24a9f0ee2b20d Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Wed, 15 Jan 2025 21:42:34 +0800 Subject: [PATCH 19/28] Fix compile error --- .../cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp | 15 +-- src/main/cpp/src/hyper_log_log_plus_plus.cu | 23 +---- .../cpp/src/hyper_log_log_plus_plus_const.hpp | 43 ++++++++ .../src/hyper_log_log_plus_plus_host_udf.cu | 98 ++++++++++++------- 4 files changed, 114 insertions(+), 65 deletions(-) create mode 100644 src/main/cpp/src/hyper_log_log_plus_plus_const.hpp diff --git a/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp b/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp index 448b40151f..b99bfef3b9 100644 --- a/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp +++ b/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp @@ -18,6 +18,8 @@ #include "hyper_log_log_plus_plus.hpp" #include "hyper_log_log_plus_plus_host_udf.hpp" +#include + extern "C" { JNIEXPORT jlong JNICALL @@ -62,12 +64,13 @@ Java_com_nvidia_spark_rapids_jni_HyperLogLogPlusPlusHostUDF_estimateDistinctValu JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_HyperLogLogPlusPlusHostUDF_close( JNIEnv* env, jclass class_object, jlong ptr) -try { - cudf::jni::auto_set_device(env); - auto to_del = reinterpret_cast(ptr); - delete to_del; -} -CATCH_STD(env, ); +{ + try { + cudf::jni::auto_set_device(env); + auto to_del = reinterpret_cast(ptr); + delete to_del; + } + CATCH_STD(env, ); } } // extern "C" diff --git a/src/main/cpp/src/hyper_log_log_plus_plus.cu b/src/main/cpp/src/hyper_log_log_plus_plus.cu index 1b9cca7a20..d1eb8bf400 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus.cu +++ b/src/main/cpp/src/hyper_log_log_plus_plus.cu @@ -15,6 +15,7 @@ */ #include "hash.hpp" #include "hyper_log_log_plus_plus.hpp" +#include "hyper_log_log_plus_plus_const.hpp" #include #include @@ -50,28 +51,6 @@ namespace spark_rapids_jni { namespace { -/** - * The number of bits that is required for a HLLPP register value. - * - * This number is determined by the maximum number of leading binary zeros a - * hashcode can produce. This is equal to the number of bits the hashcode - * returns. The current implementation uses a 64-bit hashcode, this means 6-bits - * are (at most) needed to store the number of leading zeros. - */ -constexpr int REGISTER_VALUE_BITS = 6; - -// MASK binary 6 bits: 111-111 -constexpr uint64_t MASK = (1L << REGISTER_VALUE_BITS) - 1L; - -// This value is 10, one long stores 10 register values -constexpr int REGISTERS_PER_LONG = 64 / REGISTER_VALUE_BITS; - -// XXHash seed, consistent with Spark -constexpr int64_t SEED = 42L; - -// max precision, if require a precision bigger than 18, then use 18. -constexpr int MAX_PRECISION = 18; - /** * @brief Get register value from a long which contains 10 register values, * each register value in long is 6 bits. diff --git a/src/main/cpp/src/hyper_log_log_plus_plus_const.hpp b/src/main/cpp/src/hyper_log_log_plus_plus_const.hpp new file mode 100644 index 0000000000..717a3f36bb --- /dev/null +++ b/src/main/cpp/src/hyper_log_log_plus_plus_const.hpp @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace spark_rapids_jni { + +/** + * The number of bits that is required for a HLLPP register value. + * + * This number is determined by the maximum number of leading binary zeros a + * hashcode can produce. This is equal to the number of bits the hashcode + * returns. The current implementation uses a 64-bit hashcode, this means 6-bits + * are (at most) needed to store the number of leading zeros. + */ +constexpr int REGISTER_VALUE_BITS = 6; + +// MASK binary 6 bits: 111-111 +constexpr uint64_t MASK = (1L << REGISTER_VALUE_BITS) - 1L; + +// This value is 10, one long stores 10 register values +constexpr int REGISTERS_PER_LONG = 64 / REGISTER_VALUE_BITS; + +// XXHash seed, consistent with Spark +constexpr int64_t SEED = 42L; + +// max precision, if require a precision bigger than 18, then use 18. +constexpr int MAX_PRECISION = 18; + +} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu index f1c5c6243f..c127da5abe 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu +++ b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu @@ -14,9 +14,11 @@ * limitations under the License. */ #include "hyper_log_log_plus_plus.hpp" +#include "hyper_log_log_plus_plus_const.hpp" #include "hyper_log_log_plus_plus_host_udf.hpp" #include +#include #include #include #include @@ -39,18 +41,18 @@ namespace spark_rapids_jni { namespace { -struct hllpp_udf : cudf::groupby_host_udf, cudf::reduce_host_udf { - hllpp_udf(int precision_, bool is_merge_) : precision(precision_), is_merge(is_merge_) {} +struct hllpp_agg_udf : cudf::groupby_host_udf { + hllpp_agg_udf(int precision_, bool is_merge_) : precision(precision_), is_merge(is_merge_) {} /** * Perform the main groupby computation for HLLPP UDF */ - [[nodiscard]] output_type operator()(rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const override + [[nodiscard]] std::unique_ptr operator()( + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const override { // groupby auto const& group_values = get_grouped_values(); - if (group_values.size() == 0) { return get_empty_output(std::nullopt, stream, mr); } + if (group_values.size() == 0) { return get_empty_output(stream, mr); } int num_groups = get_num_groups(); auto const group_labels = get_group_labels(); if (is_merge) { @@ -63,32 +65,11 @@ struct hllpp_udf : cudf::groupby_host_udf, cudf::reduce_host_udf { } } - /** - * Perform the main reduce computation for HLLPP UDF - */ - std::unique_ptr operator()( - column_view const& input, - data_type, /** output_dtype is useless */ - std::optional>, /** init is useless */ - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const override - { - if (input.size() == 0) { return get_empty_output(std::nullopt, stream, mr); } - if (is_merge) { - // reduce intermidate result, input are struct of long columns - return spark_rapids_jni::reduce_merge_hyper_log_log_plus_plus(input, precision, stream, mr); - } else { - return spark_rapids_jni::reduce_hyper_log_log_plus_plus(input, precision, stream, mr); - } - } - /** * @brief Create an empty column when the input is empty. */ - [[nodiscard]] output_type get_empty_output( - [[maybe_unused]] std::optional output_dtype, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const override + [[nodiscard]] std::unique_ptr get_empty_output( + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const override { int num_registers = 1 << precision; int num_long_cols = num_registers / REGISTERS_PER_LONG + 1; @@ -104,20 +85,63 @@ struct hllpp_udf : cudf::groupby_host_udf, cudf::reduce_host_udf { mr); } - [[nodiscard]] bool is_equal(host_udf_base const& other) const override + [[nodiscard]] bool is_equal(cudf::host_udf_base const& other) const override + { + auto o = dynamic_cast(&other); + return o != nullptr && o->precision == this->precision && o->is_merge == this->is_merge; + } + + [[nodiscard]] std::size_t do_hash() const override + { + return 31 * (31 * std::hash{}({"hllpp_agg_udf"}) + precision) + is_merge; + } + + [[nodiscard]] std::unique_ptr clone() const override + { + return std::make_unique(precision, is_merge); + } + + int precision; + bool is_merge; +}; + +struct hllpp_reduct_udf : cudf::reduce_host_udf { + hllpp_reduct_udf(int precision_, bool is_merge_) : precision(precision_), is_merge(is_merge_) {} + + /** + * Perform the main reduce computation for HLLPP UDF + */ + std::unique_ptr operator()( + cudf::column_view const& input, + cudf::data_type, /** output_dtype is useless */ + std::optional>, /** init is useless */ + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const override + { + CUDF_EXPECTS(input.size() > 0, + "Hyper Log Log Plus Plus reduction requires input is not empty!"); + if (is_merge) { + // reduce intermidate result, input are struct of long columns + return spark_rapids_jni::reduce_merge_hyper_log_log_plus_plus(input, precision, stream, mr); + } else { + return spark_rapids_jni::reduce_hyper_log_log_plus_plus(input, precision, stream, mr); + } + } + + [[nodiscard]] bool is_equal(cudf::host_udf_base const& other) const override { - auto o = dynamic_cast(&other); + auto o = dynamic_cast(&other); return o != nullptr && o->precision == this->precision && o->is_merge == this->is_merge; } [[nodiscard]] std::size_t do_hash() const override { - return 31 * (31 * std::hash{}({"hllpp_udf"}) + precision) + is_merge; + return 31 * (31 * std::hash{}({"hllpp_reduct_udf"}) + precision) + is_merge; } - [[nodiscard]] std::unique_ptr clone() const override + [[nodiscard]] std::unique_ptr clone() const override { - return std::make_unique(precision, is_merge); + return std::make_unique(precision, is_merge); } int precision; @@ -128,22 +152,22 @@ struct hllpp_udf : cudf::groupby_host_udf, cudf::reduce_host_udf { std::unique_ptr create_hllpp_reduction_host_udf(int precision) { - return std::make_unique(precision, /*is_merge*/ false); + return std::make_unique(precision, /*is_merge*/ false); } std::unique_ptr create_hllpp_reduction_merge_host_udf(int precision) { - return std::make_unique(precision, /*is_merge*/ true); + return std::make_unique(precision, /*is_merge*/ true); } std::unique_ptr create_hllpp_groupby_host_udf(int precision) { - return std::make_unique(precision, /*is_merge*/ false); + return std::make_unique(precision, /*is_merge*/ false); } std::unique_ptr create_hllpp_groupby_merge_host_udf(int precision) { - return std::make_unique(precision, /*is_merge*/ true); + return std::make_unique(precision, /*is_merge*/ true); } } // namespace spark_rapids_jni From 86832ae895cd70f0a47828b30fbb4b091d315263 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 16 Jan 2025 13:51:22 +0800 Subject: [PATCH 20/28] Address comments --- .../cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp | 3 - src/main/cpp/src/hyper_log_log_plus_plus.cu | 40 +-- src/main/cpp/src/hyper_log_log_plus_plus.hpp | 10 +- .../cpp/src/hyper_log_log_plus_plus_const.hpp | 18 +- .../src/hyper_log_log_plus_plus_host_udf.cu | 60 +++-- .../src/hyper_log_log_plus_plus_host_udf.hpp | 2 +- .../jni/HyperLogLogPlusPlusHostUDF.java | 248 +++++++++--------- 7 files changed, 190 insertions(+), 191 deletions(-) diff --git a/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp b/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp index b99bfef3b9..5d10733657 100644 --- a/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp +++ b/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp @@ -18,8 +18,6 @@ #include "hyper_log_log_plus_plus.hpp" #include "hyper_log_log_plus_plus_host_udf.hpp" -#include - extern "C" { JNIEXPORT jlong JNICALL @@ -66,7 +64,6 @@ JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_HyperLogLogPlusPlusHostU JNIEnv* env, jclass class_object, jlong ptr) { try { - cudf::jni::auto_set_device(env); auto to_del = reinterpret_cast(ptr); delete to_del; } diff --git a/src/main/cpp/src/hyper_log_log_plus_plus.cu b/src/main/cpp/src/hyper_log_log_plus_plus.cu index d1eb8bf400..627647ba9c 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus.cu +++ b/src/main/cpp/src/hyper_log_log_plus_plus.cu @@ -17,23 +17,16 @@ #include "hyper_log_log_plus_plus.hpp" #include "hyper_log_log_plus_plus_const.hpp" -#include #include #include -#include #include #include #include #include -#include #include -#include -#include -#include +#include #include -#include -#include #include #include @@ -43,14 +36,28 @@ #include #include #include -#include -#include -#include namespace spark_rapids_jni { namespace { +/** + * @brief The seed used for the XXHash64 hash function. + * It's consistent with Spark + */ +constexpr int64_t SEED = 42L; + +/** + * @brief 6 binary MASK bits: 111-111 + */ +constexpr uint64_t MASK = (1L << REGISTER_VALUE_BITS) - 1L; + +/** + * @brief The maximum precision that can be used for the HLLPP algorithm. + * If input precision is bigger than 18, then use 18. + */ +constexpr int MAX_PRECISION = 18; + /** * @brief Get register value from a long which contains 10 register values, * each register value in long is 6 bits. @@ -67,17 +74,18 @@ __device__ inline int get_register_value(int64_t const ten_registers, int reg_id * @brief Computes HLLPP sketches(register values) from hash values and * partially merge the sketches. * + * Tried to use `reduce_by_key`, but it uses too much of memory, so give up using `reduce_by_key`. + * More details: * `reduce_by_key` uses num_rows_input intermidate cache: * https://github.com/NVIDIA/thrust/blob/2.1.0/thrust/system/detail/generic/reduce_by_key.inl#L112 - * * // scan the values by flag * thrust::detail::temporary_array * scanned_values(exec, n); - * * Each sketch contains multiple integers, by default 512 integers(precision is - * 9), num_rows_input * 512 is huge, so this function uses a differrent approach - * to use less intermidate cache. New approach uses 2 phase merges: partial - * merge and final merge + * 9), num_rows_input * 512 is huge. + * + * This function uses a differrent approach to use less intermidate cache. + * It uses 2 phase merges: partial merge and final merge * * This function splits input into multiple segments with each segment has * num_hashs_per_thread items. The input is sorted by group labels, each segment diff --git a/src/main/cpp/src/hyper_log_log_plus_plus.hpp b/src/main/cpp/src/hyper_log_log_plus_plus.hpp index 2d08ab2819..c2f567d6f6 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus.hpp +++ b/src/main/cpp/src/hyper_log_log_plus_plus.hpp @@ -25,7 +25,7 @@ namespace spark_rapids_jni { /** - * Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP) + * @brief Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP) * sketches from hash codes, and merge the sketches in the same group. Output is * a struct column with multiple long columns which is consistent with Spark. */ @@ -38,7 +38,7 @@ std::unique_ptr group_hyper_log_log_plus_plus( rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** - * Merge HyperLogLogPlusPlus(HLLPP) sketches in the same group. + * @brief Merge HyperLogLogPlusPlus(HLLPP) sketches in the same group. * Input is a struct column with multiple long columns which is consistent with * Spark. */ @@ -51,7 +51,7 @@ std::unique_ptr group_merge_hyper_log_log_plus_plus( rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** - * Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP) + * @brief Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP) * sketches from hash codes, and merge all the sketches into one sketch, output * is a struct scalar with multiple long values. */ @@ -62,7 +62,7 @@ std::unique_ptr reduce_hyper_log_log_plus_plus( rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** - * Merge all HyperLogLogPlusPlus(HLLPP) sketches in the input column into one + * @brief Merge all HyperLogLogPlusPlus(HLLPP) sketches in the input column into one * sketch. Input is a struct column with multiple long columns which is * consistent with Spark. Output is a struct scalar with multiple long values. */ @@ -73,7 +73,7 @@ std::unique_ptr reduce_merge_hyper_log_log_plus_plus( rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** - * Estimate count distinct values for the input which contains + * @brief Estimate count distinct values from HyperLogLogPlusPlus(HLLPP) sketches. * Input is a struct column with multiple long columns which is consistent with * Spark. Output is a long column with all values are not null. Spark returns 0 * for null values when doing APPROX_COUNT_DISTINCT. diff --git a/src/main/cpp/src/hyper_log_log_plus_plus_const.hpp b/src/main/cpp/src/hyper_log_log_plus_plus_const.hpp index 717a3f36bb..b26c38320d 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus_const.hpp +++ b/src/main/cpp/src/hyper_log_log_plus_plus_const.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,7 @@ namespace spark_rapids_jni { /** - * The number of bits that is required for a HLLPP register value. + * @brief The number of bits that is required for a HLLPP register value. * * This number is determined by the maximum number of leading binary zeros a * hashcode can produce. This is equal to the number of bits the hashcode @@ -28,16 +28,10 @@ namespace spark_rapids_jni { */ constexpr int REGISTER_VALUE_BITS = 6; -// MASK binary 6 bits: 111-111 -constexpr uint64_t MASK = (1L << REGISTER_VALUE_BITS) - 1L; - -// This value is 10, one long stores 10 register values +/** + * @brief The number of registers that can be stored in a single long. + * It's 64 / 6 = 10. + */ constexpr int REGISTERS_PER_LONG = 64 / REGISTER_VALUE_BITS; -// XXHash seed, consistent with Spark -constexpr int64_t SEED = 42L; - -// max precision, if require a precision bigger than 18, then use 18. -constexpr int MAX_PRECISION = 18; - } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu index c127da5abe..2c5d16af1e 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu +++ b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu @@ -17,32 +17,15 @@ #include "hyper_log_log_plus_plus_const.hpp" #include "hyper_log_log_plus_plus_host_udf.hpp" -#include -#include -#include #include -#include #include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include namespace spark_rapids_jni { namespace { -struct hllpp_agg_udf : cudf::groupby_host_udf { - hllpp_agg_udf(int precision_, bool is_merge_) : precision(precision_), is_merge(is_merge_) {} +struct hllpp_groupby_udf : cudf::groupby_host_udf { + hllpp_groupby_udf(int precision_, bool is_merge_) : precision(precision_), is_merge(is_merge_) {} /** * Perform the main groupby computation for HLLPP UDF @@ -50,8 +33,7 @@ struct hllpp_agg_udf : cudf::groupby_host_udf { [[nodiscard]] std::unique_ptr operator()( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const override { - // groupby - auto const& group_values = get_grouped_values(); + auto const group_values = get_grouped_values(); if (group_values.size() == 0) { return get_empty_output(stream, mr); } int num_groups = get_num_groups(); auto const group_labels = get_group_labels(); @@ -87,20 +69,21 @@ struct hllpp_agg_udf : cudf::groupby_host_udf { [[nodiscard]] bool is_equal(cudf::host_udf_base const& other) const override { - auto o = dynamic_cast(&other); + auto o = dynamic_cast(&other); return o != nullptr && o->precision == this->precision && o->is_merge == this->is_merge; } [[nodiscard]] std::size_t do_hash() const override { - return 31 * (31 * std::hash{}({"hllpp_agg_udf"}) + precision) + is_merge; + return 31 * (31 * std::hash{}({"hllpp_groupby_udf"}) + precision) + is_merge; } [[nodiscard]] std::unique_ptr clone() const override { - return std::make_unique(precision, is_merge); + return std::make_unique(precision, is_merge); } + private: int precision; bool is_merge; }; @@ -108,6 +91,27 @@ struct hllpp_agg_udf : cudf::groupby_host_udf { struct hllpp_reduct_udf : cudf::reduce_host_udf { hllpp_reduct_udf(int precision_, bool is_merge_) : precision(precision_), is_merge(is_merge_) {} + /** + * @brief Create an empty scalar when the input is empty. + */ + std::unique_ptr get_empty_scalar(rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const + { + int num_registers = 1 << precision; + int num_long_cols = num_registers / REGISTERS_PER_LONG + 1; + auto const results_iter = cudf::detail::make_counting_transform_iterator( + 0, [&](int i) { return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT64}); }); + auto children = + std::vector>(results_iter, results_iter + num_long_cols); + auto host_results_view_iter = thrust::make_transform_iterator( + children.begin(), [](auto const& results_column) { return results_column->view(); }); + auto views = std::vector(host_results_view_iter, + host_results_view_iter + num_long_cols); + auto table_view = cudf::table_view{views}; + auto table = cudf::table(table_view); + return std::make_unique(std::move(table), true, stream, mr); + } + /** * Perform the main reduce computation for HLLPP UDF */ @@ -118,8 +122,7 @@ struct hllpp_reduct_udf : cudf::reduce_host_udf { rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const override { - CUDF_EXPECTS(input.size() > 0, - "Hyper Log Log Plus Plus reduction requires input is not empty!"); + if (input.size() == 0) { return get_empty_scalar(stream, mr); } if (is_merge) { // reduce intermidate result, input are struct of long columns return spark_rapids_jni::reduce_merge_hyper_log_log_plus_plus(input, precision, stream, mr); @@ -144,6 +147,7 @@ struct hllpp_reduct_udf : cudf::reduce_host_udf { return std::make_unique(precision, is_merge); } + private: int precision; bool is_merge; }; @@ -162,12 +166,12 @@ std::unique_ptr create_hllpp_reduction_merge_host_udf(int p std::unique_ptr create_hllpp_groupby_host_udf(int precision) { - return std::make_unique(precision, /*is_merge*/ false); + return std::make_unique(precision, /*is_merge*/ false); } std::unique_ptr create_hllpp_groupby_merge_host_udf(int precision) { - return std::make_unique(precision, /*is_merge*/ true); + return std::make_unique(precision, /*is_merge*/ true); } } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.hpp b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.hpp index 12ce65b386..35e06e132c 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.hpp +++ b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.hpp @@ -16,7 +16,7 @@ #pragma once -#include +#include namespace spark_rapids_jni { diff --git a/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java b/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java index 22653c92cf..5d6426a5f0 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java @@ -14,129 +14,125 @@ * limitations under the License. */ -package com.nvidia.spark.rapids.jni; - -import java.util.Objects; - -import ai.rapids.cudf.ColumnVector; -import ai.rapids.cudf.ColumnView; -import ai.rapids.cudf.HostUDFWrapper; -import ai.rapids.cudf.NativeDepsLoader; - -/** - * HyperLogLogPlusPlus(HLLPP) host UDF aggregation utils - */ -public class HyperLogLogPlusPlusHostUDF extends HostUDFWrapper { - static { - NativeDepsLoader.loadNativeDeps(); - } - - public HyperLogLogPlusPlusHostUDF(AggregationType type, int precision) { - super(createHLLPPHostUDF(type, precision)); - this.type = type; - this.precision = precision; - } - - @Override - public int hashCode() { - return Objects.hash(this.getClass().getName(), type, precision); - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - HyperLogLogPlusPlusHostUDF other = (HyperLogLogPlusPlusHostUDF) o; - return type == other.type && precision == other.precision; - } - - /** - * HyperLogLogPlusPlus(HLLPP) aggregation types - */ - public enum AggregationType { - - /** - * Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP) - * sketches from hash codes, and merge all the sketches into one sketch, output - * is a struct scalar with multiple long values. - */ - Reduction(0), - - /** - * Merge all HyperLogLogPlusPlus(HLLPP) sketches in the input column into one - * sketch. Input is a struct column with multiple long columns which is - * consistent with Spark. Output is a struct scalar with multiple long values. - */ - ReductionMERGE(1), - - /** - * Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP) - * sketches from hash codes, and merge the sketches in the same group. Output is - * a struct column with multiple long columns which is consistent with Spark. - */ - GroupBy(2), - - /** - * Merge HyperLogLogPlusPlus(HLLPP) sketches in the same group. - * Input is a struct column with multiple long columns which is consistent with - * Spark. - */ - GroupByMerge(3); - - final int nativeId; - - AggregationType(int nativeId) { - this.nativeId = nativeId; - } - } - - /** - * Create a HyperLogLogPlusPlus(HLLPP) host UDF - */ - private static long createHLLPPHostUDF(AggregationType type, int precision) { - return createHLLPPHostUDF(type.nativeId, precision); - } - - /** - * Compute the approximate count distinct value from sketch values. - * - * The input is sketch values, must be given in the format: - * `Struct`, - * The value of num_registers_per_sketch = 2^precision - * The children num of this Struct is: num_registers_per_sketch / 10 + 1, - * Here 10 means a INT64 contains 10 register values, - * each register value is 6 bits. - * Register value is the number of leading zero bits in xxhash64 hash code. - * xxhash64 hash code is 64 bits, Register value is 6 bits, - * 6 bits is enough to hold the max value 64. - * - * @param input The sketch column which constains Struct - * values. - * @param precision The num of bits for HLLPP register addressing. - * @return A INT64 column with each value indicates the approximate count - * distinct value. - */ - public static ColumnVector estimateDistinctValueFromSketches(ColumnView input, int precision) { - return new ColumnVector(estimateDistinctValueFromSketches(input.getNativeView(), precision)); - } - - private static native long createHLLPPHostUDF(int type, int precision); - - private static native long estimateDistinctValueFromSketches(long inputHandle, int precision); - - private AggregationType type; - private int precision; - - /** - * TODO: move this to cuDF HostUDFWrapper - */ - @Override - public void close() throws Exception { - close(udfNativeHandle); - } - - /** - * TODO: move this to cuDF HostUDFWrapper - */ - static native void close(long ptr); -} + package com.nvidia.spark.rapids.jni; + + import java.util.Objects; + + import ai.rapids.cudf.ColumnVector; + import ai.rapids.cudf.ColumnView; + import ai.rapids.cudf.HostUDFWrapper; + import ai.rapids.cudf.NativeDepsLoader; + + /** + * HyperLogLogPlusPlus(HLLPP) utility for aggregation, reduction and estimation. One HLLPP sketch is + * composed of several register values. Register value is the number of leading zero bits in + * xxhash64 hash code. xxhash64 hash code is 64 bits, so 6 bits is enough to store the zero number. + * Spark compacts one HLLPP sketch(6 bits register values) into multiple longs, each long stores 10 + * register values. So The sketch values must be a struct column with multiple long columns in it. + * The children num of this Struct is: num_registers_per_sketch / 10 + 1. The value of + * num_registers_per_sketch = pow(2, precision). + */ + public class HyperLogLogPlusPlusHostUDF extends HostUDFWrapper { + static { + NativeDepsLoader.loadNativeDeps(); + } + + public HyperLogLogPlusPlusHostUDF(AggregationType type, int precision) { + super(createHLLPPHostUDF(type, precision)); + this.type = type; + this.precision = precision; + } + + @Override + public int hashCode() { + return Objects.hash(this.getClass().getName(), type, precision); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + HyperLogLogPlusPlusHostUDF other = (HyperLogLogPlusPlusHostUDF) o; + return type == other.type && precision == other.precision; + } + + /** + * HyperLogLogPlusPlus(HLLPP) aggregation/reduction types + */ + public enum AggregationType { + + /** + * Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP) + * sketches from hash codes, and merge all the sketches into one sketch, output + * is a struct scalar with multiple long values. + */ + Reduction(0), + + /** + * Merge all HyperLogLogPlusPlus(HLLPP) sketches in the input column into one + * sketch. Input is a struct column with multiple long columns which is + * consistent with Spark. Output is a struct scalar with multiple long values. + */ + ReductionMERGE(1), + + /** + * Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP) + * sketches from hash codes, and merge the sketches in the same group. Output is + * a struct column with multiple long columns which is consistent with Spark. + */ + GroupBy(2), + + /** + * Merge HyperLogLogPlusPlus(HLLPP) sketches in the same group. + * Input is a struct column with multiple long columns which is consistent with + * Spark. + */ + GroupByMerge(3); + + final int nativeId; + + AggregationType(int nativeId) { + this.nativeId = nativeId; + } + } + + /** + * Create a HyperLogLogPlusPlus(HLLPP) host UDF + */ + private static long createHLLPPHostUDF(AggregationType type, int precision) { + return createHLLPPHostUDF(type.nativeId, precision); + } + + /** + * Compute the approximate count distinct values from sketch values. + * The input is sketch values must be a struct column with multiple long columns in it. + * + * @param input The sketch column which is a struct column with multiple long columns in it. + * @param precision The num of bits for HLLPP register addressing. + * @return A INT64 column with each value indicates the approximate count + * distinct value. + */ + public static ColumnVector estimateDistinctValueFromSketches(ColumnView input, int precision) { + return new ColumnVector(estimateDistinctValueFromSketches(input.getNativeView(), precision)); + } + + private static native long createHLLPPHostUDF(int type, int precision); + + private static native long estimateDistinctValueFromSketches(long inputHandle, int precision); + + private AggregationType type; + private int precision; + + /** + * TODO: move this to cuDF HostUDFWrapper + */ + @Override + public void close() throws Exception { + close(udfNativeHandle); + } + + /** + * TODO: move this to cuDF HostUDFWrapper + */ + static native void close(long ptr); + } From b331db9a5e0e28f8c77cb5c238d78dd101bd96ce Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 16 Jan 2025 14:21:48 +0800 Subject: [PATCH 21/28] Address comments --- src/main/cpp/src/hyper_log_log_plus_plus.cu | 2 +- .../com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/cpp/src/hyper_log_log_plus_plus.cu b/src/main/cpp/src/hyper_log_log_plus_plus.cu index 627647ba9c..3e0d8b15cb 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus.cu +++ b/src/main/cpp/src/hyper_log_log_plus_plus.cu @@ -656,7 +656,7 @@ std::unique_ptr group_merge_hllpp( } /** - * Launch only 1 block, uses max 1M(2^18 *sizeof(int)) shared memory. + * @brief Launch only 1 block, uses max 1M(2^18 *sizeof(int)) shared memory. * For each hash, get a pair: (register index, register value). * Use shared memory to speedup the fetch max atomic operation. */ diff --git a/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java b/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java index 5d6426a5f0..1f0606ce8a 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java @@ -73,7 +73,7 @@ public enum AggregationType { * sketch. Input is a struct column with multiple long columns which is * consistent with Spark. Output is a struct scalar with multiple long values. */ - ReductionMERGE(1), + ReductionMerge(1), /** * Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP) From 7b929b4078b2b04e56d2a12e4b1eaf061f7167c0 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 16 Jan 2025 14:28:47 +0800 Subject: [PATCH 22/28] Address comments --- .../cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp | 2 +- .../cpp/src/hyper_log_log_plus_plus_host_udf.cu | 16 ++++++++-------- .../cpp/src/hyper_log_log_plus_plus_host_udf.hpp | 8 ++++---- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp b/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp index 5d10733657..180836036c 100644 --- a/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp +++ b/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp @@ -41,7 +41,7 @@ Java_com_nvidia_spark_rapids_jni_HyperLogLogPlusPlusHostUDF_createHLLPPHostUDF(J }(); CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HyperLogLogPlusPlus(HLLPP) UDF instance."); - return reinterpret_cast(udf_ptr.release()); + return reinterpret_cast(udf_ptr); } CATCH_STD(env, 0); } diff --git a/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu index 2c5d16af1e..339cc2c434 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu +++ b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu @@ -154,24 +154,24 @@ struct hllpp_reduct_udf : cudf::reduce_host_udf { } // namespace -std::unique_ptr create_hllpp_reduction_host_udf(int precision) +cudf::host_udf_base* create_hllpp_reduction_host_udf(int precision) { - return std::make_unique(precision, /*is_merge*/ false); + return new hllpp_reduct_udf(precision, /*is_merge*/ false); } -std::unique_ptr create_hllpp_reduction_merge_host_udf(int precision) +cudf::host_udf_base* create_hllpp_reduction_merge_host_udf(int precision) { - return std::make_unique(precision, /*is_merge*/ true); + return new hllpp_reduct_udf(precision, /*is_merge*/ true); } -std::unique_ptr create_hllpp_groupby_host_udf(int precision) +cudf::host_udf_base* create_hllpp_groupby_host_udf(int precision) { - return std::make_unique(precision, /*is_merge*/ false); + return new hllpp_groupby_udf(precision, /*is_merge*/ false); } -std::unique_ptr create_hllpp_groupby_merge_host_udf(int precision) +cudf::host_udf_base* create_hllpp_groupby_merge_host_udf(int precision) { - return std::make_unique(precision, /*is_merge*/ true); + return new hllpp_groupby_udf(precision, /*is_merge*/ true); } } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.hpp b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.hpp index 35e06e132c..08bd24ffcf 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.hpp +++ b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.hpp @@ -20,12 +20,12 @@ namespace spark_rapids_jni { -std::unique_ptr create_hllpp_reduction_host_udf(int precision); +cudf::host_udf_base* create_hllpp_reduction_host_udf(int precision); -std::unique_ptr create_hllpp_reduction_merge_host_udf(int precision); +cudf::host_udf_base* create_hllpp_reduction_merge_host_udf(int precision); -std::unique_ptr create_hllpp_groupby_host_udf(int precision); +cudf::host_udf_base* create_hllpp_groupby_host_udf(int precision); -std::unique_ptr create_hllpp_groupby_merge_host_udf(int precision); +cudf::host_udf_base* create_hllpp_groupby_merge_host_udf(int precision); } // namespace spark_rapids_jni From 6fcf7e0d622db73f9ee018ea0d7759a3d09b733c Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 16 Jan 2025 15:44:49 +0800 Subject: [PATCH 23/28] Address comments --- src/main/cpp/src/hyper_log_log_plus_plus.cu | 72 ++++++++++----------- 1 file changed, 35 insertions(+), 37 deletions(-) diff --git a/src/main/cpp/src/hyper_log_log_plus_plus.cu b/src/main/cpp/src/hyper_log_log_plus_plus.cu index 3e0d8b15cb..b0d662d6e2 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus.cu +++ b/src/main/cpp/src/hyper_log_log_plus_plus.cu @@ -442,15 +442,15 @@ std::unique_ptr group_hllpp(cudf::column_view const& input, }); auto children = std::vector>(results_iter, results_iter + num_long_cols); - auto d_results = [&] { - auto host_results_pointer_iter = - thrust::make_transform_iterator(children.begin(), [](auto const& results_column) { - return results_column->mutable_view().template data(); - }); - auto host_results_pointers = - std::vector(host_results_pointer_iter, host_results_pointer_iter + children.size()); - return cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr); - }(); + + auto host_results_pointer_iter = + thrust::make_transform_iterator(children.begin(), [](auto const& results_column) { + return results_column->mutable_view().template data(); + }); + auto host_results_pointers = + std::vector(host_results_pointer_iter, host_results_pointer_iter + children.size()); + auto d_results = cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr); + auto result = cudf::make_structs_column(num_groups, std::move(children), 0, // null count @@ -636,15 +636,15 @@ std::unique_ptr group_merge_hllpp( }); auto results = std::vector>(results_iter, results_iter + num_long_cols); - auto d_sketches_output = [&] { - auto host_results_pointer_iter = - thrust::make_transform_iterator(results.begin(), [](auto const& results_column) { - return results_column->mutable_view().template data(); - }); - auto host_results_pointers = - std::vector(host_results_pointer_iter, host_results_pointer_iter + results.size()); - return cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr); - }(); + + auto host_results_pointer_iter = + thrust::make_transform_iterator(results.begin(), [](auto const& results_column) { + return results_column->mutable_view().template data(); + }); + auto host_results_pointers = + std::vector(host_results_pointer_iter, host_results_pointer_iter + results.size()); + auto d_sketches_output = + cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr); // 3rd kernel: compact auto num_phase3_threads = num_groups * num_long_cols; @@ -738,16 +738,15 @@ std::unique_ptr reduce_hllpp(cudf::column_view const& input, }); auto children = std::vector>(results_iter, results_iter + num_long_cols); - auto d_results = [&] { - auto host_results_pointer_iter = - thrust::make_transform_iterator(children.begin(), [](auto const& results_column) { - return results_column->mutable_view().template data(); - }); - auto host_results_pointers = - std::vector(host_results_pointer_iter, host_results_pointer_iter + children.size()); - return cudf::detail::make_device_uvector_async( - host_results_pointers, stream, cudf::get_current_device_resource_ref()); - }(); + + auto host_results_pointer_iter = + thrust::make_transform_iterator(children.begin(), [](auto const& results_column) { + return results_column->mutable_view().template data(); + }); + auto host_results_pointers = + std::vector(host_results_pointer_iter, host_results_pointer_iter + children.size()); + auto d_results = cudf::detail::make_device_uvector_async( + host_results_pointers, stream, cudf::get_current_device_resource_ref()); // 2. reduce and generate compacted long values constexpr int64_t block_size = 256; @@ -808,15 +807,14 @@ std::unique_ptr reduce_merge_hllpp(cudf::column_view const& input, }); auto children = std::vector>(results_iter, results_iter + num_long_cols); - auto d_results = [&] { - auto host_results_pointer_iter = - thrust::make_transform_iterator(children.begin(), [](auto const& results_column) { - return results_column->mutable_view().template data(); - }); - auto host_results_pointers = - std::vector(host_results_pointer_iter, host_results_pointer_iter + children.size()); - return cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr); - }(); + + auto host_results_pointer_iter = + thrust::make_transform_iterator(children.begin(), [](auto const& results_column) { + return results_column->mutable_view().template data(); + }); + auto host_results_pointers = + std::vector(host_results_pointer_iter, host_results_pointer_iter + children.size()); + auto d_results = cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr); // execute merge kernel auto num_threads = num_registers_per_sketch; From 024da64e5e249ff99e35b184151ab409b13c6dbe Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Fri, 17 Jan 2025 07:39:05 +0800 Subject: [PATCH 24/28] Change make device uvector from async to sync --- src/main/cpp/src/hyper_log_log_plus_plus.cu | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/main/cpp/src/hyper_log_log_plus_plus.cu b/src/main/cpp/src/hyper_log_log_plus_plus.cu index b0d662d6e2..9f98286561 100644 --- a/src/main/cpp/src/hyper_log_log_plus_plus.cu +++ b/src/main/cpp/src/hyper_log_log_plus_plus.cu @@ -449,7 +449,7 @@ std::unique_ptr group_hllpp(cudf::column_view const& input, }); auto host_results_pointers = std::vector(host_results_pointer_iter, host_results_pointer_iter + children.size()); - auto d_results = cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr); + auto d_results = cudf::detail::make_device_uvector_sync(host_results_pointers, stream, mr); auto result = cudf::make_structs_column(num_groups, std::move(children), @@ -604,7 +604,7 @@ std::unique_ptr group_merge_hllpp( auto const input_iter = cudf::detail::make_counting_transform_iterator( 0, [&](int i) { return scv.get_sliced_child(i, stream).begin(); }); auto input_cols = std::vector(input_iter, input_iter + num_long_cols); - auto d_inputs = cudf::detail::make_device_uvector_async( + auto d_inputs = cudf::detail::make_device_uvector_sync( input_cols, stream, cudf::get_current_device_resource_ref()); // 1st kernel: partially group partial_group_long_sketches_kernel @@ -644,7 +644,7 @@ std::unique_ptr group_merge_hllpp( auto host_results_pointers = std::vector(host_results_pointer_iter, host_results_pointer_iter + results.size()); auto d_sketches_output = - cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr); + cudf::detail::make_device_uvector_sync(host_results_pointers, stream, mr); // 3rd kernel: compact auto num_phase3_threads = num_groups * num_long_cols; @@ -745,7 +745,7 @@ std::unique_ptr reduce_hllpp(cudf::column_view const& input, }); auto host_results_pointers = std::vector(host_results_pointer_iter, host_results_pointer_iter + children.size()); - auto d_results = cudf::detail::make_device_uvector_async( + auto d_results = cudf::detail::make_device_uvector_sync( host_results_pointers, stream, cudf::get_current_device_resource_ref()); // 2. reduce and generate compacted long values @@ -794,7 +794,7 @@ std::unique_ptr reduce_merge_hllpp(cudf::column_view const& input, auto const input_iter = cudf::detail::make_counting_transform_iterator( 0, [&](int i) { return scv.get_sliced_child(i, stream).begin(); }); auto input_cols = std::vector(input_iter, input_iter + num_long_cols); - auto d_inputs = cudf::detail::make_device_uvector_async( + auto d_inputs = cudf::detail::make_device_uvector_sync( input_cols, stream, cudf::get_current_device_resource_ref()); // create one row output @@ -814,7 +814,7 @@ std::unique_ptr reduce_merge_hllpp(cudf::column_view const& input, }); auto host_results_pointers = std::vector(host_results_pointer_iter, host_results_pointer_iter + children.size()); - auto d_results = cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr); + auto d_results = cudf::detail::make_device_uvector_sync(host_results_pointers, stream, mr); // execute merge kernel auto num_threads = num_registers_per_sketch; @@ -949,7 +949,7 @@ std::unique_ptr estimate_from_hll_sketches(cudf::column_view const 0, [&](int i) { return input.child(i).begin(); }); auto const h_input_ptrs = std::vector(input_iter, input_iter + input.num_children()); - auto d_inputs = cudf::detail::make_device_uvector_async( + auto d_inputs = cudf::detail::make_device_uvector_sync( h_input_ptrs, stream, cudf::get_current_device_resource_ref()); auto result = cudf::make_numeric_column( cudf::data_type{cudf::type_id::INT64}, input.size(), cudf::mask_state::UNALLOCATED, stream, mr); From c9f4bfbd8e1ecdffd0c0d0a4ea50aa4e15eb72d4 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Sat, 18 Jan 2025 21:26:51 +0800 Subject: [PATCH 25/28] Add test case --- src/main/cpp/tests/CMakeLists.txt | 2 + src/main/cpp/tests/hyper_log_log_plus_plus.cu | 277 ++++++++++++++++++ 2 files changed, 279 insertions(+) create mode 100644 src/main/cpp/tests/hyper_log_log_plus_plus.cu diff --git a/src/main/cpp/tests/CMakeLists.txt b/src/main/cpp/tests/CMakeLists.txt index c774d30618..d6caf9a138 100644 --- a/src/main/cpp/tests/CMakeLists.txt +++ b/src/main/cpp/tests/CMakeLists.txt @@ -82,3 +82,5 @@ ConfigureTest(PARSE_URI ConfigureTest(SUBSTRING_INDEX substring_index.cpp) +ConfigureTest(HLLPP + hyper_log_log_plus_plus.cu) diff --git a/src/main/cpp/tests/hyper_log_log_plus_plus.cu b/src/main/cpp/tests/hyper_log_log_plus_plus.cu new file mode 100644 index 0000000000..bdd4a3fee2 --- /dev/null +++ b/src/main/cpp/tests/hyper_log_log_plus_plus.cu @@ -0,0 +1,277 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +using doubles_col = cudf::test::fixed_width_column_wrapper; +using int64_col = cudf::test::fixed_width_column_wrapper; + +struct HyperLogLogPlusPlusUDFTest : cudf::test::BaseFixture {}; + +namespace { + +/** + * @brief Concatenate struct scalars into a struct column. + * @param scalar_col_ptrs Pointers to the columns in scalars, this span appends the column pointes + * of scalar one by one, the size of the vector is num_scalars * num_longs_in_scalar. + * @param num_scalars Number of struct scalars + * @param num_longs_in_scalar Number of long columns in each struct scalar + */ +CUDF_KERNEL void concat_struct_scalars_to_struct_column_kernel( + cudf::device_span scalar_col_ptrs, + int num_scalars, + int num_longs_in_scalar, + cudf::device_span output) +{ + for (auto col = 0; col < num_longs_in_scalar; ++col) { + for (auto scalar_idx = 0; scalar_idx < num_scalars; ++scalar_idx) { + auto flattened_col_idx = scalar_idx * num_longs_in_scalar + col; + output[col][scalar_idx] = scalar_col_ptrs[flattened_col_idx][0]; + } + } +} + +/** + * @brief Flatten columns in scalars into a vector. + */ +std::vector get_column_ptrs_from_struct_scalars( + std::vector>& scalars, int num_longs_in_scalar) +{ + std::vector col_ptrs(num_longs_in_scalar * scalars.size()); + int idx = 0; + for (auto const& s : scalars) { + auto const struct_scalar_ptr = dynamic_cast(s.get()); + auto const table_view = struct_scalar_ptr->view(); + for (auto const& col_view : table_view) { + col_ptrs[idx++] = col_view.data(); + } + } + return col_ptrs; +} + +/** + * @brief Make a struct column from multiple scalars with checks: each scalar is a struct(long) + */ +std::unique_ptr make_struct_column_from_scalars( + std::vector>& scalars, + int num_longs_in_scalar, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + // asserts + for (auto const& s : scalars) { + EXPECT_EQ(s->type().id(), cudf::type_id::STRUCT); + auto const struct_scalar_ptr = dynamic_cast(s.get()); + auto const table_view = struct_scalar_ptr->view(); + EXPECT_EQ(num_longs_in_scalar, table_view.num_columns()); + for (auto const& col_view : table_view) { + EXPECT_EQ(col_view.type().id(), cudf::type_id::INT64); + } + } + + // get column pointers from struct scalars + auto col_ptrs = get_column_ptrs_from_struct_scalars(scalars, num_longs_in_scalar); + auto d_col_ptrs = cudf::detail::make_device_uvector_sync(col_ptrs, stream, mr); + + // create output columns + auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) { + return cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT64}, + scalars.size(), // num_rows + cudf::mask_state::UNALLOCATED, + stream, + mr); + }); + auto children = + std::vector>(results_iter, results_iter + num_longs_in_scalar); + auto host_results_pointer_iter = + thrust::make_transform_iterator(children.begin(), [](auto const& results_column) { + return results_column->mutable_view().template data(); + }); + auto host_results_pointers = + std::vector(host_results_pointer_iter, host_results_pointer_iter + children.size()); + auto d_output = cudf::detail::make_device_uvector_sync(host_results_pointers, stream, mr); + + // concatenate struct scalars into a struct column + concat_struct_scalars_to_struct_column_kernel<<<1, 1, 0, stream.value()>>>( + d_col_ptrs, scalars.size(), num_longs_in_scalar, d_output); + + // create struct column + return cudf::make_structs_column(scalars.size(), // num_rows + std::move(children), + 0, // null count + rmm::device_buffer{}, // null mask + stream); +} + +/** + * @brief Make a struct column from a single scalar with checks: each scalar is a struct(long) + */ +std::unique_ptr make_struct_column_from_scalar(std::unique_ptr& scalar, + int num_longs_in_scalar, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + std::vector> scalars; + scalars.push_back(std::move(scalar)); + return make_struct_column_from_scalars(scalars, num_longs_in_scalar, stream, mr); +} + +} // namespace + +TEST_F(HyperLogLogPlusPlusUDFTest, Reduction) +{ + // 1. Create data + auto const vals1 = doubles_col{1.0, 2.0, 3.0, 4.0, 5.0}; + auto const vals2 = doubles_col{6.0, 7.0, 8.0, 9.0, 10.0}; + auto stream = cudf::get_default_stream(); + auto mr = cudf::get_current_device_resource_ref(); + + // 2. Execute reduce + // There are pow(2, 9) = 512 registers + // Each register stores the num of leading zeros of hash, the max num is 64, + // 6 bits are enough to store the number, so each register is 6 bits. + // All the registers are compacted into 512 / (64 / 6) + 1 = 52 longs + constexpr int precision = 9; + constexpr int num_longs_in_sketch = 512 / 10 + 1; + auto const reduce_agg = + cudf::make_host_udf_aggregation(std::unique_ptr( + spark_rapids_jni::create_hllpp_reduction_host_udf(precision))); + std::vector> reduced_scalars; + for (size_t i = 0; i < 64; i++) { + if (i % 2 == 0) { + auto reduced = + cudf::reduce(vals1, *reduce_agg, cudf::data_type{cudf::type_id::STRUCT}, stream, mr); + EXPECT_TRUE(reduced->is_valid()); + reduced_scalars.push_back(std::move(reduced)); + } else { + auto reduced = + cudf::reduce(vals2, *reduce_agg, cudf::data_type{cudf::type_id::STRUCT}, stream, mr); + EXPECT_TRUE(reduced->is_valid()); + reduced_scalars.push_back(std::move(reduced)); + } + } + + // 3. Merge all the sketches into one sketch + auto const input_for_merge = + make_struct_column_from_scalars(reduced_scalars, num_longs_in_sketch, stream, mr); + auto const merge_agg = + cudf::make_host_udf_aggregation(std::unique_ptr( + spark_rapids_jni::create_hllpp_reduction_merge_host_udf(precision))); + + auto reduce_merged = + cudf::reduce(*input_for_merge, *merge_agg, cudf::data_type{cudf::type_id::STRUCT}, stream, mr); + EXPECT_TRUE(reduce_merged->is_valid()); + + // 4. Estimate count distinct values from the merged sketch + auto const input_for_estimate = + make_struct_column_from_scalar(reduce_merged, num_longs_in_sketch, stream, mr); + auto const result = + spark_rapids_jni::estimate_from_hll_sketches(*input_for_estimate, precision, stream, mr); + + // 5. check count distinct value + auto const expected = int64_col{10}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); +} + +TEST_F(HyperLogLogPlusPlusUDFTest, Groupby) +{ + constexpr int precision = 9; + + // 1. Create data + auto const keys = int64_col{1, 2, 3, 1, 2, 3, 1, 2, 3}; + // Each key in (1, 2, 3) maps to three values: (1.0, 2.0, 3.0) + auto const vals1 = doubles_col{1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0}; + // Each key in (1, 2, 3) maps to three values: (4.0, 5.0, 6.0) + auto const vals2 = doubles_col{4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0}; + // Each key in (1, 2, 3) maps to three values: (7.0, 8.0, 9.0) + auto const vals3 = doubles_col{7.0, 7.0, 7.0, 8.0, 8.0, 8.0, 9.0, 9.0, 9.0}; + + // 2. Execute groupby + auto agg1 = + cudf::make_host_udf_aggregation(std::unique_ptr( + spark_rapids_jni::create_hllpp_groupby_host_udf(precision))); + auto agg2 = + cudf::make_host_udf_aggregation(std::unique_ptr( + spark_rapids_jni::create_hllpp_groupby_host_udf(precision))); + auto agg3 = + cudf::make_host_udf_aggregation(std::unique_ptr( + spark_rapids_jni::create_hllpp_groupby_host_udf(precision))); + std::vector agg_requests; + agg_requests.emplace_back(); + agg_requests[0].values = vals1; + agg_requests[0].aggregations.push_back(std::move(agg1)); + agg_requests.emplace_back(); + agg_requests[1].values = vals2; + agg_requests[1].aggregations.push_back(std::move(agg2)); + agg_requests.emplace_back(); + agg_requests[2].values = vals3; + agg_requests[2].aggregations.push_back(std::move(agg3)); + cudf::groupby::groupby gb_obj( + cudf::table_view({keys}), cudf::null_policy::INCLUDE, cudf::sorted::NO, {}, {}); + auto const grp_result = gb_obj.aggregate(agg_requests, cudf::test::get_default_stream()); + // each grouped sketches has 3 rows for keys: 1, 2, 3 + auto const& grouped_sketches_for_vals1 = grp_result.second[0].results[0]->view(); + auto const& grouped_sketches_for_vals2 = grp_result.second[1].results[0]->view(); + auto const& grouped_sketches_for_vals3 = grp_result.second[1].results[0]->view(); + + // 3. Execute merge sketches + auto stream = cudf::get_default_stream(); + auto mr = cudf::get_current_device_resource_ref(); + // each result is 3 rows, concat to 9 rows. + auto const sketches = cudf::concatenate( + std::vector{ + grouped_sketches_for_vals1, grouped_sketches_for_vals2, grouped_sketches_for_vals3}, + stream, + mr); + auto merge_agg = + cudf::make_host_udf_aggregation(std::unique_ptr( + spark_rapids_jni::create_hllpp_groupby_merge_host_udf(precision))); + std::vector merge_requests; + merge_requests.emplace_back(); + merge_requests[0].values = sketches->view(); + merge_requests[0].aggregations.push_back(std::move(merge_agg)); + cudf::groupby::groupby gb_obj2( + cudf::table_view({keys}), cudf::null_policy::INCLUDE, cudf::sorted::NO, {}, {}); + auto const grp_result2 = gb_obj2.aggregate(merge_requests, cudf::test::get_default_stream()); + auto const& merged = grp_result2.second[0].results[0]; + + // 4. Estimate + auto const result = spark_rapids_jni::estimate_from_hll_sketches(*merged, precision, stream, mr); + + // 5. Check result + // each key in (1, 2, 3) has 9 distinct values: (1.0, 2.0, ..., 9.0) + // Note: 9 != 6, estimation result is an approximate value, not 100 accurate + auto const expected = int64_col{6, 6, 6}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); +} From b678c17102dd3d4ab113819fe93db404177ac484 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Sun, 19 Jan 2025 14:02:55 +0800 Subject: [PATCH 26/28] Format and Copyright --- src/main/cpp/tests/CMakeLists.txt | 3 ++- src/main/cpp/tests/hyper_log_log_plus_plus.cu | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/cpp/tests/CMakeLists.txt b/src/main/cpp/tests/CMakeLists.txt index d6caf9a138..89f456b99d 100644 --- a/src/main/cpp/tests/CMakeLists.txt +++ b/src/main/cpp/tests/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -84,3 +84,4 @@ ConfigureTest(SUBSTRING_INDEX ConfigureTest(HLLPP hyper_log_log_plus_plus.cu) + diff --git a/src/main/cpp/tests/hyper_log_log_plus_plus.cu b/src/main/cpp/tests/hyper_log_log_plus_plus.cu index bdd4a3fee2..0e3877405a 100644 --- a/src/main/cpp/tests/hyper_log_log_plus_plus.cu +++ b/src/main/cpp/tests/hyper_log_log_plus_plus.cu @@ -127,7 +127,7 @@ std::unique_ptr make_struct_column_from_scalars( d_col_ptrs, scalars.size(), num_longs_in_scalar, d_output); // create struct column - return cudf::make_structs_column(scalars.size(), // num_rows + return cudf::make_structs_column(scalars.size(), // num_rows std::move(children), 0, // null count rmm::device_buffer{}, // null mask From cbc3d125cbb4c707f3265f3c2002352bb58c6678 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Wed, 22 Jan 2025 10:45:40 +0800 Subject: [PATCH 27/28] Update according to cuDF UDF instance management change --- .../cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp | 10 - .../jni/HyperLogLogPlusPlusHostUDF.java | 236 +++++++++--------- 2 files changed, 114 insertions(+), 132 deletions(-) diff --git a/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp b/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp index 180836036c..1e1efc05fe 100644 --- a/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp +++ b/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp @@ -60,14 +60,4 @@ Java_com_nvidia_spark_rapids_jni_HyperLogLogPlusPlusHostUDF_estimateDistinctValu CATCH_STD(env, 0); } -JNIEXPORT void JNICALL Java_com_nvidia_spark_rapids_jni_HyperLogLogPlusPlusHostUDF_close( - JNIEnv* env, jclass class_object, jlong ptr) -{ - try { - auto to_del = reinterpret_cast(ptr); - delete to_del; - } - CATCH_STD(env, ); -} - } // extern "C" diff --git a/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java b/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java index 1f0606ce8a..5b8456005c 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java @@ -14,125 +14,117 @@ * limitations under the License. */ - package com.nvidia.spark.rapids.jni; - - import java.util.Objects; - - import ai.rapids.cudf.ColumnVector; - import ai.rapids.cudf.ColumnView; - import ai.rapids.cudf.HostUDFWrapper; - import ai.rapids.cudf.NativeDepsLoader; - - /** - * HyperLogLogPlusPlus(HLLPP) utility for aggregation, reduction and estimation. One HLLPP sketch is - * composed of several register values. Register value is the number of leading zero bits in - * xxhash64 hash code. xxhash64 hash code is 64 bits, so 6 bits is enough to store the zero number. - * Spark compacts one HLLPP sketch(6 bits register values) into multiple longs, each long stores 10 - * register values. So The sketch values must be a struct column with multiple long columns in it. - * The children num of this Struct is: num_registers_per_sketch / 10 + 1. The value of - * num_registers_per_sketch = pow(2, precision). - */ - public class HyperLogLogPlusPlusHostUDF extends HostUDFWrapper { - static { - NativeDepsLoader.loadNativeDeps(); - } - - public HyperLogLogPlusPlusHostUDF(AggregationType type, int precision) { - super(createHLLPPHostUDF(type, precision)); - this.type = type; - this.precision = precision; - } - - @Override - public int hashCode() { - return Objects.hash(this.getClass().getName(), type, precision); - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - HyperLogLogPlusPlusHostUDF other = (HyperLogLogPlusPlusHostUDF) o; - return type == other.type && precision == other.precision; - } - - /** - * HyperLogLogPlusPlus(HLLPP) aggregation/reduction types - */ - public enum AggregationType { - - /** - * Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP) - * sketches from hash codes, and merge all the sketches into one sketch, output - * is a struct scalar with multiple long values. - */ - Reduction(0), - - /** - * Merge all HyperLogLogPlusPlus(HLLPP) sketches in the input column into one - * sketch. Input is a struct column with multiple long columns which is - * consistent with Spark. Output is a struct scalar with multiple long values. - */ - ReductionMerge(1), - - /** - * Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP) - * sketches from hash codes, and merge the sketches in the same group. Output is - * a struct column with multiple long columns which is consistent with Spark. - */ - GroupBy(2), - - /** - * Merge HyperLogLogPlusPlus(HLLPP) sketches in the same group. - * Input is a struct column with multiple long columns which is consistent with - * Spark. - */ - GroupByMerge(3); - - final int nativeId; - - AggregationType(int nativeId) { - this.nativeId = nativeId; - } - } - - /** - * Create a HyperLogLogPlusPlus(HLLPP) host UDF - */ - private static long createHLLPPHostUDF(AggregationType type, int precision) { - return createHLLPPHostUDF(type.nativeId, precision); - } - - /** - * Compute the approximate count distinct values from sketch values. - * The input is sketch values must be a struct column with multiple long columns in it. - * - * @param input The sketch column which is a struct column with multiple long columns in it. - * @param precision The num of bits for HLLPP register addressing. - * @return A INT64 column with each value indicates the approximate count - * distinct value. - */ - public static ColumnVector estimateDistinctValueFromSketches(ColumnView input, int precision) { - return new ColumnVector(estimateDistinctValueFromSketches(input.getNativeView(), precision)); - } - - private static native long createHLLPPHostUDF(int type, int precision); - - private static native long estimateDistinctValueFromSketches(long inputHandle, int precision); - - private AggregationType type; - private int precision; - - /** - * TODO: move this to cuDF HostUDFWrapper - */ - @Override - public void close() throws Exception { - close(udfNativeHandle); - } - - /** - * TODO: move this to cuDF HostUDFWrapper - */ - static native void close(long ptr); - } +package com.nvidia.spark.rapids.jni; + +import java.util.Objects; + +import ai.rapids.cudf.ColumnVector; +import ai.rapids.cudf.ColumnView; +import ai.rapids.cudf.HostUDFWrapper; +import ai.rapids.cudf.NativeDepsLoader; + +/** + * HyperLogLogPlusPlus(HLLPP) utility for aggregation, reduction and estimation. One HLLPP sketch is + * composed of several register values. Register value is the number of leading zero bits in + * xxhash64 hash code. xxhash64 hash code is 64 bits, so 6 bits is enough to store the zero number. + * Spark compacts one HLLPP sketch(6 bits register values) into multiple longs, each long stores 10 + * register values. So The sketch values must be a struct column with multiple long columns in it. + * The children num of this Struct is: num_registers_per_sketch / 10 + 1. The value of + * num_registers_per_sketch = pow(2, precision). + */ +public class HyperLogLogPlusPlusHostUDF extends HostUDFWrapper { + static { + NativeDepsLoader.loadNativeDeps(); + } + + public HyperLogLogPlusPlusHostUDF(AggregationType type, int precision) { + this.type = type; + this.precision = precision; + } + + @Override + public long createUDFInstance() { + return createHLLPPHostUDF(type, precision); + } + + @Override + public int computeHashCode() { + return Objects.hash(this.getClass().getName(), type, precision); + } + + @Override + public boolean isEquals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + HyperLogLogPlusPlusHostUDF other = (HyperLogLogPlusPlusHostUDF) o; + return type == other.type && precision == other.precision; + } + + /** + * HyperLogLogPlusPlus(HLLPP) aggregation/reduction types + */ + public enum AggregationType { + + /** + * Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP) + * sketches from hash codes, and merge all the sketches into one sketch, output + * is a struct scalar with multiple long values. + */ + Reduction(0), + + /** + * Merge all HyperLogLogPlusPlus(HLLPP) sketches in the input column into one + * sketch. Input is a struct column with multiple long columns which is + * consistent with Spark. Output is a struct scalar with multiple long values. + */ + ReductionMerge(1), + + /** + * Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP) + * sketches from hash codes, and merge the sketches in the same group. Output is + * a struct column with multiple long columns which is consistent with Spark. + */ + GroupBy(2), + + /** + * Merge HyperLogLogPlusPlus(HLLPP) sketches in the same group. + * Input is a struct column with multiple long columns which is consistent with + * Spark. + */ + GroupByMerge(3); + + final int nativeId; + + AggregationType(int nativeId) { + this.nativeId = nativeId; + } + } + + /** + * Create a HyperLogLogPlusPlus(HLLPP) host UDF + */ + private static long createHLLPPHostUDF(AggregationType type, int precision) { + return createHLLPPHostUDF(type.nativeId, precision); + } + + /** + * Compute the approximate count distinct values from sketch values. + * The input is sketch values must be a struct column with multiple long columns in it. + * + * @param input The sketch column which is a struct column with multiple long columns in it. + * @param precision The num of bits for HLLPP register addressing. + * @return A INT64 column with each value indicates the approximate count + * distinct value. + */ + public static ColumnVector estimateDistinctValueFromSketches(ColumnView input, int precision) { + return new ColumnVector(estimateDistinctValueFromSketches(input.getNativeView(), precision)); + } + + private static native long createHLLPPHostUDF(int type, int precision); + + private static native long estimateDistinctValueFromSketches(long inputHandle, int precision); + + private AggregationType type; + private int precision; +} + \ No newline at end of file From 78871a49eba10663b490b23487829939da5f9660 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Fri, 24 Jan 2025 13:15:38 +0800 Subject: [PATCH 28/28] Minor change: fix compile error --- .../nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java b/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java index 5b8456005c..1ed95a0147 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java @@ -53,7 +53,7 @@ public int computeHashCode() { } @Override - public boolean isEquals(Object o) { + public boolean isEqual(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; HyperLogLogPlusPlusHostUDF other = (HyperLogLogPlusPlusHostUDF) o; @@ -127,4 +127,3 @@ public static ColumnVector estimateDistinctValueFromSketches(ColumnView input, i private AggregationType type; private int precision; } - \ No newline at end of file