NVIDIA · res-life · Mar 13, 2025 · Oct 16, 2024 · Nov 4, 2024 · Nov 19, 2024
diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
@@ -196,6 +196,7 @@ add_library(
   src/HashJni.cpp
   src/HistogramJni.cpp
   src/HostTableJni.cpp
+  src/HLLPPJni.cpp
   src/JSONUtilsJni.cpp
   src/NativeParquetJni.cpp
   src/ParseURIJni.cpp
@@ -204,6 +205,7 @@ add_library(
   src/SparkResourceAdaptorJni.cpp
   src/SubStringIndexJni.cpp
   src/ZOrderJni.cpp
+  src/HLLPP.cu
   src/bloom_filter.cu
   src/case_when.cu
   src/cast_decimal_to_string.cu

diff --git a/src/main/cpp/src/HLLPP.cu b/src/main/cpp/src/HLLPP.cu
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "HLLPP.hpp"
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuco/detail/hyperloglog/finalizer.cuh>
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+
+namespace spark_rapids_jni {
+
+namespace {
+
+// The number of bits required by register value. Register value stores num of zeros.
+// XXHash64 value is 64 bits, it's safe to use 6 bits to store a register value.
+constexpr int REGISTER_VALUE_BITS = 6;
+
+// MASK binary 6 bits: 111111
+constexpr uint64_t MASK = (1L << REGISTER_VALUE_BITS) - 1L;
+
+// One long stores 10 register values
+constexpr int REGISTERS_PER_LONG = 64 / REGISTER_VALUE_BITS;
+
+__device__ inline int get_register_value(int64_t const long_10_registers, int reg_idx)
+{
+  int64_t shift_mask = MASK << (REGISTER_VALUE_BITS * reg_idx);
+  int64_t v          = (long_10_registers & shift_mask) >> (REGISTER_VALUE_BITS * reg_idx);
+  return static_cast<int>(v);
+}
+
+struct estimate_fn {
+  cudf::device_span<int64_t const*> sketch_longs;
+  int const precision;
+  int64_t* const out;
+
+  __device__ void operator()(cudf::size_type const idx) const
+  {
+    auto const num_regs = 1ull << precision;
+    double sum          = 0;
+    int zeroes          = 0;
+
+    for (auto reg_idx = 0; reg_idx < num_regs; ++reg_idx) {
+      // each long contains 10 register values
+      int long_col_idx    = reg_idx / REGISTERS_PER_LONG;
+      int reg_idx_in_long = reg_idx % REGISTERS_PER_LONG;
+      int reg             = get_register_value(sketch_longs[long_col_idx][idx], reg_idx_in_long);
+      sum += double{1} / static_cast<double>(1ull << reg);
+      zeroes += reg == 0;
+    }
+
+    auto const finalize = cuco::hyperloglog_ns::detail::finalizer(precision);
+    out[idx]            = finalize(sum, zeroes);
+  }
+};
+
+}  // end anonymous namespace
+
+std::unique_ptr<cudf::column> estimate_from_hll_sketches(cudf::column_view const& input,
+                                                         int precision,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::device_async_resource_ref mr)
+{
+  CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision is bigger than 4.");
+  auto const input_iter = cudf::detail::make_counting_transform_iterator(
+    0, [&](int i) { return input.child(i).begin<int64_t>(); });
+  auto input_cols = std::vector<int64_t const*>(input_iter, input_iter + input.num_children());
+  auto d_inputs   = cudf::detail::make_device_uvector_async(input_cols, stream, mr);
-  auto input_cols = std::vector<int64_t const*>(input_iter, input_iter + input.num_children());
-  auto d_inputs   = cudf::detail::make_device_uvector_async(input_cols, stream, mr);
+  auto const h_input_ptrs = std::vector<int64_t const*>(input_iter, input_iter + input.num_children());
+  auto const d_input_ptrs   = cudf::detail::make_device_uvector_async(input_cols, stream, cudf::get_current_device_resource_ref());
-  auto input_cols = std::vector<int64_t const*>(input_iter, input_iter + input.num_children());
-  auto d_inputs   = cudf::detail::make_device_uvector_async(input_cols, stream, mr);
+  auto const h_input_ptrs = std::vector<int64_t const*>(input_iter, input_iter + input.num_children());
+  auto const d_input_ptrs   = cudf::detail::make_device_uvector_async(input_cols, stream, cudf::get_current_device_resource_ref());
+  auto result     = cudf::make_numeric_column(
+    cudf::data_type{cudf::type_id::INT64}, input.size(), cudf::mask_state::ALL_VALID, stream);
+  // evaluate from struct<long, ..., long>
+  thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                     thrust::make_counting_iterator(0),
+                     input.size(),
+                     estimate_fn{d_inputs, precision, result->mutable_view().data<int64_t>()});
+  return result;
+}
+
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/HLLPP.hpp b/src/main/cpp/src/HLLPP.hpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
-#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
-#include <rmm/resource_ref.hpp>
-#include <rmm/resource_ref.hpp>
+
+namespace spark_rapids_jni {
+
+std::unique_ptr<cudf::column> estimate_from_hll_sketches(
+  cudf::column_view const& input,
+  int precision,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/HLLPPJni.cpp b/src/main/cpp/src/HLLPPJni.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "HLLPP.hpp"
+#include "cudf_jni_apis.hpp"
+
+extern "C" {
+
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_HLLPP_estimateDistinctValueFromSketches(
+  JNIEnv* env, jclass, jlong sketches, jint precision)
+{
+  JNI_NULL_CHECK(env, sketches, "Sketch column is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const sketch_view = reinterpret_cast<cudf::column_view const*>(sketches);
+    return cudf::jni::ptr_as_jlong(
+      spark_rapids_jni::estimate_from_hll_sketches(*sketch_view, precision).release());
+  }
+  CATCH_STD(env, 0);
+}
+}
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java b/src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.jni;
+
+import ai.rapids.cudf.ColumnVector;
+import ai.rapids.cudf.ColumnView;
+import ai.rapids.cudf.NativeDepsLoader;
+
+/**
+ * HyperLogLogPlusPlus
+ */
+public class HLLPP {
-public class HLLPP {
+public class AggregationUtils {
-public class HLLPP {
+public class AggregationUtils {
+  static {
+    NativeDepsLoader.loadNativeDeps();
+  }
+
+  /**
+   * Compute the approximate count distinct value from sketch values.
+   * <p>
+   * The input sketch values must be given in the format `Struct<INT64, INT64, ...>`,
+   * The num of children is: num_registers_per_sketch / 10 + 1, here 10 means a INT64 contains
+   * max 10 registers. Register value is 6 bits. The input is columnar data, e.g.: sketch 0
+   * is composed of by all the data of the children at index 0.
+   *
+   * @param input         The sketch column which constains Struct<INT64, INT64, ...> values.
+   * @param precision     The num of bits for addressing.
+   * @return A INT64 column with each value indicates the approximate count distinct value.
+   */
+  public static ColumnVector estimateDistinctValueFromSketches(ColumnView input, int precision) {
+    return new ColumnVector(estimateDistinctValueFromSketches(input.getNativeView(), precision));
+  }
+
+  private static native long estimateDistinctValueFromSketches(long inputHandle, int precision);
+}
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/HLLPPTest.java b/src/test/java/com/nvidia/spark/rapids/jni/HLLPPTest.java
@@ -0,0 +1,37 @@
+/*
+* Copyright (c)  2024, NVIDIA CORPORATION.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package com.nvidia.spark.rapids.jni;
+
+import ai.rapids.cudf.GroupByAggregation;
+import ai.rapids.cudf.Table;
+
+import org.junit.jupiter.api.Test;
+
+
+public class HLLPPTest {
+
+  @Test
+  void testGroupByHLL() {
+    // A trivial test:
+    try (Table input = new Table.TestBuilder().column(1, 2, 3, 1, 2, 2, 1, 3, 3, 2)
+             .column(0, 1, -2, 3, -4, -5, -6, 7, -8, 9)
+             .build()){
+        input.groupBy(0).aggregate(GroupByAggregation.HLLPP(0)
+               .onColumn(1));
+    }
+  }
+}