From e385fda9173f99b18f2d5c0cb87eb4b3e9c9e2f4 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 6 Sep 2023 14:25:32 -0700
Subject: [PATCH 01/93] Add `COUNT_FREQUENCY` and `MERGE_FREQUENCY`
 aggregations

---
 cpp/include/cudf/aggregation.hpp | 70 ++++++++++++++++----------------
 1 file changed, 36 insertions(+), 34 deletions(-)

diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index d319041f8b1..12c6dc1cad7 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -83,40 +83,42 @@ class aggregation {
    * @brief Possible aggregation operations
    */
   enum Kind {
-    SUM,             ///< sum reduction
-    PRODUCT,         ///< product reduction
-    MIN,             ///< min reduction
-    MAX,             ///< max reduction
-    COUNT_VALID,     ///< count number of valid elements
-    COUNT_ALL,       ///< count number of elements
-    ANY,             ///< any reduction
-    ALL,             ///< all reduction
-    SUM_OF_SQUARES,  ///< sum of squares reduction
-    MEAN,            ///< arithmetic mean reduction
-    M2,              ///< sum of squares of differences from the mean
-    VARIANCE,        ///< variance
-    STD,             ///< standard deviation
-    MEDIAN,          ///< median reduction
-    QUANTILE,        ///< compute specified quantile(s)
-    ARGMAX,          ///< Index of max element
-    ARGMIN,          ///< Index of min element
-    NUNIQUE,         ///< count number of unique elements
-    NTH_ELEMENT,     ///< get the nth element
-    ROW_NUMBER,      ///< get row-number of current index (relative to rolling window)
-    RANK,            ///< get rank of current index
-    COLLECT_LIST,    ///< collect values into a list
-    COLLECT_SET,     ///< collect values into a list without duplicate entries
-    LEAD,            ///< window function, accesses row at specified offset following current row
-    LAG,             ///< window function, accesses row at specified offset preceding current row
-    PTX,             ///< PTX  UDF based reduction
-    CUDA,            ///< CUDA UDF based reduction
-    MERGE_LISTS,     ///< merge multiple lists values into one list
-    MERGE_SETS,      ///< merge multiple lists values into one list then drop duplicate entries
-    MERGE_M2,        ///< merge partial values of M2 aggregation,
-    COVARIANCE,      ///< covariance between two sets of elements
-    CORRELATION,     ///< correlation between two sets of elements
-    TDIGEST,         ///< create a tdigest from a set of input values
-    MERGE_TDIGEST    ///< create a tdigest by merging multiple tdigests together
+    SUM,              ///< sum reduction
+    PRODUCT,          ///< product reduction
+    MIN,              ///< min reduction
+    MAX,              ///< max reduction
+    COUNT_VALID,      ///< count number of valid elements
+    COUNT_ALL,        ///< count number of elements
+    COUNT_FREQUENCY,  ///< count frequency of each element
+    ANY,              ///< any reduction
+    ALL,              ///< all reduction
+    SUM_OF_SQUARES,   ///< sum of squares reduction
+    MEAN,             ///< arithmetic mean reduction
+    M2,               ///< sum of squares of differences from the mean
+    VARIANCE,         ///< variance
+    STD,              ///< standard deviation
+    MEDIAN,           ///< median reduction
+    QUANTILE,         ///< compute specified quantile(s)
+    ARGMAX,           ///< Index of max element
+    ARGMIN,           ///< Index of min element
+    NUNIQUE,          ///< count number of unique elements
+    NTH_ELEMENT,      ///< get the nth element
+    ROW_NUMBER,       ///< get row-number of current index (relative to rolling window)
+    RANK,             ///< get rank of current index
+    COLLECT_LIST,     ///< collect values into a list
+    COLLECT_SET,      ///< collect values into a list without duplicate entries
+    LEAD,             ///< window function, accesses row at specified offset following current row
+    LAG,              ///< window function, accesses row at specified offset preceding current row
+    PTX,              ///< PTX  UDF based reduction
+    CUDA,             ///< CUDA UDF based reduction
+    MERGE_LISTS,      ///< merge multiple lists values into one list
+    MERGE_SETS,       ///< merge multiple lists values into one list then drop duplicate entries
+    MERGE_M2,         ///< merge partial values of M2 aggregation,
+    MERGE_FREQUENCY,  ///< merge partial values of COUNT_FREQUENCY aggregation,
+    COVARIANCE,       ///< covariance between two sets of elements
+    CORRELATION,      ///< correlation between two sets of elements
+    TDIGEST,          ///< create a tdigest from a set of input values
+    MERGE_TDIGEST     ///< create a tdigest by merging multiple tdigests together
   };
 
   aggregation() = delete;

From e3df8d465cdcda5f4ba59e5c233999797fe6f916 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 6 Sep 2023 16:57:04 -0700
Subject: [PATCH 02/93] Change the new aggregations to `HISTOGRAM` and
 `MERGE_HISTOGRAM`

---
 cpp/include/cudf/aggregation.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index 12c6dc1cad7..8645247f298 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -89,7 +89,7 @@ class aggregation {
     MAX,              ///< max reduction
     COUNT_VALID,      ///< count number of valid elements
     COUNT_ALL,        ///< count number of elements
-    COUNT_FREQUENCY,  ///< count frequency of each element
+    HISTOGRAM,        ///< compute frequency of each element
     ANY,              ///< any reduction
     ALL,              ///< all reduction
     SUM_OF_SQUARES,   ///< sum of squares reduction
@@ -114,7 +114,7 @@ class aggregation {
     MERGE_LISTS,      ///< merge multiple lists values into one list
     MERGE_SETS,       ///< merge multiple lists values into one list then drop duplicate entries
     MERGE_M2,         ///< merge partial values of M2 aggregation,
-    MERGE_FREQUENCY,  ///< merge partial values of COUNT_FREQUENCY aggregation,
+    MERGE_HISTOGRAM,  ///< merge partial values of HISTOGRAM aggregation,
     COVARIANCE,       ///< covariance between two sets of elements
     CORRELATION,      ///< correlation between two sets of elements
     TDIGEST,          ///< create a tdigest from a set of input values

From 7bc7f91566892c565002d709b67feace105f768e Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 6 Sep 2023 21:03:09 -0700
Subject: [PATCH 03/93] Update copyright year

---
 cpp/include/cudf/aggregation.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index 8645247f298..359c53dff60 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 0fd200085d331e2a9412ddbca47bb6b163cd827a Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 6 Sep 2023 22:02:24 -0700
Subject: [PATCH 04/93] Implement interface for the new aggregations

---
 .../cudf/detail/aggregation/aggregation.hpp   | 44 +++++++++++++++++++
 cpp/src/aggregation/aggregation.cpp           | 42 ++++++++++++++++++
 2 files changed, 86 insertions(+)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 4d3984cab93..345977384f3 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -45,6 +45,8 @@ class simple_aggregations_collector {  // Declares the interface for the simple
                                                           class max_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class count_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class histogram_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class any_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
@@ -89,6 +91,8 @@ class simple_aggregations_collector {  // Declares the interface for the simple
                                                           class merge_sets_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class merge_m2_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(
+    data_type col_type, class merge_histogram_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class covariance_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
@@ -108,6 +112,7 @@ class aggregation_finalizer {  // Declares the interface for the finalizer
   virtual void visit(class min_aggregation const& agg);
   virtual void visit(class max_aggregation const& agg);
   virtual void visit(class count_aggregation const& agg);
+  virtual void visit(class histogram_aggregation const& agg);
   virtual void visit(class any_aggregation const& agg);
   virtual void visit(class all_aggregation const& agg);
   virtual void visit(class sum_of_squares_aggregation const& agg);
@@ -130,6 +135,7 @@ class aggregation_finalizer {  // Declares the interface for the finalizer
   virtual void visit(class merge_lists_aggregation const& agg);
   virtual void visit(class merge_sets_aggregation const& agg);
   virtual void visit(class merge_m2_aggregation const& agg);
+  virtual void visit(class merge_histogram_aggregation const& agg);
   virtual void visit(class covariance_aggregation const& agg);
   virtual void visit(class correlation_aggregation const& agg);
   virtual void visit(class tdigest_aggregation const& agg);
@@ -251,6 +257,25 @@ class count_aggregation final : public rolling_aggregation,
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
+/**
+ * @brief Derived class for specifying a histogram aggregation
+ */
+class histogram_aggregation final : public groupby_aggregation, public reduce_aggregation {
+ public:
+  histogram_aggregation() : aggregation(HISTOGRAM) {}
+
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<histogram_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
 /**
  * @brief Derived class for specifying an any aggregation
  */
@@ -972,6 +997,25 @@ class merge_m2_aggregation final : public groupby_aggregation {
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
+/**
+ * @brief Derived aggregation class for specifying MERGE_HISTOGRAM aggregation
+ */
+class merge_histogram_aggregation final : public groupby_aggregation, public reduce_aggregation {
+ public:
+  explicit merge_histogram_aggregation() : aggregation{MERGE_HISTOGRAM} {}
+
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<merge_histogram_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
 /**
  * @brief Derived aggregation class for specifying COVARIANCE aggregation
  */
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index 2e6a643484e..b3f2a774a60 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -64,6 +64,12 @@ std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   return visit(col_type, static_cast<aggregation const&>(agg));
 }
 
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, histogram_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
 std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   data_type col_type, any_aggregation const& agg)
 {
@@ -196,6 +202,12 @@ std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   return visit(col_type, static_cast<aggregation const&>(agg));
 }
 
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, merge_histogram_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
 std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   data_type col_type, covariance_aggregation const& agg)
 {
@@ -246,6 +258,10 @@ void aggregation_finalizer::visit(count_aggregation const& agg)
 {
   visit(static_cast<aggregation const&>(agg));
 }
+void aggregation_finalizer::visit(histogram_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
 
 void aggregation_finalizer::visit(any_aggregation const& agg)
 {
@@ -357,6 +373,11 @@ void aggregation_finalizer::visit(merge_m2_aggregation const& agg)
   visit(static_cast<aggregation const&>(agg));
 }
 
+void aggregation_finalizer::visit(merge_histogram_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
 void aggregation_finalizer::visit(covariance_aggregation const& agg)
 {
   visit(static_cast<aggregation const&>(agg));
@@ -460,6 +481,16 @@ template std::unique_ptr<groupby_aggregation> make_count_aggregation<groupby_agg
 template std::unique_ptr<groupby_scan_aggregation> make_count_aggregation<groupby_scan_aggregation>(
   null_policy null_handling);
 
+/// Factory to create a HISTOGRAM aggregation
+template <typename Base>
+std::unique_ptr<Base> make_histogram_aggregation()
+{
+  return std::make_unique<detail::histogram_aggregation>();
+}
+template std::unique_ptr<aggregation> make_histogram_aggregation<aggregation>();
+template std::unique_ptr<groupby_aggregation> make_histogram_aggregation<groupby_aggregation>();
+template std::unique_ptr<reduce_aggregation> make_histogram_aggregation<reduce_aggregation>();
+
 /// Factory to create a ANY aggregation
 template <typename Base>
 std::unique_ptr<Base> make_any_aggregation()
@@ -764,6 +795,17 @@ std::unique_ptr<Base> make_merge_m2_aggregation()
 template std::unique_ptr<aggregation> make_merge_m2_aggregation<aggregation>();
 template std::unique_ptr<groupby_aggregation> make_merge_m2_aggregation<groupby_aggregation>();
 
+/// Factory to create a MERGE_HISTOGRAM aggregation
+template <typename Base>
+std::unique_ptr<Base> make_merge_histogram_aggregation()
+{
+  return std::make_unique<detail::merge_histogram_aggregation>();
+}
+template std::unique_ptr<aggregation> make_merge_histogram_aggregation<aggregation>();
+template std::unique_ptr<groupby_aggregation>
+make_merge_histogram_aggregation<groupby_aggregation>();
+template std::unique_ptr<reduce_aggregation> make_merge_histogram_aggregation<reduce_aggregation>();
+
 /// Factory to create a COVARIANCE aggregation
 template <typename Base>
 std::unique_ptr<Base> make_covariance_aggregation(size_type min_periods, size_type ddof)

From 1b04436990377028c8f78bbd25742bc02808d757 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 7 Sep 2023 10:33:12 -0700
Subject: [PATCH 05/93] Add new files

---
 cpp/CMakeLists.txt                      |  2 ++
 cpp/src/groupby/sort/group_histogram.cu | 19 ++++++++++++++
 cpp/src/reductions/histogram.cu         | 34 +++++++++++++++++++++++++
 cpp/src/reductions/histogram.cuh        | 23 +++++++++++++++++
 4 files changed, 78 insertions(+)
 create mode 100644 cpp/src/groupby/sort/group_histogram.cu
 create mode 100644 cpp/src/reductions/histogram.cu
 create mode 100644 cpp/src/reductions/histogram.cuh

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 516865e5782..a8e45b70572 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -323,6 +323,7 @@ add_library(
   src/groupby/sort/group_collect.cu
   src/groupby/sort/group_correlation.cu
   src/groupby/sort/group_count.cu
+  src/groupby/sort/group_histogram.cu
   src/groupby/sort/group_m2.cu
   src/groupby/sort/group_max.cu
   src/groupby/sort/group_min.cu
@@ -469,6 +470,7 @@ add_library(
   src/reductions/all.cu
   src/reductions/any.cu
   src/reductions/collect_ops.cu
+  src/reductions/histogram.cu
   src/reductions/max.cu
   src/reductions/mean.cu
   src/reductions/min.cu
diff --git a/cpp/src/groupby/sort/group_histogram.cu b/cpp/src/groupby/sort/group_histogram.cu
new file mode 100644
index 00000000000..9eb09738ac4
--- /dev/null
+++ b/cpp/src/groupby/sort/group_histogram.cu
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+namespace cudf::groupby::detail {
+}  // namespace cudf::groupby::detail
diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
new file mode 100644
index 00000000000..5bfed5965f3
--- /dev/null
+++ b/cpp/src/reductions/histogram.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <reductions/histogram.cuh>
+
+#include <cudf/column/column.hpp>
+
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+namespace cudf::reduction::detail {
+
+std::unique_ptr<cudf::column> histogram(column_view const& col,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  return nullptr;
+}
+
+}  // namespace cudf::reduction::detail
diff --git a/cpp/src/reductions/histogram.cuh b/cpp/src/reductions/histogram.cuh
new file mode 100644
index 00000000000..5951b91a964
--- /dev/null
+++ b/cpp/src/reductions/histogram.cuh
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/aggregation.hpp>
+
+namespace cudf::reduction::detail {
+
+}  // namespace cudf::reduction::detail

From 1977d696a523c6cd4eb26a24b50a1aca0ad83099 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 11 Sep 2023 09:17:31 -0700
Subject: [PATCH 06/93] Add skeleton APIs

---
 .../reduction/detail/reduction_functions.hpp  | 22 +++++
 cpp/src/groupby/sort/aggregate.cpp            | 24 +++++
 cpp/src/groupby/sort/group_histogram.cu       | 98 +++++++++++++++++++
 cpp/src/groupby/sort/group_reductions.hpp     | 36 +++++++
 cpp/src/reductions/histogram.cu               | 14 ++-
 cpp/src/reductions/reductions.cpp             |  3 +
 6 files changed, 193 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cudf/reduction/detail/reduction_functions.hpp b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
index 014a6ba70eb..34c1720aba8 100644
--- a/cpp/include/cudf/reduction/detail/reduction_functions.hpp
+++ b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
@@ -131,6 +131,28 @@ std::unique_ptr<scalar> all(column_view const& col,
                             rmm::cuda_stream_view stream,
                             rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief
+ *
+ * If all elements in input column are null, output scalar is null.
+ */
+std::unique_ptr<scalar> histogram(column_view const& col,
+                                  data_type const output_dtype,
+                                  std::optional<std::reference_wrapper<scalar const>> init,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief
+ *
+ * If all elements in input column are null, output scalar is null.
+ */
+std::unique_ptr<scalar> merge_histogram(column_view const& col,
+                                        data_type const output_dtype,
+                                        std::optional<std::reference_wrapper<scalar const>> init,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr);
+
 /**
  * @brief Computes product of elements in input column
  *
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 3f977dc81d7..f59f2ab0271 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -89,6 +89,18 @@ void aggregate_result_functor::operator()<aggregation::COUNT_ALL>(aggregation co
     detail::group_count_all(helper.group_offsets(stream), helper.num_groups(stream), stream, mr));
 }
 
+template <>
+void aggregate_result_functor::operator()<aggregation::HISTOGRAM>(aggregation const& agg)
+{
+  if (cache.has_result(values, agg)) return;
+
+  cache.add_result(
+    values,
+    agg,
+    detail::group_histogram(
+      get_grouped_values(), helper.group_labels(stream), helper.num_groups(stream), stream, mr));
+}
+
 template <>
 void aggregate_result_functor::operator()<aggregation::SUM>(aggregation const& agg)
 {
@@ -534,6 +546,18 @@ void aggregate_result_functor::operator()<aggregation::MERGE_M2>(aggregation con
       get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr));
 }
 
+template <>
+void aggregate_result_functor::operator()<aggregation::MERGE_HISTOGRAM>(aggregation const& agg)
+{
+  if (cache.has_result(values, agg)) { return; }
+
+  cache.add_result(
+    values,
+    agg,
+    detail::group_merge_histogram(
+      get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr));
+}
+
 /**
  * @brief Creates column views with only valid elements in both input column views
  *
diff --git a/cpp/src/groupby/sort/group_histogram.cu b/cpp/src/groupby/sort/group_histogram.cu
index 9eb09738ac4..5123a9fb500 100644
--- a/cpp/src/groupby/sort/group_histogram.cu
+++ b/cpp/src/groupby/sort/group_histogram.cu
@@ -14,6 +14,104 @@
  * limitations under the License.
  */
 
+#include <cudf/aggregation.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/adjacent_difference.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/reduce.h>
 
 namespace cudf::groupby::detail {
+std::unique_ptr<column> group_histogram(column_view const& values,
+                                        cudf::device_span<size_type const> group_labels,
+                                        size_type num_groups,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(num_groups >= 0, "number of groups cannot be negative");
+  CUDF_EXPECTS(static_cast<size_t>(values.size()) == group_labels.size(),
+               "Size of values column should be same as that of group labels");
+
+  auto result = make_numeric_column(
+    data_type(type_to_id<size_type>()), num_groups, mask_state::UNALLOCATED, stream, mr);
+
+  if (num_groups == 0) { return result; }
+
+  if (values.nullable()) {
+    auto values_view = column_device_view::create(values, stream);
+
+    // make_validity_iterator returns a boolean iterator that sums to 1 (1+1=1)
+    // so we need to transform it to cast it to an integer type
+    auto bitmask_iterator =
+      thrust::make_transform_iterator(cudf::detail::make_validity_iterator(*values_view),
+                                      [] __device__(auto b) { return static_cast<size_type>(b); });
+
+    thrust::reduce_by_key(rmm::exec_policy(stream),
+                          group_labels.begin(),
+                          group_labels.end(),
+                          bitmask_iterator,
+                          thrust::make_discard_iterator(),
+                          result->mutable_view().begin<size_type>());
+  } else {
+    thrust::reduce_by_key(rmm::exec_policy(stream),
+                          group_labels.begin(),
+                          group_labels.end(),
+                          thrust::make_constant_iterator(1),
+                          thrust::make_discard_iterator(),
+                          result->mutable_view().begin<size_type>());
+  }
+
+  return result;
+}
+
+std::unique_ptr<column> group_merge_histogram(column_view const& values,
+                                              cudf::device_span<size_type const> group_labels,
+                                              size_type num_groups,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(num_groups >= 0, "number of groups cannot be negative");
+  CUDF_EXPECTS(static_cast<size_t>(values.size()) == group_labels.size(),
+               "Size of values column should be same as that of group labels");
+
+  auto result = make_numeric_column(
+    data_type(type_to_id<size_type>()), num_groups, mask_state::UNALLOCATED, stream, mr);
+
+  if (num_groups == 0) { return result; }
+
+  if (values.nullable()) {
+    auto values_view = column_device_view::create(values, stream);
+
+    // make_validity_iterator returns a boolean iterator that sums to 1 (1+1=1)
+    // so we need to transform it to cast it to an integer type
+    auto bitmask_iterator =
+      thrust::make_transform_iterator(cudf::detail::make_validity_iterator(*values_view),
+                                      [] __device__(auto b) { return static_cast<size_type>(b); });
+
+    thrust::reduce_by_key(rmm::exec_policy(stream),
+                          group_labels.begin(),
+                          group_labels.end(),
+                          bitmask_iterator,
+                          thrust::make_discard_iterator(),
+                          result->mutable_view().begin<size_type>());
+  } else {
+    thrust::reduce_by_key(rmm::exec_policy(stream),
+                          group_labels.begin(),
+                          group_labels.end(),
+                          thrust::make_constant_iterator(1),
+                          thrust::make_discard_iterator(),
+                          result->mutable_view().begin<size_type>());
+  }
+
+  return result;
+}
+
 }  // namespace cudf::groupby::detail
diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index fc24b679db5..8acf046324b 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -216,6 +216,23 @@ std::unique_ptr<column> group_count_all(cudf::device_span<size_type const> group
                                         size_type num_groups,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr);
+/**
+ * @brief
+ *
+ * @code{.pseudo}
+ * @endcode
+ *
+ * @param values Grouped values to get valid count of
+ * @param group_labels ID of group that the corresponding value belongs to
+ * @param num_groups Number of groups ( unique values in @p group_labels )
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<column> group_histogram(column_view const& values,
+                                        cudf::device_span<size_type const> group_labels,
+                                        size_type num_groups,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Internal API to calculate sum of squares of differences from means.
@@ -441,6 +458,25 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
                                        size_type num_groups,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief
+ *
+ * @code{.pseudo}
+ * @endcode
+ *
+ * @param values Grouped values to get valid count of
+ * @param group_labels ID of group that the corresponding value belongs to
+ * @param num_groups Number of groups ( unique values in @p group_labels )
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<column> group_merge_histogram(column_view const& values,
+                                              cudf::device_span<size_type const> group_labels,
+                                              size_type num_groups,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr);
+
 /**
  * @brief Internal API to find covariance of child columns of a non-nullable struct column.
  *
diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 5bfed5965f3..24e9624cc31 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -18,15 +18,21 @@
 
 #include <cudf/column/column.hpp>
 
-
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
 namespace cudf::reduction::detail {
 
-std::unique_ptr<cudf::column> histogram(column_view const& col,
-                                          rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+std::unique_ptr<cudf::column> histogram(column_view const& input,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  return nullptr;
+}
+
+std::unique_ptr<cudf::column> merge_histogram(column_view const& input,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
 {
   return nullptr;
 }
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index 2fef8aa8785..d6793d85ea6 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -59,6 +59,9 @@ struct reduce_dispatch_functor {
       case aggregation::MAX: return max(col, output_dtype, init, stream, mr);
       case aggregation::ANY: return any(col, output_dtype, init, stream, mr);
       case aggregation::ALL: return all(col, output_dtype, init, stream, mr);
+      case aggregation::HISTOGRAM: return histogram(col, output_dtype, init, stream, mr);
+      case aggregation::MERGE_HISTOGRAM:
+        return merge_histogram(col, output_dtype, init, stream, mr);
       case aggregation::SUM_OF_SQUARES: return sum_of_squares(col, output_dtype, stream, mr);
       case aggregation::MEAN: return mean(col, output_dtype, stream, mr);
       case aggregation::VARIANCE: {

From 6fa93fcdff5e9d1fb1bb39beef9dbb47b8aff4aa Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 11 Sep 2023 12:00:52 -0700
Subject: [PATCH 07/93] Extract hash_reduce_by_row

---
 cpp/CMakeLists.txt                            |   1 +
 .../reduction/detail/reduction_functions.hpp  |   3 -
 cpp/src/reductions/hash_reduce_by_row.cu      |  86 +++++++++++++
 cpp/src/reductions/hash_reduce_by_row.cuh     | 116 ++++++++++++++++++
 cpp/src/reductions/histogram.cu               |  14 +++
 cpp/src/reductions/reductions.cpp             |   5 +-
 cpp/src/stream_compaction/distinct.cu         |  20 +--
 cpp/src/stream_compaction/distinct_reduce.cu  |  59 +++------
 cpp/src/stream_compaction/distinct_reduce.cuh |   2 +-
 9 files changed, 247 insertions(+), 59 deletions(-)
 create mode 100644 cpp/src/reductions/hash_reduce_by_row.cu
 create mode 100644 cpp/src/reductions/hash_reduce_by_row.cuh

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a8e45b70572..a8c107e740f 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -471,6 +471,7 @@ add_library(
   src/reductions/any.cu
   src/reductions/collect_ops.cu
   src/reductions/histogram.cu
+  src/reductions/hash_reduce_by_row.cu
   src/reductions/max.cu
   src/reductions/mean.cu
   src/reductions/min.cu
diff --git a/cpp/include/cudf/reduction/detail/reduction_functions.hpp b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
index 34c1720aba8..804b79593da 100644
--- a/cpp/include/cudf/reduction/detail/reduction_functions.hpp
+++ b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
@@ -138,7 +138,6 @@ std::unique_ptr<scalar> all(column_view const& col,
  */
 std::unique_ptr<scalar> histogram(column_view const& col,
                                   data_type const output_dtype,
-                                  std::optional<std::reference_wrapper<scalar const>> init,
                                   rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr);
 
@@ -148,8 +147,6 @@ std::unique_ptr<scalar> histogram(column_view const& col,
  * If all elements in input column are null, output scalar is null.
  */
 std::unique_ptr<scalar> merge_histogram(column_view const& col,
-                                        data_type const output_dtype,
-                                        std::optional<std::reference_wrapper<scalar const>> init,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr);
 
diff --git a/cpp/src/reductions/hash_reduce_by_row.cu b/cpp/src/reductions/hash_reduce_by_row.cu
new file mode 100644
index 00000000000..b93a35d058a
--- /dev/null
+++ b/cpp/src/reductions/hash_reduce_by_row.cu
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "hash_reduce_by_row.cuh"
+
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/uninitialized_fill.h>
+
+namespace cudf::detail {
+
+#if 0
+rmm::device_uvector<size_type> hash_reduce_by_row(
+  hash_map_type const& map,
+  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
+  size_type num_rows,
+  cudf::nullate::DYNAMIC has_nulls,
+  bool has_nested_columns,
+  duplicate_keep_option keep,
+  null_equality nulls_equal,
+  nan_equality nans_equal,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(keep != duplicate_keep_option::KEEP_ANY,
+               "This function should not be called with KEEP_ANY");
+
+  auto reduction_results = rmm::device_uvector<size_type>(num_rows, stream, mr);
+
+  thrust::uninitialized_fill(rmm::exec_policy(stream),
+                             reduction_results.begin(),
+                             reduction_results.end(),
+                             reduction_init_value(keep));
+
+  auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
+  auto const key_hasher = experimental::compaction_hash(row_hasher.device_hasher(has_nulls));
+
+  auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input);
+
+  auto const reduce_by_row = [&](auto const value_comp) {
+    if (has_nested_columns) {
+      auto const key_equal = row_comp.equal_to<true>(has_nulls, nulls_equal, value_comp);
+      thrust::for_each(
+        rmm::exec_policy(stream),
+        thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(num_rows),
+        reduce_by_row_fn{
+          map.get_device_view(), key_hasher, key_equal, keep, reduction_results.begin()});
+    } else {
+      auto const key_equal = row_comp.equal_to<false>(has_nulls, nulls_equal, value_comp);
+      thrust::for_each(
+        rmm::exec_policy(stream),
+        thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(num_rows),
+        reduce_by_row_fn{
+          map.get_device_view(), key_hasher, key_equal, keep, reduction_results.begin()});
+    }
+  };
+
+  if (nans_equal == nan_equality::ALL_EQUAL) {
+    using nan_equal_comparator =
+      cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
+    reduce_by_row(nan_equal_comparator{});
+  } else {
+    using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
+    reduce_by_row(nan_unequal_comparator{});
+  }
+
+  return reduction_results;
+}
+#endif
+
+}  // namespace cudf::detail
diff --git a/cpp/src/reductions/hash_reduce_by_row.cuh b/cpp/src/reductions/hash_reduce_by_row.cuh
new file mode 100644
index 00000000000..b69846c807d
--- /dev/null
+++ b/cpp/src/reductions/hash_reduce_by_row.cuh
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stream_compaction/stream_compaction_common.cuh>
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/stream_compaction.hpp>
+#include <cudf/table/experimental/row_operators.cuh>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <memory>
+
+namespace cudf::detail {
+
+/**
+ * @brief Perform a reduction on groups of rows that are compared equal.
+ *
+ * This is essentially a reduce-by-key operation with keys are non-contiguous rows and are compared
+ * equal. A hash table is used to find groups of equal rows.
+ *
+ * Depending on the `keep` parameter, the reduction operation for each row group is:
+ * - If `keep == KEEP_FIRST`: min of row indices in the group.
+ * - If `keep == KEEP_LAST`: max of row indices in the group.
+ * - If `keep == KEEP_NONE`: count of equivalent rows (group size).
+ *
+ * At the beginning of the operation, the entire output array is filled with a value given by
+ * the `reduction_init_value()` function. Then, the reduction result for each row group is written
+ * into the output array at the index of an unspecified row in the group.
+ *
+ * @param map The auxiliary map to perform reduction
+ * @param preprocessed_input The preprocessed of the input rows for computing row hashing and row
+ *        comparisons
+ * @param num_rows The number of all input rows
+ * @param has_nulls Indicate whether the input rows has any nulls at any nested levels
+ * @param has_nested_columns Indicates whether the input table has any nested columns
+ * @param keep The parameter to determine what type of reduction to perform
+ * @param nulls_equal Flag to specify whether null elements should be considered as equal
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned vector
+ * @return A device_uvector containing the reduction results
+ */
+rmm::device_uvector<size_type> hash_reduce_by_row(
+  hash_map_type const& map,
+  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
+  size_type num_rows,
+  cudf::nullate::DYNAMIC has_nulls,
+  bool has_nested_columns,
+  duplicate_keep_option keep,
+  null_equality nulls_equal,
+  nan_equality nans_equal,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief A functor to perform reduce-by-key with keys are rows that compared equal.
+ *
+ * TODO: We need to switch to use `static_reduction_map` when it is ready
+ * (https://github.com/NVIDIA/cuCollections/pull/98).
+ */
+template <typename MapView, typename KeyHasher, typename KeyEqual, typename OutputType>
+struct reduce_by_row_fn {
+  MapView const d_map;
+  KeyHasher const d_hasher;
+  KeyEqual const d_equal;
+  OutputType* const d_output;
+
+  reduce_by_row_fn(MapView const& d_map,
+                   KeyHasher const& d_hasher,
+                   KeyEqual const& d_equal,
+                   OutputType* const d_output)
+    : d_map{d_map}, d_hasher{d_hasher}, d_equal{d_equal}, d_output{d_output}
+  {
+  }
+
+ protected:
+  __device__ OutputType* get_output_ptr(size_type const idx) const
+  {
+    auto const iter = d_map.find(idx, d_hasher, d_equal);
+
+    if (iter != d_map.end()) {
+      // Only one index value of the duplicate rows could be inserted into the map.
+      // As such, looking up for all indices of duplicate rows always returns the same value.
+      auto const inserted_idx = iter->second.load(cuda::std::memory_order_relaxed);
+
+      // All duplicate rows will have concurrent access to this same output slot.
+      return &d_output[inserted_idx];
+    } else {
+      // All input `idx` values have been inserted into the map before.
+      // Thus, searching for an `idx` key resulting in the `end()` iterator only happens if
+      // `d_equal(idx, idx) == false`.
+      // Such situations are due to comparing nulls or NaNs which are considered as always unequal.
+      // In those cases, all rows containing nulls or NaNs are distinct. Just return their direct
+      // output slot.
+      return &d_output[idx];
+    }
+  }
+};
+
+}  // namespace cudf::detail
diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 24e9624cc31..053ad62180b 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -21,12 +21,20 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <optional>
+
 namespace cudf::reduction::detail {
 
 std::unique_ptr<cudf::column> histogram(column_view const& input,
+                                        data_type const output_dtype,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
+  CUDF_EXPECTS(cudf::is_integral(output_dtype),
+               "The output type of histogram aggregation must be an integral type.");
+
+
+
   return nullptr;
 }
 
@@ -34,6 +42,12 @@ std::unique_ptr<cudf::column> merge_histogram(column_view const& input,
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
+  CUDF_EXPECTS(
+    input.type().id() == type_id::STRUCT && input.num_children() == 2,
+    "The input of merge_histogram aggregation must be a struct column having two children.");
+  CUDF_EXPECTS(cudf::is_integral(input.child(1).type()),
+               "The second child of the input column must be an integer type.");
+
   return nullptr;
 }
 
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index d6793d85ea6..8d19413190b 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -59,9 +59,8 @@ struct reduce_dispatch_functor {
       case aggregation::MAX: return max(col, output_dtype, init, stream, mr);
       case aggregation::ANY: return any(col, output_dtype, init, stream, mr);
       case aggregation::ALL: return all(col, output_dtype, init, stream, mr);
-      case aggregation::HISTOGRAM: return histogram(col, output_dtype, init, stream, mr);
-      case aggregation::MERGE_HISTOGRAM:
-        return merge_histogram(col, output_dtype, init, stream, mr);
+      case aggregation::HISTOGRAM: return histogram(col, output_dtype, stream, mr);
+      case aggregation::MERGE_HISTOGRAM: return merge_histogram(col, stream, mr);
       case aggregation::SUM_OF_SQUARES: return sum_of_squares(col, output_dtype, stream, mr);
       case aggregation::MEAN: return mean(col, output_dtype, stream, mr);
       case aggregation::VARIANCE: {
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index cc60b2a12ea..8b0710372a6 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -96,16 +96,16 @@ rmm::device_uvector<size_type> get_distinct_indices(table_view const& input,
   }
 
   // For other keep options, reduce by row on rows that compare equal.
-  auto const reduction_results = hash_reduce_by_row(map,
-                                                    std::move(preprocessed_input),
-                                                    input.num_rows(),
-                                                    has_nulls,
-                                                    has_nested_columns,
-                                                    keep,
-                                                    nulls_equal,
-                                                    nans_equal,
-                                                    stream,
-                                                    rmm::mr::get_current_device_resource());
+  auto const reduction_results = distinct_reduce(map,
+                                                 std::move(preprocessed_input),
+                                                 input.num_rows(),
+                                                 has_nulls,
+                                                 has_nested_columns,
+                                                 keep,
+                                                 nulls_equal,
+                                                 nans_equal,
+                                                 stream,
+                                                 rmm::mr::get_current_device_resource());
 
   // Extract the desired output indices from reduction results.
   auto const map_end = [&] {
diff --git a/cpp/src/stream_compaction/distinct_reduce.cu b/cpp/src/stream_compaction/distinct_reduce.cu
index 020e6a495bc..7562a174ebb 100644
--- a/cpp/src/stream_compaction/distinct_reduce.cu
+++ b/cpp/src/stream_compaction/distinct_reduce.cu
@@ -16,6 +16,9 @@
 
 #include "distinct_reduce.cuh"
 
+#include <reductions/hash_reduce_by_row.cuh>
+
+
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/uninitialized_fill.h>
@@ -24,31 +27,26 @@ namespace cudf::detail {
 
 namespace {
 /**
- * @brief A functor to perform reduce-by-key with keys are rows that compared equal.
+ * @brief
  *
- * TODO: We need to switch to use `static_reduction_map` when it is ready
- * (https://github.com/NVIDIA/cuCollections/pull/98).
  */
 template <typename MapView, typename KeyHasher, typename KeyEqual>
-struct reduce_by_row_fn {
-  MapView const d_map;
-  KeyHasher const d_hasher;
-  KeyEqual const d_equal;
+struct distinct_reduce_fn : reduce_by_row_fn<MapView, KeyHasher, KeyEqual, size_type> {
   duplicate_keep_option const keep;
-  size_type* const d_output;
-
-  reduce_by_row_fn(MapView const& d_map,
-                   KeyHasher const& d_hasher,
-                   KeyEqual const& d_equal,
-                   duplicate_keep_option const keep,
-                   size_type* const d_output)
-    : d_map{d_map}, d_hasher{d_hasher}, d_equal{d_equal}, keep{keep}, d_output{d_output}
+
+  distinct_reduce_fn(MapView const& d_map,
+                     KeyHasher const& d_hasher,
+                     KeyEqual const& d_equal,
+                     duplicate_keep_option const keep,
+                     size_type* const d_output)
+    : reduce_by_row_fn<MapView, KeyHasher, KeyEqual, size_type>(d_map, d_hasher, d_equal, d_output),
+      keep{keep}
   {
   }
 
   __device__ void operator()(size_type const idx) const
   {
-    auto const out_ptr = get_output_ptr(idx);
+    auto const out_ptr = this->get_output_ptr(idx);
 
     if (keep == duplicate_keep_option::KEEP_FIRST) {
       // Store the smallest index of all rows that are equal.
@@ -61,34 +59,11 @@ struct reduce_by_row_fn {
       atomicAdd(out_ptr, size_type{1});
     }
   }
-
- private:
-  __device__ size_type* get_output_ptr(size_type const idx) const
-  {
-    auto const iter = d_map.find(idx, d_hasher, d_equal);
-
-    if (iter != d_map.end()) {
-      // Only one index value of the duplicate rows could be inserted into the map.
-      // As such, looking up for all indices of duplicate rows always returns the same value.
-      auto const inserted_idx = iter->second.load(cuda::std::memory_order_relaxed);
-
-      // All duplicate rows will have concurrent access to this same output slot.
-      return &d_output[inserted_idx];
-    } else {
-      // All input `idx` values have been inserted into the map before.
-      // Thus, searching for an `idx` key resulting in the `end()` iterator only happens if
-      // `d_equal(idx, idx) == false`.
-      // Such situations are due to comparing nulls or NaNs which are considered as always unequal.
-      // In those cases, all rows containing nulls or NaNs are distinct. Just return their direct
-      // output slot.
-      return &d_output[idx];
-    }
-  }
 };
 
 }  // namespace
 
-rmm::device_uvector<size_type> hash_reduce_by_row(
+rmm::device_uvector<size_type> distinct_reduce(
   hash_map_type const& map,
   std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
   size_type num_rows,
@@ -122,7 +97,7 @@ rmm::device_uvector<size_type> hash_reduce_by_row(
         rmm::exec_policy(stream),
         thrust::make_counting_iterator(0),
         thrust::make_counting_iterator(num_rows),
-        reduce_by_row_fn{
+        distinct_reduce_fn{
           map.get_device_view(), key_hasher, key_equal, keep, reduction_results.begin()});
     } else {
       auto const key_equal = row_comp.equal_to<false>(has_nulls, nulls_equal, value_comp);
@@ -130,7 +105,7 @@ rmm::device_uvector<size_type> hash_reduce_by_row(
         rmm::exec_policy(stream),
         thrust::make_counting_iterator(0),
         thrust::make_counting_iterator(num_rows),
-        reduce_by_row_fn{
+        distinct_reduce_fn{
           map.get_device_view(), key_hasher, key_equal, keep, reduction_results.begin()});
     }
   };
diff --git a/cpp/src/stream_compaction/distinct_reduce.cuh b/cpp/src/stream_compaction/distinct_reduce.cuh
index 8ec1fa18205..74fba8196f4 100644
--- a/cpp/src/stream_compaction/distinct_reduce.cuh
+++ b/cpp/src/stream_compaction/distinct_reduce.cuh
@@ -72,7 +72,7 @@ auto constexpr reduction_init_value(duplicate_keep_option keep)
  * @param mr Device memory resource used to allocate the returned vector
  * @return A device_uvector containing the reduction results
  */
-rmm::device_uvector<size_type> hash_reduce_by_row(
+rmm::device_uvector<size_type> distinct_reduce(
   hash_map_type const& map,
   std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
   size_type num_rows,

From d11dd7f195e5cc71c3ad8b2d6ba780f039e3a48c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 11 Sep 2023 22:28:39 -0700
Subject: [PATCH 08/93] Adopt `hash_reduce_by_row` in `distinct_reduce`

---
 cpp/CMakeLists.txt                           |  1 -
 cpp/src/reductions/hash_reduce_by_row.cu     | 86 --------------------
 cpp/src/reductions/hash_reduce_by_row.cuh    | 55 +++++++++++++
 cpp/src/stream_compaction/distinct_reduce.cu | 78 ++++++++----------
 4 files changed, 89 insertions(+), 131 deletions(-)
 delete mode 100644 cpp/src/reductions/hash_reduce_by_row.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a8c107e740f..a8e45b70572 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -471,7 +471,6 @@ add_library(
   src/reductions/any.cu
   src/reductions/collect_ops.cu
   src/reductions/histogram.cu
-  src/reductions/hash_reduce_by_row.cu
   src/reductions/max.cu
   src/reductions/mean.cu
   src/reductions/min.cu
diff --git a/cpp/src/reductions/hash_reduce_by_row.cu b/cpp/src/reductions/hash_reduce_by_row.cu
deleted file mode 100644
index b93a35d058a..00000000000
--- a/cpp/src/reductions/hash_reduce_by_row.cu
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "hash_reduce_by_row.cuh"
-
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/uninitialized_fill.h>
-
-namespace cudf::detail {
-
-#if 0
-rmm::device_uvector<size_type> hash_reduce_by_row(
-  hash_map_type const& map,
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
-  size_type num_rows,
-  cudf::nullate::DYNAMIC has_nulls,
-  bool has_nested_columns,
-  duplicate_keep_option keep,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(keep != duplicate_keep_option::KEEP_ANY,
-               "This function should not be called with KEEP_ANY");
-
-  auto reduction_results = rmm::device_uvector<size_type>(num_rows, stream, mr);
-
-  thrust::uninitialized_fill(rmm::exec_policy(stream),
-                             reduction_results.begin(),
-                             reduction_results.end(),
-                             reduction_init_value(keep));
-
-  auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
-  auto const key_hasher = experimental::compaction_hash(row_hasher.device_hasher(has_nulls));
-
-  auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input);
-
-  auto const reduce_by_row = [&](auto const value_comp) {
-    if (has_nested_columns) {
-      auto const key_equal = row_comp.equal_to<true>(has_nulls, nulls_equal, value_comp);
-      thrust::for_each(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator(0),
-        thrust::make_counting_iterator(num_rows),
-        reduce_by_row_fn{
-          map.get_device_view(), key_hasher, key_equal, keep, reduction_results.begin()});
-    } else {
-      auto const key_equal = row_comp.equal_to<false>(has_nulls, nulls_equal, value_comp);
-      thrust::for_each(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator(0),
-        thrust::make_counting_iterator(num_rows),
-        reduce_by_row_fn{
-          map.get_device_view(), key_hasher, key_equal, keep, reduction_results.begin()});
-    }
-  };
-
-  if (nans_equal == nan_equality::ALL_EQUAL) {
-    using nan_equal_comparator =
-      cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
-    reduce_by_row(nan_equal_comparator{});
-  } else {
-    using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
-    reduce_by_row(nan_unequal_comparator{});
-  }
-
-  return reduction_results;
-}
-#endif
-
-}  // namespace cudf::detail
diff --git a/cpp/src/reductions/hash_reduce_by_row.cuh b/cpp/src/reductions/hash_reduce_by_row.cuh
index b69846c807d..c64f65f30b7 100644
--- a/cpp/src/reductions/hash_reduce_by_row.cuh
+++ b/cpp/src/reductions/hash_reduce_by_row.cuh
@@ -113,4 +113,59 @@ struct reduce_by_row_fn {
   }
 };
 
+template <typename ReduceFuncBuilder, typename OutputType>
+rmm::device_uvector<size_type> hash_reduce_by_row(
+  hash_map_type const& map,
+  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
+  size_type num_rows,
+  cudf::nullate::DYNAMIC has_nulls,
+  bool has_nested_columns,
+  null_equality nulls_equal,
+  nan_equality nans_equal,
+  ReduceFuncBuilder func_builder,
+  OutputType init,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  auto reduction_results = rmm::device_uvector<OutputType>(num_rows, stream, mr);
+
+  thrust::uninitialized_fill(
+    rmm::exec_policy(stream), reduction_results.begin(), reduction_results.end(), init);
+
+  auto const map_dview  = map.get_device_view();
+  auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
+  auto const key_hasher = experimental::compaction_hash(row_hasher.device_hasher(has_nulls));
+
+  auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input);
+
+  auto const reduce_by_row = [&](auto const value_comp) {
+    if (has_nested_columns) {
+      auto const key_equal = row_comp.equal_to<true>(has_nulls, nulls_equal, value_comp);
+      thrust::for_each(
+        rmm::exec_policy(stream),
+        thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(num_rows),
+        func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin()));
+    } else {
+      auto const key_equal = row_comp.equal_to<false>(has_nulls, nulls_equal, value_comp);
+      thrust::for_each(
+        rmm::exec_policy(stream),
+        thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(num_rows),
+        func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin()));
+    }
+  };
+
+  if (nans_equal == nan_equality::ALL_EQUAL) {
+    using nan_equal_comparator =
+      cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
+    reduce_by_row(nan_equal_comparator{});
+  } else {
+    using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
+    reduce_by_row(nan_unequal_comparator{});
+  }
+
+  return reduction_results;
+}
+
 }  // namespace cudf::detail
diff --git a/cpp/src/stream_compaction/distinct_reduce.cu b/cpp/src/stream_compaction/distinct_reduce.cu
index 7562a174ebb..8b51ccc4026 100644
--- a/cpp/src/stream_compaction/distinct_reduce.cu
+++ b/cpp/src/stream_compaction/distinct_reduce.cu
@@ -18,7 +18,6 @@
 
 #include <reductions/hash_reduce_by_row.cuh>
 
-
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/uninitialized_fill.h>
@@ -61,6 +60,19 @@ struct distinct_reduce_fn : reduce_by_row_fn<MapView, KeyHasher, KeyEqual, size_
   }
 };
 
+template <duplicate_keep_option keep>
+struct reduce_func_builder {
+  template <typename MapView, typename KeyHasher, typename KeyEqual>
+  static auto build(MapView const& d_map,
+                    KeyHasher const& d_hasher,
+                    KeyEqual const& d_equal,
+                    size_type* const d_output)
+  {
+    return distinct_reduce_fn<MapView, KeyHasher, KeyEqual>{
+      d_map, d_hasher, d_equal, keep, d_output};
+  }
+};
+
 }  // namespace
 
 rmm::device_uvector<size_type> distinct_reduce(
@@ -75,51 +87,29 @@ rmm::device_uvector<size_type> distinct_reduce(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(keep != duplicate_keep_option::KEEP_ANY,
-               "This function should not be called with KEEP_ANY");
-
-  auto reduction_results = rmm::device_uvector<size_type>(num_rows, stream, mr);
-
-  thrust::uninitialized_fill(rmm::exec_policy(stream),
-                             reduction_results.begin(),
-                             reduction_results.end(),
-                             reduction_init_value(keep));
-
-  auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
-  auto const key_hasher = experimental::compaction_hash(row_hasher.device_hasher(has_nulls));
-
-  auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input);
-
-  auto const reduce_by_row = [&](auto const value_comp) {
-    if (has_nested_columns) {
-      auto const key_equal = row_comp.equal_to<true>(has_nulls, nulls_equal, value_comp);
-      thrust::for_each(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator(0),
-        thrust::make_counting_iterator(num_rows),
-        distinct_reduce_fn{
-          map.get_device_view(), key_hasher, key_equal, keep, reduction_results.begin()});
-    } else {
-      auto const key_equal = row_comp.equal_to<false>(has_nulls, nulls_equal, value_comp);
-      thrust::for_each(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator(0),
-        thrust::make_counting_iterator(num_rows),
-        distinct_reduce_fn{
-          map.get_device_view(), key_hasher, key_equal, keep, reduction_results.begin()});
-    }
+  auto const hash_reduce = [&](auto const& func_builder) {
+    return hash_reduce_by_row(map,
+                              preprocessed_input,
+                              num_rows,
+                              has_nulls,
+                              has_nested_columns,
+                              nulls_equal,
+                              nans_equal,
+                              func_builder,
+                              reduction_init_value(keep),
+                              stream,
+                              mr);
   };
-
-  if (nans_equal == nan_equality::ALL_EQUAL) {
-    using nan_equal_comparator =
-      cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
-    reduce_by_row(nan_equal_comparator{});
-  } else {
-    using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
-    reduce_by_row(nan_unequal_comparator{});
+  switch (keep) {
+    case duplicate_keep_option::KEEP_FIRST:
+      return hash_reduce(reduce_func_builder<duplicate_keep_option::KEEP_FIRST>{});
+    case duplicate_keep_option::KEEP_LAST:
+      return hash_reduce(reduce_func_builder<duplicate_keep_option::KEEP_LAST>{});
+    case duplicate_keep_option::KEEP_NONE:
+      return hash_reduce(reduce_func_builder<duplicate_keep_option::KEEP_NONE>{});
+    default:  //  KEEP_ANY
+      CUDF_FAIL("This function should not be called with KEEP_ANY");
   }
-
-  return reduction_results;
 }
 
 }  // namespace cudf::detail

From e58f3e33224e4dcd59707c100e0d976fee2fce9e Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Sep 2023 09:55:22 -0700
Subject: [PATCH 09/93] Rename struct and simplify code

---
 cpp/src/reductions/hash_reduce_by_row.cuh    | 10 ++--
 cpp/src/stream_compaction/distinct_reduce.cu | 53 +++++++++-----------
 2 files changed, 28 insertions(+), 35 deletions(-)

diff --git a/cpp/src/reductions/hash_reduce_by_row.cuh b/cpp/src/reductions/hash_reduce_by_row.cuh
index c64f65f30b7..2566cee6c7f 100644
--- a/cpp/src/reductions/hash_reduce_by_row.cuh
+++ b/cpp/src/reductions/hash_reduce_by_row.cuh
@@ -75,16 +75,16 @@ rmm::device_uvector<size_type> hash_reduce_by_row(
  * (https://github.com/NVIDIA/cuCollections/pull/98).
  */
 template <typename MapView, typename KeyHasher, typename KeyEqual, typename OutputType>
-struct reduce_by_row_fn {
+struct reduce_by_row_fn_base {
   MapView const d_map;
   KeyHasher const d_hasher;
   KeyEqual const d_equal;
   OutputType* const d_output;
 
-  reduce_by_row_fn(MapView const& d_map,
-                   KeyHasher const& d_hasher,
-                   KeyEqual const& d_equal,
-                   OutputType* const d_output)
+  reduce_by_row_fn_base(MapView const& d_map,
+                        KeyHasher const& d_hasher,
+                        KeyEqual const& d_equal,
+                        OutputType* const d_output)
     : d_map{d_map}, d_hasher{d_hasher}, d_equal{d_equal}, d_output{d_output}
   {
   }
diff --git a/cpp/src/stream_compaction/distinct_reduce.cu b/cpp/src/stream_compaction/distinct_reduce.cu
index 8b51ccc4026..0b621f87fbf 100644
--- a/cpp/src/stream_compaction/distinct_reduce.cu
+++ b/cpp/src/stream_compaction/distinct_reduce.cu
@@ -30,7 +30,7 @@ namespace {
  *
  */
 template <typename MapView, typename KeyHasher, typename KeyEqual>
-struct distinct_reduce_fn : reduce_by_row_fn<MapView, KeyHasher, KeyEqual, size_type> {
+struct distinct_reduce_fn : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type> {
   duplicate_keep_option const keep;
 
   distinct_reduce_fn(MapView const& d_map,
@@ -38,7 +38,8 @@ struct distinct_reduce_fn : reduce_by_row_fn<MapView, KeyHasher, KeyEqual, size_
                      KeyEqual const& d_equal,
                      duplicate_keep_option const keep,
                      size_type* const d_output)
-    : reduce_by_row_fn<MapView, KeyHasher, KeyEqual, size_type>(d_map, d_hasher, d_equal, d_output),
+    : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type>(
+        d_map, d_hasher, d_equal, d_output),
       keep{keep}
   {
   }
@@ -60,13 +61,14 @@ struct distinct_reduce_fn : reduce_by_row_fn<MapView, KeyHasher, KeyEqual, size_
   }
 };
 
-template <duplicate_keep_option keep>
 struct reduce_func_builder {
+  duplicate_keep_option keep;
+
   template <typename MapView, typename KeyHasher, typename KeyEqual>
-  static auto build(MapView const& d_map,
-                    KeyHasher const& d_hasher,
-                    KeyEqual const& d_equal,
-                    size_type* const d_output)
+  auto build(MapView const& d_map,
+             KeyHasher const& d_hasher,
+             KeyEqual const& d_equal,
+             size_type* const d_output)
   {
     return distinct_reduce_fn<MapView, KeyHasher, KeyEqual>{
       d_map, d_hasher, d_equal, keep, d_output};
@@ -87,29 +89,20 @@ rmm::device_uvector<size_type> distinct_reduce(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  auto const hash_reduce = [&](auto const& func_builder) {
-    return hash_reduce_by_row(map,
-                              preprocessed_input,
-                              num_rows,
-                              has_nulls,
-                              has_nested_columns,
-                              nulls_equal,
-                              nans_equal,
-                              func_builder,
-                              reduction_init_value(keep),
-                              stream,
-                              mr);
-  };
-  switch (keep) {
-    case duplicate_keep_option::KEEP_FIRST:
-      return hash_reduce(reduce_func_builder<duplicate_keep_option::KEEP_FIRST>{});
-    case duplicate_keep_option::KEEP_LAST:
-      return hash_reduce(reduce_func_builder<duplicate_keep_option::KEEP_LAST>{});
-    case duplicate_keep_option::KEEP_NONE:
-      return hash_reduce(reduce_func_builder<duplicate_keep_option::KEEP_NONE>{});
-    default:  //  KEEP_ANY
-      CUDF_FAIL("This function should not be called with KEEP_ANY");
-  }
+  CUDF_EXPECTS(keep != duplicate_keep_option::KEEP_ANY,
+               "This function should not be called with KEEP_ANY");
+
+  return hash_reduce_by_row(map,
+                            preprocessed_input,
+                            num_rows,
+                            has_nulls,
+                            has_nested_columns,
+                            nulls_equal,
+                            nans_equal,
+                            reduce_func_builder{keep},
+                            reduction_init_value(keep),
+                            stream,
+                            mr);
 }
 
 }  // namespace cudf::detail

From 3cf194824e64952bac314762a35d0746ce5c4e68 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Sep 2023 10:04:52 -0700
Subject: [PATCH 10/93] Refactor `hash_reduce_by_row`

---
 cpp/src/reductions/hash_reduce_by_row.cuh     | 171 ++++++++++++++++++
 cpp/src/stream_compaction/distinct.cu         |  20 +-
 cpp/src/stream_compaction/distinct_reduce.cu  | 114 ++++--------
 cpp/src/stream_compaction/distinct_reduce.cuh |   2 +-
 4 files changed, 218 insertions(+), 89 deletions(-)
 create mode 100644 cpp/src/reductions/hash_reduce_by_row.cuh

diff --git a/cpp/src/reductions/hash_reduce_by_row.cuh b/cpp/src/reductions/hash_reduce_by_row.cuh
new file mode 100644
index 00000000000..2566cee6c7f
--- /dev/null
+++ b/cpp/src/reductions/hash_reduce_by_row.cuh
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stream_compaction/stream_compaction_common.cuh>
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/stream_compaction.hpp>
+#include <cudf/table/experimental/row_operators.cuh>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <memory>
+
+namespace cudf::detail {
+
+/**
+ * @brief Perform a reduction on groups of rows that are compared equal.
+ *
+ * This is essentially a reduce-by-key operation with keys are non-contiguous rows and are compared
+ * equal. A hash table is used to find groups of equal rows.
+ *
+ * Depending on the `keep` parameter, the reduction operation for each row group is:
+ * - If `keep == KEEP_FIRST`: min of row indices in the group.
+ * - If `keep == KEEP_LAST`: max of row indices in the group.
+ * - If `keep == KEEP_NONE`: count of equivalent rows (group size).
+ *
+ * At the beginning of the operation, the entire output array is filled with a value given by
+ * the `reduction_init_value()` function. Then, the reduction result for each row group is written
+ * into the output array at the index of an unspecified row in the group.
+ *
+ * @param map The auxiliary map to perform reduction
+ * @param preprocessed_input The preprocessed of the input rows for computing row hashing and row
+ *        comparisons
+ * @param num_rows The number of all input rows
+ * @param has_nulls Indicate whether the input rows has any nulls at any nested levels
+ * @param has_nested_columns Indicates whether the input table has any nested columns
+ * @param keep The parameter to determine what type of reduction to perform
+ * @param nulls_equal Flag to specify whether null elements should be considered as equal
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned vector
+ * @return A device_uvector containing the reduction results
+ */
+rmm::device_uvector<size_type> hash_reduce_by_row(
+  hash_map_type const& map,
+  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
+  size_type num_rows,
+  cudf::nullate::DYNAMIC has_nulls,
+  bool has_nested_columns,
+  duplicate_keep_option keep,
+  null_equality nulls_equal,
+  nan_equality nans_equal,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief A functor to perform reduce-by-key with keys are rows that compared equal.
+ *
+ * TODO: We need to switch to use `static_reduction_map` when it is ready
+ * (https://github.com/NVIDIA/cuCollections/pull/98).
+ */
+template <typename MapView, typename KeyHasher, typename KeyEqual, typename OutputType>
+struct reduce_by_row_fn_base {
+  MapView const d_map;
+  KeyHasher const d_hasher;
+  KeyEqual const d_equal;
+  OutputType* const d_output;
+
+  reduce_by_row_fn_base(MapView const& d_map,
+                        KeyHasher const& d_hasher,
+                        KeyEqual const& d_equal,
+                        OutputType* const d_output)
+    : d_map{d_map}, d_hasher{d_hasher}, d_equal{d_equal}, d_output{d_output}
+  {
+  }
+
+ protected:
+  __device__ OutputType* get_output_ptr(size_type const idx) const
+  {
+    auto const iter = d_map.find(idx, d_hasher, d_equal);
+
+    if (iter != d_map.end()) {
+      // Only one index value of the duplicate rows could be inserted into the map.
+      // As such, looking up for all indices of duplicate rows always returns the same value.
+      auto const inserted_idx = iter->second.load(cuda::std::memory_order_relaxed);
+
+      // All duplicate rows will have concurrent access to this same output slot.
+      return &d_output[inserted_idx];
+    } else {
+      // All input `idx` values have been inserted into the map before.
+      // Thus, searching for an `idx` key resulting in the `end()` iterator only happens if
+      // `d_equal(idx, idx) == false`.
+      // Such situations are due to comparing nulls or NaNs which are considered as always unequal.
+      // In those cases, all rows containing nulls or NaNs are distinct. Just return their direct
+      // output slot.
+      return &d_output[idx];
+    }
+  }
+};
+
+template <typename ReduceFuncBuilder, typename OutputType>
+rmm::device_uvector<size_type> hash_reduce_by_row(
+  hash_map_type const& map,
+  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
+  size_type num_rows,
+  cudf::nullate::DYNAMIC has_nulls,
+  bool has_nested_columns,
+  null_equality nulls_equal,
+  nan_equality nans_equal,
+  ReduceFuncBuilder func_builder,
+  OutputType init,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  auto reduction_results = rmm::device_uvector<OutputType>(num_rows, stream, mr);
+
+  thrust::uninitialized_fill(
+    rmm::exec_policy(stream), reduction_results.begin(), reduction_results.end(), init);
+
+  auto const map_dview  = map.get_device_view();
+  auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
+  auto const key_hasher = experimental::compaction_hash(row_hasher.device_hasher(has_nulls));
+
+  auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input);
+
+  auto const reduce_by_row = [&](auto const value_comp) {
+    if (has_nested_columns) {
+      auto const key_equal = row_comp.equal_to<true>(has_nulls, nulls_equal, value_comp);
+      thrust::for_each(
+        rmm::exec_policy(stream),
+        thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(num_rows),
+        func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin()));
+    } else {
+      auto const key_equal = row_comp.equal_to<false>(has_nulls, nulls_equal, value_comp);
+      thrust::for_each(
+        rmm::exec_policy(stream),
+        thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(num_rows),
+        func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin()));
+    }
+  };
+
+  if (nans_equal == nan_equality::ALL_EQUAL) {
+    using nan_equal_comparator =
+      cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
+    reduce_by_row(nan_equal_comparator{});
+  } else {
+    using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
+    reduce_by_row(nan_unequal_comparator{});
+  }
+
+  return reduction_results;
+}
+
+}  // namespace cudf::detail
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index cc60b2a12ea..8b0710372a6 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -96,16 +96,16 @@ rmm::device_uvector<size_type> get_distinct_indices(table_view const& input,
   }
 
   // For other keep options, reduce by row on rows that compare equal.
-  auto const reduction_results = hash_reduce_by_row(map,
-                                                    std::move(preprocessed_input),
-                                                    input.num_rows(),
-                                                    has_nulls,
-                                                    has_nested_columns,
-                                                    keep,
-                                                    nulls_equal,
-                                                    nans_equal,
-                                                    stream,
-                                                    rmm::mr::get_current_device_resource());
+  auto const reduction_results = distinct_reduce(map,
+                                                 std::move(preprocessed_input),
+                                                 input.num_rows(),
+                                                 has_nulls,
+                                                 has_nested_columns,
+                                                 keep,
+                                                 nulls_equal,
+                                                 nans_equal,
+                                                 stream,
+                                                 rmm::mr::get_current_device_resource());
 
   // Extract the desired output indices from reduction results.
   auto const map_end = [&] {
diff --git a/cpp/src/stream_compaction/distinct_reduce.cu b/cpp/src/stream_compaction/distinct_reduce.cu
index 020e6a495bc..0b621f87fbf 100644
--- a/cpp/src/stream_compaction/distinct_reduce.cu
+++ b/cpp/src/stream_compaction/distinct_reduce.cu
@@ -16,6 +16,8 @@
 
 #include "distinct_reduce.cuh"
 
+#include <reductions/hash_reduce_by_row.cuh>
+
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/uninitialized_fill.h>
@@ -24,31 +26,27 @@ namespace cudf::detail {
 
 namespace {
 /**
- * @brief A functor to perform reduce-by-key with keys are rows that compared equal.
+ * @brief
  *
- * TODO: We need to switch to use `static_reduction_map` when it is ready
- * (https://github.com/NVIDIA/cuCollections/pull/98).
  */
 template <typename MapView, typename KeyHasher, typename KeyEqual>
-struct reduce_by_row_fn {
-  MapView const d_map;
-  KeyHasher const d_hasher;
-  KeyEqual const d_equal;
+struct distinct_reduce_fn : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type> {
   duplicate_keep_option const keep;
-  size_type* const d_output;
 
-  reduce_by_row_fn(MapView const& d_map,
-                   KeyHasher const& d_hasher,
-                   KeyEqual const& d_equal,
-                   duplicate_keep_option const keep,
-                   size_type* const d_output)
-    : d_map{d_map}, d_hasher{d_hasher}, d_equal{d_equal}, keep{keep}, d_output{d_output}
+  distinct_reduce_fn(MapView const& d_map,
+                     KeyHasher const& d_hasher,
+                     KeyEqual const& d_equal,
+                     duplicate_keep_option const keep,
+                     size_type* const d_output)
+    : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type>(
+        d_map, d_hasher, d_equal, d_output),
+      keep{keep}
   {
   }
 
   __device__ void operator()(size_type const idx) const
   {
-    auto const out_ptr = get_output_ptr(idx);
+    auto const out_ptr = this->get_output_ptr(idx);
 
     if (keep == duplicate_keep_option::KEEP_FIRST) {
       // Store the smallest index of all rows that are equal.
@@ -61,34 +59,25 @@ struct reduce_by_row_fn {
       atomicAdd(out_ptr, size_type{1});
     }
   }
+};
 
- private:
-  __device__ size_type* get_output_ptr(size_type const idx) const
-  {
-    auto const iter = d_map.find(idx, d_hasher, d_equal);
-
-    if (iter != d_map.end()) {
-      // Only one index value of the duplicate rows could be inserted into the map.
-      // As such, looking up for all indices of duplicate rows always returns the same value.
-      auto const inserted_idx = iter->second.load(cuda::std::memory_order_relaxed);
+struct reduce_func_builder {
+  duplicate_keep_option keep;
 
-      // All duplicate rows will have concurrent access to this same output slot.
-      return &d_output[inserted_idx];
-    } else {
-      // All input `idx` values have been inserted into the map before.
-      // Thus, searching for an `idx` key resulting in the `end()` iterator only happens if
-      // `d_equal(idx, idx) == false`.
-      // Such situations are due to comparing nulls or NaNs which are considered as always unequal.
-      // In those cases, all rows containing nulls or NaNs are distinct. Just return their direct
-      // output slot.
-      return &d_output[idx];
-    }
+  template <typename MapView, typename KeyHasher, typename KeyEqual>
+  auto build(MapView const& d_map,
+             KeyHasher const& d_hasher,
+             KeyEqual const& d_equal,
+             size_type* const d_output)
+  {
+    return distinct_reduce_fn<MapView, KeyHasher, KeyEqual>{
+      d_map, d_hasher, d_equal, keep, d_output};
   }
 };
 
 }  // namespace
 
-rmm::device_uvector<size_type> hash_reduce_by_row(
+rmm::device_uvector<size_type> distinct_reduce(
   hash_map_type const& map,
   std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
   size_type num_rows,
@@ -103,48 +92,17 @@ rmm::device_uvector<size_type> hash_reduce_by_row(
   CUDF_EXPECTS(keep != duplicate_keep_option::KEEP_ANY,
                "This function should not be called with KEEP_ANY");
 
-  auto reduction_results = rmm::device_uvector<size_type>(num_rows, stream, mr);
-
-  thrust::uninitialized_fill(rmm::exec_policy(stream),
-                             reduction_results.begin(),
-                             reduction_results.end(),
-                             reduction_init_value(keep));
-
-  auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
-  auto const key_hasher = experimental::compaction_hash(row_hasher.device_hasher(has_nulls));
-
-  auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input);
-
-  auto const reduce_by_row = [&](auto const value_comp) {
-    if (has_nested_columns) {
-      auto const key_equal = row_comp.equal_to<true>(has_nulls, nulls_equal, value_comp);
-      thrust::for_each(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator(0),
-        thrust::make_counting_iterator(num_rows),
-        reduce_by_row_fn{
-          map.get_device_view(), key_hasher, key_equal, keep, reduction_results.begin()});
-    } else {
-      auto const key_equal = row_comp.equal_to<false>(has_nulls, nulls_equal, value_comp);
-      thrust::for_each(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator(0),
-        thrust::make_counting_iterator(num_rows),
-        reduce_by_row_fn{
-          map.get_device_view(), key_hasher, key_equal, keep, reduction_results.begin()});
-    }
-  };
-
-  if (nans_equal == nan_equality::ALL_EQUAL) {
-    using nan_equal_comparator =
-      cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
-    reduce_by_row(nan_equal_comparator{});
-  } else {
-    using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
-    reduce_by_row(nan_unequal_comparator{});
-  }
-
-  return reduction_results;
+  return hash_reduce_by_row(map,
+                            preprocessed_input,
+                            num_rows,
+                            has_nulls,
+                            has_nested_columns,
+                            nulls_equal,
+                            nans_equal,
+                            reduce_func_builder{keep},
+                            reduction_init_value(keep),
+                            stream,
+                            mr);
 }
 
 }  // namespace cudf::detail
diff --git a/cpp/src/stream_compaction/distinct_reduce.cuh b/cpp/src/stream_compaction/distinct_reduce.cuh
index 8ec1fa18205..74fba8196f4 100644
--- a/cpp/src/stream_compaction/distinct_reduce.cuh
+++ b/cpp/src/stream_compaction/distinct_reduce.cuh
@@ -72,7 +72,7 @@ auto constexpr reduction_init_value(duplicate_keep_option keep)
  * @param mr Device memory resource used to allocate the returned vector
  * @return A device_uvector containing the reduction results
  */
-rmm::device_uvector<size_type> hash_reduce_by_row(
+rmm::device_uvector<size_type> distinct_reduce(
   hash_map_type const& map,
   std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
   size_type num_rows,

From 84886467639b5303572f2e85402b3db075e36cbd Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Sep 2023 10:54:46 -0700
Subject: [PATCH 11/93] Rewrite `hash_reduce_by_row.cuh`

---
 cpp/src/reductions/hash_reduce_by_row.cuh | 91 ++++++++++-------------
 1 file changed, 40 insertions(+), 51 deletions(-)

diff --git a/cpp/src/reductions/hash_reduce_by_row.cuh b/cpp/src/reductions/hash_reduce_by_row.cuh
index 2566cee6c7f..d30e96bc9d2 100644
--- a/cpp/src/reductions/hash_reduce_by_row.cuh
+++ b/cpp/src/reductions/hash_reduce_by_row.cuh
@@ -16,8 +16,6 @@
 
 #include <stream_compaction/stream_compaction_common.cuh>
 
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/stream_compaction.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/types.hpp>
 
@@ -25,57 +23,22 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <memory>
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/uninitialized_fill.h>
 
 namespace cudf::detail {
 
 /**
- * @brief Perform a reduction on groups of rows that are compared equal.
- *
- * This is essentially a reduce-by-key operation with keys are non-contiguous rows and are compared
- * equal. A hash table is used to find groups of equal rows.
- *
- * Depending on the `keep` parameter, the reduction operation for each row group is:
- * - If `keep == KEEP_FIRST`: min of row indices in the group.
- * - If `keep == KEEP_LAST`: max of row indices in the group.
- * - If `keep == KEEP_NONE`: count of equivalent rows (group size).
- *
- * At the beginning of the operation, the entire output array is filled with a value given by
- * the `reduction_init_value()` function. Then, the reduction result for each row group is written
- * into the output array at the index of an unspecified row in the group.
- *
- * @param map The auxiliary map to perform reduction
- * @param preprocessed_input The preprocessed of the input rows for computing row hashing and row
- *        comparisons
- * @param num_rows The number of all input rows
- * @param has_nulls Indicate whether the input rows has any nulls at any nested levels
- * @param has_nested_columns Indicates whether the input table has any nested columns
- * @param keep The parameter to determine what type of reduction to perform
- * @param nulls_equal Flag to specify whether null elements should be considered as equal
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned vector
- * @return A device_uvector containing the reduction results
- */
-rmm::device_uvector<size_type> hash_reduce_by_row(
-  hash_map_type const& map,
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
-  size_type num_rows,
-  cudf::nullate::DYNAMIC has_nulls,
-  bool has_nested_columns,
-  duplicate_keep_option keep,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
-
-/**
- * @brief A functor to perform reduce-by-key with keys are rows that compared equal.
+ * @brief The base struct for customized reduction functor to perform reduce-by-key with keys are
+ * rows that compared equal.
  *
  * TODO: We need to switch to use `static_reduction_map` when it is ready
  * (https://github.com/NVIDIA/cuCollections/pull/98).
  */
 template <typename MapView, typename KeyHasher, typename KeyEqual, typename OutputType>
 struct reduce_by_row_fn_base {
+ protected:
   MapView const d_map;
   KeyHasher const d_hasher;
   KeyEqual const d_equal;
@@ -89,13 +52,18 @@ struct reduce_by_row_fn_base {
   {
   }
 
- protected:
+  /**
+   * @brief Return a pointer to the output array at the given index.
+   *
+   * @param idx The access index
+   * @return A pointer to the given index in the output array
+   */
   __device__ OutputType* get_output_ptr(size_type const idx) const
   {
     auto const iter = d_map.find(idx, d_hasher, d_equal);
 
     if (iter != d_map.end()) {
-      // Only one index value of the duplicate rows could be inserted into the map.
+      // Only one (undetermined) index value of the duplicate rows could be inserted into the map.
       // As such, looking up for all indices of duplicate rows always returns the same value.
       auto const inserted_idx = iter->second.load(cuda::std::memory_order_relaxed);
 
@@ -113,6 +81,29 @@ struct reduce_by_row_fn_base {
   }
 };
 
+/**
+ * @brief Perform a reduction on groups of rows that are compared equal.
+ *
+ * This is essentially a reduce-by-key operation with keys are non-contiguous rows and are compared
+ * equal. A hash table is used to find groups of equal rows.
+ *
+ * At the beginning of the operation, the entire output array is filled with a value given by
+ * the `init` parameter. Then, the reduction result for each row group is written into the output
+ * array at the index of an unspecified row in the group.
+ *
+ * @param map The auxiliary map to perform reduction
+ * @param preprocessed_input The preprocessed of the input rows for computing row hashing and row
+ *        comparisons
+ * @param num_rows The number of all input rows
+ * @param has_nulls Indicate whether the input rows has any nulls at any nested levels
+ * @param has_nested_columns Indicates whether the input table has any nested columns
+ * @param nulls_equal Flag to specify whether null elements should be considered as equal
+ * @param nans_equal Flag to specify whether NaN values in floating point column should be
+ *        considered equal.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned vector
+ * @return A device_uvector containing the reduction results
+ */
 template <typename ReduceFuncBuilder, typename OutputType>
 rmm::device_uvector<size_type> hash_reduce_by_row(
   hash_map_type const& map,
@@ -127,16 +118,14 @@ rmm::device_uvector<size_type> hash_reduce_by_row(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  auto reduction_results = rmm::device_uvector<OutputType>(num_rows, stream, mr);
-
-  thrust::uninitialized_fill(
-    rmm::exec_policy(stream), reduction_results.begin(), reduction_results.end(), init);
-
   auto const map_dview  = map.get_device_view();
   auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
   auto const key_hasher = experimental::compaction_hash(row_hasher.device_hasher(has_nulls));
+  auto const row_comp   = cudf::experimental::row::equality::self_comparator(preprocessed_input);
 
-  auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input);
+  auto reduction_results = rmm::device_uvector<OutputType>(num_rows, stream, mr);
+  thrust::uninitialized_fill(
+    rmm::exec_policy(stream), reduction_results.begin(), reduction_results.end(), init);
 
   auto const reduce_by_row = [&](auto const value_comp) {
     if (has_nested_columns) {

From 1994684b77c43fc81dd33e4b564a87b7c3b84a9c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Sep 2023 10:55:57 -0700
Subject: [PATCH 12/93] Rename and rewrite `distinct_reduce.hpp`

---
 .../{distinct_reduce.cuh => distinct_reduce.hpp}       | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)
 rename cpp/src/stream_compaction/{distinct_reduce.cuh => distinct_reduce.hpp} (93%)

diff --git a/cpp/src/stream_compaction/distinct_reduce.cuh b/cpp/src/stream_compaction/distinct_reduce.hpp
similarity index 93%
rename from cpp/src/stream_compaction/distinct_reduce.cuh
rename to cpp/src/stream_compaction/distinct_reduce.hpp
index 74fba8196f4..236b6c860c3 100644
--- a/cpp/src/stream_compaction/distinct_reduce.cuh
+++ b/cpp/src/stream_compaction/distinct_reduce.hpp
@@ -14,18 +14,14 @@
  * limitations under the License.
  */
 
-#include "stream_compaction_common.cuh"
+#include "stream_compaction_common.hpp"
 
-#include <cudf/column/column_device_view.cuh>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <memory>
 
 namespace cudf::detail {
 
@@ -56,6 +52,8 @@ auto constexpr reduction_init_value(duplicate_keep_option keep)
  * - If `keep == KEEP_LAST`: max of row indices in the group.
  * - If `keep == KEEP_NONE`: count of equivalent rows (group size).
  *
+ * Note that this function is not needed when `keep == KEEP_NONE`.
+ *
  * At the beginning of the operation, the entire output array is filled with a value given by
  * the `reduction_init_value()` function. Then, the reduction result for each row group is written
  * into the output array at the index of an unspecified row in the group.
@@ -68,6 +66,8 @@ auto constexpr reduction_init_value(duplicate_keep_option keep)
  * @param has_nested_columns Indicates whether the input table has any nested columns
  * @param keep The parameter to determine what type of reduction to perform
  * @param nulls_equal Flag to specify whether null elements should be considered as equal
+ * @param nans_equal Flag to specify whether NaN values in floating point column should be
+ *        considered equal.
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned vector
  * @return A device_uvector containing the reduction results

From 5dcbac9dec96e4525369029bb23774020f9b5c1e Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Sep 2023 11:00:16 -0700
Subject: [PATCH 13/93] Rewrite `distinct.cu`

---
 cpp/src/stream_compaction/distinct.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index 8b0710372a6..b551df96765 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -14,7 +14,8 @@
  * limitations under the License.
  */
 
-#include "distinct_reduce.cuh"
+#include "distinct_reduce.hpp"
+#include "stream_compaction_common.cuh"
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/gather.hpp>

From 6236fcc9551685b8da497c0b86eda08769615f61 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Sep 2023 11:10:14 -0700
Subject: [PATCH 14/93] Rewrite `distinct_reduce.cu`

---
 cpp/src/stream_compaction/distinct_reduce.cu | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/cpp/src/stream_compaction/distinct_reduce.cu b/cpp/src/stream_compaction/distinct_reduce.cu
index 0b621f87fbf..24926cdbd4a 100644
--- a/cpp/src/stream_compaction/distinct_reduce.cu
+++ b/cpp/src/stream_compaction/distinct_reduce.cu
@@ -14,20 +14,15 @@
  * limitations under the License.
  */
 
-#include "distinct_reduce.cuh"
+#include "distinct_reduce.hpp"
 
 #include <reductions/hash_reduce_by_row.cuh>
 
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/uninitialized_fill.h>
-
 namespace cudf::detail {
 
 namespace {
 /**
- * @brief
- *
+ * @brief The functor to find the first/last/none duplicate row for rows that compared equal.
  */
 template <typename MapView, typename KeyHasher, typename KeyEqual>
 struct distinct_reduce_fn : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type> {
@@ -61,6 +56,10 @@ struct distinct_reduce_fn : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual,
   }
 };
 
+/**
+ * @brief The builder to construct an instance of `distinct_reduce_fn` functor base on the given
+ * value of the `duplicate_keep_option` member variable.
+ */
 struct reduce_func_builder {
   duplicate_keep_option keep;
 

From 42a778f18dc35a843b869080283b67fc5ec4da1f Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Sep 2023 11:10:23 -0700
Subject: [PATCH 15/93] Rewrite `hash_reduce_by_row.cuh`

---
 cpp/src/reductions/hash_reduce_by_row.cuh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cpp/src/reductions/hash_reduce_by_row.cuh b/cpp/src/reductions/hash_reduce_by_row.cuh
index d30e96bc9d2..1cff009b17b 100644
--- a/cpp/src/reductions/hash_reduce_by_row.cuh
+++ b/cpp/src/reductions/hash_reduce_by_row.cuh
@@ -91,6 +91,9 @@ struct reduce_by_row_fn_base {
  * the `init` parameter. Then, the reduction result for each row group is written into the output
  * array at the index of an unspecified row in the group.
  *
+ * @tparam ReduceFuncBuilder The builder class that must have a `build()` method returning a
+ *         reduction functor derived from `reduce_by_row_fn_base`
+ * @tparam OutputType Type of the reduction results
  * @param map The auxiliary map to perform reduction
  * @param preprocessed_input The preprocessed of the input rows for computing row hashing and row
  *        comparisons
@@ -100,6 +103,7 @@ struct reduce_by_row_fn_base {
  * @param nulls_equal Flag to specify whether null elements should be considered as equal
  * @param nans_equal Flag to specify whether NaN values in floating point column should be
  *        considered equal.
+ * @param init The initial value for reduction of each row group
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned vector
  * @return A device_uvector containing the reduction results

From 584ff8dc600a6c6d13f5f5adadae719c0aa7eb2f Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Sep 2023 11:22:06 -0700
Subject: [PATCH 16/93] Minor changes

---
 cpp/src/stream_compaction/distinct_reduce.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/stream_compaction/distinct_reduce.cu b/cpp/src/stream_compaction/distinct_reduce.cu
index 24926cdbd4a..a451643794d 100644
--- a/cpp/src/stream_compaction/distinct_reduce.cu
+++ b/cpp/src/stream_compaction/distinct_reduce.cu
@@ -33,8 +33,8 @@ struct distinct_reduce_fn : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual,
                      KeyEqual const& d_equal,
                      duplicate_keep_option const keep,
                      size_type* const d_output)
-    : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type>(
-        d_map, d_hasher, d_equal, d_output),
+    : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type>{
+        d_map, d_hasher, d_equal, d_output},
       keep{keep}
   {
   }
@@ -61,7 +61,7 @@ struct distinct_reduce_fn : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual,
  * value of the `duplicate_keep_option` member variable.
  */
 struct reduce_func_builder {
-  duplicate_keep_option keep;
+  duplicate_keep_option const keep;
 
   template <typename MapView, typename KeyHasher, typename KeyEqual>
   auto build(MapView const& d_map,

From 4a3d60d62598c6180431b99f3ab03c3787fd445f Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Sep 2023 11:28:51 -0700
Subject: [PATCH 17/93] Fix style

---
 cpp/src/stream_compaction/distinct_reduce.cu | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/src/stream_compaction/distinct_reduce.cu b/cpp/src/stream_compaction/distinct_reduce.cu
index a451643794d..8cfb7b93515 100644
--- a/cpp/src/stream_compaction/distinct_reduce.cu
+++ b/cpp/src/stream_compaction/distinct_reduce.cu
@@ -33,8 +33,10 @@ struct distinct_reduce_fn : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual,
                      KeyEqual const& d_equal,
                      duplicate_keep_option const keep,
                      size_type* const d_output)
-    : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type>{
-        d_map, d_hasher, d_equal, d_output},
+    : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type>{d_map,
+                                                                     d_hasher,
+                                                                     d_equal,
+                                                                     d_output},
       keep{keep}
   {
   }

From 34cb488c27880f40a579686ebd18d447f734f240 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Sep 2023 11:32:09 -0700
Subject: [PATCH 18/93] Fix comment

---
 cpp/src/stream_compaction/distinct_reduce.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/stream_compaction/distinct_reduce.cu b/cpp/src/stream_compaction/distinct_reduce.cu
index 8cfb7b93515..64d29ae2ff0 100644
--- a/cpp/src/stream_compaction/distinct_reduce.cu
+++ b/cpp/src/stream_compaction/distinct_reduce.cu
@@ -22,7 +22,7 @@ namespace cudf::detail {
 
 namespace {
 /**
- * @brief The functor to find the first/last/none duplicate row for rows that compared equal.
+ * @brief The functor to find the first/last/all duplicate row for rows that compared equal.
  */
 template <typename MapView, typename KeyHasher, typename KeyEqual>
 struct distinct_reduce_fn : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type> {

From e73c07f8690e0b331fcda717436524a9fff99793 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Sep 2023 12:41:51 -0700
Subject: [PATCH 19/93] Move file

---
 .../reductions => include/cudf/detail}/hash_reduce_by_row.cuh     | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename cpp/{src/reductions => include/cudf/detail}/hash_reduce_by_row.cuh (100%)

diff --git a/cpp/src/reductions/hash_reduce_by_row.cuh b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
similarity index 100%
rename from cpp/src/reductions/hash_reduce_by_row.cuh
rename to cpp/include/cudf/detail/hash_reduce_by_row.cuh

From 40e8730d806f7e965e59fb216a795771968055c3 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Sep 2023 12:45:16 -0700
Subject: [PATCH 20/93] Merge `distinct_reduce.*` into `distinct.cu`

---
 cpp/CMakeLists.txt                            |   1 -
 cpp/src/stream_compaction/distinct.cu         | 101 ++++++++++++++--
 cpp/src/stream_compaction/distinct_reduce.cu  | 109 ------------------
 cpp/src/stream_compaction/distinct_reduce.hpp |  87 --------------
 4 files changed, 90 insertions(+), 208 deletions(-)
 delete mode 100644 cpp/src/stream_compaction/distinct_reduce.cu
 delete mode 100644 cpp/src/stream_compaction/distinct_reduce.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a8e45b70572..e65ca2895c4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -532,7 +532,6 @@ add_library(
   src/stream_compaction/apply_boolean_mask.cu
   src/stream_compaction/distinct.cu
   src/stream_compaction/distinct_count.cu
-  src/stream_compaction/distinct_reduce.cu
   src/stream_compaction/drop_nans.cu
   src/stream_compaction/drop_nulls.cu
   src/stream_compaction/stable_distinct.cu
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index b551df96765..8a7f6daa193 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
-#include "distinct_reduce.hpp"
 #include "stream_compaction_common.cuh"
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/gather.hpp>
+#include <cudf/detail/hash_reduce_by_row.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/stream_compaction.hpp>
@@ -39,6 +39,80 @@
 namespace cudf {
 namespace detail {
 
+namespace {
+/**
+ * @brief Return the reduction identity used to initialize results of `hash_reduce_by_row`.
+ *
+ * @param keep A value of `duplicate_keep_option` type, must not be `KEEP_ANY`.
+ * @return The initial reduction value.
+ */
+auto constexpr reduction_init_value(duplicate_keep_option keep)
+{
+  switch (keep) {
+    case duplicate_keep_option::KEEP_FIRST: return std::numeric_limits<size_type>::max();
+    case duplicate_keep_option::KEEP_LAST: return std::numeric_limits<size_type>::min();
+    case duplicate_keep_option::KEEP_NONE: return size_type{0};
+    default: CUDF_UNREACHABLE("This function should not be called with KEEP_ANY");
+  }
+}
+
+/**
+ * @brief The functor to find the first/last/all duplicate row for rows that compared equal.
+ */
+template <typename MapView, typename KeyHasher, typename KeyEqual>
+struct distinct_reduce_fn : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type> {
+  duplicate_keep_option const keep;
+
+  distinct_reduce_fn(MapView const& d_map,
+                     KeyHasher const& d_hasher,
+                     KeyEqual const& d_equal,
+                     duplicate_keep_option const keep,
+                     size_type* const d_output)
+    : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type>{d_map,
+                                                                     d_hasher,
+                                                                     d_equal,
+                                                                     d_output},
+      keep{keep}
+  {
+  }
+
+  __device__ void operator()(size_type const idx) const
+  {
+    auto const out_ptr = this->get_output_ptr(idx);
+
+    if (keep == duplicate_keep_option::KEEP_FIRST) {
+      // Store the smallest index of all rows that are equal.
+      atomicMin(out_ptr, idx);
+    } else if (keep == duplicate_keep_option::KEEP_LAST) {
+      // Store the greatest index of all rows that are equal.
+      atomicMax(out_ptr, idx);
+    } else {
+      // Count the number of rows in each group of rows that are compared equal.
+      atomicAdd(out_ptr, size_type{1});
+    }
+  }
+};
+
+/**
+ * @brief The builder to construct an instance of `distinct_reduce_fn` functor base on the given
+ * value of the `duplicate_keep_option` member variable.
+ */
+struct reduce_func_builder {
+  duplicate_keep_option const keep;
+
+  template <typename MapView, typename KeyHasher, typename KeyEqual>
+  auto build(MapView const& d_map,
+             KeyHasher const& d_hasher,
+             KeyEqual const& d_equal,
+             size_type* const d_output)
+  {
+    return distinct_reduce_fn<MapView, KeyHasher, KeyEqual>{
+      d_map, d_hasher, d_equal, keep, d_output};
+  }
+};
+
+}  // namespace
+
 rmm::device_uvector<size_type> get_distinct_indices(table_view const& input,
                                                     duplicate_keep_option keep,
                                                     null_equality nulls_equal,
@@ -97,16 +171,21 @@ rmm::device_uvector<size_type> get_distinct_indices(table_view const& input,
   }
 
   // For other keep options, reduce by row on rows that compare equal.
-  auto const reduction_results = distinct_reduce(map,
-                                                 std::move(preprocessed_input),
-                                                 input.num_rows(),
-                                                 has_nulls,
-                                                 has_nested_columns,
-                                                 keep,
-                                                 nulls_equal,
-                                                 nans_equal,
-                                                 stream,
-                                                 rmm::mr::get_current_device_resource());
+  // Depending on the `keep` parameter, the reduction operation for each row group is:
+  // - If `keep == KEEP_FIRST`: min of row indices in the group.
+  // - If `keep == KEEP_LAST`: max of row indices in the group.
+  // - If `keep == KEEP_NONE`: count of equivalent rows (group size).
+  auto const reduction_results = hash_reduce_by_row(map,
+                                                    std::move(preprocessed_input),
+                                                    input.num_rows(),
+                                                    has_nulls,
+                                                    has_nested_columns,
+                                                    nulls_equal,
+                                                    nans_equal,
+                                                    reduce_func_builder{keep},
+                                                    reduction_init_value(keep),
+                                                    stream,
+                                                    mr);
 
   // Extract the desired output indices from reduction results.
   auto const map_end = [&] {
diff --git a/cpp/src/stream_compaction/distinct_reduce.cu b/cpp/src/stream_compaction/distinct_reduce.cu
deleted file mode 100644
index 64d29ae2ff0..00000000000
--- a/cpp/src/stream_compaction/distinct_reduce.cu
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "distinct_reduce.hpp"
-
-#include <reductions/hash_reduce_by_row.cuh>
-
-namespace cudf::detail {
-
-namespace {
-/**
- * @brief The functor to find the first/last/all duplicate row for rows that compared equal.
- */
-template <typename MapView, typename KeyHasher, typename KeyEqual>
-struct distinct_reduce_fn : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type> {
-  duplicate_keep_option const keep;
-
-  distinct_reduce_fn(MapView const& d_map,
-                     KeyHasher const& d_hasher,
-                     KeyEqual const& d_equal,
-                     duplicate_keep_option const keep,
-                     size_type* const d_output)
-    : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type>{d_map,
-                                                                     d_hasher,
-                                                                     d_equal,
-                                                                     d_output},
-      keep{keep}
-  {
-  }
-
-  __device__ void operator()(size_type const idx) const
-  {
-    auto const out_ptr = this->get_output_ptr(idx);
-
-    if (keep == duplicate_keep_option::KEEP_FIRST) {
-      // Store the smallest index of all rows that are equal.
-      atomicMin(out_ptr, idx);
-    } else if (keep == duplicate_keep_option::KEEP_LAST) {
-      // Store the greatest index of all rows that are equal.
-      atomicMax(out_ptr, idx);
-    } else {
-      // Count the number of rows in each group of rows that are compared equal.
-      atomicAdd(out_ptr, size_type{1});
-    }
-  }
-};
-
-/**
- * @brief The builder to construct an instance of `distinct_reduce_fn` functor base on the given
- * value of the `duplicate_keep_option` member variable.
- */
-struct reduce_func_builder {
-  duplicate_keep_option const keep;
-
-  template <typename MapView, typename KeyHasher, typename KeyEqual>
-  auto build(MapView const& d_map,
-             KeyHasher const& d_hasher,
-             KeyEqual const& d_equal,
-             size_type* const d_output)
-  {
-    return distinct_reduce_fn<MapView, KeyHasher, KeyEqual>{
-      d_map, d_hasher, d_equal, keep, d_output};
-  }
-};
-
-}  // namespace
-
-rmm::device_uvector<size_type> distinct_reduce(
-  hash_map_type const& map,
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
-  size_type num_rows,
-  cudf::nullate::DYNAMIC has_nulls,
-  bool has_nested_columns,
-  duplicate_keep_option keep,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(keep != duplicate_keep_option::KEEP_ANY,
-               "This function should not be called with KEEP_ANY");
-
-  return hash_reduce_by_row(map,
-                            preprocessed_input,
-                            num_rows,
-                            has_nulls,
-                            has_nested_columns,
-                            nulls_equal,
-                            nans_equal,
-                            reduce_func_builder{keep},
-                            reduction_init_value(keep),
-                            stream,
-                            mr);
-}
-
-}  // namespace cudf::detail
diff --git a/cpp/src/stream_compaction/distinct_reduce.hpp b/cpp/src/stream_compaction/distinct_reduce.hpp
deleted file mode 100644
index 236b6c860c3..00000000000
--- a/cpp/src/stream_compaction/distinct_reduce.hpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "stream_compaction_common.hpp"
-
-#include <cudf/stream_compaction.hpp>
-#include <cudf/table/experimental/row_operators.cuh>
-#include <cudf/types.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-
-namespace cudf::detail {
-
-/**
- * @brief Return the reduction identity used to initialize results of `hash_reduce_by_row`.
- *
- * @param keep A value of `duplicate_keep_option` type, must not be `KEEP_ANY`.
- * @return The initial reduction value.
- */
-auto constexpr reduction_init_value(duplicate_keep_option keep)
-{
-  switch (keep) {
-    case duplicate_keep_option::KEEP_FIRST: return std::numeric_limits<size_type>::max();
-    case duplicate_keep_option::KEEP_LAST: return std::numeric_limits<size_type>::min();
-    case duplicate_keep_option::KEEP_NONE: return size_type{0};
-    default: CUDF_UNREACHABLE("This function should not be called with KEEP_ANY");
-  }
-}
-
-/**
- * @brief Perform a reduction on groups of rows that are compared equal.
- *
- * This is essentially a reduce-by-key operation with keys are non-contiguous rows and are compared
- * equal. A hash table is used to find groups of equal rows.
- *
- * Depending on the `keep` parameter, the reduction operation for each row group is:
- * - If `keep == KEEP_FIRST`: min of row indices in the group.
- * - If `keep == KEEP_LAST`: max of row indices in the group.
- * - If `keep == KEEP_NONE`: count of equivalent rows (group size).
- *
- * Note that this function is not needed when `keep == KEEP_NONE`.
- *
- * At the beginning of the operation, the entire output array is filled with a value given by
- * the `reduction_init_value()` function. Then, the reduction result for each row group is written
- * into the output array at the index of an unspecified row in the group.
- *
- * @param map The auxiliary map to perform reduction
- * @param preprocessed_input The preprocessed of the input rows for computing row hashing and row
- *        comparisons
- * @param num_rows The number of all input rows
- * @param has_nulls Indicate whether the input rows has any nulls at any nested levels
- * @param has_nested_columns Indicates whether the input table has any nested columns
- * @param keep The parameter to determine what type of reduction to perform
- * @param nulls_equal Flag to specify whether null elements should be considered as equal
- * @param nans_equal Flag to specify whether NaN values in floating point column should be
- *        considered equal.
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned vector
- * @return A device_uvector containing the reduction results
- */
-rmm::device_uvector<size_type> distinct_reduce(
-  hash_map_type const& map,
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
-  size_type num_rows,
-  cudf::nullate::DYNAMIC has_nulls,
-  bool has_nested_columns,
-  duplicate_keep_option keep,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
-
-}  // namespace cudf::detail

From 95e4463262aa72b250df41a33367f9f66237a825 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Sep 2023 12:41:51 -0700
Subject: [PATCH 21/93] Move file

---
 .../reductions => include/cudf/detail}/hash_reduce_by_row.cuh     | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename cpp/{src/reductions => include/cudf/detail}/hash_reduce_by_row.cuh (100%)

diff --git a/cpp/src/reductions/hash_reduce_by_row.cuh b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
similarity index 100%
rename from cpp/src/reductions/hash_reduce_by_row.cuh
rename to cpp/include/cudf/detail/hash_reduce_by_row.cuh

From 723ae4c720c3fc4a5f950c230657abeac60644c5 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Sep 2023 12:45:16 -0700
Subject: [PATCH 22/93] Merge `distinct_reduce.*` into `distinct.cu`

---
 cpp/CMakeLists.txt                            |   1 -
 cpp/src/stream_compaction/distinct.cu         | 101 ++++++++++++++--
 cpp/src/stream_compaction/distinct_reduce.cu  | 109 ------------------
 cpp/src/stream_compaction/distinct_reduce.hpp |  87 --------------
 4 files changed, 90 insertions(+), 208 deletions(-)
 delete mode 100644 cpp/src/stream_compaction/distinct_reduce.cu
 delete mode 100644 cpp/src/stream_compaction/distinct_reduce.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 516865e5782..5703318592f 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -530,7 +530,6 @@ add_library(
   src/stream_compaction/apply_boolean_mask.cu
   src/stream_compaction/distinct.cu
   src/stream_compaction/distinct_count.cu
-  src/stream_compaction/distinct_reduce.cu
   src/stream_compaction/drop_nans.cu
   src/stream_compaction/drop_nulls.cu
   src/stream_compaction/stable_distinct.cu
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index b551df96765..8a7f6daa193 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
-#include "distinct_reduce.hpp"
 #include "stream_compaction_common.cuh"
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/gather.hpp>
+#include <cudf/detail/hash_reduce_by_row.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/stream_compaction.hpp>
@@ -39,6 +39,80 @@
 namespace cudf {
 namespace detail {
 
+namespace {
+/**
+ * @brief Return the reduction identity used to initialize results of `hash_reduce_by_row`.
+ *
+ * @param keep A value of `duplicate_keep_option` type, must not be `KEEP_ANY`.
+ * @return The initial reduction value.
+ */
+auto constexpr reduction_init_value(duplicate_keep_option keep)
+{
+  switch (keep) {
+    case duplicate_keep_option::KEEP_FIRST: return std::numeric_limits<size_type>::max();
+    case duplicate_keep_option::KEEP_LAST: return std::numeric_limits<size_type>::min();
+    case duplicate_keep_option::KEEP_NONE: return size_type{0};
+    default: CUDF_UNREACHABLE("This function should not be called with KEEP_ANY");
+  }
+}
+
+/**
+ * @brief The functor to find the first/last/all duplicate row for rows that compared equal.
+ */
+template <typename MapView, typename KeyHasher, typename KeyEqual>
+struct distinct_reduce_fn : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type> {
+  duplicate_keep_option const keep;
+
+  distinct_reduce_fn(MapView const& d_map,
+                     KeyHasher const& d_hasher,
+                     KeyEqual const& d_equal,
+                     duplicate_keep_option const keep,
+                     size_type* const d_output)
+    : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type>{d_map,
+                                                                     d_hasher,
+                                                                     d_equal,
+                                                                     d_output},
+      keep{keep}
+  {
+  }
+
+  __device__ void operator()(size_type const idx) const
+  {
+    auto const out_ptr = this->get_output_ptr(idx);
+
+    if (keep == duplicate_keep_option::KEEP_FIRST) {
+      // Store the smallest index of all rows that are equal.
+      atomicMin(out_ptr, idx);
+    } else if (keep == duplicate_keep_option::KEEP_LAST) {
+      // Store the greatest index of all rows that are equal.
+      atomicMax(out_ptr, idx);
+    } else {
+      // Count the number of rows in each group of rows that are compared equal.
+      atomicAdd(out_ptr, size_type{1});
+    }
+  }
+};
+
+/**
+ * @brief The builder to construct an instance of `distinct_reduce_fn` functor base on the given
+ * value of the `duplicate_keep_option` member variable.
+ */
+struct reduce_func_builder {
+  duplicate_keep_option const keep;
+
+  template <typename MapView, typename KeyHasher, typename KeyEqual>
+  auto build(MapView const& d_map,
+             KeyHasher const& d_hasher,
+             KeyEqual const& d_equal,
+             size_type* const d_output)
+  {
+    return distinct_reduce_fn<MapView, KeyHasher, KeyEqual>{
+      d_map, d_hasher, d_equal, keep, d_output};
+  }
+};
+
+}  // namespace
+
 rmm::device_uvector<size_type> get_distinct_indices(table_view const& input,
                                                     duplicate_keep_option keep,
                                                     null_equality nulls_equal,
@@ -97,16 +171,21 @@ rmm::device_uvector<size_type> get_distinct_indices(table_view const& input,
   }
 
   // For other keep options, reduce by row on rows that compare equal.
-  auto const reduction_results = distinct_reduce(map,
-                                                 std::move(preprocessed_input),
-                                                 input.num_rows(),
-                                                 has_nulls,
-                                                 has_nested_columns,
-                                                 keep,
-                                                 nulls_equal,
-                                                 nans_equal,
-                                                 stream,
-                                                 rmm::mr::get_current_device_resource());
+  // Depending on the `keep` parameter, the reduction operation for each row group is:
+  // - If `keep == KEEP_FIRST`: min of row indices in the group.
+  // - If `keep == KEEP_LAST`: max of row indices in the group.
+  // - If `keep == KEEP_NONE`: count of equivalent rows (group size).
+  auto const reduction_results = hash_reduce_by_row(map,
+                                                    std::move(preprocessed_input),
+                                                    input.num_rows(),
+                                                    has_nulls,
+                                                    has_nested_columns,
+                                                    nulls_equal,
+                                                    nans_equal,
+                                                    reduce_func_builder{keep},
+                                                    reduction_init_value(keep),
+                                                    stream,
+                                                    mr);
 
   // Extract the desired output indices from reduction results.
   auto const map_end = [&] {
diff --git a/cpp/src/stream_compaction/distinct_reduce.cu b/cpp/src/stream_compaction/distinct_reduce.cu
deleted file mode 100644
index 64d29ae2ff0..00000000000
--- a/cpp/src/stream_compaction/distinct_reduce.cu
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "distinct_reduce.hpp"
-
-#include <reductions/hash_reduce_by_row.cuh>
-
-namespace cudf::detail {
-
-namespace {
-/**
- * @brief The functor to find the first/last/all duplicate row for rows that compared equal.
- */
-template <typename MapView, typename KeyHasher, typename KeyEqual>
-struct distinct_reduce_fn : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type> {
-  duplicate_keep_option const keep;
-
-  distinct_reduce_fn(MapView const& d_map,
-                     KeyHasher const& d_hasher,
-                     KeyEqual const& d_equal,
-                     duplicate_keep_option const keep,
-                     size_type* const d_output)
-    : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type>{d_map,
-                                                                     d_hasher,
-                                                                     d_equal,
-                                                                     d_output},
-      keep{keep}
-  {
-  }
-
-  __device__ void operator()(size_type const idx) const
-  {
-    auto const out_ptr = this->get_output_ptr(idx);
-
-    if (keep == duplicate_keep_option::KEEP_FIRST) {
-      // Store the smallest index of all rows that are equal.
-      atomicMin(out_ptr, idx);
-    } else if (keep == duplicate_keep_option::KEEP_LAST) {
-      // Store the greatest index of all rows that are equal.
-      atomicMax(out_ptr, idx);
-    } else {
-      // Count the number of rows in each group of rows that are compared equal.
-      atomicAdd(out_ptr, size_type{1});
-    }
-  }
-};
-
-/**
- * @brief The builder to construct an instance of `distinct_reduce_fn` functor base on the given
- * value of the `duplicate_keep_option` member variable.
- */
-struct reduce_func_builder {
-  duplicate_keep_option const keep;
-
-  template <typename MapView, typename KeyHasher, typename KeyEqual>
-  auto build(MapView const& d_map,
-             KeyHasher const& d_hasher,
-             KeyEqual const& d_equal,
-             size_type* const d_output)
-  {
-    return distinct_reduce_fn<MapView, KeyHasher, KeyEqual>{
-      d_map, d_hasher, d_equal, keep, d_output};
-  }
-};
-
-}  // namespace
-
-rmm::device_uvector<size_type> distinct_reduce(
-  hash_map_type const& map,
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
-  size_type num_rows,
-  cudf::nullate::DYNAMIC has_nulls,
-  bool has_nested_columns,
-  duplicate_keep_option keep,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(keep != duplicate_keep_option::KEEP_ANY,
-               "This function should not be called with KEEP_ANY");
-
-  return hash_reduce_by_row(map,
-                            preprocessed_input,
-                            num_rows,
-                            has_nulls,
-                            has_nested_columns,
-                            nulls_equal,
-                            nans_equal,
-                            reduce_func_builder{keep},
-                            reduction_init_value(keep),
-                            stream,
-                            mr);
-}
-
-}  // namespace cudf::detail
diff --git a/cpp/src/stream_compaction/distinct_reduce.hpp b/cpp/src/stream_compaction/distinct_reduce.hpp
deleted file mode 100644
index 236b6c860c3..00000000000
--- a/cpp/src/stream_compaction/distinct_reduce.hpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "stream_compaction_common.hpp"
-
-#include <cudf/stream_compaction.hpp>
-#include <cudf/table/experimental/row_operators.cuh>
-#include <cudf/types.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-
-namespace cudf::detail {
-
-/**
- * @brief Return the reduction identity used to initialize results of `hash_reduce_by_row`.
- *
- * @param keep A value of `duplicate_keep_option` type, must not be `KEEP_ANY`.
- * @return The initial reduction value.
- */
-auto constexpr reduction_init_value(duplicate_keep_option keep)
-{
-  switch (keep) {
-    case duplicate_keep_option::KEEP_FIRST: return std::numeric_limits<size_type>::max();
-    case duplicate_keep_option::KEEP_LAST: return std::numeric_limits<size_type>::min();
-    case duplicate_keep_option::KEEP_NONE: return size_type{0};
-    default: CUDF_UNREACHABLE("This function should not be called with KEEP_ANY");
-  }
-}
-
-/**
- * @brief Perform a reduction on groups of rows that are compared equal.
- *
- * This is essentially a reduce-by-key operation with keys are non-contiguous rows and are compared
- * equal. A hash table is used to find groups of equal rows.
- *
- * Depending on the `keep` parameter, the reduction operation for each row group is:
- * - If `keep == KEEP_FIRST`: min of row indices in the group.
- * - If `keep == KEEP_LAST`: max of row indices in the group.
- * - If `keep == KEEP_NONE`: count of equivalent rows (group size).
- *
- * Note that this function is not needed when `keep == KEEP_NONE`.
- *
- * At the beginning of the operation, the entire output array is filled with a value given by
- * the `reduction_init_value()` function. Then, the reduction result for each row group is written
- * into the output array at the index of an unspecified row in the group.
- *
- * @param map The auxiliary map to perform reduction
- * @param preprocessed_input The preprocessed of the input rows for computing row hashing and row
- *        comparisons
- * @param num_rows The number of all input rows
- * @param has_nulls Indicate whether the input rows has any nulls at any nested levels
- * @param has_nested_columns Indicates whether the input table has any nested columns
- * @param keep The parameter to determine what type of reduction to perform
- * @param nulls_equal Flag to specify whether null elements should be considered as equal
- * @param nans_equal Flag to specify whether NaN values in floating point column should be
- *        considered equal.
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned vector
- * @return A device_uvector containing the reduction results
- */
-rmm::device_uvector<size_type> distinct_reduce(
-  hash_map_type const& map,
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
-  size_type num_rows,
-  cudf::nullate::DYNAMIC has_nulls,
-  bool has_nested_columns,
-  duplicate_keep_option keep,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
-
-}  // namespace cudf::detail

From 8fb7a9e7124a3bfcac780c108b6cc7e629c47219 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Sep 2023 13:21:57 -0700
Subject: [PATCH 23/93] Revert "Merge `distinct_reduce.*` into `distinct.cu`"

This reverts commit 723ae4c720c3fc4a5f950c230657abeac60644c5.
---
 cpp/CMakeLists.txt                            |   1 +
 cpp/src/stream_compaction/distinct.cu         | 101 ++--------------
 cpp/src/stream_compaction/distinct_reduce.cu  | 109 ++++++++++++++++++
 cpp/src/stream_compaction/distinct_reduce.hpp |  87 ++++++++++++++
 4 files changed, 208 insertions(+), 90 deletions(-)
 create mode 100644 cpp/src/stream_compaction/distinct_reduce.cu
 create mode 100644 cpp/src/stream_compaction/distinct_reduce.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 5703318592f..516865e5782 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -530,6 +530,7 @@ add_library(
   src/stream_compaction/apply_boolean_mask.cu
   src/stream_compaction/distinct.cu
   src/stream_compaction/distinct_count.cu
+  src/stream_compaction/distinct_reduce.cu
   src/stream_compaction/drop_nans.cu
   src/stream_compaction/drop_nulls.cu
   src/stream_compaction/stable_distinct.cu
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index 8a7f6daa193..b551df96765 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
+#include "distinct_reduce.hpp"
 #include "stream_compaction_common.cuh"
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/gather.hpp>
-#include <cudf/detail/hash_reduce_by_row.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/stream_compaction.hpp>
@@ -39,80 +39,6 @@
 namespace cudf {
 namespace detail {
 
-namespace {
-/**
- * @brief Return the reduction identity used to initialize results of `hash_reduce_by_row`.
- *
- * @param keep A value of `duplicate_keep_option` type, must not be `KEEP_ANY`.
- * @return The initial reduction value.
- */
-auto constexpr reduction_init_value(duplicate_keep_option keep)
-{
-  switch (keep) {
-    case duplicate_keep_option::KEEP_FIRST: return std::numeric_limits<size_type>::max();
-    case duplicate_keep_option::KEEP_LAST: return std::numeric_limits<size_type>::min();
-    case duplicate_keep_option::KEEP_NONE: return size_type{0};
-    default: CUDF_UNREACHABLE("This function should not be called with KEEP_ANY");
-  }
-}
-
-/**
- * @brief The functor to find the first/last/all duplicate row for rows that compared equal.
- */
-template <typename MapView, typename KeyHasher, typename KeyEqual>
-struct distinct_reduce_fn : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type> {
-  duplicate_keep_option const keep;
-
-  distinct_reduce_fn(MapView const& d_map,
-                     KeyHasher const& d_hasher,
-                     KeyEqual const& d_equal,
-                     duplicate_keep_option const keep,
-                     size_type* const d_output)
-    : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type>{d_map,
-                                                                     d_hasher,
-                                                                     d_equal,
-                                                                     d_output},
-      keep{keep}
-  {
-  }
-
-  __device__ void operator()(size_type const idx) const
-  {
-    auto const out_ptr = this->get_output_ptr(idx);
-
-    if (keep == duplicate_keep_option::KEEP_FIRST) {
-      // Store the smallest index of all rows that are equal.
-      atomicMin(out_ptr, idx);
-    } else if (keep == duplicate_keep_option::KEEP_LAST) {
-      // Store the greatest index of all rows that are equal.
-      atomicMax(out_ptr, idx);
-    } else {
-      // Count the number of rows in each group of rows that are compared equal.
-      atomicAdd(out_ptr, size_type{1});
-    }
-  }
-};
-
-/**
- * @brief The builder to construct an instance of `distinct_reduce_fn` functor base on the given
- * value of the `duplicate_keep_option` member variable.
- */
-struct reduce_func_builder {
-  duplicate_keep_option const keep;
-
-  template <typename MapView, typename KeyHasher, typename KeyEqual>
-  auto build(MapView const& d_map,
-             KeyHasher const& d_hasher,
-             KeyEqual const& d_equal,
-             size_type* const d_output)
-  {
-    return distinct_reduce_fn<MapView, KeyHasher, KeyEqual>{
-      d_map, d_hasher, d_equal, keep, d_output};
-  }
-};
-
-}  // namespace
-
 rmm::device_uvector<size_type> get_distinct_indices(table_view const& input,
                                                     duplicate_keep_option keep,
                                                     null_equality nulls_equal,
@@ -171,21 +97,16 @@ rmm::device_uvector<size_type> get_distinct_indices(table_view const& input,
   }
 
   // For other keep options, reduce by row on rows that compare equal.
-  // Depending on the `keep` parameter, the reduction operation for each row group is:
-  // - If `keep == KEEP_FIRST`: min of row indices in the group.
-  // - If `keep == KEEP_LAST`: max of row indices in the group.
-  // - If `keep == KEEP_NONE`: count of equivalent rows (group size).
-  auto const reduction_results = hash_reduce_by_row(map,
-                                                    std::move(preprocessed_input),
-                                                    input.num_rows(),
-                                                    has_nulls,
-                                                    has_nested_columns,
-                                                    nulls_equal,
-                                                    nans_equal,
-                                                    reduce_func_builder{keep},
-                                                    reduction_init_value(keep),
-                                                    stream,
-                                                    mr);
+  auto const reduction_results = distinct_reduce(map,
+                                                 std::move(preprocessed_input),
+                                                 input.num_rows(),
+                                                 has_nulls,
+                                                 has_nested_columns,
+                                                 keep,
+                                                 nulls_equal,
+                                                 nans_equal,
+                                                 stream,
+                                                 rmm::mr::get_current_device_resource());
 
   // Extract the desired output indices from reduction results.
   auto const map_end = [&] {
diff --git a/cpp/src/stream_compaction/distinct_reduce.cu b/cpp/src/stream_compaction/distinct_reduce.cu
new file mode 100644
index 00000000000..64d29ae2ff0
--- /dev/null
+++ b/cpp/src/stream_compaction/distinct_reduce.cu
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "distinct_reduce.hpp"
+
+#include <reductions/hash_reduce_by_row.cuh>
+
+namespace cudf::detail {
+
+namespace {
+/**
+ * @brief The functor to find the first/last/all duplicate row for rows that compared equal.
+ */
+template <typename MapView, typename KeyHasher, typename KeyEqual>
+struct distinct_reduce_fn : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type> {
+  duplicate_keep_option const keep;
+
+  distinct_reduce_fn(MapView const& d_map,
+                     KeyHasher const& d_hasher,
+                     KeyEqual const& d_equal,
+                     duplicate_keep_option const keep,
+                     size_type* const d_output)
+    : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type>{d_map,
+                                                                     d_hasher,
+                                                                     d_equal,
+                                                                     d_output},
+      keep{keep}
+  {
+  }
+
+  __device__ void operator()(size_type const idx) const
+  {
+    auto const out_ptr = this->get_output_ptr(idx);
+
+    if (keep == duplicate_keep_option::KEEP_FIRST) {
+      // Store the smallest index of all rows that are equal.
+      atomicMin(out_ptr, idx);
+    } else if (keep == duplicate_keep_option::KEEP_LAST) {
+      // Store the greatest index of all rows that are equal.
+      atomicMax(out_ptr, idx);
+    } else {
+      // Count the number of rows in each group of rows that are compared equal.
+      atomicAdd(out_ptr, size_type{1});
+    }
+  }
+};
+
+/**
+ * @brief The builder to construct an instance of `distinct_reduce_fn` functor base on the given
+ * value of the `duplicate_keep_option` member variable.
+ */
+struct reduce_func_builder {
+  duplicate_keep_option const keep;
+
+  template <typename MapView, typename KeyHasher, typename KeyEqual>
+  auto build(MapView const& d_map,
+             KeyHasher const& d_hasher,
+             KeyEqual const& d_equal,
+             size_type* const d_output)
+  {
+    return distinct_reduce_fn<MapView, KeyHasher, KeyEqual>{
+      d_map, d_hasher, d_equal, keep, d_output};
+  }
+};
+
+}  // namespace
+
+rmm::device_uvector<size_type> distinct_reduce(
+  hash_map_type const& map,
+  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
+  size_type num_rows,
+  cudf::nullate::DYNAMIC has_nulls,
+  bool has_nested_columns,
+  duplicate_keep_option keep,
+  null_equality nulls_equal,
+  nan_equality nans_equal,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(keep != duplicate_keep_option::KEEP_ANY,
+               "This function should not be called with KEEP_ANY");
+
+  return hash_reduce_by_row(map,
+                            preprocessed_input,
+                            num_rows,
+                            has_nulls,
+                            has_nested_columns,
+                            nulls_equal,
+                            nans_equal,
+                            reduce_func_builder{keep},
+                            reduction_init_value(keep),
+                            stream,
+                            mr);
+}
+
+}  // namespace cudf::detail
diff --git a/cpp/src/stream_compaction/distinct_reduce.hpp b/cpp/src/stream_compaction/distinct_reduce.hpp
new file mode 100644
index 00000000000..236b6c860c3
--- /dev/null
+++ b/cpp/src/stream_compaction/distinct_reduce.hpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "stream_compaction_common.hpp"
+
+#include <cudf/stream_compaction.hpp>
+#include <cudf/table/experimental/row_operators.cuh>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace cudf::detail {
+
+/**
+ * @brief Return the reduction identity used to initialize results of `hash_reduce_by_row`.
+ *
+ * @param keep A value of `duplicate_keep_option` type, must not be `KEEP_ANY`.
+ * @return The initial reduction value.
+ */
+auto constexpr reduction_init_value(duplicate_keep_option keep)
+{
+  switch (keep) {
+    case duplicate_keep_option::KEEP_FIRST: return std::numeric_limits<size_type>::max();
+    case duplicate_keep_option::KEEP_LAST: return std::numeric_limits<size_type>::min();
+    case duplicate_keep_option::KEEP_NONE: return size_type{0};
+    default: CUDF_UNREACHABLE("This function should not be called with KEEP_ANY");
+  }
+}
+
+/**
+ * @brief Perform a reduction on groups of rows that are compared equal.
+ *
+ * This is essentially a reduce-by-key operation with keys are non-contiguous rows and are compared
+ * equal. A hash table is used to find groups of equal rows.
+ *
+ * Depending on the `keep` parameter, the reduction operation for each row group is:
+ * - If `keep == KEEP_FIRST`: min of row indices in the group.
+ * - If `keep == KEEP_LAST`: max of row indices in the group.
+ * - If `keep == KEEP_NONE`: count of equivalent rows (group size).
+ *
+ * Note that this function is not needed when `keep == KEEP_NONE`.
+ *
+ * At the beginning of the operation, the entire output array is filled with a value given by
+ * the `reduction_init_value()` function. Then, the reduction result for each row group is written
+ * into the output array at the index of an unspecified row in the group.
+ *
+ * @param map The auxiliary map to perform reduction
+ * @param preprocessed_input The preprocessed of the input rows for computing row hashing and row
+ *        comparisons
+ * @param num_rows The number of all input rows
+ * @param has_nulls Indicate whether the input rows has any nulls at any nested levels
+ * @param has_nested_columns Indicates whether the input table has any nested columns
+ * @param keep The parameter to determine what type of reduction to perform
+ * @param nulls_equal Flag to specify whether null elements should be considered as equal
+ * @param nans_equal Flag to specify whether NaN values in floating point column should be
+ *        considered equal.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned vector
+ * @return A device_uvector containing the reduction results
+ */
+rmm::device_uvector<size_type> distinct_reduce(
+  hash_map_type const& map,
+  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
+  size_type num_rows,
+  cudf::nullate::DYNAMIC has_nulls,
+  bool has_nested_columns,
+  duplicate_keep_option keep,
+  null_equality nulls_equal,
+  nan_equality nans_equal,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr);
+
+}  // namespace cudf::detail

From 65427c8211f5e4f63b2f3174f3fad284cf17f258 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Sep 2023 13:31:16 -0700
Subject: [PATCH 24/93] Rename function

---
 cpp/CMakeLists.txt                            |  2 +-
 cpp/src/stream_compaction/distinct.cu         | 22 ++++++++---------
 ...distinct_reduce.cu => distinct_helpers.cu} | 24 +++++++++----------
 ...stinct_reduce.hpp => distinct_helpers.hpp} |  2 +-
 4 files changed, 25 insertions(+), 25 deletions(-)
 rename cpp/src/stream_compaction/{distinct_reduce.cu => distinct_helpers.cu} (82%)
 rename cpp/src/stream_compaction/{distinct_reduce.hpp => distinct_helpers.hpp} (98%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 516865e5782..ca6444bd2f7 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -530,7 +530,7 @@ add_library(
   src/stream_compaction/apply_boolean_mask.cu
   src/stream_compaction/distinct.cu
   src/stream_compaction/distinct_count.cu
-  src/stream_compaction/distinct_reduce.cu
+  src/stream_compaction/distinct_helpers.cu
   src/stream_compaction/drop_nans.cu
   src/stream_compaction/drop_nulls.cu
   src/stream_compaction/stable_distinct.cu
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index b551df96765..de2cd6da0dd 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "distinct_reduce.hpp"
+#include "distinct_helpers.hpp"
 #include "stream_compaction_common.cuh"
 
 #include <cudf/column/column_view.hpp>
@@ -97,16 +97,16 @@ rmm::device_uvector<size_type> get_distinct_indices(table_view const& input,
   }
 
   // For other keep options, reduce by row on rows that compare equal.
-  auto const reduction_results = distinct_reduce(map,
-                                                 std::move(preprocessed_input),
-                                                 input.num_rows(),
-                                                 has_nulls,
-                                                 has_nested_columns,
-                                                 keep,
-                                                 nulls_equal,
-                                                 nans_equal,
-                                                 stream,
-                                                 rmm::mr::get_current_device_resource());
+  auto const reduction_results = indices_reduce_by_row(map,
+                                                       std::move(preprocessed_input),
+                                                       input.num_rows(),
+                                                       has_nulls,
+                                                       has_nested_columns,
+                                                       keep,
+                                                       nulls_equal,
+                                                       nans_equal,
+                                                       stream,
+                                                       rmm::mr::get_current_device_resource());
 
   // Extract the desired output indices from reduction results.
   auto const map_end = [&] {
diff --git a/cpp/src/stream_compaction/distinct_reduce.cu b/cpp/src/stream_compaction/distinct_helpers.cu
similarity index 82%
rename from cpp/src/stream_compaction/distinct_reduce.cu
rename to cpp/src/stream_compaction/distinct_helpers.cu
index 64d29ae2ff0..5d31e87943a 100644
--- a/cpp/src/stream_compaction/distinct_reduce.cu
+++ b/cpp/src/stream_compaction/distinct_helpers.cu
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include "distinct_reduce.hpp"
+#include "distinct_helpers.hpp"
 
-#include <reductions/hash_reduce_by_row.cuh>
+#include <cudf/detail/hash_reduce_by_row.cuh>
 
 namespace cudf::detail {
 
@@ -25,14 +25,14 @@ namespace {
  * @brief The functor to find the first/last/all duplicate row for rows that compared equal.
  */
 template <typename MapView, typename KeyHasher, typename KeyEqual>
-struct distinct_reduce_fn : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type> {
+struct reduce_fn : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type> {
   duplicate_keep_option const keep;
 
-  distinct_reduce_fn(MapView const& d_map,
-                     KeyHasher const& d_hasher,
-                     KeyEqual const& d_equal,
-                     duplicate_keep_option const keep,
-                     size_type* const d_output)
+  reduce_fn(MapView const& d_map,
+            KeyHasher const& d_hasher,
+            KeyEqual const& d_equal,
+            duplicate_keep_option const keep,
+            size_type* const d_output)
     : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type>{d_map,
                                                                      d_hasher,
                                                                      d_equal,
@@ -59,7 +59,7 @@ struct distinct_reduce_fn : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual,
 };
 
 /**
- * @brief The builder to construct an instance of `distinct_reduce_fn` functor base on the given
+ * @brief The builder to construct an instance of `reduce_fn` functor base on the given
  * value of the `duplicate_keep_option` member variable.
  */
 struct reduce_func_builder {
@@ -71,14 +71,14 @@ struct reduce_func_builder {
              KeyEqual const& d_equal,
              size_type* const d_output)
   {
-    return distinct_reduce_fn<MapView, KeyHasher, KeyEqual>{
-      d_map, d_hasher, d_equal, keep, d_output};
+    return reduce_fn<MapView, KeyHasher, KeyEqual>{d_map, d_hasher, d_equal, keep, d_output};
   }
 };
 
 }  // namespace
 
-rmm::device_uvector<size_type> distinct_reduce(
+// This function is split from `distinct.cu` to improve compile time.
+rmm::device_uvector<size_type> indices_reduce_by_row(
   hash_map_type const& map,
   std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
   size_type num_rows,
diff --git a/cpp/src/stream_compaction/distinct_reduce.hpp b/cpp/src/stream_compaction/distinct_helpers.hpp
similarity index 98%
rename from cpp/src/stream_compaction/distinct_reduce.hpp
rename to cpp/src/stream_compaction/distinct_helpers.hpp
index 236b6c860c3..9ae29783ca4 100644
--- a/cpp/src/stream_compaction/distinct_reduce.hpp
+++ b/cpp/src/stream_compaction/distinct_helpers.hpp
@@ -72,7 +72,7 @@ auto constexpr reduction_init_value(duplicate_keep_option keep)
  * @param mr Device memory resource used to allocate the returned vector
  * @return A device_uvector containing the reduction results
  */
-rmm::device_uvector<size_type> distinct_reduce(
+rmm::device_uvector<size_type> indices_reduce_by_row(
   hash_map_type const& map,
   std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
   size_type num_rows,

From 0c0c7ac8eb66d2e4192ef8499cbff1ef0b385014 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Sep 2023 15:56:14 -0700
Subject: [PATCH 25/93] Fix output type

---
 cpp/include/cudf/detail/hash_reduce_by_row.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/detail/hash_reduce_by_row.cuh b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
index 1cff009b17b..35654b90bc0 100644
--- a/cpp/include/cudf/detail/hash_reduce_by_row.cuh
+++ b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
@@ -109,7 +109,7 @@ struct reduce_by_row_fn_base {
  * @return A device_uvector containing the reduction results
  */
 template <typename ReduceFuncBuilder, typename OutputType>
-rmm::device_uvector<size_type> hash_reduce_by_row(
+rmm::device_uvector<OutputType> hash_reduce_by_row(
   hash_map_type const& map,
   std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
   size_type num_rows,

From 01cc1c2bf82924c0f239ea90a6e360602ee34a60 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Sep 2023 19:28:35 -0700
Subject: [PATCH 26/93] Move file

---
 .../cudf/detail => src/reductions}/hash_reduce_by_row.cuh       | 0
 cpp/src/stream_compaction/distinct_helpers.cu                   | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename cpp/{include/cudf/detail => src/reductions}/hash_reduce_by_row.cuh (100%)

diff --git a/cpp/include/cudf/detail/hash_reduce_by_row.cuh b/cpp/src/reductions/hash_reduce_by_row.cuh
similarity index 100%
rename from cpp/include/cudf/detail/hash_reduce_by_row.cuh
rename to cpp/src/reductions/hash_reduce_by_row.cuh
diff --git a/cpp/src/stream_compaction/distinct_helpers.cu b/cpp/src/stream_compaction/distinct_helpers.cu
index 5d31e87943a..cb0dc4b1c50 100644
--- a/cpp/src/stream_compaction/distinct_helpers.cu
+++ b/cpp/src/stream_compaction/distinct_helpers.cu
@@ -16,7 +16,7 @@
 
 #include "distinct_helpers.hpp"
 
-#include <cudf/detail/hash_reduce_by_row.cuh>
+#include <reductions/hash_reduce_by_row.cuh>
 
 namespace cudf::detail {
 

From f5a6a1a66841b82b1da0ff75a21d0faa98440847 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Sep 2023 19:30:15 -0700
Subject: [PATCH 27/93] Rename function

---
 cpp/src/stream_compaction/distinct.cu         | 20 +++++++++----------
 cpp/src/stream_compaction/distinct_helpers.cu |  2 +-
 .../stream_compaction/distinct_helpers.hpp    |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index de2cd6da0dd..e031727c21a 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -97,16 +97,16 @@ rmm::device_uvector<size_type> get_distinct_indices(table_view const& input,
   }
 
   // For other keep options, reduce by row on rows that compare equal.
-  auto const reduction_results = indices_reduce_by_row(map,
-                                                       std::move(preprocessed_input),
-                                                       input.num_rows(),
-                                                       has_nulls,
-                                                       has_nested_columns,
-                                                       keep,
-                                                       nulls_equal,
-                                                       nans_equal,
-                                                       stream,
-                                                       rmm::mr::get_current_device_resource());
+  auto const reduction_results = reduce_by_row(map,
+                                               std::move(preprocessed_input),
+                                               input.num_rows(),
+                                               has_nulls,
+                                               has_nested_columns,
+                                               keep,
+                                               nulls_equal,
+                                               nans_equal,
+                                               stream,
+                                               rmm::mr::get_current_device_resource());
 
   // Extract the desired output indices from reduction results.
   auto const map_end = [&] {
diff --git a/cpp/src/stream_compaction/distinct_helpers.cu b/cpp/src/stream_compaction/distinct_helpers.cu
index cb0dc4b1c50..a9df0bc98b8 100644
--- a/cpp/src/stream_compaction/distinct_helpers.cu
+++ b/cpp/src/stream_compaction/distinct_helpers.cu
@@ -78,7 +78,7 @@ struct reduce_func_builder {
 }  // namespace
 
 // This function is split from `distinct.cu` to improve compile time.
-rmm::device_uvector<size_type> indices_reduce_by_row(
+rmm::device_uvector<size_type> reduce_by_row(
   hash_map_type const& map,
   std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
   size_type num_rows,
diff --git a/cpp/src/stream_compaction/distinct_helpers.hpp b/cpp/src/stream_compaction/distinct_helpers.hpp
index 9ae29783ca4..b667d0b04f0 100644
--- a/cpp/src/stream_compaction/distinct_helpers.hpp
+++ b/cpp/src/stream_compaction/distinct_helpers.hpp
@@ -72,7 +72,7 @@ auto constexpr reduction_init_value(duplicate_keep_option keep)
  * @param mr Device memory resource used to allocate the returned vector
  * @return A device_uvector containing the reduction results
  */
-rmm::device_uvector<size_type> indices_reduce_by_row(
+rmm::device_uvector<size_type> reduce_by_row(
   hash_map_type const& map,
   std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
   size_type num_rows,

From 924a2d68bc3c5a180bafe4a461c05821106bc0f4 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Sep 2023 21:10:14 -0700
Subject: [PATCH 28/93] Implement histogram reduction

---
 cpp/src/reductions/histogram.cu | 180 ++++++++++++++++++++++++++++++--
 1 file changed, 171 insertions(+), 9 deletions(-)

diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 053ad62180b..262d1c94e89 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -14,28 +14,190 @@
  * limitations under the License.
  */
 
-#include <reductions/histogram.cuh>
+#include <reductions/hash_reduce_by_row.cuh>
+#include <stream_compaction/stream_compaction_common.cuh>
 
-#include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/utilities/type_dispatcher.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
+#include <thrust/iterator/discard_iterator.h>
 
-#include <optional>
+#include <cuda/atomic>
 
 namespace cudf::reduction::detail {
 
-std::unique_ptr<cudf::column> histogram(column_view const& input,
+namespace {
+
+/**
+ * @brief The functor to compute the occurences of each unique rows in the input table.
+ */
+template <typename MapView, typename KeyHasher, typename KeyEqual, typename OutputType>
+struct reduce_fn : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, OutputType> {
+  reduce_fn(MapView const& d_map,
+            KeyHasher const& d_hasher,
+            KeyEqual const& d_equal,
+            OutputType* const d_output)
+    : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, OutputType>{
+        d_map, d_hasher, d_equal, d_output}
+  {
+  }
+
+  // Count the number of rows in each group of rows that are compared equal.
+  __device__ void operator()(size_type const idx) const
+  {
+    cuda::atomic_ref<OutputType, cuda::thread_scope_device> count(*this->get_output_ptr(idx));
+    count.fetch_add(OutputType{1}, cuda::std::memory_order_relaxed);
+  }
+};
+
+/**
+ * @brief The builder to construct an instance of `reduce_fn` functor.
+ */
+struct reduce_func_builder {
+  template <typename MapView, typename KeyHasher, typename KeyEqual, typename OutputType>
+  auto build(MapView const& d_map,
+             KeyHasher const& d_hasher,
+             KeyEqual const& d_equal,
+             OutputType* const d_output)
+  {
+    return reduce_fn<MapView, KeyHasher, KeyEqual, OutputType>{d_map, d_hasher, d_equal, d_output};
+  }
+};
+
+template <typename T>
+struct is_none_zero {
+  T const* data;
+  __device__ bool operator()(size_type const idx) const { return data[idx] != T{0}; }
+};
+
+struct histogram_dispatcher {
+  template <typename OutputType>
+  static bool constexpr is_supported()
+  {
+    // Currently only int64_t is requested by Spark-Rapids.
+    // More data type can be supported by enabling it below.
+    return std::is_same_v<OutputType, int64_t>;
+  }
+
+  template <typename OutputType, typename... Args>
+  std::enable_if_t<!is_supported<OutputType>(), void> operator()(Args&&...)
+  {
+    CUDF_FAIL("Unsupported output type in histogram aggregation.");
+  }
+
+  template <typename OutputType, CUDF_ENABLE_IF(is_supported<OutputType>())>
+  void operator()(
+    cudf::detail::hash_map_type const& map,
+    std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
+    size_type num_rows,
+    cudf::nullate::DYNAMIC has_nulls,
+    bool has_nested_columns,
+    mutable_column_view const& output,
+    rmm::cuda_stream_view stream) const
+  {
+    auto const reduction_results =
+      cudf::detail::hash_reduce_by_row(map,
+                                       preprocessed_input,
+                                       num_rows,
+                                       has_nulls,
+                                       has_nested_columns,
+                                       null_equality::EQUAL,
+                                       nan_equality::ALL_EQUAL,
+                                       reduce_func_builder{},
+                                       OutputType{0},
+                                       stream,
+                                       rmm::mr::get_current_device_resource());
+
+    // Reduction results are either group sizes of equal rows, or `0`.
+    // Thus, we only needs to extract the non-zero group sizes.
+    thrust::copy_if(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator(0),
+                    thrust::make_counting_iterator(num_rows),
+                    output.begin<OutputType>(),
+                    is_none_zero<OutputType>{reduction_results.begin()});
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<cudf::column> histogram(table_view const& input,
                                         data_type const output_dtype,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(cudf::is_integral(output_dtype),
-               "The output type of histogram aggregation must be an integral type.");
+  CUDF_EXPECTS(cudf::is_integral(output_dtype) &&
+                 (cudf::size_of(output_dtype) == 4 || cudf::size_of(output_dtype) == 8),
+               "The output type of histogram aggregation must be an 32/64bit integral type.");
 
+  auto map = cudf::detail::hash_map_type{
+    compute_hash_table_size(input.num_rows()),
+    cuco::empty_key{cudf::detail::COMPACTION_EMPTY_KEY_SENTINEL},
+    cuco::empty_value{cudf::detail::COMPACTION_EMPTY_VALUE_SENTINEL},
+    cudf::detail::hash_table_allocator_type{default_allocator<char>{}, stream},
+    stream.value()};
 
+  auto const preprocessed_input =
+    cudf::experimental::row::hash::preprocessed_table::create(input, stream);
+  auto const has_nulls          = nullate::DYNAMIC{cudf::has_nested_nulls(input)};
+  auto const has_nested_columns = cudf::detail::has_nested_columns(input);
 
-  return nullptr;
+  auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
+  auto const key_hasher =
+    cudf::detail::experimental::compaction_hash(row_hasher.device_hasher(has_nulls));
+  auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input);
+
+  auto const pair_iter = cudf::detail::make_counting_transform_iterator(
+    size_type{0}, [] __device__(size_type const i) { return cuco::make_pair(i, i); });
+
+  using nan_equal_comparator =
+    cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
+  auto const value_comp = nan_equal_comparator{};
+  if (has_nested_columns) {
+    auto const key_equal = row_comp.equal_to<true>(has_nulls, null_equality::EQUAL, value_comp);
+    map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
+  } else {
+    auto const key_equal = row_comp.equal_to<false>(has_nulls, null_equality::EQUAL, value_comp);
+    map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
+  }
+
+  // Gather the indices of distinct rows.
+  auto distinct_indices = cudf::make_numeric_column(data_type{type_to_id<size_type>()},
+                                                    static_cast<size_type>(map.get_size()),
+                                                    mask_state::UNALLOCATED,
+                                                    stream,
+                                                    mr);
+  map.retrieve_all(distinct_indices->mutable_view().begin<size_type>(),
+                   thrust::make_discard_iterator(),
+                   stream.value());
+
+  // Count the number of occurences of each unique row.
+  auto unique_counts = make_numeric_column(
+    output_dtype, static_cast<size_type>(map.get_size()), mask_state::UNALLOCATED, stream, mr);
+  type_dispatcher(output_dtype,
+                  histogram_dispatcher{},
+                  map,
+                  std::move(preprocessed_input),
+                  input.num_rows(),
+                  has_nulls,
+                  has_nested_columns,
+                  unique_counts->mutable_view(),
+                  stream);
+
+  std::vector<std::unique_ptr<column>> output_children;
+  output_children.emplace_back(std::move(distinct_indices));
+  output_children.emplace_back(std::move(unique_counts));
+
+  return make_structs_column(
+    static_cast<size_type>(map.get_size()), std::move(output_children), 0, {}, stream, mr);
+}
+
+std::unique_ptr<cudf::column> histogram(column_view const& input,
+                                        data_type const output_dtype,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  return histogram(table_view{{input}}, output_dtype, stream, mr);
 }
 
 std::unique_ptr<cudf::column> merge_histogram(column_view const& input,

From a1b516e119c8abacc12321c144ca682c05df5b74 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 12 Sep 2023 22:22:47 -0700
Subject: [PATCH 29/93] Support partial count

---
 cpp/src/reductions/histogram.cu | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 262d1c94e89..213d5ae0cd1 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -34,12 +34,18 @@ namespace {
  */
 template <typename MapView, typename KeyHasher, typename KeyEqual, typename OutputType>
 struct reduce_fn : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, OutputType> {
+  OutputType const* d_partial_output;
+
   reduce_fn(MapView const& d_map,
             KeyHasher const& d_hasher,
             KeyEqual const& d_equal,
-            OutputType* const d_output)
-    : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, OutputType>{
-        d_map, d_hasher, d_equal, d_output}
+            OutputType* const d_output,
+            OutputType const* const d_partial_output = nullptr)
+    : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, OutputType>{d_map,
+                                                                                    d_hasher,
+                                                                                    d_equal,
+                                                                                    d_output},
+      d_partial_output{d_partial_output}
   {
   }
 
@@ -47,7 +53,11 @@ struct reduce_fn : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEq
   __device__ void operator()(size_type const idx) const
   {
     cuda::atomic_ref<OutputType, cuda::thread_scope_device> count(*this->get_output_ptr(idx));
-    count.fetch_add(OutputType{1}, cuda::std::memory_order_relaxed);
+    if (d_partial_output) {
+      count.fetch_add(d_partial_output[idx], cuda::std::memory_order_relaxed);
+    } else {
+      count.fetch_add(OutputType{1}, cuda::std::memory_order_relaxed);
+    }
   }
 };
 

From e196ab4d5d6466594c4262037b6e6a3b42f442dc Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 13 Sep 2023 11:09:13 -0700
Subject: [PATCH 30/93] Return list scalar of structs

---
 cpp/src/reductions/histogram.cu | 67 +++++++++++++++++----------------
 1 file changed, 34 insertions(+), 33 deletions(-)

diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 213d5ae0cd1..6a6522413b8 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -18,7 +18,9 @@
 #include <stream_compaction/stream_compaction_common.cuh>
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <thrust/iterator/discard_iterator.h>
@@ -131,7 +133,7 @@ struct histogram_dispatcher {
 
 }  // namespace
 
-std::unique_ptr<cudf::column> histogram(table_view const& input,
+std::unique_ptr<cudf::scalar> histogram(column_view const& input,
                                         data_type const output_dtype,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
@@ -141,16 +143,17 @@ std::unique_ptr<cudf::column> histogram(table_view const& input,
                "The output type of histogram aggregation must be an 32/64bit integral type.");
 
   auto map = cudf::detail::hash_map_type{
-    compute_hash_table_size(input.num_rows()),
+    compute_hash_table_size(input.size()),
     cuco::empty_key{cudf::detail::COMPACTION_EMPTY_KEY_SENTINEL},
     cuco::empty_value{cudf::detail::COMPACTION_EMPTY_VALUE_SENTINEL},
     cudf::detail::hash_table_allocator_type{default_allocator<char>{}, stream},
     stream.value()};
 
+  auto const input_tview = table_view{{input}};
   auto const preprocessed_input =
-    cudf::experimental::row::hash::preprocessed_table::create(input, stream);
-  auto const has_nulls          = nullate::DYNAMIC{cudf::has_nested_nulls(input)};
-  auto const has_nested_columns = cudf::detail::has_nested_columns(input);
+    cudf::experimental::row::hash::preprocessed_table::create(input_tview, stream);
+  auto const has_nulls          = nullate::DYNAMIC{cudf::has_nested_nulls(input_tview)};
+  auto const has_nested_columns = cudf::detail::has_nested_columns(input_tview);
 
   auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
   auto const key_hasher =
@@ -165,21 +168,25 @@ std::unique_ptr<cudf::column> histogram(table_view const& input,
   auto const value_comp = nan_equal_comparator{};
   if (has_nested_columns) {
     auto const key_equal = row_comp.equal_to<true>(has_nulls, null_equality::EQUAL, value_comp);
-    map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
+    map.insert(pair_iter, pair_iter + input.size(), key_hasher, key_equal, stream.value());
   } else {
     auto const key_equal = row_comp.equal_to<false>(has_nulls, null_equality::EQUAL, value_comp);
-    map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
+    map.insert(pair_iter, pair_iter + input.size(), key_hasher, key_equal, stream.value());
   }
 
-  // Gather the indices of distinct rows.
-  auto distinct_indices = cudf::make_numeric_column(data_type{type_to_id<size_type>()},
-                                                    static_cast<size_type>(map.get_size()),
-                                                    mask_state::UNALLOCATED,
-                                                    stream,
-                                                    mr);
-  map.retrieve_all(distinct_indices->mutable_view().begin<size_type>(),
-                   thrust::make_discard_iterator(),
-                   stream.value());
+  // Gather the indices of distinct rows and distinct rows.
+  auto distinct_indices = rmm::device_uvector<size_type>(
+    static_cast<size_type>(map.get_size()), stream, rmm::mr::get_current_device_resource());
+  map.retrieve_all(distinct_indices.begin(), thrust::make_discard_iterator(), stream.value());
+  auto distinct_rows =
+    std::move(cudf::detail::gather(input_tview,
+                                   distinct_indices,
+                                   out_of_bounds_policy::DONT_CHECK,
+                                   cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                   stream,
+                                   mr)
+                ->release()
+                .front());
 
   // Count the number of occurences of each unique row.
   auto unique_counts = make_numeric_column(
@@ -188,37 +195,31 @@ std::unique_ptr<cudf::column> histogram(table_view const& input,
                   histogram_dispatcher{},
                   map,
                   std::move(preprocessed_input),
-                  input.num_rows(),
+                  input.size(),
                   has_nulls,
                   has_nested_columns,
                   unique_counts->mutable_view(),
                   stream);
 
-  std::vector<std::unique_ptr<column>> output_children;
-  output_children.emplace_back(std::move(distinct_indices));
-  output_children.emplace_back(std::move(unique_counts));
+  std::vector<std::unique_ptr<column>> struct_children;
+  struct_children.emplace_back(std::move(distinct_rows));
+  struct_children.emplace_back(std::move(unique_counts));
+  auto output_structs = make_structs_column(
+    static_cast<size_type>(map.get_size()), std::move(struct_children), 0, {}, stream, mr);
 
-  return make_structs_column(
-    static_cast<size_type>(map.get_size()), std::move(output_children), 0, {}, stream, mr);
+  return std::make_unique<cudf::list_scalar>(
+    std::move(*output_structs.release()), true, stream, mr);
 }
 
-std::unique_ptr<cudf::column> histogram(column_view const& input,
-                                        data_type const output_dtype,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
-{
-  return histogram(table_view{{input}}, output_dtype, stream, mr);
-}
-
-std::unique_ptr<cudf::column> merge_histogram(column_view const& input,
+std::unique_ptr<cudf::scalar> merge_histogram(column_view const& input,
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(
     input.type().id() == type_id::STRUCT && input.num_children() == 2,
     "The input of merge_histogram aggregation must be a struct column having two children.");
-  CUDF_EXPECTS(cudf::is_integral(input.child(1).type()),
-               "The second child of the input column must be an integer type.");
+  CUDF_EXPECTS(input.child(1).type().id() == type_id::INT64,
+               "The second child of the input column must be INT64 type.");
 
   return nullptr;
 }

From 09f68afb295455b76c015e45c3065bcaa1d38c63 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 13 Sep 2023 11:29:02 -0700
Subject: [PATCH 31/93] Add factory functions for histogram and merge histogram

---
 cpp/include/cudf/aggregation.hpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index 359c53dff60..b4491b68da2 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -290,6 +290,11 @@ std::unique_ptr<Base> make_any_aggregation();
 template <typename Base = aggregation>
 std::unique_ptr<Base> make_all_aggregation();
 
+/// Factory to create a HISTOGRAM aggregation
+/// @return A HISTOGRAM aggregation object
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_histogram_aggregation();
+
 /// Factory to create a SUM_OF_SQUARES aggregation
 /// @return A SUM_OF_SQUARES aggregation object
 template <typename Base = aggregation>
@@ -612,6 +617,13 @@ std::unique_ptr<Base> make_merge_sets_aggregation(
 template <typename Base = aggregation>
 std::unique_ptr<Base> make_merge_m2_aggregation();
 
+/**
+ * @brief make_merge_m2_aggregation
+ * @return
+ */
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_merge_histogram_aggregation();
+
 /**
  * @brief Factory to create a COVARIANCE aggregation
  *

From f107d9876e8f78c9bffe8e7f7c1fa42ef6181a3d Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 13 Sep 2023 11:41:45 -0700
Subject: [PATCH 32/93] Fix aggregation dispatcher

---
 cpp/include/cudf/detail/aggregation/aggregation.hpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 345977384f3..930ec992384 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1461,6 +1461,8 @@ CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind
       return f.template operator()<aggregation::COUNT_VALID>(std::forward<Ts>(args)...);
     case aggregation::COUNT_ALL:
       return f.template operator()<aggregation::COUNT_ALL>(std::forward<Ts>(args)...);
+    case aggregation::HISTOGRAM:
+      return f.template operator()<aggregation::HISTOGRAM>(std::forward<Ts>(args)...);
     case aggregation::ANY:
       return f.template operator()<aggregation::ANY>(std::forward<Ts>(args)...);
     case aggregation::ALL:
@@ -1504,6 +1506,8 @@ CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind
       return f.template operator()<aggregation::MERGE_SETS>(std::forward<Ts>(args)...);
     case aggregation::MERGE_M2:
       return f.template operator()<aggregation::MERGE_M2>(std::forward<Ts>(args)...);
+    case aggregation::MERGE_HISTOGRAM:
+      return f.template operator()<aggregation::MERGE_HISTOGRAM>(std::forward<Ts>(args)...);
     case aggregation::COVARIANCE:
       return f.template operator()<aggregation::COVARIANCE>(std::forward<Ts>(args)...);
     case aggregation::CORRELATION:

From cc185d8b4b9fe555b72aa8be4537d7c99e7f6740 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 13 Sep 2023 12:59:04 -0700
Subject: [PATCH 33/93] Fix bug

---
 cpp/src/reductions/histogram.cu | 68 ++++++++++++++++++++++-----------
 1 file changed, 46 insertions(+), 22 deletions(-)

diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 6a6522413b8..f45f1199a39 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <cudf_test/column_utilities.hpp>
+
 #include <reductions/hash_reduce_by_row.cuh>
 #include <stream_compaction/stream_compaction_common.cuh>
 
@@ -77,10 +79,12 @@ struct reduce_func_builder {
   }
 };
 
-template <typename T>
 struct is_none_zero {
-  T const* data;
-  __device__ bool operator()(size_type const idx) const { return data[idx] != T{0}; }
+  template<typename Pair>
+  __device__ bool operator()(Pair const inp_pair) const
+  {
+    return thrust::get<1>(inp_pair) != 0;
+  }
 };
 
 struct histogram_dispatcher {
@@ -105,7 +109,8 @@ struct histogram_dispatcher {
     size_type num_rows,
     cudf::nullate::DYNAMIC has_nulls,
     bool has_nested_columns,
-    mutable_column_view const& output,
+    size_type* output_indices,
+    mutable_column_view const& output_count,
     rmm::cuda_stream_view stream) const
   {
     auto const reduction_results =
@@ -121,13 +126,28 @@ struct histogram_dispatcher {
                                        stream,
                                        rmm::mr::get_current_device_resource());
 
+    column_view cv = column_view(data_type{type_id::INT64},
+                                 (int)reduction_results.size(),
+                                 reduction_results.data(),
+                                 nullptr,
+                                 0);
+    printf("reduction result, num rows = %d\n", num_rows);
+    cudf::test::print(cv);
+
+    auto const input_it = thrust::make_zip_iterator(
+      thrust::make_tuple(thrust::make_counting_iterator(0), reduction_results.begin()));
+
+    auto const output_it = thrust::make_zip_iterator(
+      thrust::make_tuple(output_indices, output_count.begin<OutputType>()));
+
+    thrust::copy_if(rmm::exec_policy(stream),
+                    input_it,
+                    input_it + num_rows,
+                    output_it,
+                    is_none_zero{});
+
     // Reduction results are either group sizes of equal rows, or `0`.
     // Thus, we only needs to extract the non-zero group sizes.
-    thrust::copy_if(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator(0),
-                    thrust::make_counting_iterator(num_rows),
-                    output.begin<OutputType>(),
-                    is_none_zero<OutputType>{reduction_results.begin()});
   }
 };
 
@@ -177,19 +197,10 @@ std::unique_ptr<cudf::scalar> histogram(column_view const& input,
   // Gather the indices of distinct rows and distinct rows.
   auto distinct_indices = rmm::device_uvector<size_type>(
     static_cast<size_type>(map.get_size()), stream, rmm::mr::get_current_device_resource());
-  map.retrieve_all(distinct_indices.begin(), thrust::make_discard_iterator(), stream.value());
-  auto distinct_rows =
-    std::move(cudf::detail::gather(input_tview,
-                                   distinct_indices,
-                                   out_of_bounds_policy::DONT_CHECK,
-                                   cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                   stream,
-                                   mr)
-                ->release()
-                .front());
+  //  map.retrieve_all(distinct_indices.begin(), thrust::make_discard_iterator(), stream.value());
 
   // Count the number of occurences of each unique row.
-  auto unique_counts = make_numeric_column(
+  auto distinct_counts = make_numeric_column(
     output_dtype, static_cast<size_type>(map.get_size()), mask_state::UNALLOCATED, stream, mr);
   type_dispatcher(output_dtype,
                   histogram_dispatcher{},
@@ -198,12 +209,25 @@ std::unique_ptr<cudf::scalar> histogram(column_view const& input,
                   input.size(),
                   has_nulls,
                   has_nested_columns,
-                  unique_counts->mutable_view(),
+                  distinct_indices.begin(),
+                  distinct_counts->mutable_view(),
                   stream);
 
+  auto distinct_rows =
+    std::move(cudf::detail::gather(input_tview,
+                                   distinct_indices,
+                                   out_of_bounds_policy::DONT_CHECK,
+                                   cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                   stream,
+                                   mr)
+                ->release()
+                .front());
+  printf("reduction result 2\n");
+  cudf::test::print(distinct_counts->view());
+
   std::vector<std::unique_ptr<column>> struct_children;
   struct_children.emplace_back(std::move(distinct_rows));
-  struct_children.emplace_back(std::move(unique_counts));
+  struct_children.emplace_back(std::move(distinct_counts));
   auto output_structs = make_structs_column(
     static_cast<size_type>(map.get_size()), std::move(struct_children), 0, {}, stream, mr);
 

From 547be01d3939b3474c52a05acbf54c70250174c6 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 13 Sep 2023 12:59:21 -0700
Subject: [PATCH 34/93] Working test

---
 cpp/tests/reductions/reduction_tests.cpp | 45 ++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 2561f3f9886..c8ff6645b16 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -28,6 +28,7 @@
 #include <cudf/reduction.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/sorting.hpp>
 #include <cudf/types.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
@@ -119,6 +120,7 @@ struct ReductionTest : public cudf::test::BaseFixture {
   }
 };
 
+#if 0
 template <typename T>
 struct MinMaxReductionTest : public ReductionTest<T> {};
 
@@ -292,8 +294,10 @@ TYPED_TEST(SumReductionTest, Sum)
                  .second);
 }
 
-TYPED_TEST_SUITE(ReductionTest, cudf::test::NumericTypes);
+#endif
+TYPED_TEST_SUITE(ReductionTest, cudf::test::FloatingPointTypes);
 
+#if 0
 TYPED_TEST(ReductionTest, Product)
 {
   using T = TypeParam;
@@ -379,6 +383,43 @@ TYPED_TEST(ReductionTest, SumOfSquare)
             expected_null_value);
 }
 
+#endif
+
+TYPED_TEST(ReductionTest, Histogram)
+{
+  using col_data    = cudf::test::fixed_width_column_wrapper<TypeParam>;
+  using int64_data  = cudf::test::fixed_width_column_wrapper<int64_t>;
+  using structs_col = cudf::test::structs_column_wrapper;
+
+  auto const agg = cudf::make_histogram_aggregation<reduce_aggregation>();
+
+  // Test without nulls.
+  {
+    auto const input    = col_data{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1};
+    auto const expected = [] {
+      auto child1 = col_data{-3, -2, 0, 1, 2, 5};
+      auto child2 = int64_data{2, 1, 1, 2, 4, 1};
+      return structs_col{{child1, child2}};
+    }();
+    //    auto const input    = col_data{1, 2, 3, 1, 2};
+    //    auto const expected = [] {
+    //      auto child1 = col_data{1, 2, 3};
+    //      auto child2 = int64_data{2, 2, 1};
+    //      return structs_col{{child1, child2}};
+    //    }();
+    auto const result     = cudf::reduce(input, *agg, cudf::data_type{cudf::type_id::INT64});
+    auto const result_col = dynamic_cast<cudf::list_scalar*>(result.get())->view();
+    cudf::test::print(result_col);
+
+    auto const sort_order    = cudf::sorted_order(cudf::table_view{{result_col.child(0)}}, {}, {});
+    auto const sorted_result = cudf::gather(cudf::table_view{{result_col}}, *sort_order);
+    cudf::test::print(sorted_result->get_column(0).view());
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, sorted_result->get_column(0).view());
+  }
+}
+
+#if 0
 template <typename T>
 struct ReductionAnyAllTest : public ReductionTest<bool> {};
 using AnyAllTypes = cudf::test::Types<int32_t, float, bool>;
@@ -2936,5 +2977,5 @@ TEST_F(StructReductionTest, StructReductionMinMaxWithNulls)
                          *cudf::make_max_aggregation<reduce_aggregation>());
   }
 }
-
+#endif
 CUDF_TEST_PROGRAM_MAIN()

From 4d93b1e0bf14871562d7838235e50d0c3228f9b2 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 13 Sep 2023 13:18:06 -0700
Subject: [PATCH 35/93] Implement merge histogram

---
 cpp/src/reductions/histogram.cu | 87 ++++++++++++++++++++-------------
 1 file changed, 53 insertions(+), 34 deletions(-)

diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index f45f1199a39..e0cd1586756 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -29,6 +29,8 @@
 
 #include <cuda/atomic>
 
+#include <optional>
+
 namespace cudf::reduction::detail {
 
 namespace {
@@ -44,7 +46,7 @@ struct reduce_fn : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEq
             KeyHasher const& d_hasher,
             KeyEqual const& d_equal,
             OutputType* const d_output,
-            OutputType const* const d_partial_output = nullptr)
+            OutputType const* const d_partial_output)
     : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, OutputType>{d_map,
                                                                                     d_hasher,
                                                                                     d_equal,
@@ -68,19 +70,27 @@ struct reduce_fn : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEq
 /**
  * @brief The builder to construct an instance of `reduce_fn` functor.
  */
+template <typename OutputType>
 struct reduce_func_builder {
-  template <typename MapView, typename KeyHasher, typename KeyEqual, typename OutputType>
+  OutputType const* const d_partial_output;
+
+  reduce_func_builder(OutputType const* const d_partial_output) : d_partial_output{d_partial_output}
+  {
+  }
+
+  template <typename MapView, typename KeyHasher, typename KeyEqual>
   auto build(MapView const& d_map,
              KeyHasher const& d_hasher,
              KeyEqual const& d_equal,
              OutputType* const d_output)
   {
-    return reduce_fn<MapView, KeyHasher, KeyEqual, OutputType>{d_map, d_hasher, d_equal, d_output};
+    return reduce_fn<MapView, KeyHasher, KeyEqual, OutputType>{
+      d_map, d_hasher, d_equal, d_output, d_partial_output};
   }
 };
 
 struct is_none_zero {
-  template<typename Pair>
+  template <typename Pair>
   __device__ bool operator()(Pair const inp_pair) const
   {
     return thrust::get<1>(inp_pair) != 0;
@@ -110,21 +120,23 @@ struct histogram_dispatcher {
     cudf::nullate::DYNAMIC has_nulls,
     bool has_nested_columns,
     size_type* output_indices,
-    mutable_column_view const& output_count,
+    mutable_column_view const& output_counts,
+    std::optional<column_view> const& partial_counts,
     rmm::cuda_stream_view stream) const
   {
-    auto const reduction_results =
-      cudf::detail::hash_reduce_by_row(map,
-                                       preprocessed_input,
-                                       num_rows,
-                                       has_nulls,
-                                       has_nested_columns,
-                                       null_equality::EQUAL,
-                                       nan_equality::ALL_EQUAL,
-                                       reduce_func_builder{},
-                                       OutputType{0},
-                                       stream,
-                                       rmm::mr::get_current_device_resource());
+    auto const reduction_results = cudf::detail::hash_reduce_by_row(
+      map,
+      preprocessed_input,
+      num_rows,
+      has_nulls,
+      has_nested_columns,
+      null_equality::EQUAL,
+      nan_equality::ALL_EQUAL,
+      reduce_func_builder<OutputType>{partial_counts ? partial_counts.value().begin<OutputType>()
+                                                     : nullptr},
+      OutputType{0},
+      stream,
+      rmm::mr::get_current_device_resource());
 
     column_view cv = column_view(data_type{type_id::INT64},
                                  (int)reduction_results.size(),
@@ -138,13 +150,10 @@ struct histogram_dispatcher {
       thrust::make_tuple(thrust::make_counting_iterator(0), reduction_results.begin()));
 
     auto const output_it = thrust::make_zip_iterator(
-      thrust::make_tuple(output_indices, output_count.begin<OutputType>()));
+      thrust::make_tuple(output_indices, output_counts.begin<OutputType>()));
 
-    thrust::copy_if(rmm::exec_policy(stream),
-                    input_it,
-                    input_it + num_rows,
-                    output_it,
-                    is_none_zero{});
+    thrust::copy_if(
+      rmm::exec_policy(stream), input_it, input_it + num_rows, output_it, is_none_zero{});
 
     // Reduction results are either group sizes of equal rows, or `0`.
     // Thus, we only needs to extract the non-zero group sizes.
@@ -153,7 +162,8 @@ struct histogram_dispatcher {
 
 }  // namespace
 
-std::unique_ptr<cudf::scalar> histogram(column_view const& input,
+std::unique_ptr<cudf::scalar> histogram(table_view const& input,
+                                        std::optional<column_view> const& partial_distinct_counts,
                                         data_type const output_dtype,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
@@ -163,17 +173,16 @@ std::unique_ptr<cudf::scalar> histogram(column_view const& input,
                "The output type of histogram aggregation must be an 32/64bit integral type.");
 
   auto map = cudf::detail::hash_map_type{
-    compute_hash_table_size(input.size()),
+    compute_hash_table_size(input.num_rows()),
     cuco::empty_key{cudf::detail::COMPACTION_EMPTY_KEY_SENTINEL},
     cuco::empty_value{cudf::detail::COMPACTION_EMPTY_VALUE_SENTINEL},
     cudf::detail::hash_table_allocator_type{default_allocator<char>{}, stream},
     stream.value()};
 
-  auto const input_tview = table_view{{input}};
   auto const preprocessed_input =
-    cudf::experimental::row::hash::preprocessed_table::create(input_tview, stream);
-  auto const has_nulls          = nullate::DYNAMIC{cudf::has_nested_nulls(input_tview)};
-  auto const has_nested_columns = cudf::detail::has_nested_columns(input_tview);
+    cudf::experimental::row::hash::preprocessed_table::create(input, stream);
+  auto const has_nulls          = nullate::DYNAMIC{cudf::has_nested_nulls(input)};
+  auto const has_nested_columns = cudf::detail::has_nested_columns(input);
 
   auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
   auto const key_hasher =
@@ -188,10 +197,10 @@ std::unique_ptr<cudf::scalar> histogram(column_view const& input,
   auto const value_comp = nan_equal_comparator{};
   if (has_nested_columns) {
     auto const key_equal = row_comp.equal_to<true>(has_nulls, null_equality::EQUAL, value_comp);
-    map.insert(pair_iter, pair_iter + input.size(), key_hasher, key_equal, stream.value());
+    map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
   } else {
     auto const key_equal = row_comp.equal_to<false>(has_nulls, null_equality::EQUAL, value_comp);
-    map.insert(pair_iter, pair_iter + input.size(), key_hasher, key_equal, stream.value());
+    map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
   }
 
   // Gather the indices of distinct rows and distinct rows.
@@ -206,15 +215,16 @@ std::unique_ptr<cudf::scalar> histogram(column_view const& input,
                   histogram_dispatcher{},
                   map,
                   std::move(preprocessed_input),
-                  input.size(),
+                  input.num_rows(),
                   has_nulls,
                   has_nested_columns,
                   distinct_indices.begin(),
                   distinct_counts->mutable_view(),
+                  partial_distinct_counts,
                   stream);
 
   auto distinct_rows =
-    std::move(cudf::detail::gather(input_tview,
+    std::move(cudf::detail::gather(input,
                                    distinct_indices,
                                    out_of_bounds_policy::DONT_CHECK,
                                    cudf::detail::negative_index_policy::NOT_ALLOWED,
@@ -235,6 +245,14 @@ std::unique_ptr<cudf::scalar> histogram(column_view const& input,
     std::move(*output_structs.release()), true, stream, mr);
 }
 
+std::unique_ptr<cudf::scalar> histogram(column_view const& input,
+                                        data_type const output_dtype,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  return histogram(table_view{{input}}, std::nullopt, output_dtype, stream, mr);
+}
+
 std::unique_ptr<cudf::scalar> merge_histogram(column_view const& input,
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
@@ -245,7 +263,8 @@ std::unique_ptr<cudf::scalar> merge_histogram(column_view const& input,
   CUDF_EXPECTS(input.child(1).type().id() == type_id::INT64,
                "The second child of the input column must be INT64 type.");
 
-  return nullptr;
+  return histogram(
+    table_view{{input.child(0)}}, input.child(1), data_type{type_id::INT64}, stream, mr);
 }
 
 }  // namespace cudf::reduction::detail

From 6d8be79456adf660df6b87a1ae6bc287a5c6ddb2 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 13 Sep 2023 13:18:12 -0700
Subject: [PATCH 36/93] Add test for merge histogram

---
 cpp/tests/reductions/reduction_tests.cpp | 39 ++++++++++++++++++++----
 1 file changed, 33 insertions(+), 6 deletions(-)

diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index c8ff6645b16..47f276b8d07 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -401,12 +401,39 @@ TYPED_TEST(ReductionTest, Histogram)
       auto child2 = int64_data{2, 1, 1, 2, 4, 1};
       return structs_col{{child1, child2}};
     }();
-    //    auto const input    = col_data{1, 2, 3, 1, 2};
-    //    auto const expected = [] {
-    //      auto child1 = col_data{1, 2, 3};
-    //      auto child2 = int64_data{2, 2, 1};
-    //      return structs_col{{child1, child2}};
-    //    }();
+    auto const result     = cudf::reduce(input, *agg, cudf::data_type{cudf::type_id::INT64});
+    auto const result_col = dynamic_cast<cudf::list_scalar*>(result.get())->view();
+    cudf::test::print(result_col);
+
+    auto const sort_order    = cudf::sorted_order(cudf::table_view{{result_col.child(0)}}, {}, {});
+    auto const sorted_result = cudf::gather(cudf::table_view{{result_col}}, *sort_order);
+    cudf::test::print(sorted_result->get_column(0).view());
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, sorted_result->get_column(0).view());
+  }
+}
+
+TYPED_TEST(ReductionTest, MergeHistogram)
+{
+  using col_data    = cudf::test::fixed_width_column_wrapper<TypeParam>;
+  using int64_data  = cudf::test::fixed_width_column_wrapper<int64_t>;
+  using structs_col = cudf::test::structs_column_wrapper;
+
+  auto const agg = cudf::make_merge_histogram_aggregation<reduce_aggregation>();
+
+  // Test without nulls.
+  {
+    auto const input = [] {
+      auto child1 = col_data{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1};
+      auto child2 = int64_data{2, 1, 1, 2, 4, 1, 2, 3, 5, 3, 4};
+      return structs_col{{child1, child2}};
+    }();
+
+    auto const expected = [] {
+      auto child1 = col_data{-3, -2, 0, 1, 2, 5};
+      auto child2 = int64_data{5, 5, 4, 5, 8, 1};
+      return structs_col{{child1, child2}};
+    }();
     auto const result     = cudf::reduce(input, *agg, cudf::data_type{cudf::type_id::INT64});
     auto const result_col = dynamic_cast<cudf::list_scalar*>(result.get())->view();
     cudf::test::print(result_col);

From 2d085397ffea6bd14681038dc00fdbe21875cc41 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 13 Sep 2023 13:20:45 -0700
Subject: [PATCH 37/93] Cleanup

---
 cpp/src/reductions/histogram.cu          | 10 +++++-----
 cpp/tests/reductions/reduction_tests.cpp |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index e0cd1586756..4550c65e033 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <cudf_test/column_utilities.hpp>
+//#include <cudf_test/column_utilities.hpp>
 
 #include <reductions/hash_reduce_by_row.cuh>
 #include <stream_compaction/stream_compaction_common.cuh>
@@ -143,8 +143,8 @@ struct histogram_dispatcher {
                                  reduction_results.data(),
                                  nullptr,
                                  0);
-    printf("reduction result, num rows = %d\n", num_rows);
-    cudf::test::print(cv);
+//    printf("reduction result, num rows = %d\n", num_rows);
+//    cudf::test::print(cv);
 
     auto const input_it = thrust::make_zip_iterator(
       thrust::make_tuple(thrust::make_counting_iterator(0), reduction_results.begin()));
@@ -232,8 +232,8 @@ std::unique_ptr<cudf::scalar> histogram(table_view const& input,
                                    mr)
                 ->release()
                 .front());
-  printf("reduction result 2\n");
-  cudf::test::print(distinct_counts->view());
+//  printf("reduction result 2\n");
+//  cudf::test::print(distinct_counts->view());
 
   std::vector<std::unique_ptr<column>> struct_children;
   struct_children.emplace_back(std::move(distinct_rows));
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 47f276b8d07..02bbafd1bef 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -403,11 +403,11 @@ TYPED_TEST(ReductionTest, Histogram)
     }();
     auto const result     = cudf::reduce(input, *agg, cudf::data_type{cudf::type_id::INT64});
     auto const result_col = dynamic_cast<cudf::list_scalar*>(result.get())->view();
-    cudf::test::print(result_col);
+    //    cudf::test::print(result_col);
 
     auto const sort_order    = cudf::sorted_order(cudf::table_view{{result_col.child(0)}}, {}, {});
     auto const sorted_result = cudf::gather(cudf::table_view{{result_col}}, *sort_order);
-    cudf::test::print(sorted_result->get_column(0).view());
+    //    cudf::test::print(sorted_result->get_column(0).view());
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, sorted_result->get_column(0).view());
   }
@@ -436,11 +436,11 @@ TYPED_TEST(ReductionTest, MergeHistogram)
     }();
     auto const result     = cudf::reduce(input, *agg, cudf::data_type{cudf::type_id::INT64});
     auto const result_col = dynamic_cast<cudf::list_scalar*>(result.get())->view();
-    cudf::test::print(result_col);
+    //    cudf::test::print(result_col);
 
     auto const sort_order    = cudf::sorted_order(cudf::table_view{{result_col.child(0)}}, {}, {});
     auto const sorted_result = cudf::gather(cudf::table_view{{result_col}}, *sort_order);
-    cudf::test::print(sorted_result->get_column(0).view());
+    //    cudf::test::print(sorted_result->get_column(0).view());
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, sorted_result->get_column(0).view());
   }

From 7999c7eed9bbe9ab319c18dc5f94e06d25111e7c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 13 Sep 2023 14:28:51 -0700
Subject: [PATCH 38/93] Cleanup

---
 cpp/src/reductions/histogram.cu | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 4550c65e033..a4aba384c46 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-//#include <cudf_test/column_utilities.hpp>
+// #include <cudf_test/column_utilities.hpp>
 
 #include <reductions/hash_reduce_by_row.cuh>
 #include <stream_compaction/stream_compaction_common.cuh>
@@ -143,8 +143,8 @@ struct histogram_dispatcher {
                                  reduction_results.data(),
                                  nullptr,
                                  0);
-//    printf("reduction result, num rows = %d\n", num_rows);
-//    cudf::test::print(cv);
+    //    printf("reduction result, num rows = %d\n", num_rows);
+    //    cudf::test::print(cv);
 
     auto const input_it = thrust::make_zip_iterator(
       thrust::make_tuple(thrust::make_counting_iterator(0), reduction_results.begin()));
@@ -232,8 +232,8 @@ std::unique_ptr<cudf::scalar> histogram(table_view const& input,
                                    mr)
                 ->release()
                 .front());
-//  printf("reduction result 2\n");
-//  cudf::test::print(distinct_counts->view());
+  //  printf("reduction result 2\n");
+  //  cudf::test::print(distinct_counts->view());
 
   std::vector<std::unique_ptr<column>> struct_children;
   struct_children.emplace_back(std::move(distinct_rows));
@@ -260,8 +260,9 @@ std::unique_ptr<cudf::scalar> merge_histogram(column_view const& input,
   CUDF_EXPECTS(
     input.type().id() == type_id::STRUCT && input.num_children() == 2,
     "The input of merge_histogram aggregation must be a struct column having two children.");
-  CUDF_EXPECTS(input.child(1).type().id() == type_id::INT64,
-               "The second child of the input column must be INT64 type.");
+  CUDF_EXPECTS(input.child(1).type().id() == type_id::INT64 && !input.child(1).has_nulls(),
+               "The second child of the input column must be INT64 type and has no nulls.");
+  CUDF_EXPECTS(!input.has_nulls(), "The input column must not have nulls.");
 
   return histogram(
     table_view{{input.child(0)}}, input.child(1), data_type{type_id::INT64}, stream, mr);

From 2d47048a1d1cfaa03d1540365e1106d8ef587f5a Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 13 Sep 2023 14:32:22 -0700
Subject: [PATCH 39/93] Add tests with nulls

---
 cpp/tests/reductions/reduction_tests.cpp | 53 +++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 02bbafd1bef..c1ee5ec4e76 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -387,7 +387,7 @@ TYPED_TEST(ReductionTest, SumOfSquare)
 
 TYPED_TEST(ReductionTest, Histogram)
 {
-  using col_data    = cudf::test::fixed_width_column_wrapper<TypeParam>;
+  using col_data    = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
   using int64_data  = cudf::test::fixed_width_column_wrapper<int64_t>;
   using structs_col = cudf::test::structs_column_wrapper;
 
@@ -411,6 +411,29 @@ TYPED_TEST(ReductionTest, Histogram)
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, sorted_result->get_column(0).view());
   }
+
+  // Test with nulls.
+  {
+    using namespace cudf::test::iterators;
+    auto constexpr null{0};
+
+    auto const input    = col_data{{null, -3, 2, 1, 2, 0, null, 5, 2, null, -3, -2, null, 2, 1},
+                                nulls_at({0, 6, 9, 12})};
+    auto const expected = [] {
+      auto child1 = col_data{{null, -3, -2, 0, 1, 2, 5}, null_at(0)};
+      auto child2 = int64_data{4, 2, 1, 1, 2, 4, 1};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result     = cudf::reduce(input, *agg, cudf::data_type{cudf::type_id::INT64});
+    auto const result_col = dynamic_cast<cudf::list_scalar*>(result.get())->view();
+    //    cudf::test::print(result_col);
+
+    auto const sort_order    = cudf::sorted_order(cudf::table_view{{result_col.child(0)}}, {}, {});
+    auto const sorted_result = cudf::gather(cudf::table_view{{result_col}}, *sort_order);
+    //    cudf::test::print(sorted_result->get_column(0).view());
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, sorted_result->get_column(0).view());
+  }
 }
 
 TYPED_TEST(ReductionTest, MergeHistogram)
@@ -444,6 +467,34 @@ TYPED_TEST(ReductionTest, MergeHistogram)
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, sorted_result->get_column(0).view());
   }
+
+  // Test with nulls.
+  {
+    using namespace cudf::test::iterators;
+    auto constexpr null{0};
+
+    auto const input = [] {
+      auto child1 = col_data{{-3, 2, null, 1, 2, null, 0, 5, null, 2, -3, null, -2, 2, 1, null},
+                             nulls_at({2, 5, 8, 11, 15})};
+      auto child2 = int64_data{2, 1, 12, 1, 2, 11, 4, 1, 10, 2, 3, 15, 5, 3, 4, 19};
+      return structs_col{{child1, child2}};
+    }();
+
+    auto const expected = [] {
+      auto child1 = col_data{{null, -3, -2, 0, 1, 2, 5}, null_at(0)};
+      auto child2 = int64_data{67, 5, 5, 4, 5, 8, 1};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result     = cudf::reduce(input, *agg, cudf::data_type{cudf::type_id::INT64});
+    auto const result_col = dynamic_cast<cudf::list_scalar*>(result.get())->view();
+    //    cudf::test::print(result_col);
+
+    auto const sort_order    = cudf::sorted_order(cudf::table_view{{result_col.child(0)}}, {}, {});
+    auto const sorted_result = cudf::gather(cudf::table_view{{result_col}}, *sort_order);
+    //    cudf::test::print(sorted_result->get_column(0).view());
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, sorted_result->get_column(0).view());
+  }
 }
 
 #if 0

From 824dcad85be0a4588ea151213a76b2463e254254 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 13 Sep 2023 14:52:59 -0700
Subject: [PATCH 40/93] Add sliced input tests

---
 cpp/tests/reductions/reduction_tests.cpp | 98 ++++++++++++++++++++++++
 1 file changed, 98 insertions(+)

diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index c1ee5ec4e76..e61db56ad52 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -412,6 +412,26 @@ TYPED_TEST(ReductionTest, Histogram)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, sorted_result->get_column(0).view());
   }
 
+  // Test without nulls, sliced input.
+  {
+    auto const input_original = col_data{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1};
+    auto const input          = cudf::slice(input_original, {0, 7})[0];
+    auto const expected       = [] {
+      auto child1 = col_data{-3, 0, 1, 2, 5};
+      auto child2 = int64_data{1, 1, 1, 3, 1};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result     = cudf::reduce(input, *agg, cudf::data_type{cudf::type_id::INT64});
+    auto const result_col = dynamic_cast<cudf::list_scalar*>(result.get())->view();
+    //    cudf::test::print(result_col);
+
+    auto const sort_order    = cudf::sorted_order(cudf::table_view{{result_col.child(0)}}, {}, {});
+    auto const sorted_result = cudf::gather(cudf::table_view{{result_col}}, *sort_order);
+    //    cudf::test::print(sorted_result->get_column(0).view());
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, sorted_result->get_column(0).view());
+  }
+
   // Test with nulls.
   {
     using namespace cudf::test::iterators;
@@ -434,6 +454,30 @@ TYPED_TEST(ReductionTest, Histogram)
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, sorted_result->get_column(0).view());
   }
+
+  // Test with nulls, sliced input.
+  {
+    using namespace cudf::test::iterators;
+    auto constexpr null{0};
+
+    auto const input_original = col_data{
+      {null, -3, 2, 1, 2, 0, null, 5, 2, null, -3, -2, null, 2, 1}, nulls_at({0, 6, 9, 12})};
+    auto const input    = cudf::slice(input_original, {0, 9})[0];
+    auto const expected = [] {
+      auto child1 = col_data{{null, -3, 0, 1, 2, 5}, null_at(0)};
+      auto child2 = int64_data{2, 1, 1, 1, 3, 1};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result     = cudf::reduce(input, *agg, cudf::data_type{cudf::type_id::INT64});
+    auto const result_col = dynamic_cast<cudf::list_scalar*>(result.get())->view();
+    //    cudf::test::print(result_col);
+
+    auto const sort_order    = cudf::sorted_order(cudf::table_view{{result_col.child(0)}}, {}, {});
+    auto const sorted_result = cudf::gather(cudf::table_view{{result_col}}, *sort_order);
+    //    cudf::test::print(sorted_result->get_column(0).view());
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, sorted_result->get_column(0).view());
+  }
 }
 
 TYPED_TEST(ReductionTest, MergeHistogram)
@@ -468,6 +512,31 @@ TYPED_TEST(ReductionTest, MergeHistogram)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, sorted_result->get_column(0).view());
   }
 
+  // Test without nulls, sliced input.
+  {
+    auto const input_original = [] {
+      auto child1 = col_data{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1};
+      auto child2 = int64_data{2, 1, 1, 2, 4, 1, 2, 3, 5, 3, 4};
+      return structs_col{{child1, child2}};
+    }();
+    auto const input = cudf::slice(input_original, {0, 7})[0];
+
+    auto const expected = [] {
+      auto child1 = col_data{-3, 0, 1, 2, 5};
+      auto child2 = int64_data{2, 4, 1, 5, 1};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result     = cudf::reduce(input, *agg, cudf::data_type{cudf::type_id::INT64});
+    auto const result_col = dynamic_cast<cudf::list_scalar*>(result.get())->view();
+    //    cudf::test::print(result_col);
+
+    auto const sort_order    = cudf::sorted_order(cudf::table_view{{result_col.child(0)}}, {}, {});
+    auto const sorted_result = cudf::gather(cudf::table_view{{result_col}}, *sort_order);
+    //    cudf::test::print(sorted_result->get_column(0).view());
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, sorted_result->get_column(0).view());
+  }
+
   // Test with nulls.
   {
     using namespace cudf::test::iterators;
@@ -495,6 +564,35 @@ TYPED_TEST(ReductionTest, MergeHistogram)
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, sorted_result->get_column(0).view());
   }
+
+  // Test with nulls, sliced input.
+  {
+    using namespace cudf::test::iterators;
+    auto constexpr null{0};
+
+    auto const input_original = [] {
+      auto child1 = col_data{{-3, 2, null, 1, 2, null, 0, 5, null, 2, -3, null, -2, 2, 1, null},
+                             nulls_at({2, 5, 8, 11, 15})};
+      auto child2 = int64_data{2, 1, 12, 1, 2, 11, 4, 1, 10, 2, 3, 15, 5, 3, 4, 19};
+      return structs_col{{child1, child2}};
+    }();
+    auto const input = cudf::slice(input_original, {0, 9})[0];
+
+    auto const expected = [] {
+      auto child1 = col_data{{null, -3, 0, 1, 2, 5}, null_at(0)};
+      auto child2 = int64_data{33, 2, 4, 1, 3, 1};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result     = cudf::reduce(input, *agg, cudf::data_type{cudf::type_id::INT64});
+    auto const result_col = dynamic_cast<cudf::list_scalar*>(result.get())->view();
+    //    cudf::test::print(result_col);
+
+    auto const sort_order    = cudf::sorted_order(cudf::table_view{{result_col.child(0)}}, {}, {});
+    auto const sorted_result = cudf::gather(cudf::table_view{{result_col}}, *sort_order);
+    //    cudf::test::print(sorted_result->get_column(0).view());
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, sorted_result->get_column(0).view());
+  }
 }
 
 #if 0

From 3fb43f488be2566b8f282565109b779712bb2e60 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 13 Sep 2023 14:53:10 -0700
Subject: [PATCH 41/93] Fix sliced input

---
 cpp/src/reductions/histogram.cu | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index a4aba384c46..1665ef2bfea 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -264,8 +264,12 @@ std::unique_ptr<cudf::scalar> merge_histogram(column_view const& input,
                "The second child of the input column must be INT64 type and has no nulls.");
   CUDF_EXPECTS(!input.has_nulls(), "The input column must not have nulls.");
 
-  return histogram(
-    table_view{{input.child(0)}}, input.child(1), data_type{type_id::INT64}, stream, mr);
+  auto const structs_cv = structs_column_view{input};
+  return histogram(table_view{{structs_cv.get_sliced_child(0, stream)}},
+                   structs_cv.get_sliced_child(1, stream),
+                   data_type{type_id::INT64},
+                   stream,
+                   mr);
 }
 
 }  // namespace cudf::reduction::detail

From ee229a00125c9a4c2edff32c73ca8d952c75b1e7 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 13 Sep 2023 15:24:05 -0700
Subject: [PATCH 42/93] Add binding for `HISTOGRAM` and `MERGE_HISTOGRAM`
 aggregations

---
 .../main/java/ai/rapids/cudf/Aggregation.java | 24 ++++++++++++++++++-
 .../ai/rapids/cudf/GroupByAggregation.java    |  8 +++++++
 java/src/main/native/src/AggregationJni.cpp   |  5 ++++
 3 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Aggregation.java b/java/src/main/java/ai/rapids/cudf/Aggregation.java
index d10329ca0f2..029017ae113 100644
--- a/java/src/main/java/ai/rapids/cudf/Aggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/Aggregation.java
@@ -68,7 +68,9 @@ enum Kind {
         DENSE_RANK(29),
         PERCENT_RANK(30),
         TDIGEST(31), // This can take a delta argument for accuracy level
-        MERGE_TDIGEST(32); // This can take a delta argument for accuracy level
+        MERGE_TDIGEST(32), // This can take a delta argument for accuracy level
+        HISTOGRAM(33),
+        MERGE_HISTOGRAM(34);
 
         final int nativeId;
 
@@ -918,6 +920,26 @@ static TDigestAggregation mergeTDigest(int delta) {
         return new TDigestAggregation(Kind.MERGE_TDIGEST, delta);
     }
 
+    static final class HistogramAggregation extends NoParamAggregation {
+        private HistogramAggregation() {
+            super(Kind.HISTOGRAM);
+        }
+    }
+
+    static final class MergeHistogramAggregation extends NoParamAggregation {
+        private MergeHistogramAggregation() {
+            super(Kind.MERGE_HISTOGRAM);
+        }
+    }
+
+    static HistogramAggregation histogram() {
+        return new HistogramAggregation();
+    }
+
+    static MergeHistogramAggregation mergeHistogram() {
+        return new MergeHistogramAggregation();
+    }
+
     /**
      * Create one of the aggregations that only needs a kind, no other parameters. This does not
      * work for all types and for code safety reasons each kind is added separately.
diff --git a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
index 500d18f7eae..25bb716bd5a 100644
--- a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
@@ -315,4 +315,12 @@ public static GroupByAggregation createTDigest(int delta) {
   public static GroupByAggregation mergeTDigest(int delta) {
     return new GroupByAggregation(Aggregation.mergeTDigest(delta));
   }
+
+  public static GroupByAggregation histogram() {
+    return new GroupByAggregation(Aggregation.histogram());
+  }
+
+  public static GroupByAggregation mergeHistogram() {
+    return new GroupByAggregation(Aggregation.mergeHistogram());
+  }
 }
diff --git a/java/src/main/native/src/AggregationJni.cpp b/java/src/main/native/src/AggregationJni.cpp
index 6ac73282615..8984c27530d 100644
--- a/java/src/main/native/src/AggregationJni.cpp
+++ b/java/src/main/native/src/AggregationJni.cpp
@@ -90,6 +90,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNoParamAgg(JNIEnv
         case 30: // ANSI SQL PERCENT_RANK
           return cudf::make_rank_aggregation(cudf::rank_method::MIN, {}, cudf::null_policy::INCLUDE,
                                              {}, cudf::rank_percentage::ONE_NORMALIZED);
+        case 33: // HISTOGRAM
+          return cudf::make_histogram_aggregation();
+        case 34: // MERGE_HISTOGRAM
+          return cudf::make_merge_histogram_aggregation();
+
         default: throw std::logic_error("Unsupported No Parameter Aggregation Operation");
       }
     }();

From b71c7a8a99b0e257eb1f485e345904cf2313aac5 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 13 Sep 2023 17:33:34 -0700
Subject: [PATCH 43/93] Fix compiling issue

---
 cpp/src/reductions/hash_reduce_by_row.cuh | 2 +-
 cpp/src/reductions/histogram.cu           | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/cpp/src/reductions/hash_reduce_by_row.cuh b/cpp/src/reductions/hash_reduce_by_row.cuh
index 35654b90bc0..bc42c1473c1 100644
--- a/cpp/src/reductions/hash_reduce_by_row.cuh
+++ b/cpp/src/reductions/hash_reduce_by_row.cuh
@@ -124,7 +124,7 @@ rmm::device_uvector<OutputType> hash_reduce_by_row(
 {
   auto const map_dview  = map.get_device_view();
   auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
-  auto const key_hasher = experimental::compaction_hash(row_hasher.device_hasher(has_nulls));
+  auto const key_hasher = row_hasher.device_hasher(has_nulls);
   auto const row_comp   = cudf::experimental::row::equality::self_comparator(preprocessed_input);
 
   auto reduction_results = rmm::device_uvector<OutputType>(num_rows, stream, mr);
diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 1665ef2bfea..72ed3026580 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -174,8 +174,8 @@ std::unique_ptr<cudf::scalar> histogram(table_view const& input,
 
   auto map = cudf::detail::hash_map_type{
     compute_hash_table_size(input.num_rows()),
-    cuco::empty_key{cudf::detail::COMPACTION_EMPTY_KEY_SENTINEL},
-    cuco::empty_value{cudf::detail::COMPACTION_EMPTY_VALUE_SENTINEL},
+    cuco::empty_key{-1},
+    cuco::empty_value{std::numeric_limits<size_type>::min()},
     cudf::detail::hash_table_allocator_type{default_allocator<char>{}, stream},
     stream.value()};
 
@@ -185,9 +185,8 @@ std::unique_ptr<cudf::scalar> histogram(table_view const& input,
   auto const has_nested_columns = cudf::detail::has_nested_columns(input);
 
   auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
-  auto const key_hasher =
-    cudf::detail::experimental::compaction_hash(row_hasher.device_hasher(has_nulls));
-  auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input);
+  auto const key_hasher = row_hasher.device_hasher(has_nulls);
+  auto const row_comp   = cudf::experimental::row::equality::self_comparator(preprocessed_input);
 
   auto const pair_iter = cudf::detail::make_counting_transform_iterator(
     size_type{0}, [] __device__(size_type const i) { return cuco::make_pair(i, i); });

From 1edeb4ce90bb1eeef3691acd4a0b8f342924f4cb Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 13 Sep 2023 17:34:58 -0700
Subject: [PATCH 44/93] Remove header

---
 cpp/src/reductions/hash_reduce_by_row.cuh | 164 ----------------------
 cpp/src/reductions/histogram.cu           |   2 +-
 2 files changed, 1 insertion(+), 165 deletions(-)
 delete mode 100644 cpp/src/reductions/hash_reduce_by_row.cuh

diff --git a/cpp/src/reductions/hash_reduce_by_row.cuh b/cpp/src/reductions/hash_reduce_by_row.cuh
deleted file mode 100644
index bc42c1473c1..00000000000
--- a/cpp/src/reductions/hash_reduce_by_row.cuh
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stream_compaction/stream_compaction_common.cuh>
-
-#include <cudf/table/experimental/row_operators.cuh>
-#include <cudf/types.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/uninitialized_fill.h>
-
-namespace cudf::detail {
-
-/**
- * @brief The base struct for customized reduction functor to perform reduce-by-key with keys are
- * rows that compared equal.
- *
- * TODO: We need to switch to use `static_reduction_map` when it is ready
- * (https://github.com/NVIDIA/cuCollections/pull/98).
- */
-template <typename MapView, typename KeyHasher, typename KeyEqual, typename OutputType>
-struct reduce_by_row_fn_base {
- protected:
-  MapView const d_map;
-  KeyHasher const d_hasher;
-  KeyEqual const d_equal;
-  OutputType* const d_output;
-
-  reduce_by_row_fn_base(MapView const& d_map,
-                        KeyHasher const& d_hasher,
-                        KeyEqual const& d_equal,
-                        OutputType* const d_output)
-    : d_map{d_map}, d_hasher{d_hasher}, d_equal{d_equal}, d_output{d_output}
-  {
-  }
-
-  /**
-   * @brief Return a pointer to the output array at the given index.
-   *
-   * @param idx The access index
-   * @return A pointer to the given index in the output array
-   */
-  __device__ OutputType* get_output_ptr(size_type const idx) const
-  {
-    auto const iter = d_map.find(idx, d_hasher, d_equal);
-
-    if (iter != d_map.end()) {
-      // Only one (undetermined) index value of the duplicate rows could be inserted into the map.
-      // As such, looking up for all indices of duplicate rows always returns the same value.
-      auto const inserted_idx = iter->second.load(cuda::std::memory_order_relaxed);
-
-      // All duplicate rows will have concurrent access to this same output slot.
-      return &d_output[inserted_idx];
-    } else {
-      // All input `idx` values have been inserted into the map before.
-      // Thus, searching for an `idx` key resulting in the `end()` iterator only happens if
-      // `d_equal(idx, idx) == false`.
-      // Such situations are due to comparing nulls or NaNs which are considered as always unequal.
-      // In those cases, all rows containing nulls or NaNs are distinct. Just return their direct
-      // output slot.
-      return &d_output[idx];
-    }
-  }
-};
-
-/**
- * @brief Perform a reduction on groups of rows that are compared equal.
- *
- * This is essentially a reduce-by-key operation with keys are non-contiguous rows and are compared
- * equal. A hash table is used to find groups of equal rows.
- *
- * At the beginning of the operation, the entire output array is filled with a value given by
- * the `init` parameter. Then, the reduction result for each row group is written into the output
- * array at the index of an unspecified row in the group.
- *
- * @tparam ReduceFuncBuilder The builder class that must have a `build()` method returning a
- *         reduction functor derived from `reduce_by_row_fn_base`
- * @tparam OutputType Type of the reduction results
- * @param map The auxiliary map to perform reduction
- * @param preprocessed_input The preprocessed of the input rows for computing row hashing and row
- *        comparisons
- * @param num_rows The number of all input rows
- * @param has_nulls Indicate whether the input rows has any nulls at any nested levels
- * @param has_nested_columns Indicates whether the input table has any nested columns
- * @param nulls_equal Flag to specify whether null elements should be considered as equal
- * @param nans_equal Flag to specify whether NaN values in floating point column should be
- *        considered equal.
- * @param init The initial value for reduction of each row group
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned vector
- * @return A device_uvector containing the reduction results
- */
-template <typename ReduceFuncBuilder, typename OutputType>
-rmm::device_uvector<OutputType> hash_reduce_by_row(
-  hash_map_type const& map,
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
-  size_type num_rows,
-  cudf::nullate::DYNAMIC has_nulls,
-  bool has_nested_columns,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  ReduceFuncBuilder func_builder,
-  OutputType init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
-{
-  auto const map_dview  = map.get_device_view();
-  auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
-  auto const key_hasher = row_hasher.device_hasher(has_nulls);
-  auto const row_comp   = cudf::experimental::row::equality::self_comparator(preprocessed_input);
-
-  auto reduction_results = rmm::device_uvector<OutputType>(num_rows, stream, mr);
-  thrust::uninitialized_fill(
-    rmm::exec_policy(stream), reduction_results.begin(), reduction_results.end(), init);
-
-  auto const reduce_by_row = [&](auto const value_comp) {
-    if (has_nested_columns) {
-      auto const key_equal = row_comp.equal_to<true>(has_nulls, nulls_equal, value_comp);
-      thrust::for_each(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator(0),
-        thrust::make_counting_iterator(num_rows),
-        func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin()));
-    } else {
-      auto const key_equal = row_comp.equal_to<false>(has_nulls, nulls_equal, value_comp);
-      thrust::for_each(
-        rmm::exec_policy(stream),
-        thrust::make_counting_iterator(0),
-        thrust::make_counting_iterator(num_rows),
-        func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin()));
-    }
-  };
-
-  if (nans_equal == nan_equality::ALL_EQUAL) {
-    using nan_equal_comparator =
-      cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
-    reduce_by_row(nan_equal_comparator{});
-  } else {
-    using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
-    reduce_by_row(nan_unequal_comparator{});
-  }
-
-  return reduction_results;
-}
-
-}  // namespace cudf::detail
diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 72ed3026580..dc163b63d23 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -16,9 +16,9 @@
 
 // #include <cudf_test/column_utilities.hpp>
 
-#include <reductions/hash_reduce_by_row.cuh>
 #include <stream_compaction/stream_compaction_common.cuh>
 
+#include <cudf/detail/hash_reduce_by_row.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>

From 75c35c4864fcdc56fb1fd19f97717635fc961363 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Fri, 15 Sep 2023 09:55:48 -0700
Subject: [PATCH 45/93] Change test types

---
 cpp/tests/reductions/reduction_tests.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index e61db56ad52..83bf834251b 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -295,7 +295,12 @@ TYPED_TEST(SumReductionTest, Sum)
 }
 
 #endif
-TYPED_TEST_SUITE(ReductionTest, cudf::test::FloatingPointTypes);
+
+using TestTypes = cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t>,
+                                     cudf::test::FloatingPointTypes,
+                                     cudf::test::FixedPointTypes,
+                                     cudf::test::ChronoTypes>;
+TYPED_TEST_SUITE(ReductionTest, TestTypes);
 
 #if 0
 TYPED_TEST(ReductionTest, Product)

From c6c2c4336d834250cb0ed20a02cbe213679f5a95 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 18 Sep 2023 13:33:58 -0700
Subject: [PATCH 46/93] Rewrite tests

---
 cpp/tests/reductions/reduction_tests.cpp | 114 ++++++++---------------
 1 file changed, 40 insertions(+), 74 deletions(-)

diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 83bf834251b..bb9db061602 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -390,7 +390,29 @@ TYPED_TEST(ReductionTest, SumOfSquare)
 
 #endif
 
-TYPED_TEST(ReductionTest, Histogram)
+template <typename T>
+struct ReductionHistogramTest : public cudf::test::BaseFixture {};
+
+using HistogramTestTypes = cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t>,
+                                              cudf::test::FloatingPointTypes,
+                                              cudf::test::FixedPointTypes,
+                                              cudf::test::ChronoTypes>;
+TYPED_TEST_SUITE(ReductionHistogramTest, HistogramTestTypes);
+
+auto histogram_reduction(cudf::column_view const& input,
+                         std::unique_ptr<cudf::reduce_aggregation> const& agg)
+{
+  CUDF_EXPECTS(
+    agg->kind == cudf::aggregation::HISTOGRAM || agg->kind == cudf::aggregation::MERGE_HISTOGRAM,
+    "Aggregation must be either HISTOGRAM or MERGE_HISTOGRAM.");
+
+  auto const result_scalar = cudf::reduce(input, *agg, cudf::data_type{cudf::type_id::INT64});
+  auto const result_col    = dynamic_cast<cudf::list_scalar*>(result_scalar.get())->view();
+  auto const sort_order    = cudf::sorted_order(cudf::table_view{{result_col.child(0)}}, {}, {});
+  return std::move(cudf::gather(cudf::table_view{{result_col}}, *sort_order)->release().front());
+}
+
+TYPED_TEST(ReductionHistogramTest, Histogram)
 {
   using col_data    = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
   using int64_data  = cudf::test::fixed_width_column_wrapper<int64_t>;
@@ -406,15 +428,8 @@ TYPED_TEST(ReductionTest, Histogram)
       auto child2 = int64_data{2, 1, 1, 2, 4, 1};
       return structs_col{{child1, child2}};
     }();
-    auto const result     = cudf::reduce(input, *agg, cudf::data_type{cudf::type_id::INT64});
-    auto const result_col = dynamic_cast<cudf::list_scalar*>(result.get())->view();
-    //    cudf::test::print(result_col);
-
-    auto const sort_order    = cudf::sorted_order(cudf::table_view{{result_col.child(0)}}, {}, {});
-    auto const sorted_result = cudf::gather(cudf::table_view{{result_col}}, *sort_order);
-    //    cudf::test::print(sorted_result->get_column(0).view());
-
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, sorted_result->get_column(0).view());
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
   }
 
   // Test without nulls, sliced input.
@@ -426,15 +441,8 @@ TYPED_TEST(ReductionTest, Histogram)
       auto child2 = int64_data{1, 1, 1, 3, 1};
       return structs_col{{child1, child2}};
     }();
-    auto const result     = cudf::reduce(input, *agg, cudf::data_type{cudf::type_id::INT64});
-    auto const result_col = dynamic_cast<cudf::list_scalar*>(result.get())->view();
-    //    cudf::test::print(result_col);
-
-    auto const sort_order    = cudf::sorted_order(cudf::table_view{{result_col.child(0)}}, {}, {});
-    auto const sorted_result = cudf::gather(cudf::table_view{{result_col}}, *sort_order);
-    //    cudf::test::print(sorted_result->get_column(0).view());
-
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, sorted_result->get_column(0).view());
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
   }
 
   // Test with nulls.
@@ -449,15 +457,8 @@ TYPED_TEST(ReductionTest, Histogram)
       auto child2 = int64_data{4, 2, 1, 1, 2, 4, 1};
       return structs_col{{child1, child2}};
     }();
-    auto const result     = cudf::reduce(input, *agg, cudf::data_type{cudf::type_id::INT64});
-    auto const result_col = dynamic_cast<cudf::list_scalar*>(result.get())->view();
-    //    cudf::test::print(result_col);
-
-    auto const sort_order    = cudf::sorted_order(cudf::table_view{{result_col.child(0)}}, {}, {});
-    auto const sorted_result = cudf::gather(cudf::table_view{{result_col}}, *sort_order);
-    //    cudf::test::print(sorted_result->get_column(0).view());
-
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, sorted_result->get_column(0).view());
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
   }
 
   // Test with nulls, sliced input.
@@ -473,19 +474,12 @@ TYPED_TEST(ReductionTest, Histogram)
       auto child2 = int64_data{2, 1, 1, 1, 3, 1};
       return structs_col{{child1, child2}};
     }();
-    auto const result     = cudf::reduce(input, *agg, cudf::data_type{cudf::type_id::INT64});
-    auto const result_col = dynamic_cast<cudf::list_scalar*>(result.get())->view();
-    //    cudf::test::print(result_col);
-
-    auto const sort_order    = cudf::sorted_order(cudf::table_view{{result_col.child(0)}}, {}, {});
-    auto const sorted_result = cudf::gather(cudf::table_view{{result_col}}, *sort_order);
-    //    cudf::test::print(sorted_result->get_column(0).view());
-
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, sorted_result->get_column(0).view());
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
   }
 }
 
-TYPED_TEST(ReductionTest, MergeHistogram)
+TYPED_TEST(ReductionHistogramTest, MergeHistogram)
 {
   using col_data    = cudf::test::fixed_width_column_wrapper<TypeParam>;
   using int64_data  = cudf::test::fixed_width_column_wrapper<int64_t>;
@@ -506,15 +500,8 @@ TYPED_TEST(ReductionTest, MergeHistogram)
       auto child2 = int64_data{5, 5, 4, 5, 8, 1};
       return structs_col{{child1, child2}};
     }();
-    auto const result     = cudf::reduce(input, *agg, cudf::data_type{cudf::type_id::INT64});
-    auto const result_col = dynamic_cast<cudf::list_scalar*>(result.get())->view();
-    //    cudf::test::print(result_col);
-
-    auto const sort_order    = cudf::sorted_order(cudf::table_view{{result_col.child(0)}}, {}, {});
-    auto const sorted_result = cudf::gather(cudf::table_view{{result_col}}, *sort_order);
-    //    cudf::test::print(sorted_result->get_column(0).view());
-
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, sorted_result->get_column(0).view());
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
   }
 
   // Test without nulls, sliced input.
@@ -531,15 +518,8 @@ TYPED_TEST(ReductionTest, MergeHistogram)
       auto child2 = int64_data{2, 4, 1, 5, 1};
       return structs_col{{child1, child2}};
     }();
-    auto const result     = cudf::reduce(input, *agg, cudf::data_type{cudf::type_id::INT64});
-    auto const result_col = dynamic_cast<cudf::list_scalar*>(result.get())->view();
-    //    cudf::test::print(result_col);
-
-    auto const sort_order    = cudf::sorted_order(cudf::table_view{{result_col.child(0)}}, {}, {});
-    auto const sorted_result = cudf::gather(cudf::table_view{{result_col}}, *sort_order);
-    //    cudf::test::print(sorted_result->get_column(0).view());
-
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, sorted_result->get_column(0).view());
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
   }
 
   // Test with nulls.
@@ -559,15 +539,8 @@ TYPED_TEST(ReductionTest, MergeHistogram)
       auto child2 = int64_data{67, 5, 5, 4, 5, 8, 1};
       return structs_col{{child1, child2}};
     }();
-    auto const result     = cudf::reduce(input, *agg, cudf::data_type{cudf::type_id::INT64});
-    auto const result_col = dynamic_cast<cudf::list_scalar*>(result.get())->view();
-    //    cudf::test::print(result_col);
-
-    auto const sort_order    = cudf::sorted_order(cudf::table_view{{result_col.child(0)}}, {}, {});
-    auto const sorted_result = cudf::gather(cudf::table_view{{result_col}}, *sort_order);
-    //    cudf::test::print(sorted_result->get_column(0).view());
-
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, sorted_result->get_column(0).view());
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
   }
 
   // Test with nulls, sliced input.
@@ -588,15 +561,8 @@ TYPED_TEST(ReductionTest, MergeHistogram)
       auto child2 = int64_data{33, 2, 4, 1, 3, 1};
       return structs_col{{child1, child2}};
     }();
-    auto const result     = cudf::reduce(input, *agg, cudf::data_type{cudf::type_id::INT64});
-    auto const result_col = dynamic_cast<cudf::list_scalar*>(result.get())->view();
-    //    cudf::test::print(result_col);
-
-    auto const sort_order    = cudf::sorted_order(cudf::table_view{{result_col.child(0)}}, {}, {});
-    auto const sorted_result = cudf::gather(cudf::table_view{{result_col}}, *sort_order);
-    //    cudf::test::print(sorted_result->get_column(0).view());
-
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, sorted_result->get_column(0).view());
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
   }
 }
 

From b5dd22a72eb6a91beb0ec6a7ec374315cf62abba Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 18 Sep 2023 13:35:46 -0700
Subject: [PATCH 47/93] Misc

---
 cpp/tests/reductions/reduction_tests.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index bb9db061602..f6786a3a59c 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -393,6 +393,7 @@ TYPED_TEST(ReductionTest, SumOfSquare)
 template <typename T>
 struct ReductionHistogramTest : public cudf::test::BaseFixture {};
 
+// Avoid unsigned types, as the tests below have negative values in their input.
 using HistogramTestTypes = cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t>,
                                               cudf::test::FloatingPointTypes,
                                               cudf::test::FixedPointTypes,
@@ -408,7 +409,9 @@ auto histogram_reduction(cudf::column_view const& input,
 
   auto const result_scalar = cudf::reduce(input, *agg, cudf::data_type{cudf::type_id::INT64});
   auto const result_col    = dynamic_cast<cudf::list_scalar*>(result_scalar.get())->view();
-  auto const sort_order    = cudf::sorted_order(cudf::table_view{{result_col.child(0)}}, {}, {});
+
+  // Sort the histogram based on the first column (unique input values).
+  auto const sort_order = cudf::sorted_order(cudf::table_view{{result_col.child(0)}}, {}, {});
   return std::move(cudf::gather(cudf::table_view{{result_col}}, *sort_order)->release().front());
 }
 

From 17b8975e09c1457214ed4a5530a101043054a3d5 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 18 Sep 2023 13:36:29 -0700
Subject: [PATCH 48/93] Cleanup

---
 cpp/tests/reductions/reduction_tests.cpp | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index f6786a3a59c..d0287dc749a 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -120,7 +120,6 @@ struct ReductionTest : public cudf::test::BaseFixture {
   }
 };
 
-#if 0
 template <typename T>
 struct MinMaxReductionTest : public ReductionTest<T> {};
 
@@ -294,15 +293,12 @@ TYPED_TEST(SumReductionTest, Sum)
                  .second);
 }
 
-#endif
-
 using TestTypes = cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t>,
                                      cudf::test::FloatingPointTypes,
                                      cudf::test::FixedPointTypes,
                                      cudf::test::ChronoTypes>;
 TYPED_TEST_SUITE(ReductionTest, TestTypes);
 
-#if 0
 TYPED_TEST(ReductionTest, Product)
 {
   using T = TypeParam;
@@ -388,8 +384,6 @@ TYPED_TEST(ReductionTest, SumOfSquare)
             expected_null_value);
 }
 
-#endif
-
 template <typename T>
 struct ReductionHistogramTest : public cudf::test::BaseFixture {};
 
@@ -569,7 +563,6 @@ TYPED_TEST(ReductionHistogramTest, MergeHistogram)
   }
 }
 
-#if 0
 template <typename T>
 struct ReductionAnyAllTest : public ReductionTest<bool> {};
 using AnyAllTypes = cudf::test::Types<int32_t, float, bool>;
@@ -3127,5 +3120,5 @@ TEST_F(StructReductionTest, StructReductionMinMaxWithNulls)
                          *cudf::make_max_aggregation<reduce_aggregation>());
   }
 }
-#endif
+
 CUDF_TEST_PROGRAM_MAIN()

From c0b245f8057fd8e2e0b1b4235d392fd352b4768e Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 18 Sep 2023 13:37:34 -0700
Subject: [PATCH 49/93] Revert changes

---
 cpp/tests/reductions/reduction_tests.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index d0287dc749a..5665fdddfba 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -293,11 +293,7 @@ TYPED_TEST(SumReductionTest, Sum)
                  .second);
 }
 
-using TestTypes = cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t>,
-                                     cudf::test::FloatingPointTypes,
-                                     cudf::test::FixedPointTypes,
-                                     cudf::test::ChronoTypes>;
-TYPED_TEST_SUITE(ReductionTest, TestTypes);
+TYPED_TEST_SUITE(ReductionTest, cudf::test::NumericTypes);
 
 TYPED_TEST(ReductionTest, Product)
 {

From a8b3696652417279bb2e6c94acf45d18424b75e6 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 18 Sep 2023 13:42:17 -0700
Subject: [PATCH 50/93] Add more assert statements

---
 cpp/tests/reductions/reduction_tests.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 5665fdddfba..f824695bbd5 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -398,7 +398,13 @@ auto histogram_reduction(cudf::column_view const& input,
     "Aggregation must be either HISTOGRAM or MERGE_HISTOGRAM.");
 
   auto const result_scalar = cudf::reduce(input, *agg, cudf::data_type{cudf::type_id::INT64});
-  auto const result_col    = dynamic_cast<cudf::list_scalar*>(result_scalar.get())->view();
+  EXPECT_EQ(result_scalar->is_valid(), true);
+
+  auto const result_list_scalar = dynamic_cast<cudf::list_scalar*>(result_scalar.get());
+  EXPECT_NE(result_list_scalar, nullptr);
+
+  auto const result_col = result_list_scalar->view();
+  EXPECT_EQ(result_col.num_children(), 2);
 
   // Sort the histogram based on the first column (unique input values).
   auto const sort_order = cudf::sorted_order(cudf::table_view{{result_col.child(0)}}, {}, {});

From a7fee3082c57ae4dd42798ba0562df21b44e4d71 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 18 Sep 2023 13:44:03 -0700
Subject: [PATCH 51/93] Clean up tests

---
 cpp/tests/reductions/reduction_tests.cpp | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index f824695bbd5..f23abf431ee 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -445,10 +445,9 @@ TYPED_TEST(ReductionHistogramTest, Histogram)
   }
 
   // Test with nulls.
+  using namespace cudf::test::iterators;
+  auto constexpr null{0};
   {
-    using namespace cudf::test::iterators;
-    auto constexpr null{0};
-
     auto const input    = col_data{{null, -3, 2, 1, 2, 0, null, 5, 2, null, -3, -2, null, 2, 1},
                                 nulls_at({0, 6, 9, 12})};
     auto const expected = [] {
@@ -462,9 +461,6 @@ TYPED_TEST(ReductionHistogramTest, Histogram)
 
   // Test with nulls, sliced input.
   {
-    using namespace cudf::test::iterators;
-    auto constexpr null{0};
-
     auto const input_original = col_data{
       {null, -3, 2, 1, 2, 0, null, 5, 2, null, -3, -2, null, 2, 1}, nulls_at({0, 6, 9, 12})};
     auto const input    = cudf::slice(input_original, {0, 9})[0];
@@ -522,10 +518,9 @@ TYPED_TEST(ReductionHistogramTest, MergeHistogram)
   }
 
   // Test with nulls.
+  using namespace cudf::test::iterators;
+  auto constexpr null{0};
   {
-    using namespace cudf::test::iterators;
-    auto constexpr null{0};
-
     auto const input = [] {
       auto child1 = col_data{{-3, 2, null, 1, 2, null, 0, 5, null, 2, -3, null, -2, 2, 1, null},
                              nulls_at({2, 5, 8, 11, 15})};
@@ -544,9 +539,6 @@ TYPED_TEST(ReductionHistogramTest, MergeHistogram)
 
   // Test with nulls, sliced input.
   {
-    using namespace cudf::test::iterators;
-    auto constexpr null{0};
-
     auto const input_original = [] {
       auto child1 = col_data{{-3, 2, null, 1, 2, null, 0, 5, null, 2, -3, null, -2, 2, 1, null},
                              nulls_at({2, 5, 8, 11, 15})};

From 829017a3de7cb5285dbf522e67d32e5403a7b710 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 18 Sep 2023 13:48:41 -0700
Subject: [PATCH 52/93] Add docs

---
 cpp/include/cudf/aggregation.hpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index b4491b68da2..75d693df9e0 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -618,8 +618,12 @@ template <typename Base = aggregation>
 std::unique_ptr<Base> make_merge_m2_aggregation();
 
 /**
- * @brief make_merge_m2_aggregation
- * @return
+ * @brief Factory to create a MERGE_HISTOGRAM aggregation
+ *
+ * Merges the results of `HISTOGRAM` aggregations on independent sets into a new `HISTOGRAM` value
+ * equivalent to if a single `HISTOGRAM` aggregation was done across all of the sets at once.
+ *
+ * @return A MERGE_HISTOGRAM aggregation object
  */
 template <typename Base = aggregation>
 std::unique_ptr<Base> make_merge_histogram_aggregation();

From e53042e1608046d22ee3785ef6965f8e8caa526f Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 18 Sep 2023 14:36:23 -0700
Subject: [PATCH 53/93] Rewrite docs

---
 .../reduction/detail/reduction_functions.hpp  | 24 ++++++++++++++-----
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/cpp/include/cudf/reduction/detail/reduction_functions.hpp b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
index 804b79593da..9c85e754c2f 100644
--- a/cpp/include/cudf/reduction/detail/reduction_functions.hpp
+++ b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
@@ -132,21 +132,33 @@ std::unique_ptr<scalar> all(column_view const& col,
                             rmm::mr::device_memory_resource* mr);
 
 /**
- * @brief
+ * @brief Compute frequency for each unique element in the input column.
  *
- * If all elements in input column are null, output scalar is null.
+ * The result histogram is stored in structs column having two children. The first child contains
+ * unique elements from the input, and the second child contains their corresponding frequencies.
+ *
+ * @throw cudf::logic_error if `output_dtype` is not integer type
+ *
+ * @param input The column to compute histogram
+ * @param output_dtype Data type to store the element frequencies
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned scalar's device memory
+ * @return A list_scalar storing a structs column as the result histogram
  */
-std::unique_ptr<scalar> histogram(column_view const& col,
+std::unique_ptr<scalar> histogram(column_view const& input,
                                   data_type const output_dtype,
                                   rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr);
 
 /**
- * @brief
+ * @brief Merge multiple histograms together.
  *
- * If all elements in input column are null, output scalar is null.
+ * @param input The input given as multiple histograms concatenated together
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned scalar's device memory
+ * @return A list_scalar storing the result histogram
  */
-std::unique_ptr<scalar> merge_histogram(column_view const& col,
+std::unique_ptr<scalar> merge_histogram(column_view const& input,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr);
 

From 49608ab8277aa4fa73acde4ecbd8657eed96b5c8 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 18 Sep 2023 15:54:34 -0700
Subject: [PATCH 54/93] Add a helper file

---
 cpp/src/reductions/histogram_helpers.hpp | 49 ++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 cpp/src/reductions/histogram_helpers.hpp

diff --git a/cpp/src/reductions/histogram_helpers.hpp b/cpp/src/reductions/histogram_helpers.hpp
new file mode 100644
index 00000000000..64386c023e4
--- /dev/null
+++ b/cpp/src/reductions/histogram_helpers.hpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <optional>
+
+namespace cudf::reduction::detail {
+
+/**
+ * @brief Compute the histogram for the input table.
+ *
+ * This is equivalent to do a distinct count for each unique rows in the input.
+ *
+ * @param input The input table to compute histogram
+ * @param partial_counts An optional column containing counts for each row
+ * @param output_dtype The output type to store the count value
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate memory of the returned objects
+ * @return A pair of array contains the indices of the distinct rows in the input table, and their
+ *         corresponding distinct counts
+ */
+std::pair<rmm::device_uvector<size_type>, std::unique_ptr<column>> table_histogram(
+  table_view const& input,
+  std::optional<column_view> const& partial_counts,
+  data_type const output_dtype,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr);
+
+}  // namespace cudf::reduction::detail

From 08aac0ea2a91dadbe888b5ecdd56aec399b5ba73 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 18 Sep 2023 15:57:53 -0700
Subject: [PATCH 55/93] Rewrite histogram

---
 cpp/src/reductions/histogram.cu | 147 +++++++++++++++++---------------
 1 file changed, 79 insertions(+), 68 deletions(-)

diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index dc163b63d23..73343946792 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -14,13 +14,11 @@
  * limitations under the License.
  */
 
-// #include <cudf_test/column_utilities.hpp>
+#include <stream_compaction/stream_compaction_common.hpp>
 
-#include <stream_compaction/stream_compaction_common.cuh>
-
-#include <cudf/detail/hash_reduce_by_row.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/gather.hpp>
+#include <cudf/detail/hash_reduce_by_row.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
@@ -36,7 +34,7 @@ namespace cudf::reduction::detail {
 namespace {
 
 /**
- * @brief The functor to compute the occurences of each unique rows in the input table.
+ * @brief The functor to accumulate the frequency of each distinct rows in the input table.
  */
 template <typename MapView, typename KeyHasher, typename KeyEqual, typename OutputType>
 struct reduce_fn : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, OutputType> {
@@ -58,12 +56,10 @@ struct reduce_fn : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEq
   // Count the number of rows in each group of rows that are compared equal.
   __device__ void operator()(size_type const idx) const
   {
-    cuda::atomic_ref<OutputType, cuda::thread_scope_device> count(*this->get_output_ptr(idx));
-    if (d_partial_output) {
-      count.fetch_add(d_partial_output[idx], cuda::std::memory_order_relaxed);
-    } else {
-      count.fetch_add(OutputType{1}, cuda::std::memory_order_relaxed);
-    }
+    auto const increment = d_partial_output ? d_partial_output[idx] : OutputType{1};
+    auto const count =
+      cuda::atomic_ref<OutputType, cuda::thread_scope_device>(*this->get_output_ptr(idx));
+    count.fetch_add(increment, cuda::std::memory_order_relaxed);
   }
 };
 
@@ -89,20 +85,31 @@ struct reduce_func_builder {
   }
 };
 
+/**
+ * @brief Specialized functor to check for non-zero.
+ *
+ * The input must be given as Pair<T1, T2>. Only value of T2 is checked for non-zero.
+ */
 struct is_none_zero {
   template <typename Pair>
-  __device__ bool operator()(Pair const inp_pair) const
+  __device__ bool operator()(Pair const input) const
   {
-    return thrust::get<1>(inp_pair) != 0;
+    return thrust::get<1>(input) != 0;
   }
 };
 
+/**
+ * @brief Dispatcher functor to compute histogram in the given OutputType.
+ *
+ * The indices of distinct rows and their corresponding frequencies are written into two separate
+ * output buffer.
+ */
 struct histogram_dispatcher {
   template <typename OutputType>
   static bool constexpr is_supported()
   {
     // Currently only int64_t is requested by Spark-Rapids.
-    // More data type can be supported by enabling it below.
+    // More data type (integer only) can be supported by enabling below.
     return std::is_same_v<OutputType, int64_t>;
   }
 
@@ -138,39 +145,55 @@ struct histogram_dispatcher {
       stream,
       rmm::mr::get_current_device_resource());
 
-    column_view cv = column_view(data_type{type_id::INT64},
-                                 (int)reduction_results.size(),
-                                 reduction_results.data(),
-                                 nullptr,
-                                 0);
-    //    printf("reduction result, num rows = %d\n", num_rows);
-    //    cudf::test::print(cv);
-
     auto const input_it = thrust::make_zip_iterator(
       thrust::make_tuple(thrust::make_counting_iterator(0), reduction_results.begin()));
-
     auto const output_it = thrust::make_zip_iterator(
       thrust::make_tuple(output_indices, output_counts.begin<OutputType>()));
 
+    // Reduction results above are either group sizes of equal rows, or `0`.
+    // Thus, we need to extract the non-zero group sizes.
     thrust::copy_if(
       rmm::exec_policy(stream), input_it, input_it + num_rows, output_it, is_none_zero{});
-
-    // Reduction results are either group sizes of equal rows, or `0`.
-    // Thus, we only needs to extract the non-zero group sizes.
   }
 };
 
+auto gather_histogram(table_view const& input,
+                      device_span<size_type const> distinct_indices,
+                      std::unique_ptr<column>&& distinct_counts,
+                      rmm::cuda_stream_view stream,
+                      rmm::mr::device_memory_resource* mr)
+{
+  auto distinct_rows =
+    std::move(cudf::detail::gather(input,
+                                   distinct_indices,
+                                   out_of_bounds_policy::DONT_CHECK,
+                                   cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                   stream,
+                                   mr)
+                ->release()
+                .front());
+
+  std::vector<std::unique_ptr<column>> struct_children;
+  struct_children.emplace_back(std::move(distinct_rows));
+  struct_children.emplace_back(std::move(distinct_counts));
+  auto output_structs = make_structs_column(
+    static_cast<size_type>(distinct_indices.size()), std::move(struct_children), 0, {}, stream, mr);
+
+  return std::make_unique<cudf::list_scalar>(
+    std::move(*output_structs.release()), true, stream, mr);
+}
+
 }  // namespace
 
-std::unique_ptr<cudf::scalar> histogram(table_view const& input,
-                                        std::optional<column_view> const& partial_distinct_counts,
-                                        data_type const output_dtype,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+std::pair<rmm::device_uvector<size_type>, std::unique_ptr<column>> table_histogram(
+  table_view const& input,
+  std::optional<column_view> const& partial_counts,
+  data_type const output_dtype,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(cudf::is_integral(output_dtype) &&
-                 (cudf::size_of(output_dtype) == 4 || cudf::size_of(output_dtype) == 8),
-               "The output type of histogram aggregation must be an 32/64bit integral type.");
+  CUDF_EXPECTS(cudf::is_integral(output_dtype),
+               "The output type of histogram aggregation must be an integral type.");
 
   auto map = cudf::detail::hash_map_type{
     compute_hash_table_size(input.num_rows()),
@@ -191,9 +214,11 @@ std::unique_ptr<cudf::scalar> histogram(table_view const& input,
   auto const pair_iter = cudf::detail::make_counting_transform_iterator(
     size_type{0}, [] __device__(size_type const i) { return cuco::make_pair(i, i); });
 
+  // Always compare NaNs as equal.
   using nan_equal_comparator =
     cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
   auto const value_comp = nan_equal_comparator{};
+
   if (has_nested_columns) {
     auto const key_equal = row_comp.equal_to<true>(has_nulls, null_equality::EQUAL, value_comp);
     map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
@@ -202,14 +227,14 @@ std::unique_ptr<cudf::scalar> histogram(table_view const& input,
     map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
   }
 
-  // Gather the indices of distinct rows and distinct rows.
+  // Gather the indices of distinct rows.
   auto distinct_indices = rmm::device_uvector<size_type>(
     static_cast<size_type>(map.get_size()), stream, rmm::mr::get_current_device_resource());
-  //  map.retrieve_all(distinct_indices.begin(), thrust::make_discard_iterator(), stream.value());
 
-  // Count the number of occurences of each unique row.
+  // Store the number of occurences of each distinct row.
   auto distinct_counts = make_numeric_column(
     output_dtype, static_cast<size_type>(map.get_size()), mask_state::UNALLOCATED, stream, mr);
+
   type_dispatcher(output_dtype,
                   histogram_dispatcher{},
                   map,
@@ -219,29 +244,10 @@ std::unique_ptr<cudf::scalar> histogram(table_view const& input,
                   has_nested_columns,
                   distinct_indices.begin(),
                   distinct_counts->mutable_view(),
-                  partial_distinct_counts,
+                  partial_counts,
                   stream);
 
-  auto distinct_rows =
-    std::move(cudf::detail::gather(input,
-                                   distinct_indices,
-                                   out_of_bounds_policy::DONT_CHECK,
-                                   cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                   stream,
-                                   mr)
-                ->release()
-                .front());
-  //  printf("reduction result 2\n");
-  //  cudf::test::print(distinct_counts->view());
-
-  std::vector<std::unique_ptr<column>> struct_children;
-  struct_children.emplace_back(std::move(distinct_rows));
-  struct_children.emplace_back(std::move(distinct_counts));
-  auto output_structs = make_structs_column(
-    static_cast<size_type>(map.get_size()), std::move(struct_children), 0, {}, stream, mr);
-
-  return std::make_unique<cudf::list_scalar>(
-    std::move(*output_structs.release()), true, stream, mr);
+  return {std::move(distinct_indices), std::move(distinct_counts)};
 }
 
 std::unique_ptr<cudf::scalar> histogram(column_view const& input,
@@ -249,26 +255,31 @@ std::unique_ptr<cudf::scalar> histogram(column_view const& input,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
-  return histogram(table_view{{input}}, std::nullopt, output_dtype, stream, mr);
+  auto const input_tv = table_view{{input}};
+  auto [distinct_indices, distinct_counts] =
+    table_histogram(input_tv, std::nullopt, output_dtype, stream, mr);
+  return gather_histogram(input_tv, distinct_indices, std::move(distinct_counts), stream, mr);
 }
 
 std::unique_ptr<cudf::scalar> merge_histogram(column_view const& input,
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
+  CUDF_EXPECTS(!input.has_nulls(), "The input column must not have nulls.");
   CUDF_EXPECTS(
     input.type().id() == type_id::STRUCT && input.num_children() == 2,
     "The input of merge_histogram aggregation must be a struct column having two children.");
-  CUDF_EXPECTS(input.child(1).type().id() == type_id::INT64 && !input.child(1).has_nulls(),
-               "The second child of the input column must be INT64 type and has no nulls.");
-  CUDF_EXPECTS(!input.has_nulls(), "The input column must not have nulls.");
+  CUDF_EXPECTS(cudf::is_integral(input.child(1).type()) && !input.child(1).has_nulls(),
+               "The second child of the input column must be ingegral type and has no nulls.");
+
+  auto const structs_cv   = structs_column_view{input};
+  auto const input_values = structs_cv.get_sliced_child(0, stream);
+  auto const input_counts = structs_cv.get_sliced_child(1, stream);
 
-  auto const structs_cv = structs_column_view{input};
-  return histogram(table_view{{structs_cv.get_sliced_child(0, stream)}},
-                   structs_cv.get_sliced_child(1, stream),
-                   data_type{type_id::INT64},
-                   stream,
-                   mr);
+  auto const values_tv = table_view{{input_values}};
+  auto [distinct_indices, distinct_counts] =
+    table_histogram(values_tv, input_counts, data_type{type_id::INT64}, stream, mr);
+  return gather_histogram(values_tv, distinct_indices, std::move(distinct_counts), stream, mr);
 }
 
 }  // namespace cudf::reduction::detail

From aaaf3474c8028b92e6c84290c4eeb201cd7f96c8 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 18 Sep 2023 16:06:02 -0700
Subject: [PATCH 56/93] Add docs

---
 cpp/src/groupby/sort/aggregate.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index f59f2ab0271..7416f114671 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -546,6 +546,12 @@ void aggregate_result_functor::operator()<aggregation::MERGE_M2>(aggregation con
       get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr));
 }
 
+/**
+ * @brief Perform merging for multiple histograms that correspond to the same key value.
+ *
+ * The partial results input to this aggregation is a structs column that is (vertically)
+ * concatenated from multiple outputs of HISTOGRAM aggregations.
+ */
 template <>
 void aggregate_result_functor::operator()<aggregation::MERGE_HISTOGRAM>(aggregation const& agg)
 {

From 2f5b343f7cb223d58a4445f59211fb49118d77b0 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 18 Sep 2023 16:07:42 -0700
Subject: [PATCH 57/93] Remove file

---
 cpp/src/reductions/histogram.cuh | 23 -----------------------
 1 file changed, 23 deletions(-)
 delete mode 100644 cpp/src/reductions/histogram.cuh

diff --git a/cpp/src/reductions/histogram.cuh b/cpp/src/reductions/histogram.cuh
deleted file mode 100644
index 5951b91a964..00000000000
--- a/cpp/src/reductions/histogram.cuh
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/aggregation.hpp>
-
-namespace cudf::reduction::detail {
-
-}  // namespace cudf::reduction::detail

From c11f939d3824d1f99800f58a95ef34ad73727305 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 18 Sep 2023 16:39:25 -0700
Subject: [PATCH 58/93] Rewrite docs

---
 cpp/src/groupby/sort/aggregate.cpp        |  4 ++--
 cpp/src/groupby/sort/group_reductions.hpp | 22 +++++++++++++++++++---
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 7416f114671..10c271f76f9 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -549,8 +549,8 @@ void aggregate_result_functor::operator()<aggregation::MERGE_M2>(aggregation con
 /**
  * @brief Perform merging for multiple histograms that correspond to the same key value.
  *
- * The partial results input to this aggregation is a structs column that is (vertically)
- * concatenated from multiple outputs of HISTOGRAM aggregations.
+ * The partial results input to this aggregation is a structs column that is concatenated from
+ * multiple outputs of HISTOGRAM aggregations.
  */
 template <>
 void aggregate_result_functor::operator()<aggregation::MERGE_HISTOGRAM>(aggregation const& agg)
diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index 8acf046324b..6ff0d9df47a 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -217,12 +217,20 @@ std::unique_ptr<column> group_count_all(cudf::device_span<size_type const> group
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr);
 /**
- * @brief
+ * @brief Internal API to compute histogram for each group in @p values.
+ *
+ * The returned column is a lists column, each list corresponds to one input group and stores the
+ * histogram of the distinct elements in that group in the form of `STRUCT<value, count>`.
  *
  * @code{.pseudo}
+ * values       = [2, 1, 1, 3, 5, 2, 2, 3, 1, 4]
+ * group_labels = [0, 0, 0, 1, 1, 1, 1, 1, 2, 2]
+ * num_groups   = 3
+ *
+ * output = [[<1, 2>, <2, 1>], [<2, 2>, <3, 2>, <5, 1>], [<1, 1>, <4, 1>]]
  * @endcode
  *
- * @param values Grouped values to get valid count of
+ * @param values Grouped values to compute histogram
  * @param group_labels ID of group that the corresponding value belongs to
  * @param num_groups Number of groups ( unique values in @p group_labels )
  * @param stream CUDA stream used for device memory operations and kernel launches.
@@ -460,9 +468,17 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
                                        rmm::mr::device_memory_resource* mr);
 
 /**
- * @brief
+ * @brief Internal API to merge multiple output of HISTOGRAM aggregation.
+ *
+ * The input values column should be given as a structs column in the form of
+ * `STRUCT<value, count>`.
  *
  * @code{.pseudo}
+ * values       = [<1, 2>, <2, 1>, <2, 2>, <3, 2>, <2, 1>, <1, 1>, <2, 1>]
+ * group_labels = [0,      0,      0,      1,      1,      1,      1]
+ * num_groups = 2
+ *
+ * output = [[<1, 2>, <2, 3>], [<1, 1>, <2, 2>, <3, 3>]]]
  * @endcode
  *
  * @param values Grouped values to get valid count of

From d10842e5e2bef044686178b15251be4cbb7d03f8 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 18 Sep 2023 16:43:22 -0700
Subject: [PATCH 59/93] Change docs

---
 cpp/src/groupby/sort/group_reductions.hpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index 6ff0d9df47a..24d631d6f51 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -222,6 +222,8 @@ std::unique_ptr<column> group_count_all(cudf::device_span<size_type const> group
  * The returned column is a lists column, each list corresponds to one input group and stores the
  * histogram of the distinct elements in that group in the form of `STRUCT<value, count>`.
  *
+ * Note that the order of distinct elements in each output list is not specified.
+ *
  * @code{.pseudo}
  * values       = [2, 1, 1, 3, 5, 2, 2, 3, 1, 4]
  * group_labels = [0, 0, 0, 1, 1, 1, 1, 1, 2, 2]
@@ -472,6 +474,7 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
  *
  * The input values column should be given as a structs column in the form of
  * `STRUCT<value, count>`.
+ * After merging, the order of distinct elements in each output list is not specified.
  *
  * @code{.pseudo}
  * values       = [<1, 2>, <2, 1>, <2, 2>, <3, 2>, <2, 1>, <1, 1>, <2, 1>]

From 6abc7b509336627dfe5ffe64d2b04b1087df6f9b Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 18 Sep 2023 21:50:40 -0700
Subject: [PATCH 60/93] Add headers

---
 cpp/src/reductions/histogram.cu          | 1 +
 cpp/src/reductions/histogram_helpers.hpp | 1 +
 2 files changed, 2 insertions(+)

diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 73343946792..5f6386fedba 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/hash_reduce_by_row.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/structs/structs_column_view.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <thrust/iterator/discard_iterator.h>
diff --git a/cpp/src/reductions/histogram_helpers.hpp b/cpp/src/reductions/histogram_helpers.hpp
index 64386c023e4..2a271ec70a5 100644
--- a/cpp/src/reductions/histogram_helpers.hpp
+++ b/cpp/src/reductions/histogram_helpers.hpp
@@ -21,6 +21,7 @@
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <optional>
 

From f833f58162003a7b52975cea4c3b3a3309237f1c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 18 Sep 2023 21:50:53 -0700
Subject: [PATCH 61/93] Implement groupby histogram and merge histogram aggs

---
 cpp/src/groupby/sort/group_histogram.cu | 167 +++++++++++++-----------
 1 file changed, 92 insertions(+), 75 deletions(-)

diff --git a/cpp/src/groupby/sort/group_histogram.cu b/cpp/src/groupby/sort/group_histogram.cu
index 5123a9fb500..4cb9320f418 100644
--- a/cpp/src/groupby/sort/group_histogram.cu
+++ b/cpp/src/groupby/sort/group_histogram.cu
@@ -14,104 +14,121 @@
  * limitations under the License.
  */
 
+#include <lists/utilities.hpp>
+#include <reductions/histogram_helpers.hpp>
+
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/gather.hpp>
+#include <cudf/structs/structs_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/adjacent_difference.h>
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/iterator/discard_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/reduce.h>
+#include <rmm/device_buffer.hpp>
 
 namespace cudf::groupby::detail {
-std::unique_ptr<column> group_histogram(column_view const& values,
+
+// Fixed type for counting frequencies in historam.
+// This is to avoid using `target_type_t` which requires type_dispatcher.
+constexpr auto histogram_count_dtype = data_type{type_to_id<int64_t>()};
+
+namespace {
+auto make_empty_histogram(column_view const& values)
+{
+  std::vector<std::unique_ptr<column>> struct_children;
+  struct_children.emplace_back(empty_like(values));
+  struct_children.emplace_back(make_numeric_column(histogram_count_dtype, 0));
+  auto structs = std::make_unique<column>(data_type{type_id::STRUCT},
+                                          0,
+                                          rmm::device_buffer{},
+                                          rmm::device_buffer{},
+                                          0,
+                                          std::move(struct_children));
+
+  std::vector<std::unique_ptr<column>> lists_children;
+  lists_children.emplace_back(make_numeric_column(data_type{type_to_id<size_type>()}, 0));
+  lists_children.emplace_back(std::move(structs));
+  return std::make_unique<column>(cudf::data_type{type_id::LIST},
+                                  0,
+                                  rmm::device_buffer{},
+                                  rmm::device_buffer{},
+                                  0,
+                                  std::move(lists_children));
+}
+
+std::unique_ptr<column> group_histogram(column_view const& input,
                                         cudf::device_span<size_type const> group_labels,
+                                        std::optional<column_view> const& partial_counts,
                                         size_type num_groups,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(num_groups >= 0, "number of groups cannot be negative");
-  CUDF_EXPECTS(static_cast<size_t>(values.size()) == group_labels.size(),
+  CUDF_EXPECTS(static_cast<size_t>(input.size()) == group_labels.size(),
                "Size of values column should be same as that of group labels");
 
-  auto result = make_numeric_column(
-    data_type(type_to_id<size_type>()), num_groups, mask_state::UNALLOCATED, stream, mr);
-
-  if (num_groups == 0) { return result; }
-
-  if (values.nullable()) {
-    auto values_view = column_device_view::create(values, stream);
-
-    // make_validity_iterator returns a boolean iterator that sums to 1 (1+1=1)
-    // so we need to transform it to cast it to an integer type
-    auto bitmask_iterator =
-      thrust::make_transform_iterator(cudf::detail::make_validity_iterator(*values_view),
-                                      [] __device__(auto b) { return static_cast<size_type>(b); });
-
-    thrust::reduce_by_key(rmm::exec_policy(stream),
-                          group_labels.begin(),
-                          group_labels.end(),
-                          bitmask_iterator,
-                          thrust::make_discard_iterator(),
-                          result->mutable_view().begin<size_type>());
-  } else {
-    thrust::reduce_by_key(rmm::exec_policy(stream),
-                          group_labels.begin(),
-                          group_labels.end(),
-                          thrust::make_constant_iterator(1),
-                          thrust::make_discard_iterator(),
-                          result->mutable_view().begin<size_type>());
-  }
-
-  return result;
+  if (num_groups == 0) { return make_empty_histogram(input); }
+
+  auto const labels_cv      = column_view{data_type{type_to_id<size_type>()},
+                                     static_cast<size_type>(group_labels.size()),
+                                     group_labels.data(),
+                                     nullptr,
+                                     0};
+  auto const labeled_values = table_view{{labels_cv, input}};
+
+  auto [distinct_indices, distinct_counts] = cudf::reduction::detail::table_histogram(
+    labeled_values, partial_counts, histogram_count_dtype, stream, mr);
+  auto out_table = cudf::detail::gather(labeled_values,
+                                        distinct_indices,
+                                        out_of_bounds_policy::DONT_CHECK,
+                                        cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                        stream,
+                                        mr);
+
+  auto out_offsets = cudf::lists::detail::reconstruct_offsets(
+    out_table->get_column(0).view(), num_groups, stream, mr);
+
+  std::vector<std::unique_ptr<column>> struct_children;
+  struct_children.emplace_back(std::move(out_table->release().back()));
+  struct_children.emplace_back(std::move(distinct_counts));
+  auto out_structs = make_structs_column(
+    static_cast<size_type>(distinct_indices.size()), std::move(struct_children), 0, {}, stream, mr);
+
+  return make_lists_column(
+    num_groups, std::move(out_offsets), std::move(out_structs), 0, {}, stream, mr);
 }
 
-std::unique_ptr<column> group_merge_histogram(column_view const& values,
+}  // namespace
+
+std::unique_ptr<column> group_histogram(column_view const& input,
+                                        cudf::device_span<size_type const> group_labels,
+                                        size_type num_groups,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  return group_histogram(input, group_labels, std::nullopt, num_groups, stream, mr);
+}
+
+std::unique_ptr<column> group_merge_histogram(column_view const& input,
                                               cudf::device_span<size_type const> group_labels,
                                               size_type num_groups,
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(num_groups >= 0, "number of groups cannot be negative");
-  CUDF_EXPECTS(static_cast<size_t>(values.size()) == group_labels.size(),
-               "Size of values column should be same as that of group labels");
+  CUDF_EXPECTS(!input.has_nulls(), "The input column must not have nulls.");
+  CUDF_EXPECTS(
+    input.type().id() == type_id::STRUCT && input.num_children() == 2,
+    "The input of merge_histogram aggregation must be a struct column having two children.");
+  CUDF_EXPECTS(cudf::is_integral(input.child(1).type()) && !input.child(1).has_nulls(),
+               "The second child of the input column must be ingegral type and has no nulls.");
+
+  if (num_groups == 0) { return empty_like(input); }
+
+  auto const structs_cv   = structs_column_view{input};
+  auto const input_values = structs_cv.get_sliced_child(0, stream);
+  auto const input_counts = structs_cv.get_sliced_child(1, stream);
 
-  auto result = make_numeric_column(
-    data_type(type_to_id<size_type>()), num_groups, mask_state::UNALLOCATED, stream, mr);
-
-  if (num_groups == 0) { return result; }
-
-  if (values.nullable()) {
-    auto values_view = column_device_view::create(values, stream);
-
-    // make_validity_iterator returns a boolean iterator that sums to 1 (1+1=1)
-    // so we need to transform it to cast it to an integer type
-    auto bitmask_iterator =
-      thrust::make_transform_iterator(cudf::detail::make_validity_iterator(*values_view),
-                                      [] __device__(auto b) { return static_cast<size_type>(b); });
-
-    thrust::reduce_by_key(rmm::exec_policy(stream),
-                          group_labels.begin(),
-                          group_labels.end(),
-                          bitmask_iterator,
-                          thrust::make_discard_iterator(),
-                          result->mutable_view().begin<size_type>());
-  } else {
-    thrust::reduce_by_key(rmm::exec_policy(stream),
-                          group_labels.begin(),
-                          group_labels.end(),
-                          thrust::make_constant_iterator(1),
-                          thrust::make_discard_iterator(),
-                          result->mutable_view().begin<size_type>());
-  }
-
-  return result;
+  return group_histogram(input_values, group_labels, input_counts, num_groups, stream, mr);
 }
 
 }  // namespace cudf::groupby::detail

From ef308e8c26ad8f3baabe8a3bd8b71482e2da444e Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 18 Sep 2023 21:53:26 -0700
Subject: [PATCH 62/93] Update header copyright

---
 cpp/src/groupby/sort/group_reductions.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index 24d631d6f51..c1d42987906 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 70e624d4d355504c26f2789ae297f72cdb0b08ff Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Mon, 18 Sep 2023 21:53:32 -0700
Subject: [PATCH 63/93] Rename function

---
 cpp/src/groupby/sort/group_histogram.cu | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/cpp/src/groupby/sort/group_histogram.cu b/cpp/src/groupby/sort/group_histogram.cu
index 4cb9320f418..78b1db23700 100644
--- a/cpp/src/groupby/sort/group_histogram.cu
+++ b/cpp/src/groupby/sort/group_histogram.cu
@@ -56,12 +56,12 @@ auto make_empty_histogram(column_view const& values)
                                   std::move(lists_children));
 }
 
-std::unique_ptr<column> group_histogram(column_view const& input,
-                                        cudf::device_span<size_type const> group_labels,
-                                        std::optional<column_view> const& partial_counts,
-                                        size_type num_groups,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> histogram(column_view const& input,
+                                  cudf::device_span<size_type const> group_labels,
+                                  std::optional<column_view> const& partial_counts,
+                                  size_type num_groups,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(num_groups >= 0, "number of groups cannot be negative");
   CUDF_EXPECTS(static_cast<size_t>(input.size()) == group_labels.size(),
@@ -106,7 +106,7 @@ std::unique_ptr<column> group_histogram(column_view const& input,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
-  return group_histogram(input, group_labels, std::nullopt, num_groups, stream, mr);
+  return histogram(input, group_labels, std::nullopt, num_groups, stream, mr);
 }
 
 std::unique_ptr<column> group_merge_histogram(column_view const& input,
@@ -128,7 +128,7 @@ std::unique_ptr<column> group_merge_histogram(column_view const& input,
   auto const input_values = structs_cv.get_sliced_child(0, stream);
   auto const input_counts = structs_cv.get_sliced_child(1, stream);
 
-  return group_histogram(input_values, group_labels, input_counts, num_groups, stream, mr);
+  return histogram(input_values, group_labels, input_counts, num_groups, stream, mr);
 }
 
 }  // namespace cudf::groupby::detail

From ee91b2e3d5018056528c97a5301a6fa975ca57f5 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 19 Sep 2023 06:35:42 -0700
Subject: [PATCH 64/93] Fix typos

---
 cpp/src/groupby/sort/group_histogram.cu | 2 +-
 cpp/src/reductions/histogram.cu         | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/groupby/sort/group_histogram.cu b/cpp/src/groupby/sort/group_histogram.cu
index 78b1db23700..afb763a021b 100644
--- a/cpp/src/groupby/sort/group_histogram.cu
+++ b/cpp/src/groupby/sort/group_histogram.cu
@@ -120,7 +120,7 @@ std::unique_ptr<column> group_merge_histogram(column_view const& input,
     input.type().id() == type_id::STRUCT && input.num_children() == 2,
     "The input of merge_histogram aggregation must be a struct column having two children.");
   CUDF_EXPECTS(cudf::is_integral(input.child(1).type()) && !input.child(1).has_nulls(),
-               "The second child of the input column must be ingegral type and has no nulls.");
+               "The second child of the input column must be integral type and has no nulls.");
 
   if (num_groups == 0) { return empty_like(input); }
 
diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 5f6386fedba..046068b3f0d 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -232,7 +232,7 @@ std::pair<rmm::device_uvector<size_type>, std::unique_ptr<column>> table_histogr
   auto distinct_indices = rmm::device_uvector<size_type>(
     static_cast<size_type>(map.get_size()), stream, rmm::mr::get_current_device_resource());
 
-  // Store the number of occurences of each distinct row.
+  // Store the number of occurrences of each distinct row.
   auto distinct_counts = make_numeric_column(
     output_dtype, static_cast<size_type>(map.get_size()), mask_state::UNALLOCATED, stream, mr);
 
@@ -271,7 +271,7 @@ std::unique_ptr<cudf::scalar> merge_histogram(column_view const& input,
     input.type().id() == type_id::STRUCT && input.num_children() == 2,
     "The input of merge_histogram aggregation must be a struct column having two children.");
   CUDF_EXPECTS(cudf::is_integral(input.child(1).type()) && !input.child(1).has_nulls(),
-               "The second child of the input column must be ingegral type and has no nulls.");
+               "The second child of the input column must be integral type and has no nulls.");
 
   auto const structs_cv   = structs_column_view{input};
   auto const input_values = structs_cv.get_sliced_child(0, stream);

From 7c51faa0cb6931b2efe734fb4558a7cd80ef32ac Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 19 Sep 2023 09:25:50 -0700
Subject: [PATCH 65/93] Add file

---
 cpp/tests/CMakeLists.txt              |  1 +
 cpp/tests/groupby/histogram_tests.cpp | 24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+)
 create mode 100644 cpp/tests/groupby/histogram_tests.cpp

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index a69dc9bf2f8..9645f322f81 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -116,6 +116,7 @@ ConfigureTest(
   groupby/covariance_tests.cpp
   groupby/groupby_test_util.cpp
   groupby/groups_tests.cpp
+  groupby/histogram_tests.cpp
   groupby/keys_tests.cpp
   groupby/lists_tests.cpp
   groupby/m2_tests.cpp
diff --git a/cpp/tests/groupby/histogram_tests.cpp b/cpp/tests/groupby/histogram_tests.cpp
new file mode 100644
index 00000000000..761a2abacae
--- /dev/null
+++ b/cpp/tests/groupby/histogram_tests.cpp
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>

From 270bcb8493271c2992295b53e31d49ea627327bf Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 19 Sep 2023 10:01:09 -0700
Subject: [PATCH 66/93] Add docs

---
 cpp/src/groupby/sort/group_histogram.cu  | 43 ++++++++++++++----------
 cpp/src/reductions/histogram_helpers.hpp |  4 +--
 2 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/cpp/src/groupby/sort/group_histogram.cu b/cpp/src/groupby/sort/group_histogram.cu
index afb763a021b..e042d88f837 100644
--- a/cpp/src/groupby/sort/group_histogram.cu
+++ b/cpp/src/groupby/sort/group_histogram.cu
@@ -56,28 +56,32 @@ auto make_empty_histogram(column_view const& values)
                                   std::move(lists_children));
 }
 
-std::unique_ptr<column> histogram(column_view const& input,
-                                  cudf::device_span<size_type const> group_labels,
-                                  std::optional<column_view> const& partial_counts,
-                                  size_type num_groups,
-                                  rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> build_histogram(column_view const& values,
+                                        cudf::device_span<size_type const> group_labels,
+                                        std::optional<column_view> const& partial_counts,
+                                        size_type num_groups,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(num_groups >= 0, "number of groups cannot be negative");
-  CUDF_EXPECTS(static_cast<size_t>(input.size()) == group_labels.size(),
+  CUDF_EXPECTS(static_cast<size_t>(values.size()) == group_labels.size(),
                "Size of values column should be same as that of group labels");
 
-  if (num_groups == 0) { return make_empty_histogram(input); }
+  if (num_groups == 0) { return make_empty_histogram(values); }
 
+  // Attach group labels to the input values.
   auto const labels_cv      = column_view{data_type{type_to_id<size_type>()},
                                      static_cast<size_type>(group_labels.size()),
                                      group_labels.data(),
                                      nullptr,
                                      0};
-  auto const labeled_values = table_view{{labels_cv, input}};
+  auto const labeled_values = table_view{{labels_cv, values}};
 
+  // Build histogram for the labeled values.
   auto [distinct_indices, distinct_counts] = cudf::reduction::detail::table_histogram(
     labeled_values, partial_counts, histogram_count_dtype, stream, mr);
+
+  // Gather the distinct rows for output histogram.
   auto out_table = cudf::detail::gather(labeled_values,
                                         distinct_indices,
                                         out_of_bounds_policy::DONT_CHECK,
@@ -85,6 +89,8 @@ std::unique_ptr<column> histogram(column_view const& input,
                                         stream,
                                         mr);
 
+  // Build offsets for the output lists column.
+  // Each list will be a histogram corresponding to each value group.
   auto out_offsets = cudf::lists::detail::reconstruct_offsets(
     out_table->get_column(0).view(), num_groups, stream, mr);
 
@@ -100,35 +106,36 @@ std::unique_ptr<column> histogram(column_view const& input,
 
 }  // namespace
 
-std::unique_ptr<column> group_histogram(column_view const& input,
+std::unique_ptr<column> group_histogram(column_view const& values,
                                         cudf::device_span<size_type const> group_labels,
                                         size_type num_groups,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
-  return histogram(input, group_labels, std::nullopt, num_groups, stream, mr);
+  return build_histogram(values, group_labels, std::nullopt, num_groups, stream, mr);
 }
 
-std::unique_ptr<column> group_merge_histogram(column_view const& input,
+std::unique_ptr<column> group_merge_histogram(column_view const& values,
                                               cudf::device_span<size_type const> group_labels,
                                               size_type num_groups,
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(!input.has_nulls(), "The input column must not have nulls.");
+  CUDF_EXPECTS(!values.has_nulls(), "The input column must not have nulls.");
   CUDF_EXPECTS(
-    input.type().id() == type_id::STRUCT && input.num_children() == 2,
+    values.type().id() == type_id::STRUCT && values.num_children() == 2,
     "The input of merge_histogram aggregation must be a struct column having two children.");
-  CUDF_EXPECTS(cudf::is_integral(input.child(1).type()) && !input.child(1).has_nulls(),
+  CUDF_EXPECTS(cudf::is_integral(values.child(1).type()) && !values.child(1).has_nulls(),
                "The second child of the input column must be integral type and has no nulls.");
 
-  if (num_groups == 0) { return empty_like(input); }
+  if (num_groups == 0) { return empty_like(values); }
 
-  auto const structs_cv   = structs_column_view{input};
+  // The input values column is already in histogram format (i.e., column of Struct<value, count>).
+  auto const structs_cv   = structs_column_view{values};
   auto const input_values = structs_cv.get_sliced_child(0, stream);
   auto const input_counts = structs_cv.get_sliced_child(1, stream);
 
-  return histogram(input_values, group_labels, input_counts, num_groups, stream, mr);
+  return build_histogram(input_values, group_labels, input_counts, num_groups, stream, mr);
 }
 
 }  // namespace cudf::groupby::detail
diff --git a/cpp/src/reductions/histogram_helpers.hpp b/cpp/src/reductions/histogram_helpers.hpp
index 2a271ec70a5..3d3d548cc00 100644
--- a/cpp/src/reductions/histogram_helpers.hpp
+++ b/cpp/src/reductions/histogram_helpers.hpp
@@ -37,8 +37,8 @@ namespace cudf::reduction::detail {
  * @param output_dtype The output type to store the count value
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate memory of the returned objects
- * @return A pair of array contains the indices of the distinct rows in the input table, and their
- *         corresponding distinct counts
+ * @return A pair of array contains the (stable-order) indices of the distinct rows in the input
+ * table, and their corresponding distinct counts
  */
 std::pair<rmm::device_uvector<size_type>, std::unique_ptr<column>> table_histogram(
   table_view const& input,

From 6447877645a69acb157fc0c56072f19f0a96acbd Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 19 Sep 2023 11:01:16 -0700
Subject: [PATCH 67/93] Add empty tests

---
 cpp/tests/reductions/reduction_tests.cpp | 29 +++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index f23abf431ee..34e1011d294 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -419,7 +419,18 @@ TYPED_TEST(ReductionHistogramTest, Histogram)
 
   auto const agg = cudf::make_histogram_aggregation<reduce_aggregation>();
 
-  // Test without nulls.
+  // Empty input.
+  {
+    auto const input    = col_data{};
+    auto const expected = [] {
+      auto child1 = col_data{};
+      auto child2 = int64_data{};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
   {
     auto const input    = col_data{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1};
     auto const expected = [] {
@@ -482,6 +493,22 @@ TYPED_TEST(ReductionHistogramTest, MergeHistogram)
 
   auto const agg = cudf::make_merge_histogram_aggregation<reduce_aggregation>();
 
+  // Empty input.
+  {
+    auto const input = [] {
+      auto child1 = col_data{};
+      auto child2 = int64_data{};
+      return structs_col{{child1, child2}};
+    }();
+    auto const expected = [] {
+      auto child1 = col_data{};
+      auto child2 = int64_data{};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
   // Test without nulls.
   {
     auto const input = [] {

From c766e4354118ffa16f1f39af0050f0d3cdc66f7d Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 19 Sep 2023 12:13:01 -0700
Subject: [PATCH 68/93] Implement histogram tests

---
 cpp/tests/groupby/histogram_tests.cpp | 51 +++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/cpp/tests/groupby/histogram_tests.cpp b/cpp/tests/groupby/histogram_tests.cpp
index 761a2abacae..0bcda05667a 100644
--- a/cpp/tests/groupby/histogram_tests.cpp
+++ b/cpp/tests/groupby/histogram_tests.cpp
@@ -21,4 +21,55 @@
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <cudf/copying.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/lists/sorting.hpp>
+#include <cudf/sorting.hpp>
+
+template <typename T>
+struct GroupbyHistogramTest : public cudf::test::BaseFixture {};
+
+template <typename T>
+struct GroupbyMergeHistogramTest : public cudf::test::BaseFixture {};
+
+// Avoid unsigned types, as the tests below have negative values in their input.
+using HistogramTestTypes = cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t>,
+                                              cudf::test::FloatingPointTypes,
+                                              cudf::test::FixedPointTypes,
+                                              cudf::test::ChronoTypes>;
+TYPED_TEST_SUITE(GroupbyHistogramTest, HistogramTestTypes);
+TYPED_TEST_SUITE(GroupbyMergeHistogramTest, HistogramTestTypes);
+
+auto groupby_histogram(cudf::column_view const& keys,
+                       cudf::column_view const& values,
+                       std::unique_ptr<cudf::groupby_aggregation>&& agg)
+{
+  CUDF_EXPECTS(
+    agg->kind == cudf::aggregation::HISTOGRAM || agg->kind == cudf::aggregation::MERGE_HISTOGRAM,
+    "Aggregation must be either HISTOGRAM or MERGE_HISTOGRAM.");
+
+  std::vector<cudf::groupby::aggregation_request> requests;
+  requests.emplace_back(cudf::groupby::aggregation_request());
+  requests[0].values = values;
+  requests[0].aggregations.push_back(std::move(agg));
+
+  auto gb_obj              = cudf::groupby::groupby(cudf::table_view({keys}));
+  auto const agg_results   = gb_obj.aggregate(requests, cudf::test::get_default_stream());
+  auto const agg_histogram = agg_results.second[0].results[0]->view();
+  EXPECT_NE(agg_histogram.type().id(), cudf::type_id::LIST);
+  EXPECT_EQ(agg_histogram.num_children(), 2);
+
+  auto const key_sort_order = cudf::sorted_order(agg_results.first->view(), {}, {});
+  auto sorted_keys =
+    std::move(cudf::gather(agg_results.first->view(), *key_sort_order)->release().front());
+  auto const sorted_vals = std::move(
+    cudf::gather(cudf::table_view({agg_results.second[0].results[0]->view()}), *key_sort_order)
+      ->release()
+      .front());
+  auto sorted_histograms = cudf::lists::sort_lists(cudf::lists_column_view{*sorted_vals},
+                                                   cudf::order::ASCENDING,
+                                                   cudf::null_order::BEFORE,
+                                                   rmm::mr::get_current_device_resource());
+
+  return std::pair{std::move(sorted_keys), std::move(sorted_histograms)};
+}

From baddf18786d37bfad1b41fb0bdb7b2dc666d0ac2 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 19 Sep 2023 20:22:37 -0700
Subject: [PATCH 69/93] Move tests

---
 cpp/tests/reductions/reduction_tests.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 34e1011d294..9ee7527b0f5 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -380,16 +380,6 @@ TYPED_TEST(ReductionTest, SumOfSquare)
             expected_null_value);
 }
 
-template <typename T>
-struct ReductionHistogramTest : public cudf::test::BaseFixture {};
-
-// Avoid unsigned types, as the tests below have negative values in their input.
-using HistogramTestTypes = cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t>,
-                                              cudf::test::FloatingPointTypes,
-                                              cudf::test::FixedPointTypes,
-                                              cudf::test::ChronoTypes>;
-TYPED_TEST_SUITE(ReductionHistogramTest, HistogramTestTypes);
-
 auto histogram_reduction(cudf::column_view const& input,
                          std::unique_ptr<cudf::reduce_aggregation> const& agg)
 {
@@ -411,6 +401,16 @@ auto histogram_reduction(cudf::column_view const& input,
   return std::move(cudf::gather(cudf::table_view{{result_col}}, *sort_order)->release().front());
 }
 
+template <typename T>
+struct ReductionHistogramTest : public cudf::test::BaseFixture {};
+
+// Avoid unsigned types, as the tests below have negative values in their input.
+using HistogramTestTypes = cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t>,
+                                              cudf::test::FloatingPointTypes,
+                                              cudf::test::FixedPointTypes,
+                                              cudf::test::ChronoTypes>;
+TYPED_TEST_SUITE(ReductionHistogramTest, HistogramTestTypes);
+
 TYPED_TEST(ReductionHistogramTest, Histogram)
 {
   using col_data    = cudf::test::fixed_width_column_wrapper<TypeParam, int>;

From 0afad9cc94d272c6a85d52ad8f181f40162e791e Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 19 Sep 2023 20:24:15 -0700
Subject: [PATCH 70/93] Rename alias

---
 cpp/tests/reductions/reduction_tests.cpp | 82 ++++++++++++------------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 9ee7527b0f5..74bd39e11ea 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -413,18 +413,18 @@ TYPED_TEST_SUITE(ReductionHistogramTest, HistogramTestTypes);
 
 TYPED_TEST(ReductionHistogramTest, Histogram)
 {
-  using col_data    = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
-  using int64_data  = cudf::test::fixed_width_column_wrapper<int64_t>;
+  using data_col    = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+  using int64_col   = cudf::test::fixed_width_column_wrapper<int64_t>;
   using structs_col = cudf::test::structs_column_wrapper;
 
   auto const agg = cudf::make_histogram_aggregation<reduce_aggregation>();
 
   // Empty input.
   {
-    auto const input    = col_data{};
+    auto const input    = data_col{};
     auto const expected = [] {
-      auto child1 = col_data{};
-      auto child2 = int64_data{};
+      auto child1 = data_col{};
+      auto child2 = int64_col{};
       return structs_col{{child1, child2}};
     }();
     auto const result = histogram_reduction(input, agg);
@@ -432,10 +432,10 @@ TYPED_TEST(ReductionHistogramTest, Histogram)
   }
 
   {
-    auto const input    = col_data{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1};
+    auto const input    = data_col{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1};
     auto const expected = [] {
-      auto child1 = col_data{-3, -2, 0, 1, 2, 5};
-      auto child2 = int64_data{2, 1, 1, 2, 4, 1};
+      auto child1 = data_col{-3, -2, 0, 1, 2, 5};
+      auto child2 = int64_col{2, 1, 1, 2, 4, 1};
       return structs_col{{child1, child2}};
     }();
     auto const result = histogram_reduction(input, agg);
@@ -444,11 +444,11 @@ TYPED_TEST(ReductionHistogramTest, Histogram)
 
   // Test without nulls, sliced input.
   {
-    auto const input_original = col_data{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1};
+    auto const input_original = data_col{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1};
     auto const input          = cudf::slice(input_original, {0, 7})[0];
     auto const expected       = [] {
-      auto child1 = col_data{-3, 0, 1, 2, 5};
-      auto child2 = int64_data{1, 1, 1, 3, 1};
+      auto child1 = data_col{-3, 0, 1, 2, 5};
+      auto child2 = int64_col{1, 1, 1, 3, 1};
       return structs_col{{child1, child2}};
     }();
     auto const result = histogram_reduction(input, agg);
@@ -459,11 +459,11 @@ TYPED_TEST(ReductionHistogramTest, Histogram)
   using namespace cudf::test::iterators;
   auto constexpr null{0};
   {
-    auto const input    = col_data{{null, -3, 2, 1, 2, 0, null, 5, 2, null, -3, -2, null, 2, 1},
+    auto const input    = data_col{{null, -3, 2, 1, 2, 0, null, 5, 2, null, -3, -2, null, 2, 1},
                                 nulls_at({0, 6, 9, 12})};
     auto const expected = [] {
-      auto child1 = col_data{{null, -3, -2, 0, 1, 2, 5}, null_at(0)};
-      auto child2 = int64_data{4, 2, 1, 1, 2, 4, 1};
+      auto child1 = data_col{{null, -3, -2, 0, 1, 2, 5}, null_at(0)};
+      auto child2 = int64_col{4, 2, 1, 1, 2, 4, 1};
       return structs_col{{child1, child2}};
     }();
     auto const result = histogram_reduction(input, agg);
@@ -472,12 +472,12 @@ TYPED_TEST(ReductionHistogramTest, Histogram)
 
   // Test with nulls, sliced input.
   {
-    auto const input_original = col_data{
+    auto const input_original = data_col{
       {null, -3, 2, 1, 2, 0, null, 5, 2, null, -3, -2, null, 2, 1}, nulls_at({0, 6, 9, 12})};
     auto const input    = cudf::slice(input_original, {0, 9})[0];
     auto const expected = [] {
-      auto child1 = col_data{{null, -3, 0, 1, 2, 5}, null_at(0)};
-      auto child2 = int64_data{2, 1, 1, 1, 3, 1};
+      auto child1 = data_col{{null, -3, 0, 1, 2, 5}, null_at(0)};
+      auto child2 = int64_col{2, 1, 1, 1, 3, 1};
       return structs_col{{child1, child2}};
     }();
     auto const result = histogram_reduction(input, agg);
@@ -487,8 +487,8 @@ TYPED_TEST(ReductionHistogramTest, Histogram)
 
 TYPED_TEST(ReductionHistogramTest, MergeHistogram)
 {
-  using col_data    = cudf::test::fixed_width_column_wrapper<TypeParam>;
-  using int64_data  = cudf::test::fixed_width_column_wrapper<int64_t>;
+  using data_col    = cudf::test::fixed_width_column_wrapper<TypeParam>;
+  using int64_col   = cudf::test::fixed_width_column_wrapper<int64_t>;
   using structs_col = cudf::test::structs_column_wrapper;
 
   auto const agg = cudf::make_merge_histogram_aggregation<reduce_aggregation>();
@@ -496,13 +496,13 @@ TYPED_TEST(ReductionHistogramTest, MergeHistogram)
   // Empty input.
   {
     auto const input = [] {
-      auto child1 = col_data{};
-      auto child2 = int64_data{};
+      auto child1 = data_col{};
+      auto child2 = int64_col{};
       return structs_col{{child1, child2}};
     }();
     auto const expected = [] {
-      auto child1 = col_data{};
-      auto child2 = int64_data{};
+      auto child1 = data_col{};
+      auto child2 = int64_col{};
       return structs_col{{child1, child2}};
     }();
     auto const result = histogram_reduction(input, agg);
@@ -512,14 +512,14 @@ TYPED_TEST(ReductionHistogramTest, MergeHistogram)
   // Test without nulls.
   {
     auto const input = [] {
-      auto child1 = col_data{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1};
-      auto child2 = int64_data{2, 1, 1, 2, 4, 1, 2, 3, 5, 3, 4};
+      auto child1 = data_col{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1};
+      auto child2 = int64_col{2, 1, 1, 2, 4, 1, 2, 3, 5, 3, 4};
       return structs_col{{child1, child2}};
     }();
 
     auto const expected = [] {
-      auto child1 = col_data{-3, -2, 0, 1, 2, 5};
-      auto child2 = int64_data{5, 5, 4, 5, 8, 1};
+      auto child1 = data_col{-3, -2, 0, 1, 2, 5};
+      auto child2 = int64_col{5, 5, 4, 5, 8, 1};
       return structs_col{{child1, child2}};
     }();
     auto const result = histogram_reduction(input, agg);
@@ -529,15 +529,15 @@ TYPED_TEST(ReductionHistogramTest, MergeHistogram)
   // Test without nulls, sliced input.
   {
     auto const input_original = [] {
-      auto child1 = col_data{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1};
-      auto child2 = int64_data{2, 1, 1, 2, 4, 1, 2, 3, 5, 3, 4};
+      auto child1 = data_col{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1};
+      auto child2 = int64_col{2, 1, 1, 2, 4, 1, 2, 3, 5, 3, 4};
       return structs_col{{child1, child2}};
     }();
     auto const input = cudf::slice(input_original, {0, 7})[0];
 
     auto const expected = [] {
-      auto child1 = col_data{-3, 0, 1, 2, 5};
-      auto child2 = int64_data{2, 4, 1, 5, 1};
+      auto child1 = data_col{-3, 0, 1, 2, 5};
+      auto child2 = int64_col{2, 4, 1, 5, 1};
       return structs_col{{child1, child2}};
     }();
     auto const result = histogram_reduction(input, agg);
@@ -549,15 +549,15 @@ TYPED_TEST(ReductionHistogramTest, MergeHistogram)
   auto constexpr null{0};
   {
     auto const input = [] {
-      auto child1 = col_data{{-3, 2, null, 1, 2, null, 0, 5, null, 2, -3, null, -2, 2, 1, null},
+      auto child1 = data_col{{-3, 2, null, 1, 2, null, 0, 5, null, 2, -3, null, -2, 2, 1, null},
                              nulls_at({2, 5, 8, 11, 15})};
-      auto child2 = int64_data{2, 1, 12, 1, 2, 11, 4, 1, 10, 2, 3, 15, 5, 3, 4, 19};
+      auto child2 = int64_col{2, 1, 12, 1, 2, 11, 4, 1, 10, 2, 3, 15, 5, 3, 4, 19};
       return structs_col{{child1, child2}};
     }();
 
     auto const expected = [] {
-      auto child1 = col_data{{null, -3, -2, 0, 1, 2, 5}, null_at(0)};
-      auto child2 = int64_data{67, 5, 5, 4, 5, 8, 1};
+      auto child1 = data_col{{null, -3, -2, 0, 1, 2, 5}, null_at(0)};
+      auto child2 = int64_col{67, 5, 5, 4, 5, 8, 1};
       return structs_col{{child1, child2}};
     }();
     auto const result = histogram_reduction(input, agg);
@@ -567,16 +567,16 @@ TYPED_TEST(ReductionHistogramTest, MergeHistogram)
   // Test with nulls, sliced input.
   {
     auto const input_original = [] {
-      auto child1 = col_data{{-3, 2, null, 1, 2, null, 0, 5, null, 2, -3, null, -2, 2, 1, null},
+      auto child1 = data_col{{-3, 2, null, 1, 2, null, 0, 5, null, 2, -3, null, -2, 2, 1, null},
                              nulls_at({2, 5, 8, 11, 15})};
-      auto child2 = int64_data{2, 1, 12, 1, 2, 11, 4, 1, 10, 2, 3, 15, 5, 3, 4, 19};
+      auto child2 = int64_col{2, 1, 12, 1, 2, 11, 4, 1, 10, 2, 3, 15, 5, 3, 4, 19};
       return structs_col{{child1, child2}};
     }();
     auto const input = cudf::slice(input_original, {0, 9})[0];
 
     auto const expected = [] {
-      auto child1 = col_data{{null, -3, 0, 1, 2, 5}, null_at(0)};
-      auto child2 = int64_data{33, 2, 4, 1, 3, 1};
+      auto child1 = data_col{{null, -3, 0, 1, 2, 5}, null_at(0)};
+      auto child2 = int64_col{33, 2, 4, 1, 3, 1};
       return structs_col{{child1, child2}};
     }();
     auto const result = histogram_reduction(input, agg);
@@ -1089,10 +1089,10 @@ TEST_F(ReductionEmptyTest, empty_column)
   // test if null count is equal or greater than size of input
   // expect result.is_valid() is false
   int col_size = 5;
-  std::vector<T> col_data(col_size);
+  std::vector<T> data_col(col_size);
   std::vector<bool> valids(col_size, 0);
 
-  cudf::test::fixed_width_column_wrapper<T> col_nulls = construct_null_column(col_data, valids);
+  cudf::test::fixed_width_column_wrapper<T> col_nulls = construct_null_column(data_col, valids);
   CUDF_EXPECT_NO_THROW(statement(col_nulls));
 
   auto any_agg   = cudf::make_any_aggregation<cudf::reduce_aggregation>();

From c05e5956d74c9fa546f75cf774cb060015f939ed Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 19 Sep 2023 21:47:54 -0700
Subject: [PATCH 71/93] Add target types

---
 cpp/include/cudf/detail/aggregation/aggregation.hpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 930ec992384..784f05a964e 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1192,6 +1192,12 @@ struct target_type_impl<Source, aggregation::COUNT_ALL> {
   using type = size_type;
 };
 
+// Use list for HISTOGRAM
+template <typename SourceType>
+struct target_type_impl<SourceType, aggregation::HISTOGRAM> {
+  using type = list_view;
+};
+
 // Computing ANY of any type, use bool accumulator
 template <typename Source>
 struct target_type_impl<Source, aggregation::ANY> {
@@ -1370,6 +1376,12 @@ struct target_type_impl<SourceType, aggregation::MERGE_M2> {
   using type = struct_view;
 };
 
+// Use list for MERGE_HISTOGRAM
+template <typename SourceType>
+struct target_type_impl<SourceType, aggregation::MERGE_HISTOGRAM> {
+  using type = list_view;
+};
+
 // Always use double for COVARIANCE
 template <typename SourceType>
 struct target_type_impl<SourceType, aggregation::COVARIANCE> {

From 86530538a55d6219ea3e8c5c18b8fa4a969f16a1 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 19 Sep 2023 21:54:29 -0700
Subject: [PATCH 72/93] Add empty return

---
 cpp/src/groupby/groupby.cu | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index ce1fc71968f..ec200b52bbd 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -110,6 +110,20 @@ struct empty_column_constructor {
         0, make_empty_column(type_to_id<size_type>()), empty_like(values), 0, {});
     }
 
+    if constexpr (k == aggregation::Kind::HISTOGRAM || k == aggregation::Kind::MERGE_HISTOGRAM) {
+        std::vector<std::unique_ptr<column>> struct_children;
+        struct_children.emplace_back(empty_like(values));
+        struct_children.emplace_back(make_numeric_column(data_type{type_id::INT64}, 0));
+        auto structs = std::make_unique<column>(data_type{type_id::STRUCT},
+                                                0,
+                                                rmm::device_buffer{},
+                                                rmm::device_buffer{},
+                                                0,
+                                                std::move(struct_children));
+      return make_lists_column(
+        0, make_empty_column(type_to_id<size_type>()), std::move(structs), 0, {});
+    }
+
     if constexpr (k == aggregation::Kind::RANK) {
       auto const& rank_agg = dynamic_cast<cudf::detail::rank_aggregation const&>(agg);
       if (rank_agg._method == cudf::rank_method::AVERAGE or

From 8d6fdfee9255a79245db4d3fb05ed0f689c5eb3a Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Tue, 19 Sep 2023 21:55:19 -0700
Subject: [PATCH 73/93] MISC

---
 cpp/src/groupby/sort/group_histogram.cu | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/cpp/src/groupby/sort/group_histogram.cu b/cpp/src/groupby/sort/group_histogram.cu
index e042d88f837..63e6952c3b5 100644
--- a/cpp/src/groupby/sort/group_histogram.cu
+++ b/cpp/src/groupby/sort/group_histogram.cu
@@ -44,16 +44,8 @@ auto make_empty_histogram(column_view const& values)
                                           rmm::device_buffer{},
                                           0,
                                           std::move(struct_children));
-
-  std::vector<std::unique_ptr<column>> lists_children;
-  lists_children.emplace_back(make_numeric_column(data_type{type_to_id<size_type>()}, 0));
-  lists_children.emplace_back(std::move(structs));
-  return std::make_unique<column>(cudf::data_type{type_id::LIST},
-                                  0,
-                                  rmm::device_buffer{},
-                                  rmm::device_buffer{},
-                                  0,
-                                  std::move(lists_children));
+  return make_lists_column(
+    0, make_empty_column(type_to_id<size_type>()), std::move(structs), 0, {});
 }
 
 std::unique_ptr<column> build_histogram(column_view const& values,

From d1fbda44fe501f5e2106db23491620dbb2d49da1 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 20 Sep 2023 09:55:39 -0700
Subject: [PATCH 74/93] Add more assertions

---
 cpp/tests/reductions/reduction_tests.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 74bd39e11ea..ed85da9e50b 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -393,12 +393,14 @@ auto histogram_reduction(cudf::column_view const& input,
   auto const result_list_scalar = dynamic_cast<cudf::list_scalar*>(result_scalar.get());
   EXPECT_NE(result_list_scalar, nullptr);
 
-  auto const result_col = result_list_scalar->view();
-  EXPECT_EQ(result_col.num_children(), 2);
+  auto const histogram = result_list_scalar->view();
+  EXPECT_EQ(histogram.num_children(), 2);
+  EXPECT_EQ(histogram.null_count(), 0);
+  EXPECT_EQ(histogram.child(1).null_count(), 0);
 
   // Sort the histogram based on the first column (unique input values).
-  auto const sort_order = cudf::sorted_order(cudf::table_view{{result_col.child(0)}}, {}, {});
-  return std::move(cudf::gather(cudf::table_view{{result_col}}, *sort_order)->release().front());
+  auto const sort_order = cudf::sorted_order(cudf::table_view{{histogram.child(0)}}, {}, {});
+  return std::move(cudf::gather(cudf::table_view{{histogram}}, *sort_order)->release().front());
 }
 
 template <typename T>

From 199d97b0019b6cf2ee5a6a51beb7b10db39821b5 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 20 Sep 2023 10:09:02 -0700
Subject: [PATCH 75/93] Implement unit tests for groupby histogram

---
 cpp/tests/groupby/histogram_tests.cpp | 291 ++++++++++++++++++++++++--
 1 file changed, 269 insertions(+), 22 deletions(-)

diff --git a/cpp/tests/groupby/histogram_tests.cpp b/cpp/tests/groupby/histogram_tests.cpp
index 0bcda05667a..3ea0c07b68a 100644
--- a/cpp/tests/groupby/histogram_tests.cpp
+++ b/cpp/tests/groupby/histogram_tests.cpp
@@ -26,46 +26,45 @@
 #include <cudf/lists/sorting.hpp>
 #include <cudf/sorting.hpp>
 
-template <typename T>
-struct GroupbyHistogramTest : public cudf::test::BaseFixture {};
-
-template <typename T>
-struct GroupbyMergeHistogramTest : public cudf::test::BaseFixture {};
-
-// Avoid unsigned types, as the tests below have negative values in their input.
-using HistogramTestTypes = cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t>,
-                                              cudf::test::FloatingPointTypes,
-                                              cudf::test::FixedPointTypes,
-                                              cudf::test::ChronoTypes>;
-TYPED_TEST_SUITE(GroupbyHistogramTest, HistogramTestTypes);
-TYPED_TEST_SUITE(GroupbyMergeHistogramTest, HistogramTestTypes);
+using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
+using int64s_col  = cudf::test::fixed_width_column_wrapper<int64_t>;
+using structs_col = cudf::test::structs_column_wrapper;
 
 auto groupby_histogram(cudf::column_view const& keys,
                        cudf::column_view const& values,
-                       std::unique_ptr<cudf::groupby_aggregation>&& agg)
+                       cudf::aggregation::Kind agg_kind)
 {
   CUDF_EXPECTS(
-    agg->kind == cudf::aggregation::HISTOGRAM || agg->kind == cudf::aggregation::MERGE_HISTOGRAM,
+    agg_kind == cudf::aggregation::HISTOGRAM || agg_kind == cudf::aggregation::MERGE_HISTOGRAM,
     "Aggregation must be either HISTOGRAM or MERGE_HISTOGRAM.");
 
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
   requests[0].values = values;
-  requests[0].aggregations.push_back(std::move(agg));
+  if (agg_kind == cudf::aggregation::HISTOGRAM) {
+    requests[0].aggregations.push_back(
+      cudf::make_histogram_aggregation<cudf::groupby_aggregation>());
+  } else {
+    requests[0].aggregations.push_back(
+      cudf::make_merge_histogram_aggregation<cudf::groupby_aggregation>());
+  }
 
   auto gb_obj              = cudf::groupby::groupby(cudf::table_view({keys}));
   auto const agg_results   = gb_obj.aggregate(requests, cudf::test::get_default_stream());
   auto const agg_histogram = agg_results.second[0].results[0]->view();
-  EXPECT_NE(agg_histogram.type().id(), cudf::type_id::LIST);
-  EXPECT_EQ(agg_histogram.num_children(), 2);
+  EXPECT_EQ(agg_histogram.type().id(), cudf::type_id::LIST);
+  EXPECT_EQ(agg_histogram.null_count(), 0);
+
+  auto const histograms = cudf::lists_column_view{agg_histogram}.child();
+  EXPECT_EQ(histograms.num_children(), 2);
+  EXPECT_EQ(histograms.null_count(), 0);
+  EXPECT_EQ(histograms.child(1).null_count(), 0);
 
   auto const key_sort_order = cudf::sorted_order(agg_results.first->view(), {}, {});
   auto sorted_keys =
     std::move(cudf::gather(agg_results.first->view(), *key_sort_order)->release().front());
-  auto const sorted_vals = std::move(
-    cudf::gather(cudf::table_view({agg_results.second[0].results[0]->view()}), *key_sort_order)
-      ->release()
-      .front());
+  auto const sorted_vals =
+    std::move(cudf::gather(cudf::table_view{{agg_histogram}}, *key_sort_order)->release().front());
   auto sorted_histograms = cudf::lists::sort_lists(cudf::lists_column_view{*sorted_vals},
                                                    cudf::order::ASCENDING,
                                                    cudf::null_order::BEFORE,
@@ -73,3 +72,251 @@ auto groupby_histogram(cudf::column_view const& keys,
 
   return std::pair{std::move(sorted_keys), std::move(sorted_histograms)};
 }
+
+template <typename T>
+struct GroupbyHistogramTest : public cudf::test::BaseFixture {};
+
+template <typename T>
+struct GroupbyMergeHistogramTest : public cudf::test::BaseFixture {};
+
+// Avoid unsigned types, as the tests below have negative values in their input.
+using HistogramTestTypes = cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t>,
+                                              cudf::test::FloatingPointTypes,
+                                              cudf::test::FixedPointTypes,
+                                              cudf::test::ChronoTypes>;
+TYPED_TEST_SUITE(GroupbyHistogramTest, HistogramTestTypes);
+TYPED_TEST_SUITE(GroupbyMergeHistogramTest, HistogramTestTypes);
+
+TYPED_TEST(GroupbyHistogramTest, EmptyInput)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+
+  auto const keys   = int32s_col{};
+  auto const values = col_data{};
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, values, cudf::aggregation::HISTOGRAM);
+
+  // The structure of the output is already verified in the function `groupby_histogram`.
+  ASSERT_EQ(res_histogram->size(), 0);
+}
+
+TYPED_TEST(GroupbyHistogramTest, SimpleInputNoNull)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+
+  // key = 0: values = [2, 2, -3, -2, 2]
+  // key = 1: values = [2, 0, 5, 2, 1]
+  // key = 2: values = [-3, 1, 1, 2, 2]
+  auto const keys               = int32s_col{2, 0, 2, 1, 1, 1, 0, 0, 0, 1, 2, 2, 1, 0, 2};
+  auto const values             = col_data{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1, 2, 1, 2, 2};
+  auto const expected_keys      = int32s_col{0, 1, 2};
+  auto const expected_histogram = [] {
+    auto structs = [] {
+      auto values = col_data{-3, -2, 2, 0, 1, 2, 5, -3, 1, 2};
+      auto counts = int64s_col{1, 1, 3, 1, 1, 2, 1, 1, 2, 2};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      3, int32s_col{0, 3, 7, 10}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, values, cudf::aggregation::HISTOGRAM);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_histogram, *res_histogram);
+}
+
+TYPED_TEST(GroupbyHistogramTest, SlicedInputNoNull)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+
+  auto const keys_original = int32s_col{2, 0, 2, 1, 0, 2, 0, 2, 1, 1, 1, 0, 0, 0, 1, 2, 2, 1, 0, 2};
+  auto const values_original =
+    col_data{1, 2, 0, 2, 1, -3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1, 2, 1, 2, 2};
+
+  // key = 0: values = [2, 2, -3, -2, 2]
+  // key = 1: values = [2, 0, 5, 2, 1]
+  // key = 2: values = [-3, 1, 1, 2, 2]
+  auto const keys   = cudf::slice(keys_original, {5, 20})[0];
+  auto const values = cudf::slice(values_original, {5, 20})[0];
+
+  auto const expected_keys      = int32s_col{0, 1, 2};
+  auto const expected_histogram = [] {
+    auto structs = [] {
+      auto values = col_data{-3, -2, 2, 0, 1, 2, 5, -3, 1, 2};
+      auto counts = int64s_col{1, 1, 3, 1, 1, 2, 1, 1, 2, 2};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      3, int32s_col{0, 3, 7, 10}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, values, cudf::aggregation::HISTOGRAM);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_histogram, *res_histogram);
+}
+
+TYPED_TEST(GroupbyHistogramTest, InputWithNulls)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+  using namespace cudf::test::iterators;
+  auto constexpr null{0};
+
+  // key = 0: values = [-3, null, 2, null, 2]
+  // key = 1: values = [1, 2, null, 5, 2, -3, 1, 1]
+  // key = 2: values = [null, 2, 0, -2, 2, null, 2]
+  auto const keys = int32s_col{2, 0, 2, 1, 1, 1, 2, 1, 1, 0, 1, 2, 0, 0, 1, 2, 2, 1, 0, 2};
+  auto const values =
+    col_data{{null, -3, 2, 1, 2, null, 0, 5, 2, null, -3, -2, 2, null, 1, 2, null, 1, 2, 2},
+             nulls_at({0, 5, 9, 13, 16})};
+  auto const expected_keys      = int32s_col{0, 1, 2};
+  auto const expected_histogram = [] {
+    auto structs = [] {
+      auto values = col_data{{null, -3, 2, null, -3, 1, 2, 5, null, -2, 0, 2}, nulls_at({0, 3, 8})};
+      auto counts = int64s_col{2, 1, 2, 1, 1, 3, 2, 1, 2, 1, 1, 3};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      3, int32s_col{0, 3, 8, 12}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, values, cudf::aggregation::HISTOGRAM);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_histogram, *res_histogram);
+}
+
+TYPED_TEST(GroupbyHistogramTest, SlicedInputWithNulls)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+  using namespace cudf::test::iterators;
+  auto constexpr null{0};
+
+  auto const keys_original =
+    int32s_col{1, 0, 2, 2, 0, 2, 0, 2, 1, 1, 1, 2, 1, 1, 0, 1, 2, 0, 0, 1, 2, 2, 1, 0, 2, 0, 1, 2};
+  auto const values_original =
+    col_data{{null, 1,  1,  2, 1,    null, -3, 2,    1, 2, null, 0,    5, 2,
+              null, -3, -2, 2, null, 1,    2,  null, 1, 2, 2,    null, 1, 2},
+             nulls_at({0, 5, 10, 14, 18, 21, 25})};
+
+  // key = 0: values = [-3, null, 2, null, 2]
+  // key = 1: values = [1, 2, null, 5, 2, -3, 1, 1]
+  // key = 2: values = [null, 2, 0, -2, 2, null, 2]
+  auto const keys   = cudf::slice(keys_original, {5, 25})[0];
+  auto const values = cudf::slice(values_original, {5, 25})[0];
+
+  auto const expected_keys      = int32s_col{0, 1, 2};
+  auto const expected_histogram = [] {
+    auto structs = [] {
+      auto values = col_data{{null, -3, 2, null, -3, 1, 2, 5, null, -2, 0, 2}, nulls_at({0, 3, 8})};
+      auto counts = int64s_col{2, 1, 2, 1, 1, 3, 2, 1, 2, 1, 1, 3};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      3, int32s_col{0, 3, 8, 12}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, values, cudf::aggregation::HISTOGRAM);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_histogram, *res_histogram);
+}
+
+#if 0
+TYPED_TEST(GroupbyMergeHistogramTest, MergeHistogram)
+{
+  using col_data    = cudf::test::fixed_width_column_wrapper<TypeParam>;
+  using int64s_col  = cudf::test::fixed_width_column_wrapper<int64_t>;
+  using structs_col = cudf::test::structs_column_wrapper;
+
+  auto const agg = cudf::make_merge_histogram_aggregation<reduce_aggregation>();
+
+  // Empty input.
+  {
+    auto const input = [] {
+      auto child1 = col_data{};
+      auto child2 = int64s_col{};
+      return structs_col{{child1, child2}};
+    }();
+    auto const expected = [] {
+      auto child1 = col_data{};
+      auto child2 = int64s_col{};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Test without nulls.
+  {
+    auto const input = [] {
+      auto child1 = col_data{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1};
+      auto child2 = int64s_col{2, 1, 1, 2, 4, 1, 2, 3, 5, 3, 4};
+      return structs_col{{child1, child2}};
+    }();
+
+    auto const expected = [] {
+      auto child1 = col_data{-3, -2, 0, 1, 2, 5};
+      auto child2 = int64s_col{5, 5, 4, 5, 8, 1};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Test without nulls, sliced input.
+  {
+    auto const input_original = [] {
+      auto child1 = col_data{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1};
+      auto child2 = int64s_col{2, 1, 1, 2, 4, 1, 2, 3, 5, 3, 4};
+      return structs_col{{child1, child2}};
+    }();
+    auto const input = cudf::slice(input_original, {0, 7})[0];
+
+    auto const expected = [] {
+      auto child1 = col_data{-3, 0, 1, 2, 5};
+      auto child2 = int64s_col{2, 4, 1, 5, 1};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Test with nulls.
+  using namespace cudf::test::iterators;
+  auto constexpr null{0};
+  {
+    auto const input = [] {
+      auto child1 = col_data{{-3, 2, null, 1, 2, null, 0, 5, null, 2, -3, null, -2, 2, 1, null},
+                             nulls_at({2, 5, 8, 11, 15})};
+      auto child2 = int64s_col{2, 1, 12, 1, 2, 11, 4, 1, 10, 2, 3, 15, 5, 3, 4, 19};
+      return structs_col{{child1, child2}};
+    }();
+
+    auto const expected = [] {
+      auto child1 = col_data{{null, -3, -2, 0, 1, 2, 5}, null_at(0)};
+      auto child2 = int64s_col{67, 5, 5, 4, 5, 8, 1};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Test with nulls, sliced input.
+  {
+    auto const input_original = [] {
+      auto child1 = col_data{{-3, 2, null, 1, 2, null, 0, 5, null, 2, -3, null, -2, 2, 1, null},
+                             nulls_at({2, 5, 8, 11, 15})};
+      auto child2 = int64s_col{2, 1, 12, 1, 2, 11, 4, 1, 10, 2, 3, 15, 5, 3, 4, 19};
+      return structs_col{{child1, child2}};
+    }();
+    auto const input = cudf::slice(input_original, {0, 9})[0];
+
+    auto const expected = [] {
+      auto child1 = col_data{{null, -3, 0, 1, 2, 5}, null_at(0)};
+      auto child2 = int64s_col{33, 2, 4, 1, 3, 1};
+      return structs_col{{child1, child2}};
+    }();
+    auto const result = histogram_reduction(input, agg);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+}
+
+#endif

From 4b0983e810c1427868100be532cd8cb32b5b0d66 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 20 Sep 2023 11:34:59 -0700
Subject: [PATCH 76/93] Reimplement merge histogram

---
 cpp/src/groupby/sort/group_histogram.cu   | 44 ++++++++++++++++++-----
 cpp/src/groupby/sort/group_reductions.hpp |  4 +--
 cpp/src/reductions/histogram.cu           |  5 ++-
 3 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/cpp/src/groupby/sort/group_histogram.cu b/cpp/src/groupby/sort/group_histogram.cu
index 63e6952c3b5..88724c54112 100644
--- a/cpp/src/groupby/sort/group_histogram.cu
+++ b/cpp/src/groupby/sort/group_histogram.cu
@@ -20,12 +20,15 @@
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/gather.hpp>
+#include <cudf/detail/labeling/label_segments.cuh>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_buffer.hpp>
 
+#include <thrust/gather.h>
+
 namespace cudf::groupby::detail {
 
 // Fixed type for counting frequencies in historam.
@@ -55,9 +58,9 @@ std::unique_ptr<column> build_histogram(column_view const& values,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(num_groups >= 0, "number of groups cannot be negative");
+  CUDF_EXPECTS(num_groups >= 0, "Number of groups cannot be negative.");
   CUDF_EXPECTS(static_cast<size_t>(values.size()) == group_labels.size(),
-               "Size of values column should be same as that of group labels");
+               "Size of values column should be same as that of group labels.");
 
   if (num_groups == 0) { return make_empty_histogram(values); }
 
@@ -108,26 +111,49 @@ std::unique_ptr<column> group_histogram(column_view const& values,
 }
 
 std::unique_ptr<column> group_merge_histogram(column_view const& values,
-                                              cudf::device_span<size_type const> group_labels,
+                                              cudf::device_span<size_type const> group_offsets,
                                               size_type num_groups,
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
+  // The input must be a lists column without nulls.
   CUDF_EXPECTS(!values.has_nulls(), "The input column must not have nulls.");
+  CUDF_EXPECTS(values.type().id() == type_id::LIST,
+               "The input of MERGE_HISTOGRAM aggregation must be a lists column.");
+
+  // Child of the input lists column must be a structs column without nulls,
+  // and its second child is a count columns of integer type having no nulls.
+  auto const lists_cv     = lists_column_view{values};
+  auto const histogram_cv = lists_cv.get_sliced_child(stream);
+  CUDF_EXPECTS(!histogram_cv.has_nulls(), "Child of the input lists column must not have nulls.");
+  CUDF_EXPECTS(histogram_cv.type().id() == type_id::STRUCT && histogram_cv.num_children() == 2,
+               "The input column has invalid histograms structure.");
   CUDF_EXPECTS(
-    values.type().id() == type_id::STRUCT && values.num_children() == 2,
-    "The input of merge_histogram aggregation must be a struct column having two children.");
-  CUDF_EXPECTS(cudf::is_integral(values.child(1).type()) && !values.child(1).has_nulls(),
-               "The second child of the input column must be integral type and has no nulls.");
+    cudf::is_integral(histogram_cv.child(1).type()) && !histogram_cv.child(1).has_nulls(),
+    "The input column has invalid histograms structure.");
 
   if (num_groups == 0) { return empty_like(values); }
 
+  // Firstly concatenate the histograms corresponding to the same key values.
+  // That is equivalent to creating a new lists column (view) from the input lists column
+  // with new offsets as below.
+  auto new_offsets = rmm::device_uvector<size_type>(num_groups + 1, stream);
+  thrust::gather(rmm::exec_policy(stream),
+                 group_offsets.begin(),
+                 group_offsets.end(),
+                 lists_cv.offsets_begin(),
+                 new_offsets.begin());
+
+  auto key_labels = rmm::device_uvector<size_type>(histogram_cv.size(), stream);
+  cudf::detail::label_segments(
+    new_offsets.begin(), new_offsets.end(), key_labels.begin(), key_labels.end(), stream);
+
   // The input values column is already in histogram format (i.e., column of Struct<value, count>).
-  auto const structs_cv   = structs_column_view{values};
+  auto const structs_cv   = structs_column_view{histogram_cv};
   auto const input_values = structs_cv.get_sliced_child(0, stream);
   auto const input_counts = structs_cv.get_sliced_child(1, stream);
 
-  return build_histogram(input_values, group_labels, input_counts, num_groups, stream, mr);
+  return build_histogram(input_values, key_labels, input_counts, num_groups, stream, mr);
 }
 
 }  // namespace cudf::groupby::detail
diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index c1d42987906..52ffb9fd9da 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -485,13 +485,13 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
  * @endcode
  *
  * @param values Grouped values to get valid count of
- * @param group_labels ID of group that the corresponding value belongs to
+ * @param group_offsets Offsets of groups' starting points within @p values.
  * @param num_groups Number of groups ( unique values in @p group_labels )
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  */
 std::unique_ptr<column> group_merge_histogram(column_view const& values,
-                                              cudf::device_span<size_type const> group_labels,
+                                              cudf::device_span<size_type const> group_offsets,
                                               size_type num_groups,
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr);
diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 046068b3f0d..67aea83eee5 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -267,9 +267,8 @@ std::unique_ptr<cudf::scalar> merge_histogram(column_view const& input,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(!input.has_nulls(), "The input column must not have nulls.");
-  CUDF_EXPECTS(
-    input.type().id() == type_id::STRUCT && input.num_children() == 2,
-    "The input of merge_histogram aggregation must be a struct column having two children.");
+  CUDF_EXPECTS(input.type().id() == type_id::STRUCT && input.num_children() == 2,
+               "The input must be a structs column having two children.");
   CUDF_EXPECTS(cudf::is_integral(input.child(1).type()) && !input.child(1).has_nulls(),
                "The second child of the input column must be integral type and has no nulls.");
 

From 0a8a03dd313846880d09aee822f98d42563d8d99 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 20 Sep 2023 13:40:03 -0700
Subject: [PATCH 77/93] Implement unit tests for merge histogram

---
 cpp/tests/groupby/histogram_tests.cpp | 237 +++++++++++++++++---------
 1 file changed, 152 insertions(+), 85 deletions(-)

diff --git a/cpp/tests/groupby/histogram_tests.cpp b/cpp/tests/groupby/histogram_tests.cpp
index 3ea0c07b68a..fb3faebb951 100644
--- a/cpp/tests/groupby/histogram_tests.cpp
+++ b/cpp/tests/groupby/histogram_tests.cpp
@@ -39,7 +39,7 @@ auto groupby_histogram(cudf::column_view const& keys,
     "Aggregation must be either HISTOGRAM or MERGE_HISTOGRAM.");
 
   std::vector<cudf::groupby::aggregation_request> requests;
-  requests.emplace_back(cudf::groupby::aggregation_request());
+  requests.emplace_back();
   requests[0].values = values;
   if (agg_kind == cudf::aggregation::HISTOGRAM) {
     requests[0].aggregations.push_back(
@@ -107,8 +107,9 @@ TYPED_TEST(GroupbyHistogramTest, SimpleInputNoNull)
   // key = 0: values = [2, 2, -3, -2, 2]
   // key = 1: values = [2, 0, 5, 2, 1]
   // key = 2: values = [-3, 1, 1, 2, 2]
-  auto const keys               = int32s_col{2, 0, 2, 1, 1, 1, 0, 0, 0, 1, 2, 2, 1, 0, 2};
-  auto const values             = col_data{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1, 2, 1, 2, 2};
+  auto const keys   = int32s_col{2, 0, 2, 1, 1, 1, 0, 0, 0, 1, 2, 2, 1, 0, 2};
+  auto const values = col_data{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1, 2, 1, 2, 2};
+
   auto const expected_keys      = int32s_col{0, 1, 2};
   auto const expected_histogram = [] {
     auto structs = [] {
@@ -119,6 +120,7 @@ TYPED_TEST(GroupbyHistogramTest, SimpleInputNoNull)
     return cudf::make_lists_column(
       3, int32s_col{0, 3, 7, 10}.release(), structs.release(), 0, rmm::device_buffer{});
   }();
+
   auto const [res_keys, res_histogram] =
     groupby_histogram(keys, values, cudf::aggregation::HISTOGRAM);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
@@ -132,7 +134,6 @@ TYPED_TEST(GroupbyHistogramTest, SlicedInputNoNull)
   auto const keys_original = int32s_col{2, 0, 2, 1, 0, 2, 0, 2, 1, 1, 1, 0, 0, 0, 1, 2, 2, 1, 0, 2};
   auto const values_original =
     col_data{1, 2, 0, 2, 1, -3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1, 2, 1, 2, 2};
-
   // key = 0: values = [2, 2, -3, -2, 2]
   // key = 1: values = [2, 0, 5, 2, 1]
   // key = 2: values = [-3, 1, 1, 2, 2]
@@ -149,6 +150,7 @@ TYPED_TEST(GroupbyHistogramTest, SlicedInputNoNull)
     return cudf::make_lists_column(
       3, int32s_col{0, 3, 7, 10}.release(), structs.release(), 0, rmm::device_buffer{});
   }();
+
   auto const [res_keys, res_histogram] =
     groupby_histogram(keys, values, cudf::aggregation::HISTOGRAM);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
@@ -168,6 +170,7 @@ TYPED_TEST(GroupbyHistogramTest, InputWithNulls)
   auto const values =
     col_data{{null, -3, 2, 1, 2, null, 0, 5, 2, null, -3, -2, 2, null, 1, 2, null, 1, 2, 2},
              nulls_at({0, 5, 9, 13, 16})};
+
   auto const expected_keys      = int32s_col{0, 1, 2};
   auto const expected_histogram = [] {
     auto structs = [] {
@@ -178,6 +181,7 @@ TYPED_TEST(GroupbyHistogramTest, InputWithNulls)
     return cudf::make_lists_column(
       3, int32s_col{0, 3, 8, 12}.release(), structs.release(), 0, rmm::device_buffer{});
   }();
+
   auto const [res_keys, res_histogram] =
     groupby_histogram(keys, values, cudf::aggregation::HISTOGRAM);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
@@ -213,110 +217,173 @@ TYPED_TEST(GroupbyHistogramTest, SlicedInputWithNulls)
     return cudf::make_lists_column(
       3, int32s_col{0, 3, 8, 12}.release(), structs.release(), 0, rmm::device_buffer{});
   }();
+
   auto const [res_keys, res_histogram] =
     groupby_histogram(keys, values, cudf::aggregation::HISTOGRAM);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_histogram, *res_histogram);
 }
 
-#if 0
-TYPED_TEST(GroupbyMergeHistogramTest, MergeHistogram)
+TYPED_TEST(GroupbyMergeHistogramTest, EmptyInput)
 {
-  using col_data    = cudf::test::fixed_width_column_wrapper<TypeParam>;
-  using int64s_col  = cudf::test::fixed_width_column_wrapper<int64_t>;
-  using structs_col = cudf::test::structs_column_wrapper;
-
-  auto const agg = cudf::make_merge_histogram_aggregation<reduce_aggregation>();
-
-  // Empty input.
-  {
-    auto const input = [] {
-      auto child1 = col_data{};
-      auto child2 = int64s_col{};
-      return structs_col{{child1, child2}};
-    }();
-    auto const expected = [] {
-      auto child1 = col_data{};
-      auto child2 = int64s_col{};
-      return structs_col{{child1, child2}};
-    }();
-    auto const result = histogram_reduction(input, agg);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
-  }
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
 
-  // Test without nulls.
-  {
-    auto const input = [] {
-      auto child1 = col_data{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1};
-      auto child2 = int64s_col{2, 1, 1, 2, 4, 1, 2, 3, 5, 3, 4};
-      return structs_col{{child1, child2}};
+  auto const keys   = int32s_col{};
+  auto const values = col_data{};
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, values, cudf::aggregation::MERGE_HISTOGRAM);
+
+  // The structure of the output is already verified in the function `groupby_histogram`.
+  ASSERT_EQ(res_histogram->size(), 0);
+}
+
+TYPED_TEST(GroupbyMergeHistogramTest, SimpleInputNoNull)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+
+  // key = 0: histograms = [[<-3, 1>, <-2, 1>, <2, 3>], [<0, 1>, <1, 1>], [<-3, 3>, <0, 1>, <1, 2>]]
+  // key = 1: histograms = [[<-2, 1>, <1, 3>, <2, 2>], [<0, 2>, <1, 1>, <2, 2>]]
+  auto const keys   = int32s_col{0, 1, 0, 1, 0};
+  auto const values = [] {
+    auto structs = [] {
+      auto values = col_data{-3, -2, 2, -2, 1, 2, 0, 1, 0, 1, 2, -3, 0, 1};
+      auto counts = int64s_col{1, 1, 3, 1, 3, 2, 1, 1, 2, 1, 2, 3, 1, 2};
+      return structs_col{{values, counts}};
     }();
+    return cudf::make_lists_column(
+      5, int32s_col{0, 3, 6, 8, 11, 14}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
 
-    auto const expected = [] {
-      auto child1 = col_data{-3, -2, 0, 1, 2, 5};
-      auto child2 = int64s_col{5, 5, 4, 5, 8, 1};
-      return structs_col{{child1, child2}};
+  auto const expected_keys      = int32s_col{0, 1};
+  auto const expected_histogram = [] {
+    auto structs = [] {
+      auto values = col_data{-3, -2, 0, 1, 2, -2, 0, 1, 2};
+      auto counts = int64s_col{4, 1, 2, 3, 3, 1, 2, 4, 4};
+      return structs_col{{values, counts}};
     }();
-    auto const result = histogram_reduction(input, agg);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
-  }
+    return cudf::make_lists_column(
+      2, int32s_col{0, 5, 9}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
 
-  // Test without nulls, sliced input.
-  {
-    auto const input_original = [] {
-      auto child1 = col_data{-3, 2, 1, 2, 0, 5, 2, -3, -2, 2, 1};
-      auto child2 = int64s_col{2, 1, 1, 2, 4, 1, 2, 3, 5, 3, 4};
-      return structs_col{{child1, child2}};
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, *values, cudf::aggregation::MERGE_HISTOGRAM);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_histogram, *res_histogram);
+}
+
+TYPED_TEST(GroupbyMergeHistogramTest, SlicedInputNoNull)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+
+  // key = 0: histograms = [[<-3, 1>, <-2, 1>, <2, 3>], [<0, 1>, <1, 1>], [<-3, 3>, <0, 1>, <1, 2>]]
+  // key = 1: histograms = [[<-2, 1>, <1, 3>, <2, 2>], [<0, 2>, <1, 1>, <2, 2>]]
+  auto const keys_original   = int32s_col{0, 1, 0, 1, 0, 1, 0};
+  auto const values_original = [] {
+    auto structs = [] {
+      auto values = col_data{0, 2, -3, 1, -3, -2, 2, -2, 1, 2, 0, 1, 0, 1, 2, -3, 0, 1};
+      auto counts = int64s_col{1, 2, 3, 1, 1, 1, 3, 1, 3, 2, 1, 1, 2, 1, 2, 3, 1, 2};
+      return structs_col{{values, counts}};
     }();
-    auto const input = cudf::slice(input_original, {0, 7})[0];
+    return cudf::make_lists_column(7,
+                                   int32s_col{0, 2, 4, 7, 10, 12, 15, 18}.release(),
+                                   structs.release(),
+                                   0,
+                                   rmm::device_buffer{});
+  }();
+  auto const keys   = cudf::slice(keys_original, {2, 7})[0];
+  auto const values = cudf::slice(*values_original, {2, 7})[0];
 
-    auto const expected = [] {
-      auto child1 = col_data{-3, 0, 1, 2, 5};
-      auto child2 = int64s_col{2, 4, 1, 5, 1};
-      return structs_col{{child1, child2}};
+  auto const expected_keys      = int32s_col{0, 1};
+  auto const expected_histogram = [] {
+    auto structs = [] {
+      auto values = col_data{-3, -2, 0, 1, 2, -2, 0, 1, 2};
+      auto counts = int64s_col{4, 1, 2, 3, 3, 1, 2, 4, 4};
+      return structs_col{{values, counts}};
     }();
-    auto const result = histogram_reduction(input, agg);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
-  }
+    return cudf::make_lists_column(
+      2, int32s_col{0, 5, 9}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
+
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, values, cudf::aggregation::MERGE_HISTOGRAM);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_histogram, *res_histogram);
+}
 
-  // Test with nulls.
+TYPED_TEST(GroupbyMergeHistogramTest, InputWithNulls)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
   using namespace cudf::test::iterators;
   auto constexpr null{0};
-  {
-    auto const input = [] {
-      auto child1 = col_data{{-3, 2, null, 1, 2, null, 0, 5, null, 2, -3, null, -2, 2, 1, null},
-                             nulls_at({2, 5, 8, 11, 15})};
-      auto child2 = int64s_col{2, 1, 12, 1, 2, 11, 4, 1, 10, 2, 3, 15, 5, 3, 4, 19};
-      return structs_col{{child1, child2}};
+
+  // key = 0: histograms = [[<null, 1>, <2, 3>], [<null, 2>, <1, 1>], [<0, 1>, <1, 2>]]
+  // key = 1: histograms = [[<null, 1>, <1, 3>, <2, 2>], [<0, 2>, <1, 1>, <2, 2>]]
+  auto const keys   = int32s_col{0, 1, 1, 0, 0};
+  auto const values = [] {
+    auto structs = [] {
+      auto values = col_data{{null, 2, null, 1, 2, 0, 1, 2, null, 1, 0, 1}, nulls_at({0, 2, 8})};
+      auto counts = int64s_col{1, 3, 1, 3, 2, 2, 1, 2, 2, 1, 1, 2};
+      return structs_col{{values, counts}};
     }();
+    return cudf::make_lists_column(
+      5, int32s_col{0, 2, 5, 8, 10, 12}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
 
-    auto const expected = [] {
-      auto child1 = col_data{{null, -3, -2, 0, 1, 2, 5}, null_at(0)};
-      auto child2 = int64s_col{67, 5, 5, 4, 5, 8, 1};
-      return structs_col{{child1, child2}};
+  auto const expected_keys      = int32s_col{0, 1};
+  auto const expected_histogram = [] {
+    auto structs = [] {
+      auto values = col_data{{null, 0, 1, 2, null, 0, 1, 2}, nulls_at({0, 4})};
+      auto counts = int64s_col{3, 1, 3, 3, 1, 2, 4, 4};
+      return structs_col{{values, counts}};
     }();
-    auto const result = histogram_reduction(input, agg);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
-  }
+    return cudf::make_lists_column(
+      2, int32s_col{0, 4, 8}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
 
-  // Test with nulls, sliced input.
-  {
-    auto const input_original = [] {
-      auto child1 = col_data{{-3, 2, null, 1, 2, null, 0, 5, null, 2, -3, null, -2, 2, 1, null},
-                             nulls_at({2, 5, 8, 11, 15})};
-      auto child2 = int64s_col{2, 1, 12, 1, 2, 11, 4, 1, 10, 2, 3, 15, 5, 3, 4, 19};
-      return structs_col{{child1, child2}};
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, *values, cudf::aggregation::MERGE_HISTOGRAM);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_histogram, *res_histogram);
+}
+
+TYPED_TEST(GroupbyMergeHistogramTest, SlicedInputWithNulls)
+{
+  using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
+  using namespace cudf::test::iterators;
+  auto constexpr null{0};
+
+  // key = 0: histograms = [[<null, 1>, <2, 3>], [<null, 2>, <1, 1>], [<0, 1>, <1, 2>]]
+  // key = 1: histograms = [[<null, 1>, <1, 3>, <2, 2>], [<0, 2>, <1, 1>, <2, 2>]]
+  auto const keys_original   = int32s_col{0, 1, 0, 1, 1, 0, 0};
+  auto const values_original = [] {
+    auto structs = [] {
+      auto values = col_data{{null, 2, null, 1, null, 2, null, 1, 2, 0, 1, 2, null, 1, 0, 1},
+                             nulls_at({0, 2, 4, 6, 12})};
+      auto counts = int64s_col{1, 3, 2, 1, 1, 3, 1, 3, 2, 2, 1, 2, 2, 1, 1, 2};
+      return structs_col{{values, counts}};
     }();
-    auto const input = cudf::slice(input_original, {0, 9})[0];
+    return cudf::make_lists_column(7,
+                                   int32s_col{0, 2, 4, 6, 9, 12, 14, 16}.release(),
+                                   structs.release(),
+                                   0,
+                                   rmm::device_buffer{});
+  }();
+  auto const keys   = cudf::slice(keys_original, {2, 7})[0];
+  auto const values = cudf::slice(*values_original, {2, 7})[0];
 
-    auto const expected = [] {
-      auto child1 = col_data{{null, -3, 0, 1, 2, 5}, null_at(0)};
-      auto child2 = int64s_col{33, 2, 4, 1, 3, 1};
-      return structs_col{{child1, child2}};
+  auto const expected_keys      = int32s_col{0, 1};
+  auto const expected_histogram = [] {
+    auto structs = [] {
+      auto values = col_data{{null, 0, 1, 2, null, 0, 1, 2}, nulls_at({0, 4})};
+      auto counts = int64s_col{3, 1, 3, 3, 1, 2, 4, 4};
+      return structs_col{{values, counts}};
     }();
-    auto const result = histogram_reduction(input, agg);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
-  }
-}
+    return cudf::make_lists_column(
+      2, int32s_col{0, 4, 8}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
 
-#endif
+  auto const [res_keys, res_histogram] =
+    groupby_histogram(keys, values, cudf::aggregation::MERGE_HISTOGRAM);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_keys, *res_keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_histogram, *res_histogram);
+}

From 201d432a331527ff04458700a19844a2bf5ecc85 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 20 Sep 2023 13:54:45 -0700
Subject: [PATCH 78/93] Fix empty output for merge histogram

---
 cpp/src/groupby/groupby.cu | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index ec200b52bbd..3ec40266b16 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -110,16 +110,16 @@ struct empty_column_constructor {
         0, make_empty_column(type_to_id<size_type>()), empty_like(values), 0, {});
     }
 
-    if constexpr (k == aggregation::Kind::HISTOGRAM || k == aggregation::Kind::MERGE_HISTOGRAM) {
-        std::vector<std::unique_ptr<column>> struct_children;
-        struct_children.emplace_back(empty_like(values));
-        struct_children.emplace_back(make_numeric_column(data_type{type_id::INT64}, 0));
-        auto structs = std::make_unique<column>(data_type{type_id::STRUCT},
-                                                0,
-                                                rmm::device_buffer{},
-                                                rmm::device_buffer{},
-                                                0,
-                                                std::move(struct_children));
+    if constexpr (k == aggregation::Kind::HISTOGRAM) {
+      std::vector<std::unique_ptr<column>> struct_children;
+      struct_children.emplace_back(empty_like(values));
+      struct_children.emplace_back(make_numeric_column(data_type{type_id::INT64}, 0));
+      auto structs = std::make_unique<column>(data_type{type_id::STRUCT},
+                                              0,
+                                              rmm::device_buffer{},
+                                              rmm::device_buffer{},
+                                              0,
+                                              std::move(struct_children));
       return make_lists_column(
         0, make_empty_column(type_to_id<size_type>()), std::move(structs), 0, {});
     }

From edf68160231d023f250c31cbdd8fdd615a57bb66 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 20 Sep 2023 13:54:53 -0700
Subject: [PATCH 79/93] Fix empty input test

---
 cpp/tests/groupby/histogram_tests.cpp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/cpp/tests/groupby/histogram_tests.cpp b/cpp/tests/groupby/histogram_tests.cpp
index fb3faebb951..3345c483d3a 100644
--- a/cpp/tests/groupby/histogram_tests.cpp
+++ b/cpp/tests/groupby/histogram_tests.cpp
@@ -229,9 +229,17 @@ TYPED_TEST(GroupbyMergeHistogramTest, EmptyInput)
   using col_data = cudf::test::fixed_width_column_wrapper<TypeParam, int>;
 
   auto const keys   = int32s_col{};
-  auto const values = col_data{};
+  auto const values = [] {
+    auto structs = [] {
+      auto values = col_data{};
+      auto counts = int64s_col{};
+      return structs_col{{values, counts}};
+    }();
+    return cudf::make_lists_column(
+      0, int32s_col{}.release(), structs.release(), 0, rmm::device_buffer{});
+  }();
   auto const [res_keys, res_histogram] =
-    groupby_histogram(keys, values, cudf::aggregation::MERGE_HISTOGRAM);
+    groupby_histogram(keys, *values, cudf::aggregation::MERGE_HISTOGRAM);
 
   // The structure of the output is already verified in the function `groupby_histogram`.
   ASSERT_EQ(res_histogram->size(), 0);

From 8ac649ecf7dfd23c9efdc5d674a3603bff0fe219 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 20 Sep 2023 13:56:07 -0700
Subject: [PATCH 80/93] Remove comment

---
 cpp/src/groupby/sort/group_histogram.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/src/groupby/sort/group_histogram.cu b/cpp/src/groupby/sort/group_histogram.cu
index 88724c54112..29a2debe741 100644
--- a/cpp/src/groupby/sort/group_histogram.cu
+++ b/cpp/src/groupby/sort/group_histogram.cu
@@ -32,7 +32,6 @@
 namespace cudf::groupby::detail {
 
 // Fixed type for counting frequencies in historam.
-// This is to avoid using `target_type_t` which requires type_dispatcher.
 constexpr auto histogram_count_dtype = data_type{type_to_id<int64_t>()};
 
 namespace {

From 04965fa2a972fed16ce7a9e57bf496c52d0c71c3 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 20 Sep 2023 14:09:23 -0700
Subject: [PATCH 81/93] Cleanup

---
 cpp/src/groupby/sort/group_histogram.cu | 40 +++++++++----------------
 1 file changed, 14 insertions(+), 26 deletions(-)

diff --git a/cpp/src/groupby/sort/group_histogram.cu b/cpp/src/groupby/sort/group_histogram.cu
index 29a2debe741..927cd43d040 100644
--- a/cpp/src/groupby/sort/group_histogram.cu
+++ b/cpp/src/groupby/sort/group_histogram.cu
@@ -35,20 +35,6 @@ namespace cudf::groupby::detail {
 constexpr auto histogram_count_dtype = data_type{type_to_id<int64_t>()};
 
 namespace {
-auto make_empty_histogram(column_view const& values)
-{
-  std::vector<std::unique_ptr<column>> struct_children;
-  struct_children.emplace_back(empty_like(values));
-  struct_children.emplace_back(make_numeric_column(histogram_count_dtype, 0));
-  auto structs = std::make_unique<column>(data_type{type_id::STRUCT},
-                                          0,
-                                          rmm::device_buffer{},
-                                          rmm::device_buffer{},
-                                          0,
-                                          std::move(struct_children));
-  return make_lists_column(
-    0, make_empty_column(type_to_id<size_type>()), std::move(structs), 0, {});
-}
 
 std::unique_ptr<column> build_histogram(column_view const& values,
                                         cudf::device_span<size_type const> group_labels,
@@ -59,9 +45,7 @@ std::unique_ptr<column> build_histogram(column_view const& values,
 {
   CUDF_EXPECTS(num_groups >= 0, "Number of groups cannot be negative.");
   CUDF_EXPECTS(static_cast<size_t>(values.size()) == group_labels.size(),
-               "Size of values column should be same as that of group labels.");
-
-  if (num_groups == 0) { return make_empty_histogram(values); }
+               "Size of values column should be the same as that of group labels.");
 
   // Attach group labels to the input values.
   auto const labels_cv      = column_view{data_type{type_to_id<size_type>()},
@@ -75,7 +59,7 @@ std::unique_ptr<column> build_histogram(column_view const& values,
   auto [distinct_indices, distinct_counts] = cudf::reduction::detail::table_histogram(
     labeled_values, partial_counts, histogram_count_dtype, stream, mr);
 
-  // Gather the distinct rows for output histogram.
+  // Gather the distinct rows for the output histogram.
   auto out_table = cudf::detail::gather(labeled_values,
                                         distinct_indices,
                                         out_of_bounds_policy::DONT_CHECK,
@@ -83,8 +67,8 @@ std::unique_ptr<column> build_histogram(column_view const& values,
                                         stream,
                                         mr);
 
-  // Build offsets for the output lists column.
-  // Each list will be a histogram corresponding to each value group.
+  // Build offsets for the output lists column containing output histograms.
+  // Each list will be a histogram corresponding to one value group.
   auto out_offsets = cudf::lists::detail::reconstruct_offsets(
     out_table->get_column(0).view(), num_groups, stream, mr);
 
@@ -106,6 +90,9 @@ std::unique_ptr<column> group_histogram(column_view const& values,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
+  // Empty group should be handled before reaching here.
+  CUDF_EXPECTS(num_groups > 0, "Group should not be empty.");
+
   return build_histogram(values, group_labels, std::nullopt, num_groups, stream, mr);
 }
 
@@ -115,13 +102,16 @@ std::unique_ptr<column> group_merge_histogram(column_view const& values,
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
+  // Empty group should be handled before reaching here.
+  CUDF_EXPECTS(num_groups > 0, "Group should not be empty.");
+
   // The input must be a lists column without nulls.
   CUDF_EXPECTS(!values.has_nulls(), "The input column must not have nulls.");
   CUDF_EXPECTS(values.type().id() == type_id::LIST,
                "The input of MERGE_HISTOGRAM aggregation must be a lists column.");
 
   // Child of the input lists column must be a structs column without nulls,
-  // and its second child is a count columns of integer type having no nulls.
+  // and its second child is a columns of integer type having no nulls.
   auto const lists_cv     = lists_column_view{values};
   auto const histogram_cv = lists_cv.get_sliced_child(stream);
   CUDF_EXPECTS(!histogram_cv.has_nulls(), "Child of the input lists column must not have nulls.");
@@ -131,11 +121,9 @@ std::unique_ptr<column> group_merge_histogram(column_view const& values,
     cudf::is_integral(histogram_cv.child(1).type()) && !histogram_cv.child(1).has_nulls(),
     "The input column has invalid histograms structure.");
 
-  if (num_groups == 0) { return empty_like(values); }
-
-  // Firstly concatenate the histograms corresponding to the same key values.
+  // Concatenate the histograms corresponding to the same key values.
   // That is equivalent to creating a new lists column (view) from the input lists column
-  // with new offsets as below.
+  // with new offsets gathered as below.
   auto new_offsets = rmm::device_uvector<size_type>(num_groups + 1, stream);
   thrust::gather(rmm::exec_policy(stream),
                  group_offsets.begin(),
@@ -143,11 +131,11 @@ std::unique_ptr<column> group_merge_histogram(column_view const& values,
                  lists_cv.offsets_begin(),
                  new_offsets.begin());
 
+  // Generate labels for the new lists.
   auto key_labels = rmm::device_uvector<size_type>(histogram_cv.size(), stream);
   cudf::detail::label_segments(
     new_offsets.begin(), new_offsets.end(), key_labels.begin(), key_labels.end(), stream);
 
-  // The input values column is already in histogram format (i.e., column of Struct<value, count>).
   auto const structs_cv   = structs_column_view{histogram_cv};
   auto const input_values = structs_cv.get_sliced_child(0, stream);
   auto const input_counts = structs_cv.get_sliced_child(1, stream);

From 63ef1fa5e41a816d8e523167d6d1e1dbdb2307bf Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 20 Sep 2023 14:17:51 -0700
Subject: [PATCH 82/93] Fix docs

---
 cpp/src/groupby/sort/group_reductions.hpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index 52ffb9fd9da..3aa79f226a3 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -234,8 +234,8 @@ std::unique_ptr<column> group_count_all(cudf::device_span<size_type const> group
  *
  * @param values Grouped values to compute histogram
  * @param group_labels ID of group that the corresponding value belongs to
- * @param num_groups Number of groups ( unique values in @p group_labels )
- * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param num_groups Number of groups
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  */
 std::unique_ptr<column> group_histogram(column_view const& values,
@@ -472,22 +472,22 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
 /**
  * @brief Internal API to merge multiple output of HISTOGRAM aggregation.
  *
- * The input values column should be given as a structs column in the form of
- * `STRUCT<value, count>`.
+ * The input values column should be given as a lists column in the form of
+ * `LIST<STRUCT<value, count>>`.
  * After merging, the order of distinct elements in each output list is not specified.
  *
  * @code{.pseudo}
- * values       = [<1, 2>, <2, 1>, <2, 2>, <3, 2>, <2, 1>, <1, 1>, <2, 1>]
- * group_labels = [0,      0,      0,      1,      1,      1,      1]
- * num_groups = 2
+ * values        = [ [<1, 2>, <2, 1>], [<2, 2>], [<3, 2>, <2, 1>], [<1, 1>, <2, 1>] ]
+ * group_offsets = [ 0,                          2,                                 4]
+ * num_groups    = 2
  *
- * output = [[<1, 2>, <2, 3>], [<1, 1>, <2, 2>, <3, 3>]]]
+ * output = [[<1, 2>, <2, 3>], [<1, 1>, <2, 2>, <3, 2>]]]
  * @endcode
  *
  * @param values Grouped values to get valid count of
- * @param group_offsets Offsets of groups' starting points within @p values.
- * @param num_groups Number of groups ( unique values in @p group_labels )
- * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param group_offsets Offsets of groups' starting points within @p values
+ * @param num_groups Number of groups
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  */
 std::unique_ptr<column> group_merge_histogram(column_view const& values,

From d31de2006f8119d639dd84b1596deace16cae130 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 20 Sep 2023 14:31:03 -0700
Subject: [PATCH 83/93] Rewrite docs

---
 cpp/src/reductions/histogram_helpers.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/reductions/histogram_helpers.hpp b/cpp/src/reductions/histogram_helpers.hpp
index 3d3d548cc00..521b633cfa3 100644
--- a/cpp/src/reductions/histogram_helpers.hpp
+++ b/cpp/src/reductions/histogram_helpers.hpp
@@ -30,11 +30,11 @@ namespace cudf::reduction::detail {
 /**
  * @brief Compute the histogram for the input table.
  *
- * This is equivalent to do a distinct count for each unique rows in the input.
+ * This is equivalent to do a distinct count for each unique row in the input.
  *
  * @param input The input table to compute histogram
- * @param partial_counts An optional column containing counts for each row
- * @param output_dtype The output type to store the count value
+ * @param partial_counts An optional column containing count for each row
+ * @param output_dtype The type to store count value
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate memory of the returned objects
  * @return A pair of array contains the (stable-order) indices of the distinct rows in the input

From 34a426854e5b9e5a9f959e4eea67a7c8bf77af9a Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 20 Sep 2023 14:32:39 -0700
Subject: [PATCH 84/93] Rewrite histogram.cu

---
 cpp/src/reductions/histogram.cu | 81 +++++++++++++++++++--------------
 1 file changed, 47 insertions(+), 34 deletions(-)

diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 67aea83eee5..ea2f9afe620 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -24,7 +24,9 @@
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <thrust/iterator/discard_iterator.h>
+#include <thrust/copy.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/tuple.h>
 
 #include <cuda/atomic>
 
@@ -37,19 +39,19 @@ namespace {
 /**
  * @brief The functor to accumulate the frequency of each distinct rows in the input table.
  */
-template <typename MapView, typename KeyHasher, typename KeyEqual, typename OutputType>
-struct reduce_fn : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, OutputType> {
-  OutputType const* d_partial_output;
+template <typename MapView, typename KeyHasher, typename KeyEqual, typename CountType>
+struct reduce_fn : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, CountType> {
+  CountType const* d_partial_output;
 
   reduce_fn(MapView const& d_map,
             KeyHasher const& d_hasher,
             KeyEqual const& d_equal,
-            OutputType* const d_output,
-            OutputType const* const d_partial_output)
-    : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, OutputType>{d_map,
-                                                                                    d_hasher,
-                                                                                    d_equal,
-                                                                                    d_output},
+            CountType* const d_output,
+            CountType const* const d_partial_output)
+    : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, CountType>{d_map,
+                                                                                   d_hasher,
+                                                                                   d_equal,
+                                                                                   d_output},
       d_partial_output{d_partial_output}
   {
   }
@@ -57,9 +59,9 @@ struct reduce_fn : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEq
   // Count the number of rows in each group of rows that are compared equal.
   __device__ void operator()(size_type const idx) const
   {
-    auto const increment = d_partial_output ? d_partial_output[idx] : OutputType{1};
+    auto const increment = d_partial_output ? d_partial_output[idx] : CountType{1};
     auto const count =
-      cuda::atomic_ref<OutputType, cuda::thread_scope_device>(*this->get_output_ptr(idx));
+      cuda::atomic_ref<CountType, cuda::thread_scope_device>(*this->get_output_ptr(idx));
     count.fetch_add(increment, cuda::std::memory_order_relaxed);
   }
 };
@@ -67,11 +69,11 @@ struct reduce_fn : cudf::detail::reduce_by_row_fn_base<MapView, KeyHasher, KeyEq
 /**
  * @brief The builder to construct an instance of `reduce_fn` functor.
  */
-template <typename OutputType>
+template <typename CountType>
 struct reduce_func_builder {
-  OutputType const* const d_partial_output;
+  CountType const* const d_partial_output;
 
-  reduce_func_builder(OutputType const* const d_partial_output) : d_partial_output{d_partial_output}
+  reduce_func_builder(CountType const* const d_partial_output) : d_partial_output{d_partial_output}
   {
   }
 
@@ -79,17 +81,15 @@ struct reduce_func_builder {
   auto build(MapView const& d_map,
              KeyHasher const& d_hasher,
              KeyEqual const& d_equal,
-             OutputType* const d_output)
+             CountType* const d_output)
   {
-    return reduce_fn<MapView, KeyHasher, KeyEqual, OutputType>{
+    return reduce_fn<MapView, KeyHasher, KeyEqual, CountType>{
       d_map, d_hasher, d_equal, d_output, d_partial_output};
   }
 };
 
 /**
- * @brief Specialized functor to check for non-zero.
- *
- * The input must be given as Pair<T1, T2>. Only value of T2 is checked for non-zero.
+ * @brief Specialized functor to check for non-zero of the second component of the input.
  */
 struct is_none_zero {
   template <typename Pair>
@@ -100,27 +100,28 @@ struct is_none_zero {
 };
 
 /**
- * @brief Dispatcher functor to compute histogram in the given OutputType.
+ * @brief Dispatcher functor to compute histogram with frequencies (aka element counts) stored in
+ * a buffer of type given by CountType.
  *
  * The indices of distinct rows and their corresponding frequencies are written into two separate
- * output buffer.
+ * output buffers.
  */
 struct histogram_dispatcher {
-  template <typename OutputType>
+  template <typename CountType>
   static bool constexpr is_supported()
   {
     // Currently only int64_t is requested by Spark-Rapids.
     // More data type (integer only) can be supported by enabling below.
-    return std::is_same_v<OutputType, int64_t>;
+    return std::is_same_v<CountType, int64_t>;
   }
 
-  template <typename OutputType, typename... Args>
-  std::enable_if_t<!is_supported<OutputType>(), void> operator()(Args&&...)
+  template <typename CountType, typename... Args>
+  std::enable_if_t<!is_supported<CountType>(), void> operator()(Args&&...)
   {
-    CUDF_FAIL("Unsupported output type in histogram aggregation.");
+    CUDF_FAIL("Unsupported count type in histogram aggregation.");
   }
 
-  template <typename OutputType, CUDF_ENABLE_IF(is_supported<OutputType>())>
+  template <typename CountType, CUDF_ENABLE_IF(is_supported<CountType>())>
   void operator()(
     cudf::detail::hash_map_type const& map,
     std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
@@ -132,6 +133,7 @@ struct histogram_dispatcher {
     std::optional<column_view> const& partial_counts,
     rmm::cuda_stream_view stream) const
   {
+    // Note that we consider null and NaNs as always equal.
     auto const reduction_results = cudf::detail::hash_reduce_by_row(
       map,
       preprocessed_input,
@@ -140,24 +142,35 @@ struct histogram_dispatcher {
       has_nested_columns,
       null_equality::EQUAL,
       nan_equality::ALL_EQUAL,
-      reduce_func_builder<OutputType>{partial_counts ? partial_counts.value().begin<OutputType>()
-                                                     : nullptr},
-      OutputType{0},
+      reduce_func_builder<CountType>{partial_counts ? partial_counts.value().begin<CountType>()
+                                                    : nullptr},
+      CountType{0},
       stream,
       rmm::mr::get_current_device_resource());
 
     auto const input_it = thrust::make_zip_iterator(
       thrust::make_tuple(thrust::make_counting_iterator(0), reduction_results.begin()));
     auto const output_it = thrust::make_zip_iterator(
-      thrust::make_tuple(output_indices, output_counts.begin<OutputType>()));
+      thrust::make_tuple(output_indices, output_counts.begin<OutputTykpe>()));
 
     // Reduction results above are either group sizes of equal rows, or `0`.
-    // Thus, we need to extract the non-zero group sizes.
+    // The final output is non-zero group sizes only.
     thrust::copy_if(
       rmm::exec_policy(stream), input_it, input_it + num_rows, output_it, is_none_zero{});
   }
 };
 
+/**
+ * @brief Building a histogram by gathering distinct rows from the input table and their
+ * corresponding distinct counts.
+ *
+ * @param input The input table
+ * @param distinct_indices Indices of the distinct rows
+ * @param distinct_counts Distinct counts corresponding to the distinct rows
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned object's device memory
+ * @return A list_scalar storing the output histogram
+ */
 auto gather_histogram(table_view const& input,
                       device_span<size_type const> distinct_indices,
                       std::unique_ptr<column>&& distinct_counts,
@@ -194,7 +207,7 @@ std::pair<rmm::device_uvector<size_type>, std::unique_ptr<column>> table_histogr
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(cudf::is_integral(output_dtype),
-               "The output type of histogram aggregation must be an integral type.");
+               "The output count type of histogram aggregation must be an integral type.");
 
   auto map = cudf::detail::hash_map_type{
     compute_hash_table_size(input.num_rows()),

From 502a3daccad8cda3d7b37f972e765a4e9e008ec1 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 20 Sep 2023 14:33:24 -0700
Subject: [PATCH 85/93] Fix typo

---
 cpp/src/reductions/histogram.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index ea2f9afe620..b6078955d06 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -151,7 +151,7 @@ struct histogram_dispatcher {
     auto const input_it = thrust::make_zip_iterator(
       thrust::make_tuple(thrust::make_counting_iterator(0), reduction_results.begin()));
     auto const output_it = thrust::make_zip_iterator(
-      thrust::make_tuple(output_indices, output_counts.begin<OutputTykpe>()));
+      thrust::make_tuple(output_indices, output_counts.begin<CountType>()));
 
     // Reduction results above are either group sizes of equal rows, or `0`.
     // The final output is non-zero group sizes only.

From 61377e0d586a5a4a93c20185e6e4de52e1686b5f Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 20 Sep 2023 14:35:34 -0700
Subject: [PATCH 86/93] Fix header

---
 cpp/tests/groupby/histogram_tests.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/tests/groupby/histogram_tests.cpp b/cpp/tests/groupby/histogram_tests.cpp
index 3345c483d3a..c5833f40cf2 100644
--- a/cpp/tests/groupby/histogram_tests.cpp
+++ b/cpp/tests/groupby/histogram_tests.cpp
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <tests/groupby/groupby_test_util.hpp>
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
@@ -23,6 +21,7 @@
 
 #include <cudf/copying.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/groupby.hpp>
 #include <cudf/lists/sorting.hpp>
 #include <cudf/sorting.hpp>
 

From dd72159f570bd3eedca1da33080e96a7aa810f5c Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 20 Sep 2023 14:37:37 -0700
Subject: [PATCH 87/93] Revert changes

---
 cpp/tests/reductions/reduction_tests.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index ed85da9e50b..7644ac48892 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -1091,10 +1091,10 @@ TEST_F(ReductionEmptyTest, empty_column)
   // test if null count is equal or greater than size of input
   // expect result.is_valid() is false
   int col_size = 5;
-  std::vector<T> data_col(col_size);
+  std::vector<T> col_data(col_size);
   std::vector<bool> valids(col_size, 0);
 
-  cudf::test::fixed_width_column_wrapper<T> col_nulls = construct_null_column(data_col, valids);
+  cudf::test::fixed_width_column_wrapper<T> col_nulls = construct_null_column(col_data, valids);
   CUDF_EXPECT_NO_THROW(statement(col_nulls));
 
   auto any_agg   = cudf::make_any_aggregation<cudf::reduce_aggregation>();

From 424196b72ccb6a1358791427b6adb80ccc7749d8 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 20 Sep 2023 21:55:04 -0700
Subject: [PATCH 88/93] Add empty input handling

---
 cpp/src/groupby/groupby.cu               | 19 ++++++++-----------
 cpp/src/reductions/histogram.cu          | 18 ++++++++++++++++++
 cpp/src/reductions/histogram_helpers.hpp | 11 +++++++++++
 cpp/src/reductions/reductions.cpp        | 11 +++++++++++
 4 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 3ec40266b16..d237c5db41b 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <reductions/histogram_helpers.hpp>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
@@ -111,18 +113,13 @@ struct empty_column_constructor {
     }
 
     if constexpr (k == aggregation::Kind::HISTOGRAM) {
-      std::vector<std::unique_ptr<column>> struct_children;
-      struct_children.emplace_back(empty_like(values));
-      struct_children.emplace_back(make_numeric_column(data_type{type_id::INT64}, 0));
-      auto structs = std::make_unique<column>(data_type{type_id::STRUCT},
-                                              0,
-                                              rmm::device_buffer{},
-                                              rmm::device_buffer{},
-                                              0,
-                                              std::move(struct_children));
-      return make_lists_column(
-        0, make_empty_column(type_to_id<size_type>()), std::move(structs), 0, {});
+      return make_lists_column(0,
+                               make_empty_column(type_to_id<size_type>()),
+                               cudf::reduction::detail::make_empty_histogram_like(values),
+                               0,
+                               {});
     }
+    if constexpr (k == aggregation::Kind::MERGE_HISTOGRAM) { return empty_like(values); }
 
     if constexpr (k == aggregation::Kind::RANK) {
       auto const& rank_agg = dynamic_cast<cudf::detail::rank_aggregation const&>(agg);
diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index b6078955d06..651be4e3e98 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -199,6 +199,19 @@ auto gather_histogram(table_view const& input,
 
 }  // namespace
 
+std::unique_ptr<column> make_empty_histogram_like(column_view const& values)
+{
+  std::vector<std::unique_ptr<column>> struct_children;
+  struct_children.emplace_back(empty_like(values));
+  struct_children.emplace_back(make_numeric_column(data_type{type_id::INT64}, 0));
+  return std::make_unique<column>(data_type{type_id::STRUCT},
+                                  0,
+                                  rmm::device_buffer{},
+                                  rmm::device_buffer{},
+                                  0,
+                                  std::move(struct_children));
+}
+
 std::pair<rmm::device_uvector<size_type>, std::unique_ptr<column>> table_histogram(
   table_view const& input,
   std::optional<column_view> const& partial_counts,
@@ -269,6 +282,9 @@ std::unique_ptr<cudf::scalar> histogram(column_view const& input,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
+  // Empty group should be handled before reaching here.
+  CUDF_EXPECTS(input.size() > 0, "Input should not be empty.");
+
   auto const input_tv = table_view{{input}};
   auto [distinct_indices, distinct_counts] =
     table_histogram(input_tv, std::nullopt, output_dtype, stream, mr);
@@ -279,6 +295,8 @@ std::unique_ptr<cudf::scalar> merge_histogram(column_view const& input,
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
+  // Empty group should be handled before reaching here.
+  CUDF_EXPECTS(input.size() > 0, "Input should not be empty.");
   CUDF_EXPECTS(!input.has_nulls(), "The input column must not have nulls.");
   CUDF_EXPECTS(input.type().id() == type_id::STRUCT && input.num_children() == 2,
                "The input must be a structs column having two children.");
diff --git a/cpp/src/reductions/histogram_helpers.hpp b/cpp/src/reductions/histogram_helpers.hpp
index 521b633cfa3..62051b9240e 100644
--- a/cpp/src/reductions/histogram_helpers.hpp
+++ b/cpp/src/reductions/histogram_helpers.hpp
@@ -23,6 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <memory>
 #include <optional>
 
 namespace cudf::reduction::detail {
@@ -47,4 +48,14 @@ std::pair<rmm::device_uvector<size_type>, std::unique_ptr<column>> table_histogr
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief Create an empty histogram column.
+ *
+ * A histogram column is a structs column `STRUCT<T, int64_t>` where T is type of the input
+ * values.
+ *
+ * @returns An empty histogram column
+ */
+std::unique_ptr<column> make_empty_histogram_like(column_view const& values);
+
 }  // namespace cudf::reduction::detail
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index 8d19413190b..9e476742baa 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <reductions/histogram_helpers.hpp>
+
 #include <cudf/column/column.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/copy.hpp>
@@ -167,6 +169,15 @@ std::unique_ptr<scalar> reduce(column_view const& col,
       return tdigest::detail::make_empty_tdigest_scalar(stream, mr);
     }
 
+    if (agg.kind == aggregation::HISTOGRAM) {
+      return std::make_unique<list_scalar>(
+        std::move(*reduction::detail::make_empty_histogram_like(col)), true, stream, mr);
+    }
+    if (agg.kind == aggregation::MERGE_HISTOGRAM) {
+      return std::make_unique<list_scalar>(
+        std::move(*reduction::detail::make_empty_histogram_like(col.child(0))), true, stream, mr);
+    }
+
     if (output_dtype.id() == type_id::LIST) {
       if (col.type() == output_dtype) { return make_empty_scalar_like(col, stream, mr); }
       // Under some circumstance, the output type will become the List of input type,

From 26238dd8a39846ad2a578d700b41613c2ae1167b Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Wed, 20 Sep 2023 22:27:43 -0700
Subject: [PATCH 89/93] Rename function and change return type

---
 cpp/src/groupby/sort/group_histogram.cu  |  6 +++---
 cpp/src/reductions/histogram.cu          | 16 ++++++++--------
 cpp/src/reductions/histogram_helpers.hpp |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/cpp/src/groupby/sort/group_histogram.cu b/cpp/src/groupby/sort/group_histogram.cu
index 927cd43d040..a3f0c5b1e47 100644
--- a/cpp/src/groupby/sort/group_histogram.cu
+++ b/cpp/src/groupby/sort/group_histogram.cu
@@ -56,12 +56,12 @@ std::unique_ptr<column> build_histogram(column_view const& values,
   auto const labeled_values = table_view{{labels_cv, values}};
 
   // Build histogram for the labeled values.
-  auto [distinct_indices, distinct_counts] = cudf::reduction::detail::table_histogram(
+  auto [distinct_indices, distinct_counts] = cudf::reduction::detail::histogram_table(
     labeled_values, partial_counts, histogram_count_dtype, stream, mr);
 
   // Gather the distinct rows for the output histogram.
   auto out_table = cudf::detail::gather(labeled_values,
-                                        distinct_indices,
+                                        *distinct_indices,
                                         out_of_bounds_policy::DONT_CHECK,
                                         cudf::detail::negative_index_policy::NOT_ALLOWED,
                                         stream,
@@ -76,7 +76,7 @@ std::unique_ptr<column> build_histogram(column_view const& values,
   struct_children.emplace_back(std::move(out_table->release().back()));
   struct_children.emplace_back(std::move(distinct_counts));
   auto out_structs = make_structs_column(
-    static_cast<size_type>(distinct_indices.size()), std::move(struct_children), 0, {}, stream, mr);
+    static_cast<size_type>(distinct_indices->size()), std::move(struct_children), 0, {}, stream, mr);
 
   return make_lists_column(
     num_groups, std::move(out_offsets), std::move(out_structs), 0, {}, stream, mr);
diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 651be4e3e98..f4ae7fdb407 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -212,7 +212,7 @@ std::unique_ptr<column> make_empty_histogram_like(column_view const& values)
                                   std::move(struct_children));
 }
 
-std::pair<rmm::device_uvector<size_type>, std::unique_ptr<column>> table_histogram(
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>, std::unique_ptr<column>> histogram_table(
   table_view const& input,
   std::optional<column_view> const& partial_counts,
   data_type const output_dtype,
@@ -255,8 +255,8 @@ std::pair<rmm::device_uvector<size_type>, std::unique_ptr<column>> table_histogr
   }
 
   // Gather the indices of distinct rows.
-  auto distinct_indices = rmm::device_uvector<size_type>(
-    static_cast<size_type>(map.get_size()), stream, rmm::mr::get_current_device_resource());
+  auto distinct_indices = std::make_unique<rmm::device_uvector<size_type>>(
+    static_cast<size_type>(map.get_size()), stream, mr);
 
   // Store the number of occurrences of each distinct row.
   auto distinct_counts = make_numeric_column(
@@ -269,7 +269,7 @@ std::pair<rmm::device_uvector<size_type>, std::unique_ptr<column>> table_histogr
                   input.num_rows(),
                   has_nulls,
                   has_nested_columns,
-                  distinct_indices.begin(),
+                  distinct_indices->begin(),
                   distinct_counts->mutable_view(),
                   partial_counts,
                   stream);
@@ -287,8 +287,8 @@ std::unique_ptr<cudf::scalar> histogram(column_view const& input,
 
   auto const input_tv = table_view{{input}};
   auto [distinct_indices, distinct_counts] =
-    table_histogram(input_tv, std::nullopt, output_dtype, stream, mr);
-  return gather_histogram(input_tv, distinct_indices, std::move(distinct_counts), stream, mr);
+    histogram_table(input_tv, std::nullopt, output_dtype, stream, mr);
+  return gather_histogram(input_tv, *distinct_indices, std::move(distinct_counts), stream, mr);
 }
 
 std::unique_ptr<cudf::scalar> merge_histogram(column_view const& input,
@@ -309,8 +309,8 @@ std::unique_ptr<cudf::scalar> merge_histogram(column_view const& input,
 
   auto const values_tv = table_view{{input_values}};
   auto [distinct_indices, distinct_counts] =
-    table_histogram(values_tv, input_counts, data_type{type_id::INT64}, stream, mr);
-  return gather_histogram(values_tv, distinct_indices, std::move(distinct_counts), stream, mr);
+    histogram_table(values_tv, input_counts, data_type{type_id::INT64}, stream, mr);
+  return gather_histogram(values_tv, *distinct_indices, std::move(distinct_counts), stream, mr);
 }
 
 }  // namespace cudf::reduction::detail
diff --git a/cpp/src/reductions/histogram_helpers.hpp b/cpp/src/reductions/histogram_helpers.hpp
index 62051b9240e..0f830e71bbc 100644
--- a/cpp/src/reductions/histogram_helpers.hpp
+++ b/cpp/src/reductions/histogram_helpers.hpp
@@ -41,7 +41,7 @@ namespace cudf::reduction::detail {
  * @return A pair of array contains the (stable-order) indices of the distinct rows in the input
  * table, and their corresponding distinct counts
  */
-std::pair<rmm::device_uvector<size_type>, std::unique_ptr<column>> table_histogram(
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>, std::unique_ptr<column>> histogram_table(
   table_view const& input,
   std::optional<column_view> const& partial_counts,
   data_type const output_dtype,

From 76f77a00a7cd0b28f04f7958dfac00fc577e7f1d Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 21 Sep 2023 09:40:23 -0700
Subject: [PATCH 90/93] Format

---
 cpp/src/groupby/sort/group_histogram.cu | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/cpp/src/groupby/sort/group_histogram.cu b/cpp/src/groupby/sort/group_histogram.cu
index a3f0c5b1e47..5c02c57af10 100644
--- a/cpp/src/groupby/sort/group_histogram.cu
+++ b/cpp/src/groupby/sort/group_histogram.cu
@@ -75,8 +75,12 @@ std::unique_ptr<column> build_histogram(column_view const& values,
   std::vector<std::unique_ptr<column>> struct_children;
   struct_children.emplace_back(std::move(out_table->release().back()));
   struct_children.emplace_back(std::move(distinct_counts));
-  auto out_structs = make_structs_column(
-    static_cast<size_type>(distinct_indices->size()), std::move(struct_children), 0, {}, stream, mr);
+  auto out_structs = make_structs_column(static_cast<size_type>(distinct_indices->size()),
+                                         std::move(struct_children),
+                                         0,
+                                         {},
+                                         stream,
+                                         mr);
 
   return make_lists_column(
     num_groups, std::move(out_offsets), std::move(out_structs), 0, {}, stream, mr);

From 31093cd6971bdf9fb395697121566fa52f0c681d Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 21 Sep 2023 09:51:24 -0700
Subject: [PATCH 91/93] Add docs and reduction aggregations

---
 .../ai/rapids/cudf/GroupByAggregation.java     | 14 ++++++++++++++
 .../ai/rapids/cudf/ReductionAggregation.java   | 18 ++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
index 25bb716bd5a..b1cc8b6b67e 100644
--- a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
@@ -316,10 +316,24 @@ public static GroupByAggregation mergeTDigest(int delta) {
     return new GroupByAggregation(Aggregation.mergeTDigest(delta));
   }
 
+  /**
+   * Histogram aggregation, computing the frequencies for each unique row.
+   *
+   * A histogram is given as a lists column, in which the first child stores unique rows from
+   * the input values and the second child stores their corresponding frequencies.
+   *
+   * @return A lists of structs column in which each list contains a histogram corresponding to
+   *         an input key.
+   */
   public static GroupByAggregation histogram() {
     return new GroupByAggregation(Aggregation.histogram());
   }
 
+  /**
+   * MergeHistogram aggregation, to merge multiple histograms.
+   *
+   * @return A new histogram in which the frequencies of the unique rows are sum up.
+   */
   public static GroupByAggregation mergeHistogram() {
     return new GroupByAggregation(Aggregation.mergeHistogram());
   }
diff --git a/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java b/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java
index eab1c94fd2c..46ae65085cf 100644
--- a/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java
@@ -286,4 +286,22 @@ public static ReductionAggregation mergeSets(NullEquality nullEquality, NaNEqual
     return new ReductionAggregation(Aggregation.mergeSets(nullEquality, nanEquality));
   }
 
+  /**
+   * Create HistogramAggregation, computing the frequencies for each unique row.
+   *
+   * @return A structs column in which the first child stores unique rows from the input and the
+   *         second child stores their corresponding frequencies.
+   */
+  public static ReductionAggregation histogram() {
+    return new ReductionAggregation(Aggregation.histogram());
+  }
+
+  /**
+   * Create MergeHistogramAggregation, to merge multiple histograms.
+   *
+   * @return A new histogram in which the frequencies of the unique rows are sum up.
+   */
+  public static ReductionAggregation mergeHistogram() {
+    return new ReductionAggregation(Aggregation.mergeHistogram());
+  }
 }

From 2ce59d1820cb81121051c521df94ae3362f9ad22 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 21 Sep 2023 09:58:11 -0700
Subject: [PATCH 92/93] Update copyright years

---
 java/src/main/java/ai/rapids/cudf/Aggregation.java          | 2 +-
 java/src/main/java/ai/rapids/cudf/GroupByAggregation.java   | 2 +-
 java/src/main/java/ai/rapids/cudf/ReductionAggregation.java | 2 +-
 java/src/main/native/src/AggregationJni.cpp                 | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Aggregation.java b/java/src/main/java/ai/rapids/cudf/Aggregation.java
index 029017ae113..379750bb0b7 100644
--- a/java/src/main/java/ai/rapids/cudf/Aggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/Aggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
diff --git a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
index b1cc8b6b67e..0fae33927b6 100644
--- a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
diff --git a/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java b/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java
index 46ae65085cf..ba8ae379bae 100644
--- a/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *  Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
diff --git a/java/src/main/native/src/AggregationJni.cpp b/java/src/main/native/src/AggregationJni.cpp
index 8984c27530d..bc62e95c36a 100644
--- a/java/src/main/native/src/AggregationJni.cpp
+++ b/java/src/main/native/src/AggregationJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From ad09d30c41c684e526a1e933ccb58fc8b4b89467 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiat@nvidia.com>
Date: Thu, 21 Sep 2023 10:12:42 -0700
Subject: [PATCH 93/93] Revert "Add binding for `HISTOGRAM` and
 `MERGE_HISTOGRAM` aggregations"

This reverts commit ee229a00125c9a4c2edff32c73ca8d952c75b1e7.
---
 .../main/java/ai/rapids/cudf/Aggregation.java | 24 +------------------
 .../ai/rapids/cudf/GroupByAggregation.java    |  8 -------
 java/src/main/native/src/AggregationJni.cpp   |  5 ----
 3 files changed, 1 insertion(+), 36 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Aggregation.java b/java/src/main/java/ai/rapids/cudf/Aggregation.java
index 029017ae113..d10329ca0f2 100644
--- a/java/src/main/java/ai/rapids/cudf/Aggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/Aggregation.java
@@ -68,9 +68,7 @@ enum Kind {
         DENSE_RANK(29),
         PERCENT_RANK(30),
         TDIGEST(31), // This can take a delta argument for accuracy level
-        MERGE_TDIGEST(32), // This can take a delta argument for accuracy level
-        HISTOGRAM(33),
-        MERGE_HISTOGRAM(34);
+        MERGE_TDIGEST(32); // This can take a delta argument for accuracy level
 
         final int nativeId;
 
@@ -920,26 +918,6 @@ static TDigestAggregation mergeTDigest(int delta) {
         return new TDigestAggregation(Kind.MERGE_TDIGEST, delta);
     }
 
-    static final class HistogramAggregation extends NoParamAggregation {
-        private HistogramAggregation() {
-            super(Kind.HISTOGRAM);
-        }
-    }
-
-    static final class MergeHistogramAggregation extends NoParamAggregation {
-        private MergeHistogramAggregation() {
-            super(Kind.MERGE_HISTOGRAM);
-        }
-    }
-
-    static HistogramAggregation histogram() {
-        return new HistogramAggregation();
-    }
-
-    static MergeHistogramAggregation mergeHistogram() {
-        return new MergeHistogramAggregation();
-    }
-
     /**
      * Create one of the aggregations that only needs a kind, no other parameters. This does not
      * work for all types and for code safety reasons each kind is added separately.
diff --git a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
index 25bb716bd5a..500d18f7eae 100644
--- a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
@@ -315,12 +315,4 @@ public static GroupByAggregation createTDigest(int delta) {
   public static GroupByAggregation mergeTDigest(int delta) {
     return new GroupByAggregation(Aggregation.mergeTDigest(delta));
   }
-
-  public static GroupByAggregation histogram() {
-    return new GroupByAggregation(Aggregation.histogram());
-  }
-
-  public static GroupByAggregation mergeHistogram() {
-    return new GroupByAggregation(Aggregation.mergeHistogram());
-  }
 }
diff --git a/java/src/main/native/src/AggregationJni.cpp b/java/src/main/native/src/AggregationJni.cpp
index 8984c27530d..6ac73282615 100644
--- a/java/src/main/native/src/AggregationJni.cpp
+++ b/java/src/main/native/src/AggregationJni.cpp
@@ -90,11 +90,6 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNoParamAgg(JNIEnv
         case 30: // ANSI SQL PERCENT_RANK
           return cudf::make_rank_aggregation(cudf::rank_method::MIN, {}, cudf::null_policy::INCLUDE,
                                              {}, cudf::rank_percentage::ONE_NORMALIZED);
-        case 33: // HISTOGRAM
-          return cudf::make_histogram_aggregation();
-        case 34: // MERGE_HISTOGRAM
-          return cudf::make_merge_histogram_aggregation();
-
         default: throw std::logic_error("Unsupported No Parameter Aggregation Operation");
       }
     }();