Fix xxhash32 implementation to avoid hash_combine steps.

rapidsai · Dec 17, 2024 · 1010775 · 1010775
1 parent 1508aad
commit 1010775
Show file tree

Hide file tree

Showing 4 changed files with 83 additions and 20 deletions.
diff --git a/cpp/src/hash/xxhash_32.cu b/cpp/src/hash/xxhash_32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,23 +13,89 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/hashing/detail/xxhash_32.cuh>
-#include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/memory_resource.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/limits>
 #include <thrust/tabulate.h>
 
-namespace cudf::hashing {
+namespace cudf {
+namespace hashing {
 namespace detail {
 
+namespace {
+
+using hash_value_type = uint32_t;
+
+/**
+ * @brief Computes the hash value of a row in the given table.
+ *
+ * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
+ */
+template <typename Nullate>
+class device_row_hasher {
+ public:
+  device_row_hasher(Nullate nulls, table_device_view const& t, hash_value_type seed)
+    : _check_nulls(nulls), _table(t), _seed(seed)
+  {
+  }
+
+  __device__ auto operator()(size_type row_index) const noexcept
+  {
+    return cudf::detail::accumulate(
+      _table.begin(),
+      _table.end(),
+      _seed,
+      [row_index, nulls = _check_nulls] __device__(auto hash, auto column) {
+        return cudf::type_dispatcher(
+          column.type(), element_hasher_adapter{}, column, row_index, nulls, hash);
+      });
+  }
+
+  /**
+   * @brief Computes the hash value of an element in the given column.
+   */
+  class element_hasher_adapter {
+   public:
+    template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
+    __device__ hash_value_type operator()(column_device_view const& col,
+                                          size_type const row_index,
+                                          Nullate const _check_nulls,
+                                          hash_value_type const _seed) const noexcept
+    {
+      if (_check_nulls && col.is_null(row_index)) {
+        return cuda::std::numeric_limits<hash_value_type>::max();
+      }
+      auto const hasher = XXHash_32<T>{_seed};
+      return hasher(col.element<T>(row_index));
+    }
+
+    template <typename T, CUDF_ENABLE_IF(not column_device_view::has_element_accessor<T>())>
+    __device__ hash_value_type operator()(column_device_view const&,
+                                          size_type const,
+                                          Nullate const,
+                                          hash_value_type const) const noexcept
+    {
+      CUDF_UNREACHABLE("Unsupported type for XXHash_32");
+    }
+  };
+
+  Nullate const _check_nulls;
+  table_device_view const _table;
+  hash_value_type const _seed;
+};
+
+}  // namespace
+
 std::unique_ptr<column> xxhash_32(table_view const& input,
                                   uint32_t seed,
                                   rmm::cuda_stream_view stream,
@@ -45,14 +111,14 @@ std::unique_ptr<column> xxhash_32(table_view const& input,
   if (input.num_columns() == 0 || input.num_rows() == 0) { return output; }
 
   bool const nullable   = has_nulls(input);
-  auto const row_hasher = cudf::experimental::row::hash::row_hasher(input, stream);
+  auto const input_view = table_device_view::create(input, stream);
   auto output_view      = output->mutable_view();
 
   // Compute the hash value for each row
   thrust::tabulate(rmm::exec_policy(stream),
                    output_view.begin<hash_value_type>(),
                    output_view.end<hash_value_type>(),
-                   row_hasher.device_hasher<XXHash_32>(nullable, seed));
+                   device_row_hasher(nullable, *input_view, seed));
 
   return output;
 }
@@ -68,4 +134,5 @@ std::unique_ptr<column> xxhash_32(table_view const& input,
   return detail::xxhash_32(input, seed, stream, mr);
 }
 
-}  // namespace cudf::hashing
+}  // namespace hashing
+}  // namespace cudf
diff --git a/cpp/tests/hashing/xxhash_32_test.cpp b/cpp/tests/hashing/xxhash_32_test.cpp
@@ -32,7 +32,7 @@ TEST_F(XXHash_32_Test, TestInteger)
   // Expected results were generated with the reference implementation:
   // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h
   auto expected =
-    cudf::test::fixed_width_column_wrapper<uint32_t>({2802733858u, 3816402826u, 3721130582u});
+    cudf::test::fixed_width_column_wrapper<uint32_t>({148298089u, 1161967057u, 1066694813u});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected);
 }
 
@@ -46,7 +46,7 @@ TEST_F(XXHash_32_Test, TestDouble)
   // Expected results were generated with the reference implementation:
   // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h
   auto expected =
-    cudf::test::fixed_width_column_wrapper<uint32_t>({635906976u, 1479683640u, 1813668619u});
+    cudf::test::fixed_width_column_wrapper<uint32_t>({2276435783u, 3120212431u, 3454197470u});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected);
 }
@@ -61,7 +61,7 @@ TEST_F(XXHash_32_Test, StringType)
   // Expected results were generated with the reference implementation:
   // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h
   auto expected =
-    cudf::test::fixed_width_column_wrapper<uint32_t>({2975112264u, 4267142293u, 4063988593u});
+    cudf::test::fixed_width_column_wrapper<uint32_t>({320624298u, 1612654309u, 1409499099u});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected);
 }
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
@@ -1511,21 +1511,21 @@ def test_dataframe_hash_values_xxhash32():
     gdf["b"] = -gdf["a"]
     out_a = gdf["a"].hash_values(method="xxhash32", seed=0)
     expected_a = cudf.Series(
-        [2095779532, 667448960, 1266115603, 3401014672, 2654435768],
+        [3736311059, 2307980487, 2906647130, 746578903, 4294967295],
         dtype=np.uint32,
     )
     assert_eq(out_a, expected_a)
 
     out_b = gdf["b"].hash_values(method="xxhash32", seed=42)
     expected_b = cudf.Series(
-        [3730825784, 620821108, 3185936566, 3305307769, 2654438504],
+        [1076387279, 2261349915, 531498073, 650869264, 4294967295],
         dtype=np.uint32,
     )
     assert_eq(out_b, expected_b)
 
     out_df = gdf.hash_values(method="xxhash32", seed=0)
     expected_df = cudf.Series(
-        [2852293555, 2790935110, 2304542211, 57154089, 3449077662],
+        [1223721700, 2885793241, 1920811472, 1146715602, 4294967295],
         dtype=np.uint32,
     )
     assert_eq(out_df, expected_df)

diff --git a/python/pylibcudf/pylibcudf/tests/test_hashing.py b/python/pylibcudf/pylibcudf/tests/test_hashing.py
@@ -49,12 +49,6 @@ def libcudf_mmh3_x86_32(binary):
     return hash_combine_32(seed, hashval)
 
 
-def libcudf_xxhash_32(binary):
-    seed = plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
-    hashval = xxhash.xxh32(binary, seed).intdigest()
-    return hash_combine_32(seed, hashval)
-
-
 @pytest.fixture(params=[pa.int64(), pa.float64(), pa.string(), pa.bool_()])
 def scalar_type(request):
     return request.param
@@ -109,7 +103,9 @@ def py_hasher(val):
 
 def test_hash_column_xxhash32(pa_scalar_input_column, plc_scalar_input_tbl):
     def py_hasher(val):
-        return libcudf_xxhash_32(scalar_to_binary(val))
+        return xxhash.xxh32(
+            scalar_to_binary(val), seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
+        ).intdigest()
 
     expect = pa.array(
         [py_hasher(val) for val in pa_scalar_input_column.to_pylist()],