diff --git a/cpp/src/hash/xxhash_32.cu b/cpp/src/hash/xxhash_32.cu index 349c7e5cede..7a864fbc98e 100644 --- a/cpp/src/hash/xxhash_32.cu +++ b/cpp/src/hash/xxhash_32.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,23 +13,89 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #include #include +#include #include #include -#include #include #include +#include #include #include +#include #include -namespace cudf::hashing { +namespace cudf { +namespace hashing { namespace detail { +namespace { + +using hash_value_type = uint32_t; + +/** + * @brief Computes the hash value of a row in the given table. + * + * @tparam Nullate A cudf::nullate type describing whether to check for nulls. + */ +template +class device_row_hasher { + public: + device_row_hasher(Nullate nulls, table_device_view const& t, hash_value_type seed) + : _check_nulls(nulls), _table(t), _seed(seed) + { + } + + __device__ auto operator()(size_type row_index) const noexcept + { + return cudf::detail::accumulate( + _table.begin(), + _table.end(), + _seed, + [row_index, nulls = _check_nulls] __device__(auto hash, auto column) { + return cudf::type_dispatcher( + column.type(), element_hasher_adapter{}, column, row_index, nulls, hash); + }); + } + + /** + * @brief Computes the hash value of an element in the given column. + */ + class element_hasher_adapter { + public: + template ())> + __device__ hash_value_type operator()(column_device_view const& col, + size_type const row_index, + Nullate const _check_nulls, + hash_value_type const _seed) const noexcept + { + if (_check_nulls && col.is_null(row_index)) { + return cuda::std::numeric_limits::max(); + } + auto const hasher = XXHash_32{_seed}; + return hasher(col.element(row_index)); + } + + template ())> + __device__ hash_value_type operator()(column_device_view const&, + size_type const, + Nullate const, + hash_value_type const) const noexcept + { + CUDF_UNREACHABLE("Unsupported type for XXHash_32"); + } + }; + + Nullate const _check_nulls; + table_device_view const _table; + hash_value_type const _seed; +}; + +} // namespace + std::unique_ptr xxhash_32(table_view const& input, uint32_t seed, rmm::cuda_stream_view stream, @@ -45,14 +111,14 @@ std::unique_ptr xxhash_32(table_view const& input, if (input.num_columns() == 0 || input.num_rows() == 0) { return output; } bool const nullable = has_nulls(input); - auto const row_hasher = cudf::experimental::row::hash::row_hasher(input, stream); + auto const input_view = table_device_view::create(input, stream); auto output_view = output->mutable_view(); // Compute the hash value for each row thrust::tabulate(rmm::exec_policy(stream), output_view.begin(), output_view.end(), - row_hasher.device_hasher(nullable, seed)); + device_row_hasher(nullable, *input_view, seed)); return output; } @@ -68,4 +134,5 @@ std::unique_ptr xxhash_32(table_view const& input, return detail::xxhash_32(input, seed, stream, mr); } -} // namespace cudf::hashing +} // namespace hashing +} // namespace cudf diff --git a/cpp/tests/hashing/xxhash_32_test.cpp b/cpp/tests/hashing/xxhash_32_test.cpp index cfecca141a8..3b3bce493e8 100644 --- a/cpp/tests/hashing/xxhash_32_test.cpp +++ b/cpp/tests/hashing/xxhash_32_test.cpp @@ -32,7 +32,7 @@ TEST_F(XXHash_32_Test, TestInteger) // Expected results were generated with the reference implementation: // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h auto expected = - cudf::test::fixed_width_column_wrapper({2802733858u, 3816402826u, 3721130582u}); + cudf::test::fixed_width_column_wrapper({148298089u, 1161967057u, 1066694813u}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected); } @@ -46,7 +46,7 @@ TEST_F(XXHash_32_Test, TestDouble) // Expected results were generated with the reference implementation: // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h auto expected = - cudf::test::fixed_width_column_wrapper({635906976u, 1479683640u, 1813668619u}); + cudf::test::fixed_width_column_wrapper({2276435783u, 3120212431u, 3454197470u}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected); } @@ -61,7 +61,7 @@ TEST_F(XXHash_32_Test, StringType) // Expected results were generated with the reference implementation: // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h auto expected = - cudf::test::fixed_width_column_wrapper({2975112264u, 4267142293u, 4063988593u}); + cudf::test::fixed_width_column_wrapper({320624298u, 1612654309u, 1409499099u}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected); } diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 79673303a9a..51de33576c0 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1511,21 +1511,21 @@ def test_dataframe_hash_values_xxhash32(): gdf["b"] = -gdf["a"] out_a = gdf["a"].hash_values(method="xxhash32", seed=0) expected_a = cudf.Series( - [2095779532, 667448960, 1266115603, 3401014672, 2654435768], + [3736311059, 2307980487, 2906647130, 746578903, 4294967295], dtype=np.uint32, ) assert_eq(out_a, expected_a) out_b = gdf["b"].hash_values(method="xxhash32", seed=42) expected_b = cudf.Series( - [3730825784, 620821108, 3185936566, 3305307769, 2654438504], + [1076387279, 2261349915, 531498073, 650869264, 4294967295], dtype=np.uint32, ) assert_eq(out_b, expected_b) out_df = gdf.hash_values(method="xxhash32", seed=0) expected_df = cudf.Series( - [2852293555, 2790935110, 2304542211, 57154089, 3449077662], + [1223721700, 2885793241, 1920811472, 1146715602, 4294967295], dtype=np.uint32, ) assert_eq(out_df, expected_df) diff --git a/python/pylibcudf/pylibcudf/tests/test_hashing.py b/python/pylibcudf/pylibcudf/tests/test_hashing.py index 94496fe7a0b..87d21618a75 100644 --- a/python/pylibcudf/pylibcudf/tests/test_hashing.py +++ b/python/pylibcudf/pylibcudf/tests/test_hashing.py @@ -49,12 +49,6 @@ def libcudf_mmh3_x86_32(binary): return hash_combine_32(seed, hashval) -def libcudf_xxhash_32(binary): - seed = plc.hashing.LIBCUDF_DEFAULT_HASH_SEED - hashval = xxhash.xxh32(binary, seed).intdigest() - return hash_combine_32(seed, hashval) - - @pytest.fixture(params=[pa.int64(), pa.float64(), pa.string(), pa.bool_()]) def scalar_type(request): return request.param @@ -109,7 +103,9 @@ def py_hasher(val): def test_hash_column_xxhash32(pa_scalar_input_column, plc_scalar_input_tbl): def py_hasher(val): - return libcudf_xxhash_32(scalar_to_binary(val)) + return xxhash.xxh32( + scalar_to_binary(val), seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ).intdigest() expect = pa.array( [py_hasher(val) for val in pa_scalar_input_column.to_pylist()],