Skip to content

Commit 1010775

Browse files
committed
Fix xxhash32 implementation to avoid hash_combine steps.
1 parent 1508aad commit 1010775

File tree

4 files changed

+83
-20
lines changed

4 files changed

+83
-20
lines changed

cpp/src/hash/xxhash_32.cu

Lines changed: 74 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -13,23 +13,89 @@
1313
* See the License for the specific language governing permissions and
1414
* limitations under the License.
1515
*/
16-
1716
#include <cudf/column/column_factories.hpp>
1817
#include <cudf/detail/nvtx/ranges.hpp>
18+
#include <cudf/detail/utilities/algorithm.cuh>
1919
#include <cudf/hashing/detail/hashing.hpp>
2020
#include <cudf/hashing/detail/xxhash_32.cuh>
21-
#include <cudf/table/experimental/row_operators.cuh>
2221
#include <cudf/table/table_device_view.cuh>
2322
#include <cudf/utilities/memory_resource.hpp>
23+
#include <cudf/utilities/span.hpp>
2424

2525
#include <rmm/cuda_stream_view.hpp>
2626
#include <rmm/exec_policy.hpp>
2727

28+
#include <cuda/std/limits>
2829
#include <thrust/tabulate.h>
2930

30-
namespace cudf::hashing {
31+
namespace cudf {
32+
namespace hashing {
3133
namespace detail {
3234

35+
namespace {
36+
37+
using hash_value_type = uint32_t;
38+
39+
/**
40+
* @brief Computes the hash value of a row in the given table.
41+
*
42+
* @tparam Nullate A cudf::nullate type describing whether to check for nulls.
43+
*/
44+
template <typename Nullate>
45+
class device_row_hasher {
46+
public:
47+
device_row_hasher(Nullate nulls, table_device_view const& t, hash_value_type seed)
48+
: _check_nulls(nulls), _table(t), _seed(seed)
49+
{
50+
}
51+
52+
__device__ auto operator()(size_type row_index) const noexcept
53+
{
54+
return cudf::detail::accumulate(
55+
_table.begin(),
56+
_table.end(),
57+
_seed,
58+
[row_index, nulls = _check_nulls] __device__(auto hash, auto column) {
59+
return cudf::type_dispatcher(
60+
column.type(), element_hasher_adapter{}, column, row_index, nulls, hash);
61+
});
62+
}
63+
64+
/**
65+
* @brief Computes the hash value of an element in the given column.
66+
*/
67+
class element_hasher_adapter {
68+
public:
69+
template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
70+
__device__ hash_value_type operator()(column_device_view const& col,
71+
size_type const row_index,
72+
Nullate const _check_nulls,
73+
hash_value_type const _seed) const noexcept
74+
{
75+
if (_check_nulls && col.is_null(row_index)) {
76+
return cuda::std::numeric_limits<hash_value_type>::max();
77+
}
78+
auto const hasher = XXHash_32<T>{_seed};
79+
return hasher(col.element<T>(row_index));
80+
}
81+
82+
template <typename T, CUDF_ENABLE_IF(not column_device_view::has_element_accessor<T>())>
83+
__device__ hash_value_type operator()(column_device_view const&,
84+
size_type const,
85+
Nullate const,
86+
hash_value_type const) const noexcept
87+
{
88+
CUDF_UNREACHABLE("Unsupported type for XXHash_32");
89+
}
90+
};
91+
92+
Nullate const _check_nulls;
93+
table_device_view const _table;
94+
hash_value_type const _seed;
95+
};
96+
97+
} // namespace
98+
3399
std::unique_ptr<column> xxhash_32(table_view const& input,
34100
uint32_t seed,
35101
rmm::cuda_stream_view stream,
@@ -45,14 +111,14 @@ std::unique_ptr<column> xxhash_32(table_view const& input,
45111
if (input.num_columns() == 0 || input.num_rows() == 0) { return output; }
46112

47113
bool const nullable = has_nulls(input);
48-
auto const row_hasher = cudf::experimental::row::hash::row_hasher(input, stream);
114+
auto const input_view = table_device_view::create(input, stream);
49115
auto output_view = output->mutable_view();
50116

51117
// Compute the hash value for each row
52118
thrust::tabulate(rmm::exec_policy(stream),
53119
output_view.begin<hash_value_type>(),
54120
output_view.end<hash_value_type>(),
55-
row_hasher.device_hasher<XXHash_32>(nullable, seed));
121+
device_row_hasher(nullable, *input_view, seed));
56122

57123
return output;
58124
}
@@ -68,4 +134,5 @@ std::unique_ptr<column> xxhash_32(table_view const& input,
68134
return detail::xxhash_32(input, seed, stream, mr);
69135
}
70136

71-
} // namespace cudf::hashing
137+
} // namespace hashing
138+
} // namespace cudf

cpp/tests/hashing/xxhash_32_test.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ TEST_F(XXHash_32_Test, TestInteger)
3232
// Expected results were generated with the reference implementation:
3333
// https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h
3434
auto expected =
35-
cudf::test::fixed_width_column_wrapper<uint32_t>({2802733858u, 3816402826u, 3721130582u});
35+
cudf::test::fixed_width_column_wrapper<uint32_t>({148298089u, 1161967057u, 1066694813u});
3636
CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected);
3737
}
3838

@@ -46,7 +46,7 @@ TEST_F(XXHash_32_Test, TestDouble)
4646
// Expected results were generated with the reference implementation:
4747
// https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h
4848
auto expected =
49-
cudf::test::fixed_width_column_wrapper<uint32_t>({635906976u, 1479683640u, 1813668619u});
49+
cudf::test::fixed_width_column_wrapper<uint32_t>({2276435783u, 3120212431u, 3454197470u});
5050

5151
CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected);
5252
}
@@ -61,7 +61,7 @@ TEST_F(XXHash_32_Test, StringType)
6161
// Expected results were generated with the reference implementation:
6262
// https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h
6363
auto expected =
64-
cudf::test::fixed_width_column_wrapper<uint32_t>({2975112264u, 4267142293u, 4063988593u});
64+
cudf::test::fixed_width_column_wrapper<uint32_t>({320624298u, 1612654309u, 1409499099u});
6565

6666
CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected);
6767
}

python/cudf/cudf/tests/test_dataframe.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1511,21 +1511,21 @@ def test_dataframe_hash_values_xxhash32():
15111511
gdf["b"] = -gdf["a"]
15121512
out_a = gdf["a"].hash_values(method="xxhash32", seed=0)
15131513
expected_a = cudf.Series(
1514-
[2095779532, 667448960, 1266115603, 3401014672, 2654435768],
1514+
[3736311059, 2307980487, 2906647130, 746578903, 4294967295],
15151515
dtype=np.uint32,
15161516
)
15171517
assert_eq(out_a, expected_a)
15181518

15191519
out_b = gdf["b"].hash_values(method="xxhash32", seed=42)
15201520
expected_b = cudf.Series(
1521-
[3730825784, 620821108, 3185936566, 3305307769, 2654438504],
1521+
[1076387279, 2261349915, 531498073, 650869264, 4294967295],
15221522
dtype=np.uint32,
15231523
)
15241524
assert_eq(out_b, expected_b)
15251525

15261526
out_df = gdf.hash_values(method="xxhash32", seed=0)
15271527
expected_df = cudf.Series(
1528-
[2852293555, 2790935110, 2304542211, 57154089, 3449077662],
1528+
[1223721700, 2885793241, 1920811472, 1146715602, 4294967295],
15291529
dtype=np.uint32,
15301530
)
15311531
assert_eq(out_df, expected_df)

python/pylibcudf/pylibcudf/tests/test_hashing.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,6 @@ def libcudf_mmh3_x86_32(binary):
4949
return hash_combine_32(seed, hashval)
5050

5151

52-
def libcudf_xxhash_32(binary):
53-
seed = plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
54-
hashval = xxhash.xxh32(binary, seed).intdigest()
55-
return hash_combine_32(seed, hashval)
56-
57-
5852
@pytest.fixture(params=[pa.int64(), pa.float64(), pa.string(), pa.bool_()])
5953
def scalar_type(request):
6054
return request.param
@@ -109,7 +103,9 @@ def py_hasher(val):
109103

110104
def test_hash_column_xxhash32(pa_scalar_input_column, plc_scalar_input_tbl):
111105
def py_hasher(val):
112-
return libcudf_xxhash_32(scalar_to_binary(val))
106+
return xxhash.xxh32(
107+
scalar_to_binary(val), seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
108+
).intdigest()
113109

114110
expect = pa.array(
115111
[py_hasher(val) for val in pa_scalar_input_column.to_pylist()],

0 commit comments

Comments
 (0)