Skip to content

Commit

Permalink
Fix xxhash32 implementation to avoid hash_combine steps.
Browse files Browse the repository at this point in the history
  • Loading branch information
bdice committed Dec 17, 2024
1 parent 1508aad commit 1010775
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 20 deletions.
81 changes: 74 additions & 7 deletions cpp/src/hash/xxhash_32.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -13,23 +13,89 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/column/column_factories.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/algorithm.cuh>
#include <cudf/hashing/detail/hashing.hpp>
#include <cudf/hashing/detail/xxhash_32.cuh>
#include <cudf/table/experimental/row_operators.cuh>
#include <cudf/table/table_device_view.cuh>
#include <cudf/utilities/memory_resource.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>

#include <cuda/std/limits>
#include <thrust/tabulate.h>

namespace cudf::hashing {
namespace cudf {
namespace hashing {
namespace detail {

namespace {

using hash_value_type = uint32_t;

/**
* @brief Computes the hash value of a row in the given table.
*
* @tparam Nullate A cudf::nullate type describing whether to check for nulls.
*/
template <typename Nullate>
class device_row_hasher {
public:
device_row_hasher(Nullate nulls, table_device_view const& t, hash_value_type seed)
: _check_nulls(nulls), _table(t), _seed(seed)
{
}

__device__ auto operator()(size_type row_index) const noexcept
{
return cudf::detail::accumulate(
_table.begin(),
_table.end(),
_seed,
[row_index, nulls = _check_nulls] __device__(auto hash, auto column) {
return cudf::type_dispatcher(
column.type(), element_hasher_adapter{}, column, row_index, nulls, hash);
});
}

/**
* @brief Computes the hash value of an element in the given column.
*/
class element_hasher_adapter {
public:
template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
__device__ hash_value_type operator()(column_device_view const& col,
size_type const row_index,
Nullate const _check_nulls,
hash_value_type const _seed) const noexcept
{
if (_check_nulls && col.is_null(row_index)) {
return cuda::std::numeric_limits<hash_value_type>::max();
}
auto const hasher = XXHash_32<T>{_seed};
return hasher(col.element<T>(row_index));
}

template <typename T, CUDF_ENABLE_IF(not column_device_view::has_element_accessor<T>())>
__device__ hash_value_type operator()(column_device_view const&,
size_type const,
Nullate const,
hash_value_type const) const noexcept
{
CUDF_UNREACHABLE("Unsupported type for XXHash_32");
}
};

Nullate const _check_nulls;
table_device_view const _table;
hash_value_type const _seed;
};

} // namespace

std::unique_ptr<column> xxhash_32(table_view const& input,
uint32_t seed,
rmm::cuda_stream_view stream,
Expand All @@ -45,14 +111,14 @@ std::unique_ptr<column> xxhash_32(table_view const& input,
if (input.num_columns() == 0 || input.num_rows() == 0) { return output; }

bool const nullable = has_nulls(input);
auto const row_hasher = cudf::experimental::row::hash::row_hasher(input, stream);
auto const input_view = table_device_view::create(input, stream);
auto output_view = output->mutable_view();

// Compute the hash value for each row
thrust::tabulate(rmm::exec_policy(stream),
output_view.begin<hash_value_type>(),
output_view.end<hash_value_type>(),
row_hasher.device_hasher<XXHash_32>(nullable, seed));
device_row_hasher(nullable, *input_view, seed));

return output;
}
Expand All @@ -68,4 +134,5 @@ std::unique_ptr<column> xxhash_32(table_view const& input,
return detail::xxhash_32(input, seed, stream, mr);
}

} // namespace cudf::hashing
} // namespace hashing
} // namespace cudf
6 changes: 3 additions & 3 deletions cpp/tests/hashing/xxhash_32_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ TEST_F(XXHash_32_Test, TestInteger)
// Expected results were generated with the reference implementation:
// https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h
auto expected =
cudf::test::fixed_width_column_wrapper<uint32_t>({2802733858u, 3816402826u, 3721130582u});
cudf::test::fixed_width_column_wrapper<uint32_t>({148298089u, 1161967057u, 1066694813u});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected);
}

Expand All @@ -46,7 +46,7 @@ TEST_F(XXHash_32_Test, TestDouble)
// Expected results were generated with the reference implementation:
// https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h
auto expected =
cudf::test::fixed_width_column_wrapper<uint32_t>({635906976u, 1479683640u, 1813668619u});
cudf::test::fixed_width_column_wrapper<uint32_t>({2276435783u, 3120212431u, 3454197470u});

CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected);
}
Expand All @@ -61,7 +61,7 @@ TEST_F(XXHash_32_Test, StringType)
// Expected results were generated with the reference implementation:
// https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h
auto expected =
cudf::test::fixed_width_column_wrapper<uint32_t>({2975112264u, 4267142293u, 4063988593u});
cudf::test::fixed_width_column_wrapper<uint32_t>({320624298u, 1612654309u, 1409499099u});

CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected);
}
6 changes: 3 additions & 3 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1511,21 +1511,21 @@ def test_dataframe_hash_values_xxhash32():
gdf["b"] = -gdf["a"]
out_a = gdf["a"].hash_values(method="xxhash32", seed=0)
expected_a = cudf.Series(
[2095779532, 667448960, 1266115603, 3401014672, 2654435768],
[3736311059, 2307980487, 2906647130, 746578903, 4294967295],
dtype=np.uint32,
)
assert_eq(out_a, expected_a)

out_b = gdf["b"].hash_values(method="xxhash32", seed=42)
expected_b = cudf.Series(
[3730825784, 620821108, 3185936566, 3305307769, 2654438504],
[1076387279, 2261349915, 531498073, 650869264, 4294967295],
dtype=np.uint32,
)
assert_eq(out_b, expected_b)

out_df = gdf.hash_values(method="xxhash32", seed=0)
expected_df = cudf.Series(
[2852293555, 2790935110, 2304542211, 57154089, 3449077662],
[1223721700, 2885793241, 1920811472, 1146715602, 4294967295],
dtype=np.uint32,
)
assert_eq(out_df, expected_df)
Expand Down
10 changes: 3 additions & 7 deletions python/pylibcudf/pylibcudf/tests/test_hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,6 @@ def libcudf_mmh3_x86_32(binary):
return hash_combine_32(seed, hashval)


def libcudf_xxhash_32(binary):
seed = plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
hashval = xxhash.xxh32(binary, seed).intdigest()
return hash_combine_32(seed, hashval)


@pytest.fixture(params=[pa.int64(), pa.float64(), pa.string(), pa.bool_()])
def scalar_type(request):
return request.param
Expand Down Expand Up @@ -109,7 +103,9 @@ def py_hasher(val):

def test_hash_column_xxhash32(pa_scalar_input_column, plc_scalar_input_tbl):
def py_hasher(val):
return libcudf_xxhash_32(scalar_to_binary(val))
return xxhash.xxh32(
scalar_to_binary(val), seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
).intdigest()

expect = pa.array(
[py_hasher(val) for val in pa_scalar_input_column.to_pylist()],
Expand Down

0 comments on commit 1010775

Please sign in to comment.