From e4d51b2ae5368c926ef4fd689bb7c71e0241b80e Mon Sep 17 00:00:00 2001 From: Naim <110031745+naimnv@users.noreply.github.com> Date: Thu, 20 Jun 2024 00:33:41 +0200 Subject: [PATCH] compute cosine similarity for vertex pairs (#4482) compute cosine similarity for vertex pairs Authors: - Naim (https://github.com/naimnv) Approvers: - Seunghwa Kang (https://github.com/seunghwak) URL: https://github.com/rapidsai/cugraph/pull/4482 --- cpp/CMakeLists.txt | 2 + cpp/include/cugraph/algorithms.hpp | 87 +++++++++++ cpp/src/link_prediction/cosine_mg.cu | 122 +++++++++++++++ cpp/src/link_prediction/cosine_sg.cu | 122 +++++++++++++++ .../cosine_similarity_impl.cuh | 82 ++++++++++ cpp/src/link_prediction/jaccard_impl.cuh | 8 +- cpp/src/link_prediction/overlap_impl.cuh | 8 +- cpp/src/link_prediction/similarity_impl.cuh | 122 ++++++++++----- cpp/src/link_prediction/sorensen_impl.cuh | 8 +- .../link_prediction/mg_similarity_test.cpp | 28 ++++ .../mg_weighted_similarity_test.cpp | 28 ++++ .../link_prediction/similarity_compare.cpp | 141 +++++++++++++----- .../link_prediction/similarity_compare.hpp | 46 ++++++ cpp/tests/link_prediction/similarity_test.cu | 24 +++ .../weighted_similarity_test.cpp | 24 +++ 15 files changed, 764 insertions(+), 88 deletions(-) create mode 100644 cpp/src/link_prediction/cosine_mg.cu create mode 100644 cpp/src/link_prediction/cosine_sg.cu create mode 100644 cpp/src/link_prediction/cosine_similarity_impl.cuh diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7dca3d983a5..93f43b9ec56 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -193,9 +193,11 @@ set(CUGRAPH_SOURCES src/link_prediction/jaccard_sg.cu src/link_prediction/sorensen_sg.cu src/link_prediction/overlap_sg.cu + src/link_prediction/cosine_sg.cu src/link_prediction/jaccard_mg.cu src/link_prediction/sorensen_mg.cu src/link_prediction/overlap_mg.cu + src/link_prediction/cosine_mg.cu src/layout/legacy/force_atlas2.cu src/converters/legacy/COOtoCSR.cu src/community/legacy/spectral_clustering.cu diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp index cc42399f091..bce484ece20 100644 --- a/cpp/include/cugraph/algorithms.hpp +++ b/cpp/include/cugraph/algorithms.hpp @@ -2082,6 +2082,37 @@ rmm::device_uvector jaccard_coefficients( std::tuple, raft::device_span> vertex_pairs, bool do_expensive_check = false); +/** + * @brief Compute Cosine similarity coefficient + * + * Similarity is computed for every pair of vertices specified. Note that + * similarity algorithms expect a symmetric graph. + * + * @throws cugraph::logic_error when an error occurs. + * + * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. + * @tparam edge_t Type of edge identifiers. Needs to be an integral type. + * @tparam weight_t Type of edge weights. Needs to be a floating point type. + * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph_view Graph view object. + * @param edge_weight_view Optional view object holding edge weights for @p graph_view. If @p + * edge_weight_view.has_value() == true, use the weights associated with the graph. If false, assume + * a weight of 1 for all edges. + * @param vertex_pairs tuple of device spans defining the vertex pairs to compute similarity for + * In a multi-gpu context each vertex pair should be local to this GPU. + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + * @return similarity coefficient for the corresponding @p vertex_pairs + */ +template +rmm::device_uvector cosine_similarity_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::tuple, raft::device_span> vertex_pairs, + bool do_expensive_check = false); + /** * @brief Compute Sorensen similarity coefficient * @@ -2202,6 +2233,62 @@ std:: std::optional topk, bool do_expensive_check = false); +/** + * @brief Compute Consine all pairs similarity coefficient + * + * Similarity is computed for all pairs of vertices. Note that in a sparse + * graph, many of the vertex pairs will have a score of zero. We actually + * compute similarity only for vertices that are two hop neighbors within + * the graph, since vertices that are not two hop neighbors will have + * a score of 0. + * + * If @p vertices is specified we will compute similarity on two hop + * neighbors the @p vertices. If @p vertices is not specified it will + * compute similarity on all two hop neighbors in the graph. + * + * If @p topk is specified only the top @p topk scoring vertex pairs + * will be returned, if not specified then scores for all computed vertex pairs + * will be returned. + * + * Note the list of two hop neighbors in the entire graph might be a large + * number of vertex pairs. If the graph is dense enough it could be as large + * as the the number of vertices squared, which might run out of memory. + * + * @throws cugraph::logic_error when an error occurs. + * + * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. + * @tparam edge_t Type of edge identifiers. Needs to be an integral type. + * @tparam weight_t Type of edge weights. Needs to be a floating point type. + * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph_view Graph view object. + * @param edge_weight_view Optional view object holding edge weights for @p graph_view. If @p + * edge_weight_view.has_value() == true, use the weights associated with the graph. If false, assume + * a weight of 1 for all edges. + * @param vertices optional device span defining the seed vertices. In a multi-gpu context the + * vertices should be local to this GPU. + * @param topk optional specification of the how many of the top scoring vertex pairs should be + * returned + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + * @return tuple containing three device vectors (v1, v2, score) of the same length. Corresponding + * elements in the vectors identify a result, v1 identifying a vertex in the graph, v2 identifying + * one of v1's two hop neighors, and the score identifying the similarity score between v1 and v2. + * If @p topk was specified then the vectors will be no longer than @p topk elements. In a + * multi-gpu context, if @p topk is specified all results will return on GPU rank 0, otherwise they + * will be returned on the local GPU for vertex v1. + */ +template +std:: + tuple, rmm::device_uvector, rmm::device_uvector> + cosine_similarity_all_pairs_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::optional> vertices, + std::optional topk, + bool do_expensive_check = false); + /** * @brief Compute Sorensen similarity coefficient * diff --git a/cpp/src/link_prediction/cosine_mg.cu b/cpp/src/link_prediction/cosine_mg.cu new file mode 100644 index 00000000000..71b8e7030df --- /dev/null +++ b/cpp/src/link_prediction/cosine_mg.cu @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "link_prediction/cosine_similarity_impl.cuh" + +namespace cugraph { + +template rmm::device_uvector cosine_similarity_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::tuple, raft::device_span> vertex_pairs, + bool do_expensive_check); + +template rmm::device_uvector cosine_similarity_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::tuple, raft::device_span> vertex_pairs, + bool do_expensive_check); + +template rmm::device_uvector cosine_similarity_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::tuple, raft::device_span> vertex_pairs, + bool do_expensive_check); + +template rmm::device_uvector cosine_similarity_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::tuple, raft::device_span> vertex_pairs, + bool do_expensive_check); + +template rmm::device_uvector cosine_similarity_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::tuple, raft::device_span> vertex_pairs, + bool do_expensive_check); + +template rmm::device_uvector cosine_similarity_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::tuple, raft::device_span> vertex_pairs, + bool do_expensive_check); + +template std:: + tuple, rmm::device_uvector, rmm::device_uvector> + cosine_similarity_all_pairs_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::optional> vertices, + std::optional topk, + bool do_expensive_check); + +template std:: + tuple, rmm::device_uvector, rmm::device_uvector> + cosine_similarity_all_pairs_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::optional> vertices, + std::optional topk, + bool do_expensive_check); + +template std:: + tuple, rmm::device_uvector, rmm::device_uvector> + cosine_similarity_all_pairs_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::optional> vertices, + std::optional topk, + bool do_expensive_check); + +template std:: + tuple, rmm::device_uvector, rmm::device_uvector> + cosine_similarity_all_pairs_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::optional> vertices, + std::optional topk, + bool do_expensive_check); + +template std:: + tuple, rmm::device_uvector, rmm::device_uvector> + cosine_similarity_all_pairs_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::optional> vertices, + std::optional topk, + bool do_expensive_check); + +template std:: + tuple, rmm::device_uvector, rmm::device_uvector> + cosine_similarity_all_pairs_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::optional> vertices, + std::optional topk, + bool do_expensive_check); + +} // namespace cugraph diff --git a/cpp/src/link_prediction/cosine_sg.cu b/cpp/src/link_prediction/cosine_sg.cu new file mode 100644 index 00000000000..bf0cb79d802 --- /dev/null +++ b/cpp/src/link_prediction/cosine_sg.cu @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "link_prediction/cosine_similarity_impl.cuh" + +namespace cugraph { + +template rmm::device_uvector cosine_similarity_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::tuple, raft::device_span> vertex_pairs, + bool do_expensive_check); + +template rmm::device_uvector cosine_similarity_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::tuple, raft::device_span> vertex_pairs, + bool do_expensive_check); + +template rmm::device_uvector cosine_similarity_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::tuple, raft::device_span> vertex_pairs, + bool do_expensive_check); + +template rmm::device_uvector cosine_similarity_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::tuple, raft::device_span> vertex_pairs, + bool do_expensive_check); + +template rmm::device_uvector cosine_similarity_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::tuple, raft::device_span> vertex_pairs, + bool do_expensive_check); + +template rmm::device_uvector cosine_similarity_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::tuple, raft::device_span> vertex_pairs, + bool do_expensive_check); + +template std:: + tuple, rmm::device_uvector, rmm::device_uvector> + cosine_similarity_all_pairs_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::optional> vertices, + std::optional topk, + bool do_expensive_check); + +template std:: + tuple, rmm::device_uvector, rmm::device_uvector> + cosine_similarity_all_pairs_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::optional> vertices, + std::optional topk, + bool do_expensive_check); + +template std:: + tuple, rmm::device_uvector, rmm::device_uvector> + cosine_similarity_all_pairs_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::optional> vertices, + std::optional topk, + bool do_expensive_check); + +template std:: + tuple, rmm::device_uvector, rmm::device_uvector> + cosine_similarity_all_pairs_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::optional> vertices, + std::optional topk, + bool do_expensive_check); + +template std:: + tuple, rmm::device_uvector, rmm::device_uvector> + cosine_similarity_all_pairs_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::optional> vertices, + std::optional topk, + bool do_expensive_check); + +template std:: + tuple, rmm::device_uvector, rmm::device_uvector> + cosine_similarity_all_pairs_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::optional> vertices, + std::optional topk, + bool do_expensive_check); + +} // namespace cugraph diff --git a/cpp/src/link_prediction/cosine_similarity_impl.cuh b/cpp/src/link_prediction/cosine_similarity_impl.cuh new file mode 100644 index 00000000000..831f202c5e9 --- /dev/null +++ b/cpp/src/link_prediction/cosine_similarity_impl.cuh @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "link_prediction/similarity_impl.cuh" + +#include + +#include + +namespace cugraph { +namespace detail { + +template +struct cosine_functor_t { + weight_t __device__ compute_score(weight_t norm_a, + weight_t norm_b, + weight_t sum_of_product_of_a_and_b, + weight_t reserved_param) const + { + return sum_of_product_of_a_and_b / (norm_a * norm_b); + } +}; + +} // namespace detail + +template +rmm::device_uvector cosine_similarity_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::tuple, raft::device_span> vertex_pairs, + bool do_expensive_check) +{ + CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented."); + + return detail::similarity(handle, + graph_view, + edge_weight_view, + vertex_pairs, + detail::cosine_functor_t{}, + detail::coefficient_t::COSINE, + do_expensive_check); +} + +template +std:: + tuple, rmm::device_uvector, rmm::device_uvector> + cosine_similarity_all_pairs_coefficients( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::optional> vertices, + std::optional topk, + bool do_expensive_check) +{ + CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented."); + + return detail::all_pairs_similarity(handle, + graph_view, + edge_weight_view, + vertices, + topk, + detail::cosine_functor_t{}, + detail::coefficient_t::COSINE, + do_expensive_check); +} + +} // namespace cugraph diff --git a/cpp/src/link_prediction/jaccard_impl.cuh b/cpp/src/link_prediction/jaccard_impl.cuh index d8cfcf19b4f..f889f143c43 100644 --- a/cpp/src/link_prediction/jaccard_impl.cuh +++ b/cpp/src/link_prediction/jaccard_impl.cuh @@ -24,8 +24,8 @@ namespace cugraph { namespace detail { +template struct jaccard_functor_t { - template weight_t __device__ compute_score(weight_t weight_a, weight_t weight_b, weight_t weight_a_intersect_b, @@ -53,7 +53,8 @@ rmm::device_uvector jaccard_coefficients( graph_view, edge_weight_view, vertex_pairs, - detail::jaccard_functor_t{}, + detail::jaccard_functor_t{}, + detail::coefficient_t::JACCARD, do_expensive_check); } @@ -75,7 +76,8 @@ std:: edge_weight_view, vertices, topk, - detail::jaccard_functor_t{}, + detail::jaccard_functor_t{}, + detail::coefficient_t::JACCARD, do_expensive_check); } diff --git a/cpp/src/link_prediction/overlap_impl.cuh b/cpp/src/link_prediction/overlap_impl.cuh index 38e654453ff..f3a3319309e 100644 --- a/cpp/src/link_prediction/overlap_impl.cuh +++ b/cpp/src/link_prediction/overlap_impl.cuh @@ -24,8 +24,8 @@ namespace cugraph { namespace detail { +template struct overlap_functor_t { - template weight_t __device__ compute_score(weight_t weight_a, weight_t weight_b, weight_t weight_a_intersect_b, @@ -53,7 +53,8 @@ rmm::device_uvector overlap_coefficients( graph_view, edge_weight_view, vertex_pairs, - detail::overlap_functor_t{}, + detail::overlap_functor_t{}, + detail::coefficient_t::OVERLAP, do_expensive_check); } @@ -75,7 +76,8 @@ std:: edge_weight_view, vertices, topk, - detail::overlap_functor_t{}, + detail::overlap_functor_t{}, + detail::coefficient_t::OVERLAP, do_expensive_check); } diff --git a/cpp/src/link_prediction/similarity_impl.cuh b/cpp/src/link_prediction/similarity_impl.cuh index 00f7bc6cbe7..6a8882dcfab 100644 --- a/cpp/src/link_prediction/similarity_impl.cuh +++ b/cpp/src/link_prediction/similarity_impl.cuh @@ -36,6 +36,8 @@ namespace cugraph { namespace detail { +enum class coefficient_t { JACCARD, SORENSEN, OVERLAP, COSINE }; + template rmm::device_uvector similarity( raft::handle_t const& handle, @@ -43,6 +45,7 @@ rmm::device_uvector similarity( std::optional> edge_weight_view, std::tuple, raft::device_span> vertex_pairs, functor_t functor, + coefficient_t coeff, bool do_expensive_check = false) { using GraphViewType = graph_view_t; @@ -88,58 +91,86 @@ rmm::device_uvector similarity( vertex_pairs_begin, vertex_pairs_begin + num_vertex_pairs, weighted_out_degrees.begin(), - [functor] __device__(auto a, - auto b, - auto weight_a, - auto weight_b, - auto intersection, - auto intersected_properties_a, - auto intersected_properties_b) { - weight_t sum_of_min_weight_a_intersect_b = weight_t{0}; - weight_t sum_of_max_weight_a_intersect_b = weight_t{0}; - weight_t sum_of_intersected_a = weight_t{0}; - weight_t sum_of_intersected_b = weight_t{0}; - - auto pair_first = thrust::make_zip_iterator(intersected_properties_a.data(), - intersected_properties_b.data()); - thrust::tie(sum_of_min_weight_a_intersect_b, - sum_of_max_weight_a_intersect_b, - sum_of_intersected_a, - sum_of_intersected_b) = - thrust::transform_reduce( + [functor, coeff] __device__(auto a, + auto b, + auto weight_a, + auto weight_b, + auto intersection, + auto intersected_properties_a, + auto intersected_properties_b) { + if (coeff == coefficient_t::COSINE) { + weight_t norm_a = weight_t{0}; + weight_t norm_b = weight_t{0}; + weight_t sum_of_product_of_a_and_b = weight_t{0}; + + auto pair_first = thrust::make_zip_iterator(intersected_properties_a.data(), + intersected_properties_b.data()); + thrust::tie(norm_a, norm_b, sum_of_product_of_a_and_b) = thrust::transform_reduce( thrust::seq, pair_first, pair_first + intersected_properties_a.size(), [] __device__(auto property_pair) { auto prop_a = thrust::get<0>(property_pair); auto prop_b = thrust::get<1>(property_pair); - return thrust::make_tuple(min(prop_a, prop_b), max(prop_a, prop_b), prop_a, prop_b); + return thrust::make_tuple(prop_a * prop_a, prop_b * prop_b, prop_a * prop_b); }, - thrust::make_tuple(weight_t{0}, weight_t{0}, weight_t{0}, weight_t{0}), + thrust::make_tuple(weight_t{0}, weight_t{0}, weight_t{0}), [] __device__(auto lhs, auto rhs) { return thrust::make_tuple(thrust::get<0>(lhs) + thrust::get<0>(rhs), thrust::get<1>(lhs) + thrust::get<1>(rhs), - thrust::get<2>(lhs) + thrust::get<2>(rhs), - thrust::get<3>(lhs) + thrust::get<3>(rhs)); + thrust::get<2>(lhs) + thrust::get<2>(rhs)); }); - weight_t sum_of_uniq_a = weight_a - sum_of_intersected_a; - weight_t sum_of_uniq_b = weight_b - sum_of_intersected_b; - - sum_of_max_weight_a_intersect_b += sum_of_uniq_a + sum_of_uniq_b; - - return functor.compute_score(static_cast(weight_a), - static_cast(weight_b), - static_cast(sum_of_min_weight_a_intersect_b), - static_cast(sum_of_max_weight_a_intersect_b)); + return functor.compute_score(static_cast(sqrt(norm_a)), + static_cast(sqrt(norm_b)), + static_cast(sum_of_product_of_a_and_b), + weight_t{1.0}); + + } else { + weight_t sum_of_min_weight_a_intersect_b = weight_t{0}; + weight_t sum_of_max_weight_a_intersect_b = weight_t{0}; + weight_t sum_of_intersected_a = weight_t{0}; + weight_t sum_of_intersected_b = weight_t{0}; + + auto pair_first = thrust::make_zip_iterator(intersected_properties_a.data(), + intersected_properties_b.data()); + thrust::tie(sum_of_min_weight_a_intersect_b, + sum_of_max_weight_a_intersect_b, + sum_of_intersected_a, + sum_of_intersected_b) = + thrust::transform_reduce( + thrust::seq, + pair_first, + pair_first + intersected_properties_a.size(), + [] __device__(auto property_pair) { + auto prop_a = thrust::get<0>(property_pair); + auto prop_b = thrust::get<1>(property_pair); + return thrust::make_tuple(min(prop_a, prop_b), max(prop_a, prop_b), prop_a, prop_b); + }, + thrust::make_tuple(weight_t{0}, weight_t{0}, weight_t{0}, weight_t{0}), + [] __device__(auto lhs, auto rhs) { + return thrust::make_tuple(thrust::get<0>(lhs) + thrust::get<0>(rhs), + thrust::get<1>(lhs) + thrust::get<1>(rhs), + thrust::get<2>(lhs) + thrust::get<2>(rhs), + thrust::get<3>(lhs) + thrust::get<3>(rhs)); + }); + + weight_t sum_of_uniq_a = weight_a - sum_of_intersected_a; + weight_t sum_of_uniq_b = weight_b - sum_of_intersected_b; + + sum_of_max_weight_a_intersect_b += sum_of_uniq_a + sum_of_uniq_b; + + return functor.compute_score(static_cast(weight_a), + static_cast(weight_b), + static_cast(sum_of_min_weight_a_intersect_b), + static_cast(sum_of_max_weight_a_intersect_b)); + } }, similarity_score.begin(), do_expensive_check); - return similarity_score; } else { rmm::device_uvector similarity_score(num_vertex_pairs, handle.get_stream()); - auto out_degrees = graph_view.compute_out_degrees(handle); per_v_pair_transform_dst_nbr_intersection( @@ -149,17 +180,23 @@ rmm::device_uvector similarity( vertex_pairs_begin, vertex_pairs_begin + num_vertex_pairs, out_degrees.begin(), - [functor] __device__( + [functor, coeff] __device__( auto v1, auto v2, auto v1_degree, auto v2_degree, auto intersection, auto, auto) { - return functor.compute_score( - static_cast(v1_degree), - static_cast(v2_degree), - static_cast(intersection.size()), - static_cast(v1_degree + v2_degree - intersection.size())); + if (coeff == coefficient_t::COSINE) { + return functor.compute_score(weight_t{1}, + weight_t{1}, + intersection.size() >= 1 ? weight_t{1} : weight_t{0}, + weight_t{1}); + } else { + return functor.compute_score( + static_cast(v1_degree), + static_cast(v2_degree), + static_cast(intersection.size()), + static_cast(v1_degree + v2_degree - intersection.size())); + } }, similarity_score.begin(), do_expensive_check); - return similarity_score; } } @@ -174,6 +211,7 @@ all_pairs_similarity(raft::handle_t const& handle, std::optional> vertices, std::optional topk, functor_t functor, + coefficient_t coeff, bool do_expensive_check = false) { using GraphViewType = graph_view_t; @@ -391,6 +429,7 @@ all_pairs_similarity(raft::handle_t const& handle, std::make_tuple(raft::device_span{v1.data(), v1.size()}, raft::device_span{v2.data(), v2.size()}), functor, + coeff, do_expensive_check); // Add a remove_if to remove items that are less than the last topk element @@ -581,6 +620,7 @@ all_pairs_similarity(raft::handle_t const& handle, std::make_tuple(raft::device_span{v1.data(), v1.size()}, raft::device_span{v2.data(), v2.size()}), functor, + coeff, do_expensive_check); return std::make_tuple(std::move(v1), std::move(v2), std::move(score)); diff --git a/cpp/src/link_prediction/sorensen_impl.cuh b/cpp/src/link_prediction/sorensen_impl.cuh index af99732a45e..064e59d9f3d 100644 --- a/cpp/src/link_prediction/sorensen_impl.cuh +++ b/cpp/src/link_prediction/sorensen_impl.cuh @@ -24,8 +24,8 @@ namespace cugraph { namespace detail { +template struct sorensen_functor_t { - template weight_t __device__ compute_score(weight_t weight_a, weight_t weight_b, weight_t weight_a_intersect_b, @@ -53,7 +53,8 @@ rmm::device_uvector sorensen_coefficients( graph_view, edge_weight_view, vertex_pairs, - detail::sorensen_functor_t{}, + detail::sorensen_functor_t{}, + detail::coefficient_t::SORENSEN, do_expensive_check); } @@ -75,7 +76,8 @@ std:: edge_weight_view, vertices, topk, - detail::sorensen_functor_t{}, + detail::sorensen_functor_t{}, + detail::coefficient_t::SORENSEN, do_expensive_check); } diff --git a/cpp/tests/link_prediction/mg_similarity_test.cpp b/cpp/tests/link_prediction/mg_similarity_test.cpp index 3a71f8ee221..8f674e6a6de 100644 --- a/cpp/tests/link_prediction/mg_similarity_test.cpp +++ b/cpp/tests/link_prediction/mg_similarity_test.cpp @@ -252,6 +252,34 @@ TEST_P(Tests_MGSimilarity_Rmat, CheckInt64Int64FloatOverlap) override_Rmat_Usecase_with_cmd_line_arguments(GetParam()), cugraph::test::test_overlap_t{}); } +TEST_P(Tests_MGSimilarity_File, CheckInt32Int32FloatFloatCosine) +{ + auto param = GetParam(); + run_current_test( + override_File_Usecase_with_cmd_line_arguments(GetParam()), cugraph::test::test_cosine_t{}); +} + +TEST_P(Tests_MGSimilarity_Rmat, CheckInt32Int32FloatFloatCosine) +{ + auto param = GetParam(); + run_current_test( + override_Rmat_Usecase_with_cmd_line_arguments(GetParam()), cugraph::test::test_cosine_t{}); +} + +TEST_P(Tests_MGSimilarity_Rmat, CheckInt32Int64FloatFloatCosine) +{ + auto param = GetParam(); + run_current_test( + override_Rmat_Usecase_with_cmd_line_arguments(GetParam()), cugraph::test::test_cosine_t{}); +} + +TEST_P(Tests_MGSimilarity_Rmat, CheckInt64Int64FloatFloatCosine) +{ + auto param = GetParam(); + run_current_test( + override_Rmat_Usecase_with_cmd_line_arguments(GetParam()), cugraph::test::test_cosine_t{}); +} + INSTANTIATE_TEST_SUITE_P( file_test, Tests_MGSimilarity_File, diff --git a/cpp/tests/link_prediction/mg_weighted_similarity_test.cpp b/cpp/tests/link_prediction/mg_weighted_similarity_test.cpp index 3d891484818..192caa5227e 100644 --- a/cpp/tests/link_prediction/mg_weighted_similarity_test.cpp +++ b/cpp/tests/link_prediction/mg_weighted_similarity_test.cpp @@ -262,6 +262,34 @@ TEST_P(Tests_MGWeightedSimilarity_Rmat, CheckInt64Int64FloatOverlap) override_Rmat_Usecase_with_cmd_line_arguments(GetParam()), cugraph::test::test_overlap_t{}); } +TEST_P(Tests_MGWeightedSimilarity_File, CheckInt32Int32FloatFloatCosine) +{ + auto param = GetParam(); + run_current_test( + override_File_Usecase_with_cmd_line_arguments(GetParam()), cugraph::test::test_cosine_t{}); +} + +TEST_P(Tests_MGWeightedSimilarity_Rmat, CheckInt32Int32FloatFloatCosine) +{ + auto param = GetParam(); + run_current_test( + override_Rmat_Usecase_with_cmd_line_arguments(GetParam()), cugraph::test::test_cosine_t{}); +} + +TEST_P(Tests_MGWeightedSimilarity_Rmat, CheckInt32Int64FloatFloatCosine) +{ + auto param = GetParam(); + run_current_test( + override_Rmat_Usecase_with_cmd_line_arguments(GetParam()), cugraph::test::test_cosine_t{}); +} + +TEST_P(Tests_MGWeightedSimilarity_Rmat, CheckInt64Int64FloatFloatCosine) +{ + auto param = GetParam(); + run_current_test( + override_Rmat_Usecase_with_cmd_line_arguments(GetParam()), cugraph::test::test_cosine_t{}); +} + INSTANTIATE_TEST_SUITE_P( file_test, Tests_MGWeightedSimilarity_File, diff --git a/cpp/tests/link_prediction/similarity_compare.cpp b/cpp/tests/link_prediction/similarity_compare.cpp index b5da85cd36f..3c9459e988a 100644 --- a/cpp/tests/link_prediction/similarity_compare.cpp +++ b/cpp/tests/link_prediction/similarity_compare.cpp @@ -161,33 +161,56 @@ void weighted_similarity_compare( ++intersected_weight_idx; }); - weight_t sum_intersected_weights_v1 = - std::accumulate(intersected_weights_v1.begin(), intersected_weights_v1.end(), 0.0); - weight_t sum_intersected_weights_v2 = - std::accumulate(intersected_weights_v2.begin(), intersected_weights_v2.end(), 0.0); - - weight_t sum_of_uniq_weights_v1 = weighted_vertex_degrees[v1] - sum_intersected_weights_v1; - weight_t sum_of_uniq_weights_v2 = weighted_vertex_degrees[v2] - sum_intersected_weights_v2; - - weight_t min_weight_v1_intersect_v2 = weight_t{0}; - weight_t max_weight_v1_intersect_v2 = weight_t{0}; - - std::for_each( - thrust::make_zip_iterator(intersected_weights_v1.begin(), intersected_weights_v2.begin()), - thrust::make_zip_iterator(intersected_weights_v1.end(), intersected_weights_v2.end()), - [&min_weight_v1_intersect_v2, - &max_weight_v1_intersect_v2](thrust::tuple w1_w2) { - min_weight_v1_intersect_v2 += std::min(thrust::get<0>(w1_w2), thrust::get<1>(w1_w2)); - max_weight_v1_intersect_v2 += std::max(thrust::get<0>(w1_w2), thrust::get<1>(w1_w2)); - }); - - max_weight_v1_intersect_v2 += (sum_of_uniq_weights_v1 + sum_of_uniq_weights_v2); - auto expected_score = test_functor.compute_score(weighted_vertex_degrees[v1], - weighted_vertex_degrees[v2], - min_weight_v1_intersect_v2, - max_weight_v1_intersect_v2); - EXPECT_TRUE(compare_functor(score, expected_score)) - << "score mismatch, got " << score << ", expected " << expected_score; + if (test_functor.is_jaccard_or_sorensen_or_overlap) { + weight_t sum_intersected_weights_v1 = + std::accumulate(intersected_weights_v1.begin(), intersected_weights_v1.end(), 0.0); + weight_t sum_intersected_weights_v2 = + std::accumulate(intersected_weights_v2.begin(), intersected_weights_v2.end(), 0.0); + + weight_t sum_of_uniq_weights_v1 = weighted_vertex_degrees[v1] - sum_intersected_weights_v1; + weight_t sum_of_uniq_weights_v2 = weighted_vertex_degrees[v2] - sum_intersected_weights_v2; + + weight_t min_weight_v1_intersect_v2 = weight_t{0}; + weight_t max_weight_v1_intersect_v2 = weight_t{0}; + + std::for_each( + thrust::make_zip_iterator(intersected_weights_v1.begin(), intersected_weights_v2.begin()), + thrust::make_zip_iterator(intersected_weights_v1.end(), intersected_weights_v2.end()), + [&min_weight_v1_intersect_v2, + &max_weight_v1_intersect_v2](thrust::tuple w1_w2) { + min_weight_v1_intersect_v2 += std::min(thrust::get<0>(w1_w2), thrust::get<1>(w1_w2)); + max_weight_v1_intersect_v2 += std::max(thrust::get<0>(w1_w2), thrust::get<1>(w1_w2)); + }); + + max_weight_v1_intersect_v2 += (sum_of_uniq_weights_v1 + sum_of_uniq_weights_v2); + auto expected_score = test_functor.compute_score(weighted_vertex_degrees[v1], + weighted_vertex_degrees[v2], + min_weight_v1_intersect_v2, + max_weight_v1_intersect_v2); + EXPECT_TRUE(compare_functor(score, expected_score)) + << "score mismatch, got " << score << ", expected " << expected_score; + } else { + weight_t norm_v1 = weight_t{0}; + weight_t norm_v2 = weight_t{0}; + weight_t v1_dot_v2 = weight_t{0}; + + std::for_each( + thrust::make_zip_iterator(intersected_weights_v1.begin(), intersected_weights_v2.begin()), + thrust::make_zip_iterator(intersected_weights_v1.end(), intersected_weights_v2.end()), + [&norm_v1, &norm_v2, &v1_dot_v2](thrust::tuple w1_w2) { + auto x = thrust::get<0>(w1_w2); + auto y = thrust::get<1>(w1_w2); + + norm_v1 += x * x; + norm_v2 += y * y; + v1_dot_v2 += x * y; + }); + + auto expected_score = test_functor.compute_score( + std::sqrt(norm_v1), std::sqrt(norm_v2), v1_dot_v2, weight_t{1.0}); + EXPECT_TRUE(compare_functor(score, expected_score)) + << "score mismatch, got " << score << ", expected " << expected_score; + } }); } @@ -249,15 +272,27 @@ void similarity_compare( graph_dst.begin() + v2_end, intersection.begin()); - auto expected_score = test_functor.compute_score( - static_cast(vertex_degrees[v1]), - static_cast(vertex_degrees[v2]), - static_cast(std::distance(intersection.begin(), intersection_end)), - static_cast(vertex_degrees[v1] + vertex_degrees[v2] - - std::distance(intersection.begin(), intersection_end))); - - EXPECT_TRUE(compare_functor(score, expected_score)) - << "score mismatch, got " << score << ", expected " << expected_score; + if (test_functor.is_jaccard_or_sorensen_or_overlap) { + auto expected_score = test_functor.compute_score( + static_cast(vertex_degrees[v1]), + static_cast(vertex_degrees[v2]), + static_cast(std::distance(intersection.begin(), intersection_end)), + static_cast(vertex_degrees[v1] + vertex_degrees[v2] - + std::distance(intersection.begin(), intersection_end))); + + EXPECT_TRUE(compare_functor(score, expected_score)) + << "score mismatch, got " << score << ", expected " << expected_score; + + } else { + auto expected_score = + test_functor.compute_score(weight_t{1}, + weight_t{1}, + intersection.size() >= 1 ? weight_t{1} : weight_t{0}, + weight_t{1}); + + EXPECT_TRUE(compare_functor(score, expected_score)) + << "score mismatch, got " << score << ", expected " << expected_score; + } }); } @@ -269,6 +304,14 @@ template void similarity_compare( std::vector& result_score, test_jaccard_t const& test_functor); +template void similarity_compare( + int32_t num_vertices, + std::tuple&, std::vector&, std::optional>&> + edge_list, + std::tuple&, std::vector&> vertex_pairs, + std::vector& result_score, + test_cosine_t const& test_functor); + template void similarity_compare( int32_t num_vertices, std::tuple&, std::vector&, std::optional>&> @@ -293,6 +336,14 @@ template void similarity_compare( std::vector& result_score, test_jaccard_t const& test_functor); +template void similarity_compare( + int64_t num_vertices, + std::tuple&, std::vector&, std::optional>&> + edge_list, + std::tuple&, std::vector&> vertex_pairs, + std::vector& result_score, + test_cosine_t const& test_functor); + template void similarity_compare( int64_t num_vertices, std::tuple&, std::vector&, std::optional>&> @@ -309,8 +360,6 @@ template void similarity_compare( std::vector& result_score, test_overlap_t const& test_functor); -//// - template void weighted_similarity_compare( int32_t num_vertices, std::tuple&, std::vector&, std::optional>&> @@ -319,6 +368,14 @@ template void weighted_similarity_compare( std::vector& result_score, test_jaccard_t const& test_functor); +template void weighted_similarity_compare( + int32_t num_vertices, + std::tuple&, std::vector&, std::optional>&> + edge_list, + std::tuple&, std::vector&> vertex_pairs, + std::vector& result_score, + test_cosine_t const& test_functor); + template void weighted_similarity_compare( int32_t num_vertices, std::tuple&, std::vector&, std::optional>&> @@ -343,6 +400,14 @@ template void weighted_similarity_compare( std::vector& result_score, test_jaccard_t const& test_functor); +template void weighted_similarity_compare( + int64_t num_vertices, + std::tuple&, std::vector&, std::optional>&> + edge_list, + std::tuple&, std::vector&> vertex_pairs, + std::vector& result_score, + test_cosine_t const& test_functor); + template void weighted_similarity_compare( int64_t num_vertices, std::tuple&, std::vector&, std::optional>&> diff --git a/cpp/tests/link_prediction/similarity_compare.hpp b/cpp/tests/link_prediction/similarity_compare.hpp index eed0a82fe7e..d34916c0bae 100644 --- a/cpp/tests/link_prediction/similarity_compare.hpp +++ b/cpp/tests/link_prediction/similarity_compare.hpp @@ -28,6 +28,7 @@ namespace test { struct test_jaccard_t { std::string testname{"Jaccard"}; + bool is_jaccard_or_sorensen_or_overlap{true}; template weight_t compute_score(weight_t weight_a, @@ -69,6 +70,7 @@ struct test_jaccard_t { struct test_sorensen_t { std::string testname{"Sorensen"}; + bool is_jaccard_or_sorensen_or_overlap{true}; template weight_t compute_score(weight_t weight_a, @@ -110,6 +112,7 @@ struct test_sorensen_t { struct test_overlap_t { std::string testname{"Overlap"}; + bool is_jaccard_or_sorensen_or_overlap{true}; template weight_t compute_score(weight_t weight_a, @@ -149,6 +152,49 @@ struct test_overlap_t { } }; +struct test_cosine_t { + std::string testname{"Cosine"}; + bool is_jaccard_or_sorensen_or_overlap{false}; + + template + weight_t compute_score(weight_t norm_a, + weight_t norm_b, + weight_t sum_of_product_of_a_and_b, + weight_t reserved_param) const + { + if (std::abs(static_cast(norm_a * norm_b)) < + double{2} / std::numeric_limits::max()) { + return weight_t{0}; + } else { + return sum_of_product_of_a_and_b / (norm_a * norm_b); + } + } + + template + auto run( + raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::tuple, raft::device_span> vertex_pairs, + bool use_weights) const + { + return cugraph::cosine_similarity_coefficients( + handle, graph_view, edge_weight_view, vertex_pairs, true); + } + + template + auto run(raft::handle_t const& handle, + graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::optional> vertices, + bool use_weights, + std::optional topk) const + { + return cugraph::cosine_similarity_all_pairs_coefficients( + handle, graph_view, edge_weight_view, vertices, topk); + } +}; + template void similarity_compare( vertex_t num_vertices, diff --git a/cpp/tests/link_prediction/similarity_test.cu b/cpp/tests/link_prediction/similarity_test.cu index feeea257597..5547c4bd0c0 100644 --- a/cpp/tests/link_prediction/similarity_test.cu +++ b/cpp/tests/link_prediction/similarity_test.cu @@ -283,6 +283,30 @@ TEST_P(Tests_Similarity_Rmat, CheckInt64Int64FloatOverlap) override_Rmat_Usecase_with_cmd_line_arguments(GetParam()), cugraph::test::test_overlap_t{}); } +TEST_P(Tests_Similarity_File, CheckInt32Int32FloatCosine) +{ + run_current_test( + override_File_Usecase_with_cmd_line_arguments(GetParam()), cugraph::test::test_cosine_t{}); +} + +TEST_P(Tests_Similarity_Rmat, CheckInt32Int32FloatCosine) +{ + run_current_test( + override_Rmat_Usecase_with_cmd_line_arguments(GetParam()), cugraph::test::test_cosine_t{}); +} + +TEST_P(Tests_Similarity_Rmat, CheckInt32Int64FloatCosine) +{ + run_current_test( + override_Rmat_Usecase_with_cmd_line_arguments(GetParam()), cugraph::test::test_cosine_t{}); +} + +TEST_P(Tests_Similarity_Rmat, CheckInt64Int64FloatCosine) +{ + run_current_test( + override_Rmat_Usecase_with_cmd_line_arguments(GetParam()), cugraph::test::test_cosine_t{}); +} + INSTANTIATE_TEST_SUITE_P( file_test, Tests_Similarity_File, diff --git a/cpp/tests/link_prediction/weighted_similarity_test.cpp b/cpp/tests/link_prediction/weighted_similarity_test.cpp index 570369a0236..2450e7d6376 100644 --- a/cpp/tests/link_prediction/weighted_similarity_test.cpp +++ b/cpp/tests/link_prediction/weighted_similarity_test.cpp @@ -293,6 +293,30 @@ TEST_P(Tests_Similarity_Rmat, CheckInt64Int64FloatOverlap) override_Rmat_Usecase_with_cmd_line_arguments(GetParam()), cugraph::test::test_overlap_t{}); } +TEST_P(Tests_Similarity_File, CheckInt32Int32FloatCosine) +{ + run_current_test( + override_File_Usecase_with_cmd_line_arguments(GetParam()), cugraph::test::test_cosine_t{}); +} + +TEST_P(Tests_Similarity_Rmat, CheckInt32Int32FloatCosine) +{ + run_current_test( + override_Rmat_Usecase_with_cmd_line_arguments(GetParam()), cugraph::test::test_cosine_t{}); +} + +TEST_P(Tests_Similarity_Rmat, CheckInt32Int64FloatCosine) +{ + run_current_test( + override_Rmat_Usecase_with_cmd_line_arguments(GetParam()), cugraph::test::test_cosine_t{}); +} + +TEST_P(Tests_Similarity_Rmat, CheckInt64Int64FloatCosine) +{ + run_current_test( + override_Rmat_Usecase_with_cmd_line_arguments(GetParam()), cugraph::test::test_cosine_t{}); +} + INSTANTIATE_TEST_SUITE_P( file_test, Tests_Similarity_File,