diff --git a/cpp/include/cugraph_c/sampling_algorithms.h b/cpp/include/cugraph_c/sampling_algorithms.h index 5760d2098aa..859eaca7f3b 100644 --- a/cpp/include/cugraph_c/sampling_algorithms.h +++ b/cpp/include/cugraph_c/sampling_algorithms.h @@ -236,6 +236,15 @@ typedef enum cugraph_compression_type_t { cugraph_error_code_t cugraph_sampling_options_create(cugraph_sampling_options_t** options, cugraph_error_t** error); +/** + * @ingroup samplingC + * @brief Set flag to retain seeds (original sources) + * + * @param options - opaque pointer to the sampling options + * @param value - Boolean value to assign to the option + */ +void cugraph_sampling_set_retain_seeds(cugraph_sampling_options_t* options, bool_t value); + /** * @ingroup samplingC * @brief Set flag to renumber results @@ -335,6 +344,8 @@ void cugraph_sampling_options_free(cugraph_sampling_options_t* options); * output. If specified then the all data from @p label_list[i] will be shuffled to rank @p. This * cannot be specified unless @p start_vertex_labels is also specified * label_to_comm_rank[i]. If not specified then the output data will not be shuffled between ranks. + * @param [in] label_offsets Device array of the offsets for each label in the seed list. This + * parameter is only used with the retain_seeds option. * @param [in] fanout Host array defining the fan out at each step in the sampling algorithm. * We only support fanout values of type INT32 * @param [in/out] rng_state State of the random number generator, updated with each call @@ -354,6 +365,7 @@ cugraph_error_code_t cugraph_uniform_neighbor_sample( const cugraph_type_erased_device_array_view_t* start_vertex_labels, const cugraph_type_erased_device_array_view_t* label_list, const cugraph_type_erased_device_array_view_t* label_to_comm_rank, + const cugraph_type_erased_device_array_view_t* label_offsets, const cugraph_type_erased_host_array_view_t* fan_out, cugraph_rng_state_t* rng_state, const cugraph_sampling_options_t* options, diff --git a/cpp/src/c_api/uniform_neighbor_sampling.cpp b/cpp/src/c_api/uniform_neighbor_sampling.cpp index 100e81a5bd2..45609fc0e01 100644 --- a/cpp/src/c_api/uniform_neighbor_sampling.cpp +++ b/cpp/src/c_api/uniform_neighbor_sampling.cpp @@ -40,6 +40,7 @@ struct cugraph_sampling_options_t { bool_t renumber_results_{FALSE}; cugraph_compression_type_t compression_type_{cugraph_compression_type_t::COO}; bool_t compress_per_hop_{FALSE}; + bool_t retain_seeds_{FALSE}; }; struct cugraph_sample_result_t { @@ -68,6 +69,7 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct cugraph::c_api::cugraph_type_erased_device_array_view_t const* start_vertex_labels_{nullptr}; cugraph::c_api::cugraph_type_erased_device_array_view_t const* label_list_{nullptr}; cugraph::c_api::cugraph_type_erased_device_array_view_t const* label_to_comm_rank_{nullptr}; + cugraph::c_api::cugraph_type_erased_device_array_view_t const* label_offsets_{nullptr}; cugraph::c_api::cugraph_type_erased_host_array_view_t const* fan_out_{nullptr}; cugraph::c_api::cugraph_rng_state_t* rng_state_{nullptr}; cugraph::c_api::cugraph_sampling_options_t options_{}; @@ -81,6 +83,7 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct cugraph_type_erased_device_array_view_t const* start_vertex_labels, cugraph_type_erased_device_array_view_t const* label_list, cugraph_type_erased_device_array_view_t const* label_to_comm_rank, + cugraph_type_erased_device_array_view_t const* label_offsets, cugraph_type_erased_host_array_view_t const* fan_out, cugraph_rng_state_t* rng_state, cugraph::c_api::cugraph_sampling_options_t options, @@ -99,6 +102,9 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct label_to_comm_rank_( reinterpret_cast( label_to_comm_rank)), + label_offsets_( + reinterpret_cast( + label_offsets)), fan_out_( reinterpret_cast(fan_out)), rng_state_(reinterpret_cast(rng_state)), @@ -267,8 +273,13 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct std::move(edge_id), std::move(edge_type), std::move(hop), - std::nullopt, - std::nullopt, + options_.retain_seeds_ + ? std::make_optional(raft::device_span{ + start_vertices_->as_type(), start_vertices_->size_}) + : std::nullopt, + options_.retain_seeds_ ? std::make_optional(raft::device_span{ + label_offsets_->as_type(), label_offsets_->size_}) + : std::nullopt, offsets ? std::make_optional( raft::device_span{offsets->data(), offsets->size()}) : std::nullopt, @@ -304,8 +315,13 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct std::move(edge_id), std::move(edge_type), std::move(hop), - std::nullopt, - std::nullopt, + options_.retain_seeds_ + ? std::make_optional(raft::device_span{ + start_vertices_->as_type(), start_vertices_->size_}) + : std::nullopt, + options_.retain_seeds_ ? std::make_optional(raft::device_span{ + label_offsets_->as_type(), label_offsets_->size_}) + : std::nullopt, offsets ? std::make_optional( raft::device_span{offsets->data(), offsets->size()}) : std::nullopt, @@ -402,6 +418,12 @@ extern "C" cugraph_error_code_t cugraph_sampling_options_create( return CUGRAPH_SUCCESS; } +extern "C" void cugraph_sampling_set_retain_seeds(cugraph_sampling_options_t* options, bool_t value) +{ + auto internal_pointer = reinterpret_cast(options); + internal_pointer->retain_seeds_ = value; +} + extern "C" void cugraph_sampling_set_renumber_results(cugraph_sampling_options_t* options, bool_t value) { @@ -871,6 +893,7 @@ cugraph_error_code_t cugraph_uniform_neighbor_sample( const cugraph_type_erased_device_array_view_t* start_vertex_labels, const cugraph_type_erased_device_array_view_t* label_list, const cugraph_type_erased_device_array_view_t* label_to_comm_rank, + const cugraph_type_erased_device_array_view_t* label_offsets, const cugraph_type_erased_host_array_view_t* fan_out, cugraph_rng_state_t* rng_state, const cugraph_sampling_options_t* options, @@ -878,6 +901,13 @@ cugraph_error_code_t cugraph_uniform_neighbor_sample( cugraph_sample_result_t** result, cugraph_error_t** error) { + auto options_cpp = *reinterpret_cast(options); + + CAPI_EXPECTS((!options_cpp.retain_seeds_) || (label_offsets != nullptr), + CUGRAPH_INVALID_INPUT, + "must specify label_offsets if retain_seeds is true", + *error); + CAPI_EXPECTS((start_vertex_labels == nullptr) || (reinterpret_cast( start_vertex_labels) @@ -911,16 +941,16 @@ cugraph_error_code_t cugraph_uniform_neighbor_sample( "fan_out should be of type int", *error); - uniform_neighbor_sampling_functor functor{ - handle, - graph, - start_vertices, - start_vertex_labels, - label_list, - label_to_comm_rank, - fan_out, - rng_state, - *reinterpret_cast(options), - do_expensive_check}; + uniform_neighbor_sampling_functor functor{handle, + graph, + start_vertices, + start_vertex_labels, + label_list, + label_to_comm_rank, + label_offsets, + fan_out, + rng_state, + std::move(options_cpp), + do_expensive_check}; return cugraph::c_api::run_algorithm(graph, functor, result, error); } diff --git a/cpp/tests/c_api/create_graph_test.c b/cpp/tests/c_api/create_graph_test.c index 758624a89e9..41b8691e79c 100644 --- a/cpp/tests/c_api/create_graph_test.c +++ b/cpp/tests/c_api/create_graph_test.c @@ -268,6 +268,7 @@ int test_create_sg_graph_csr() NULL, NULL, NULL, + NULL, h_fan_out_view, rng_state, sampling_options, diff --git a/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c b/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c index a32056a9f15..3d8fb02ed46 100644 --- a/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c +++ b/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c @@ -133,6 +133,7 @@ int generic_uniform_neighbor_sample_test(const cugraph_resource_handle_t* handle d_start_labels_view, NULL, NULL, + NULL, h_fan_out_view, rng_state, sampling_options, @@ -565,6 +566,7 @@ int test_uniform_neighbor_from_alex(const cugraph_resource_handle_t* handle) d_label_view, NULL, NULL, + NULL, h_fan_out_view, rng_state, sampling_options, @@ -841,6 +843,7 @@ int test_uniform_neighbor_sample_alex_bug(const cugraph_resource_handle_t* handl d_start_labels_view, d_label_list_view, d_label_to_output_comm_rank_view, + NULL, h_fan_out_view, rng_state, sampling_options, @@ -1099,6 +1102,7 @@ int test_uniform_neighbor_sample_sort_by_hop(const cugraph_resource_handle_t* ha d_start_labels_view, d_label_list_view, d_label_to_output_comm_rank_view, + NULL, h_fan_out_view, rng_state, sampling_options, diff --git a/cpp/tests/c_api/uniform_neighbor_sample_test.c b/cpp/tests/c_api/uniform_neighbor_sample_test.c index 94f0f788354..451dbca51a7 100644 --- a/cpp/tests/c_api/uniform_neighbor_sample_test.c +++ b/cpp/tests/c_api/uniform_neighbor_sample_test.c @@ -140,6 +140,7 @@ int generic_uniform_neighbor_sample_test(const cugraph_resource_handle_t* handle d_start_labels_view, NULL, NULL, + NULL, h_fan_out_view, rng_state, sampling_options, @@ -661,6 +662,7 @@ int test_uniform_neighbor_sample_with_labels(const cugraph_resource_handle_t* ha d_start_labels_view, NULL, NULL, + NULL, h_fan_out_view, rng_state, sampling_options, diff --git a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py index 86b33594ed7..eafadfa4ff0 100644 --- a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py +++ b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py @@ -71,6 +71,8 @@ def uniform_neighbor_sample( prior_sources_behavior: str = None, deduplicate_sources: bool = False, renumber: bool = False, + retain_seeds: bool = False, + label_offsets: Sequence = None, use_legacy_names: bool = True, # deprecated compress_per_hop: bool = False, compression: str = "COO", @@ -142,6 +144,15 @@ def uniform_neighbor_sample( will return the renumber map and renumber map offsets as an additional dataframe. + retain_seeds: bool, optional (default=False) + If True, will retain the original seeds (original source vertices) + in the output even if they do not have outgoing neighbors. + + label_offsets: integer sequence, optional (default=None) + Offsets of each label within the start vertex list. + Only used if retain_seeds is True. Required if retain_seeds + is True. + use_legacy_names: bool, optional (default=True) Whether to use the legacy column names (sources, destinations). If True, will use "sources" and "destinations" as the column names. @@ -342,13 +353,15 @@ def uniform_neighbor_sample( else None, h_fan_out=fanout_vals, with_replacement=with_replacement, - do_expensive_check=False, + do_expensive_check=True, with_edge_properties=with_edge_properties, random_state=random_state, prior_sources_behavior=prior_sources_behavior, deduplicate_sources=deduplicate_sources, return_hops=return_hops, renumber=renumber, + retain_seeds=retain_seeds, + label_offsets=label_offsets, compression=compression, compress_per_hop=compress_per_hop, return_dict=True, diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py index 560b80993d9..304ead6fea9 100644 --- a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py +++ b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample.py @@ -963,6 +963,46 @@ def test_uniform_neighbor_sample_csr_csc_local(hops, seed): assert 1 == len(el[(el.src == majors.iloc[i]) & (el.dst == minors.iloc[i])]) +def test_uniform_neighbor_sample_retain_seeds(): + src = cupy.array([0, 1, 2, 3, 4, 5], dtype="int64") + dst = cupy.array([2, 3, 1, 7, 5, 6], dtype="int64") + + seeds = cupy.array([6, 0, 1, 7], dtype="int64") + batch = cupy.array([0, 0, 1, 1], dtype="int32") + batch_offsets = cupy.array([0, 2, 4], dtype="int64") + + fanout = [2, 2] + + df = cudf.DataFrame({"src": src, "dst": dst}) + + G = cugraph.MultiGraph(directed=True) + G.from_cudf_edgelist(df, source="src", destination="dst") + + batch_df = cudf.DataFrame({"seeds": seeds, "batch": batch}) + batch_offsets_s = cudf.Series(batch_offsets, name="batch_offsets") + results, offsets, renumber_map = cugraph.uniform_neighbor_sample( + G, + batch_df, + fanout, + with_replacement=False, + with_edge_properties=True, + with_batch_ids=True, + random_state=62, + return_offsets=True, + label_offsets=batch_offsets_s, + return_hops=True, + prior_sources_behavior="exclude", + deduplicate_sources=True, + renumber=True, + retain_seeds=True, + compress_per_hop=False, + ) + + assert offsets.renumber_map_offsets.dropna().values_host.tolist() == [0, 4, 7] + assert renumber_map.renumber_map.values_host[[0, 1]].tolist() == [0, 6] + assert renumber_map.renumber_map.values_host[[4, 5]].tolist() == [1, 7] + + @pytest.mark.sg @pytest.mark.skip(reason="needs to be written!") def test_uniform_neighbor_sample_dcsr_dcsc_global(): diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd index b0e7ffaf82d..4da7c4328fd 100644 --- a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd +++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd @@ -292,6 +292,12 @@ cdef extern from "cugraph_c/algorithms.h": bool_t value, ) + cdef void \ + cugraph_sampling_set_retain_seeds( + cugraph_sampling_options_t* options, + bool_t value, + ) + cdef void \ cugraph_sampling_set_with_replacement( cugraph_sampling_options_t* options, diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd index c32b57f8621..dbd3ef4b7e1 100644 --- a/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd +++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/sampling_algorithms.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -50,6 +50,7 @@ cdef extern from "cugraph_c/sampling_algorithms.h": const cugraph_type_erased_device_array_view_t* start_vertex_labels, const cugraph_type_erased_device_array_view_t* label_list, const cugraph_type_erased_device_array_view_t* label_to_comm_rank, + const cugraph_type_erased_device_array_view_t* label_offsets, const cugraph_type_erased_host_array_view_t* fan_out, cugraph_rng_state_t* rng_state, const cugraph_sampling_options_t* options, diff --git a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx index b4145a80095..f002622f497 100644 --- a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx +++ b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx @@ -49,6 +49,7 @@ from pylibcugraph._cugraph_c.algorithms cimport ( cugraph_sampling_set_renumber_results, cugraph_sampling_set_compress_per_hop, cugraph_sampling_set_compression_type, + cugraph_sampling_set_retain_seeds, ) from pylibcugraph._cugraph_c.sampling_algorithms cimport ( cugraph_uniform_neighbor_sample, @@ -89,10 +90,12 @@ def uniform_neighbor_sample(ResourceHandle resource_handle, batch_id_list=None, label_list=None, label_to_output_comm_rank=None, + label_offsets=None, prior_sources_behavior=None, deduplicate_sources=False, return_hops=False, renumber=False, + retain_seeds=False, compression='COO', compress_per_hop=False, random_state=None, @@ -143,6 +146,9 @@ def uniform_neighbor_sample(ResourceHandle resource_handle, worker that should hold results for that batch id. Defaults to NULL (does nothing) + label_offsets: list[int] (Optional) + Offsets of each label within the start vertex list. + prior_sources_behavior: str (Optional) Options are "carryover", and "exclude". Default will leave the source list as-is. @@ -160,6 +166,11 @@ def uniform_neighbor_sample(ResourceHandle resource_handle, per-batch basis and return the renumber map and batch offsets in additional to the standard returns. + retain_seeds: bool (Optional) + If True, will retain the original seeds (original source vertices) + in the output even if they do not have outgoing neighbors. + Defaults to False. + compression: str (Optional) Options: COO (default), CSR, CSC, DCSR, DCSR Sets the compression format for the returned samples. @@ -210,6 +221,7 @@ def uniform_neighbor_sample(ResourceHandle resource_handle, assert_CAI_type(batch_id_list, "batch_id_list", True) assert_CAI_type(label_list, "label_list", True) assert_CAI_type(label_to_output_comm_rank, "label_to_output_comm_rank", True) + assert_CAI_type(label_offsets, "label_offsets", True) assert_AI_type(h_fan_out, "h_fan_out") cdef cugraph_sample_result_t* result_ptr @@ -234,6 +246,11 @@ def uniform_neighbor_sample(ResourceHandle resource_handle, cai_label_to_output_comm_rank_ptr = \ label_to_output_comm_rank.__cuda_array_interface__['data'][0] + cdef uintptr_t cai_label_offsets_ptr + if label_offsets is not None: + cai_label_offsets_ptr = \ + label_offsets.__cuda_array_interface__['data'][0] + cdef uintptr_t ai_fan_out_ptr = \ h_fan_out.__array_interface__["data"][0] @@ -270,6 +287,17 @@ def uniform_neighbor_sample(ResourceHandle resource_handle, get_c_type_from_numpy_type(label_to_output_comm_rank.dtype) ) + cdef cugraph_type_erased_device_array_view_t* label_offsets_ptr = NULL + if retain_seeds: + if label_offsets is None: + raise ValueError("Must provide label offsets if retain_seeds is True") + label_offsets_ptr = \ + cugraph_type_erased_device_array_view_create( + cai_label_offsets_ptr, + len(label_offsets), + get_c_type_from_numpy_type(label_offsets.dtype) + ) + cdef cugraph_type_erased_host_array_view_t* fan_out_ptr = \ cugraph_type_erased_host_array_view_create( ai_fan_out_ptr, @@ -323,6 +351,7 @@ def uniform_neighbor_sample(ResourceHandle resource_handle, cugraph_sampling_set_renumber_results(sampling_options, c_renumber) cugraph_sampling_set_compression_type(sampling_options, compression_behavior_e) cugraph_sampling_set_compress_per_hop(sampling_options, c_compress_per_hop) + cugraph_sampling_set_retain_seeds(sampling_options, retain_seeds) error_code = cugraph_uniform_neighbor_sample( c_resource_handle_ptr, @@ -331,6 +360,7 @@ def uniform_neighbor_sample(ResourceHandle resource_handle, batch_id_ptr, label_list_ptr, label_to_output_comm_rank_ptr, + label_offsets_ptr, fan_out_ptr, rng_state_ptr, sampling_options, @@ -347,6 +377,8 @@ def uniform_neighbor_sample(ResourceHandle resource_handle, cugraph_type_erased_host_array_view_free(fan_out_ptr) if batch_id_list is not None: cugraph_type_erased_device_array_view_free(batch_id_ptr) + if label_offsets is not None: + cugraph_type_erased_device_array_view_free(label_offsets_ptr) # Have the SamplingResult instance assume ownership of the result data. result = SamplingResult()