Skip to content

Commit

Permalink
[FEA] Support Seed Retention for Sampling with Renumbering (rapidsai#…
Browse files Browse the repository at this point in the history
…4355)

Exposes the ability to retain seeds even if they have no outgoing edges (and therefore are not sampled).  Required to fix the current bug in cuGraph-PyG involving batch size and dropping seeds.

Currently, this functionality can't be exposed through the MG Python API (rapidsai#4358) but exposing it through the pylibcugraph API is sufficient to resolve this issue.  This PR does expose it through the SG Python API.

Authors:
  - Alex Barghi (https://github.com/alexbarghi-nv)
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Seunghwa Kang (https://github.com/seunghwak)
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Naim (https://github.com/naimnv)
  - Rick Ratzel (https://github.com/rlratzel)

URL: rapidsai#4355
  • Loading branch information
alexbarghi-nv authored May 14, 2024
1 parent 79acec9 commit 45371cb
Show file tree
Hide file tree
Showing 10 changed files with 158 additions and 17 deletions.
12 changes: 12 additions & 0 deletions cpp/include/cugraph_c/sampling_algorithms.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,15 @@ typedef enum cugraph_compression_type_t {
cugraph_error_code_t cugraph_sampling_options_create(cugraph_sampling_options_t** options,
cugraph_error_t** error);

/**
* @ingroup samplingC
* @brief Set flag to retain seeds (original sources)
*
* @param options - opaque pointer to the sampling options
* @param value - Boolean value to assign to the option
*/
void cugraph_sampling_set_retain_seeds(cugraph_sampling_options_t* options, bool_t value);

/**
* @ingroup samplingC
* @brief Set flag to renumber results
Expand Down Expand Up @@ -335,6 +344,8 @@ void cugraph_sampling_options_free(cugraph_sampling_options_t* options);
* output. If specified then the all data from @p label_list[i] will be shuffled to rank @p. This
* cannot be specified unless @p start_vertex_labels is also specified
* label_to_comm_rank[i]. If not specified then the output data will not be shuffled between ranks.
* @param [in] label_offsets Device array of the offsets for each label in the seed list. This
* parameter is only used with the retain_seeds option.
* @param [in] fanout Host array defining the fan out at each step in the sampling algorithm.
* We only support fanout values of type INT32
* @param [in/out] rng_state State of the random number generator, updated with each call
Expand All @@ -354,6 +365,7 @@ cugraph_error_code_t cugraph_uniform_neighbor_sample(
const cugraph_type_erased_device_array_view_t* start_vertex_labels,
const cugraph_type_erased_device_array_view_t* label_list,
const cugraph_type_erased_device_array_view_t* label_to_comm_rank,
const cugraph_type_erased_device_array_view_t* label_offsets,
const cugraph_type_erased_host_array_view_t* fan_out,
cugraph_rng_state_t* rng_state,
const cugraph_sampling_options_t* options,
Expand Down
60 changes: 45 additions & 15 deletions cpp/src/c_api/uniform_neighbor_sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ struct cugraph_sampling_options_t {
bool_t renumber_results_{FALSE};
cugraph_compression_type_t compression_type_{cugraph_compression_type_t::COO};
bool_t compress_per_hop_{FALSE};
bool_t retain_seeds_{FALSE};
};

struct cugraph_sample_result_t {
Expand Down Expand Up @@ -68,6 +69,7 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
cugraph::c_api::cugraph_type_erased_device_array_view_t const* start_vertex_labels_{nullptr};
cugraph::c_api::cugraph_type_erased_device_array_view_t const* label_list_{nullptr};
cugraph::c_api::cugraph_type_erased_device_array_view_t const* label_to_comm_rank_{nullptr};
cugraph::c_api::cugraph_type_erased_device_array_view_t const* label_offsets_{nullptr};
cugraph::c_api::cugraph_type_erased_host_array_view_t const* fan_out_{nullptr};
cugraph::c_api::cugraph_rng_state_t* rng_state_{nullptr};
cugraph::c_api::cugraph_sampling_options_t options_{};
Expand All @@ -81,6 +83,7 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
cugraph_type_erased_device_array_view_t const* start_vertex_labels,
cugraph_type_erased_device_array_view_t const* label_list,
cugraph_type_erased_device_array_view_t const* label_to_comm_rank,
cugraph_type_erased_device_array_view_t const* label_offsets,
cugraph_type_erased_host_array_view_t const* fan_out,
cugraph_rng_state_t* rng_state,
cugraph::c_api::cugraph_sampling_options_t options,
Expand All @@ -99,6 +102,9 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
label_to_comm_rank_(
reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
label_to_comm_rank)),
label_offsets_(
reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
label_offsets)),
fan_out_(
reinterpret_cast<cugraph::c_api::cugraph_type_erased_host_array_view_t const*>(fan_out)),
rng_state_(reinterpret_cast<cugraph::c_api::cugraph_rng_state_t*>(rng_state)),
Expand Down Expand Up @@ -267,8 +273,13 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
std::move(edge_id),
std::move(edge_type),
std::move(hop),
std::nullopt,
std::nullopt,
options_.retain_seeds_
? std::make_optional(raft::device_span<vertex_t const>{
start_vertices_->as_type<vertex_t>(), start_vertices_->size_})
: std::nullopt,
options_.retain_seeds_ ? std::make_optional(raft::device_span<size_t const>{
label_offsets_->as_type<size_t>(), label_offsets_->size_})
: std::nullopt,
offsets ? std::make_optional(
raft::device_span<size_t const>{offsets->data(), offsets->size()})
: std::nullopt,
Expand Down Expand Up @@ -304,8 +315,13 @@ struct uniform_neighbor_sampling_functor : public cugraph::c_api::abstract_funct
std::move(edge_id),
std::move(edge_type),
std::move(hop),
std::nullopt,
std::nullopt,
options_.retain_seeds_
? std::make_optional(raft::device_span<vertex_t const>{
start_vertices_->as_type<vertex_t>(), start_vertices_->size_})
: std::nullopt,
options_.retain_seeds_ ? std::make_optional(raft::device_span<size_t const>{
label_offsets_->as_type<size_t>(), label_offsets_->size_})
: std::nullopt,
offsets ? std::make_optional(
raft::device_span<size_t const>{offsets->data(), offsets->size()})
: std::nullopt,
Expand Down Expand Up @@ -402,6 +418,12 @@ extern "C" cugraph_error_code_t cugraph_sampling_options_create(
return CUGRAPH_SUCCESS;
}

extern "C" void cugraph_sampling_set_retain_seeds(cugraph_sampling_options_t* options, bool_t value)
{
auto internal_pointer = reinterpret_cast<cugraph::c_api::cugraph_sampling_options_t*>(options);
internal_pointer->retain_seeds_ = value;
}

extern "C" void cugraph_sampling_set_renumber_results(cugraph_sampling_options_t* options,
bool_t value)
{
Expand Down Expand Up @@ -871,13 +893,21 @@ cugraph_error_code_t cugraph_uniform_neighbor_sample(
const cugraph_type_erased_device_array_view_t* start_vertex_labels,
const cugraph_type_erased_device_array_view_t* label_list,
const cugraph_type_erased_device_array_view_t* label_to_comm_rank,
const cugraph_type_erased_device_array_view_t* label_offsets,
const cugraph_type_erased_host_array_view_t* fan_out,
cugraph_rng_state_t* rng_state,
const cugraph_sampling_options_t* options,
bool_t do_expensive_check,
cugraph_sample_result_t** result,
cugraph_error_t** error)
{
auto options_cpp = *reinterpret_cast<cugraph::c_api::cugraph_sampling_options_t const*>(options);

CAPI_EXPECTS((!options_cpp.retain_seeds_) || (label_offsets != nullptr),
CUGRAPH_INVALID_INPUT,
"must specify label_offsets if retain_seeds is true",
*error);

CAPI_EXPECTS((start_vertex_labels == nullptr) ||
(reinterpret_cast<cugraph::c_api::cugraph_type_erased_device_array_view_t const*>(
start_vertex_labels)
Expand Down Expand Up @@ -911,16 +941,16 @@ cugraph_error_code_t cugraph_uniform_neighbor_sample(
"fan_out should be of type int",
*error);

uniform_neighbor_sampling_functor functor{
handle,
graph,
start_vertices,
start_vertex_labels,
label_list,
label_to_comm_rank,
fan_out,
rng_state,
*reinterpret_cast<cugraph::c_api::cugraph_sampling_options_t const*>(options),
do_expensive_check};
uniform_neighbor_sampling_functor functor{handle,
graph,
start_vertices,
start_vertex_labels,
label_list,
label_to_comm_rank,
label_offsets,
fan_out,
rng_state,
std::move(options_cpp),
do_expensive_check};
return cugraph::c_api::run_algorithm(graph, functor, result, error);
}
1 change: 1 addition & 0 deletions cpp/tests/c_api/create_graph_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,7 @@ int test_create_sg_graph_csr()
NULL,
NULL,
NULL,
NULL,
h_fan_out_view,
rng_state,
sampling_options,
Expand Down
4 changes: 4 additions & 0 deletions cpp/tests/c_api/mg_uniform_neighbor_sample_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ int generic_uniform_neighbor_sample_test(const cugraph_resource_handle_t* handle
d_start_labels_view,
NULL,
NULL,
NULL,
h_fan_out_view,
rng_state,
sampling_options,
Expand Down Expand Up @@ -565,6 +566,7 @@ int test_uniform_neighbor_from_alex(const cugraph_resource_handle_t* handle)
d_label_view,
NULL,
NULL,
NULL,
h_fan_out_view,
rng_state,
sampling_options,
Expand Down Expand Up @@ -841,6 +843,7 @@ int test_uniform_neighbor_sample_alex_bug(const cugraph_resource_handle_t* handl
d_start_labels_view,
d_label_list_view,
d_label_to_output_comm_rank_view,
NULL,
h_fan_out_view,
rng_state,
sampling_options,
Expand Down Expand Up @@ -1099,6 +1102,7 @@ int test_uniform_neighbor_sample_sort_by_hop(const cugraph_resource_handle_t* ha
d_start_labels_view,
d_label_list_view,
d_label_to_output_comm_rank_view,
NULL,
h_fan_out_view,
rng_state,
sampling_options,
Expand Down
2 changes: 2 additions & 0 deletions cpp/tests/c_api/uniform_neighbor_sample_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ int generic_uniform_neighbor_sample_test(const cugraph_resource_handle_t* handle
d_start_labels_view,
NULL,
NULL,
NULL,
h_fan_out_view,
rng_state,
sampling_options,
Expand Down Expand Up @@ -661,6 +662,7 @@ int test_uniform_neighbor_sample_with_labels(const cugraph_resource_handle_t* ha
d_start_labels_view,
NULL,
NULL,
NULL,
h_fan_out_view,
rng_state,
sampling_options,
Expand Down
15 changes: 14 additions & 1 deletion python/cugraph/cugraph/sampling/uniform_neighbor_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ def uniform_neighbor_sample(
prior_sources_behavior: str = None,
deduplicate_sources: bool = False,
renumber: bool = False,
retain_seeds: bool = False,
label_offsets: Sequence = None,
use_legacy_names: bool = True, # deprecated
compress_per_hop: bool = False,
compression: str = "COO",
Expand Down Expand Up @@ -142,6 +144,15 @@ def uniform_neighbor_sample(
will return the renumber map and renumber map offsets
as an additional dataframe.
retain_seeds: bool, optional (default=False)
If True, will retain the original seeds (original source vertices)
in the output even if they do not have outgoing neighbors.
label_offsets: integer sequence, optional (default=None)
Offsets of each label within the start vertex list.
Only used if retain_seeds is True. Required if retain_seeds
is True.
use_legacy_names: bool, optional (default=True)
Whether to use the legacy column names (sources, destinations).
If True, will use "sources" and "destinations" as the column names.
Expand Down Expand Up @@ -342,13 +353,15 @@ def uniform_neighbor_sample(
else None,
h_fan_out=fanout_vals,
with_replacement=with_replacement,
do_expensive_check=False,
do_expensive_check=True,
with_edge_properties=with_edge_properties,
random_state=random_state,
prior_sources_behavior=prior_sources_behavior,
deduplicate_sources=deduplicate_sources,
return_hops=return_hops,
renumber=renumber,
retain_seeds=retain_seeds,
label_offsets=label_offsets,
compression=compression,
compress_per_hop=compress_per_hop,
return_dict=True,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -963,6 +963,46 @@ def test_uniform_neighbor_sample_csr_csc_local(hops, seed):
assert 1 == len(el[(el.src == majors.iloc[i]) & (el.dst == minors.iloc[i])])


def test_uniform_neighbor_sample_retain_seeds():
src = cupy.array([0, 1, 2, 3, 4, 5], dtype="int64")
dst = cupy.array([2, 3, 1, 7, 5, 6], dtype="int64")

seeds = cupy.array([6, 0, 1, 7], dtype="int64")
batch = cupy.array([0, 0, 1, 1], dtype="int32")
batch_offsets = cupy.array([0, 2, 4], dtype="int64")

fanout = [2, 2]

df = cudf.DataFrame({"src": src, "dst": dst})

G = cugraph.MultiGraph(directed=True)
G.from_cudf_edgelist(df, source="src", destination="dst")

batch_df = cudf.DataFrame({"seeds": seeds, "batch": batch})
batch_offsets_s = cudf.Series(batch_offsets, name="batch_offsets")
results, offsets, renumber_map = cugraph.uniform_neighbor_sample(
G,
batch_df,
fanout,
with_replacement=False,
with_edge_properties=True,
with_batch_ids=True,
random_state=62,
return_offsets=True,
label_offsets=batch_offsets_s,
return_hops=True,
prior_sources_behavior="exclude",
deduplicate_sources=True,
renumber=True,
retain_seeds=True,
compress_per_hop=False,
)

assert offsets.renumber_map_offsets.dropna().values_host.tolist() == [0, 4, 7]
assert renumber_map.renumber_map.values_host[[0, 1]].tolist() == [0, 6]
assert renumber_map.renumber_map.values_host[[4, 5]].tolist() == [1, 7]


@pytest.mark.sg
@pytest.mark.skip(reason="needs to be written!")
def test_uniform_neighbor_sample_dcsr_dcsc_global():
Expand Down
6 changes: 6 additions & 0 deletions python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,12 @@ cdef extern from "cugraph_c/algorithms.h":
bool_t value,
)

cdef void \
cugraph_sampling_set_retain_seeds(
cugraph_sampling_options_t* options,
bool_t value,
)

cdef void \
cugraph_sampling_set_with_replacement(
cugraph_sampling_options_t* options,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand Down Expand Up @@ -50,6 +50,7 @@ cdef extern from "cugraph_c/sampling_algorithms.h":
const cugraph_type_erased_device_array_view_t* start_vertex_labels,
const cugraph_type_erased_device_array_view_t* label_list,
const cugraph_type_erased_device_array_view_t* label_to_comm_rank,
const cugraph_type_erased_device_array_view_t* label_offsets,
const cugraph_type_erased_host_array_view_t* fan_out,
cugraph_rng_state_t* rng_state,
const cugraph_sampling_options_t* options,
Expand Down
Loading

0 comments on commit 45371cb

Please sign in to comment.