-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathsparsegraph.py
395 lines (330 loc) · 15.2 KB
/
sparsegraph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
import warnings
from typing import Dict, Union, Tuple, Any
import numpy as np
import scipy.sparse as sp
__all__ = ['SparseGraph']
sparse_graph_properties = [
'adj_matrix', 'attr_matrix', 'labels',
'node_names', 'attr_names', 'class_names',
'metadata']
class SparseGraph:
"""Attributed labeled graph stored in sparse matrix form.
Parameters
----------
adj_matrix
Adjacency matrix in CSR format. Shape [num_nodes, num_nodes]
attr_matrix
Attribute matrix in CSR or numpy format. Shape [num_nodes, num_attr]
labels
Array, where each entry represents respective node's label(s). Shape [num_nodes]
Alternatively, CSR matrix with labels in one-hot format. Shape [num_nodes, num_classes]
node_names
Names of nodes (as strings). Shape [num_nodes]
attr_names
Names of the attributes (as strings). Shape [num_attr]
class_names
Names of the class labels (as strings). Shape [num_classes]
metadata
Additional metadata such as text.
"""
def __init__(
self, adj_matrix: sp.spmatrix,
attr_matrix: Union[np.ndarray, sp.spmatrix] = None,
labels: Union[np.ndarray, sp.spmatrix] = None,
node_names: np.ndarray = None,
attr_names: np.ndarray = None,
class_names: np.ndarray = None,
metadata: Any = None):
# Make sure that the dimensions of matrices / arrays all agree
if sp.isspmatrix(adj_matrix):
adj_matrix = adj_matrix.tocsr().astype(np.float32)
else:
raise ValueError("Adjacency matrix must be in sparse format (got {0} instead)."
.format(type(adj_matrix)))
if adj_matrix.shape[0] != adj_matrix.shape[1]:
raise ValueError("Dimensions of the adjacency matrix don't agree.")
if attr_matrix is not None:
if sp.isspmatrix(attr_matrix):
attr_matrix = attr_matrix.tocsr().astype(np.float32)
elif isinstance(attr_matrix, np.ndarray):
attr_matrix = attr_matrix.astype(np.float32)
else:
raise ValueError("Attribute matrix must be a sp.spmatrix or a np.ndarray (got {0} instead)."
.format(type(attr_matrix)))
if attr_matrix.shape[0] != adj_matrix.shape[0]:
raise ValueError("Dimensions of the adjacency and attribute matrices don't agree.")
if labels is not None:
if labels.shape[0] != adj_matrix.shape[0]:
raise ValueError("Dimensions of the adjacency matrix and the label vector don't agree.")
if node_names is not None:
if len(node_names) != adj_matrix.shape[0]:
raise ValueError("Dimensions of the adjacency matrix and the node names don't agree.")
if attr_names is not None:
if len(attr_names) != attr_matrix.shape[1]:
raise ValueError("Dimensions of the attribute matrix and the attribute names don't agree.")
self.adj_matrix = adj_matrix
self.attr_matrix = attr_matrix
self.labels = labels
self.node_names = node_names
self.attr_names = attr_names
self.class_names = class_names
self.metadata = metadata
def num_nodes(self) -> int:
"""Get the number of nodes in the graph.
"""
return self.adj_matrix.shape[0]
def num_edges(self) -> int:
"""Get the number of edges in the graph.
For undirected graphs, (i, j) and (j, i) are counted as _two_ edges.
"""
return self.adj_matrix.nnz
def get_neighbors(self, idx: int) -> np.ndarray:
"""Get the indices of neighbors of a given node.
Parameters
----------
idx
Index of the node whose neighbors are of interest.
"""
return self.adj_matrix[idx].indices
def get_edgeid_to_idx_array(self) -> np.ndarray:
"""Return a Numpy Array that maps edgeids to the indices in the adjacency matrix.
Returns
-------
np.ndarray
The i'th entry contains the x- and y-coordinates of edge i in the adjacency matrix.
Shape [num_edges, 2]
"""
return np.transpose(self.adj_matrix.nonzero())
def is_directed(self) -> bool:
"""Check if the graph is directed (adjacency matrix is not symmetric).
"""
return (self.adj_matrix != self.adj_matrix.T).sum() != 0
def to_undirected(self) -> 'SparseGraph':
"""Convert to an undirected graph (make adjacency matrix symmetric).
"""
idx = self.get_edgeid_to_idx_array().T
ridx = np.ravel_multi_index(idx, self.adj_matrix.shape)
ridx_rev = np.ravel_multi_index(idx[::-1], self.adj_matrix.shape)
# Get duplicate edges (self-loops and opposing edges)
dup_ridx = ridx[np.isin(ridx, ridx_rev)]
dup_idx = np.unravel_index(dup_ridx, self.adj_matrix.shape)
# Check if the adjacency matrix weights are symmetric (if nonzero)
if len(dup_ridx) > 0 and not np.allclose(self.adj_matrix[dup_idx], self.adj_matrix[dup_idx[::-1]]):
raise ValueError("Adjacency matrix weights of opposing edges differ.")
# Create symmetric matrix
new_adj_matrix = self.adj_matrix + self.adj_matrix.T
if len(dup_ridx) > 0:
new_adj_matrix[dup_idx] = (new_adj_matrix[dup_idx] - self.adj_matrix[dup_idx]).A1
self.adj_matrix = new_adj_matrix
return self
def is_weighted(self) -> bool:
"""Check if the graph is weighted (edge weights other than 1).
"""
return np.any(np.unique(self.adj_matrix[self.adj_matrix.nonzero()].A1) != 1)
def to_unweighted(self) -> 'SparseGraph':
"""Convert to an unweighted graph (set all edge weights to 1).
"""
self.adj_matrix.data = np.ones_like(self.adj_matrix.data)
return self
def is_connected(self) -> bool:
"""Check if the graph is connected.
"""
return sp.csgraph.connected_components(self.adj_matrix, return_labels=False) == 1
def has_self_loops(self) -> bool:
"""Check if the graph has self-loops.
"""
return not np.allclose(self.adj_matrix.diagonal(), 0)
def __repr__(self) -> str:
props = []
for prop_name in sparse_graph_properties:
prop = getattr(self, prop_name)
if prop is not None:
if prop_name == 'metadata':
props.append(prop_name)
else:
shape_string = 'x'.join([str(x) for x in prop.shape])
props.append("{} ({})".format(prop_name, shape_string))
dir_string = 'Directed' if self.is_directed() else 'Undirected'
weight_string = 'weighted' if self.is_weighted() else 'unweighted'
conn_string = 'connected' if self.is_connected() else 'disconnected'
loop_string = 'has self-loops' if self.has_self_loops() else 'no self-loops'
return ("<{}, {} and {} SparseGraph with {} edges ({}). Data: {}>"
.format(dir_string, weight_string, conn_string,
self.num_edges(), loop_string,
', '.join(props)))
# Quality of life (shortcuts)
def standardize(
self, make_unweighted: bool = True,
make_undirected: bool = True,
no_self_loops: bool = True,
select_lcc: bool = True
) -> 'SparseGraph':
"""Perform common preprocessing steps: remove self-loops, make unweighted/undirected, select LCC.
All changes are done inplace.
Parameters
----------
make_unweighted
Whether to set all edge weights to 1.
make_undirected
Whether to make the adjacency matrix symmetric. Can only be used if make_unweighted is True.
no_self_loops
Whether to remove self loops.
select_lcc
Whether to select the largest connected component of the graph.
"""
G = self
if make_unweighted and G.is_weighted():
G = G.to_unweighted()
if make_undirected and G.is_directed():
G = G.to_undirected()
if no_self_loops and G.has_self_loops():
G = remove_self_loops(G)
if select_lcc and not G.is_connected():
G = largest_connected_components(G, 1)
return G
def unpack(self) -> Tuple[sp.csr_matrix,
Union[np.ndarray, sp.csr_matrix],
Union[np.ndarray, sp.csr_matrix]]:
"""Return the (A, X, E, z) quadruplet.
"""
return self.adj_matrix, self.attr_matrix, self.labels
def to_flat_dict(self) -> Dict[str, Any]:
"""Return flat dictionary containing all SparseGraph properties.
"""
data_dict = {}
for key in sparse_graph_properties:
val = getattr(self, key)
if sp.isspmatrix(val):
data_dict['{}.data'.format(key)] = val.data
data_dict['{}.indices'.format(key)] = val.indices
data_dict['{}.indptr'.format(key)] = val.indptr
data_dict['{}.shape'.format(key)] = val.shape
else:
data_dict[key] = val
return data_dict
@staticmethod
def from_flat_dict(data_dict: Dict[str, Any]) -> 'SparseGraph':
"""Initialize SparseGraph from a flat dictionary.
"""
init_dict = {}
del_entries = []
# Construct sparse matrices
for key in data_dict.keys():
if key.endswith('_data') or key.endswith('.data'):
if key.endswith('_data'):
sep = '_'
warnings.warn(
"The separator used for sparse matrices during export (for .npz files) "
"is now '.' instead of '_'. Please update (re-save) your stored graphs.",
DeprecationWarning, stacklevel=2)
else:
sep = '.'
matrix_name = key[:-5]
mat_data = key
mat_indices = '{}{}indices'.format(matrix_name, sep)
mat_indptr = '{}{}indptr'.format(matrix_name, sep)
mat_shape = '{}{}shape'.format(matrix_name, sep)
if matrix_name == 'adj' or matrix_name == 'attr':
warnings.warn(
"Matrices are exported (for .npz files) with full names now. "
"Please update (re-save) your stored graphs.",
DeprecationWarning, stacklevel=2)
matrix_name += '_matrix'
init_dict[matrix_name] = sp.csr_matrix(
(data_dict[mat_data],
data_dict[mat_indices],
data_dict[mat_indptr]),
shape=data_dict[mat_shape])
del_entries.extend([mat_data, mat_indices, mat_indptr, mat_shape])
# Delete sparse matrix entries
for del_entry in del_entries:
del data_dict[del_entry]
# Load everything else
for key, val in data_dict.items():
if ((val is not None) and (None not in val)):
init_dict[key] = val
# Check if the dictionary contains only entries in sparse_graph_properties
unknown_keys = [key for key in init_dict.keys() if key not in sparse_graph_properties]
if len(unknown_keys) > 0:
raise ValueError("Input dictionary contains keys that are not SparseGraph properties ({})."
.format(unknown_keys))
return SparseGraph(**init_dict)
def create_subgraph(
sparse_graph: SparseGraph,
_sentinel: None = None,
nodes_to_remove: np.ndarray = None,
nodes_to_keep: np.ndarray = None
) -> SparseGraph:
"""Create a graph with the specified subset of nodes.
Exactly one of (nodes_to_remove, nodes_to_keep) should be provided, while the other stays None.
Note that to avoid confusion, it is required to pass node indices as named arguments to this function.
The subgraph partially points to the old graph's data.
Parameters
----------
sparse_graph
Input graph.
_sentinel
Internal, to prevent passing positional arguments. Do not use.
nodes_to_remove
Indices of nodes that have to removed.
nodes_to_keep
Indices of nodes that have to be kept.
Returns
-------
SparseGraph
Graph with specified nodes removed.
"""
# Check that arguments are passed correctly
if _sentinel is not None:
raise ValueError("Only call `create_subgraph` with named arguments',"
" (nodes_to_remove=...) or (nodes_to_keep=...).")
if nodes_to_remove is None and nodes_to_keep is None:
raise ValueError("Either nodes_to_remove or nodes_to_keep must be provided.")
elif nodes_to_remove is not None and nodes_to_keep is not None:
raise ValueError("Only one of nodes_to_remove or nodes_to_keep must be provided.")
elif nodes_to_remove is not None:
nodes_to_keep = [i for i in range(sparse_graph.num_nodes()) if i not in nodes_to_remove]
elif nodes_to_keep is not None:
nodes_to_keep = sorted(nodes_to_keep)
else:
raise RuntimeError("This should never happen.")
sparse_graph.adj_matrix = sparse_graph.adj_matrix[nodes_to_keep][:, nodes_to_keep]
if sparse_graph.attr_matrix is not None:
sparse_graph.attr_matrix = sparse_graph.attr_matrix[nodes_to_keep]
if sparse_graph.labels is not None:
sparse_graph.labels = sparse_graph.labels[nodes_to_keep]
if sparse_graph.node_names is not None:
sparse_graph.node_names = sparse_graph.node_names[nodes_to_keep]
return sparse_graph
def largest_connected_components(sparse_graph: SparseGraph, n_components: int = 1) -> SparseGraph:
"""Select the largest connected components in the graph.
Changes are returned in a partially new SparseGraph.
Parameters
----------
sparse_graph
Input graph.
n_components
Number of largest connected components to keep.
Returns
-------
SparseGraph
Subgraph of the input graph where only the nodes in largest n_components are kept.
"""
_, component_indices = sp.csgraph.connected_components(sparse_graph.adj_matrix)
component_sizes = np.bincount(component_indices)
components_to_keep = np.argsort(component_sizes)[::-1][:n_components] # reverse order to sort descending
nodes_to_keep = [
idx for (idx, component) in enumerate(component_indices) if component in components_to_keep
]
return create_subgraph(sparse_graph, nodes_to_keep=nodes_to_keep)
def remove_self_loops(sparse_graph: SparseGraph) -> SparseGraph:
"""Remove self loops (diagonal entries in the adjacency matrix).
Changes are returned in a partially new SparseGraph.
"""
num_self_loops = (~np.isclose(sparse_graph.adj_matrix.diagonal(), 0)).sum()
if num_self_loops > 0:
sparse_graph.adj_matrix = sparse_graph.adj_matrix.tolil()
sparse_graph.adj_matrix.setdiag(0)
sparse_graph.adj_matrix = sparse_graph.adj_matrix.tocsr()
warnings.warn("{0} self loops removed".format(num_self_loops))
return sparse_graph