-
Notifications
You must be signed in to change notification settings - Fork 96
/
Copy patheditdist.jl
299 lines (266 loc) · 10.3 KB
/
editdist.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
"""
edit_distance(G₁::AbstractGraph, G₂::AbstractGraph)
Compute the edit distance between graphs `G₁` and `G₂`. Return the minimum
edit cost and edit path to transform graph `G₁` into graph `G₂``.
An edit path consists of a sequence of pairs of vertices ``(u,v) ∈ [0,|G₁|] × [0,|G₂|]``
representing vertex operations:
- ``(0,v)``: insertion of vertex ``v ∈ G₂``
- ``(u,0)``: deletion of vertex ``u ∈ G₁``
- ``(u>0,v>0)``: substitution of vertex ``u ∈ G₁`` by vertex ``v ∈ G₂``
### Optional Arguments
- `vertex_insert_cost::Function=v->0.`
- `vertex_delete_cost::Function=u->0.`
- `vertex_subst_cost::Function=(u, v)->0.`
- `edge_insert_cost::Function=e->1.`
- `edge_delete_cost::Function=e->1.`
- `edge_subst_cost::Function=(e1, e2)->0.`
The algorithm will always try to match two edges if it can, so if it is
preferrable to delete two edges rather than match these, it should be
reflected in the `edge_subst_cost` function.
By default, the algorithm uses constant operation costs. The
user can provide classical Minkowski costs computed from vertex
labels μ₁ (for G₁) and μ₂ (for G₂) in order to further guide the
search, for example:
```
edit_distance(G₁, G₂, subst_cost=MinkowskiCost(μ₁, μ₂))
```
- `heuristic::Function=DefaultEditHeuristic`: a custom heuristic provided to the A*
search in case the default heuristic is not satisfactory.
### Performance
- Given two graphs ``|G₁| < |G₂|``, `edit_distance(G₁, G₂)` is faster to
compute than `edit_distance(G₂, G₁)`. Consider swapping the arguments
if involved costs are equivalent.
- The use of a heuristic can improve performance considerably.
- Exploit vertex attributes when designing operation costs.
### References
- RIESEN, K., 2015. Structural Pattern Recognition with Graph Edit Distance: Approximation Algorithms and Applications. (Chapter 2)
### Author
- Júlio Hoffimann Mendes ([email protected])
# Examples
```jldoctest
julia> using Graphs
julia> g1 = SimpleDiGraph([0 1 0 0 0; 0 0 1 0 0; 1 0 0 1 0; 0 0 0 0 1; 0 0 0 1 0]);
julia> g2 = SimpleDiGraph([0 1 0; 0 0 1; 1 0 0]);
julia> edit_distance(g1, g2)
(3.0, Tuple[(1, 3), (2, 1), (3, 2), (4, 0), (5, 0)])
```
"""
function edit_distance(
G₁::AbstractGraph,
G₂::AbstractGraph;
vertex_insert_cost=nothing,
vertex_delete_cost=nothing,
vertex_subst_cost=nothing,
edge_insert_cost=nothing,
edge_delete_cost=nothing,
edge_subst_cost=nothing,
heuristic=nothing,
)
if isnothing(vertex_insert_cost) &&
isnothing(vertex_delete_cost) &&
isnothing(vertex_subst_cost) &&
isnothing(edge_insert_cost) &&
isnothing(edge_delete_cost) &&
isnothing(edge_subst_cost) &&
isnothing(heuristic)
heuristic = default_edit_heuristic
end
vertex_insert_cost = something(vertex_insert_cost, v -> 0.0)
vertex_delete_cost = something(vertex_delete_cost, v -> 0.0)
vertex_subst_cost = something(vertex_subst_cost, (u, v) -> 0.0)
edge_insert_cost = something(edge_insert_cost, e -> 1.0)
edge_delete_cost = something(edge_delete_cost, e -> 1.0)
edge_subst_cost = something(edge_subst_cost, (e1, e2) -> 0.0)
heuristic = something(heuristic, (λ, G₁, G₂) -> 0.0)
return _edit_distance(
G₁::AbstractGraph,
G₂::AbstractGraph,
vertex_insert_cost,
vertex_delete_cost,
vertex_subst_cost,
edge_insert_cost,
edge_delete_cost,
edge_subst_cost,
heuristic,
)
end
function _edit_distance(
G₁::AbstractGraph{T},
G₂::AbstractGraph{U},
vertex_insert_cost::Function,
vertex_delete_cost::Function,
vertex_subst_cost::Function,
edge_insert_cost::Function,
edge_delete_cost::Function,
edge_subst_cost::Function,
heuristic::Function,
) where {T<:Integer,U<:Integer}
isdirected = is_directed(G₁) || is_directed(G₂)
# compute the cost on edges due to associate u1 to v1 and u2 to v2
# u2 and v2 can eventually be 0
function association_cost(u1, u2, v1, v2)
cost = 0.0
if has_edge(G₁, u1, u2)
if has_edge(G₂, v1, v2)
cost += edge_subst_cost(Edge(u1, u2), Edge(v1, v2))
else
cost += edge_delete_cost(Edge(u1, u2))
end
else
if has_edge(G₂, v1, v2)
cost += edge_insert_cost(Edge(v1, v2))
end
end
if isdirected && u1 != u2
if has_edge(G₁, u2, u1)
if has_edge(G₂, v2, v1)
cost += edge_subst_cost(Edge(u2, u1), Edge(v2, v1))
else
cost += edge_delete_cost(Edge(u2, u1))
end
else
if has_edge(G₂, v2, v1)
cost += edge_insert_cost(Edge(v2, v1))
end
end
end
return cost
end
# A* search heuristic
h(λ) = heuristic(λ, G₁, G₂)
# initialize open set
OPEN = PriorityQueue{Vector{Tuple},Float64}()
for v in vertices(G₂)
enqueue!(OPEN, [(T(1), v)], vertex_subst_cost(1, v) + h([(T(1), v)]))
end
enqueue!(OPEN, [(T(1), U(0))], vertex_delete_cost(1) + h([(T(1), U(0))]))
c = 0
while true
# minimum (partial) edit path
λ, cost = peek(OPEN)
c += 1
dequeue!(OPEN)
if is_complete_path(λ, G₁, G₂)
return cost, λ
else
u1, _ = λ[end]
u1 += T(1)
vs = setdiff(vertices(G₂), [v for (u, v) in λ])
if u1 <= nv(G₁) # there are still vertices to process in G₁?
# we try every possible assignment of v1
for v1 in vs
λ⁺ = [λ; (u1, v1)]
new_cost = cost + vertex_subst_cost(u1, v1) + h(λ⁺) - h(λ)
for (u2, v2) in λ
new_cost += association_cost(u1, u2, v1, v2)
end
new_cost += association_cost(u1, u1, v1, v1) # handle self-loops
enqueue!(OPEN, λ⁺, new_cost)
end
# we try deleting v1
λ⁺ = [λ; (u1, U(0))]
new_cost = cost + vertex_delete_cost(u1) + h(λ⁺) - h(λ)
for u2 in outneighbors(G₁, u1)
# edges deleted later when assigning v2
u2 > u1 && continue
new_cost += edge_delete_cost(Edge(u1, u2))
end
if isdirected
for u2 in inneighbors(G₁, u1)
# edges deleted later when assigning v2, and we should not count a self loop twice
u2 >= u1 && continue
new_cost += edge_delete_cost(Edge(u2, u1))
end
end
enqueue!(OPEN, λ⁺, new_cost)
else
# add remaining vertices of G₂ to the path by deleting them
λ⁺ = [λ; [(T(0), v) for v in vs]]
new_cost = cost + sum(vertex_insert_cost, vs)
for v1 in vs
for v2 in outneighbors(G₂, v1)
(v2 > v1 && v2 in vs) && continue # these edges will be deleted later
new_cost += edge_insert_cost(Edge(v1, v2))
end
if isdirected
for v2 in inneighbors(G₂, v1)
(v2 > v1 && v2 in vs) && continue # these edges will be deleted later
v1 == v2 && continue # we should not count a self loop twice
new_cost += edge_insert_cost(Edge(v2, v1))
end
end
end
enqueue!(OPEN, λ⁺, new_cost + h(λ⁺) - h(λ))
end
end
end
end
function is_complete_path(λ, G₁, G₂)
us = Set()
vs = Set()
for (u, v) in λ
push!(us, u)
push!(vs, v)
end
delete!(us, 0)
delete!(vs, 0)
return length(us) == nv(G₁) && length(vs) == nv(G₂)
end
# edit_distance(G₁::AbstractGraph, G₂::AbstractGraph) =
# edit_distance(G₁, G₂,
# vertex_insert_cost=v -> 0.,
# vertex_delete_cost=u -> 0.,
# vertex_subst_cost=(u, v) -> 0.,
# edge_insert_cost=e -> 1.,
# edge_delete_cost=e -> 1.,
# edge_subst_cost=(e1, e2) -> 0.,
# heuristic=default_edit_heuristic)
"""
compute an upper bound on the number of edges that can still be affected
"""
function default_edit_heuristic(λ, G₁::AbstractGraph, G₂::AbstractGraph)
us = setdiff(1:nv(G₁), [u for (u, v) in λ])
vs = setdiff(1:nv(G₂), [v for (u, v) in λ])
total_free_edges_g1 = 0
total_free_edges_g2 = 0
if !isempty(us)
total_free_edges_g1 = sum(u -> outdegree(G₁, u), us)
end
if !isempty(vs)
total_free_edges_g2 = sum(v -> outdegree(G₂, v), vs)
end
for (u1, v1) in λ
(u1 == 0 || v1 == 0) && continue
total_free_edges_g1 += count(u2 -> u2 in us, outneighbors(G₁, u1))
total_free_edges_g2 += count(v2 -> v2 in vs, outneighbors(G₂, v1))
end
if !is_directed(G₁) && !is_directed(G₂)
total_free_edges_g1 = total_free_edges_g1 / 2
total_free_edges_g2 = total_free_edges_g2 / 2
end
return abs(total_free_edges_g1 - total_free_edges_g2)
end
#-------------------------
# Edit path cost functions
#-------------------------
"""
MinkowskiCost(μ₁, μ₂; p::Real=1)
For labels μ₁ on the vertices of graph G₁ and labels μ₂ on the vertices
of graph G₂, compute the p-norm cost of substituting vertex u ∈ G₁ by
vertex v ∈ G₂.
### Optional Arguments
`p=1`: the p value for p-norm calculation.
"""
function MinkowskiCost(μ₁::AbstractVector, μ₂::AbstractVector; p::Real=1)
return (u, v) -> norm(μ₁[u] - μ₂[v], p)
end
"""
BoundedMinkowskiCost(μ₁, μ₂)
Return value similar to [`MinkowskiCost`](@ref), but ensure costs smaller than 2τ.
### Optional Arguments
`p=1`: the p value for p-norm calculation.
`τ=1`: value specifying half of the upper limit of the Minkowski cost.
"""
function BoundedMinkowskiCost(μ₁::AbstractVector, μ₂::AbstractVector; p::Real=1, τ::Real=1)
return (u, v) -> 1 / (1 / (2τ) + exp(-norm(μ₁[u] - μ₂[v], p)))
end