-
Notifications
You must be signed in to change notification settings - Fork 0
/
search_with_scc.py
270 lines (238 loc) · 7.59 KB
/
search_with_scc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
#
# Exam search functions
#
def multi_lookup(index, query):
keywords = query
urls = []
for match in index[keywords[0]]:
if len(keywords) == 1:
union(urls, [match[0]])
else:
union(urls, find(index, keywords[1:], match[1]+1))
return urls
def find(index, keywords, i):
matches = []
for match in index[keywords[0]]:
if match[1] == i:
if len(keywords) == 1:
union(matches, [match[0]])
else:
union(matches, find(index, keywords[1:], match[1]+1))
return matches
def crawl_web(seed): # returns index, graph of inlinks
tocrawl = [seed]
crawled = []
graph = {} # <url>, [list of pages it links to]
index = {}
while tocrawl:
page = tocrawl.pop()
if page not in crawled:
content = get_page(page)
add_page_to_index(index, page, content)
outlinks = get_all_links(content)
graph[page] = outlinks
union(tocrawl, outlinks)
crawled.append(page)
return index, graph
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def get_all_links(page):
links = []
while True:
url, endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
def union(a, b):
for e in b:
if e not in a:
a.append(e)
def add_page_to_index(index, url, content):
words = content.split()
for i in range(len(words)):
word = words[i]
add_to_index(index, word, url, i)
def add_to_index(index, keyword, url, i):
if keyword in index:
index[keyword].append([url, i])
else:
index[keyword] = [[url, i]]
def lookup(index, keyword):
if keyword in index:
return index[keyword]
else:
return None
def get_page(url):
if url in cache:
return cache[url]
else:
print "Page not in cache: " + url
return None
#
# Functions for computing strongly connected components (web pages)
#
# notmalize the graph by adding missing nodes
def normilize_graph(graph):
normalized_graph = {}
for u in graph:
for v in graph[u]:
if v not in normalized_graph:
normalized_graph[v] = []
if u not in normalized_graph:
normalized_graph[u] = []
normalized_graph[u].append(v)
return normalized_graph
# reverse the directed of all graph edges
def reverse_graph(graph):
reversed_graph = {}
for u in graph:
for v in graph[u]:
if v not in reversed_graph:
reversed_graph[v] = []
if u not in reversed_graph:
reversed_graph[u] = []
reversed_graph[v].append(u)
return reversed_graph
# helper function that uses depth first search to timestamp all vertices
def dfs_timestamp(graph, explored, timestamps, i):
global timestamp
explored[i] = 1
for v in graph[i]:
if v not in explored:
dfs_timestamp(graph, explored, timestamps, v)
timestamp += 1
timestamps[timestamp] = i
# timestamp all vertices of the graph
def timestamp_graph(graph):
explored = {}
timestamps = {}
for v in sorted(graph.keys(), reverse=True):
if v not in explored:
dfs_timestamp(graph, explored, timestamps, v)
return timestamps
# helper fuction that uses depth firts search to set leader vertex of all graph
# verticies
def dfs_leader(graph, explored, leaders, i, leader):
explored[i] = 1
leaders[i] = leader
for v in graph[i]:
if v not in explored:
dfs_leader(graph, explored, leaders, v, leader)
# set leaders for all the vertices of the graph
def graph_leaders(graph, timestamps):
explored = {}
leaders = {}
for t in sorted(timestamps.keys(), reverse=True):
i = timestamps[t]
if i not in explored:
leader = i
dfs_leader(graph,explored, leaders, i, leader)
return leaders
# Ohh, nooo ... a global variable :(
timestamp = 0
# high level fuction for setting leaders of all graph verticies
def compute_leaders(graph):
G = normilize_graph(graph)
Grev = reverse_graph(G)
timestamps = timestamp_graph(G)
return graph_leaders(Grev, timestamps)
# sample web geaph with two clustes of pages
# one cluster about python snakes
# the other cluster about python language
cache = {
'http://udacity.com/pythons.html': """
python snake
python language
<a href="http://udacity.com/python/snakes2.html">A</a><br>
<a href="http://udacity.com/python/language6.html">A</a><br>
<a href="http://udacity.com/python/language7.html">A</a><br>
""",
# cluster about python snakes
'http://udacity.com/python/snakes1.html': """
python snake
<a href="http://udacity.com/python/snakes2.html">A</a><br>
""",
'http://udacity.com/python/snakes2.html': """
python snake
<a href="http://udacity.com/python/snakes3.html">A</a><br>
<a href="http://udacity.com/python/snakes4.html">A</a><br>
""",
'http://udacity.com/python/snakes3.html': """
python snake
<a href="http://udacity.com/python/snakes1.html">A</a><br>
""",
'http://udacity.com/python/snakes4.html': """
python snake
""",
# cluster about python language
'http://udacity.com/python/language6.html': """
python language
<a href="http://udacity.com/python/language7.html">A</a><br>
""",
'http://udacity.com/python/language7.html': """
python language
<a href="http://udacity.com/python/language8.html">A</a><br>
<a href="http://udacity.com/python/language9.html">A</a><br>
""",
'http://udacity.com/python/language8.html': """
python language
<a href="http://udacity.com/python/language6.html">A</a><br>
""",
'http://udacity.com/python/language9.html': """
python language
""",
}
# crawl and index the webpages
index, graph = crawl_web('http://udacity.com/pythons.html')
# find strongly connected web pages
leaders = compute_leaders(graph)
# print "index", index
# print "graph", graph
# print "leaders", leaders
# search
keywords = ['python', 'language']
# keywords = ['python', 'snake']
results = multi_lookup(index, keywords)
search_results = []
for result in results:
leader = leaders[result]
connected = []
for c in leaders.keys():
if leaders[c] == leader and c != result:
connected.append(c)
search_results.append({'url': result, 'related': connected })
# print results
print "Search results for", keywords
for r in sorted(search_results, key = lambda result: len(result['related']),
reverse=True):
print r['url']
if r['related']:
print " Strongly connected web pages:"
for c in r['related']:
print " ", c
#sample output
#zeus:~/projects/CS101 (master)$ python search_with_scc.py
#Search results for ['python', 'language']
#http://udacity.com/python/language7.html
# Strongly connected web pages:
# http://udacity.com/python/language8.html
# http://udacity.com/python/language6.html
#http://udacity.com/python/language8.html
# Strongly connected web pages:
# http://udacity.com/python/language6.html
# http://udacity.com/python/language7.html
#http://udacity.com/python/language6.html
# Strongly connected web pages:
# http://udacity.com/python/language8.html
# http://udacity.com/python/language7.html
#http://udacity.com/python/language9.html
#http://udacity.com/pythons.html