Skip to content

Commit b00fc37

Browse files
committed
Final solution
1 parent ecade47 commit b00fc37

File tree

4 files changed

+349
-319
lines changed

4 files changed

+349
-319
lines changed

README.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
# lcs-suffix
1+
# lcs-suffix
2+
3+
Solution to the longest common substring problem for large inputs. Use sol.py for fastest algorithm.

filelcsDP.py old_sol_DP.py

+11-11
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import sys
22
import os
3-
import time
3+
# import time
44

55
# Reference DP Solution
66

@@ -35,7 +35,7 @@ def get_lcs_offsets(f1_data, f2_data):
3535
return maxlen, [min(offset_1, offset_2), max(offset_1, offset_2)]
3636

3737

38-
start = time.time()
38+
# start = time.time()
3939

4040
filenames = sys.argv[1:]
4141

@@ -58,13 +58,13 @@ def get_lcs_offsets(f1_data, f2_data):
5858
print("ERROR: FILE '{}' DOES NOT EXIST.".format(name2))
5959
f2_len = os.path.getsize(name2)
6060
f2_data = (f2_len, name2, f2.read())
61-
print(name1, name2)
62-
comp_start = time.time()
61+
# print(name1, name2)
62+
# comp_start = time.time()
6363
lcs_len, lcs_offset = get_lcs_offsets(max(f1_data, f2_data), min(f1_data, f2_data))
64-
comp_end = time.time()
65-
print(lcs_offset, lcs_len)
66-
print("Elapsed time for computation: {} seconds".format(comp_end - comp_start))
67-
print()
64+
# comp_end = time.time()
65+
# print(lcs_offset, lcs_len)
66+
# print("Elapsed time for computation: {} seconds".format(comp_end - comp_start))
67+
# print()
6868
if lcs_len > 0:
6969
if lcs_len > maxlen:
7070
maxlen = lcs_len
@@ -78,6 +78,6 @@ def get_lcs_offsets(f1_data, f2_data):
7878
for off in offsets:
7979
print("File name: {}, Offset where sequence begins: {}".format(off[0], off[1]))
8080

81-
end = time.time()
82-
print()
83-
print("DP Computation: {} seconds".format(end - start))
81+
# end = time.time()
82+
# print()
83+
# print("DP Computation: {} seconds".format(end - start))

old_sol_suffix.py

+276
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
import sys
2+
# import time
3+
4+
""" Manber-Myers suffix array construction - O(n*log^2(n)) - inspired from GeeksForGeeks """
5+
6+
def build_suffix_arr(string):
7+
""" Constructs and returns the suffix array and LCP array """
8+
length = len(string)
9+
10+
# During computation, every suffix is represented by three numbers: the rank of its first half, the rank of its second half, and the index it corresponds to.
11+
# The index is irrelevant to sorting in construction of the suffix array, so it is at the end of the list so it naturally works with python's sort.
12+
suffs = [[0, 0, 0] for _ in range(length)]
13+
14+
for i in range(length):
15+
suffs[i][2]= i
16+
suffs[i][0] = string[i]
17+
suffs[i][1] = string[i+1] if i < length-1 else -1
18+
19+
suffs.sort()
20+
21+
k = 2
22+
inds = [0] * length
23+
while k < length:
24+
curr_rank = 0
25+
prev_rank = suffs[0][0]
26+
suffs[0][0] = curr_rank
27+
inds[suffs[0][2]] = 0
28+
no_change = True
29+
30+
for i in range(1, length):
31+
if suffs[i][0] == prev_rank and suffs[i][1] == suffs[i-1][1]:
32+
suffs[i][0] = curr_rank
33+
no_change = False
34+
else:
35+
prev_rank = suffs[i][0]
36+
curr_rank += 1
37+
suffs[i][0] = curr_rank
38+
inds[suffs[i][2]] = i
39+
40+
for i in range(length):
41+
next_ind = suffs[i][2] + k
42+
suffs[i][1] = suffs[inds[next_ind]][0] if next_ind < length else -1
43+
44+
if no_change:
45+
break
46+
suffs.sort()
47+
k *= 2
48+
49+
suffs_arr = [s[2] for s in suffs]
50+
return suffs_arr, compute_lcp_arr(string, suffs_arr, inds)
51+
52+
53+
54+
55+
""" LCP construction with Kasai's algorithm - O(n) """
56+
57+
def compute_lcp_arr(string, suffs, rank=None):
58+
""" Constructs the LCP array """
59+
if rank == None:
60+
rank = compute_rank(suffs)
61+
lcp_arr = [0] * (len(suffs)-1)
62+
last_lcp = 0
63+
for i in range(len(rank)):
64+
# Skip computation if rank[i] corresponds to last element in suffix array
65+
if (rank[i] == len(lcp_arr)):
66+
continue
67+
next_lcp = compute_lcp(string, suffs[rank[i]], suffs[rank[i] + 1], max(0, last_lcp-1))
68+
last_lcp = next_lcp
69+
lcp_arr[rank[i]] = next_lcp
70+
return lcp_arr
71+
72+
def compute_lcp(string, suff1, suff2, start):
73+
""" Computes the LCP of two given suffixes """
74+
assert start >= 0
75+
lcp = start
76+
s1 = min(suff1, suff2)
77+
s2 = max(suff1, suff2)
78+
s1 += start
79+
s2 += start
80+
while s2 < len(string) and string[s1] == string[s2]:
81+
lcp += 1
82+
s1 += 1
83+
s2 += 1
84+
return lcp
85+
86+
def compute_rank(suffs):
87+
""" Computes rank array (inverse of suffix array) - Not needed in current implementation """
88+
rank = [0] * len(suffs)
89+
for i in range(len(suffs)):
90+
rank[suffs[i]] = i
91+
return rank
92+
93+
94+
""" Misc. functions to help compute LCS """
95+
96+
def get_type(ind_to_type, index):
97+
""" Determines what file a given position in the string comes from using linear search"""
98+
return ind_to_type[index]
99+
100+
def get_offset(sentinels, file_ind, str_ind):
101+
""" Finds offset within file of a particular index """
102+
return str_ind - sentinels[file_ind] - 1
103+
104+
105+
""" Process Input """
106+
107+
if len(sys.argv) <= 2:
108+
print("Usage: python filelcs.py <file> <file> ... <file>")
109+
exit()
110+
111+
filenames = sys.argv[1:]
112+
string_nums = ()
113+
ind_to_type = []
114+
sentinels = [0] * (len(filenames) + 1)
115+
# # Placeholder for "imaginary" sentinel at beginning of string
116+
sentinels[0] = -1
117+
# Sentinel will range from 0 - len(filenames)-1. In the case of the 10 sample files, sentinels will be 0-9
118+
cur_sentinel = 0
119+
120+
# Read bytes in, inject separating sentinels starting from 0
121+
for i in range(len(filenames)):
122+
name = filenames[i]
123+
try:
124+
with open(name, "rb") as f:
125+
# Convert all bytes of the file to integers in an int array, and shift them up according to the number of sentinels needed
126+
string = f.read()
127+
string_nums += tuple([i + len(filenames) for i in string]) + (cur_sentinel,)
128+
sentinels[i+1] = len(string_nums) - 1
129+
ind_to_type.extend([cur_sentinel] * (len(string)+1))
130+
cur_sentinel += 1
131+
except FileNotFoundError:
132+
print("ERROR: FILE '{}' DOES NOT EXIST.".format(name))
133+
exit()
134+
135+
# Check that final sentinel is len(filenames) and all sentinels were used
136+
assert string_nums[-1] == len(filenames)-1
137+
assert cur_sentinel == len(filenames)
138+
139+
""" Build Data Structures """
140+
141+
# start = time.time()
142+
suffs, lcp = build_suffix_arr(string_nums)
143+
# end = time.time()
144+
# print("Suffix array + LCP construction took {} seconds".format(end - start))
145+
146+
147+
""" Find LCS """
148+
149+
# start = time.time()
150+
151+
longest = 0
152+
lcp_ind = 0
153+
154+
for cur_pos in range(len(filenames), len(lcp)):
155+
if lcp[cur_pos] > longest and get_type(ind_to_type, suffs[cur_pos]) != get_type(ind_to_type, suffs[cur_pos+1]):
156+
longest = lcp[cur_pos]
157+
lcp_ind = cur_pos
158+
159+
if longest == 0:
160+
print("There is no common sequence of bytes in the given files.")
161+
else:
162+
print("Length of longest shared strand of bytes: {}".format(longest))
163+
cur_type = get_type(ind_to_type, suffs[lcp_ind])
164+
files_checked = set([cur_type])
165+
offsets = [[filenames[cur_type], get_offset(sentinels, cur_type, suffs[lcp_ind])]]
166+
cur_lcp_ind = lcp_ind
167+
while cur_lcp_ind < len(lcp) and lcp[cur_lcp_ind] == longest and len(files_checked) < len(filenames):
168+
cur_type = get_type(ind_to_type, suffs[cur_lcp_ind+1])
169+
if cur_type not in files_checked:
170+
files_checked.add(cur_type)
171+
offsets.append([filenames[cur_type], get_offset(sentinels, cur_type, suffs[cur_lcp_ind+1])])
172+
cur_lcp_ind += 1
173+
174+
for off in offsets:
175+
print("File name: {}, Offset where sequence begins: {}".format(off[0], off[1]))
176+
177+
# end = time.time()
178+
# print("LCS Computation: {} seconds".format(end - start))
179+
180+
181+
182+
183+
184+
""" Manber-Myers with radix sort - O(nlogn) - turns out to be consistently slower in empirical tests """
185+
186+
# def counting_sort_ranks(arr, sort_ind):
187+
# largest = max(arr, key=lambda e: e[sort_ind])
188+
# counts = [0] * (largest[sort_ind] + 1)
189+
# out = [None] * len(arr)
190+
# for elem in arr:
191+
# counts[elem[sort_ind]] += 1
192+
# # Make cumulative
193+
# for i in range(1, len(counts)):
194+
# counts[i] += counts[i-1]
195+
# # Construct output
196+
# for elem in reversed(arr):
197+
# counts[elem[sort_ind]] -= 1
198+
# out[counts[elem[sort_ind]]] = elem
199+
200+
# return out
201+
202+
# def radix_sort_ranks(arr):
203+
# arr = counting_sort_ranks(arr, 1)
204+
# # print(arr)
205+
# arr = counting_sort_ranks(arr, 0)
206+
# # print(arr)
207+
# return arr
208+
209+
# def build_suffix_arr_radix(string):
210+
# length = len(string)
211+
212+
# # During computation, every suffix is represented by three numbers: the rank of its first half, the rank of its second half, and the index it corresponds to.
213+
# # The index is irrelevant to sorting in construction of the suffix array, so it is at the end of the list so it naturally works with python's sort.
214+
# suffs = [[0, 0, 0] for _ in range(length)]
215+
216+
# for i in range(length):
217+
# suffs[i][2]= i
218+
# # Shift numbers up by 1 so that indicator for end of suffix can be 0
219+
# suffs[i][0] = string[i]+1
220+
# suffs[i][1] = string[i+1]+1 if i < length-1 else 0
221+
222+
# suffs = radix_sort_ranks(suffs)
223+
224+
# k = 2
225+
# inds = [0] * length
226+
# while k < length:
227+
# curr_rank = 0
228+
# prev_rank = suffs[0][0]
229+
# suffs[0][0] = curr_rank
230+
# inds[suffs[0][2]] = 0
231+
# no_change = True
232+
233+
# for i in range(1, length):
234+
# if suffs[i][0] == prev_rank and suffs[i][1] == suffs[i-1][1]:
235+
# suffs[i][0] = curr_rank
236+
# no_change = False
237+
# else:
238+
# prev_rank = suffs[i][0]
239+
# curr_rank += 1
240+
# suffs[i][0] = curr_rank
241+
# inds[suffs[i][2]] = i
242+
243+
# for i in range(length):
244+
# next_ind = suffs[i][2] + k
245+
# suffs[i][1] = suffs[inds[next_ind]][0] if next_ind < length else 0
246+
247+
# if no_change:
248+
# break
249+
# suffs = radix_sort_ranks(suffs)
250+
# k *= 2
251+
252+
# suffs_arr = [s[2] for s in suffs]
253+
# return suffs_arr, compute_lcp_arr(string, suffs_arr, inds)
254+
255+
256+
257+
# import random
258+
# test = []
259+
# for _ in range(100000):
260+
# test.append([random.randint(0, 10000), random.randint(0, 10000)])
261+
262+
263+
# print("Normal start:")
264+
# start = time.time()
265+
# normal = sorted(test)
266+
# end = time.time()
267+
# print("Normal sort: {} seconds".format(end-start))
268+
269+
# print()
270+
# print("Radix start:")
271+
# start = time.time()
272+
# rad = radix_sort_ranks(test)
273+
# end = time.time()
274+
# print("Radix sort: {} seconds".format(end-start))
275+
276+
# print("Normal == Radix: {}".format(normal == rad))

0 commit comments

Comments
 (0)