Skip to content

Commit 5f5e434

Browse files
committed
First apparently functional solution with suffix arrays
0 parents  commit 5f5e434

14 files changed

+501
-0
lines changed

Problem - Longest Strand.md

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
2+
Given a large number of binary files, write a program that finds the
3+
longest strand of bytes that is identical between two or more files
4+
5+
Use the test set attached (files sample.*)
6+
7+
The program should display:
8+
- the length of the strand
9+
- the file names where the largest strand appears
10+
- the offset where the strand appears in each file

filelcsDP.py

+99
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import sys
2+
import os
3+
4+
# Reference DP Solution
5+
6+
if len(sys.argv) <= 2:
7+
print("Usage: python filelcs.py <file> <file> ... <file>")
8+
exit()
9+
10+
11+
# def get_lcs_offsets(f1_data, f2_data):
12+
# """ f2 is the smaller file """
13+
# f1_len, f1_name, f1 = f1_data
14+
# f2_len, f2_name, f2 = f2_data
15+
# matrix = [[0] * (f2_len+1), [0] * (f2_len+2)]
16+
# f1.seek(0)
17+
# maxlen = 0
18+
# of1 = 0
19+
# of2 = 0
20+
# for i in range(1, f1_len+1):
21+
# b1 = f1.read(1)
22+
# f2.seek(0)
23+
# # print(i)
24+
# for j in range(1, f2_len+1):
25+
# b2 = f2.read(1)
26+
# if b1 == b2:
27+
# num = matrix[(i%2) - 1][j-1] + 1
28+
# matrix[i%2][j] = num
29+
# if num > maxlen:
30+
# maxlen = num
31+
# of1 = i
32+
# of2 = j
33+
# offset_1 = (f1_name, of1)
34+
# offset_2 = (f2_name, of2)
35+
# return maxlen, [min(offset_1, offset_2), max(offset_1, offset_2)]
36+
37+
def get_lcs_offsets(f1_data, f2_data):
38+
""" f2 is the smaller file """
39+
f1_len, f1_name, f1 = f1_data
40+
f2_len, f2_name, f2 = f2_data
41+
matrix = [[0] * (f2_len+1), [0] * (f2_len+1)]
42+
f1.seek(0)
43+
f2.seek(0)
44+
f1_str = f1.read()
45+
f2_str = f2.read()
46+
maxlen = 0
47+
of1 = 0
48+
of2 = 0
49+
for i in range(1, f1_len+1):
50+
b1 = f1_str[i-1]
51+
# print(i)
52+
for j in range(1, f2_len+1):
53+
b2 = f2_str[j-1]
54+
if b1 == b2:
55+
num = matrix[(i-1)%2][j-1] + 1
56+
matrix[i%2][j] = num
57+
if num > maxlen:
58+
maxlen = num
59+
of1 = i-1
60+
of2 = j-1
61+
offset_1 = (f1_name, of1)
62+
offset_2 = (f2_name, of2)
63+
return maxlen, [min(offset_1, offset_2), max(offset_1, offset_2)]
64+
65+
66+
filenames = sys.argv[1:]
67+
68+
maxlen = 0
69+
offsets = []
70+
for i in range(len(filenames)-1):
71+
name1 = filenames[i]
72+
try:
73+
f1 = open(name1, "rb")
74+
except FileNotFoundError:
75+
print("ERROR: FILE '{}' DOES NOT EXIST.".format(name1))
76+
exit()
77+
f1_len = os.path.getsize(name1)
78+
f1_data = (f1_len, name1, f1)
79+
for j in range(i+1, len(filenames)):
80+
name2 = filenames[j]
81+
try:
82+
f2 = open(name2, "rb")
83+
except FileNotFoundError:
84+
print("ERROR: FILE '{}' DOES NOT EXIST.".format(name2))
85+
f2_len = os.path.getsize(name2)
86+
f2_data = (f2_len, name2, f2)
87+
lcs_len, lcs_offset = get_lcs_offsets(max(f1_data, f2_data), min(f1_data, f2_data))
88+
if lcs_len > 0:
89+
if lcs_len > maxlen:
90+
maxlen = lcs_len
91+
offsets = lcs_offset
92+
elif lcs_len == maxlen and lcs_offset[0][0] == offsets[0][0]:
93+
offsets.append(lcs_offset[1])
94+
f2.close()
95+
f1.close()
96+
97+
print("Length of longest shared strand of bytes: {}".format(maxlen))
98+
for off in offsets:
99+
print("File name: {}, Offset where sequence begins: {}".format(off[0], off[1] - maxlen))

0 commit comments

Comments
 (0)