Skip to content

Commit 295da8b

Browse files
committed
Lingering file committed
1 parent b00fc37 commit 295da8b

File tree

1 file changed

+268
-0
lines changed

1 file changed

+268
-0
lines changed

sais.py

+268
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
""" SA-IS"""
2+
3+
BYTESIZE = 256
4+
5+
def naive_build_suff_arr(string):
6+
return sorted(range(len(string)+1), key=lambda i: string[i:])
7+
8+
9+
10+
def build_type_map(string):
11+
""" Returns boolean array for each index of string (including empty suffix) - True for if it is S-Type, False for L-Type """
12+
is_S_typemap = [False] * (len(string) + 1)
13+
14+
is_S_typemap[-1] = True
15+
if len(string) == 0:
16+
return is_S_typemap
17+
18+
for i in range(len(string)-2, -1, -1):
19+
if string[i] < string[i+1] or (string[i] == string[i+1] and is_S_typemap[i+1]):
20+
is_S_typemap[i] = True
21+
22+
return is_S_typemap
23+
24+
def is_LMS(is_S_typemap, index):
25+
return index != 0 and is_S_typemap[index] and not is_S_typemap[index-1]
26+
27+
28+
def print_type_LMS(string):
29+
print(string.decode("ascii"))
30+
is_S_typemap = build_type_map(string)
31+
for is_S in is_S_typemap:
32+
print("S" if is_S else "L", end="")
33+
print()
34+
for i in range(len(is_S_typemap)):
35+
if is_LMS(is_S_typemap, i):
36+
print("^", end="")
37+
else:
38+
print(" ", end="")
39+
print()
40+
41+
def is_equal_lms(string, is_S_typemap, indA, indB):
42+
""" Compare two LMS substrings to be exactly equal - assumes input is LMS index """
43+
if indA == len(string) or indB == len(string):
44+
return False
45+
46+
pos = 0
47+
while True:
48+
a_is_LMS = is_LMS(is_S_typemap, indA + pos)
49+
b_is_LMS = is_LMS(is_S_typemap, indB + pos)
50+
51+
# Reached the end of one LMS substring
52+
if a_is_LMS != b_is_LMS:
53+
return False
54+
55+
# Characters are different
56+
if string[indA+pos] != string[indB+pos]:
57+
return False
58+
59+
# Reached next LMS substring
60+
if pos > 0 and a_is_LMS and b_is_LMS:
61+
return True
62+
pos += 1
63+
64+
def calc_bucket_sizes(string, alphabet_size):
65+
sizes = [0] * alphabet_size
66+
for num in string:
67+
sizes[num] += 1
68+
return sizes
69+
70+
def calc_bucket_heads(bucket_sizes):
71+
heads = [0] * len(bucket_sizes)
72+
offset = 1
73+
for index, size in enumerate(bucket_sizes):
74+
heads[index] = offset
75+
offset += size
76+
return heads
77+
78+
def calc_bucket_tails(bucket_sizes):
79+
tails = [0] * len(bucket_sizes)
80+
offset = 0
81+
for index, size in enumerate(bucket_sizes):
82+
offset += size
83+
tails[index] = offset
84+
return tails
85+
86+
def print_suff_arr(arr, pos=None):
87+
print(" ".join("%02d" % each for each in arr))
88+
89+
if pos is not None:
90+
print(" ".join(
91+
"^^" if each == pos else " "
92+
for each in range(len(arr))
93+
))
94+
95+
def build_suffix_arr_SAIS(string, alphabet_size):
96+
""" Build complete suffix array with SA-IS """
97+
is_S_typemap = build_type_map(string)
98+
bucket_sizes = calc_bucket_sizes(string, alphabet_size)
99+
100+
approx_suff_arr = approx_LMS_sort(string, bucket_sizes, is_S_typemap)
101+
print()
102+
sort_L_type(string, approx_suff_arr, bucket_sizes, is_S_typemap)
103+
print()
104+
sort_S_type(string, approx_suff_arr, bucket_sizes, is_S_typemap)
105+
print()
106+
summ_str, summ_alph_size, summ_suff_indices = summarize_suff_arr(string, approx_suff_arr, is_S_typemap)
107+
print()
108+
summ_suff_arr = build_summ_suff_arr(summ_str, summ_alph_size)
109+
print()
110+
final_suff_arr = final_LMS_sort(string, bucket_sizes, is_S_typemap, summ_suff_arr, summ_suff_indices)
111+
print()
112+
sort_L_type(string, final_suff_arr, bucket_sizes, is_S_typemap)
113+
print()
114+
sort_S_type(string, final_suff_arr, bucket_sizes, is_S_typemap)
115+
print()
116+
117+
return final_suff_arr
118+
119+
def approx_LMS_sort(string, bucket_sizes, is_S_typemap):
120+
""" Generate suffix array with LMS substrings approximately sorted by first characters """
121+
approx_suff_arr = [-1] * (len(string) + 1)
122+
# Empty string is lexicographically smallest
123+
approx_suff_arr[0] = len(string)
124+
print_suff_arr(approx_suff_arr)
125+
bucket_tails = calc_bucket_tails(bucket_sizes)
126+
127+
# Bucket sort by first char - only LMS substrings
128+
for i in range(len(string)):
129+
if not is_LMS(is_S_typemap, i):
130+
continue
131+
132+
char_num = string[i]
133+
approx_suff_arr[bucket_tails[char_num]] = i
134+
bucket_tails[char_num] -= 1
135+
136+
print_suff_arr(approx_suff_arr)
137+
return approx_suff_arr
138+
139+
def sort_L_type(string, suff_arr, bucket_sizes, is_S_typemap):
140+
bucket_heads = calc_bucket_heads(bucket_sizes)
141+
142+
for suff in suff_arr:
143+
L_suff = suff - 1
144+
if L_suff < 0 or is_S_typemap[L_suff]:
145+
continue
146+
147+
char_num = string[L_suff]
148+
suff_arr[bucket_heads[char_num]] = L_suff
149+
bucket_heads[char_num] += 1
150+
151+
print_suff_arr(suff_arr)
152+
153+
def sort_S_type(string, suff_arr, bucket_sizes, is_S_typemap):
154+
bucket_tails = calc_bucket_tails(bucket_sizes)
155+
156+
for suff in reversed(suff_arr):
157+
L_suff = suff - 1
158+
if L_suff < 0 or not is_S_typemap[L_suff]:
159+
continue
160+
161+
char_num = string[L_suff]
162+
suff_arr[bucket_tails[char_num]] = L_suff
163+
bucket_tails[char_num] -= 1
164+
165+
print_suff_arr(suff_arr)
166+
167+
def summarize_suff_arr(string, approx_suff_arr, is_S_typemap):
168+
lms_names = [-1] * (len(string) + 1)
169+
cur_name = 0
170+
last_LMS_ind = None
171+
172+
lms_names[len(string)] = cur_name
173+
last_LMS_ind = len(string)
174+
print_suff_arr(lms_names)
175+
176+
for i in range(1, len(approx_suff_arr)):
177+
suff_ind = approx_suff_arr[i]
178+
if not is_LMS(is_S_typemap, suff_ind):
179+
continue
180+
if not is_equal_lms(string, is_S_typemap, last_LMS_ind, suff_ind):
181+
cur_name += 1
182+
# if last_LMS_ind < len(string) and string[suff_ind] != string[last_LMS_ind]:
183+
# cur_name += 1
184+
last_LMS_ind = suff_ind
185+
lms_names[suff_ind] = cur_name
186+
print_suff_arr(lms_names)
187+
188+
summ_suff_inds = []
189+
summ_str = []
190+
for ind, name in enumerate(lms_names):
191+
if name != -1:
192+
summ_suff_inds.append(ind)
193+
summ_str.append(name)
194+
195+
summ_alph_size = cur_name + 1
196+
print()
197+
print_suff_arr(summ_str)
198+
print()
199+
print_suff_arr(summ_suff_inds)
200+
return summ_str, summ_alph_size, summ_suff_inds
201+
202+
def build_summ_suff_arr(summ_str, summ_alph_size):
203+
if summ_alph_size == len(summ_str):
204+
summ_suff_arr = [-1] * (len(summ_str) + 1)
205+
summ_suff_arr[0] = len(summ_str)
206+
for i in range(len(summ_str)):
207+
rank_num = summ_str[i]
208+
summ_suff_arr[rank_num+1] = i
209+
else:
210+
# Recursively make suffix array of new string
211+
summ_suff_arr = build_suffix_arr_SAIS(summ_str, summ_alph_size)
212+
return summ_suff_arr
213+
214+
def final_LMS_sort(string, bucket_sizes, is_S_typemap, summ_suff_arr, summ_suff_indices):
215+
suff_arr = [-1] * (len(string) + 1)
216+
suff_arr[0] = len(string)
217+
print_suff_arr(suff_arr)
218+
bucket_tails = calc_bucket_tails(bucket_sizes)
219+
220+
for i in range(len(summ_suff_arr)-1, 1, -1):
221+
str_ind = summ_suff_indices[summ_suff_arr[i]]
222+
char_num = string[str_ind]
223+
suff_arr[bucket_tails[char_num]] = str_ind
224+
bucket_tails[char_num] -= 1
225+
print_suff_arr(suff_arr)
226+
227+
return suff_arr
228+
229+
230+
231+
232+
# string = b'rikki-tikki-tikka'
233+
# print_type_LMS(string)
234+
# t = build_type_map(string)
235+
# print(is_equal_lms(string, t, 1, 13))
236+
237+
string = b"caabcaac"
238+
# # bucket_sizes = calc_bucket_sizes(string, 256)
239+
# # is_S_typemap = build_type_map(string)
240+
print_type_LMS(string)
241+
print()
242+
# # print()
243+
# # guess = approx_LMS_sort(string, bucket_sizes, is_S_typemap)
244+
# # print()
245+
# # sort_L_type(string, guess, bucket_sizes, is_S_typemap)
246+
# # print()
247+
# # sort_S_type(string, guess, bucket_sizes, is_S_typemap)
248+
# # print()
249+
250+
# # str_summ, str_summ_alph, str_summ_offs = summarize_suff_arr(string, guess, is_S_typemap)
251+
# # print()
252+
# # print_suff_arr(str_summ)
253+
# # print()
254+
# # print_suff_arr(str_summ_offs)
255+
# # print()
256+
257+
# # summ_suff_arr = build_summ_suff_arr(str_summ, str_summ_alph)
258+
# # print_suff_arr(summ_suff_arr)
259+
# # print()
260+
261+
suff_arr = build_suffix_arr_SAIS(string, 256)
262+
print()
263+
print_suff_arr(suff_arr)
264+
print()
265+
266+
print_suff_arr(naive_build_suff_arr(string))
267+
268+
exit()

0 commit comments

Comments
 (0)