1
+ """ SA-IS"""
2
+
3
+ BYTESIZE = 256
4
+
5
+ def naive_build_suff_arr (string ):
6
+ return sorted (range (len (string )+ 1 ), key = lambda i : string [i :])
7
+
8
+
9
+
10
+ def build_type_map (string ):
11
+ """ Returns boolean array for each index of string (including empty suffix) - True for if it is S-Type, False for L-Type """
12
+ is_S_typemap = [False ] * (len (string ) + 1 )
13
+
14
+ is_S_typemap [- 1 ] = True
15
+ if len (string ) == 0 :
16
+ return is_S_typemap
17
+
18
+ for i in range (len (string )- 2 , - 1 , - 1 ):
19
+ if string [i ] < string [i + 1 ] or (string [i ] == string [i + 1 ] and is_S_typemap [i + 1 ]):
20
+ is_S_typemap [i ] = True
21
+
22
+ return is_S_typemap
23
+
24
+ def is_LMS (is_S_typemap , index ):
25
+ return index != 0 and is_S_typemap [index ] and not is_S_typemap [index - 1 ]
26
+
27
+
28
+ def print_type_LMS (string ):
29
+ print (string .decode ("ascii" ))
30
+ is_S_typemap = build_type_map (string )
31
+ for is_S in is_S_typemap :
32
+ print ("S" if is_S else "L" , end = "" )
33
+ print ()
34
+ for i in range (len (is_S_typemap )):
35
+ if is_LMS (is_S_typemap , i ):
36
+ print ("^" , end = "" )
37
+ else :
38
+ print (" " , end = "" )
39
+ print ()
40
+
41
+ def is_equal_lms (string , is_S_typemap , indA , indB ):
42
+ """ Compare two LMS substrings to be exactly equal - assumes input is LMS index """
43
+ if indA == len (string ) or indB == len (string ):
44
+ return False
45
+
46
+ pos = 0
47
+ while True :
48
+ a_is_LMS = is_LMS (is_S_typemap , indA + pos )
49
+ b_is_LMS = is_LMS (is_S_typemap , indB + pos )
50
+
51
+ # Reached the end of one LMS substring
52
+ if a_is_LMS != b_is_LMS :
53
+ return False
54
+
55
+ # Characters are different
56
+ if string [indA + pos ] != string [indB + pos ]:
57
+ return False
58
+
59
+ # Reached next LMS substring
60
+ if pos > 0 and a_is_LMS and b_is_LMS :
61
+ return True
62
+ pos += 1
63
+
64
+ def calc_bucket_sizes (string , alphabet_size ):
65
+ sizes = [0 ] * alphabet_size
66
+ for num in string :
67
+ sizes [num ] += 1
68
+ return sizes
69
+
70
+ def calc_bucket_heads (bucket_sizes ):
71
+ heads = [0 ] * len (bucket_sizes )
72
+ offset = 1
73
+ for index , size in enumerate (bucket_sizes ):
74
+ heads [index ] = offset
75
+ offset += size
76
+ return heads
77
+
78
+ def calc_bucket_tails (bucket_sizes ):
79
+ tails = [0 ] * len (bucket_sizes )
80
+ offset = 0
81
+ for index , size in enumerate (bucket_sizes ):
82
+ offset += size
83
+ tails [index ] = offset
84
+ return tails
85
+
86
+ def print_suff_arr (arr , pos = None ):
87
+ print (" " .join ("%02d" % each for each in arr ))
88
+
89
+ if pos is not None :
90
+ print (" " .join (
91
+ "^^" if each == pos else " "
92
+ for each in range (len (arr ))
93
+ ))
94
+
95
+ def build_suffix_arr_SAIS (string , alphabet_size ):
96
+ """ Build complete suffix array with SA-IS """
97
+ is_S_typemap = build_type_map (string )
98
+ bucket_sizes = calc_bucket_sizes (string , alphabet_size )
99
+
100
+ approx_suff_arr = approx_LMS_sort (string , bucket_sizes , is_S_typemap )
101
+ print ()
102
+ sort_L_type (string , approx_suff_arr , bucket_sizes , is_S_typemap )
103
+ print ()
104
+ sort_S_type (string , approx_suff_arr , bucket_sizes , is_S_typemap )
105
+ print ()
106
+ summ_str , summ_alph_size , summ_suff_indices = summarize_suff_arr (string , approx_suff_arr , is_S_typemap )
107
+ print ()
108
+ summ_suff_arr = build_summ_suff_arr (summ_str , summ_alph_size )
109
+ print ()
110
+ final_suff_arr = final_LMS_sort (string , bucket_sizes , is_S_typemap , summ_suff_arr , summ_suff_indices )
111
+ print ()
112
+ sort_L_type (string , final_suff_arr , bucket_sizes , is_S_typemap )
113
+ print ()
114
+ sort_S_type (string , final_suff_arr , bucket_sizes , is_S_typemap )
115
+ print ()
116
+
117
+ return final_suff_arr
118
+
119
+ def approx_LMS_sort (string , bucket_sizes , is_S_typemap ):
120
+ """ Generate suffix array with LMS substrings approximately sorted by first characters """
121
+ approx_suff_arr = [- 1 ] * (len (string ) + 1 )
122
+ # Empty string is lexicographically smallest
123
+ approx_suff_arr [0 ] = len (string )
124
+ print_suff_arr (approx_suff_arr )
125
+ bucket_tails = calc_bucket_tails (bucket_sizes )
126
+
127
+ # Bucket sort by first char - only LMS substrings
128
+ for i in range (len (string )):
129
+ if not is_LMS (is_S_typemap , i ):
130
+ continue
131
+
132
+ char_num = string [i ]
133
+ approx_suff_arr [bucket_tails [char_num ]] = i
134
+ bucket_tails [char_num ] -= 1
135
+
136
+ print_suff_arr (approx_suff_arr )
137
+ return approx_suff_arr
138
+
139
+ def sort_L_type (string , suff_arr , bucket_sizes , is_S_typemap ):
140
+ bucket_heads = calc_bucket_heads (bucket_sizes )
141
+
142
+ for suff in suff_arr :
143
+ L_suff = suff - 1
144
+ if L_suff < 0 or is_S_typemap [L_suff ]:
145
+ continue
146
+
147
+ char_num = string [L_suff ]
148
+ suff_arr [bucket_heads [char_num ]] = L_suff
149
+ bucket_heads [char_num ] += 1
150
+
151
+ print_suff_arr (suff_arr )
152
+
153
+ def sort_S_type (string , suff_arr , bucket_sizes , is_S_typemap ):
154
+ bucket_tails = calc_bucket_tails (bucket_sizes )
155
+
156
+ for suff in reversed (suff_arr ):
157
+ L_suff = suff - 1
158
+ if L_suff < 0 or not is_S_typemap [L_suff ]:
159
+ continue
160
+
161
+ char_num = string [L_suff ]
162
+ suff_arr [bucket_tails [char_num ]] = L_suff
163
+ bucket_tails [char_num ] -= 1
164
+
165
+ print_suff_arr (suff_arr )
166
+
167
+ def summarize_suff_arr (string , approx_suff_arr , is_S_typemap ):
168
+ lms_names = [- 1 ] * (len (string ) + 1 )
169
+ cur_name = 0
170
+ last_LMS_ind = None
171
+
172
+ lms_names [len (string )] = cur_name
173
+ last_LMS_ind = len (string )
174
+ print_suff_arr (lms_names )
175
+
176
+ for i in range (1 , len (approx_suff_arr )):
177
+ suff_ind = approx_suff_arr [i ]
178
+ if not is_LMS (is_S_typemap , suff_ind ):
179
+ continue
180
+ if not is_equal_lms (string , is_S_typemap , last_LMS_ind , suff_ind ):
181
+ cur_name += 1
182
+ # if last_LMS_ind < len(string) and string[suff_ind] != string[last_LMS_ind]:
183
+ # cur_name += 1
184
+ last_LMS_ind = suff_ind
185
+ lms_names [suff_ind ] = cur_name
186
+ print_suff_arr (lms_names )
187
+
188
+ summ_suff_inds = []
189
+ summ_str = []
190
+ for ind , name in enumerate (lms_names ):
191
+ if name != - 1 :
192
+ summ_suff_inds .append (ind )
193
+ summ_str .append (name )
194
+
195
+ summ_alph_size = cur_name + 1
196
+ print ()
197
+ print_suff_arr (summ_str )
198
+ print ()
199
+ print_suff_arr (summ_suff_inds )
200
+ return summ_str , summ_alph_size , summ_suff_inds
201
+
202
+ def build_summ_suff_arr (summ_str , summ_alph_size ):
203
+ if summ_alph_size == len (summ_str ):
204
+ summ_suff_arr = [- 1 ] * (len (summ_str ) + 1 )
205
+ summ_suff_arr [0 ] = len (summ_str )
206
+ for i in range (len (summ_str )):
207
+ rank_num = summ_str [i ]
208
+ summ_suff_arr [rank_num + 1 ] = i
209
+ else :
210
+ # Recursively make suffix array of new string
211
+ summ_suff_arr = build_suffix_arr_SAIS (summ_str , summ_alph_size )
212
+ return summ_suff_arr
213
+
214
+ def final_LMS_sort (string , bucket_sizes , is_S_typemap , summ_suff_arr , summ_suff_indices ):
215
+ suff_arr = [- 1 ] * (len (string ) + 1 )
216
+ suff_arr [0 ] = len (string )
217
+ print_suff_arr (suff_arr )
218
+ bucket_tails = calc_bucket_tails (bucket_sizes )
219
+
220
+ for i in range (len (summ_suff_arr )- 1 , 1 , - 1 ):
221
+ str_ind = summ_suff_indices [summ_suff_arr [i ]]
222
+ char_num = string [str_ind ]
223
+ suff_arr [bucket_tails [char_num ]] = str_ind
224
+ bucket_tails [char_num ] -= 1
225
+ print_suff_arr (suff_arr )
226
+
227
+ return suff_arr
228
+
229
+
230
+
231
+
232
+ # string = b'rikki-tikki-tikka'
233
+ # print_type_LMS(string)
234
+ # t = build_type_map(string)
235
+ # print(is_equal_lms(string, t, 1, 13))
236
+
237
+ string = b"caabcaac"
238
+ # # bucket_sizes = calc_bucket_sizes(string, 256)
239
+ # # is_S_typemap = build_type_map(string)
240
+ print_type_LMS (string )
241
+ print ()
242
+ # # print()
243
+ # # guess = approx_LMS_sort(string, bucket_sizes, is_S_typemap)
244
+ # # print()
245
+ # # sort_L_type(string, guess, bucket_sizes, is_S_typemap)
246
+ # # print()
247
+ # # sort_S_type(string, guess, bucket_sizes, is_S_typemap)
248
+ # # print()
249
+
250
+ # # str_summ, str_summ_alph, str_summ_offs = summarize_suff_arr(string, guess, is_S_typemap)
251
+ # # print()
252
+ # # print_suff_arr(str_summ)
253
+ # # print()
254
+ # # print_suff_arr(str_summ_offs)
255
+ # # print()
256
+
257
+ # # summ_suff_arr = build_summ_suff_arr(str_summ, str_summ_alph)
258
+ # # print_suff_arr(summ_suff_arr)
259
+ # # print()
260
+
261
+ suff_arr = build_suffix_arr_SAIS (string , 256 )
262
+ print ()
263
+ print_suff_arr (suff_arr )
264
+ print ()
265
+
266
+ print_suff_arr (naive_build_suff_arr (string ))
267
+
268
+ exit ()
0 commit comments