-
Notifications
You must be signed in to change notification settings - Fork 557
/
ksa.c
242 lines (229 loc) · 8.63 KB
/
ksa.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
/*
* Copyright (c) 2008 Yuta Mori All Rights Reserved.
* 2011 Attractive Chaos <[email protected]>
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/* This is a library for constructing the suffix array for a string containing
* multiple sentinels with sentinels all represented by 0. The last symbol in
* the string must be a sentinel. The library is modified from an early version
* of Yuta Mori's SAIS library, but is slower than the lastest SAIS by about
* 30%, partly due to the recent optimization Yuta has applied and partly due
* to the extra comparisons between sentinels. This is not the first effort in
* supporting multi-sentinel strings, but is probably the easiest to use. */
#include <stdlib.h>
#ifdef _KSA64
#include <stdint.h>
typedef int64_t saint_t;
#define SAINT_MAX INT64_MAX
#define SAIS_CORE ksa_core64
#define SAIS_BWT ksa_bwt64
#define SAIS_MAIN ksa_sa64
#else
#include <limits.h>
typedef int saint_t;
#define SAINT_MAX INT_MAX
#define SAIS_CORE ksa_core
#define SAIS_BWT ksa_bwt
#define SAIS_MAIN ksa_sa
#endif
/* T is of type "const unsigned char*". If T[i] is a sentinel, chr(i) takes a negative value */
#define chr(i) (cs == sizeof(saint_t) ? ((const saint_t *)T)[i] : (T[i]? (saint_t)T[i] : i - SAINT_MAX))
/** Count the occurrences of each symbol */
static void getCounts(const unsigned char *T, saint_t *C, saint_t n, saint_t k, int cs)
{
saint_t i;
for (i = 0; i < k; ++i) C[i] = 0;
for (i = 0; i < n; ++i) {
saint_t c = chr(i);
++C[c > 0? c : 0];
}
}
/**
* Find the end of each bucket
*
* @param C occurrences computed by getCounts(); input
* @param B start/end of each bucket; output
* @param k size of alphabet
* @param end compute the end of bucket if true; otherwise compute the end
*/
static inline void getBuckets(const saint_t *C, saint_t *B, saint_t k, saint_t end)
{
saint_t i, sum = 0;
if (end) for (i = 0; i < k; ++i) sum += C[i], B[i] = sum;
else for (i = 0; i < k; ++i) sum += C[i], B[i] = sum - C[i];
}
/** Induced sort */
static void induceSA(const unsigned char *T, saint_t *SA, saint_t *C, saint_t *B, saint_t n, saint_t k, saint_t cs)
{
saint_t *b, i, j;
saint_t c0, c1;
/* left-to-right induced sort (for L-type) */
if (C == B) getCounts(T, C, n, k, cs);
getBuckets(C, B, k, 0); /* find starts of buckets */
for (i = 0, b = 0, c1 = -1; i < n; ++i) {
j = SA[i], SA[i] = ~j;
if (0 < j) { /* >0 if j-1 is L-type; <0 if S-type; ==0 undefined */
--j;
if ((c0 = chr(j)) != c1) {
B[c1 > 0? c1 : 0] = b - SA;
c1 = c0;
b = SA + B[c1 > 0? c1 : 0];
}
*b++ = (0 < j && chr(j - 1) < c1) ? ~j : j;
}
}
/* right-to-left induced sort (for S-type) */
if (C == B) getCounts(T, C, n, k, cs);
getBuckets(C, B, k, 1); /* find ends of buckets */
for (i = n - 1, b = 0, c1 = -1; 0 <= i; --i) {
if (0 < (j = SA[i])) { /* the prefix is S-type */
--j;
if ((c0 = chr(j)) != c1) {
B[c1 > 0? c1 : 0] = b - SA;
c1 = c0;
b = SA + B[c1 > 0? c1 : 0];
}
if (c0 > 0) *--b = (j == 0 || chr(j - 1) > c1) ? ~j : j;
} else SA[i] = ~j; /* if L-type, change the sign */
}
}
/**
* Recursively construct the suffix array for a string containing multiple
* sentinels. NULL is taken as the sentinel.
*
* @param T NULL terminated input string (there can be multiple NULLs)
* @param SA output suffix array
* @param fs working space available in SA (typically 0 when first called)
* @param n length of T, including the trailing NULL
* @param k size of the alphabet (typically 256 when first called)
* @param cs # bytes per element in T; 1 or sizeof(saint_t) (typically 1 when first called)
*
* @return 0 upon success
*/
int SAIS_CORE(const unsigned char *T, saint_t *SA, saint_t fs, saint_t n, saint_t k, int cs)
{
saint_t *C, *B;
saint_t i, j, c, m, q, qlen, name;
saint_t c0, c1;
/* STAGE I: reduce the problem by at least 1/2 sort all the S-substrings */
if (k <= fs) C = SA + n, B = (k <= fs - k) ? C + k : C;
else {
if ((C = (saint_t*)malloc(k * (1 + (cs == 1)) * sizeof(saint_t))) == NULL) return -2;
B = cs == 1? C + k : C;
}
getCounts(T, C, n, k, cs);
getBuckets(C, B, k, 1); /* find ends of buckets */
for (i = 0; i < n; ++i) SA[i] = 0;
/* mark L and S (the t array in Nong et al.), and keep the positions of LMS in the buckets */
for (i = n - 2, c = 1, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
if ((c0 = chr(i)) < c1 + c) c = 1; /* c1 = chr(i+1); c==1 if in an S run */
else if (c) SA[--B[c1 > 0? c1 : 0]] = i + 1, c = 0;
}
induceSA(T, SA, C, B, n, k, cs);
if (fs < k) free(C);
/* pack all the sorted LMS into the first m items of SA
2*m must be not larger than n (see Nong et al. for the proof) */
for (i = 0, m = 0; i < n; ++i) {
saint_t p = SA[i];
if (p == n - 1) SA[m++] = p;
else if (0 < p && chr(p - 1) > (c0 = chr(p))) {
for (j = p + 1; j < n && c0 == (c1 = chr(j)); ++j);
if (j < n && c0 < c1) SA[m++] = p;
}
}
for (i = m; i < n; ++i) SA[i] = 0; /* init the name array buffer */
/* store the length of all substrings */
for (i = n - 2, j = n, c = 1, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
if ((c0 = chr(i)) < c1 + c) c = 1; /* c1 = chr(i+1) */
else if (c) SA[m + ((i + 1) >> 1)] = j - i - 1, j = i + 1, c = 0;
}
/* find the lexicographic names of all substrings */
for (i = 0, name = 0, q = n, qlen = 0; i < m; ++i) {
saint_t p = SA[i], plen = SA[m + (p >> 1)], diff = 1;
if (plen == qlen) {
for (j = 0; j < plen && chr(p + j) == chr(q + j); j++);
if (j == plen) diff = 0;
}
if (diff) ++name, q = p, qlen = plen;
SA[m + (p >> 1)] = name;
}
/* STAGE II: solve the reduced problem; recurse if names are not yet unique */
if (name < m) {
saint_t *RA = SA + n + fs - m - 1;
for (i = n - 1, j = m - 1; m <= i; --i)
if (SA[i] != 0) RA[j--] = SA[i];
RA[m] = 0; // add a sentinel; in the resulting SA, SA[0]==m always stands
if (SAIS_CORE((unsigned char *)RA, SA, fs + n - m * 2 - 2, m + 1, name + 1, sizeof(saint_t)) != 0) return -2;
for (i = n - 2, j = m - 1, c = 1, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) {
if ((c0 = chr(i)) < c1 + c) c = 1;
else if (c) RA[j--] = i + 1, c = 0; /* get p1 */
}
for (i = 0; i < m; ++i) SA[i] = RA[SA[i+1]]; /* get index */
}
/* STAGE III: induce the result for the original problem */
if (k <= fs) C = SA + n, B = (k <= fs - k) ? C + k : C;
else {
if ((C = (saint_t*)malloc(k * (1 + (cs == 1)) * sizeof(saint_t))) == NULL) return -2;
B = cs == 1? C + k : C;
}
/* put all LMS characters into their buckets */
getCounts(T, C, n, k, cs);
getBuckets(C, B, k, 1); /* find ends of buckets */
for (i = m; i < n; ++i) SA[i] = 0; /* init SA[m..n-1] */
for (i = m - 1; 0 <= i; --i) {
j = SA[i], SA[i] = 0;
c = chr(j);
SA[--B[c > 0? c : 0]] = j;
}
induceSA(T, SA, C, B, n, k, cs);
if (fs < k) free(C);
return 0;
}
/**
* Construct the suffix array for a NULL terminated string possibly containing
* multiple sentinels (NULLs).
*
* @param T[0..n-1] NULL terminated input string
* @param SA[0..n-1] output suffix array
* @param n length of the given string, including NULL
* @param k size of the alphabet including the sentinel; no more than 256
* @return 0 upon success
*/
int SAIS_MAIN(const unsigned char *T, saint_t *SA, saint_t n, int k)
{
if (T == NULL || SA == NULL || T[n - 1] != '\0' || n <= 0) return -1;
if (k < 0 || k > 256) k = 256;
return SAIS_CORE(T, SA, 0, n, (saint_t)k, 1);
}
int SAIS_BWT(unsigned char *T, saint_t n, int k)
{
saint_t *SA, i;
int ret;
if ((SA = malloc(n * sizeof(saint_t))) == 0) return -1;
if ((ret = SAIS_MAIN(T, SA, n, k)) != 0) return ret;
for (i = 0; i < n; ++i)
if (SA[i]) SA[i] = T[SA[i] - 1]; // if SA[i]==0, SA[i]=0
for (i = 0; i < n; ++i) T[i] = SA[i];
free(SA);
return 0;
}