Skip to content

Commit d04598c

Browse files
committed
Further improvements for casefolding
1 parent 40cd160 commit d04598c

File tree

6 files changed

+436
-31
lines changed

6 files changed

+436
-31
lines changed

REQUIRE

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
julia 0.6 2-
2-
ModuleInterfaceTools 0.1.6
2+
ModuleInterfaceTools 0.1.7
33
StrAPI 0.1.8
44
CharSetEncodings 0.1.8

src/CaseTables.jl

+321
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,321 @@
1+
__precompile__(true)
2+
"""
3+
Case folding tables for Unicode characters
4+
5+
Copyright 2018 Gandalf Software, Inc., Scott P. Jones
6+
Licensed under MIT License, see LICENSE.md
7+
"""
8+
module CaseTables
9+
10+
struct CaseTable
11+
l_tab::NTuple{128,UInt8}
12+
u_tab::NTuple{128,UInt8}
13+
t_tab::NTuple{64,UInt8}
14+
15+
is_l_tab::NTuple{256, UInt8}
16+
is_u_tab::NTuple{256, UInt8}
17+
can_l_tab::NTuple{256, UInt8}
18+
can_u_tab::NTuple{256, UInt8}
19+
20+
can_l_flg::UInt128
21+
can_u_flg::UInt128
22+
can_t_flg::UInt128
23+
can_sl_flg::UInt128
24+
can_su_flg::UInt128
25+
26+
is_l_flg::UInt128
27+
is_u_flg::UInt128
28+
is_sl_flg::UInt128
29+
is_su_flg::UInt128
30+
31+
siz_l_flg::UInt128
32+
siz_u_flg::UInt128
33+
max_siz_l::UInt32
34+
max_siz_u::UInt32
35+
end
36+
37+
@static VERSION < v"0.7-" && (const islowercase = islower; const isuppercase = isupper)
38+
39+
# Calculate tables (later move these to library built based on Unicode tables with BinaryBuilder,
40+
# loaded by BinaryProvider)
41+
42+
# For speed, we want to know which characters can be lowercased, uppercased, or titlecased
43+
# We want to also know which ones increase or decrease in size in UTF-8 encoding
44+
45+
@inline utf8_len(cp::Unsigned) = ifelse(cp <= 0x7f, 1, ifelse(cp <= 0x7ff, 2, 3 + (cp > 0xffff)))
46+
@inline utf8_len(ch::Char) = utf8_len(ch%UInt32)
47+
48+
const _z4 = 0%UInt32
49+
50+
const zerotup = (_z4, _z4, _z4, _z4, _z4, _z4, _z4, _z4,
51+
_z4, _z4, _z4, _z4, _z4, _z4, _z4, _z4)
52+
53+
function _add_to_bv!(vec, tmp)
54+
tt = tuple(tmp...)
55+
tt == zerotup && return 0x0
56+
fill!(tmp, 0%UInt32)
57+
for (i, t) in enumerate(vec)
58+
(t == tt) && return UInt8(i)
59+
end
60+
push!(vec, tt)
61+
UInt8(length(vec))
62+
end
63+
64+
# case tables should be:
65+
# 128-bit bitmap to say whether a particular range of 512 code points can be folded
66+
# 64-byte ntuple{32,UInt16}
67+
function case_tables()
68+
bitvec = NTuple{16,UInt32}[] # each element represents 512 characters
69+
tupvec = NTuple{32,UInt16}[] # each element represents 32 characters
70+
offvec = NTuple{32,UInt8}[] # each element maps 1024 character range to offsets in tupvec
71+
72+
sizvecl = Pair{UInt16,UInt64}[]
73+
sizvecu = Pair{UInt16,UInt64}[]
74+
75+
# Top level tables
76+
l_tab = fill(0x0, 128)
77+
u_tab = fill(0x0, 128)
78+
t_tab = fill(0x0, 64)
79+
80+
# The elements of these are all offsets into bitvec
81+
is_l_tab = fill(0x0, 256)
82+
is_u_tab = fill(0x0, 256)
83+
can_l_tab = fill(0x0, 256)
84+
can_u_tab = fill(0x0, 256)
85+
is_l_flg = is_u_flg = is_sl_flg = is_su_flg = 0%UInt128
86+
87+
# These tables get reused, stored in offvec
88+
l_off = fill(0x0, 32)
89+
u_off = fill(0x0, 32)
90+
t_off = fill(0x0, 32)
91+
92+
# These tables get reused, stored in tupvec
93+
tmp_l = fill(0x0000, 32)
94+
tmp_u = fill(0x0000, 32)
95+
tmp_t = fill(0x0000, 32)
96+
97+
# These tables get reused, stored in bitvec
98+
bit_is_l = fill(0%UInt32, 16)
99+
bit_is_u = fill(0%UInt32, 16)
100+
bit_can_l = fill(0%UInt32, 16)
101+
bit_can_u = fill(0%UInt32, 16)
102+
103+
# Handle BMP
104+
l_mid = u_mid = t_mid = false
105+
can_l_flg = can_u_flg = can_t_flg = 0%UInt128
106+
siz_l_flg = siz_u_flg = 0%UInt128
107+
max_siz_l = max_siz_u = 0%UInt32
108+
tmp_is_l = tmp_is_u = tmp_can_l = tmp_can_u = 0%UInt32
109+
for rng in (0x0080:0x20:0xd7e0, 0xe000:0x20:0xffe0), base in rng
110+
hipos = (base >>> 10) + 1
111+
hibit = UInt128(1) << (base >>> 9)
112+
midbits = ((base >>> 5) & 0x1f) + 1
113+
114+
# Handle block of 32 characters
115+
l_flg = u_flg = t_flg = false
116+
diff_l = diff_u = 0%UInt64
117+
for off = 0x00:0x1f
118+
cp = UInt16(base + off)
119+
lowbit = UInt32(1) << off
120+
ch = Char(cp)
121+
cl = UInt16(lowercase(ch))
122+
cu = UInt16(uppercase(ch))
123+
ct = UInt16(titlecase(ch))
124+
tmp_l[off+1] = cl
125+
tmp_u[off+1] = cu
126+
tmp_t[off+1] = ct
127+
islowercase(ch) && (tmp_is_l |= lowbit)
128+
isuppercase(ch) && (tmp_is_u |= lowbit)
129+
cp === cl === cu === ct && continue
130+
sizc = utf8_len(ch)
131+
sizu = utf8_len(cu)
132+
sizt = utf8_len(ct)
133+
if cl !== cp
134+
l_flg = true
135+
tmp_can_l |= lowbit
136+
diff = utf8_len(cl) - sizc
137+
-2 <= diff <= 1 || error("Size difference for $cp -> $cl is $diff")
138+
diff == 0 || (diff_l |= (UInt64(diff & 3)<<(off<<1)); max_siz_l = cp)
139+
end
140+
if cu !== cp
141+
u_flg = true
142+
tmp_can_u |= lowbit
143+
diff = sizu - sizc
144+
-2 <= diff <= 1 || error("Size difference for $cp -> $cu is $diff")
145+
diff == 0 || (diff_u |= (UInt64(diff & 3)<<(off<<1)); max_siz_u = cp)
146+
end
147+
if ct !== cu
148+
t_flg = true
149+
sizu == sizt ||
150+
error("Titlecase and Uppercase are not same size in UTF-8: " *
151+
"$cp $cu:$sizu $ct:$sizt")
152+
end
153+
end
154+
155+
bitoff = ((base >>> 5) & 0xf) + 1
156+
bit_is_l[bitoff] = tmp_is_l
157+
bit_is_u[bitoff] = tmp_is_u
158+
bit_can_l[bitoff] = tmp_can_l
159+
bit_can_u[bitoff] = tmp_can_u
160+
161+
tmp_is_l = tmp_is_u = tmp_can_l = tmp_can_u = 0%UInt32
162+
163+
if l_flg
164+
can_l_flg |= hibit
165+
l_mid = true # Have at least one in this set of blocks
166+
push!(tupvec, tuple(tmp_l...))
167+
l_off[midbits] = UInt8(length(tupvec))
168+
end
169+
if u_flg
170+
can_u_flg |= hibit
171+
u_mid = true
172+
push!(tupvec, tuple(tmp_u...))
173+
u_off[midbits] = UInt8(length(tupvec))
174+
end
175+
if t_flg
176+
can_t_flg |= hibit
177+
t_mid = true
178+
push!(tupvec, tuple(tmp_t...))
179+
t_off[midbits] = UInt8(length(tupvec))
180+
else
181+
t_off[midbits] = u_off[midbits]
182+
end
183+
184+
diff_l == 0 || (siz_l_flg |= hibit; push!(sizvecl, base => diff_l))
185+
diff_u == 0 || (siz_u_flg |= hibit; push!(sizvecu, base => diff_u))
186+
187+
if bitoff == 16
188+
# Reset bits
189+
pos = (base >>> 9) + 1
190+
(is_l_tab[pos] = _add_to_bv!(bitvec, bit_is_l)) == 0 || (is_l_flg |= hibit)
191+
(is_u_tab[pos] = _add_to_bv!(bitvec, bit_is_u)) == 0 || (is_u_flg |= hibit)
192+
(can_l_tab[pos] = _add_to_bv!(bitvec, bit_can_l)) == 0 || (can_l_flg |= hibit)
193+
(can_u_tab[pos] = _add_to_bv!(bitvec, bit_can_u)) == 0 || (can_u_flg |= hibit)
194+
end
195+
196+
# Check for end of chunk
197+
midbits == 32 || continue
198+
199+
if l_mid
200+
push!(offvec, tuple(l_off...))
201+
l_tab[hipos] = UInt8(length(offvec))
202+
fill!(l_off, 0x0)
203+
l_mid = false
204+
end
205+
if u_mid
206+
push!(offvec, tuple(u_off...))
207+
u_tab[hipos] = UInt8(length(offvec))
208+
fill!(u_off, 0x0)
209+
u_mid = false
210+
end
211+
if t_mid
212+
push!(offvec, tuple(t_off...))
213+
t_tab[hipos] = UInt8(length(offvec))
214+
t_mid = false
215+
else
216+
t_tab[hipos] = u_tab[hipos]
217+
end
218+
end
219+
220+
# Handle SLP
221+
can_sl_flg = can_su_flg = 0%UInt128
222+
223+
for base in 0x0000:0x20:0xffe0
224+
hipos = (base >>> 10) + 65
225+
hibit = (1%UInt128) << ((base >>> 9) & 0x7f)
226+
midbits = ((base >>> 5) & 0x1f) + 1
227+
228+
# Handle block of 32 characters
229+
l_flg = u_flg = false
230+
for off = 0x00:0x1f
231+
cp = UInt32(0x10000 + base + off)
232+
ch = Char(cp)
233+
cl = UInt32(lowercase(ch))
234+
cu = UInt32(uppercase(ch))
235+
ct = UInt32(titlecase(ch))
236+
lowbit = UInt32(1) << off
237+
islowercase(ch) && (tmp_is_l |= lowbit)
238+
isuppercase(ch) && (tmp_is_u |= lowbit)
239+
tmp_l[off+1] = cl%UInt16
240+
tmp_u[off+1] = cu%UInt16
241+
cp === cl === cu === ct && continue
242+
ct == cu || error("titlecase: $cp -> $ct not same as uppercase $cu")
243+
sizc = 4
244+
sizl = utf8_len(cl)
245+
sizu = utf8_len(cu)
246+
sizl == sizu == 4 || error("UTF-8 sizes not 4: $cp: $cl=$sizl, $cu=$sizu")
247+
if cl !== cp
248+
tmp_can_l |= lowbit
249+
l_flg = true
250+
end
251+
if cu !== cp
252+
tmp_can_u |= lowbit
253+
u_flg = true
254+
end
255+
end
256+
257+
if l_flg
258+
l_mid = true
259+
push!(tupvec, tuple(tmp_l...))
260+
l_off[midbits] = UInt8(length(tupvec))
261+
end
262+
if u_flg
263+
u_mid = true
264+
push!(tupvec, tuple(tmp_u...))
265+
u_off[midbits] = UInt8(length(tupvec))
266+
end
267+
268+
bitoff = ((base >>> 5) & 0xf) + 1
269+
bit_is_l[bitoff] = tmp_is_l
270+
bit_is_u[bitoff] = tmp_is_u
271+
bit_can_l[bitoff] = tmp_can_l
272+
bit_can_u[bitoff] = tmp_can_u
273+
274+
tmp_is_l = tmp_is_u = tmp_can_l = tmp_can_u = 0%UInt32
275+
276+
if bitoff == 16
277+
# Reset bits
278+
pos = (base >>> 9) + 129
279+
(is_l_tab[pos] = _add_to_bv!(bitvec, bit_is_l)) == 0 || (is_sl_flg |= hibit)
280+
(is_u_tab[pos] = _add_to_bv!(bitvec, bit_is_u)) == 0 || (is_su_flg |= hibit)
281+
(can_l_tab[pos] = _add_to_bv!(bitvec, bit_can_l)) == 0 || (can_sl_flg |= hibit)
282+
(can_u_tab[pos] = _add_to_bv!(bitvec, bit_can_u)) == 0 || (can_su_flg |= hibit)
283+
end
284+
285+
# Check for end of chunk
286+
midbits == 32 || continue
287+
288+
if l_mid
289+
push!(offvec, tuple(l_off...))
290+
l_tab[hipos] = UInt8(length(offvec))
291+
fill!(l_off, 0x0)
292+
l_mid = false
293+
end
294+
if u_mid
295+
push!(offvec, tuple(u_off...))
296+
u_tab[hipos] = UInt8(length(offvec))
297+
fill!(u_off, 0x0)
298+
u_mid = false
299+
end
300+
end
301+
302+
# Check that there are no upper / lower / title case characters above SLP
303+
for cp in 0x20000:0x10ffff
304+
ch = Char(cp)
305+
ch == lowercase(ch) == uppercase(ch) == titlecase(ch) ||
306+
error("$cp has lower/upper/titlecase")
307+
end
308+
309+
(CaseTable(tuple(l_tab...), tuple(u_tab...), tuple(t_tab...),
310+
tuple(is_l_tab...), tuple(is_u_tab...),
311+
tuple(can_l_tab...), tuple(can_u_tab...),
312+
can_l_flg, can_u_flg, can_t_flg, can_sl_flg, can_su_flg,
313+
is_l_flg, is_u_flg, is_sl_flg, is_su_flg,
314+
siz_l_flg, siz_u_flg, max_siz_l, max_siz_u),
315+
tuple(tupvec...), tuple(offvec...), tuple(bitvec...),
316+
tuple(sizvecl...), tuple(sizvecu...))
317+
end
318+
319+
const ct, tupvec, offvec, bitvec, sizvecl, sizvecu = case_tables()
320+
321+
end # module CaseTables

src/ChrBase.jl

+4-3
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,15 @@ using ModuleInterfaceTools
2121
LatinChars, ByteChars, WideChars, AbsChar, bytoff, chroff, chrdiff, utf_trail,
2222
codepoint_cse, codepoint_rng, codepoint_adj, utf8proc_error,
2323
write_utf8, write_utf16, _write_utf8_2, _write_utf8_3, _write_utf8_4, _write_ucs2,
24-
_lowercase_l, _uppercase_l, _lowercase_u, _uppercase_u, _titlecase_u,
25-
_islower_a, _islower_l, _islower_u, _isupper_a, _isupper_l, _isupper_al, _isupper_u,
26-
_can_upper, _can_upper_l
24+
_lowercase_l, _uppercase_l,
25+
_is_lower_a, _is_lower_l, _is_lower_al, _is_lower_ch,
26+
_is_upper_a, _is_upper_l, _is_upper_al, _is_upper_ch
2727

2828
@api develop! _isvalid_chr
2929

3030
include("core.jl")
3131
@static V6_COMPAT && include("compat.jl")
32+
include("CaseTables.jl"); using .CaseTables
3233
include("casefold.jl")
3334
include("io.jl")
3435
include("traits.jl")

0 commit comments

Comments
 (0)