Skip to content

Commit b8e550a

Browse files
committed
Update iterate/next for utf8
1 parent ee7a6bb commit b8e550a

File tree

3 files changed

+40
-28
lines changed

3 files changed

+40
-28
lines changed

src/support.jl

+29-19
Original file line numberDiff line numberDiff line change
@@ -935,29 +935,39 @@ repeat(ch::UTF32Chr, cnt::Integer) = _repeat(UTF32CSE, ch, cnt)
935935
=#
936936

937937
function repeat(ch::C, cnt::Integer) where {C<:Union{ASCIIChr,LatinChr,_LatinChr}}
938-
cnt == 0 && return empty_str(ASCIICSE)
939-
cnt < 0 && repeaterr(cnt)
940-
cu = ch%UInt8
941-
buf, pnt = _allocate(UInt8, cnt)
942-
_memset(pnt, cu, cnt)
943-
Str((C == ASCIIChr || cu <= 0x7f) ? ASCIICSE : (C == _LatinChr ? _LatinCSE : LatinCSE), buf)
938+
if cnt > 0
939+
cu = ch%UInt8
940+
buf, pnt = _allocate(UInt8, cnt)
941+
_memset(pnt, cu, cnt)
942+
if C == ASCIIChr || cu <= 0x7f
943+
Str(ASCIICSE, buf)
944+
elseif C == _LatinChr
945+
Str(_LatinCSE, buf)
946+
else
947+
Str(LatinCSE, buf)
948+
end
949+
else
950+
cnt == 0 ? empty_ascii : repeaterr(cnt)
951+
end
944952
end
945953

946954
function repeat(ch::C, cnt::Integer) where {C<:Union{UCS2Chr,UTF32Chr}}
947-
cnt == 0 && return empty_str(ASCIICSE)
948-
cnt < 0 && repeaterr(cnt)
949-
if ch%UInt32 <= 0xff
950-
buf, pnt = _allocate(UInt8, cnt)
951-
cnt == 1 && set_codeunit!(pnt, ch%UInt8) : _memset(pnt, ch%UInt8, cnt)
952-
Str(ifelse(ch%UInt8 <= 0x7f, ASCIICSE, LatinCSE), buf)
953-
elseif C == UCS2Chr || ch%UInt32 <= 0xffff
954-
buf, pnt = _allocate(UInt16, cnt)
955-
cnt == 1 && set_codeunit!(pnt, ch%UInt16) : _aligned_set(pnt, ch%UInt16, cnt)
956-
Str(UCS2CSE, buf)
955+
if cnt > 0
956+
if ch%UInt32 <= 0xff
957+
buf, pnt = _allocate(UInt8, cnt)
958+
cnt == 1 ? set_codeunit!(pnt, ch%UInt8) : _memset(pnt, ch%UInt8, cnt)
959+
ch%UInt8 <= 0x7f ? Str(ASCIICSE, buf) : Str(LatinCSE, buf)
960+
elseif C == UCS2Chr || ch%UInt32 <= 0xffff
961+
buf, pnt = _allocate(UInt16, cnt)
962+
cnt == 1 ? set_codeunit!(pnt, ch%UInt16) : _aligned_set(pnt, ch%UInt16, cnt)
963+
Str(UCS2CSE, buf)
964+
else
965+
buf, pnt = _allocate(UInt32, cnt)
966+
cnt == 1 ? set_codeunit!(pnt, ch%UInt32) : _aligned_set(pnt, ch%UInt32, cnt)
967+
Str(UTF32CSE, buf)
968+
end
957969
else
958-
buf, pnt = _allocate(UInt32, cnt)
959-
cnt == 1 && set_codeunit!(pnt, ch%UInt32) : _aligned_set(pnt, ch%UInt32, cnt)
960-
Str(UTF32CSE, buf)
970+
cnt == 0 ? empty_ascii : repeaterr(cnt)
961971
end
962972
end
963973

src/utf16.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ end
186186

187187
@propagate_inbounds function _next(::MultiCU, ::Type{T}, str::MS_UTF16, pos::Int) where {T}
188188
@boundscheck pos <= ncodeunits(str) || boundserr(str, pos)
189-
_iterate(MultiCU(), T, str, pos)
189+
iterate(str, pos)
190190
end
191191

192192
@inline _thisind(::MultiCU, str::MS_UTF16, len, pnt, pos) =

src/utf8.jl

+10-8
Original file line numberDiff line numberDiff line change
@@ -361,27 +361,29 @@ function _iterate_utf8(ch, str, pnt, pos)
361361
end
362362
end
363363

364-
@propagate_inbounds function iterate(str::MS_UTF8, pos::Integer=1)
365-
pos > ncodeunits(str) && return nothing
366-
@boundscheck pos <= 0 && boundserr(str, pos)
364+
@inline function _iterate_utf8(str, pos)
367365
@preserve str begin
368366
pnt = pointer(str) + pos - 1
369367
ch = get_codeunit(pnt)
370368
ch <= 0x7f ? (UTF32Chr(ch), pos + 1) : _iterate_utf8(ch, str, pnt, pos)
371369
end
372370
end
373371

372+
@propagate_inbounds function iterate(str::MS_UTF8, pos::Integer=1)
373+
pos > ncodeunits(str) && return nothing
374+
@boundscheck pos <= 0 && boundserr(str, pos)
375+
_iterate_utf8(str, pos)
376+
end
377+
374378
_iterate(::MultiCU, ::Type{T}, str::Str{RawUTF8CSE}, pos::Int) where {T} =
375379
iterate(str.data, pos)
376380
_iterate(::MultiCU, ::Type{T}, str::SubString{<:Str{RawUTF8CSE}}, pos::Int) where {T} =
377381
iterate(SubString(str.string.data, str.offset + pos, str.offset + ncodeunits(str)), 1)
378382

379383
# Gets next codepoint
380-
@propagate_inbounds function _next(::MultiCU, ::Type{T}, str::MS_UTF8,
381-
pos::Int) where {T<:Chr}
382-
len = ncodeunits(str)
383-
@boundscheck 0 < pos <= len || boundserr(str, pos)
384-
_iterate(MultiCU(), T, str, pos)
384+
@propagate_inbounds function _next(::MultiCU, ::Type{T}, str::MS_UTF8, pos::Int) where {T<:Chr}
385+
@boundscheck 0 < pos <= ncodeunits(str) || boundserr(str, pos)
386+
_iterate_utf8(str, pos)
385387
end
386388

387389
_next(::MultiCU, ::Type{T}, str::Str{RawUTF8CSE}, pos::Int) where {T} =

0 commit comments

Comments
 (0)