-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
StringUtils - string handling improvements
- Loading branch information
1 parent
db27c81
commit e829998
Showing
4 changed files
with
190 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,18 @@ | ||
# StringUtils | ||
|
||
[![Build Status](https://travis-ci.org/ScottPJones/StringUtils.jl.svg?branch=master)](https://travis-ci.org/ScottPJones/StringUtils.jl) | ||
|
||
The StringUtils package in a work-in-progress, where I am placing various improvements on the | ||
String functionality in Julia language. | ||
|
||
Currently, it adds a Swift style string macro, `u"..."`, which uses the Swift syntax for | ||
interpolation, i.e. `\(expression)`. This means that you never have to worry about strings with | ||
the $ character in them, which is rather frequent in some applications. | ||
Also, Unicode sequences are represented as in Swift, i.e. as `\u{hexdigits}`, where there | ||
can be from 1 to 6 hex digits. This syntax eliminates having to worry about always outputting | ||
4 or 8 hex digits, to prevent problems with 0-9,A-F,a-f characters immediately following. | ||
Finally, I have added two new ways of representing characters in the literal string, | ||
`\:emojiname:` and `\{latexname}`. | ||
This makes life a lot easier when you want to keep the text of a program in ASCII, and | ||
also to be able to write programs using those characters that might not even display | ||
correctly in their editor. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,12 @@ | ||
module StringUtils | ||
""" | ||
String utilities and performance improvements | ||
# package code goes here | ||
Copyright 2016 Gandalf Software, Inc., Scott P. Jones | ||
Licensed under MIT License, see LICENSE.md | ||
""" | ||
module StringUtils | ||
export @u_str, @sinterpolate | ||
export s_unescape_string, s_escape_string, s_print_unescaped, s_print_escaped | ||
|
||
end # module | ||
include("literals.jl") | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
# Licensed under MIT License, see LICENSE.md | ||
# Copyright 2016 Gandalf Software, Inc., Scott P. Jones | ||
|
||
""" | ||
String macro with more Swift-like syntax, plus support for emojis and LaTex names | ||
""" | ||
macro u_str(str) ; s_interp_parse(str) ; end | ||
|
||
""" | ||
Interpolates one or more strings using more Swift-like syntax | ||
julia> x = "World"; @sinterpolate "Hello \\(x)" | ||
"Hello World" | ||
""" | ||
macro sinterpolate(args...) ; s_interp_parse(args...) ; end | ||
|
||
""" | ||
Handle Unicode character constant, of form \\u{<hexdigits>} | ||
""" | ||
function s_parse_unicode(io, s, i) | ||
done(s,i) && throw(ArgumentError("Incomplete \\u{...} in $(repr(s))")) | ||
c, i = next(s, i) | ||
c != '{' && throw(ArgumentError("\\u missing opening { in $(repr(s))")) | ||
done(s,i) && throw(ArgumentError("Incomplete \\u{...} in $(repr(s))")) | ||
c, i = next(s, i) | ||
n::UInt32 = 0 | ||
k = 0 | ||
while c != '}' | ||
done(s, i) && throw(ArgumentError("\\u{ missing closing } in $(repr(s))")) | ||
(k += 1) > 6 && throw(ArgumentError("Unicode constant too long in $(repr(s))")) | ||
n = n<<4 + c - ('0' <= c <= '9' ? '0' : | ||
'a' <= c <= 'f' ? 'a' - 10 : | ||
'A' <= c <= 'F' ? 'A' - 10 : | ||
throw(ArgumentError("\\u missing closing } in $(repr(s))"))) | ||
c, i = next(s,i) | ||
end | ||
k == 0 && throw(ArgumentError("\\u{} has no hex digits in $(repr(s))")) | ||
((0x0d800 <= n <= 0x0dfff) || n > 0x10ffff) && | ||
throw(ArgumentError("Invalid Unicode character constant $(repr(s))")) | ||
print(io, Char(n)) | ||
i | ||
end | ||
|
||
""" | ||
Handle Emoji character, of form \\:<name>: | ||
""" | ||
function s_parse_emoji(io, s, i) | ||
beg = i-2 # start location | ||
c, i = next(s, i) | ||
while c != ':' | ||
done(s, i) && throw(ArgumentError("\\: missing closing : in $(repr(s))")) | ||
c, i = next(s, i) | ||
end | ||
emojistr = get(Base.REPLCompletions.emoji_symbols, s[beg:i-1], "") | ||
emojistr == "" && throw(ArgumentError("Invalid Emoji name in $(repr(s))")) | ||
print(io, emojistr) | ||
i | ||
end | ||
|
||
""" | ||
Handle LaTex character/string, of form \\{<name>} | ||
""" | ||
function s_parse_latex(io, s, i) | ||
beg = i # start location | ||
c, i = next(s, i) | ||
while c != '}' | ||
done(s, i) && throw(ArgumentError("\\: missing closing : in $(repr(s))")) | ||
c, i = next(s, i) | ||
end | ||
latexstr = get(Base.REPLCompletions.latex_symbols, string("\\", s[beg:i-2]), "") | ||
latexstr == "" && throw(ArgumentError("Invalid LaTex name in $(repr(s))")) | ||
print(io, latexstr) | ||
i | ||
end | ||
|
||
""" | ||
String interpolation parsing | ||
Based on code resurrected from Julia base: | ||
https://github.com/JuliaLang/julia/blob/deab8eabd7089e2699a8f3a9598177b62cbb1733/base/string.jl | ||
""" | ||
function s_print_unescaped(io, s::AbstractString) | ||
i = start(s) | ||
while !done(s,i) | ||
c, i = next(s,i) | ||
if !done(s,i) && c == '\\' | ||
c, i = next(s,i) | ||
if c == 'u' | ||
i = s_parse_unicode(io, s, i) | ||
elseif c == ':' # Emoji | ||
i = s_parse_emoji(io, s, i) | ||
elseif c == '{' # LaTex | ||
i = s_parse_latex(io, s, i) | ||
else | ||
c = (c == '0' ? '\0' : | ||
c == '"' ? '"' : | ||
c == '\'' ? '\'' : | ||
c == 'a' ? '\a' : | ||
c == 'b' ? '\b' : | ||
c == 't' ? '\t' : | ||
c == 'n' ? '\n' : | ||
c == 'v' ? '\v' : | ||
c == 'f' ? '\f' : | ||
c == 'r' ? '\r' : | ||
c == 'e' ? '\e' : | ||
throw(ArgumentError("Invalid \\$c sequence in $(repr(s))"))) | ||
write(io, UInt8(c)) | ||
end | ||
else | ||
print(io, c) | ||
end | ||
end | ||
end | ||
|
||
s_unescape_string(s::AbstractString) = sprint(endof(s), s_print_unescaped, s) | ||
|
||
function s_print_escaped(io, s::AbstractString, esc::AbstractString) | ||
i = start(s) | ||
while !done(s,i) | ||
c, i = next(s, i) | ||
c == '\0' ? print(io, "\\0") : | ||
c == '\e' ? print(io, "\\e") : | ||
c == '\\' ? print(io, "\\\\") : | ||
c in esc ? print(io, '\\', c) : | ||
'\a' <= c <= '\r' ? print(io, '\\', "abtnvfr"[Int(c)-6]) : | ||
isprint(c) ? print(io, c) : | ||
print(io, "\\u{", hex(c), "}") | ||
end | ||
end | ||
|
||
s_escape_string(s::AbstractString) = sprint(endof(s), s_print_escaped, s, "\"") | ||
|
||
function s_interp_parse(s::AbstractString, unescape::Function, p::Function) | ||
sx = [] | ||
i = j = start(s) | ||
while !done(s, j) | ||
c, k = next(s, j) | ||
if c == '\\' && !done(s, k) && s[k] == '(' | ||
# Handle interpolation | ||
if !isempty(s[i:j-1]) | ||
push!(sx, unescape(s[i:j-1])) | ||
end | ||
ex, j = parse(s, k, greedy=false) | ||
if isa(ex, Expr) && is(ex.head, :continue) | ||
throw(ParseError("Incomplete expression")) | ||
end | ||
push!(sx, esc(ex)) | ||
i = j | ||
else | ||
j = k | ||
end | ||
end | ||
if !isempty(s[i:end]) | ||
push!(sx, unescape(s[i:j-1])) | ||
end | ||
length(sx) == 1 && isa(sx[1], ByteString) ? sx[1] : Expr(:call, :sprint, p, sx...) | ||
end | ||
|
||
s_interp_parse(s::AbstractString, u::Function) = s_interp_parse(s, u, print) | ||
s_interp_parse(s::AbstractString) = | ||
s_interp_parse(s, x -> isvalid(UTF8String, s_unescape_string(x)) | ||
? s_unescape_string(x) : throw(ArgumentError("Invalid UTF-8 sequence"))) |