Skip to content

Commit

Permalink
StringUtils - string handling improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
ScottPJones committed Jan 7, 2016
1 parent db27c81 commit e829998
Show file tree
Hide file tree
Showing 4 changed files with 190 additions and 5 deletions.
7 changes: 5 additions & 2 deletions LICENSE.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
The StringUtils.jl package is licensed under the MIT "Expat" License:

> Copyright (c) 2016: ScottPJones.
>
Copyright (c) 2016 Gandalf Software, Inc. (Scott P. Jones)

Portions based on code that is part of Julia, licensed under the MIT license,
and also Eric Forgy's StringInterpolations.jl package.

> Permission is hereby granted, free of charge, to any person obtaining a copy
> of this software and associated documentation files (the "Software"), to deal
> in the Software without restriction, including without limitation the rights
Expand Down
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
# StringUtils

[![Build Status](https://travis-ci.org/ScottPJones/StringUtils.jl.svg?branch=master)](https://travis-ci.org/ScottPJones/StringUtils.jl)

The StringUtils package in a work-in-progress, where I am placing various improvements on the
String functionality in Julia language.

Currently, it adds a Swift style string macro, `u"..."`, which uses the Swift syntax for
interpolation, i.e. `\(expression)`. This means that you never have to worry about strings with
the $ character in them, which is rather frequent in some applications.
Also, Unicode sequences are represented as in Swift, i.e. as `\u{hexdigits}`, where there
can be from 1 to 6 hex digits. This syntax eliminates having to worry about always outputting
4 or 8 hex digits, to prevent problems with 0-9,A-F,a-f characters immediately following.
Finally, I have added two new ways of representing characters in the literal string,
`\:emojiname:` and `\{latexname}`.
This makes life a lot easier when you want to keep the text of a program in ASCII, and
also to be able to write programs using those characters that might not even display
correctly in their editor.
13 changes: 10 additions & 3 deletions src/StringUtils.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
module StringUtils
"""
String utilities and performance improvements
# package code goes here
Copyright 2016 Gandalf Software, Inc., Scott P. Jones
Licensed under MIT License, see LICENSE.md
"""
module StringUtils
export @u_str, @sinterpolate
export s_unescape_string, s_escape_string, s_print_unescaped, s_print_escaped

end # module
include("literals.jl")
end
160 changes: 160 additions & 0 deletions src/literals.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# Licensed under MIT License, see LICENSE.md
# Copyright 2016 Gandalf Software, Inc., Scott P. Jones

"""
String macro with more Swift-like syntax, plus support for emojis and LaTex names
"""
macro u_str(str) ; s_interp_parse(str) ; end

"""
Interpolates one or more strings using more Swift-like syntax
julia> x = "World"; @sinterpolate "Hello \\(x)"
"Hello World"
"""
macro sinterpolate(args...) ; s_interp_parse(args...) ; end

"""
Handle Unicode character constant, of form \\u{<hexdigits>}
"""
function s_parse_unicode(io, s, i)
done(s,i) && throw(ArgumentError("Incomplete \\u{...} in $(repr(s))"))
c, i = next(s, i)
c != '{' && throw(ArgumentError("\\u missing opening { in $(repr(s))"))
done(s,i) && throw(ArgumentError("Incomplete \\u{...} in $(repr(s))"))
c, i = next(s, i)
n::UInt32 = 0
k = 0
while c != '}'
done(s, i) && throw(ArgumentError("\\u{ missing closing } in $(repr(s))"))
(k += 1) > 6 && throw(ArgumentError("Unicode constant too long in $(repr(s))"))
n = n<<4 + c - ('0' <= c <= '9' ? '0' :
'a' <= c <= 'f' ? 'a' - 10 :
'A' <= c <= 'F' ? 'A' - 10 :
throw(ArgumentError("\\u missing closing } in $(repr(s))")))
c, i = next(s,i)
end
k == 0 && throw(ArgumentError("\\u{} has no hex digits in $(repr(s))"))
((0x0d800 <= n <= 0x0dfff) || n > 0x10ffff) &&
throw(ArgumentError("Invalid Unicode character constant $(repr(s))"))
print(io, Char(n))
i
end

"""
Handle Emoji character, of form \\:<name>:
"""
function s_parse_emoji(io, s, i)
beg = i-2 # start location
c, i = next(s, i)
while c != ':'
done(s, i) && throw(ArgumentError("\\: missing closing : in $(repr(s))"))
c, i = next(s, i)
end
emojistr = get(Base.REPLCompletions.emoji_symbols, s[beg:i-1], "")
emojistr == "" && throw(ArgumentError("Invalid Emoji name in $(repr(s))"))
print(io, emojistr)
i
end

"""
Handle LaTex character/string, of form \\{<name>}
"""
function s_parse_latex(io, s, i)
beg = i # start location
c, i = next(s, i)
while c != '}'
done(s, i) && throw(ArgumentError("\\: missing closing : in $(repr(s))"))
c, i = next(s, i)
end
latexstr = get(Base.REPLCompletions.latex_symbols, string("\\", s[beg:i-2]), "")
latexstr == "" && throw(ArgumentError("Invalid LaTex name in $(repr(s))"))
print(io, latexstr)
i
end

"""
String interpolation parsing
Based on code resurrected from Julia base:
https://github.com/JuliaLang/julia/blob/deab8eabd7089e2699a8f3a9598177b62cbb1733/base/string.jl
"""
function s_print_unescaped(io, s::AbstractString)
i = start(s)
while !done(s,i)
c, i = next(s,i)
if !done(s,i) && c == '\\'
c, i = next(s,i)
if c == 'u'
i = s_parse_unicode(io, s, i)
elseif c == ':' # Emoji
i = s_parse_emoji(io, s, i)
elseif c == '{' # LaTex
i = s_parse_latex(io, s, i)
else
c = (c == '0' ? '\0' :
c == '"' ? '"' :
c == '\'' ? '\'' :
c == 'a' ? '\a' :
c == 'b' ? '\b' :
c == 't' ? '\t' :
c == 'n' ? '\n' :
c == 'v' ? '\v' :
c == 'f' ? '\f' :
c == 'r' ? '\r' :
c == 'e' ? '\e' :
throw(ArgumentError("Invalid \\$c sequence in $(repr(s))")))
write(io, UInt8(c))
end
else
print(io, c)
end
end
end

s_unescape_string(s::AbstractString) = sprint(endof(s), s_print_unescaped, s)

function s_print_escaped(io, s::AbstractString, esc::AbstractString)
i = start(s)
while !done(s,i)
c, i = next(s, i)
c == '\0' ? print(io, "\\0") :
c == '\e' ? print(io, "\\e") :
c == '\\' ? print(io, "\\\\") :
c in esc ? print(io, '\\', c) :
'\a' <= c <= '\r' ? print(io, '\\', "abtnvfr"[Int(c)-6]) :
isprint(c) ? print(io, c) :
print(io, "\\u{", hex(c), "}")
end
end

s_escape_string(s::AbstractString) = sprint(endof(s), s_print_escaped, s, "\"")

function s_interp_parse(s::AbstractString, unescape::Function, p::Function)
sx = []
i = j = start(s)
while !done(s, j)
c, k = next(s, j)
if c == '\\' && !done(s, k) && s[k] == '('
# Handle interpolation
if !isempty(s[i:j-1])
push!(sx, unescape(s[i:j-1]))
end
ex, j = parse(s, k, greedy=false)
if isa(ex, Expr) && is(ex.head, :continue)
throw(ParseError("Incomplete expression"))
end
push!(sx, esc(ex))
i = j
else
j = k
end
end
if !isempty(s[i:end])
push!(sx, unescape(s[i:j-1]))
end
length(sx) == 1 && isa(sx[1], ByteString) ? sx[1] : Expr(:call, :sprint, p, sx...)
end

s_interp_parse(s::AbstractString, u::Function) = s_interp_parse(s, u, print)
s_interp_parse(s::AbstractString) =
s_interp_parse(s, x -> isvalid(UTF8String, s_unescape_string(x))
? s_unescape_string(x) : throw(ArgumentError("Invalid UTF-8 sequence")))

0 comments on commit e829998

Please sign in to comment.