Skip to content

Commit 14fc90e

Browse files
Copilotyegor256
andcommitted
Fix Unicode surrogate pair parsing in strings
Co-authored-by: yegor256 <[email protected]>
1 parent 0323da7 commit 14fc90e

File tree

3 files changed

+31
-8
lines changed

3 files changed

+31
-8
lines changed

foo.phi

Lines changed: 0 additions & 3 deletions
This file was deleted.

src/Parser.hs

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,24 @@ unicodeEscape = do
9595
hexDigits <- count 4 hexDigitChar
9696
case readHex hexDigits of
9797
[(n, "")] ->
98-
if (n >= 0 && n <= 0x10FFFF) && not (n >= 0xD800 && n <= 0xDFFF)
99-
then return (chr n)
100-
else fail ("Invalid Unicode code point: \\u" ++ hexDigits)
98+
if n >= 0xD800 && n <= 0xDBFF
99+
then -- High surrogate, look for low surrogate
100+
do
101+
_ <- string "\\u"
102+
lowHexDigits <- count 4 hexDigitChar
103+
case readHex lowHexDigits of
104+
[(low, "")] ->
105+
if low >= 0xDC00 && low <= 0xDFFF
106+
then -- Valid surrogate pair, combine them
107+
let codePoint = 0x10000 + ((n - 0xD800) * 0x400) + (low - 0xDC00)
108+
in return (chr codePoint)
109+
else fail ("Invalid low surrogate: \\u" ++ lowHexDigits)
110+
_ -> fail ("Invalid low surrogate hex: \\u" ++ lowHexDigits)
111+
else if n >= 0xDC00 && n <= 0xDFFF
112+
then fail ("Unexpected low surrogate: \\u" ++ hexDigits)
113+
else if n >= 0 && n <= 0x10FFFF
114+
then return (chr n)
115+
else fail ("Invalid Unicode code point: \\u" ++ hexDigits)
101116

102117
function :: Parser String
103118
function = lexeme $ do

test/ParserSpec.hs

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,13 @@ spec = do
3737
("{[[foo ↦ QQ]]}", Just (Program (ExFormation [BiTau (AtLabel "foo") (ExDispatch (ExDispatch ExGlobal (AtLabel "org")) (AtLabel "eolang")), BiVoid AtRho])))
3838
]
3939

40+
describe "parse unicode surrogate pairs" $
41+
forM_
42+
[ "{⟦ j$org ↦ \"org/eolang/larger/\\uD835\\uDF11\" ⟧}",
43+
"Q -> \"\\uD835\\uDF11\""
44+
]
45+
(\prog -> it prog (parseProgram prog `shouldSatisfy` isRight))
46+
4047
describe "parse expression" $
4148
test
4249
parseExpression
@@ -241,7 +248,8 @@ spec = do
241248
"[[x -> -42, y -> +34]]",
242249
"⟦x ↦ Φ.org.eolang(z ↦ ξ.f, x ↦ α0, φ ↦ ρ, t ↦ φ, first ↦ ⟦ λ ⤍ Function_name, Δ ⤍ 42- ⟧)⟧",
243250
"[[x -> 1.00e+3, y -> 2.32e-4]]",
244-
"[[ x -> \"\\u0001\\u0001\"]]"
251+
"[[ x -> \"\\u0001\\u0001\"]]",
252+
"[[ x -> \"\\uD835\\uDF11\"]]"
245253
]
246254
(\expr -> it expr (parseExpression expr `shouldSatisfy` isRight))
247255

@@ -267,7 +275,10 @@ spec = do
267275
"Q.x(y(~1) -> [[]])",
268276
"Q.x(1, 2, !B)",
269277
"Q.x(~1 -> Q.y, x -> 5, !B1)",
270-
"Q.x(𝐵1, 𝜏0 -> $, x -> 𝑒)"
278+
"Q.x(𝐵1, 𝜏0 -> $, x -> 𝑒)",
279+
"[[ x -> \"\\uD800\"]]",
280+
"[[ x -> \"\\uDFFF\"]]",
281+
"[[ x -> \"\\uD835\\u0041\"]]"
271282
]
272283
)
273284

0 commit comments

Comments
 (0)