typst · emilyyyylime · Nov 19, 2024 · Nov 19, 2024 · Nov 19, 2024 · Nov 20, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -7,5 +7,5 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - uses: dtolnay/rust-toolchain@stable
-      - run: cargo build
+      - run: cargo build --features unicode_names2
       - run: cargo test
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -9,3 +9,6 @@ readme = "README.md"
 license = "Apache-2.0"
 categories = ["encoding", "text-processing"]
 keywords = ["unicode", "symbols"]
+
+[build-dependencies]
+unicode_names2 = { version = "1.3.0", optional = true }
diff --git a/build.rs b/build.rs
@@ -86,8 +86,8 @@ fn tokenize(line: &str) -> StrResult<Line> {
         return Ok(Line::Blank);
     }
 
-    let (head, tail) = match line.split_once(' ') {
-        Some((a, b)) => (a, Some(b)),
+    let (head, tail) = match line.split_once(char::is_whitespace) {
+        Some((a, b)) => (a, Some(b.trim_start())),
         None => (line, None),
     };
 
@@ -121,10 +121,29 @@ fn validate_ident(string: &str) -> StrResult<()> {
 /// Extracts either a single char or parses a U+XXXX escape.
 fn decode_char(text: &str) -> StrResult<char> {
     if let Some(hex) = text.strip_prefix("U+") {
-        u32::from_str_radix(hex, 16)
+        let (hex, name) = match hex.split_once(char::is_whitespace) {
+            Some((hex, name)) => (hex, Some(name.trim_start())),
+            None => (hex, None),
+        };
+
+        let ch = u32::from_str_radix(hex, 16)
             .ok()
-            .and_then(|n| char::try_from(n).ok())
-            .ok_or_else(|| format!("invalid unicode escape {text:?}"))
+            .and_then(|n| char::from_u32(n))
+            .ok_or_else(|| format!("invalid unicode escape {hex:?}"))?;
+
+        #[cfg_attr(not(feature = "unicode_names2"), expect(unused_variables))]
+        if let Some(name) = name {
+            #[cfg(feature = "unicode_names2")]
+            if unicode_names2::character(name) != Some(ch) {
+                return Err(format!(
+                    "Incorrect name supplied for character U+{hex}: '{name}'{}",
+                    unicode_names2::name(ch)
+                        .map_or("".to_string(), |name| format!(" (expected {name})"))
+                ));
+            }
+        }
+
+        Ok(ch)
     } else {
         let mut chars = text.chars();
         match (chars.next(), chars.next()) {

diff --git a/src/modules/sym.txt b/src/modules/sym.txt
@@ -1,25 +1,25 @@
-// Control.
-wj U+2060
-zwj U+200D
-zwnj U+200C
-zws U+200B
-lrm U+200E
-rlm U+200F
+// Layout control.
+wj   U+2060  Word joiner
+zwj  U+200D  Zero width joiner
+zwnj U+200C  Zero width non-joiner
+zws  U+200B  Zero width space
+lrm  U+200E  Left-to-right mark
+rlm  U+200F  Right-to-left mark
 
 // Spaces.
-space U+20
-  .nobreak U+A0
-  .nobreak.narrow U+202F
-  .en U+2002
-  .quad U+2003
-  .third U+2004
-  .quarter U+2005
-  .sixth U+2006
-  .med U+205F
-  .fig U+2007
-  .punct U+2008
-  .thin U+2009
-  .hair U+200A
+space U+20        Space
+  .nobreak U+A0   No-break space
+  .nobreak.narrow U+202F Narrow no-break space
+  .en U+2002      En space
+  .quad U+2003    Em space
+  .third U+2004   Three-per-em space
+  .quarter U+2005 Four-per-em space
+  .sixth U+2006   Six-per-em space
+  .fig U+2007     Figure space
+  .punct U+2008   Punctuation space
+  .thin U+2009    Thin space
+  .hair U+200A    Hair space
+  .med U+205F     Medium mathematical space
 
 // Delimiters.
 paren
@@ -30,9 +30,9 @@ paren
   .t ⏜
   .b ⏝
 brace
-  .l U+7B
+  .l U+7B  Left curly bracket
   .l.double ⦃
-  .r U+7D
+  .r U+7D  Right curly bracket
   .r.double ⦄
   .t ⏞
   .b ⏟
@@ -130,14 +130,14 @@ dash
   .wave.double 〰
 dot
   .op ⋅
-  .basic U+2E
+  .basic U+2E  Full stop
   .c ·
   .circle ⊙
   .circle.big ⨀
   .square ⊡
   .double ¨
-  .triple U+20DB
-  .quad U+20DC
+  .triple U+20DB Combining three dots above 
+  .quad U+20DC   Combining four dots above
 excl !
   .double ‼
   .inv ¡
@@ -149,10 +149,10 @@ quest ?
 interrobang ‽
 hash #
 hyph ‐
-  .minus U+2D
-  .nobreak U+2011
+  .minus U+2D     Hyphen-minus
+  .nobreak U+2011 Non-breaking hyphen
+  .soft U+AD      Soft hyphen
   .point ‧
-  .soft U+AD
 numero №
 percent %
 permille ‰