diff --git a/src/tokenizer/newmm.rs b/src/tokenizer/newmm.rs index 8d3f221..7909cc7 100644 --- a/src/tokenizer/newmm.rs +++ b/src/tokenizer/newmm.rs @@ -42,12 +42,13 @@ const TEXT_SCAN_END: usize = TEXT_SCAN_POINT + TEXT_SCAN_RIGHT; type CharacterIndex = usize; -const NON_THAI_READABLE_PATTERN: &[&str; 5] = &[ +const NON_THAI_READABLE_PATTERN: &[&str; 6] = &[ r"(?x)^[-a-zA-Z]+", r"(?x)^[0-9]+([,\.][0-9]+)*", r"(?x)^[๐-๙]+([,\.][๐-๙]+)*", r"(?x)^[\ \t]+", r"(?x)^\r?\n", + r"(?x)^[\u0E00-\u0E7F \t\r\n]", ]; lazy_static! {