diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml index 3df77ac..c6b53b9 100644 --- a/charabia/Cargo.toml +++ b/charabia/Cargo.toml @@ -31,7 +31,7 @@ unicode-normalization = "0.1.23" irg-kvariants = { path = "../irg-kvariants", version = "=0.1.1" } [features] -default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese", "swedish-recomposition", "turkish", "german-segmentation"] +default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "khmer", "vietnamese", "swedish-recomposition", "turkish", "german-segmentation"] # allow chinese specialized tokenization chinese = ["chinese-segmentation", "chinese-normalization"] diff --git a/charabia/README.md b/charabia/README.md index 4ee9770..e5244de 100644 --- a/charabia/README.md +++ b/charabia/README.md @@ -17,6 +17,7 @@ Charabia provides a simple API to segment, normalize, or tokenize (segment + nor | Script / Language | specialized segmentation | specialized normalization | Segmentation Performance level | Tokenization Performance level | |---------------------|-------------------------------------------------------------------------------|---------------------------|-------------------|---| | **Latin** | ✅ CamelCase segmentation | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `Ð vs Đ` spoofing normalization + `ı` normalization | 🟩 ~23MiB/sec | 🟨 ~9MiB/sec | +| **Latin** - **German** | ✅ CamelCase segmentation + German word segmentation | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + [nonspacing-marks](https://www.compart.com/en/unicode/category/Mn) removal + `Ð vs Đ` spoofing normalization + `ı` normalization | 🟩 ~23MiB/sec | 🟨 ~9MiB/sec | | **Greek** | ❌ | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase + final sigma normalization | 🟩 ~27MiB/sec | 🟨 ~8MiB/sec | | **Cyrillic** - **Georgian** | ❌ | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + lowercase | 🟩 ~27MiB/sec | 🟨 ~9MiB/sec | | **Chinese** **CMN** 🇨🇳 | ✅ [jieba](https://github.com/messense/jieba-rs) | ✅ [compatibility decomposition](https://unicode.org/reports/tr15/) + kvariant conversion | 🟨 ~10MiB/sec | 🟧 ~5MiB/sec | diff --git a/charabia/src/detection/mod.rs b/charabia/src/detection/mod.rs index 3bfa5d5..87335e5 100644 --- a/charabia/src/detection/mod.rs +++ b/charabia/src/detection/mod.rs @@ -27,7 +27,11 @@ impl<'o, 'al> StrDetection<'o, 'al> { let inner = self.inner; self.language = match self.language.take() { Some(lang) => Some(lang), - None => Self::detect_lang(inner, self.allow_list), + None => match self.allow_list { + Some([unique_language]) => Some(*unique_language), + None if Self::detect_script(inner) == Script::Latin => None, + _otherwise => Self::detect_lang(inner, self.allow_list), + }, }; self.language diff --git a/charabia/src/detection/script_language.rs b/charabia/src/detection/script_language.rs index 1c904d8..ab73a7b 100644 --- a/charabia/src/detection/script_language.rs +++ b/charabia/src/detection/script_language.rs @@ -10,6 +10,7 @@ macro_rules! make_language { ($($language:tt), +) => { #[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize, PartialOrd, Ord)] pub enum Language { + Zho, $($language),+, } impl From for Language { @@ -23,6 +24,7 @@ macro_rules! make_language { impl From for whatlang::Lang { fn from(other: Language) -> whatlang::Lang { match other { + Language::Zho => whatlang::Lang::Cmn, $(Language::$language => whatlang::Lang::$language), +, } } @@ -31,12 +33,16 @@ macro_rules! make_language { impl Language { pub fn code(&self) -> &'static str { match self { + Language::Zho => "zho", $(Language::$language => whatlang::Lang::$language.code()), +, } } pub fn from_code>(code: S) -> Option { - whatlang::Lang::from_code(code.as_ref()).map(Language::from) + match code.as_ref() { + "zho" => Some(Language::Zho), + _ => whatlang::Lang::from_code(code.as_ref()).map(Language::from), + } } } }; diff --git a/charabia/src/normalizer/chinese.rs b/charabia/src/normalizer/chinese.rs index c1e4a1b..5e3ac86 100644 --- a/charabia/src/normalizer/chinese.rs +++ b/charabia/src/normalizer/chinese.rs @@ -38,7 +38,8 @@ impl CharNormalizer for ChineseNormalizer { } fn should_normalize(&self, token: &Token) -> bool { - token.script == Script::Cj && matches!(token.language, None | Some(Language::Cmn)) + token.script == Script::Cj + && matches!(token.language, None | Some(Language::Cmn) | Some(Language::Zho)) } } @@ -74,7 +75,7 @@ mod test { char_end: 5, byte_end: 15, script: Script::Cj, - language: Some(Language::Cmn), + language: Some(Language::Zho), ..Default::default() }, ] @@ -111,7 +112,7 @@ mod test { byte_end: 15, char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 4), (3, 4), (3, 4)]), script: Script::Cj, - language: Some(Language::Cmn), + language: Some(Language::Zho), ..Default::default() }, ] @@ -147,7 +148,7 @@ mod test { byte_end: 15, char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 4), (3, 4), (3, 4)]), script: Script::Cj, - language: Some(Language::Cmn), + language: Some(Language::Zho), kind: TokenKind::Word, ..Default::default() }, @@ -182,7 +183,7 @@ mod test { byte_end: 15, char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 4)]), script: Script::Cj, - language: Some(Language::Cmn), + language: Some(Language::Zho), ..Default::default() }, ] @@ -223,7 +224,7 @@ mod test { byte_end: 15, char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 4)]), script: Script::Cj, - language: Some(Language::Cmn), + language: Some(Language::Zho), }, ] } diff --git a/charabia/src/segmenter/latin/mod.rs b/charabia/src/segmenter/latin/mod.rs index 7f978c1..b7b7855 100644 --- a/charabia/src/segmenter/latin/mod.rs +++ b/charabia/src/segmenter/latin/mod.rs @@ -27,12 +27,18 @@ mod test { const TEXT: &str = "The quick (\"brown\") fox can’t jump 32.3 feet, right? Brr, it's 29.3°F! camelCase kebab-case snake_case"; + + #[rustfmt::skip] + #[cfg(feature = "latin-camelcase")] const SEGMENTED: &[&str] = &[ "The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "’", "t", " ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "Brr", ", ", "it", "'", "s", " ", "29", ".", "3°F", "!", " ", "camel", "Case", " ", "kebab", "-", "case", " ", "snake", "_", "case", ]; + + #[rustfmt::skip] + #[cfg(feature = "latin-camelcase")] const TOKENIZED: &[&str] = &[ "the", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "'", "t", " ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "brr", ", ", "it", @@ -40,5 +46,23 @@ mod test { "snake", "_", "case", ]; + #[rustfmt::skip] + #[cfg(not(feature = "latin-camelcase"))] + const SEGMENTED: &[&str] = &[ + "The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "’", "t", + " ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "Brr", ", ", "it", + "'", "s", " ", "29", ".", "3°F", "!", " ", "camelCase", " ", "kebab", "-", "case", " ", + "snake", "_", "case", + ]; + + #[rustfmt::skip] + #[cfg(not(feature = "latin-camelcase"))] + const TOKENIZED: &[&str] = &[ + "the", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can", "'", "t", + " ", "jump", " ", "32", ".", "3", " ", "feet", ", ", "right", "?", " ", "brr", ", ", "it", + "'", "s", " ", "29", ".", "3°f", "!", " ", "camelcase", " ", "kebab", "-", "case", " ", + "snake", "_", "case", + ]; + test_segmenter!(LatinSegmenter, TEXT, SEGMENTED, TOKENIZED, Script::Latin, Language::Eng); } diff --git a/charabia/src/segmenter/mod.rs b/charabia/src/segmenter/mod.rs index e3c61c0..628c7bc 100644 --- a/charabia/src/segmenter/mod.rs +++ b/charabia/src/segmenter/mod.rs @@ -63,6 +63,8 @@ pub static SEGMENTERS: Lazy = Lazy::new(|| { // chinese segmenter #[cfg(feature = "chinese-segmentation")] ((Script::Cj, Some(Language::Cmn)), Box::new(ChineseSegmenter) as Box), + #[cfg(feature = "chinese-segmentation")] + ((Script::Cj, Some(Language::Zho)), Box::new(ChineseSegmenter) as Box), // japanese segmenter #[cfg(feature = "japanese")] ((Script::Cj, Some(Language::Jpn)), Box::new(JapaneseSegmenter) as Box), @@ -395,7 +397,6 @@ mod test { ($segmenter:expr, $text:expr, $segmented:expr, $tokenized:expr, $script:expr, $language:expr) => { use crate::{Token, Language, Script}; use crate::segmenter::{Segment, AhoSegmentedStrIter, MatchType, DEFAULT_SEPARATOR_AHO}; - use crate::tokenizer::Tokenize; use super::*; #[test] @@ -425,7 +426,7 @@ Check if the expected Script/Language corresponds to the detected Script/Languag #[test] fn segment() { - let segmented_text: Vec<_> = $text.segment_str().collect(); + let segmented_text: Vec<_> = $text.segment_str_with_option(None, Some(&[$language])).collect(); assert_eq!(&segmented_text[..], $segmented, r#" Segmenter chosen by global segment() function, didn't segment the text as expected. @@ -436,7 +437,8 @@ Check if the tested segmenter is assigned to the good Script/Language in `SEGMEN #[test] fn tokenize() { - let tokens: Vec<_> = $text.tokenize().collect(); + let tokenizer = crate::TokenizerBuilder::default().into_tokenizer(); + let tokens: Vec<_> = tokenizer.tokenize_with_allow_list($text, Some(&[$language])).collect(); let tokenized_text: Vec<_> = tokens.iter().map(|t| t.lemma()).collect(); assert_eq!(&tokenized_text[..], $tokenized, r#"