55import sys
66from types import ModuleType
77
8- posseg : ModuleType | None = None
8+ from . morpheme import Morpheme
99
10+ posseg : ModuleType | None = None
1011successful_startup : bool = False
1112
1213################################################################################
13- # This section about cjk_ideographs is from zhon/hanzi.py
14- # zhon: https://github.com/tsroten/zhon
14+ # This section about cjk_ideographs is based on zhon/hanzi.py in:
15+ # https://github.com/tsroten/zhon
1516################################################################################
1617
1718#: Character code ranges for pertinent CJK ideograph Unicode blocks.
18- # cjk_ideographs = (
19- CJK_IDEOGRAPHS : str = (
20- "\u3007 " # Ideographic number zero, see issue #17
21- "\u4E00 -\u9FFF " # CJK Unified Ideographs
22- "\u3400 -\u4DBF " # CJK Unified Ideographs Extension A
23- "\uF900 -\uFAFF " # CJK Compatibility Ideographs
24- )
19+ cjk_ideograph_unicode_ranges = [
20+ (0x3007 , 0x3007 ), # Ideographic number zero
21+ (0x4E00 , 0x9FFF ), # CJK Unified Ideographs
22+ (0x3400 , 0x4DBF ), # CJK Unified Ideographs Extension A
23+ (0xF900 , 0xFAFF ), # CJK Compatibility Ideographs
24+ ]
2525if sys .maxunicode > 0xFFFF :
26- CJK_IDEOGRAPHS += (
27- " \U00020000 - \U0002A6DF " # CJK Unified Ideographs Extension B
28- " \U0002A700 - \U0002B73F " # CJK Unified Ideographs Extension C
29- " \U0002B740 - \U0002B81F " # CJK Unified Ideographs Extension D
30- " \U0002F800 - \U0002FA1F " # CJK Compatibility Ideographs Supplement
31- )
26+ cjk_ideograph_unicode_ranges += [
27+ ( 0x20000 , 0x2A6DF ), # CJK Unified Ideographs Extension B
28+ ( 0x2A700 , 0x2B73F ), # CJK Unified Ideographs Extension C
29+ ( 0x2B740 , 0x2B81F ), # CJK Unified Ideographs Extension D
30+ ( 0x2F800 , 0x2FA1F ), # CJK Compatibility Ideographs Supplement
31+ ]
3232################################################################################
3333
3434
@@ -43,3 +43,34 @@ def import_jieba() -> None:
4343 return
4444
4545 successful_startup = True
46+
47+
48+ def get_morphemes_jieba (expression : str ) -> list [Morpheme ]:
49+ assert posseg is not None
50+ _morphs : list [Morpheme ] = []
51+
52+ # The "posseg.cut" function returns "Pair" instances:
53+ # Pair.word
54+ # Pair.flag # part of speech
55+ for posseg_pair in posseg .cut (expression ):
56+ if text_contains_only_cjk_ranges (_text = posseg_pair .word ) is False :
57+ continue
58+
59+ # chinese does not have inflections, so we use the lemma for both
60+ _morphs .append (Morpheme (lemma = posseg_pair .word , inflection = posseg_pair .word ))
61+
62+ return _morphs
63+
64+
65+ def char_found_in_cjk_ranges (_char : str ) -> bool :
66+ for start , end in cjk_ideograph_unicode_ranges :
67+ if start <= ord (_char ) <= end :
68+ return True
69+ return False
70+
71+
72+ def text_contains_only_cjk_ranges (_text : str ) -> bool :
73+ for char in _text :
74+ if not char_found_in_cjk_ranges (char ):
75+ return False
76+ return True
0 commit comments