Skip to content

Commit 02b3572

Browse files
committed
fixed jieba bug (#219)
1 parent 031beb1 commit 02b3572

File tree

3 files changed

+66
-38
lines changed

3 files changed

+66
-38
lines changed

ankimorphs/jieba_wrapper.py

Lines changed: 47 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,30 +5,30 @@
55
import sys
66
from types import ModuleType
77

8-
posseg: ModuleType | None = None
8+
from .morpheme import Morpheme
99

10+
posseg: ModuleType | None = None
1011
successful_startup: bool = False
1112

1213
################################################################################
13-
# This section about cjk_ideographs is from zhon/hanzi.py
14-
# zhon: https://github.com/tsroten/zhon
14+
# This section about cjk_ideographs is based on zhon/hanzi.py in:
15+
# https://github.com/tsroten/zhon
1516
################################################################################
1617

1718
#: Character code ranges for pertinent CJK ideograph Unicode blocks.
18-
# cjk_ideographs = (
19-
CJK_IDEOGRAPHS: str = (
20-
"\u3007" # Ideographic number zero, see issue #17
21-
"\u4E00-\u9FFF" # CJK Unified Ideographs
22-
"\u3400-\u4DBF" # CJK Unified Ideographs Extension A
23-
"\uF900-\uFAFF" # CJK Compatibility Ideographs
24-
)
19+
cjk_ideograph_unicode_ranges = [
20+
(0x3007, 0x3007), # Ideographic number zero
21+
(0x4E00, 0x9FFF), # CJK Unified Ideographs
22+
(0x3400, 0x4DBF), # CJK Unified Ideographs Extension A
23+
(0xF900, 0xFAFF), # CJK Compatibility Ideographs
24+
]
2525
if sys.maxunicode > 0xFFFF:
26-
CJK_IDEOGRAPHS += (
27-
"\U00020000-\U0002A6DF" # CJK Unified Ideographs Extension B
28-
"\U0002A700-\U0002B73F" # CJK Unified Ideographs Extension C
29-
"\U0002B740-\U0002B81F" # CJK Unified Ideographs Extension D
30-
"\U0002F800-\U0002FA1F" # CJK Compatibility Ideographs Supplement
31-
)
26+
cjk_ideograph_unicode_ranges += [
27+
(0x20000, 0x2A6DF), # CJK Unified Ideographs Extension B
28+
(0x2A700, 0x2B73F), # CJK Unified Ideographs Extension C
29+
(0x2B740, 0x2B81F), # CJK Unified Ideographs Extension D
30+
(0x2F800, 0x2FA1F), # CJK Compatibility Ideographs Supplement
31+
]
3232
################################################################################
3333

3434

@@ -43,3 +43,34 @@ def import_jieba() -> None:
4343
return
4444

4545
successful_startup = True
46+
47+
48+
def get_morphemes_jieba(expression: str) -> list[Morpheme]:
49+
assert posseg is not None
50+
_morphs: list[Morpheme] = []
51+
52+
# The "posseg.cut" function returns "Pair" instances:
53+
# Pair.word
54+
# Pair.flag # part of speech
55+
for posseg_pair in posseg.cut(expression):
56+
if text_contains_only_cjk_ranges(_text=posseg_pair.word) is False:
57+
continue
58+
59+
# chinese does not have inflections, so we use the lemma for both
60+
_morphs.append(Morpheme(lemma=posseg_pair.word, inflection=posseg_pair.word))
61+
62+
return _morphs
63+
64+
65+
def char_found_in_cjk_ranges(_char: str) -> bool:
66+
for start, end in cjk_ideograph_unicode_ranges:
67+
if start <= ord(_char) <= end:
68+
return True
69+
return False
70+
71+
72+
def text_contains_only_cjk_ranges(_text: str) -> bool:
73+
for char in _text:
74+
if not char_found_in_cjk_ranges(char):
75+
return False
76+
return True

ankimorphs/morphemizer.py

Lines changed: 5 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@
44
import re
55

66
from . import jieba_wrapper, mecab_wrapper, spacy_wrapper
7-
from .mecab_wrapper import get_morphemes_mecab
87
from .morpheme import Morpheme
98

9+
space_char_regex = re.compile(" ")
10+
11+
1012
####################################################################################################
1113
# Base Class
1214
####################################################################################################
@@ -80,8 +82,6 @@ def get_morphemizer_by_description(description: str) -> Morphemizer | None:
8082
# Mecab Morphemizer
8183
####################################################################################################
8284

83-
space_char_regex = re.compile(" ")
84-
8585

8686
class MecabMorphemizer(Morphemizer):
8787

@@ -93,7 +93,7 @@ def _get_morphemes_from_expr(self, expression: str) -> list[Morpheme]:
9393
# Remove simple spaces that could be added by other add-ons and break the parsing.
9494
if space_char_regex.search(expression):
9595
expression = space_char_regex.sub("", expression)
96-
return get_morphemes_mecab(expression)
96+
return mecab_wrapper.get_morphemes_mecab(expression)
9797

9898
def get_description(self) -> str:
9999
return "AnkiMorphs: Japanese"
@@ -166,23 +166,7 @@ def __init__(self) -> None:
166166
jieba_wrapper.import_jieba()
167167

168168
def _get_morphemes_from_expr(self, expression: str) -> list[Morpheme]:
169-
assert jieba_wrapper.posseg is not None
170-
expression_morphs: list[Morpheme] = []
171-
172-
# only retain the cjk ideographs
173-
expression = "".join(
174-
re.findall(
175-
f"[{jieba_wrapper.CJK_IDEOGRAPHS}]",
176-
expression,
177-
)
178-
)
179-
180-
for jieba_segment in jieba_wrapper.posseg.cut(expression):
181-
# chinese does not have inflections, so we use the lemma for both
182-
_morph = Morpheme(lemma=jieba_segment.word, inflection=jieba_segment.word)
183-
expression_morphs.append(_morph)
184-
185-
return expression_morphs
169+
return jieba_wrapper.get_morphemes_jieba(expression)
186170

187171
def get_description(self) -> str:
188172
return "AnkiMorphs: Chinese"

tests/mecab_and_jieba_test.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,6 @@ def test_mecab_morpheme_generation(fake_environment): # pylint:disable=unused-a
5555
def test_jieba_morpheme_generation(fake_environment): # pylint:disable=unused-argument
5656
morphemizer = get_morphemizer_by_description("AnkiMorphs: Chinese")
5757

58-
# sentence = "本当に重要な任務の時しか 動かない"
5958
sentence = "请您说得慢些好吗?"
6059
correct_morphs: set[Morpheme] = {
6160
Morpheme("吗", "吗"),
@@ -72,3 +71,17 @@ def test_jieba_morpheme_generation(fake_environment): # pylint:disable=unused-a
7271

7372
for morph in extracted_morphs:
7473
assert morph in correct_morphs
74+
75+
sentence = "一,二,三,跳!"
76+
correct_morphs: set[Morpheme] = {
77+
Morpheme("一", "一"),
78+
Morpheme("二", "二"),
79+
Morpheme("三", "三"),
80+
Morpheme("跳", "跳"),
81+
}
82+
83+
extracted_morphs = morphemizer.get_morphemes_from_expr(sentence)
84+
assert len(extracted_morphs) == 4
85+
86+
for morph in extracted_morphs:
87+
assert morph in correct_morphs

0 commit comments

Comments
 (0)