Skip to content

Commit a958f93

Browse files
committed
fixed highlighting bug on morphs with uppercase letters ( #163)
1 parent 912122f commit a958f93

File tree

3 files changed

+66
-9
lines changed

3 files changed

+66
-9
lines changed

ankimorphs/text_highlighting.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,12 @@
99
class SpanElement:
1010

1111
def __init__(
12-
self, morph: Morpheme, morph_status: str, start_index: int, end_index: int
12+
self, morph_group: str, morph_status: str, start_index: int, end_index: int
1313
):
14-
self.morph: Morpheme = morph
14+
# it's crucial that the morph_group parameter originates from Match[str].group()
15+
# because that maintains the original letter casing, which we want to preserve
16+
# in the highlighted version of the text.
17+
self.morph_group: str = morph_group
1518
self.morph_status: str = morph_status
1619
self.start_index: int = start_index
1720
self.end_index: int = end_index
@@ -78,7 +81,7 @@ def get_highlighted_text(
7881
span_element is not None
7982
and span_element.start_index <= index < span_element.end_index
8083
):
81-
span_string = span_element.morph.inflection
84+
span_string = span_element.morph_group
8285

8386
if len(ruby_character_dict) > 0:
8487
# we need to do this in reverse order to preserve the indices
@@ -188,15 +191,20 @@ def _extract_span_elements_and_filter_string(
188191
# escaping special regex characters is crucial because morphs from malformed text
189192
# sometimes can include them, e.g. "?몇"
190193
regex_pattern: str = f"{re.escape(morph.inflection)}"
191-
morph_matches = re.finditer(regex_pattern, text_to_highlight)
194+
morph_matches = re.finditer(
195+
regex_pattern, text_to_highlight, flags=re.IGNORECASE
196+
)
192197

193198
for morph_match in morph_matches:
194199
start_index = morph_match.start()
195200
end_index = morph_match.end()
196201
morph_len = end_index - start_index
197202

203+
# the morph_match.group() maintains the original letter casing of the
204+
# morph found in the text, which is crucial because we want everything
205+
# to be identical to the original text.
198206
span_elements.append(
199-
SpanElement(morph, morph_status, start_index, end_index)
207+
SpanElement(morph_match.group(), morph_status, start_index, end_index)
200208
)
201209

202210
# we need to preserve indices, so we replace the morphs with whitespaces

tests/data/collection.anki2

0 Bytes
Binary file not shown.

tests/recalc_test.py

Lines changed: 53 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -283,14 +283,63 @@ def test_highlighting(fake_environment): # pylint:disable=unused-argument
283283
# This second example the morphemizer finds the correct morph. However, the regex does
284284
# not match the morph because of the whitespace between 'す ね', which means that no
285285
# spans are made, potentially causing an 'index out of range' error immediately.
286-
input_text: str = "そうです ね"
287-
card_morphs: list[Morpheme] = [
286+
input_text = "そうです ね"
287+
card_morphs = [
288288
Morpheme(
289289
lemma="そうですね", inflection="そうですね", highest_learning_interval=0
290290
),
291291
]
292-
correct_result: str = "そうです ね"
293-
highlighted_text: str = text_highlighting.get_highlighted_text(
292+
correct_result = "そうです ね"
293+
highlighted_text = text_highlighting.get_highlighted_text(
294+
am_config, card_morphs, input_text
295+
)
296+
297+
assert highlighted_text == correct_result
298+
299+
# This third example checks if letter casing is preserved in the highlighted version
300+
input_text = "Das sind doch die Schädel von den Flüchtlingen, die wir gefunden hatten! Keine Sorge, dein Kopf wird auch schon bald in meiner Sammlung sein."
301+
card_morphs = [
302+
Morpheme(
303+
lemma="Flüchtling", inflection="flüchtlingen", highest_learning_interval=0
304+
),
305+
Morpheme(lemma="Sammlung", inflection="sammlung", highest_learning_interval=0),
306+
Morpheme(lemma="finden", inflection="gefunden", highest_learning_interval=0),
307+
Morpheme(lemma="Schädel", inflection="schädel", highest_learning_interval=0),
308+
Morpheme(lemma="haben", inflection="hatten", highest_learning_interval=0),
309+
Morpheme(lemma="mein", inflection="meiner", highest_learning_interval=0),
310+
Morpheme(lemma="Sorge", inflection="sorge", highest_learning_interval=0),
311+
Morpheme(lemma="kein", inflection="keine", highest_learning_interval=0),
312+
Morpheme(lemma="schon", inflection="schon", highest_learning_interval=0),
313+
Morpheme(lemma="Kopf", inflection="kopf", highest_learning_interval=0),
314+
Morpheme(lemma="auch", inflection="auch", highest_learning_interval=0),
315+
Morpheme(lemma="bald", inflection="bald", highest_learning_interval=0),
316+
Morpheme(lemma="dein", inflection="dein", highest_learning_interval=0),
317+
Morpheme(lemma="doch", inflection="doch", highest_learning_interval=0),
318+
Morpheme(lemma="sein", inflection="sein", highest_learning_interval=0),
319+
Morpheme(lemma="sein", inflection="sind", highest_learning_interval=0),
320+
Morpheme(lemma="werden", inflection="wird", highest_learning_interval=0),
321+
Morpheme(lemma="der", inflection="das", highest_learning_interval=0),
322+
Morpheme(lemma="der", inflection="den", highest_learning_interval=0),
323+
Morpheme(lemma="der", inflection="die", highest_learning_interval=0),
324+
Morpheme(lemma="von", inflection="von", highest_learning_interval=0),
325+
Morpheme(lemma="wir", inflection="wir", highest_learning_interval=0),
326+
Morpheme(lemma="in", inflection="in", highest_learning_interval=0),
327+
]
328+
correct_result = '<span morph-status="unknown">Das</span> <span morph-status="unknown">sind</span> <span morph-status="unknown">doch</span> <span morph-status="unknown">die</span> <span morph-status="unknown">Schädel</span> <span morph-status="unknown">von</span> <span morph-status="unknown">den</span> <span morph-status="unknown">Flüchtlingen</span>, <span morph-status="unknown">die</span> <span morph-status="unknown">wir</span> <span morph-status="unknown">gefunden</span> <span morph-status="unknown">hatten</span>! <span morph-status="unknown">Keine</span> <span morph-status="unknown">Sorge</span>, <span morph-status="unknown">dein</span> <span morph-status="unknown">Kopf</span> <span morph-status="unknown">wird</span> <span morph-status="unknown">auch</span> <span morph-status="unknown">schon</span> <span morph-status="unknown">bald</span> <span morph-status="unknown">in</span> <span morph-status="unknown">meiner</span> <span morph-status="unknown">Sammlung</span> <span morph-status="unknown">sein</span>.'
329+
highlighted_text = text_highlighting.get_highlighted_text(
330+
am_config, card_morphs, input_text
331+
)
332+
333+
assert highlighted_text == correct_result
334+
335+
# This fourth example checks if morphs with special regex characters are escaped properly
336+
input_text = "몇...?<div><br></div><div>몇...</div>"
337+
card_morphs = [
338+
Morpheme(lemma="?몇", inflection="?몇", highest_learning_interval=0),
339+
Morpheme(lemma="몇", inflection="몇", highest_learning_interval=0),
340+
]
341+
correct_result = '<span morph-status="unknown">몇</span>...?<div><br></div><div><span morph-status="unknown">몇</span>...</div>'
342+
highlighted_text = text_highlighting.get_highlighted_text(
294343
am_config, card_morphs, input_text
295344
)
296345

0 commit comments

Comments
 (0)