fixed highlighting bug on morphs with uppercase letters ( #163)

mortii · mortii · commit a958f9347a0c · 2024-02-26T13:45:21.000+01:00
diff --git a/ankimorphs/text_highlighting.py b/ankimorphs/text_highlighting.py
@@ -9,9 +9,12 @@
 class SpanElement:
 
     def __init__(
-        self, morph: Morpheme, morph_status: str, start_index: int, end_index: int
+        self, morph_group: str, morph_status: str, start_index: int, end_index: int
     ):
-        self.morph: Morpheme = morph
+        # it's crucial that the morph_group parameter originates from Match[str].group()
+        # because that maintains the original letter casing, which we want to preserve
+        # in the highlighted version of the text.
+        self.morph_group: str = morph_group
         self.morph_status: str = morph_status
         self.start_index: int = start_index
         self.end_index: int = end_index
@@ -78,7 +81,7 @@ def get_highlighted_text(
             span_element is not None
             and span_element.start_index <= index < span_element.end_index
         ):
-            span_string = span_element.morph.inflection
+            span_string = span_element.morph_group
 
             if len(ruby_character_dict) > 0:
                 # we need to do this in reverse order to preserve the indices
@@ -188,15 +191,20 @@ def _extract_span_elements_and_filter_string(
         # escaping special regex characters is crucial because morphs from malformed text
         # sometimes can include them, e.g. "?몇"
         regex_pattern: str = f"{re.escape(morph.inflection)}"
-        morph_matches = re.finditer(regex_pattern, text_to_highlight)
+        morph_matches = re.finditer(
+            regex_pattern, text_to_highlight, flags=re.IGNORECASE
+        )
 
         for morph_match in morph_matches:
             start_index = morph_match.start()
             end_index = morph_match.end()
             morph_len = end_index - start_index
 
+            # the morph_match.group() maintains the original letter casing of the
+            # morph found in the text, which is crucial because we want everything
+            # to be identical to the original text.
             span_elements.append(
-                SpanElement(morph, morph_status, start_index, end_index)
+                SpanElement(morph_match.group(), morph_status, start_index, end_index)
             )
 
             # we need to preserve indices, so we replace the morphs with whitespaces
diff --git a/tests/data/collection.anki2 b/tests/data/collection.anki2
diff --git a/tests/recalc_test.py b/tests/recalc_test.py
@@ -283,14 +283,63 @@ def test_highlighting(fake_environment):  # pylint:disable=unused-argument
     # This second example the morphemizer finds the correct morph. However, the regex does
     # not match the morph because of the whitespace between 'す ね', which means that no
     # spans are made, potentially causing an 'index out of range' error immediately.
-    input_text: str = "そうです ね"
-    card_morphs: list[Morpheme] = [
+    input_text = "そうです ね"
+    card_morphs = [
         Morpheme(
             lemma="そうですね", inflection="そうですね", highest_learning_interval=0
         ),
     ]
-    correct_result: str = "そうです ね"
-    highlighted_text: str = text_highlighting.get_highlighted_text(
+    correct_result = "そうです ね"
+    highlighted_text = text_highlighting.get_highlighted_text(
+        am_config, card_morphs, input_text
+    )
+
+    assert highlighted_text == correct_result
+
+    # This third example checks if letter casing is preserved in the highlighted version
+    input_text = "Das sind doch die Schädel von den Flüchtlingen, die wir gefunden hatten! Keine Sorge, dein Kopf wird auch schon bald in meiner Sammlung sein."
+    card_morphs = [
+        Morpheme(
+            lemma="Flüchtling", inflection="flüchtlingen", highest_learning_interval=0
+        ),
+        Morpheme(lemma="Sammlung", inflection="sammlung", highest_learning_interval=0),
+        Morpheme(lemma="finden", inflection="gefunden", highest_learning_interval=0),
+        Morpheme(lemma="Schädel", inflection="schädel", highest_learning_interval=0),
+        Morpheme(lemma="haben", inflection="hatten", highest_learning_interval=0),
+        Morpheme(lemma="mein", inflection="meiner", highest_learning_interval=0),
+        Morpheme(lemma="Sorge", inflection="sorge", highest_learning_interval=0),
+        Morpheme(lemma="kein", inflection="keine", highest_learning_interval=0),
+        Morpheme(lemma="schon", inflection="schon", highest_learning_interval=0),
+        Morpheme(lemma="Kopf", inflection="kopf", highest_learning_interval=0),
+        Morpheme(lemma="auch", inflection="auch", highest_learning_interval=0),
+        Morpheme(lemma="bald", inflection="bald", highest_learning_interval=0),
+        Morpheme(lemma="dein", inflection="dein", highest_learning_interval=0),
+        Morpheme(lemma="doch", inflection="doch", highest_learning_interval=0),
+        Morpheme(lemma="sein", inflection="sein", highest_learning_interval=0),
+        Morpheme(lemma="sein", inflection="sind", highest_learning_interval=0),
+        Morpheme(lemma="werden", inflection="wird", highest_learning_interval=0),
+        Morpheme(lemma="der", inflection="das", highest_learning_interval=0),
+        Morpheme(lemma="der", inflection="den", highest_learning_interval=0),
+        Morpheme(lemma="der", inflection="die", highest_learning_interval=0),
+        Morpheme(lemma="von", inflection="von", highest_learning_interval=0),
+        Morpheme(lemma="wir", inflection="wir", highest_learning_interval=0),
+        Morpheme(lemma="in", inflection="in", highest_learning_interval=0),
+    ]
+    correct_result = '<span morph-status="unknown">Das</span> <span morph-status="unknown">sind</span> <span morph-status="unknown">doch</span> <span morph-status="unknown">die</span> <span morph-status="unknown">Schädel</span> <span morph-status="unknown">von</span> <span morph-status="unknown">den</span> <span morph-status="unknown">Flüchtlingen</span>, <span morph-status="unknown">die</span> <span morph-status="unknown">wir</span> <span morph-status="unknown">gefunden</span> <span morph-status="unknown">hatten</span>! <span morph-status="unknown">Keine</span> <span morph-status="unknown">Sorge</span>, <span morph-status="unknown">dein</span> <span morph-status="unknown">Kopf</span> <span morph-status="unknown">wird</span> <span morph-status="unknown">auch</span> <span morph-status="unknown">schon</span> <span morph-status="unknown">bald</span> <span morph-status="unknown">in</span> <span morph-status="unknown">meiner</span> <span morph-status="unknown">Sammlung</span> <span morph-status="unknown">sein</span>.'
+    highlighted_text = text_highlighting.get_highlighted_text(
+        am_config, card_morphs, input_text
+    )
+
+    assert highlighted_text == correct_result
+
+    # This fourth example checks if morphs with special regex characters are escaped properly
+    input_text = "몇...?<div><br></div><div>몇...</div>"
+    card_morphs = [
+        Morpheme(lemma="?몇", inflection="?몇", highest_learning_interval=0),
+        Morpheme(lemma="몇", inflection="몇", highest_learning_interval=0),
+    ]
+    correct_result = '<span morph-status="unknown">몇</span>...?<div><br></div><div><span morph-status="unknown">몇</span>...</div>'
+    highlighted_text = text_highlighting.get_highlighted_text(
         am_config, card_morphs, input_text
     )