fixed epub text splitting (#362)

mortii · mortii · commit 6253529b1f5f · 2025-09-24T18:37:36.000+02:00
diff --git a/ankimorphs/ankimorphs_globals.py b/ankimorphs/ankimorphs_globals.py
@@ -5,7 +5,7 @@
 """
 
 # Semantic Versioning https://semver.org/
-__version__ = "6.0.2"
+__version__ = "6.0.3"
 
 DEV_MODE: bool = False
 
diff --git a/ankimorphs/generators/text_extractors.py b/ankimorphs/generators/text_extractors.py
@@ -1,13 +1,13 @@
 from __future__ import annotations
 
 import os
+import re
 import tempfile
 import zipfile
 from collections.abc import Iterator
 from pathlib import Path
 
-import anki
-from aqt import mw
+from anki import utils as anki_utils
 
 
 def extract_ass_text(file_path: Path) -> list[str]:
@@ -71,23 +71,28 @@ def extract_vtt_text(file_path: Path) -> list[str]:
 
 
 def extract_epub_text(epub_path: Path) -> list[str]:
-    assert mw is not None
 
-    # Use a generator to yield text lines for better memory efficiency
-    def extract_text(_temp_dir: str) -> Iterator[list[str]]:
+    # use a generator to yield text lines for better memory efficiency
+    def extract_text_chunks(_temp_dir: str) -> Iterator[list[str]]:
         for _root, _, _files in os.walk(_temp_dir):
             for file in filter(lambda f: f.endswith((".xhtml", ".html")), _files):
                 file_path = Path(_root, file)
-                yield extract_html_text(file_path)
-
-    # Create an auto-cleaning temporary directory
+                file_text = extract_html_text(file_path)[0]
+                # Text found in epub files may lack clear paragraph boundaries,
+                # which can cause morphemizer input buffer overflows.
+                # To prevent this, we split on CJK punctuation.
+                _chunk = re.split(r"(?<=[。！？])", file_text)
+                yield [s.strip() for s in _chunk if s.strip()]
+
+    # creates an auto-cleaning temp dir allowed by the operating system,
+    # otherwise we can get lack of privilege errors.
     with tempfile.TemporaryDirectory() as temp_dir:
         with zipfile.ZipFile(epub_path) as epub:
             epub.extractall(temp_dir)
             text_content: list[str] = []
 
-            for batch in extract_text(temp_dir):
-                text_content.extend(batch)
+            for chunk in extract_text_chunks(temp_dir):
+                text_content.extend(chunk)
 
             return text_content
 
@@ -98,7 +103,7 @@ def extract_html_text(file_path: Path) -> list[str]:
     """
     with open(file_path, encoding="utf-8") as file:
         content = file.read()
-    content = anki.utils.strip_html(content)
+    content = anki_utils.strip_html(content)
     return [content]
 
 
diff --git a/ankimorphs/morphemizers/mecab_morphemizer.py b/ankimorphs/morphemizers/mecab_morphemizer.py
@@ -18,7 +18,8 @@ def init_successful(self) -> bool:
 
     def get_morphemes(self, sentences: list[str]) -> Iterator[list[Morpheme]]:
         for sentence in sentences:
-            # Remove simple spaces that could be added by other add-ons and break the parsing.
+            # Remove simple spaces that could be added by other add-ons because
+            # they can lead to parsing errors.
             if space_char_regex.search(sentence):
                 sentence = space_char_regex.sub("", sentence)
             yield mecab_wrapper.get_morphemes_mecab(sentence)
diff --git a/test/fake_environment_module.py b/test/fake_environment_module.py
@@ -46,7 +46,6 @@
     priority_file_generator,
     readability_report_generator,
     study_plan_generator,
-    text_extractors,
 )
 from ankimorphs.morphemizers import spacy_wrapper
 from ankimorphs.progression import progression_utils, progression_window
@@ -218,7 +217,6 @@ def create_mw_patches(mock_mw: AnkiQt) -> list[Any]:
         mock.patch.object(known_morphs_exporter, "mw", mock_mw),
         mock.patch.object(ankimorphs_extra_settings, "mw", mock_mw),
         mock.patch.object(generators_output_dialog, "mw", mock_mw),
-        mock.patch.object(text_extractors, "mw", mock_mw),
     ]
 
 

Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,6 @@`
`46`	`46`	`priority_file_generator,`
`47`	`47`	`readability_report_generator,`
`48`	`48`	`study_plan_generator,`
`49`		`- text_extractors,`
`50`	`49`	`)`
`51`	`50`	`from ankimorphs.morphemizers import spacy_wrapper`
`52`	`51`	`from ankimorphs.progression import progression_utils, progression_window`
`@@ -218,7 +217,6 @@ def create_mw_patches(mock_mw: AnkiQt) -> list[Any]:`
`218`	`217`	`mock.patch.object(known_morphs_exporter, "mw", mock_mw),`
`219`	`218`	`mock.patch.object(ankimorphs_extra_settings, "mw", mock_mw),`
`220`	`219`	`mock.patch.object(generators_output_dialog, "mw", mock_mw),`
`221`		`- mock.patch.object(text_extractors, "mw", mock_mw),`
`222`	`220`	`]`
`223`	`221`
`224`	`222`