Skip to content

Commit 6253529

Browse files
committed
fixed epub text splitting (#362)
1 parent 93a00c0 commit 6253529

File tree

4 files changed

+19
-15
lines changed

4 files changed

+19
-15
lines changed

ankimorphs/ankimorphs_globals.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
"""
66

77
# Semantic Versioning https://semver.org/
8-
__version__ = "6.0.2"
8+
__version__ = "6.0.3"
99

1010
DEV_MODE: bool = False
1111

ankimorphs/generators/text_extractors.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
from __future__ import annotations
22

33
import os
4+
import re
45
import tempfile
56
import zipfile
67
from collections.abc import Iterator
78
from pathlib import Path
89

9-
import anki
10-
from aqt import mw
10+
from anki import utils as anki_utils
1111

1212

1313
def extract_ass_text(file_path: Path) -> list[str]:
@@ -71,23 +71,28 @@ def extract_vtt_text(file_path: Path) -> list[str]:
7171

7272

7373
def extract_epub_text(epub_path: Path) -> list[str]:
74-
assert mw is not None
7574

76-
# Use a generator to yield text lines for better memory efficiency
77-
def extract_text(_temp_dir: str) -> Iterator[list[str]]:
75+
# use a generator to yield text lines for better memory efficiency
76+
def extract_text_chunks(_temp_dir: str) -> Iterator[list[str]]:
7877
for _root, _, _files in os.walk(_temp_dir):
7978
for file in filter(lambda f: f.endswith((".xhtml", ".html")), _files):
8079
file_path = Path(_root, file)
81-
yield extract_html_text(file_path)
82-
83-
# Create an auto-cleaning temporary directory
80+
file_text = extract_html_text(file_path)[0]
81+
# Text found in epub files may lack clear paragraph boundaries,
82+
# which can cause morphemizer input buffer overflows.
83+
# To prevent this, we split on CJK punctuation.
84+
_chunk = re.split(r"(?<=[。!?])", file_text)
85+
yield [s.strip() for s in _chunk if s.strip()]
86+
87+
# creates an auto-cleaning temp dir allowed by the operating system,
88+
# otherwise we can get lack of privilege errors.
8489
with tempfile.TemporaryDirectory() as temp_dir:
8590
with zipfile.ZipFile(epub_path) as epub:
8691
epub.extractall(temp_dir)
8792
text_content: list[str] = []
8893

89-
for batch in extract_text(temp_dir):
90-
text_content.extend(batch)
94+
for chunk in extract_text_chunks(temp_dir):
95+
text_content.extend(chunk)
9196

9297
return text_content
9398

@@ -98,7 +103,7 @@ def extract_html_text(file_path: Path) -> list[str]:
98103
"""
99104
with open(file_path, encoding="utf-8") as file:
100105
content = file.read()
101-
content = anki.utils.strip_html(content)
106+
content = anki_utils.strip_html(content)
102107
return [content]
103108

104109

ankimorphs/morphemizers/mecab_morphemizer.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ def init_successful(self) -> bool:
1818

1919
def get_morphemes(self, sentences: list[str]) -> Iterator[list[Morpheme]]:
2020
for sentence in sentences:
21-
# Remove simple spaces that could be added by other add-ons and break the parsing.
21+
# Remove simple spaces that could be added by other add-ons because
22+
# they can lead to parsing errors.
2223
if space_char_regex.search(sentence):
2324
sentence = space_char_regex.sub("", sentence)
2425
yield mecab_wrapper.get_morphemes_mecab(sentence)

test/fake_environment_module.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@
4646
priority_file_generator,
4747
readability_report_generator,
4848
study_plan_generator,
49-
text_extractors,
5049
)
5150
from ankimorphs.morphemizers import spacy_wrapper
5251
from ankimorphs.progression import progression_utils, progression_window
@@ -218,7 +217,6 @@ def create_mw_patches(mock_mw: AnkiQt) -> list[Any]:
218217
mock.patch.object(known_morphs_exporter, "mw", mock_mw),
219218
mock.patch.object(ankimorphs_extra_settings, "mw", mock_mw),
220219
mock.patch.object(generators_output_dialog, "mw", mock_mw),
221-
mock.patch.object(text_extractors, "mw", mock_mw),
222220
]
223221

224222

0 commit comments

Comments
 (0)