11from __future__ import annotations
22
33import os
4+ import re
45import tempfile
56import zipfile
67from collections .abc import Iterator
78from pathlib import Path
89
9- import anki
10- from aqt import mw
10+ from anki import utils as anki_utils
1111
1212
1313def extract_ass_text (file_path : Path ) -> list [str ]:
@@ -71,23 +71,28 @@ def extract_vtt_text(file_path: Path) -> list[str]:
7171
7272
7373def extract_epub_text (epub_path : Path ) -> list [str ]:
74- assert mw is not None
7574
76- # Use a generator to yield text lines for better memory efficiency
77- def extract_text (_temp_dir : str ) -> Iterator [list [str ]]:
75+ # use a generator to yield text lines for better memory efficiency
76+ def extract_text_chunks (_temp_dir : str ) -> Iterator [list [str ]]:
7877 for _root , _ , _files in os .walk (_temp_dir ):
7978 for file in filter (lambda f : f .endswith ((".xhtml" , ".html" )), _files ):
8079 file_path = Path (_root , file )
81- yield extract_html_text (file_path )
82-
83- # Create an auto-cleaning temporary directory
80+ file_text = extract_html_text (file_path )[0 ]
81+ # Text found in epub files may lack clear paragraph boundaries,
82+ # which can cause morphemizer input buffer overflows.
83+ # To prevent this, we split on CJK punctuation.
84+ _chunk = re .split (r"(?<=[。!?])" , file_text )
85+ yield [s .strip () for s in _chunk if s .strip ()]
86+
87+ # creates an auto-cleaning temp dir allowed by the operating system,
88+ # otherwise we can get lack of privilege errors.
8489 with tempfile .TemporaryDirectory () as temp_dir :
8590 with zipfile .ZipFile (epub_path ) as epub :
8691 epub .extractall (temp_dir )
8792 text_content : list [str ] = []
8893
89- for batch in extract_text (temp_dir ):
90- text_content .extend (batch )
94+ for chunk in extract_text_chunks (temp_dir ):
95+ text_content .extend (chunk )
9196
9297 return text_content
9398
@@ -98,7 +103,7 @@ def extract_html_text(file_path: Path) -> list[str]:
98103 """
99104 with open (file_path , encoding = "utf-8" ) as file :
100105 content = file .read ()
101- content = anki . utils .strip_html (content )
106+ content = anki_utils .strip_html (content )
102107 return [content ]
103108
104109
0 commit comments