Skip to content

Commit 93a00c0

Browse files
committed
fixed epub extraction (#360)
1 parent 72ff2fe commit 93a00c0

File tree

3 files changed

+14
-20
lines changed

3 files changed

+14
-20
lines changed

ankimorphs/ankimorphs_globals.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
"""
66

77
# Semantic Versioning https://semver.org/
8-
__version__ = "6.0.1"
8+
__version__ = "6.0.2"
99

1010
DEV_MODE: bool = False
1111

ankimorphs/generators/text_extractors.py

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from __future__ import annotations
22

33
import os
4-
import shutil
4+
import tempfile
55
import zipfile
66
from collections.abc import Iterator
77
from pathlib import Path
@@ -73,29 +73,23 @@ def extract_vtt_text(file_path: Path) -> list[str]:
7373
def extract_epub_text(epub_path: Path) -> list[str]:
7474
assert mw is not None
7575

76-
# Create a temporary directory to store unzipped contents
77-
temp_dir: str = "temp_unzipped_epub" # relative directory
78-
79-
if os.path.exists(temp_dir):
80-
shutil.rmtree(temp_dir)
81-
82-
with zipfile.ZipFile(epub_path) as epub:
83-
epub.extractall(temp_dir)
84-
85-
text_content: list[str] = []
86-
8776
# Use a generator to yield text lines for better memory efficiency
88-
def extract_text() -> Iterator[list[str]]:
89-
for _root, _, _files in os.walk(temp_dir):
77+
def extract_text(_temp_dir: str) -> Iterator[list[str]]:
78+
for _root, _, _files in os.walk(_temp_dir):
9079
for file in filter(lambda f: f.endswith((".xhtml", ".html")), _files):
9180
file_path = Path(_root, file)
9281
yield extract_html_text(file_path)
9382

94-
for batch in extract_text():
95-
text_content.extend(batch)
83+
# Create an auto-cleaning temporary directory
84+
with tempfile.TemporaryDirectory() as temp_dir:
85+
with zipfile.ZipFile(epub_path) as epub:
86+
epub.extractall(temp_dir)
87+
text_content: list[str] = []
88+
89+
for batch in extract_text(temp_dir):
90+
text_content.extend(batch)
9691

97-
shutil.rmtree(temp_dir)
98-
return text_content
92+
return text_content
9993

10094

10195
def extract_html_text(file_path: Path) -> list[str]:

docs/src/contributors.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ Vilhelm-Ian, CodeWithMa, ashprice, aleksejrs, HQYang1979, soliviantar, buster-bl
1919
cocowash, asayake-b5, quietmansoath, MichaelPetre, xofm31, knoebelja, xuiqzy, Jcuhfehl, fuquasteve, pallas42, syfgk,
2020
jahnke, jsteel44, iwouldrathernotusegithub, tanhoaian01, drkthomp, Kirchheim, zeroeightysix, Gardengul, wolearyc,
2121
Pedrubik2000, RyanMcEntire, BobvanSchendel, khanguyenwk, buqamura, Rct567, rwmpelstilzchen, bie-zheng, IncontinentCell,
22-
mdraves91, dae, AtilioA, BenjaminBrandtner.
22+
mdraves91, dae, AtilioA, BenjaminBrandtner, JSchoreels.
2323

2424
### MorphMan (v5.0-qt6-alpha.1)
2525

0 commit comments

Comments
 (0)