fixed epub extraction (#360)

mortii · mortii · commit 93a00c00bb23 · 2025-09-20T17:56:11.000+02:00
diff --git a/ankimorphs/ankimorphs_globals.py b/ankimorphs/ankimorphs_globals.py
@@ -5,7 +5,7 @@
 """
 
 # Semantic Versioning https://semver.org/
-__version__ = "6.0.1"
+__version__ = "6.0.2"
 
 DEV_MODE: bool = False
 
diff --git a/ankimorphs/generators/text_extractors.py b/ankimorphs/generators/text_extractors.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import os
-import shutil
+import tempfile
 import zipfile
 from collections.abc import Iterator
 from pathlib import Path
@@ -73,29 +73,23 @@ def extract_vtt_text(file_path: Path) -> list[str]:
 def extract_epub_text(epub_path: Path) -> list[str]:
     assert mw is not None
 
-    # Create a temporary directory to store unzipped contents
-    temp_dir: str = "temp_unzipped_epub"  # relative directory
-
-    if os.path.exists(temp_dir):
-        shutil.rmtree(temp_dir)
-
-    with zipfile.ZipFile(epub_path) as epub:
-        epub.extractall(temp_dir)
-
-    text_content: list[str] = []
-
     # Use a generator to yield text lines for better memory efficiency
-    def extract_text() -> Iterator[list[str]]:
-        for _root, _, _files in os.walk(temp_dir):
+    def extract_text(_temp_dir: str) -> Iterator[list[str]]:
+        for _root, _, _files in os.walk(_temp_dir):
             for file in filter(lambda f: f.endswith((".xhtml", ".html")), _files):
                 file_path = Path(_root, file)
                 yield extract_html_text(file_path)
 
-    for batch in extract_text():
-        text_content.extend(batch)
+    # Create an auto-cleaning temporary directory
+    with tempfile.TemporaryDirectory() as temp_dir:
+        with zipfile.ZipFile(epub_path) as epub:
+            epub.extractall(temp_dir)
+            text_content: list[str] = []
+
+            for batch in extract_text(temp_dir):
+                text_content.extend(batch)
 
-    shutil.rmtree(temp_dir)
-    return text_content
+            return text_content
 
 
 def extract_html_text(file_path: Path) -> list[str]:
diff --git a/docs/src/contributors.md b/docs/src/contributors.md
@@ -19,7 +19,7 @@ Vilhelm-Ian, CodeWithMa, ashprice, aleksejrs, HQYang1979, soliviantar, buster-bl
 cocowash, asayake-b5, quietmansoath, MichaelPetre, xofm31, knoebelja, xuiqzy, Jcuhfehl, fuquasteve, pallas42, syfgk,
 jahnke, jsteel44, iwouldrathernotusegithub, tanhoaian01, drkthomp, Kirchheim, zeroeightysix, Gardengul, wolearyc,
 Pedrubik2000, RyanMcEntire, BobvanSchendel, khanguyenwk, buqamura, Rct567, rwmpelstilzchen, bie-zheng, IncontinentCell,
-mdraves91, dae, AtilioA, BenjaminBrandtner.
+mdraves91, dae, AtilioA, BenjaminBrandtner, JSchoreels.
 
 ### MorphMan (v5.0-qt6-alpha.1)