|
1 | 1 | from __future__ import annotations |
2 | 2 |
|
3 | 3 | import os |
4 | | -import shutil |
| 4 | +import tempfile |
5 | 5 | import zipfile |
6 | 6 | from collections.abc import Iterator |
7 | 7 | from pathlib import Path |
@@ -73,29 +73,23 @@ def extract_vtt_text(file_path: Path) -> list[str]: |
73 | 73 | def extract_epub_text(epub_path: Path) -> list[str]: |
74 | 74 | assert mw is not None |
75 | 75 |
|
76 | | - # Create a temporary directory to store unzipped contents |
77 | | - temp_dir: str = "temp_unzipped_epub" # relative directory |
78 | | - |
79 | | - if os.path.exists(temp_dir): |
80 | | - shutil.rmtree(temp_dir) |
81 | | - |
82 | | - with zipfile.ZipFile(epub_path) as epub: |
83 | | - epub.extractall(temp_dir) |
84 | | - |
85 | | - text_content: list[str] = [] |
86 | | - |
87 | 76 | # Use a generator to yield text lines for better memory efficiency |
88 | | - def extract_text() -> Iterator[list[str]]: |
89 | | - for _root, _, _files in os.walk(temp_dir): |
| 77 | + def extract_text(_temp_dir: str) -> Iterator[list[str]]: |
| 78 | + for _root, _, _files in os.walk(_temp_dir): |
90 | 79 | for file in filter(lambda f: f.endswith((".xhtml", ".html")), _files): |
91 | 80 | file_path = Path(_root, file) |
92 | 81 | yield extract_html_text(file_path) |
93 | 82 |
|
94 | | - for batch in extract_text(): |
95 | | - text_content.extend(batch) |
| 83 | + # Create an auto-cleaning temporary directory |
| 84 | + with tempfile.TemporaryDirectory() as temp_dir: |
| 85 | + with zipfile.ZipFile(epub_path) as epub: |
| 86 | + epub.extractall(temp_dir) |
| 87 | + text_content: list[str] = [] |
| 88 | + |
| 89 | + for batch in extract_text(temp_dir): |
| 90 | + text_content.extend(batch) |
96 | 91 |
|
97 | | - shutil.rmtree(temp_dir) |
98 | | - return text_content |
| 92 | + return text_content |
99 | 93 |
|
100 | 94 |
|
101 | 95 | def extract_html_text(file_path: Path) -> list[str]: |
|
0 commit comments