Skip to content

Commit 027900a

Browse files
committed
added support for more file formats (#297)
- made existing parsing more sophisticated - fixed default output path in generator - fixed generators progress dialog bringing mw up to the background
1 parent ee74265 commit 027900a

File tree

46 files changed

+27551
-2696
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+27551
-2696
lines changed

README.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,3 @@ ground up with a focus on simplicity, performance, and a codebase with minimal t
1313
Download: https://ankiweb.net/shared/info/472573498
1414

1515
Guide and docs: https://mortii.github.io/anki-morphs/
16-
17-
Project roadmap: https://github.com/users/mortii/projects/1/views/1

ankimorphs/ankimorphs_globals.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
"""
66

77
# Semantic Versioning https://semver.org/
8-
__version__ = "3.2.2"
8+
__version__ = "3.3.0"
99

1010
DEV_MODE: bool = False
1111

ankimorphs/debugging_utils.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,12 @@ def print_thread_name() -> None:
2323

2424

2525
def save_to_json_file(file_path: Path, _dict: dict[tuple[str, str], int]) -> None:
26-
"""Changes the file extension to .json and outputs to that location"""
27-
json_file: Path = file_path.with_suffix(".json")
28-
2926
# the json module only does not support dict with tuple keys,
3027
# so we have to convert the keys to single strings and then
3128
# reverse the process when loading them later
3229
dict_with_str_keys = {f"{k[0]}|{k[1]}": v for k, v in _dict.items()}
3330

34-
with json_file.open("w", encoding="utf-8") as file:
31+
with file_path.open("w", encoding="utf-8") as file:
3532
json.dump(dict_with_str_keys, file, ensure_ascii=False, indent=4)
3633

3734

ankimorphs/extra_settings/ankimorphs_extra_settings.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,13 @@ def save_generators_window_settings(
4040
self.setValue(GeneratorsWindowKeys.INPUT_DIR, ui.inputDirLineEdit.text())
4141

4242
self.beginGroup(GeneratorsWindowKeys.FILE_FORMATS)
43-
self.setValue(FileFormatsKeys.TXT, ui.txtFilesCheckBox.isChecked())
43+
self.setValue(FileFormatsKeys.ASS, ui.assFilesCheckBox.isChecked())
44+
self.setValue(FileFormatsKeys.EPUB, ui.epubFilesCheckBox.isChecked())
45+
self.setValue(FileFormatsKeys.HTML, ui.htmlFilesCheckBox.isChecked())
46+
self.setValue(FileFormatsKeys.MD, ui.mdFilesCheckBox.isChecked())
4447
self.setValue(FileFormatsKeys.SRT, ui.srtFilesCheckBox.isChecked())
48+
self.setValue(FileFormatsKeys.TXT, ui.txtFilesCheckBox.isChecked())
4549
self.setValue(FileFormatsKeys.VTT, ui.vttFilesCheckBox.isChecked())
46-
self.setValue(FileFormatsKeys.MD, ui.mdFilesCheckBox.isChecked())
4750
self.endGroup() # file format group
4851

4952
self.beginGroup(GeneratorsWindowKeys.PREPROCESS)

ankimorphs/extra_settings/extra_settings_keys.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,13 @@ class GeneratorsWindowKeys:
1919

2020

2121
class FileFormatsKeys:
22-
TXT = "txt"
22+
ASS = "ass"
23+
EPUB = "epub"
24+
HTML = "html"
25+
MD = "md"
2326
SRT = "srt"
27+
TXT = "txt"
2428
VTT = "vtt"
25-
MD = "md"
2629

2730

2831
class PreprocessKeys:

ankimorphs/generators/generators_output_dialog.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def _setup_output_path(self) -> None:
7474
)
7575
if stored_output_file_path == "":
7676
self.ui.outputLineEdit.setText(
77-
self._default_output_dir + self._default_output_file
77+
os.path.join(self._default_output_dir, self._default_output_file)
7878
)
7979
else:
8080
self.ui.outputLineEdit.setText(stored_output_file_path)

ankimorphs/generators/generators_text_processing.py

Lines changed: 37 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@
33
# abstractions to combine the two, but this would be a classic mistake of
44
# over-abstraction--the uses cases are sufficiently different that they
55
# should be kept separate.
6+
from __future__ import annotations
67

78
import re
89
from pathlib import Path
9-
from typing import Any, TextIO
10+
from typing import Any, Callable
1011

1112
from .. import text_preprocessing
1213
from ..exceptions import UnicodeException
@@ -19,6 +20,14 @@
1920
square_brackets_regex,
2021
)
2122
from ..ui.generators_window_ui import Ui_GeneratorsWindow
23+
from .text_extractors import (
24+
extract_ass_text,
25+
extract_basic_text,
26+
extract_epub_text,
27+
extract_html_text,
28+
extract_srt_text,
29+
extract_vtt_text,
30+
)
2231

2332

2433
class PreprocessOptions:
@@ -31,40 +40,57 @@ def __init__(self, ui: Ui_GeneratorsWindow):
3140
self.filter_names_from_file: bool = ui.namesFileCheckBox.isChecked()
3241

3342

43+
extractors: dict[str, Callable[[Path], list[str]]] = {
44+
".ass": extract_ass_text,
45+
".epub": extract_epub_text,
46+
".html": extract_html_text,
47+
".srt": extract_srt_text,
48+
".vtt": extract_vtt_text,
49+
".md": extract_basic_text,
50+
".txt": extract_basic_text,
51+
}
52+
53+
3454
def create_file_morph_occurrences(
3555
preprocess_options: PreprocessOptions,
3656
file_path: Path,
37-
file_io: TextIO,
3857
morphemizer: Morphemizer,
39-
nlp: Any,
58+
nlp: Any, # nlp: spacy.Language
4059
) -> dict[str, MorphOccurrence]:
41-
# nlp: spacy.Language
4260

43-
all_lines: list[str] = []
4461
morph_occurrences: dict[str, MorphOccurrence]
62+
raw_lines: list[str]
63+
filtered_lines: list[str] = []
64+
extension = file_path.suffix
65+
66+
if extension in extractors:
67+
raw_lines = extractors[extension](file_path)
68+
else:
69+
raise ValueError(f"Unsupported file format: {extension}")
4570

4671
try:
47-
for line in file_io:
72+
for line in raw_lines:
4873
# lower-case to avoid proper noun false-positives
49-
filtered_lines = filter_line(preprocess_options, line=line.lower())
50-
all_lines.append(filtered_lines)
74+
filtered_line = filter_line(preprocess_options, line=line.strip().lower())
75+
if filtered_line:
76+
filtered_lines.append(filtered_line)
77+
5178
except UnicodeDecodeError as exc:
5279
raise UnicodeException(path=file_path) from exc
5380

5481
if nlp is not None:
5582
morph_occurrences = get_morph_occurrences_by_spacy(
56-
preprocess_options, nlp, all_lines
83+
preprocess_options, nlp, filtered_lines
5784
)
5885
else:
5986
morph_occurrences = get_morph_occurrences_by_morphemizer(
60-
preprocess_options, morphemizer, all_lines
87+
preprocess_options, morphemizer, filtered_lines
6188
)
6289

6390
return morph_occurrences
6491

6592

6693
def filter_line(preprocess: PreprocessOptions, line: str) -> str:
67-
6894
if preprocess.filter_square_brackets:
6995
if square_brackets_regex.search(line):
7096
line = square_brackets_regex.sub("", line)

ankimorphs/generators/generators_utils.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -187,17 +187,15 @@ def generate_morph_occurrences_by_file(
187187
)
188188
)
189189

190-
with open(input_file, encoding="utf-8") as file_io:
191-
file_morph_occurrences: dict[str, MorphOccurrence] = (
192-
generators_text_processing.create_file_morph_occurrences(
193-
preprocess_options=preprocess_options,
194-
file_path=input_file,
195-
file_io=file_io,
196-
morphemizer=_morphemizer,
197-
nlp=_nlp,
198-
)
190+
file_morph_occurrences: dict[str, MorphOccurrence] = (
191+
generators_text_processing.create_file_morph_occurrences(
192+
preprocess_options=preprocess_options,
193+
file_path=input_file,
194+
morphemizer=_morphemizer,
195+
nlp=_nlp,
199196
)
200-
morph_occurrences_by_file[input_file] = file_morph_occurrences
197+
)
198+
morph_occurrences_by_file[input_file] = file_morph_occurrences
201199

202200
return morph_occurrences_by_file
203201

ankimorphs/generators/generators_window.py

Lines changed: 34 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -149,35 +149,50 @@ def _setup_checkboxes(self) -> None:
149149

150150
def _setup_file_extension_checkboxes(self) -> None:
151151
checkboxes = [
152-
self.ui.txtFilesCheckBox,
152+
self.ui.assFilesCheckBox,
153+
self.ui.epubFilesCheckBox,
154+
self.ui.htmlFilesCheckBox,
155+
self.ui.mdFilesCheckBox,
153156
self.ui.srtFilesCheckBox,
157+
self.ui.txtFilesCheckBox,
154158
self.ui.vttFilesCheckBox,
155-
self.ui.mdFilesCheckBox,
156159
]
157160

158161
self.am_extra_settings.beginGroup(
159162
extra_settings_keys.GeneratorsWindowKeys.FILE_FORMATS
160163
)
161164

162-
stored_txt_checkbox: bool = self.am_extra_settings.value(
163-
extra_settings_keys.FileFormatsKeys.TXT, defaultValue=True, type=bool
165+
stored_ass_checkbox: bool = self.am_extra_settings.value(
166+
extra_settings_keys.FileFormatsKeys.ASS, defaultValue=True, type=bool
167+
)
168+
stored_epub_checkbox: bool = self.am_extra_settings.value(
169+
extra_settings_keys.FileFormatsKeys.EPUB, defaultValue=True, type=bool
170+
)
171+
stored_html_checkbox: bool = self.am_extra_settings.value(
172+
extra_settings_keys.FileFormatsKeys.HTML, defaultValue=True, type=bool
173+
)
174+
stored_md_checkbox: bool = self.am_extra_settings.value(
175+
extra_settings_keys.FileFormatsKeys.MD, defaultValue=True, type=bool
164176
)
165177
stored_srt_checkbox: bool = self.am_extra_settings.value(
166178
extra_settings_keys.FileFormatsKeys.SRT, defaultValue=True, type=bool
167179
)
180+
stored_txt_checkbox: bool = self.am_extra_settings.value(
181+
extra_settings_keys.FileFormatsKeys.TXT, defaultValue=True, type=bool
182+
)
168183
stored_vtt_checkbox: bool = self.am_extra_settings.value(
169184
extra_settings_keys.FileFormatsKeys.VTT, defaultValue=True, type=bool
170185
)
171-
stored_md_checkbox: bool = self.am_extra_settings.value(
172-
extra_settings_keys.FileFormatsKeys.MD, defaultValue=True, type=bool
173-
)
174186

175187
self.am_extra_settings.endGroup()
176188

177-
self.ui.txtFilesCheckBox.setChecked(stored_txt_checkbox)
189+
self.ui.assFilesCheckBox.setChecked(stored_ass_checkbox)
190+
self.ui.epubFilesCheckBox.setChecked(stored_epub_checkbox)
191+
self.ui.htmlFilesCheckBox.setChecked(stored_html_checkbox)
192+
self.ui.mdFilesCheckBox.setChecked(stored_md_checkbox)
178193
self.ui.srtFilesCheckBox.setChecked(stored_srt_checkbox)
194+
self.ui.txtFilesCheckBox.setChecked(stored_txt_checkbox)
179195
self.ui.vttFilesCheckBox.setChecked(stored_vtt_checkbox)
180-
self.ui.mdFilesCheckBox.setChecked(stored_md_checkbox)
181196

182197
for checkbox in checkboxes:
183198
checkbox.clicked.connect(
@@ -308,22 +323,27 @@ def _populate_files_column(self) -> None:
308323
def _get_checked_extensions(self) -> tuple[str, ...]:
309324
extensions = []
310325

311-
if self.ui.txtFilesCheckBox.isChecked():
312-
extensions.append(".txt")
326+
if self.ui.assFilesCheckBox.isChecked():
327+
extensions.append(".ass")
328+
if self.ui.epubFilesCheckBox.isChecked():
329+
extensions.append(".epub")
330+
if self.ui.htmlFilesCheckBox.isChecked():
331+
extensions.append(".html")
332+
if self.ui.mdFilesCheckBox.isChecked():
333+
extensions.append(".md")
313334
if self.ui.srtFilesCheckBox.isChecked():
314335
extensions.append(".srt")
336+
if self.ui.txtFilesCheckBox.isChecked():
337+
extensions.append(".txt")
315338
if self.ui.vttFilesCheckBox.isChecked():
316339
extensions.append(".vtt")
317-
if self.ui.mdFilesCheckBox.isChecked():
318-
extensions.append(".md")
319340

320341
# we return a tuple to make it compatible with .endswith()
321342
return tuple(extensions)
322343

323344
def _generate_readability_report(self) -> None:
324345
assert mw is not None
325346

326-
mw.progress.start(label="Generating readability report")
327347
operation = QueryOp(
328348
parent=self,
329349
op=lambda _: readability_report_generator.background_generate_report(
@@ -352,7 +372,6 @@ def _generate_priority_file(self) -> None:
352372

353373
selected_output_options: OutputOptions = selected_output.get_selected_options()
354374

355-
mw.progress.start(label="Generating priority file")
356375
operation = QueryOp(
357376
parent=self,
358377
op=lambda _: priority_file_generator.background_generate_priority_file(
@@ -382,7 +401,6 @@ def _generate_study_plan(self) -> None:
382401

383402
selected_output_options: OutputOptions = selected_output.get_selected_options()
384403

385-
mw.progress.start(label="Generating study plan")
386404
operation = QueryOp(
387405
parent=self,
388406
op=lambda _: study_plan_generator.background_generate_study_plan(

ankimorphs/generators/priority_file_generator.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ def background_generate_priority_file(
2222
input_files: list[Path],
2323
) -> None:
2424
assert mw is not None
25-
assert mw.progress is not None
25+
26+
mw.progress.start(label="Generating priority file")
2627

2728
# pylint: disable=duplicate-code
2829
morph_occurrences_by_file: dict[Path, dict[str, MorphOccurrence]] = (

ankimorphs/generators/readability_report_generator.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ def background_generate_report(
2424
input_files: list[Path],
2525
) -> None:
2626
assert mw is not None
27-
assert mw.progress is not None
27+
28+
mw.progress.start(label="Generating readability report")
2829

2930
if len(input_files) == 0:
3031
raise EmptyFileSelectionException

ankimorphs/generators/study_plan_generator.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ def background_generate_study_plan(
2323
input_files: list[Path],
2424
) -> None:
2525
assert mw is not None
26-
assert mw.progress is not None
26+
27+
mw.progress.start(label="Generating study plan")
2728

2829
morph_occurrences_by_file: dict[Path, dict[str, MorphOccurrence]] = (
2930
generators_utils.generate_morph_occurrences_by_file(

0 commit comments

Comments
 (0)