Skip to content

Commit

Permalink
added support for more file formats (#297)
Browse files Browse the repository at this point in the history
- made existing parsing more sophisticated
- fixed default output path in generator
- fixed generators progress dialog bringing mw up to the background
  • Loading branch information
mortii committed Oct 11, 2024
1 parent ee74265 commit 027900a
Show file tree
Hide file tree
Showing 46 changed files with 27,551 additions and 2,696 deletions.
2 changes: 0 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,3 @@ ground up with a focus on simplicity, performance, and a codebase with minimal t
Download: https://ankiweb.net/shared/info/472573498

Guide and docs: https://mortii.github.io/anki-morphs/

Project roadmap: https://github.com/users/mortii/projects/1/views/1
2 changes: 1 addition & 1 deletion ankimorphs/ankimorphs_globals.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"""

# Semantic Versioning https://semver.org/
__version__ = "3.2.2"
__version__ = "3.3.0"

DEV_MODE: bool = False

Expand Down
5 changes: 1 addition & 4 deletions ankimorphs/debugging_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,12 @@ def print_thread_name() -> None:


def save_to_json_file(file_path: Path, _dict: dict[tuple[str, str], int]) -> None:
"""Changes the file extension to .json and outputs to that location"""
json_file: Path = file_path.with_suffix(".json")

# the json module only does not support dict with tuple keys,
# so we have to convert the keys to single strings and then
# reverse the process when loading them later
dict_with_str_keys = {f"{k[0]}|{k[1]}": v for k, v in _dict.items()}

with json_file.open("w", encoding="utf-8") as file:
with file_path.open("w", encoding="utf-8") as file:
json.dump(dict_with_str_keys, file, ensure_ascii=False, indent=4)


Expand Down
7 changes: 5 additions & 2 deletions ankimorphs/extra_settings/ankimorphs_extra_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,13 @@ def save_generators_window_settings(
self.setValue(GeneratorsWindowKeys.INPUT_DIR, ui.inputDirLineEdit.text())

self.beginGroup(GeneratorsWindowKeys.FILE_FORMATS)
self.setValue(FileFormatsKeys.TXT, ui.txtFilesCheckBox.isChecked())
self.setValue(FileFormatsKeys.ASS, ui.assFilesCheckBox.isChecked())
self.setValue(FileFormatsKeys.EPUB, ui.epubFilesCheckBox.isChecked())
self.setValue(FileFormatsKeys.HTML, ui.htmlFilesCheckBox.isChecked())
self.setValue(FileFormatsKeys.MD, ui.mdFilesCheckBox.isChecked())
self.setValue(FileFormatsKeys.SRT, ui.srtFilesCheckBox.isChecked())
self.setValue(FileFormatsKeys.TXT, ui.txtFilesCheckBox.isChecked())
self.setValue(FileFormatsKeys.VTT, ui.vttFilesCheckBox.isChecked())
self.setValue(FileFormatsKeys.MD, ui.mdFilesCheckBox.isChecked())
self.endGroup() # file format group

self.beginGroup(GeneratorsWindowKeys.PREPROCESS)
Expand Down
7 changes: 5 additions & 2 deletions ankimorphs/extra_settings/extra_settings_keys.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,13 @@ class GeneratorsWindowKeys:


class FileFormatsKeys:
TXT = "txt"
ASS = "ass"
EPUB = "epub"
HTML = "html"
MD = "md"
SRT = "srt"
TXT = "txt"
VTT = "vtt"
MD = "md"


class PreprocessKeys:
Expand Down
2 changes: 1 addition & 1 deletion ankimorphs/generators/generators_output_dialog.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def _setup_output_path(self) -> None:
)
if stored_output_file_path == "":
self.ui.outputLineEdit.setText(
self._default_output_dir + self._default_output_file
os.path.join(self._default_output_dir, self._default_output_file)
)
else:
self.ui.outputLineEdit.setText(stored_output_file_path)
Expand Down
48 changes: 37 additions & 11 deletions ankimorphs/generators/generators_text_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
# abstractions to combine the two, but this would be a classic mistake of
# over-abstraction--the uses cases are sufficiently different that they
# should be kept separate.
from __future__ import annotations

import re
from pathlib import Path
from typing import Any, TextIO
from typing import Any, Callable

from .. import text_preprocessing
from ..exceptions import UnicodeException
Expand All @@ -19,6 +20,14 @@
square_brackets_regex,
)
from ..ui.generators_window_ui import Ui_GeneratorsWindow
from .text_extractors import (
extract_ass_text,
extract_basic_text,
extract_epub_text,
extract_html_text,
extract_srt_text,
extract_vtt_text,
)


class PreprocessOptions:
Expand All @@ -31,40 +40,57 @@ def __init__(self, ui: Ui_GeneratorsWindow):
self.filter_names_from_file: bool = ui.namesFileCheckBox.isChecked()


extractors: dict[str, Callable[[Path], list[str]]] = {
".ass": extract_ass_text,
".epub": extract_epub_text,
".html": extract_html_text,
".srt": extract_srt_text,
".vtt": extract_vtt_text,
".md": extract_basic_text,
".txt": extract_basic_text,
}


def create_file_morph_occurrences(
preprocess_options: PreprocessOptions,
file_path: Path,
file_io: TextIO,
morphemizer: Morphemizer,
nlp: Any,
nlp: Any, # nlp: spacy.Language
) -> dict[str, MorphOccurrence]:
# nlp: spacy.Language

all_lines: list[str] = []
morph_occurrences: dict[str, MorphOccurrence]
raw_lines: list[str]
filtered_lines: list[str] = []
extension = file_path.suffix

if extension in extractors:
raw_lines = extractors[extension](file_path)
else:
raise ValueError(f"Unsupported file format: {extension}")

try:
for line in file_io:
for line in raw_lines:
# lower-case to avoid proper noun false-positives
filtered_lines = filter_line(preprocess_options, line=line.lower())
all_lines.append(filtered_lines)
filtered_line = filter_line(preprocess_options, line=line.strip().lower())
if filtered_line:
filtered_lines.append(filtered_line)

except UnicodeDecodeError as exc:
raise UnicodeException(path=file_path) from exc

if nlp is not None:
morph_occurrences = get_morph_occurrences_by_spacy(
preprocess_options, nlp, all_lines
preprocess_options, nlp, filtered_lines
)
else:
morph_occurrences = get_morph_occurrences_by_morphemizer(
preprocess_options, morphemizer, all_lines
preprocess_options, morphemizer, filtered_lines
)

return morph_occurrences


def filter_line(preprocess: PreprocessOptions, line: str) -> str:

if preprocess.filter_square_brackets:
if square_brackets_regex.search(line):
line = square_brackets_regex.sub("", line)
Expand Down
18 changes: 8 additions & 10 deletions ankimorphs/generators/generators_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,17 +187,15 @@ def generate_morph_occurrences_by_file(
)
)

with open(input_file, encoding="utf-8") as file_io:
file_morph_occurrences: dict[str, MorphOccurrence] = (
generators_text_processing.create_file_morph_occurrences(
preprocess_options=preprocess_options,
file_path=input_file,
file_io=file_io,
morphemizer=_morphemizer,
nlp=_nlp,
)
file_morph_occurrences: dict[str, MorphOccurrence] = (
generators_text_processing.create_file_morph_occurrences(
preprocess_options=preprocess_options,
file_path=input_file,
morphemizer=_morphemizer,
nlp=_nlp,
)
morph_occurrences_by_file[input_file] = file_morph_occurrences
)
morph_occurrences_by_file[input_file] = file_morph_occurrences

return morph_occurrences_by_file

Expand Down
50 changes: 34 additions & 16 deletions ankimorphs/generators/generators_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,35 +149,50 @@ def _setup_checkboxes(self) -> None:

def _setup_file_extension_checkboxes(self) -> None:
checkboxes = [
self.ui.txtFilesCheckBox,
self.ui.assFilesCheckBox,
self.ui.epubFilesCheckBox,
self.ui.htmlFilesCheckBox,
self.ui.mdFilesCheckBox,
self.ui.srtFilesCheckBox,
self.ui.txtFilesCheckBox,
self.ui.vttFilesCheckBox,
self.ui.mdFilesCheckBox,
]

self.am_extra_settings.beginGroup(
extra_settings_keys.GeneratorsWindowKeys.FILE_FORMATS
)

stored_txt_checkbox: bool = self.am_extra_settings.value(
extra_settings_keys.FileFormatsKeys.TXT, defaultValue=True, type=bool
stored_ass_checkbox: bool = self.am_extra_settings.value(
extra_settings_keys.FileFormatsKeys.ASS, defaultValue=True, type=bool
)
stored_epub_checkbox: bool = self.am_extra_settings.value(
extra_settings_keys.FileFormatsKeys.EPUB, defaultValue=True, type=bool
)
stored_html_checkbox: bool = self.am_extra_settings.value(
extra_settings_keys.FileFormatsKeys.HTML, defaultValue=True, type=bool
)
stored_md_checkbox: bool = self.am_extra_settings.value(
extra_settings_keys.FileFormatsKeys.MD, defaultValue=True, type=bool
)
stored_srt_checkbox: bool = self.am_extra_settings.value(
extra_settings_keys.FileFormatsKeys.SRT, defaultValue=True, type=bool
)
stored_txt_checkbox: bool = self.am_extra_settings.value(
extra_settings_keys.FileFormatsKeys.TXT, defaultValue=True, type=bool
)
stored_vtt_checkbox: bool = self.am_extra_settings.value(
extra_settings_keys.FileFormatsKeys.VTT, defaultValue=True, type=bool
)
stored_md_checkbox: bool = self.am_extra_settings.value(
extra_settings_keys.FileFormatsKeys.MD, defaultValue=True, type=bool
)

self.am_extra_settings.endGroup()

self.ui.txtFilesCheckBox.setChecked(stored_txt_checkbox)
self.ui.assFilesCheckBox.setChecked(stored_ass_checkbox)
self.ui.epubFilesCheckBox.setChecked(stored_epub_checkbox)
self.ui.htmlFilesCheckBox.setChecked(stored_html_checkbox)
self.ui.mdFilesCheckBox.setChecked(stored_md_checkbox)
self.ui.srtFilesCheckBox.setChecked(stored_srt_checkbox)
self.ui.txtFilesCheckBox.setChecked(stored_txt_checkbox)
self.ui.vttFilesCheckBox.setChecked(stored_vtt_checkbox)
self.ui.mdFilesCheckBox.setChecked(stored_md_checkbox)

for checkbox in checkboxes:
checkbox.clicked.connect(
Expand Down Expand Up @@ -308,22 +323,27 @@ def _populate_files_column(self) -> None:
def _get_checked_extensions(self) -> tuple[str, ...]:
extensions = []

if self.ui.txtFilesCheckBox.isChecked():
extensions.append(".txt")
if self.ui.assFilesCheckBox.isChecked():
extensions.append(".ass")
if self.ui.epubFilesCheckBox.isChecked():
extensions.append(".epub")
if self.ui.htmlFilesCheckBox.isChecked():
extensions.append(".html")
if self.ui.mdFilesCheckBox.isChecked():
extensions.append(".md")
if self.ui.srtFilesCheckBox.isChecked():
extensions.append(".srt")
if self.ui.txtFilesCheckBox.isChecked():
extensions.append(".txt")
if self.ui.vttFilesCheckBox.isChecked():
extensions.append(".vtt")
if self.ui.mdFilesCheckBox.isChecked():
extensions.append(".md")

# we return a tuple to make it compatible with .endswith()
return tuple(extensions)

def _generate_readability_report(self) -> None:
assert mw is not None

mw.progress.start(label="Generating readability report")
operation = QueryOp(
parent=self,
op=lambda _: readability_report_generator.background_generate_report(
Expand Down Expand Up @@ -352,7 +372,6 @@ def _generate_priority_file(self) -> None:

selected_output_options: OutputOptions = selected_output.get_selected_options()

mw.progress.start(label="Generating priority file")
operation = QueryOp(
parent=self,
op=lambda _: priority_file_generator.background_generate_priority_file(
Expand Down Expand Up @@ -382,7 +401,6 @@ def _generate_study_plan(self) -> None:

selected_output_options: OutputOptions = selected_output.get_selected_options()

mw.progress.start(label="Generating study plan")
operation = QueryOp(
parent=self,
op=lambda _: study_plan_generator.background_generate_study_plan(
Expand Down
3 changes: 2 additions & 1 deletion ankimorphs/generators/priority_file_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ def background_generate_priority_file(
input_files: list[Path],
) -> None:
assert mw is not None
assert mw.progress is not None

mw.progress.start(label="Generating priority file")

# pylint: disable=duplicate-code
morph_occurrences_by_file: dict[Path, dict[str, MorphOccurrence]] = (
Expand Down
3 changes: 2 additions & 1 deletion ankimorphs/generators/readability_report_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ def background_generate_report(
input_files: list[Path],
) -> None:
assert mw is not None
assert mw.progress is not None

mw.progress.start(label="Generating readability report")

if len(input_files) == 0:
raise EmptyFileSelectionException
Expand Down
3 changes: 2 additions & 1 deletion ankimorphs/generators/study_plan_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ def background_generate_study_plan(
input_files: list[Path],
) -> None:
assert mw is not None
assert mw.progress is not None

mw.progress.start(label="Generating study plan")

morph_occurrences_by_file: dict[Path, dict[str, MorphOccurrence]] = (
generators_utils.generate_morph_occurrences_by_file(
Expand Down
Loading

0 comments on commit 027900a

Please sign in to comment.