Skip to content

Commit f7d154e

Browse files
committed
added custom char filter to generators (#325)
1 parent ec61c18 commit f7d154e

File tree

9 files changed

+388
-330
lines changed

9 files changed

+388
-330
lines changed

ankimorphs/extra_settings/ankimorphs_extra_settings.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ def save_generators_window_settings(
5757
self.setValue(PreprocessKeys.IGNORE_NAMES_MORPHEMIZER, ui.namesMorphemizerCheckBox.isChecked())
5858
self.setValue(PreprocessKeys.IGNORE_NAMES_IN_FILE, ui.namesFileCheckBox.isChecked())
5959
self.setValue(PreprocessKeys.IGNORE_NUMBERS, ui.numbersCheckBox.isChecked())
60+
self.setValue(PreprocessKeys.IGNORE_CUSTOM_CHARS, ui.customCharactersCheckBox.isChecked())
61+
self.setValue(PreprocessKeys.CHARS_TO_IGNORE, ui.customCharactersLineEdit.text())
6062
self.endGroup() # preprocess group
6163
self.endGroup() # generators window group
6264
# fmt: on

ankimorphs/extra_settings/extra_settings_keys.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ class PreprocessKeys:
3636
IGNORE_NAMES_MORPHEMIZER = "ignore_names_morphemizer"
3737
IGNORE_NAMES_IN_FILE = "ignore_names_in_file"
3838
IGNORE_NUMBERS = "ignore_numbers"
39+
IGNORE_CUSTOM_CHARS = "ignore_custom_chars"
40+
CHARS_TO_IGNORE = "chars_to_ignore"
3941

4042

4143
class KnownMorphsExporterKeys:

ankimorphs/generators/generators_utils.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -104,14 +104,16 @@ def __add__(self, other: FileMorphsStats) -> FileMorphsStats:
104104
return self
105105

106106

107-
class PreprocessOptions:
107+
class PreprocessOptions: # pylint:disable=too-many-instance-attributes
108108
def __init__(self, ui: Ui_GeneratorsWindow):
109109
self.filter_square_brackets: bool = ui.squareBracketsCheckBox.isChecked()
110110
self.filter_round_brackets: bool = ui.roundBracketsCheckBox.isChecked()
111111
self.filter_slim_round_brackets: bool = ui.slimRoundBracketsCheckBox.isChecked()
112112
self.filter_numbers: bool = ui.numbersCheckBox.isChecked()
113113
self.filter_morphemizer_names: bool = ui.namesMorphemizerCheckBox.isChecked()
114114
self.filter_names_from_file: bool = ui.namesFileCheckBox.isChecked()
115+
self.filter_custom_chars: bool = ui.customCharactersCheckBox.isChecked()
116+
self.custom_chars_to_ignore: str = ui.customCharactersLineEdit.text()
115117

116118
def to_mock_am_config(self) -> AnkiMorphsConfig:
117119
return Mock(
@@ -122,7 +124,8 @@ def to_mock_am_config(self) -> AnkiMorphsConfig:
122124
preprocess_ignore_numbers=self.filter_numbers,
123125
preprocess_ignore_names_morphemizer=self.filter_morphemizer_names,
124126
preprocess_ignore_names_textfile=self.filter_names_from_file,
125-
preprocess_ignore_custom_characters="", # todo: add option in generators window?
127+
preprocess_ignore_custom_characters=self.filter_custom_chars,
128+
preprocess_custom_characters_to_ignore=self.custom_chars_to_ignore,
126129
)
127130

128131

@@ -206,7 +209,6 @@ def generate_morph_occurrences_by_file(
206209
)
207210
preprocess_options = PreprocessOptions(ui)
208211
morph_occurrences_by_file: dict[Path, dict[str, MorphOccurrence]] = {}
209-
210212
sorted_input_files: list[Path]
211213

212214
if sorted_by_table:
@@ -217,6 +219,8 @@ def generate_morph_occurrences_by_file(
217219
else:
218220
sorted_input_files = input_files
219221

222+
translation_table = str.maketrans("", "", preprocess_options.custom_chars_to_ignore)
223+
220224
for input_file in sorted_input_files:
221225
if mw.progress.want_cancel(): # user clicked 'x' button
222226
raise CancelledOperationException
@@ -233,6 +237,7 @@ def generate_morph_occurrences_by_file(
233237
preprocess_options=preprocess_options,
234238
file_path=input_file,
235239
morphemizer=_morphemizer,
240+
translation_table=translation_table,
236241
)
237242
)
238243
morph_occurrences_by_file[input_file] = file_morph_occurrences
@@ -244,9 +249,9 @@ def create_file_morph_occurrences(
244249
preprocess_options: PreprocessOptions,
245250
file_path: Path,
246251
morphemizer: Morphemizer,
252+
translation_table: dict[int, int | None],
247253
) -> dict[str, MorphOccurrence]:
248254

249-
morph_occurrences: dict[str, MorphOccurrence]
250255
raw_lines: list[str]
251256
filtered_lines: list[str] = []
252257
extension = file_path.suffix
@@ -261,22 +266,21 @@ def create_file_morph_occurrences(
261266
for line in raw_lines:
262267
# lower-case to avoid proper noun false-positives
263268
filtered_line = text_preprocessing.get_processed_text(
264-
am_config=mock_am_config, text=line.strip().lower()
269+
am_config=mock_am_config,
270+
text=line.strip().lower(),
271+
translation_table=translation_table,
265272
)
266273
if filtered_line:
267274
filtered_lines.append(filtered_line)
268-
269275
except UnicodeDecodeError as exc:
270276
raise UnicodeException(path=file_path) from exc
271277

272-
morph_occurrences = get_morph_occurrences(
278+
return get_morph_occurrences(
273279
mock_am_config=mock_am_config,
274280
morphemizer=morphemizer,
275281
all_lines=filtered_lines,
276282
)
277283

278-
return morph_occurrences
279-
280284

281285
def get_morph_occurrences(
282286
mock_am_config: AnkiMorphsConfig,

ankimorphs/generators/generators_window.py

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
QFileDialog,
1717
QHeaderView,
1818
QMainWindow,
19+
Qt,
1920
QTableWidget,
2021
QTableWidgetItem,
2122
)
@@ -57,15 +58,14 @@ def __init__(
5758

5859
self.checkboxes: list[QCheckBox] = []
5960
self._input_files: list[Path] = []
61+
self._input_dir_root: Path
6062
self._morphemizers: list[Morphemizer] = morphemizer_utils.get_all_morphemizers()
6163
self._setup_morphemizers()
64+
self._setup_line_edits()
6265
self._setup_checkboxes()
63-
self._input_dir_root: Path
64-
6566
self._setup_table(self.ui.numericalTableWidget)
6667
self._setup_table(self.ui.percentTableWidget)
6768
self._setup_buttons()
68-
self._setup_input_field()
6969
self._setup_geometry()
7070

7171
self.am_extra_settings.endGroup()
@@ -124,13 +124,27 @@ def _setup_buttons(self) -> None:
124124

125125
self.ui.loadFilesPushButton.setFocus() # quality of life
126126

127-
def _setup_input_field(self) -> None:
127+
def _setup_line_edits(self) -> None:
128128
stored_input_dir: str = self.am_extra_settings.value(
129129
extra_settings_keys.GeneratorsWindowKeys.INPUT_DIR, type=str
130130
)
131+
132+
self.am_extra_settings.beginGroup(
133+
extra_settings_keys.GeneratorsWindowKeys.PREPROCESS
134+
)
135+
stored_chars_to_ignore: str = self.am_extra_settings.value(
136+
extra_settings_keys.PreprocessKeys.CHARS_TO_IGNORE, type=str
137+
)
138+
self.am_extra_settings.endGroup()
139+
131140
if stored_input_dir is not None:
132141
self.ui.inputDirLineEdit.setText(stored_input_dir)
133142

143+
if stored_chars_to_ignore is not None:
144+
self.ui.customCharactersLineEdit.setText(stored_chars_to_ignore)
145+
146+
self.ui.customCharactersLineEdit.setDisabled(True)
147+
134148
self.ui.inputDirLineEdit.textEdited.connect(
135149
partial(self.ui.loadFilesPushButton.setEnabled, True)
136150
)
@@ -225,15 +239,29 @@ def _setup_preprocess_checkboxes(self) -> None:
225239
stored_ignore_numbers: bool = self.am_extra_settings.value(
226240
extra_settings_keys.PreprocessKeys.IGNORE_NUMBERS, type=bool
227241
)
242+
stored_ignore_custom_chars: bool = self.am_extra_settings.value(
243+
extra_settings_keys.PreprocessKeys.IGNORE_CUSTOM_CHARS, type=bool
244+
)
228245

229246
self.am_extra_settings.endGroup()
230247

248+
self.ui.customCharactersCheckBox.stateChanged.connect(
249+
self._toggle_disable_custom_characters_line_edit
250+
)
251+
231252
self.ui.squareBracketsCheckBox.setChecked(stored_ignore_square_brackets)
232253
self.ui.roundBracketsCheckBox.setChecked(stored_ignore_round_brackets)
233254
self.ui.slimRoundBracketsCheckBox.setChecked(stored_ignore_slim_round_brackets)
234255
self.ui.namesMorphemizerCheckBox.setChecked(stored_ignore_names_morphemizer)
235256
self.ui.namesFileCheckBox.setChecked(stored_ignore_names_in_file)
236257
self.ui.numbersCheckBox.setChecked(stored_ignore_numbers)
258+
self.ui.customCharactersCheckBox.setChecked(stored_ignore_custom_chars)
259+
260+
def _toggle_disable_custom_characters_line_edit(self) -> None:
261+
if self.ui.customCharactersCheckBox.checkState() == Qt.CheckState.Unchecked:
262+
self.ui.customCharactersLineEdit.setDisabled(True)
263+
else:
264+
self.ui.customCharactersLineEdit.setEnabled(True)
237265

238266
def _on_select_folder_clicked(self) -> None:
239267
input_dir: str = QFileDialog.getExistingDirectory(

ankimorphs/text_preprocessing.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import annotations
2+
13
import re
24
from typing import Any
35

@@ -9,7 +11,7 @@
911
round_brackets_regex = re.compile(r"([^)]*)")
1012
slim_round_brackets_regexp = re.compile(r"\([^)]*\)")
1113

12-
translation_table: dict[int, Any] = {}
14+
global_translation_table: dict[int, Any] = {}
1315

1416

1517
def update_translation_table() -> None:
@@ -19,13 +21,18 @@ def update_translation_table() -> None:
1921
2022
Note: this function is executed on startup and when settings are saved
2123
"""
22-
global translation_table
23-
translation_table = str.maketrans(
24+
global global_translation_table
25+
global_translation_table = str.maketrans(
2426
"", "", AnkiMorphsConfig().preprocess_custom_characters_to_ignore
2527
)
2628

2729

28-
def get_processed_text(am_config: AnkiMorphsConfig, text: str) -> str:
30+
def get_processed_text(
31+
am_config: AnkiMorphsConfig,
32+
text: str,
33+
translation_table: dict[int, int | None] | None = None,
34+
) -> str:
35+
2936
if am_config.preprocess_ignore_bracket_contents:
3037
text = square_brackets_regex.sub("", text)
3138

@@ -39,6 +46,8 @@ def get_processed_text(am_config: AnkiMorphsConfig, text: str) -> str:
3946
text = re.sub(r"\d", "", text)
4047

4148
if am_config.preprocess_ignore_custom_characters:
49+
if translation_table is None:
50+
translation_table = global_translation_table
4251
# str.translate() removes characters in a single pass, which is
4352
# much more efficient than str.replace()
4453
text = text.translate(translation_table)

0 commit comments

Comments
 (0)