Skip to content

Commit a43ea18

Browse files
mortiiVilhelm-Ian
andcommitted
fixed readability report accuracy (#134)
Co-authored-by: Vilhelm-Ian <[email protected]>
1 parent 39e9f56 commit a43ea18

15 files changed

+215
-191
lines changed

ankimorphs/anki_data_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def __init__(
9191
self.note_id = anki_row_data.note_id
9292

9393
# this is set later when spacy is used
94-
self.morphs: Optional[set[Morpheme]] = None
94+
self.morphs: Optional[list[Morpheme]] = None
9595

9696

9797
class AnkiMorphsCardData:

ankimorphs/frequency_file_generator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def _background_generate_frequency_file( # pylint:disable=too-many-locals
109109
# NB! Never use readlines(), it loads the entire file to memory
110110
for counter, line in enumerate(file):
111111
print(f"line: {counter}")
112-
morphs: set[Morpheme] = self._get_morphs_from_line(
112+
morphs: list[Morpheme] = self._get_morphs_from_line(
113113
morphemizer, nlp, line
114114
)
115115
for morph in morphs:

ankimorphs/generator_dialog.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -130,16 +130,16 @@ def _filter_expression(self, expression: str) -> str:
130130

131131
def _get_morphs_from_line( # type: ignore[no-untyped-def]
132132
self, _morphemizer: Morphemizer, nlp, line: str
133-
) -> set[Morpheme]:
133+
) -> list[Morpheme]:
134134
# todo: this is horrible, create a callback or something
135135
if nlp is None:
136136
return self._get_morphs_from_line_morphemizer(_morphemizer, line)
137137
return self._get_morphs_from_line_spacy(nlp, line)
138138

139-
def _get_morphs_from_line_spacy(self, nlp, line: str) -> set[Morpheme]: # type: ignore[no-untyped-def]
139+
def _get_morphs_from_line_spacy(self, nlp, line: str) -> list[Morpheme]: # type: ignore[no-untyped-def]
140140
# nlp: spacy.Language
141141

142-
morphs: set[Morpheme] = set()
142+
morphs: list[Morpheme] = []
143143
expression = self._filter_expression(line)
144144

145145
doc = nlp(expression)
@@ -152,7 +152,7 @@ def _get_morphs_from_line_spacy(self, nlp, line: str) -> set[Morpheme]: # type:
152152
if w.pos == 96: # PROPN
153153
continue
154154

155-
morphs.add(
155+
morphs.append(
156156
Morpheme(
157157
lemma=w.lemma_,
158158
inflection=w.text,
@@ -166,9 +166,9 @@ def _get_morphs_from_line_spacy(self, nlp, line: str) -> set[Morpheme]: # type:
166166

167167
def _get_morphs_from_line_morphemizer(
168168
self, _morphemizer: Morphemizer, line: str
169-
) -> set[Morpheme]:
169+
) -> list[Morpheme]:
170170
expression = self._filter_expression(line)
171-
morphs: set[Morpheme] = _morphemizer.get_morphemes_from_expr(expression)
171+
morphs: list[Morpheme] = _morphemizer.get_morphemes_from_expr(expression)
172172
if self.ui.namesMorphemizerCheckBox.isChecked():
173173
morphs = text_preprocessing.remove_names_morphemizer(morphs)
174174
if self.ui.namesFileCheckBox.isChecked():

ankimorphs/mecab_wrapper.py

Lines changed: 7 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def get_morpheme( # pylint:disable=too-many-return-statements
9999
control_chars_re = re.compile("[\x00-\x1f\x7f-\x9f]")
100100

101101

102-
def get_morphemes_mecab(expression) -> set[Morpheme]:
102+
def get_morphemes_mecab(expression) -> list[Morpheme]:
103103
# HACK: mecab sometimes does not produce the right morphs if there are no extra characters in the expression,
104104
# so we just add a whitespace and a japanese punctuation mark "。" the end to prevent the problem.
105105
expression += " 。"
@@ -109,7 +109,7 @@ def get_morphemes_mecab(expression) -> set[Morpheme]:
109109

110110
_morphs = [get_morpheme(m.split("\t")) for m in interact(expression).split("\r")]
111111

112-
_morphs = {_morph for _morph in _morphs if _morph is not None}
112+
_morphs = [_morph for _morph in _morphs if _morph is not None]
113113
return _morphs
114114

115115

@@ -203,68 +203,11 @@ def mecab(): # pylint: disable=too-many-branches,too-many-statements
203203
else:
204204
startup_info = None
205205

206-
# Search for mecab
207-
reading = None
208-
209-
# # 1st priority - MecabUnidic
210-
# if importlib.util.find_spec("MecabUnidic"):
211-
# try:
212-
# reading = importlib.import_module("MecabUnidic.reading")
213-
# mecab_source = "MecabUnidic from addon MecabUnidic"
214-
# except ModuleNotFoundError:
215-
# pass
216-
#
217-
# if importlib.util.find_spec("13462835"):
218-
# try:
219-
# reading = importlib.import_module("13462835.reading")
220-
# mecab_source = "MecabUnidic from addon 13462835"
221-
# except ModuleNotFoundError:
222-
# pass
223-
#
224-
# # 2nd priority - Japanese Support
225-
# if (not reading) and importlib.util.find_spec("3918629684"):
226-
# try:
227-
# reading = importlib.import_module("3918629684.reading")
228-
# mecab_source = "Japanese Support from addon 3918629684"
229-
# except ModuleNotFoundError:
230-
# pass
231-
#
232-
# # 3nd priority - MIAJapaneseSupport
233-
# if (not reading) and importlib.util.find_spec("MIAJapaneseSupport"):
234-
# try:
235-
# reading = importlib.import_module("MIAJapaneseSupport.reading")
236-
# mecab_source = "MIAJapaneseSupport from addon MIAJapaneseSupport"
237-
# except ModuleNotFoundError:
238-
# pass
239-
# # 4nd priority - MigakuJapaneseSupport via Anki code (278530045)
240-
# if (not reading) and importlib.util.find_spec("278530045"):
241-
# try:
242-
# reading = importlib.import_module("278530045.reading")
243-
# mecab_source = "Migaku Japanese support from addon 278530045"
244-
# except ModuleNotFoundError:
245-
# pass
246-
247-
# 5th priority - From Morphman
248-
if not reading:
249-
file_path = os.path.realpath(__file__)
250-
am_dir = file_path.split(os.sep)[-2]
251-
mecab_dir = am_dir + ".deps.mecab.reading"
252-
reading = importlib.import_module(mecab_dir)
253-
mecab_source = "AnkiMorphs"
254-
255-
# 6th priority - system mecab
256-
# if not reading:
257-
# try:
258-
# return spawn_mecab(["mecab"], startup_info), "System"
259-
# except Exception as error:
260-
# raise OSError(
261-
# """
262-
# Mecab Japanese analyzer could not be found.
263-
# Please install one of the following Anki add-ons:
264-
# https://ankiweb.net/shared/info/3918629684
265-
# https://ankiweb.net/shared/info/13462835
266-
# https://ankiweb.net/shared/info/278530045"""
267-
# ) from error
206+
file_path = os.path.realpath(__file__)
207+
am_dir = file_path.split(os.sep)[-2]
208+
mecab_dir = am_dir + ".deps.mecab.reading"
209+
reading = importlib.import_module(mecab_dir)
210+
mecab_source = "AnkiMorphs"
268211

269212
_mecab = reading.MecabController()
270213
_mecab.setup()

ankimorphs/morphemizer.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,17 @@ def __init__(self) -> None:
1818

1919
# the cache needs to have a max size to maintain garbage collection
2020
@functools.lru_cache(maxsize=131072)
21-
def get_morphemes_from_expr(self, expression: str) -> set[Morpheme]:
21+
def get_morphemes_from_expr(self, expression: str) -> list[Morpheme]:
2222
morphs = self._get_morphemes_from_expr(expression)
2323
return morphs
2424

2525
def _get_morphemes_from_expr( # pylint:disable=unused-argument
2626
self, expression: str
27-
) -> set[Morpheme]:
27+
) -> list[Morpheme]:
2828
"""
2929
The heart of this plugin: convert an expression to a list of its morphemes.
3030
"""
31-
return set()
31+
return []
3232

3333
def get_description(self) -> str:
3434
"""
@@ -80,7 +80,7 @@ class MecabMorphemizer(Morphemizer):
8080
a extra tool called 'mecab' has to be used.
8181
"""
8282

83-
def _get_morphemes_from_expr(self, expression: str) -> set[Morpheme]:
83+
def _get_morphemes_from_expr(self, expression: str) -> list[Morpheme]:
8484
# Remove simple spaces that could be added by other add-ons and break the parsing.
8585
if space_char_regex.search(expression):
8686
expression = space_char_regex.sub("", expression)
@@ -105,7 +105,7 @@ class SpaceMorphemizer(Morphemizer):
105105
a general-use-morphemizer, it can't generate the base form from inflection.
106106
"""
107107

108-
def _get_morphemes_from_expr(self, expression: str) -> set[Morpheme]:
108+
def _get_morphemes_from_expr(self, expression: str) -> list[Morpheme]:
109109
# We want the expression: "At 3 o'clock that god-forsaken-man shows up..."
110110
# to produce: ['at', '3', "o'clock", 'that', 'god-forsaken-man', 'shows', 'up']
111111
#
@@ -126,7 +126,7 @@ def _get_morphemes_from_expr(self, expression: str) -> set[Morpheme]:
126126
word.lower()
127127
for word in re.findall(r"\w+(?:[-']\w+)*", expression, re.UNICODE)
128128
]
129-
return {Morpheme(lemma=word, inflection=word) for word in word_list}
129+
return [Morpheme(lemma=word, inflection=word) for word in word_list]
130130

131131
def get_description(self) -> str:
132132
return "AnkiMoprhs: Language w/ Spaces"

ankimorphs/readability_report_generator.py

Lines changed: 30 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from .generator_dialog import GeneratorDialog
2020
from .morpheme import Morpheme, MorphOccurrence
2121
from .morphemizer import Morphemizer, SpacyMorphemizer
22+
from .table_utils import QTableWidgetIntegerItem, QTableWidgetPercentItem
2223
from .ui.readability_report_generator_ui import Ui_ReadabilityReportGeneratorDialog
2324

2425

@@ -36,7 +37,7 @@ def __init__(self) -> None:
3637
self._unknowns_column = 4
3738
self._number_of_columns = 5
3839
self._setup_absolute_table()
39-
self._setup_perecentages_table()
40+
self._setup_percentages_table()
4041
self._setup_buttons()
4142
self.show()
4243

@@ -63,7 +64,7 @@ def _setup_absolute_table(self) -> None:
6364
QAbstractItemView.EditTrigger.NoEditTriggers
6465
)
6566

66-
def _setup_perecentages_table(self) -> None:
67+
def _setup_percentages_table(self) -> None:
6768
assert isinstance(self.ui, Ui_ReadabilityReportGeneratorDialog)
6869

6970
self.ui.percentTableWidget.setAlternatingRowColors(True)
@@ -169,17 +170,15 @@ def _background_generate_report( # pylint:disable=too-many-locals
169170
am_config, am_db, file_morphs
170171
)
171172

172-
self._populate_absolute_table(
173+
self._populate_numerical_table(
173174
_input_file,
174-
file_morphs,
175175
_row,
176176
known_morphs,
177177
learning_morphs,
178178
unknown_morphs,
179179
)
180180
self._populate_percentage_table(
181181
_input_file,
182-
file_morphs,
183182
_row,
184183
known_morphs,
185184
learning_morphs,
@@ -196,7 +195,7 @@ def _create_file_morphs_dict(self, file: TextIO, morphemizer, nlp) -> dict[str,
196195

197196
file_morphs: dict[str, MorphOccurrence] = {}
198197
for line in file:
199-
morphs: set[Morpheme] = self._get_morphs_from_line(morphemizer, nlp, line)
198+
morphs: list[Morpheme] = self._get_morphs_from_line(morphemizer, nlp, line)
200199
for morph in morphs:
201200
key = morph.lemma + morph.inflection
202201
if key in file_morphs:
@@ -217,28 +216,28 @@ def _get_morph_statuses(
217216

218217
for morph_occurrence_object in file_morphs.values():
219218
morph = morph_occurrence_object.morph
219+
occurrence = morph_occurrence_object.occurrence
220220

221221
highest_learning_interval: Optional[
222222
int
223223
] = am_db.get_highest_learning_interval(morph.lemma, morph.inflection)
224224

225225
if highest_learning_interval is None:
226-
unknown_morphs += 1
226+
unknown_morphs += occurrence
227227
continue
228228

229229
if highest_learning_interval == 0:
230-
unknown_morphs += 1
230+
unknown_morphs += occurrence
231231
elif highest_learning_interval < am_config.recalc_interval_for_known:
232-
learning_morphs += 1
232+
learning_morphs += occurrence
233233
else:
234-
known_morphs += 1
234+
known_morphs += occurrence
235235

236236
return known_morphs, learning_morphs, unknown_morphs
237237

238-
def _populate_absolute_table(
238+
def _populate_numerical_table(
239239
self,
240240
_input_file: Path,
241-
file_morphs: dict[str, MorphOccurrence],
242241
_row: int,
243242
known_morphs: int,
244243
learning_morphs: int,
@@ -248,11 +247,13 @@ def _populate_absolute_table(
248247

249248
relative_path = _input_file.relative_to(self._input_dir_root)
250249

250+
total_morphs: int = known_morphs + learning_morphs + unknown_morphs
251+
251252
file_name_item = QTableWidgetItem(str(relative_path))
252-
total_morphs_item = QTableWidgetItem(str(len(file_morphs)))
253-
known_item = QTableWidgetItem(str(known_morphs))
254-
learning_item = QTableWidgetItem(str(learning_morphs))
255-
unknowns_item = QTableWidgetItem(str(unknown_morphs))
253+
total_morphs_item = QTableWidgetIntegerItem(total_morphs)
254+
known_item = QTableWidgetIntegerItem(known_morphs)
255+
learning_item = QTableWidgetIntegerItem(learning_morphs)
256+
unknowns_item = QTableWidgetIntegerItem(unknown_morphs)
256257

257258
total_morphs_item.setTextAlignment(Qt.AlignmentFlag.AlignCenter)
258259
known_item.setTextAlignment(Qt.AlignmentFlag.AlignCenter)
@@ -272,26 +273,30 @@ def _populate_absolute_table(
272273
def _populate_percentage_table(
273274
self,
274275
_input_file: Path,
275-
file_morphs: dict[str, MorphOccurrence],
276276
_row: int,
277277
known_morphs: int,
278278
learning_morphs: int,
279279
unknown_morphs: int,
280280
) -> None:
281281
assert isinstance(self.ui, Ui_ReadabilityReportGeneratorDialog)
282282

283-
total_morphs = len(file_morphs)
284-
known_morphs_percent = (known_morphs / total_morphs) * 100
285-
learning_morphs_percent = (learning_morphs / total_morphs) * 100
286-
unknown_morphs_percent = (unknown_morphs / total_morphs) * 100
283+
total_morphs: int = known_morphs + learning_morphs + unknown_morphs
284+
known_morphs_percent: float = 0
285+
learning_morphs_percent: float = 0
286+
unknown_morphs_percent: float = 0
287+
288+
if total_morphs != 0:
289+
known_morphs_percent = (known_morphs / total_morphs) * 100
290+
learning_morphs_percent = (learning_morphs / total_morphs) * 100
291+
unknown_morphs_percent = (unknown_morphs / total_morphs) * 100
287292

288293
relative_path = _input_file.relative_to(self._input_dir_root)
289294

290295
file_name_item = QTableWidgetItem(str(relative_path))
291-
total_morphs_item = QTableWidgetItem(str(len(file_morphs)))
292-
known_item = QTableWidgetItem(f"{round(known_morphs_percent, 1)} %")
293-
learning_item = QTableWidgetItem(f"{round(learning_morphs_percent, 1)} %")
294-
unknowns_item = QTableWidgetItem(f"{round(unknown_morphs_percent, 1)} %")
296+
total_morphs_item = QTableWidgetIntegerItem(total_morphs)
297+
known_item = QTableWidgetPercentItem(round(known_morphs_percent, 1))
298+
learning_item = QTableWidgetPercentItem(round(learning_morphs_percent, 1))
299+
unknowns_item = QTableWidgetPercentItem(round(unknown_morphs_percent, 1))
295300

296301
total_morphs_item.setTextAlignment(Qt.AlignmentFlag.AlignCenter)
297302
known_item.setTextAlignment(Qt.AlignmentFlag.AlignCenter)

ankimorphs/settings_dialog.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def __init__(self) -> None:
109109
)
110110

111111
# Semantic Versioning https://semver.org/
112-
self.ui.ankimorphs_version_label.setText("AnkiMorphs version: 0.16.0-alpha")
112+
self.ui.ankimorphs_version_label.setText("AnkiMorphs version: 0.16.1-alpha")
113113

114114
self.show()
115115

0 commit comments

Comments
 (0)