Skip to content

Commit 8b717fe

Browse files
authored
Merge pull request #963 from PrimozGodec/language-preprocess
[ENH] Preprocess - Use language from Corpus
2 parents 15852b8 + 1509420 commit 8b717fe

File tree

6 files changed

+244
-17
lines changed

6 files changed

+244
-17
lines changed

orangecontrib/text/keywords/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from orangecontrib.text.vectorization import BowVectorizer
2121

2222
# all available languages for RAKE
23-
RAKE_LANGUAGES = StopwordsFilter.supported_languages()
23+
RAKE_LANGUAGES = StopwordsFilter.supported_languages
2424
# all available languages for YAKE!
2525
# fmt: off
2626
YAKE_LANGUAGES = [

orangecontrib/text/preprocess/filter.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,10 @@ def lang_to_iso(language: str) -> str:
117117
"""
118118
return LANG2ISO[StopwordsFilter.NLTK2LANG.get(language, language)]
119119

120-
@staticmethod
120+
@classmethod
121+
@property
121122
@wait_nltk_data
122-
def supported_languages() -> Set[str]:
123+
def supported_languages(_) -> Set[str]:
123124
"""
124125
List all languages supported by NLTK
125126

orangecontrib/text/preprocess/normalize.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,10 @@ def __files_to_dict(self, files: List[Tuple[str]]) -> Dict[str, Tuple[str, str]]
122122
def supported_languages(self) -> List[Tuple[str, str]]:
123123
return [(name, iso) for iso, (name, _) in self.model_files.items()]
124124

125+
@property
126+
def supported_languages_iso(self) -> List[Tuple[str, str]]:
127+
return {iso for _, iso in self.supported_languages}
128+
125129
@property
126130
def online(self) -> bool:
127131
try:

orangecontrib/text/tests/test_preprocess.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -486,7 +486,7 @@ def test_stopwords_slovene(self):
486486
self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
487487

488488
def test_supported_languages(self):
489-
langs = preprocess.StopwordsFilter.supported_languages()
489+
langs = preprocess.StopwordsFilter.supported_languages
490490
self.assertIsInstance(langs, set)
491491
# just testing few of most important languages since I want for test to be
492492
# resistant for any potentially newly introduced languages by NLTK

orangecontrib/text/widgets/owpreprocess.py

Lines changed: 76 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
from AnyQt.QtGui import QBrush, QValidator
1212

1313
from Orange.util import wrap_callback
14+
from orangecanvas.gui.utils import disconnected
15+
from orangewidget.settings import SettingsHandler
1416
from orangewidget.utils.filedialogs import RecentPath
1517

1618
import Orange.widgets.data.owpreprocess
@@ -112,7 +114,8 @@ def set_current_language(self, iso_language: Optional[str]):
112114
The ISO language code of element to be selected.
113115
"""
114116
index = self.findData(iso_language)
115-
self.setCurrentIndex(index)
117+
if index >= 0:
118+
self.setCurrentIndex(index)
116119

117120

118121
class UDPipeComboBox(LanguageComboBox):
@@ -130,15 +133,9 @@ def items(self) -> List:
130133
def add_items(self, _, include_none: bool, language: str):
131134
self.__items = self.items
132135
super().add_items(self.__items, include_none, language)
133-
134-
def set_current_language(self, iso_language: Optional[str]):
135136
iso_items = {iso for _, iso in self.__items}
136-
if iso_language in iso_items:
137-
super().set_current_language(iso_language)
138-
elif self.__default_lang in iso_items:
137+
if language not in iso_items and self.__default_lang in iso_items:
139138
super().set_current_language(self.__default_lang)
140-
elif self.__items:
141-
self.setCurrentIndex(0)
142139

143140
def showPopup(self):
144141
if self.__items != self.items:
@@ -657,7 +654,7 @@ def __init__(self, parent=None, **kwargs):
657654

658655
self.__combo = LanguageComboBox(
659656
self,
660-
StopwordsFilter.supported_languages(),
657+
StopwordsFilter.supported_languages,
661658
self.__sw_lang,
662659
True,
663660
self.__set_language,
@@ -1044,6 +1041,21 @@ def createinstance(params: Dict) -> POSTagger:
10441041
return POSTaggingModule.Methods[method]()
10451042

10461043

1044+
class PreprocessSettingsHandler(SettingsHandler):
1045+
"""
1046+
Settings handler, that makes all language settings, which are
1047+
a part of common preprocess settings, schema_only. It removes them when
1048+
settings are not loaded from schema but from common settings.
1049+
"""
1050+
def _remove_schema_only(self, settings_dict):
1051+
super()._remove_schema_only(settings_dict)
1052+
for setting, data, _ in self.provider.traverse_settings(data=settings_dict):
1053+
for pp_name, settings in data["storedsettings"]["preprocessors"]:
1054+
for key in list(settings):
1055+
if "language" in key:
1056+
settings.pop(key)
1057+
1058+
10471059
PREPROCESS_ACTIONS = [
10481060
PreprocessAction(
10491061
"Transformation", "preprocess.transform", "",
@@ -1127,12 +1139,14 @@ class Warning(Orange.widgets.data.owpreprocess.OWPreprocess.Warning):
11271139
("preprocess.tokenize", {}),
11281140
("preprocess.filter", {})]
11291141
} # type: Dict[str, List[Tuple[str, Dict]]]
1142+
settingsHandler = PreprocessSettingsHandler()
11301143
storedsettings = Setting(DEFAULT_PP)
11311144
buttons_area_orientation = Qt.Vertical
11321145

11331146
def __init__(self):
11341147
ConcurrentWidgetMixin.__init__(self)
11351148
Orange.widgets.data.owpreprocess.OWPreprocess.__init__(self)
1149+
self.__store_pending_languages()
11361150

11371151
box = gui.vBox(self.controlArea, "Preview")
11381152
self.preview = ""
@@ -1150,6 +1164,16 @@ def load(self, saved: Dict) -> StandardItemModel:
11501164
saved["preprocessors"][i] = (name, params)
11511165
return super().load(saved)
11521166

1167+
def set_model(self, pmodel):
1168+
"""Connect signal which handle setting language from corpus"""
1169+
super().set_model(pmodel)
1170+
if pmodel:
1171+
pmodel.rowsInserted.connect(self.__on_item_inserted)
1172+
1173+
def __on_item_inserted(self, _, first: int, last: int):
1174+
assert first == last
1175+
self.__set_languages_single_editor(first)
1176+
11531177
def __update_filtering_params(self, params: Dict):
11541178
params["sw_path"] = self.__relocate_file(params.get("sw_path"))
11551179
params["sw_list"] = self.__relocate_files(params.get("sw_list", []))
@@ -1179,6 +1203,49 @@ def __relocate_file(self, path: RecentPath) -> RecentPath:
11791203
def set_data(self, data: Corpus):
11801204
self.cancel()
11811205
self.data = data
1206+
self.__set_languages()
1207+
1208+
LANG_PARAMS = {
1209+
"preprocess.normalize": [
1210+
("snowball_language", SnowballStemmer.supported_languages),
1211+
("udpipe_language", UDPipeModels().supported_languages_iso),
1212+
("lemmagen_language", LemmagenLemmatizer.supported_languages),
1213+
],
1214+
"preprocess.filter": [("language", StopwordsFilter.supported_languages)],
1215+
}
1216+
1217+
def __store_pending_languages(self):
1218+
settings = self.storedsettings["preprocessors"]
1219+
self.__pending_languages = {
1220+
pp_name: {p for p in par if "language" in p} for pp_name, par in settings
1221+
}
1222+
1223+
def __set_languages(self):
1224+
if self.data is not None:
1225+
for i in range(self.preprocessormodel.rowCount()):
1226+
self.__set_languages_single_editor(i)
1227+
self.__pending_languages = {}
1228+
1229+
def __set_languages_single_editor(self, item_index: int):
1230+
"""
1231+
Set language from corpus for single editor/module,
1232+
keep language unchanged if it comes from schema (pending).
1233+
"""
1234+
if self.data and self.data.language:
1235+
model = self.preprocessormodel
1236+
item = model.item(item_index)
1237+
pp_name = item.data(DescriptionRole).qualname
1238+
params = item.data(ParametersRole)
1239+
pending = self.__pending_languages.get(pp_name, set())
1240+
for param, available_langs in self.LANG_PARAMS.get(pp_name, []):
1241+
if param not in pending and self.data.language in available_langs:
1242+
# set language if not pending from schema - should not be changed
1243+
# and if available for the method
1244+
params[param] = self.data.language
1245+
with disconnected(model.dataChanged, self.__on_modelchanged):
1246+
# disconnection prevent double apply call, it is already called
1247+
# on new data and when row inserted, both caller of this method
1248+
item.setData(params, ParametersRole)
11821249

11831250
def buildpreproc(self) -> PreprocessorList:
11841251
plist = []

orangecontrib/text/widgets/tests/test_owpreprocess.py

Lines changed: 159 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
from unittest.mock import patch, PropertyMock, MagicMock, Mock
33

44
import numpy as np
5+
from AnyQt.QtGui import QStandardItem, QIcon
56
from Orange.data import Domain, StringVariable
7+
from Orange.widgets.data.utils.preprocess import DescriptionRole, ParametersRole
68
from orangewidget.utils.filedialogs import RecentPath
79
from Orange.widgets.tests.base import WidgetTest
810
from Orange.widgets.tests.utils import simulate
@@ -180,6 +182,153 @@ def test_no_tokens_left(self):
180182
self.wait_until_finished()
181183
self.assertFalse(self.widget.Warning.no_token_left.is_shown())
182184

185+
def test_language_from_corpus(self):
186+
"""Test language from corpus is set correctly"""
187+
initial = {
188+
"name": "",
189+
"preprocessors": [("preprocess.normalize", {}), ("preprocess.filter", {})],
190+
}
191+
self.widget.storedsettings = initial
192+
self.widget._initialize()
193+
self.assertDictEqual(initial, self.widget.storedsettings)
194+
combos = self.widget.mainArea.findChildren(LanguageComboBox)
195+
self.assertEqual(
196+
["English", "English", "English", "English"],
197+
[c.currentText() for c in combos]
198+
)
199+
200+
# test with Slovenian - language should set for all preprocessors except
201+
# Snowball that doesn't support Slovenian
202+
self.corpus.attributes["language"] = "sl"
203+
self.send_signal(self.widget.Inputs.corpus, self.corpus)
204+
self.assertEqual(
205+
["English", "Slovenian", "Slovenian", "Slovenian"],
206+
[c.currentText() for c in combos]
207+
)
208+
settings = self.widget.storedsettings["preprocessors"]
209+
self.assertEqual("sl", settings[0][1]["udpipe_language"])
210+
self.assertEqual("sl", settings[0][1]["lemmagen_language"])
211+
self.assertEqual("sl", settings[1][1]["language"])
212+
213+
# test with Lithuanian that is support by one preprocessors
214+
self.corpus.attributes["language"] = "lt"
215+
self.send_signal(self.widget.Inputs.corpus, self.corpus)
216+
self.assertEqual(
217+
["English", "Lithuanian", "Slovenian", "Slovenian"],
218+
[c.currentText() for c in combos]
219+
)
220+
settings = self.widget.storedsettings["preprocessors"]
221+
self.assertEqual("lt", settings[0][1]["udpipe_language"])
222+
self.assertEqual("sl", settings[0][1]["lemmagen_language"])
223+
self.assertEqual("sl", settings[1][1]["language"])
224+
225+
self.corpus.attributes["language"] = "pt"
226+
self.send_signal(self.widget.Inputs.corpus, self.corpus)
227+
self.assertEqual(
228+
["Portuguese", "Portuguese", "Slovenian", "Portuguese"],
229+
[c.currentText() for c in combos]
230+
)
231+
settings = self.widget.storedsettings["preprocessors"]
232+
self.assertEqual("pt", settings[0][1]["snowball_language"])
233+
self.assertEqual("pt", settings[0][1]["udpipe_language"])
234+
self.assertEqual("sl", settings[0][1]["lemmagen_language"])
235+
self.assertEqual("pt", settings[1][1]["language"])
236+
237+
# language not supported by any preprocessor - language shouldn't change
238+
self.corpus.attributes["language"] = "bo"
239+
self.send_signal(self.widget.Inputs.corpus, self.corpus)
240+
self.assertEqual(
241+
["Portuguese", "Portuguese", "Slovenian", "Portuguese"],
242+
[c.currentText() for c in combos]
243+
)
244+
settings = self.widget.storedsettings["preprocessors"]
245+
self.assertEqual("pt", settings[0][1]["snowball_language"])
246+
self.assertEqual("pt", settings[0][1]["udpipe_language"])
247+
self.assertEqual("sl", settings[0][1]["lemmagen_language"])
248+
self.assertEqual("pt", settings[1][1]["language"])
249+
250+
# test with missing language - language shouldn't change
251+
self.corpus.attributes["language"] = None
252+
self.send_signal(self.widget.Inputs.corpus, self.corpus)
253+
self.assertEqual(
254+
["Portuguese", "Portuguese", "Slovenian", "Portuguese"],
255+
[c.currentText() for c in combos]
256+
)
257+
settings = self.widget.storedsettings["preprocessors"]
258+
self.assertEqual("pt", settings[0][1]["snowball_language"])
259+
self.assertEqual("pt", settings[0][1]["udpipe_language"])
260+
self.assertEqual("sl", settings[0][1]["lemmagen_language"])
261+
self.assertEqual("pt", settings[1][1]["language"])
262+
263+
def test_language_from_schema(self):
264+
"""Test language from schema/workflow is retained"""
265+
initial = {
266+
"name": "",
267+
"preprocessors": [
268+
(
269+
"preprocess.normalize",
270+
{
271+
"lemmagen_language": "sl",
272+
"snowball_language": "nl",
273+
"udpipe_language": "lt",
274+
},
275+
),
276+
("preprocess.filter", {"language": "nl"}),
277+
],
278+
}
279+
self.widget.storedsettings = initial
280+
281+
settings = self.widget.settingsHandler.pack_data(self.widget)
282+
widget = self.create_widget(OWPreprocess, stored_settings=settings)
283+
self.send_signal(widget.Inputs.corpus, self.corpus, widget=widget)
284+
self.assertDictEqual(initial, widget.storedsettings)
285+
combos = widget.mainArea.findChildren(LanguageComboBox)
286+
self.assertEqual(
287+
["Dutch", "Lithuanian", "Slovenian", "Dutch"],
288+
[c.currentText() for c in combos]
289+
)
290+
291+
def test_language_from_corpus_editor_inserted(self):
292+
"""Test language from corpus is set to new editor too"""
293+
initial = {
294+
"name": "",
295+
"preprocessors": [("preprocess.filter", {})],
296+
}
297+
self.widget.storedsettings = initial
298+
self.widget._initialize()
299+
self.assertDictEqual(initial, self.widget.storedsettings)
300+
combos = self.widget.mainArea.findChildren(LanguageComboBox)
301+
self.assertEqual(
302+
["English"],
303+
[c.currentText() for c in combos]
304+
)
305+
306+
# insert data - language of stopwords combo should change to italian
307+
self.corpus.attributes["language"] = "sl"
308+
self.send_signal(self.widget.Inputs.corpus, self.corpus)
309+
self.assertEqual(
310+
["Slovenian"],
311+
[c.currentText() for c in combos]
312+
)
313+
314+
# insert new editor - all languages except snowball should be set to Slovenian
315+
pp_def = self.widget._qname2ppdef["preprocess.normalize"]
316+
description = pp_def.description
317+
item = QStandardItem(description.title)
318+
icon = QIcon(description.icon)
319+
item.setIcon(icon)
320+
item.setToolTip(description.summary)
321+
item.setData(pp_def, DescriptionRole)
322+
item.setData({}, ParametersRole)
323+
self.widget.preprocessormodel.insertRow(0, [item])
324+
self.wait_until_finished()
325+
326+
combos = self.widget.mainArea.findChildren(LanguageComboBox)
327+
self.assertEqual(
328+
['Slovenian', 'English', 'Slovenian', 'Slovenian'],
329+
[c.currentText() for c in combos]
330+
)
331+
183332

184333
@patch(SF_LIST, new=Mock(return_value=SERVER_FILES))
185334
class TestOWPreprocessMigrateSettings(WidgetTest):
@@ -983,14 +1132,20 @@ def test_set_current_language(self):
9831132
self.assertEqual("Portuguese", cb.currentText())
9841133
cb.set_current_language("sl")
9851134
self.assertEqual("Slovenian", cb.currentText())
986-
cb.set_current_language("abc") # should set to default
987-
self.assertEqual("English", cb.currentText())
1135+
cb.set_current_language("abc") # language not in list - keep current seleciton
1136+
self.assertEqual("Slovenian", cb.currentText())
1137+
1138+
def test_set_language_to_default(self):
1139+
"""In case current item not in dropdown anymore set language to default"""
1140+
mock = Mock()
1141+
cb = UDPipeComboBox(None, "pt", "en", mock)
1142+
self.assertEqual("Portuguese", cb.currentText())
9881143
# when no default language in the dropdown set to first
9891144
cb.removeItem(0)
9901145
x = cb._UDPipeComboBox__items
9911146
cb._UDPipeComboBox__items = x[:3] + x[4:]
992-
cb.set_current_language("abc")
993-
self.assertEqual("English (lines)", cb.currentText())
1147+
cb.showPopup()
1148+
self.assertEqual("English", cb.currentText())
9941149

9951150
def test_change_item(self):
9961151
mock = Mock()

0 commit comments

Comments
 (0)