22
33import functools
44import os .path
5+ import shutil
6+ import subprocess
57import sys
68from types import ModuleType
79from typing import Any
810
911from anki .utils import is_win
1012from aqt import mw
13+ from aqt .package import venv_binary
1114
1215updated_python_path : bool = False
1316testing_environment : bool = False
2023_SpacyTokenizer : Any = None # spacy.tokenizer
2124_SpacyDoc : Any = None # spacy.tokens.doc
2225
26+
27+ # spaCy does not have a cli to query available languages or models, so we hardcode it.
28+ available_langs_and_models : dict [str , list [str ]] = {
29+ # fmt: off
30+ "Catalan" : ["ca_core_news_sm" , "ca_core_news_md" , "ca_core_news_lg" ],
31+ "Chinese" : ["zh_core_web_sm" , "zh_core_web_md" , "zh_core_web_lg" ],
32+ "Croatian" : ["hr_core_news_sm" , "hr_core_news_md" , "hr_core_news_lg" ],
33+ "Danish" : ["da_core_news_sm" , "da_core_news_md" , "da_core_news_lg" ],
34+ "Dutch" : ["nl_core_news_sm" , "nl_core_news_md" , "nl_core_news_lg" ],
35+ "English" : ["en_core_web_sm" , "en_core_web_md" , "en_core_web_lg" ],
36+ "Finnish" : ["fi_core_news_sm" , "fi_core_news_md" , "fi_core_news_lg" ],
37+ "French" : ["fr_core_news_sm" , "fr_core_news_md" , "fr_core_news_lg" ],
38+ "German" : ["de_core_news_sm" , "de_core_news_md" , "de_core_news_lg" ],
39+ "Greek" : ["el_core_news_sm" , "el_core_news_md" , "el_core_news_lg" ],
40+ "Italian" : ["it_core_news_sm" , "it_core_news_md" , "it_core_news_lg" ],
41+ "Japanese" : ["ja_core_news_sm" , "ja_core_news_md" , "ja_core_news_lg" ],
42+ "Korean" : ["ko_core_news_sm" , "ko_core_news_md" , "ko_core_news_lg" ],
43+ "Lithuanian" : ["lt_core_news_sm" , "lt_core_news_md" , "lt_core_news_lg" ],
44+ "Macedonian" : ["mk_core_news_sm" , "mk_core_news_md" , "mk_core_news_lg" ],
45+ "Norwegian Bokmål" : ["nb_core_news_sm" , "nb_core_news_md" , "nb_core_news_lg" ],
46+ "Polish" : ["pl_core_news_sm" , "pl_core_news_md" , "pl_core_news_lg" ],
47+ "Portuguese" : ["pt_core_news_sm" , "pt_core_news_md" , "pt_core_news_lg" ],
48+ "Romanian" : ["ro_core_news_sm" , "ro_core_news_md" , "ro_core_news_lg" ],
49+ "Russian" : ["ru_core_news_sm" , "ru_core_news_md" , "ru_core_news_lg" ],
50+ "Slovenian" : ["sl_core_news_sm" , "sl_core_news_md" , "sl_core_news_lg" ],
51+ "Spanish" : ["es_core_news_sm" , "es_core_news_md" , "es_core_news_lg" ],
52+ "Swedish" : ["sv_core_news_sm" , "sv_core_news_md" , "sv_core_news_lg" ],
53+ "Ukrainian" : ["uk_core_news_sm" , "uk_core_news_md" , "uk_core_news_lg" ],
54+ # fmt: on
55+ }
56+
57+
2358LANGUAGE_PIPE_CONFIGS : dict [str , set [str ]] = {
24- "ja" : {"" }, # Japanese uses SudachiPy
25- "nb" : {"tok2vec" , "attribute_ruler" , "lemmatizer" , "morphologizer" , "ner" },
59+ "ca" : {"tok2vec" , "morphologizer" , "lemmatizer" },
60+ "zh" : {"tok2vec" , "tagger" , "attribute_ruler" },
61+ "hr" : {"tok2vec" , "lemmatizer" , "morphologizer" },
2662 "da" : {"tok2vec" , "morphologizer" , "lemmatizer" },
27- "de" : {"tok2vec" , "lemmatizer" , "morphologizer" },
28- "fr" : {"tok2vec" , "lemmatizer" , "morphologizer" },
29- "en" : {"tok2vec" , "tagger" , "attribute_ruler" , "lemmatizer" , "morphologizer" },
30- "es" : {"tok2vec" , "lemmatizer" , "morphologizer" },
31- "sv" : {"tok2vec" , "lemmatizer" , "morphologizer" },
3263 "nl" : {"tok2vec" , "lemmatizer" , "morphologizer" },
33- "hr " : {"tok2vec" , "lemmatizer" , "morphologizer" },
64+ "en " : {"tok2vec" , "tagger" , "attribute_ruler " , "lemmatizer" , "morphologizer" },
3465 "fi" : {"tok2vec" , "lemmatizer" , "morphologizer" },
66+ "fr" : {"tok2vec" , "lemmatizer" , "morphologizer" },
67+ "de" : {"tok2vec" , "lemmatizer" , "morphologizer" },
3568 "el" : {"tok2vec" , "lemmatizer" , "morphologizer" },
3669 "it" : {"tok2vec" , "lemmatizer" , "morphologizer" },
70+ "ja" : {"" }, # Japanese uses SudachiPy
71+ "ko" : {"tok2vec" , "morphologizer" , "lemmatizer" },
3772 "lt" : {"tok2vec" , "lemmatizer" , "morphologizer" },
3873 "mk" : {"tok2vec" , "lemmatizer" , "morphologizer" },
74+ "nb" : {"tok2vec" , "attribute_ruler" , "lemmatizer" , "morphologizer" , "ner" },
3975 "pl" : {"tok2vec" , "lemmatizer" , "morphologizer" },
4076 "pt" : {"tok2vec" , "lemmatizer" , "morphologizer" },
4177 "ro" : {"tok2vec" , "tagger" , "morphologizer" , "lemmatizer" , "attribute_ruler" },
42- "sl" : {"tok2vec" , "lemmatizer" , "morphologizer" },
43- "ca" : {"tok2vec" , "morphologizer" , "lemmatizer" },
4478 "ru" : {"tok2vec" , "morphologizer" , "lemmatizer" },
79+ "sl" : {"tok2vec" , "lemmatizer" , "morphologizer" },
80+ "es" : {"tok2vec" , "lemmatizer" , "morphologizer" },
81+ "sv" : {"tok2vec" , "lemmatizer" , "morphologizer" },
4582 "uk" : {"tok2vec" , "morphologizer" , "lemmatizer" },
46- "ko" : {"tok2vec" , "morphologizer" , "lemmatizer" },
47- "zh" : {"tok2vec" , "tagger" , "attribute_ruler" },
4883}
4984
5085
5186def load_spacy_modules () -> None :
87+ # We load the spacy modules in this complicated way to maintain at least
88+ # some form of static type checking, and to minimize error checking
89+ # and exception handling
90+
5291 global updated_python_path
5392 global successful_import
5493 global _spacy
@@ -59,14 +98,9 @@ def load_spacy_modules() -> None:
5998
6099 # dev environments should already have spaCy, so this can be skipped
61100 if not updated_python_path and not testing_environment :
62- # Anki only looks into its own directories for python packages,
63- # to add other lookup folders we have to change the sys path.
64- # In the guide we instruct the users to install the spacy
65- # virtual environment into the addons21 folder as 'spacyenv',
66- # that way we can get the path based on the anki mw.pm.
67-
68101 assert mw is not None
69- spacy_path = os .path .join (mw .pm .addonFolder (), "spacyenv" )
102+
103+ spacy_path = _get_am_spacy_venv_path ()
70104
71105 if is_win is True :
72106 spacy_bin_path = os .path .join (spacy_path , "Scripts" )
@@ -80,6 +114,7 @@ def load_spacy_modules() -> None:
80114 "site-packages" ,
81115 )
82116
117+ # appending to the path is less disruptive than prepending
83118 sys .path .append (spacy_bin_path )
84119 sys .path .append (spacy_site_packages_path )
85120 updated_python_path = True
@@ -116,6 +151,79 @@ def get_installed_models() -> list[str]:
116151 return [f"{ model_name } " for model_name in _spacy_utils .get_installed_models ()]
117152
118153
154+ def _get_am_spacy_venv_python () -> str :
155+ if is_win :
156+ return os .path .join (_get_am_spacy_venv_path (), "Scripts" , "python" , ".exe" )
157+ return os .path .join (_get_am_spacy_venv_path (), "bin" , "python" )
158+
159+
160+ def _get_am_spacy_venv_path () -> str :
161+ python_version = f"{ sys .version_info .major } _{ sys .version_info .minor } "
162+ return os .path .join (mw .pm .addonFolder (), f"spacy-venv-python-{ python_version } " )
163+
164+
165+ def create_spacy_venv () -> None :
166+ """
167+ We create a dedicated venv to avoid polluting the anki launcher environment
168+ """
169+
170+ spacy_venv_path = _get_am_spacy_venv_path ()
171+ python_path : str | None = venv_binary ("python" )
172+ assert python_path is not None
173+
174+ subprocess .run ([python_path , "-m" , "venv" , spacy_venv_path ], check = True )
175+
176+ if is_win :
177+ spacy_venv_python = os .path .join (spacy_venv_path , "Scripts" , "python" , ".exe" )
178+ else :
179+ spacy_venv_python = os .path .join (spacy_venv_path , "bin" , "python" )
180+
181+ # make sure pip, setuptools, and wheel are up to date
182+ subprocess .run (
183+ [
184+ spacy_venv_python ,
185+ "-m" ,
186+ "pip" ,
187+ "install" ,
188+ "--upgrade" ,
189+ "pip" ,
190+ "setuptools" ,
191+ "wheel" ,
192+ ],
193+ check = True ,
194+ )
195+
196+ # six is necessary for some models
197+ subprocess .run (
198+ [spacy_venv_python , "-m" , "pip" , "install" , "--upgrade" , "spacy" , "six" ],
199+ check = True ,
200+ )
201+
202+
203+ def delete_spacy_venv () -> None :
204+ shutil .rmtree (_get_am_spacy_venv_path ())
205+
206+
207+ def install_model (model_name : str ) -> None :
208+ assert successful_import
209+ assert _spacy is not None
210+
211+ subprocess .run (
212+ [_get_am_spacy_venv_python (), "-m" , "spacy" , "download" , model_name ], check = True
213+ )
214+
215+
216+ def uninstall_model (model_name : str ) -> None :
217+ assert successful_import
218+ assert _spacy is not None
219+
220+ # the -y flag prevents a confirmation prompt
221+ subprocess .run (
222+ [_get_am_spacy_venv_python (), "-m" , "pip" , "uninstall" , "-y" , model_name ],
223+ check = True ,
224+ )
225+
226+
119227# the cache needs to have a max size to maintain garbage collection
120228@functools .lru_cache (maxsize = 131072 )
121229def get_nlp (spacy_model_name : str ): # type: ignore[no-untyped-def] # pylint:disable=too-many-branches, too-many-statements
0 commit comments