Skip to content

Commit eb3cb3f

Browse files
committed
resource functions
1 parent 7785d04 commit eb3cb3f

File tree

2 files changed

+167
-1
lines changed

2 files changed

+167
-1
lines changed

pyproject.toml

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
[build-system]
22
requires = [
33
"setuptools>=61",
4-
"setuptools_scm"
4+
"setuptools_scm[toml]>=3.4",
5+
"toml",
6+
"wheel"
57
]
68

79
build-backend = "setuptools.build_meta"

src/udar/misc.py

+164
Original file line numberDiff line numberDiff line change
@@ -145,3 +145,167 @@ def combine_stress(stresses: Union[List[str], Set[str]]) -> str:
145145
def unspace_punct(in_str: str):
146146
"""Attempt to remove spaces before punctuation."""
147147
return re.sub(r' +([.?!;:])', r'\1', in_str)
148+
149+
150+
def tixonov(from_cache=True):
151+
cache_path = f'{FST_DIR}/Tixonov_dict.pkl'
152+
if from_cache:
153+
154+
tix_dict = defaultdict(list)
155+
with open(f'{RSRC_DIR}/src/Tixonov.txt') as f:
156+
for line in f:
157+
parse = line.strip().replace('`', '').split('/')
158+
parse = tuple([e for e in parse if e])
159+
lemma = ''.join(parse)
160+
noncyr = re.sub(r'[a-яё\-]', '', lemma, flags=re.I)
161+
if noncyr:
162+
print('Non-cyrillic characters:', lemma, noncyr, file=stderr)
163+
# TODO verify and remove duplicates
164+
# if lemma in tix_dict:
165+
# print(f'\t{lemma} already in tix_dict:',
166+
# f'old: "{tix_dict[lemma]}"',
167+
# f'new: "{parse}"', file=stderr)
168+
if parse not in tix_dict[lemma]:
169+
tix_dict[lemma].append(parse)
170+
171+
for lemma, parses in tix_dict.items():
172+
tix_dict[lemma] = sorted(parses)
173+
174+
return tix_dict
175+
176+
177+
def tixonov_morph_count():
178+
cache_path = f'{FST_DIR}/Tix_morph_count_dict.pkl'
179+
tix_dict = tixonov()
180+
181+
morph_count_dict = {}
182+
for lemma, parses in tix_dict.items():
183+
morph_count_dict[lemma] = mean(len(p) for p in parses)
184+
return morph_count_dict
185+
186+
187+
def lexmin():
188+
cache_path = f'FST_DIR}/lexmin_dict.pkl'
189+
lexmin_dict = {}
190+
for level in ['A1', 'A2', 'B1', 'B2']:
191+
with open(f'{RSRC_DIR}/src/lexmin_{level}.txt') as f:
192+
for lemma in f:
193+
lemma = lemma.strip()
194+
if lemma:
195+
# TODO verify and remove duplicates
196+
# if lemma in lexmin_dict:
197+
# print(f'\t{lemma} ({level}) already in lexmin',
198+
# lexmin_dict[lemma], file=stderr)
199+
lexmin_dict[lemma] = level
200+
return lexmin_dict
201+
202+
203+
def kelly():
204+
cache_path = f'FST_DIR}/kelly_dict.pkl'
205+
kelly_dict = {}
206+
with open(f'{RSRC_DIR}/src/KellyProject_Russian_M3.txt') as f:
207+
for line in f:
208+
level, freq, lemma = line.strip().split('\t')
209+
# TODO verify and remove duplicates
210+
# if lemma in kelly_dict:
211+
# print(f'{lemma} ({level}) already in kelly_dict',
212+
# kelly_dict[lemma], file=stderr)
213+
kelly_dict[lemma] = level
214+
return kelly_dict
215+
216+
217+
def rnc_freq():
218+
"""Token frequency data from Russian National Corpus 1-gram data.
219+
taken from: http://ruscorpora.ru/corpora-freq.html
220+
"""
221+
cache_path = f'FST_DIR}/RNC_tok_freq_dict.pkl'
222+
RNC_tok_freq_dict = {}
223+
with open(f'{RSRC_DIR}/src/RNC_1grams-3.txt') as f:
224+
for line in f:
225+
tok_freq, tok = line.split()
226+
if tok in RNC_tok_freq_dict:
227+
print(f'\t{tok} already in RNC_tok_freq_dict '
228+
f'({tok_freq} vs {RNC_tok_freq_dict[tok]})', file=stderr)
229+
continue
230+
RNC_tok_freq_dict[tok] = float(tok_freq)
231+
return RNC_tok_freq_dict
232+
233+
234+
def rnc_freq_rank():
235+
"""Token frequency data from Russian National Corpus 1-gram data.
236+
taken from: http://ruscorpora.ru/corpora-freq.html
237+
"""
238+
cache_path = f'FST_DIR}/RNC_tok_freq_rank_dict.pkl'
239+
RNC_tok_freq_rank_dict = {}
240+
with open(f'{RSRC_DIR}/src/RNC_1grams-3.txt') as f:
241+
rank = 0
242+
last_freq = None
243+
for i, line in enumerate(f, start=1):
244+
tok_freq, tok = line.split()
245+
if tok_freq != last_freq:
246+
rank = i
247+
if tok in RNC_tok_freq_rank_dict:
248+
print(f'\t{tok} already in RNC_tok_freq_rank_dict '
249+
f'({rank} vs {RNC_tok_freq_rank_dict[tok]})', file=stderr)
250+
continue
251+
RNC_tok_freq_rank_dict[tok] = rank
252+
return RNC_tok_freq_rank_dict
253+
254+
255+
def sharoff():
256+
# Lemma freq data from Serge Sharoff.
257+
# Taken from: http://www.artint.ru/projects/frqlist/frqlist-en.php
258+
259+
# TODO what about http://dict.ruslang.ru/freq.php ?
260+
261+
cache_path = f'FST_DIR}/Sharoff_lem_freq_dict.pkl'
262+
263+
Sharoff_lem_freq_dict = {}
264+
with open(f'{RSRC_DIR}/src/Sharoff_lemmaFreq.txt') as f:
265+
for line in f:
266+
line_num, freq, lemma, pos = line.split()
267+
if lemma in Sharoff_lem_freq_dict:
268+
print(f'{lemma} already in Sharoff_lem_freq_dict. '
269+
f'old: {Sharoff_lem_freq_dict[lemma]} '
270+
f'new: {(freq, line_num, pos)}', file=stderr)
271+
continue
272+
Sharoff_lem_freq_dict[lemma] = float(freq)
273+
return Sharoff_lem_freq_dict
274+
275+
276+
def sharoff_rank():
277+
# Lemma freq data from Serge Sharoff.
278+
# Taken from: http://www.artint.ru/projects/frqlist/frqlist-en.php
279+
280+
# TODO what about http://dict.ruslang.ru/freq.php ?
281+
282+
cache_path = f'FST_DIR}/Sharoff_lem_freq_rank_dict.pkl'
283+
284+
Sharoff_lem_freq_rank_dict = {}
285+
with open(f'{RSRC_DIR}/src/Sharoff_lemmaFreq.txt') as f:
286+
rank = None
287+
last_freq = None
288+
for i, line in enumerate(f, start=1):
289+
line_num, freq, lemma, pos = line.split()
290+
if freq != last_freq:
291+
rank = i
292+
if lemma in Sharoff_lem_freq_rank_dict:
293+
print(f'{lemma} already in Sharoff_lem_freq_rank_dict. '
294+
f'old: {Sharoff_lem_freq_rank_dict[lemma]} '
295+
f'new: {(rank, line_num, pos)}', file=stderr)
296+
continue
297+
Sharoff_lem_freq_rank_dict[lemma] = rank
298+
return Sharoff_lem_freq_rank_dict
299+
300+
301+
def cache_rsrc(resource, fname) --> bool:
302+
"""Attempt to cache (pickle) resource to `fname`."""
303+
with open(fname, 'w') as f:
304+
pickle.dump(resource)
305+
306+
307+
def uncache_rsrc(fname):
308+
"""Attempt to uncache (unpickle) resource from `fname`."""
309+
with open(fname) as f:
310+
resource = pickle.load(f)
311+
return resource

0 commit comments

Comments
 (0)