@@ -145,3 +145,167 @@ def combine_stress(stresses: Union[List[str], Set[str]]) -> str:
145
145
def unspace_punct (in_str : str ):
146
146
"""Attempt to remove spaces before punctuation."""
147
147
return re .sub (r' +([.?!;:])' , r'\1' , in_str )
148
+
149
+
150
+ def tixonov (from_cache = True ):
151
+ cache_path = f'{ FST_DIR } /Tixonov_dict.pkl'
152
+ if from_cache :
153
+
154
+ tix_dict = defaultdict (list )
155
+ with open (f'{ RSRC_DIR } /src/Tixonov.txt' ) as f :
156
+ for line in f :
157
+ parse = line .strip ().replace ('`' , '' ).split ('/' )
158
+ parse = tuple ([e for e in parse if e ])
159
+ lemma = '' .join (parse )
160
+ noncyr = re .sub (r'[a-яё\-]' , '' , lemma , flags = re .I )
161
+ if noncyr :
162
+ print ('Non-cyrillic characters:' , lemma , noncyr , file = stderr )
163
+ # TODO verify and remove duplicates
164
+ # if lemma in tix_dict:
165
+ # print(f'\t{lemma} already in tix_dict:',
166
+ # f'old: "{tix_dict[lemma]}"',
167
+ # f'new: "{parse}"', file=stderr)
168
+ if parse not in tix_dict [lemma ]:
169
+ tix_dict [lemma ].append (parse )
170
+
171
+ for lemma , parses in tix_dict .items ():
172
+ tix_dict [lemma ] = sorted (parses )
173
+
174
+ return tix_dict
175
+
176
+
177
+ def tixonov_morph_count ():
178
+ cache_path = f'{ FST_DIR } /Tix_morph_count_dict.pkl'
179
+ tix_dict = tixonov ()
180
+
181
+ morph_count_dict = {}
182
+ for lemma , parses in tix_dict .items ():
183
+ morph_count_dict [lemma ] = mean (len (p ) for p in parses )
184
+ return morph_count_dict
185
+
186
+
187
+ def lexmin ():
188
+ cache_path = f'FST_DIR}/ lexmin_dict .pkl '
189
+ lexmin_dict = {}
190
+ for level in ['A1' , 'A2' , 'B1' , 'B2' ]:
191
+ with open (f'{ RSRC_DIR } /src/lexmin_{ level } .txt' ) as f :
192
+ for lemma in f :
193
+ lemma = lemma .strip ()
194
+ if lemma :
195
+ # TODO verify and remove duplicates
196
+ # if lemma in lexmin_dict:
197
+ # print(f'\t{lemma} ({level}) already in lexmin',
198
+ # lexmin_dict[lemma], file=stderr)
199
+ lexmin_dict [lemma ] = level
200
+ return lexmin_dict
201
+
202
+
203
+ def kelly ():
204
+ cache_path = f'FST_DIR}/ kelly_dict .pkl '
205
+ kelly_dict = {}
206
+ with open (f'{ RSRC_DIR } /src/KellyProject_Russian_M3.txt' ) as f :
207
+ for line in f :
208
+ level , freq , lemma = line .strip ().split ('\t ' )
209
+ # TODO verify and remove duplicates
210
+ # if lemma in kelly_dict:
211
+ # print(f'{lemma} ({level}) already in kelly_dict',
212
+ # kelly_dict[lemma], file=stderr)
213
+ kelly_dict [lemma ] = level
214
+ return kelly_dict
215
+
216
+
217
+ def rnc_freq ():
218
+ """Token frequency data from Russian National Corpus 1-gram data.
219
+ taken from: http://ruscorpora.ru/corpora-freq.html
220
+ """
221
+ cache_path = f'FST_DIR}/ RNC_tok_freq_dict .pkl '
222
+ RNC_tok_freq_dict = {}
223
+ with open (f'{ RSRC_DIR } /src/RNC_1grams-3.txt' ) as f :
224
+ for line in f :
225
+ tok_freq , tok = line .split ()
226
+ if tok in RNC_tok_freq_dict :
227
+ print (f'\t { tok } already in RNC_tok_freq_dict '
228
+ f'({ tok_freq } vs { RNC_tok_freq_dict [tok ]} )' , file = stderr )
229
+ continue
230
+ RNC_tok_freq_dict [tok ] = float (tok_freq )
231
+ return RNC_tok_freq_dict
232
+
233
+
234
+ def rnc_freq_rank ():
235
+ """Token frequency data from Russian National Corpus 1-gram data.
236
+ taken from: http://ruscorpora.ru/corpora-freq.html
237
+ """
238
+ cache_path = f'FST_DIR}/ RNC_tok_freq_rank_dict .pkl '
239
+ RNC_tok_freq_rank_dict = {}
240
+ with open (f'{ RSRC_DIR } /src/RNC_1grams-3.txt' ) as f :
241
+ rank = 0
242
+ last_freq = None
243
+ for i , line in enumerate (f , start = 1 ):
244
+ tok_freq , tok = line .split ()
245
+ if tok_freq != last_freq :
246
+ rank = i
247
+ if tok in RNC_tok_freq_rank_dict :
248
+ print (f'\t { tok } already in RNC_tok_freq_rank_dict '
249
+ f'({ rank } vs { RNC_tok_freq_rank_dict [tok ]} )' , file = stderr )
250
+ continue
251
+ RNC_tok_freq_rank_dict [tok ] = rank
252
+ return RNC_tok_freq_rank_dict
253
+
254
+
255
+ def sharoff ():
256
+ # Lemma freq data from Serge Sharoff.
257
+ # Taken from: http://www.artint.ru/projects/frqlist/frqlist-en.php
258
+
259
+ # TODO what about http://dict.ruslang.ru/freq.php ?
260
+
261
+ cache_path = f'FST_DIR}/ Sharoff_lem_freq_dict .pkl '
262
+
263
+ Sharoff_lem_freq_dict = {}
264
+ with open (f'{ RSRC_DIR } /src/Sharoff_lemmaFreq.txt' ) as f :
265
+ for line in f :
266
+ line_num , freq , lemma , pos = line .split ()
267
+ if lemma in Sharoff_lem_freq_dict :
268
+ print (f'{ lemma } already in Sharoff_lem_freq_dict. '
269
+ f'old: { Sharoff_lem_freq_dict [lemma ]} '
270
+ f'new: { (freq , line_num , pos )} ' , file = stderr )
271
+ continue
272
+ Sharoff_lem_freq_dict [lemma ] = float (freq )
273
+ return Sharoff_lem_freq_dict
274
+
275
+
276
+ def sharoff_rank ():
277
+ # Lemma freq data from Serge Sharoff.
278
+ # Taken from: http://www.artint.ru/projects/frqlist/frqlist-en.php
279
+
280
+ # TODO what about http://dict.ruslang.ru/freq.php ?
281
+
282
+ cache_path = f'FST_DIR}/ Sharoff_lem_freq_rank_dict .pkl '
283
+
284
+ Sharoff_lem_freq_rank_dict = {}
285
+ with open (f'{ RSRC_DIR } /src/Sharoff_lemmaFreq.txt' ) as f :
286
+ rank = None
287
+ last_freq = None
288
+ for i , line in enumerate (f , start = 1 ):
289
+ line_num , freq , lemma , pos = line .split ()
290
+ if freq != last_freq :
291
+ rank = i
292
+ if lemma in Sharoff_lem_freq_rank_dict :
293
+ print (f'{ lemma } already in Sharoff_lem_freq_rank_dict. '
294
+ f'old: { Sharoff_lem_freq_rank_dict [lemma ]} '
295
+ f'new: { (rank , line_num , pos )} ' , file = stderr )
296
+ continue
297
+ Sharoff_lem_freq_rank_dict [lemma ] = rank
298
+ return Sharoff_lem_freq_rank_dict
299
+
300
+
301
+ def cache_rsrc (resource , fname ) - -> bool :
302
+ """Attempt to cache (pickle) resource to `fname`."""
303
+ with open (fname , 'w' ) as f :
304
+ pickle .dump (resource )
305
+
306
+
307
+ def uncache_rsrc (fname ):
308
+ """Attempt to uncache (unpickle) resource from `fname`."""
309
+ with open (fname ) as f :
310
+ resource = pickle .load (f )
311
+ return resource
0 commit comments