|
1 | | -#!/usr/bin/python |
2 | | -# -*- coding: utf-8 -*- |
3 | | -from __future__ import print_function, unicode_literals |
4 | | -import codecs |
5 | | -import os, sys |
6 | | -import pprint |
7 | | -import regex as re |
| 1 | +#!/usr/bin/env python3 |
| 2 | +import os |
| 3 | +import sys |
| 4 | +import re |
| 5 | +import bz2 |
8 | 6 |
|
9 | 7 | #-------------------------------------------------------------------------# |
10 | 8 | # |
11 | | -# ▗▀▖▗ ▐ |
| 9 | +# ▗▀▖▗ ▐ |
12 | 10 | # ▌ ▌▞▀▖▐ ▄ ▞▀▖▝▀▖▜▀ ▞▀▖▙▀▖ ▛▀▖▌ ▌ |
13 | 11 | # ▚▄▌▌ ▌▜▀ ▐ ▌ ▖▞▀▌▐ ▖▌ ▌▌ ▗▖▙▄▘▚▄▌ |
14 | 12 | # ▗▄▘▝▀ ▐ ▀▘▝▀ ▝▀▘ ▀ ▝▀ ▘ ▝▘▌ ▗▄▘ |
15 | 13 | # |
16 | 14 | # Description: |
17 | 15 | # This is a Russian text yoficator (ёфикатор). |
18 | 16 | # |
19 | | -# It conservatively replaces every "е" to "ё" when it's unambiguously |
| 17 | +# It conservatively replaces every "е" to "ё" when it's unambiguously |
20 | 18 | # a case of the latter. No context is used; it relies entirely on a lack |
21 | | -# of dictionary entries for a correspondent "truly е" homograph. |
| 19 | +# of dictionary entries for a correspondent "truly е" homograph. |
22 | 20 | # |
23 | 21 | # Yoficating Russian texts remove some unnecessary ambiguity. |
24 | 22 | # https://en.wikipedia.org/wiki/Yoficator |
25 | 23 | # https://ru.wikipedia.org/wiki/Ёфикатор |
26 | 24 | # |
27 | 25 | # Syntax: yoficator.py [text-file-in-Russian | string-in-Russian] |
28 | | -# |
| 26 | +# |
29 | 27 | # Depends on yoficator.dic, which is used for the lookup. |
30 | 28 | # |
31 | | -# Limitations: |
32 | | -# * The code being conservative and not looking for context, it won't correct |
33 | | -# when a "truly е" homograph exists. Thus a "все" will never be corrected, |
| 29 | +# Limitations: |
| 30 | +# * The code being conservative and not looking for context, it won't correct |
| 31 | +# when a "truly е" homograph exists. Thus a "все" will never be corrected, |
34 | 32 | # because both все and всё exist as different words. |
35 | | -# * Prone to wrongly yoficate other Cyrillic-based languages, such as |
| 33 | +# * Prone to wrongly yoficate other Cyrillic-based languages, such as |
36 | 34 | # Bulgarian, Ukrainian, Belarussian. |
37 | 35 | # * It's not the fastest thing in the world, mind you. But does the job. |
38 | 36 | # |
39 | 37 | #------------------------------------------------------------------------- |
40 | 38 | # |
41 | 39 | # Found this useful? Appalling? Appealing? Please let me know. |
42 | | -# The Unabashed welcomes your impressions. |
| 40 | +# The Unabashed welcomes your impressions. |
43 | 41 | # |
44 | 42 | # You will find the |
45 | 43 | # unabashed |
|
64 | 62 | # |
65 | 63 | #--------------------------------------------------------------------------# |
66 | 64 |
|
67 | | -# TODO Better handle lowercase, uppercase |
68 | | - |
69 | | -pp = pprint.PrettyPrinter(4) |
70 | 65 |
|
71 | | -# Variables initialization; tests a file if no argument is supplied. |
72 | | -# Save the yoficator as a subfolder of your Desktop |
73 | | -# TODO: Make it compatible with other OSs. |
74 | | -workingDir = os.getenv('HOME') + "/Desktop/yoficator/" |
75 | | -textFile = workingDir + "tests/yoficator.txt" |
76 | | -dictionaryFile = workingDir + "yoficator.dic" |
| 66 | +if __name__ == '__main__': |
| 67 | + # TODO Better handle lowercase, uppercase |
| 68 | + dictionary_file_path = os.path.abspath(os.path.dirname(__file__)) + '/_data/dictionary.ru_RU.txt.bz2' |
77 | 69 |
|
78 | | -if len(sys.argv) > 1: |
79 | | - # Is the input a filename? |
80 | | - if os.path.isfile(sys.argv[1]): |
81 | | - text = codecs.open(sys.argv[1].decode("utf-8"), "r", "utf-8").read() |
82 | | - # Else we will assume it's a string |
| 70 | + if len(sys.argv) > 1: |
| 71 | + # Is the input a filename? |
| 72 | + if os.path.isfile(sys.argv[1]): |
| 73 | + text = open(sys.argv[1]).read() |
| 74 | + # Else we will assume it's a string |
| 75 | + else: |
| 76 | + text = sys.argv[1] |
83 | 77 | else: |
84 | | - text = sys.argv[1].decode("utf-8") |
85 | | -else: |
86 | | - # We will assume using textFile as input filename above |
87 | | - text = codecs.open(textFile, "r", "utf-8").read() |
| 78 | + print('Error: No file specified', file=sys.stderr) |
| 79 | + exit(1) |
88 | 80 |
|
89 | | -dictionary = {} |
| 81 | + dictionary = {} |
90 | 82 |
|
| 83 | + # Splitter / tokenizer |
| 84 | + splitter = re.compile(r'(?P<word>[а-я]*е[а-я]*)|(?P<unknown>[^е]+\b)', re.IGNORECASE) |
91 | 85 |
|
92 | | -# Splitter / tokenizer |
93 | | -splitter = re.compile(r'(\s+|\w+|\W+|\S+)', re.UNICODE) |
94 | | -tokens = splitter.findall(text) |
95 | | - |
96 | | -with codecs.open(dictionaryFile, "r", "utf-8") as f: |
97 | | - for line in f: |
98 | | - if ":" in line: |
99 | | - key,value = line.split(":") |
| 86 | + with bz2.open(dictionary_file_path) as stream: |
| 87 | + for line in iter(stream): |
| 88 | + key, value = line.decode('utf-8').split(':') |
100 | 89 | dictionary[key] = value.rstrip('\n') |
101 | | - else: |
102 | | - pass |
103 | | - |
104 | | -for token in tokens: |
105 | | - if token in dictionary: |
106 | | - print(dictionary[token], end='') |
107 | | - else: |
108 | | - print(token, end='') |
109 | | - |
110 | | - |
111 | | -sys.exit(0) |
112 | 90 |
|
113 | | -# -------------------- END ----------------------- |
| 91 | + for token in splitter.finditer(text): |
| 92 | + word = token.group(0) |
| 93 | + if token.lastgroup == 'word': |
| 94 | + print(dictionary.get(word, word), end='') |
| 95 | + continue |
114 | 96 |
|
| 97 | + print(word, end='') |
0 commit comments