| 
1 |  | -#!/usr/bin/python  | 
2 |  | -# -*- coding: utf-8 -*-  | 
3 |  | -from __future__ import print_function, unicode_literals  | 
4 |  | -import codecs  | 
5 |  | -import os, sys  | 
 | 1 | +#!/usr/bin/env python3  | 
 | 2 | +import os  | 
 | 3 | +import sys  | 
6 | 4 | import re  | 
7 | 5 | 
 
  | 
8 | 6 | #-------------------------------------------------------------------------#  | 
9 | 7 | #  | 
10 |  | -#                         ▗▀▖▗       ▐                  | 
 | 8 | +#                         ▗▀▖▗       ▐  | 
11 | 9 | #                   ▌ ▌▞▀▖▐  ▄ ▞▀▖▝▀▖▜▀ ▞▀▖▙▀▖  ▛▀▖▌ ▌  | 
12 | 10 | #                   ▚▄▌▌ ▌▜▀ ▐ ▌ ▖▞▀▌▐ ▖▌ ▌▌  ▗▖▙▄▘▚▄▌  | 
13 | 11 | #                   ▗▄▘▝▀ ▐  ▀▘▝▀ ▝▀▘ ▀ ▝▀ ▘  ▝▘▌  ▗▄▘  | 
14 | 12 | #  | 
15 | 13 | # Description:  | 
16 | 14 | #    This is a Russian text yoficator (ёфикатор).  | 
17 | 15 | #  | 
18 |  | -#    It conservatively replaces every "е" to "ё" when it's unambiguously   | 
 | 16 | +#    It conservatively replaces every "е" to "ё" when it's unambiguously  | 
19 | 17 | #    a case of the latter. No context is used; it relies entirely on a lack  | 
20 |  | -#    of dictionary entries for a correspondent "truly е" homograph.   | 
 | 18 | +#    of dictionary entries for a correspondent "truly е" homograph.  | 
21 | 19 | #  | 
22 | 20 | #    Yoficating Russian texts remove some unnecessary ambiguity.  | 
23 | 21 | #    https://en.wikipedia.org/wiki/Yoficator  | 
24 | 22 | #    https://ru.wikipedia.org/wiki/Ёфикатор  | 
25 | 23 | #  | 
26 | 24 | #    Syntax: yoficator.py [text-file-in-Russian | string-in-Russian]  | 
27 |  | -#   | 
 | 25 | +#  | 
28 | 26 | #    Depends on yoficator.dic, which is used for the lookup.  | 
29 | 27 | #  | 
30 |  | -#    Limitations:   | 
31 |  | -#    * The code being conservative and not looking for context, it won't correct   | 
32 |  | -#      when a "truly е" homograph exists. Thus a "все" will never be corrected,   | 
 | 28 | +#    Limitations:  | 
 | 29 | +#    * The code being conservative and not looking for context, it won't correct  | 
 | 30 | +#      when a "truly е" homograph exists. Thus a "все" will never be corrected,  | 
33 | 31 | #      because both все and всё exist as different words.  | 
34 |  | -#    * Prone to wrongly yoficate other Cyrillic-based languages, such as   | 
 | 32 | +#    * Prone to wrongly yoficate other Cyrillic-based languages, such as  | 
35 | 33 | #      Bulgarian, Ukrainian, Belarussian.  | 
36 | 34 | #    * It's not the fastest thing in the world, mind you. But does the job.  | 
37 | 35 | #  | 
38 | 36 | #-------------------------------------------------------------------------  | 
39 | 37 | #  | 
40 | 38 | # Found this useful? Appalling? Appealing? Please let me know.  | 
41 |  | -# The Unabashed welcomes your impressions.   | 
 | 39 | +# The Unabashed welcomes your impressions.  | 
42 | 40 | #  | 
43 | 41 | # You will find the  | 
44 | 42 | #   unabashed  | 
 | 
63 | 61 | #  | 
64 | 62 | #--------------------------------------------------------------------------#  | 
65 | 63 | 
 
  | 
66 |  | -# TODO Better handle lowercase, uppercase  | 
67 |  | - | 
68 |  | -workingDir = os.path.abspath(os.path.dirname(__file__)) + '/_data'  | 
69 |  | -dictionaryFile = workingDir + "/yoficator.dic"  | 
70 |  | - | 
71 |  | -if len(sys.argv) > 1:  | 
72 |  | -    # Is the input a filename?  | 
73 |  | -    if os.path.isfile(sys.argv[1]):  | 
74 |  | -        text = codecs.open(sys.argv[1].decode("utf-8"), "r", "utf-8").read()  | 
75 |  | -    # Else we will assume it's a string  | 
76 |  | -    else:  | 
77 |  | -        text = sys.argv[1].decode("utf-8")  | 
78 |  | -else:  | 
79 |  | -    print('Error: No file specified', file=sys.stderr)  | 
80 |  | -    sys.exit(1)  | 
81 |  | - | 
82 |  | -dictionary = {}  | 
83 |  | - | 
84 | 64 | 
 
  | 
85 |  | -# Splitter / tokenizer  | 
86 |  | -splitter = re.compile(r'(\s+|\w+|\W+|\S+)', re.UNICODE)  | 
87 |  | -tokens = splitter.findall(text)  | 
 | 65 | +if __name__ == '__main__':  | 
 | 66 | +    # TODO Better handle lowercase, uppercase  | 
 | 67 | +    dictionary_file_path = os.path.abspath(os.path.dirname(__file__)) + '/_data/yoficator.dic'  | 
88 | 68 | 
 
  | 
89 |  | -with codecs.open(dictionaryFile, "r", "utf-8") as f:  | 
90 |  | -    for line in f:  | 
91 |  | -        if ":" in line:  | 
92 |  | -            key,value = line.split(":")  | 
93 |  | -            dictionary[key] = value.rstrip('\n')  | 
 | 69 | +    if len(sys.argv) > 1:  | 
 | 70 | +        # Is the input a filename?  | 
 | 71 | +        if os.path.isfile(sys.argv[1]):  | 
 | 72 | +            text = open(sys.argv[1]).read()  | 
 | 73 | +        # Else we will assume it's a string  | 
94 | 74 |         else:  | 
95 |  | -            pass  | 
96 |  | - | 
97 |  | -for token in tokens:  | 
98 |  | -    if token in dictionary:  | 
99 |  | -        print(dictionary[token], end='')  | 
 | 75 | +            text = sys.argv[1]  | 
100 | 76 |     else:  | 
101 |  | -        print(token, end='')  | 
 | 77 | +        print('Error: No file specified', file=sys.stderr)  | 
 | 78 | +        exit(1)  | 
102 | 79 | 
 
  | 
 | 80 | +    dictionary = {}  | 
103 | 81 | 
 
  | 
104 |  | -sys.exit(0)  | 
 | 82 | +    # Splitter / tokenizer  | 
 | 83 | +    splitter = re.compile(r'(\s+|\w+|\W+|\S+)')  | 
105 | 84 | 
 
  | 
106 |  | -# -------------------- END -----------------------  | 
 | 85 | +    with open(dictionary_file_path) as stream:  | 
 | 86 | +        for line in iter(stream):  | 
 | 87 | +            if ':' in line:  | 
 | 88 | +                key, value = line.split(':')  | 
 | 89 | +                dictionary[key] = value.rstrip('\n')  | 
107 | 90 | 
 
  | 
 | 91 | +    for token in splitter.finditer(text):  | 
 | 92 | +        if token in dictionary:  | 
 | 93 | +            print(dictionary[token], end='')  | 
 | 94 | +        else:  | 
 | 95 | +            print(token, end='')  | 
0 commit comments