|
1 | | -#!/usr/bin/python |
2 | | -# -*- coding: utf-8 -*- |
3 | | -from __future__ import print_function, unicode_literals |
4 | | -import codecs |
5 | | -import os, sys |
| 1 | +#!/usr/bin/env python3 |
| 2 | +import os |
| 3 | +import sys |
6 | 4 | import re |
7 | 5 |
|
8 | 6 | #-------------------------------------------------------------------------# |
9 | 7 | # |
10 | | -# ▗▀▖▗ ▐ |
| 8 | +# ▗▀▖▗ ▐ |
11 | 9 | # ▌ ▌▞▀▖▐ ▄ ▞▀▖▝▀▖▜▀ ▞▀▖▙▀▖ ▛▀▖▌ ▌ |
12 | 10 | # ▚▄▌▌ ▌▜▀ ▐ ▌ ▖▞▀▌▐ ▖▌ ▌▌ ▗▖▙▄▘▚▄▌ |
13 | 11 | # ▗▄▘▝▀ ▐ ▀▘▝▀ ▝▀▘ ▀ ▝▀ ▘ ▝▘▌ ▗▄▘ |
14 | 12 | # |
15 | 13 | # Description: |
16 | 14 | # This is a Russian text yoficator (ёфикатор). |
17 | 15 | # |
18 | | -# It conservatively replaces every "е" to "ё" when it's unambiguously |
| 16 | +# It conservatively replaces every "е" to "ё" when it's unambiguously |
19 | 17 | # a case of the latter. No context is used; it relies entirely on a lack |
20 | | -# of dictionary entries for a correspondent "truly е" homograph. |
| 18 | +# of dictionary entries for a correspondent "truly е" homograph. |
21 | 19 | # |
22 | 20 | # Yoficating Russian texts remove some unnecessary ambiguity. |
23 | 21 | # https://en.wikipedia.org/wiki/Yoficator |
24 | 22 | # https://ru.wikipedia.org/wiki/Ёфикатор |
25 | 23 | # |
26 | 24 | # Syntax: yoficator.py [text-file-in-Russian | string-in-Russian] |
27 | | -# |
| 25 | +# |
28 | 26 | # Depends on yoficator.dic, which is used for the lookup. |
29 | 27 | # |
30 | | -# Limitations: |
31 | | -# * The code being conservative and not looking for context, it won't correct |
32 | | -# when a "truly е" homograph exists. Thus a "все" will never be corrected, |
| 28 | +# Limitations: |
| 29 | +# * The code being conservative and not looking for context, it won't correct |
| 30 | +# when a "truly е" homograph exists. Thus a "все" will never be corrected, |
33 | 31 | # because both все and всё exist as different words. |
34 | | -# * Prone to wrongly yoficate other Cyrillic-based languages, such as |
| 32 | +# * Prone to wrongly yoficate other Cyrillic-based languages, such as |
35 | 33 | # Bulgarian, Ukrainian, Belarussian. |
36 | 34 | # * It's not the fastest thing in the world, mind you. But does the job. |
37 | 35 | # |
38 | 36 | #------------------------------------------------------------------------- |
39 | 37 | # |
40 | 38 | # Found this useful? Appalling? Appealing? Please let me know. |
41 | | -# The Unabashed welcomes your impressions. |
| 39 | +# The Unabashed welcomes your impressions. |
42 | 40 | # |
43 | 41 | # You will find the |
44 | 42 | # unabashed |
|
63 | 61 | # |
64 | 62 | #--------------------------------------------------------------------------# |
65 | 63 |
|
66 | | -# TODO Better handle lowercase, uppercase |
67 | | - |
68 | | -workingDir = os.path.abspath(os.path.dirname(__file__)) + '/_data' |
69 | | -dictionaryFile = workingDir + "/yoficator.dic" |
70 | | - |
71 | | -if len(sys.argv) > 1: |
72 | | - # Is the input a filename? |
73 | | - if os.path.isfile(sys.argv[1]): |
74 | | - text = codecs.open(sys.argv[1].decode("utf-8"), "r", "utf-8").read() |
75 | | - # Else we will assume it's a string |
76 | | - else: |
77 | | - text = sys.argv[1].decode("utf-8") |
78 | | -else: |
79 | | - print('Error: No file specified', file=sys.stderr) |
80 | | - sys.exit(1) |
81 | | - |
82 | | -dictionary = {} |
83 | | - |
84 | 64 |
|
85 | | -# Splitter / tokenizer |
86 | | -splitter = re.compile(r'(\s+|\w+|\W+|\S+)', re.UNICODE) |
87 | | -tokens = splitter.findall(text) |
| 65 | +if __name__ == '__main__': |
| 66 | + # TODO Better handle lowercase, uppercase |
| 67 | + dictionary_file_path = os.path.abspath(os.path.dirname(__file__)) + '/_data/yoficator.dic' |
88 | 68 |
|
89 | | -with codecs.open(dictionaryFile, "r", "utf-8") as f: |
90 | | - for line in f: |
91 | | - if ":" in line: |
92 | | - key,value = line.split(":") |
93 | | - dictionary[key] = value.rstrip('\n') |
| 69 | + if len(sys.argv) > 1: |
| 70 | + # Is the input a filename? |
| 71 | + if os.path.isfile(sys.argv[1]): |
| 72 | + text = open(sys.argv[1]).read() |
| 73 | + # Else we will assume it's a string |
94 | 74 | else: |
95 | | - pass |
96 | | - |
97 | | -for token in tokens: |
98 | | - if token in dictionary: |
99 | | - print(dictionary[token], end='') |
| 75 | + text = sys.argv[1] |
100 | 76 | else: |
101 | | - print(token, end='') |
| 77 | + print('Error: No file specified', file=sys.stderr) |
| 78 | + exit(1) |
102 | 79 |
|
| 80 | + dictionary = {} |
103 | 81 |
|
104 | | -sys.exit(0) |
| 82 | + # Splitter / tokenizer |
| 83 | + splitter = re.compile(r'(\s+|\w+|\W+|\S+)') |
105 | 84 |
|
106 | | -# -------------------- END ----------------------- |
| 85 | + with open(dictionary_file_path) as stream: |
| 86 | + for line in iter(stream): |
| 87 | + if ':' in line: |
| 88 | + key, value = line.split(':') |
| 89 | + dictionary[key] = value.rstrip('\n') |
107 | 90 |
|
| 91 | + for token in splitter.finditer(text): |
| 92 | + if token in dictionary: |
| 93 | + print(dictionary[token], end='') |
| 94 | + else: |
| 95 | + print(token, end='') |
0 commit comments