Skip to content

Commit 17c81fd

Browse files
committed
Merge remastering into master
2 parents 2841cbf + e79e8eb commit 17c81fd

File tree

9 files changed

+201
-92
lines changed

9 files changed

+201
-92
lines changed

README.md

Lines changed: 0 additions & 33 deletions
This file was deleted.

changelog.md

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Changelog
2+
3+
All notable changes to this project will be documented in this file.
4+
5+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7+
8+
## 0.1.7 — 2025-02-15
9+
### Changed
10+
11+
- [x] replace dictionary file with bz2-archived version (decrease file size in 8.1 times)
12+
13+
## 0.1.6 — 2025-02-15
14+
### Changed
15+
16+
- [x] optimize regexp pattern
17+
18+
## 0.1.5 — 2025-02-15
19+
### Changed
20+
21+
- [x] refactoring: dictionary file renamed: `yoficator/_data/yoficator.dic` to `yoficator/_data/dictionary.ru_RU.txt`
22+
23+
## 0.1.4 — 2025-02-15
24+
### Changed
25+
26+
- [x] upgrade to python3
27+
28+
## 0.1.3 — 2025-02-15
29+
### Removed
30+
31+
- [x] refuse from `regex` as dependency in favor of standard `re`
32+
33+
## 0.1.2 — 2025-02-15
34+
### Removed
35+
36+
- [x] remove unused `pprint` statement
37+
38+
## 0.1.1 — 2025-02-15
39+
### Removed
40+
41+
- [x] remove "tests" (demo) functionality
42+
43+
## 0.1.0 — 2025-02-15
44+
### Added
45+
46+
- [x] added `changelog`, `license` files
47+
48+
### Changed
49+
50+
- [x] refactoring: project directory structure changed, added `setup.py` file for packaging
51+
52+
## Unversioned — 2015-11-17
53+
54+
Initial release

license.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# License (GPL 3)
2+
3+
This program is free software: you can redistribute it and/or modify
4+
it under the terms of the GNU General Public License as published by
5+
the Free Software Foundation; either version 3 of the License, or
6+
(at your option) any later version.
7+
8+
This program is distributed in the hope that it will be useful,
9+
but WITHOUT ANY WARRANTY; without even the implied warranty of
10+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11+
GNU General Public License for more details.
12+
13+
You should have received a copy of the GNU General Public License
14+
along with this program. If not, see <https://www.gnu.org/licenses/>.

readme.md

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Yoficator
2+
3+
> A Russian text yoficator (ёфикатор).
4+
5+
## What does it do?
6+
7+
It conservatively replaces every `е` to `ё` when it's unambiguously a case of the latter.
8+
No context is used; it relies entirely on a lack of dictionary entries for a correspondent "truly `е`" homograph.
9+
10+
Yoficating Russian texts removes some unnecessary ambiguities.
11+
12+
To learn more, check Wikipedia in [English](https://en.wikipedia.org/wiki/Yoficator)
13+
or [Russian](https://ru.wikipedia.org/wiki/Ёфикатор).
14+
15+
## Usage
16+
17+
1. Build wheel:
18+
```sh
19+
python setup.py bdist_wheel -d '/tmp'
20+
```
21+
2. Install wheel:
22+
```sh
23+
pip install yoficator-0.1.0-py2-none-any.whl
24+
```
25+
3. Use:
26+
```sh
27+
python -m yoficator # [text-file-in-Russian | string-in-Russian]
28+
```
29+
30+
## Examples
31+
32+
Running the command without arguments parses the test file:
33+
34+
```sh
35+
python -m yoficator
36+
```
37+
38+
Or just use it with a file or string:
39+
40+
```sh
41+
python -m yoficator russianfile.txt # prints to STDOUT
42+
python -m yoficator russianfile.txt > russianfile-yoficated.txt
43+
python -m yoficator "Где ее книга?"
44+
```
45+
46+
## Limitations
47+
48+
- The code being conservative and not looking for context, it won't correct when a "truly `е`" homograph exists.
49+
Thus a "`все`" will never be corrected, because both `все` and `всё` exist as different words.
50+
- Prone to wrongly yoficate other Cyrillic-based languages, such as Bulgarian, Ukrainian, Belarussian.
51+
- It's not the fastest thing in the world, mind you. But does the job.
52+
53+
## [Changelog](changelog.md)
54+
## [License (GPL 3)](license.md)

setup.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import setuptools
2+
import os
3+
import bz2
4+
5+
6+
with open('readme.md') as fh:
7+
long_description = fh.read()
8+
9+
10+
def create_zip():
11+
with open('yoficator/_data/dictionary.ru_RU.txt', 'rb') as source:
12+
with bz2.BZ2File('yoficator/_data/dictionary.ru_RU.txt.bz2', 'w') as stream:
13+
stream.write(source.read())
14+
15+
16+
try:
17+
create_zip()
18+
setuptools.setup(
19+
name='yoficator',
20+
version='0.1.7',
21+
description='A Russian text yoficator (ёфикатор)',
22+
long_description=long_description,
23+
long_description_content_type='text/markdown',
24+
license='License :: OSI Approved :: MIT License',
25+
packages=['yoficator'],
26+
package_data={
27+
'yoficator': ['_data/dictionary.ru_RU.txt.bz2'],
28+
},
29+
include_package_data=True,
30+
classifiers=[
31+
'Programming Language :: Python :: 3',
32+
'License :: OSI Approved :: MIT License',
33+
'Operating System :: OS Independent',
34+
],
35+
python_requires='>=3',
36+
)
37+
finally:
38+
try:
39+
os.remove('yoficator/_data/dictionary.ru_RU.txt.bz2')
40+
except FileNotFoundError:
41+
pass

tests/yoficator.txt

Lines changed: 0 additions & 4 deletions
This file was deleted.
File renamed without changes.
Lines changed: 38 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,43 @@
1-
#!/usr/bin/python
2-
# -*- coding: utf-8 -*-
3-
from __future__ import print_function, unicode_literals
4-
import codecs
5-
import os, sys
6-
import pprint
7-
import regex as re
1+
#!/usr/bin/env python3
2+
import os
3+
import sys
4+
import re
5+
import bz2
86

97
#-------------------------------------------------------------------------#
108
#
11-
# ▗▀▖▗ ▐
9+
# ▗▀▖▗ ▐
1210
# ▌ ▌▞▀▖▐ ▄ ▞▀▖▝▀▖▜▀ ▞▀▖▙▀▖ ▛▀▖▌ ▌
1311
# ▚▄▌▌ ▌▜▀ ▐ ▌ ▖▞▀▌▐ ▖▌ ▌▌ ▗▖▙▄▘▚▄▌
1412
# ▗▄▘▝▀ ▐ ▀▘▝▀ ▝▀▘ ▀ ▝▀ ▘ ▝▘▌ ▗▄▘
1513
#
1614
# Description:
1715
# This is a Russian text yoficator (ёфикатор).
1816
#
19-
# It conservatively replaces every "е" to "ё" when it's unambiguously
17+
# It conservatively replaces every "е" to "ё" when it's unambiguously
2018
# a case of the latter. No context is used; it relies entirely on a lack
21-
# of dictionary entries for a correspondent "truly е" homograph.
19+
# of dictionary entries for a correspondent "truly е" homograph.
2220
#
2321
# Yoficating Russian texts remove some unnecessary ambiguity.
2422
# https://en.wikipedia.org/wiki/Yoficator
2523
# https://ru.wikipedia.org/wiki/Ёфикатор
2624
#
2725
# Syntax: yoficator.py [text-file-in-Russian | string-in-Russian]
28-
#
26+
#
2927
# Depends on yoficator.dic, which is used for the lookup.
3028
#
31-
# Limitations:
32-
# * The code being conservative and not looking for context, it won't correct
33-
# when a "truly е" homograph exists. Thus a "все" will never be corrected,
29+
# Limitations:
30+
# * The code being conservative and not looking for context, it won't correct
31+
# when a "truly е" homograph exists. Thus a "все" will never be corrected,
3432
# because both все and всё exist as different words.
35-
# * Prone to wrongly yoficate other Cyrillic-based languages, such as
33+
# * Prone to wrongly yoficate other Cyrillic-based languages, such as
3634
# Bulgarian, Ukrainian, Belarussian.
3735
# * It's not the fastest thing in the world, mind you. But does the job.
3836
#
3937
#-------------------------------------------------------------------------
4038
#
4139
# Found this useful? Appalling? Appealing? Please let me know.
42-
# The Unabashed welcomes your impressions.
40+
# The Unabashed welcomes your impressions.
4341
#
4442
# You will find the
4543
# unabashed
@@ -64,51 +62,36 @@
6462
#
6563
#--------------------------------------------------------------------------#
6664

67-
# TODO Better handle lowercase, uppercase
68-
69-
pp = pprint.PrettyPrinter(4)
7065

71-
# Variables initialization; tests a file if no argument is supplied.
72-
# Save the yoficator as a subfolder of your Desktop
73-
# TODO: Make it compatible with other OSs.
74-
workingDir = os.getenv('HOME') + "/Desktop/yoficator/"
75-
textFile = workingDir + "tests/yoficator.txt"
76-
dictionaryFile = workingDir + "yoficator.dic"
66+
if __name__ == '__main__':
67+
# TODO Better handle lowercase, uppercase
68+
dictionary_file_path = os.path.abspath(os.path.dirname(__file__)) + '/_data/dictionary.ru_RU.txt.bz2'
7769

78-
if len(sys.argv) > 1:
79-
# Is the input a filename?
80-
if os.path.isfile(sys.argv[1]):
81-
text = codecs.open(sys.argv[1].decode("utf-8"), "r", "utf-8").read()
82-
# Else we will assume it's a string
70+
if len(sys.argv) > 1:
71+
# Is the input a filename?
72+
if os.path.isfile(sys.argv[1]):
73+
text = open(sys.argv[1]).read()
74+
# Else we will assume it's a string
75+
else:
76+
text = sys.argv[1]
8377
else:
84-
text = sys.argv[1].decode("utf-8")
85-
else:
86-
# We will assume using textFile as input filename above
87-
text = codecs.open(textFile, "r", "utf-8").read()
78+
print('Error: No file specified', file=sys.stderr)
79+
exit(1)
8880

89-
dictionary = {}
81+
dictionary = {}
9082

83+
# Splitter / tokenizer
84+
splitter = re.compile(r'(?P<word>[а-я]*е[а-я]*)|(?P<unknown>[^е]+\b)', re.IGNORECASE)
9185

92-
# Splitter / tokenizer
93-
splitter = re.compile(r'(\s+|\w+|\W+|\S+)', re.UNICODE)
94-
tokens = splitter.findall(text)
95-
96-
with codecs.open(dictionaryFile, "r", "utf-8") as f:
97-
for line in f:
98-
if ":" in line:
99-
key,value = line.split(":")
86+
with bz2.open(dictionary_file_path) as stream:
87+
for line in iter(stream):
88+
key, value = line.decode('utf-8').split(':')
10089
dictionary[key] = value.rstrip('\n')
101-
else:
102-
pass
103-
104-
for token in tokens:
105-
if token in dictionary:
106-
print(dictionary[token], end='')
107-
else:
108-
print(token, end='')
109-
110-
111-
sys.exit(0)
11290

113-
# -------------------- END -----------------------
91+
for token in splitter.finditer(text):
92+
word = token.group(0)
93+
if token.lastgroup == 'word':
94+
print(dictionary.get(word, word), end='')
95+
continue
11496

97+
print(word, end='')

0 commit comments

Comments
 (0)