md-py
diff --git a/‎README.md‎
Lines changed: 0 additions & 33 deletions b/‎README.md‎
Lines changed: 0 additions & 33 deletions
diff --git a/‎changelog.md‎
Lines changed: 54 additions & 0 deletions b/‎changelog.md‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎license.md‎
Lines changed: 14 additions & 0 deletions b/‎license.md‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎readme.md‎
Lines changed: 54 additions & 0 deletions b/‎readme.md‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎setup.py‎
Lines changed: 41 additions & 0 deletions b/‎setup.py‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎tests/yoficator.txt‎
Lines changed: 0 additions & 4 deletions b/‎tests/yoficator.txt‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎.gitignore‎ ‎yoficator/__init__.py‎.gitignore renamed to yoficator/__init__.py b/‎.gitignore‎ ‎yoficator/__init__.py‎.gitignore renamed to yoficator/__init__.py
diff --git a/‎yoficator.py‎ ‎yoficator/__main__.py‎yoficator.py renamed to yoficator/__main__.py
Lines changed: 38 additions & 55 deletions b/‎yoficator.py‎ ‎yoficator/__main__.py‎yoficator.py renamed to yoficator/__main__.py
Lines changed: 38 additions & 55 deletions
diff --git a/‎yoficator.dic‎ ‎yoficator/_data/dictionary.ru_RU.txt‎yoficator.dic renamed to yoficator/_data/dictionary.ru_RU.txt b/‎yoficator.dic‎ ‎yoficator/_data/dictionary.ru_RU.txt‎yoficator.dic renamed to yoficator/_data/dictionary.ru_RU.txt
@@ -0,0 +1,54 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## 0.1.7 — 2025-02-15
+### Changed
+
+- [x] replace dictionary file with bz2-archived version (decrease file size in 8.1 times)
+
+## 0.1.6 — 2025-02-15
+### Changed
+
+- [x] optimize regexp pattern
+
+## 0.1.5 — 2025-02-15
+### Changed
+
+- [x] refactoring: dictionary file renamed: `yoficator/_data/yoficator.dic` to `yoficator/_data/dictionary.ru_RU.txt`
+
+## 0.1.4 — 2025-02-15
+### Changed
+
+- [x] upgrade to python3
+
+## 0.1.3 — 2025-02-15
+### Removed
+
+- [x] refuse from `regex` as dependency in favor of standard `re`
+
+## 0.1.2 — 2025-02-15
+### Removed
+
+- [x] remove unused `pprint` statement
+
+## 0.1.1 — 2025-02-15
+### Removed
+
+- [x] remove "tests" (demo) functionality
+
+## 0.1.0 — 2025-02-15
+### Added
+
+- [x] added `changelog`, `license` files
+
+### Changed
+
+- [x] refactoring: project directory structure changed, added `setup.py` file for packaging
+
+## Unversioned — 2015-11-17
+
+Initial release
@@ -0,0 +1,14 @@
+# License (GPL 3)
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <https://www.gnu.org/licenses/>.
@@ -0,0 +1,54 @@
+# Yoficator
+
+> A Russian text yoficator (ёфикатор).
+
+## What does it do?
+
+It conservatively replaces every `е` to `ё` when it's unambiguously a case of the latter. 
+No context is used; it relies entirely on a lack of dictionary entries for a correspondent "truly `е`" homograph. 
+
+Yoficating Russian texts removes some unnecessary ambiguities.
+
+To learn more, check Wikipedia in [English](https://en.wikipedia.org/wiki/Yoficator) 
+or [Russian](https://ru.wikipedia.org/wiki/Ёфикатор).
+
+## Usage
+
+1. Build wheel:
+   ```sh
+   python setup.py bdist_wheel -d '/tmp'
+   ```
+2. Install wheel:
+   ```sh
+   pip install yoficator-0.1.0-py2-none-any.whl
+   ```
+3. Use:
+   ```sh
+   python -m yoficator  # [text-file-in-Russian | string-in-Russian]
+   ```
+
+## Examples
+
+Running the command without arguments parses the test file:
+
+```sh
+python -m yoficator
+```
+
+Or just use it with a file or string:
+
+```sh
+python -m yoficator russianfile.txt    # prints to STDOUT
+python -m yoficator russianfile.txt > russianfile-yoficated.txt
+python -m yoficator "Где ее книга?"
+```
+
+## Limitations
+
+- The code being conservative and not looking for context, it won't correct when a "truly `е`" homograph exists.
+  Thus a "`все`" will never be corrected, because both `все` and `всё` exist as different words.
+- Prone to wrongly yoficate other Cyrillic-based languages, such as Bulgarian, Ukrainian, Belarussian.
+- It's not the fastest thing in the world, mind you. But does the job.
+
+## [Changelog](changelog.md)
+## [License (GPL 3)](license.md)
@@ -0,0 +1,41 @@
+import setuptools
+import os
+import bz2
+
+
+with open('readme.md') as fh:
+    long_description = fh.read()
+
+
+def create_zip():
+    with open('yoficator/_data/dictionary.ru_RU.txt', 'rb') as source:
+        with bz2.BZ2File('yoficator/_data/dictionary.ru_RU.txt.bz2', 'w') as stream:
+            stream.write(source.read())
+
+
+try:
+    create_zip()
+    setuptools.setup(
+        name='yoficator',
+        version='0.1.7',
+        description='A Russian text yoficator (ёфикатор)',
+        long_description=long_description,
+        long_description_content_type='text/markdown',
+        license='License :: OSI Approved :: MIT License',
+        packages=['yoficator'],
+        package_data={
+            'yoficator': ['_data/dictionary.ru_RU.txt.bz2'],
+        },
+        include_package_data=True,
+        classifiers=[
+            'Programming Language :: Python :: 3',
+            'License :: OSI Approved :: MIT License',
+            'Operating System :: OS Independent',
+        ],
+        python_requires='>=3',
+    )
+finally:
+    try:
+        os.remove('yoficator/_data/dictionary.ru_RU.txt.bz2')
+    except FileNotFoundError:
+        pass
@@ -1,45 +1,43 @@
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-from __future__ import print_function, unicode_literals
-import codecs
-import os, sys
-import pprint
-import regex as re
+#!/usr/bin/env python3
+import os
+import sys
+import re
+import bz2
 
 #-------------------------------------------------------------------------#
 #
-#                         ▗▀▖▗       ▐                
+#                         ▗▀▖▗       ▐
 #                   ▌ ▌▞▀▖▐  ▄ ▞▀▖▝▀▖▜▀ ▞▀▖▙▀▖  ▛▀▖▌ ▌
 #                   ▚▄▌▌ ▌▜▀ ▐ ▌ ▖▞▀▌▐ ▖▌ ▌▌  ▗▖▙▄▘▚▄▌
 #                   ▗▄▘▝▀ ▐  ▀▘▝▀ ▝▀▘ ▀ ▝▀ ▘  ▝▘▌  ▗▄▘
 #
 # Description:
 #    This is a Russian text yoficator (ёфикатор).
 #
-#    It conservatively replaces every "е" to "ё" when it's unambiguously 
+#    It conservatively replaces every "е" to "ё" when it's unambiguously
 #    a case of the latter. No context is used; it relies entirely on a lack
-#    of dictionary entries for a correspondent "truly е" homograph. 
+#    of dictionary entries for a correspondent "truly е" homograph.
 #
 #    Yoficating Russian texts remove some unnecessary ambiguity.
 #    https://en.wikipedia.org/wiki/Yoficator
 #    https://ru.wikipedia.org/wiki/Ёфикатор
 #
 #    Syntax: yoficator.py [text-file-in-Russian | string-in-Russian]
-# 
+#
 #    Depends on yoficator.dic, which is used for the lookup.
 #
-#    Limitations: 
-#    * The code being conservative and not looking for context, it won't correct 
-#      when a "truly е" homograph exists. Thus a "все" will never be corrected, 
+#    Limitations:
+#    * The code being conservative and not looking for context, it won't correct
+#      when a "truly е" homograph exists. Thus a "все" will never be corrected,
 #      because both все and всё exist as different words.
-#    * Prone to wrongly yoficate other Cyrillic-based languages, such as 
+#    * Prone to wrongly yoficate other Cyrillic-based languages, such as
 #      Bulgarian, Ukrainian, Belarussian.
 #    * It's not the fastest thing in the world, mind you. But does the job.
 #
 #-------------------------------------------------------------------------
 #
 # Found this useful? Appalling? Appealing? Please let me know.
-# The Unabashed welcomes your impressions. 
+# The Unabashed welcomes your impressions.
 #
 # You will find the
 #   unabashed
@@ -64,51 +62,36 @@
 #
 #--------------------------------------------------------------------------#
 
-# TODO Better handle lowercase, uppercase
-
-pp = pprint.PrettyPrinter(4) 
 
-# Variables initialization; tests a file if no argument is supplied.
-# Save the yoficator as a subfolder of your Desktop
-# TODO: Make it compatible with other OSs.
-workingDir = os.getenv('HOME') + "/Desktop/yoficator/"
-textFile = workingDir + "tests/yoficator.txt"
-dictionaryFile = workingDir + "yoficator.dic"
+if __name__ == '__main__':
+    # TODO Better handle lowercase, uppercase
+    dictionary_file_path = os.path.abspath(os.path.dirname(__file__)) + '/_data/dictionary.ru_RU.txt.bz2'
 
-if len(sys.argv) > 1:
-    # Is the input a filename?
-    if os.path.isfile(sys.argv[1]):
-        text = codecs.open(sys.argv[1].decode("utf-8"), "r", "utf-8").read()
-    # Else we will assume it's a string
+    if len(sys.argv) > 1:
+        # Is the input a filename?
+        if os.path.isfile(sys.argv[1]):
+            text = open(sys.argv[1]).read()
+        # Else we will assume it's a string
+        else:
+            text = sys.argv[1]
     else:
-        text = sys.argv[1].decode("utf-8")
-else:
-    # We will assume using textFile as input filename above
-    text = codecs.open(textFile, "r", "utf-8").read()
+        print('Error: No file specified', file=sys.stderr)
+        exit(1)
 
-dictionary = {}
+    dictionary = {}
 
+    # Splitter / tokenizer
+    splitter = re.compile(r'(?P<word>[а-я]*е[а-я]*)|(?P<unknown>[^е]+\b)', re.IGNORECASE)
 
-# Splitter / tokenizer
-splitter = re.compile(r'(\s+|\w+|\W+|\S+)', re.UNICODE)
-tokens = splitter.findall(text)
-
-with codecs.open(dictionaryFile, "r", "utf-8") as f:
-    for line in f:
-        if ":" in line:
-            key,value = line.split(":")
+    with bz2.open(dictionary_file_path) as stream:
+        for line in iter(stream):
+            key, value = line.decode('utf-8').split(':')
             dictionary[key] = value.rstrip('\n')
-        else:
-            pass
-
-for token in tokens:
-    if token in dictionary:
-        print(dictionary[token], end='')
-    else:
-        print(token, end='')
-
-
-sys.exit(0)
 
-# -------------------- END -----------------------
+    for token in splitter.finditer(text):
+        word = token.group(0)
+        if token.lastgroup == 'word':
+            print(dictionary.get(word, word), end='')
+            continue
 
+        print(word, end='')