Skip to content

Commit e79e8eb

Browse files
committed
feature: replace dictionary file with bz2-archived version
1 parent cd11eb2 commit e79e8eb

File tree

3 files changed

+45
-24
lines changed

3 files changed

+45
-24
lines changed

changelog.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## 0.1.7 — 2025-02-15
9+
### Changed
10+
11+
- [x] replace dictionary file with bz2-archived version (decrease file size in 8.1 times)
12+
813
## 0.1.6 — 2025-02-15
914
### Changed
1015

setup.py

Lines changed: 35 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,41 @@
11
import setuptools
2+
import os
3+
import bz2
24

35

46
with open('readme.md') as fh:
57
long_description = fh.read()
68

7-
setuptools.setup(
8-
name='yoficator',
9-
version='0.1.6',
10-
description='A Russian text yoficator (ёфикатор)',
11-
long_description=long_description,
12-
long_description_content_type='text/markdown',
13-
license='License :: OSI Approved :: MIT License',
14-
packages=['yoficator'],
15-
package_data={
16-
'yoficator': ['_data/dictionary.ru_RU.txt'],
17-
},
18-
include_package_data=True,
19-
classifiers=[
20-
'Programming Language :: Python :: 3',
21-
'License :: OSI Approved :: MIT License',
22-
'Operating System :: OS Independent',
23-
],
24-
python_requires='>=3',
25-
)
9+
10+
def create_zip():
11+
with open('yoficator/_data/dictionary.ru_RU.txt', 'rb') as source:
12+
with bz2.BZ2File('yoficator/_data/dictionary.ru_RU.txt.bz2', 'w') as stream:
13+
stream.write(source.read())
14+
15+
16+
try:
17+
create_zip()
18+
setuptools.setup(
19+
name='yoficator',
20+
version='0.1.7',
21+
description='A Russian text yoficator (ёфикатор)',
22+
long_description=long_description,
23+
long_description_content_type='text/markdown',
24+
license='License :: OSI Approved :: MIT License',
25+
packages=['yoficator'],
26+
package_data={
27+
'yoficator': ['_data/dictionary.ru_RU.txt.bz2'],
28+
},
29+
include_package_data=True,
30+
classifiers=[
31+
'Programming Language :: Python :: 3',
32+
'License :: OSI Approved :: MIT License',
33+
'Operating System :: OS Independent',
34+
],
35+
python_requires='>=3',
36+
)
37+
finally:
38+
try:
39+
os.remove('yoficator/_data/dictionary.ru_RU.txt.bz2')
40+
except FileNotFoundError:
41+
pass

yoficator/__main__.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import os
33
import sys
44
import re
5+
import bz2
56

67
#-------------------------------------------------------------------------#
78
#
@@ -64,7 +65,7 @@
6465

6566
if __name__ == '__main__':
6667
# TODO Better handle lowercase, uppercase
67-
dictionary_file_path = os.path.abspath(os.path.dirname(__file__)) + '/_data/dictionary.ru_RU.txt'
68+
dictionary_file_path = os.path.abspath(os.path.dirname(__file__)) + '/_data/dictionary.ru_RU.txt.bz2'
6869

6970
if len(sys.argv) > 1:
7071
# Is the input a filename?
@@ -82,11 +83,10 @@
8283
# Splitter / tokenizer
8384
splitter = re.compile(r'(?P<word>[а-я]*е[а-я]*)|(?P<unknown>[^е]+\b)', re.IGNORECASE)
8485

85-
with open(dictionary_file_path) as stream:
86+
with bz2.open(dictionary_file_path) as stream:
8687
for line in iter(stream):
87-
if ':' in line:
88-
key, value = line.split(':')
89-
dictionary[key] = value.rstrip('\n')
88+
key, value = line.decode('utf-8').split(':')
89+
dictionary[key] = value.rstrip('\n')
9090

9191
for token in splitter.finditer(text):
9292
word = token.group(0)

0 commit comments

Comments
 (0)