feature: optimize regexp pattern

unsektor · unsektor · commit cd11eb2b893b · 2025-02-15T22:40:40.000+03:00
diff --git a/changelog.md b/changelog.md
@@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## 0.1.6 — 2025-02-15
+### Changed
+
+- [x] optimize regexp pattern
+
 ## 0.1.5 — 2025-02-15
 ### Changed
 
diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 
 setuptools.setup(
     name='yoficator',
-    version='0.1.5',
+    version='0.1.6',
     description='A Russian text yoficator (ёфикатор)',
     long_description=long_description,
     long_description_content_type='text/markdown',
diff --git a/yoficator/__main__.py b/yoficator/__main__.py
@@ -80,7 +80,7 @@
     dictionary = {}
 
     # Splitter / tokenizer
-    splitter = re.compile(r'(\s+|\w+|\W+|\S+)')
+    splitter = re.compile(r'(?P<word>[а-я]*е[а-я]*)|(?P<unknown>[^е]+\b)', re.IGNORECASE)
 
     with open(dictionary_file_path) as stream:
         for line in iter(stream):
@@ -89,7 +89,9 @@
                 dictionary[key] = value.rstrip('\n')
 
     for token in splitter.finditer(text):
-        if token in dictionary:
-            print(dictionary[token], end='')
-        else:
-            print(token, end='')
+        word = token.group(0)
+        if token.lastgroup == 'word':
+            print(dictionary.get(word, word), end='')
+            continue
+
+        print(word, end='')