Skip to content

Commit 06bce51

Browse files
authored
feat: added ukrainian vocab (#1700)
1 parent d7f4533 commit 06bce51

File tree

2 files changed

+8
-0
lines changed

2 files changed

+8
-0
lines changed

docs/source/modules/datasets.rst

+6
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,9 @@ of vocabs.
9494
* - arabic_letters
9595
- 37
9696
- ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىي
97+
* - generic_cyrillic_letters
98+
- 58
99+
- абвгдежзийклмнопрстуфхцчшщьюяАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯ
97100
* - persian_letters
98101
- 5
99102
- پچڢڤگ
@@ -151,6 +154,9 @@ of vocabs.
151154
* - swedish
152155
- 106
153156
- 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿åäöÅÄÖ
157+
* - ukrainian
158+
- 115
159+
- абвгдежзийклмнопрстуфхцчшщьюяАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯ0123456789!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ґіїєҐІЇЄ₴
154160
* - vietnamese
155161
- 236
156162
- 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áàảạãăắằẳẵặâấầẩẫậđéèẻẽẹêếềểễệóòỏõọôốồổộỗơớờởợỡúùủũụưứừửữựiíìỉĩịýỳỷỹỵÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬĐÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰIÍÌỈĨỊÝỲỶỸỴ

doctr/datasets/vocabs.py

+2
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
"hindi_punctuation": "।,?!:्ॐ॰॥॰",
2626
"bangla_letters": "অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহ়ঽািীুূৃেৈোৌ্ৎংঃঁ",
2727
"bangla_digits": "০১২৩৪৫৬৭৮৯",
28+
"generic_cyrillic_letters": "абвгдежзийклмнопрстуфхцчшщьюяАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯ",
2829
}
2930

3031
VOCABS["latin"] = VOCABS["digits"] + VOCABS["ascii_letters"] + VOCABS["punctuation"]
@@ -59,6 +60,7 @@
5960
VOCABS["hebrew"] = VOCABS["english"] + "אבגדהוזחטיכלמנסעפצקרשת" + "₪"
6061
VOCABS["hindi"] = VOCABS["hindi_letters"] + VOCABS["hindi_digits"] + VOCABS["hindi_punctuation"]
6162
VOCABS["bangla"] = VOCABS["bangla_letters"] + VOCABS["bangla_digits"]
63+
VOCABS["ukrainian"] = VOCABS["generic_cyrillic_letters"] + VOCABS["digits"] + VOCABS["punctuation"] + VOCABS["currency"] + "ґіїєҐІЇЄ₴"
6264
VOCABS["multilingual"] = "".join(
6365
dict.fromkeys(
6466
VOCABS["french"]

0 commit comments

Comments
 (0)