Skip to content

Commit 6cad455

Browse files
authoredNov 22, 2023
Add KOI8-U encoding (ruby#1906)
* Add test for KOI8-U * Rename koi8 char_width function - Rename function for use with any KOI8-based encoding * Add KOI8-U encoding * Add encoding to encoding.md
1 parent 3c7c47b commit 6cad455

File tree

6 files changed

+42
-2
lines changed

6 files changed

+42
-2
lines changed
 

‎bin/encodings

+1
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ when "iso-8859-14" then table(Encoding::ISO8859_14)
8686
when "iso-8859-15" then table(Encoding::ISO8859_15)
8787
when "iso-8859-16" then table(Encoding::ISO8859_16)
8888
when "koi8-r" then table(Encoding::KOI8_R)
89+
when "koi8-u" then table(Encoding::KOI8_U)
8990
when "maccenteuro" then table(Encoding::MACCENTEURO)
9091
when "maccroatian" then table(Encoding::MACCROATIAN)
9192
when "maccyrillic" then table(Encoding::MACCYRILLIC)

‎docs/encoding.md

+1
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ The key of the comment can be either "encoding" or "coding". The value of the co
5353
* `ISO-8859-15`
5454
* `ISO-8859-16`
5555
* `KOI8-R`
56+
* `KOI8-U`
5657
* `macCentEuro`
5758
* `macCroatian`
5859
* `macCyrillic`

‎include/prism/enc/pm_encoding.h

+1
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ extern pm_encoding_t pm_encoding_iso_8859_14;
198198
extern pm_encoding_t pm_encoding_iso_8859_15;
199199
extern pm_encoding_t pm_encoding_iso_8859_16;
200200
extern pm_encoding_t pm_encoding_koi8_r;
201+
extern pm_encoding_t pm_encoding_koi8_u;
201202
extern pm_encoding_t pm_encoding_mac_cent_euro;
202203
extern pm_encoding_t pm_encoding_mac_croatian;
203204
extern pm_encoding_t pm_encoding_mac_cyrillic;

‎src/enc/pm_tables.c

+37-2
Original file line numberDiff line numberDiff line change
@@ -864,6 +864,30 @@ static uint8_t pm_encoding_koi8_r_table[256] = {
864864
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Fx
865865
};
866866

867+
/**
868+
* Each element of the following table contains a bitfield that indicates a
869+
* piece of information about the corresponding KOI8-U character.
870+
*/
871+
static uint8_t pm_encoding_koi8_u_table[256] = {
872+
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
873+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
874+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
875+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x
876+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x
877+
0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x
878+
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x
879+
0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x
880+
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x
881+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x
882+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x
883+
0, 0, 0, 3, 3, 0, 3, 3, 0, 0, 0, 0, 0, 3, 0, 0, // Ax
884+
0, 0, 0, 7, 7, 0, 7, 7, 0, 0, 0, 0, 0, 7, 0, 0, // Bx
885+
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Cx
886+
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Dx
887+
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Ex
888+
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Fx
889+
};
890+
867891
/**
868892
* Each element of the following table contains a bitfield that indicates a
869893
* piece of information about the corresponding macCentEuro character.
@@ -1418,7 +1442,7 @@ pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATT
14181442
* checking if it's a valid codepoint in KOI-8 and if it is returning 1.
14191443
*/
14201444
static size_t
1421-
pm_encoding_koi8_r_char_width(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
1445+
pm_encoding_koi8_char_width(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
14221446
return ((*b >= 0x20 && *b <= 0x7E) || (*b >= 0x80)) ? 1 : 0;
14231447
}
14241448

@@ -1468,6 +1492,7 @@ PRISM_ENCODING_TABLE(iso_8859_14)
14681492
PRISM_ENCODING_TABLE(iso_8859_15)
14691493
PRISM_ENCODING_TABLE(iso_8859_16)
14701494
PRISM_ENCODING_TABLE(koi8_r)
1495+
PRISM_ENCODING_TABLE(koi8_u)
14711496
PRISM_ENCODING_TABLE(mac_cent_euro)
14721497
PRISM_ENCODING_TABLE(mac_croatian)
14731498
PRISM_ENCODING_TABLE(mac_cyrillic)
@@ -1855,13 +1880,23 @@ pm_encoding_t pm_encoding_iso_8859_16 = {
18551880
/** KOI8-R */
18561881
pm_encoding_t pm_encoding_koi8_r = {
18571882
.name = "KOI8-R",
1858-
.char_width = pm_encoding_koi8_r_char_width,
1883+
.char_width = pm_encoding_koi8_char_width,
18591884
.alnum_char = pm_encoding_koi8_r_alnum_char,
18601885
.alpha_char = pm_encoding_koi8_r_alpha_char,
18611886
.isupper_char = pm_encoding_koi8_r_isupper_char,
18621887
.multibyte = false
18631888
};
18641889

1890+
/** KOI8-U */
1891+
pm_encoding_t pm_encoding_koi8_u = {
1892+
.name = "KOI8-U",
1893+
.char_width = pm_encoding_koi8_char_width,
1894+
.alnum_char = pm_encoding_koi8_u_alnum_char,
1895+
.alpha_char = pm_encoding_koi8_u_alpha_char,
1896+
.isupper_char = pm_encoding_koi8_u_isupper_char,
1897+
.multibyte = false
1898+
};
1899+
18651900
/** macCentEuro */
18661901
pm_encoding_t pm_encoding_mac_cent_euro = {
18671902
.name = "macCentEuro",

‎src/prism.c

+1
Original file line numberDiff line numberDiff line change
@@ -6223,6 +6223,7 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star
62236223
break;
62246224
case 'K': case 'k':
62256225
ENCODING1("KOI8-R", pm_encoding_koi8_r);
6226+
ENCODING1("KOI8-U", pm_encoding_koi8_u);
62266227
break;
62276228
case 'L': case 'l':
62286229
ENCODING1("locale", pm_encoding_utf_8);

‎test/prism/encoding_test.rb

+1
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ class EncodingTest < TestCase
4444
Encoding::ISO_8859_15 => 0x00...0x100,
4545
Encoding::ISO_8859_16 => 0x00...0x100,
4646
Encoding::KOI8_R => 0x00...0x100,
47+
Encoding::KOI8_U => 0x00...0x100,
4748
Encoding::MACCENTEURO => 0x00...0x100,
4849
Encoding::MACCROATIAN => 0x00...0x100,
4950
Encoding::MACCYRILLIC => 0x00...0x100,

0 commit comments

Comments
 (0)
Please sign in to comment.