Skip to content

Commit 5105181

Browse files
committed
Add new optimized function unicode_utf8_decode
Use it as a drop-in replacement instead of bitstring_utf8_decode. Signed-off-by: Davide Bettio <[email protected]>
1 parent 9117765 commit 5105181

File tree

6 files changed

+51
-88
lines changed

6 files changed

+51
-88
lines changed

src/libAtomVM/bitstring.c

Lines changed: 0 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -141,69 +141,6 @@ bool bitstring_utf8_encode(uint32_t c, uint8_t *buf, size_t *out_size)
141141
return true;
142142
}
143143

144-
enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size)
145-
{
146-
if (len == 0) {
147-
return UnicodeTransformDecodeFail;
148-
} else if (len >= 4 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80) && ((buf[3] & 0xC0) == 0x80)) {
149-
uint32_t v = 0;
150-
v |= (buf[0] & 0x07) << 18;
151-
v |= (buf[1] & 0x3F) << 12;
152-
v |= (buf[2] & 0x3F) << 6;
153-
v |= (buf[3] & 0x3F);
154-
// overlong encoding or invalid codepoint
155-
if (v <= 0x10000 || v > 0x10FFFF) {
156-
return UnicodeTransformDecodeFail;
157-
}
158-
*c = v;
159-
*out_size = 4;
160-
return UnicodeTransformDecodeSuccess;
161-
} else if (len >= 3 && (buf[0] & 0xF0) == 0xE0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80)) {
162-
uint32_t v = 0;
163-
v |= (buf[0] & 0x0F) << 12;
164-
v |= (buf[1] & 0x3F) << 6;
165-
v |= (buf[2] & 0x3F);
166-
// overlong encoding or surrogate
167-
if (v < 0x800 || (v >= 0xD800 && v <= 0xDFFF)) {
168-
return UnicodeTransformDecodeFail;
169-
}
170-
*c = v;
171-
*out_size = 3;
172-
return UnicodeTransformDecodeSuccess;
173-
} else if (len >= 2 && (buf[0] & 0xE0) == 0xC0 && ((buf[1] & 0xC0) == 0x80)) {
174-
uint32_t v = 0;
175-
v |= (buf[0] & 0x1F) << 6;
176-
v |= (buf[1] & 0x3F);
177-
// overlong encoding
178-
if (v < 0x80) {
179-
return UnicodeTransformDecodeFail;
180-
}
181-
*c = v;
182-
*out_size = 2;
183-
return UnicodeTransformDecodeSuccess;
184-
} else if ((*buf & 0x80) == 0) {
185-
uint32_t v = 0;
186-
v |= (buf[0] & 0x7F);
187-
*c = v;
188-
*out_size = 1;
189-
return UnicodeTransformDecodeSuccess;
190-
} else if (len == 3 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80)) {
191-
return UnicodeTransformDecodeIncomplete;
192-
} else if (len == 2 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80)) {
193-
return UnicodeTransformDecodeIncomplete;
194-
} else if (len == 1 && (buf[0] & 0xF8) == 0xF0) {
195-
return UnicodeTransformDecodeIncomplete;
196-
} else if (len == 2 && (buf[0] & 0xF0) == 0xE0 && ((buf[1] & 0xC0) == 0x80)) {
197-
return UnicodeTransformDecodeIncomplete;
198-
} else if (len == 1 && (buf[0] & 0xF0) == 0xE0) {
199-
return UnicodeTransformDecodeIncomplete;
200-
} else if (len == 1 && (buf[0] & 0xE0) == 0xC0) {
201-
return UnicodeTransformDecodeIncomplete;
202-
}
203-
204-
return UnicodeTransformDecodeFail;
205-
}
206-
207144
// UTF-16 encoding, when U in U+010000 to U+10FFFF:
208145
//
209146
// U' = yyyyyyyyyyxxxxxxxxxx // U - 0x10000

src/libAtomVM/bitstring.h

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#define _BITSTRING_H_
2424

2525
#include "term.h"
26+
#include "unicode.h"
2627

2728
#include <stdbool.h>
2829
#include <stdint.h>
@@ -99,13 +100,6 @@ enum BitstringFlags
99100
#endif
100101
};
101102

102-
enum UnicodeTransformDecodeResult
103-
{
104-
UnicodeTransformDecodeSuccess,
105-
UnicodeTransformDecodeFail,
106-
UnicodeTransformDecodeIncomplete
107-
};
108-
109103
union maybe_unsigned_int8
110104
{
111105
uint8_t u;
@@ -320,20 +314,6 @@ static inline bool bitstring_insert_integer(term dst_bin, size_t offset, avm_int
320314
*/
321315
bool bitstring_utf8_encode(uint32_t c, uint8_t *buf, size_t *out_size);
322316

323-
/**
324-
* @brief Decode a character from UTF-8.
325-
*
326-
* @param buf the buffer from which to decode the string
327-
* @param len the length (in bytes) of the bytes in buf
328-
* @param c int value to decode to or NULL to only compute the size.
329-
* @param out_size the size in bytes, on output (if not NULL)
330-
* @return \c UnicodeTransformDecodeSuccess if decoding was successful,
331-
* \c UnicodeTransformDecodeFail if character starting at buf is not a valid
332-
* unicode character or \c UnicodeTransformDecodeIncomplete if character
333-
* starting at buf is a valid but incomplete transformation
334-
*/
335-
enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size);
336-
337317
/**
338318
* @brief Encode a character to UTF-16.
339319
*
@@ -441,7 +421,7 @@ static inline bool bitstring_match_utf8(term src_bin, size_t offset, uint32_t *c
441421
{
442422
size_t byte_offset = offset >> 3; // divide by 8
443423
const uint8_t *src = (const uint8_t *) term_binary_data(src_bin) + byte_offset;
444-
return bitstring_utf8_decode(src, term_binary_size(src_bin) - byte_offset, c, out_size) == UnicodeTransformDecodeSuccess;
424+
return unicode_utf8_decode(src, term_binary_size(src_bin) - byte_offset, c, out_size) == UnicodeTransformDecodeSuccess;
445425
}
446426

447427
/**

src/libAtomVM/interop.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -385,7 +385,7 @@ static enum UnicodeConversionResult interop_binary_conversion(term t, uint8_t *o
385385
while (input_index < len) {
386386
size_t char_size;
387387
uint32_t c;
388-
enum UnicodeTransformDecodeResult decode_result = bitstring_utf8_decode(input + input_index, len - input_index, &c, &char_size);
388+
enum UnicodeTransformDecodeResult decode_result = unicode_utf8_decode(input + input_index, len - input_index, &c, &char_size);
389389
if (UNLIKELY(decode_result != UnicodeTransformDecodeSuccess)) {
390390
*rest_crsr = input_index;
391391
*output_len = result;

src/libAtomVM/nifs.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2197,7 +2197,7 @@ static term nif_erlang_atom_to_binary(Context *ctx, int argc, term argv[])
21972197
for (size_t i = 0; i < encoded_len; i++) {
21982198
size_t codepoint_size;
21992199
uint32_t codepoint;
2200-
if (UNLIKELY(bitstring_utf8_decode(
2200+
if (UNLIKELY(unicode_utf8_decode(
22012201
&utf8_tmp_buf[in_pos], 2, &codepoint, &codepoint_size)
22022202
!= UnicodeTransformDecodeSuccess
22032203
|| (codepoint > 255))) {
@@ -2238,7 +2238,7 @@ static term make_list_from_utf8_buf(const uint8_t *buf, size_t buf_len, Context
22382238
for (size_t i = 0; i < u8len; i++) {
22392239
size_t codepoint_size;
22402240
enum UnicodeTransformDecodeResult result
2241-
= bitstring_utf8_decode(u_in, buf_len, &codepoints[i], &codepoint_size);
2241+
= unicode_utf8_decode(u_in, buf_len, &codepoints[i], &codepoint_size);
22422242
if (UNLIKELY((result != UnicodeTransformDecodeSuccess)
22432243
|| !unicode_is_valid_codepoint(codepoints[i]))) {
22442244
AVM_ABORT();

src/libAtomVM/unicode.c

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
#include <stdbool.h>
2222
#include <stddef.h>
2323

24+
#include "utils.h"
25+
2426
#include "unicode.h"
2527

2628
// Following utf8d table and decode function are covered by MIT license
@@ -63,6 +65,28 @@ static inline uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte)
6365
return *state;
6466
}
6567

68+
enum UnicodeTransformDecodeResult unicode_utf8_decode(
69+
const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size)
70+
{
71+
uint32_t codepoint = 0;
72+
uint32_t state = 0;
73+
size_t i = 0;
74+
while (i < len) {
75+
state = decode(&state, &codepoint, buf[i]);
76+
i++;
77+
78+
if (state == UTF8_ACCEPT) {
79+
*c = codepoint;
80+
*out_size = i;
81+
return UnicodeTransformDecodeSuccess;
82+
} else if (UNLIKELY(state == UTF8_REJECT)) {
83+
return UnicodeTransformDecodeFail;
84+
}
85+
}
86+
87+
return UnicodeTransformDecodeIncomplete;
88+
}
89+
6690
bool unicode_is_valid_utf8_buf(const uint8_t *buf, size_t len)
6791
{
6892
uint32_t codepoint = 0;

src/libAtomVM/unicode.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,13 @@
2929
extern "C" {
3030
#endif
3131

32+
enum UnicodeTransformDecodeResult
33+
{
34+
UnicodeTransformDecodeSuccess,
35+
UnicodeTransformDecodeFail,
36+
UnicodeTransformDecodeIncomplete
37+
};
38+
3239
size_t unicode_buf_utf8_len(const uint8_t *buf, size_t buf_len);
3340
bool unicode_buf_is_ascii(const uint8_t *buf, size_t buf_len);
3441
size_t unicode_latin1_buf_size_as_utf8(const uint8_t *buf, size_t len);
@@ -40,6 +47,21 @@ static inline bool unicode_is_valid_codepoint(uint32_t codepoint)
4047
return (codepoint < 0x110000) && !((codepoint > 0xD800) && (codepoint < 0xDFFF));
4148
}
4249

50+
/**
51+
* @brief Decode a character from UTF-8.
52+
*
53+
* @param buf the buffer from which to decode the string
54+
* @param len the length (in bytes) of the bytes in buf
55+
* @param c int value to decode to
56+
* @param out_size the size in bytes, on output (if not NULL)
57+
* @return \c UnicodeTransformDecodeSuccess if decoding was successful,
58+
* \c UnicodeTransformDecodeFail if character starting at buf is not a valid
59+
* unicode character or \c UnicodeTransformDecodeIncomplete if character
60+
* starting at buf is a valid but incomplete transformation
61+
*/
62+
enum UnicodeTransformDecodeResult unicode_utf8_decode(
63+
const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size);
64+
4365
bool unicode_is_valid_utf8_buf(const uint8_t *buf, size_t len);
4466

4567
#ifdef __cplusplus

0 commit comments

Comments
 (0)