Skip to content

Commit

Permalink
Add new optimized function unicode_utf8_decode
Browse files Browse the repository at this point in the history
Use it as a drop-in replacement instead of bitstring_utf8_decode.

Signed-off-by: Davide Bettio <[email protected]>
  • Loading branch information
bettio committed Jan 22, 2025
1 parent 9117765 commit 5105181
Show file tree
Hide file tree
Showing 6 changed files with 51 additions and 88 deletions.
63 changes: 0 additions & 63 deletions src/libAtomVM/bitstring.c
Original file line number Diff line number Diff line change
Expand Up @@ -141,69 +141,6 @@ bool bitstring_utf8_encode(uint32_t c, uint8_t *buf, size_t *out_size)
return true;
}

enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size)
{
if (len == 0) {
return UnicodeTransformDecodeFail;
} else if (len >= 4 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80) && ((buf[3] & 0xC0) == 0x80)) {
uint32_t v = 0;
v |= (buf[0] & 0x07) << 18;
v |= (buf[1] & 0x3F) << 12;
v |= (buf[2] & 0x3F) << 6;
v |= (buf[3] & 0x3F);
// overlong encoding or invalid codepoint
if (v <= 0x10000 || v > 0x10FFFF) {
return UnicodeTransformDecodeFail;
}
*c = v;
*out_size = 4;
return UnicodeTransformDecodeSuccess;
} else if (len >= 3 && (buf[0] & 0xF0) == 0xE0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80)) {
uint32_t v = 0;
v |= (buf[0] & 0x0F) << 12;
v |= (buf[1] & 0x3F) << 6;
v |= (buf[2] & 0x3F);
// overlong encoding or surrogate
if (v < 0x800 || (v >= 0xD800 && v <= 0xDFFF)) {
return UnicodeTransformDecodeFail;
}
*c = v;
*out_size = 3;
return UnicodeTransformDecodeSuccess;
} else if (len >= 2 && (buf[0] & 0xE0) == 0xC0 && ((buf[1] & 0xC0) == 0x80)) {
uint32_t v = 0;
v |= (buf[0] & 0x1F) << 6;
v |= (buf[1] & 0x3F);
// overlong encoding
if (v < 0x80) {
return UnicodeTransformDecodeFail;
}
*c = v;
*out_size = 2;
return UnicodeTransformDecodeSuccess;
} else if ((*buf & 0x80) == 0) {
uint32_t v = 0;
v |= (buf[0] & 0x7F);
*c = v;
*out_size = 1;
return UnicodeTransformDecodeSuccess;
} else if (len == 3 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80)) {
return UnicodeTransformDecodeIncomplete;
} else if (len == 2 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80)) {
return UnicodeTransformDecodeIncomplete;
} else if (len == 1 && (buf[0] & 0xF8) == 0xF0) {
return UnicodeTransformDecodeIncomplete;
} else if (len == 2 && (buf[0] & 0xF0) == 0xE0 && ((buf[1] & 0xC0) == 0x80)) {
return UnicodeTransformDecodeIncomplete;
} else if (len == 1 && (buf[0] & 0xF0) == 0xE0) {
return UnicodeTransformDecodeIncomplete;
} else if (len == 1 && (buf[0] & 0xE0) == 0xC0) {
return UnicodeTransformDecodeIncomplete;
}

return UnicodeTransformDecodeFail;
}

// UTF-16 encoding, when U in U+010000 to U+10FFFF:
//
// U' = yyyyyyyyyyxxxxxxxxxx // U - 0x10000
Expand Down
24 changes: 2 additions & 22 deletions src/libAtomVM/bitstring.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#define _BITSTRING_H_

#include "term.h"
#include "unicode.h"

#include <stdbool.h>
#include <stdint.h>
Expand Down Expand Up @@ -99,13 +100,6 @@ enum BitstringFlags
#endif
};

enum UnicodeTransformDecodeResult
{
UnicodeTransformDecodeSuccess,
UnicodeTransformDecodeFail,
UnicodeTransformDecodeIncomplete
};

union maybe_unsigned_int8
{
uint8_t u;
Expand Down Expand Up @@ -320,20 +314,6 @@ static inline bool bitstring_insert_integer(term dst_bin, size_t offset, avm_int
*/
bool bitstring_utf8_encode(uint32_t c, uint8_t *buf, size_t *out_size);

/**
* @brief Decode a character from UTF-8.
*
* @param buf the buffer from which to decode the string
* @param len the length (in bytes) of the bytes in buf
* @param c int value to decode to or NULL to only compute the size.
* @param out_size the size in bytes, on output (if not NULL)
* @return \c UnicodeTransformDecodeSuccess if decoding was successful,
* \c UnicodeTransformDecodeFail if character starting at buf is not a valid
* unicode character or \c UnicodeTransformDecodeIncomplete if character
* starting at buf is a valid but incomplete transformation
*/
enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size);

/**
* @brief Encode a character to UTF-16.
*
Expand Down Expand Up @@ -441,7 +421,7 @@ static inline bool bitstring_match_utf8(term src_bin, size_t offset, uint32_t *c
{
size_t byte_offset = offset >> 3; // divide by 8
const uint8_t *src = (const uint8_t *) term_binary_data(src_bin) + byte_offset;
return bitstring_utf8_decode(src, term_binary_size(src_bin) - byte_offset, c, out_size) == UnicodeTransformDecodeSuccess;
return unicode_utf8_decode(src, term_binary_size(src_bin) - byte_offset, c, out_size) == UnicodeTransformDecodeSuccess;
}

/**
Expand Down
2 changes: 1 addition & 1 deletion src/libAtomVM/interop.c
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ static enum UnicodeConversionResult interop_binary_conversion(term t, uint8_t *o
while (input_index < len) {
size_t char_size;
uint32_t c;
enum UnicodeTransformDecodeResult decode_result = bitstring_utf8_decode(input + input_index, len - input_index, &c, &char_size);
enum UnicodeTransformDecodeResult decode_result = unicode_utf8_decode(input + input_index, len - input_index, &c, &char_size);
if (UNLIKELY(decode_result != UnicodeTransformDecodeSuccess)) {
*rest_crsr = input_index;
*output_len = result;
Expand Down
4 changes: 2 additions & 2 deletions src/libAtomVM/nifs.c
Original file line number Diff line number Diff line change
Expand Up @@ -2197,7 +2197,7 @@ static term nif_erlang_atom_to_binary(Context *ctx, int argc, term argv[])
for (size_t i = 0; i < encoded_len; i++) {
size_t codepoint_size;
uint32_t codepoint;
if (UNLIKELY(bitstring_utf8_decode(
if (UNLIKELY(unicode_utf8_decode(
&utf8_tmp_buf[in_pos], 2, &codepoint, &codepoint_size)
!= UnicodeTransformDecodeSuccess
|| (codepoint > 255))) {
Expand Down Expand Up @@ -2238,7 +2238,7 @@ static term make_list_from_utf8_buf(const uint8_t *buf, size_t buf_len, Context
for (size_t i = 0; i < u8len; i++) {
size_t codepoint_size;
enum UnicodeTransformDecodeResult result
= bitstring_utf8_decode(u_in, buf_len, &codepoints[i], &codepoint_size);
= unicode_utf8_decode(u_in, buf_len, &codepoints[i], &codepoint_size);
if (UNLIKELY((result != UnicodeTransformDecodeSuccess)
|| !unicode_is_valid_codepoint(codepoints[i]))) {
AVM_ABORT();
Expand Down
24 changes: 24 additions & 0 deletions src/libAtomVM/unicode.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
#include <stdbool.h>
#include <stddef.h>

#include "utils.h"

#include "unicode.h"

// Following utf8d table and decode function are covered by MIT license
Expand Down Expand Up @@ -63,6 +65,28 @@ static inline uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte)
return *state;
}

enum UnicodeTransformDecodeResult unicode_utf8_decode(
const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size)
{
uint32_t codepoint = 0;
uint32_t state = 0;
size_t i = 0;
while (i < len) {
state = decode(&state, &codepoint, buf[i]);
i++;

if (state == UTF8_ACCEPT) {
*c = codepoint;
*out_size = i;
return UnicodeTransformDecodeSuccess;
} else if (UNLIKELY(state == UTF8_REJECT)) {
return UnicodeTransformDecodeFail;
}
}

return UnicodeTransformDecodeIncomplete;
}

bool unicode_is_valid_utf8_buf(const uint8_t *buf, size_t len)
{
uint32_t codepoint = 0;
Expand Down
22 changes: 22 additions & 0 deletions src/libAtomVM/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,13 @@
extern "C" {
#endif

enum UnicodeTransformDecodeResult
{
UnicodeTransformDecodeSuccess,
UnicodeTransformDecodeFail,
UnicodeTransformDecodeIncomplete
};

size_t unicode_buf_utf8_len(const uint8_t *buf, size_t buf_len);
bool unicode_buf_is_ascii(const uint8_t *buf, size_t buf_len);
size_t unicode_latin1_buf_size_as_utf8(const uint8_t *buf, size_t len);
Expand All @@ -40,6 +47,21 @@ static inline bool unicode_is_valid_codepoint(uint32_t codepoint)
return (codepoint < 0x110000) && !((codepoint > 0xD800) && (codepoint < 0xDFFF));
}

/**
* @brief Decode a character from UTF-8.
*
* @param buf the buffer from which to decode the string
* @param len the length (in bytes) of the bytes in buf
* @param c int value to decode to
* @param out_size the size in bytes, on output (if not NULL)
* @return \c UnicodeTransformDecodeSuccess if decoding was successful,
* \c UnicodeTransformDecodeFail if character starting at buf is not a valid
* unicode character or \c UnicodeTransformDecodeIncomplete if character
* starting at buf is a valid but incomplete transformation
*/
enum UnicodeTransformDecodeResult unicode_utf8_decode(
const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size);

bool unicode_is_valid_utf8_buf(const uint8_t *buf, size_t len);

#ifdef __cplusplus
Expand Down

0 comments on commit 5105181

Please sign in to comment.