Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimized UTF-8 #1051

Merged
merged 4 commits into from
Feb 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions LICENSES/MIT.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) [year] [fullname]

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
1 change: 0 additions & 1 deletion doc/src/apidocs/libatomvm/functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ Functions
.. doxygenfunction:: bitstring_utf16_size
.. doxygenfunction:: bitstring_utf32_decode
.. doxygenfunction:: bitstring_utf32_encode
.. doxygenfunction:: bitstring_utf8_decode
.. doxygenfunction:: bitstring_utf8_encode
.. doxygenfunction:: bitstring_utf8_size
.. doxygenfunction:: context_avail_free_memory
Expand Down
15 changes: 3 additions & 12 deletions src/libAtomVM/bif.c
Original file line number Diff line number Diff line change
Expand Up @@ -1638,19 +1638,10 @@ term binary_to_atom(Context *ctx, term a_binary, term encoding, bool create_new,

AtomString atom;
if (LIKELY(!encode_latin1_to_utf8)) {
size_t i = 0;
while (i < atom_string_len) {
uint32_t codepoint;
size_t codepoint_size;
if (UNLIKELY(bitstring_utf8_decode(
(uint8_t *) atom_string + i, atom_string_len, &codepoint, &codepoint_size))
!= UnicodeTransformDecodeSuccess) {
*error_reason = BADARG_ATOM;
return term_invalid_term();
}
i += codepoint_size;
if (UNLIKELY(!unicode_is_valid_utf8_buf((const uint8_t *) atom_string, atom_string_len))) {
*error_reason = BADARG_ATOM;
return term_invalid_term();
}

atom = malloc(atom_string_len + 1);
if (IS_NULL_PTR(atom)) {
*error_reason = OUT_OF_MEMORY_ATOM;
Expand Down
63 changes: 0 additions & 63 deletions src/libAtomVM/bitstring.c
Original file line number Diff line number Diff line change
Expand Up @@ -141,69 +141,6 @@ bool bitstring_utf8_encode(uint32_t c, uint8_t *buf, size_t *out_size)
return true;
}

enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size)
{
if (len == 0) {
return UnicodeTransformDecodeFail;
} else if (len >= 4 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80) && ((buf[3] & 0xC0) == 0x80)) {
uint32_t v = 0;
v |= (buf[0] & 0x07) << 18;
v |= (buf[1] & 0x3F) << 12;
v |= (buf[2] & 0x3F) << 6;
v |= (buf[3] & 0x3F);
// overlong encoding or invalid codepoint
if (v <= 0x10000 || v > 0x10FFFF) {
return UnicodeTransformDecodeFail;
}
*c = v;
*out_size = 4;
return UnicodeTransformDecodeSuccess;
} else if (len >= 3 && (buf[0] & 0xF0) == 0xE0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80)) {
uint32_t v = 0;
v |= (buf[0] & 0x0F) << 12;
v |= (buf[1] & 0x3F) << 6;
v |= (buf[2] & 0x3F);
// overlong encoding or surrogate
if (v < 0x800 || (v >= 0xD800 && v <= 0xDFFF)) {
return UnicodeTransformDecodeFail;
}
*c = v;
*out_size = 3;
return UnicodeTransformDecodeSuccess;
} else if (len >= 2 && (buf[0] & 0xE0) == 0xC0 && ((buf[1] & 0xC0) == 0x80)) {
uint32_t v = 0;
v |= (buf[0] & 0x1F) << 6;
v |= (buf[1] & 0x3F);
// overlong encoding
if (v < 0x80) {
return UnicodeTransformDecodeFail;
}
*c = v;
*out_size = 2;
return UnicodeTransformDecodeSuccess;
} else if ((*buf & 0x80) == 0) {
uint32_t v = 0;
v |= (buf[0] & 0x7F);
*c = v;
*out_size = 1;
return UnicodeTransformDecodeSuccess;
} else if (len == 3 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80) && ((buf[2] & 0xC0) == 0x80)) {
return UnicodeTransformDecodeIncomplete;
} else if (len == 2 && (buf[0] & 0xF8) == 0xF0 && ((buf[1] & 0xC0) == 0x80)) {
return UnicodeTransformDecodeIncomplete;
} else if (len == 1 && (buf[0] & 0xF8) == 0xF0) {
return UnicodeTransformDecodeIncomplete;
} else if (len == 2 && (buf[0] & 0xF0) == 0xE0 && ((buf[1] & 0xC0) == 0x80)) {
return UnicodeTransformDecodeIncomplete;
} else if (len == 1 && (buf[0] & 0xF0) == 0xE0) {
return UnicodeTransformDecodeIncomplete;
} else if (len == 1 && (buf[0] & 0xE0) == 0xC0) {
return UnicodeTransformDecodeIncomplete;
}

return UnicodeTransformDecodeFail;
}

// UTF-16 encoding, when U in U+010000 to U+10FFFF:
//
// U' = yyyyyyyyyyxxxxxxxxxx // U - 0x10000
Expand Down
24 changes: 2 additions & 22 deletions src/libAtomVM/bitstring.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#define _BITSTRING_H_

#include "term.h"
#include "unicode.h"

#include <stdbool.h>
#include <stdint.h>
Expand Down Expand Up @@ -99,13 +100,6 @@ enum BitstringFlags
#endif
};

enum UnicodeTransformDecodeResult
{
UnicodeTransformDecodeSuccess,
UnicodeTransformDecodeFail,
UnicodeTransformDecodeIncomplete
};

union maybe_unsigned_int8
{
uint8_t u;
Expand Down Expand Up @@ -320,20 +314,6 @@ static inline bool bitstring_insert_integer(term dst_bin, size_t offset, avm_int
*/
bool bitstring_utf8_encode(uint32_t c, uint8_t *buf, size_t *out_size);

/**
* @brief Decode a character from UTF-8.
*
* @param buf the buffer from which to decode the string
* @param len the length (in bytes) of the bytes in buf
* @param c int value to decode to or NULL to only compute the size.
* @param out_size the size in bytes, on output (if not NULL)
* @return \c UnicodeTransformDecodeSuccess if decoding was successful,
* \c UnicodeTransformDecodeFail if character starting at buf is not a valid
* unicode character or \c UnicodeTransformDecodeIncomplete if character
* starting at buf is a valid but incomplete transformation
*/
enum UnicodeTransformDecodeResult bitstring_utf8_decode(const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size);

/**
* @brief Encode a character to UTF-16.
*
Expand Down Expand Up @@ -441,7 +421,7 @@ static inline bool bitstring_match_utf8(term src_bin, size_t offset, uint32_t *c
{
size_t byte_offset = offset >> 3; // divide by 8
const uint8_t *src = (const uint8_t *) term_binary_data(src_bin) + byte_offset;
return bitstring_utf8_decode(src, term_binary_size(src_bin) - byte_offset, c, out_size) == UnicodeTransformDecodeSuccess;
return unicode_utf8_decode(src, term_binary_size(src_bin) - byte_offset, c, out_size) == UnicodeTransformDecodeSuccess;
}

/**
Expand Down
13 changes: 2 additions & 11 deletions src/libAtomVM/externalterm.c
Original file line number Diff line number Diff line change
Expand Up @@ -636,17 +636,8 @@ static term parse_external_terms(const uint8_t *external_term_buf, size_t *eterm
uint8_t atom_len = *(external_term_buf + 1);
const uint8_t *atom_chars = external_term_buf + 2;

size_t remaining_length = atom_len;
const uint8_t *curr_buf = atom_chars;
while (remaining_length) {
uint32_t out_c;
size_t codepoint_size;
enum UnicodeTransformDecodeResult result = bitstring_utf8_decode(curr_buf, remaining_length, &out_c, &codepoint_size);
if (UNLIKELY(result != UnicodeTransformDecodeSuccess)) {
return term_invalid_term();
}
remaining_length -= codepoint_size;
curr_buf += codepoint_size;
if (UNLIKELY(!unicode_is_valid_utf8_buf((const uint8_t *) atom_chars, atom_len))) {
return term_invalid_term();
}

// AtomString first byte is the atom length
Expand Down
2 changes: 1 addition & 1 deletion src/libAtomVM/interop.c
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ static enum UnicodeConversionResult interop_binary_conversion(term t, uint8_t *o
while (input_index < len) {
size_t char_size;
uint32_t c;
enum UnicodeTransformDecodeResult decode_result = bitstring_utf8_decode(input + input_index, len - input_index, &c, &char_size);
enum UnicodeTransformDecodeResult decode_result = unicode_utf8_decode(input + input_index, len - input_index, &c, &char_size);
if (UNLIKELY(decode_result != UnicodeTransformDecodeSuccess)) {
*rest_crsr = input_index;
*output_len = result;
Expand Down
4 changes: 2 additions & 2 deletions src/libAtomVM/nifs.c
Original file line number Diff line number Diff line change
Expand Up @@ -2081,7 +2081,7 @@ static term nif_erlang_atom_to_binary(Context *ctx, int argc, term argv[])
for (size_t i = 0; i < encoded_len; i++) {
size_t codepoint_size;
uint32_t codepoint;
if (UNLIKELY(bitstring_utf8_decode(
if (UNLIKELY(unicode_utf8_decode(
&utf8_tmp_buf[in_pos], 2, &codepoint, &codepoint_size)
!= UnicodeTransformDecodeSuccess
|| (codepoint > 255))) {
Expand Down Expand Up @@ -2122,7 +2122,7 @@ static term make_list_from_utf8_buf(const uint8_t *buf, size_t buf_len, Context
for (size_t i = 0; i < u8len; i++) {
size_t codepoint_size;
enum UnicodeTransformDecodeResult result
= bitstring_utf8_decode(u_in, buf_len, &codepoints[i], &codepoint_size);
= unicode_utf8_decode(u_in, buf_len, &codepoints[i], &codepoint_size);
if (UNLIKELY((result != UnicodeTransformDecodeSuccess)
|| !unicode_is_valid_codepoint(codepoints[i]))) {
AVM_ABORT();
Expand Down
82 changes: 81 additions & 1 deletion src/libAtomVM/unicode.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,94 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*
* SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
* SPDX-License-Identifier: (Apache-2.0 OR LGPL-2.1-or-later) AND MIT
*/

#include <stdbool.h>
#include <stddef.h>

#include "utils.h"

#include "unicode.h"

// clang-format off

// Following utf8d table and decode function are covered by MIT license
// Copyright (c) 2008-2010 Bjoern Hoehrmann <[email protected]>
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.

#define UTF8_ACCEPT 0
#define UTF8_REJECT 12

static const uint8_t utf8d[] = {
// The first part of the table maps bytes to character classes that
// to reduce the size of the transition table and create bitmasks.
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,

// The second part is a transition table that maps a combination
// of a state of the automaton and a character class to a state.
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
12,36,12,12,12,12,12,12,12,12,12,12,
};

static inline uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte)
{
uint32_t type = utf8d[byte];

*codep = (*state != UTF8_ACCEPT) ?
(byte & 0x3fu) | (*codep << 6) :
(0xff >> type) & (byte);

*state = utf8d[256 + *state + type];
return *state;
}

// clang-format on

enum UnicodeTransformDecodeResult unicode_utf8_decode(
const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size)
{
uint32_t codepoint = 0;
uint32_t state = 0;
size_t i = 0;
while (i < len) {
state = decode(&state, &codepoint, buf[i]);
i++;

if (state == UTF8_ACCEPT) {
*c = codepoint;
*out_size = i;
return UnicodeTransformDecodeSuccess;
} else if (UNLIKELY(state == UTF8_REJECT)) {
return UnicodeTransformDecodeFail;
}
}

return UnicodeTransformDecodeIncomplete;
}

bool unicode_is_valid_utf8_buf(const uint8_t *buf, size_t len)
{
uint32_t codepoint = 0;
uint32_t state = 0;

for (size_t i = 0; i < len; i++) {
state = decode(&state, &codepoint, buf[i]);
}

return state == UTF8_ACCEPT;
}

size_t unicode_buf_utf8_len(const uint8_t *buf, size_t buf_len)
{
size_t count = 0;
Expand Down
24 changes: 24 additions & 0 deletions src/libAtomVM/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,13 @@
extern "C" {
#endif

enum UnicodeTransformDecodeResult
{
UnicodeTransformDecodeSuccess,
UnicodeTransformDecodeFail,
UnicodeTransformDecodeIncomplete
};

size_t unicode_buf_utf8_len(const uint8_t *buf, size_t buf_len);
bool unicode_buf_is_ascii(const uint8_t *buf, size_t buf_len);
size_t unicode_latin1_buf_size_as_utf8(const uint8_t *buf, size_t len);
Expand All @@ -40,6 +47,23 @@ static inline bool unicode_is_valid_codepoint(uint32_t codepoint)
return (codepoint < 0x110000) && !((codepoint > 0xD800) && (codepoint < 0xDFFF));
}

/**
* @brief Decode a character from UTF-8.
*
* @param buf the buffer from which to decode the string
* @param len the length (in bytes) of the bytes in buf
* @param c int value to decode to
* @param out_size the size in bytes, on output (if not NULL)
* @return \c UnicodeTransformDecodeSuccess if decoding was successful,
* \c UnicodeTransformDecodeFail if character starting at buf is not a valid
* unicode character or \c UnicodeTransformDecodeIncomplete if character
* starting at buf is a valid but incomplete transformation
*/
enum UnicodeTransformDecodeResult unicode_utf8_decode(
const uint8_t *buf, size_t len, uint32_t *c, size_t *out_size);

bool unicode_is_valid_utf8_buf(const uint8_t *buf, size_t len);

#ifdef __cplusplus
}
#endif
Expand Down
Loading