Skip to content

Unicode: fix reading buffer overflow in valid_utf8() #5531

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Sep 25, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions src/Makefile.in
Original file line number Diff line number Diff line change
@@ -803,7 +803,10 @@ testfiles:
###############################################################################

UNIT_TEST_OBJS = \
tests/unit-tests.o tests/misc.o tests/common.o tests/memory.o tests/sha2.o
tests/unit-tests.o tests/misc.o tests/common.o tests/memory.o tests/sha2.o tests/unicode.o

UNIT_TEST_INCLUDED_PIECES = \
tests/test_valid_utf8.c

tests/unit-tests.o: tests/unit-tests.c common.h memory.h misc.h
$(CC) -o tests/unit-tests.o $(CFLAGS) -DFORCE_GENERIC_SHA2 -D_JOHN_MISC_NO_LOG tests/unit-tests.c
@@ -820,10 +823,13 @@ tests/common.o: common.c arch.h common.h memory.h os.h os-autoconf.h autoconfig.
tests/memory.o: memory.c arch.h misc.h jumbo.h autoconfig.h memory.h common.h johnswap.h os.h os-autoconf.h
$(CC) -o tests/memory.o $(CFLAGS) -D_JOHN_MISC_NO_LOG memory.c

tests/unicode.o: unicode.o # just to have all the same deps
$(CC) -o tests/unicode.o $(CFLAGS) -DUNICODE_NO_OPTIONS -DNOT_JOHN unicode.c

# keep the 'easy name' build target of unit-tests The 'real' target is ../run/unit-tests[.exe]
unit-tests: ../run/unit-tests@EXE_EXT@

../run/unit-tests@EXE_EXT@: $(UNIT_TEST_OBJS)
../run/unit-tests@EXE_EXT@: $(UNIT_TEST_OBJS) $(UNIT_TEST_INCLUDED_PIECES)
$(LD) $(UNIT_TEST_OBJS) $(LDFLAGS) @OPENSSL_LIBS@ -o $@
@ echo "Now Running the Unit Tests"
@ ${POSSIBLE_WINE_MSG}
2 changes: 1 addition & 1 deletion src/Makefile.legacy
Original file line number Diff line number Diff line change
@@ -338,7 +338,7 @@ default:
@echo "generic Any other Unix-like system with gcc"

unit-tests:
$(CC) -o ../run/unit-tests -Wall -O2 -fomit-frame-pointer -DFORCE_GENERIC_SHA2 -D_JOHN_MISC_NO_LOG tests/unit-tests.c misc.c common.c memory.c sha2.c
$(CC) -o ../run/unit-tests -Wall -O2 -fomit-frame-pointer -DFORCE_GENERIC_SHA2 -D_JOHN_MISC_NO_LOG -DUNICODE_NO_OPTIONS -DNOT_JOHN tests/unit-tests.c misc.c common.c memory.c sha2.c unicode.c
../run/unit-tests

linux-x86-64-avx512:
172 changes: 172 additions & 0 deletions src/tests/test_valid_utf8.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
/*
* Copyright (c) 2024 Aleksey Cherepanov
* Redistribution and use in source and binary forms, with or without
* modification, are permitted.
*/

/* Test code for valid_utf8() from unicode.c vs single char UTF-8 sequences
*
* It tests only a sequence of bytes for single character. Plus a few
* cases of additional byte after valid 2-bytes long sequence are
* tested. Higher-level logic is not checked. Test is sparse: checks
* of continuous blocks are applied skipping parts. Dense checks are
* applied close to borders to catch off-by-one mistakes. Valid
* sequences are limited to single character (1-4 bytes). Invalid
* sequences go up to 5 bytes and use even bigger steps for skipping.
* ASCII bytes at trailing positions are tested lightly.
*
* Description in PR#5531 contains a script to test valid_utf8()
* against Python3. https://github.com/openwall/john/pull/5531
*
* See also info in unicode.h
*/

/* This file has up to 6 levels of nesting, so tab-width 4 might be
* helpful. Deep nesting is the price for simple regular structure. */

#define is_trailing(c) (0x80 <= (c) && (c) < 0xC0)

#define valid_utf8(a) (inc_test(), valid_utf8((a)))
#define expect(cond) \
do { \
if (!(cond)) { \
printf("Failed %s(): check '%s' fails for this byte sequence: %s\n", \
Results.test_name, #cond, hex(buf, strlen((void *)buf))); \
inc_failed_test(); \
return; /* early exit for the whole test */ \
} \
} while (0)

void _test_valid_utf8()
{
UTF8 buf[6] = {};

/* Empty string is valid. */
expect(valid_utf8(buf) == 1);

/* 1 byte: ASCII is valid, non-ascii alone is invalid. */
for (int c = 0; c < 256; c++) {
buf[0] = c;
buf[1] = '\0';
expect(valid_utf8(buf) == (c < 128));
}

/* Setup dense check around borders of 80..BF range for trailing bytes. */
unsigned char trailing_sparse_check[256] = {};
for (int c = 0x79; c < 256; c += 16)
trailing_sparse_check[c] = 1;
for (int c = 0x80 - 8; c < 0x80 + 8; c++)
trailing_sparse_check[c] = 1;
for (int c = 0xBF - 8; c < 0xBF + 8; c++)
trailing_sparse_check[c] = 1;

/* Multi-byte test: either start is valid or we grow sequence (up to 5 bytes). */
for (int c1 = 128; c1 < 256; c1++) {
buf[0] = c1;
buf[1] = '\0';

int step = 1;

/* Invalid starting byte would be checked with all endings. So
* checks are sparse for invalid starting bytes. */
if (buf[0] < 0xC2 || 0xF4 < buf[0])
step = 15; /* sparse checks */
else
step = 1;

for (int c2 = 0x70, r2; c2 < 256; c2 += step) {
/* The second byte is checked sparsely only for invalid starts. */
buf[1] = c2;
buf[2] = '\0';
r2 = valid_utf8(buf);

if (0xC2 <= buf[0] && buf[0] < 0xE0 &&
is_trailing(buf[1])) {

expect(r2 == 2);

/* Additional test with 41 and F5 after valid 2-bytes sequence */
buf[3] = '\0';
buf[2] = 'A';
expect(valid_utf8(buf) == 2);
buf[2] = 0xF5;
expect(valid_utf8(buf) == 0);

continue;
}

expect(r2 == 0);
for (int c3 = 0x79, r3; c3 < 256; c3++) {
if (0 == trailing_sparse_check[c3])
continue; /* run code below sparsely */
buf[2] = c3;
buf[3] = '\0';
r3 = valid_utf8(buf);

if ((buf[0] == 0xE0 &&
0xA0 <= buf[1] && buf[1] < 0xC0 &&
is_trailing(buf[2])) ||

(buf[0] == 0xED &&
0x80 <= buf[1] && buf[1] < 0xA0 &&
is_trailing(buf[2])) ||

(0xE1 <= buf[0] && buf[0] < 0xF0 && buf[0] != 0xED &&
is_trailing(buf[1]) &&
is_trailing(buf[2]))) {

expect(r3 == 2);
continue;
}

expect(r3 == 0);
for (int c4 = 0x79, r4; c4 < 256; c4++) {
if (0 == trailing_sparse_check[c4])
continue; /* run code below sparsely */
buf[3] = c4;
buf[4] = '\0';
r4 = valid_utf8(buf);

if ((buf[0] == 0xF0 &&
0x90 <= buf[1] && buf[1] < 0xC0 &&
is_trailing(buf[2]) &&
is_trailing(buf[3])) ||

(buf[0] == 0xF4 &&
0x80 <= buf[1] && buf[1] < 0x90 &&
is_trailing(buf[2]) &&
is_trailing(buf[3])) ||

((buf[0] == 0xF1 || buf[0] == 0xF2 || buf[0] == 0xF3) &&
is_trailing(buf[1]) &&
is_trailing(buf[2]) &&
is_trailing(buf[3]))) {

expect(r4 == 2);
continue;
}

expect(r4 == 0);
for (int c5 = 0x79; c5 < 256; c5 += 32) {
/* We test only a few values for the fifth byte. */
buf[4] = c5;
buf[5] = '\0';
expect(valid_utf8(buf) == 0);
}
}
}
}
}
}

void test_valid_utf8()
{
start_test(__FUNCTION__);
failed = 0;
_test_valid_utf8();
end_test();
}

#undef expect
#undef is_trailing
#undef valid_utf8
7 changes: 7 additions & 0 deletions src/tests/unit-tests.c
Original file line number Diff line number Diff line change
@@ -41,6 +41,7 @@
#include "../misc.h"
#include "../memory.h"
#include "../common.h"
#include "../unicode.h"

#include "../sha2.h"

@@ -2439,6 +2440,9 @@ void test_sha2_c() {
end_test();
}

/* Tests for unicode.c */
#include "test_valid_utf8.c"

int main() {
start_of_run = clock();

@@ -2493,6 +2497,9 @@ int main() {
set_unit_test_source("sha2.c");
test_sha2_c();

set_unit_test_source("unicode.c");
test_valid_utf8();

// perform dump listing of all processed functions.
dump_stats();

48 changes: 25 additions & 23 deletions src/unicode.c
Original file line number Diff line number Diff line change
@@ -534,7 +534,7 @@ inline size_t strlen_any(const void *source)
int valid_utf8(const UTF8 *source)
{
UTF8 a;
int length, ret = 1;
int ret = 1;
const UTF8 *srcptr;

while (*source) {
@@ -543,35 +543,37 @@ int valid_utf8(const UTF8 *source)
continue;
}

length = opt_trailingBytesUTF8[*source & 0x3f] + 1;
srcptr = source + length;

switch (length) {
default:
if (*source < 0xC2)
return 0;
/* Everything else falls through when valid */
case 4:
if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
case 3:
if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
case 2:
if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;

srcptr = source;

if (*source >= 0xE0) { /* 3+ bytes */
if (*source >= 0xF0) { /* 4+ bytes */

if ((a = (*++srcptr)) < 0x80 || a > 0xBF) return 0;

if (*source > 0xF4) return 0;

switch (*source) {
case 0xF0: if (a < 0x90) return 0; break;
case 0xF4: if (a > 0x8F) return 0;
}

} /* end of specific handling for 4+ bytes */

if ((a = (*++srcptr)) < 0x80 || a > 0xBF) return 0;

switch (*source) {
/* no fall-through in this inner switch */
case 0xE0: if (a < 0xA0) return 0; break;
case 0xED: if (a > 0x9F) return 0; break;
case 0xF0: if (a < 0x90) return 0; break;
case 0xF4: if (a > 0x8F) return 0;
case 0xED: if (a > 0x9F) return 0;
}

case 1:
if (*source >= 0x80 && *source < 0xC2) return 0;
}
if (*source > 0xF4)
return 0;
/* 2 bytes or "fall-through" with handled beginning of 3-4 bytes */

if ((a = (*++srcptr)) < 0x80 || a > 0xBF) return 0;

source += length;
source = srcptr + 1;
ret++;
}
return ret;
59 changes: 57 additions & 2 deletions src/unicode.h
Original file line number Diff line number Diff line change
@@ -34,7 +34,10 @@
#include <stdint.h>
#include <stdlib.h>

#ifndef UNICODE_NO_OPTIONS
#include "options.h"
#endif

#include "common.h"
#include "jumbo.h"

@@ -229,8 +232,54 @@ extern void truncate_utf8(UTF8 *string, int len);
* returns > 1 if data is valid and in fact contains UTF-8 sequences
*
* Actually in the last case, the return is the number of proper UTF-8
* sequences, so it can be used as a quality measure. A low number might be
* a false positive, a high number most probably isn't.
* sequences (plus one and not counting ASCII), so it can be used as a
* quality measure. A low number might be a false positive, a high
* number most probably isn't.
*
*
* Related info about UTF-8
*
* Valid UTF-8 sequences of bytes (1-4 bytes long):
*
* 00..7F
*
* C2..DF 80..BF
*
* E0 A0..BF 80..BF
* ED 80..9F 80..BF
* Ex 80..BF 80..BF where Ex does not include E0 and ED
*
* F0 90..BF 80..BF 80..BF
* F4 80..8F 80..BF 80..BF notice 8F as upper bound in the second byte
* F1..F3 80..BF 80..BF 80..BF
*
* (X..Y denotes range from X to Y inclusive. X and Y are byte
* values written in hex.)
*
* Incomplete sequences are invalid.
*
* Range 80..BF is for trailing bytes (also called continuation
* bytes). It is not a valid starting byte. Adjacent values C0 and C1
* could be considered starting 2-bytes sequences but they are not
* valid in UTF-8.
*
* Each of E0,ED,F0,F4 starting bytes use a sub-range for trailing
* byte at the second position. E0/ED use different halves of the
* range for the second byte. F0/F4 allow the second byte in other
* proportions (48:16), not overlapping too.
*
* Valid UTF-8 text cannot include C0,C1,F5..FF bytes at any position.
* 80..BF are invalid at starting position.
* 00..7F,C2..F4 are invalid at any trailing position (actually they
* invalidate previous char while new starting byte itself could be a
* part of a valid char, but even then the whole string would be
* invalid for purposes of valid_utf8()).
*
* See also https://en.wikipedia.org/wiki/UTF-8#Codepage_layout
*
* Sequences for unallocated, unassigned, reserved (including
* noncharacters) code points are considered valid. See here:
* https://en.wikipedia.org/wiki/Universal_Character_Set_characters#Noncharacters
*/
extern int valid_utf8(const UTF8 *source);

@@ -318,10 +367,16 @@ extern UTF8 CP_isUpper[0x100];
extern UTF8 CP_isSeparator[0x100];
extern UTF8 CP_isDigit[0x100];

#ifndef UNICODE_NO_OPTIONS
/* These are encoding-aware but not LC_CTYPE */
#define enc_islower(c) (options.internal_cp == ENC_RAW ? (c >= 'a' && c <= 'z') : CP_isLower[ARCH_INDEX(c)])
#define enc_isupper(c) (options.internal_cp == ENC_RAW ? (c >= 'A' && c <= 'Z') : CP_isUpper[ARCH_INDEX(c)])
#define enc_isdigit(c) (options.internal_cp == ENC_RAW ? (c >= '0' && c <= '9') : CP_isDigit[ARCH_INDEX(c)])
#else
#define enc_islower(c) (c >= 'a' && c <= 'z')
#define enc_isupper(c) (c >= 'A' && c <= 'Z')
#define enc_isdigit(c) (c >= '0' && c <= '9')
#endif
#define enc_tolower(c) (char)CP_down[ARCH_INDEX(c)]
#define enc_toupper(c) (char)CP_up[ARCH_INDEX(c)]