openwall · solardiz · Sep 25, 2024 · Sep 8, 2024 · Sep 25, 2024 · Sep 25, 2024
diff --git a/src/Makefile.in b/src/Makefile.in
@@ -803,7 +803,10 @@ testfiles:
 ###############################################################################
 
 UNIT_TEST_OBJS = \
-	tests/unit-tests.o tests/misc.o tests/common.o tests/memory.o tests/sha2.o
+	tests/unit-tests.o tests/misc.o tests/common.o tests/memory.o tests/sha2.o tests/unicode.o
+
+UNIT_TEST_INCLUDED_PIECES = \
+	tests/test_valid_utf8.c
 
 tests/unit-tests.o:	tests/unit-tests.c common.h memory.h misc.h
 	$(CC) -o tests/unit-tests.o $(CFLAGS) -DFORCE_GENERIC_SHA2 -D_JOHN_MISC_NO_LOG  tests/unit-tests.c
@@ -820,10 +823,13 @@ tests/common.o:	common.c arch.h common.h memory.h os.h os-autoconf.h autoconfig.
 tests/memory.o:	memory.c arch.h misc.h jumbo.h autoconfig.h memory.h common.h johnswap.h os.h os-autoconf.h
 	$(CC) -o tests/memory.o $(CFLAGS) -D_JOHN_MISC_NO_LOG  memory.c
 
+tests/unicode.o:	unicode.o # just to have all the same deps
+	$(CC) -o tests/unicode.o $(CFLAGS) -DUNICODE_NO_OPTIONS -DNOT_JOHN  unicode.c
+
 # keep the 'easy name' build target of unit-tests   The 'real' target is ../run/unit-tests[.exe]
 unit-tests:	../run/unit-tests@EXE_EXT@
 
-../run/unit-tests@EXE_EXT@:	$(UNIT_TEST_OBJS)
+../run/unit-tests@EXE_EXT@:	$(UNIT_TEST_OBJS) $(UNIT_TEST_INCLUDED_PIECES)
 	$(LD) $(UNIT_TEST_OBJS) $(LDFLAGS) @OPENSSL_LIBS@ -o $@
 	@ echo "Now Running the Unit Tests"
 	@ ${POSSIBLE_WINE_MSG}

diff --git a/src/Makefile.legacy b/src/Makefile.legacy
@@ -338,7 +338,7 @@ default:
 	@echo "generic                  Any other Unix-like system with gcc"
 
 unit-tests:
-	$(CC) -o ../run/unit-tests -Wall -O2 -fomit-frame-pointer -DFORCE_GENERIC_SHA2 -D_JOHN_MISC_NO_LOG tests/unit-tests.c misc.c common.c memory.c sha2.c
+	$(CC) -o ../run/unit-tests -Wall -O2 -fomit-frame-pointer -DFORCE_GENERIC_SHA2 -D_JOHN_MISC_NO_LOG -DUNICODE_NO_OPTIONS -DNOT_JOHN tests/unit-tests.c misc.c common.c memory.c sha2.c unicode.c
 	../run/unit-tests
 
 linux-x86-64-avx512:

diff --git a/src/tests/test_valid_utf8.c b/src/tests/test_valid_utf8.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2024 Aleksey Cherepanov
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted.
+ */
+
+/* Test code for valid_utf8() from unicode.c vs single char UTF-8 sequences
+ *
+ * It tests only a sequence of bytes for single character. Plus a few
+ * cases of additional byte after valid 2-bytes long sequence are
+ * tested. Higher-level logic is not checked. Test is sparse: checks
+ * of continuous blocks are applied skipping parts. Dense checks are
+ * applied close to borders to catch off-by-one mistakes. Valid
+ * sequences are limited to single character (1-4 bytes). Invalid
+ * sequences go up to 5 bytes and use even bigger steps for skipping.
+ * ASCII bytes at trailing positions are tested lightly.
+ *
+ * Description in PR#5531 contains a script to test valid_utf8()
+ * against Python3.  https://github.com/openwall/john/pull/5531
+ *
+ * See also info in  unicode.h
+ */
+
+/* This file has up to 6 levels of nesting, so tab-width 4 might be
+ * helpful. Deep nesting is the price for simple regular structure. */
+
+#define is_trailing(c) (0x80 <= (c) && (c) < 0xC0)
+
+#define valid_utf8(a) (inc_test(), valid_utf8((a)))
+#define expect(cond) \
+	do { \
+		if (!(cond)) { \
+			printf("Failed %s(): check '%s' fails for this byte sequence: %s\n", \
+			       Results.test_name, #cond, hex(buf, strlen((void *)buf))); \
+			inc_failed_test(); \
+			return; /* early exit for the whole test */ \
+		} \
+	} while (0)
+
+void _test_valid_utf8()
+{
+	UTF8 buf[6] = {};
+
+	/* Empty string is valid. */
+	expect(valid_utf8(buf) == 1);
+
+	/* 1 byte: ASCII is valid, non-ascii alone is invalid. */
+	for (int c = 0; c < 256; c++) {
+		buf[0] = c;
+		buf[1] = '\0';
+		expect(valid_utf8(buf) == (c < 128));
+	}
+
+	/* Setup dense check around borders of 80..BF range for trailing bytes. */
+	unsigned char trailing_sparse_check[256] = {};
+	for (int c = 0x79; c < 256; c += 16)
+		trailing_sparse_check[c] = 1;
+	for (int c = 0x80 - 8; c < 0x80 + 8; c++)
+		trailing_sparse_check[c] = 1;
+	for (int c = 0xBF - 8; c < 0xBF + 8; c++)
+		trailing_sparse_check[c] = 1;
+
+	/* Multi-byte test: either start is valid or we grow sequence (up to 5 bytes). */
+	for (int c1 = 128; c1 < 256; c1++) {
+		buf[0] = c1;
+		buf[1] = '\0';
+
+		int step = 1;
+
+		/* Invalid starting byte would be checked with all endings. So
+		 * checks are sparse for invalid starting bytes. */
+		if (buf[0] < 0xC2 || 0xF4 < buf[0])
+			step = 15; /* sparse checks */
+		else
+			step = 1;
+
+		for (int c2 = 0x70, r2; c2 < 256; c2 += step) {
+			/* The second byte is checked sparsely only for invalid starts. */
+			buf[1] = c2;
+			buf[2] = '\0';
+			r2 = valid_utf8(buf);
+
+			if (0xC2 <= buf[0] && buf[0] < 0xE0 &&
+			    is_trailing(buf[1])) {
+
+				expect(r2 == 2);
+
+				/* Additional test with 41 and F5 after valid 2-bytes sequence */
+				buf[3] = '\0';
+				buf[2] = 'A';
+				expect(valid_utf8(buf) == 2);
+				buf[2] = 0xF5;
+				expect(valid_utf8(buf) == 0);
+
+				continue;
+			}
+
+			expect(r2 == 0);
+			for (int c3 = 0x79, r3; c3 < 256; c3++) {
+				if (0 == trailing_sparse_check[c3])
+					continue; /* run code below sparsely */
+				buf[2] = c3;
+				buf[3] = '\0';
+				r3 = valid_utf8(buf);
+
+				if ((buf[0] == 0xE0 &&
+				     0xA0 <= buf[1] && buf[1] < 0xC0 &&
+				     is_trailing(buf[2])) ||
+
+				    (buf[0] == 0xED &&
+				     0x80 <= buf[1] && buf[1] < 0xA0 &&
+				     is_trailing(buf[2])) ||
+
+				    (0xE1 <= buf[0] && buf[0] < 0xF0 && buf[0] != 0xED &&
+				     is_trailing(buf[1]) &&
+				     is_trailing(buf[2]))) {
+
+					expect(r3 == 2);
+					continue;
+				}
+
+				expect(r3 == 0);
+				for (int c4 = 0x79, r4; c4 < 256; c4++) {
+					if (0 == trailing_sparse_check[c4])
+						continue; /* run code below sparsely */
+					buf[3] = c4;
+					buf[4] = '\0';
+					r4 = valid_utf8(buf);
+
+					if ((buf[0] == 0xF0 &&
+					     0x90 <= buf[1] && buf[1] < 0xC0 &&
+					     is_trailing(buf[2]) &&
+					     is_trailing(buf[3])) ||
+
+					    (buf[0] == 0xF4 &&
+					     0x80 <= buf[1] && buf[1] < 0x90 &&
+					     is_trailing(buf[2]) &&
+					     is_trailing(buf[3])) ||
+
+					    ((buf[0] == 0xF1 || buf[0] == 0xF2 || buf[0] == 0xF3) &&
+					     is_trailing(buf[1]) &&
+					     is_trailing(buf[2]) &&
+					     is_trailing(buf[3]))) {
+
+						expect(r4 == 2);
+						continue;
+					}
+
+					expect(r4 == 0);
+					for (int c5 = 0x79; c5 < 256; c5 += 32) {
+						/* We test only a few values for the fifth byte. */
+						buf[4] = c5;
+						buf[5] = '\0';
+						expect(valid_utf8(buf) == 0);
+					}
+				}
+			}
+		}
+	}
+}
+
+void test_valid_utf8()
+{
+	start_test(__FUNCTION__);
+	failed = 0;
+	_test_valid_utf8();
+	end_test();
+}
+
+#undef expect
+#undef is_trailing
+#undef valid_utf8
diff --git a/src/tests/unit-tests.c b/src/tests/unit-tests.c
@@ -41,6 +41,7 @@
 #include "../misc.h"
 #include "../memory.h"
 #include "../common.h"
+#include "../unicode.h"
 
 #include "../sha2.h"
 
@@ -2439,6 +2440,9 @@ void test_sha2_c() {
 	end_test();
 }
 
+/* Tests for unicode.c */
+#include "test_valid_utf8.c"
+
 int main() {
 	start_of_run = clock();
 
@@ -2493,6 +2497,9 @@ int main() {
 	set_unit_test_source("sha2.c");
 	test_sha2_c();
 
+	set_unit_test_source("unicode.c");
+	test_valid_utf8();
+
 	// perform dump listing of all processed functions.
 	dump_stats();
 

diff --git a/src/unicode.c b/src/unicode.c
@@ -534,7 +534,7 @@ inline size_t strlen_any(const void *source)
 int valid_utf8(const UTF8 *source)
 {
 	UTF8 a;
-	int length, ret = 1;
+	int ret = 1;
 	const UTF8 *srcptr;
 
 	while (*source) {
@@ -543,35 +543,37 @@ int valid_utf8(const UTF8 *source)
 			continue;
 		}
 
-		length = opt_trailingBytesUTF8[*source & 0x3f] + 1;
-		srcptr = source + length;
-
-		switch (length) {
-		default:
+		if (*source < 0xC2)
 			return 0;
-			/* Everything else falls through when valid */
-		case 4:
-			if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
-		case 3:
-			if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
-		case 2:
-			if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
+
+		srcptr = source;
+
+		if (*source >= 0xE0) { /* 3+ bytes */
+			if (*source >= 0xF0) { /* 4+ bytes */
+
+				if ((a = (*++srcptr)) < 0x80 || a > 0xBF) return 0;
+
+				if (*source > 0xF4) return 0;
+
+				switch (*source) {
+				case 0xF0: if (a < 0x90) return 0; break;
+				case 0xF4: if (a > 0x8F) return 0;
+				}
+
+			} /* end of specific handling for 4+ bytes */
+
+			if ((a = (*++srcptr)) < 0x80 || a > 0xBF) return 0;
 
 			switch (*source) {
-				/* no fall-through in this inner switch */
 			case 0xE0: if (a < 0xA0) return 0; break;
-			case 0xED: if (a > 0x9F) return 0; break;
-			case 0xF0: if (a < 0x90) return 0; break;
-			case 0xF4: if (a > 0x8F) return 0;
+			case 0xED: if (a > 0x9F) return 0;
 			}
-
-		case 1:
-			if (*source >= 0x80 && *source < 0xC2) return 0;
 		}
-		if (*source > 0xF4)
-			return 0;
+		/* 2 bytes or "fall-through" with handled beginning of 3-4 bytes */
+
+		if ((a = (*++srcptr)) < 0x80 || a > 0xBF) return 0;
 
-		source += length;
+		source = srcptr + 1;
 		ret++;
 	}
 	return ret;

diff --git a/src/unicode.h b/src/unicode.h
@@ -34,7 +34,10 @@
 #include <stdint.h>
 #include <stdlib.h>
 
+#ifndef UNICODE_NO_OPTIONS
 #include "options.h"
+#endif
+
 #include "common.h"
 #include "jumbo.h"
 
@@ -229,8 +232,54 @@ extern void truncate_utf8(UTF8 *string, int len);
  * returns > 1 if data is valid and in fact contains UTF-8 sequences
  *
  * Actually in the last case, the return is the number of proper UTF-8
- * sequences, so it can be used as a quality measure. A low number might be
- * a false positive, a high number most probably isn't.
+ * sequences (plus one and not counting ASCII), so it can be used as a
+ * quality measure. A low number might be a false positive, a high
+ * number most probably isn't.
+ *
+ *
+ * Related info about UTF-8
+ *
+ * Valid UTF-8 sequences of bytes (1-4 bytes long):
+ *
+ * 00..7F
+ *
+ * C2..DF  80..BF
+ *
+ *   E0    A0..BF  80..BF
+ *   ED    80..9F  80..BF
+ *   Ex    80..BF  80..BF  where Ex does not include E0 and ED
+ *
+ *   F0    90..BF  80..BF  80..BF
+ *   F4    80..8F  80..BF  80..BF  notice 8F as upper bound in the second byte
+ * F1..F3  80..BF  80..BF  80..BF
+ *
+ * (X..Y denotes range from X to Y inclusive. X and Y are byte
+ * values written in hex.)
+ *
+ * Incomplete sequences are invalid.
+ *
+ * Range 80..BF is for trailing bytes (also called continuation
+ * bytes). It is not a valid starting byte. Adjacent values C0 and C1
+ * could be considered starting 2-bytes sequences but they are not
+ * valid in UTF-8.
+ *
+ * Each of E0,ED,F0,F4 starting bytes use a sub-range for trailing
+ * byte at the second position. E0/ED use different halves of the
+ * range for the second byte. F0/F4 allow the second byte in other
+ * proportions (48:16), not overlapping too.
+ *
+ * Valid UTF-8 text cannot include C0,C1,F5..FF bytes at any position.
+ * 80..BF are invalid at starting position.
+ * 00..7F,C2..F4 are invalid at any trailing position (actually they
+ * invalidate previous char while new starting byte itself could be a
+ * part of a valid char, but even then the whole string would be
+ * invalid for purposes of valid_utf8()).
+ *
+ * See also  https://en.wikipedia.org/wiki/UTF-8#Codepage_layout
+ *
+ * Sequences for unallocated, unassigned, reserved (including
+ * noncharacters) code points are considered valid. See here:
+ * https://en.wikipedia.org/wiki/Universal_Character_Set_characters#Noncharacters
  */
 extern int valid_utf8(const UTF8 *source);
 
@@ -318,10 +367,16 @@ extern UTF8 CP_isUpper[0x100];
 extern UTF8 CP_isSeparator[0x100];
 extern UTF8 CP_isDigit[0x100];
 
+#ifndef UNICODE_NO_OPTIONS
 /* These are encoding-aware but not LC_CTYPE */
 #define enc_islower(c) (options.internal_cp == ENC_RAW ? (c >= 'a' && c <= 'z') : CP_isLower[ARCH_INDEX(c)])
 #define enc_isupper(c) (options.internal_cp == ENC_RAW ? (c >= 'A' && c <= 'Z') : CP_isUpper[ARCH_INDEX(c)])
 #define enc_isdigit(c) (options.internal_cp == ENC_RAW ? (c >= '0' && c <= '9') : CP_isDigit[ARCH_INDEX(c)])
+#else
+#define enc_islower(c) (c >= 'a' && c <= 'z')
+#define enc_isupper(c) (c >= 'A' && c <= 'Z')
+#define enc_isdigit(c) (c >= '0' && c <= '9')
+#endif
 #define enc_tolower(c) (char)CP_down[ARCH_INDEX(c)]
 #define enc_toupper(c) (char)CP_up[ARCH_INDEX(c)]