Implemented valud UTF8 character checks

wazuh · Sep 26, 2024 · 31c4c73 · 31c4c73
1 parent 22600e0
commit 31c4c73
Show file tree

Hide file tree

Showing 2 changed files with 82 additions and 9 deletions.
diff --git a/src/common/utf8_op/src/utf8_op.c b/src/common/utf8_op/src/utf8_op.c
@@ -15,21 +15,21 @@
 #define REPLACEMENT_INC 4096
 
 /* Single byte: 0xxxxxxx */
-#define valid_1(x) (x[0] & 0x80) == 0
+#define valid_1(x) (((x)[0] & 0x80) == 0)
 
 /* Two bytes: 110xxxxx 10xxxxxx */
 /* Starting bytes 0xC0 and 0xC1 are forbidden (overlong) */
-#define valid_2(x) (x[0] & 0xE0) == 0xC0 && (x[0] & 0x1E) != 0 && (x[1] & 0xC0) == 0x80
+#define valid_2(x) (((x)[0] & 0xE0) == 0xC0 && (x)[0] >= (char)0xC2 && ((x)[1] & 0xC0) == 0x80)
 
 /* Three bytes: 1110xxxx 10xxxxxx 10xxxxxx */
 /* 0xE0 could start overlong encodings */
 /* 0xED (range U+D800–U+DFFF) is reserved for UTF-16 surrogate halves */
-#define valid_3(x) (x[0] & 0xF0) == 0xE0 && x[0] != (char)0xE0 && x[0] != (char)0xED && (x[1] & 0xC0) == 0x80 && (x[2] & 0xC0) == 0x80
+#define valid_3(x) (((x)[0] & 0xF0) == 0xE0 && (((x)[0] != (char)0xE0 || ((x)[1] & 0xE0) != 0x80) && ((x)[0] != (char)0xED || ((x)[1] & 0xE0) != 0xA0)) && ((x)[1] & 0xC0) == 0x80 && ((x)[2] & 0xC0) == 0x80)
 
 /* Four bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
 /* 0xF0 could start overlong encodings */
 /* Starting bytes 111101xx are forbidden (Unicode limit) */
-#define valid_4(x) (x[0] & 0xF8) == 0xF0 && x[0] != (char)0xF0 && (x[0] & 0x04) == 0 && (x[1] & 0xC0) == 0x80 && (x[2] & 0xC0) == 0x80 && (x[3] & 0xC0) == 0x80
+#define valid_4(x) (((x)[0] & 0xF8) == 0xF0 && (((x)[0] != (char)0xF0 || ((x)[1] & 0xF0) != 0x80) && ((x)[0] != (char)0xF4 || ((x)[1] & 0xF0) == 0x80)) && ((x)[1] & 0xC0) == 0x80 && ((x)[2] & 0xC0) == 0x80 && ((x)[3] & 0xC0) == 0x80)
 
 /* Return whether a string is UTF-8 */
 bool w_utf8_valid(const char * string) {

diff --git a/src/common/utf8_op/tests/unit/tests/test_utf8_op.c b/src/common/utf8_op/tests/unit/tests/test_utf8_op.c
@@ -18,7 +18,57 @@
 #include "../../headers/shared.h"
 #include "../wrappers/common.h"
 
-// Tests
+// Utility function for verifying the result
+void assert_valid_utf8(const char *input, bool replacement, bool expect_valid) {
+    char *filtered = w_utf8_filter(input, replacement);
+    int result = w_utf8_valid(filtered);
+    if (expect_valid) {
+        assert_int_equal(result, 1);
+    } else {
+        assert_int_equal(result, 0);
+    }
+    free(filtered);
+}
+
+// Test valid UTF-8 sequences
+void test_valid_utf8_sequences(void **state)
+{
+    const char * valid_sequences[] = {
+        "Hello, World!",      // ASCII characters (1-byte each)
+        "\xC3\x9C",           // Ü (U+00DC, 2-byte UTF-8)
+        "\xC3\xBC",           // ü (U+00FC, 2-byte UTF-8)
+        "\xE2\x98\x83",       // ☃ (U+2603, 3-byte UTF-8)
+        "\xF0\x9F\x98\x81",   // 😁 (U+1F601, 4-byte UTF-8)
+        "Σὲ γνωρίζω",         // Greek text (multi-byte sequences)
+        "中文字符",            // Chinese characters (3-byte UTF-8)
+        NULL                  // Null-terminated array
+    };
+
+    for (int i = 0; valid_sequences[i] != NULL; ++i) {
+        assert_valid_utf8(valid_sequences[i], false, true);
+        assert_valid_utf8(valid_sequences[i], true, true);
+    }
+}
+
+// Test invalid UTF-8 sequences
+void test_invalid_utf8_sequences(void **state)
+{
+    const char * invalid_sequences[] = {
+        "\xC0\xAF",           // Overlong encoding of '/'
+        "\xE0\x80\xAF",       // Overlong encoding (null character U+002F)
+        "\xED\xA0\x80",       // UTF-16 surrogate half (invalid in UTF-8)
+        "\xF8\x88\x80\x80\x80", // 5-byte sequence (invalid, as UTF-8 only supports up to 4 bytes)
+        "\xFF",               // Invalid single byte (not valid in UTF-8)
+        "\x80",               // Continuation byte without a start
+        "\xC3\x28",           // Invalid 2-byte sequence (invalid second byte)
+        NULL                  // Null-terminated array
+    };
+
+    for (int i = 0; invalid_sequences[i] != NULL; ++i) {
+        assert_valid_utf8(invalid_sequences[i], false, false);
+        assert_valid_utf8(invalid_sequences[i], true, true); // Replaced, thus valid output
+    }
+}
 
 void test_utf8_random_replace(void **state)
 {
@@ -38,6 +88,10 @@ void test_utf8_random_replace(void **state)
 
     char * copy = w_utf8_filter(buffer, true);
     int r = w_utf8_valid(copy);
+
+    /* Check if the output is valid */
+    assert_int_equal(r, 1);
+
     free(copy);
 }
 
@@ -50,7 +104,6 @@ void test_utf8_random_not_replace(void **state)
     randombytes(buffer, LENGTH - 1);
 
     /* Avoid zeroes */
-
     for (i = 0; i < LENGTH - 1; i++) {
         buffer[i] = buffer[i] ? buffer[i] : '0';
     }
@@ -59,13 +112,33 @@ void test_utf8_random_not_replace(void **state)
 
     char * copy = w_utf8_filter(buffer, false);
     int r = w_utf8_valid(copy);
+
+    /* The result could be either valid or invalid */
+    (void)r; // Use (void) to avoid unused variable warning in case you don't assert
+
     free(copy);
 }
 
+void test_utf8_edge_cases(void **state)
+{
+    const char * edge_cases[] = {
+        "\xF4\x8F\xBF\xBF", // U+10FFFF (highest valid UTF-8 character)
+        "\xF4\x90\x80\x80", // Beyond U+10FFFF (invalid)
+        NULL
+    };
+
+    // Check edge cases
+    assert_valid_utf8(edge_cases[0], false, true); // Should be valid
+    assert_valid_utf8(edge_cases[1], false, false); // Should be invalid
+}
+
 int main(void) {
     const struct CMUnitTest tests[] = {
-            cmocka_unit_test(test_utf8_random_replace),
-            cmocka_unit_test(test_utf8_random_not_replace),
+        cmocka_unit_test(test_valid_utf8_sequences),
+        cmocka_unit_test(test_invalid_utf8_sequences),
+        cmocka_unit_test(test_utf8_random_replace),
+        cmocka_unit_test(test_utf8_random_not_replace),
+        cmocka_unit_test(test_utf8_edge_cases),
     };
     return cmocka_run_group_tests(tests, NULL, NULL);
-}
+}