nlohmann · nlohmann · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024
@@ -25,10 +25,11 @@ and `ensure_ascii` parameters.
     result consists of ASCII characters only.
 
 `error_handler` (in)
-:   how to react on decoding errors; there are three possible values (see [`error_handler_t`](error_handler_t.md):
-    `strict` (throws and exception in case a decoding error occurs; default), `replace` (replace invalid UTF-8 sequences
-    with U+FFFD), and `ignore` (ignore invalid UTF-8 sequences during serialization; all bytes are copied to the output
-    unchanged)).
+:   how to react on decoding errors; there are three possible values (see [`error_handler_t`](error_handler_t.md)):
+:   - `strict`: throws a [`type_error`](../../home/exceptions.md#type-errors) exception in case a decoding error occurs (this is the default),
+    - `replace`: replace invalid UTF-8 sequences with U+FFFD (� REPLACEMENT CHARACTER),
+    - `ignore`: ignore invalid UTF-8 sequences during serialization (i.e., these bytes are skipped and not copied to the output), and
+    - `keep`: keep invalid UTF-8 sequences during serialization (i.e., all bytes are copied to the output unchanged)
 
 ## Return value
 
@@ -77,3 +78,4 @@ Binary values are serialized as object containing two keys:
 - Indentation character `indent_char`, option `ensure_ascii` and exceptions added in version 3.0.0.
 - Error handlers added in version 3.4.0.
 - Serialization of binary values added in version 3.8.0.
+- Added support for error handler value `keep` in version ???.
@@ -12,13 +12,16 @@ This enumeration is used in the [`dump`](dump.md) function to choose how to trea
 `basic_json` value. Three values are differentiated:
 
 strict
-:   throw a `type_error` exception in case of invalid UTF-8
+:   throw a [`type_error`](../../home/exceptions.md#type-errors) exception in case of invalid UTF-8
 
 replace
 :   replace invalid UTF-8 sequences with U+FFFD (� REPLACEMENT CHARACTER)
 
 ignore
-:   ignore invalid UTF-8 sequences; all bytes are copied to the output unchanged
+:   ignore invalid UTF-8 sequences; these bytes are skipped and not copied to the output
+
+keep
+:   keep invalid UTF-8 sequences; all bytes are copied to the output unchanged
 
 ## Examples
 
@@ -40,3 +43,4 @@ ignore
 ## Version history
 
 - Added in version 3.4.0.
+- Added value `keep` in version ???.
@@ -44,7 +44,8 @@ enum class error_handler_t
 {
     strict,  ///< throw a type_error exception in case of invalid UTF-8
     replace, ///< replace invalid UTF-8 sequences with U+FFFD
-    ignore   ///< ignore invalid UTF-8 sequences
+    ignore,  ///< ignore invalid UTF-8 sequences
+    keep     ///< keep invalid UTF-8 sequences
 };
 
 template<typename BasicJsonType>
@@ -398,6 +399,13 @@ class serializer
         std::size_t bytes_after_last_accept = 0;
         std::size_t undumped_chars = 0;
 
+        // copy string as-is if error handler is set to keep, and we don't want to ensure ASCII
+        if (error_handler == error_handler_t::keep && !ensure_ascii)
+        {
+            o->write_characters(s.data(), s.size());
+            return;
+        }
+
         for (std::size_t i = 0; i < s.size(); ++i)
         {
             const auto byte = static_cast<std::uint8_t>(s[i]);
@@ -567,7 +575,23 @@ class serializer
                             break;
                         }
 
-                        default:            // LCOV_EXCL_LINE
+                        case error_handler_t::keep:
+                        {
+                            // copy undumped chars to string buffer
+                            for (std::size_t j = 0; j < undumped_chars; ++j)
+                            {
+                                string_buffer[bytes++] = s[bytes_after_last_accept + j];
+                            }
+
+                            // add erroneous byte to string buffer
+                            string_buffer[bytes++] = s[i];
+
+                            // continue processing the string
+                            state = UTF8_ACCEPT;
+                            break;
+                        }
+
+                        default:                // LCOV_EXCL_LINE
                             JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
                     }
                     break;
@@ -605,6 +629,20 @@ class serializer
                     JSON_THROW(type_error::create(316, concat("incomplete UTF-8 string; last byte: 0x", hex_bytes(static_cast<std::uint8_t>(s.back() | 0))), nullptr));
                 }
 
+                case error_handler_t::keep:
+                {
+                    // copy undumped chars to string buffer
+                    for (std::size_t j = 0; j < undumped_chars; ++j)
+                    {
+                        string_buffer[bytes++] = s[bytes_after_last_accept + j];
+                    }
+                    undumped_chars = 0;
+
+                    // write all accepted bytes
+                    o->write_characters(string_buffer.data(), bytes);
+                    break;
+                }
+
                 case error_handler_t::ignore:
                 {
                     // write all accepted bytes
@@ -628,8 +666,8 @@ class serializer
                     break;
                 }
 
-                default:            // LCOV_EXCL_LINE
-                    JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+                default:                    // LCOV_EXCL_LINE
+                    JSON_ASSERT(false);     // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
             }
         }
     }

@@ -18606,7 +18606,8 @@ enum class error_handler_t
 {
     strict,  ///< throw a type_error exception in case of invalid UTF-8
     replace, ///< replace invalid UTF-8 sequences with U+FFFD
-    ignore   ///< ignore invalid UTF-8 sequences
+    ignore,  ///< ignore invalid UTF-8 sequences
+    keep     ///< keep invalid UTF-8 sequences
 };
 
 template<typename BasicJsonType>
@@ -18960,6 +18961,13 @@ class serializer
         std::size_t bytes_after_last_accept = 0;
         std::size_t undumped_chars = 0;
 
+        // copy string as-is if error handler is set to keep, and we don't want to ensure ASCII
+        if (error_handler == error_handler_t::keep && !ensure_ascii)
+        {
+            o->write_characters(s.data(), s.size());
+            return;
+        }
+
         for (std::size_t i = 0; i < s.size(); ++i)
         {
             const auto byte = static_cast<std::uint8_t>(s[i]);
@@ -19129,7 +19137,23 @@ class serializer
                             break;
                         }
 
-                        default:            // LCOV_EXCL_LINE
+                        case error_handler_t::keep:
+                        {
+                            // copy undumped chars to string buffer
+                            for (std::size_t j = 0; j < undumped_chars; ++j)
+                            {
+                                string_buffer[bytes++] = s[bytes_after_last_accept + j];
+                            }
+
+                            // add erroneous byte to string buffer
+                            string_buffer[bytes++] = s[i];
+
+                            // continue processing the string
+                            state = UTF8_ACCEPT;
+                            break;
+                        }
+
+                        default:                // LCOV_EXCL_LINE
                             JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
                     }
                     break;
@@ -19167,6 +19191,20 @@ class serializer
                     JSON_THROW(type_error::create(316, concat("incomplete UTF-8 string; last byte: 0x", hex_bytes(static_cast<std::uint8_t>(s.back() | 0))), nullptr));
                 }
 
+                case error_handler_t::keep:
+                {
+                    // copy undumped chars to string buffer
+                    for (std::size_t j = 0; j < undumped_chars; ++j)
+                    {
+                        string_buffer[bytes++] = s[bytes_after_last_accept + j];
+                    }
+                    undumped_chars = 0;
+
+                    // write all accepted bytes
+                    o->write_characters(string_buffer.data(), bytes);
+                    break;
+                }
+
                 case error_handler_t::ignore:
                 {
                     // write all accepted bytes
@@ -19190,8 +19228,8 @@ class serializer
                     break;
                 }
 
-                default:            // LCOV_EXCL_LINE
-                    JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+                default:                    // LCOV_EXCL_LINE
+                    JSON_ASSERT(false);     // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
             }
         }
     }

@@ -995,6 +995,15 @@ TEST_CASE("regression tests 2")
         CHECK(p.x == 1);
         CHECK(p.y == 2);
     }
+
+    SECTION("issue #4552 - UTF-8 invalid characters are not always ignored when dumping with error_handler_t::ignore")
+    {
+        nlohmann::json node;
+        node["test"] = "test\334\005";
+        CHECK(node.dump(-1, ' ', false, nlohmann::json::error_handler_t::ignore) == "{\"test\":\"test\\u0005\"}");
+        CHECK(node.dump(-1, ' ', false, nlohmann::json::error_handler_t::keep) == "{\"test\":\"test\334\005\"}");
+        CHECK(node.dump(-1, ' ', true, nlohmann::json::error_handler_t::keep) == "{\"test\":\"test\334\005\"}");
+    }
 }
 
 DOCTEST_CLANG_SUPPRESS_WARNING_POP
@@ -86,8 +86,11 @@ TEST_CASE("serialization")
             CHECK_THROWS_WITH_AS(j.dump(), "[json.exception.type_error.316] invalid UTF-8 byte at index 2: 0xA9", json::type_error&);
             CHECK_THROWS_WITH_AS(j.dump(1, ' ', false, json::error_handler_t::strict), "[json.exception.type_error.316] invalid UTF-8 byte at index 2: 0xA9", json::type_error&);
             CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"äü\"");
+            CHECK(j.dump(-1, ' ', true, json::error_handler_t::ignore) == "\"\\u00e4\\u00fc\"");
             CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"ä\xEF\xBF\xBDü\"");
             CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"\\u00e4\\ufffd\\u00fc\"");
+            CHECK(j.dump(-1, ' ', false, json::error_handler_t::keep) == "\"ä\xA9ü\"");
+            CHECK(j.dump(-1, ' ', true, json::error_handler_t::keep) == "\"\\u00e4\xA9\\u00fc\"");
         }
 
         SECTION("ending with incomplete character")
@@ -97,8 +100,11 @@ TEST_CASE("serialization")
             CHECK_THROWS_WITH_AS(j.dump(), "[json.exception.type_error.316] incomplete UTF-8 string; last byte: 0xC2", json::type_error&);
             CHECK_THROWS_AS(j.dump(1, ' ', false, json::error_handler_t::strict), json::type_error&);
             CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123\"");
+            CHECK(j.dump(-1, ' ', true, json::error_handler_t::ignore) == "\"123\"");
             CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\xEF\xBF\xBD\"");
             CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"123\\ufffd\"");
+            CHECK(j.dump(-1, ' ', false, json::error_handler_t::keep) == "\"123\xC2\"");
+            CHECK(j.dump(-1, ' ', true, json::error_handler_t::keep) == "\"123\xC2\"");
         }
 
         SECTION("unexpected character")
@@ -107,9 +113,14 @@ TEST_CASE("serialization")
 
             CHECK_THROWS_WITH_AS(j.dump(), "[json.exception.type_error.316] invalid UTF-8 byte at index 5: 0x34", json::type_error&);
             CHECK_THROWS_AS(j.dump(1, ' ', false, json::error_handler_t::strict), json::type_error&);
+
+            // see pending discussion at #4452
             CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123456\"");
+            CHECK(j.dump(-1, ' ', true, json::error_handler_t::ignore) == "\"123456\"");
             CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\xEF\xBF\xBD\x34\x35\x36\"");
             CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"123\\ufffd456\"");
+            CHECK(j.dump(-1, ' ', false, json::error_handler_t::keep) == "\"123\xF1\xB0\x34\x35\x36\"");
+            CHECK(j.dump(-1, ' ', true, json::error_handler_t::keep) == "\"123\xF1\xB0\x34\x35\x36\"");
         }
 
         SECTION("U+FFFD Substitution of Maximal Subparts")

@@ -74,6 +74,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
     static std::string s_replaced2;
     static std::string s_replaced_ascii;
     static std::string s_replaced2_ascii;
+    static std::string s_kept;
 
     // dumping with ignore/replace must not throw in any case
     s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore);
@@ -84,6 +85,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
     s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace);
     s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace);
     s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace);
+    s_kept = j.dump(-1, ' ', false, json::error_handler_t::keep);
 
     if (success_expected)
     {
@@ -105,6 +107,9 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
         CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos);
     }
 
+    // check if the string is unchanged (ignoring the quotes) if error_handler_t::keep is used
+    CHECK(json_string == s_kept.substr(1, json_string.size()));
+
     // check that prefix and suffix are preserved
     CHECK(s_ignored2.substr(1, 3) == "abc");
     CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz");

@@ -74,6 +74,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
     static std::string s_replaced2;
     static std::string s_replaced_ascii;
     static std::string s_replaced2_ascii;
+    static std::string s_kept;
 
     // dumping with ignore/replace must not throw in any case
     s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore);
@@ -84,6 +85,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
     s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace);
     s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace);
     s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace);
+    s_kept = j.dump(-1, ' ', false, json::error_handler_t::keep);
 
     if (success_expected)
     {
@@ -105,6 +107,9 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
         CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos);
     }
 
+    // check if the string is unchanged (ignoring the quotes) if error_handler_t::keep is used
+    CHECK(json_string == s_kept.substr(1, json_string.size()));
+
     // check that prefix and suffix are preserved
     CHECK(s_ignored2.substr(1, 3) == "abc");
     CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz");

@@ -74,6 +74,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
     static std::string s_replaced2;
     static std::string s_replaced_ascii;
     static std::string s_replaced2_ascii;
+    static std::string s_kept;
 
     // dumping with ignore/replace must not throw in any case
     s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore);
@@ -84,6 +85,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
     s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace);
     s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace);
     s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace);
+    s_kept = j.dump(-1, ' ', false, json::error_handler_t::keep);
 
     if (success_expected)
     {
@@ -105,6 +107,9 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
         CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos);
     }
 
+    // check if the string is unchanged (ignoring the quotes) if error_handler_t::keep is used
+    CHECK(json_string == s_kept.substr(1, json_string.size()));
+
     // check that prefix and suffix are preserved
     CHECK(s_ignored2.substr(1, 3) == "abc");
     CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz");

@@ -74,6 +74,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
     static std::string s_replaced2;
     static std::string s_replaced_ascii;
     static std::string s_replaced2_ascii;
+    static std::string s_kept;
 
     // dumping with ignore/replace must not throw in any case
     s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore);
@@ -84,6 +85,7 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
     s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace);
     s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace);
     s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace);
+    s_kept = j.dump(-1, ' ', false, json::error_handler_t::keep);
 
     if (success_expected)
     {
@@ -105,6 +107,9 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
         CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos);
     }
 
+    // check if the string is unchanged (ignoring the quotes) if error_handler_t::keep is used
+    CHECK(json_string == s_kept.substr(1, json_string.size()));
+
     // check that prefix and suffix are preserved
     CHECK(s_ignored2.substr(1, 3) == "abc");
     CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz");