From 0671e92ced71187690c81db645279d39ecf92e16 Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Tue, 16 Oct 2018 20:38:50 +0200 Subject: [PATCH 1/8] :construction: proposal for different error handlers #1198 Proof of concept; currently only as parameter to the internal dump_escaped function; that is, not yet exposed to the dump function. --- include/nlohmann/detail/output/serializer.hpp | 67 +++++++++++++++++-- single_include/nlohmann/json.hpp | 67 +++++++++++++++++-- 2 files changed, 120 insertions(+), 14 deletions(-) diff --git a/include/nlohmann/detail/output/serializer.hpp b/include/nlohmann/detail/output/serializer.hpp index bb74a86e..7adf0c2f 100644 --- a/include/nlohmann/detail/output/serializer.hpp +++ b/include/nlohmann/detail/output/serializer.hpp @@ -39,6 +39,14 @@ class serializer static constexpr uint8_t UTF8_REJECT = 1; public: + /// how to treat decoding errors + enum class error_handler_t + { + strict, ///< throw a type_error exception in case of invalid UTF-8 + replace, ///< replace invalid UTF-8 sequences with U+FFFD + ignore ///< ignore invalid UTF-8 sequences + }; + /*! @param[in] s output stream to serialize to @param[in] ichar indentation character to use @@ -278,10 +286,12 @@ class serializer @param[in] s the string to escape @param[in] ensure_ascii whether to escape non-ASCII characters with \uXXXX sequences + @param[in] error_handler how to react on decoding errors @complexity Linear in the length of string @a s. */ - void dump_escaped(const string_t& s, const bool ensure_ascii) + void dump_escaped(const string_t& s, const bool ensure_ascii, + const error_handler_t error_handler = error_handler_t::strict) { uint32_t codepoint; uint8_t state = UTF8_ACCEPT; @@ -389,9 +399,33 @@ class serializer case UTF8_REJECT: // decode found invalid UTF-8 byte { - std::string sn(3, '\0'); - snprintf(&sn[0], sn.size(), "%.2X", byte); - JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn)); + switch (error_handler) + { + case error_handler_t::strict: + { + std::string sn(3, '\0'); + snprintf(&sn[0], sn.size(), "%.2X", byte); + JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn)); + } + + case error_handler_t::ignore: + { + state = UTF8_ACCEPT; + continue; + } + + case error_handler_t::replace: + { + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = 'u'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'd'; + state = UTF8_ACCEPT; + continue; + } + } } default: // decode found yet incomplete multi-byte code point @@ -417,9 +451,28 @@ class serializer else { // we finish reading, but do not accept: string was incomplete - std::string sn(3, '\0'); - snprintf(&sn[0], sn.size(), "%.2X", static_cast(s.back())); - JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn)); + switch (error_handler) + { + case error_handler_t::strict: + { + std::string sn(3, '\0'); + snprintf(&sn[0], sn.size(), "%.2X", static_cast(s.back())); + JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn)); + } + + case error_handler_t::ignore: + { + break; + } + + case error_handler_t::replace: + { + // write buffer, but replace last byte + o->write_characters(string_buffer.data(), bytes - 1); + o->write_characters("\\ufffd", 6); + break; + } + } } } diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index dc206d30..c4681d7b 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -9991,6 +9991,14 @@ class serializer static constexpr uint8_t UTF8_REJECT = 1; public: + /// how to treat decoding errors + enum class error_handler_t + { + strict, ///< throw a type_error exception in case of invalid UTF-8 + replace, ///< replace invalid UTF-8 sequences with U+FFFD + ignore ///< ignore invalid UTF-8 sequences + }; + /*! @param[in] s output stream to serialize to @param[in] ichar indentation character to use @@ -10230,10 +10238,12 @@ class serializer @param[in] s the string to escape @param[in] ensure_ascii whether to escape non-ASCII characters with \uXXXX sequences + @param[in] error_handler how to react on decoding errors @complexity Linear in the length of string @a s. */ - void dump_escaped(const string_t& s, const bool ensure_ascii) + void dump_escaped(const string_t& s, const bool ensure_ascii, + const error_handler_t error_handler = error_handler_t::strict) { uint32_t codepoint; uint8_t state = UTF8_ACCEPT; @@ -10341,9 +10351,33 @@ class serializer case UTF8_REJECT: // decode found invalid UTF-8 byte { - std::string sn(3, '\0'); - snprintf(&sn[0], sn.size(), "%.2X", byte); - JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn)); + switch (error_handler) + { + case error_handler_t::strict: + { + std::string sn(3, '\0'); + snprintf(&sn[0], sn.size(), "%.2X", byte); + JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn)); + } + + case error_handler_t::ignore: + { + state = UTF8_ACCEPT; + continue; + } + + case error_handler_t::replace: + { + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = 'u'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'd'; + state = UTF8_ACCEPT; + continue; + } + } } default: // decode found yet incomplete multi-byte code point @@ -10369,9 +10403,28 @@ class serializer else { // we finish reading, but do not accept: string was incomplete - std::string sn(3, '\0'); - snprintf(&sn[0], sn.size(), "%.2X", static_cast(s.back())); - JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn)); + switch (error_handler) + { + case error_handler_t::strict: + { + std::string sn(3, '\0'); + snprintf(&sn[0], sn.size(), "%.2X", static_cast(s.back())); + JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn)); + } + + case error_handler_t::ignore: + { + break; + } + + case error_handler_t::replace: + { + // write buffer, but replace last byte + o->write_characters(string_buffer.data(), bytes - 1); + o->write_characters("\\ufffd", 6); + break; + } + } } } From c5821d91e51f9f2b233689e3f7d3f274cc8d27fc Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Sun, 21 Oct 2018 11:49:37 +0200 Subject: [PATCH 2/8] :construction: overworked error handlers #1198 --- include/nlohmann/detail/output/serializer.hpp | 88 ++++++++++----- include/nlohmann/json.hpp | 17 ++- single_include/nlohmann/json.hpp | 105 ++++++++++++------ test/src/unit-serialization.cpp | 37 ++++++ 4 files changed, 181 insertions(+), 66 deletions(-) diff --git a/include/nlohmann/detail/output/serializer.hpp b/include/nlohmann/detail/output/serializer.hpp index 7adf0c2f..3ccca482 100644 --- a/include/nlohmann/detail/output/serializer.hpp +++ b/include/nlohmann/detail/output/serializer.hpp @@ -28,6 +28,14 @@ namespace detail // serialization // /////////////////// +/// how to treat decoding errors +enum class error_handler_t +{ + strict, ///< throw a type_error exception in case of invalid UTF-8 + replace, ///< replace invalid UTF-8 sequences with U+FFFD + ignore ///< ignore invalid UTF-8 sequences +}; + template class serializer { @@ -39,23 +47,20 @@ class serializer static constexpr uint8_t UTF8_REJECT = 1; public: - /// how to treat decoding errors - enum class error_handler_t - { - strict, ///< throw a type_error exception in case of invalid UTF-8 - replace, ///< replace invalid UTF-8 sequences with U+FFFD - ignore ///< ignore invalid UTF-8 sequences - }; - /*! @param[in] s output stream to serialize to @param[in] ichar indentation character to use + @param[in] error_handler_ how to react on decoding errors */ - serializer(output_adapter_t s, const char ichar) - : o(std::move(s)), loc(std::localeconv()), - thousands_sep(loc->thousands_sep == nullptr ? '\0' : * (loc->thousands_sep)), - decimal_point(loc->decimal_point == nullptr ? '\0' : * (loc->decimal_point)), - indent_char(ichar), indent_string(512, indent_char) + serializer(output_adapter_t s, const char ichar, + error_handler_t error_handler_ = error_handler_t::strict) + : o(std::move(s)) + , loc(std::localeconv()) + , thousands_sep(loc->thousands_sep == nullptr ? '\0' : * (loc->thousands_sep)) + , decimal_point(loc->decimal_point == nullptr ? '\0' : * (loc->decimal_point)) + , indent_char(ichar) + , indent_string(512, indent_char) + , error_handler(error_handler_) {} // delete because of pointer members @@ -286,17 +291,18 @@ class serializer @param[in] s the string to escape @param[in] ensure_ascii whether to escape non-ASCII characters with \uXXXX sequences - @param[in] error_handler how to react on decoding errors @complexity Linear in the length of string @a s. */ - void dump_escaped(const string_t& s, const bool ensure_ascii, - const error_handler_t error_handler = error_handler_t::strict) + void dump_escaped(const string_t& s, const bool ensure_ascii) { uint32_t codepoint; uint8_t state = UTF8_ACCEPT; std::size_t bytes = 0; // number of bytes written to string_buffer + // number of bytes written at the point of the last valid byte + std::size_t bytes_after_last_accept = 0; + for (std::size_t i = 0; i < s.size(); ++i) { const auto byte = static_cast(s[i]); @@ -394,6 +400,9 @@ class serializer o->write_characters(string_buffer.data(), bytes); bytes = 0; } + + // remember the byte position of this accept + bytes_after_last_accept = bytes; break; } @@ -409,19 +418,33 @@ class serializer } case error_handler_t::ignore: - { - state = UTF8_ACCEPT; - continue; - } - case error_handler_t::replace: { - string_buffer[bytes++] = '\\'; - string_buffer[bytes++] = 'u'; - string_buffer[bytes++] = 'f'; - string_buffer[bytes++] = 'f'; - string_buffer[bytes++] = 'f'; - string_buffer[bytes++] = 'd'; + // in case we saw this chatacter the first time, we + // would like to read it again, because the byte + // may be OK for itself, but just not OK for the + // previous sequence + if (bytes_after_last_accept != bytes) + { + --i; + } + + // reset length buffer to the last accepted index; + // thus removing/ignoring the invalid characters + bytes = bytes_after_last_accept; + + if (error_handler == error_handler_t::replace) + { + // add a replacement character + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = 'u'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'd'; + } + + // continue processing the string state = UTF8_ACCEPT; continue; } @@ -440,6 +463,7 @@ class serializer } } + // we finished processing the string if (JSON_LIKELY(state == UTF8_ACCEPT)) { // write buffer @@ -462,13 +486,16 @@ class serializer case error_handler_t::ignore: { + // write all accepted bytes + o->write_characters(string_buffer.data(), bytes_after_last_accept); break; } case error_handler_t::replace: { - // write buffer, but replace last byte - o->write_characters(string_buffer.data(), bytes - 1); + // write all accepted bytes + o->write_characters(string_buffer.data(), bytes_after_last_accept); + // add a replacement character o->write_characters("\\ufffd", 6); break; } @@ -682,6 +709,9 @@ class serializer const char indent_char; /// the indentation string string_t indent_string; + + /// error_handler how to react on decoding errors + const error_handler_t error_handler; }; } // namespace detail } // namespace nlohmann diff --git a/include/nlohmann/json.hpp b/include/nlohmann/json.hpp index 619fd9e6..a5f7d47d 100644 --- a/include/nlohmann/json.hpp +++ b/include/nlohmann/json.hpp @@ -208,6 +208,8 @@ class basic_json using json_pointer = ::nlohmann::json_pointer; template using json_serializer = JSONSerializer; + /// how to treat decoding errors + using error_handler_t = detail::error_handler_t; /// helper type for initializer lists of basic_json values using initializer_list_t = std::initializer_list>; @@ -1932,6 +1934,10 @@ class basic_json @param[in] ensure_ascii If @a ensure_ascii is true, all non-ASCII characters in the output are escaped with `\uXXXX` sequences, and the result consists of ASCII characters only. + @param[in] error_handler how to react on decoding errors; there are three + possible values: `strict` (throws and exception in case a decoding error + occurs; default), `replace` (replace invalid UTF-8 sequences with U+FFFD), + and `ignore` (ignore invalid UTF-8 sequences during serialization). @return string containing the serialization of the JSON value @@ -1950,13 +1956,16 @@ class basic_json @see https://docs.python.org/2/library/json.html#json.dump @since version 1.0.0; indentation character @a indent_char, option - @a ensure_ascii and exceptions added in version 3.0.0 + @a ensure_ascii and exceptions added in version 3.0.0; error + handlers added in version 3.4.0. */ - string_t dump(const int indent = -1, const char indent_char = ' ', - const bool ensure_ascii = false) const + string_t dump(const int indent = -1, + const char indent_char = ' ', + const bool ensure_ascii = false, + const error_handler_t error_handler = error_handler_t::strict) const { string_t result; - serializer s(detail::output_adapter(result), indent_char); + serializer s(detail::output_adapter(result), indent_char, error_handler); if (indent >= 0) { diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index c4681d7b..b3a4487e 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -9980,6 +9980,14 @@ namespace detail // serialization // /////////////////// +/// how to treat decoding errors +enum class error_handler_t +{ + strict, ///< throw a type_error exception in case of invalid UTF-8 + replace, ///< replace invalid UTF-8 sequences with U+FFFD + ignore ///< ignore invalid UTF-8 sequences +}; + template class serializer { @@ -9991,23 +9999,20 @@ class serializer static constexpr uint8_t UTF8_REJECT = 1; public: - /// how to treat decoding errors - enum class error_handler_t - { - strict, ///< throw a type_error exception in case of invalid UTF-8 - replace, ///< replace invalid UTF-8 sequences with U+FFFD - ignore ///< ignore invalid UTF-8 sequences - }; - /*! @param[in] s output stream to serialize to @param[in] ichar indentation character to use + @param[in] error_handler_ how to react on decoding errors */ - serializer(output_adapter_t s, const char ichar) - : o(std::move(s)), loc(std::localeconv()), - thousands_sep(loc->thousands_sep == nullptr ? '\0' : * (loc->thousands_sep)), - decimal_point(loc->decimal_point == nullptr ? '\0' : * (loc->decimal_point)), - indent_char(ichar), indent_string(512, indent_char) + serializer(output_adapter_t s, const char ichar, + error_handler_t error_handler_ = error_handler_t::strict) + : o(std::move(s)) + , loc(std::localeconv()) + , thousands_sep(loc->thousands_sep == nullptr ? '\0' : * (loc->thousands_sep)) + , decimal_point(loc->decimal_point == nullptr ? '\0' : * (loc->decimal_point)) + , indent_char(ichar) + , indent_string(512, indent_char) + , error_handler(error_handler_) {} // delete because of pointer members @@ -10238,17 +10243,18 @@ class serializer @param[in] s the string to escape @param[in] ensure_ascii whether to escape non-ASCII characters with \uXXXX sequences - @param[in] error_handler how to react on decoding errors @complexity Linear in the length of string @a s. */ - void dump_escaped(const string_t& s, const bool ensure_ascii, - const error_handler_t error_handler = error_handler_t::strict) + void dump_escaped(const string_t& s, const bool ensure_ascii) { uint32_t codepoint; uint8_t state = UTF8_ACCEPT; std::size_t bytes = 0; // number of bytes written to string_buffer + // number of bytes written at the point of the last valid byte + std::size_t bytes_after_last_accept = 0; + for (std::size_t i = 0; i < s.size(); ++i) { const auto byte = static_cast(s[i]); @@ -10346,6 +10352,9 @@ class serializer o->write_characters(string_buffer.data(), bytes); bytes = 0; } + + // remember the byte position of this accept + bytes_after_last_accept = bytes; break; } @@ -10361,19 +10370,33 @@ class serializer } case error_handler_t::ignore: - { - state = UTF8_ACCEPT; - continue; - } - case error_handler_t::replace: { - string_buffer[bytes++] = '\\'; - string_buffer[bytes++] = 'u'; - string_buffer[bytes++] = 'f'; - string_buffer[bytes++] = 'f'; - string_buffer[bytes++] = 'f'; - string_buffer[bytes++] = 'd'; + // in case we saw this chatacter the first time, we + // would like to read it again, because the byte + // may be OK for itself, but just not OK for the + // previous sequence + if (bytes_after_last_accept != bytes) + { + --i; + } + + // reset length buffer to the last accepted index; + // thus removing/ignoring the invalid characters + bytes = bytes_after_last_accept; + + if (error_handler == error_handler_t::replace) + { + // add a replacement character + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = 'u'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'd'; + } + + // continue processing the string state = UTF8_ACCEPT; continue; } @@ -10392,6 +10415,7 @@ class serializer } } + // we finished processing the string if (JSON_LIKELY(state == UTF8_ACCEPT)) { // write buffer @@ -10414,13 +10438,16 @@ class serializer case error_handler_t::ignore: { + // write all accepted bytes + o->write_characters(string_buffer.data(), bytes_after_last_accept); break; } case error_handler_t::replace: { - // write buffer, but replace last byte - o->write_characters(string_buffer.data(), bytes - 1); + // write all accepted bytes + o->write_characters(string_buffer.data(), bytes_after_last_accept); + // add a replacement character o->write_characters("\\ufffd", 6); break; } @@ -10634,6 +10661,9 @@ class serializer const char indent_char; /// the indentation string string_t indent_string; + + /// error_handler how to react on decoding errors + const error_handler_t error_handler; }; } // namespace detail } // namespace nlohmann @@ -11603,6 +11633,8 @@ class basic_json using json_pointer = ::nlohmann::json_pointer; template using json_serializer = JSONSerializer; + /// how to treat decoding errors + using error_handler_t = detail::error_handler_t; /// helper type for initializer lists of basic_json values using initializer_list_t = std::initializer_list>; @@ -13327,6 +13359,10 @@ class basic_json @param[in] ensure_ascii If @a ensure_ascii is true, all non-ASCII characters in the output are escaped with `\uXXXX` sequences, and the result consists of ASCII characters only. + @param[in] error_handler how to react on decoding errors; there are three + possible values: `strict` (throws and exception in case a decoding error + occurs; default), `replace` (replace invalid UTF-8 sequences with U+FFFD), + and `ignore` (ignore invalid UTF-8 sequences during serialization). @return string containing the serialization of the JSON value @@ -13345,13 +13381,16 @@ class basic_json @see https://docs.python.org/2/library/json.html#json.dump @since version 1.0.0; indentation character @a indent_char, option - @a ensure_ascii and exceptions added in version 3.0.0 + @a ensure_ascii and exceptions added in version 3.0.0; error + handlers added in version 3.4.0. */ - string_t dump(const int indent = -1, const char indent_char = ' ', - const bool ensure_ascii = false) const + string_t dump(const int indent = -1, + const char indent_char = ' ', + const bool ensure_ascii = false, + const error_handler_t error_handler = error_handler_t::strict) const { string_t result; - serializer s(detail::output_adapter(result), indent_char); + serializer s(detail::output_adapter(result), indent_char, error_handler); if (indent >= 0) { diff --git a/test/src/unit-serialization.cpp b/test/src/unit-serialization.cpp index 0eed7246..e4069f73 100644 --- a/test/src/unit-serialization.cpp +++ b/test/src/unit-serialization.cpp @@ -94,4 +94,41 @@ TEST_CASE("serialization") "[\n\t\"foo\",\n\t1,\n\t2,\n\t3,\n\tfalse,\n\t{\n\t\t\"one\": 1\n\t}\n]"); } } + + SECTION("dump") + { + SECTION("invalid character") + { + json j = "ä\xA9ü"; + + CHECK_THROWS_AS(j.dump(), json::type_error&); + CHECK_THROWS_WITH(j.dump(), "[json.exception.type_error.316] invalid UTF-8 byte at index 2: 0xA9"); + CHECK_THROWS_AS(j.dump(1, ' ', false, json::error_handler_t::strict), json::type_error&); + CHECK_THROWS_WITH(j.dump(1, ' ', false, json::error_handler_t::strict), "[json.exception.type_error.316] invalid UTF-8 byte at index 2: 0xA9"); + CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"äü\""); + CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"ä\\ufffdü\""); + } + + SECTION("ending with incomplete character") + { + json j = "123\xC2"; + + CHECK_THROWS_AS(j.dump(), json::type_error&); + CHECK_THROWS_WITH(j.dump(), "[json.exception.type_error.316] incomplete UTF-8 string; last byte: 0xC2"); + CHECK_THROWS_AS(j.dump(1, ' ', false, json::error_handler_t::strict), json::type_error&); + CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123\""); + CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\\ufffd\""); + } + + SECTION("unexpected character") + { + json j = "123\xF1\xB0\x34\x35\x36"; + + CHECK_THROWS_AS(j.dump(), json::type_error&); + CHECK_THROWS_WITH(j.dump(), "[json.exception.type_error.316] invalid UTF-8 byte at index 5: 0x34"); + CHECK_THROWS_AS(j.dump(1, ' ', false, json::error_handler_t::strict), json::type_error&); + CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123456\""); + CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\\ufffd456\""); + } + } } From e5dce641159874b9485d4c8a310b82e34b7aca18 Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Sun, 21 Oct 2018 23:26:25 +0200 Subject: [PATCH 3/8] :green_heart: added tests #1198 Test every prefix of Unicode sequences against the different dump functions. --- include/nlohmann/detail/output/serializer.hpp | 1 + single_include/nlohmann/json.hpp | 1 + test/src/unit-unicode.cpp | 120 ++++++++++++++++++ 3 files changed, 122 insertions(+) diff --git a/include/nlohmann/detail/output/serializer.hpp b/include/nlohmann/detail/output/serializer.hpp index 3ccca482..c022c307 100644 --- a/include/nlohmann/detail/output/serializer.hpp +++ b/include/nlohmann/detail/output/serializer.hpp @@ -442,6 +442,7 @@ class serializer string_buffer[bytes++] = 'f'; string_buffer[bytes++] = 'f'; string_buffer[bytes++] = 'd'; + bytes_after_last_accept = bytes; } // continue processing the string diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index b3a4487e..4e86fafc 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -10394,6 +10394,7 @@ class serializer string_buffer[bytes++] = 'f'; string_buffer[bytes++] = 'f'; string_buffer[bytes++] = 'd'; + bytes_after_last_accept = bytes; } // continue processing the string diff --git a/test/src/unit-unicode.cpp b/test/src/unit-unicode.cpp index 4def1a3e..c71f6f1d 100644 --- a/test/src/unit-unicode.cpp +++ b/test/src/unit-unicode.cpp @@ -39,6 +39,79 @@ using nlohmann::json; extern size_t calls; size_t calls = 0; +void check_utf8dump(bool success_expected, int byte1, int byte2, int byte3, int byte4); + +void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 = -1, int byte4 = -1) +{ + std::string json_string; + + CAPTURE(byte1); + CAPTURE(byte2); + CAPTURE(byte3); + CAPTURE(byte4); + + json_string += std::string(1, static_cast(byte1)); + + if (byte2 != -1) + { + json_string += std::string(1, static_cast(byte2)); + } + + if (byte3 != -1) + { + json_string += std::string(1, static_cast(byte3)); + } + + if (byte4 != -1) + { + json_string += std::string(1, static_cast(byte4)); + } + + CAPTURE(json_string); + + // store the string in a JSON value + json j = json_string; + json j2 = "abc" + json_string + "xyz"; + + // dumping with ignore/replace must not throw in any case + auto s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore); + auto s_ignored2 = j2.dump(-1, ' ', false, json::error_handler_t::ignore); + auto s_replaced = j.dump(-1, ' ', false, json::error_handler_t::replace); + auto s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace); + + if (success_expected) + { + // strict mode must not throw if success is expected + auto s_strict = j.dump(); + // all dumps should agree on the string + CHECK(s_strict == s_ignored); + CHECK(s_strict == s_replaced); + + // check that ignore/replace string does not contain a replacement character + CHECK(s_ignored.find("\\ufffd") == std::string::npos); + CHECK(s_replaced.find("\\ufffd") == std::string::npos); + } + else + { + // strict mode must throw if success is not expected + CHECK_THROWS_AS(j.dump(), json::type_error&); + // ignore and replace must create different dumps + CHECK(s_ignored != s_replaced); + + // check that ignore string does not contain a replacement character + CHECK(s_ignored.find("\\ufffd") == std::string::npos); + // check that replace string contains a replacement character + CHECK(s_replaced.find("\\ufffd") != std::string::npos); + + } + + // check that prefix and suffix are preserved + CHECK(s_ignored2.substr(1, 3) == "abc"); + CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz"); + CHECK(s_replaced2.substr(1, 3) == "abc"); + CHECK(s_replaced2.substr(s_replaced2.size() - 4, 3) == "xyz"); +} + void check_utf8string(bool success_expected, int byte1, int byte2, int byte3, int byte4); // create and check a JSON string with up to four UTF-8 bytes @@ -115,11 +188,13 @@ TEST_CASE("Unicode", "[hide]") for (int byte1 = 0x80; byte1 <= 0xC1; ++byte1) { check_utf8string(false, byte1); + check_utf8dump(false, byte1); } for (int byte1 = 0xF5; byte1 <= 0xFF; ++byte1) { check_utf8string(false, byte1); + check_utf8dump(false, byte1); } } @@ -152,6 +227,7 @@ TEST_CASE("Unicode", "[hide]") // all other characters are OK check_utf8string(true, byte1); + check_utf8dump(true, byte1); } } } @@ -165,6 +241,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2) { check_utf8string(true, byte1, byte2); + check_utf8dump(true, byte1, byte2); } } } @@ -174,6 +251,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte1 = 0xC2; byte1 <= 0xDF; ++byte1) { check_utf8string(false, byte1); + check_utf8dump(false, byte1); } } @@ -190,6 +268,7 @@ TEST_CASE("Unicode", "[hide]") } check_utf8string(false, byte1, byte2); + check_utf8dump(false, byte1, byte2); } } } @@ -206,6 +285,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3) { check_utf8string(true, byte1, byte2, byte3); + check_utf8dump(true, byte1, byte2, byte3); } } } @@ -216,6 +296,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1) { check_utf8string(false, byte1); + check_utf8dump(false, byte1); } } @@ -226,6 +307,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte2 = 0xA0; byte2 <= 0xBF; ++byte2) { check_utf8string(false, byte1, byte2); + check_utf8dump(false, byte1, byte2); } } } @@ -245,6 +327,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3) { check_utf8string(false, byte1, byte2, byte3); + check_utf8dump(false, byte1, byte2, byte3); } } } @@ -265,6 +348,7 @@ TEST_CASE("Unicode", "[hide]") } check_utf8string(false, byte1, byte2, byte3); + check_utf8dump(false, byte1, byte2, byte3); } } } @@ -282,6 +366,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3) { check_utf8string(true, byte1, byte2, byte3); + check_utf8dump(true, byte1, byte2, byte3); } } } @@ -292,6 +377,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1) { check_utf8string(false, byte1); + check_utf8dump(false, byte1); } } @@ -302,6 +388,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2) { check_utf8string(false, byte1, byte2); + check_utf8dump(false, byte1, byte2); } } } @@ -321,6 +408,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3) { check_utf8string(false, byte1, byte2, byte3); + check_utf8dump(false, byte1, byte2, byte3); } } } @@ -341,6 +429,7 @@ TEST_CASE("Unicode", "[hide]") } check_utf8string(false, byte1, byte2, byte3); + check_utf8dump(false, byte1, byte2, byte3); } } } @@ -358,6 +447,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3) { check_utf8string(true, byte1, byte2, byte3); + check_utf8dump(true, byte1, byte2, byte3); } } } @@ -368,6 +458,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte1 = 0xED; byte1 <= 0xED; ++byte1) { check_utf8string(false, byte1); + check_utf8dump(false, byte1); } } @@ -378,6 +469,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte2 = 0x80; byte2 <= 0x9F; ++byte2) { check_utf8string(false, byte1, byte2); + check_utf8dump(false, byte1, byte2); } } } @@ -397,6 +489,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3) { check_utf8string(false, byte1, byte2, byte3); + check_utf8dump(false, byte1, byte2, byte3); } } } @@ -417,6 +510,7 @@ TEST_CASE("Unicode", "[hide]") } check_utf8string(false, byte1, byte2, byte3); + check_utf8dump(false, byte1, byte2, byte3); } } } @@ -434,6 +528,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3) { check_utf8string(true, byte1, byte2, byte3); + check_utf8dump(true, byte1, byte2, byte3); } } } @@ -444,6 +539,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1) { check_utf8string(false, byte1); + check_utf8dump(false, byte1); } } @@ -454,6 +550,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2) { check_utf8string(false, byte1, byte2); + check_utf8dump(false, byte1, byte2); } } } @@ -473,6 +570,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3) { check_utf8string(false, byte1, byte2, byte3); + check_utf8dump(false, byte1, byte2, byte3); } } } @@ -493,6 +591,7 @@ TEST_CASE("Unicode", "[hide]") } check_utf8string(false, byte1, byte2, byte3); + check_utf8dump(false, byte1, byte2, byte3); } } } @@ -512,6 +611,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4) { check_utf8string(true, byte1, byte2, byte3, byte4); + check_utf8dump(true, byte1, byte2, byte3, byte4); } } } @@ -523,6 +623,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1) { check_utf8string(false, byte1); + check_utf8dump(false, byte1); } } @@ -533,6 +634,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2) { check_utf8string(false, byte1, byte2); + check_utf8dump(false, byte1, byte2); } } } @@ -546,6 +648,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3) { check_utf8string(false, byte1, byte2, byte3); + check_utf8dump(false, byte1, byte2, byte3); } } } @@ -568,6 +671,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4) { check_utf8string(false, byte1, byte2, byte3, byte4); + check_utf8dump(false, byte1, byte2, byte3, byte4); } } } @@ -591,6 +695,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4) { check_utf8string(false, byte1, byte2, byte3, byte4); + check_utf8dump(false, byte1, byte2, byte3, byte4); } } } @@ -614,6 +719,7 @@ TEST_CASE("Unicode", "[hide]") } check_utf8string(false, byte1, byte2, byte3, byte4); + check_utf8dump(false, byte1, byte2, byte3, byte4); } } } @@ -634,6 +740,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4) { check_utf8string(true, byte1, byte2, byte3, byte4); + check_utf8dump(true, byte1, byte2, byte3, byte4); } } } @@ -645,6 +752,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1) { check_utf8string(false, byte1); + check_utf8dump(false, byte1); } } @@ -655,6 +763,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2) { check_utf8string(false, byte1, byte2); + check_utf8dump(false, byte1, byte2); } } } @@ -668,6 +777,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3) { check_utf8string(false, byte1, byte2, byte3); + check_utf8dump(false, byte1, byte2, byte3); } } } @@ -690,6 +800,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4) { check_utf8string(false, byte1, byte2, byte3, byte4); + check_utf8dump(false, byte1, byte2, byte3, byte4); } } } @@ -713,6 +824,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4) { check_utf8string(false, byte1, byte2, byte3, byte4); + check_utf8dump(false, byte1, byte2, byte3, byte4); } } } @@ -736,6 +848,7 @@ TEST_CASE("Unicode", "[hide]") } check_utf8string(false, byte1, byte2, byte3, byte4); + check_utf8dump(false, byte1, byte2, byte3, byte4); } } } @@ -756,6 +869,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4) { check_utf8string(true, byte1, byte2, byte3, byte4); + check_utf8dump(true, byte1, byte2, byte3, byte4); } } } @@ -767,6 +881,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1) { check_utf8string(false, byte1); + check_utf8dump(false, byte1); } } @@ -777,6 +892,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2) { check_utf8string(false, byte1, byte2); + check_utf8dump(false, byte1, byte2); } } } @@ -790,6 +906,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3) { check_utf8string(false, byte1, byte2, byte3); + check_utf8dump(false, byte1, byte2, byte3); } } } @@ -812,6 +929,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4) { check_utf8string(false, byte1, byte2, byte3, byte4); + check_utf8dump(false, byte1, byte2, byte3, byte4); } } } @@ -835,6 +953,7 @@ TEST_CASE("Unicode", "[hide]") for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4) { check_utf8string(false, byte1, byte2, byte3, byte4); + check_utf8dump(false, byte1, byte2, byte3, byte4); } } } @@ -858,6 +977,7 @@ TEST_CASE("Unicode", "[hide]") } check_utf8string(false, byte1, byte2, byte3, byte4); + check_utf8dump(false, byte1, byte2, byte3, byte4); } } } From c7af027cbb2edc39bb35ed5881adfac3ac45f14d Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Mon, 22 Oct 2018 09:18:16 +0200 Subject: [PATCH 4/8] :construction: respect ensure_ascii parameter #1198 --- include/nlohmann/detail/output/serializer.hpp | 30 ++++++++++++++----- single_include/nlohmann/json.hpp | 30 ++++++++++++++----- test/src/unit-unicode.cpp | 8 +---- 3 files changed, 47 insertions(+), 21 deletions(-) diff --git a/include/nlohmann/detail/output/serializer.hpp b/include/nlohmann/detail/output/serializer.hpp index c022c307..d21cd8fe 100644 --- a/include/nlohmann/detail/output/serializer.hpp +++ b/include/nlohmann/detail/output/serializer.hpp @@ -436,12 +436,21 @@ class serializer if (error_handler == error_handler_t::replace) { // add a replacement character - string_buffer[bytes++] = '\\'; - string_buffer[bytes++] = 'u'; - string_buffer[bytes++] = 'f'; - string_buffer[bytes++] = 'f'; - string_buffer[bytes++] = 'f'; - string_buffer[bytes++] = 'd'; + if (ensure_ascii) + { + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = 'u'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'd'; + } + else + { + string_buffer[bytes++] = '\xEF'; + string_buffer[bytes++] = '\xBF'; + string_buffer[bytes++] = '\xBD'; + } bytes_after_last_accept = bytes; } @@ -497,7 +506,14 @@ class serializer // write all accepted bytes o->write_characters(string_buffer.data(), bytes_after_last_accept); // add a replacement character - o->write_characters("\\ufffd", 6); + if (ensure_ascii) + { + o->write_characters("\\ufffd", 6); + } + else + { + o->write_characters("\xEF\xBF\xBD", 3); + } break; } } diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index 4e86fafc..f1335cd4 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -10388,12 +10388,21 @@ class serializer if (error_handler == error_handler_t::replace) { // add a replacement character - string_buffer[bytes++] = '\\'; - string_buffer[bytes++] = 'u'; - string_buffer[bytes++] = 'f'; - string_buffer[bytes++] = 'f'; - string_buffer[bytes++] = 'f'; - string_buffer[bytes++] = 'd'; + if (ensure_ascii) + { + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = 'u'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'f'; + string_buffer[bytes++] = 'd'; + } + else + { + string_buffer[bytes++] = '\xEF'; + string_buffer[bytes++] = '\xBF'; + string_buffer[bytes++] = '\xBD'; + } bytes_after_last_accept = bytes; } @@ -10449,7 +10458,14 @@ class serializer // write all accepted bytes o->write_characters(string_buffer.data(), bytes_after_last_accept); // add a replacement character - o->write_characters("\\ufffd", 6); + if (ensure_ascii) + { + o->write_characters("\\ufffd", 6); + } + else + { + o->write_characters("\xEF\xBF\xBD", 3); + } break; } } diff --git a/test/src/unit-unicode.cpp b/test/src/unit-unicode.cpp index c71f6f1d..acc384bd 100644 --- a/test/src/unit-unicode.cpp +++ b/test/src/unit-unicode.cpp @@ -86,10 +86,6 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 // all dumps should agree on the string CHECK(s_strict == s_ignored); CHECK(s_strict == s_replaced); - - // check that ignore/replace string does not contain a replacement character - CHECK(s_ignored.find("\\ufffd") == std::string::npos); - CHECK(s_replaced.find("\\ufffd") == std::string::npos); } else { @@ -98,10 +94,8 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 // ignore and replace must create different dumps CHECK(s_ignored != s_replaced); - // check that ignore string does not contain a replacement character - CHECK(s_ignored.find("\\ufffd") == std::string::npos); // check that replace string contains a replacement character - CHECK(s_replaced.find("\\ufffd") != std::string::npos); + CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos); } From c51b1e6fabd676dcd84c327a99586778855b7bda Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Mon, 22 Oct 2018 15:53:36 +0200 Subject: [PATCH 5/8] :construction: fixed an issue with ensure_ascii #1198 --- include/nlohmann/detail/output/serializer.hpp | 7 ++++++- single_include/nlohmann/json.hpp | 7 ++++++- test/src/unit-unicode.cpp | 8 ++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/include/nlohmann/detail/output/serializer.hpp b/include/nlohmann/detail/output/serializer.hpp index d21cd8fe..f1c0b051 100644 --- a/include/nlohmann/detail/output/serializer.hpp +++ b/include/nlohmann/detail/output/serializer.hpp @@ -302,6 +302,7 @@ class serializer // number of bytes written at the point of the last valid byte std::size_t bytes_after_last_accept = 0; + std::size_t undumped_chars = 0; for (std::size_t i = 0; i < s.size(); ++i) { @@ -403,6 +404,7 @@ class serializer // remember the byte position of this accept bytes_after_last_accept = bytes; + undumped_chars = 0; break; } @@ -424,7 +426,7 @@ class serializer // would like to read it again, because the byte // may be OK for itself, but just not OK for the // previous sequence - if (bytes_after_last_accept != bytes) + if (undumped_chars > 0) { --i; } @@ -454,6 +456,8 @@ class serializer bytes_after_last_accept = bytes; } + undumped_chars = 0; + // continue processing the string state = UTF8_ACCEPT; continue; @@ -468,6 +472,7 @@ class serializer // code point will not be escaped - copy byte to buffer string_buffer[bytes++] = s[i]; } + ++undumped_chars; break; } } diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index f1335cd4..025dfeaf 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -10254,6 +10254,7 @@ class serializer // number of bytes written at the point of the last valid byte std::size_t bytes_after_last_accept = 0; + std::size_t undumped_chars = 0; for (std::size_t i = 0; i < s.size(); ++i) { @@ -10355,6 +10356,7 @@ class serializer // remember the byte position of this accept bytes_after_last_accept = bytes; + undumped_chars = 0; break; } @@ -10376,7 +10378,7 @@ class serializer // would like to read it again, because the byte // may be OK for itself, but just not OK for the // previous sequence - if (bytes_after_last_accept != bytes) + if (undumped_chars > 0) { --i; } @@ -10406,6 +10408,8 @@ class serializer bytes_after_last_accept = bytes; } + undumped_chars = 0; + // continue processing the string state = UTF8_ACCEPT; continue; @@ -10420,6 +10424,7 @@ class serializer // code point will not be escaped - copy byte to buffer string_buffer[bytes++] = s[i]; } + ++undumped_chars; break; } } diff --git a/test/src/unit-unicode.cpp b/test/src/unit-unicode.cpp index acc384bd..fe16eb0d 100644 --- a/test/src/unit-unicode.cpp +++ b/test/src/unit-unicode.cpp @@ -76,8 +76,12 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 // dumping with ignore/replace must not throw in any case auto s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore); auto s_ignored2 = j2.dump(-1, ' ', false, json::error_handler_t::ignore); + auto s_ignored_ascii = j.dump(-1, ' ', true, json::error_handler_t::ignore); + auto s_ignored2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::ignore); auto s_replaced = j.dump(-1, ' ', false, json::error_handler_t::replace); auto s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace); + auto s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace); + auto s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace); if (success_expected) { @@ -102,8 +106,12 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 // check that prefix and suffix are preserved CHECK(s_ignored2.substr(1, 3) == "abc"); CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz"); + CHECK(s_ignored2_ascii.substr(1, 3) == "abc"); + CHECK(s_ignored2_ascii.substr(s_ignored2_ascii.size() - 4, 3) == "xyz"); CHECK(s_replaced2.substr(1, 3) == "abc"); CHECK(s_replaced2.substr(s_replaced2.size() - 4, 3) == "xyz"); + CHECK(s_replaced2_ascii.substr(1, 3) == "abc"); + CHECK(s_replaced2_ascii.substr(s_replaced2_ascii.size() - 4, 3) == "xyz"); } void check_utf8string(bool success_expected, int byte1, int byte2, int byte3, int byte4); From 951a7a64559a746e4cc6e99c0e59a7a00bd72394 Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Mon, 22 Oct 2018 18:20:45 +0200 Subject: [PATCH 6/8] :construction: fixed test cases #1198 --- test/src/unit-serialization.cpp | 9 ++++++--- test/src/unit-unicode.cpp | 1 - 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/test/src/unit-serialization.cpp b/test/src/unit-serialization.cpp index e4069f73..0255edc9 100644 --- a/test/src/unit-serialization.cpp +++ b/test/src/unit-serialization.cpp @@ -106,7 +106,8 @@ TEST_CASE("serialization") CHECK_THROWS_AS(j.dump(1, ' ', false, json::error_handler_t::strict), json::type_error&); CHECK_THROWS_WITH(j.dump(1, ' ', false, json::error_handler_t::strict), "[json.exception.type_error.316] invalid UTF-8 byte at index 2: 0xA9"); CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"äü\""); - CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"ä\\ufffdü\""); + CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"ä\xEF\xBF\xBDü\""); + CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"\\u00e4\\ufffd\\u00fc\""); } SECTION("ending with incomplete character") @@ -117,7 +118,8 @@ TEST_CASE("serialization") CHECK_THROWS_WITH(j.dump(), "[json.exception.type_error.316] incomplete UTF-8 string; last byte: 0xC2"); CHECK_THROWS_AS(j.dump(1, ' ', false, json::error_handler_t::strict), json::type_error&); CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123\""); - CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\\ufffd\""); + CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\xEF\xBF\xBD\""); + CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"123\\ufffd\""); } SECTION("unexpected character") @@ -128,7 +130,8 @@ TEST_CASE("serialization") CHECK_THROWS_WITH(j.dump(), "[json.exception.type_error.316] invalid UTF-8 byte at index 5: 0x34"); CHECK_THROWS_AS(j.dump(1, ' ', false, json::error_handler_t::strict), json::type_error&); CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123456\""); - CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\\ufffd456\""); + CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\xEF\xBF\xBD\x34\x35\x36\""); + CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"123\\ufffd456\""); } } } diff --git a/test/src/unit-unicode.cpp b/test/src/unit-unicode.cpp index fe16eb0d..a8a0a938 100644 --- a/test/src/unit-unicode.cpp +++ b/test/src/unit-unicode.cpp @@ -100,7 +100,6 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 // check that replace string contains a replacement character CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos); - } // check that prefix and suffix are preserved From 2343d9caeb12cb0047762c38f22cf428eea61d7f Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Tue, 23 Oct 2018 17:22:13 +0200 Subject: [PATCH 7/8] :green_heart: additional tests from the Unicode spec #1198 Thanks @abolz! --- test/src/unit-serialization.cpp | 36 +++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/test/src/unit-serialization.cpp b/test/src/unit-serialization.cpp index 0255edc9..1fe796e5 100644 --- a/test/src/unit-serialization.cpp +++ b/test/src/unit-serialization.cpp @@ -133,5 +133,41 @@ TEST_CASE("serialization") CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\xEF\xBF\xBD\x34\x35\x36\""); CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"123\\ufffd456\""); } + + SECTION("U+FFFD Substitution of Maximal Subparts") + { + // Some tests (mostly) from + // https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf + // Section 3.9 -- U+FFFD Substitution of Maximal Subparts + + auto test = [&](std::string const & input, std::string const & expected) + { + json j = input; + CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"" + expected + "\""); + }; + + test("\xC2", "\\ufffd"); + test("\xC2\x41\x42", "\\ufffd" "\x41" "\x42"); + test("\xC2\xF4", "\\ufffd" "\\ufffd"); + + test("\xF0\x80\x80\x41", "\\ufffd" "\\ufffd" "\\ufffd" "\x41"); + test("\xF1\x80\x80\x41", "\\ufffd" "\x41"); + test("\xF2\x80\x80\x41", "\\ufffd" "\x41"); + test("\xF3\x80\x80\x41", "\\ufffd" "\x41"); + test("\xF4\x80\x80\x41", "\\ufffd" "\x41"); + test("\xF5\x80\x80\x41", "\\ufffd" "\\ufffd" "\\ufffd" "\x41"); + + test("\xF0\x90\x80\x41", "\\ufffd" "\x41"); + test("\xF1\x90\x80\x41", "\\ufffd" "\x41"); + test("\xF2\x90\x80\x41", "\\ufffd" "\x41"); + test("\xF3\x90\x80\x41", "\\ufffd" "\x41"); + test("\xF4\x90\x80\x41", "\\ufffd" "\\ufffd" "\\ufffd" "\x41"); + test("\xF5\x90\x80\x41", "\\ufffd" "\\ufffd" "\\ufffd" "\x41"); + + test("\xC0\xAF\xE0\x80\xBF\xF0\x81\x82\x41", "\\ufffd" "\\ufffd" "\\ufffd" "\\ufffd" "\\ufffd" "\\ufffd" "\\ufffd" "\\ufffd" "\x41"); + test("\xED\xA0\x80\xED\xBF\xBF\xED\xAF\x41", "\\ufffd" "\\ufffd" "\\ufffd" "\\ufffd" "\\ufffd" "\\ufffd" "\\ufffd" "\\ufffd" "\x41"); + test("\xF4\x91\x92\x93\xFF\x41\x80\xBF\x42", "\\ufffd" "\\ufffd" "\\ufffd" "\\ufffd" "\\ufffd" "\x41" "\\ufffd""\\ufffd" "\x42"); + test("\xE1\x80\xE2\xF0\x91\x92\xF1\xBF\x41", "\\ufffd" "\\ufffd" "\\ufffd" "\\ufffd" "\x41"); + } } } From 87ef3f25f2bb325e96db0e4b167d63c45f682af5 Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Tue, 23 Oct 2018 22:56:10 +0200 Subject: [PATCH 8/8] :pencil2: fixed a typo #1314 --- include/nlohmann/detail/output/serializer.hpp | 2 +- single_include/nlohmann/json.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/nlohmann/detail/output/serializer.hpp b/include/nlohmann/detail/output/serializer.hpp index f1c0b051..1d107ce0 100644 --- a/include/nlohmann/detail/output/serializer.hpp +++ b/include/nlohmann/detail/output/serializer.hpp @@ -422,7 +422,7 @@ class serializer case error_handler_t::ignore: case error_handler_t::replace: { - // in case we saw this chatacter the first time, we + // in case we saw this character the first time, we // would like to read it again, because the byte // may be OK for itself, but just not OK for the // previous sequence diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index 025dfeaf..be3f8417 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -10374,7 +10374,7 @@ class serializer case error_handler_t::ignore: case error_handler_t::replace: { - // in case we saw this chatacter the first time, we + // in case we saw this character the first time, we // would like to read it again, because the byte // may be OK for itself, but just not OK for the // previous sequence