🚧 overworked error handlers #1198
This commit is contained in:
		
							parent
							
								
									0671e92ced
								
							
						
					
					
						commit
						c5821d91e5
					
				
					 4 changed files with 181 additions and 66 deletions
				
			
		|  | @ -28,6 +28,14 @@ namespace detail | |||
| // serialization //
 | ||||
| ///////////////////
 | ||||
| 
 | ||||
| /// how to treat decoding errors
 | ||||
| enum class error_handler_t | ||||
| { | ||||
|     strict,  ///< throw a type_error exception in case of invalid UTF-8
 | ||||
|     replace, ///< replace invalid UTF-8 sequences with U+FFFD
 | ||||
|     ignore   ///< ignore invalid UTF-8 sequences
 | ||||
| }; | ||||
| 
 | ||||
| template<typename BasicJsonType> | ||||
| class serializer | ||||
| { | ||||
|  | @ -39,23 +47,20 @@ class serializer | |||
|     static constexpr uint8_t UTF8_REJECT = 1; | ||||
| 
 | ||||
|   public: | ||||
|     /// how to treat decoding errors
 | ||||
|     enum class error_handler_t | ||||
|     { | ||||
|         strict,  ///< throw a type_error exception in case of invalid UTF-8
 | ||||
|         replace, ///< replace invalid UTF-8 sequences with U+FFFD
 | ||||
|         ignore   ///< ignore invalid UTF-8 sequences
 | ||||
|     }; | ||||
| 
 | ||||
|     /*!
 | ||||
|     @param[in] s  output stream to serialize to | ||||
|     @param[in] ichar  indentation character to use | ||||
|     @param[in] error_handler_  how to react on decoding errors | ||||
|     */ | ||||
|     serializer(output_adapter_t<char> s, const char ichar) | ||||
|         : o(std::move(s)), loc(std::localeconv()), | ||||
|           thousands_sep(loc->thousands_sep == nullptr ? '\0' : * (loc->thousands_sep)), | ||||
|           decimal_point(loc->decimal_point == nullptr ? '\0' : * (loc->decimal_point)), | ||||
|           indent_char(ichar), indent_string(512, indent_char) | ||||
|     serializer(output_adapter_t<char> s, const char ichar, | ||||
|                error_handler_t error_handler_ = error_handler_t::strict) | ||||
|         : o(std::move(s)) | ||||
|         , loc(std::localeconv()) | ||||
|         , thousands_sep(loc->thousands_sep == nullptr ? '\0' : * (loc->thousands_sep)) | ||||
|         , decimal_point(loc->decimal_point == nullptr ? '\0' : * (loc->decimal_point)) | ||||
|         , indent_char(ichar) | ||||
|         , indent_string(512, indent_char) | ||||
|         , error_handler(error_handler_) | ||||
|     {} | ||||
| 
 | ||||
|     // delete because of pointer members
 | ||||
|  | @ -286,17 +291,18 @@ class serializer | |||
|     @param[in] s  the string to escape | ||||
|     @param[in] ensure_ascii  whether to escape non-ASCII characters with | ||||
|                              \uXXXX sequences | ||||
|     @param[in] error_handler how to react on decoding errors | ||||
| 
 | ||||
|     @complexity Linear in the length of string @a s. | ||||
|     */ | ||||
|     void dump_escaped(const string_t& s, const bool ensure_ascii, | ||||
|                       const error_handler_t error_handler = error_handler_t::strict) | ||||
|     void dump_escaped(const string_t& s, const bool ensure_ascii) | ||||
|     { | ||||
|         uint32_t codepoint; | ||||
|         uint8_t state = UTF8_ACCEPT; | ||||
|         std::size_t bytes = 0;  // number of bytes written to string_buffer
 | ||||
| 
 | ||||
|         // number of bytes written at the point of the last valid byte
 | ||||
|         std::size_t bytes_after_last_accept = 0; | ||||
| 
 | ||||
|         for (std::size_t i = 0; i < s.size(); ++i) | ||||
|         { | ||||
|             const auto byte = static_cast<uint8_t>(s[i]); | ||||
|  | @ -394,6 +400,9 @@ class serializer | |||
|                         o->write_characters(string_buffer.data(), bytes); | ||||
|                         bytes = 0; | ||||
|                     } | ||||
| 
 | ||||
|                     // remember the byte position of this accept
 | ||||
|                     bytes_after_last_accept = bytes; | ||||
|                     break; | ||||
|                 } | ||||
| 
 | ||||
|  | @ -409,19 +418,33 @@ class serializer | |||
|                         } | ||||
| 
 | ||||
|                         case error_handler_t::ignore: | ||||
|                         { | ||||
|                             state = UTF8_ACCEPT; | ||||
|                             continue; | ||||
|                         } | ||||
| 
 | ||||
|                         case error_handler_t::replace: | ||||
|                         { | ||||
|                             string_buffer[bytes++] = '\\'; | ||||
|                             string_buffer[bytes++] = 'u'; | ||||
|                             string_buffer[bytes++] = 'f'; | ||||
|                             string_buffer[bytes++] = 'f'; | ||||
|                             string_buffer[bytes++] = 'f'; | ||||
|                             string_buffer[bytes++] = 'd'; | ||||
|                             // in case we saw this chatacter the first time, we
 | ||||
|                             // would like to read it again, because the byte
 | ||||
|                             // may be OK for itself, but just not OK for the
 | ||||
|                             // previous sequence
 | ||||
|                             if (bytes_after_last_accept != bytes) | ||||
|                             { | ||||
|                                 --i; | ||||
|                             } | ||||
| 
 | ||||
|                             // reset length buffer to the last accepted index;
 | ||||
|                             // thus removing/ignoring the invalid characters
 | ||||
|                             bytes = bytes_after_last_accept; | ||||
| 
 | ||||
|                             if (error_handler == error_handler_t::replace) | ||||
|                             { | ||||
|                                 // add a replacement character
 | ||||
|                                 string_buffer[bytes++] = '\\'; | ||||
|                                 string_buffer[bytes++] = 'u'; | ||||
|                                 string_buffer[bytes++] = 'f'; | ||||
|                                 string_buffer[bytes++] = 'f'; | ||||
|                                 string_buffer[bytes++] = 'f'; | ||||
|                                 string_buffer[bytes++] = 'd'; | ||||
|                             } | ||||
| 
 | ||||
|                             // continue processing the string
 | ||||
|                             state = UTF8_ACCEPT; | ||||
|                             continue; | ||||
|                         } | ||||
|  | @ -440,6 +463,7 @@ class serializer | |||
|             } | ||||
|         } | ||||
| 
 | ||||
|         // we finished processing the string
 | ||||
|         if (JSON_LIKELY(state == UTF8_ACCEPT)) | ||||
|         { | ||||
|             // write buffer
 | ||||
|  | @ -462,13 +486,16 @@ class serializer | |||
| 
 | ||||
|                 case error_handler_t::ignore: | ||||
|                 { | ||||
|                     // write all accepted bytes
 | ||||
|                     o->write_characters(string_buffer.data(), bytes_after_last_accept); | ||||
|                     break; | ||||
|                 } | ||||
| 
 | ||||
|                 case error_handler_t::replace: | ||||
|                 { | ||||
|                     // write buffer, but replace last byte
 | ||||
|                     o->write_characters(string_buffer.data(), bytes - 1); | ||||
|                     // write all accepted bytes
 | ||||
|                     o->write_characters(string_buffer.data(), bytes_after_last_accept); | ||||
|                     // add a replacement character
 | ||||
|                     o->write_characters("\\ufffd", 6); | ||||
|                     break; | ||||
|                 } | ||||
|  | @ -682,6 +709,9 @@ class serializer | |||
|     const char indent_char; | ||||
|     /// the indentation string
 | ||||
|     string_t indent_string; | ||||
| 
 | ||||
|     /// error_handler how to react on decoding errors
 | ||||
|     const error_handler_t error_handler; | ||||
| }; | ||||
| }  // namespace detail
 | ||||
| }  // namespace nlohmann
 | ||||
|  |  | |||
|  | @ -208,6 +208,8 @@ class basic_json | |||
|     using json_pointer = ::nlohmann::json_pointer<basic_json>; | ||||
|     template<typename T, typename SFINAE> | ||||
|     using json_serializer = JSONSerializer<T, SFINAE>; | ||||
|     /// how to treat decoding errors
 | ||||
|     using error_handler_t = detail::error_handler_t; | ||||
|     /// helper type for initializer lists of basic_json values
 | ||||
|     using initializer_list_t = std::initializer_list<detail::json_ref<basic_json>>; | ||||
| 
 | ||||
|  | @ -1932,6 +1934,10 @@ class basic_json | |||
|     @param[in] ensure_ascii If @a ensure_ascii is true, all non-ASCII characters | ||||
|     in the output are escaped with `\uXXXX` sequences, and the result consists | ||||
|     of ASCII characters only. | ||||
|     @param[in] error_handler  how to react on decoding errors; there are three | ||||
|     possible values: `strict` (throws and exception in case a decoding error | ||||
|     occurs; default), `replace` (replace invalid UTF-8 sequences with U+FFFD), | ||||
|     and `ignore` (ignore invalid UTF-8 sequences during serialization). | ||||
| 
 | ||||
|     @return string containing the serialization of the JSON value | ||||
| 
 | ||||
|  | @ -1950,13 +1956,16 @@ class basic_json | |||
|     @see https://docs.python.org/2/library/json.html#json.dump
 | ||||
| 
 | ||||
|     @since version 1.0.0; indentation character @a indent_char, option | ||||
|            @a ensure_ascii and exceptions added in version 3.0.0 | ||||
|            @a ensure_ascii and exceptions added in version 3.0.0; error | ||||
|            handlers added in version 3.4.0. | ||||
|     */ | ||||
|     string_t dump(const int indent = -1, const char indent_char = ' ', | ||||
|                   const bool ensure_ascii = false) const | ||||
|     string_t dump(const int indent = -1, | ||||
|                   const char indent_char = ' ', | ||||
|                   const bool ensure_ascii = false, | ||||
|                   const error_handler_t error_handler = error_handler_t::strict) const | ||||
|     { | ||||
|         string_t result; | ||||
|         serializer s(detail::output_adapter<char, string_t>(result), indent_char); | ||||
|         serializer s(detail::output_adapter<char, string_t>(result), indent_char, error_handler); | ||||
| 
 | ||||
|         if (indent >= 0) | ||||
|         { | ||||
|  |  | |||
|  | @ -9980,6 +9980,14 @@ namespace detail | |||
| // serialization //
 | ||||
| ///////////////////
 | ||||
| 
 | ||||
| /// how to treat decoding errors
 | ||||
| enum class error_handler_t | ||||
| { | ||||
|     strict,  ///< throw a type_error exception in case of invalid UTF-8
 | ||||
|     replace, ///< replace invalid UTF-8 sequences with U+FFFD
 | ||||
|     ignore   ///< ignore invalid UTF-8 sequences
 | ||||
| }; | ||||
| 
 | ||||
| template<typename BasicJsonType> | ||||
| class serializer | ||||
| { | ||||
|  | @ -9991,23 +9999,20 @@ class serializer | |||
|     static constexpr uint8_t UTF8_REJECT = 1; | ||||
| 
 | ||||
|   public: | ||||
|     /// how to treat decoding errors
 | ||||
|     enum class error_handler_t | ||||
|     { | ||||
|         strict,  ///< throw a type_error exception in case of invalid UTF-8
 | ||||
|         replace, ///< replace invalid UTF-8 sequences with U+FFFD
 | ||||
|         ignore   ///< ignore invalid UTF-8 sequences
 | ||||
|     }; | ||||
| 
 | ||||
|     /*!
 | ||||
|     @param[in] s  output stream to serialize to | ||||
|     @param[in] ichar  indentation character to use | ||||
|     @param[in] error_handler_  how to react on decoding errors | ||||
|     */ | ||||
|     serializer(output_adapter_t<char> s, const char ichar) | ||||
|         : o(std::move(s)), loc(std::localeconv()), | ||||
|           thousands_sep(loc->thousands_sep == nullptr ? '\0' : * (loc->thousands_sep)), | ||||
|           decimal_point(loc->decimal_point == nullptr ? '\0' : * (loc->decimal_point)), | ||||
|           indent_char(ichar), indent_string(512, indent_char) | ||||
|     serializer(output_adapter_t<char> s, const char ichar, | ||||
|                error_handler_t error_handler_ = error_handler_t::strict) | ||||
|         : o(std::move(s)) | ||||
|         , loc(std::localeconv()) | ||||
|         , thousands_sep(loc->thousands_sep == nullptr ? '\0' : * (loc->thousands_sep)) | ||||
|         , decimal_point(loc->decimal_point == nullptr ? '\0' : * (loc->decimal_point)) | ||||
|         , indent_char(ichar) | ||||
|         , indent_string(512, indent_char) | ||||
|         , error_handler(error_handler_) | ||||
|     {} | ||||
| 
 | ||||
|     // delete because of pointer members
 | ||||
|  | @ -10238,17 +10243,18 @@ class serializer | |||
|     @param[in] s  the string to escape | ||||
|     @param[in] ensure_ascii  whether to escape non-ASCII characters with | ||||
|                              \uXXXX sequences | ||||
|     @param[in] error_handler how to react on decoding errors | ||||
| 
 | ||||
|     @complexity Linear in the length of string @a s. | ||||
|     */ | ||||
|     void dump_escaped(const string_t& s, const bool ensure_ascii, | ||||
|                       const error_handler_t error_handler = error_handler_t::strict) | ||||
|     void dump_escaped(const string_t& s, const bool ensure_ascii) | ||||
|     { | ||||
|         uint32_t codepoint; | ||||
|         uint8_t state = UTF8_ACCEPT; | ||||
|         std::size_t bytes = 0;  // number of bytes written to string_buffer
 | ||||
| 
 | ||||
|         // number of bytes written at the point of the last valid byte
 | ||||
|         std::size_t bytes_after_last_accept = 0; | ||||
| 
 | ||||
|         for (std::size_t i = 0; i < s.size(); ++i) | ||||
|         { | ||||
|             const auto byte = static_cast<uint8_t>(s[i]); | ||||
|  | @ -10346,6 +10352,9 @@ class serializer | |||
|                         o->write_characters(string_buffer.data(), bytes); | ||||
|                         bytes = 0; | ||||
|                     } | ||||
| 
 | ||||
|                     // remember the byte position of this accept
 | ||||
|                     bytes_after_last_accept = bytes; | ||||
|                     break; | ||||
|                 } | ||||
| 
 | ||||
|  | @ -10361,19 +10370,33 @@ class serializer | |||
|                         } | ||||
| 
 | ||||
|                         case error_handler_t::ignore: | ||||
|                         { | ||||
|                             state = UTF8_ACCEPT; | ||||
|                             continue; | ||||
|                         } | ||||
| 
 | ||||
|                         case error_handler_t::replace: | ||||
|                         { | ||||
|                             string_buffer[bytes++] = '\\'; | ||||
|                             string_buffer[bytes++] = 'u'; | ||||
|                             string_buffer[bytes++] = 'f'; | ||||
|                             string_buffer[bytes++] = 'f'; | ||||
|                             string_buffer[bytes++] = 'f'; | ||||
|                             string_buffer[bytes++] = 'd'; | ||||
|                             // in case we saw this chatacter the first time, we
 | ||||
|                             // would like to read it again, because the byte
 | ||||
|                             // may be OK for itself, but just not OK for the
 | ||||
|                             // previous sequence
 | ||||
|                             if (bytes_after_last_accept != bytes) | ||||
|                             { | ||||
|                                 --i; | ||||
|                             } | ||||
| 
 | ||||
|                             // reset length buffer to the last accepted index;
 | ||||
|                             // thus removing/ignoring the invalid characters
 | ||||
|                             bytes = bytes_after_last_accept; | ||||
| 
 | ||||
|                             if (error_handler == error_handler_t::replace) | ||||
|                             { | ||||
|                                 // add a replacement character
 | ||||
|                                 string_buffer[bytes++] = '\\'; | ||||
|                                 string_buffer[bytes++] = 'u'; | ||||
|                                 string_buffer[bytes++] = 'f'; | ||||
|                                 string_buffer[bytes++] = 'f'; | ||||
|                                 string_buffer[bytes++] = 'f'; | ||||
|                                 string_buffer[bytes++] = 'd'; | ||||
|                             } | ||||
| 
 | ||||
|                             // continue processing the string
 | ||||
|                             state = UTF8_ACCEPT; | ||||
|                             continue; | ||||
|                         } | ||||
|  | @ -10392,6 +10415,7 @@ class serializer | |||
|             } | ||||
|         } | ||||
| 
 | ||||
|         // we finished processing the string
 | ||||
|         if (JSON_LIKELY(state == UTF8_ACCEPT)) | ||||
|         { | ||||
|             // write buffer
 | ||||
|  | @ -10414,13 +10438,16 @@ class serializer | |||
| 
 | ||||
|                 case error_handler_t::ignore: | ||||
|                 { | ||||
|                     // write all accepted bytes
 | ||||
|                     o->write_characters(string_buffer.data(), bytes_after_last_accept); | ||||
|                     break; | ||||
|                 } | ||||
| 
 | ||||
|                 case error_handler_t::replace: | ||||
|                 { | ||||
|                     // write buffer, but replace last byte
 | ||||
|                     o->write_characters(string_buffer.data(), bytes - 1); | ||||
|                     // write all accepted bytes
 | ||||
|                     o->write_characters(string_buffer.data(), bytes_after_last_accept); | ||||
|                     // add a replacement character
 | ||||
|                     o->write_characters("\\ufffd", 6); | ||||
|                     break; | ||||
|                 } | ||||
|  | @ -10634,6 +10661,9 @@ class serializer | |||
|     const char indent_char; | ||||
|     /// the indentation string
 | ||||
|     string_t indent_string; | ||||
| 
 | ||||
|     /// error_handler how to react on decoding errors
 | ||||
|     const error_handler_t error_handler; | ||||
| }; | ||||
| }  // namespace detail
 | ||||
| }  // namespace nlohmann
 | ||||
|  | @ -11603,6 +11633,8 @@ class basic_json | |||
|     using json_pointer = ::nlohmann::json_pointer<basic_json>; | ||||
|     template<typename T, typename SFINAE> | ||||
|     using json_serializer = JSONSerializer<T, SFINAE>; | ||||
|     /// how to treat decoding errors
 | ||||
|     using error_handler_t = detail::error_handler_t; | ||||
|     /// helper type for initializer lists of basic_json values
 | ||||
|     using initializer_list_t = std::initializer_list<detail::json_ref<basic_json>>; | ||||
| 
 | ||||
|  | @ -13327,6 +13359,10 @@ class basic_json | |||
|     @param[in] ensure_ascii If @a ensure_ascii is true, all non-ASCII characters | ||||
|     in the output are escaped with `\uXXXX` sequences, and the result consists | ||||
|     of ASCII characters only. | ||||
|     @param[in] error_handler  how to react on decoding errors; there are three | ||||
|     possible values: `strict` (throws and exception in case a decoding error | ||||
|     occurs; default), `replace` (replace invalid UTF-8 sequences with U+FFFD), | ||||
|     and `ignore` (ignore invalid UTF-8 sequences during serialization). | ||||
| 
 | ||||
|     @return string containing the serialization of the JSON value | ||||
| 
 | ||||
|  | @ -13345,13 +13381,16 @@ class basic_json | |||
|     @see https://docs.python.org/2/library/json.html#json.dump
 | ||||
| 
 | ||||
|     @since version 1.0.0; indentation character @a indent_char, option | ||||
|            @a ensure_ascii and exceptions added in version 3.0.0 | ||||
|            @a ensure_ascii and exceptions added in version 3.0.0; error | ||||
|            handlers added in version 3.4.0. | ||||
|     */ | ||||
|     string_t dump(const int indent = -1, const char indent_char = ' ', | ||||
|                   const bool ensure_ascii = false) const | ||||
|     string_t dump(const int indent = -1, | ||||
|                   const char indent_char = ' ', | ||||
|                   const bool ensure_ascii = false, | ||||
|                   const error_handler_t error_handler = error_handler_t::strict) const | ||||
|     { | ||||
|         string_t result; | ||||
|         serializer s(detail::output_adapter<char, string_t>(result), indent_char); | ||||
|         serializer s(detail::output_adapter<char, string_t>(result), indent_char, error_handler); | ||||
| 
 | ||||
|         if (indent >= 0) | ||||
|         { | ||||
|  |  | |||
|  | @ -94,4 +94,41 @@ TEST_CASE("serialization") | |||
|                   "[\n\t\"foo\",\n\t1,\n\t2,\n\t3,\n\tfalse,\n\t{\n\t\t\"one\": 1\n\t}\n]"); | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     SECTION("dump") | ||||
|     { | ||||
|         SECTION("invalid character") | ||||
|         { | ||||
|             json j = "ä\xA9ü"; | ||||
| 
 | ||||
|             CHECK_THROWS_AS(j.dump(), json::type_error&); | ||||
|             CHECK_THROWS_WITH(j.dump(), "[json.exception.type_error.316] invalid UTF-8 byte at index 2: 0xA9"); | ||||
|             CHECK_THROWS_AS(j.dump(1, ' ', false, json::error_handler_t::strict), json::type_error&); | ||||
|             CHECK_THROWS_WITH(j.dump(1, ' ', false, json::error_handler_t::strict), "[json.exception.type_error.316] invalid UTF-8 byte at index 2: 0xA9"); | ||||
|             CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"äü\""); | ||||
|             CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"ä\\ufffdü\""); | ||||
|         } | ||||
| 
 | ||||
|         SECTION("ending with incomplete character") | ||||
|         { | ||||
|             json j = "123\xC2"; | ||||
| 
 | ||||
|             CHECK_THROWS_AS(j.dump(), json::type_error&); | ||||
|             CHECK_THROWS_WITH(j.dump(), "[json.exception.type_error.316] incomplete UTF-8 string; last byte: 0xC2"); | ||||
|             CHECK_THROWS_AS(j.dump(1, ' ', false, json::error_handler_t::strict), json::type_error&); | ||||
|             CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123\""); | ||||
|             CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\\ufffd\""); | ||||
|         } | ||||
| 
 | ||||
|         SECTION("unexpected character") | ||||
|         { | ||||
|             json j = "123\xF1\xB0\x34\x35\x36"; | ||||
| 
 | ||||
|             CHECK_THROWS_AS(j.dump(), json::type_error&); | ||||
|             CHECK_THROWS_WITH(j.dump(), "[json.exception.type_error.316] invalid UTF-8 byte at index 5: 0x34"); | ||||
|             CHECK_THROWS_AS(j.dump(1, ' ', false, json::error_handler_t::strict), json::type_error&); | ||||
|             CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123456\""); | ||||
|             CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\\ufffd456\""); | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue