🚧 overworked error handlers #1198

This commit is contained in:
Niels Lohmann 2018-10-21 11:49:37 +02:00
parent 0671e92ced
commit c5821d91e5
No known key found for this signature in database
GPG key ID: 7F3CEA63AE251B69
4 changed files with 181 additions and 66 deletions

View file

@ -28,6 +28,14 @@ namespace detail
// serialization //
///////////////////
/// how to treat decoding errors
enum class error_handler_t
{
strict, ///< throw a type_error exception in case of invalid UTF-8
replace, ///< replace invalid UTF-8 sequences with U+FFFD
ignore ///< ignore invalid UTF-8 sequences
};
template<typename BasicJsonType>
class serializer
{
@ -39,23 +47,20 @@ class serializer
static constexpr uint8_t UTF8_REJECT = 1;
public:
/// how to treat decoding errors
enum class error_handler_t
{
strict, ///< throw a type_error exception in case of invalid UTF-8
replace, ///< replace invalid UTF-8 sequences with U+FFFD
ignore ///< ignore invalid UTF-8 sequences
};
/*!
@param[in] s output stream to serialize to
@param[in] ichar indentation character to use
@param[in] error_handler_ how to react on decoding errors
*/
serializer(output_adapter_t<char> s, const char ichar)
: o(std::move(s)), loc(std::localeconv()),
thousands_sep(loc->thousands_sep == nullptr ? '\0' : * (loc->thousands_sep)),
decimal_point(loc->decimal_point == nullptr ? '\0' : * (loc->decimal_point)),
indent_char(ichar), indent_string(512, indent_char)
serializer(output_adapter_t<char> s, const char ichar,
error_handler_t error_handler_ = error_handler_t::strict)
: o(std::move(s))
, loc(std::localeconv())
, thousands_sep(loc->thousands_sep == nullptr ? '\0' : * (loc->thousands_sep))
, decimal_point(loc->decimal_point == nullptr ? '\0' : * (loc->decimal_point))
, indent_char(ichar)
, indent_string(512, indent_char)
, error_handler(error_handler_)
{}
// delete because of pointer members
@ -286,17 +291,18 @@ class serializer
@param[in] s the string to escape
@param[in] ensure_ascii whether to escape non-ASCII characters with
\uXXXX sequences
@param[in] error_handler how to react on decoding errors
@complexity Linear in the length of string @a s.
*/
void dump_escaped(const string_t& s, const bool ensure_ascii,
const error_handler_t error_handler = error_handler_t::strict)
void dump_escaped(const string_t& s, const bool ensure_ascii)
{
uint32_t codepoint;
uint8_t state = UTF8_ACCEPT;
std::size_t bytes = 0; // number of bytes written to string_buffer
// number of bytes written at the point of the last valid byte
std::size_t bytes_after_last_accept = 0;
for (std::size_t i = 0; i < s.size(); ++i)
{
const auto byte = static_cast<uint8_t>(s[i]);
@ -394,6 +400,9 @@ class serializer
o->write_characters(string_buffer.data(), bytes);
bytes = 0;
}
// remember the byte position of this accept
bytes_after_last_accept = bytes;
break;
}
@ -409,19 +418,33 @@ class serializer
}
case error_handler_t::ignore:
{
state = UTF8_ACCEPT;
continue;
}
case error_handler_t::replace:
{
string_buffer[bytes++] = '\\';
string_buffer[bytes++] = 'u';
string_buffer[bytes++] = 'f';
string_buffer[bytes++] = 'f';
string_buffer[bytes++] = 'f';
string_buffer[bytes++] = 'd';
// in case we saw this chatacter the first time, we
// would like to read it again, because the byte
// may be OK for itself, but just not OK for the
// previous sequence
if (bytes_after_last_accept != bytes)
{
--i;
}
// reset length buffer to the last accepted index;
// thus removing/ignoring the invalid characters
bytes = bytes_after_last_accept;
if (error_handler == error_handler_t::replace)
{
// add a replacement character
string_buffer[bytes++] = '\\';
string_buffer[bytes++] = 'u';
string_buffer[bytes++] = 'f';
string_buffer[bytes++] = 'f';
string_buffer[bytes++] = 'f';
string_buffer[bytes++] = 'd';
}
// continue processing the string
state = UTF8_ACCEPT;
continue;
}
@ -440,6 +463,7 @@ class serializer
}
}
// we finished processing the string
if (JSON_LIKELY(state == UTF8_ACCEPT))
{
// write buffer
@ -462,13 +486,16 @@ class serializer
case error_handler_t::ignore:
{
// write all accepted bytes
o->write_characters(string_buffer.data(), bytes_after_last_accept);
break;
}
case error_handler_t::replace:
{
// write buffer, but replace last byte
o->write_characters(string_buffer.data(), bytes - 1);
// write all accepted bytes
o->write_characters(string_buffer.data(), bytes_after_last_accept);
// add a replacement character
o->write_characters("\\ufffd", 6);
break;
}
@ -682,6 +709,9 @@ class serializer
const char indent_char;
/// the indentation string
string_t indent_string;
/// error_handler how to react on decoding errors
const error_handler_t error_handler;
};
} // namespace detail
} // namespace nlohmann