Merge pull request #1314 from nlohmann/feature/codec_errors
Allow to set error handler for decoding errors
This commit is contained in:
commit
7b501de054
5 changed files with 455 additions and 30 deletions
|
@ -28,6 +28,14 @@ namespace detail
|
|||
// serialization //
|
||||
///////////////////
|
||||
|
||||
/// how to treat decoding errors
|
||||
enum class error_handler_t
|
||||
{
|
||||
strict, ///< throw a type_error exception in case of invalid UTF-8
|
||||
replace, ///< replace invalid UTF-8 sequences with U+FFFD
|
||||
ignore ///< ignore invalid UTF-8 sequences
|
||||
};
|
||||
|
||||
template<typename BasicJsonType>
|
||||
class serializer
|
||||
{
|
||||
|
@ -42,12 +50,17 @@ class serializer
|
|||
/*!
|
||||
@param[in] s output stream to serialize to
|
||||
@param[in] ichar indentation character to use
|
||||
@param[in] error_handler_ how to react on decoding errors
|
||||
*/
|
||||
serializer(output_adapter_t<char> s, const char ichar)
|
||||
: o(std::move(s)), loc(std::localeconv()),
|
||||
thousands_sep(loc->thousands_sep == nullptr ? '\0' : * (loc->thousands_sep)),
|
||||
decimal_point(loc->decimal_point == nullptr ? '\0' : * (loc->decimal_point)),
|
||||
indent_char(ichar), indent_string(512, indent_char)
|
||||
serializer(output_adapter_t<char> s, const char ichar,
|
||||
error_handler_t error_handler_ = error_handler_t::strict)
|
||||
: o(std::move(s))
|
||||
, loc(std::localeconv())
|
||||
, thousands_sep(loc->thousands_sep == nullptr ? '\0' : * (loc->thousands_sep))
|
||||
, decimal_point(loc->decimal_point == nullptr ? '\0' : * (loc->decimal_point))
|
||||
, indent_char(ichar)
|
||||
, indent_string(512, indent_char)
|
||||
, error_handler(error_handler_)
|
||||
{}
|
||||
|
||||
// delete because of pointer members
|
||||
|
@ -287,6 +300,10 @@ class serializer
|
|||
uint8_t state = UTF8_ACCEPT;
|
||||
std::size_t bytes = 0; // number of bytes written to string_buffer
|
||||
|
||||
// number of bytes written at the point of the last valid byte
|
||||
std::size_t bytes_after_last_accept = 0;
|
||||
std::size_t undumped_chars = 0;
|
||||
|
||||
for (std::size_t i = 0; i < s.size(); ++i)
|
||||
{
|
||||
const auto byte = static_cast<uint8_t>(s[i]);
|
||||
|
@ -384,14 +401,68 @@ class serializer
|
|||
o->write_characters(string_buffer.data(), bytes);
|
||||
bytes = 0;
|
||||
}
|
||||
|
||||
// remember the byte position of this accept
|
||||
bytes_after_last_accept = bytes;
|
||||
undumped_chars = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
case UTF8_REJECT: // decode found invalid UTF-8 byte
|
||||
{
|
||||
std::string sn(3, '\0');
|
||||
snprintf(&sn[0], sn.size(), "%.2X", byte);
|
||||
JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn));
|
||||
switch (error_handler)
|
||||
{
|
||||
case error_handler_t::strict:
|
||||
{
|
||||
std::string sn(3, '\0');
|
||||
snprintf(&sn[0], sn.size(), "%.2X", byte);
|
||||
JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn));
|
||||
}
|
||||
|
||||
case error_handler_t::ignore:
|
||||
case error_handler_t::replace:
|
||||
{
|
||||
// in case we saw this character the first time, we
|
||||
// would like to read it again, because the byte
|
||||
// may be OK for itself, but just not OK for the
|
||||
// previous sequence
|
||||
if (undumped_chars > 0)
|
||||
{
|
||||
--i;
|
||||
}
|
||||
|
||||
// reset length buffer to the last accepted index;
|
||||
// thus removing/ignoring the invalid characters
|
||||
bytes = bytes_after_last_accept;
|
||||
|
||||
if (error_handler == error_handler_t::replace)
|
||||
{
|
||||
// add a replacement character
|
||||
if (ensure_ascii)
|
||||
{
|
||||
string_buffer[bytes++] = '\\';
|
||||
string_buffer[bytes++] = 'u';
|
||||
string_buffer[bytes++] = 'f';
|
||||
string_buffer[bytes++] = 'f';
|
||||
string_buffer[bytes++] = 'f';
|
||||
string_buffer[bytes++] = 'd';
|
||||
}
|
||||
else
|
||||
{
|
||||
string_buffer[bytes++] = '\xEF';
|
||||
string_buffer[bytes++] = '\xBF';
|
||||
string_buffer[bytes++] = '\xBD';
|
||||
}
|
||||
bytes_after_last_accept = bytes;
|
||||
}
|
||||
|
||||
undumped_chars = 0;
|
||||
|
||||
// continue processing the string
|
||||
state = UTF8_ACCEPT;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
default: // decode found yet incomplete multi-byte code point
|
||||
|
@ -401,11 +472,13 @@ class serializer
|
|||
// code point will not be escaped - copy byte to buffer
|
||||
string_buffer[bytes++] = s[i];
|
||||
}
|
||||
++undumped_chars;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// we finished processing the string
|
||||
if (JSON_LIKELY(state == UTF8_ACCEPT))
|
||||
{
|
||||
// write buffer
|
||||
|
@ -417,9 +490,38 @@ class serializer
|
|||
else
|
||||
{
|
||||
// we finish reading, but do not accept: string was incomplete
|
||||
std::string sn(3, '\0');
|
||||
snprintf(&sn[0], sn.size(), "%.2X", static_cast<uint8_t>(s.back()));
|
||||
JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn));
|
||||
switch (error_handler)
|
||||
{
|
||||
case error_handler_t::strict:
|
||||
{
|
||||
std::string sn(3, '\0');
|
||||
snprintf(&sn[0], sn.size(), "%.2X", static_cast<uint8_t>(s.back()));
|
||||
JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn));
|
||||
}
|
||||
|
||||
case error_handler_t::ignore:
|
||||
{
|
||||
// write all accepted bytes
|
||||
o->write_characters(string_buffer.data(), bytes_after_last_accept);
|
||||
break;
|
||||
}
|
||||
|
||||
case error_handler_t::replace:
|
||||
{
|
||||
// write all accepted bytes
|
||||
o->write_characters(string_buffer.data(), bytes_after_last_accept);
|
||||
// add a replacement character
|
||||
if (ensure_ascii)
|
||||
{
|
||||
o->write_characters("\\ufffd", 6);
|
||||
}
|
||||
else
|
||||
{
|
||||
o->write_characters("\xEF\xBF\xBD", 3);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -629,6 +731,9 @@ class serializer
|
|||
const char indent_char;
|
||||
/// the indentation string
|
||||
string_t indent_string;
|
||||
|
||||
/// error_handler how to react on decoding errors
|
||||
const error_handler_t error_handler;
|
||||
};
|
||||
} // namespace detail
|
||||
} // namespace nlohmann
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue