🚧 proposal for different error handlers #1198

Proof of concept; currently only as parameter to the internal dump_escaped function; that is, not yet exposed to the dump function.
This commit is contained in:
Niels Lohmann 2018-10-16 20:38:50 +02:00
parent dd672939a0
commit 0671e92ced
No known key found for this signature in database
GPG key ID: 7F3CEA63AE251B69
2 changed files with 120 additions and 14 deletions

View file

@ -39,6 +39,14 @@ class serializer
static constexpr uint8_t UTF8_REJECT = 1;
public:
/// how to treat decoding errors
enum class error_handler_t
{
strict, ///< throw a type_error exception in case of invalid UTF-8
replace, ///< replace invalid UTF-8 sequences with U+FFFD
ignore ///< ignore invalid UTF-8 sequences
};
/*!
@param[in] s output stream to serialize to
@param[in] ichar indentation character to use
@ -278,10 +286,12 @@ class serializer
@param[in] s the string to escape
@param[in] ensure_ascii whether to escape non-ASCII characters with
\uXXXX sequences
@param[in] error_handler how to react on decoding errors
@complexity Linear in the length of string @a s.
*/
void dump_escaped(const string_t& s, const bool ensure_ascii)
void dump_escaped(const string_t& s, const bool ensure_ascii,
const error_handler_t error_handler = error_handler_t::strict)
{
uint32_t codepoint;
uint8_t state = UTF8_ACCEPT;
@ -389,9 +399,33 @@ class serializer
case UTF8_REJECT: // decode found invalid UTF-8 byte
{
std::string sn(3, '\0');
snprintf(&sn[0], sn.size(), "%.2X", byte);
JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn));
switch (error_handler)
{
case error_handler_t::strict:
{
std::string sn(3, '\0');
snprintf(&sn[0], sn.size(), "%.2X", byte);
JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn));
}
case error_handler_t::ignore:
{
state = UTF8_ACCEPT;
continue;
}
case error_handler_t::replace:
{
string_buffer[bytes++] = '\\';
string_buffer[bytes++] = 'u';
string_buffer[bytes++] = 'f';
string_buffer[bytes++] = 'f';
string_buffer[bytes++] = 'f';
string_buffer[bytes++] = 'd';
state = UTF8_ACCEPT;
continue;
}
}
}
default: // decode found yet incomplete multi-byte code point
@ -417,9 +451,28 @@ class serializer
else
{
// we finish reading, but do not accept: string was incomplete
std::string sn(3, '\0');
snprintf(&sn[0], sn.size(), "%.2X", static_cast<uint8_t>(s.back()));
JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn));
switch (error_handler)
{
case error_handler_t::strict:
{
std::string sn(3, '\0');
snprintf(&sn[0], sn.size(), "%.2X", static_cast<uint8_t>(s.back()));
JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn));
}
case error_handler_t::ignore:
{
break;
}
case error_handler_t::replace:
{
// write buffer, but replace last byte
o->write_characters(string_buffer.data(), bytes - 1);
o->write_characters("\\ufffd", 6);
break;
}
}
}
}

View file

@ -9991,6 +9991,14 @@ class serializer
static constexpr uint8_t UTF8_REJECT = 1;
public:
/// how to treat decoding errors
enum class error_handler_t
{
strict, ///< throw a type_error exception in case of invalid UTF-8
replace, ///< replace invalid UTF-8 sequences with U+FFFD
ignore ///< ignore invalid UTF-8 sequences
};
/*!
@param[in] s output stream to serialize to
@param[in] ichar indentation character to use
@ -10230,10 +10238,12 @@ class serializer
@param[in] s the string to escape
@param[in] ensure_ascii whether to escape non-ASCII characters with
\uXXXX sequences
@param[in] error_handler how to react on decoding errors
@complexity Linear in the length of string @a s.
*/
void dump_escaped(const string_t& s, const bool ensure_ascii)
void dump_escaped(const string_t& s, const bool ensure_ascii,
const error_handler_t error_handler = error_handler_t::strict)
{
uint32_t codepoint;
uint8_t state = UTF8_ACCEPT;
@ -10341,9 +10351,33 @@ class serializer
case UTF8_REJECT: // decode found invalid UTF-8 byte
{
std::string sn(3, '\0');
snprintf(&sn[0], sn.size(), "%.2X", byte);
JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn));
switch (error_handler)
{
case error_handler_t::strict:
{
std::string sn(3, '\0');
snprintf(&sn[0], sn.size(), "%.2X", byte);
JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn));
}
case error_handler_t::ignore:
{
state = UTF8_ACCEPT;
continue;
}
case error_handler_t::replace:
{
string_buffer[bytes++] = '\\';
string_buffer[bytes++] = 'u';
string_buffer[bytes++] = 'f';
string_buffer[bytes++] = 'f';
string_buffer[bytes++] = 'f';
string_buffer[bytes++] = 'd';
state = UTF8_ACCEPT;
continue;
}
}
}
default: // decode found yet incomplete multi-byte code point
@ -10369,9 +10403,28 @@ class serializer
else
{
// we finish reading, but do not accept: string was incomplete
std::string sn(3, '\0');
snprintf(&sn[0], sn.size(), "%.2X", static_cast<uint8_t>(s.back()));
JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn));
switch (error_handler)
{
case error_handler_t::strict:
{
std::string sn(3, '\0');
snprintf(&sn[0], sn.size(), "%.2X", static_cast<uint8_t>(s.back()));
JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn));
}
case error_handler_t::ignore:
{
break;
}
case error_handler_t::replace:
{
// write buffer, but replace last byte
o->write_characters(string_buffer.data(), bytes - 1);
o->write_characters("\\ufffd", 6);
break;
}
}
}
}