🚧 proposal for different error handlers #1198
Proof of concept; currently only as parameter to the internal dump_escaped function; that is, not yet exposed to the dump function.
This commit is contained in:
parent
dd672939a0
commit
0671e92ced
2 changed files with 120 additions and 14 deletions
|
@ -39,6 +39,14 @@ class serializer
|
||||||
static constexpr uint8_t UTF8_REJECT = 1;
|
static constexpr uint8_t UTF8_REJECT = 1;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
/// how to treat decoding errors
|
||||||
|
enum class error_handler_t
|
||||||
|
{
|
||||||
|
strict, ///< throw a type_error exception in case of invalid UTF-8
|
||||||
|
replace, ///< replace invalid UTF-8 sequences with U+FFFD
|
||||||
|
ignore ///< ignore invalid UTF-8 sequences
|
||||||
|
};
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
@param[in] s output stream to serialize to
|
@param[in] s output stream to serialize to
|
||||||
@param[in] ichar indentation character to use
|
@param[in] ichar indentation character to use
|
||||||
|
@ -278,10 +286,12 @@ class serializer
|
||||||
@param[in] s the string to escape
|
@param[in] s the string to escape
|
||||||
@param[in] ensure_ascii whether to escape non-ASCII characters with
|
@param[in] ensure_ascii whether to escape non-ASCII characters with
|
||||||
\uXXXX sequences
|
\uXXXX sequences
|
||||||
|
@param[in] error_handler how to react on decoding errors
|
||||||
|
|
||||||
@complexity Linear in the length of string @a s.
|
@complexity Linear in the length of string @a s.
|
||||||
*/
|
*/
|
||||||
void dump_escaped(const string_t& s, const bool ensure_ascii)
|
void dump_escaped(const string_t& s, const bool ensure_ascii,
|
||||||
|
const error_handler_t error_handler = error_handler_t::strict)
|
||||||
{
|
{
|
||||||
uint32_t codepoint;
|
uint32_t codepoint;
|
||||||
uint8_t state = UTF8_ACCEPT;
|
uint8_t state = UTF8_ACCEPT;
|
||||||
|
@ -388,12 +398,36 @@ class serializer
|
||||||
}
|
}
|
||||||
|
|
||||||
case UTF8_REJECT: // decode found invalid UTF-8 byte
|
case UTF8_REJECT: // decode found invalid UTF-8 byte
|
||||||
|
{
|
||||||
|
switch (error_handler)
|
||||||
|
{
|
||||||
|
case error_handler_t::strict:
|
||||||
{
|
{
|
||||||
std::string sn(3, '\0');
|
std::string sn(3, '\0');
|
||||||
snprintf(&sn[0], sn.size(), "%.2X", byte);
|
snprintf(&sn[0], sn.size(), "%.2X", byte);
|
||||||
JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn));
|
JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case error_handler_t::ignore:
|
||||||
|
{
|
||||||
|
state = UTF8_ACCEPT;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
case error_handler_t::replace:
|
||||||
|
{
|
||||||
|
string_buffer[bytes++] = '\\';
|
||||||
|
string_buffer[bytes++] = 'u';
|
||||||
|
string_buffer[bytes++] = 'f';
|
||||||
|
string_buffer[bytes++] = 'f';
|
||||||
|
string_buffer[bytes++] = 'f';
|
||||||
|
string_buffer[bytes++] = 'd';
|
||||||
|
state = UTF8_ACCEPT;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
default: // decode found yet incomplete multi-byte code point
|
default: // decode found yet incomplete multi-byte code point
|
||||||
{
|
{
|
||||||
if (not ensure_ascii)
|
if (not ensure_ascii)
|
||||||
|
@ -417,10 +451,29 @@ class serializer
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// we finish reading, but do not accept: string was incomplete
|
// we finish reading, but do not accept: string was incomplete
|
||||||
|
switch (error_handler)
|
||||||
|
{
|
||||||
|
case error_handler_t::strict:
|
||||||
|
{
|
||||||
std::string sn(3, '\0');
|
std::string sn(3, '\0');
|
||||||
snprintf(&sn[0], sn.size(), "%.2X", static_cast<uint8_t>(s.back()));
|
snprintf(&sn[0], sn.size(), "%.2X", static_cast<uint8_t>(s.back()));
|
||||||
JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn));
|
JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case error_handler_t::ignore:
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case error_handler_t::replace:
|
||||||
|
{
|
||||||
|
// write buffer, but replace last byte
|
||||||
|
o->write_characters(string_buffer.data(), bytes - 1);
|
||||||
|
o->write_characters("\\ufffd", 6);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
|
|
|
@ -9991,6 +9991,14 @@ class serializer
|
||||||
static constexpr uint8_t UTF8_REJECT = 1;
|
static constexpr uint8_t UTF8_REJECT = 1;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
/// how to treat decoding errors
|
||||||
|
enum class error_handler_t
|
||||||
|
{
|
||||||
|
strict, ///< throw a type_error exception in case of invalid UTF-8
|
||||||
|
replace, ///< replace invalid UTF-8 sequences with U+FFFD
|
||||||
|
ignore ///< ignore invalid UTF-8 sequences
|
||||||
|
};
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
@param[in] s output stream to serialize to
|
@param[in] s output stream to serialize to
|
||||||
@param[in] ichar indentation character to use
|
@param[in] ichar indentation character to use
|
||||||
|
@ -10230,10 +10238,12 @@ class serializer
|
||||||
@param[in] s the string to escape
|
@param[in] s the string to escape
|
||||||
@param[in] ensure_ascii whether to escape non-ASCII characters with
|
@param[in] ensure_ascii whether to escape non-ASCII characters with
|
||||||
\uXXXX sequences
|
\uXXXX sequences
|
||||||
|
@param[in] error_handler how to react on decoding errors
|
||||||
|
|
||||||
@complexity Linear in the length of string @a s.
|
@complexity Linear in the length of string @a s.
|
||||||
*/
|
*/
|
||||||
void dump_escaped(const string_t& s, const bool ensure_ascii)
|
void dump_escaped(const string_t& s, const bool ensure_ascii,
|
||||||
|
const error_handler_t error_handler = error_handler_t::strict)
|
||||||
{
|
{
|
||||||
uint32_t codepoint;
|
uint32_t codepoint;
|
||||||
uint8_t state = UTF8_ACCEPT;
|
uint8_t state = UTF8_ACCEPT;
|
||||||
|
@ -10340,12 +10350,36 @@ class serializer
|
||||||
}
|
}
|
||||||
|
|
||||||
case UTF8_REJECT: // decode found invalid UTF-8 byte
|
case UTF8_REJECT: // decode found invalid UTF-8 byte
|
||||||
|
{
|
||||||
|
switch (error_handler)
|
||||||
|
{
|
||||||
|
case error_handler_t::strict:
|
||||||
{
|
{
|
||||||
std::string sn(3, '\0');
|
std::string sn(3, '\0');
|
||||||
snprintf(&sn[0], sn.size(), "%.2X", byte);
|
snprintf(&sn[0], sn.size(), "%.2X", byte);
|
||||||
JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn));
|
JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case error_handler_t::ignore:
|
||||||
|
{
|
||||||
|
state = UTF8_ACCEPT;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
case error_handler_t::replace:
|
||||||
|
{
|
||||||
|
string_buffer[bytes++] = '\\';
|
||||||
|
string_buffer[bytes++] = 'u';
|
||||||
|
string_buffer[bytes++] = 'f';
|
||||||
|
string_buffer[bytes++] = 'f';
|
||||||
|
string_buffer[bytes++] = 'f';
|
||||||
|
string_buffer[bytes++] = 'd';
|
||||||
|
state = UTF8_ACCEPT;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
default: // decode found yet incomplete multi-byte code point
|
default: // decode found yet incomplete multi-byte code point
|
||||||
{
|
{
|
||||||
if (not ensure_ascii)
|
if (not ensure_ascii)
|
||||||
|
@ -10369,10 +10403,29 @@ class serializer
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// we finish reading, but do not accept: string was incomplete
|
// we finish reading, but do not accept: string was incomplete
|
||||||
|
switch (error_handler)
|
||||||
|
{
|
||||||
|
case error_handler_t::strict:
|
||||||
|
{
|
||||||
std::string sn(3, '\0');
|
std::string sn(3, '\0');
|
||||||
snprintf(&sn[0], sn.size(), "%.2X", static_cast<uint8_t>(s.back()));
|
snprintf(&sn[0], sn.size(), "%.2X", static_cast<uint8_t>(s.back()));
|
||||||
JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn));
|
JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case error_handler_t::ignore:
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case error_handler_t::replace:
|
||||||
|
{
|
||||||
|
// write buffer, but replace last byte
|
||||||
|
o->write_characters(string_buffer.data(), bytes - 1);
|
||||||
|
o->write_characters("\\ufffd", 6);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
|
|
Loading…
Reference in a new issue