🐛 fix for #656

A complete rewrite of the string escape function. It now provides codepoint-to-\uxxxx escaping. Invalid UTF-8 byte sequences are not escaped, but copied as-is. I haven’t spent much time optimizing the code - but the library now agrees with Python on every single Unicode character’s escaping (see file test/data/json_nlohmann_tests/all_unicode_ascii.json).

Other minor changes: replaced "size_t" by "std::size_t"
This commit is contained in:
Niels Lohmann 2017-07-17 07:53:02 +02:00
parent 8a9133c6b2
commit 21d23982ca
No known key found for this signature in database
GPG key ID: 7F3CEA63AE251B69
4 changed files with 1112330 additions and 131 deletions

View file

@ -247,7 +247,7 @@ class parse_error : public exception
@param[in] what_arg the explanatory string @param[in] what_arg the explanatory string
@return parse_error object @return parse_error object
*/ */
static parse_error create(int id, size_t byte_, const std::string& what_arg) static parse_error create(int id, std::size_t byte_, const std::string& what_arg)
{ {
std::string w = exception::name("parse_error", id) + "parse error" + std::string w = exception::name("parse_error", id) + "parse error" +
(byte_ != 0 ? (" at " + std::to_string(byte_)) : "") + (byte_ != 0 ? (" at " + std::to_string(byte_)) : "") +
@ -265,10 +265,10 @@ class parse_error : public exception
file. This also holds true when reading a byte vector (CBOR or file. This also holds true when reading a byte vector (CBOR or
MessagePack). MessagePack).
*/ */
const size_t byte; const std::size_t byte;
private: private:
parse_error(int id_, size_t byte_, const char* what_arg) parse_error(int id_, std::size_t byte_, const char* what_arg)
: exception(id_, what_arg), byte(byte_) : exception(id_, what_arg), byte(byte_)
{} {}
}; };
@ -1278,7 +1278,7 @@ constexpr T static_const<T>::value;
struct input_adapter struct input_adapter
{ {
virtual int get_character() = 0; virtual int get_character() = 0;
virtual std::string read(size_t offset, size_t length) = 0; virtual std::string read(std::size_t offset, std::size_t length) = 0;
virtual ~input_adapter() {} virtual ~input_adapter() {}
}; };
@ -1339,7 +1339,7 @@ class cached_input_stream_adapter : public input_adapter
return buffer[buffer_pos++] & 0xFF; return buffer[buffer_pos++] & 0xFF;
} }
std::string read(size_t offset, size_t length) override std::string read(std::size_t offset, std::size_t length) override
{ {
// create buffer // create buffer
std::string result(length, '\0'); std::string result(length, '\0');
@ -1377,14 +1377,14 @@ class cached_input_stream_adapter : public input_adapter
std::istream& is; std::istream& is;
/// chars returned via get_character() /// chars returned via get_character()
size_t processed_chars = 0; std::size_t processed_chars = 0;
/// chars processed in the current buffer /// chars processed in the current buffer
size_t buffer_pos = 0; std::size_t buffer_pos = 0;
/// whether stream reached eof /// whether stream reached eof
bool eof = false; bool eof = false;
/// how many chars have been copied to the buffer by last (re)fill /// how many chars have been copied to the buffer by last (re)fill
size_t fill_size = 0; std::size_t fill_size = 0;
/// position of the stream when we started /// position of the stream when we started
const std::streampos start_position; const std::streampos start_position;
@ -1397,7 +1397,7 @@ class cached_input_stream_adapter : public input_adapter
class input_buffer_adapter : public input_adapter class input_buffer_adapter : public input_adapter
{ {
public: public:
input_buffer_adapter(const char* b, size_t l) input_buffer_adapter(const char* b, std::size_t l)
: input_adapter(), cursor(b), limit(b + l), start(b) : input_adapter(), cursor(b), limit(b + l), start(b)
{ {
// skip byte order mark // skip byte order mark
@ -1423,10 +1423,10 @@ class input_buffer_adapter : public input_adapter
} }
} }
std::string read(size_t offset, size_t length) override std::string read(std::size_t offset, std::size_t length) override
{ {
// avoid reading too many characters // avoid reading too many characters
const size_t max_length = static_cast<size_t>(limit - start); const std::size_t max_length = static_cast<size_t>(limit - start);
return std::string(start + offset, (std::min)(length, max_length - offset)); return std::string(start + offset, (std::min)(length, max_length - offset));
} }
@ -1456,7 +1456,7 @@ struct input_adapter_factory
} }
/// input adapter for buffer /// input adapter for buffer
static std::shared_ptr<input_adapter> create(const char* b, size_t l) static std::shared_ptr<input_adapter> create(const char* b, std::size_t l)
{ {
return std::make_shared<input_buffer_adapter>(b, l); return std::make_shared<input_buffer_adapter>(b, l);
} }
@ -1504,8 +1504,17 @@ struct input_adapter_factory
static_assert( static_assert(
sizeof(typename std::iterator_traits<IteratorType>::value_type) == 1, "each element in the iterator range must have the size of 1 byte"); sizeof(typename std::iterator_traits<IteratorType>::value_type) == 1, "each element in the iterator range must have the size of 1 byte");
return create(reinterpret_cast<const char*>(&(*first)), const auto len = static_cast<size_t>(std::distance(first, last));
static_cast<size_t>(std::distance(first, last))); if (JSON_LIKELY(len > 0))
{
// there is at least one element: use the address of first
return create(reinterpret_cast<const char*>(&(*first)), len);
}
else
{
// the address of first cannot be used - use nullptr
return create(nullptr, len);
}
} }
/// input adapter for array /// input adapter for array
@ -2843,11 +2852,11 @@ scan_number_done:
@param[in] length the length of the passed literal text @param[in] length the length of the passed literal text
@param[in] return_type the token type to return on success @param[in] return_type the token type to return on success
*/ */
token_type scan_literal(const char* literal_text, const size_t length, token_type scan_literal(const char* literal_text, const std::size_t length,
token_type return_type) token_type return_type)
{ {
assert(current == literal_text[0]); assert(current == literal_text[0]);
for (size_t i = 1; i < length; ++i) for (std::size_t i = 1; i < length; ++i)
{ {
if (JSON_UNLIKELY(get() != literal_text[i])) if (JSON_UNLIKELY(get() != literal_text[i]))
{ {
@ -2926,7 +2935,7 @@ scan_number_done:
///////////////////// /////////////////////
/// return position of last read token /// return position of last read token
constexpr size_t get_position() const noexcept constexpr std::size_t get_position() const noexcept
{ {
return chars_read; return chars_read;
} }
@ -3050,14 +3059,14 @@ scan_number_done:
bool next_unget = false; bool next_unget = false;
/// the number of characters read /// the number of characters read
size_t chars_read = 0; std::size_t chars_read = 0;
/// the start position of the current token /// the start position of the current token
size_t start_pos = 0; std::size_t start_pos = 0;
/// buffer for variable-length tokens (numbers, strings) /// buffer for variable-length tokens (numbers, strings)
std::vector<char> yytext = std::vector<char>(1024, '\0'); std::vector<char> yytext = std::vector<char>(1024, '\0');
/// current index in yytext /// current index in yytext
size_t yylen = 0; std::size_t yylen = 0;
/// a description of occurred lexer errors /// a description of occurred lexer errors
const char* error_message = ""; const char* error_message = "";
@ -4379,7 +4388,7 @@ template <typename IteratorType> class iteration_proxy
/// the iterator /// the iterator
IteratorType anchor; IteratorType anchor;
/// an index for arrays (used to create key names) /// an index for arrays (used to create key names)
size_t array_index = 0; std::size_t array_index = 0;
public: public:
explicit iteration_proxy_internal(IteratorType it) noexcept : anchor(it) {} explicit iteration_proxy_internal(IteratorType it) noexcept : anchor(it) {}
@ -4574,7 +4583,7 @@ template <typename CharType> class output_adapter
{ {
public: public:
virtual void write_character(CharType c) = 0; virtual void write_character(CharType c) = 0;
virtual void write_characters(const CharType* s, size_t length) = 0; virtual void write_characters(const CharType* s, std::size_t length) = 0;
virtual ~output_adapter() {} virtual ~output_adapter() {}
}; };
@ -4594,7 +4603,7 @@ class output_vector_adapter : public output_adapter<CharType>
v.push_back(c); v.push_back(c);
} }
void write_characters(const CharType* s, size_t length) override void write_characters(const CharType* s, std::size_t length) override
{ {
std::copy(s, s + length, std::back_inserter(v)); std::copy(s, s + length, std::back_inserter(v));
} }
@ -4615,7 +4624,7 @@ class output_stream_adapter : public output_adapter<CharType>
stream.put(c); stream.put(c);
} }
void write_characters(const CharType* s, size_t length) override void write_characters(const CharType* s, std::size_t length) override
{ {
stream.write(s, static_cast<std::streamsize>(length)); stream.write(s, static_cast<std::streamsize>(length));
} }
@ -4636,7 +4645,7 @@ class output_string_adapter : public output_adapter<CharType>
str.push_back(c); str.push_back(c);
} }
void write_characters(const CharType* s, size_t length) override void write_characters(const CharType* s, std::size_t length) override
{ {
str.append(s, length); str.append(s, length);
} }
@ -4874,7 +4883,7 @@ class binary_reader
{ {
BasicJsonType result = value_t::array; BasicJsonType result = value_t::array;
const auto len = static_cast<size_t>(current & 0x1f); const auto len = static_cast<size_t>(current & 0x1f);
for (size_t i = 0; i < len; ++i) for (std::size_t i = 0; i < len; ++i)
{ {
result.push_back(parse_cbor()); result.push_back(parse_cbor());
} }
@ -4885,7 +4894,7 @@ class binary_reader
{ {
BasicJsonType result = value_t::array; BasicJsonType result = value_t::array;
const auto len = static_cast<size_t>(get_number<uint8_t>()); const auto len = static_cast<size_t>(get_number<uint8_t>());
for (size_t i = 0; i < len; ++i) for (std::size_t i = 0; i < len; ++i)
{ {
result.push_back(parse_cbor()); result.push_back(parse_cbor());
} }
@ -4896,7 +4905,7 @@ class binary_reader
{ {
BasicJsonType result = value_t::array; BasicJsonType result = value_t::array;
const auto len = static_cast<size_t>(get_number<uint16_t>()); const auto len = static_cast<size_t>(get_number<uint16_t>());
for (size_t i = 0; i < len; ++i) for (std::size_t i = 0; i < len; ++i)
{ {
result.push_back(parse_cbor()); result.push_back(parse_cbor());
} }
@ -4907,7 +4916,7 @@ class binary_reader
{ {
BasicJsonType result = value_t::array; BasicJsonType result = value_t::array;
const auto len = static_cast<size_t>(get_number<uint32_t>()); const auto len = static_cast<size_t>(get_number<uint32_t>());
for (size_t i = 0; i < len; ++i) for (std::size_t i = 0; i < len; ++i)
{ {
result.push_back(parse_cbor()); result.push_back(parse_cbor());
} }
@ -4918,7 +4927,7 @@ class binary_reader
{ {
BasicJsonType result = value_t::array; BasicJsonType result = value_t::array;
const auto len = static_cast<size_t>(get_number<uint64_t>()); const auto len = static_cast<size_t>(get_number<uint64_t>());
for (size_t i = 0; i < len; ++i) for (std::size_t i = 0; i < len; ++i)
{ {
result.push_back(parse_cbor()); result.push_back(parse_cbor());
} }
@ -4963,7 +4972,7 @@ class binary_reader
{ {
BasicJsonType result = value_t::object; BasicJsonType result = value_t::object;
const auto len = static_cast<size_t>(current & 0x1f); const auto len = static_cast<size_t>(current & 0x1f);
for (size_t i = 0; i < len; ++i) for (std::size_t i = 0; i < len; ++i)
{ {
get(); get();
auto key = get_cbor_string(); auto key = get_cbor_string();
@ -4976,7 +4985,7 @@ class binary_reader
{ {
BasicJsonType result = value_t::object; BasicJsonType result = value_t::object;
const auto len = static_cast<size_t>(get_number<uint8_t>()); const auto len = static_cast<size_t>(get_number<uint8_t>());
for (size_t i = 0; i < len; ++i) for (std::size_t i = 0; i < len; ++i)
{ {
get(); get();
auto key = get_cbor_string(); auto key = get_cbor_string();
@ -4989,7 +4998,7 @@ class binary_reader
{ {
BasicJsonType result = value_t::object; BasicJsonType result = value_t::object;
const auto len = static_cast<size_t>(get_number<uint16_t>()); const auto len = static_cast<size_t>(get_number<uint16_t>());
for (size_t i = 0; i < len; ++i) for (std::size_t i = 0; i < len; ++i)
{ {
get(); get();
auto key = get_cbor_string(); auto key = get_cbor_string();
@ -5002,7 +5011,7 @@ class binary_reader
{ {
BasicJsonType result = value_t::object; BasicJsonType result = value_t::object;
const auto len = static_cast<size_t>(get_number<uint32_t>()); const auto len = static_cast<size_t>(get_number<uint32_t>());
for (size_t i = 0; i < len; ++i) for (std::size_t i = 0; i < len; ++i)
{ {
get(); get();
auto key = get_cbor_string(); auto key = get_cbor_string();
@ -5015,7 +5024,7 @@ class binary_reader
{ {
BasicJsonType result = value_t::object; BasicJsonType result = value_t::object;
const auto len = static_cast<size_t>(get_number<uint64_t>()); const auto len = static_cast<size_t>(get_number<uint64_t>());
for (size_t i = 0; i < len; ++i) for (std::size_t i = 0; i < len; ++i)
{ {
get(); get();
auto key = get_cbor_string(); auto key = get_cbor_string();
@ -5277,7 +5286,7 @@ class binary_reader
{ {
BasicJsonType result = value_t::object; BasicJsonType result = value_t::object;
const auto len = static_cast<size_t>(current & 0x0f); const auto len = static_cast<size_t>(current & 0x0f);
for (size_t i = 0; i < len; ++i) for (std::size_t i = 0; i < len; ++i)
{ {
get(); get();
auto key = get_msgpack_string(); auto key = get_msgpack_string();
@ -5306,7 +5315,7 @@ class binary_reader
{ {
BasicJsonType result = value_t::array; BasicJsonType result = value_t::array;
const auto len = static_cast<size_t>(current & 0x0f); const auto len = static_cast<size_t>(current & 0x0f);
for (size_t i = 0; i < len; ++i) for (std::size_t i = 0; i < len; ++i)
{ {
result.push_back(parse_msgpack()); result.push_back(parse_msgpack());
} }
@ -5426,7 +5435,7 @@ class binary_reader
{ {
BasicJsonType result = value_t::array; BasicJsonType result = value_t::array;
const auto len = static_cast<size_t>(get_number<uint16_t>()); const auto len = static_cast<size_t>(get_number<uint16_t>());
for (size_t i = 0; i < len; ++i) for (std::size_t i = 0; i < len; ++i)
{ {
result.push_back(parse_msgpack()); result.push_back(parse_msgpack());
} }
@ -5437,7 +5446,7 @@ class binary_reader
{ {
BasicJsonType result = value_t::array; BasicJsonType result = value_t::array;
const auto len = static_cast<size_t>(get_number<uint32_t>()); const auto len = static_cast<size_t>(get_number<uint32_t>());
for (size_t i = 0; i < len; ++i) for (std::size_t i = 0; i < len; ++i)
{ {
result.push_back(parse_msgpack()); result.push_back(parse_msgpack());
} }
@ -5448,7 +5457,7 @@ class binary_reader
{ {
BasicJsonType result = value_t::object; BasicJsonType result = value_t::object;
const auto len = static_cast<size_t>(get_number<uint16_t>()); const auto len = static_cast<size_t>(get_number<uint16_t>());
for (size_t i = 0; i < len; ++i) for (std::size_t i = 0; i < len; ++i)
{ {
get(); get();
auto key = get_msgpack_string(); auto key = get_msgpack_string();
@ -5461,7 +5470,7 @@ class binary_reader
{ {
BasicJsonType result = value_t::object; BasicJsonType result = value_t::object;
const auto len = static_cast<size_t>(get_number<uint32_t>()); const auto len = static_cast<size_t>(get_number<uint32_t>());
for (size_t i = 0; i < len; ++i) for (std::size_t i = 0; i < len; ++i)
{ {
get(); get();
auto key = get_msgpack_string(); auto key = get_msgpack_string();
@ -5565,7 +5574,7 @@ class binary_reader
{ {
// step 1: read input into array with system's byte order // step 1: read input into array with system's byte order
std::array<uint8_t, sizeof(NumberType)> vec; std::array<uint8_t, sizeof(NumberType)> vec;
for (size_t i = 0; i < sizeof(NumberType); ++i) for (std::size_t i = 0; i < sizeof(NumberType); ++i)
{ {
get(); get();
check_eof(); check_eof();
@ -5600,10 +5609,10 @@ class binary_reader
@throw parse_error.110 if input has less than @a len bytes @throw parse_error.110 if input has less than @a len bytes
*/ */
std::string get_string(const size_t len) std::string get_string(const std::size_t len)
{ {
std::string result; std::string result;
for (size_t i = 0; i < len; ++i) for (std::size_t i = 0; i < len; ++i)
{ {
get(); get();
check_eof(); check_eof();
@ -5810,7 +5819,7 @@ class binary_reader
int current = std::char_traits<char>::eof(); int current = std::char_traits<char>::eof();
/// the number of characters read /// the number of characters read
size_t chars_read = 0; std::size_t chars_read = 0;
/// whether we can assume little endianess /// whether we can assume little endianess
const bool is_little_endian = true; const bool is_little_endian = true;
@ -6450,7 +6459,7 @@ class serializer
// first n-1 elements // first n-1 elements
auto i = val.m_value.object->cbegin(); auto i = val.m_value.object->cbegin();
for (size_t cnt = 0; cnt < val.m_value.object->size() - 1; ++cnt, ++i) for (std::size_t cnt = 0; cnt < val.m_value.object->size() - 1; ++cnt, ++i)
{ {
o->write_characters(indent_string.c_str(), new_indent); o->write_characters(indent_string.c_str(), new_indent);
o->write_character('\"'); o->write_character('\"');
@ -6478,7 +6487,7 @@ class serializer
// first n-1 elements // first n-1 elements
auto i = val.m_value.object->cbegin(); auto i = val.m_value.object->cbegin();
for (size_t cnt = 0; cnt < val.m_value.object->size() - 1; ++cnt, ++i) for (std::size_t cnt = 0; cnt < val.m_value.object->size() - 1; ++cnt, ++i)
{ {
o->write_character('\"'); o->write_character('\"');
dump_escaped(i->first, ensure_ascii); dump_escaped(i->first, ensure_ascii);
@ -6613,22 +6622,40 @@ class serializer
} }
private: private:
/*!
@brief returns the number of expected bytes following in UTF-8 string
@param[in] u the first byte of a UTF-8 string
@return the number of expected bytes following
*/
static constexpr std::size_t bytes_following(const uint8_t u)
{
return ((0 <= u and u <= 127) ? 0
: ((192 <= u and u <= 223) ? 1
: ((224 <= u and u <= 239) ? 2
: ((240 <= u and u <= 247) ? 3 : std::string::npos))));
}
/*! /*!
@brief calculates the extra space to escape a JSON string @brief calculates the extra space to escape a JSON string
@param[in] s the string to escape @param[in] s the string to escape
@param[in] ensure_ascii whether to escape non-ASCII characters with \uXXXX sequences @param[in] ensure_ascii whether to escape non-ASCII characters with
\uXXXX sequences
@return the number of characters required to escape string @a s @return the number of characters required to escape string @a s
@complexity Linear in the length of string @a s. @complexity Linear in the length of string @a s.
*/ */
static std::size_t extra_space(const string_t& s, const bool ensure_ascii) noexcept static std::size_t extra_space(const string_t& s,
const bool ensure_ascii) noexcept
{ {
return std::accumulate(s.begin(), s.end(), size_t{}, std::size_t res = 0;
[ensure_ascii](size_t res, typename string_t::value_type c)
for (std::size_t i = 0; i < s.size(); ++i)
{ {
switch (c) switch (s[i])
{ {
// control characters that can be escaped with a backslash
case '"': case '"':
case '\\': case '\\':
case '\b': case '\b':
@ -6638,9 +6665,11 @@ class serializer
case '\t': case '\t':
{ {
// from c (1 byte) to \x (2 bytes) // from c (1 byte) to \x (2 bytes)
return res + 1; res += 1;
break;
} }
// control characters that need \uxxxx escaping
case 0x00: case 0x00:
case 0x01: case 0x01:
case 0x02: case 0x02:
@ -6670,20 +6699,96 @@ class serializer
case 0x1f: case 0x1f:
{ {
// from c (1 byte) to \uxxxx (6 bytes) // from c (1 byte) to \uxxxx (6 bytes)
return res + 5; res += 5;
break;
} }
default: default:
{ {
if (c & 0x80 and ensure_ascii) if (ensure_ascii and (s[i] & 0x80 or s[i] == 0x7F))
{ {
// from c (1 byte) to \uxxxx (6 bytes) const std::size_t bytes = bytes_following(static_cast<uint8_t>(s[i]));
return res + 5; if (bytes == std::string::npos)
{
// invalid characters are treated as is, so no
// additional space will be used
break;
}
if (bytes == 3)
{
// codepoints that need 4 bytes (i.e., 3
// additional bytes) in UTF-8 needs a surrogate
// pair when \u escaping is used:
// from 4 bytes to \uxxxx\uxxxx (12 bytes)
res += (12 - bytes - 1);
}
else
{
// from x bytes to \uxxxx (6 bytes)
res += (6 - bytes - 1);
}
// skip the additional bytes
i += bytes;
} }
return res; break;
} }
} }
}); }
return res;
}
static void escape_codepoint(const uint32_t codepoint,
string_t& result, size_t& pos)
{
// expecting a proper codepoint
assert(0x00 <= codepoint and codepoint <= 0x10FFFF);
// the last written character was the backslash before the 'u'
assert(result[pos] == '\\');
// write the 'u'
result[++pos] = 'u';
// convert a number 0..15 to its hex representation (0..f)
static const std::array<char, 16> hexify =
{
{
'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
}
};
if (codepoint < 0x10000)
{
// codepoints U+0000..U+FFFF can be represented as \uxxxx.
result[++pos] = hexify[(codepoint >> 12) & 0x0F];
result[++pos] = hexify[(codepoint >> 8) & 0x0F];
result[++pos] = hexify[(codepoint >> 4) & 0x0F];
result[++pos] = hexify[codepoint & 0x0F];
}
else
{
// codepoints U+10000..U+10FFFF need a surrogate pair to be
// represented as \uxxxx\uxxxx.
// http://www.unicode.org/faq/utf_bom.html#utf16-4
const uint32_t high_surrogate = 0xD800 - (0x10000 >> 10) + (codepoint >> 10);
const uint32_t low_surrogate = 0xDC00 + (codepoint & 0x3FF);
result[++pos] = hexify[(high_surrogate >> 12) & 0x0F];
result[++pos] = hexify[(high_surrogate >> 8) & 0x0F];
result[++pos] = hexify[(high_surrogate >> 4) & 0x0F];
result[++pos] = hexify[high_surrogate & 0x0F];
++pos; // backslash is already in output
result[++pos] = 'u';
result[++pos] = hexify[(low_surrogate >> 12) & 0x0F];
result[++pos] = hexify[(low_surrogate >> 8) & 0x0F];
result[++pos] = hexify[(low_surrogate >> 4) & 0x0F];
result[++pos] = hexify[low_surrogate & 0x0F];
}
++pos;
} }
/*! /*!
@ -6712,30 +6817,9 @@ class serializer
string_t result(s.size() + space, '\\'); string_t result(s.size() + space, '\\');
std::size_t pos = 0; std::size_t pos = 0;
auto escape_character = [&result, &pos](const typename string_t::value_type c) for (std::size_t i = 0; i < s.size(); ++i)
{ {
// convert a number 0..15 to its hex representation switch (s[i])
// (0..f)
static const char hexify[16] =
{
'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
};
// print character c as \uxxxx
for (const char m :
{ 'u', '0', '0', hexify[(c >> 4) & 0x0f], hexify[c & 0x0f]
})
{
result[++pos] = m;
}
++pos;
};
for (const auto& c : s)
{
switch (c)
{ {
// quotation mark (0x22) // quotation mark (0x22)
case '"': case '"':
@ -6793,55 +6877,74 @@ class serializer
break; break;
} }
case 0x00:
case 0x01:
case 0x02:
case 0x03:
case 0x04:
case 0x05:
case 0x06:
case 0x07:
case 0x0b:
case 0x0e:
case 0x0f:
case 0x10:
case 0x11:
case 0x12:
case 0x13:
case 0x14:
case 0x15:
case 0x16:
case 0x17:
case 0x18:
case 0x19:
case 0x1a:
case 0x1b:
case 0x1c:
case 0x1d:
case 0x1e:
case 0x1f:
{
escape_character(c);
break;
}
default: default:
{ {
if (c & 0x80 and ensure_ascii) // escape control characters (0x00..0x1F) or, if
// ensure_ascii paramter is used, non-ASCII characters
if ((0x00 <= s[i] and s[i] <= 0x1F) or
(ensure_ascii and (s[i] & 0x80 or s[i] == 0x7F)))
{ {
escape_character(c); const std::size_t bytes = bytes_following(static_cast<uint8_t>(s[i]));
if (bytes == std::string::npos)
{
// copy invalid character as is
result[pos++] = s[i];
break;
}
assert(i + bytes < s.size());
// to use \uxxxx escaping, we first need to caluclate
// the codepoint from the UTF-8 bytes
uint32_t codepoint = 0;
switch (bytes)
{
case 0:
{
codepoint = static_cast<uint8_t>(s[i]);
break;
}
case 1:
{
codepoint = (static_cast<uint8_t>(s[i]) - 192) * 64
+ (static_cast<uint8_t>(s[i + 1]) - 128);
break;
}
case 2:
{
codepoint = (static_cast<uint8_t>(s[i]) - 224) * 4096
+ (static_cast<uint8_t>(s[i + 1]) - 128) * 64
+ (static_cast<uint8_t>(s[i + 2]) - 128);
break;
}
case 3:
{
codepoint = (static_cast<uint8_t>(s[i]) - 240) * 262144
+ (static_cast<uint8_t>(s[i + 1]) - 128) * 4096
+ (static_cast<uint8_t>(s[i + 2]) - 128) * 64
+ (static_cast<uint8_t>(s[i + 3]) - 128);
break;
}
}
escape_codepoint(codepoint, result, pos);
i += bytes;
} }
else else
{ {
// all other characters are added as-is // all other characters are added as-is
result[pos++] = c; result[pos++] = s[i];
} }
break; break;
} }
} }
} }
assert(pos == s.size() + space); assert(pos == result.size());
o->write_characters(result.c_str(), result.size()); o->write_characters(result.c_str(), result.size());
} }
@ -6869,7 +6972,7 @@ class serializer
} }
const bool is_negative = x < 0; const bool is_negative = x < 0;
size_t i = 0; std::size_t i = 0;
// spare 1 byte for '\0' // spare 1 byte for '\0'
while (x != 0 and i < number_buffer.size() - 1) while (x != 0 and i < number_buffer.size() - 1)
@ -7258,7 +7361,7 @@ class json_pointer
// - start: position after the previous slash // - start: position after the previous slash
for ( for (
// search for the first slash after the first character // search for the first slash after the first character
size_t slash = reference_string.find_first_of('/', 1), std::size_t slash = reference_string.find_first_of('/', 1),
// set the beginning of the first reference token // set the beginning of the first reference token
start = 1; start = 1;
// we can stop if start == string::npos+1 = 0 // we can stop if start == string::npos+1 = 0
@ -7274,7 +7377,7 @@ class json_pointer
auto reference_token = reference_string.substr(start, slash - start); auto reference_token = reference_string.substr(start, slash - start);
// check reference tokens are properly escaped // check reference tokens are properly escaped
for (size_t pos = reference_token.find_first_of('~'); for (std::size_t pos = reference_token.find_first_of('~');
pos != std::string::npos; pos != std::string::npos;
pos = reference_token.find_first_of('~', pos + 1)) pos = reference_token.find_first_of('~', pos + 1))
{ {
@ -7316,7 +7419,7 @@ class json_pointer
{ {
assert(not f.empty()); assert(not f.empty());
for (size_t pos = s.find(f); // find first occurrence of f for (std::size_t pos = s.find(f); // find first occurrence of f
pos != std::string::npos; // make sure f was found pos != std::string::npos; // make sure f was found
s.replace(pos, f.size(), t), // replace with t s.replace(pos, f.size(), t), // replace with t
pos = s.find(f, pos + t.size()) // find next occurrence of f pos = s.find(f, pos + t.size()) // find next occurrence of f
@ -13419,7 +13522,7 @@ class basic_json
@since version 2.0.9, parameter @a start_index since 2.1.1 @since version 2.0.9, parameter @a start_index since 2.1.1
*/ */
static basic_json from_cbor(const std::vector<uint8_t>& v, static basic_json from_cbor(const std::vector<uint8_t>& v,
const size_t start_index = 0) const std::size_t start_index = 0)
{ {
binary_reader br(detail::input_adapter_factory::create(v.begin() + static_cast<difference_type>(start_index), v.end())); binary_reader br(detail::input_adapter_factory::create(v.begin() + static_cast<difference_type>(start_index), v.end()));
return br.parse_cbor(); return br.parse_cbor();
@ -13494,7 +13597,7 @@ class basic_json
@since version 2.0.9, parameter @a start_index since 2.1.1 @since version 2.0.9, parameter @a start_index since 2.1.1
*/ */
static basic_json from_msgpack(const std::vector<uint8_t>& v, static basic_json from_msgpack(const std::vector<uint8_t>& v,
const size_t start_index = 0) const std::size_t start_index = 0)
{ {
binary_reader br(detail::input_adapter_factory::create(v.begin() + static_cast<difference_type>(start_index), v.end())); binary_reader br(detail::input_adapter_factory::create(v.begin() + static_cast<difference_type>(start_index), v.end()));
return br.parse_msgpack(); return br.parse_msgpack();
@ -14097,7 +14200,7 @@ class basic_json
case value_t::array: case value_t::array:
{ {
// first pass: traverse common elements // first pass: traverse common elements
size_t i = 0; std::size_t i = 0;
while (i < source.size() and i < target.size()) while (i < source.size() and i < target.size())
{ {
// recursive call to compare array values at index i // recursive call to compare array values at index i
@ -14565,7 +14668,7 @@ void json_pointer::flatten(const std::string& reference_string,
else else
{ {
// iterate array and use index as reference string // iterate array and use index as reference string
for (size_t i = 0; i < value.m_value.array->size(); ++i) for (std::size_t i = 0; i < value.m_value.array->size(); ++i)
{ {
flatten(reference_string + "/" + std::to_string(i), flatten(reference_string + "/" + std::to_string(i),
value.m_value.array->operator[](i), result); value.m_value.array->operator[](i), result);

File diff suppressed because it is too large Load diff

View file

@ -98,7 +98,9 @@ TEST_CASE("convenience functions")
check_escaped("\x1d", "\\u001d"); check_escaped("\x1d", "\\u001d");
check_escaped("\x1e", "\\u001e"); check_escaped("\x1e", "\\u001e");
check_escaped("\x1f", "\\u001f"); check_escaped("\x1f", "\\u001f");
check_escaped("\xA9", "\xA9");
check_escaped("\xA9", "\\u00a9", true); // invalid UTF-8 characters
check_escaped("ä\xA9ü", "ä\xA9ü");
check_escaped("ä\xA9ü", "\\u00e4\xA9\\u00fc", true);
} }
} }

View file

@ -28,6 +28,7 @@ SOFTWARE.
#include "catch.hpp" #include "catch.hpp"
#include <fstream>
#include "json.hpp" #include "json.hpp"
using nlohmann::json; using nlohmann::json;
@ -252,9 +253,35 @@ TEST_CASE("object inspection")
SECTION("dump with ensure_ascii and non-ASCII characters") SECTION("dump with ensure_ascii and non-ASCII characters")
{ {
CHECK(json("ä").dump(-1, ' ', true) == R"("\u00c3\u00a4")"); CHECK(json("ä").dump(-1, ' ', true) == "\"\\u00e4\"");
CHECK(json("Ö").dump(-1, ' ', true) == R"("\u00c3\u0096")"); CHECK(json("Ö").dump(-1, ' ', true) == "\"\\u00d6\"");
CHECK(json("❤️").dump(-1, ' ', true) == R"("\u00e2\u009d\u00a4\u00ef\u00b8\u008f")"); CHECK(json("❤️").dump(-1, ' ', true) == "\"\\u2764\\ufe0f\"");
}
SECTION("full Unicode escaping to ASCII")
{
SECTION("parsing yields the same JSON value")
{
std::ifstream f_escaped("test/data/json_nlohmann_tests/all_unicode_ascii.json");
std::ifstream f_unescaped("test/data/json_nlohmann_tests/all_unicode.json");
json j1 = json::parse(f_escaped);
json j2 = json::parse(f_unescaped);
CHECK(j1 == j2);
}
SECTION("dumping yields the same JSON text")
{
std::ifstream f_escaped("test/data/json_nlohmann_tests/all_unicode_ascii.json");
std::ifstream f_unescaped("test/data/json_nlohmann_tests/all_unicode.json");
json value = json::parse(f_unescaped);
std::string text = value.dump(4, ' ', true);
std::string expected((std::istreambuf_iterator<char>(f_escaped)),
std::istreambuf_iterator<char>());
CHECK(text == expected);
}
} }
SECTION("serialization of discarded element") SECTION("serialization of discarded element")