From 7456f1d87b2e6e2a62272d884dd09136c52b9447 Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Tue, 16 Jan 2018 20:42:00 +0100 Subject: [PATCH] :recycle: re-used existing UTF-8 decoder to simplfy string serialization --- develop/detail/serializer.hpp | 465 +++++++++-------------------- src/json.hpp | 533 +++++++++++----------------------- 2 files changed, 307 insertions(+), 691 deletions(-) diff --git a/develop/detail/serializer.hpp b/develop/detail/serializer.hpp index b5f2b06a..91ce4987 100644 --- a/develop/detail/serializer.hpp +++ b/develop/detail/serializer.hpp @@ -34,6 +34,9 @@ class serializer using number_float_t = typename BasicJsonType::number_float_t; using number_integer_t = typename BasicJsonType::number_integer_t; using number_unsigned_t = typename BasicJsonType::number_unsigned_t; + static constexpr uint8_t UTF8_ACCEPT = 0; + static constexpr uint8_t UTF8_REJECT = 1; + public: /*! @param[in] s output stream to serialize to @@ -43,7 +46,8 @@ class serializer : o(std::move(s)), loc(std::localeconv()), thousands_sep(loc->thousands_sep == nullptr ? '\0' : * (loc->thousands_sep)), decimal_point(loc->decimal_point == nullptr ? '\0' : * (loc->decimal_point)), - indent_char(ichar), indent_string(512, indent_char) {} + indent_char(ichar), indent_string(512, indent_char) + {} // delete because of pointer members serializer(const serializer&) = delete; @@ -259,171 +263,6 @@ class serializer } private: - /*! - @brief returns the number of expected bytes following in UTF-8 string - - @param[in] u the first byte of a UTF-8 string - @return the number of expected bytes following - */ - static constexpr std::size_t bytes_following(const uint8_t u) - { - return ((u <= 127) ? 0 - : ((192 <= u and u <= 223) ? 1 - : ((224 <= u and u <= 239) ? 2 - : ((240 <= u and u <= 247) ? 3 : std::string::npos)))); - } - - /*! - @brief calculates the extra space to escape a JSON string - - @param[in] s the string to escape - @param[in] ensure_ascii whether to escape non-ASCII characters with - \uXXXX sequences - @return the number of characters required to escape string @a s - - @complexity Linear in the length of string @a s. - */ - static std::size_t extra_space(const string_t& s, - const bool ensure_ascii) noexcept - { - std::size_t res = 0; - - for (std::size_t i = 0; i < s.size(); ++i) - { - switch (s[i]) - { - // control characters that can be escaped with a backslash - case '"': - case '\\': - case '\b': - case '\f': - case '\n': - case '\r': - case '\t': - { - // from c (1 byte) to \x (2 bytes) - res += 1; - break; - } - - // control characters that need \uxxxx escaping - case 0x00: - case 0x01: - case 0x02: - case 0x03: - case 0x04: - case 0x05: - case 0x06: - case 0x07: - case 0x0B: - case 0x0E: - case 0x0F: - case 0x10: - case 0x11: - case 0x12: - case 0x13: - case 0x14: - case 0x15: - case 0x16: - case 0x17: - case 0x18: - case 0x19: - case 0x1A: - case 0x1B: - case 0x1C: - case 0x1D: - case 0x1E: - case 0x1F: - { - // from c (1 byte) to \uxxxx (6 bytes) - res += 5; - break; - } - - default: - { - if (ensure_ascii and (s[i] & 0x80 or s[i] == 0x7F)) - { - const auto bytes = bytes_following(static_cast(s[i])); - // invalid characters will be detected by throw_if_invalid_utf8 - assert (bytes != std::string::npos); - - if (bytes == 3) - { - // codepoints that need 4 bytes (i.e., 3 additional - // bytes) in UTF-8 need a surrogate pair when \u - // escaping is used: from 4 bytes to \uxxxx\uxxxx - // (12 bytes) - res += (12 - bytes - 1); - } - else - { - // from x bytes to \uxxxx (6 bytes) - res += (6 - bytes - 1); - } - - // skip the additional bytes - i += bytes; - } - break; - } - } - } - - return res; - } - - static void escape_codepoint(int codepoint, string_t& result, std::size_t& pos) - { - // expecting a proper codepoint - assert(0x00 <= codepoint and codepoint <= 0x10FFFF); - - // the last written character was the backslash before the 'u' - assert(result[pos] == '\\'); - - // write the 'u' - result[++pos] = 'u'; - - // convert a number 0..15 to its hex representation (0..f) - static const std::array hexify = - { - { - '0', '1', '2', '3', '4', '5', '6', '7', - '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' - } - }; - - if (codepoint < 0x10000) - { - // codepoints U+0000..U+FFFF can be represented as \uxxxx. - result[++pos] = hexify[(codepoint >> 12) & 0x0F]; - result[++pos] = hexify[(codepoint >> 8) & 0x0F]; - result[++pos] = hexify[(codepoint >> 4) & 0x0F]; - result[++pos] = hexify[codepoint & 0x0F]; - } - else - { - // codepoints U+10000..U+10FFFF need a surrogate pair to be - // represented as \uxxxx\uxxxx. - // http://www.unicode.org/faq/utf_bom.html#utf16-4 - codepoint -= 0x10000; - const int high_surrogate = 0xD800 | ((codepoint >> 10) & 0x3FF); - const int low_surrogate = 0xDC00 | (codepoint & 0x3FF); - result[++pos] = hexify[(high_surrogate >> 12) & 0x0F]; - result[++pos] = hexify[(high_surrogate >> 8) & 0x0F]; - result[++pos] = hexify[(high_surrogate >> 4) & 0x0F]; - result[++pos] = hexify[high_surrogate & 0x0F]; - ++pos; // backslash is already in output - result[++pos] = 'u'; - result[++pos] = hexify[(low_surrogate >> 12) & 0x0F]; - result[++pos] = hexify[(low_surrogate >> 8) & 0x0F]; - result[++pos] = hexify[(low_surrogate >> 4) & 0x0F]; - result[++pos] = hexify[low_surrogate & 0x0F]; - } - - ++pos; - } - /*! @brief dump escaped string @@ -438,145 +277,145 @@ class serializer @complexity Linear in the length of string @a s. */ - void dump_escaped(const string_t& s, const bool ensure_ascii) const + void dump_escaped(const string_t& s, const bool ensure_ascii) { - throw_if_invalid_utf8(s); - - const auto space = extra_space(s, ensure_ascii); - if (space == 0) - { - o->write_characters(s.c_str(), s.size()); - return; - } - - // create a result string of necessary size - string_t result(s.size() + space, '\\'); - std::size_t pos = 0; + uint32_t codepoint; + uint8_t state = UTF8_ACCEPT; + std::size_t bytes = 0; // number of bytes written to string_buffer for (std::size_t i = 0; i < s.size(); ++i) { - switch (s[i]) + const auto byte = static_cast(s[i]); + + switch (decode(state, codepoint, byte)) { - case '"': // quotation mark (0x22) + case UTF8_ACCEPT: // decode found a new code point { - result[pos + 1] = '"'; - pos += 2; - break; - } - - case '\\': // reverse solidus (0x5C) - { - // nothing to change - pos += 2; - break; - } - - case '\b': // backspace (0x08) - { - result[pos + 1] = 'b'; - pos += 2; - break; - } - - case '\f': // formfeed (0x0C) - { - result[pos + 1] = 'f'; - pos += 2; - break; - } - - case '\n': // newline (0x0A) - { - result[pos + 1] = 'n'; - pos += 2; - break; - } - - case '\r': // carriage return (0x0D) - { - result[pos + 1] = 'r'; - pos += 2; - break; - } - - case '\t': // horizontal tab (0x09) - { - result[pos + 1] = 't'; - pos += 2; - break; - } - - default: - { - // escape control characters (0x00..0x1F) or, if - // ensure_ascii parameter is used, non-ASCII characters - if ((0x00 <= s[i] and s[i] <= 0x1F) or - (ensure_ascii and (s[i] & 0x80 or s[i] == 0x7F))) + switch (codepoint) { - const auto bytes = bytes_following(static_cast(s[i])); - // invalid characters will be detected by throw_if_invalid_utf8 - assert (bytes != std::string::npos); - - // check that the additional bytes are present - assert(i + bytes < s.size()); - - // to use \uxxxx escaping, we first need to calculate - // the codepoint from the UTF-8 bytes - int codepoint = 0; - - // bytes is unsigned type: - assert(bytes <= 3); - switch (bytes) + case 0x08: // backspace { - case 0: - { - codepoint = s[i] & 0xFF; - break; - } - - case 1: - { - codepoint = ((s[i] & 0x3F) << 6) - + (s[i + 1] & 0x7F); - break; - } - - case 2: - { - codepoint = ((s[i] & 0x1F) << 12) - + ((s[i + 1] & 0x7F) << 6) - + (s[i + 2] & 0x7F); - break; - } - - case 3: - { - codepoint = ((s[i] & 0xF) << 18) - + ((s[i + 1] & 0x7F) << 12) - + ((s[i + 2] & 0x7F) << 6) - + (s[i + 3] & 0x7F); - break; - } - - default: - break; // LCOV_EXCL_LINE + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = 'b'; + break; } - escape_codepoint(codepoint, result, pos); - i += bytes; + case 0x09: // horizontal tab + { + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = 't'; + break; + } + + case 0x0A: // newline + { + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = 'n'; + break; + } + + case 0x0C: // formfeed + { + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = 'f'; + break; + } + + case 0x0D: // carriage return + { + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = 'r'; + break; + } + + case 0x22: // quotation mark + { + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = '\"'; + break; + } + + case 0x5C: // reverse solidus + { + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = '\\'; + break; + } + + default: + { + // escape control characters (0x00..0x1F) or, if + // ensure_ascii parameter is used, non-ASCII characters + if ((codepoint <= 0x1F) or (ensure_ascii and (codepoint >= 0x7F))) + { + if (codepoint <= 0xFFFF) + { + std::snprintf(string_buffer.data() + bytes, 7, "\\u%04x", codepoint); + bytes += 6; + } + else + { + std::snprintf(string_buffer.data() + bytes, 13, "\\u%04x\\u%04x", + (0xD7C0 + (codepoint >> 10)), + (0xDC00 + (codepoint & 0x3FF))); + bytes += 12; + } + } + else + { + // copy byte to buffer (all previous bytes + // been copied have in default case above) + string_buffer[bytes++] = s[i]; + } + break; + } } - else + + // write buffer and reset index; there must be 13 bytes + // left, as this is the maximal number of bytes to be + // written ("\uxxxx\uxxxx\0") for one code point + if (string_buffer.size() - bytes < 13) { - // all other characters are added as-is - result[pos++] = s[i]; + o->write_characters(string_buffer.data(), bytes); + bytes = 0; + } + break; + } + + case UTF8_REJECT: // decode found invalid UTF-8 byte + { + std::stringstream ss; + ss << std::setw(2) << std::uppercase << std::setfill('0') << std::hex << static_cast(byte); + JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + ss.str())); + } + + default: // decode found yet incomplete multi-byte code point + { + if (not ensure_ascii) + { + // code point will not be escaped - copy byte to buffer + string_buffer[bytes++] = s[i]; } break; } } } - assert(pos == result.size()); - o->write_characters(result.c_str(), result.size()); + if (JSON_LIKELY(state == UTF8_ACCEPT)) + { + // write buffer + if (bytes > 0) + { + o->write_characters(string_buffer.data(), bytes); + } + } + else + { + // we finish reading, but do not accept: string was incomplete + std::stringstream ss; + ss << std::setw(2) << std::uppercase << std::setfill('0') << std::hex << static_cast(static_cast(s.back())); + JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + ss.str())); + } } /*! @@ -701,15 +540,16 @@ class serializer followed. @param[in,out] state the state of the decoding + @param[in,out] codep codepoint (valid only if resulting state is UTF8_ACCEPT) @param[in] byte next byte to decode + @return new state - @note The function has been edited: a std::array is used and the code - point is not calculated. + @note The function has been edited: a std::array is used. @copyright Copyright (c) 2008-2009 Bjoern Hoehrmann @sa http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ */ - static void decode(uint8_t& state, const uint8_t byte) + static uint8_t decode(uint8_t& state, uint32_t& codep, const uint8_t byte) noexcept { static const std::array utf8d = { @@ -732,42 +572,13 @@ class serializer }; const uint8_t type = utf8d[byte]; + + codep = (state != UTF8_ACCEPT) + ? (byte & 0x3fu) | (codep << 6) + : (0xff >> type) & (byte); + state = utf8d[256u + state * 16u + type]; - } - - /*! - @brief throw an exception if a string is not UTF-8 encoded - - @param[in] str UTF-8 string to check - @throw type_error.316 if passed string is not UTF-8 encoded - - @since version 3.0.0 - */ - static void throw_if_invalid_utf8(const std::string& str) - { - // start with state 0 (= accept) - uint8_t state = 0; - - for (size_t i = 0; i < str.size(); ++i) - { - const auto byte = static_cast(str[i]); - decode(state, byte); - if (state == 1) - { - // state 1 means reject - std::stringstream ss; - ss << std::setw(2) << std::uppercase << std::setfill('0') << std::hex << static_cast(byte); - JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + ss.str())); - } - } - - if (state != 0) - { - // we finish reading, but do not accept: string was incomplete - std::stringstream ss; - ss << std::setw(2) << std::uppercase << std::setfill('0') << std::hex << static_cast(static_cast(str.back())); - JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + ss.str())); - } + return state; } private: @@ -784,9 +595,11 @@ class serializer /// the locale's decimal point character const char decimal_point = '\0'; + /// string buffer + std::array string_buffer{{}}; + /// the indentation character const char indent_char; - /// the indentation string string_t indent_string; }; diff --git a/src/json.hpp b/src/json.hpp index a80cc36a..7ce9aa57 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -109,8 +109,6 @@ using json = basic_json<>; // #include "detail/macro_scope.hpp" -#include // not - // This file contains all internal macro definitions // You MUST include macro_unscope.hpp at the end of json.hpp to undef all of them @@ -147,7 +145,7 @@ using json = basic_json<>; #endif // allow to disable exceptions -#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND)) && not defined(JSON_NOEXCEPTION) +#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND)) && !defined(JSON_NOEXCEPTION) #define JSON_THROW(exception) throw exception #define JSON_TRY try #define JSON_CATCH(exception) catch(exception) @@ -450,8 +448,7 @@ constexpr T static_const::value; } // #include "detail/exceptions.hpp" -#ifndef NLOHMANN_JSON_DETAIL_EXCEPTIONS_HPP -#define NLOHMANN_JSON_DETAIL_EXCEPTIONS_HPP + #include // exception #include // runtime_error @@ -780,8 +777,6 @@ class other_error : public exception } } -#endif - // #include "detail/value_t.hpp" @@ -3667,10 +3662,8 @@ class parser // #include "detail/iterators/primitive_iterator.hpp" -#include // not #include // ptrdiff_t #include // numeric_limits -#include // ostream namespace nlohmann { @@ -3687,9 +3680,15 @@ end_value (`1`) models past the end. */ class primitive_iterator_t { - public: + private: using difference_type = std::ptrdiff_t; + static constexpr difference_type begin_value = 0; + static constexpr difference_type end_value = begin_value + 1; + /// iterator as signed integer type + difference_type m_it = (std::numeric_limits::min)(); + + public: constexpr difference_type get_value() const noexcept { return m_it; @@ -3729,10 +3728,10 @@ class primitive_iterator_t return lhs.m_it < rhs.m_it; } - primitive_iterator_t operator+(difference_type i) + primitive_iterator_t operator+(difference_type n) noexcept { auto result = *this; - result += i; + result += n; return result; } @@ -3741,55 +3740,43 @@ class primitive_iterator_t return lhs.m_it - rhs.m_it; } - friend std::ostream& operator<<(std::ostream& os, primitive_iterator_t it) - { - return os << it.m_it; - } - - primitive_iterator_t& operator++() + primitive_iterator_t& operator++() noexcept { ++m_it; return *this; } - primitive_iterator_t const operator++(int) + primitive_iterator_t const operator++(int) noexcept { auto result = *this; m_it++; return result; } - primitive_iterator_t& operator--() + primitive_iterator_t& operator--() noexcept { --m_it; return *this; } - primitive_iterator_t const operator--(int) + primitive_iterator_t const operator--(int) noexcept { auto result = *this; m_it--; return result; } - primitive_iterator_t& operator+=(difference_type n) + primitive_iterator_t& operator+=(difference_type n) noexcept { m_it += n; return *this; } - primitive_iterator_t& operator-=(difference_type n) + primitive_iterator_t& operator-=(difference_type n) noexcept { m_it -= n; return *this; } - - private: - static constexpr difference_type begin_value = 0; - static constexpr difference_type end_value = begin_value + 1; - - /// iterator as signed integer type - difference_type m_it = (std::numeric_limits::min)(); }; } } @@ -4527,7 +4514,7 @@ template class iteration_proxy public: /// construct iteration proxy from a container - explicit iteration_proxy(typename IteratorType::reference cont) + explicit iteration_proxy(typename IteratorType::reference cont) noexcept : container(cont) {} /// return iterator begin (needed for range-based for) @@ -6477,6 +6464,9 @@ class serializer using number_float_t = typename BasicJsonType::number_float_t; using number_integer_t = typename BasicJsonType::number_integer_t; using number_unsigned_t = typename BasicJsonType::number_unsigned_t; + static constexpr uint8_t UTF8_ACCEPT = 0; + static constexpr uint8_t UTF8_REJECT = 1; + public: /*! @param[in] s output stream to serialize to @@ -6486,7 +6476,8 @@ class serializer : o(std::move(s)), loc(std::localeconv()), thousands_sep(loc->thousands_sep == nullptr ? '\0' : * (loc->thousands_sep)), decimal_point(loc->decimal_point == nullptr ? '\0' : * (loc->decimal_point)), - indent_char(ichar), indent_string(512, indent_char) {} + indent_char(ichar), indent_string(512, indent_char) + {} // delete because of pointer members serializer(const serializer&) = delete; @@ -6702,171 +6693,6 @@ class serializer } private: - /*! - @brief returns the number of expected bytes following in UTF-8 string - - @param[in] u the first byte of a UTF-8 string - @return the number of expected bytes following - */ - static constexpr std::size_t bytes_following(const uint8_t u) - { - return ((u <= 127) ? 0 - : ((192 <= u and u <= 223) ? 1 - : ((224 <= u and u <= 239) ? 2 - : ((240 <= u and u <= 247) ? 3 : std::string::npos)))); - } - - /*! - @brief calculates the extra space to escape a JSON string - - @param[in] s the string to escape - @param[in] ensure_ascii whether to escape non-ASCII characters with - \uXXXX sequences - @return the number of characters required to escape string @a s - - @complexity Linear in the length of string @a s. - */ - static std::size_t extra_space(const string_t& s, - const bool ensure_ascii) noexcept - { - std::size_t res = 0; - - for (std::size_t i = 0; i < s.size(); ++i) - { - switch (s[i]) - { - // control characters that can be escaped with a backslash - case '"': - case '\\': - case '\b': - case '\f': - case '\n': - case '\r': - case '\t': - { - // from c (1 byte) to \x (2 bytes) - res += 1; - break; - } - - // control characters that need \uxxxx escaping - case 0x00: - case 0x01: - case 0x02: - case 0x03: - case 0x04: - case 0x05: - case 0x06: - case 0x07: - case 0x0B: - case 0x0E: - case 0x0F: - case 0x10: - case 0x11: - case 0x12: - case 0x13: - case 0x14: - case 0x15: - case 0x16: - case 0x17: - case 0x18: - case 0x19: - case 0x1A: - case 0x1B: - case 0x1C: - case 0x1D: - case 0x1E: - case 0x1F: - { - // from c (1 byte) to \uxxxx (6 bytes) - res += 5; - break; - } - - default: - { - if (ensure_ascii and (s[i] & 0x80 or s[i] == 0x7F)) - { - const auto bytes = bytes_following(static_cast(s[i])); - // invalid characters will be detected by throw_if_invalid_utf8 - assert (bytes != std::string::npos); - - if (bytes == 3) - { - // codepoints that need 4 bytes (i.e., 3 additional - // bytes) in UTF-8 need a surrogate pair when \u - // escaping is used: from 4 bytes to \uxxxx\uxxxx - // (12 bytes) - res += (12 - bytes - 1); - } - else - { - // from x bytes to \uxxxx (6 bytes) - res += (6 - bytes - 1); - } - - // skip the additional bytes - i += bytes; - } - break; - } - } - } - - return res; - } - - static void escape_codepoint(int codepoint, string_t& result, std::size_t& pos) - { - // expecting a proper codepoint - assert(0x00 <= codepoint and codepoint <= 0x10FFFF); - - // the last written character was the backslash before the 'u' - assert(result[pos] == '\\'); - - // write the 'u' - result[++pos] = 'u'; - - // convert a number 0..15 to its hex representation (0..f) - static const std::array hexify = - { - { - '0', '1', '2', '3', '4', '5', '6', '7', - '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' - } - }; - - if (codepoint < 0x10000) - { - // codepoints U+0000..U+FFFF can be represented as \uxxxx. - result[++pos] = hexify[(codepoint >> 12) & 0x0F]; - result[++pos] = hexify[(codepoint >> 8) & 0x0F]; - result[++pos] = hexify[(codepoint >> 4) & 0x0F]; - result[++pos] = hexify[codepoint & 0x0F]; - } - else - { - // codepoints U+10000..U+10FFFF need a surrogate pair to be - // represented as \uxxxx\uxxxx. - // http://www.unicode.org/faq/utf_bom.html#utf16-4 - codepoint -= 0x10000; - const int high_surrogate = 0xD800 | ((codepoint >> 10) & 0x3FF); - const int low_surrogate = 0xDC00 | (codepoint & 0x3FF); - result[++pos] = hexify[(high_surrogate >> 12) & 0x0F]; - result[++pos] = hexify[(high_surrogate >> 8) & 0x0F]; - result[++pos] = hexify[(high_surrogate >> 4) & 0x0F]; - result[++pos] = hexify[high_surrogate & 0x0F]; - ++pos; // backslash is already in output - result[++pos] = 'u'; - result[++pos] = hexify[(low_surrogate >> 12) & 0x0F]; - result[++pos] = hexify[(low_surrogate >> 8) & 0x0F]; - result[++pos] = hexify[(low_surrogate >> 4) & 0x0F]; - result[++pos] = hexify[low_surrogate & 0x0F]; - } - - ++pos; - } - /*! @brief dump escaped string @@ -6881,145 +6707,145 @@ class serializer @complexity Linear in the length of string @a s. */ - void dump_escaped(const string_t& s, const bool ensure_ascii) const + void dump_escaped(const string_t& s, const bool ensure_ascii) { - throw_if_invalid_utf8(s); - - const auto space = extra_space(s, ensure_ascii); - if (space == 0) - { - o->write_characters(s.c_str(), s.size()); - return; - } - - // create a result string of necessary size - string_t result(s.size() + space, '\\'); - std::size_t pos = 0; + uint32_t codepoint; + uint8_t state = UTF8_ACCEPT; + std::size_t bytes = 0; // number of bytes written to string_buffer for (std::size_t i = 0; i < s.size(); ++i) { - switch (s[i]) + const auto byte = static_cast(s[i]); + + switch (decode(state, codepoint, byte)) { - case '"': // quotation mark (0x22) + case UTF8_ACCEPT: // decode found a new code point { - result[pos + 1] = '"'; - pos += 2; - break; - } - - case '\\': // reverse solidus (0x5C) - { - // nothing to change - pos += 2; - break; - } - - case '\b': // backspace (0x08) - { - result[pos + 1] = 'b'; - pos += 2; - break; - } - - case '\f': // formfeed (0x0C) - { - result[pos + 1] = 'f'; - pos += 2; - break; - } - - case '\n': // newline (0x0A) - { - result[pos + 1] = 'n'; - pos += 2; - break; - } - - case '\r': // carriage return (0x0D) - { - result[pos + 1] = 'r'; - pos += 2; - break; - } - - case '\t': // horizontal tab (0x09) - { - result[pos + 1] = 't'; - pos += 2; - break; - } - - default: - { - // escape control characters (0x00..0x1F) or, if - // ensure_ascii parameter is used, non-ASCII characters - if ((0x00 <= s[i] and s[i] <= 0x1F) or - (ensure_ascii and (s[i] & 0x80 or s[i] == 0x7F))) + switch (codepoint) { - const auto bytes = bytes_following(static_cast(s[i])); - // invalid characters will be detected by throw_if_invalid_utf8 - assert (bytes != std::string::npos); - - // check that the additional bytes are present - assert(i + bytes < s.size()); - - // to use \uxxxx escaping, we first need to calculate - // the codepoint from the UTF-8 bytes - int codepoint = 0; - - // bytes is unsigned type: - assert(bytes <= 3); - switch (bytes) + case 0x08: // backspace { - case 0: - { - codepoint = s[i] & 0xFF; - break; - } - - case 1: - { - codepoint = ((s[i] & 0x3F) << 6) - + (s[i + 1] & 0x7F); - break; - } - - case 2: - { - codepoint = ((s[i] & 0x1F) << 12) - + ((s[i + 1] & 0x7F) << 6) - + (s[i + 2] & 0x7F); - break; - } - - case 3: - { - codepoint = ((s[i] & 0xF) << 18) - + ((s[i + 1] & 0x7F) << 12) - + ((s[i + 2] & 0x7F) << 6) - + (s[i + 3] & 0x7F); - break; - } - - default: - break; // LCOV_EXCL_LINE + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = 'b'; + break; } - escape_codepoint(codepoint, result, pos); - i += bytes; + case 0x09: // horizontal tab + { + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = 't'; + break; + } + + case 0x0A: // newline + { + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = 'n'; + break; + } + + case 0x0C: // formfeed + { + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = 'f'; + break; + } + + case 0x0D: // carriage return + { + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = 'r'; + break; + } + + case 0x22: // quotation mark + { + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = '\"'; + break; + } + + case 0x5C: // reverse solidus + { + string_buffer[bytes++] = '\\'; + string_buffer[bytes++] = '\\'; + break; + } + + default: + { + // escape control characters (0x00..0x1F) or, if + // ensure_ascii parameter is used, non-ASCII characters + if ((codepoint <= 0x1F) or (ensure_ascii and (codepoint >= 0x7F))) + { + if (codepoint <= 0xFFFF) + { + std::snprintf(string_buffer.data() + bytes, 7, "\\u%04x", codepoint); + bytes += 6; + } + else + { + std::snprintf(string_buffer.data() + bytes, 13, "\\u%04x\\u%04x", + (0xD7C0 + (codepoint >> 10)), + (0xDC00 + (codepoint & 0x3FF))); + bytes += 12; + } + } + else + { + // copy byte to buffer (all previous bytes + // been copied have in default case above) + string_buffer[bytes++] = s[i]; + } + break; + } } - else + + // write buffer and reset index; there must be 13 bytes + // left, as this is the maximal number of bytes to be + // written ("\uxxxx\uxxxx\0") for one code point + if (string_buffer.size() - bytes < 13) { - // all other characters are added as-is - result[pos++] = s[i]; + o->write_characters(string_buffer.data(), bytes); + bytes = 0; + } + break; + } + + case UTF8_REJECT: // decode found invalid UTF-8 byte + { + std::stringstream ss; + ss << std::setw(2) << std::uppercase << std::setfill('0') << std::hex << static_cast(byte); + JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + ss.str())); + } + + default: // decode found yet incomplete multi-byte code point + { + if (not ensure_ascii) + { + // code point will not be escaped - copy byte to buffer + string_buffer[bytes++] = s[i]; } break; } } } - assert(pos == result.size()); - o->write_characters(result.c_str(), result.size()); + if (JSON_LIKELY(state == UTF8_ACCEPT)) + { + // write buffer + if (bytes > 0) + { + o->write_characters(string_buffer.data(), bytes); + } + } + else + { + // we finish reading, but do not accept: string was incomplete + std::stringstream ss; + ss << std::setw(2) << std::uppercase << std::setfill('0') << std::hex << static_cast(static_cast(s.back())); + JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + ss.str())); + } } /*! @@ -7144,15 +6970,16 @@ class serializer followed. @param[in,out] state the state of the decoding + @param[in,out] codep codepoint (valid only if resulting state is UTF8_ACCEPT) @param[in] byte next byte to decode + @return new state - @note The function has been edited: a std::array is used and the code - point is not calculated. + @note The function has been edited: a std::array is used. @copyright Copyright (c) 2008-2009 Bjoern Hoehrmann @sa http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ */ - static void decode(uint8_t& state, const uint8_t byte) + static uint8_t decode(uint8_t& state, uint32_t& codep, const uint8_t byte) noexcept { static const std::array utf8d = { @@ -7175,42 +7002,13 @@ class serializer }; const uint8_t type = utf8d[byte]; + + codep = (state != UTF8_ACCEPT) + ? (byte & 0x3fu) | (codep << 6) + : (0xff >> type) & (byte); + state = utf8d[256u + state * 16u + type]; - } - - /*! - @brief throw an exception if a string is not UTF-8 encoded - - @param[in] str UTF-8 string to check - @throw type_error.316 if passed string is not UTF-8 encoded - - @since version 3.0.0 - */ - static void throw_if_invalid_utf8(const std::string& str) - { - // start with state 0 (= accept) - uint8_t state = 0; - - for (size_t i = 0; i < str.size(); ++i) - { - const auto byte = static_cast(str[i]); - decode(state, byte); - if (state == 1) - { - // state 1 means reject - std::stringstream ss; - ss << std::setw(2) << std::uppercase << std::setfill('0') << std::hex << static_cast(byte); - JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + ss.str())); - } - } - - if (state != 0) - { - // we finish reading, but do not accept: string was incomplete - std::stringstream ss; - ss << std::setw(2) << std::uppercase << std::setfill('0') << std::hex << static_cast(static_cast(str.back())); - JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + ss.str())); - } + return state; } private: @@ -7227,9 +7025,11 @@ class serializer /// the locale's decimal point character const char decimal_point = '\0'; + /// string buffer + std::array string_buffer{{}}; + /// the indentation character const char indent_char; - /// the indentation string string_t indent_string; }; @@ -8589,7 +8389,7 @@ class basic_json array = create(std::move(value)); } - void destroy(value_t t) + void destroy(value_t t) noexcept { switch (t) { @@ -8634,7 +8434,7 @@ class basic_json value is changed, because the invariant expresses a relationship between @a m_type and @a m_value. */ - void assert_invariant() const + void assert_invariant() const noexcept { assert(m_type != value_t::object or m_value.object != nullptr); assert(m_type != value_t::array or m_value.array != nullptr); @@ -9428,7 +9228,7 @@ class basic_json @since version 1.0.0 */ - ~basic_json() + ~basic_json() noexcept { assert_invariant(); m_value.destroy(m_type); @@ -11769,7 +11569,7 @@ class basic_json @note The name of this function is not yet final and may change in the future. */ - static iteration_proxy iterator_wrapper(reference ref) + static iteration_proxy iterator_wrapper(reference ref) noexcept { return iteration_proxy(ref); } @@ -11777,7 +11577,7 @@ class basic_json /*! @copydoc iterator_wrapper(reference) */ - static iteration_proxy iterator_wrapper(const_reference ref) + static iteration_proxy iterator_wrapper(const_reference ref) noexcept { return iteration_proxy(ref); } @@ -11819,7 +11619,8 @@ class basic_json @endcode @note When iterating over an array, `key()` will return the index of the - element as string (see example). + element as string (see example). For primitive types (e.g., numbers), + `key()` returns an empty string. @return iteration proxy object wrapping @a ref with an interface to use in range-based for loops @@ -11830,8 +11631,10 @@ class basic_json changes in the JSON value. @complexity Constant. + + @since version 3.x.x. */ - iteration_proxy items() + iteration_proxy items() noexcept { return iteration_proxy(*this); } @@ -11839,7 +11642,7 @@ class basic_json /*! @copydoc items() */ - iteration_proxy items() const + iteration_proxy items() const noexcept { return iteration_proxy(*this); }