From 7cf2fe149cdd70313c05146d82e82f2978a75670 Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Mon, 20 Jul 2020 09:42:37 +0200 Subject: [PATCH 01/14] :construction: support for UBJSON high-precision numbers #2286 --- .../nlohmann/detail/input/binary_reader.hpp | 30 + single_include/nlohmann/json.hpp | 3283 +++++++++-------- test/src/unit-ubjson.cpp | 10 +- 3 files changed, 1696 insertions(+), 1627 deletions(-) diff --git a/include/nlohmann/detail/input/binary_reader.hpp b/include/nlohmann/detail/input/binary_reader.hpp index a650b2d0..8e6493ee 100644 --- a/include/nlohmann/detail/input/binary_reader.hpp +++ b/include/nlohmann/detail/input/binary_reader.hpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -2000,6 +2001,35 @@ class binary_reader return get_number(input_format_t::ubjson, number) && sax->number_float(static_cast(number), ""); } + case 'H': + { + std::size_t size{}; + auto res = get_ubjson_size_value(size); + + std::string s; + for (int i = 0; i < size; ++i) + { + get(); + s.push_back(current); + } + + auto ia = detail::input_adapter(std::forward(s)); + auto l = detail::lexer(std::move(ia), false); + auto result = l.scan(); + + switch (result) + { + case detail::lexer_base::token_type::value_integer: + return sax->number_integer(l.get_number_integer()); + case detail::lexer_base::token_type::value_unsigned: + return sax->number_unsigned(l.get_number_unsigned()); + case detail::lexer_base::token_type::value_float: + return sax->number_float(l.get_number_float(), std::move(s)); + default: + return sax->parse_error(chars_read, s, parse_error::create(113, chars_read, exception_message(input_format_t::ubjson, "invalid number", "number"))); + } + } + case 'C': // char { get(); diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index c639df9c..c9fa6f97 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -5733,6 +5733,1634 @@ class json_sax_acceptor } // namespace nlohmann +// #include + + +#include // array +#include // localeconv +#include // size_t +#include // snprintf +#include // strtof, strtod, strtold, strtoll, strtoull +#include // initializer_list +#include // char_traits, string +#include // move +#include // vector + +// #include + +// #include + +// #include + + +namespace nlohmann +{ +namespace detail +{ +/////////// +// lexer // +/////////// + +template +class lexer_base +{ + public: + /// token types for the parser + enum class token_type + { + uninitialized, ///< indicating the scanner is uninitialized + literal_true, ///< the `true` literal + literal_false, ///< the `false` literal + literal_null, ///< the `null` literal + value_string, ///< a string -- use get_string() for actual value + value_unsigned, ///< an unsigned integer -- use get_number_unsigned() for actual value + value_integer, ///< a signed integer -- use get_number_integer() for actual value + value_float, ///< an floating point number -- use get_number_float() for actual value + begin_array, ///< the character for array begin `[` + begin_object, ///< the character for object begin `{` + end_array, ///< the character for array end `]` + end_object, ///< the character for object end `}` + name_separator, ///< the name separator `:` + value_separator, ///< the value separator `,` + parse_error, ///< indicating a parse error + end_of_input, ///< indicating the end of the input buffer + literal_or_value ///< a literal or the begin of a value (only for diagnostics) + }; + + /// return name of values of type token_type (only used for errors) + JSON_HEDLEY_RETURNS_NON_NULL + JSON_HEDLEY_CONST + static const char* token_type_name(const token_type t) noexcept + { + switch (t) + { + case token_type::uninitialized: + return ""; + case token_type::literal_true: + return "true literal"; + case token_type::literal_false: + return "false literal"; + case token_type::literal_null: + return "null literal"; + case token_type::value_string: + return "string literal"; + case token_type::value_unsigned: + case token_type::value_integer: + case token_type::value_float: + return "number literal"; + case token_type::begin_array: + return "'['"; + case token_type::begin_object: + return "'{'"; + case token_type::end_array: + return "']'"; + case token_type::end_object: + return "'}'"; + case token_type::name_separator: + return "':'"; + case token_type::value_separator: + return "','"; + case token_type::parse_error: + return ""; + case token_type::end_of_input: + return "end of input"; + case token_type::literal_or_value: + return "'[', '{', or a literal"; + // LCOV_EXCL_START + default: // catch non-enum values + return "unknown token"; + // LCOV_EXCL_STOP + } + } +}; +/*! +@brief lexical analysis + +This class organizes the lexical analysis during JSON deserialization. +*/ +template +class lexer : public lexer_base +{ + using number_integer_t = typename BasicJsonType::number_integer_t; + using number_unsigned_t = typename BasicJsonType::number_unsigned_t; + using number_float_t = typename BasicJsonType::number_float_t; + using string_t = typename BasicJsonType::string_t; + using char_type = typename InputAdapterType::char_type; + using char_int_type = typename std::char_traits::int_type; + + public: + using token_type = typename lexer_base::token_type; + + explicit lexer(InputAdapterType&& adapter, bool ignore_comments_ = false) + : ia(std::move(adapter)) + , ignore_comments(ignore_comments_) + , decimal_point_char(static_cast(get_decimal_point())) + {} + + // delete because of pointer members + lexer(const lexer&) = delete; + lexer(lexer&&) = default; + lexer& operator=(lexer&) = delete; + lexer& operator=(lexer&&) = default; + ~lexer() = default; + + private: + ///////////////////// + // locales + ///////////////////// + + /// return the locale-dependent decimal point + JSON_HEDLEY_PURE + static char get_decimal_point() noexcept + { + const auto* loc = localeconv(); + JSON_ASSERT(loc != nullptr); + return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point); + } + + ///////////////////// + // scan functions + ///////////////////// + + /*! + @brief get codepoint from 4 hex characters following `\u` + + For input "\u c1 c2 c3 c4" the codepoint is: + (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4 + = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0) + + Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f' + must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The + conversion is done by subtracting the offset (0x30, 0x37, and 0x57) + between the ASCII value of the character and the desired integer value. + + @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or + non-hex character) + */ + int get_codepoint() + { + // this function only makes sense after reading `\u` + JSON_ASSERT(current == 'u'); + int codepoint = 0; + + const auto factors = { 12u, 8u, 4u, 0u }; + for (const auto factor : factors) + { + get(); + + if (current >= '0' && current <= '9') + { + codepoint += static_cast((static_cast(current) - 0x30u) << factor); + } + else if (current >= 'A' && current <= 'F') + { + codepoint += static_cast((static_cast(current) - 0x37u) << factor); + } + else if (current >= 'a' && current <= 'f') + { + codepoint += static_cast((static_cast(current) - 0x57u) << factor); + } + else + { + return -1; + } + } + + JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF); + return codepoint; + } + + /*! + @brief check if the next byte(s) are inside a given range + + Adds the current byte and, for each passed range, reads a new byte and + checks if it is inside the range. If a violation was detected, set up an + error message and return false. Otherwise, return true. + + @param[in] ranges list of integers; interpreted as list of pairs of + inclusive lower and upper bound, respectively + + @pre The passed list @a ranges must have 2, 4, or 6 elements; that is, + 1, 2, or 3 pairs. This precondition is enforced by an assertion. + + @return true if and only if no range violation was detected + */ + bool next_byte_in_range(std::initializer_list ranges) + { + JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6); + add(current); + + for (auto range = ranges.begin(); range != ranges.end(); ++range) + { + get(); + if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range))) + { + add(current); + } + else + { + error_message = "invalid string: ill-formed UTF-8 byte"; + return false; + } + } + + return true; + } + + /*! + @brief scan a string literal + + This function scans a string according to Sect. 7 of RFC 7159. While + scanning, bytes are escaped and copied into buffer token_buffer. Then the + function returns successfully, token_buffer is *not* null-terminated (as it + may contain \0 bytes), and token_buffer.size() is the number of bytes in the + string. + + @return token_type::value_string if string could be successfully scanned, + token_type::parse_error otherwise + + @note In case of errors, variable error_message contains a textual + description. + */ + token_type scan_string() + { + // reset token_buffer (ignore opening quote) + reset(); + + // we entered the function by reading an open quote + JSON_ASSERT(current == '\"'); + + while (true) + { + // get next character + switch (get()) + { + // end of file while parsing string + case std::char_traits::eof(): + { + error_message = "invalid string: missing closing quote"; + return token_type::parse_error; + } + + // closing quote + case '\"': + { + return token_type::value_string; + } + + // escapes + case '\\': + { + switch (get()) + { + // quotation mark + case '\"': + add('\"'); + break; + // reverse solidus + case '\\': + add('\\'); + break; + // solidus + case '/': + add('/'); + break; + // backspace + case 'b': + add('\b'); + break; + // form feed + case 'f': + add('\f'); + break; + // line feed + case 'n': + add('\n'); + break; + // carriage return + case 'r': + add('\r'); + break; + // tab + case 't': + add('\t'); + break; + + // unicode escapes + case 'u': + { + const int codepoint1 = get_codepoint(); + int codepoint = codepoint1; // start with codepoint1 + + if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1)) + { + error_message = "invalid string: '\\u' must be followed by 4 hex digits"; + return token_type::parse_error; + } + + // check if code point is a high surrogate + if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF) + { + // expect next \uxxxx entry + if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u')) + { + const int codepoint2 = get_codepoint(); + + if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1)) + { + error_message = "invalid string: '\\u' must be followed by 4 hex digits"; + return token_type::parse_error; + } + + // check if codepoint2 is a low surrogate + if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF)) + { + // overwrite codepoint + codepoint = static_cast( + // high surrogate occupies the most significant 22 bits + (static_cast(codepoint1) << 10u) + // low surrogate occupies the least significant 15 bits + + static_cast(codepoint2) + // there is still the 0xD800, 0xDC00 and 0x10000 noise + // in the result so we have to subtract with: + // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00 + - 0x35FDC00u); + } + else + { + error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF"; + return token_type::parse_error; + } + } + else + { + error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF"; + return token_type::parse_error; + } + } + else + { + if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF)) + { + error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF"; + return token_type::parse_error; + } + } + + // result of the above calculation yields a proper codepoint + JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF); + + // translate codepoint into bytes + if (codepoint < 0x80) + { + // 1-byte characters: 0xxxxxxx (ASCII) + add(static_cast(codepoint)); + } + else if (codepoint <= 0x7FF) + { + // 2-byte characters: 110xxxxx 10xxxxxx + add(static_cast(0xC0u | (static_cast(codepoint) >> 6u))); + add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); + } + else if (codepoint <= 0xFFFF) + { + // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx + add(static_cast(0xE0u | (static_cast(codepoint) >> 12u))); + add(static_cast(0x80u | ((static_cast(codepoint) >> 6u) & 0x3Fu))); + add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); + } + else + { + // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + add(static_cast(0xF0u | (static_cast(codepoint) >> 18u))); + add(static_cast(0x80u | ((static_cast(codepoint) >> 12u) & 0x3Fu))); + add(static_cast(0x80u | ((static_cast(codepoint) >> 6u) & 0x3Fu))); + add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); + } + + break; + } + + // other characters after escape + default: + error_message = "invalid string: forbidden character after backslash"; + return token_type::parse_error; + } + + break; + } + + // invalid control characters + case 0x00: + { + error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000"; + return token_type::parse_error; + } + + case 0x01: + { + error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001"; + return token_type::parse_error; + } + + case 0x02: + { + error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002"; + return token_type::parse_error; + } + + case 0x03: + { + error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003"; + return token_type::parse_error; + } + + case 0x04: + { + error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004"; + return token_type::parse_error; + } + + case 0x05: + { + error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005"; + return token_type::parse_error; + } + + case 0x06: + { + error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006"; + return token_type::parse_error; + } + + case 0x07: + { + error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007"; + return token_type::parse_error; + } + + case 0x08: + { + error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b"; + return token_type::parse_error; + } + + case 0x09: + { + error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t"; + return token_type::parse_error; + } + + case 0x0A: + { + error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n"; + return token_type::parse_error; + } + + case 0x0B: + { + error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B"; + return token_type::parse_error; + } + + case 0x0C: + { + error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f"; + return token_type::parse_error; + } + + case 0x0D: + { + error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r"; + return token_type::parse_error; + } + + case 0x0E: + { + error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E"; + return token_type::parse_error; + } + + case 0x0F: + { + error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F"; + return token_type::parse_error; + } + + case 0x10: + { + error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010"; + return token_type::parse_error; + } + + case 0x11: + { + error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011"; + return token_type::parse_error; + } + + case 0x12: + { + error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012"; + return token_type::parse_error; + } + + case 0x13: + { + error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013"; + return token_type::parse_error; + } + + case 0x14: + { + error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014"; + return token_type::parse_error; + } + + case 0x15: + { + error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015"; + return token_type::parse_error; + } + + case 0x16: + { + error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016"; + return token_type::parse_error; + } + + case 0x17: + { + error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017"; + return token_type::parse_error; + } + + case 0x18: + { + error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018"; + return token_type::parse_error; + } + + case 0x19: + { + error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019"; + return token_type::parse_error; + } + + case 0x1A: + { + error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A"; + return token_type::parse_error; + } + + case 0x1B: + { + error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B"; + return token_type::parse_error; + } + + case 0x1C: + { + error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C"; + return token_type::parse_error; + } + + case 0x1D: + { + error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D"; + return token_type::parse_error; + } + + case 0x1E: + { + error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E"; + return token_type::parse_error; + } + + case 0x1F: + { + error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F"; + return token_type::parse_error; + } + + // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace)) + case 0x20: + case 0x21: + case 0x23: + case 0x24: + case 0x25: + case 0x26: + case 0x27: + case 0x28: + case 0x29: + case 0x2A: + case 0x2B: + case 0x2C: + case 0x2D: + case 0x2E: + case 0x2F: + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + case 0x3A: + case 0x3B: + case 0x3C: + case 0x3D: + case 0x3E: + case 0x3F: + case 0x40: + case 0x41: + case 0x42: + case 0x43: + case 0x44: + case 0x45: + case 0x46: + case 0x47: + case 0x48: + case 0x49: + case 0x4A: + case 0x4B: + case 0x4C: + case 0x4D: + case 0x4E: + case 0x4F: + case 0x50: + case 0x51: + case 0x52: + case 0x53: + case 0x54: + case 0x55: + case 0x56: + case 0x57: + case 0x58: + case 0x59: + case 0x5A: + case 0x5B: + case 0x5D: + case 0x5E: + case 0x5F: + case 0x60: + case 0x61: + case 0x62: + case 0x63: + case 0x64: + case 0x65: + case 0x66: + case 0x67: + case 0x68: + case 0x69: + case 0x6A: + case 0x6B: + case 0x6C: + case 0x6D: + case 0x6E: + case 0x6F: + case 0x70: + case 0x71: + case 0x72: + case 0x73: + case 0x74: + case 0x75: + case 0x76: + case 0x77: + case 0x78: + case 0x79: + case 0x7A: + case 0x7B: + case 0x7C: + case 0x7D: + case 0x7E: + case 0x7F: + { + add(current); + break; + } + + // U+0080..U+07FF: bytes C2..DF 80..BF + case 0xC2: + case 0xC3: + case 0xC4: + case 0xC5: + case 0xC6: + case 0xC7: + case 0xC8: + case 0xC9: + case 0xCA: + case 0xCB: + case 0xCC: + case 0xCD: + case 0xCE: + case 0xCF: + case 0xD0: + case 0xD1: + case 0xD2: + case 0xD3: + case 0xD4: + case 0xD5: + case 0xD6: + case 0xD7: + case 0xD8: + case 0xD9: + case 0xDA: + case 0xDB: + case 0xDC: + case 0xDD: + case 0xDE: + case 0xDF: + { + if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF}))) + { + return token_type::parse_error; + } + break; + } + + // U+0800..U+0FFF: bytes E0 A0..BF 80..BF + case 0xE0: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF + // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF + case 0xE1: + case 0xE2: + case 0xE3: + case 0xE4: + case 0xE5: + case 0xE6: + case 0xE7: + case 0xE8: + case 0xE9: + case 0xEA: + case 0xEB: + case 0xEC: + case 0xEE: + case 0xEF: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // U+D000..U+D7FF: bytes ED 80..9F 80..BF + case 0xED: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF + case 0xF0: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF + case 0xF1: + case 0xF2: + case 0xF3: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF + case 0xF4: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // remaining bytes (80..C1 and F5..FF) are ill-formed + default: + { + error_message = "invalid string: ill-formed UTF-8 byte"; + return token_type::parse_error; + } + } + } + } + + /*! + * @brief scan a comment + * @return whether comment could be scanned successfully + */ + bool scan_comment() + { + switch (get()) + { + // single-line comments skip input until a newline or EOF is read + case '/': + { + while (true) + { + switch (get()) + { + case '\n': + case '\r': + case std::char_traits::eof(): + case '\0': + return true; + + default: + break; + } + } + } + + // multi-line comments skip input until */ is read + case '*': + { + while (true) + { + switch (get()) + { + case std::char_traits::eof(): + case '\0': + { + error_message = "invalid comment; missing closing '*/'"; + return false; + } + + case '*': + { + switch (get()) + { + case '/': + return true; + + default: + { + unget(); + break; + } + } + } + + default: + break; + } + } + } + + // unexpected character after reading '/' + default: + { + error_message = "invalid comment; expecting '/' or '*' after '/'"; + return false; + } + } + } + + JSON_HEDLEY_NON_NULL(2) + static void strtof(float& f, const char* str, char** endptr) noexcept + { + f = std::strtof(str, endptr); + } + + JSON_HEDLEY_NON_NULL(2) + static void strtof(double& f, const char* str, char** endptr) noexcept + { + f = std::strtod(str, endptr); + } + + JSON_HEDLEY_NON_NULL(2) + static void strtof(long double& f, const char* str, char** endptr) noexcept + { + f = std::strtold(str, endptr); + } + + /*! + @brief scan a number literal + + This function scans a string according to Sect. 6 of RFC 7159. + + The function is realized with a deterministic finite state machine derived + from the grammar described in RFC 7159. Starting in state "init", the + input is read and used to determined the next state. Only state "done" + accepts the number. State "error" is a trap state to model errors. In the + table below, "anything" means any character but the ones listed before. + + state | 0 | 1-9 | e E | + | - | . | anything + ---------|----------|----------|----------|---------|---------|----------|----------- + init | zero | any1 | [error] | [error] | minus | [error] | [error] + minus | zero | any1 | [error] | [error] | [error] | [error] | [error] + zero | done | done | exponent | done | done | decimal1 | done + any1 | any1 | any1 | exponent | done | done | decimal1 | done + decimal1 | decimal2 | decimal2 | [error] | [error] | [error] | [error] | [error] + decimal2 | decimal2 | decimal2 | exponent | done | done | done | done + exponent | any2 | any2 | [error] | sign | sign | [error] | [error] + sign | any2 | any2 | [error] | [error] | [error] | [error] | [error] + any2 | any2 | any2 | done | done | done | done | done + + The state machine is realized with one label per state (prefixed with + "scan_number_") and `goto` statements between them. The state machine + contains cycles, but any cycle can be left when EOF is read. Therefore, + the function is guaranteed to terminate. + + During scanning, the read bytes are stored in token_buffer. This string is + then converted to a signed integer, an unsigned integer, or a + floating-point number. + + @return token_type::value_unsigned, token_type::value_integer, or + token_type::value_float if number could be successfully scanned, + token_type::parse_error otherwise + + @note The scanner is independent of the current locale. Internally, the + locale's decimal point is used instead of `.` to work with the + locale-dependent converters. + */ + token_type scan_number() // lgtm [cpp/use-of-goto] + { + // reset token_buffer to store the number's bytes + reset(); + + // the type of the parsed number; initially set to unsigned; will be + // changed if minus sign, decimal point or exponent is read + token_type number_type = token_type::value_unsigned; + + // state (init): we just found out we need to scan a number + switch (current) + { + case '-': + { + add(current); + goto scan_number_minus; + } + + case '0': + { + add(current); + goto scan_number_zero; + } + + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any1; + } + + // all other characters are rejected outside scan_number() + default: // LCOV_EXCL_LINE + JSON_ASSERT(false); // LCOV_EXCL_LINE + } + +scan_number_minus: + // state: we just parsed a leading minus sign + number_type = token_type::value_integer; + switch (get()) + { + case '0': + { + add(current); + goto scan_number_zero; + } + + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any1; + } + + default: + { + error_message = "invalid number; expected digit after '-'"; + return token_type::parse_error; + } + } + +scan_number_zero: + // state: we just parse a zero (maybe with a leading minus sign) + switch (get()) + { + case '.': + { + add(decimal_point_char); + goto scan_number_decimal1; + } + + case 'e': + case 'E': + { + add(current); + goto scan_number_exponent; + } + + default: + goto scan_number_done; + } + +scan_number_any1: + // state: we just parsed a number 0-9 (maybe with a leading minus sign) + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any1; + } + + case '.': + { + add(decimal_point_char); + goto scan_number_decimal1; + } + + case 'e': + case 'E': + { + add(current); + goto scan_number_exponent; + } + + default: + goto scan_number_done; + } + +scan_number_decimal1: + // state: we just parsed a decimal point + number_type = token_type::value_float; + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_decimal2; + } + + default: + { + error_message = "invalid number; expected digit after '.'"; + return token_type::parse_error; + } + } + +scan_number_decimal2: + // we just parsed at least one number after a decimal point + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_decimal2; + } + + case 'e': + case 'E': + { + add(current); + goto scan_number_exponent; + } + + default: + goto scan_number_done; + } + +scan_number_exponent: + // we just parsed an exponent + number_type = token_type::value_float; + switch (get()) + { + case '+': + case '-': + { + add(current); + goto scan_number_sign; + } + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any2; + } + + default: + { + error_message = + "invalid number; expected '+', '-', or digit after exponent"; + return token_type::parse_error; + } + } + +scan_number_sign: + // we just parsed an exponent sign + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any2; + } + + default: + { + error_message = "invalid number; expected digit after exponent sign"; + return token_type::parse_error; + } + } + +scan_number_any2: + // we just parsed a number after the exponent or exponent sign + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any2; + } + + default: + goto scan_number_done; + } + +scan_number_done: + // unget the character after the number (we only read it to know that + // we are done scanning a number) + unget(); + + char* endptr = nullptr; + errno = 0; + + // try to parse integers first and fall back to floats + if (number_type == token_type::value_unsigned) + { + const auto x = std::strtoull(token_buffer.data(), &endptr, 10); + + // we checked the number format before + JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); + + if (errno == 0) + { + value_unsigned = static_cast(x); + if (value_unsigned == x) + { + return token_type::value_unsigned; + } + } + } + else if (number_type == token_type::value_integer) + { + const auto x = std::strtoll(token_buffer.data(), &endptr, 10); + + // we checked the number format before + JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); + + if (errno == 0) + { + value_integer = static_cast(x); + if (value_integer == x) + { + return token_type::value_integer; + } + } + } + + // this code is reached if we parse a floating-point number or if an + // integer conversion above failed + strtof(value_float, token_buffer.data(), &endptr); + + // we checked the number format before + JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); + + return token_type::value_float; + } + + /*! + @param[in] literal_text the literal text to expect + @param[in] length the length of the passed literal text + @param[in] return_type the token type to return on success + */ + JSON_HEDLEY_NON_NULL(2) + token_type scan_literal(const char_type* literal_text, const std::size_t length, + token_type return_type) + { + JSON_ASSERT(std::char_traits::to_char_type(current) == literal_text[0]); + for (std::size_t i = 1; i < length; ++i) + { + if (JSON_HEDLEY_UNLIKELY(std::char_traits::to_char_type(get()) != literal_text[i])) + { + error_message = "invalid literal"; + return token_type::parse_error; + } + } + return return_type; + } + + ///////////////////// + // input management + ///////////////////// + + /// reset token_buffer; current character is beginning of token + void reset() noexcept + { + token_buffer.clear(); + token_string.clear(); + token_string.push_back(std::char_traits::to_char_type(current)); + } + + /* + @brief get next character from the input + + This function provides the interface to the used input adapter. It does + not throw in case the input reached EOF, but returns a + `std::char_traits::eof()` in that case. Stores the scanned characters + for use in error messages. + + @return character read from the input + */ + char_int_type get() + { + ++position.chars_read_total; + ++position.chars_read_current_line; + + if (next_unget) + { + // just reset the next_unget variable and work with current + next_unget = false; + } + else + { + current = ia.get_character(); + } + + if (JSON_HEDLEY_LIKELY(current != std::char_traits::eof())) + { + token_string.push_back(std::char_traits::to_char_type(current)); + } + + if (current == '\n') + { + ++position.lines_read; + position.chars_read_current_line = 0; + } + + return current; + } + + /*! + @brief unget current character (read it again on next get) + + We implement unget by setting variable next_unget to true. The input is not + changed - we just simulate ungetting by modifying chars_read_total, + chars_read_current_line, and token_string. The next call to get() will + behave as if the unget character is read again. + */ + void unget() + { + next_unget = true; + + --position.chars_read_total; + + // in case we "unget" a newline, we have to also decrement the lines_read + if (position.chars_read_current_line == 0) + { + if (position.lines_read > 0) + { + --position.lines_read; + } + } + else + { + --position.chars_read_current_line; + } + + if (JSON_HEDLEY_LIKELY(current != std::char_traits::eof())) + { + JSON_ASSERT(!token_string.empty()); + token_string.pop_back(); + } + } + + /// add a character to token_buffer + void add(char_int_type c) + { + token_buffer.push_back(static_cast(c)); + } + + public: + ///////////////////// + // value getters + ///////////////////// + + /// return integer value + constexpr number_integer_t get_number_integer() const noexcept + { + return value_integer; + } + + /// return unsigned integer value + constexpr number_unsigned_t get_number_unsigned() const noexcept + { + return value_unsigned; + } + + /// return floating-point value + constexpr number_float_t get_number_float() const noexcept + { + return value_float; + } + + /// return current string value (implicitly resets the token; useful only once) + string_t& get_string() + { + return token_buffer; + } + + ///////////////////// + // diagnostics + ///////////////////// + + /// return position of last read token + constexpr position_t get_position() const noexcept + { + return position; + } + + /// return the last read token (for errors only). Will never contain EOF + /// (an arbitrary value that is not a valid char value, often -1), because + /// 255 may legitimately occur. May contain NUL, which should be escaped. + std::string get_token_string() const + { + // escape control characters + std::string result; + for (const auto c : token_string) + { + if (static_cast(c) <= '\x1F') + { + // escape control characters + std::array cs{{}}; + (std::snprintf)(cs.data(), cs.size(), "", static_cast(c)); + result += cs.data(); + } + else + { + // add character as is + result.push_back(static_cast(c)); + } + } + + return result; + } + + /// return syntax error message + JSON_HEDLEY_RETURNS_NON_NULL + constexpr const char* get_error_message() const noexcept + { + return error_message; + } + + ///////////////////// + // actual scanner + ///////////////////// + + /*! + @brief skip the UTF-8 byte order mark + @return true iff there is no BOM or the correct BOM has been skipped + */ + bool skip_bom() + { + if (get() == 0xEF) + { + // check if we completely parse the BOM + return get() == 0xBB && get() == 0xBF; + } + + // the first character is not the beginning of the BOM; unget it to + // process is later + unget(); + return true; + } + + void skip_whitespace() + { + do + { + get(); + } + while (current == ' ' || current == '\t' || current == '\n' || current == '\r'); + } + + token_type scan() + { + // initially, skip the BOM + if (position.chars_read_total == 0 && !skip_bom()) + { + error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given"; + return token_type::parse_error; + } + + // read next character and ignore whitespace + skip_whitespace(); + + // ignore comments + if (ignore_comments && current == '/') + { + if (!scan_comment()) + { + return token_type::parse_error; + } + + // skip following whitespace + skip_whitespace(); + } + + switch (current) + { + // structural characters + case '[': + return token_type::begin_array; + case ']': + return token_type::end_array; + case '{': + return token_type::begin_object; + case '}': + return token_type::end_object; + case ':': + return token_type::name_separator; + case ',': + return token_type::value_separator; + + // literals + case 't': + { + std::array true_literal = {{'t', 'r', 'u', 'e'}}; + return scan_literal(true_literal.data(), true_literal.size(), token_type::literal_true); + } + case 'f': + { + std::array false_literal = {{'f', 'a', 'l', 's', 'e'}}; + return scan_literal(false_literal.data(), false_literal.size(), token_type::literal_false); + } + case 'n': + { + std::array null_literal = {{'n', 'u', 'l', 'l'}}; + return scan_literal(null_literal.data(), null_literal.size(), token_type::literal_null); + } + + // string + case '\"': + return scan_string(); + + // number + case '-': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + return scan_number(); + + // end of input (the null byte is needed when parsing from + // string literals) + case '\0': + case std::char_traits::eof(): + return token_type::end_of_input; + + // error + default: + error_message = "invalid literal"; + return token_type::parse_error; + } + } + + private: + /// input adapter + InputAdapterType ia; + + /// whether comments should be ignored (true) or signaled as errors (false) + const bool ignore_comments = false; + + /// the current character + char_int_type current = std::char_traits::eof(); + + /// whether the next get() call should just return current + bool next_unget = false; + + /// the start position of the current token + position_t position {}; + + /// raw input token string (for error messages) + std::vector token_string {}; + + /// buffer for variable-length tokens (numbers, strings) + string_t token_buffer {}; + + /// a description of occurred lexer errors + const char* error_message = ""; + + // number values + number_integer_t value_integer = 0; + number_unsigned_t value_unsigned = 0; + number_float_t value_float = 0; + + /// the decimal point + const char_int_type decimal_point_char = '.'; +}; +} // namespace detail +} // namespace nlohmann + // #include // #include @@ -7872,6 +9500,35 @@ class binary_reader return get_number(input_format_t::ubjson, number) && sax->number_float(static_cast(number), ""); } + case 'H': + { + std::size_t size{}; + auto res = get_ubjson_size_value(size); + + std::string s; + for (int i = 0; i < size; ++i) + { + get(); + s.push_back(current); + } + + auto ia = detail::input_adapter(std::forward(s)); + auto l = detail::lexer(std::move(ia), false); + auto result = l.scan(); + + switch (result) + { + case detail::lexer_base::token_type::value_integer: + return sax->number_integer(l.get_number_integer()); + case detail::lexer_base::token_type::value_unsigned: + return sax->number_unsigned(l.get_number_unsigned()); + case detail::lexer_base::token_type::value_float: + return sax->number_float(l.get_number_float(), std::move(s)); + default: + return sax->parse_error(chars_read, s, parse_error::create(113, chars_read, exception_message(input_format_t::ubjson, "invalid number", "number"))); + } + } + case 'C': // char { get(); @@ -8273,1632 +9930,6 @@ class binary_reader // #include - -#include // array -#include // localeconv -#include // size_t -#include // snprintf -#include // strtof, strtod, strtold, strtoll, strtoull -#include // initializer_list -#include // char_traits, string -#include // move -#include // vector - -// #include - -// #include - -// #include - - -namespace nlohmann -{ -namespace detail -{ -/////////// -// lexer // -/////////// - -template -class lexer_base -{ - public: - /// token types for the parser - enum class token_type - { - uninitialized, ///< indicating the scanner is uninitialized - literal_true, ///< the `true` literal - literal_false, ///< the `false` literal - literal_null, ///< the `null` literal - value_string, ///< a string -- use get_string() for actual value - value_unsigned, ///< an unsigned integer -- use get_number_unsigned() for actual value - value_integer, ///< a signed integer -- use get_number_integer() for actual value - value_float, ///< an floating point number -- use get_number_float() for actual value - begin_array, ///< the character for array begin `[` - begin_object, ///< the character for object begin `{` - end_array, ///< the character for array end `]` - end_object, ///< the character for object end `}` - name_separator, ///< the name separator `:` - value_separator, ///< the value separator `,` - parse_error, ///< indicating a parse error - end_of_input, ///< indicating the end of the input buffer - literal_or_value ///< a literal or the begin of a value (only for diagnostics) - }; - - /// return name of values of type token_type (only used for errors) - JSON_HEDLEY_RETURNS_NON_NULL - JSON_HEDLEY_CONST - static const char* token_type_name(const token_type t) noexcept - { - switch (t) - { - case token_type::uninitialized: - return ""; - case token_type::literal_true: - return "true literal"; - case token_type::literal_false: - return "false literal"; - case token_type::literal_null: - return "null literal"; - case token_type::value_string: - return "string literal"; - case token_type::value_unsigned: - case token_type::value_integer: - case token_type::value_float: - return "number literal"; - case token_type::begin_array: - return "'['"; - case token_type::begin_object: - return "'{'"; - case token_type::end_array: - return "']'"; - case token_type::end_object: - return "'}'"; - case token_type::name_separator: - return "':'"; - case token_type::value_separator: - return "','"; - case token_type::parse_error: - return ""; - case token_type::end_of_input: - return "end of input"; - case token_type::literal_or_value: - return "'[', '{', or a literal"; - // LCOV_EXCL_START - default: // catch non-enum values - return "unknown token"; - // LCOV_EXCL_STOP - } - } -}; -/*! -@brief lexical analysis - -This class organizes the lexical analysis during JSON deserialization. -*/ -template -class lexer : public lexer_base -{ - using number_integer_t = typename BasicJsonType::number_integer_t; - using number_unsigned_t = typename BasicJsonType::number_unsigned_t; - using number_float_t = typename BasicJsonType::number_float_t; - using string_t = typename BasicJsonType::string_t; - using char_type = typename InputAdapterType::char_type; - using char_int_type = typename std::char_traits::int_type; - - public: - using token_type = typename lexer_base::token_type; - - explicit lexer(InputAdapterType&& adapter, bool ignore_comments_ = false) - : ia(std::move(adapter)) - , ignore_comments(ignore_comments_) - , decimal_point_char(static_cast(get_decimal_point())) - {} - - // delete because of pointer members - lexer(const lexer&) = delete; - lexer(lexer&&) = default; - lexer& operator=(lexer&) = delete; - lexer& operator=(lexer&&) = default; - ~lexer() = default; - - private: - ///////////////////// - // locales - ///////////////////// - - /// return the locale-dependent decimal point - JSON_HEDLEY_PURE - static char get_decimal_point() noexcept - { - const auto* loc = localeconv(); - JSON_ASSERT(loc != nullptr); - return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point); - } - - ///////////////////// - // scan functions - ///////////////////// - - /*! - @brief get codepoint from 4 hex characters following `\u` - - For input "\u c1 c2 c3 c4" the codepoint is: - (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4 - = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0) - - Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f' - must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The - conversion is done by subtracting the offset (0x30, 0x37, and 0x57) - between the ASCII value of the character and the desired integer value. - - @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or - non-hex character) - */ - int get_codepoint() - { - // this function only makes sense after reading `\u` - JSON_ASSERT(current == 'u'); - int codepoint = 0; - - const auto factors = { 12u, 8u, 4u, 0u }; - for (const auto factor : factors) - { - get(); - - if (current >= '0' && current <= '9') - { - codepoint += static_cast((static_cast(current) - 0x30u) << factor); - } - else if (current >= 'A' && current <= 'F') - { - codepoint += static_cast((static_cast(current) - 0x37u) << factor); - } - else if (current >= 'a' && current <= 'f') - { - codepoint += static_cast((static_cast(current) - 0x57u) << factor); - } - else - { - return -1; - } - } - - JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF); - return codepoint; - } - - /*! - @brief check if the next byte(s) are inside a given range - - Adds the current byte and, for each passed range, reads a new byte and - checks if it is inside the range. If a violation was detected, set up an - error message and return false. Otherwise, return true. - - @param[in] ranges list of integers; interpreted as list of pairs of - inclusive lower and upper bound, respectively - - @pre The passed list @a ranges must have 2, 4, or 6 elements; that is, - 1, 2, or 3 pairs. This precondition is enforced by an assertion. - - @return true if and only if no range violation was detected - */ - bool next_byte_in_range(std::initializer_list ranges) - { - JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6); - add(current); - - for (auto range = ranges.begin(); range != ranges.end(); ++range) - { - get(); - if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range))) - { - add(current); - } - else - { - error_message = "invalid string: ill-formed UTF-8 byte"; - return false; - } - } - - return true; - } - - /*! - @brief scan a string literal - - This function scans a string according to Sect. 7 of RFC 7159. While - scanning, bytes are escaped and copied into buffer token_buffer. Then the - function returns successfully, token_buffer is *not* null-terminated (as it - may contain \0 bytes), and token_buffer.size() is the number of bytes in the - string. - - @return token_type::value_string if string could be successfully scanned, - token_type::parse_error otherwise - - @note In case of errors, variable error_message contains a textual - description. - */ - token_type scan_string() - { - // reset token_buffer (ignore opening quote) - reset(); - - // we entered the function by reading an open quote - JSON_ASSERT(current == '\"'); - - while (true) - { - // get next character - switch (get()) - { - // end of file while parsing string - case std::char_traits::eof(): - { - error_message = "invalid string: missing closing quote"; - return token_type::parse_error; - } - - // closing quote - case '\"': - { - return token_type::value_string; - } - - // escapes - case '\\': - { - switch (get()) - { - // quotation mark - case '\"': - add('\"'); - break; - // reverse solidus - case '\\': - add('\\'); - break; - // solidus - case '/': - add('/'); - break; - // backspace - case 'b': - add('\b'); - break; - // form feed - case 'f': - add('\f'); - break; - // line feed - case 'n': - add('\n'); - break; - // carriage return - case 'r': - add('\r'); - break; - // tab - case 't': - add('\t'); - break; - - // unicode escapes - case 'u': - { - const int codepoint1 = get_codepoint(); - int codepoint = codepoint1; // start with codepoint1 - - if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1)) - { - error_message = "invalid string: '\\u' must be followed by 4 hex digits"; - return token_type::parse_error; - } - - // check if code point is a high surrogate - if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF) - { - // expect next \uxxxx entry - if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u')) - { - const int codepoint2 = get_codepoint(); - - if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1)) - { - error_message = "invalid string: '\\u' must be followed by 4 hex digits"; - return token_type::parse_error; - } - - // check if codepoint2 is a low surrogate - if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF)) - { - // overwrite codepoint - codepoint = static_cast( - // high surrogate occupies the most significant 22 bits - (static_cast(codepoint1) << 10u) - // low surrogate occupies the least significant 15 bits - + static_cast(codepoint2) - // there is still the 0xD800, 0xDC00 and 0x10000 noise - // in the result so we have to subtract with: - // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00 - - 0x35FDC00u); - } - else - { - error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF"; - return token_type::parse_error; - } - } - else - { - error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF"; - return token_type::parse_error; - } - } - else - { - if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF)) - { - error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF"; - return token_type::parse_error; - } - } - - // result of the above calculation yields a proper codepoint - JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF); - - // translate codepoint into bytes - if (codepoint < 0x80) - { - // 1-byte characters: 0xxxxxxx (ASCII) - add(static_cast(codepoint)); - } - else if (codepoint <= 0x7FF) - { - // 2-byte characters: 110xxxxx 10xxxxxx - add(static_cast(0xC0u | (static_cast(codepoint) >> 6u))); - add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); - } - else if (codepoint <= 0xFFFF) - { - // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx - add(static_cast(0xE0u | (static_cast(codepoint) >> 12u))); - add(static_cast(0x80u | ((static_cast(codepoint) >> 6u) & 0x3Fu))); - add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); - } - else - { - // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - add(static_cast(0xF0u | (static_cast(codepoint) >> 18u))); - add(static_cast(0x80u | ((static_cast(codepoint) >> 12u) & 0x3Fu))); - add(static_cast(0x80u | ((static_cast(codepoint) >> 6u) & 0x3Fu))); - add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); - } - - break; - } - - // other characters after escape - default: - error_message = "invalid string: forbidden character after backslash"; - return token_type::parse_error; - } - - break; - } - - // invalid control characters - case 0x00: - { - error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000"; - return token_type::parse_error; - } - - case 0x01: - { - error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001"; - return token_type::parse_error; - } - - case 0x02: - { - error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002"; - return token_type::parse_error; - } - - case 0x03: - { - error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003"; - return token_type::parse_error; - } - - case 0x04: - { - error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004"; - return token_type::parse_error; - } - - case 0x05: - { - error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005"; - return token_type::parse_error; - } - - case 0x06: - { - error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006"; - return token_type::parse_error; - } - - case 0x07: - { - error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007"; - return token_type::parse_error; - } - - case 0x08: - { - error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b"; - return token_type::parse_error; - } - - case 0x09: - { - error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t"; - return token_type::parse_error; - } - - case 0x0A: - { - error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n"; - return token_type::parse_error; - } - - case 0x0B: - { - error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B"; - return token_type::parse_error; - } - - case 0x0C: - { - error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f"; - return token_type::parse_error; - } - - case 0x0D: - { - error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r"; - return token_type::parse_error; - } - - case 0x0E: - { - error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E"; - return token_type::parse_error; - } - - case 0x0F: - { - error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F"; - return token_type::parse_error; - } - - case 0x10: - { - error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010"; - return token_type::parse_error; - } - - case 0x11: - { - error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011"; - return token_type::parse_error; - } - - case 0x12: - { - error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012"; - return token_type::parse_error; - } - - case 0x13: - { - error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013"; - return token_type::parse_error; - } - - case 0x14: - { - error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014"; - return token_type::parse_error; - } - - case 0x15: - { - error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015"; - return token_type::parse_error; - } - - case 0x16: - { - error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016"; - return token_type::parse_error; - } - - case 0x17: - { - error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017"; - return token_type::parse_error; - } - - case 0x18: - { - error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018"; - return token_type::parse_error; - } - - case 0x19: - { - error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019"; - return token_type::parse_error; - } - - case 0x1A: - { - error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A"; - return token_type::parse_error; - } - - case 0x1B: - { - error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B"; - return token_type::parse_error; - } - - case 0x1C: - { - error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C"; - return token_type::parse_error; - } - - case 0x1D: - { - error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D"; - return token_type::parse_error; - } - - case 0x1E: - { - error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E"; - return token_type::parse_error; - } - - case 0x1F: - { - error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F"; - return token_type::parse_error; - } - - // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace)) - case 0x20: - case 0x21: - case 0x23: - case 0x24: - case 0x25: - case 0x26: - case 0x27: - case 0x28: - case 0x29: - case 0x2A: - case 0x2B: - case 0x2C: - case 0x2D: - case 0x2E: - case 0x2F: - case 0x30: - case 0x31: - case 0x32: - case 0x33: - case 0x34: - case 0x35: - case 0x36: - case 0x37: - case 0x38: - case 0x39: - case 0x3A: - case 0x3B: - case 0x3C: - case 0x3D: - case 0x3E: - case 0x3F: - case 0x40: - case 0x41: - case 0x42: - case 0x43: - case 0x44: - case 0x45: - case 0x46: - case 0x47: - case 0x48: - case 0x49: - case 0x4A: - case 0x4B: - case 0x4C: - case 0x4D: - case 0x4E: - case 0x4F: - case 0x50: - case 0x51: - case 0x52: - case 0x53: - case 0x54: - case 0x55: - case 0x56: - case 0x57: - case 0x58: - case 0x59: - case 0x5A: - case 0x5B: - case 0x5D: - case 0x5E: - case 0x5F: - case 0x60: - case 0x61: - case 0x62: - case 0x63: - case 0x64: - case 0x65: - case 0x66: - case 0x67: - case 0x68: - case 0x69: - case 0x6A: - case 0x6B: - case 0x6C: - case 0x6D: - case 0x6E: - case 0x6F: - case 0x70: - case 0x71: - case 0x72: - case 0x73: - case 0x74: - case 0x75: - case 0x76: - case 0x77: - case 0x78: - case 0x79: - case 0x7A: - case 0x7B: - case 0x7C: - case 0x7D: - case 0x7E: - case 0x7F: - { - add(current); - break; - } - - // U+0080..U+07FF: bytes C2..DF 80..BF - case 0xC2: - case 0xC3: - case 0xC4: - case 0xC5: - case 0xC6: - case 0xC7: - case 0xC8: - case 0xC9: - case 0xCA: - case 0xCB: - case 0xCC: - case 0xCD: - case 0xCE: - case 0xCF: - case 0xD0: - case 0xD1: - case 0xD2: - case 0xD3: - case 0xD4: - case 0xD5: - case 0xD6: - case 0xD7: - case 0xD8: - case 0xD9: - case 0xDA: - case 0xDB: - case 0xDC: - case 0xDD: - case 0xDE: - case 0xDF: - { - if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF}))) - { - return token_type::parse_error; - } - break; - } - - // U+0800..U+0FFF: bytes E0 A0..BF 80..BF - case 0xE0: - { - if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF})))) - { - return token_type::parse_error; - } - break; - } - - // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF - // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF - case 0xE1: - case 0xE2: - case 0xE3: - case 0xE4: - case 0xE5: - case 0xE6: - case 0xE7: - case 0xE8: - case 0xE9: - case 0xEA: - case 0xEB: - case 0xEC: - case 0xEE: - case 0xEF: - { - if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF})))) - { - return token_type::parse_error; - } - break; - } - - // U+D000..U+D7FF: bytes ED 80..9F 80..BF - case 0xED: - { - if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF})))) - { - return token_type::parse_error; - } - break; - } - - // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF - case 0xF0: - { - if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) - { - return token_type::parse_error; - } - break; - } - - // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF - case 0xF1: - case 0xF2: - case 0xF3: - { - if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) - { - return token_type::parse_error; - } - break; - } - - // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF - case 0xF4: - { - if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF})))) - { - return token_type::parse_error; - } - break; - } - - // remaining bytes (80..C1 and F5..FF) are ill-formed - default: - { - error_message = "invalid string: ill-formed UTF-8 byte"; - return token_type::parse_error; - } - } - } - } - - /*! - * @brief scan a comment - * @return whether comment could be scanned successfully - */ - bool scan_comment() - { - switch (get()) - { - // single-line comments skip input until a newline or EOF is read - case '/': - { - while (true) - { - switch (get()) - { - case '\n': - case '\r': - case std::char_traits::eof(): - case '\0': - return true; - - default: - break; - } - } - } - - // multi-line comments skip input until */ is read - case '*': - { - while (true) - { - switch (get()) - { - case std::char_traits::eof(): - case '\0': - { - error_message = "invalid comment; missing closing '*/'"; - return false; - } - - case '*': - { - switch (get()) - { - case '/': - return true; - - default: - { - unget(); - break; - } - } - } - - default: - break; - } - } - } - - // unexpected character after reading '/' - default: - { - error_message = "invalid comment; expecting '/' or '*' after '/'"; - return false; - } - } - } - - JSON_HEDLEY_NON_NULL(2) - static void strtof(float& f, const char* str, char** endptr) noexcept - { - f = std::strtof(str, endptr); - } - - JSON_HEDLEY_NON_NULL(2) - static void strtof(double& f, const char* str, char** endptr) noexcept - { - f = std::strtod(str, endptr); - } - - JSON_HEDLEY_NON_NULL(2) - static void strtof(long double& f, const char* str, char** endptr) noexcept - { - f = std::strtold(str, endptr); - } - - /*! - @brief scan a number literal - - This function scans a string according to Sect. 6 of RFC 7159. - - The function is realized with a deterministic finite state machine derived - from the grammar described in RFC 7159. Starting in state "init", the - input is read and used to determined the next state. Only state "done" - accepts the number. State "error" is a trap state to model errors. In the - table below, "anything" means any character but the ones listed before. - - state | 0 | 1-9 | e E | + | - | . | anything - ---------|----------|----------|----------|---------|---------|----------|----------- - init | zero | any1 | [error] | [error] | minus | [error] | [error] - minus | zero | any1 | [error] | [error] | [error] | [error] | [error] - zero | done | done | exponent | done | done | decimal1 | done - any1 | any1 | any1 | exponent | done | done | decimal1 | done - decimal1 | decimal2 | decimal2 | [error] | [error] | [error] | [error] | [error] - decimal2 | decimal2 | decimal2 | exponent | done | done | done | done - exponent | any2 | any2 | [error] | sign | sign | [error] | [error] - sign | any2 | any2 | [error] | [error] | [error] | [error] | [error] - any2 | any2 | any2 | done | done | done | done | done - - The state machine is realized with one label per state (prefixed with - "scan_number_") and `goto` statements between them. The state machine - contains cycles, but any cycle can be left when EOF is read. Therefore, - the function is guaranteed to terminate. - - During scanning, the read bytes are stored in token_buffer. This string is - then converted to a signed integer, an unsigned integer, or a - floating-point number. - - @return token_type::value_unsigned, token_type::value_integer, or - token_type::value_float if number could be successfully scanned, - token_type::parse_error otherwise - - @note The scanner is independent of the current locale. Internally, the - locale's decimal point is used instead of `.` to work with the - locale-dependent converters. - */ - token_type scan_number() // lgtm [cpp/use-of-goto] - { - // reset token_buffer to store the number's bytes - reset(); - - // the type of the parsed number; initially set to unsigned; will be - // changed if minus sign, decimal point or exponent is read - token_type number_type = token_type::value_unsigned; - - // state (init): we just found out we need to scan a number - switch (current) - { - case '-': - { - add(current); - goto scan_number_minus; - } - - case '0': - { - add(current); - goto scan_number_zero; - } - - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any1; - } - - // all other characters are rejected outside scan_number() - default: // LCOV_EXCL_LINE - JSON_ASSERT(false); // LCOV_EXCL_LINE - } - -scan_number_minus: - // state: we just parsed a leading minus sign - number_type = token_type::value_integer; - switch (get()) - { - case '0': - { - add(current); - goto scan_number_zero; - } - - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any1; - } - - default: - { - error_message = "invalid number; expected digit after '-'"; - return token_type::parse_error; - } - } - -scan_number_zero: - // state: we just parse a zero (maybe with a leading minus sign) - switch (get()) - { - case '.': - { - add(decimal_point_char); - goto scan_number_decimal1; - } - - case 'e': - case 'E': - { - add(current); - goto scan_number_exponent; - } - - default: - goto scan_number_done; - } - -scan_number_any1: - // state: we just parsed a number 0-9 (maybe with a leading minus sign) - switch (get()) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any1; - } - - case '.': - { - add(decimal_point_char); - goto scan_number_decimal1; - } - - case 'e': - case 'E': - { - add(current); - goto scan_number_exponent; - } - - default: - goto scan_number_done; - } - -scan_number_decimal1: - // state: we just parsed a decimal point - number_type = token_type::value_float; - switch (get()) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_decimal2; - } - - default: - { - error_message = "invalid number; expected digit after '.'"; - return token_type::parse_error; - } - } - -scan_number_decimal2: - // we just parsed at least one number after a decimal point - switch (get()) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_decimal2; - } - - case 'e': - case 'E': - { - add(current); - goto scan_number_exponent; - } - - default: - goto scan_number_done; - } - -scan_number_exponent: - // we just parsed an exponent - number_type = token_type::value_float; - switch (get()) - { - case '+': - case '-': - { - add(current); - goto scan_number_sign; - } - - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any2; - } - - default: - { - error_message = - "invalid number; expected '+', '-', or digit after exponent"; - return token_type::parse_error; - } - } - -scan_number_sign: - // we just parsed an exponent sign - switch (get()) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any2; - } - - default: - { - error_message = "invalid number; expected digit after exponent sign"; - return token_type::parse_error; - } - } - -scan_number_any2: - // we just parsed a number after the exponent or exponent sign - switch (get()) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any2; - } - - default: - goto scan_number_done; - } - -scan_number_done: - // unget the character after the number (we only read it to know that - // we are done scanning a number) - unget(); - - char* endptr = nullptr; - errno = 0; - - // try to parse integers first and fall back to floats - if (number_type == token_type::value_unsigned) - { - const auto x = std::strtoull(token_buffer.data(), &endptr, 10); - - // we checked the number format before - JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); - - if (errno == 0) - { - value_unsigned = static_cast(x); - if (value_unsigned == x) - { - return token_type::value_unsigned; - } - } - } - else if (number_type == token_type::value_integer) - { - const auto x = std::strtoll(token_buffer.data(), &endptr, 10); - - // we checked the number format before - JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); - - if (errno == 0) - { - value_integer = static_cast(x); - if (value_integer == x) - { - return token_type::value_integer; - } - } - } - - // this code is reached if we parse a floating-point number or if an - // integer conversion above failed - strtof(value_float, token_buffer.data(), &endptr); - - // we checked the number format before - JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); - - return token_type::value_float; - } - - /*! - @param[in] literal_text the literal text to expect - @param[in] length the length of the passed literal text - @param[in] return_type the token type to return on success - */ - JSON_HEDLEY_NON_NULL(2) - token_type scan_literal(const char_type* literal_text, const std::size_t length, - token_type return_type) - { - JSON_ASSERT(std::char_traits::to_char_type(current) == literal_text[0]); - for (std::size_t i = 1; i < length; ++i) - { - if (JSON_HEDLEY_UNLIKELY(std::char_traits::to_char_type(get()) != literal_text[i])) - { - error_message = "invalid literal"; - return token_type::parse_error; - } - } - return return_type; - } - - ///////////////////// - // input management - ///////////////////// - - /// reset token_buffer; current character is beginning of token - void reset() noexcept - { - token_buffer.clear(); - token_string.clear(); - token_string.push_back(std::char_traits::to_char_type(current)); - } - - /* - @brief get next character from the input - - This function provides the interface to the used input adapter. It does - not throw in case the input reached EOF, but returns a - `std::char_traits::eof()` in that case. Stores the scanned characters - for use in error messages. - - @return character read from the input - */ - char_int_type get() - { - ++position.chars_read_total; - ++position.chars_read_current_line; - - if (next_unget) - { - // just reset the next_unget variable and work with current - next_unget = false; - } - else - { - current = ia.get_character(); - } - - if (JSON_HEDLEY_LIKELY(current != std::char_traits::eof())) - { - token_string.push_back(std::char_traits::to_char_type(current)); - } - - if (current == '\n') - { - ++position.lines_read; - position.chars_read_current_line = 0; - } - - return current; - } - - /*! - @brief unget current character (read it again on next get) - - We implement unget by setting variable next_unget to true. The input is not - changed - we just simulate ungetting by modifying chars_read_total, - chars_read_current_line, and token_string. The next call to get() will - behave as if the unget character is read again. - */ - void unget() - { - next_unget = true; - - --position.chars_read_total; - - // in case we "unget" a newline, we have to also decrement the lines_read - if (position.chars_read_current_line == 0) - { - if (position.lines_read > 0) - { - --position.lines_read; - } - } - else - { - --position.chars_read_current_line; - } - - if (JSON_HEDLEY_LIKELY(current != std::char_traits::eof())) - { - JSON_ASSERT(!token_string.empty()); - token_string.pop_back(); - } - } - - /// add a character to token_buffer - void add(char_int_type c) - { - token_buffer.push_back(static_cast(c)); - } - - public: - ///////////////////// - // value getters - ///////////////////// - - /// return integer value - constexpr number_integer_t get_number_integer() const noexcept - { - return value_integer; - } - - /// return unsigned integer value - constexpr number_unsigned_t get_number_unsigned() const noexcept - { - return value_unsigned; - } - - /// return floating-point value - constexpr number_float_t get_number_float() const noexcept - { - return value_float; - } - - /// return current string value (implicitly resets the token; useful only once) - string_t& get_string() - { - return token_buffer; - } - - ///////////////////// - // diagnostics - ///////////////////// - - /// return position of last read token - constexpr position_t get_position() const noexcept - { - return position; - } - - /// return the last read token (for errors only). Will never contain EOF - /// (an arbitrary value that is not a valid char value, often -1), because - /// 255 may legitimately occur. May contain NUL, which should be escaped. - std::string get_token_string() const - { - // escape control characters - std::string result; - for (const auto c : token_string) - { - if (static_cast(c) <= '\x1F') - { - // escape control characters - std::array cs{{}}; - (std::snprintf)(cs.data(), cs.size(), "", static_cast(c)); - result += cs.data(); - } - else - { - // add character as is - result.push_back(static_cast(c)); - } - } - - return result; - } - - /// return syntax error message - JSON_HEDLEY_RETURNS_NON_NULL - constexpr const char* get_error_message() const noexcept - { - return error_message; - } - - ///////////////////// - // actual scanner - ///////////////////// - - /*! - @brief skip the UTF-8 byte order mark - @return true iff there is no BOM or the correct BOM has been skipped - */ - bool skip_bom() - { - if (get() == 0xEF) - { - // check if we completely parse the BOM - return get() == 0xBB && get() == 0xBF; - } - - // the first character is not the beginning of the BOM; unget it to - // process is later - unget(); - return true; - } - - void skip_whitespace() - { - do - { - get(); - } - while (current == ' ' || current == '\t' || current == '\n' || current == '\r'); - } - - token_type scan() - { - // initially, skip the BOM - if (position.chars_read_total == 0 && !skip_bom()) - { - error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given"; - return token_type::parse_error; - } - - // read next character and ignore whitespace - skip_whitespace(); - - // ignore comments - if (ignore_comments && current == '/') - { - if (!scan_comment()) - { - return token_type::parse_error; - } - - // skip following whitespace - skip_whitespace(); - } - - switch (current) - { - // structural characters - case '[': - return token_type::begin_array; - case ']': - return token_type::end_array; - case '{': - return token_type::begin_object; - case '}': - return token_type::end_object; - case ':': - return token_type::name_separator; - case ',': - return token_type::value_separator; - - // literals - case 't': - { - std::array true_literal = {{'t', 'r', 'u', 'e'}}; - return scan_literal(true_literal.data(), true_literal.size(), token_type::literal_true); - } - case 'f': - { - std::array false_literal = {{'f', 'a', 'l', 's', 'e'}}; - return scan_literal(false_literal.data(), false_literal.size(), token_type::literal_false); - } - case 'n': - { - std::array null_literal = {{'n', 'u', 'l', 'l'}}; - return scan_literal(null_literal.data(), null_literal.size(), token_type::literal_null); - } - - // string - case '\"': - return scan_string(); - - // number - case '-': - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - return scan_number(); - - // end of input (the null byte is needed when parsing from - // string literals) - case '\0': - case std::char_traits::eof(): - return token_type::end_of_input; - - // error - default: - error_message = "invalid literal"; - return token_type::parse_error; - } - } - - private: - /// input adapter - InputAdapterType ia; - - /// whether comments should be ignored (true) or signaled as errors (false) - const bool ignore_comments = false; - - /// the current character - char_int_type current = std::char_traits::eof(); - - /// whether the next get() call should just return current - bool next_unget = false; - - /// the start position of the current token - position_t position {}; - - /// raw input token string (for error messages) - std::vector token_string {}; - - /// buffer for variable-length tokens (numbers, strings) - string_t token_buffer {}; - - /// a description of occurred lexer errors - const char* error_message = ""; - - // number values - number_integer_t value_integer = 0; - number_unsigned_t value_unsigned = 0; - number_float_t value_float = 0; - - /// the decimal point - const char_int_type decimal_point_char = '.'; -}; -} // namespace detail -} // namespace nlohmann - // #include diff --git a/test/src/unit-ubjson.cpp b/test/src/unit-ubjson.cpp index 0f95bcec..61b39139 100644 --- a/test/src/unit-ubjson.cpp +++ b/test/src/unit-ubjson.cpp @@ -767,6 +767,14 @@ TEST_CASE("UBJSON") CHECK(json::from_ubjson(result, true, false) == j); } } + + SECTION("high-precision number") + { + std::vector vec = {'H', 'i', 0x16, '3', '.', '1', '4', '1', '5', '9', '2', '6', '5', '3', '5', '8', '9', '7', '9', '3', '2', '3', '8', '4', '6'}; + const auto j = json::from_ubjson(vec); + CHECK(j.is_number_float()); + CHECK(j.dump() == "3.141592653589793"); + } } SECTION("string") @@ -2369,7 +2377,7 @@ TEST_CASE("all UBJSON first bytes") // these bytes will fail immediately with exception parse_error.112 std::set supported = { - 'T', 'F', 'Z', 'U', 'i', 'I', 'l', 'L', 'd', 'D', 'C', 'S', '[', '{', 'N' + 'T', 'F', 'Z', 'U', 'i', 'I', 'l', 'L', 'd', 'D', 'C', 'S', '[', '{', 'N', 'H' }; for (auto i = 0; i < 256; ++i) From 7360e09830a10b26c182db0e0eccb964847b9361 Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Mon, 20 Jul 2020 13:12:20 +0200 Subject: [PATCH 02/14] :construction: support for UBJSON high-precision numbers #2286 --- include/nlohmann/detail/exceptions.hpp | 1 + .../nlohmann/detail/input/binary_reader.hpp | 29 +++++++++++---- include/nlohmann/json.hpp | 1 + single_include/nlohmann/json.hpp | 31 ++++++++++++---- test/src/unit-ubjson.cpp | 37 +++++++++++++++++-- 5 files changed, 81 insertions(+), 18 deletions(-) diff --git a/include/nlohmann/detail/exceptions.hpp b/include/nlohmann/detail/exceptions.hpp index ed836188..9ead6855 100644 --- a/include/nlohmann/detail/exceptions.hpp +++ b/include/nlohmann/detail/exceptions.hpp @@ -97,6 +97,7 @@ json.exception.parse_error.110 | parse error at 1: cannot read 2 bytes from vect json.exception.parse_error.112 | parse error at 1: error reading CBOR; last byte: 0xF8 | Not all types of CBOR or MessagePack are supported. This exception occurs if an unsupported byte was read. json.exception.parse_error.113 | parse error at 2: expected a CBOR string; last byte: 0x98 | While parsing a map key, a value that is not a string has been read. json.exception.parse_error.114 | parse error: Unsupported BSON record type 0x0F | The parsing of the corresponding BSON record type is not implemented (yet). +json.exception.parse_error.115 | parse error at byte 5: syntax error while parsing UBJSON high-precision number: invalid number text: 1A | A UBJSON high-precision number could not be parsed. @note For an input with n bytes, 1 is the index of the first character and n+1 is the index of the terminating null byte or the end of file. This also diff --git a/include/nlohmann/detail/input/binary_reader.hpp b/include/nlohmann/detail/input/binary_reader.hpp index 8e6493ee..693f078c 100644 --- a/include/nlohmann/detail/input/binary_reader.hpp +++ b/include/nlohmann/detail/input/binary_reader.hpp @@ -2003,30 +2003,45 @@ class binary_reader case 'H': { + // get size of following number string std::size_t size{}; auto res = get_ubjson_size_value(size); + if (JSON_HEDLEY_UNLIKELY(!res)) + { + return res; + } + // get number string std::string s; - for (int i = 0; i < size; ++i) + for (std::size_t i = 0; i < size; ++i) { get(); s.push_back(current); } + // parse number string auto ia = detail::input_adapter(std::forward(s)); auto l = detail::lexer(std::move(ia), false); - auto result = l.scan(); + const auto result_number = l.scan(); + const auto result_remainder = l.scan(); - switch (result) + using token_type = typename detail::lexer_base::token_type; + + if (JSON_HEDLEY_UNLIKELY(result_remainder != token_type::end_of_input)) { - case detail::lexer_base::token_type::value_integer: + return sax->parse_error(chars_read, s, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + s, "high-precision number"))); + } + + switch (result_number) + { + case token_type::value_integer: return sax->number_integer(l.get_number_integer()); - case detail::lexer_base::token_type::value_unsigned: + case token_type::value_unsigned: return sax->number_unsigned(l.get_number_unsigned()); - case detail::lexer_base::token_type::value_float: + case token_type::value_float: return sax->number_float(l.get_number_float(), std::move(s)); default: - return sax->parse_error(chars_read, s, parse_error::create(113, chars_read, exception_message(input_format_t::ubjson, "invalid number", "number"))); + return sax->parse_error(chars_read, s, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + s, "high-precision number"))); } } diff --git a/include/nlohmann/json.hpp b/include/nlohmann/json.hpp index 986cf71f..dd8842a6 100644 --- a/include/nlohmann/json.hpp +++ b/include/nlohmann/json.hpp @@ -7686,6 +7686,7 @@ class basic_json int16 | number_integer | `I` int32 | number_integer | `l` int64 | number_integer | `L` + high-precision number | number_integer, number_unsigned, or number_float - depends on number string | 'H' string | string | `S` char | string | `C` array | array (optimized values are supported) | `[` diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index c9fa6f97..4ecaac4e 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -2280,6 +2280,7 @@ json.exception.parse_error.110 | parse error at 1: cannot read 2 bytes from vect json.exception.parse_error.112 | parse error at 1: error reading CBOR; last byte: 0xF8 | Not all types of CBOR or MessagePack are supported. This exception occurs if an unsupported byte was read. json.exception.parse_error.113 | parse error at 2: expected a CBOR string; last byte: 0x98 | While parsing a map key, a value that is not a string has been read. json.exception.parse_error.114 | parse error: Unsupported BSON record type 0x0F | The parsing of the corresponding BSON record type is not implemented (yet). +json.exception.parse_error.115 | parse error at byte 5: syntax error while parsing UBJSON high-precision number: invalid number text: 1A | A UBJSON high-precision number could not be parsed. @note For an input with n bytes, 1 is the index of the first character and n+1 is the index of the terminating null byte or the end of file. This also @@ -9502,30 +9503,45 @@ class binary_reader case 'H': { + // get size of following number string std::size_t size{}; auto res = get_ubjson_size_value(size); + if (JSON_HEDLEY_UNLIKELY(!res)) + { + return res; + } + // get number string std::string s; - for (int i = 0; i < size; ++i) + for (std::size_t i = 0; i < size; ++i) { get(); s.push_back(current); } + // parse number string auto ia = detail::input_adapter(std::forward(s)); auto l = detail::lexer(std::move(ia), false); - auto result = l.scan(); + const auto result_number = l.scan(); + const auto result_remainder = l.scan(); - switch (result) + using token_type = typename detail::lexer_base::token_type; + + if (JSON_HEDLEY_UNLIKELY(result_remainder != token_type::end_of_input)) { - case detail::lexer_base::token_type::value_integer: + return sax->parse_error(chars_read, s, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + s, "high-precision number"))); + } + + switch (result_number) + { + case token_type::value_integer: return sax->number_integer(l.get_number_integer()); - case detail::lexer_base::token_type::value_unsigned: + case token_type::value_unsigned: return sax->number_unsigned(l.get_number_unsigned()); - case detail::lexer_base::token_type::value_float: + case token_type::value_float: return sax->number_float(l.get_number_float(), std::move(s)); default: - return sax->parse_error(chars_read, s, parse_error::create(113, chars_read, exception_message(input_format_t::ubjson, "invalid number", "number"))); + return sax->parse_error(chars_read, s, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + s, "high-precision number"))); } } @@ -23881,6 +23897,7 @@ class basic_json int16 | number_integer | `I` int32 | number_integer | `l` int64 | number_integer | `L` + high-precision number | number_integer, number_unsigned, or number_float - depends on number string | 'H' string | string | `S` char | string | `C` array | array (optimized values are supported) | `[` diff --git a/test/src/unit-ubjson.cpp b/test/src/unit-ubjson.cpp index 61b39139..ea15ba1c 100644 --- a/test/src/unit-ubjson.cpp +++ b/test/src/unit-ubjson.cpp @@ -770,10 +770,39 @@ TEST_CASE("UBJSON") SECTION("high-precision number") { - std::vector vec = {'H', 'i', 0x16, '3', '.', '1', '4', '1', '5', '9', '2', '6', '5', '3', '5', '8', '9', '7', '9', '3', '2', '3', '8', '4', '6'}; - const auto j = json::from_ubjson(vec); - CHECK(j.is_number_float()); - CHECK(j.dump() == "3.141592653589793"); + SECTION("unsigned integer number") + { + std::vector vec = {'H', 'i', 0x14, '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'}; + const auto j = json::from_ubjson(vec); + CHECK(j.is_number_unsigned()); + CHECK(j.dump() == "12345678901234567890"); + } + + SECTION("signed integer number") + { + std::vector vec = {'H', 'i', 0x13, '-', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '1', '2', '3', '4', '5', '6', '7', '8'}; + const auto j = json::from_ubjson(vec); + CHECK(j.is_number_integer()); + CHECK(j.dump() == "-123456789012345678"); + } + + SECTION("floating-point number") + { + std::vector vec = {'H', 'i', 0x16, '3', '.', '1', '4', '1', '5', '9', '2', '6', '5', '3', '5', '8', '9', '7', '9', '3', '2', '3', '8', '4', '6'}; + const auto j = json::from_ubjson(vec); + CHECK(j.is_number_float()); + CHECK(j.dump() == "3.141592653589793"); + } + + SECTION("errors") + { + std::vector vec1 = {'H', 'i', 2, '1', 'A', '3'}; + CHECK_THROWS_WITH_AS(json::from_ubjson(vec1), "[json.exception.parse_error.115] parse error at byte 5: syntax error while parsing UBJSON high-precision number: invalid number text: 1A", json::parse_error); + std::vector vec2 = {'H', 'i', 2, '1', '.'}; + CHECK_THROWS_WITH_AS(json::from_ubjson(vec2), "[json.exception.parse_error.115] parse error at byte 5: syntax error while parsing UBJSON high-precision number: invalid number text: 1.", json::parse_error); + std::vector vec3 = {'H', 2, '1', '0'}; + CHECK_THROWS_WITH_AS(json::from_ubjson(vec3), "[json.exception.parse_error.113] parse error at byte 2: syntax error while parsing UBJSON size: expected length type specification (U, i, I, l, L) after '#'; last byte: 0x02", json::parse_error); + } } } From 8aa6da61dc992fec50c5e06b4d72b71008cb2b62 Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Mon, 20 Jul 2020 13:57:19 +0200 Subject: [PATCH 03/14] :construction: support for UBJSON high-precision numbers #2286 --- include/nlohmann/detail/exceptions.hpp | 2 +- .../nlohmann/detail/output/binary_writer.hpp | 46 +++++++++++++----- single_include/nlohmann/json.hpp | 48 ++++++++++++++----- test/src/unit-ubjson.cpp | 25 ++++++---- 4 files changed, 86 insertions(+), 35 deletions(-) diff --git a/include/nlohmann/detail/exceptions.hpp b/include/nlohmann/detail/exceptions.hpp index 9ead6855..dd92897d 100644 --- a/include/nlohmann/detail/exceptions.hpp +++ b/include/nlohmann/detail/exceptions.hpp @@ -286,7 +286,7 @@ json.exception.out_of_range.403 | key 'foo' not found | The provided key was not json.exception.out_of_range.404 | unresolved reference token 'foo' | A reference token in a JSON Pointer could not be resolved. json.exception.out_of_range.405 | JSON pointer has no parent | The JSON Patch operations 'remove' and 'add' can not be applied to the root element of the JSON value. json.exception.out_of_range.406 | number overflow parsing '10E1000' | A parsed number could not be stored as without changing it to NaN or INF. -json.exception.out_of_range.407 | number overflow serializing '9223372036854775808' | UBJSON and BSON only support integer numbers up to 9223372036854775807. | +json.exception.out_of_range.407 | number overflow serializing '9223372036854775808' | UBJSON and BSON only support integer numbers up to 9223372036854775807. (until version 3.8.0) | json.exception.out_of_range.408 | excessive array size: 8658170730974374167 | The size (following `#`) of an UBJSON array or object exceeds the maximal capacity. | json.exception.out_of_range.409 | BSON key cannot contain code point U+0000 (at byte 2) | Key identifiers to be serialized to BSON cannot contain code point U+0000, since the key is stored as zero-terminated c-string | diff --git a/include/nlohmann/detail/output/binary_writer.hpp b/include/nlohmann/detail/output/binary_writer.hpp index 342cb478..b707f772 100644 --- a/include/nlohmann/detail/output/binary_writer.hpp +++ b/include/nlohmann/detail/output/binary_writer.hpp @@ -1319,7 +1319,17 @@ class binary_writer } else { - JSON_THROW(out_of_range::create(407, "integer number " + std::to_string(n) + " cannot be represented by UBJSON as it does not fit int64")); + if (add_prefix) + { + oa->write_character(to_char_type('H')); // high-precision number + } + + const auto number = BasicJsonType(n).dump(); + write_number_with_ubjson_prefix(number.size(), true); + for (std::size_t i = 0; i < number.size(); ++i) + { + oa->write_character(to_char_type(number[i])); + } } } @@ -1373,19 +1383,23 @@ class binary_writer // LCOV_EXCL_START else { - JSON_THROW(out_of_range::create(407, "integer number " + std::to_string(n) + " cannot be represented by UBJSON as it does not fit int64")); + if (add_prefix) + { + oa->write_character(to_char_type('H')); // high-precision number + } + + const auto number = BasicJsonType(n).dump(); + write_number_with_ubjson_prefix(number.size(), true); + for (std::size_t i = 0; i < number.size(); ++i) + { + oa->write_character(to_char_type(number[i])); + } } // LCOV_EXCL_STOP } /*! @brief determine the type prefix of container values - - @note This function does not need to be 100% accurate when it comes to - integer limits. In case a number exceeds the limits of int64_t, - this will be detected by a later call to function - write_number_with_ubjson_prefix. Therefore, we return 'L' for any - value that does not fit the previous limits. */ CharType ubjson_prefix(const BasicJsonType& j) const noexcept { @@ -1415,8 +1429,12 @@ class binary_writer { return 'l'; } - // no check and assume int64_t (see note above) - return 'L'; + if ((std::numeric_limits::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits::max)()) + { + return 'L'; + } + // anything else is treated as high-precision number + return 'H'; } case value_t::number_unsigned: @@ -1437,8 +1455,12 @@ class binary_writer { return 'l'; } - // no check and assume int64_t (see note above) - return 'L'; + if (j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + { + return 'L'; + } + // anything else is treated as high-precision number + return 'H'; } case value_t::number_float: diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index 4ecaac4e..48504a85 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -2469,7 +2469,7 @@ json.exception.out_of_range.403 | key 'foo' not found | The provided key was not json.exception.out_of_range.404 | unresolved reference token 'foo' | A reference token in a JSON Pointer could not be resolved. json.exception.out_of_range.405 | JSON pointer has no parent | The JSON Patch operations 'remove' and 'add' can not be applied to the root element of the JSON value. json.exception.out_of_range.406 | number overflow parsing '10E1000' | A parsed number could not be stored as without changing it to NaN or INF. -json.exception.out_of_range.407 | number overflow serializing '9223372036854775808' | UBJSON and BSON only support integer numbers up to 9223372036854775807. | +json.exception.out_of_range.407 | number overflow serializing '9223372036854775808' | UBJSON and BSON only support integer numbers up to 9223372036854775807. (until version 3.8.0) | json.exception.out_of_range.408 | excessive array size: 8658170730974374167 | The size (following `#`) of an UBJSON array or object exceeds the maximal capacity. | json.exception.out_of_range.409 | BSON key cannot contain code point U+0000 (at byte 2) | Key identifiers to be serialized to BSON cannot contain code point U+0000, since the key is stored as zero-terminated c-string | @@ -13889,7 +13889,17 @@ class binary_writer } else { - JSON_THROW(out_of_range::create(407, "integer number " + std::to_string(n) + " cannot be represented by UBJSON as it does not fit int64")); + if (add_prefix) + { + oa->write_character(to_char_type('H')); // high-precision number + } + + const auto number = BasicJsonType(n).dump(); + write_number_with_ubjson_prefix(number.size(), true); + for (std::size_t i = 0; i < number.size(); ++i) + { + oa->write_character(to_char_type(number[i])); + } } } @@ -13943,19 +13953,23 @@ class binary_writer // LCOV_EXCL_START else { - JSON_THROW(out_of_range::create(407, "integer number " + std::to_string(n) + " cannot be represented by UBJSON as it does not fit int64")); + if (add_prefix) + { + oa->write_character(to_char_type('H')); // high-precision number + } + + const auto number = BasicJsonType(n).dump(); + write_number_with_ubjson_prefix(number.size(), true); + for (std::size_t i = 0; i < number.size(); ++i) + { + oa->write_character(to_char_type(number[i])); + } } // LCOV_EXCL_STOP } /*! @brief determine the type prefix of container values - - @note This function does not need to be 100% accurate when it comes to - integer limits. In case a number exceeds the limits of int64_t, - this will be detected by a later call to function - write_number_with_ubjson_prefix. Therefore, we return 'L' for any - value that does not fit the previous limits. */ CharType ubjson_prefix(const BasicJsonType& j) const noexcept { @@ -13985,8 +13999,12 @@ class binary_writer { return 'l'; } - // no check and assume int64_t (see note above) - return 'L'; + if ((std::numeric_limits::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits::max)()) + { + return 'L'; + } + // anything else is treated as high-precision number + return 'H'; } case value_t::number_unsigned: @@ -14007,8 +14025,12 @@ class binary_writer { return 'l'; } - // no check and assume int64_t (see note above) - return 'L'; + if (j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + { + return 'L'; + } + // anything else is treated as high-precision number + return 'H'; } case value_t::number_float: diff --git a/test/src/unit-ubjson.cpp b/test/src/unit-ubjson.cpp index ea15ba1c..c8b5f6bc 100644 --- a/test/src/unit-ubjson.cpp +++ b/test/src/unit-ubjson.cpp @@ -32,6 +32,7 @@ SOFTWARE. #include using nlohmann::json; +#include #include #include #include @@ -803,6 +804,21 @@ TEST_CASE("UBJSON") std::vector vec3 = {'H', 2, '1', '0'}; CHECK_THROWS_WITH_AS(json::from_ubjson(vec3), "[json.exception.parse_error.113] parse error at byte 2: syntax error while parsing UBJSON size: expected length type specification (U, i, I, l, L) after '#'; last byte: 0x02", json::parse_error); } + + SECTION("serialization") + { + // number that does not fit int64 + json j = 11111111111111111111ULL; + CHECK(j.is_number_unsigned()); + + // number will be serialized to high-precision number + const auto vec = json::to_ubjson(j); + std::vector expected = {'H', 'i', 0x14, '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1'}; + CHECK(vec == expected); + + // roundtrip + CHECK(json::from_ubjson(vec) == j); + } } } @@ -1576,15 +1592,6 @@ TEST_CASE("UBJSON") } } - SECTION("number out of range") - { - // larger than max int64 - json j = 9223372036854775808llu; - json _; - CHECK_THROWS_AS(_ = json::to_ubjson(j), json::out_of_range&); - CHECK_THROWS_WITH(_ = json::to_ubjson(j), "[json.exception.out_of_range.407] integer number 9223372036854775808 cannot be represented by UBJSON as it does not fit int64"); - } - SECTION("excessive size") { SECTION("array") From 4a5277d09db5546645b95df3896eb2c50499d9d2 Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Mon, 20 Jul 2020 14:11:43 +0200 Subject: [PATCH 04/14] :pencil: update documentation --- doc/mkdocs/docs/features/binary_formats/ubjson.md | 2 +- doc/mkdocs/docs/home/exceptions.md | 14 ++++++++++++++ include/nlohmann/json.hpp | 2 +- single_include/nlohmann/json.hpp | 2 +- 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/doc/mkdocs/docs/features/binary_formats/ubjson.md b/doc/mkdocs/docs/features/binary_formats/ubjson.md index cb01cfde..050b4ec5 100644 --- a/doc/mkdocs/docs/features/binary_formats/ubjson.md +++ b/doc/mkdocs/docs/features/binary_formats/ubjson.md @@ -28,6 +28,7 @@ number_unsigned | 128..255 | uint8 | `U` number_unsigned | 256..32767 | int16 | `I` number_unsigned | 32768..2147483647 | int32 | `l` number_unsigned | 2147483648..9223372036854775807 | int64 | `L` +number_unsigned | 2147483649..18446744073709551615 | high-precision | `H` number_float | *any value* | float64 | `D` string | *with shortest length indicator* | string | `S` array | *see notes on optimized format* | array | `[` @@ -44,7 +45,6 @@ object | *see notes on optimized format* | map | `{` The following values can **not** be converted to a UBJSON value: - strings with more than 9223372036854775807 bytes (theoretical) - - unsigned integer numbers above 9223372036854775807 !!! info "Unused UBJSON markers" diff --git a/doc/mkdocs/docs/home/exceptions.md b/doc/mkdocs/docs/home/exceptions.md index d7430ccc..e1e1d13b 100644 --- a/doc/mkdocs/docs/home/exceptions.md +++ b/doc/mkdocs/docs/home/exceptions.md @@ -279,6 +279,16 @@ The parsing of the corresponding BSON record type is not implemented (yet). [json.exception.parse_error.114] parse error at byte 5: Unsupported BSON record type 0xFF ``` +### json.exception.parse_error.115 + +A UBJSON high-precision number could not be parsed. + +!!! failure "Example message" + + ``` + [json.exception.parse_error.115] parse error at byte 5: syntax error while parsing UBJSON high-precision number: invalid number text: 1A + ``` + ## Iterator errors This exception is thrown if iterators passed to a library function do not match @@ -765,6 +775,10 @@ UBJSON and BSON only support integer numbers up to 9223372036854775807. number overflow serializing '9223372036854775808' ``` +!!! note + + Since version 3.9.0, integer numbers beyond int64 are serialized as high-precision UBJSON numbers, and this exception does not further occur. + ### json.exception.out_of_range.408 The size (following `#`) of an UBJSON array or object exceeds the maximal capacity. diff --git a/include/nlohmann/json.hpp b/include/nlohmann/json.hpp index dd8842a6..e9f57795 100644 --- a/include/nlohmann/json.hpp +++ b/include/nlohmann/json.hpp @@ -7200,6 +7200,7 @@ class basic_json number_unsigned | 256..32767 | int16 | `I` number_unsigned | 32768..2147483647 | int32 | `l` number_unsigned | 2147483648..9223372036854775807 | int64 | `L` + number_unsigned | 2147483649..18446744073709551615 | high-precision | `H` number_float | *any value* | float64 | `D` string | *with shortest length indicator* | string | `S` array | *see notes on optimized format* | array | `[` @@ -7210,7 +7211,6 @@ class basic_json @note The following values can **not** be converted to a UBJSON value: - strings with more than 9223372036854775807 bytes (theoretical) - - unsigned integer numbers above 9223372036854775807 @note The following markers are not used in the conversion: - `Z`: no-op values are not created. diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index 48504a85..30e994a3 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -23433,6 +23433,7 @@ class basic_json number_unsigned | 256..32767 | int16 | `I` number_unsigned | 32768..2147483647 | int32 | `l` number_unsigned | 2147483648..9223372036854775807 | int64 | `L` + number_unsigned | 2147483649..18446744073709551615 | high-precision | `H` number_float | *any value* | float64 | `D` string | *with shortest length indicator* | string | `S` array | *see notes on optimized format* | array | `[` @@ -23443,7 +23444,6 @@ class basic_json @note The following values can **not** be converted to a UBJSON value: - strings with more than 9223372036854775807 bytes (theoretical) - - unsigned integer numbers above 9223372036854775807 @note The following markers are not used in the conversion: - `Z`: no-op values are not created. From a9117828e15d4f8089f4e454aaa74ba95ee28e12 Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Mon, 20 Jul 2020 22:38:00 +0200 Subject: [PATCH 05/14] :rotating_light: fix warnings --- .../nlohmann/detail/input/binary_reader.hpp | 23 ++++++++------- .../nlohmann/detail/output/binary_writer.hpp | 4 +-- include/nlohmann/json.hpp | 2 +- single_include/nlohmann/json.hpp | 29 ++++++++++--------- test/src/unit-cbor.cpp | 8 ++--- 5 files changed, 34 insertions(+), 32 deletions(-) diff --git a/include/nlohmann/detail/input/binary_reader.hpp b/include/nlohmann/detail/input/binary_reader.hpp index 693f078c..6f06856a 100644 --- a/include/nlohmann/detail/input/binary_reader.hpp +++ b/include/nlohmann/detail/input/binary_reader.hpp @@ -2012,36 +2012,37 @@ class binary_reader } // get number string - std::string s; + std::vector number_vector; for (std::size_t i = 0; i < size; ++i) { get(); - s.push_back(current); + number_vector.push_back(current); } // parse number string - auto ia = detail::input_adapter(std::forward(s)); - auto l = detail::lexer(std::move(ia), false); - const auto result_number = l.scan(); - const auto result_remainder = l.scan(); + auto number_ia = detail::input_adapter(std::forward(number_vector)); + auto number_lexer = detail::lexer(std::move(number_ia), false); + const auto result_number = number_lexer.scan(); + const auto number_string = number_lexer.get_token_string(); + const auto result_remainder = number_lexer.scan(); using token_type = typename detail::lexer_base::token_type; if (JSON_HEDLEY_UNLIKELY(result_remainder != token_type::end_of_input)) { - return sax->parse_error(chars_read, s, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + s, "high-precision number"))); + return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number"))); } switch (result_number) { case token_type::value_integer: - return sax->number_integer(l.get_number_integer()); + return sax->number_integer(number_lexer.get_number_integer()); case token_type::value_unsigned: - return sax->number_unsigned(l.get_number_unsigned()); + return sax->number_unsigned(number_lexer.get_number_unsigned()); case token_type::value_float: - return sax->number_float(l.get_number_float(), std::move(s)); + return sax->number_float(number_lexer.get_number_float(), std::move(number_string)); default: - return sax->parse_error(chars_read, s, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + s, "high-precision number"))); + return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number"))); } } diff --git a/include/nlohmann/detail/output/binary_writer.hpp b/include/nlohmann/detail/output/binary_writer.hpp index b707f772..8b864b57 100644 --- a/include/nlohmann/detail/output/binary_writer.hpp +++ b/include/nlohmann/detail/output/binary_writer.hpp @@ -1328,7 +1328,7 @@ class binary_writer write_number_with_ubjson_prefix(number.size(), true); for (std::size_t i = 0; i < number.size(); ++i) { - oa->write_character(to_char_type(number[i])); + oa->write_character(to_char_type(static_cast(number[i]))); } } } @@ -1392,7 +1392,7 @@ class binary_writer write_number_with_ubjson_prefix(number.size(), true); for (std::size_t i = 0; i < number.size(); ++i) { - oa->write_character(to_char_type(number[i])); + oa->write_character(to_char_type(static_cast(number[i]))); } } // LCOV_EXCL_STOP diff --git a/include/nlohmann/json.hpp b/include/nlohmann/json.hpp index e9f57795..19cf0e9c 100644 --- a/include/nlohmann/json.hpp +++ b/include/nlohmann/json.hpp @@ -7507,7 +7507,7 @@ class basic_json const bool allow_exceptions = true, const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error) { - return from_cbor(ptr, ptr + len, strict, tag_handler); + return from_cbor(ptr, ptr + len, strict, allow_exceptions, tag_handler); } diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index 30e994a3..0b9c4edb 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -9512,36 +9512,37 @@ class binary_reader } // get number string - std::string s; + std::vector number_vector; for (std::size_t i = 0; i < size; ++i) { get(); - s.push_back(current); + number_vector.push_back(current); } // parse number string - auto ia = detail::input_adapter(std::forward(s)); - auto l = detail::lexer(std::move(ia), false); - const auto result_number = l.scan(); - const auto result_remainder = l.scan(); + auto number_ia = detail::input_adapter(std::forward(number_vector)); + auto number_lexer = detail::lexer(std::move(number_ia), false); + const auto result_number = number_lexer.scan(); + const auto number_string = number_lexer.get_token_string(); + const auto result_remainder = number_lexer.scan(); using token_type = typename detail::lexer_base::token_type; if (JSON_HEDLEY_UNLIKELY(result_remainder != token_type::end_of_input)) { - return sax->parse_error(chars_read, s, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + s, "high-precision number"))); + return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number"))); } switch (result_number) { case token_type::value_integer: - return sax->number_integer(l.get_number_integer()); + return sax->number_integer(number_lexer.get_number_integer()); case token_type::value_unsigned: - return sax->number_unsigned(l.get_number_unsigned()); + return sax->number_unsigned(number_lexer.get_number_unsigned()); case token_type::value_float: - return sax->number_float(l.get_number_float(), std::move(s)); + return sax->number_float(number_lexer.get_number_float(), std::move(number_string)); default: - return sax->parse_error(chars_read, s, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + s, "high-precision number"))); + return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number"))); } } @@ -13898,7 +13899,7 @@ class binary_writer write_number_with_ubjson_prefix(number.size(), true); for (std::size_t i = 0; i < number.size(); ++i) { - oa->write_character(to_char_type(number[i])); + oa->write_character(to_char_type(static_cast(number[i]))); } } } @@ -13962,7 +13963,7 @@ class binary_writer write_number_with_ubjson_prefix(number.size(), true); for (std::size_t i = 0; i < number.size(); ++i) { - oa->write_character(to_char_type(number[i])); + oa->write_character(to_char_type(static_cast(number[i]))); } } // LCOV_EXCL_STOP @@ -23740,7 +23741,7 @@ class basic_json const bool allow_exceptions = true, const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error) { - return from_cbor(ptr, ptr + len, strict, tag_handler); + return from_cbor(ptr, ptr + len, strict, allow_exceptions, tag_handler); } diff --git a/test/src/unit-cbor.cpp b/test/src/unit-cbor.cpp index 2aeac10d..ee32983a 100644 --- a/test/src/unit-cbor.cpp +++ b/test/src/unit-cbor.cpp @@ -2558,10 +2558,10 @@ TEST_CASE("Tagged values") SECTION("0xC6..0xD4") { - for (std::uint8_t b : - { - 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4 - }) + for (auto b : std::vector + { + 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4 + }) { // add tag to value auto v_tagged = v; From 1339d6b683d3fb9d31efdf2b18d84ea36cac54c4 Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Mon, 20 Jul 2020 22:40:28 +0200 Subject: [PATCH 06/14] :bug: add missing EOF check --- include/nlohmann/detail/input/binary_reader.hpp | 4 ++++ single_include/nlohmann/json.hpp | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/include/nlohmann/detail/input/binary_reader.hpp b/include/nlohmann/detail/input/binary_reader.hpp index 6f06856a..9085f1cb 100644 --- a/include/nlohmann/detail/input/binary_reader.hpp +++ b/include/nlohmann/detail/input/binary_reader.hpp @@ -2016,6 +2016,10 @@ class binary_reader for (std::size_t i = 0; i < size; ++i) { get(); + if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "number"))) + { + return false; + } number_vector.push_back(current); } diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index 0b9c4edb..0d60de15 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -9516,6 +9516,10 @@ class binary_reader for (std::size_t i = 0; i < size; ++i) { get(); + if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "number"))) + { + return false; + } number_vector.push_back(current); } From b1903fff1f17af5adaaa0e2db76d26ad73548894 Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Tue, 21 Jul 2020 09:09:30 +0200 Subject: [PATCH 07/14] :green_heart: fix build --- include/nlohmann/detail/input/binary_reader.hpp | 4 ++-- single_include/nlohmann/json.hpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/nlohmann/detail/input/binary_reader.hpp b/include/nlohmann/detail/input/binary_reader.hpp index 9085f1cb..d2cf313c 100644 --- a/include/nlohmann/detail/input/binary_reader.hpp +++ b/include/nlohmann/detail/input/binary_reader.hpp @@ -2012,7 +2012,7 @@ class binary_reader } // get number string - std::vector number_vector; + std::vector number_vector; for (std::size_t i = 0; i < size; ++i) { get(); @@ -2020,7 +2020,7 @@ class binary_reader { return false; } - number_vector.push_back(current); + number_vector.push_back(static_cast(current)); } // parse number string diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index 0d60de15..ba17cadb 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -9512,7 +9512,7 @@ class binary_reader } // get number string - std::vector number_vector; + std::vector number_vector; for (std::size_t i = 0; i < size; ++i) { get(); @@ -9520,7 +9520,7 @@ class binary_reader { return false; } - number_vector.push_back(current); + number_vector.push_back(static_cast(current)); } // parse number string From 8046754c15006c8a2fced2f0c8c7e2f0b257045f Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Tue, 21 Jul 2020 12:44:38 +0200 Subject: [PATCH 08/14] :loud_sound: add debug output to tests --- test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 76ba31d8..6c01767a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -160,7 +160,7 @@ foreach(file ${files}) endif() add_test(NAME "${testcase}" - COMMAND ${testcase} ${DOCTEST_TEST_FILTER} --no-skip + COMMAND ${testcase} ${DOCTEST_TEST_FILTER} --no-skip -s WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} ) set_tests_properties("${testcase}" PROPERTIES LABELS "all" FIXTURES_REQUIRED TEST_DATA) From 96e9f6602577543e6e3e60ced301804e95d81cc5 Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Tue, 21 Jul 2020 22:01:15 +0200 Subject: [PATCH 09/14] :mute: remove debug output --- test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 6c01767a..76ba31d8 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -160,7 +160,7 @@ foreach(file ${files}) endif() add_test(NAME "${testcase}" - COMMAND ${testcase} ${DOCTEST_TEST_FILTER} --no-skip -s + COMMAND ${testcase} ${DOCTEST_TEST_FILTER} --no-skip WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} ) set_tests_properties("${testcase}" PROPERTIES LABELS "all" FIXTURES_REQUIRED TEST_DATA) From 3bb43ecd517afdc402d538beb6ce382f718022df Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Tue, 21 Jul 2020 22:02:14 +0200 Subject: [PATCH 10/14] :twisted_rightwards_arrows: merge develop branch --- single_include/nlohmann/json.hpp | 3357 +++++++++++++++--------------- 1 file changed, 1716 insertions(+), 1641 deletions(-) diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index 9476994a..8b234047 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -2393,6 +2393,7 @@ json.exception.parse_error.110 | parse error at 1: cannot read 2 bytes from vect json.exception.parse_error.112 | parse error at 1: error reading CBOR; last byte: 0xF8 | Not all types of CBOR or MessagePack are supported. This exception occurs if an unsupported byte was read. json.exception.parse_error.113 | parse error at 2: expected a CBOR string; last byte: 0x98 | While parsing a map key, a value that is not a string has been read. json.exception.parse_error.114 | parse error: Unsupported BSON record type 0x0F | The parsing of the corresponding BSON record type is not implemented (yet). +json.exception.parse_error.115 | parse error at byte 5: syntax error while parsing UBJSON high-precision number: invalid number text: 1A | A UBJSON high-precision number could not be parsed. @note For an input with n bytes, 1 is the index of the first character and n+1 is the index of the terminating null byte or the end of file. This also @@ -2581,7 +2582,7 @@ json.exception.out_of_range.403 | key 'foo' not found | The provided key was not json.exception.out_of_range.404 | unresolved reference token 'foo' | A reference token in a JSON Pointer could not be resolved. json.exception.out_of_range.405 | JSON pointer has no parent | The JSON Patch operations 'remove' and 'add' can not be applied to the root element of the JSON value. json.exception.out_of_range.406 | number overflow parsing '10E1000' | A parsed number could not be stored as without changing it to NaN or INF. -json.exception.out_of_range.407 | number overflow serializing '9223372036854775808' | UBJSON and BSON only support integer numbers up to 9223372036854775807. | +json.exception.out_of_range.407 | number overflow serializing '9223372036854775808' | UBJSON and BSON only support integer numbers up to 9223372036854775807. (until version 3.8.0) | json.exception.out_of_range.408 | excessive array size: 8658170730974374167 | The size (following `#`) of an UBJSON array or object exceeds the maximal capacity. | json.exception.out_of_range.409 | BSON key cannot contain code point U+0000 (at byte 2) | Key identifiers to be serialized to BSON cannot contain code point U+0000, since the key is stored as zero-terminated c-string | @@ -5846,6 +5847,1634 @@ class json_sax_acceptor } // namespace nlohmann +// #include + + +#include // array +#include // localeconv +#include // size_t +#include // snprintf +#include // strtof, strtod, strtold, strtoll, strtoull +#include // initializer_list +#include // char_traits, string +#include // move +#include // vector + +// #include + +// #include + +// #include + + +namespace nlohmann +{ +namespace detail +{ +/////////// +// lexer // +/////////// + +template +class lexer_base +{ + public: + /// token types for the parser + enum class token_type + { + uninitialized, ///< indicating the scanner is uninitialized + literal_true, ///< the `true` literal + literal_false, ///< the `false` literal + literal_null, ///< the `null` literal + value_string, ///< a string -- use get_string() for actual value + value_unsigned, ///< an unsigned integer -- use get_number_unsigned() for actual value + value_integer, ///< a signed integer -- use get_number_integer() for actual value + value_float, ///< an floating point number -- use get_number_float() for actual value + begin_array, ///< the character for array begin `[` + begin_object, ///< the character for object begin `{` + end_array, ///< the character for array end `]` + end_object, ///< the character for object end `}` + name_separator, ///< the name separator `:` + value_separator, ///< the value separator `,` + parse_error, ///< indicating a parse error + end_of_input, ///< indicating the end of the input buffer + literal_or_value ///< a literal or the begin of a value (only for diagnostics) + }; + + /// return name of values of type token_type (only used for errors) + JSON_HEDLEY_RETURNS_NON_NULL + JSON_HEDLEY_CONST + static const char* token_type_name(const token_type t) noexcept + { + switch (t) + { + case token_type::uninitialized: + return ""; + case token_type::literal_true: + return "true literal"; + case token_type::literal_false: + return "false literal"; + case token_type::literal_null: + return "null literal"; + case token_type::value_string: + return "string literal"; + case token_type::value_unsigned: + case token_type::value_integer: + case token_type::value_float: + return "number literal"; + case token_type::begin_array: + return "'['"; + case token_type::begin_object: + return "'{'"; + case token_type::end_array: + return "']'"; + case token_type::end_object: + return "'}'"; + case token_type::name_separator: + return "':'"; + case token_type::value_separator: + return "','"; + case token_type::parse_error: + return ""; + case token_type::end_of_input: + return "end of input"; + case token_type::literal_or_value: + return "'[', '{', or a literal"; + // LCOV_EXCL_START + default: // catch non-enum values + return "unknown token"; + // LCOV_EXCL_STOP + } + } +}; +/*! +@brief lexical analysis + +This class organizes the lexical analysis during JSON deserialization. +*/ +template +class lexer : public lexer_base +{ + using number_integer_t = typename BasicJsonType::number_integer_t; + using number_unsigned_t = typename BasicJsonType::number_unsigned_t; + using number_float_t = typename BasicJsonType::number_float_t; + using string_t = typename BasicJsonType::string_t; + using char_type = typename InputAdapterType::char_type; + using char_int_type = typename std::char_traits::int_type; + + public: + using token_type = typename lexer_base::token_type; + + explicit lexer(InputAdapterType&& adapter, bool ignore_comments_ = false) + : ia(std::move(adapter)) + , ignore_comments(ignore_comments_) + , decimal_point_char(static_cast(get_decimal_point())) + {} + + // delete because of pointer members + lexer(const lexer&) = delete; + lexer(lexer&&) = default; + lexer& operator=(lexer&) = delete; + lexer& operator=(lexer&&) = default; + ~lexer() = default; + + private: + ///////////////////// + // locales + ///////////////////// + + /// return the locale-dependent decimal point + JSON_HEDLEY_PURE + static char get_decimal_point() noexcept + { + const auto* loc = localeconv(); + JSON_ASSERT(loc != nullptr); + return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point); + } + + ///////////////////// + // scan functions + ///////////////////// + + /*! + @brief get codepoint from 4 hex characters following `\u` + + For input "\u c1 c2 c3 c4" the codepoint is: + (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4 + = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0) + + Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f' + must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The + conversion is done by subtracting the offset (0x30, 0x37, and 0x57) + between the ASCII value of the character and the desired integer value. + + @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or + non-hex character) + */ + int get_codepoint() + { + // this function only makes sense after reading `\u` + JSON_ASSERT(current == 'u'); + int codepoint = 0; + + const auto factors = { 12u, 8u, 4u, 0u }; + for (const auto factor : factors) + { + get(); + + if (current >= '0' && current <= '9') + { + codepoint += static_cast((static_cast(current) - 0x30u) << factor); + } + else if (current >= 'A' && current <= 'F') + { + codepoint += static_cast((static_cast(current) - 0x37u) << factor); + } + else if (current >= 'a' && current <= 'f') + { + codepoint += static_cast((static_cast(current) - 0x57u) << factor); + } + else + { + return -1; + } + } + + JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF); + return codepoint; + } + + /*! + @brief check if the next byte(s) are inside a given range + + Adds the current byte and, for each passed range, reads a new byte and + checks if it is inside the range. If a violation was detected, set up an + error message and return false. Otherwise, return true. + + @param[in] ranges list of integers; interpreted as list of pairs of + inclusive lower and upper bound, respectively + + @pre The passed list @a ranges must have 2, 4, or 6 elements; that is, + 1, 2, or 3 pairs. This precondition is enforced by an assertion. + + @return true if and only if no range violation was detected + */ + bool next_byte_in_range(std::initializer_list ranges) + { + JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6); + add(current); + + for (auto range = ranges.begin(); range != ranges.end(); ++range) + { + get(); + if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range))) + { + add(current); + } + else + { + error_message = "invalid string: ill-formed UTF-8 byte"; + return false; + } + } + + return true; + } + + /*! + @brief scan a string literal + + This function scans a string according to Sect. 7 of RFC 7159. While + scanning, bytes are escaped and copied into buffer token_buffer. Then the + function returns successfully, token_buffer is *not* null-terminated (as it + may contain \0 bytes), and token_buffer.size() is the number of bytes in the + string. + + @return token_type::value_string if string could be successfully scanned, + token_type::parse_error otherwise + + @note In case of errors, variable error_message contains a textual + description. + */ + token_type scan_string() + { + // reset token_buffer (ignore opening quote) + reset(); + + // we entered the function by reading an open quote + JSON_ASSERT(current == '\"'); + + while (true) + { + // get next character + switch (get()) + { + // end of file while parsing string + case std::char_traits::eof(): + { + error_message = "invalid string: missing closing quote"; + return token_type::parse_error; + } + + // closing quote + case '\"': + { + return token_type::value_string; + } + + // escapes + case '\\': + { + switch (get()) + { + // quotation mark + case '\"': + add('\"'); + break; + // reverse solidus + case '\\': + add('\\'); + break; + // solidus + case '/': + add('/'); + break; + // backspace + case 'b': + add('\b'); + break; + // form feed + case 'f': + add('\f'); + break; + // line feed + case 'n': + add('\n'); + break; + // carriage return + case 'r': + add('\r'); + break; + // tab + case 't': + add('\t'); + break; + + // unicode escapes + case 'u': + { + const int codepoint1 = get_codepoint(); + int codepoint = codepoint1; // start with codepoint1 + + if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1)) + { + error_message = "invalid string: '\\u' must be followed by 4 hex digits"; + return token_type::parse_error; + } + + // check if code point is a high surrogate + if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF) + { + // expect next \uxxxx entry + if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u')) + { + const int codepoint2 = get_codepoint(); + + if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1)) + { + error_message = "invalid string: '\\u' must be followed by 4 hex digits"; + return token_type::parse_error; + } + + // check if codepoint2 is a low surrogate + if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF)) + { + // overwrite codepoint + codepoint = static_cast( + // high surrogate occupies the most significant 22 bits + (static_cast(codepoint1) << 10u) + // low surrogate occupies the least significant 15 bits + + static_cast(codepoint2) + // there is still the 0xD800, 0xDC00 and 0x10000 noise + // in the result so we have to subtract with: + // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00 + - 0x35FDC00u); + } + else + { + error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF"; + return token_type::parse_error; + } + } + else + { + error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF"; + return token_type::parse_error; + } + } + else + { + if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF)) + { + error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF"; + return token_type::parse_error; + } + } + + // result of the above calculation yields a proper codepoint + JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF); + + // translate codepoint into bytes + if (codepoint < 0x80) + { + // 1-byte characters: 0xxxxxxx (ASCII) + add(static_cast(codepoint)); + } + else if (codepoint <= 0x7FF) + { + // 2-byte characters: 110xxxxx 10xxxxxx + add(static_cast(0xC0u | (static_cast(codepoint) >> 6u))); + add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); + } + else if (codepoint <= 0xFFFF) + { + // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx + add(static_cast(0xE0u | (static_cast(codepoint) >> 12u))); + add(static_cast(0x80u | ((static_cast(codepoint) >> 6u) & 0x3Fu))); + add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); + } + else + { + // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + add(static_cast(0xF0u | (static_cast(codepoint) >> 18u))); + add(static_cast(0x80u | ((static_cast(codepoint) >> 12u) & 0x3Fu))); + add(static_cast(0x80u | ((static_cast(codepoint) >> 6u) & 0x3Fu))); + add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); + } + + break; + } + + // other characters after escape + default: + error_message = "invalid string: forbidden character after backslash"; + return token_type::parse_error; + } + + break; + } + + // invalid control characters + case 0x00: + { + error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000"; + return token_type::parse_error; + } + + case 0x01: + { + error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001"; + return token_type::parse_error; + } + + case 0x02: + { + error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002"; + return token_type::parse_error; + } + + case 0x03: + { + error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003"; + return token_type::parse_error; + } + + case 0x04: + { + error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004"; + return token_type::parse_error; + } + + case 0x05: + { + error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005"; + return token_type::parse_error; + } + + case 0x06: + { + error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006"; + return token_type::parse_error; + } + + case 0x07: + { + error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007"; + return token_type::parse_error; + } + + case 0x08: + { + error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b"; + return token_type::parse_error; + } + + case 0x09: + { + error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t"; + return token_type::parse_error; + } + + case 0x0A: + { + error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n"; + return token_type::parse_error; + } + + case 0x0B: + { + error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B"; + return token_type::parse_error; + } + + case 0x0C: + { + error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f"; + return token_type::parse_error; + } + + case 0x0D: + { + error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r"; + return token_type::parse_error; + } + + case 0x0E: + { + error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E"; + return token_type::parse_error; + } + + case 0x0F: + { + error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F"; + return token_type::parse_error; + } + + case 0x10: + { + error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010"; + return token_type::parse_error; + } + + case 0x11: + { + error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011"; + return token_type::parse_error; + } + + case 0x12: + { + error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012"; + return token_type::parse_error; + } + + case 0x13: + { + error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013"; + return token_type::parse_error; + } + + case 0x14: + { + error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014"; + return token_type::parse_error; + } + + case 0x15: + { + error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015"; + return token_type::parse_error; + } + + case 0x16: + { + error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016"; + return token_type::parse_error; + } + + case 0x17: + { + error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017"; + return token_type::parse_error; + } + + case 0x18: + { + error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018"; + return token_type::parse_error; + } + + case 0x19: + { + error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019"; + return token_type::parse_error; + } + + case 0x1A: + { + error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A"; + return token_type::parse_error; + } + + case 0x1B: + { + error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B"; + return token_type::parse_error; + } + + case 0x1C: + { + error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C"; + return token_type::parse_error; + } + + case 0x1D: + { + error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D"; + return token_type::parse_error; + } + + case 0x1E: + { + error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E"; + return token_type::parse_error; + } + + case 0x1F: + { + error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F"; + return token_type::parse_error; + } + + // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace)) + case 0x20: + case 0x21: + case 0x23: + case 0x24: + case 0x25: + case 0x26: + case 0x27: + case 0x28: + case 0x29: + case 0x2A: + case 0x2B: + case 0x2C: + case 0x2D: + case 0x2E: + case 0x2F: + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + case 0x3A: + case 0x3B: + case 0x3C: + case 0x3D: + case 0x3E: + case 0x3F: + case 0x40: + case 0x41: + case 0x42: + case 0x43: + case 0x44: + case 0x45: + case 0x46: + case 0x47: + case 0x48: + case 0x49: + case 0x4A: + case 0x4B: + case 0x4C: + case 0x4D: + case 0x4E: + case 0x4F: + case 0x50: + case 0x51: + case 0x52: + case 0x53: + case 0x54: + case 0x55: + case 0x56: + case 0x57: + case 0x58: + case 0x59: + case 0x5A: + case 0x5B: + case 0x5D: + case 0x5E: + case 0x5F: + case 0x60: + case 0x61: + case 0x62: + case 0x63: + case 0x64: + case 0x65: + case 0x66: + case 0x67: + case 0x68: + case 0x69: + case 0x6A: + case 0x6B: + case 0x6C: + case 0x6D: + case 0x6E: + case 0x6F: + case 0x70: + case 0x71: + case 0x72: + case 0x73: + case 0x74: + case 0x75: + case 0x76: + case 0x77: + case 0x78: + case 0x79: + case 0x7A: + case 0x7B: + case 0x7C: + case 0x7D: + case 0x7E: + case 0x7F: + { + add(current); + break; + } + + // U+0080..U+07FF: bytes C2..DF 80..BF + case 0xC2: + case 0xC3: + case 0xC4: + case 0xC5: + case 0xC6: + case 0xC7: + case 0xC8: + case 0xC9: + case 0xCA: + case 0xCB: + case 0xCC: + case 0xCD: + case 0xCE: + case 0xCF: + case 0xD0: + case 0xD1: + case 0xD2: + case 0xD3: + case 0xD4: + case 0xD5: + case 0xD6: + case 0xD7: + case 0xD8: + case 0xD9: + case 0xDA: + case 0xDB: + case 0xDC: + case 0xDD: + case 0xDE: + case 0xDF: + { + if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF}))) + { + return token_type::parse_error; + } + break; + } + + // U+0800..U+0FFF: bytes E0 A0..BF 80..BF + case 0xE0: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF + // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF + case 0xE1: + case 0xE2: + case 0xE3: + case 0xE4: + case 0xE5: + case 0xE6: + case 0xE7: + case 0xE8: + case 0xE9: + case 0xEA: + case 0xEB: + case 0xEC: + case 0xEE: + case 0xEF: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // U+D000..U+D7FF: bytes ED 80..9F 80..BF + case 0xED: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF + case 0xF0: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF + case 0xF1: + case 0xF2: + case 0xF3: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF + case 0xF4: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // remaining bytes (80..C1 and F5..FF) are ill-formed + default: + { + error_message = "invalid string: ill-formed UTF-8 byte"; + return token_type::parse_error; + } + } + } + } + + /*! + * @brief scan a comment + * @return whether comment could be scanned successfully + */ + bool scan_comment() + { + switch (get()) + { + // single-line comments skip input until a newline or EOF is read + case '/': + { + while (true) + { + switch (get()) + { + case '\n': + case '\r': + case std::char_traits::eof(): + case '\0': + return true; + + default: + break; + } + } + } + + // multi-line comments skip input until */ is read + case '*': + { + while (true) + { + switch (get()) + { + case std::char_traits::eof(): + case '\0': + { + error_message = "invalid comment; missing closing '*/'"; + return false; + } + + case '*': + { + switch (get()) + { + case '/': + return true; + + default: + { + unget(); + break; + } + } + } + + default: + break; + } + } + } + + // unexpected character after reading '/' + default: + { + error_message = "invalid comment; expecting '/' or '*' after '/'"; + return false; + } + } + } + + JSON_HEDLEY_NON_NULL(2) + static void strtof(float& f, const char* str, char** endptr) noexcept + { + f = std::strtof(str, endptr); + } + + JSON_HEDLEY_NON_NULL(2) + static void strtof(double& f, const char* str, char** endptr) noexcept + { + f = std::strtod(str, endptr); + } + + JSON_HEDLEY_NON_NULL(2) + static void strtof(long double& f, const char* str, char** endptr) noexcept + { + f = std::strtold(str, endptr); + } + + /*! + @brief scan a number literal + + This function scans a string according to Sect. 6 of RFC 7159. + + The function is realized with a deterministic finite state machine derived + from the grammar described in RFC 7159. Starting in state "init", the + input is read and used to determined the next state. Only state "done" + accepts the number. State "error" is a trap state to model errors. In the + table below, "anything" means any character but the ones listed before. + + state | 0 | 1-9 | e E | + | - | . | anything + ---------|----------|----------|----------|---------|---------|----------|----------- + init | zero | any1 | [error] | [error] | minus | [error] | [error] + minus | zero | any1 | [error] | [error] | [error] | [error] | [error] + zero | done | done | exponent | done | done | decimal1 | done + any1 | any1 | any1 | exponent | done | done | decimal1 | done + decimal1 | decimal2 | decimal2 | [error] | [error] | [error] | [error] | [error] + decimal2 | decimal2 | decimal2 | exponent | done | done | done | done + exponent | any2 | any2 | [error] | sign | sign | [error] | [error] + sign | any2 | any2 | [error] | [error] | [error] | [error] | [error] + any2 | any2 | any2 | done | done | done | done | done + + The state machine is realized with one label per state (prefixed with + "scan_number_") and `goto` statements between them. The state machine + contains cycles, but any cycle can be left when EOF is read. Therefore, + the function is guaranteed to terminate. + + During scanning, the read bytes are stored in token_buffer. This string is + then converted to a signed integer, an unsigned integer, or a + floating-point number. + + @return token_type::value_unsigned, token_type::value_integer, or + token_type::value_float if number could be successfully scanned, + token_type::parse_error otherwise + + @note The scanner is independent of the current locale. Internally, the + locale's decimal point is used instead of `.` to work with the + locale-dependent converters. + */ + token_type scan_number() // lgtm [cpp/use-of-goto] + { + // reset token_buffer to store the number's bytes + reset(); + + // the type of the parsed number; initially set to unsigned; will be + // changed if minus sign, decimal point or exponent is read + token_type number_type = token_type::value_unsigned; + + // state (init): we just found out we need to scan a number + switch (current) + { + case '-': + { + add(current); + goto scan_number_minus; + } + + case '0': + { + add(current); + goto scan_number_zero; + } + + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any1; + } + + // all other characters are rejected outside scan_number() + default: // LCOV_EXCL_LINE + JSON_ASSERT(false); // LCOV_EXCL_LINE + } + +scan_number_minus: + // state: we just parsed a leading minus sign + number_type = token_type::value_integer; + switch (get()) + { + case '0': + { + add(current); + goto scan_number_zero; + } + + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any1; + } + + default: + { + error_message = "invalid number; expected digit after '-'"; + return token_type::parse_error; + } + } + +scan_number_zero: + // state: we just parse a zero (maybe with a leading minus sign) + switch (get()) + { + case '.': + { + add(decimal_point_char); + goto scan_number_decimal1; + } + + case 'e': + case 'E': + { + add(current); + goto scan_number_exponent; + } + + default: + goto scan_number_done; + } + +scan_number_any1: + // state: we just parsed a number 0-9 (maybe with a leading minus sign) + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any1; + } + + case '.': + { + add(decimal_point_char); + goto scan_number_decimal1; + } + + case 'e': + case 'E': + { + add(current); + goto scan_number_exponent; + } + + default: + goto scan_number_done; + } + +scan_number_decimal1: + // state: we just parsed a decimal point + number_type = token_type::value_float; + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_decimal2; + } + + default: + { + error_message = "invalid number; expected digit after '.'"; + return token_type::parse_error; + } + } + +scan_number_decimal2: + // we just parsed at least one number after a decimal point + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_decimal2; + } + + case 'e': + case 'E': + { + add(current); + goto scan_number_exponent; + } + + default: + goto scan_number_done; + } + +scan_number_exponent: + // we just parsed an exponent + number_type = token_type::value_float; + switch (get()) + { + case '+': + case '-': + { + add(current); + goto scan_number_sign; + } + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any2; + } + + default: + { + error_message = + "invalid number; expected '+', '-', or digit after exponent"; + return token_type::parse_error; + } + } + +scan_number_sign: + // we just parsed an exponent sign + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any2; + } + + default: + { + error_message = "invalid number; expected digit after exponent sign"; + return token_type::parse_error; + } + } + +scan_number_any2: + // we just parsed a number after the exponent or exponent sign + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any2; + } + + default: + goto scan_number_done; + } + +scan_number_done: + // unget the character after the number (we only read it to know that + // we are done scanning a number) + unget(); + + char* endptr = nullptr; + errno = 0; + + // try to parse integers first and fall back to floats + if (number_type == token_type::value_unsigned) + { + const auto x = std::strtoull(token_buffer.data(), &endptr, 10); + + // we checked the number format before + JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); + + if (errno == 0) + { + value_unsigned = static_cast(x); + if (value_unsigned == x) + { + return token_type::value_unsigned; + } + } + } + else if (number_type == token_type::value_integer) + { + const auto x = std::strtoll(token_buffer.data(), &endptr, 10); + + // we checked the number format before + JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); + + if (errno == 0) + { + value_integer = static_cast(x); + if (value_integer == x) + { + return token_type::value_integer; + } + } + } + + // this code is reached if we parse a floating-point number or if an + // integer conversion above failed + strtof(value_float, token_buffer.data(), &endptr); + + // we checked the number format before + JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); + + return token_type::value_float; + } + + /*! + @param[in] literal_text the literal text to expect + @param[in] length the length of the passed literal text + @param[in] return_type the token type to return on success + */ + JSON_HEDLEY_NON_NULL(2) + token_type scan_literal(const char_type* literal_text, const std::size_t length, + token_type return_type) + { + JSON_ASSERT(std::char_traits::to_char_type(current) == literal_text[0]); + for (std::size_t i = 1; i < length; ++i) + { + if (JSON_HEDLEY_UNLIKELY(std::char_traits::to_char_type(get()) != literal_text[i])) + { + error_message = "invalid literal"; + return token_type::parse_error; + } + } + return return_type; + } + + ///////////////////// + // input management + ///////////////////// + + /// reset token_buffer; current character is beginning of token + void reset() noexcept + { + token_buffer.clear(); + token_string.clear(); + token_string.push_back(std::char_traits::to_char_type(current)); + } + + /* + @brief get next character from the input + + This function provides the interface to the used input adapter. It does + not throw in case the input reached EOF, but returns a + `std::char_traits::eof()` in that case. Stores the scanned characters + for use in error messages. + + @return character read from the input + */ + char_int_type get() + { + ++position.chars_read_total; + ++position.chars_read_current_line; + + if (next_unget) + { + // just reset the next_unget variable and work with current + next_unget = false; + } + else + { + current = ia.get_character(); + } + + if (JSON_HEDLEY_LIKELY(current != std::char_traits::eof())) + { + token_string.push_back(std::char_traits::to_char_type(current)); + } + + if (current == '\n') + { + ++position.lines_read; + position.chars_read_current_line = 0; + } + + return current; + } + + /*! + @brief unget current character (read it again on next get) + + We implement unget by setting variable next_unget to true. The input is not + changed - we just simulate ungetting by modifying chars_read_total, + chars_read_current_line, and token_string. The next call to get() will + behave as if the unget character is read again. + */ + void unget() + { + next_unget = true; + + --position.chars_read_total; + + // in case we "unget" a newline, we have to also decrement the lines_read + if (position.chars_read_current_line == 0) + { + if (position.lines_read > 0) + { + --position.lines_read; + } + } + else + { + --position.chars_read_current_line; + } + + if (JSON_HEDLEY_LIKELY(current != std::char_traits::eof())) + { + JSON_ASSERT(!token_string.empty()); + token_string.pop_back(); + } + } + + /// add a character to token_buffer + void add(char_int_type c) + { + token_buffer.push_back(static_cast(c)); + } + + public: + ///////////////////// + // value getters + ///////////////////// + + /// return integer value + constexpr number_integer_t get_number_integer() const noexcept + { + return value_integer; + } + + /// return unsigned integer value + constexpr number_unsigned_t get_number_unsigned() const noexcept + { + return value_unsigned; + } + + /// return floating-point value + constexpr number_float_t get_number_float() const noexcept + { + return value_float; + } + + /// return current string value (implicitly resets the token; useful only once) + string_t& get_string() + { + return token_buffer; + } + + ///////////////////// + // diagnostics + ///////////////////// + + /// return position of last read token + constexpr position_t get_position() const noexcept + { + return position; + } + + /// return the last read token (for errors only). Will never contain EOF + /// (an arbitrary value that is not a valid char value, often -1), because + /// 255 may legitimately occur. May contain NUL, which should be escaped. + std::string get_token_string() const + { + // escape control characters + std::string result; + for (const auto c : token_string) + { + if (static_cast(c) <= '\x1F') + { + // escape control characters + std::array cs{{}}; + (std::snprintf)(cs.data(), cs.size(), "", static_cast(c)); + result += cs.data(); + } + else + { + // add character as is + result.push_back(static_cast(c)); + } + } + + return result; + } + + /// return syntax error message + JSON_HEDLEY_RETURNS_NON_NULL + constexpr const char* get_error_message() const noexcept + { + return error_message; + } + + ///////////////////// + // actual scanner + ///////////////////// + + /*! + @brief skip the UTF-8 byte order mark + @return true iff there is no BOM or the correct BOM has been skipped + */ + bool skip_bom() + { + if (get() == 0xEF) + { + // check if we completely parse the BOM + return get() == 0xBB && get() == 0xBF; + } + + // the first character is not the beginning of the BOM; unget it to + // process is later + unget(); + return true; + } + + void skip_whitespace() + { + do + { + get(); + } + while (current == ' ' || current == '\t' || current == '\n' || current == '\r'); + } + + token_type scan() + { + // initially, skip the BOM + if (position.chars_read_total == 0 && !skip_bom()) + { + error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given"; + return token_type::parse_error; + } + + // read next character and ignore whitespace + skip_whitespace(); + + // ignore comments + if (ignore_comments && current == '/') + { + if (!scan_comment()) + { + return token_type::parse_error; + } + + // skip following whitespace + skip_whitespace(); + } + + switch (current) + { + // structural characters + case '[': + return token_type::begin_array; + case ']': + return token_type::end_array; + case '{': + return token_type::begin_object; + case '}': + return token_type::end_object; + case ':': + return token_type::name_separator; + case ',': + return token_type::value_separator; + + // literals + case 't': + { + std::array true_literal = {{'t', 'r', 'u', 'e'}}; + return scan_literal(true_literal.data(), true_literal.size(), token_type::literal_true); + } + case 'f': + { + std::array false_literal = {{'f', 'a', 'l', 's', 'e'}}; + return scan_literal(false_literal.data(), false_literal.size(), token_type::literal_false); + } + case 'n': + { + std::array null_literal = {{'n', 'u', 'l', 'l'}}; + return scan_literal(null_literal.data(), null_literal.size(), token_type::literal_null); + } + + // string + case '\"': + return scan_string(); + + // number + case '-': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + return scan_number(); + + // end of input (the null byte is needed when parsing from + // string literals) + case '\0': + case std::char_traits::eof(): + return token_type::end_of_input; + + // error + default: + error_message = "invalid literal"; + return token_type::parse_error; + } + } + + private: + /// input adapter + InputAdapterType ia; + + /// whether comments should be ignored (true) or signaled as errors (false) + const bool ignore_comments = false; + + /// the current character + char_int_type current = std::char_traits::eof(); + + /// whether the next get() call should just return current + bool next_unget = false; + + /// the start position of the current token + position_t position {}; + + /// raw input token string (for error messages) + std::vector token_string {}; + + /// buffer for variable-length tokens (numbers, strings) + string_t token_buffer {}; + + /// a description of occurred lexer errors + const char* error_message = ""; + + // number values + number_integer_t value_integer = 0; + number_unsigned_t value_unsigned = 0; + number_float_t value_float = 0; + + /// the decimal point + const char_int_type decimal_point_char = '.'; +}; +} // namespace detail +} // namespace nlohmann + // #include // #include @@ -7985,6 +9614,55 @@ class binary_reader return get_number(input_format_t::ubjson, number) && sax->number_float(static_cast(number), ""); } + case 'H': + { + // get size of following number string + std::size_t size{}; + auto res = get_ubjson_size_value(size); + if (JSON_HEDLEY_UNLIKELY(!res)) + { + return res; + } + + // get number string + std::vector number_vector; + for (std::size_t i = 0; i < size; ++i) + { + get(); + if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "number"))) + { + return false; + } + number_vector.push_back(static_cast(current)); + } + + // parse number string + auto number_ia = detail::input_adapter(std::forward(number_vector)); + auto number_lexer = detail::lexer(std::move(number_ia), false); + const auto result_number = number_lexer.scan(); + const auto number_string = number_lexer.get_token_string(); + const auto result_remainder = number_lexer.scan(); + + using token_type = typename detail::lexer_base::token_type; + + if (JSON_HEDLEY_UNLIKELY(result_remainder != token_type::end_of_input)) + { + return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number"))); + } + + switch (result_number) + { + case token_type::value_integer: + return sax->number_integer(number_lexer.get_number_integer()); + case token_type::value_unsigned: + return sax->number_unsigned(number_lexer.get_number_unsigned()); + case token_type::value_float: + return sax->number_float(number_lexer.get_number_float(), std::move(number_string)); + default: + return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number"))); + } + } + case 'C': // char { get(); @@ -8388,1632 +10066,6 @@ class binary_reader // #include - -#include // array -#include // localeconv -#include // size_t -#include // snprintf -#include // strtof, strtod, strtold, strtoll, strtoull -#include // initializer_list -#include // char_traits, string -#include // move -#include // vector - -// #include - -// #include - -// #include - - -namespace nlohmann -{ -namespace detail -{ -/////////// -// lexer // -/////////// - -template -class lexer_base -{ - public: - /// token types for the parser - enum class token_type - { - uninitialized, ///< indicating the scanner is uninitialized - literal_true, ///< the `true` literal - literal_false, ///< the `false` literal - literal_null, ///< the `null` literal - value_string, ///< a string -- use get_string() for actual value - value_unsigned, ///< an unsigned integer -- use get_number_unsigned() for actual value - value_integer, ///< a signed integer -- use get_number_integer() for actual value - value_float, ///< an floating point number -- use get_number_float() for actual value - begin_array, ///< the character for array begin `[` - begin_object, ///< the character for object begin `{` - end_array, ///< the character for array end `]` - end_object, ///< the character for object end `}` - name_separator, ///< the name separator `:` - value_separator, ///< the value separator `,` - parse_error, ///< indicating a parse error - end_of_input, ///< indicating the end of the input buffer - literal_or_value ///< a literal or the begin of a value (only for diagnostics) - }; - - /// return name of values of type token_type (only used for errors) - JSON_HEDLEY_RETURNS_NON_NULL - JSON_HEDLEY_CONST - static const char* token_type_name(const token_type t) noexcept - { - switch (t) - { - case token_type::uninitialized: - return ""; - case token_type::literal_true: - return "true literal"; - case token_type::literal_false: - return "false literal"; - case token_type::literal_null: - return "null literal"; - case token_type::value_string: - return "string literal"; - case token_type::value_unsigned: - case token_type::value_integer: - case token_type::value_float: - return "number literal"; - case token_type::begin_array: - return "'['"; - case token_type::begin_object: - return "'{'"; - case token_type::end_array: - return "']'"; - case token_type::end_object: - return "'}'"; - case token_type::name_separator: - return "':'"; - case token_type::value_separator: - return "','"; - case token_type::parse_error: - return ""; - case token_type::end_of_input: - return "end of input"; - case token_type::literal_or_value: - return "'[', '{', or a literal"; - // LCOV_EXCL_START - default: // catch non-enum values - return "unknown token"; - // LCOV_EXCL_STOP - } - } -}; -/*! -@brief lexical analysis - -This class organizes the lexical analysis during JSON deserialization. -*/ -template -class lexer : public lexer_base -{ - using number_integer_t = typename BasicJsonType::number_integer_t; - using number_unsigned_t = typename BasicJsonType::number_unsigned_t; - using number_float_t = typename BasicJsonType::number_float_t; - using string_t = typename BasicJsonType::string_t; - using char_type = typename InputAdapterType::char_type; - using char_int_type = typename std::char_traits::int_type; - - public: - using token_type = typename lexer_base::token_type; - - explicit lexer(InputAdapterType&& adapter, bool ignore_comments_ = false) - : ia(std::move(adapter)) - , ignore_comments(ignore_comments_) - , decimal_point_char(static_cast(get_decimal_point())) - {} - - // delete because of pointer members - lexer(const lexer&) = delete; - lexer(lexer&&) = default; - lexer& operator=(lexer&) = delete; - lexer& operator=(lexer&&) = default; - ~lexer() = default; - - private: - ///////////////////// - // locales - ///////////////////// - - /// return the locale-dependent decimal point - JSON_HEDLEY_PURE - static char get_decimal_point() noexcept - { - const auto* loc = localeconv(); - JSON_ASSERT(loc != nullptr); - return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point); - } - - ///////////////////// - // scan functions - ///////////////////// - - /*! - @brief get codepoint from 4 hex characters following `\u` - - For input "\u c1 c2 c3 c4" the codepoint is: - (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4 - = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0) - - Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f' - must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The - conversion is done by subtracting the offset (0x30, 0x37, and 0x57) - between the ASCII value of the character and the desired integer value. - - @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or - non-hex character) - */ - int get_codepoint() - { - // this function only makes sense after reading `\u` - JSON_ASSERT(current == 'u'); - int codepoint = 0; - - const auto factors = { 12u, 8u, 4u, 0u }; - for (const auto factor : factors) - { - get(); - - if (current >= '0' && current <= '9') - { - codepoint += static_cast((static_cast(current) - 0x30u) << factor); - } - else if (current >= 'A' && current <= 'F') - { - codepoint += static_cast((static_cast(current) - 0x37u) << factor); - } - else if (current >= 'a' && current <= 'f') - { - codepoint += static_cast((static_cast(current) - 0x57u) << factor); - } - else - { - return -1; - } - } - - JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF); - return codepoint; - } - - /*! - @brief check if the next byte(s) are inside a given range - - Adds the current byte and, for each passed range, reads a new byte and - checks if it is inside the range. If a violation was detected, set up an - error message and return false. Otherwise, return true. - - @param[in] ranges list of integers; interpreted as list of pairs of - inclusive lower and upper bound, respectively - - @pre The passed list @a ranges must have 2, 4, or 6 elements; that is, - 1, 2, or 3 pairs. This precondition is enforced by an assertion. - - @return true if and only if no range violation was detected - */ - bool next_byte_in_range(std::initializer_list ranges) - { - JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6); - add(current); - - for (auto range = ranges.begin(); range != ranges.end(); ++range) - { - get(); - if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range))) - { - add(current); - } - else - { - error_message = "invalid string: ill-formed UTF-8 byte"; - return false; - } - } - - return true; - } - - /*! - @brief scan a string literal - - This function scans a string according to Sect. 7 of RFC 7159. While - scanning, bytes are escaped and copied into buffer token_buffer. Then the - function returns successfully, token_buffer is *not* null-terminated (as it - may contain \0 bytes), and token_buffer.size() is the number of bytes in the - string. - - @return token_type::value_string if string could be successfully scanned, - token_type::parse_error otherwise - - @note In case of errors, variable error_message contains a textual - description. - */ - token_type scan_string() - { - // reset token_buffer (ignore opening quote) - reset(); - - // we entered the function by reading an open quote - JSON_ASSERT(current == '\"'); - - while (true) - { - // get next character - switch (get()) - { - // end of file while parsing string - case std::char_traits::eof(): - { - error_message = "invalid string: missing closing quote"; - return token_type::parse_error; - } - - // closing quote - case '\"': - { - return token_type::value_string; - } - - // escapes - case '\\': - { - switch (get()) - { - // quotation mark - case '\"': - add('\"'); - break; - // reverse solidus - case '\\': - add('\\'); - break; - // solidus - case '/': - add('/'); - break; - // backspace - case 'b': - add('\b'); - break; - // form feed - case 'f': - add('\f'); - break; - // line feed - case 'n': - add('\n'); - break; - // carriage return - case 'r': - add('\r'); - break; - // tab - case 't': - add('\t'); - break; - - // unicode escapes - case 'u': - { - const int codepoint1 = get_codepoint(); - int codepoint = codepoint1; // start with codepoint1 - - if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1)) - { - error_message = "invalid string: '\\u' must be followed by 4 hex digits"; - return token_type::parse_error; - } - - // check if code point is a high surrogate - if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF) - { - // expect next \uxxxx entry - if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u')) - { - const int codepoint2 = get_codepoint(); - - if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1)) - { - error_message = "invalid string: '\\u' must be followed by 4 hex digits"; - return token_type::parse_error; - } - - // check if codepoint2 is a low surrogate - if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF)) - { - // overwrite codepoint - codepoint = static_cast( - // high surrogate occupies the most significant 22 bits - (static_cast(codepoint1) << 10u) - // low surrogate occupies the least significant 15 bits - + static_cast(codepoint2) - // there is still the 0xD800, 0xDC00 and 0x10000 noise - // in the result so we have to subtract with: - // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00 - - 0x35FDC00u); - } - else - { - error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF"; - return token_type::parse_error; - } - } - else - { - error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF"; - return token_type::parse_error; - } - } - else - { - if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF)) - { - error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF"; - return token_type::parse_error; - } - } - - // result of the above calculation yields a proper codepoint - JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF); - - // translate codepoint into bytes - if (codepoint < 0x80) - { - // 1-byte characters: 0xxxxxxx (ASCII) - add(static_cast(codepoint)); - } - else if (codepoint <= 0x7FF) - { - // 2-byte characters: 110xxxxx 10xxxxxx - add(static_cast(0xC0u | (static_cast(codepoint) >> 6u))); - add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); - } - else if (codepoint <= 0xFFFF) - { - // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx - add(static_cast(0xE0u | (static_cast(codepoint) >> 12u))); - add(static_cast(0x80u | ((static_cast(codepoint) >> 6u) & 0x3Fu))); - add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); - } - else - { - // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - add(static_cast(0xF0u | (static_cast(codepoint) >> 18u))); - add(static_cast(0x80u | ((static_cast(codepoint) >> 12u) & 0x3Fu))); - add(static_cast(0x80u | ((static_cast(codepoint) >> 6u) & 0x3Fu))); - add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); - } - - break; - } - - // other characters after escape - default: - error_message = "invalid string: forbidden character after backslash"; - return token_type::parse_error; - } - - break; - } - - // invalid control characters - case 0x00: - { - error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000"; - return token_type::parse_error; - } - - case 0x01: - { - error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001"; - return token_type::parse_error; - } - - case 0x02: - { - error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002"; - return token_type::parse_error; - } - - case 0x03: - { - error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003"; - return token_type::parse_error; - } - - case 0x04: - { - error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004"; - return token_type::parse_error; - } - - case 0x05: - { - error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005"; - return token_type::parse_error; - } - - case 0x06: - { - error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006"; - return token_type::parse_error; - } - - case 0x07: - { - error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007"; - return token_type::parse_error; - } - - case 0x08: - { - error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b"; - return token_type::parse_error; - } - - case 0x09: - { - error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t"; - return token_type::parse_error; - } - - case 0x0A: - { - error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n"; - return token_type::parse_error; - } - - case 0x0B: - { - error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B"; - return token_type::parse_error; - } - - case 0x0C: - { - error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f"; - return token_type::parse_error; - } - - case 0x0D: - { - error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r"; - return token_type::parse_error; - } - - case 0x0E: - { - error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E"; - return token_type::parse_error; - } - - case 0x0F: - { - error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F"; - return token_type::parse_error; - } - - case 0x10: - { - error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010"; - return token_type::parse_error; - } - - case 0x11: - { - error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011"; - return token_type::parse_error; - } - - case 0x12: - { - error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012"; - return token_type::parse_error; - } - - case 0x13: - { - error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013"; - return token_type::parse_error; - } - - case 0x14: - { - error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014"; - return token_type::parse_error; - } - - case 0x15: - { - error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015"; - return token_type::parse_error; - } - - case 0x16: - { - error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016"; - return token_type::parse_error; - } - - case 0x17: - { - error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017"; - return token_type::parse_error; - } - - case 0x18: - { - error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018"; - return token_type::parse_error; - } - - case 0x19: - { - error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019"; - return token_type::parse_error; - } - - case 0x1A: - { - error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A"; - return token_type::parse_error; - } - - case 0x1B: - { - error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B"; - return token_type::parse_error; - } - - case 0x1C: - { - error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C"; - return token_type::parse_error; - } - - case 0x1D: - { - error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D"; - return token_type::parse_error; - } - - case 0x1E: - { - error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E"; - return token_type::parse_error; - } - - case 0x1F: - { - error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F"; - return token_type::parse_error; - } - - // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace)) - case 0x20: - case 0x21: - case 0x23: - case 0x24: - case 0x25: - case 0x26: - case 0x27: - case 0x28: - case 0x29: - case 0x2A: - case 0x2B: - case 0x2C: - case 0x2D: - case 0x2E: - case 0x2F: - case 0x30: - case 0x31: - case 0x32: - case 0x33: - case 0x34: - case 0x35: - case 0x36: - case 0x37: - case 0x38: - case 0x39: - case 0x3A: - case 0x3B: - case 0x3C: - case 0x3D: - case 0x3E: - case 0x3F: - case 0x40: - case 0x41: - case 0x42: - case 0x43: - case 0x44: - case 0x45: - case 0x46: - case 0x47: - case 0x48: - case 0x49: - case 0x4A: - case 0x4B: - case 0x4C: - case 0x4D: - case 0x4E: - case 0x4F: - case 0x50: - case 0x51: - case 0x52: - case 0x53: - case 0x54: - case 0x55: - case 0x56: - case 0x57: - case 0x58: - case 0x59: - case 0x5A: - case 0x5B: - case 0x5D: - case 0x5E: - case 0x5F: - case 0x60: - case 0x61: - case 0x62: - case 0x63: - case 0x64: - case 0x65: - case 0x66: - case 0x67: - case 0x68: - case 0x69: - case 0x6A: - case 0x6B: - case 0x6C: - case 0x6D: - case 0x6E: - case 0x6F: - case 0x70: - case 0x71: - case 0x72: - case 0x73: - case 0x74: - case 0x75: - case 0x76: - case 0x77: - case 0x78: - case 0x79: - case 0x7A: - case 0x7B: - case 0x7C: - case 0x7D: - case 0x7E: - case 0x7F: - { - add(current); - break; - } - - // U+0080..U+07FF: bytes C2..DF 80..BF - case 0xC2: - case 0xC3: - case 0xC4: - case 0xC5: - case 0xC6: - case 0xC7: - case 0xC8: - case 0xC9: - case 0xCA: - case 0xCB: - case 0xCC: - case 0xCD: - case 0xCE: - case 0xCF: - case 0xD0: - case 0xD1: - case 0xD2: - case 0xD3: - case 0xD4: - case 0xD5: - case 0xD6: - case 0xD7: - case 0xD8: - case 0xD9: - case 0xDA: - case 0xDB: - case 0xDC: - case 0xDD: - case 0xDE: - case 0xDF: - { - if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF}))) - { - return token_type::parse_error; - } - break; - } - - // U+0800..U+0FFF: bytes E0 A0..BF 80..BF - case 0xE0: - { - if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF})))) - { - return token_type::parse_error; - } - break; - } - - // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF - // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF - case 0xE1: - case 0xE2: - case 0xE3: - case 0xE4: - case 0xE5: - case 0xE6: - case 0xE7: - case 0xE8: - case 0xE9: - case 0xEA: - case 0xEB: - case 0xEC: - case 0xEE: - case 0xEF: - { - if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF})))) - { - return token_type::parse_error; - } - break; - } - - // U+D000..U+D7FF: bytes ED 80..9F 80..BF - case 0xED: - { - if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF})))) - { - return token_type::parse_error; - } - break; - } - - // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF - case 0xF0: - { - if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) - { - return token_type::parse_error; - } - break; - } - - // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF - case 0xF1: - case 0xF2: - case 0xF3: - { - if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) - { - return token_type::parse_error; - } - break; - } - - // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF - case 0xF4: - { - if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF})))) - { - return token_type::parse_error; - } - break; - } - - // remaining bytes (80..C1 and F5..FF) are ill-formed - default: - { - error_message = "invalid string: ill-formed UTF-8 byte"; - return token_type::parse_error; - } - } - } - } - - /*! - * @brief scan a comment - * @return whether comment could be scanned successfully - */ - bool scan_comment() - { - switch (get()) - { - // single-line comments skip input until a newline or EOF is read - case '/': - { - while (true) - { - switch (get()) - { - case '\n': - case '\r': - case std::char_traits::eof(): - case '\0': - return true; - - default: - break; - } - } - } - - // multi-line comments skip input until */ is read - case '*': - { - while (true) - { - switch (get()) - { - case std::char_traits::eof(): - case '\0': - { - error_message = "invalid comment; missing closing '*/'"; - return false; - } - - case '*': - { - switch (get()) - { - case '/': - return true; - - default: - { - unget(); - break; - } - } - } - - default: - break; - } - } - } - - // unexpected character after reading '/' - default: - { - error_message = "invalid comment; expecting '/' or '*' after '/'"; - return false; - } - } - } - - JSON_HEDLEY_NON_NULL(2) - static void strtof(float& f, const char* str, char** endptr) noexcept - { - f = std::strtof(str, endptr); - } - - JSON_HEDLEY_NON_NULL(2) - static void strtof(double& f, const char* str, char** endptr) noexcept - { - f = std::strtod(str, endptr); - } - - JSON_HEDLEY_NON_NULL(2) - static void strtof(long double& f, const char* str, char** endptr) noexcept - { - f = std::strtold(str, endptr); - } - - /*! - @brief scan a number literal - - This function scans a string according to Sect. 6 of RFC 7159. - - The function is realized with a deterministic finite state machine derived - from the grammar described in RFC 7159. Starting in state "init", the - input is read and used to determined the next state. Only state "done" - accepts the number. State "error" is a trap state to model errors. In the - table below, "anything" means any character but the ones listed before. - - state | 0 | 1-9 | e E | + | - | . | anything - ---------|----------|----------|----------|---------|---------|----------|----------- - init | zero | any1 | [error] | [error] | minus | [error] | [error] - minus | zero | any1 | [error] | [error] | [error] | [error] | [error] - zero | done | done | exponent | done | done | decimal1 | done - any1 | any1 | any1 | exponent | done | done | decimal1 | done - decimal1 | decimal2 | decimal2 | [error] | [error] | [error] | [error] | [error] - decimal2 | decimal2 | decimal2 | exponent | done | done | done | done - exponent | any2 | any2 | [error] | sign | sign | [error] | [error] - sign | any2 | any2 | [error] | [error] | [error] | [error] | [error] - any2 | any2 | any2 | done | done | done | done | done - - The state machine is realized with one label per state (prefixed with - "scan_number_") and `goto` statements between them. The state machine - contains cycles, but any cycle can be left when EOF is read. Therefore, - the function is guaranteed to terminate. - - During scanning, the read bytes are stored in token_buffer. This string is - then converted to a signed integer, an unsigned integer, or a - floating-point number. - - @return token_type::value_unsigned, token_type::value_integer, or - token_type::value_float if number could be successfully scanned, - token_type::parse_error otherwise - - @note The scanner is independent of the current locale. Internally, the - locale's decimal point is used instead of `.` to work with the - locale-dependent converters. - */ - token_type scan_number() // lgtm [cpp/use-of-goto] - { - // reset token_buffer to store the number's bytes - reset(); - - // the type of the parsed number; initially set to unsigned; will be - // changed if minus sign, decimal point or exponent is read - token_type number_type = token_type::value_unsigned; - - // state (init): we just found out we need to scan a number - switch (current) - { - case '-': - { - add(current); - goto scan_number_minus; - } - - case '0': - { - add(current); - goto scan_number_zero; - } - - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any1; - } - - // all other characters are rejected outside scan_number() - default: // LCOV_EXCL_LINE - JSON_ASSERT(false); // LCOV_EXCL_LINE - } - -scan_number_minus: - // state: we just parsed a leading minus sign - number_type = token_type::value_integer; - switch (get()) - { - case '0': - { - add(current); - goto scan_number_zero; - } - - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any1; - } - - default: - { - error_message = "invalid number; expected digit after '-'"; - return token_type::parse_error; - } - } - -scan_number_zero: - // state: we just parse a zero (maybe with a leading minus sign) - switch (get()) - { - case '.': - { - add(decimal_point_char); - goto scan_number_decimal1; - } - - case 'e': - case 'E': - { - add(current); - goto scan_number_exponent; - } - - default: - goto scan_number_done; - } - -scan_number_any1: - // state: we just parsed a number 0-9 (maybe with a leading minus sign) - switch (get()) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any1; - } - - case '.': - { - add(decimal_point_char); - goto scan_number_decimal1; - } - - case 'e': - case 'E': - { - add(current); - goto scan_number_exponent; - } - - default: - goto scan_number_done; - } - -scan_number_decimal1: - // state: we just parsed a decimal point - number_type = token_type::value_float; - switch (get()) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_decimal2; - } - - default: - { - error_message = "invalid number; expected digit after '.'"; - return token_type::parse_error; - } - } - -scan_number_decimal2: - // we just parsed at least one number after a decimal point - switch (get()) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_decimal2; - } - - case 'e': - case 'E': - { - add(current); - goto scan_number_exponent; - } - - default: - goto scan_number_done; - } - -scan_number_exponent: - // we just parsed an exponent - number_type = token_type::value_float; - switch (get()) - { - case '+': - case '-': - { - add(current); - goto scan_number_sign; - } - - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any2; - } - - default: - { - error_message = - "invalid number; expected '+', '-', or digit after exponent"; - return token_type::parse_error; - } - } - -scan_number_sign: - // we just parsed an exponent sign - switch (get()) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any2; - } - - default: - { - error_message = "invalid number; expected digit after exponent sign"; - return token_type::parse_error; - } - } - -scan_number_any2: - // we just parsed a number after the exponent or exponent sign - switch (get()) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any2; - } - - default: - goto scan_number_done; - } - -scan_number_done: - // unget the character after the number (we only read it to know that - // we are done scanning a number) - unget(); - - char* endptr = nullptr; - errno = 0; - - // try to parse integers first and fall back to floats - if (number_type == token_type::value_unsigned) - { - const auto x = std::strtoull(token_buffer.data(), &endptr, 10); - - // we checked the number format before - JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); - - if (errno == 0) - { - value_unsigned = static_cast(x); - if (value_unsigned == x) - { - return token_type::value_unsigned; - } - } - } - else if (number_type == token_type::value_integer) - { - const auto x = std::strtoll(token_buffer.data(), &endptr, 10); - - // we checked the number format before - JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); - - if (errno == 0) - { - value_integer = static_cast(x); - if (value_integer == x) - { - return token_type::value_integer; - } - } - } - - // this code is reached if we parse a floating-point number or if an - // integer conversion above failed - strtof(value_float, token_buffer.data(), &endptr); - - // we checked the number format before - JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); - - return token_type::value_float; - } - - /*! - @param[in] literal_text the literal text to expect - @param[in] length the length of the passed literal text - @param[in] return_type the token type to return on success - */ - JSON_HEDLEY_NON_NULL(2) - token_type scan_literal(const char_type* literal_text, const std::size_t length, - token_type return_type) - { - JSON_ASSERT(std::char_traits::to_char_type(current) == literal_text[0]); - for (std::size_t i = 1; i < length; ++i) - { - if (JSON_HEDLEY_UNLIKELY(std::char_traits::to_char_type(get()) != literal_text[i])) - { - error_message = "invalid literal"; - return token_type::parse_error; - } - } - return return_type; - } - - ///////////////////// - // input management - ///////////////////// - - /// reset token_buffer; current character is beginning of token - void reset() noexcept - { - token_buffer.clear(); - token_string.clear(); - token_string.push_back(std::char_traits::to_char_type(current)); - } - - /* - @brief get next character from the input - - This function provides the interface to the used input adapter. It does - not throw in case the input reached EOF, but returns a - `std::char_traits::eof()` in that case. Stores the scanned characters - for use in error messages. - - @return character read from the input - */ - char_int_type get() - { - ++position.chars_read_total; - ++position.chars_read_current_line; - - if (next_unget) - { - // just reset the next_unget variable and work with current - next_unget = false; - } - else - { - current = ia.get_character(); - } - - if (JSON_HEDLEY_LIKELY(current != std::char_traits::eof())) - { - token_string.push_back(std::char_traits::to_char_type(current)); - } - - if (current == '\n') - { - ++position.lines_read; - position.chars_read_current_line = 0; - } - - return current; - } - - /*! - @brief unget current character (read it again on next get) - - We implement unget by setting variable next_unget to true. The input is not - changed - we just simulate ungetting by modifying chars_read_total, - chars_read_current_line, and token_string. The next call to get() will - behave as if the unget character is read again. - */ - void unget() - { - next_unget = true; - - --position.chars_read_total; - - // in case we "unget" a newline, we have to also decrement the lines_read - if (position.chars_read_current_line == 0) - { - if (position.lines_read > 0) - { - --position.lines_read; - } - } - else - { - --position.chars_read_current_line; - } - - if (JSON_HEDLEY_LIKELY(current != std::char_traits::eof())) - { - JSON_ASSERT(!token_string.empty()); - token_string.pop_back(); - } - } - - /// add a character to token_buffer - void add(char_int_type c) - { - token_buffer.push_back(static_cast(c)); - } - - public: - ///////////////////// - // value getters - ///////////////////// - - /// return integer value - constexpr number_integer_t get_number_integer() const noexcept - { - return value_integer; - } - - /// return unsigned integer value - constexpr number_unsigned_t get_number_unsigned() const noexcept - { - return value_unsigned; - } - - /// return floating-point value - constexpr number_float_t get_number_float() const noexcept - { - return value_float; - } - - /// return current string value (implicitly resets the token; useful only once) - string_t& get_string() - { - return token_buffer; - } - - ///////////////////// - // diagnostics - ///////////////////// - - /// return position of last read token - constexpr position_t get_position() const noexcept - { - return position; - } - - /// return the last read token (for errors only). Will never contain EOF - /// (an arbitrary value that is not a valid char value, often -1), because - /// 255 may legitimately occur. May contain NUL, which should be escaped. - std::string get_token_string() const - { - // escape control characters - std::string result; - for (const auto c : token_string) - { - if (static_cast(c) <= '\x1F') - { - // escape control characters - std::array cs{{}}; - (std::snprintf)(cs.data(), cs.size(), "", static_cast(c)); - result += cs.data(); - } - else - { - // add character as is - result.push_back(static_cast(c)); - } - } - - return result; - } - - /// return syntax error message - JSON_HEDLEY_RETURNS_NON_NULL - constexpr const char* get_error_message() const noexcept - { - return error_message; - } - - ///////////////////// - // actual scanner - ///////////////////// - - /*! - @brief skip the UTF-8 byte order mark - @return true iff there is no BOM or the correct BOM has been skipped - */ - bool skip_bom() - { - if (get() == 0xEF) - { - // check if we completely parse the BOM - return get() == 0xBB && get() == 0xBF; - } - - // the first character is not the beginning of the BOM; unget it to - // process is later - unget(); - return true; - } - - void skip_whitespace() - { - do - { - get(); - } - while (current == ' ' || current == '\t' || current == '\n' || current == '\r'); - } - - token_type scan() - { - // initially, skip the BOM - if (position.chars_read_total == 0 && !skip_bom()) - { - error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given"; - return token_type::parse_error; - } - - // read next character and ignore whitespace - skip_whitespace(); - - // ignore comments - if (ignore_comments && current == '/') - { - if (!scan_comment()) - { - return token_type::parse_error; - } - - // skip following whitespace - skip_whitespace(); - } - - switch (current) - { - // structural characters - case '[': - return token_type::begin_array; - case ']': - return token_type::end_array; - case '{': - return token_type::begin_object; - case '}': - return token_type::end_object; - case ':': - return token_type::name_separator; - case ',': - return token_type::value_separator; - - // literals - case 't': - { - std::array true_literal = {{'t', 'r', 'u', 'e'}}; - return scan_literal(true_literal.data(), true_literal.size(), token_type::literal_true); - } - case 'f': - { - std::array false_literal = {{'f', 'a', 'l', 's', 'e'}}; - return scan_literal(false_literal.data(), false_literal.size(), token_type::literal_false); - } - case 'n': - { - std::array null_literal = {{'n', 'u', 'l', 'l'}}; - return scan_literal(null_literal.data(), null_literal.size(), token_type::literal_null); - } - - // string - case '\"': - return scan_string(); - - // number - case '-': - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - return scan_number(); - - // end of input (the null byte is needed when parsing from - // string literals) - case '\0': - case std::char_traits::eof(): - return token_type::end_of_input; - - // error - default: - error_message = "invalid literal"; - return token_type::parse_error; - } - } - - private: - /// input adapter - InputAdapterType ia; - - /// whether comments should be ignored (true) or signaled as errors (false) - const bool ignore_comments = false; - - /// the current character - char_int_type current = std::char_traits::eof(); - - /// whether the next get() call should just return current - bool next_unget = false; - - /// the start position of the current token - position_t position {}; - - /// raw input token string (for error messages) - std::vector token_string {}; - - /// buffer for variable-length tokens (numbers, strings) - string_t token_buffer {}; - - /// a description of occurred lexer errors - const char* error_message = ""; - - // number values - number_integer_t value_integer = 0; - number_unsigned_t value_unsigned = 0; - number_float_t value_float = 0; - - /// the decimal point - const char_int_type decimal_point_char = '.'; -}; -} // namespace detail -} // namespace nlohmann - // #include @@ -13957,7 +14009,17 @@ class binary_writer } else { - JSON_THROW(out_of_range::create(407, "integer number " + std::to_string(n) + " cannot be represented by UBJSON as it does not fit int64")); + if (add_prefix) + { + oa->write_character(to_char_type('H')); // high-precision number + } + + const auto number = BasicJsonType(n).dump(); + write_number_with_ubjson_prefix(number.size(), true); + for (std::size_t i = 0; i < number.size(); ++i) + { + oa->write_character(to_char_type(static_cast(number[i]))); + } } } @@ -14011,19 +14073,23 @@ class binary_writer // LCOV_EXCL_START else { - JSON_THROW(out_of_range::create(407, "integer number " + std::to_string(n) + " cannot be represented by UBJSON as it does not fit int64")); + if (add_prefix) + { + oa->write_character(to_char_type('H')); // high-precision number + } + + const auto number = BasicJsonType(n).dump(); + write_number_with_ubjson_prefix(number.size(), true); + for (std::size_t i = 0; i < number.size(); ++i) + { + oa->write_character(to_char_type(static_cast(number[i]))); + } } // LCOV_EXCL_STOP } /*! @brief determine the type prefix of container values - - @note This function does not need to be 100% accurate when it comes to - integer limits. In case a number exceeds the limits of int64_t, - this will be detected by a later call to function - write_number_with_ubjson_prefix. Therefore, we return 'L' for any - value that does not fit the previous limits. */ CharType ubjson_prefix(const BasicJsonType& j) const noexcept { @@ -14053,8 +14119,12 @@ class binary_writer { return 'l'; } - // no check and assume int64_t (see note above) - return 'L'; + if ((std::numeric_limits::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits::max)()) + { + return 'L'; + } + // anything else is treated as high-precision number + return 'H'; } case value_t::number_unsigned: @@ -14075,8 +14145,12 @@ class binary_writer { return 'l'; } - // no check and assume int64_t (see note above) - return 'L'; + if (j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + { + return 'L'; + } + // anything else is treated as high-precision number + return 'H'; } case value_t::number_float: @@ -23479,6 +23553,7 @@ class basic_json number_unsigned | 256..32767 | int16 | `I` number_unsigned | 32768..2147483647 | int32 | `l` number_unsigned | 2147483648..9223372036854775807 | int64 | `L` + number_unsigned | 2147483649..18446744073709551615 | high-precision | `H` number_float | *any value* | float64 | `D` string | *with shortest length indicator* | string | `S` array | *see notes on optimized format* | array | `[` @@ -23489,7 +23564,6 @@ class basic_json @note The following values can **not** be converted to a UBJSON value: - strings with more than 9223372036854775807 bytes (theoretical) - - unsigned integer numbers above 9223372036854775807 @note The following markers are not used in the conversion: - `Z`: no-op values are not created. @@ -23786,7 +23860,7 @@ class basic_json const bool allow_exceptions = true, const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error) { - return from_cbor(ptr, ptr + len, strict, tag_handler); + return from_cbor(ptr, ptr + len, strict, allow_exceptions, tag_handler); } @@ -23965,6 +24039,7 @@ class basic_json int16 | number_integer | `I` int32 | number_integer | `l` int64 | number_integer | `L` + high-precision number | number_integer, number_unsigned, or number_float - depends on number string | 'H' string | string | `S` char | string | `C` array | array (optimized values are supported) | `[` From 88122467bd84264eea71d8dabc61c6c70d69cb77 Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Thu, 23 Jul 2020 12:16:18 +0200 Subject: [PATCH 11/14] :recycle: pull code into function --- .../nlohmann/detail/input/binary_reader.hpp | 95 ++++++++++--------- single_include/nlohmann/json.hpp | 95 ++++++++++--------- 2 files changed, 100 insertions(+), 90 deletions(-) diff --git a/include/nlohmann/detail/input/binary_reader.hpp b/include/nlohmann/detail/input/binary_reader.hpp index b9c2532d..7d74f369 100644 --- a/include/nlohmann/detail/input/binary_reader.hpp +++ b/include/nlohmann/detail/input/binary_reader.hpp @@ -2003,51 +2003,7 @@ class binary_reader case 'H': { - // get size of following number string - std::size_t size{}; - auto res = get_ubjson_size_value(size); - if (JSON_HEDLEY_UNLIKELY(!res)) - { - return res; - } - - // get number string - std::vector number_vector; - for (std::size_t i = 0; i < size; ++i) - { - get(); - if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "number"))) - { - return false; - } - number_vector.push_back(static_cast(current)); - } - - // parse number string - auto number_ia = detail::input_adapter(std::forward(number_vector)); - auto number_lexer = detail::lexer(std::move(number_ia), false); - const auto result_number = number_lexer.scan(); - const auto number_string = number_lexer.get_token_string(); - const auto result_remainder = number_lexer.scan(); - - using token_type = typename detail::lexer_base::token_type; - - if (JSON_HEDLEY_UNLIKELY(result_remainder != token_type::end_of_input)) - { - return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number"))); - } - - switch (result_number) - { - case token_type::value_integer: - return sax->number_integer(number_lexer.get_number_integer()); - case token_type::value_unsigned: - return sax->number_unsigned(number_lexer.get_number_unsigned()); - case token_type::value_float: - return sax->number_float(number_lexer.get_number_float(), std::move(number_string)); - default: - return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number"))); - } + return get_ubjson_high_precision_number(); } case 'C': // char @@ -2226,6 +2182,55 @@ class binary_reader // Note, no reader for UBJSON binary types is implemented because they do // not exist + bool get_ubjson_high_precision_number() + { + // get size of following number string + std::size_t size{}; + auto res = get_ubjson_size_value(size); + if (JSON_HEDLEY_UNLIKELY(!res)) + { + return res; + } + + // get number string + std::vector number_vector; + for (std::size_t i = 0; i < size; ++i) + { + get(); + if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "number"))) + { + return false; + } + number_vector.push_back(static_cast(current)); + } + + // parse number string + auto number_ia = detail::input_adapter(std::forward(number_vector)); + auto number_lexer = detail::lexer(std::move(number_ia), false); + const auto result_number = number_lexer.scan(); + const auto number_string = number_lexer.get_token_string(); + const auto result_remainder = number_lexer.scan(); + + using token_type = typename detail::lexer_base::token_type; + + if (JSON_HEDLEY_UNLIKELY(result_remainder != token_type::end_of_input)) + { + return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number"))); + } + + switch (result_number) + { + case token_type::value_integer: + return sax->number_integer(number_lexer.get_number_integer()); + case token_type::value_unsigned: + return sax->number_unsigned(number_lexer.get_number_unsigned()); + case token_type::value_float: + return sax->number_float(number_lexer.get_number_float(), std::move(number_string)); + default: + return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number"))); + } + } + /////////////////////// // Utility functions // /////////////////////// diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index 8b234047..655eb80c 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -9616,51 +9616,7 @@ class binary_reader case 'H': { - // get size of following number string - std::size_t size{}; - auto res = get_ubjson_size_value(size); - if (JSON_HEDLEY_UNLIKELY(!res)) - { - return res; - } - - // get number string - std::vector number_vector; - for (std::size_t i = 0; i < size; ++i) - { - get(); - if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "number"))) - { - return false; - } - number_vector.push_back(static_cast(current)); - } - - // parse number string - auto number_ia = detail::input_adapter(std::forward(number_vector)); - auto number_lexer = detail::lexer(std::move(number_ia), false); - const auto result_number = number_lexer.scan(); - const auto number_string = number_lexer.get_token_string(); - const auto result_remainder = number_lexer.scan(); - - using token_type = typename detail::lexer_base::token_type; - - if (JSON_HEDLEY_UNLIKELY(result_remainder != token_type::end_of_input)) - { - return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number"))); - } - - switch (result_number) - { - case token_type::value_integer: - return sax->number_integer(number_lexer.get_number_integer()); - case token_type::value_unsigned: - return sax->number_unsigned(number_lexer.get_number_unsigned()); - case token_type::value_float: - return sax->number_float(number_lexer.get_number_float(), std::move(number_string)); - default: - return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number"))); - } + return get_ubjson_high_precision_number(); } case 'C': // char @@ -9839,6 +9795,55 @@ class binary_reader // Note, no reader for UBJSON binary types is implemented because they do // not exist + bool get_ubjson_high_precision_number() + { + // get size of following number string + std::size_t size{}; + auto res = get_ubjson_size_value(size); + if (JSON_HEDLEY_UNLIKELY(!res)) + { + return res; + } + + // get number string + std::vector number_vector; + for (std::size_t i = 0; i < size; ++i) + { + get(); + if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "number"))) + { + return false; + } + number_vector.push_back(static_cast(current)); + } + + // parse number string + auto number_ia = detail::input_adapter(std::forward(number_vector)); + auto number_lexer = detail::lexer(std::move(number_ia), false); + const auto result_number = number_lexer.scan(); + const auto number_string = number_lexer.get_token_string(); + const auto result_remainder = number_lexer.scan(); + + using token_type = typename detail::lexer_base::token_type; + + if (JSON_HEDLEY_UNLIKELY(result_remainder != token_type::end_of_input)) + { + return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number"))); + } + + switch (result_number) + { + case token_type::value_integer: + return sax->number_integer(number_lexer.get_number_integer()); + case token_type::value_unsigned: + return sax->number_unsigned(number_lexer.get_number_unsigned()); + case token_type::value_float: + return sax->number_float(number_lexer.get_number_float(), std::move(number_string)); + default: + return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number"))); + } + } + /////////////////////// // Utility functions // /////////////////////// From b315fe484a3ba2655403458c9a162de6311c6050 Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Thu, 23 Jul 2020 12:30:26 +0200 Subject: [PATCH 12/14] :twisted_rightwards_arrows: merge develop branch --- single_include/nlohmann/json.hpp | 3360 +++++++++++++++--------------- 1 file changed, 1720 insertions(+), 1640 deletions(-) diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index 0b7d2411..a845846a 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -2403,6 +2403,7 @@ json.exception.parse_error.110 | parse error at 1: cannot read 2 bytes from vect json.exception.parse_error.112 | parse error at 1: error reading CBOR; last byte: 0xF8 | Not all types of CBOR or MessagePack are supported. This exception occurs if an unsupported byte was read. json.exception.parse_error.113 | parse error at 2: expected a CBOR string; last byte: 0x98 | While parsing a map key, a value that is not a string has been read. json.exception.parse_error.114 | parse error: Unsupported BSON record type 0x0F | The parsing of the corresponding BSON record type is not implemented (yet). +json.exception.parse_error.115 | parse error at byte 5: syntax error while parsing UBJSON high-precision number: invalid number text: 1A | A UBJSON high-precision number could not be parsed. @note For an input with n bytes, 1 is the index of the first character and n+1 is the index of the terminating null byte or the end of file. This also @@ -2591,7 +2592,7 @@ json.exception.out_of_range.403 | key 'foo' not found | The provided key was not json.exception.out_of_range.404 | unresolved reference token 'foo' | A reference token in a JSON Pointer could not be resolved. json.exception.out_of_range.405 | JSON pointer has no parent | The JSON Patch operations 'remove' and 'add' can not be applied to the root element of the JSON value. json.exception.out_of_range.406 | number overflow parsing '10E1000' | A parsed number could not be stored as without changing it to NaN or INF. -json.exception.out_of_range.407 | number overflow serializing '9223372036854775808' | UBJSON and BSON only support integer numbers up to 9223372036854775807. | +json.exception.out_of_range.407 | number overflow serializing '9223372036854775808' | UBJSON and BSON only support integer numbers up to 9223372036854775807. (until version 3.8.0) | json.exception.out_of_range.408 | excessive array size: 8658170730974374167 | The size (following `#`) of an UBJSON array or object exceeds the maximal capacity. | json.exception.out_of_range.409 | BSON key cannot contain code point U+0000 (at byte 2) | Key identifiers to be serialized to BSON cannot contain code point U+0000, since the key is stored as zero-terminated c-string | @@ -5870,6 +5871,1634 @@ class json_sax_acceptor } // namespace nlohmann +// #include + + +#include // array +#include // localeconv +#include // size_t +#include // snprintf +#include // strtof, strtod, strtold, strtoll, strtoull +#include // initializer_list +#include // char_traits, string +#include // move +#include // vector + +// #include + +// #include + +// #include + + +namespace nlohmann +{ +namespace detail +{ +/////////// +// lexer // +/////////// + +template +class lexer_base +{ + public: + /// token types for the parser + enum class token_type + { + uninitialized, ///< indicating the scanner is uninitialized + literal_true, ///< the `true` literal + literal_false, ///< the `false` literal + literal_null, ///< the `null` literal + value_string, ///< a string -- use get_string() for actual value + value_unsigned, ///< an unsigned integer -- use get_number_unsigned() for actual value + value_integer, ///< a signed integer -- use get_number_integer() for actual value + value_float, ///< an floating point number -- use get_number_float() for actual value + begin_array, ///< the character for array begin `[` + begin_object, ///< the character for object begin `{` + end_array, ///< the character for array end `]` + end_object, ///< the character for object end `}` + name_separator, ///< the name separator `:` + value_separator, ///< the value separator `,` + parse_error, ///< indicating a parse error + end_of_input, ///< indicating the end of the input buffer + literal_or_value ///< a literal or the begin of a value (only for diagnostics) + }; + + /// return name of values of type token_type (only used for errors) + JSON_HEDLEY_RETURNS_NON_NULL + JSON_HEDLEY_CONST + static const char* token_type_name(const token_type t) noexcept + { + switch (t) + { + case token_type::uninitialized: + return ""; + case token_type::literal_true: + return "true literal"; + case token_type::literal_false: + return "false literal"; + case token_type::literal_null: + return "null literal"; + case token_type::value_string: + return "string literal"; + case token_type::value_unsigned: + case token_type::value_integer: + case token_type::value_float: + return "number literal"; + case token_type::begin_array: + return "'['"; + case token_type::begin_object: + return "'{'"; + case token_type::end_array: + return "']'"; + case token_type::end_object: + return "'}'"; + case token_type::name_separator: + return "':'"; + case token_type::value_separator: + return "','"; + case token_type::parse_error: + return ""; + case token_type::end_of_input: + return "end of input"; + case token_type::literal_or_value: + return "'[', '{', or a literal"; + // LCOV_EXCL_START + default: // catch non-enum values + return "unknown token"; + // LCOV_EXCL_STOP + } + } +}; +/*! +@brief lexical analysis + +This class organizes the lexical analysis during JSON deserialization. +*/ +template +class lexer : public lexer_base +{ + using number_integer_t = typename BasicJsonType::number_integer_t; + using number_unsigned_t = typename BasicJsonType::number_unsigned_t; + using number_float_t = typename BasicJsonType::number_float_t; + using string_t = typename BasicJsonType::string_t; + using char_type = typename InputAdapterType::char_type; + using char_int_type = typename std::char_traits::int_type; + + public: + using token_type = typename lexer_base::token_type; + + explicit lexer(InputAdapterType&& adapter, bool ignore_comments_ = false) + : ia(std::move(adapter)) + , ignore_comments(ignore_comments_) + , decimal_point_char(static_cast(get_decimal_point())) + {} + + // delete because of pointer members + lexer(const lexer&) = delete; + lexer(lexer&&) = default; + lexer& operator=(lexer&) = delete; + lexer& operator=(lexer&&) = default; + ~lexer() = default; + + private: + ///////////////////// + // locales + ///////////////////// + + /// return the locale-dependent decimal point + JSON_HEDLEY_PURE + static char get_decimal_point() noexcept + { + const auto* loc = localeconv(); + JSON_ASSERT(loc != nullptr); + return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point); + } + + ///////////////////// + // scan functions + ///////////////////// + + /*! + @brief get codepoint from 4 hex characters following `\u` + + For input "\u c1 c2 c3 c4" the codepoint is: + (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4 + = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0) + + Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f' + must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The + conversion is done by subtracting the offset (0x30, 0x37, and 0x57) + between the ASCII value of the character and the desired integer value. + + @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or + non-hex character) + */ + int get_codepoint() + { + // this function only makes sense after reading `\u` + JSON_ASSERT(current == 'u'); + int codepoint = 0; + + const auto factors = { 12u, 8u, 4u, 0u }; + for (const auto factor : factors) + { + get(); + + if (current >= '0' && current <= '9') + { + codepoint += static_cast((static_cast(current) - 0x30u) << factor); + } + else if (current >= 'A' && current <= 'F') + { + codepoint += static_cast((static_cast(current) - 0x37u) << factor); + } + else if (current >= 'a' && current <= 'f') + { + codepoint += static_cast((static_cast(current) - 0x57u) << factor); + } + else + { + return -1; + } + } + + JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF); + return codepoint; + } + + /*! + @brief check if the next byte(s) are inside a given range + + Adds the current byte and, for each passed range, reads a new byte and + checks if it is inside the range. If a violation was detected, set up an + error message and return false. Otherwise, return true. + + @param[in] ranges list of integers; interpreted as list of pairs of + inclusive lower and upper bound, respectively + + @pre The passed list @a ranges must have 2, 4, or 6 elements; that is, + 1, 2, or 3 pairs. This precondition is enforced by an assertion. + + @return true if and only if no range violation was detected + */ + bool next_byte_in_range(std::initializer_list ranges) + { + JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6); + add(current); + + for (auto range = ranges.begin(); range != ranges.end(); ++range) + { + get(); + if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range))) + { + add(current); + } + else + { + error_message = "invalid string: ill-formed UTF-8 byte"; + return false; + } + } + + return true; + } + + /*! + @brief scan a string literal + + This function scans a string according to Sect. 7 of RFC 7159. While + scanning, bytes are escaped and copied into buffer token_buffer. Then the + function returns successfully, token_buffer is *not* null-terminated (as it + may contain \0 bytes), and token_buffer.size() is the number of bytes in the + string. + + @return token_type::value_string if string could be successfully scanned, + token_type::parse_error otherwise + + @note In case of errors, variable error_message contains a textual + description. + */ + token_type scan_string() + { + // reset token_buffer (ignore opening quote) + reset(); + + // we entered the function by reading an open quote + JSON_ASSERT(current == '\"'); + + while (true) + { + // get next character + switch (get()) + { + // end of file while parsing string + case std::char_traits::eof(): + { + error_message = "invalid string: missing closing quote"; + return token_type::parse_error; + } + + // closing quote + case '\"': + { + return token_type::value_string; + } + + // escapes + case '\\': + { + switch (get()) + { + // quotation mark + case '\"': + add('\"'); + break; + // reverse solidus + case '\\': + add('\\'); + break; + // solidus + case '/': + add('/'); + break; + // backspace + case 'b': + add('\b'); + break; + // form feed + case 'f': + add('\f'); + break; + // line feed + case 'n': + add('\n'); + break; + // carriage return + case 'r': + add('\r'); + break; + // tab + case 't': + add('\t'); + break; + + // unicode escapes + case 'u': + { + const int codepoint1 = get_codepoint(); + int codepoint = codepoint1; // start with codepoint1 + + if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1)) + { + error_message = "invalid string: '\\u' must be followed by 4 hex digits"; + return token_type::parse_error; + } + + // check if code point is a high surrogate + if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF) + { + // expect next \uxxxx entry + if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u')) + { + const int codepoint2 = get_codepoint(); + + if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1)) + { + error_message = "invalid string: '\\u' must be followed by 4 hex digits"; + return token_type::parse_error; + } + + // check if codepoint2 is a low surrogate + if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF)) + { + // overwrite codepoint + codepoint = static_cast( + // high surrogate occupies the most significant 22 bits + (static_cast(codepoint1) << 10u) + // low surrogate occupies the least significant 15 bits + + static_cast(codepoint2) + // there is still the 0xD800, 0xDC00 and 0x10000 noise + // in the result so we have to subtract with: + // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00 + - 0x35FDC00u); + } + else + { + error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF"; + return token_type::parse_error; + } + } + else + { + error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF"; + return token_type::parse_error; + } + } + else + { + if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF)) + { + error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF"; + return token_type::parse_error; + } + } + + // result of the above calculation yields a proper codepoint + JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF); + + // translate codepoint into bytes + if (codepoint < 0x80) + { + // 1-byte characters: 0xxxxxxx (ASCII) + add(static_cast(codepoint)); + } + else if (codepoint <= 0x7FF) + { + // 2-byte characters: 110xxxxx 10xxxxxx + add(static_cast(0xC0u | (static_cast(codepoint) >> 6u))); + add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); + } + else if (codepoint <= 0xFFFF) + { + // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx + add(static_cast(0xE0u | (static_cast(codepoint) >> 12u))); + add(static_cast(0x80u | ((static_cast(codepoint) >> 6u) & 0x3Fu))); + add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); + } + else + { + // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + add(static_cast(0xF0u | (static_cast(codepoint) >> 18u))); + add(static_cast(0x80u | ((static_cast(codepoint) >> 12u) & 0x3Fu))); + add(static_cast(0x80u | ((static_cast(codepoint) >> 6u) & 0x3Fu))); + add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); + } + + break; + } + + // other characters after escape + default: + error_message = "invalid string: forbidden character after backslash"; + return token_type::parse_error; + } + + break; + } + + // invalid control characters + case 0x00: + { + error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000"; + return token_type::parse_error; + } + + case 0x01: + { + error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001"; + return token_type::parse_error; + } + + case 0x02: + { + error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002"; + return token_type::parse_error; + } + + case 0x03: + { + error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003"; + return token_type::parse_error; + } + + case 0x04: + { + error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004"; + return token_type::parse_error; + } + + case 0x05: + { + error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005"; + return token_type::parse_error; + } + + case 0x06: + { + error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006"; + return token_type::parse_error; + } + + case 0x07: + { + error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007"; + return token_type::parse_error; + } + + case 0x08: + { + error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b"; + return token_type::parse_error; + } + + case 0x09: + { + error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t"; + return token_type::parse_error; + } + + case 0x0A: + { + error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n"; + return token_type::parse_error; + } + + case 0x0B: + { + error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B"; + return token_type::parse_error; + } + + case 0x0C: + { + error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f"; + return token_type::parse_error; + } + + case 0x0D: + { + error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r"; + return token_type::parse_error; + } + + case 0x0E: + { + error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E"; + return token_type::parse_error; + } + + case 0x0F: + { + error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F"; + return token_type::parse_error; + } + + case 0x10: + { + error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010"; + return token_type::parse_error; + } + + case 0x11: + { + error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011"; + return token_type::parse_error; + } + + case 0x12: + { + error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012"; + return token_type::parse_error; + } + + case 0x13: + { + error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013"; + return token_type::parse_error; + } + + case 0x14: + { + error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014"; + return token_type::parse_error; + } + + case 0x15: + { + error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015"; + return token_type::parse_error; + } + + case 0x16: + { + error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016"; + return token_type::parse_error; + } + + case 0x17: + { + error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017"; + return token_type::parse_error; + } + + case 0x18: + { + error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018"; + return token_type::parse_error; + } + + case 0x19: + { + error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019"; + return token_type::parse_error; + } + + case 0x1A: + { + error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A"; + return token_type::parse_error; + } + + case 0x1B: + { + error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B"; + return token_type::parse_error; + } + + case 0x1C: + { + error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C"; + return token_type::parse_error; + } + + case 0x1D: + { + error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D"; + return token_type::parse_error; + } + + case 0x1E: + { + error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E"; + return token_type::parse_error; + } + + case 0x1F: + { + error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F"; + return token_type::parse_error; + } + + // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace)) + case 0x20: + case 0x21: + case 0x23: + case 0x24: + case 0x25: + case 0x26: + case 0x27: + case 0x28: + case 0x29: + case 0x2A: + case 0x2B: + case 0x2C: + case 0x2D: + case 0x2E: + case 0x2F: + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + case 0x3A: + case 0x3B: + case 0x3C: + case 0x3D: + case 0x3E: + case 0x3F: + case 0x40: + case 0x41: + case 0x42: + case 0x43: + case 0x44: + case 0x45: + case 0x46: + case 0x47: + case 0x48: + case 0x49: + case 0x4A: + case 0x4B: + case 0x4C: + case 0x4D: + case 0x4E: + case 0x4F: + case 0x50: + case 0x51: + case 0x52: + case 0x53: + case 0x54: + case 0x55: + case 0x56: + case 0x57: + case 0x58: + case 0x59: + case 0x5A: + case 0x5B: + case 0x5D: + case 0x5E: + case 0x5F: + case 0x60: + case 0x61: + case 0x62: + case 0x63: + case 0x64: + case 0x65: + case 0x66: + case 0x67: + case 0x68: + case 0x69: + case 0x6A: + case 0x6B: + case 0x6C: + case 0x6D: + case 0x6E: + case 0x6F: + case 0x70: + case 0x71: + case 0x72: + case 0x73: + case 0x74: + case 0x75: + case 0x76: + case 0x77: + case 0x78: + case 0x79: + case 0x7A: + case 0x7B: + case 0x7C: + case 0x7D: + case 0x7E: + case 0x7F: + { + add(current); + break; + } + + // U+0080..U+07FF: bytes C2..DF 80..BF + case 0xC2: + case 0xC3: + case 0xC4: + case 0xC5: + case 0xC6: + case 0xC7: + case 0xC8: + case 0xC9: + case 0xCA: + case 0xCB: + case 0xCC: + case 0xCD: + case 0xCE: + case 0xCF: + case 0xD0: + case 0xD1: + case 0xD2: + case 0xD3: + case 0xD4: + case 0xD5: + case 0xD6: + case 0xD7: + case 0xD8: + case 0xD9: + case 0xDA: + case 0xDB: + case 0xDC: + case 0xDD: + case 0xDE: + case 0xDF: + { + if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF}))) + { + return token_type::parse_error; + } + break; + } + + // U+0800..U+0FFF: bytes E0 A0..BF 80..BF + case 0xE0: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF + // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF + case 0xE1: + case 0xE2: + case 0xE3: + case 0xE4: + case 0xE5: + case 0xE6: + case 0xE7: + case 0xE8: + case 0xE9: + case 0xEA: + case 0xEB: + case 0xEC: + case 0xEE: + case 0xEF: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // U+D000..U+D7FF: bytes ED 80..9F 80..BF + case 0xED: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF + case 0xF0: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF + case 0xF1: + case 0xF2: + case 0xF3: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF + case 0xF4: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // remaining bytes (80..C1 and F5..FF) are ill-formed + default: + { + error_message = "invalid string: ill-formed UTF-8 byte"; + return token_type::parse_error; + } + } + } + } + + /*! + * @brief scan a comment + * @return whether comment could be scanned successfully + */ + bool scan_comment() + { + switch (get()) + { + // single-line comments skip input until a newline or EOF is read + case '/': + { + while (true) + { + switch (get()) + { + case '\n': + case '\r': + case std::char_traits::eof(): + case '\0': + return true; + + default: + break; + } + } + } + + // multi-line comments skip input until */ is read + case '*': + { + while (true) + { + switch (get()) + { + case std::char_traits::eof(): + case '\0': + { + error_message = "invalid comment; missing closing '*/'"; + return false; + } + + case '*': + { + switch (get()) + { + case '/': + return true; + + default: + { + unget(); + break; + } + } + } + + default: + break; + } + } + } + + // unexpected character after reading '/' + default: + { + error_message = "invalid comment; expecting '/' or '*' after '/'"; + return false; + } + } + } + + JSON_HEDLEY_NON_NULL(2) + static void strtof(float& f, const char* str, char** endptr) noexcept + { + f = std::strtof(str, endptr); + } + + JSON_HEDLEY_NON_NULL(2) + static void strtof(double& f, const char* str, char** endptr) noexcept + { + f = std::strtod(str, endptr); + } + + JSON_HEDLEY_NON_NULL(2) + static void strtof(long double& f, const char* str, char** endptr) noexcept + { + f = std::strtold(str, endptr); + } + + /*! + @brief scan a number literal + + This function scans a string according to Sect. 6 of RFC 7159. + + The function is realized with a deterministic finite state machine derived + from the grammar described in RFC 7159. Starting in state "init", the + input is read and used to determined the next state. Only state "done" + accepts the number. State "error" is a trap state to model errors. In the + table below, "anything" means any character but the ones listed before. + + state | 0 | 1-9 | e E | + | - | . | anything + ---------|----------|----------|----------|---------|---------|----------|----------- + init | zero | any1 | [error] | [error] | minus | [error] | [error] + minus | zero | any1 | [error] | [error] | [error] | [error] | [error] + zero | done | done | exponent | done | done | decimal1 | done + any1 | any1 | any1 | exponent | done | done | decimal1 | done + decimal1 | decimal2 | decimal2 | [error] | [error] | [error] | [error] | [error] + decimal2 | decimal2 | decimal2 | exponent | done | done | done | done + exponent | any2 | any2 | [error] | sign | sign | [error] | [error] + sign | any2 | any2 | [error] | [error] | [error] | [error] | [error] + any2 | any2 | any2 | done | done | done | done | done + + The state machine is realized with one label per state (prefixed with + "scan_number_") and `goto` statements between them. The state machine + contains cycles, but any cycle can be left when EOF is read. Therefore, + the function is guaranteed to terminate. + + During scanning, the read bytes are stored in token_buffer. This string is + then converted to a signed integer, an unsigned integer, or a + floating-point number. + + @return token_type::value_unsigned, token_type::value_integer, or + token_type::value_float if number could be successfully scanned, + token_type::parse_error otherwise + + @note The scanner is independent of the current locale. Internally, the + locale's decimal point is used instead of `.` to work with the + locale-dependent converters. + */ + token_type scan_number() // lgtm [cpp/use-of-goto] + { + // reset token_buffer to store the number's bytes + reset(); + + // the type of the parsed number; initially set to unsigned; will be + // changed if minus sign, decimal point or exponent is read + token_type number_type = token_type::value_unsigned; + + // state (init): we just found out we need to scan a number + switch (current) + { + case '-': + { + add(current); + goto scan_number_minus; + } + + case '0': + { + add(current); + goto scan_number_zero; + } + + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any1; + } + + // all other characters are rejected outside scan_number() + default: // LCOV_EXCL_LINE + JSON_ASSERT(false); // LCOV_EXCL_LINE + } + +scan_number_minus: + // state: we just parsed a leading minus sign + number_type = token_type::value_integer; + switch (get()) + { + case '0': + { + add(current); + goto scan_number_zero; + } + + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any1; + } + + default: + { + error_message = "invalid number; expected digit after '-'"; + return token_type::parse_error; + } + } + +scan_number_zero: + // state: we just parse a zero (maybe with a leading minus sign) + switch (get()) + { + case '.': + { + add(decimal_point_char); + goto scan_number_decimal1; + } + + case 'e': + case 'E': + { + add(current); + goto scan_number_exponent; + } + + default: + goto scan_number_done; + } + +scan_number_any1: + // state: we just parsed a number 0-9 (maybe with a leading minus sign) + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any1; + } + + case '.': + { + add(decimal_point_char); + goto scan_number_decimal1; + } + + case 'e': + case 'E': + { + add(current); + goto scan_number_exponent; + } + + default: + goto scan_number_done; + } + +scan_number_decimal1: + // state: we just parsed a decimal point + number_type = token_type::value_float; + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_decimal2; + } + + default: + { + error_message = "invalid number; expected digit after '.'"; + return token_type::parse_error; + } + } + +scan_number_decimal2: + // we just parsed at least one number after a decimal point + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_decimal2; + } + + case 'e': + case 'E': + { + add(current); + goto scan_number_exponent; + } + + default: + goto scan_number_done; + } + +scan_number_exponent: + // we just parsed an exponent + number_type = token_type::value_float; + switch (get()) + { + case '+': + case '-': + { + add(current); + goto scan_number_sign; + } + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any2; + } + + default: + { + error_message = + "invalid number; expected '+', '-', or digit after exponent"; + return token_type::parse_error; + } + } + +scan_number_sign: + // we just parsed an exponent sign + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any2; + } + + default: + { + error_message = "invalid number; expected digit after exponent sign"; + return token_type::parse_error; + } + } + +scan_number_any2: + // we just parsed a number after the exponent or exponent sign + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any2; + } + + default: + goto scan_number_done; + } + +scan_number_done: + // unget the character after the number (we only read it to know that + // we are done scanning a number) + unget(); + + char* endptr = nullptr; + errno = 0; + + // try to parse integers first and fall back to floats + if (number_type == token_type::value_unsigned) + { + const auto x = std::strtoull(token_buffer.data(), &endptr, 10); + + // we checked the number format before + JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); + + if (errno == 0) + { + value_unsigned = static_cast(x); + if (value_unsigned == x) + { + return token_type::value_unsigned; + } + } + } + else if (number_type == token_type::value_integer) + { + const auto x = std::strtoll(token_buffer.data(), &endptr, 10); + + // we checked the number format before + JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); + + if (errno == 0) + { + value_integer = static_cast(x); + if (value_integer == x) + { + return token_type::value_integer; + } + } + } + + // this code is reached if we parse a floating-point number or if an + // integer conversion above failed + strtof(value_float, token_buffer.data(), &endptr); + + // we checked the number format before + JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); + + return token_type::value_float; + } + + /*! + @param[in] literal_text the literal text to expect + @param[in] length the length of the passed literal text + @param[in] return_type the token type to return on success + */ + JSON_HEDLEY_NON_NULL(2) + token_type scan_literal(const char_type* literal_text, const std::size_t length, + token_type return_type) + { + JSON_ASSERT(std::char_traits::to_char_type(current) == literal_text[0]); + for (std::size_t i = 1; i < length; ++i) + { + if (JSON_HEDLEY_UNLIKELY(std::char_traits::to_char_type(get()) != literal_text[i])) + { + error_message = "invalid literal"; + return token_type::parse_error; + } + } + return return_type; + } + + ///////////////////// + // input management + ///////////////////// + + /// reset token_buffer; current character is beginning of token + void reset() noexcept + { + token_buffer.clear(); + token_string.clear(); + token_string.push_back(std::char_traits::to_char_type(current)); + } + + /* + @brief get next character from the input + + This function provides the interface to the used input adapter. It does + not throw in case the input reached EOF, but returns a + `std::char_traits::eof()` in that case. Stores the scanned characters + for use in error messages. + + @return character read from the input + */ + char_int_type get() + { + ++position.chars_read_total; + ++position.chars_read_current_line; + + if (next_unget) + { + // just reset the next_unget variable and work with current + next_unget = false; + } + else + { + current = ia.get_character(); + } + + if (JSON_HEDLEY_LIKELY(current != std::char_traits::eof())) + { + token_string.push_back(std::char_traits::to_char_type(current)); + } + + if (current == '\n') + { + ++position.lines_read; + position.chars_read_current_line = 0; + } + + return current; + } + + /*! + @brief unget current character (read it again on next get) + + We implement unget by setting variable next_unget to true. The input is not + changed - we just simulate ungetting by modifying chars_read_total, + chars_read_current_line, and token_string. The next call to get() will + behave as if the unget character is read again. + */ + void unget() + { + next_unget = true; + + --position.chars_read_total; + + // in case we "unget" a newline, we have to also decrement the lines_read + if (position.chars_read_current_line == 0) + { + if (position.lines_read > 0) + { + --position.lines_read; + } + } + else + { + --position.chars_read_current_line; + } + + if (JSON_HEDLEY_LIKELY(current != std::char_traits::eof())) + { + JSON_ASSERT(!token_string.empty()); + token_string.pop_back(); + } + } + + /// add a character to token_buffer + void add(char_int_type c) + { + token_buffer.push_back(static_cast(c)); + } + + public: + ///////////////////// + // value getters + ///////////////////// + + /// return integer value + constexpr number_integer_t get_number_integer() const noexcept + { + return value_integer; + } + + /// return unsigned integer value + constexpr number_unsigned_t get_number_unsigned() const noexcept + { + return value_unsigned; + } + + /// return floating-point value + constexpr number_float_t get_number_float() const noexcept + { + return value_float; + } + + /// return current string value (implicitly resets the token; useful only once) + string_t& get_string() + { + return token_buffer; + } + + ///////////////////// + // diagnostics + ///////////////////// + + /// return position of last read token + constexpr position_t get_position() const noexcept + { + return position; + } + + /// return the last read token (for errors only). Will never contain EOF + /// (an arbitrary value that is not a valid char value, often -1), because + /// 255 may legitimately occur. May contain NUL, which should be escaped. + std::string get_token_string() const + { + // escape control characters + std::string result; + for (const auto c : token_string) + { + if (static_cast(c) <= '\x1F') + { + // escape control characters + std::array cs{{}}; + (std::snprintf)(cs.data(), cs.size(), "", static_cast(c)); + result += cs.data(); + } + else + { + // add character as is + result.push_back(static_cast(c)); + } + } + + return result; + } + + /// return syntax error message + JSON_HEDLEY_RETURNS_NON_NULL + constexpr const char* get_error_message() const noexcept + { + return error_message; + } + + ///////////////////// + // actual scanner + ///////////////////// + + /*! + @brief skip the UTF-8 byte order mark + @return true iff there is no BOM or the correct BOM has been skipped + */ + bool skip_bom() + { + if (get() == 0xEF) + { + // check if we completely parse the BOM + return get() == 0xBB && get() == 0xBF; + } + + // the first character is not the beginning of the BOM; unget it to + // process is later + unget(); + return true; + } + + void skip_whitespace() + { + do + { + get(); + } + while (current == ' ' || current == '\t' || current == '\n' || current == '\r'); + } + + token_type scan() + { + // initially, skip the BOM + if (position.chars_read_total == 0 && !skip_bom()) + { + error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given"; + return token_type::parse_error; + } + + // read next character and ignore whitespace + skip_whitespace(); + + // ignore comments + if (ignore_comments && current == '/') + { + if (!scan_comment()) + { + return token_type::parse_error; + } + + // skip following whitespace + skip_whitespace(); + } + + switch (current) + { + // structural characters + case '[': + return token_type::begin_array; + case ']': + return token_type::end_array; + case '{': + return token_type::begin_object; + case '}': + return token_type::end_object; + case ':': + return token_type::name_separator; + case ',': + return token_type::value_separator; + + // literals + case 't': + { + std::array true_literal = {{'t', 'r', 'u', 'e'}}; + return scan_literal(true_literal.data(), true_literal.size(), token_type::literal_true); + } + case 'f': + { + std::array false_literal = {{'f', 'a', 'l', 's', 'e'}}; + return scan_literal(false_literal.data(), false_literal.size(), token_type::literal_false); + } + case 'n': + { + std::array null_literal = {{'n', 'u', 'l', 'l'}}; + return scan_literal(null_literal.data(), null_literal.size(), token_type::literal_null); + } + + // string + case '\"': + return scan_string(); + + // number + case '-': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + return scan_number(); + + // end of input (the null byte is needed when parsing from + // string literals) + case '\0': + case std::char_traits::eof(): + return token_type::end_of_input; + + // error + default: + error_message = "invalid literal"; + return token_type::parse_error; + } + } + + private: + /// input adapter + InputAdapterType ia; + + /// whether comments should be ignored (true) or signaled as errors (false) + const bool ignore_comments = false; + + /// the current character + char_int_type current = std::char_traits::eof(); + + /// whether the next get() call should just return current + bool next_unget = false; + + /// the start position of the current token + position_t position {}; + + /// raw input token string (for error messages) + std::vector token_string {}; + + /// buffer for variable-length tokens (numbers, strings) + string_t token_buffer {}; + + /// a description of occurred lexer errors + const char* error_message = ""; + + // number values + number_integer_t value_integer = 0; + number_unsigned_t value_unsigned = 0; + number_float_t value_float = 0; + + /// the decimal point + const char_int_type decimal_point_char = '.'; +}; +} // namespace detail +} // namespace nlohmann + // #include // #include @@ -8009,6 +9638,11 @@ class binary_reader return get_number(input_format_t::ubjson, number) && sax->number_float(static_cast(number), ""); } + case 'H': + { + return get_ubjson_high_precision_number(); + } + case 'C': // char { get(); @@ -8185,6 +9819,55 @@ class binary_reader // Note, no reader for UBJSON binary types is implemented because they do // not exist + bool get_ubjson_high_precision_number() + { + // get size of following number string + std::size_t size{}; + auto res = get_ubjson_size_value(size); + if (JSON_HEDLEY_UNLIKELY(!res)) + { + return res; + } + + // get number string + std::vector number_vector; + for (std::size_t i = 0; i < size; ++i) + { + get(); + if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "number"))) + { + return false; + } + number_vector.push_back(static_cast(current)); + } + + // parse number string + auto number_ia = detail::input_adapter(std::forward(number_vector)); + auto number_lexer = detail::lexer(std::move(number_ia), false); + const auto result_number = number_lexer.scan(); + const auto number_string = number_lexer.get_token_string(); + const auto result_remainder = number_lexer.scan(); + + using token_type = typename detail::lexer_base::token_type; + + if (JSON_HEDLEY_UNLIKELY(result_remainder != token_type::end_of_input)) + { + return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number"))); + } + + switch (result_number) + { + case token_type::value_integer: + return sax->number_integer(number_lexer.get_number_integer()); + case token_type::value_unsigned: + return sax->number_unsigned(number_lexer.get_number_unsigned()); + case token_type::value_float: + return sax->number_float(number_lexer.get_number_float(), std::move(number_string)); + default: + return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number"))); + } + } + /////////////////////// // Utility functions // /////////////////////// @@ -8412,1632 +10095,6 @@ class binary_reader // #include - -#include // array -#include // localeconv -#include // size_t -#include // snprintf -#include // strtof, strtod, strtold, strtoll, strtoull -#include // initializer_list -#include // char_traits, string -#include // move -#include // vector - -// #include - -// #include - -// #include - - -namespace nlohmann -{ -namespace detail -{ -/////////// -// lexer // -/////////// - -template -class lexer_base -{ - public: - /// token types for the parser - enum class token_type - { - uninitialized, ///< indicating the scanner is uninitialized - literal_true, ///< the `true` literal - literal_false, ///< the `false` literal - literal_null, ///< the `null` literal - value_string, ///< a string -- use get_string() for actual value - value_unsigned, ///< an unsigned integer -- use get_number_unsigned() for actual value - value_integer, ///< a signed integer -- use get_number_integer() for actual value - value_float, ///< an floating point number -- use get_number_float() for actual value - begin_array, ///< the character for array begin `[` - begin_object, ///< the character for object begin `{` - end_array, ///< the character for array end `]` - end_object, ///< the character for object end `}` - name_separator, ///< the name separator `:` - value_separator, ///< the value separator `,` - parse_error, ///< indicating a parse error - end_of_input, ///< indicating the end of the input buffer - literal_or_value ///< a literal or the begin of a value (only for diagnostics) - }; - - /// return name of values of type token_type (only used for errors) - JSON_HEDLEY_RETURNS_NON_NULL - JSON_HEDLEY_CONST - static const char* token_type_name(const token_type t) noexcept - { - switch (t) - { - case token_type::uninitialized: - return ""; - case token_type::literal_true: - return "true literal"; - case token_type::literal_false: - return "false literal"; - case token_type::literal_null: - return "null literal"; - case token_type::value_string: - return "string literal"; - case token_type::value_unsigned: - case token_type::value_integer: - case token_type::value_float: - return "number literal"; - case token_type::begin_array: - return "'['"; - case token_type::begin_object: - return "'{'"; - case token_type::end_array: - return "']'"; - case token_type::end_object: - return "'}'"; - case token_type::name_separator: - return "':'"; - case token_type::value_separator: - return "','"; - case token_type::parse_error: - return ""; - case token_type::end_of_input: - return "end of input"; - case token_type::literal_or_value: - return "'[', '{', or a literal"; - // LCOV_EXCL_START - default: // catch non-enum values - return "unknown token"; - // LCOV_EXCL_STOP - } - } -}; -/*! -@brief lexical analysis - -This class organizes the lexical analysis during JSON deserialization. -*/ -template -class lexer : public lexer_base -{ - using number_integer_t = typename BasicJsonType::number_integer_t; - using number_unsigned_t = typename BasicJsonType::number_unsigned_t; - using number_float_t = typename BasicJsonType::number_float_t; - using string_t = typename BasicJsonType::string_t; - using char_type = typename InputAdapterType::char_type; - using char_int_type = typename std::char_traits::int_type; - - public: - using token_type = typename lexer_base::token_type; - - explicit lexer(InputAdapterType&& adapter, bool ignore_comments_ = false) - : ia(std::move(adapter)) - , ignore_comments(ignore_comments_) - , decimal_point_char(static_cast(get_decimal_point())) - {} - - // delete because of pointer members - lexer(const lexer&) = delete; - lexer(lexer&&) = default; - lexer& operator=(lexer&) = delete; - lexer& operator=(lexer&&) = default; - ~lexer() = default; - - private: - ///////////////////// - // locales - ///////////////////// - - /// return the locale-dependent decimal point - JSON_HEDLEY_PURE - static char get_decimal_point() noexcept - { - const auto* loc = localeconv(); - JSON_ASSERT(loc != nullptr); - return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point); - } - - ///////////////////// - // scan functions - ///////////////////// - - /*! - @brief get codepoint from 4 hex characters following `\u` - - For input "\u c1 c2 c3 c4" the codepoint is: - (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4 - = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0) - - Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f' - must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The - conversion is done by subtracting the offset (0x30, 0x37, and 0x57) - between the ASCII value of the character and the desired integer value. - - @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or - non-hex character) - */ - int get_codepoint() - { - // this function only makes sense after reading `\u` - JSON_ASSERT(current == 'u'); - int codepoint = 0; - - const auto factors = { 12u, 8u, 4u, 0u }; - for (const auto factor : factors) - { - get(); - - if (current >= '0' && current <= '9') - { - codepoint += static_cast((static_cast(current) - 0x30u) << factor); - } - else if (current >= 'A' && current <= 'F') - { - codepoint += static_cast((static_cast(current) - 0x37u) << factor); - } - else if (current >= 'a' && current <= 'f') - { - codepoint += static_cast((static_cast(current) - 0x57u) << factor); - } - else - { - return -1; - } - } - - JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF); - return codepoint; - } - - /*! - @brief check if the next byte(s) are inside a given range - - Adds the current byte and, for each passed range, reads a new byte and - checks if it is inside the range. If a violation was detected, set up an - error message and return false. Otherwise, return true. - - @param[in] ranges list of integers; interpreted as list of pairs of - inclusive lower and upper bound, respectively - - @pre The passed list @a ranges must have 2, 4, or 6 elements; that is, - 1, 2, or 3 pairs. This precondition is enforced by an assertion. - - @return true if and only if no range violation was detected - */ - bool next_byte_in_range(std::initializer_list ranges) - { - JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6); - add(current); - - for (auto range = ranges.begin(); range != ranges.end(); ++range) - { - get(); - if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range))) - { - add(current); - } - else - { - error_message = "invalid string: ill-formed UTF-8 byte"; - return false; - } - } - - return true; - } - - /*! - @brief scan a string literal - - This function scans a string according to Sect. 7 of RFC 7159. While - scanning, bytes are escaped and copied into buffer token_buffer. Then the - function returns successfully, token_buffer is *not* null-terminated (as it - may contain \0 bytes), and token_buffer.size() is the number of bytes in the - string. - - @return token_type::value_string if string could be successfully scanned, - token_type::parse_error otherwise - - @note In case of errors, variable error_message contains a textual - description. - */ - token_type scan_string() - { - // reset token_buffer (ignore opening quote) - reset(); - - // we entered the function by reading an open quote - JSON_ASSERT(current == '\"'); - - while (true) - { - // get next character - switch (get()) - { - // end of file while parsing string - case std::char_traits::eof(): - { - error_message = "invalid string: missing closing quote"; - return token_type::parse_error; - } - - // closing quote - case '\"': - { - return token_type::value_string; - } - - // escapes - case '\\': - { - switch (get()) - { - // quotation mark - case '\"': - add('\"'); - break; - // reverse solidus - case '\\': - add('\\'); - break; - // solidus - case '/': - add('/'); - break; - // backspace - case 'b': - add('\b'); - break; - // form feed - case 'f': - add('\f'); - break; - // line feed - case 'n': - add('\n'); - break; - // carriage return - case 'r': - add('\r'); - break; - // tab - case 't': - add('\t'); - break; - - // unicode escapes - case 'u': - { - const int codepoint1 = get_codepoint(); - int codepoint = codepoint1; // start with codepoint1 - - if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1)) - { - error_message = "invalid string: '\\u' must be followed by 4 hex digits"; - return token_type::parse_error; - } - - // check if code point is a high surrogate - if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF) - { - // expect next \uxxxx entry - if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u')) - { - const int codepoint2 = get_codepoint(); - - if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1)) - { - error_message = "invalid string: '\\u' must be followed by 4 hex digits"; - return token_type::parse_error; - } - - // check if codepoint2 is a low surrogate - if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF)) - { - // overwrite codepoint - codepoint = static_cast( - // high surrogate occupies the most significant 22 bits - (static_cast(codepoint1) << 10u) - // low surrogate occupies the least significant 15 bits - + static_cast(codepoint2) - // there is still the 0xD800, 0xDC00 and 0x10000 noise - // in the result so we have to subtract with: - // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00 - - 0x35FDC00u); - } - else - { - error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF"; - return token_type::parse_error; - } - } - else - { - error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF"; - return token_type::parse_error; - } - } - else - { - if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF)) - { - error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF"; - return token_type::parse_error; - } - } - - // result of the above calculation yields a proper codepoint - JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF); - - // translate codepoint into bytes - if (codepoint < 0x80) - { - // 1-byte characters: 0xxxxxxx (ASCII) - add(static_cast(codepoint)); - } - else if (codepoint <= 0x7FF) - { - // 2-byte characters: 110xxxxx 10xxxxxx - add(static_cast(0xC0u | (static_cast(codepoint) >> 6u))); - add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); - } - else if (codepoint <= 0xFFFF) - { - // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx - add(static_cast(0xE0u | (static_cast(codepoint) >> 12u))); - add(static_cast(0x80u | ((static_cast(codepoint) >> 6u) & 0x3Fu))); - add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); - } - else - { - // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - add(static_cast(0xF0u | (static_cast(codepoint) >> 18u))); - add(static_cast(0x80u | ((static_cast(codepoint) >> 12u) & 0x3Fu))); - add(static_cast(0x80u | ((static_cast(codepoint) >> 6u) & 0x3Fu))); - add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); - } - - break; - } - - // other characters after escape - default: - error_message = "invalid string: forbidden character after backslash"; - return token_type::parse_error; - } - - break; - } - - // invalid control characters - case 0x00: - { - error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000"; - return token_type::parse_error; - } - - case 0x01: - { - error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001"; - return token_type::parse_error; - } - - case 0x02: - { - error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002"; - return token_type::parse_error; - } - - case 0x03: - { - error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003"; - return token_type::parse_error; - } - - case 0x04: - { - error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004"; - return token_type::parse_error; - } - - case 0x05: - { - error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005"; - return token_type::parse_error; - } - - case 0x06: - { - error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006"; - return token_type::parse_error; - } - - case 0x07: - { - error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007"; - return token_type::parse_error; - } - - case 0x08: - { - error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b"; - return token_type::parse_error; - } - - case 0x09: - { - error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t"; - return token_type::parse_error; - } - - case 0x0A: - { - error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n"; - return token_type::parse_error; - } - - case 0x0B: - { - error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B"; - return token_type::parse_error; - } - - case 0x0C: - { - error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f"; - return token_type::parse_error; - } - - case 0x0D: - { - error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r"; - return token_type::parse_error; - } - - case 0x0E: - { - error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E"; - return token_type::parse_error; - } - - case 0x0F: - { - error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F"; - return token_type::parse_error; - } - - case 0x10: - { - error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010"; - return token_type::parse_error; - } - - case 0x11: - { - error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011"; - return token_type::parse_error; - } - - case 0x12: - { - error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012"; - return token_type::parse_error; - } - - case 0x13: - { - error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013"; - return token_type::parse_error; - } - - case 0x14: - { - error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014"; - return token_type::parse_error; - } - - case 0x15: - { - error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015"; - return token_type::parse_error; - } - - case 0x16: - { - error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016"; - return token_type::parse_error; - } - - case 0x17: - { - error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017"; - return token_type::parse_error; - } - - case 0x18: - { - error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018"; - return token_type::parse_error; - } - - case 0x19: - { - error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019"; - return token_type::parse_error; - } - - case 0x1A: - { - error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A"; - return token_type::parse_error; - } - - case 0x1B: - { - error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B"; - return token_type::parse_error; - } - - case 0x1C: - { - error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C"; - return token_type::parse_error; - } - - case 0x1D: - { - error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D"; - return token_type::parse_error; - } - - case 0x1E: - { - error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E"; - return token_type::parse_error; - } - - case 0x1F: - { - error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F"; - return token_type::parse_error; - } - - // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace)) - case 0x20: - case 0x21: - case 0x23: - case 0x24: - case 0x25: - case 0x26: - case 0x27: - case 0x28: - case 0x29: - case 0x2A: - case 0x2B: - case 0x2C: - case 0x2D: - case 0x2E: - case 0x2F: - case 0x30: - case 0x31: - case 0x32: - case 0x33: - case 0x34: - case 0x35: - case 0x36: - case 0x37: - case 0x38: - case 0x39: - case 0x3A: - case 0x3B: - case 0x3C: - case 0x3D: - case 0x3E: - case 0x3F: - case 0x40: - case 0x41: - case 0x42: - case 0x43: - case 0x44: - case 0x45: - case 0x46: - case 0x47: - case 0x48: - case 0x49: - case 0x4A: - case 0x4B: - case 0x4C: - case 0x4D: - case 0x4E: - case 0x4F: - case 0x50: - case 0x51: - case 0x52: - case 0x53: - case 0x54: - case 0x55: - case 0x56: - case 0x57: - case 0x58: - case 0x59: - case 0x5A: - case 0x5B: - case 0x5D: - case 0x5E: - case 0x5F: - case 0x60: - case 0x61: - case 0x62: - case 0x63: - case 0x64: - case 0x65: - case 0x66: - case 0x67: - case 0x68: - case 0x69: - case 0x6A: - case 0x6B: - case 0x6C: - case 0x6D: - case 0x6E: - case 0x6F: - case 0x70: - case 0x71: - case 0x72: - case 0x73: - case 0x74: - case 0x75: - case 0x76: - case 0x77: - case 0x78: - case 0x79: - case 0x7A: - case 0x7B: - case 0x7C: - case 0x7D: - case 0x7E: - case 0x7F: - { - add(current); - break; - } - - // U+0080..U+07FF: bytes C2..DF 80..BF - case 0xC2: - case 0xC3: - case 0xC4: - case 0xC5: - case 0xC6: - case 0xC7: - case 0xC8: - case 0xC9: - case 0xCA: - case 0xCB: - case 0xCC: - case 0xCD: - case 0xCE: - case 0xCF: - case 0xD0: - case 0xD1: - case 0xD2: - case 0xD3: - case 0xD4: - case 0xD5: - case 0xD6: - case 0xD7: - case 0xD8: - case 0xD9: - case 0xDA: - case 0xDB: - case 0xDC: - case 0xDD: - case 0xDE: - case 0xDF: - { - if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF}))) - { - return token_type::parse_error; - } - break; - } - - // U+0800..U+0FFF: bytes E0 A0..BF 80..BF - case 0xE0: - { - if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF})))) - { - return token_type::parse_error; - } - break; - } - - // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF - // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF - case 0xE1: - case 0xE2: - case 0xE3: - case 0xE4: - case 0xE5: - case 0xE6: - case 0xE7: - case 0xE8: - case 0xE9: - case 0xEA: - case 0xEB: - case 0xEC: - case 0xEE: - case 0xEF: - { - if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF})))) - { - return token_type::parse_error; - } - break; - } - - // U+D000..U+D7FF: bytes ED 80..9F 80..BF - case 0xED: - { - if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF})))) - { - return token_type::parse_error; - } - break; - } - - // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF - case 0xF0: - { - if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) - { - return token_type::parse_error; - } - break; - } - - // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF - case 0xF1: - case 0xF2: - case 0xF3: - { - if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) - { - return token_type::parse_error; - } - break; - } - - // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF - case 0xF4: - { - if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF})))) - { - return token_type::parse_error; - } - break; - } - - // remaining bytes (80..C1 and F5..FF) are ill-formed - default: - { - error_message = "invalid string: ill-formed UTF-8 byte"; - return token_type::parse_error; - } - } - } - } - - /*! - * @brief scan a comment - * @return whether comment could be scanned successfully - */ - bool scan_comment() - { - switch (get()) - { - // single-line comments skip input until a newline or EOF is read - case '/': - { - while (true) - { - switch (get()) - { - case '\n': - case '\r': - case std::char_traits::eof(): - case '\0': - return true; - - default: - break; - } - } - } - - // multi-line comments skip input until */ is read - case '*': - { - while (true) - { - switch (get()) - { - case std::char_traits::eof(): - case '\0': - { - error_message = "invalid comment; missing closing '*/'"; - return false; - } - - case '*': - { - switch (get()) - { - case '/': - return true; - - default: - { - unget(); - break; - } - } - } - - default: - break; - } - } - } - - // unexpected character after reading '/' - default: - { - error_message = "invalid comment; expecting '/' or '*' after '/'"; - return false; - } - } - } - - JSON_HEDLEY_NON_NULL(2) - static void strtof(float& f, const char* str, char** endptr) noexcept - { - f = std::strtof(str, endptr); - } - - JSON_HEDLEY_NON_NULL(2) - static void strtof(double& f, const char* str, char** endptr) noexcept - { - f = std::strtod(str, endptr); - } - - JSON_HEDLEY_NON_NULL(2) - static void strtof(long double& f, const char* str, char** endptr) noexcept - { - f = std::strtold(str, endptr); - } - - /*! - @brief scan a number literal - - This function scans a string according to Sect. 6 of RFC 7159. - - The function is realized with a deterministic finite state machine derived - from the grammar described in RFC 7159. Starting in state "init", the - input is read and used to determined the next state. Only state "done" - accepts the number. State "error" is a trap state to model errors. In the - table below, "anything" means any character but the ones listed before. - - state | 0 | 1-9 | e E | + | - | . | anything - ---------|----------|----------|----------|---------|---------|----------|----------- - init | zero | any1 | [error] | [error] | minus | [error] | [error] - minus | zero | any1 | [error] | [error] | [error] | [error] | [error] - zero | done | done | exponent | done | done | decimal1 | done - any1 | any1 | any1 | exponent | done | done | decimal1 | done - decimal1 | decimal2 | decimal2 | [error] | [error] | [error] | [error] | [error] - decimal2 | decimal2 | decimal2 | exponent | done | done | done | done - exponent | any2 | any2 | [error] | sign | sign | [error] | [error] - sign | any2 | any2 | [error] | [error] | [error] | [error] | [error] - any2 | any2 | any2 | done | done | done | done | done - - The state machine is realized with one label per state (prefixed with - "scan_number_") and `goto` statements between them. The state machine - contains cycles, but any cycle can be left when EOF is read. Therefore, - the function is guaranteed to terminate. - - During scanning, the read bytes are stored in token_buffer. This string is - then converted to a signed integer, an unsigned integer, or a - floating-point number. - - @return token_type::value_unsigned, token_type::value_integer, or - token_type::value_float if number could be successfully scanned, - token_type::parse_error otherwise - - @note The scanner is independent of the current locale. Internally, the - locale's decimal point is used instead of `.` to work with the - locale-dependent converters. - */ - token_type scan_number() // lgtm [cpp/use-of-goto] - { - // reset token_buffer to store the number's bytes - reset(); - - // the type of the parsed number; initially set to unsigned; will be - // changed if minus sign, decimal point or exponent is read - token_type number_type = token_type::value_unsigned; - - // state (init): we just found out we need to scan a number - switch (current) - { - case '-': - { - add(current); - goto scan_number_minus; - } - - case '0': - { - add(current); - goto scan_number_zero; - } - - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any1; - } - - // all other characters are rejected outside scan_number() - default: // LCOV_EXCL_LINE - JSON_ASSERT(false); // LCOV_EXCL_LINE - } - -scan_number_minus: - // state: we just parsed a leading minus sign - number_type = token_type::value_integer; - switch (get()) - { - case '0': - { - add(current); - goto scan_number_zero; - } - - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any1; - } - - default: - { - error_message = "invalid number; expected digit after '-'"; - return token_type::parse_error; - } - } - -scan_number_zero: - // state: we just parse a zero (maybe with a leading minus sign) - switch (get()) - { - case '.': - { - add(decimal_point_char); - goto scan_number_decimal1; - } - - case 'e': - case 'E': - { - add(current); - goto scan_number_exponent; - } - - default: - goto scan_number_done; - } - -scan_number_any1: - // state: we just parsed a number 0-9 (maybe with a leading minus sign) - switch (get()) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any1; - } - - case '.': - { - add(decimal_point_char); - goto scan_number_decimal1; - } - - case 'e': - case 'E': - { - add(current); - goto scan_number_exponent; - } - - default: - goto scan_number_done; - } - -scan_number_decimal1: - // state: we just parsed a decimal point - number_type = token_type::value_float; - switch (get()) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_decimal2; - } - - default: - { - error_message = "invalid number; expected digit after '.'"; - return token_type::parse_error; - } - } - -scan_number_decimal2: - // we just parsed at least one number after a decimal point - switch (get()) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_decimal2; - } - - case 'e': - case 'E': - { - add(current); - goto scan_number_exponent; - } - - default: - goto scan_number_done; - } - -scan_number_exponent: - // we just parsed an exponent - number_type = token_type::value_float; - switch (get()) - { - case '+': - case '-': - { - add(current); - goto scan_number_sign; - } - - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any2; - } - - default: - { - error_message = - "invalid number; expected '+', '-', or digit after exponent"; - return token_type::parse_error; - } - } - -scan_number_sign: - // we just parsed an exponent sign - switch (get()) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any2; - } - - default: - { - error_message = "invalid number; expected digit after exponent sign"; - return token_type::parse_error; - } - } - -scan_number_any2: - // we just parsed a number after the exponent or exponent sign - switch (get()) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any2; - } - - default: - goto scan_number_done; - } - -scan_number_done: - // unget the character after the number (we only read it to know that - // we are done scanning a number) - unget(); - - char* endptr = nullptr; - errno = 0; - - // try to parse integers first and fall back to floats - if (number_type == token_type::value_unsigned) - { - const auto x = std::strtoull(token_buffer.data(), &endptr, 10); - - // we checked the number format before - JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); - - if (errno == 0) - { - value_unsigned = static_cast(x); - if (value_unsigned == x) - { - return token_type::value_unsigned; - } - } - } - else if (number_type == token_type::value_integer) - { - const auto x = std::strtoll(token_buffer.data(), &endptr, 10); - - // we checked the number format before - JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); - - if (errno == 0) - { - value_integer = static_cast(x); - if (value_integer == x) - { - return token_type::value_integer; - } - } - } - - // this code is reached if we parse a floating-point number or if an - // integer conversion above failed - strtof(value_float, token_buffer.data(), &endptr); - - // we checked the number format before - JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); - - return token_type::value_float; - } - - /*! - @param[in] literal_text the literal text to expect - @param[in] length the length of the passed literal text - @param[in] return_type the token type to return on success - */ - JSON_HEDLEY_NON_NULL(2) - token_type scan_literal(const char_type* literal_text, const std::size_t length, - token_type return_type) - { - JSON_ASSERT(std::char_traits::to_char_type(current) == literal_text[0]); - for (std::size_t i = 1; i < length; ++i) - { - if (JSON_HEDLEY_UNLIKELY(std::char_traits::to_char_type(get()) != literal_text[i])) - { - error_message = "invalid literal"; - return token_type::parse_error; - } - } - return return_type; - } - - ///////////////////// - // input management - ///////////////////// - - /// reset token_buffer; current character is beginning of token - void reset() noexcept - { - token_buffer.clear(); - token_string.clear(); - token_string.push_back(std::char_traits::to_char_type(current)); - } - - /* - @brief get next character from the input - - This function provides the interface to the used input adapter. It does - not throw in case the input reached EOF, but returns a - `std::char_traits::eof()` in that case. Stores the scanned characters - for use in error messages. - - @return character read from the input - */ - char_int_type get() - { - ++position.chars_read_total; - ++position.chars_read_current_line; - - if (next_unget) - { - // just reset the next_unget variable and work with current - next_unget = false; - } - else - { - current = ia.get_character(); - } - - if (JSON_HEDLEY_LIKELY(current != std::char_traits::eof())) - { - token_string.push_back(std::char_traits::to_char_type(current)); - } - - if (current == '\n') - { - ++position.lines_read; - position.chars_read_current_line = 0; - } - - return current; - } - - /*! - @brief unget current character (read it again on next get) - - We implement unget by setting variable next_unget to true. The input is not - changed - we just simulate ungetting by modifying chars_read_total, - chars_read_current_line, and token_string. The next call to get() will - behave as if the unget character is read again. - */ - void unget() - { - next_unget = true; - - --position.chars_read_total; - - // in case we "unget" a newline, we have to also decrement the lines_read - if (position.chars_read_current_line == 0) - { - if (position.lines_read > 0) - { - --position.lines_read; - } - } - else - { - --position.chars_read_current_line; - } - - if (JSON_HEDLEY_LIKELY(current != std::char_traits::eof())) - { - JSON_ASSERT(!token_string.empty()); - token_string.pop_back(); - } - } - - /// add a character to token_buffer - void add(char_int_type c) - { - token_buffer.push_back(static_cast(c)); - } - - public: - ///////////////////// - // value getters - ///////////////////// - - /// return integer value - constexpr number_integer_t get_number_integer() const noexcept - { - return value_integer; - } - - /// return unsigned integer value - constexpr number_unsigned_t get_number_unsigned() const noexcept - { - return value_unsigned; - } - - /// return floating-point value - constexpr number_float_t get_number_float() const noexcept - { - return value_float; - } - - /// return current string value (implicitly resets the token; useful only once) - string_t& get_string() - { - return token_buffer; - } - - ///////////////////// - // diagnostics - ///////////////////// - - /// return position of last read token - constexpr position_t get_position() const noexcept - { - return position; - } - - /// return the last read token (for errors only). Will never contain EOF - /// (an arbitrary value that is not a valid char value, often -1), because - /// 255 may legitimately occur. May contain NUL, which should be escaped. - std::string get_token_string() const - { - // escape control characters - std::string result; - for (const auto c : token_string) - { - if (static_cast(c) <= '\x1F') - { - // escape control characters - std::array cs{{}}; - (std::snprintf)(cs.data(), cs.size(), "", static_cast(c)); - result += cs.data(); - } - else - { - // add character as is - result.push_back(static_cast(c)); - } - } - - return result; - } - - /// return syntax error message - JSON_HEDLEY_RETURNS_NON_NULL - constexpr const char* get_error_message() const noexcept - { - return error_message; - } - - ///////////////////// - // actual scanner - ///////////////////// - - /*! - @brief skip the UTF-8 byte order mark - @return true iff there is no BOM or the correct BOM has been skipped - */ - bool skip_bom() - { - if (get() == 0xEF) - { - // check if we completely parse the BOM - return get() == 0xBB && get() == 0xBF; - } - - // the first character is not the beginning of the BOM; unget it to - // process is later - unget(); - return true; - } - - void skip_whitespace() - { - do - { - get(); - } - while (current == ' ' || current == '\t' || current == '\n' || current == '\r'); - } - - token_type scan() - { - // initially, skip the BOM - if (position.chars_read_total == 0 && !skip_bom()) - { - error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given"; - return token_type::parse_error; - } - - // read next character and ignore whitespace - skip_whitespace(); - - // ignore comments - if (ignore_comments && current == '/') - { - if (!scan_comment()) - { - return token_type::parse_error; - } - - // skip following whitespace - skip_whitespace(); - } - - switch (current) - { - // structural characters - case '[': - return token_type::begin_array; - case ']': - return token_type::end_array; - case '{': - return token_type::begin_object; - case '}': - return token_type::end_object; - case ':': - return token_type::name_separator; - case ',': - return token_type::value_separator; - - // literals - case 't': - { - std::array true_literal = {{'t', 'r', 'u', 'e'}}; - return scan_literal(true_literal.data(), true_literal.size(), token_type::literal_true); - } - case 'f': - { - std::array false_literal = {{'f', 'a', 'l', 's', 'e'}}; - return scan_literal(false_literal.data(), false_literal.size(), token_type::literal_false); - } - case 'n': - { - std::array null_literal = {{'n', 'u', 'l', 'l'}}; - return scan_literal(null_literal.data(), null_literal.size(), token_type::literal_null); - } - - // string - case '\"': - return scan_string(); - - // number - case '-': - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - return scan_number(); - - // end of input (the null byte is needed when parsing from - // string literals) - case '\0': - case std::char_traits::eof(): - return token_type::end_of_input; - - // error - default: - error_message = "invalid literal"; - return token_type::parse_error; - } - } - - private: - /// input adapter - InputAdapterType ia; - - /// whether comments should be ignored (true) or signaled as errors (false) - const bool ignore_comments = false; - - /// the current character - char_int_type current = std::char_traits::eof(); - - /// whether the next get() call should just return current - bool next_unget = false; - - /// the start position of the current token - position_t position {}; - - /// raw input token string (for error messages) - std::vector token_string {}; - - /// buffer for variable-length tokens (numbers, strings) - string_t token_buffer {}; - - /// a description of occurred lexer errors - const char* error_message = ""; - - // number values - number_integer_t value_integer = 0; - number_unsigned_t value_unsigned = 0; - number_float_t value_float = 0; - - /// the decimal point - const char_int_type decimal_point_char = '.'; -}; -} // namespace detail -} // namespace nlohmann - // #include @@ -13981,7 +14038,17 @@ class binary_writer } else { - JSON_THROW(out_of_range::create(407, "integer number " + std::to_string(n) + " cannot be represented by UBJSON as it does not fit int64")); + if (add_prefix) + { + oa->write_character(to_char_type('H')); // high-precision number + } + + const auto number = BasicJsonType(n).dump(); + write_number_with_ubjson_prefix(number.size(), true); + for (std::size_t i = 0; i < number.size(); ++i) + { + oa->write_character(to_char_type(static_cast(number[i]))); + } } } @@ -14035,19 +14102,23 @@ class binary_writer // LCOV_EXCL_START else { - JSON_THROW(out_of_range::create(407, "integer number " + std::to_string(n) + " cannot be represented by UBJSON as it does not fit int64")); + if (add_prefix) + { + oa->write_character(to_char_type('H')); // high-precision number + } + + const auto number = BasicJsonType(n).dump(); + write_number_with_ubjson_prefix(number.size(), true); + for (std::size_t i = 0; i < number.size(); ++i) + { + oa->write_character(to_char_type(static_cast(number[i]))); + } } // LCOV_EXCL_STOP } /*! @brief determine the type prefix of container values - - @note This function does not need to be 100% accurate when it comes to - integer limits. In case a number exceeds the limits of int64_t, - this will be detected by a later call to function - write_number_with_ubjson_prefix. Therefore, we return 'L' for any - value that does not fit the previous limits. */ CharType ubjson_prefix(const BasicJsonType& j) const noexcept { @@ -14077,8 +14148,12 @@ class binary_writer { return 'l'; } - // no check and assume int64_t (see note above) - return 'L'; + if ((std::numeric_limits::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits::max)()) + { + return 'L'; + } + // anything else is treated as high-precision number + return 'H'; } case value_t::number_unsigned: @@ -14099,8 +14174,12 @@ class binary_writer { return 'l'; } - // no check and assume int64_t (see note above) - return 'L'; + if (j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + { + return 'L'; + } + // anything else is treated as high-precision number + return 'H'; } case value_t::number_float: @@ -23504,6 +23583,7 @@ class basic_json number_unsigned | 256..32767 | int16 | `I` number_unsigned | 32768..2147483647 | int32 | `l` number_unsigned | 2147483648..9223372036854775807 | int64 | `L` + number_unsigned | 2147483649..18446744073709551615 | high-precision | `H` number_float | *any value* | float64 | `D` string | *with shortest length indicator* | string | `S` array | *see notes on optimized format* | array | `[` @@ -23514,7 +23594,6 @@ class basic_json @note The following values can **not** be converted to a UBJSON value: - strings with more than 9223372036854775807 bytes (theoretical) - - unsigned integer numbers above 9223372036854775807 @note The following markers are not used in the conversion: - `Z`: no-op values are not created. @@ -23990,6 +24069,7 @@ class basic_json int16 | number_integer | `I` int32 | number_integer | `l` int64 | number_integer | `L` + high-precision number | number_integer, number_unsigned, or number_float - depends on number string | 'H' string | string | `S` char | string | `C` array | array (optimized values are supported) | `[` From 33662417c167892ba31e78c91fbd16ddaaf5741d Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Fri, 24 Jul 2020 09:32:03 +0200 Subject: [PATCH 13/14] :white_check_mark: improve coverage --- .../nlohmann/detail/output/binary_writer.hpp | 4 ++-- single_include/nlohmann/json.hpp | 4 ++-- test/src/unit-ubjson.cpp | 19 +++++++++++++------ 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/include/nlohmann/detail/output/binary_writer.hpp b/include/nlohmann/detail/output/binary_writer.hpp index 8b864b57..0c6185e0 100644 --- a/include/nlohmann/detail/output/binary_writer.hpp +++ b/include/nlohmann/detail/output/binary_writer.hpp @@ -1434,7 +1434,7 @@ class binary_writer return 'L'; } // anything else is treated as high-precision number - return 'H'; + return 'H'; // LCOV_EXCL_LINE } case value_t::number_unsigned: @@ -1460,7 +1460,7 @@ class binary_writer return 'L'; } // anything else is treated as high-precision number - return 'H'; + return 'H'; // LCOV_EXCL_LINE } case value_t::number_float: diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index a845846a..9770712a 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -14153,7 +14153,7 @@ class binary_writer return 'L'; } // anything else is treated as high-precision number - return 'H'; + return 'H'; // LCOV_EXCL_LINE } case value_t::number_unsigned: @@ -14179,7 +14179,7 @@ class binary_writer return 'L'; } // anything else is treated as high-precision number - return 'H'; + return 'H'; // LCOV_EXCL_LINE } case value_t::number_float: diff --git a/test/src/unit-ubjson.cpp b/test/src/unit-ubjson.cpp index c8b5f6bc..d55f52e8 100644 --- a/test/src/unit-ubjson.cpp +++ b/test/src/unit-ubjson.cpp @@ -797,12 +797,19 @@ TEST_CASE("UBJSON") SECTION("errors") { - std::vector vec1 = {'H', 'i', 2, '1', 'A', '3'}; - CHECK_THROWS_WITH_AS(json::from_ubjson(vec1), "[json.exception.parse_error.115] parse error at byte 5: syntax error while parsing UBJSON high-precision number: invalid number text: 1A", json::parse_error); - std::vector vec2 = {'H', 'i', 2, '1', '.'}; - CHECK_THROWS_WITH_AS(json::from_ubjson(vec2), "[json.exception.parse_error.115] parse error at byte 5: syntax error while parsing UBJSON high-precision number: invalid number text: 1.", json::parse_error); - std::vector vec3 = {'H', 2, '1', '0'}; - CHECK_THROWS_WITH_AS(json::from_ubjson(vec3), "[json.exception.parse_error.113] parse error at byte 2: syntax error while parsing UBJSON size: expected length type specification (U, i, I, l, L) after '#'; last byte: 0x02", json::parse_error); + // error while parsing length + std::vector vec0 = {'H', 'i'}; + CHECK(json::from_ubjson(vec0, true, false).is_discarded()); + // error while parsing string + std::vector vec1 = {'H', 'i', '1'}; + CHECK(json::from_ubjson(vec1, true, false).is_discarded()); + + std::vector vec2 = {'H', 'i', 2, '1', 'A', '3'}; + CHECK_THROWS_WITH_AS(json::from_ubjson(vec2), "[json.exception.parse_error.115] parse error at byte 5: syntax error while parsing UBJSON high-precision number: invalid number text: 1A", json::parse_error); + std::vector vec3 = {'H', 'i', 2, '1', '.'}; + CHECK_THROWS_WITH_AS(json::from_ubjson(vec3), "[json.exception.parse_error.115] parse error at byte 5: syntax error while parsing UBJSON high-precision number: invalid number text: 1.", json::parse_error); + std::vector vec4 = {'H', 2, '1', '0'}; + CHECK_THROWS_WITH_AS(json::from_ubjson(vec4), "[json.exception.parse_error.113] parse error at byte 2: syntax error while parsing UBJSON size: expected length type specification (U, i, I, l, L) after '#'; last byte: 0x02", json::parse_error); } SECTION("serialization") From 980f8c6f6157b11dc908554d56ca75028f5a9657 Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Sat, 25 Jul 2020 21:45:47 +0200 Subject: [PATCH 14/14] :twisted_rightwards_arrows: merge develop branch --- single_include/nlohmann/json.hpp | 3364 +++++++++++++++--------------- 1 file changed, 1722 insertions(+), 1642 deletions(-) diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index 68198efb..8743c76b 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -2403,6 +2403,7 @@ json.exception.parse_error.110 | parse error at 1: cannot read 2 bytes from vect json.exception.parse_error.112 | parse error at 1: error reading CBOR; last byte: 0xF8 | Not all types of CBOR or MessagePack are supported. This exception occurs if an unsupported byte was read. json.exception.parse_error.113 | parse error at 2: expected a CBOR string; last byte: 0x98 | While parsing a map key, a value that is not a string has been read. json.exception.parse_error.114 | parse error: Unsupported BSON record type 0x0F | The parsing of the corresponding BSON record type is not implemented (yet). +json.exception.parse_error.115 | parse error at byte 5: syntax error while parsing UBJSON high-precision number: invalid number text: 1A | A UBJSON high-precision number could not be parsed. @note For an input with n bytes, 1 is the index of the first character and n+1 is the index of the terminating null byte or the end of file. This also @@ -2591,7 +2592,7 @@ json.exception.out_of_range.403 | key 'foo' not found | The provided key was not json.exception.out_of_range.404 | unresolved reference token 'foo' | A reference token in a JSON Pointer could not be resolved. json.exception.out_of_range.405 | JSON pointer has no parent | The JSON Patch operations 'remove' and 'add' can not be applied to the root element of the JSON value. json.exception.out_of_range.406 | number overflow parsing '10E1000' | A parsed number could not be stored as without changing it to NaN or INF. -json.exception.out_of_range.407 | number overflow serializing '9223372036854775808' | UBJSON and BSON only support integer numbers up to 9223372036854775807. | +json.exception.out_of_range.407 | number overflow serializing '9223372036854775808' | UBJSON and BSON only support integer numbers up to 9223372036854775807. (until version 3.8.0) | json.exception.out_of_range.408 | excessive array size: 8658170730974374167 | The size (following `#`) of an UBJSON array or object exceeds the maximal capacity. | json.exception.out_of_range.409 | BSON key cannot contain code point U+0000 (at byte 2) | Key identifiers to be serialized to BSON cannot contain code point U+0000, since the key is stored as zero-terminated c-string | @@ -5870,6 +5871,1634 @@ class json_sax_acceptor } // namespace nlohmann +// #include + + +#include // array +#include // localeconv +#include // size_t +#include // snprintf +#include // strtof, strtod, strtold, strtoll, strtoull +#include // initializer_list +#include // char_traits, string +#include // move +#include // vector + +// #include + +// #include + +// #include + + +namespace nlohmann +{ +namespace detail +{ +/////////// +// lexer // +/////////// + +template +class lexer_base +{ + public: + /// token types for the parser + enum class token_type + { + uninitialized, ///< indicating the scanner is uninitialized + literal_true, ///< the `true` literal + literal_false, ///< the `false` literal + literal_null, ///< the `null` literal + value_string, ///< a string -- use get_string() for actual value + value_unsigned, ///< an unsigned integer -- use get_number_unsigned() for actual value + value_integer, ///< a signed integer -- use get_number_integer() for actual value + value_float, ///< an floating point number -- use get_number_float() for actual value + begin_array, ///< the character for array begin `[` + begin_object, ///< the character for object begin `{` + end_array, ///< the character for array end `]` + end_object, ///< the character for object end `}` + name_separator, ///< the name separator `:` + value_separator, ///< the value separator `,` + parse_error, ///< indicating a parse error + end_of_input, ///< indicating the end of the input buffer + literal_or_value ///< a literal or the begin of a value (only for diagnostics) + }; + + /// return name of values of type token_type (only used for errors) + JSON_HEDLEY_RETURNS_NON_NULL + JSON_HEDLEY_CONST + static const char* token_type_name(const token_type t) noexcept + { + switch (t) + { + case token_type::uninitialized: + return ""; + case token_type::literal_true: + return "true literal"; + case token_type::literal_false: + return "false literal"; + case token_type::literal_null: + return "null literal"; + case token_type::value_string: + return "string literal"; + case token_type::value_unsigned: + case token_type::value_integer: + case token_type::value_float: + return "number literal"; + case token_type::begin_array: + return "'['"; + case token_type::begin_object: + return "'{'"; + case token_type::end_array: + return "']'"; + case token_type::end_object: + return "'}'"; + case token_type::name_separator: + return "':'"; + case token_type::value_separator: + return "','"; + case token_type::parse_error: + return ""; + case token_type::end_of_input: + return "end of input"; + case token_type::literal_or_value: + return "'[', '{', or a literal"; + // LCOV_EXCL_START + default: // catch non-enum values + return "unknown token"; + // LCOV_EXCL_STOP + } + } +}; +/*! +@brief lexical analysis + +This class organizes the lexical analysis during JSON deserialization. +*/ +template +class lexer : public lexer_base +{ + using number_integer_t = typename BasicJsonType::number_integer_t; + using number_unsigned_t = typename BasicJsonType::number_unsigned_t; + using number_float_t = typename BasicJsonType::number_float_t; + using string_t = typename BasicJsonType::string_t; + using char_type = typename InputAdapterType::char_type; + using char_int_type = typename std::char_traits::int_type; + + public: + using token_type = typename lexer_base::token_type; + + explicit lexer(InputAdapterType&& adapter, bool ignore_comments_ = false) + : ia(std::move(adapter)) + , ignore_comments(ignore_comments_) + , decimal_point_char(static_cast(get_decimal_point())) + {} + + // delete because of pointer members + lexer(const lexer&) = delete; + lexer(lexer&&) = default; + lexer& operator=(lexer&) = delete; + lexer& operator=(lexer&&) = default; + ~lexer() = default; + + private: + ///////////////////// + // locales + ///////////////////// + + /// return the locale-dependent decimal point + JSON_HEDLEY_PURE + static char get_decimal_point() noexcept + { + const auto* loc = localeconv(); + JSON_ASSERT(loc != nullptr); + return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point); + } + + ///////////////////// + // scan functions + ///////////////////// + + /*! + @brief get codepoint from 4 hex characters following `\u` + + For input "\u c1 c2 c3 c4" the codepoint is: + (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4 + = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0) + + Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f' + must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The + conversion is done by subtracting the offset (0x30, 0x37, and 0x57) + between the ASCII value of the character and the desired integer value. + + @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or + non-hex character) + */ + int get_codepoint() + { + // this function only makes sense after reading `\u` + JSON_ASSERT(current == 'u'); + int codepoint = 0; + + const auto factors = { 12u, 8u, 4u, 0u }; + for (const auto factor : factors) + { + get(); + + if (current >= '0' && current <= '9') + { + codepoint += static_cast((static_cast(current) - 0x30u) << factor); + } + else if (current >= 'A' && current <= 'F') + { + codepoint += static_cast((static_cast(current) - 0x37u) << factor); + } + else if (current >= 'a' && current <= 'f') + { + codepoint += static_cast((static_cast(current) - 0x57u) << factor); + } + else + { + return -1; + } + } + + JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF); + return codepoint; + } + + /*! + @brief check if the next byte(s) are inside a given range + + Adds the current byte and, for each passed range, reads a new byte and + checks if it is inside the range. If a violation was detected, set up an + error message and return false. Otherwise, return true. + + @param[in] ranges list of integers; interpreted as list of pairs of + inclusive lower and upper bound, respectively + + @pre The passed list @a ranges must have 2, 4, or 6 elements; that is, + 1, 2, or 3 pairs. This precondition is enforced by an assertion. + + @return true if and only if no range violation was detected + */ + bool next_byte_in_range(std::initializer_list ranges) + { + JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6); + add(current); + + for (auto range = ranges.begin(); range != ranges.end(); ++range) + { + get(); + if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range))) + { + add(current); + } + else + { + error_message = "invalid string: ill-formed UTF-8 byte"; + return false; + } + } + + return true; + } + + /*! + @brief scan a string literal + + This function scans a string according to Sect. 7 of RFC 7159. While + scanning, bytes are escaped and copied into buffer token_buffer. Then the + function returns successfully, token_buffer is *not* null-terminated (as it + may contain \0 bytes), and token_buffer.size() is the number of bytes in the + string. + + @return token_type::value_string if string could be successfully scanned, + token_type::parse_error otherwise + + @note In case of errors, variable error_message contains a textual + description. + */ + token_type scan_string() + { + // reset token_buffer (ignore opening quote) + reset(); + + // we entered the function by reading an open quote + JSON_ASSERT(current == '\"'); + + while (true) + { + // get next character + switch (get()) + { + // end of file while parsing string + case std::char_traits::eof(): + { + error_message = "invalid string: missing closing quote"; + return token_type::parse_error; + } + + // closing quote + case '\"': + { + return token_type::value_string; + } + + // escapes + case '\\': + { + switch (get()) + { + // quotation mark + case '\"': + add('\"'); + break; + // reverse solidus + case '\\': + add('\\'); + break; + // solidus + case '/': + add('/'); + break; + // backspace + case 'b': + add('\b'); + break; + // form feed + case 'f': + add('\f'); + break; + // line feed + case 'n': + add('\n'); + break; + // carriage return + case 'r': + add('\r'); + break; + // tab + case 't': + add('\t'); + break; + + // unicode escapes + case 'u': + { + const int codepoint1 = get_codepoint(); + int codepoint = codepoint1; // start with codepoint1 + + if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1)) + { + error_message = "invalid string: '\\u' must be followed by 4 hex digits"; + return token_type::parse_error; + } + + // check if code point is a high surrogate + if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF) + { + // expect next \uxxxx entry + if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u')) + { + const int codepoint2 = get_codepoint(); + + if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1)) + { + error_message = "invalid string: '\\u' must be followed by 4 hex digits"; + return token_type::parse_error; + } + + // check if codepoint2 is a low surrogate + if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF)) + { + // overwrite codepoint + codepoint = static_cast( + // high surrogate occupies the most significant 22 bits + (static_cast(codepoint1) << 10u) + // low surrogate occupies the least significant 15 bits + + static_cast(codepoint2) + // there is still the 0xD800, 0xDC00 and 0x10000 noise + // in the result so we have to subtract with: + // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00 + - 0x35FDC00u); + } + else + { + error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF"; + return token_type::parse_error; + } + } + else + { + error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF"; + return token_type::parse_error; + } + } + else + { + if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF)) + { + error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF"; + return token_type::parse_error; + } + } + + // result of the above calculation yields a proper codepoint + JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF); + + // translate codepoint into bytes + if (codepoint < 0x80) + { + // 1-byte characters: 0xxxxxxx (ASCII) + add(static_cast(codepoint)); + } + else if (codepoint <= 0x7FF) + { + // 2-byte characters: 110xxxxx 10xxxxxx + add(static_cast(0xC0u | (static_cast(codepoint) >> 6u))); + add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); + } + else if (codepoint <= 0xFFFF) + { + // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx + add(static_cast(0xE0u | (static_cast(codepoint) >> 12u))); + add(static_cast(0x80u | ((static_cast(codepoint) >> 6u) & 0x3Fu))); + add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); + } + else + { + // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + add(static_cast(0xF0u | (static_cast(codepoint) >> 18u))); + add(static_cast(0x80u | ((static_cast(codepoint) >> 12u) & 0x3Fu))); + add(static_cast(0x80u | ((static_cast(codepoint) >> 6u) & 0x3Fu))); + add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); + } + + break; + } + + // other characters after escape + default: + error_message = "invalid string: forbidden character after backslash"; + return token_type::parse_error; + } + + break; + } + + // invalid control characters + case 0x00: + { + error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000"; + return token_type::parse_error; + } + + case 0x01: + { + error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001"; + return token_type::parse_error; + } + + case 0x02: + { + error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002"; + return token_type::parse_error; + } + + case 0x03: + { + error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003"; + return token_type::parse_error; + } + + case 0x04: + { + error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004"; + return token_type::parse_error; + } + + case 0x05: + { + error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005"; + return token_type::parse_error; + } + + case 0x06: + { + error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006"; + return token_type::parse_error; + } + + case 0x07: + { + error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007"; + return token_type::parse_error; + } + + case 0x08: + { + error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b"; + return token_type::parse_error; + } + + case 0x09: + { + error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t"; + return token_type::parse_error; + } + + case 0x0A: + { + error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n"; + return token_type::parse_error; + } + + case 0x0B: + { + error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B"; + return token_type::parse_error; + } + + case 0x0C: + { + error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f"; + return token_type::parse_error; + } + + case 0x0D: + { + error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r"; + return token_type::parse_error; + } + + case 0x0E: + { + error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E"; + return token_type::parse_error; + } + + case 0x0F: + { + error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F"; + return token_type::parse_error; + } + + case 0x10: + { + error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010"; + return token_type::parse_error; + } + + case 0x11: + { + error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011"; + return token_type::parse_error; + } + + case 0x12: + { + error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012"; + return token_type::parse_error; + } + + case 0x13: + { + error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013"; + return token_type::parse_error; + } + + case 0x14: + { + error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014"; + return token_type::parse_error; + } + + case 0x15: + { + error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015"; + return token_type::parse_error; + } + + case 0x16: + { + error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016"; + return token_type::parse_error; + } + + case 0x17: + { + error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017"; + return token_type::parse_error; + } + + case 0x18: + { + error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018"; + return token_type::parse_error; + } + + case 0x19: + { + error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019"; + return token_type::parse_error; + } + + case 0x1A: + { + error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A"; + return token_type::parse_error; + } + + case 0x1B: + { + error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B"; + return token_type::parse_error; + } + + case 0x1C: + { + error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C"; + return token_type::parse_error; + } + + case 0x1D: + { + error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D"; + return token_type::parse_error; + } + + case 0x1E: + { + error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E"; + return token_type::parse_error; + } + + case 0x1F: + { + error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F"; + return token_type::parse_error; + } + + // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace)) + case 0x20: + case 0x21: + case 0x23: + case 0x24: + case 0x25: + case 0x26: + case 0x27: + case 0x28: + case 0x29: + case 0x2A: + case 0x2B: + case 0x2C: + case 0x2D: + case 0x2E: + case 0x2F: + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + case 0x3A: + case 0x3B: + case 0x3C: + case 0x3D: + case 0x3E: + case 0x3F: + case 0x40: + case 0x41: + case 0x42: + case 0x43: + case 0x44: + case 0x45: + case 0x46: + case 0x47: + case 0x48: + case 0x49: + case 0x4A: + case 0x4B: + case 0x4C: + case 0x4D: + case 0x4E: + case 0x4F: + case 0x50: + case 0x51: + case 0x52: + case 0x53: + case 0x54: + case 0x55: + case 0x56: + case 0x57: + case 0x58: + case 0x59: + case 0x5A: + case 0x5B: + case 0x5D: + case 0x5E: + case 0x5F: + case 0x60: + case 0x61: + case 0x62: + case 0x63: + case 0x64: + case 0x65: + case 0x66: + case 0x67: + case 0x68: + case 0x69: + case 0x6A: + case 0x6B: + case 0x6C: + case 0x6D: + case 0x6E: + case 0x6F: + case 0x70: + case 0x71: + case 0x72: + case 0x73: + case 0x74: + case 0x75: + case 0x76: + case 0x77: + case 0x78: + case 0x79: + case 0x7A: + case 0x7B: + case 0x7C: + case 0x7D: + case 0x7E: + case 0x7F: + { + add(current); + break; + } + + // U+0080..U+07FF: bytes C2..DF 80..BF + case 0xC2: + case 0xC3: + case 0xC4: + case 0xC5: + case 0xC6: + case 0xC7: + case 0xC8: + case 0xC9: + case 0xCA: + case 0xCB: + case 0xCC: + case 0xCD: + case 0xCE: + case 0xCF: + case 0xD0: + case 0xD1: + case 0xD2: + case 0xD3: + case 0xD4: + case 0xD5: + case 0xD6: + case 0xD7: + case 0xD8: + case 0xD9: + case 0xDA: + case 0xDB: + case 0xDC: + case 0xDD: + case 0xDE: + case 0xDF: + { + if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF}))) + { + return token_type::parse_error; + } + break; + } + + // U+0800..U+0FFF: bytes E0 A0..BF 80..BF + case 0xE0: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF + // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF + case 0xE1: + case 0xE2: + case 0xE3: + case 0xE4: + case 0xE5: + case 0xE6: + case 0xE7: + case 0xE8: + case 0xE9: + case 0xEA: + case 0xEB: + case 0xEC: + case 0xEE: + case 0xEF: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // U+D000..U+D7FF: bytes ED 80..9F 80..BF + case 0xED: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF + case 0xF0: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF + case 0xF1: + case 0xF2: + case 0xF3: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF + case 0xF4: + { + if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF})))) + { + return token_type::parse_error; + } + break; + } + + // remaining bytes (80..C1 and F5..FF) are ill-formed + default: + { + error_message = "invalid string: ill-formed UTF-8 byte"; + return token_type::parse_error; + } + } + } + } + + /*! + * @brief scan a comment + * @return whether comment could be scanned successfully + */ + bool scan_comment() + { + switch (get()) + { + // single-line comments skip input until a newline or EOF is read + case '/': + { + while (true) + { + switch (get()) + { + case '\n': + case '\r': + case std::char_traits::eof(): + case '\0': + return true; + + default: + break; + } + } + } + + // multi-line comments skip input until */ is read + case '*': + { + while (true) + { + switch (get()) + { + case std::char_traits::eof(): + case '\0': + { + error_message = "invalid comment; missing closing '*/'"; + return false; + } + + case '*': + { + switch (get()) + { + case '/': + return true; + + default: + { + unget(); + break; + } + } + } + + default: + break; + } + } + } + + // unexpected character after reading '/' + default: + { + error_message = "invalid comment; expecting '/' or '*' after '/'"; + return false; + } + } + } + + JSON_HEDLEY_NON_NULL(2) + static void strtof(float& f, const char* str, char** endptr) noexcept + { + f = std::strtof(str, endptr); + } + + JSON_HEDLEY_NON_NULL(2) + static void strtof(double& f, const char* str, char** endptr) noexcept + { + f = std::strtod(str, endptr); + } + + JSON_HEDLEY_NON_NULL(2) + static void strtof(long double& f, const char* str, char** endptr) noexcept + { + f = std::strtold(str, endptr); + } + + /*! + @brief scan a number literal + + This function scans a string according to Sect. 6 of RFC 7159. + + The function is realized with a deterministic finite state machine derived + from the grammar described in RFC 7159. Starting in state "init", the + input is read and used to determined the next state. Only state "done" + accepts the number. State "error" is a trap state to model errors. In the + table below, "anything" means any character but the ones listed before. + + state | 0 | 1-9 | e E | + | - | . | anything + ---------|----------|----------|----------|---------|---------|----------|----------- + init | zero | any1 | [error] | [error] | minus | [error] | [error] + minus | zero | any1 | [error] | [error] | [error] | [error] | [error] + zero | done | done | exponent | done | done | decimal1 | done + any1 | any1 | any1 | exponent | done | done | decimal1 | done + decimal1 | decimal2 | decimal2 | [error] | [error] | [error] | [error] | [error] + decimal2 | decimal2 | decimal2 | exponent | done | done | done | done + exponent | any2 | any2 | [error] | sign | sign | [error] | [error] + sign | any2 | any2 | [error] | [error] | [error] | [error] | [error] + any2 | any2 | any2 | done | done | done | done | done + + The state machine is realized with one label per state (prefixed with + "scan_number_") and `goto` statements between them. The state machine + contains cycles, but any cycle can be left when EOF is read. Therefore, + the function is guaranteed to terminate. + + During scanning, the read bytes are stored in token_buffer. This string is + then converted to a signed integer, an unsigned integer, or a + floating-point number. + + @return token_type::value_unsigned, token_type::value_integer, or + token_type::value_float if number could be successfully scanned, + token_type::parse_error otherwise + + @note The scanner is independent of the current locale. Internally, the + locale's decimal point is used instead of `.` to work with the + locale-dependent converters. + */ + token_type scan_number() // lgtm [cpp/use-of-goto] + { + // reset token_buffer to store the number's bytes + reset(); + + // the type of the parsed number; initially set to unsigned; will be + // changed if minus sign, decimal point or exponent is read + token_type number_type = token_type::value_unsigned; + + // state (init): we just found out we need to scan a number + switch (current) + { + case '-': + { + add(current); + goto scan_number_minus; + } + + case '0': + { + add(current); + goto scan_number_zero; + } + + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any1; + } + + // all other characters are rejected outside scan_number() + default: // LCOV_EXCL_LINE + JSON_ASSERT(false); // LCOV_EXCL_LINE + } + +scan_number_minus: + // state: we just parsed a leading minus sign + number_type = token_type::value_integer; + switch (get()) + { + case '0': + { + add(current); + goto scan_number_zero; + } + + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any1; + } + + default: + { + error_message = "invalid number; expected digit after '-'"; + return token_type::parse_error; + } + } + +scan_number_zero: + // state: we just parse a zero (maybe with a leading minus sign) + switch (get()) + { + case '.': + { + add(decimal_point_char); + goto scan_number_decimal1; + } + + case 'e': + case 'E': + { + add(current); + goto scan_number_exponent; + } + + default: + goto scan_number_done; + } + +scan_number_any1: + // state: we just parsed a number 0-9 (maybe with a leading minus sign) + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any1; + } + + case '.': + { + add(decimal_point_char); + goto scan_number_decimal1; + } + + case 'e': + case 'E': + { + add(current); + goto scan_number_exponent; + } + + default: + goto scan_number_done; + } + +scan_number_decimal1: + // state: we just parsed a decimal point + number_type = token_type::value_float; + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_decimal2; + } + + default: + { + error_message = "invalid number; expected digit after '.'"; + return token_type::parse_error; + } + } + +scan_number_decimal2: + // we just parsed at least one number after a decimal point + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_decimal2; + } + + case 'e': + case 'E': + { + add(current); + goto scan_number_exponent; + } + + default: + goto scan_number_done; + } + +scan_number_exponent: + // we just parsed an exponent + number_type = token_type::value_float; + switch (get()) + { + case '+': + case '-': + { + add(current); + goto scan_number_sign; + } + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any2; + } + + default: + { + error_message = + "invalid number; expected '+', '-', or digit after exponent"; + return token_type::parse_error; + } + } + +scan_number_sign: + // we just parsed an exponent sign + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any2; + } + + default: + { + error_message = "invalid number; expected digit after exponent sign"; + return token_type::parse_error; + } + } + +scan_number_any2: + // we just parsed a number after the exponent or exponent sign + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any2; + } + + default: + goto scan_number_done; + } + +scan_number_done: + // unget the character after the number (we only read it to know that + // we are done scanning a number) + unget(); + + char* endptr = nullptr; + errno = 0; + + // try to parse integers first and fall back to floats + if (number_type == token_type::value_unsigned) + { + const auto x = std::strtoull(token_buffer.data(), &endptr, 10); + + // we checked the number format before + JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); + + if (errno == 0) + { + value_unsigned = static_cast(x); + if (value_unsigned == x) + { + return token_type::value_unsigned; + } + } + } + else if (number_type == token_type::value_integer) + { + const auto x = std::strtoll(token_buffer.data(), &endptr, 10); + + // we checked the number format before + JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); + + if (errno == 0) + { + value_integer = static_cast(x); + if (value_integer == x) + { + return token_type::value_integer; + } + } + } + + // this code is reached if we parse a floating-point number or if an + // integer conversion above failed + strtof(value_float, token_buffer.data(), &endptr); + + // we checked the number format before + JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); + + return token_type::value_float; + } + + /*! + @param[in] literal_text the literal text to expect + @param[in] length the length of the passed literal text + @param[in] return_type the token type to return on success + */ + JSON_HEDLEY_NON_NULL(2) + token_type scan_literal(const char_type* literal_text, const std::size_t length, + token_type return_type) + { + JSON_ASSERT(std::char_traits::to_char_type(current) == literal_text[0]); + for (std::size_t i = 1; i < length; ++i) + { + if (JSON_HEDLEY_UNLIKELY(std::char_traits::to_char_type(get()) != literal_text[i])) + { + error_message = "invalid literal"; + return token_type::parse_error; + } + } + return return_type; + } + + ///////////////////// + // input management + ///////////////////// + + /// reset token_buffer; current character is beginning of token + void reset() noexcept + { + token_buffer.clear(); + token_string.clear(); + token_string.push_back(std::char_traits::to_char_type(current)); + } + + /* + @brief get next character from the input + + This function provides the interface to the used input adapter. It does + not throw in case the input reached EOF, but returns a + `std::char_traits::eof()` in that case. Stores the scanned characters + for use in error messages. + + @return character read from the input + */ + char_int_type get() + { + ++position.chars_read_total; + ++position.chars_read_current_line; + + if (next_unget) + { + // just reset the next_unget variable and work with current + next_unget = false; + } + else + { + current = ia.get_character(); + } + + if (JSON_HEDLEY_LIKELY(current != std::char_traits::eof())) + { + token_string.push_back(std::char_traits::to_char_type(current)); + } + + if (current == '\n') + { + ++position.lines_read; + position.chars_read_current_line = 0; + } + + return current; + } + + /*! + @brief unget current character (read it again on next get) + + We implement unget by setting variable next_unget to true. The input is not + changed - we just simulate ungetting by modifying chars_read_total, + chars_read_current_line, and token_string. The next call to get() will + behave as if the unget character is read again. + */ + void unget() + { + next_unget = true; + + --position.chars_read_total; + + // in case we "unget" a newline, we have to also decrement the lines_read + if (position.chars_read_current_line == 0) + { + if (position.lines_read > 0) + { + --position.lines_read; + } + } + else + { + --position.chars_read_current_line; + } + + if (JSON_HEDLEY_LIKELY(current != std::char_traits::eof())) + { + JSON_ASSERT(!token_string.empty()); + token_string.pop_back(); + } + } + + /// add a character to token_buffer + void add(char_int_type c) + { + token_buffer.push_back(static_cast(c)); + } + + public: + ///////////////////// + // value getters + ///////////////////// + + /// return integer value + constexpr number_integer_t get_number_integer() const noexcept + { + return value_integer; + } + + /// return unsigned integer value + constexpr number_unsigned_t get_number_unsigned() const noexcept + { + return value_unsigned; + } + + /// return floating-point value + constexpr number_float_t get_number_float() const noexcept + { + return value_float; + } + + /// return current string value (implicitly resets the token; useful only once) + string_t& get_string() + { + return token_buffer; + } + + ///////////////////// + // diagnostics + ///////////////////// + + /// return position of last read token + constexpr position_t get_position() const noexcept + { + return position; + } + + /// return the last read token (for errors only). Will never contain EOF + /// (an arbitrary value that is not a valid char value, often -1), because + /// 255 may legitimately occur. May contain NUL, which should be escaped. + std::string get_token_string() const + { + // escape control characters + std::string result; + for (const auto c : token_string) + { + if (static_cast(c) <= '\x1F') + { + // escape control characters + std::array cs{{}}; + (std::snprintf)(cs.data(), cs.size(), "", static_cast(c)); + result += cs.data(); + } + else + { + // add character as is + result.push_back(static_cast(c)); + } + } + + return result; + } + + /// return syntax error message + JSON_HEDLEY_RETURNS_NON_NULL + constexpr const char* get_error_message() const noexcept + { + return error_message; + } + + ///////////////////// + // actual scanner + ///////////////////// + + /*! + @brief skip the UTF-8 byte order mark + @return true iff there is no BOM or the correct BOM has been skipped + */ + bool skip_bom() + { + if (get() == 0xEF) + { + // check if we completely parse the BOM + return get() == 0xBB && get() == 0xBF; + } + + // the first character is not the beginning of the BOM; unget it to + // process is later + unget(); + return true; + } + + void skip_whitespace() + { + do + { + get(); + } + while (current == ' ' || current == '\t' || current == '\n' || current == '\r'); + } + + token_type scan() + { + // initially, skip the BOM + if (position.chars_read_total == 0 && !skip_bom()) + { + error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given"; + return token_type::parse_error; + } + + // read next character and ignore whitespace + skip_whitespace(); + + // ignore comments + if (ignore_comments && current == '/') + { + if (!scan_comment()) + { + return token_type::parse_error; + } + + // skip following whitespace + skip_whitespace(); + } + + switch (current) + { + // structural characters + case '[': + return token_type::begin_array; + case ']': + return token_type::end_array; + case '{': + return token_type::begin_object; + case '}': + return token_type::end_object; + case ':': + return token_type::name_separator; + case ',': + return token_type::value_separator; + + // literals + case 't': + { + std::array true_literal = {{'t', 'r', 'u', 'e'}}; + return scan_literal(true_literal.data(), true_literal.size(), token_type::literal_true); + } + case 'f': + { + std::array false_literal = {{'f', 'a', 'l', 's', 'e'}}; + return scan_literal(false_literal.data(), false_literal.size(), token_type::literal_false); + } + case 'n': + { + std::array null_literal = {{'n', 'u', 'l', 'l'}}; + return scan_literal(null_literal.data(), null_literal.size(), token_type::literal_null); + } + + // string + case '\"': + return scan_string(); + + // number + case '-': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + return scan_number(); + + // end of input (the null byte is needed when parsing from + // string literals) + case '\0': + case std::char_traits::eof(): + return token_type::end_of_input; + + // error + default: + error_message = "invalid literal"; + return token_type::parse_error; + } + } + + private: + /// input adapter + InputAdapterType ia; + + /// whether comments should be ignored (true) or signaled as errors (false) + const bool ignore_comments = false; + + /// the current character + char_int_type current = std::char_traits::eof(); + + /// whether the next get() call should just return current + bool next_unget = false; + + /// the start position of the current token + position_t position {}; + + /// raw input token string (for error messages) + std::vector token_string {}; + + /// buffer for variable-length tokens (numbers, strings) + string_t token_buffer {}; + + /// a description of occurred lexer errors + const char* error_message = ""; + + // number values + number_integer_t value_integer = 0; + number_unsigned_t value_unsigned = 0; + number_float_t value_float = 0; + + /// the decimal point + const char_int_type decimal_point_char = '.'; +}; +} // namespace detail +} // namespace nlohmann + // #include // #include @@ -8013,6 +9642,11 @@ class binary_reader return get_number(input_format_t::ubjson, number) && sax->number_float(static_cast(number), ""); } + case 'H': + { + return get_ubjson_high_precision_number(); + } + case 'C': // char { get(); @@ -8189,6 +9823,55 @@ class binary_reader // Note, no reader for UBJSON binary types is implemented because they do // not exist + bool get_ubjson_high_precision_number() + { + // get size of following number string + std::size_t size{}; + auto res = get_ubjson_size_value(size); + if (JSON_HEDLEY_UNLIKELY(!res)) + { + return res; + } + + // get number string + std::vector number_vector; + for (std::size_t i = 0; i < size; ++i) + { + get(); + if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "number"))) + { + return false; + } + number_vector.push_back(static_cast(current)); + } + + // parse number string + auto number_ia = detail::input_adapter(std::forward(number_vector)); + auto number_lexer = detail::lexer(std::move(number_ia), false); + const auto result_number = number_lexer.scan(); + const auto number_string = number_lexer.get_token_string(); + const auto result_remainder = number_lexer.scan(); + + using token_type = typename detail::lexer_base::token_type; + + if (JSON_HEDLEY_UNLIKELY(result_remainder != token_type::end_of_input)) + { + return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number"))); + } + + switch (result_number) + { + case token_type::value_integer: + return sax->number_integer(number_lexer.get_number_integer()); + case token_type::value_unsigned: + return sax->number_unsigned(number_lexer.get_number_unsigned()); + case token_type::value_float: + return sax->number_float(number_lexer.get_number_float(), std::move(number_string)); + default: + return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number"))); + } + } + /////////////////////// // Utility functions // /////////////////////// @@ -8416,1632 +10099,6 @@ class binary_reader // #include - -#include // array -#include // localeconv -#include // size_t -#include // snprintf -#include // strtof, strtod, strtold, strtoll, strtoull -#include // initializer_list -#include // char_traits, string -#include // move -#include // vector - -// #include - -// #include - -// #include - - -namespace nlohmann -{ -namespace detail -{ -/////////// -// lexer // -/////////// - -template -class lexer_base -{ - public: - /// token types for the parser - enum class token_type - { - uninitialized, ///< indicating the scanner is uninitialized - literal_true, ///< the `true` literal - literal_false, ///< the `false` literal - literal_null, ///< the `null` literal - value_string, ///< a string -- use get_string() for actual value - value_unsigned, ///< an unsigned integer -- use get_number_unsigned() for actual value - value_integer, ///< a signed integer -- use get_number_integer() for actual value - value_float, ///< an floating point number -- use get_number_float() for actual value - begin_array, ///< the character for array begin `[` - begin_object, ///< the character for object begin `{` - end_array, ///< the character for array end `]` - end_object, ///< the character for object end `}` - name_separator, ///< the name separator `:` - value_separator, ///< the value separator `,` - parse_error, ///< indicating a parse error - end_of_input, ///< indicating the end of the input buffer - literal_or_value ///< a literal or the begin of a value (only for diagnostics) - }; - - /// return name of values of type token_type (only used for errors) - JSON_HEDLEY_RETURNS_NON_NULL - JSON_HEDLEY_CONST - static const char* token_type_name(const token_type t) noexcept - { - switch (t) - { - case token_type::uninitialized: - return ""; - case token_type::literal_true: - return "true literal"; - case token_type::literal_false: - return "false literal"; - case token_type::literal_null: - return "null literal"; - case token_type::value_string: - return "string literal"; - case token_type::value_unsigned: - case token_type::value_integer: - case token_type::value_float: - return "number literal"; - case token_type::begin_array: - return "'['"; - case token_type::begin_object: - return "'{'"; - case token_type::end_array: - return "']'"; - case token_type::end_object: - return "'}'"; - case token_type::name_separator: - return "':'"; - case token_type::value_separator: - return "','"; - case token_type::parse_error: - return ""; - case token_type::end_of_input: - return "end of input"; - case token_type::literal_or_value: - return "'[', '{', or a literal"; - // LCOV_EXCL_START - default: // catch non-enum values - return "unknown token"; - // LCOV_EXCL_STOP - } - } -}; -/*! -@brief lexical analysis - -This class organizes the lexical analysis during JSON deserialization. -*/ -template -class lexer : public lexer_base -{ - using number_integer_t = typename BasicJsonType::number_integer_t; - using number_unsigned_t = typename BasicJsonType::number_unsigned_t; - using number_float_t = typename BasicJsonType::number_float_t; - using string_t = typename BasicJsonType::string_t; - using char_type = typename InputAdapterType::char_type; - using char_int_type = typename std::char_traits::int_type; - - public: - using token_type = typename lexer_base::token_type; - - explicit lexer(InputAdapterType&& adapter, bool ignore_comments_ = false) - : ia(std::move(adapter)) - , ignore_comments(ignore_comments_) - , decimal_point_char(static_cast(get_decimal_point())) - {} - - // delete because of pointer members - lexer(const lexer&) = delete; - lexer(lexer&&) = default; - lexer& operator=(lexer&) = delete; - lexer& operator=(lexer&&) = default; - ~lexer() = default; - - private: - ///////////////////// - // locales - ///////////////////// - - /// return the locale-dependent decimal point - JSON_HEDLEY_PURE - static char get_decimal_point() noexcept - { - const auto* loc = localeconv(); - JSON_ASSERT(loc != nullptr); - return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point); - } - - ///////////////////// - // scan functions - ///////////////////// - - /*! - @brief get codepoint from 4 hex characters following `\u` - - For input "\u c1 c2 c3 c4" the codepoint is: - (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4 - = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0) - - Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f' - must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The - conversion is done by subtracting the offset (0x30, 0x37, and 0x57) - between the ASCII value of the character and the desired integer value. - - @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or - non-hex character) - */ - int get_codepoint() - { - // this function only makes sense after reading `\u` - JSON_ASSERT(current == 'u'); - int codepoint = 0; - - const auto factors = { 12u, 8u, 4u, 0u }; - for (const auto factor : factors) - { - get(); - - if (current >= '0' && current <= '9') - { - codepoint += static_cast((static_cast(current) - 0x30u) << factor); - } - else if (current >= 'A' && current <= 'F') - { - codepoint += static_cast((static_cast(current) - 0x37u) << factor); - } - else if (current >= 'a' && current <= 'f') - { - codepoint += static_cast((static_cast(current) - 0x57u) << factor); - } - else - { - return -1; - } - } - - JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF); - return codepoint; - } - - /*! - @brief check if the next byte(s) are inside a given range - - Adds the current byte and, for each passed range, reads a new byte and - checks if it is inside the range. If a violation was detected, set up an - error message and return false. Otherwise, return true. - - @param[in] ranges list of integers; interpreted as list of pairs of - inclusive lower and upper bound, respectively - - @pre The passed list @a ranges must have 2, 4, or 6 elements; that is, - 1, 2, or 3 pairs. This precondition is enforced by an assertion. - - @return true if and only if no range violation was detected - */ - bool next_byte_in_range(std::initializer_list ranges) - { - JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6); - add(current); - - for (auto range = ranges.begin(); range != ranges.end(); ++range) - { - get(); - if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range))) - { - add(current); - } - else - { - error_message = "invalid string: ill-formed UTF-8 byte"; - return false; - } - } - - return true; - } - - /*! - @brief scan a string literal - - This function scans a string according to Sect. 7 of RFC 7159. While - scanning, bytes are escaped and copied into buffer token_buffer. Then the - function returns successfully, token_buffer is *not* null-terminated (as it - may contain \0 bytes), and token_buffer.size() is the number of bytes in the - string. - - @return token_type::value_string if string could be successfully scanned, - token_type::parse_error otherwise - - @note In case of errors, variable error_message contains a textual - description. - */ - token_type scan_string() - { - // reset token_buffer (ignore opening quote) - reset(); - - // we entered the function by reading an open quote - JSON_ASSERT(current == '\"'); - - while (true) - { - // get next character - switch (get()) - { - // end of file while parsing string - case std::char_traits::eof(): - { - error_message = "invalid string: missing closing quote"; - return token_type::parse_error; - } - - // closing quote - case '\"': - { - return token_type::value_string; - } - - // escapes - case '\\': - { - switch (get()) - { - // quotation mark - case '\"': - add('\"'); - break; - // reverse solidus - case '\\': - add('\\'); - break; - // solidus - case '/': - add('/'); - break; - // backspace - case 'b': - add('\b'); - break; - // form feed - case 'f': - add('\f'); - break; - // line feed - case 'n': - add('\n'); - break; - // carriage return - case 'r': - add('\r'); - break; - // tab - case 't': - add('\t'); - break; - - // unicode escapes - case 'u': - { - const int codepoint1 = get_codepoint(); - int codepoint = codepoint1; // start with codepoint1 - - if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1)) - { - error_message = "invalid string: '\\u' must be followed by 4 hex digits"; - return token_type::parse_error; - } - - // check if code point is a high surrogate - if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF) - { - // expect next \uxxxx entry - if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u')) - { - const int codepoint2 = get_codepoint(); - - if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1)) - { - error_message = "invalid string: '\\u' must be followed by 4 hex digits"; - return token_type::parse_error; - } - - // check if codepoint2 is a low surrogate - if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF)) - { - // overwrite codepoint - codepoint = static_cast( - // high surrogate occupies the most significant 22 bits - (static_cast(codepoint1) << 10u) - // low surrogate occupies the least significant 15 bits - + static_cast(codepoint2) - // there is still the 0xD800, 0xDC00 and 0x10000 noise - // in the result so we have to subtract with: - // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00 - - 0x35FDC00u); - } - else - { - error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF"; - return token_type::parse_error; - } - } - else - { - error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF"; - return token_type::parse_error; - } - } - else - { - if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF)) - { - error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF"; - return token_type::parse_error; - } - } - - // result of the above calculation yields a proper codepoint - JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF); - - // translate codepoint into bytes - if (codepoint < 0x80) - { - // 1-byte characters: 0xxxxxxx (ASCII) - add(static_cast(codepoint)); - } - else if (codepoint <= 0x7FF) - { - // 2-byte characters: 110xxxxx 10xxxxxx - add(static_cast(0xC0u | (static_cast(codepoint) >> 6u))); - add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); - } - else if (codepoint <= 0xFFFF) - { - // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx - add(static_cast(0xE0u | (static_cast(codepoint) >> 12u))); - add(static_cast(0x80u | ((static_cast(codepoint) >> 6u) & 0x3Fu))); - add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); - } - else - { - // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - add(static_cast(0xF0u | (static_cast(codepoint) >> 18u))); - add(static_cast(0x80u | ((static_cast(codepoint) >> 12u) & 0x3Fu))); - add(static_cast(0x80u | ((static_cast(codepoint) >> 6u) & 0x3Fu))); - add(static_cast(0x80u | (static_cast(codepoint) & 0x3Fu))); - } - - break; - } - - // other characters after escape - default: - error_message = "invalid string: forbidden character after backslash"; - return token_type::parse_error; - } - - break; - } - - // invalid control characters - case 0x00: - { - error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000"; - return token_type::parse_error; - } - - case 0x01: - { - error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001"; - return token_type::parse_error; - } - - case 0x02: - { - error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002"; - return token_type::parse_error; - } - - case 0x03: - { - error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003"; - return token_type::parse_error; - } - - case 0x04: - { - error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004"; - return token_type::parse_error; - } - - case 0x05: - { - error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005"; - return token_type::parse_error; - } - - case 0x06: - { - error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006"; - return token_type::parse_error; - } - - case 0x07: - { - error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007"; - return token_type::parse_error; - } - - case 0x08: - { - error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b"; - return token_type::parse_error; - } - - case 0x09: - { - error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t"; - return token_type::parse_error; - } - - case 0x0A: - { - error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n"; - return token_type::parse_error; - } - - case 0x0B: - { - error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B"; - return token_type::parse_error; - } - - case 0x0C: - { - error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f"; - return token_type::parse_error; - } - - case 0x0D: - { - error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r"; - return token_type::parse_error; - } - - case 0x0E: - { - error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E"; - return token_type::parse_error; - } - - case 0x0F: - { - error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F"; - return token_type::parse_error; - } - - case 0x10: - { - error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010"; - return token_type::parse_error; - } - - case 0x11: - { - error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011"; - return token_type::parse_error; - } - - case 0x12: - { - error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012"; - return token_type::parse_error; - } - - case 0x13: - { - error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013"; - return token_type::parse_error; - } - - case 0x14: - { - error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014"; - return token_type::parse_error; - } - - case 0x15: - { - error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015"; - return token_type::parse_error; - } - - case 0x16: - { - error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016"; - return token_type::parse_error; - } - - case 0x17: - { - error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017"; - return token_type::parse_error; - } - - case 0x18: - { - error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018"; - return token_type::parse_error; - } - - case 0x19: - { - error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019"; - return token_type::parse_error; - } - - case 0x1A: - { - error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A"; - return token_type::parse_error; - } - - case 0x1B: - { - error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B"; - return token_type::parse_error; - } - - case 0x1C: - { - error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C"; - return token_type::parse_error; - } - - case 0x1D: - { - error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D"; - return token_type::parse_error; - } - - case 0x1E: - { - error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E"; - return token_type::parse_error; - } - - case 0x1F: - { - error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F"; - return token_type::parse_error; - } - - // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace)) - case 0x20: - case 0x21: - case 0x23: - case 0x24: - case 0x25: - case 0x26: - case 0x27: - case 0x28: - case 0x29: - case 0x2A: - case 0x2B: - case 0x2C: - case 0x2D: - case 0x2E: - case 0x2F: - case 0x30: - case 0x31: - case 0x32: - case 0x33: - case 0x34: - case 0x35: - case 0x36: - case 0x37: - case 0x38: - case 0x39: - case 0x3A: - case 0x3B: - case 0x3C: - case 0x3D: - case 0x3E: - case 0x3F: - case 0x40: - case 0x41: - case 0x42: - case 0x43: - case 0x44: - case 0x45: - case 0x46: - case 0x47: - case 0x48: - case 0x49: - case 0x4A: - case 0x4B: - case 0x4C: - case 0x4D: - case 0x4E: - case 0x4F: - case 0x50: - case 0x51: - case 0x52: - case 0x53: - case 0x54: - case 0x55: - case 0x56: - case 0x57: - case 0x58: - case 0x59: - case 0x5A: - case 0x5B: - case 0x5D: - case 0x5E: - case 0x5F: - case 0x60: - case 0x61: - case 0x62: - case 0x63: - case 0x64: - case 0x65: - case 0x66: - case 0x67: - case 0x68: - case 0x69: - case 0x6A: - case 0x6B: - case 0x6C: - case 0x6D: - case 0x6E: - case 0x6F: - case 0x70: - case 0x71: - case 0x72: - case 0x73: - case 0x74: - case 0x75: - case 0x76: - case 0x77: - case 0x78: - case 0x79: - case 0x7A: - case 0x7B: - case 0x7C: - case 0x7D: - case 0x7E: - case 0x7F: - { - add(current); - break; - } - - // U+0080..U+07FF: bytes C2..DF 80..BF - case 0xC2: - case 0xC3: - case 0xC4: - case 0xC5: - case 0xC6: - case 0xC7: - case 0xC8: - case 0xC9: - case 0xCA: - case 0xCB: - case 0xCC: - case 0xCD: - case 0xCE: - case 0xCF: - case 0xD0: - case 0xD1: - case 0xD2: - case 0xD3: - case 0xD4: - case 0xD5: - case 0xD6: - case 0xD7: - case 0xD8: - case 0xD9: - case 0xDA: - case 0xDB: - case 0xDC: - case 0xDD: - case 0xDE: - case 0xDF: - { - if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF}))) - { - return token_type::parse_error; - } - break; - } - - // U+0800..U+0FFF: bytes E0 A0..BF 80..BF - case 0xE0: - { - if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF})))) - { - return token_type::parse_error; - } - break; - } - - // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF - // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF - case 0xE1: - case 0xE2: - case 0xE3: - case 0xE4: - case 0xE5: - case 0xE6: - case 0xE7: - case 0xE8: - case 0xE9: - case 0xEA: - case 0xEB: - case 0xEC: - case 0xEE: - case 0xEF: - { - if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF})))) - { - return token_type::parse_error; - } - break; - } - - // U+D000..U+D7FF: bytes ED 80..9F 80..BF - case 0xED: - { - if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF})))) - { - return token_type::parse_error; - } - break; - } - - // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF - case 0xF0: - { - if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) - { - return token_type::parse_error; - } - break; - } - - // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF - case 0xF1: - case 0xF2: - case 0xF3: - { - if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) - { - return token_type::parse_error; - } - break; - } - - // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF - case 0xF4: - { - if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF})))) - { - return token_type::parse_error; - } - break; - } - - // remaining bytes (80..C1 and F5..FF) are ill-formed - default: - { - error_message = "invalid string: ill-formed UTF-8 byte"; - return token_type::parse_error; - } - } - } - } - - /*! - * @brief scan a comment - * @return whether comment could be scanned successfully - */ - bool scan_comment() - { - switch (get()) - { - // single-line comments skip input until a newline or EOF is read - case '/': - { - while (true) - { - switch (get()) - { - case '\n': - case '\r': - case std::char_traits::eof(): - case '\0': - return true; - - default: - break; - } - } - } - - // multi-line comments skip input until */ is read - case '*': - { - while (true) - { - switch (get()) - { - case std::char_traits::eof(): - case '\0': - { - error_message = "invalid comment; missing closing '*/'"; - return false; - } - - case '*': - { - switch (get()) - { - case '/': - return true; - - default: - { - unget(); - break; - } - } - } - - default: - break; - } - } - } - - // unexpected character after reading '/' - default: - { - error_message = "invalid comment; expecting '/' or '*' after '/'"; - return false; - } - } - } - - JSON_HEDLEY_NON_NULL(2) - static void strtof(float& f, const char* str, char** endptr) noexcept - { - f = std::strtof(str, endptr); - } - - JSON_HEDLEY_NON_NULL(2) - static void strtof(double& f, const char* str, char** endptr) noexcept - { - f = std::strtod(str, endptr); - } - - JSON_HEDLEY_NON_NULL(2) - static void strtof(long double& f, const char* str, char** endptr) noexcept - { - f = std::strtold(str, endptr); - } - - /*! - @brief scan a number literal - - This function scans a string according to Sect. 6 of RFC 7159. - - The function is realized with a deterministic finite state machine derived - from the grammar described in RFC 7159. Starting in state "init", the - input is read and used to determined the next state. Only state "done" - accepts the number. State "error" is a trap state to model errors. In the - table below, "anything" means any character but the ones listed before. - - state | 0 | 1-9 | e E | + | - | . | anything - ---------|----------|----------|----------|---------|---------|----------|----------- - init | zero | any1 | [error] | [error] | minus | [error] | [error] - minus | zero | any1 | [error] | [error] | [error] | [error] | [error] - zero | done | done | exponent | done | done | decimal1 | done - any1 | any1 | any1 | exponent | done | done | decimal1 | done - decimal1 | decimal2 | decimal2 | [error] | [error] | [error] | [error] | [error] - decimal2 | decimal2 | decimal2 | exponent | done | done | done | done - exponent | any2 | any2 | [error] | sign | sign | [error] | [error] - sign | any2 | any2 | [error] | [error] | [error] | [error] | [error] - any2 | any2 | any2 | done | done | done | done | done - - The state machine is realized with one label per state (prefixed with - "scan_number_") and `goto` statements between them. The state machine - contains cycles, but any cycle can be left when EOF is read. Therefore, - the function is guaranteed to terminate. - - During scanning, the read bytes are stored in token_buffer. This string is - then converted to a signed integer, an unsigned integer, or a - floating-point number. - - @return token_type::value_unsigned, token_type::value_integer, or - token_type::value_float if number could be successfully scanned, - token_type::parse_error otherwise - - @note The scanner is independent of the current locale. Internally, the - locale's decimal point is used instead of `.` to work with the - locale-dependent converters. - */ - token_type scan_number() // lgtm [cpp/use-of-goto] - { - // reset token_buffer to store the number's bytes - reset(); - - // the type of the parsed number; initially set to unsigned; will be - // changed if minus sign, decimal point or exponent is read - token_type number_type = token_type::value_unsigned; - - // state (init): we just found out we need to scan a number - switch (current) - { - case '-': - { - add(current); - goto scan_number_minus; - } - - case '0': - { - add(current); - goto scan_number_zero; - } - - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any1; - } - - // all other characters are rejected outside scan_number() - default: // LCOV_EXCL_LINE - JSON_ASSERT(false); // LCOV_EXCL_LINE - } - -scan_number_minus: - // state: we just parsed a leading minus sign - number_type = token_type::value_integer; - switch (get()) - { - case '0': - { - add(current); - goto scan_number_zero; - } - - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any1; - } - - default: - { - error_message = "invalid number; expected digit after '-'"; - return token_type::parse_error; - } - } - -scan_number_zero: - // state: we just parse a zero (maybe with a leading minus sign) - switch (get()) - { - case '.': - { - add(decimal_point_char); - goto scan_number_decimal1; - } - - case 'e': - case 'E': - { - add(current); - goto scan_number_exponent; - } - - default: - goto scan_number_done; - } - -scan_number_any1: - // state: we just parsed a number 0-9 (maybe with a leading minus sign) - switch (get()) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any1; - } - - case '.': - { - add(decimal_point_char); - goto scan_number_decimal1; - } - - case 'e': - case 'E': - { - add(current); - goto scan_number_exponent; - } - - default: - goto scan_number_done; - } - -scan_number_decimal1: - // state: we just parsed a decimal point - number_type = token_type::value_float; - switch (get()) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_decimal2; - } - - default: - { - error_message = "invalid number; expected digit after '.'"; - return token_type::parse_error; - } - } - -scan_number_decimal2: - // we just parsed at least one number after a decimal point - switch (get()) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_decimal2; - } - - case 'e': - case 'E': - { - add(current); - goto scan_number_exponent; - } - - default: - goto scan_number_done; - } - -scan_number_exponent: - // we just parsed an exponent - number_type = token_type::value_float; - switch (get()) - { - case '+': - case '-': - { - add(current); - goto scan_number_sign; - } - - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any2; - } - - default: - { - error_message = - "invalid number; expected '+', '-', or digit after exponent"; - return token_type::parse_error; - } - } - -scan_number_sign: - // we just parsed an exponent sign - switch (get()) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any2; - } - - default: - { - error_message = "invalid number; expected digit after exponent sign"; - return token_type::parse_error; - } - } - -scan_number_any2: - // we just parsed a number after the exponent or exponent sign - switch (get()) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any2; - } - - default: - goto scan_number_done; - } - -scan_number_done: - // unget the character after the number (we only read it to know that - // we are done scanning a number) - unget(); - - char* endptr = nullptr; - errno = 0; - - // try to parse integers first and fall back to floats - if (number_type == token_type::value_unsigned) - { - const auto x = std::strtoull(token_buffer.data(), &endptr, 10); - - // we checked the number format before - JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); - - if (errno == 0) - { - value_unsigned = static_cast(x); - if (value_unsigned == x) - { - return token_type::value_unsigned; - } - } - } - else if (number_type == token_type::value_integer) - { - const auto x = std::strtoll(token_buffer.data(), &endptr, 10); - - // we checked the number format before - JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); - - if (errno == 0) - { - value_integer = static_cast(x); - if (value_integer == x) - { - return token_type::value_integer; - } - } - } - - // this code is reached if we parse a floating-point number or if an - // integer conversion above failed - strtof(value_float, token_buffer.data(), &endptr); - - // we checked the number format before - JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); - - return token_type::value_float; - } - - /*! - @param[in] literal_text the literal text to expect - @param[in] length the length of the passed literal text - @param[in] return_type the token type to return on success - */ - JSON_HEDLEY_NON_NULL(2) - token_type scan_literal(const char_type* literal_text, const std::size_t length, - token_type return_type) - { - JSON_ASSERT(std::char_traits::to_char_type(current) == literal_text[0]); - for (std::size_t i = 1; i < length; ++i) - { - if (JSON_HEDLEY_UNLIKELY(std::char_traits::to_char_type(get()) != literal_text[i])) - { - error_message = "invalid literal"; - return token_type::parse_error; - } - } - return return_type; - } - - ///////////////////// - // input management - ///////////////////// - - /// reset token_buffer; current character is beginning of token - void reset() noexcept - { - token_buffer.clear(); - token_string.clear(); - token_string.push_back(std::char_traits::to_char_type(current)); - } - - /* - @brief get next character from the input - - This function provides the interface to the used input adapter. It does - not throw in case the input reached EOF, but returns a - `std::char_traits::eof()` in that case. Stores the scanned characters - for use in error messages. - - @return character read from the input - */ - char_int_type get() - { - ++position.chars_read_total; - ++position.chars_read_current_line; - - if (next_unget) - { - // just reset the next_unget variable and work with current - next_unget = false; - } - else - { - current = ia.get_character(); - } - - if (JSON_HEDLEY_LIKELY(current != std::char_traits::eof())) - { - token_string.push_back(std::char_traits::to_char_type(current)); - } - - if (current == '\n') - { - ++position.lines_read; - position.chars_read_current_line = 0; - } - - return current; - } - - /*! - @brief unget current character (read it again on next get) - - We implement unget by setting variable next_unget to true. The input is not - changed - we just simulate ungetting by modifying chars_read_total, - chars_read_current_line, and token_string. The next call to get() will - behave as if the unget character is read again. - */ - void unget() - { - next_unget = true; - - --position.chars_read_total; - - // in case we "unget" a newline, we have to also decrement the lines_read - if (position.chars_read_current_line == 0) - { - if (position.lines_read > 0) - { - --position.lines_read; - } - } - else - { - --position.chars_read_current_line; - } - - if (JSON_HEDLEY_LIKELY(current != std::char_traits::eof())) - { - JSON_ASSERT(!token_string.empty()); - token_string.pop_back(); - } - } - - /// add a character to token_buffer - void add(char_int_type c) - { - token_buffer.push_back(static_cast(c)); - } - - public: - ///////////////////// - // value getters - ///////////////////// - - /// return integer value - constexpr number_integer_t get_number_integer() const noexcept - { - return value_integer; - } - - /// return unsigned integer value - constexpr number_unsigned_t get_number_unsigned() const noexcept - { - return value_unsigned; - } - - /// return floating-point value - constexpr number_float_t get_number_float() const noexcept - { - return value_float; - } - - /// return current string value (implicitly resets the token; useful only once) - string_t& get_string() - { - return token_buffer; - } - - ///////////////////// - // diagnostics - ///////////////////// - - /// return position of last read token - constexpr position_t get_position() const noexcept - { - return position; - } - - /// return the last read token (for errors only). Will never contain EOF - /// (an arbitrary value that is not a valid char value, often -1), because - /// 255 may legitimately occur. May contain NUL, which should be escaped. - std::string get_token_string() const - { - // escape control characters - std::string result; - for (const auto c : token_string) - { - if (static_cast(c) <= '\x1F') - { - // escape control characters - std::array cs{{}}; - (std::snprintf)(cs.data(), cs.size(), "", static_cast(c)); - result += cs.data(); - } - else - { - // add character as is - result.push_back(static_cast(c)); - } - } - - return result; - } - - /// return syntax error message - JSON_HEDLEY_RETURNS_NON_NULL - constexpr const char* get_error_message() const noexcept - { - return error_message; - } - - ///////////////////// - // actual scanner - ///////////////////// - - /*! - @brief skip the UTF-8 byte order mark - @return true iff there is no BOM or the correct BOM has been skipped - */ - bool skip_bom() - { - if (get() == 0xEF) - { - // check if we completely parse the BOM - return get() == 0xBB && get() == 0xBF; - } - - // the first character is not the beginning of the BOM; unget it to - // process is later - unget(); - return true; - } - - void skip_whitespace() - { - do - { - get(); - } - while (current == ' ' || current == '\t' || current == '\n' || current == '\r'); - } - - token_type scan() - { - // initially, skip the BOM - if (position.chars_read_total == 0 && !skip_bom()) - { - error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given"; - return token_type::parse_error; - } - - // read next character and ignore whitespace - skip_whitespace(); - - // ignore comments - if (ignore_comments && current == '/') - { - if (!scan_comment()) - { - return token_type::parse_error; - } - - // skip following whitespace - skip_whitespace(); - } - - switch (current) - { - // structural characters - case '[': - return token_type::begin_array; - case ']': - return token_type::end_array; - case '{': - return token_type::begin_object; - case '}': - return token_type::end_object; - case ':': - return token_type::name_separator; - case ',': - return token_type::value_separator; - - // literals - case 't': - { - std::array true_literal = {{'t', 'r', 'u', 'e'}}; - return scan_literal(true_literal.data(), true_literal.size(), token_type::literal_true); - } - case 'f': - { - std::array false_literal = {{'f', 'a', 'l', 's', 'e'}}; - return scan_literal(false_literal.data(), false_literal.size(), token_type::literal_false); - } - case 'n': - { - std::array null_literal = {{'n', 'u', 'l', 'l'}}; - return scan_literal(null_literal.data(), null_literal.size(), token_type::literal_null); - } - - // string - case '\"': - return scan_string(); - - // number - case '-': - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - return scan_number(); - - // end of input (the null byte is needed when parsing from - // string literals) - case '\0': - case std::char_traits::eof(): - return token_type::end_of_input; - - // error - default: - error_message = "invalid literal"; - return token_type::parse_error; - } - } - - private: - /// input adapter - InputAdapterType ia; - - /// whether comments should be ignored (true) or signaled as errors (false) - const bool ignore_comments = false; - - /// the current character - char_int_type current = std::char_traits::eof(); - - /// whether the next get() call should just return current - bool next_unget = false; - - /// the start position of the current token - position_t position {}; - - /// raw input token string (for error messages) - std::vector token_string {}; - - /// buffer for variable-length tokens (numbers, strings) - string_t token_buffer {}; - - /// a description of occurred lexer errors - const char* error_message = ""; - - // number values - number_integer_t value_integer = 0; - number_unsigned_t value_unsigned = 0; - number_float_t value_float = 0; - - /// the decimal point - const char_int_type decimal_point_char = '.'; -}; -} // namespace detail -} // namespace nlohmann - // #include @@ -13985,7 +14042,17 @@ class binary_writer } else { - JSON_THROW(out_of_range::create(407, "integer number " + std::to_string(n) + " cannot be represented by UBJSON as it does not fit int64")); + if (add_prefix) + { + oa->write_character(to_char_type('H')); // high-precision number + } + + const auto number = BasicJsonType(n).dump(); + write_number_with_ubjson_prefix(number.size(), true); + for (std::size_t i = 0; i < number.size(); ++i) + { + oa->write_character(to_char_type(static_cast(number[i]))); + } } } @@ -14039,19 +14106,23 @@ class binary_writer // LCOV_EXCL_START else { - JSON_THROW(out_of_range::create(407, "integer number " + std::to_string(n) + " cannot be represented by UBJSON as it does not fit int64")); + if (add_prefix) + { + oa->write_character(to_char_type('H')); // high-precision number + } + + const auto number = BasicJsonType(n).dump(); + write_number_with_ubjson_prefix(number.size(), true); + for (std::size_t i = 0; i < number.size(); ++i) + { + oa->write_character(to_char_type(static_cast(number[i]))); + } } // LCOV_EXCL_STOP } /*! @brief determine the type prefix of container values - - @note This function does not need to be 100% accurate when it comes to - integer limits. In case a number exceeds the limits of int64_t, - this will be detected by a later call to function - write_number_with_ubjson_prefix. Therefore, we return 'L' for any - value that does not fit the previous limits. */ CharType ubjson_prefix(const BasicJsonType& j) const noexcept { @@ -14081,8 +14152,12 @@ class binary_writer { return 'l'; } - // no check and assume int64_t (see note above) - return 'L'; + if ((std::numeric_limits::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits::max)()) + { + return 'L'; + } + // anything else is treated as high-precision number + return 'H'; // LCOV_EXCL_LINE } case value_t::number_unsigned: @@ -14103,8 +14178,12 @@ class binary_writer { return 'l'; } - // no check and assume int64_t (see note above) - return 'L'; + if (j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + { + return 'L'; + } + // anything else is treated as high-precision number + return 'H'; // LCOV_EXCL_LINE } case value_t::number_float: @@ -16505,7 +16584,7 @@ class basic_json detail::parser_callback_tcb = nullptr, const bool allow_exceptions = true, const bool ignore_comments = false - ) + ) { return ::nlohmann::detail::parser(std::move(adapter), std::move(cb), allow_exceptions, ignore_comments); @@ -23508,6 +23587,7 @@ class basic_json number_unsigned | 256..32767 | int16 | `I` number_unsigned | 32768..2147483647 | int32 | `l` number_unsigned | 2147483648..9223372036854775807 | int64 | `L` + number_unsigned | 2147483649..18446744073709551615 | high-precision | `H` number_float | *any value* | float64 | `D` string | *with shortest length indicator* | string | `S` array | *see notes on optimized format* | array | `[` @@ -23518,7 +23598,6 @@ class basic_json @note The following values can **not** be converted to a UBJSON value: - strings with more than 9223372036854775807 bytes (theoretical) - - unsigned integer numbers above 9223372036854775807 @note The following markers are not used in the conversion: - `Z`: no-op values are not created. @@ -23994,6 +24073,7 @@ class basic_json int16 | number_integer | `I` int32 | number_integer | `l` int64 | number_integer | `L` + high-precision number | number_integer, number_unsigned, or number_float - depends on number string | 'H' string | string | `S` char | string | `C` array | array (optimized values are supported) | `[` @@ -25047,7 +25127,7 @@ template<> inline void swap(nlohmann::json& j1, nlohmann::json& j2) noexcept( is_nothrow_move_constructible::value&& is_nothrow_move_assignable::value -) + ) { j1.swap(j2); }