diff --git a/src/json.hpp b/src/json.hpp index fd63fe13..8459c1a6 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -1529,8 +1529,2047 @@ struct input_adapter_factory } }; -} // namespace detail +////////////////////// +// lexer and parser // +////////////////////// +/*! +@brief lexical analysis + +This class organizes the lexical analysis during JSON deserialization. +*/ +template +class lexer +{ + using number_integer_t = typename BasicJsonType::number_integer_t; + using number_unsigned_t = typename BasicJsonType::number_unsigned_t; + using number_float_t = typename BasicJsonType::number_float_t; + + public: + /// token types for the parser + enum class token_type + { + uninitialized, ///< indicating the scanner is uninitialized + literal_true, ///< the `true` literal + literal_false, ///< the `false` literal + literal_null, ///< the `null` literal + value_string, ///< a string -- use get_string() for actual value + value_unsigned, ///< an unsigned integer -- use get_number_unsigned() for + ///actual value + value_integer, ///< a signed integer -- use get_number_integer() for actual + ///value + value_float, ///< an floating point number -- use get_number_float() for + ///actual value + begin_array, ///< the character for array begin `[` + begin_object, ///< the character for object begin `{` + end_array, ///< the character for array end `]` + end_object, ///< the character for object end `}` + name_separator, ///< the name separator `:` + value_separator, ///< the value separator `,` + parse_error, ///< indicating a parse error + end_of_input, ///< indicating the end of the input buffer + literal_or_value ///< a literal or the begin of a value (only for + ///diagnostics) + }; + + /// return name of values of type token_type (only used for errors) + static const char* token_type_name(const token_type t) noexcept + { + switch (t) + { + case token_type::uninitialized: + return ""; + case token_type::literal_true: + return "true literal"; + case token_type::literal_false: + return "false literal"; + case token_type::literal_null: + return "null literal"; + case token_type::value_string: + return "string literal"; + case lexer::token_type::value_unsigned: + case lexer::token_type::value_integer: + case lexer::token_type::value_float: + return "number literal"; + case token_type::begin_array: + return "'['"; + case token_type::begin_object: + return "'{'"; + case token_type::end_array: + return "']'"; + case token_type::end_object: + return "'}'"; + case token_type::name_separator: + return "':'"; + case token_type::value_separator: + return "','"; + case token_type::parse_error: + return ""; + case token_type::end_of_input: + return "end of input"; + case token_type::literal_or_value: + return "'[', '{', or a literal"; + default: + { + // catch non-enum values + return "unknown token"; // LCOV_EXCL_LINE + } + } + } + + explicit lexer(detail::input_adapter_t adapter) + : ia(adapter), decimal_point_char(get_decimal_point()) {} + + // delete because of pointer members + lexer(const lexer&) = delete; + lexer& operator=(lexer&) = delete; + + private: + ///////////////////// + // locales + ///////////////////// + + /// return the locale-dependent decimal point + static char get_decimal_point() noexcept + { + const auto loc = localeconv(); + assert(loc != nullptr); + return (loc->decimal_point == nullptr) ? '.' : loc->decimal_point[0]; + } + + ///////////////////// + // scan functions + ///////////////////// + + /*! + @brief get codepoint from 4 hex characters following `\u` + + @return codepoint or -1 in case of an error (e.g. EOF or non-hex + character) + */ + int get_codepoint() + { + // this function only makes sense after reading `\u` + assert(current == 'u'); + int codepoint = 0; + + // byte 1: \uXxxx + switch (get()) + { + case '0': + break; + case '1': + codepoint += 0x1000; + break; + case '2': + codepoint += 0x2000; + break; + case '3': + codepoint += 0x3000; + break; + case '4': + codepoint += 0x4000; + break; + case '5': + codepoint += 0x5000; + break; + case '6': + codepoint += 0x6000; + break; + case '7': + codepoint += 0x7000; + break; + case '8': + codepoint += 0x8000; + break; + case '9': + codepoint += 0x9000; + break; + case 'A': + case 'a': + codepoint += 0xa000; + break; + case 'B': + case 'b': + codepoint += 0xb000; + break; + case 'C': + case 'c': + codepoint += 0xc000; + break; + case 'D': + case 'd': + codepoint += 0xd000; + break; + case 'E': + case 'e': + codepoint += 0xe000; + break; + case 'F': + case 'f': + codepoint += 0xf000; + break; + default: + return -1; + } + + // byte 2: \uxXxx + switch (get()) + { + case '0': + break; + case '1': + codepoint += 0x0100; + break; + case '2': + codepoint += 0x0200; + break; + case '3': + codepoint += 0x0300; + break; + case '4': + codepoint += 0x0400; + break; + case '5': + codepoint += 0x0500; + break; + case '6': + codepoint += 0x0600; + break; + case '7': + codepoint += 0x0700; + break; + case '8': + codepoint += 0x0800; + break; + case '9': + codepoint += 0x0900; + break; + case 'A': + case 'a': + codepoint += 0x0a00; + break; + case 'B': + case 'b': + codepoint += 0x0b00; + break; + case 'C': + case 'c': + codepoint += 0x0c00; + break; + case 'D': + case 'd': + codepoint += 0x0d00; + break; + case 'E': + case 'e': + codepoint += 0x0e00; + break; + case 'F': + case 'f': + codepoint += 0x0f00; + break; + default: + return -1; + } + + // byte 3: \uxxXx + switch (get()) + { + case '0': + break; + case '1': + codepoint += 0x0010; + break; + case '2': + codepoint += 0x0020; + break; + case '3': + codepoint += 0x0030; + break; + case '4': + codepoint += 0x0040; + break; + case '5': + codepoint += 0x0050; + break; + case '6': + codepoint += 0x0060; + break; + case '7': + codepoint += 0x0070; + break; + case '8': + codepoint += 0x0080; + break; + case '9': + codepoint += 0x0090; + break; + case 'A': + case 'a': + codepoint += 0x00a0; + break; + case 'B': + case 'b': + codepoint += 0x00b0; + break; + case 'C': + case 'c': + codepoint += 0x00c0; + break; + case 'D': + case 'd': + codepoint += 0x00d0; + break; + case 'E': + case 'e': + codepoint += 0x00e0; + break; + case 'F': + case 'f': + codepoint += 0x00f0; + break; + default: + return -1; + } + + // byte 4: \uxxxX + switch (get()) + { + case '0': + break; + case '1': + codepoint += 0x0001; + break; + case '2': + codepoint += 0x0002; + break; + case '3': + codepoint += 0x0003; + break; + case '4': + codepoint += 0x0004; + break; + case '5': + codepoint += 0x0005; + break; + case '6': + codepoint += 0x0006; + break; + case '7': + codepoint += 0x0007; + break; + case '8': + codepoint += 0x0008; + break; + case '9': + codepoint += 0x0009; + break; + case 'A': + case 'a': + codepoint += 0x000a; + break; + case 'B': + case 'b': + codepoint += 0x000b; + break; + case 'C': + case 'c': + codepoint += 0x000c; + break; + case 'D': + case 'd': + codepoint += 0x000d; + break; + case 'E': + case 'e': + codepoint += 0x000e; + break; + case 'F': + case 'f': + codepoint += 0x000f; + break; + default: + return -1; + } + + return codepoint; + } + + /*! + @brief scan a string literal + + This function scans a string according to Sect. 7 of RFC 7159. While + scanning, bytes are escaped and copied into buffer yytext. Then the + function returns successfully, yytext is null-terminated and yylen + contains the number of bytes in the string. + + @return token_type::value_string if string could be successfully + scanned, token_type::parse_error otherwise + + @note In case of errors, variable error_message contains a textual + description. + */ + token_type scan_string() + { + // reset yytext (ignore opening quote) + reset(); + + // we entered the function by reading an open quote + assert(current == '\"'); + + while (true) + { + // get next character + switch (get()) + { + // end of file while parsing string + case std::char_traits::eof(): + { + error_message = "invalid string: missing closing quote"; + return token_type::parse_error; + } + + // closing quote + case '\"': + { + // terminate yytext + add('\0'); + --yylen; + return token_type::value_string; + } + + // escapes + case '\\': + { + switch (get()) + { + // quotation mark + case '\"': + add('\"'); + break; + // reverse solidus + case '\\': + add('\\'); + break; + // solidus + case '/': + add('/'); + break; + // backspace + case 'b': + add('\b'); + break; + // form feed + case 'f': + add('\f'); + break; + // line feed + case 'n': + add('\n'); + break; + // carriage return + case 'r': + add('\r'); + break; + // tab + case 't': + add('\t'); + break; + + // unicode escapes + case 'u': + { + int codepoint; + int codepoint1 = get_codepoint(); + + if (JSON_UNLIKELY(codepoint1 == -1)) + { + error_message = + "invalid string: '\\u' must be followed by 4 hex digits"; + return token_type::parse_error; + } + + // check if code point is a high surrogate + if (0xD800 <= codepoint1 and codepoint1 <= 0xDBFF) + { + // expect next \uxxxx entry + if (JSON_LIKELY(get() == '\\' and get() == 'u')) + { + const int codepoint2 = get_codepoint(); + + if (JSON_UNLIKELY(codepoint2 == -1)) + { + error_message = + "invalid string: '\\u' must be followed by 4 hex digits"; + return token_type::parse_error; + } + + // check if codepoint2 is a low surrogate + if (JSON_LIKELY(0xDC00 <= codepoint2 and codepoint2 <= 0xDFFF)) + { + codepoint = + // high surrogate occupies the most significant 22 bits + (codepoint1 << 10) + // low surrogate occupies the least significant 15 bits + + codepoint2 + // there is still the 0xD800, 0xDC00 and 0x10000 noise + // in the result so we have to subtract with: + // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00 + - 0x35FDC00; + } + else + { + error_message = "invalid string: surrogate U+DC00..U+DFFF must " + "be followed by U+DC00..U+DFFF"; + return token_type::parse_error; + } + } + else + { + error_message = "invalid string: surrogate U+DC00..U+DFFF must " + "be followed by U+DC00..U+DFFF"; + return token_type::parse_error; + } + } + else + { + if (JSON_UNLIKELY(0xDC00 <= codepoint1 and codepoint1 <= 0xDFFF)) + { + error_message = "invalid string: surrogate U+DC00..U+DFFF must " + "follow U+D800..U+DBFF"; + return token_type::parse_error; + } + + // only work with first code point + codepoint = codepoint1; + } + + // result of the above calculation yields a proper codepoint + assert(0x00 <= codepoint and codepoint <= 0x10FFFF); + + // translate code point to bytes + if (codepoint < 0x80) + { + // 1-byte characters: 0xxxxxxx (ASCII) + add(codepoint); + } + else if (codepoint <= 0x7ff) + { + // 2-byte characters: 110xxxxx 10xxxxxx + add(0xC0 | (codepoint >> 6)); + add(0x80 | (codepoint & 0x3F)); + } + else if (codepoint <= 0xffff) + { + // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx + add(0xE0 | (codepoint >> 12)); + add(0x80 | ((codepoint >> 6) & 0x3F)); + add(0x80 | (codepoint & 0x3F)); + } + else + { + // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + add(0xF0 | (codepoint >> 18)); + add(0x80 | ((codepoint >> 12) & 0x3F)); + add(0x80 | ((codepoint >> 6) & 0x3F)); + add(0x80 | (codepoint & 0x3F)); + } + + break; + } + + // other characters after escape + default: + error_message = "invalid string: forbidden character after backslash"; + return token_type::parse_error; + } + + break; + } + + // invalid control characters + case 0x00: + case 0x01: + case 0x02: + case 0x03: + case 0x04: + case 0x05: + case 0x06: + case 0x07: + case 0x08: + case 0x09: + case 0x0a: + case 0x0b: + case 0x0c: + case 0x0d: + case 0x0e: + case 0x0f: + case 0x10: + case 0x11: + case 0x12: + case 0x13: + case 0x14: + case 0x15: + case 0x16: + case 0x17: + case 0x18: + case 0x19: + case 0x1a: + case 0x1b: + case 0x1c: + case 0x1d: + case 0x1e: + case 0x1f: + { + error_message = "invalid string: control character must be escaped"; + return token_type::parse_error; + } + + // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace)) + case 0x20: + case 0x21: + case 0x23: + case 0x24: + case 0x25: + case 0x26: + case 0x27: + case 0x28: + case 0x29: + case 0x2a: + case 0x2b: + case 0x2c: + case 0x2d: + case 0x2e: + case 0x2f: + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + case 0x3a: + case 0x3b: + case 0x3c: + case 0x3d: + case 0x3e: + case 0x3f: + case 0x40: + case 0x41: + case 0x42: + case 0x43: + case 0x44: + case 0x45: + case 0x46: + case 0x47: + case 0x48: + case 0x49: + case 0x4a: + case 0x4b: + case 0x4c: + case 0x4d: + case 0x4e: + case 0x4f: + case 0x50: + case 0x51: + case 0x52: + case 0x53: + case 0x54: + case 0x55: + case 0x56: + case 0x57: + case 0x58: + case 0x59: + case 0x5a: + case 0x5b: + case 0x5d: + case 0x5e: + case 0x5f: + case 0x60: + case 0x61: + case 0x62: + case 0x63: + case 0x64: + case 0x65: + case 0x66: + case 0x67: + case 0x68: + case 0x69: + case 0x6a: + case 0x6b: + case 0x6c: + case 0x6d: + case 0x6e: + case 0x6f: + case 0x70: + case 0x71: + case 0x72: + case 0x73: + case 0x74: + case 0x75: + case 0x76: + case 0x77: + case 0x78: + case 0x79: + case 0x7a: + case 0x7b: + case 0x7c: + case 0x7d: + case 0x7e: + case 0x7f: + { + add(current); + break; + } + + // U+0080..U+07FF: bytes C2..DF 80..BF + case 0xc2: + case 0xc3: + case 0xc4: + case 0xc5: + case 0xc6: + case 0xc7: + case 0xc8: + case 0xc9: + case 0xca: + case 0xcb: + case 0xcc: + case 0xcd: + case 0xce: + case 0xcf: + case 0xd0: + case 0xd1: + case 0xd2: + case 0xd3: + case 0xd4: + case 0xd5: + case 0xd6: + case 0xd7: + case 0xd8: + case 0xd9: + case 0xda: + case 0xdb: + case 0xdc: + case 0xdd: + case 0xde: + case 0xdf: + { + add(current); + get(); + if (JSON_LIKELY(0x80 <= current and current <= 0xbf)) + { + add(current); + continue; + } + + error_message = "invalid string: ill-formed UTF-8 byte"; + return token_type::parse_error; + } + + // U+0800..U+0FFF: bytes E0 A0..BF 80..BF + case 0xe0: + { + add(current); + get(); + if (JSON_LIKELY(0xa0 <= current and current <= 0xbf)) + { + add(current); + get(); + if (JSON_LIKELY(0x80 <= current and current <= 0xbf)) + { + add(current); + continue; + } + } + + error_message = "invalid string: ill-formed UTF-8 byte"; + return token_type::parse_error; + } + + // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF + // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF + case 0xe1: + case 0xe2: + case 0xe3: + case 0xe4: + case 0xe5: + case 0xe6: + case 0xe7: + case 0xe8: + case 0xe9: + case 0xea: + case 0xeb: + case 0xec: + case 0xee: + case 0xef: + { + add(current); + get(); + if (JSON_LIKELY(0x80 <= current and current <= 0xbf)) + { + add(current); + get(); + if (JSON_LIKELY(0x80 <= current and current <= 0xbf)) + { + add(current); + continue; + } + } + + error_message = "invalid string: ill-formed UTF-8 byte"; + return token_type::parse_error; + } + + // U+D000..U+D7FF: bytes ED 80..9F 80..BF + case 0xed: + { + add(current); + get(); + if (JSON_LIKELY(0x80 <= current and current <= 0x9f)) + { + add(current); + get(); + if (JSON_LIKELY(0x80 <= current and current <= 0xbf)) + { + add(current); + continue; + } + } + + error_message = "invalid string: ill-formed UTF-8 byte"; + return token_type::parse_error; + } + + // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF + case 0xf0: + { + add(current); + get(); + if (JSON_LIKELY(0x90 <= current and current <= 0xbf)) + { + add(current); + get(); + if (JSON_LIKELY(0x80 <= current and current <= 0xbf)) + { + add(current); + get(); + if (JSON_LIKELY(0x80 <= current and current <= 0xbf)) + { + add(current); + continue; + } + } + } + + error_message = "invalid string: ill-formed UTF-8 byte"; + return token_type::parse_error; + } + + // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF + case 0xf1: + case 0xf2: + case 0xf3: + { + add(current); + get(); + if (JSON_LIKELY(0x80 <= current and current <= 0xbf)) + { + add(current); + get(); + if (JSON_LIKELY(0x80 <= current and current <= 0xbf)) + { + add(current); + get(); + if (JSON_LIKELY(0x80 <= current and current <= 0xbf)) + { + add(current); + continue; + } + } + } + + error_message = "invalid string: ill-formed UTF-8 byte"; + return token_type::parse_error; + } + + // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF + case 0xf4: + { + add(current); + get(); + if (JSON_LIKELY(0x80 <= current and current <= 0x8f)) + { + add(current); + get(); + if (JSON_LIKELY(0x80 <= current and current <= 0xbf)) + { + add(current); + get(); + if (JSON_LIKELY(0x80 <= current and current <= 0xbf)) + { + add(current); + continue; + } + } + } + + error_message = "invalid string: ill-formed UTF-8 byte"; + return token_type::parse_error; + } + + // remaining bytes (80..C1 and F5..FF) are ill-formed + default: + { + error_message = "invalid string: ill-formed UTF-8 byte"; + return token_type::parse_error; + } + } + } + } + + static void strtof(float& f, const char* str, char** endptr) noexcept + { + f = std::strtof(str, endptr); + } + + static void strtof(double& f, const char* str, char** endptr) noexcept + { + f = std::strtod(str, endptr); + } + + static void strtof(long double& f, const char* str, char** endptr) noexcept + { + f = std::strtold(str, endptr); + } + + /*! + @brief scan a number literal + + This function scans a string according to Sect. 6 of RFC 7159. + + The function is realized with a deterministic finite state machine + derived from the grammar described in RFC 7159. Starting in state + "init", the input is read and used to determined the next state. Only + state "done" accepts the number. State "error" is a trap state to model + errors. In the table below, "anything" means any character but the ones + listed before. + + state | 0 | 1-9 | e E | + | - | . | + anything + ---------|----------|----------|----------|---------|---------|----------|----------- + init | zero | any1 | [error] | [error] | minus | [error] | + [error] + minus | zero | any1 | [error] | [error] | [error] | [error] | + [error] + zero | done | done | exponent | done | done | decimal1 | + done + any1 | any1 | any1 | exponent | done | done | decimal1 | + done + decimal1 | decimal2 | [error] | [error] | [error] | [error] | [error] | + [error] + decimal2 | decimal2 | decimal2 | exponent | done | done | done | + done + exponent | any2 | any2 | [error] | sign | sign | [error] | + [error] + sign | any2 | any2 | [error] | [error] | [error] | [error] | + [error] + any2 | any2 | any2 | done | done | done | done | + done + + The state machine is realized with one label per state (prefixed with + "scan_number_") and `goto` statements between them. The state machine + contains cycles, but any cycle can be left when EOF is read. Therefore, + the function is guaranteed to terminate. + + During scanning, the read bytes are stored in yytext. This string is + then converted to a signed integer, an unsigned integer, or a + floating-point number. + + @return token_type::value_unsigned, token_type::value_integer, or + token_type::value_float if number could be successfully scanned, + token_type::parse_error otherwise + + @note The scanner is independent of the current locale. Internally, the + locale's decimal point is used instead of `.` to work with the + locale-dependent converters. + */ + token_type scan_number() + { + // reset yytext to store the number's bytes + reset(); + + // the type of the parsed number; initially set to unsigned; will be + // changed if minus sign, decimal point or exponent is read + token_type number_type = token_type::value_unsigned; + + // state (init): we just found out we need to scan a number + switch (current) + { + case '-': + { + add(current); + goto scan_number_minus; + } + + case '0': + { + add(current); + goto scan_number_zero; + } + + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any1; + } + + default: + { + // all other characters are rejected outside scan_number() + assert(false); // LCOV_EXCL_LINE + } + } + +scan_number_minus: + // state: we just parsed a leading minus sign + number_type = token_type::value_integer; + switch (get()) + { + case '0': + { + add(current); + goto scan_number_zero; + } + + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any1; + } + + default: + { + error_message = "invalid number; expected digit after '-'"; + return token_type::parse_error; + } + } + +scan_number_zero: + // state: we just parse a zero (maybe with a leading minus sign) + switch (get()) + { + case '.': + { + add(decimal_point_char); + goto scan_number_decimal1; + } + + case 'e': + case 'E': + { + add(current); + goto scan_number_exponent; + } + + default: + { + goto scan_number_done; + } + } + +scan_number_any1: + // state: we just parsed a number 0-9 (maybe with a leading minus sign) + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any1; + } + + case '.': + { + add(decimal_point_char); + goto scan_number_decimal1; + } + + case 'e': + case 'E': + { + add(current); + goto scan_number_exponent; + } + + default: + { + goto scan_number_done; + } + } + +scan_number_decimal1: + // state: we just parsed a decimal point + number_type = token_type::value_float; + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_decimal2; + } + + default: + { + error_message = "invalid number; expected digit after '.'"; + return token_type::parse_error; + } + } + +scan_number_decimal2: + // we just parsed at least one number after a decimal point + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_decimal2; + } + + case 'e': + case 'E': + { + add(current); + goto scan_number_exponent; + } + + default: + { + goto scan_number_done; + } + } + +scan_number_exponent: + // we just parsed an exponent + number_type = token_type::value_float; + switch (get()) + { + case '+': + case '-': + { + add(current); + goto scan_number_sign; + } + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any2; + } + + default: + { + error_message = + "invalid number; expected '+', '-', or digit after exponent"; + return token_type::parse_error; + } + } + +scan_number_sign: + // we just parsed an exponent sign + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any2; + } + + default: + { + error_message = "invalid number; expected digit after exponent sign"; + return token_type::parse_error; + } + } + +scan_number_any2: + // we just parsed a number after the exponent or exponent sign + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any2; + } + + default: + { + goto scan_number_done; + } + } + +scan_number_done: + // unget the character after the number (we only read it to know + // that we are done scanning a number) + --chars_read; + next_unget = true; + + // terminate token + add('\0'); + --yylen; + + // try to parse integers first and fall back to floats + if (number_type == token_type::value_unsigned) + { + char* endptr = nullptr; + errno = 0; + const auto x = std::strtoull(yytext.data(), &endptr, 10); + + // we checked the number format before + assert(endptr == yytext.data() + yylen); + + if (errno == 0) + { + value_unsigned = static_cast(x); + if (value_unsigned == x) + { + return token_type::value_unsigned; + } + } + } + else if (number_type == token_type::value_integer) + { + char* endptr = nullptr; + errno = 0; + const auto x = std::strtoll(yytext.data(), &endptr, 10); + + // we checked the number format before + assert(endptr == yytext.data() + yylen); + + if (errno == 0) + { + value_integer = static_cast(x); + if (value_integer == x) + { + return token_type::value_integer; + } + } + } + + // this code is reached if we parse a floating-point number or if + // an integer conversion above failed + strtof(value_float, yytext.data(), nullptr); + return token_type::value_float; + } + + /*! + @param[in] literal_text the literal text to expect + @param[in] length the length of the passed literal text + @param[in] return_type the token type to return on success + */ + token_type scan_literal(const char* literal_text, const size_t length, + token_type return_type) + { + assert(current == literal_text[0]); + for (size_t i = 1; i < length; ++i) + { + if (JSON_UNLIKELY(get() != literal_text[i])) + { + error_message = "invalid literal"; + return token_type::parse_error; + } + } + return return_type; + } + + ///////////////////// + // input management + ///////////////////// + + /// reset yytext + void reset() noexcept + { + yylen = 0; + start_pos = chars_read - 1; + } + + /// get a character from the input + int get() + { + ++chars_read; + return next_unget ? (next_unget = false, current) + : (current = ia->get_character()); + } + + /// add a character to yytext + void add(int c) + { + // resize yytext if necessary; this condition is deemed unlikely, + // because we start with a 1024-byte buffer + if (JSON_UNLIKELY((yylen + 1 > yytext.capacity()))) + { + yytext.resize(2 * yytext.capacity(), '\0'); + } + assert(yylen < yytext.size()); + yytext[yylen++] = static_cast(c); + } + + public: + ///////////////////// + // value getters + ///////////////////// + + /// return integer value + constexpr number_integer_t get_number_integer() const noexcept + { + return value_integer; + } + + /// return unsigned integer value + constexpr number_unsigned_t get_number_unsigned() const noexcept + { + return value_unsigned; + } + + /// return floating-point value + constexpr number_float_t get_number_float() const noexcept + { + return value_float; + } + + /// return string value + const std::string get_string() + { + // yytext cannot be returned as char*, because it may contain a + // null byte (parsed as "\u0000") + return std::string(yytext.data(), yylen); + } + + ///////////////////// + // diagnostics + ///////////////////// + + /// return position of last read token + constexpr size_t get_position() const noexcept + { + return chars_read; + } + + /// return the last read token (for errors only) + std::string get_token_string() const + { + // get the raw byte sequence of the last token + std::string s = ia->read(start_pos, chars_read - start_pos); + + // escape control characters + std::string result; + for (auto c : s) + { + if (c == '\0' or c == std::char_traits::eof()) + { + // ignore EOF + continue; + } + else if ('\x00' <= c and c <= '\x1f') + { + // escape control characters + std::stringstream ss; + ss << "(c) << ">"; + result += ss.str(); + } + else + { + // add character as is + result.append(1, c); + } + } + + return result; + } + + /// return syntax error message + constexpr const char* get_error_message() const noexcept + { + return error_message; + } + + ///////////////////// + // actual scanner + ///////////////////// + + token_type scan() + { + // read next character and ignore whitespace + do + { + get(); + } + while (current == ' ' or current == '\t' or current == '\n' or + current == '\r'); + + switch (current) + { + // structural characters + case '[': + return token_type::begin_array; + case ']': + return token_type::end_array; + case '{': + return token_type::begin_object; + case '}': + return token_type::end_object; + case ':': + return token_type::name_separator; + case ',': + return token_type::value_separator; + + // literals + case 't': + return scan_literal("true", 4, token_type::literal_true); + case 'f': + return scan_literal("false", 5, token_type::literal_false); + case 'n': + return scan_literal("null", 4, token_type::literal_null); + + // string + case '\"': + return scan_string(); + + // number + case '-': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + return scan_number(); + + // end of input (the null byte is needed when parsing from + // string literals) + case '\0': + case std::char_traits::eof(): + return token_type::end_of_input; + + // error + default: + error_message = "invalid literal"; + return token_type::parse_error; + } + } + + private: + /// input adapter + detail::input_adapter_t ia = nullptr; + + /// the current character + int current = std::char_traits::eof(); + + /// whether get() should return the last character again + bool next_unget = false; + + /// the number of characters read + size_t chars_read = 0; + /// the start position of the current token + size_t start_pos = 0; + + /// buffer for variable-length tokens (numbers, strings) + std::vector yytext = std::vector(1024, '\0'); + /// current index in yytext + size_t yylen = 0; + + /// a description of occurred lexer errors + const char* error_message = ""; + + // number values + number_integer_t value_integer = 0; + number_unsigned_t value_unsigned = 0; + number_float_t value_float = 0; + + /// the decimal point + const char decimal_point_char = '.'; +}; + +/*! +@brief syntax analysis + +This class implements a recursive decent parser. +*/ +template +class parser +{ + using number_integer_t = typename BasicJsonType::number_integer_t; + using number_unsigned_t = typename BasicJsonType::number_unsigned_t; + using number_float_t = typename BasicJsonType::number_float_t; + + using lexer_t = lexer; + using token_type = typename lexer_t::token_type; + + public: + enum class parse_event_t : uint8_t + { + /// the parser read `{` and started to process a JSON object + object_start, + /// the parser read `}` and finished processing a JSON object + object_end, + /// the parser read `[` and started to process a JSON array + array_start, + /// the parser read `]` and finished processing a JSON array + array_end, + /// the parser read a key of a value in an object + key, + /// the parser finished reading a JSON value + value + }; + + using parser_callback_t = + std::function; + + /// a parser reading from an input adapter + explicit parser(detail::input_adapter_t adapter, + const parser_callback_t cb = nullptr) + : callback(cb), m_lexer(adapter) {} + + /*! + @brief public parser interface + + @param[in] strict whether to expect the last token to be EOF + @return parsed JSON value + + @throw parse_error.101 in case of an unexpected token + @throw parse_error.102 if to_unicode fails or surrogate error + @throw parse_error.103 if to_unicode fails + */ + BasicJsonType parse(const bool strict = true) + { + // read first token + get_token(); + + BasicJsonType result = parse_internal(true); + result.assert_invariant(); + + if (strict) + { + get_token(); + expect(token_type::end_of_input); + } + + // return parser result and replace it with null in case the + // top-level value was discarded by the callback function + return result.is_discarded() ? BasicJsonType() : std::move(result); + } + + /*! + @brief public accept interface + + @param[in] strict whether to expect the last token to be EOF + @return whether the input is a proper JSON text + */ + bool accept(const bool strict = true) + { + // read first token + get_token(); + + if (not accept_internal()) + { + return false; + } + + if (strict and get_token() != token_type::end_of_input) + { + return false; + } + + return true; + } + + private: + /*! + @brief the actual parser + @throw parse_error.101 in case of an unexpected token + @throw parse_error.102 if to_unicode fails or surrogate error + @throw parse_error.103 if to_unicode fails + */ + BasicJsonType parse_internal(bool keep) + { + auto result = BasicJsonType(value_t::discarded); + + switch (last_token) + { + case token_type::begin_object: + { + if (keep and (not callback or + ((keep = callback(depth++, parse_event_t::object_start, + result)) != 0))) + { + // explicitly set result to object to cope with {} + result.m_type = value_t::object; + result.m_value = value_t::object; + } + + // read next token + get_token(); + + // closing } -> we are done + if (last_token == token_type::end_object) + { + if (keep and callback and + not callback(--depth, parse_event_t::object_end, result)) + { + result = BasicJsonType(value_t::discarded); + } + return result; + } + + // parse values + while (true) + { + // store key + expect(token_type::value_string); + const auto key = m_lexer.get_string(); + + bool keep_tag = false; + if (keep) + { + if (callback) + { + BasicJsonType k(key); + keep_tag = callback(depth, parse_event_t::key, k); + } + else + { + keep_tag = true; + } + } + + // parse separator (:) + get_token(); + expect(token_type::name_separator); + + // parse and add value + get_token(); + auto value = parse_internal(keep); + if (keep and keep_tag and not value.is_discarded()) + { + result[key] = std::move(value); + } + + // comma -> next value + get_token(); + if (last_token == token_type::value_separator) + { + get_token(); + continue; + } + + // closing } + expect(token_type::end_object); + break; + } + + if (keep and callback and + not callback(--depth, parse_event_t::object_end, result)) + { + result = BasicJsonType(value_t::discarded); + } + + return result; + } + + case token_type::begin_array: + { + if (keep and (not callback or + ((keep = callback(depth++, parse_event_t::array_start, + result)) != 0))) + { + // explicitly set result to object to cope with [] + result.m_type = value_t::array; + result.m_value = value_t::array; + } + + // read next token + get_token(); + + // closing ] -> we are done + if (last_token == token_type::end_array) + { + if (callback and + not callback(--depth, parse_event_t::array_end, result)) + { + result = BasicJsonType(value_t::discarded); + } + return result; + } + + // parse values + while (true) + { + // parse value + auto value = parse_internal(keep); + if (keep and not value.is_discarded()) + { + result.push_back(std::move(value)); + } + + // comma -> next value + get_token(); + if (last_token == token_type::value_separator) + { + get_token(); + continue; + } + + // closing ] + expect(token_type::end_array); + break; + } + + if (keep and callback and + not callback(--depth, parse_event_t::array_end, result)) + { + result = BasicJsonType(value_t::discarded); + } + + return result; + } + + case token_type::literal_null: + { + result.m_type = value_t::null; + break; + } + + case token_type::value_string: + { + result = BasicJsonType(m_lexer.get_string()); + break; + } + + case token_type::literal_true: + { + result.m_type = value_t::boolean; + result.m_value = true; + break; + } + + case token_type::literal_false: + { + result.m_type = value_t::boolean; + result.m_value = false; + break; + } + + case token_type::value_unsigned: + { + result.m_type = value_t::number_unsigned; + result.m_value = m_lexer.get_number_unsigned(); + break; + } + + case token_type::value_integer: + { + result.m_type = value_t::number_integer; + result.m_value = m_lexer.get_number_integer(); + break; + } + + case token_type::value_float: + { + result.m_type = value_t::number_float; + result.m_value = m_lexer.get_number_float(); + + // throw in case of infinity or NAN + if (JSON_UNLIKELY(not std::isfinite(result.m_value.number_float))) + { + JSON_THROW(out_of_range::create(406, "number overflow parsing '" + + m_lexer.get_token_string() + + "'")); + } + + break; + } + + case token_type::parse_error: + { + // using "uninitialized" to avoid "expected" message + expect(token_type::uninitialized); + break; // LCOV_EXCL_LINE + } + + default: + { + // the last token was unexpected; we expected a value + expect(token_type::literal_or_value); + break; // LCOV_EXCL_LINE + } + } + + if (keep and callback and + not callback(depth, parse_event_t::value, result)) + { + result = BasicJsonType(value_t::discarded); + } + return result; + } + + /*! + @brief the acutal acceptor + + @invariant 1. The last token is not yet processed. Therefore, the + caller of this function must make sure a token has + been read. + 2. When this function returns, the last token is processed. + That is, the last read character was already considered. + + This invariant makes sure that no token needs to be "unput". + */ + bool accept_internal() + { + switch (last_token) + { + case token_type::begin_object: + { + // read next token + get_token(); + + // closing } -> we are done + if (last_token == token_type::end_object) + { + return true; + } + + // parse values + while (true) + { + // parse key + if (last_token != token_type::value_string) + { + return false; + } + + // parse separator (:) + get_token(); + if (last_token != token_type::name_separator) + { + return false; + } + + // parse value + get_token(); + if (not accept_internal()) + { + return false; + } + + // comma -> next value + get_token(); + if (last_token == token_type::value_separator) + { + get_token(); + continue; + } + + // closing } + if (last_token != token_type::end_object) + { + return false; + } + + return true; + } + } + + case token_type::begin_array: + { + // read next token + get_token(); + + // closing ] -> we are done + if (last_token == token_type::end_array) + { + return true; + } + + // parse values + while (true) + { + // parse value + if (not accept_internal()) + { + return false; + } + + // comma -> next value + get_token(); + if (last_token == token_type::value_separator) + { + get_token(); + continue; + } + + // closing ] + if (last_token != token_type::end_array) + { + return false; + } + + return true; + } + } + + case token_type::literal_false: + case token_type::literal_null: + case token_type::literal_true: + case token_type::value_float: + case token_type::value_integer: + case token_type::value_string: + case token_type::value_unsigned: + { + return true; + } + + default: + { + // the last token was unexpected + return false; + } + } + } + + /// get next token from lexer + token_type get_token() + { + return (last_token = m_lexer.scan()); + } + + /*! + @throw parse_error.101 if expected token did not occur + */ + void expect(token_type t) + { + if (JSON_UNLIKELY(t != last_token)) + { + errored = true; + expected = t; + throw_exception(); + } + } + + [[noreturn]] void throw_exception() const + { + std::string error_msg = "syntax error - "; + if (last_token == token_type::parse_error) + { + error_msg += std::string(m_lexer.get_error_message()) + "; last read: '" + + m_lexer.get_token_string() + "'"; + } + else + { + error_msg += + "unexpected " + std::string(lexer_t::token_type_name(last_token)); + } + + if (expected != token_type::uninitialized) + { + error_msg += + "; expected " + std::string(lexer_t::token_type_name(expected)); + } + + JSON_THROW(parse_error::create(101, m_lexer.get_position(), error_msg)); + } + + private: + /// current level of recursion + int depth = 0; + /// callback function + const parser_callback_t callback = nullptr; + /// the type of the last read token + token_type last_token = token_type::uninitialized; + /// the lexer + lexer_t m_lexer; + /// whether a syntax error occurred + bool errored = false; + /// possible reason for the syntax error + token_type expected = token_type::uninitialized; +}; +} // namespace detail /// namespace to hold default `to_json` / `from_json` functions namespace @@ -1999,9 +4038,14 @@ class basic_json { private: template friend struct detail::external_constructor; + friend ::nlohmann::json_pointer; + friend ::nlohmann::detail::parser; /// workaround type for MSVC using basic_json_t = NLOHMANN_BASIC_JSON_TPL; - friend ::nlohmann::json_pointer; + + // convenience aliases for types residing in namespace detail; + using lexer = ::nlohmann::detail::lexer; + using parser = ::nlohmann::detail::parser; public: using value_t = detail::value_t; @@ -2774,31 +4818,7 @@ class basic_json // JSON parser callback // ////////////////////////// - /*! - @brief JSON callback events - - This enumeration lists the parser events that can trigger calling a - callback function of type @ref parser_callback_t during parsing. - - @image html callback_events.png "Example when certain parse events are triggered" - - @since version 1.0.0 - */ - enum class parse_event_t : uint8_t - { - /// the parser read `{` and started to process a JSON object - object_start, - /// the parser read `}` and finished processing a JSON object - object_end, - /// the parser read `[` and started to process a JSON array - array_start, - /// the parser read `]` and finished processing a JSON array - array_end, - /// the parser read a key of a value in an object - key, - /// the parser finished reading a JSON value - value - }; + using parse_event_t = typename parser::parse_event_t; /*! @brief per-element parser callback type @@ -2852,9 +4872,7 @@ class basic_json @since version 1.0.0 */ - using parser_callback_t = std::function; + using parser_callback_t = typename parser::parser_callback_t; ////////////////// @@ -11386,1986 +13404,6 @@ class basic_json /// @} - ////////////////////// - // lexer and parser // - ////////////////////// - - private: - /*! - @brief lexical analysis - - This class organizes the lexical analysis during JSON deserialization. - */ - class lexer - { - public: - /// token types for the parser - enum class token_type - { - uninitialized, ///< indicating the scanner is uninitialized - literal_true, ///< the `true` literal - literal_false, ///< the `false` literal - literal_null, ///< the `null` literal - value_string, ///< a string -- use get_string() for actual value - value_unsigned, ///< an unsigned integer -- use get_number_unsigned() for actual value - value_integer, ///< a signed integer -- use get_number_integer() for actual value - value_float, ///< an floating point number -- use get_number_float() for actual value - begin_array, ///< the character for array begin `[` - begin_object, ///< the character for object begin `{` - end_array, ///< the character for array end `]` - end_object, ///< the character for object end `}` - name_separator, ///< the name separator `:` - value_separator, ///< the value separator `,` - parse_error, ///< indicating a parse error - end_of_input, ///< indicating the end of the input buffer - literal_or_value ///< a literal or the begin of a value (only for diagnostics) - }; - - /// return name of values of type token_type (only used for errors) - static const char* token_type_name(const token_type t) noexcept - { - switch (t) - { - case token_type::uninitialized: - return ""; - case token_type::literal_true: - return "true literal"; - case token_type::literal_false: - return "false literal"; - case token_type::literal_null: - return "null literal"; - case token_type::value_string: - return "string literal"; - case lexer::token_type::value_unsigned: - case lexer::token_type::value_integer: - case lexer::token_type::value_float: - return "number literal"; - case token_type::begin_array: - return "'['"; - case token_type::begin_object: - return "'{'"; - case token_type::end_array: - return "']'"; - case token_type::end_object: - return "'}'"; - case token_type::name_separator: - return "':'"; - case token_type::value_separator: - return "','"; - case token_type::parse_error: - return ""; - case token_type::end_of_input: - return "end of input"; - case token_type::literal_or_value: - return "'[', '{', or a literal"; - default: - { - // catch non-enum values - return "unknown token"; // LCOV_EXCL_LINE - } - } - } - - explicit lexer(detail::input_adapter_t adapter) - : ia(adapter), decimal_point_char(get_decimal_point()) - {} - - // delete because of pointer members - lexer(const lexer&) = delete; - lexer& operator=(lexer&) = delete; - - private: - ///////////////////// - // locales - ///////////////////// - - /// return the locale-dependent decimal point - static char get_decimal_point() noexcept - { - const auto loc = localeconv(); - assert(loc != nullptr); - return (loc->decimal_point == nullptr) ? '.' : loc->decimal_point[0]; - } - - ///////////////////// - // scan functions - ///////////////////// - - /*! - @brief get codepoint from 4 hex characters following `\u` - - @return codepoint or -1 in case of an error (e.g. EOF or non-hex - character) - */ - int get_codepoint() - { - // this function only makes sense after reading `\u` - assert(current == 'u'); - int codepoint = 0; - - // byte 1: \uXxxx - switch (get()) - { - case '0': - break; - case '1': - codepoint += 0x1000; - break; - case '2': - codepoint += 0x2000; - break; - case '3': - codepoint += 0x3000; - break; - case '4': - codepoint += 0x4000; - break; - case '5': - codepoint += 0x5000; - break; - case '6': - codepoint += 0x6000; - break; - case '7': - codepoint += 0x7000; - break; - case '8': - codepoint += 0x8000; - break; - case '9': - codepoint += 0x9000; - break; - case 'A': - case 'a': - codepoint += 0xa000; - break; - case 'B': - case 'b': - codepoint += 0xb000; - break; - case 'C': - case 'c': - codepoint += 0xc000; - break; - case 'D': - case 'd': - codepoint += 0xd000; - break; - case 'E': - case 'e': - codepoint += 0xe000; - break; - case 'F': - case 'f': - codepoint += 0xf000; - break; - default: - return -1; - } - - // byte 2: \uxXxx - switch (get()) - { - case '0': - break; - case '1': - codepoint += 0x0100; - break; - case '2': - codepoint += 0x0200; - break; - case '3': - codepoint += 0x0300; - break; - case '4': - codepoint += 0x0400; - break; - case '5': - codepoint += 0x0500; - break; - case '6': - codepoint += 0x0600; - break; - case '7': - codepoint += 0x0700; - break; - case '8': - codepoint += 0x0800; - break; - case '9': - codepoint += 0x0900; - break; - case 'A': - case 'a': - codepoint += 0x0a00; - break; - case 'B': - case 'b': - codepoint += 0x0b00; - break; - case 'C': - case 'c': - codepoint += 0x0c00; - break; - case 'D': - case 'd': - codepoint += 0x0d00; - break; - case 'E': - case 'e': - codepoint += 0x0e00; - break; - case 'F': - case 'f': - codepoint += 0x0f00; - break; - default: - return -1; - } - - // byte 3: \uxxXx - switch (get()) - { - case '0': - break; - case '1': - codepoint += 0x0010; - break; - case '2': - codepoint += 0x0020; - break; - case '3': - codepoint += 0x0030; - break; - case '4': - codepoint += 0x0040; - break; - case '5': - codepoint += 0x0050; - break; - case '6': - codepoint += 0x0060; - break; - case '7': - codepoint += 0x0070; - break; - case '8': - codepoint += 0x0080; - break; - case '9': - codepoint += 0x0090; - break; - case 'A': - case 'a': - codepoint += 0x00a0; - break; - case 'B': - case 'b': - codepoint += 0x00b0; - break; - case 'C': - case 'c': - codepoint += 0x00c0; - break; - case 'D': - case 'd': - codepoint += 0x00d0; - break; - case 'E': - case 'e': - codepoint += 0x00e0; - break; - case 'F': - case 'f': - codepoint += 0x00f0; - break; - default: - return -1; - } - - // byte 4: \uxxxX - switch (get()) - { - case '0': - break; - case '1': - codepoint += 0x0001; - break; - case '2': - codepoint += 0x0002; - break; - case '3': - codepoint += 0x0003; - break; - case '4': - codepoint += 0x0004; - break; - case '5': - codepoint += 0x0005; - break; - case '6': - codepoint += 0x0006; - break; - case '7': - codepoint += 0x0007; - break; - case '8': - codepoint += 0x0008; - break; - case '9': - codepoint += 0x0009; - break; - case 'A': - case 'a': - codepoint += 0x000a; - break; - case 'B': - case 'b': - codepoint += 0x000b; - break; - case 'C': - case 'c': - codepoint += 0x000c; - break; - case 'D': - case 'd': - codepoint += 0x000d; - break; - case 'E': - case 'e': - codepoint += 0x000e; - break; - case 'F': - case 'f': - codepoint += 0x000f; - break; - default: - return -1; - } - - return codepoint; - } - - /*! - @brief scan a string literal - - This function scans a string according to Sect. 7 of RFC 7159. While - scanning, bytes are escaped and copied into buffer yytext. Then the - function returns successfully, yytext is null-terminated and yylen - contains the number of bytes in the string. - - @return token_type::value_string if string could be successfully - scanned, token_type::parse_error otherwise - - @note In case of errors, variable error_message contains a textual - description. - */ - token_type scan_string() - { - // reset yytext (ignore opening quote) - reset(); - - // we entered the function by reading an open quote - assert(current == '\"'); - - while (true) - { - // get next character - switch (get()) - { - // end of file while parsing string - case std::char_traits::eof(): - { - error_message = "invalid string: missing closing quote"; - return token_type::parse_error; - } - - // closing quote - case '\"': - { - // terminate yytext - add('\0'); - --yylen; - return token_type::value_string; - } - - // escapes - case '\\': - { - switch (get()) - { - // quotation mark - case '\"': - add('\"'); - break; - // reverse solidus - case '\\': - add('\\'); - break; - // solidus - case '/': - add('/'); - break; - // backspace - case 'b': - add('\b'); - break; - // form feed - case 'f': - add('\f'); - break; - // line feed - case 'n': - add('\n'); - break; - // carriage return - case 'r': - add('\r'); - break; - // tab - case 't': - add('\t'); - break; - - // unicode escapes - case 'u': - { - int codepoint; - int codepoint1 = get_codepoint(); - - if (JSON_UNLIKELY(codepoint1 == -1)) - { - error_message = "invalid string: '\\u' must be followed by 4 hex digits"; - return token_type::parse_error; - } - - // check if code point is a high surrogate - if (0xD800 <= codepoint1 and codepoint1 <= 0xDBFF) - { - // expect next \uxxxx entry - if (JSON_LIKELY(get() == '\\' and get() == 'u')) - { - const int codepoint2 = get_codepoint(); - - if (JSON_UNLIKELY(codepoint2 == -1)) - { - error_message = "invalid string: '\\u' must be followed by 4 hex digits"; - return token_type::parse_error; - } - - // check if codepoint2 is a low surrogate - if (JSON_LIKELY(0xDC00 <= codepoint2 and codepoint2 <= 0xDFFF)) - { - codepoint = - // high surrogate occupies the most significant 22 bits - (codepoint1 << 10) - // low surrogate occupies the least significant 15 bits - + codepoint2 - // there is still the 0xD800, 0xDC00 and 0x10000 noise - // in the result so we have to subtract with: - // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00 - - 0x35FDC00; - } - else - { - error_message = "invalid string: surrogate U+DC00..U+DFFF must be followed by U+DC00..U+DFFF"; - return token_type::parse_error; - } - } - else - { - error_message = "invalid string: surrogate U+DC00..U+DFFF must be followed by U+DC00..U+DFFF"; - return token_type::parse_error; - } - } - else - { - if (JSON_UNLIKELY(0xDC00 <= codepoint1 and codepoint1 <= 0xDFFF)) - { - error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF"; - return token_type::parse_error; - } - - // only work with first code point - codepoint = codepoint1; - } - - // result of the above calculation yields a proper codepoint - assert(0x00 <= codepoint and codepoint <= 0x10FFFF); - - // translate code point to bytes - if (codepoint < 0x80) - { - // 1-byte characters: 0xxxxxxx (ASCII) - add(codepoint); - } - else if (codepoint <= 0x7ff) - { - // 2-byte characters: 110xxxxx 10xxxxxx - add(0xC0 | (codepoint >> 6)); - add(0x80 | (codepoint & 0x3F)); - } - else if (codepoint <= 0xffff) - { - // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx - add(0xE0 | (codepoint >> 12)); - add(0x80 | ((codepoint >> 6) & 0x3F)); - add(0x80 | (codepoint & 0x3F)); - } - else - { - // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - add(0xF0 | (codepoint >> 18)); - add(0x80 | ((codepoint >> 12) & 0x3F)); - add(0x80 | ((codepoint >> 6) & 0x3F)); - add(0x80 | (codepoint & 0x3F)); - } - - break; - } - - // other characters after escape - default: - error_message = "invalid string: forbidden character after backslash"; - return token_type::parse_error; - } - - break; - } - - // invalid control characters - case 0x00: - case 0x01: - case 0x02: - case 0x03: - case 0x04: - case 0x05: - case 0x06: - case 0x07: - case 0x08: - case 0x09: - case 0x0a: - case 0x0b: - case 0x0c: - case 0x0d: - case 0x0e: - case 0x0f: - case 0x10: - case 0x11: - case 0x12: - case 0x13: - case 0x14: - case 0x15: - case 0x16: - case 0x17: - case 0x18: - case 0x19: - case 0x1a: - case 0x1b: - case 0x1c: - case 0x1d: - case 0x1e: - case 0x1f: - { - error_message = "invalid string: control character must be escaped"; - return token_type::parse_error; - } - - // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace)) - case 0x20: - case 0x21: - case 0x23: - case 0x24: - case 0x25: - case 0x26: - case 0x27: - case 0x28: - case 0x29: - case 0x2a: - case 0x2b: - case 0x2c: - case 0x2d: - case 0x2e: - case 0x2f: - case 0x30: - case 0x31: - case 0x32: - case 0x33: - case 0x34: - case 0x35: - case 0x36: - case 0x37: - case 0x38: - case 0x39: - case 0x3a: - case 0x3b: - case 0x3c: - case 0x3d: - case 0x3e: - case 0x3f: - case 0x40: - case 0x41: - case 0x42: - case 0x43: - case 0x44: - case 0x45: - case 0x46: - case 0x47: - case 0x48: - case 0x49: - case 0x4a: - case 0x4b: - case 0x4c: - case 0x4d: - case 0x4e: - case 0x4f: - case 0x50: - case 0x51: - case 0x52: - case 0x53: - case 0x54: - case 0x55: - case 0x56: - case 0x57: - case 0x58: - case 0x59: - case 0x5a: - case 0x5b: - case 0x5d: - case 0x5e: - case 0x5f: - case 0x60: - case 0x61: - case 0x62: - case 0x63: - case 0x64: - case 0x65: - case 0x66: - case 0x67: - case 0x68: - case 0x69: - case 0x6a: - case 0x6b: - case 0x6c: - case 0x6d: - case 0x6e: - case 0x6f: - case 0x70: - case 0x71: - case 0x72: - case 0x73: - case 0x74: - case 0x75: - case 0x76: - case 0x77: - case 0x78: - case 0x79: - case 0x7a: - case 0x7b: - case 0x7c: - case 0x7d: - case 0x7e: - case 0x7f: - { - add(current); - break; - } - - // U+0080..U+07FF: bytes C2..DF 80..BF - case 0xc2: - case 0xc3: - case 0xc4: - case 0xc5: - case 0xc6: - case 0xc7: - case 0xc8: - case 0xc9: - case 0xca: - case 0xcb: - case 0xcc: - case 0xcd: - case 0xce: - case 0xcf: - case 0xd0: - case 0xd1: - case 0xd2: - case 0xd3: - case 0xd4: - case 0xd5: - case 0xd6: - case 0xd7: - case 0xd8: - case 0xd9: - case 0xda: - case 0xdb: - case 0xdc: - case 0xdd: - case 0xde: - case 0xdf: - { - add(current); - get(); - if (JSON_LIKELY(0x80 <= current and current <= 0xbf)) - { - add(current); - continue; - } - - error_message = "invalid string: ill-formed UTF-8 byte"; - return token_type::parse_error; - } - - // U+0800..U+0FFF: bytes E0 A0..BF 80..BF - case 0xe0: - { - add(current); - get(); - if (JSON_LIKELY(0xa0 <= current and current <= 0xbf)) - { - add(current); - get(); - if (JSON_LIKELY(0x80 <= current and current <= 0xbf)) - { - add(current); - continue; - } - } - - error_message = "invalid string: ill-formed UTF-8 byte"; - return token_type::parse_error; - } - - // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF - // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF - case 0xe1: - case 0xe2: - case 0xe3: - case 0xe4: - case 0xe5: - case 0xe6: - case 0xe7: - case 0xe8: - case 0xe9: - case 0xea: - case 0xeb: - case 0xec: - case 0xee: - case 0xef: - { - add(current); - get(); - if (JSON_LIKELY(0x80 <= current and current <= 0xbf)) - { - add(current); - get(); - if (JSON_LIKELY(0x80 <= current and current <= 0xbf)) - { - add(current); - continue; - } - } - - error_message = "invalid string: ill-formed UTF-8 byte"; - return token_type::parse_error; - } - - // U+D000..U+D7FF: bytes ED 80..9F 80..BF - case 0xed: - { - add(current); - get(); - if (JSON_LIKELY(0x80 <= current and current <= 0x9f)) - { - add(current); - get(); - if (JSON_LIKELY(0x80 <= current and current <= 0xbf)) - { - add(current); - continue; - } - } - - error_message = "invalid string: ill-formed UTF-8 byte"; - return token_type::parse_error; - } - - // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF - case 0xf0: - { - add(current); - get(); - if (JSON_LIKELY(0x90 <= current and current <= 0xbf)) - { - add(current); - get(); - if (JSON_LIKELY(0x80 <= current and current <= 0xbf)) - { - add(current); - get(); - if (JSON_LIKELY(0x80 <= current and current <= 0xbf)) - { - add(current); - continue; - } - } - } - - error_message = "invalid string: ill-formed UTF-8 byte"; - return token_type::parse_error; - } - - // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF - case 0xf1: - case 0xf2: - case 0xf3: - { - add(current); - get(); - if (JSON_LIKELY(0x80 <= current and current <= 0xbf)) - { - add(current); - get(); - if (JSON_LIKELY(0x80 <= current and current <= 0xbf)) - { - add(current); - get(); - if (JSON_LIKELY(0x80 <= current and current <= 0xbf)) - { - add(current); - continue; - } - } - } - - error_message = "invalid string: ill-formed UTF-8 byte"; - return token_type::parse_error; - } - - // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF - case 0xf4: - { - add(current); - get(); - if (JSON_LIKELY(0x80 <= current and current <= 0x8f)) - { - add(current); - get(); - if (JSON_LIKELY(0x80 <= current and current <= 0xbf)) - { - add(current); - get(); - if (JSON_LIKELY(0x80 <= current and current <= 0xbf)) - { - add(current); - continue; - } - } - } - - error_message = "invalid string: ill-formed UTF-8 byte"; - return token_type::parse_error; - } - - // remaining bytes (80..C1 and F5..FF) are ill-formed - default: - { - error_message = "invalid string: ill-formed UTF-8 byte"; - return token_type::parse_error; - } - } - } - } - - static void strtof(float& f, const char* str, char** endptr) noexcept - { - f = std::strtof(str, endptr); - } - - static void strtof(double& f, const char* str, char** endptr) noexcept - { - f = std::strtod(str, endptr); - } - - static void strtof(long double& f, const char* str, char** endptr) noexcept - { - f = std::strtold(str, endptr); - } - - /*! - @brief scan a number literal - - This function scans a string according to Sect. 6 of RFC 7159. - - The function is realized with a deterministic finite state machine - derived from the grammar described in RFC 7159. Starting in state - "init", the input is read and used to determined the next state. Only - state "done" accepts the number. State "error" is a trap state to model - errors. In the table below, "anything" means any character but the ones - listed before. - - state | 0 | 1-9 | e E | + | - | . | anything - ---------|----------|----------|----------|---------|---------|----------|----------- - init | zero | any1 | [error] | [error] | minus | [error] | [error] - minus | zero | any1 | [error] | [error] | [error] | [error] | [error] - zero | done | done | exponent | done | done | decimal1 | done - any1 | any1 | any1 | exponent | done | done | decimal1 | done - decimal1 | decimal2 | [error] | [error] | [error] | [error] | [error] | [error] - decimal2 | decimal2 | decimal2 | exponent | done | done | done | done - exponent | any2 | any2 | [error] | sign | sign | [error] | [error] - sign | any2 | any2 | [error] | [error] | [error] | [error] | [error] - any2 | any2 | any2 | done | done | done | done | done - - The state machine is realized with one label per state (prefixed with - "scan_number_") and `goto` statements between them. The state machine - contains cycles, but any cycle can be left when EOF is read. Therefore, - the function is guaranteed to terminate. - - During scanning, the read bytes are stored in yytext. This string is - then converted to a signed integer, an unsigned integer, or a - floating-point number. - - @return token_type::value_unsigned, token_type::value_integer, or - token_type::value_float if number could be successfully scanned, - token_type::parse_error otherwise - - @note The scanner is independent of the current locale. Internally, the - locale's decimal point is used instead of `.` to work with the - locale-dependent converters. - */ - token_type scan_number() - { - // reset yytext to store the number's bytes - reset(); - - // the type of the parsed number; initially set to unsigned; will be - // changed if minus sign, decimal point or exponent is read - token_type number_type = token_type::value_unsigned; - - // state (init): we just found out we need to scan a number - switch (current) - { - case '-': - { - add(current); - goto scan_number_minus; - } - - case '0': - { - add(current); - goto scan_number_zero; - } - - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any1; - } - - default: - { - // all other characters are rejected outside scan_number() - assert(false); // LCOV_EXCL_LINE - } - } - -scan_number_minus: - // state: we just parsed a leading minus sign - number_type = token_type::value_integer; - switch (get()) - { - case '0': - { - add(current); - goto scan_number_zero; - } - - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any1; - } - - default: - { - error_message = "invalid number; expected digit after '-'"; - return token_type::parse_error; - } - } - -scan_number_zero: - // state: we just parse a zero (maybe with a leading minus sign) - switch (get()) - { - case '.': - { - add(decimal_point_char); - goto scan_number_decimal1; - } - - case 'e': - case 'E': - { - add(current); - goto scan_number_exponent; - } - - default: - { - goto scan_number_done; - } - } - -scan_number_any1: - // state: we just parsed a number 0-9 (maybe with a leading minus sign) - switch (get()) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any1; - } - - case '.': - { - add(decimal_point_char); - goto scan_number_decimal1; - } - - case 'e': - case 'E': - { - add(current); - goto scan_number_exponent; - } - - default: - { - goto scan_number_done; - } - } - -scan_number_decimal1: - // state: we just parsed a decimal point - number_type = token_type::value_float; - switch (get()) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_decimal2; - } - - default: - { - error_message = "invalid number; expected digit after '.'"; - return token_type::parse_error; - } - } - -scan_number_decimal2: - // we just parsed at least one number after a decimal point - switch (get()) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_decimal2; - } - - case 'e': - case 'E': - { - add(current); - goto scan_number_exponent; - } - - default: - { - goto scan_number_done; - } - } - -scan_number_exponent: - // we just parsed an exponent - number_type = token_type::value_float; - switch (get()) - { - case '+': - case '-': - { - add(current); - goto scan_number_sign; - } - - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any2; - } - - default: - { - error_message = "invalid number; expected '+', '-', or digit after exponent"; - return token_type::parse_error; - } - } - -scan_number_sign: - // we just parsed an exponent sign - switch (get()) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any2; - } - - default: - { - error_message = "invalid number; expected digit after exponent sign"; - return token_type::parse_error; - } - } - -scan_number_any2: - // we just parsed a number after the exponent or exponent sign - switch (get()) - { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - add(current); - goto scan_number_any2; - } - - default: - { - goto scan_number_done; - } - } - -scan_number_done: - // unget the character after the number (we only read it to know - // that we are done scanning a number) - --chars_read; - next_unget = true; - - // terminate token - add('\0'); - --yylen; - - // try to parse integers first and fall back to floats - if (number_type == token_type::value_unsigned) - { - char* endptr = nullptr; - errno = 0; - const auto x = std::strtoull(yytext.data(), &endptr, 10); - - // we checked the number format before - assert(endptr == yytext.data() + yylen); - - if (errno == 0) - { - value_unsigned = static_cast(x); - if (value_unsigned == x) - { - return token_type::value_unsigned; - } - } - } - else if (number_type == token_type::value_integer) - { - char* endptr = nullptr; - errno = 0; - const auto x = std::strtoll(yytext.data(), &endptr, 10); - - // we checked the number format before - assert(endptr == yytext.data() + yylen); - - if (errno == 0) - { - value_integer = static_cast(x); - if (value_integer == x) - { - return token_type::value_integer; - } - } - } - - // this code is reached if we parse a floating-point number or if - // an integer conversion above failed - strtof(value_float, yytext.data(), nullptr); - return token_type::value_float; - } - - /*! - @param[in] literal_text the literal text to expect - @param[in] length the length of the passed literal text - @param[in] return_type the token type to return on success - */ - token_type scan_literal(const char* literal_text, const size_t length, - token_type return_type) - { - assert(current == literal_text[0]); - for (size_t i = 1; i < length; ++i) - { - if (JSON_UNLIKELY(get() != literal_text[i])) - { - error_message = "invalid literal"; - return token_type::parse_error; - } - } - return return_type; - } - - ///////////////////// - // input management - ///////////////////// - - /// reset yytext - void reset() noexcept - { - yylen = 0; - start_pos = chars_read - 1; - } - - /// get a character from the input - int get() - { - ++chars_read; - return next_unget - ? (next_unget = false, current) - : (current = ia->get_character()); - } - - /// add a character to yytext - void add(int c) - { - // resize yytext if necessary; this condition is deemed unlikely, - // because we start with a 1024-byte buffer - if (JSON_UNLIKELY((yylen + 1 > yytext.capacity()))) - { - yytext.resize(2 * yytext.capacity(), '\0'); - } - assert(yylen < yytext.size()); - yytext[yylen++] = static_cast(c); - } - - public: - ///////////////////// - // value getters - ///////////////////// - - /// return integer value - constexpr number_integer_t get_number_integer() const noexcept - { - return value_integer; - } - - /// return unsigned integer value - constexpr number_unsigned_t get_number_unsigned() const noexcept - { - return value_unsigned; - } - - /// return floating-point value - constexpr number_float_t get_number_float() const noexcept - { - return value_float; - } - - /// return string value - const std::string get_string() - { - // yytext cannot be returned as char*, because it may contain a - // null byte (parsed as "\u0000") - return std::string(yytext.data(), yylen); - } - - ///////////////////// - // diagnostics - ///////////////////// - - /// return position of last read token - constexpr size_t get_position() const noexcept - { - return chars_read; - } - - /// return the last read token (for errors only) - std::string get_token_string() const - { - // get the raw byte sequence of the last token - std::string s = ia->read(start_pos, chars_read - start_pos); - - // escape control characters - std::string result; - for (auto c : s) - { - if (c == '\0' or c == std::char_traits::eof()) - { - // ignore EOF - continue; - } - else if ('\x00' <= c and c <= '\x1f') - { - // escape control characters - std::stringstream ss; - ss << "(c) << ">"; - result += ss.str(); - } - else - { - // add character as is - result.append(1, c); - } - } - - return result; - } - - /// return syntax error message - constexpr const char* get_error_message() const noexcept - { - return error_message; - } - - ///////////////////// - // actual scanner - ///////////////////// - - token_type scan() - { - // read next character and ignore whitespace - do - { - get(); - } - while (current == ' ' or current == '\t' or current == '\n' or current == '\r'); - - switch (current) - { - // structural characters - case '[': - return token_type::begin_array; - case ']': - return token_type::end_array; - case '{': - return token_type::begin_object; - case '}': - return token_type::end_object; - case ':': - return token_type::name_separator; - case ',': - return token_type::value_separator; - - // literals - case 't': - return scan_literal("true", 4, token_type::literal_true); - case 'f': - return scan_literal("false", 5, token_type::literal_false); - case 'n': - return scan_literal("null", 4, token_type::literal_null); - - // string - case '\"': - return scan_string(); - - // number - case '-': - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - return scan_number(); - - // end of input (the null byte is needed when parsing from - // string literals) - case '\0': - case std::char_traits::eof(): - return token_type::end_of_input; - - // error - default: - error_message = "invalid literal"; - return token_type::parse_error; - } - } - - private: - /// input adapter - detail::input_adapter_t ia = nullptr; - - /// the current character - int current = std::char_traits::eof(); - - /// whether get() should return the last character again - bool next_unget = false; - - /// the number of characters read - size_t chars_read = 0; - /// the start position of the current token - size_t start_pos = 0; - - /// buffer for variable-length tokens (numbers, strings) - std::vector yytext = std::vector(1024, '\0'); - /// current index in yytext - size_t yylen = 0; - - /// a description of occurred lexer errors - const char* error_message = ""; - - // number values - number_integer_t value_integer = 0; - number_unsigned_t value_unsigned = 0; - number_float_t value_float = 0; - - /// the decimal point - const char decimal_point_char = '.'; - }; - - /*! - @brief syntax analysis - - This class implements a recursive decent parser. - */ - class parser - { - public: - /// a parser reading from an input adapter - explicit parser(detail::input_adapter_t adapter, - const parser_callback_t cb = nullptr) - : callback(cb), m_lexer(adapter) - {} - - /*! - @brief public parser interface - - @param[in] strict whether to expect the last token to be EOF - @return parsed JSON value - - @throw parse_error.101 in case of an unexpected token - @throw parse_error.102 if to_unicode fails or surrogate error - @throw parse_error.103 if to_unicode fails - */ - basic_json parse(const bool strict = true) - { - // read first token - get_token(); - - basic_json result = parse_internal(true); - result.assert_invariant(); - - if (strict) - { - get_token(); - expect(lexer::token_type::end_of_input); - } - - // return parser result and replace it with null in case the - // top-level value was discarded by the callback function - return result.is_discarded() ? basic_json() : std::move(result); - } - - /*! - @brief public accept interface - - @param[in] strict whether to expect the last token to be EOF - @return whether the input is a proper JSON text - */ - bool accept(const bool strict = true) - { - // read first token - get_token(); - - if (not accept_internal()) - { - return false; - } - - if (strict and get_token() != lexer::token_type::end_of_input) - { - return false; - } - - return true; - } - - private: - /*! - @brief the actual parser - @throw parse_error.101 in case of an unexpected token - @throw parse_error.102 if to_unicode fails or surrogate error - @throw parse_error.103 if to_unicode fails - */ - basic_json parse_internal(bool keep) - { - auto result = basic_json(value_t::discarded); - - switch (last_token) - { - case lexer::token_type::begin_object: - { - if (keep and (not callback - or ((keep = callback(depth++, parse_event_t::object_start, result)) != 0))) - { - // explicitly set result to object to cope with {} - result.m_type = value_t::object; - result.m_value = value_t::object; - } - - // read next token - get_token(); - - // closing } -> we are done - if (last_token == lexer::token_type::end_object) - { - if (keep and callback and not callback(--depth, parse_event_t::object_end, result)) - { - result = basic_json(value_t::discarded); - } - return result; - } - - // parse values - while (true) - { - // store key - expect(lexer::token_type::value_string); - const auto key = m_lexer.get_string(); - - bool keep_tag = false; - if (keep) - { - if (callback) - { - basic_json k(key); - keep_tag = callback(depth, parse_event_t::key, k); - } - else - { - keep_tag = true; - } - } - - // parse separator (:) - get_token(); - expect(lexer::token_type::name_separator); - - // parse and add value - get_token(); - auto value = parse_internal(keep); - if (keep and keep_tag and not value.is_discarded()) - { - result[key] = std::move(value); - } - - // comma -> next value - get_token(); - if (last_token == lexer::token_type::value_separator) - { - get_token(); - continue; - } - - // closing } - expect(lexer::token_type::end_object); - break; - } - - if (keep and callback and not callback(--depth, parse_event_t::object_end, result)) - { - result = basic_json(value_t::discarded); - } - - return result; - } - - case lexer::token_type::begin_array: - { - if (keep and (not callback - or ((keep = callback(depth++, parse_event_t::array_start, result)) != 0))) - { - // explicitly set result to object to cope with [] - result.m_type = value_t::array; - result.m_value = value_t::array; - } - - // read next token - get_token(); - - // closing ] -> we are done - if (last_token == lexer::token_type::end_array) - { - if (callback and not callback(--depth, parse_event_t::array_end, result)) - { - result = basic_json(value_t::discarded); - } - return result; - } - - // parse values - while (true) - { - // parse value - auto value = parse_internal(keep); - if (keep and not value.is_discarded()) - { - result.push_back(std::move(value)); - } - - // comma -> next value - get_token(); - if (last_token == lexer::token_type::value_separator) - { - get_token(); - continue; - } - - // closing ] - expect(lexer::token_type::end_array); - break; - } - - if (keep and callback and not callback(--depth, parse_event_t::array_end, result)) - { - result = basic_json(value_t::discarded); - } - - return result; - } - - case lexer::token_type::literal_null: - { - result.m_type = value_t::null; - break; - } - - case lexer::token_type::value_string: - { - result = basic_json(m_lexer.get_string()); - break; - } - - case lexer::token_type::literal_true: - { - result.m_type = value_t::boolean; - result.m_value = true; - break; - } - - case lexer::token_type::literal_false: - { - result.m_type = value_t::boolean; - result.m_value = false; - break; - } - - case lexer::token_type::value_unsigned: - { - result.m_type = value_t::number_unsigned; - result.m_value = m_lexer.get_number_unsigned(); - break; - } - - case lexer::token_type::value_integer: - { - result.m_type = value_t::number_integer; - result.m_value = m_lexer.get_number_integer(); - break; - } - - case lexer::token_type::value_float: - { - result.m_type = value_t::number_float; - result.m_value = m_lexer.get_number_float(); - - // throw in case of infinity or NAN - if (JSON_UNLIKELY(not std::isfinite(result.m_value.number_float))) - { - JSON_THROW(out_of_range::create(406, "number overflow parsing '" + m_lexer.get_token_string() + "'")); - } - - break; - } - - case lexer::token_type::parse_error: - { - // using "uninitialized" to avoid "expected" message - expect(lexer::token_type::uninitialized); - break; // LCOV_EXCL_LINE - } - - default: - { - // the last token was unexpected; we expected a value - expect(lexer::token_type::literal_or_value); - break; // LCOV_EXCL_LINE - } - } - - if (keep and callback and not callback(depth, parse_event_t::value, result)) - { - result = basic_json(value_t::discarded); - } - return result; - } - - /*! - @brief the acutal acceptor - - @invariant 1. The last token is not yet processed. Therefore, the - caller of this function must make sure a token has - been read. - 2. When this function returns, the last token is processed. - That is, the last read character was already considered. - - This invariant makes sure that no token needs to be "unput". - */ - bool accept_internal() - { - switch (last_token) - { - case lexer::token_type::begin_object: - { - // read next token - get_token(); - - // closing } -> we are done - if (last_token == lexer::token_type::end_object) - { - return true; - } - - // parse values - while (true) - { - // parse key - if (last_token != lexer::token_type::value_string) - { - return false; - } - - // parse separator (:) - get_token(); - if (last_token != lexer::token_type::name_separator) - { - return false; - } - - // parse value - get_token(); - if (not accept_internal()) - { - return false; - } - - // comma -> next value - get_token(); - if (last_token == lexer::token_type::value_separator) - { - get_token(); - continue; - } - - // closing } - if (last_token != lexer::token_type::end_object) - { - return false; - } - - return true; - } - } - - case lexer::token_type::begin_array: - { - // read next token - get_token(); - - // closing ] -> we are done - if (last_token == lexer::token_type::end_array) - { - return true; - } - - // parse values - while (true) - { - // parse value - if (not accept_internal()) - { - return false; - } - - // comma -> next value - get_token(); - if (last_token == lexer::token_type::value_separator) - { - get_token(); - continue; - } - - // closing ] - if (last_token != lexer::token_type::end_array) - { - return false; - } - - return true; - } - } - - case lexer::token_type::literal_false: - case lexer::token_type::literal_null: - case lexer::token_type::literal_true: - case lexer::token_type::value_float: - case lexer::token_type::value_integer: - case lexer::token_type::value_string: - case lexer::token_type::value_unsigned: - { - return true; - } - - default: - { - // the last token was unexpected - return false; - } - } - } - - /// get next token from lexer - typename lexer::token_type get_token() - { - return (last_token = m_lexer.scan()); - } - - /*! - @throw parse_error.101 if expected token did not occur - */ - void expect(typename lexer::token_type t) - { - if (JSON_UNLIKELY(t != last_token)) - { - errored = true; - expected = t; - throw_exception(); - } - } - - [[noreturn]] void throw_exception() const - { - std::string error_msg = "syntax error - "; - if (last_token == lexer::token_type::parse_error) - { - error_msg += std::string(m_lexer.get_error_message()) + "; last read: '" + m_lexer.get_token_string() + "'"; - } - else - { - error_msg += "unexpected " + std::string(lexer::token_type_name(last_token)); - } - - if (expected != lexer::token_type::uninitialized) - { - error_msg += "; expected " + std::string(lexer::token_type_name(expected)); - } - - JSON_THROW(parse_error::create(101, m_lexer.get_position(), error_msg)); - } - - private: - /// current level of recursion - int depth = 0; - /// callback function - const parser_callback_t callback = nullptr; - /// the type of the last read token - typename lexer::token_type last_token = lexer::token_type::uninitialized; - /// the lexer - lexer m_lexer; - /// whether a syntax error occurred - bool errored = false; - /// possible reason for the syntax error - typename lexer::token_type expected = lexer::token_type::uninitialized; - }; - - public: ////////////////////////// // JSON Pointer support // //////////////////////////