From 50ee0a62f5eaf40d62c67115ebf58f959ab4dc60 Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Fri, 31 Mar 2017 23:24:33 +0200 Subject: [PATCH] :hammer: replaced lookup-tables by switches --- src/json.hpp | 1314 ++++++++++++++++++++++---------- test/src/unit-class_parser.cpp | 2 +- 2 files changed, 921 insertions(+), 395 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index f3e20f06..51154ca1 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -10552,7 +10552,7 @@ class basic_json } explicit lexer(std::istream& i) - : ia(new cached_input_stream_adapter(i, 1024 * 1024)), + : ia(new cached_input_stream_adapter(i, 16384)), decimal_point_char(get_decimal_point()) {} @@ -10591,29 +10591,243 @@ class basic_json // must be called after \u was read; returns following xxxx as hex or -1 when error int get_codepoint() { - // a mapping to discover hex numbers - static int8_t ascii_to_hex[256] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; - + assert(current == 'u'); int codepoint = 0; - // check the next 4 bytes - for (size_t i = 0; i < 4; ++i) + switch (get()) { - const int8_t digit = ascii_to_hex[static_cast(get())]; - if (JSON_UNLIKELY(digit == -1)) - { + case '0': + break; + case '1': + codepoint += 0x1000; + break; + case '2': + codepoint += 0x2000; + break; + case '3': + codepoint += 0x3000; + break; + case '4': + codepoint += 0x4000; + break; + case '5': + codepoint += 0x5000; + break; + case '6': + codepoint += 0x6000; + break; + case '7': + codepoint += 0x7000; + break; + case '8': + codepoint += 0x8000; + break; + case '9': + codepoint += 0x9000; + break; + case 'A': + case 'a': + codepoint += 0xa000; + break; + case 'B': + case 'b': + codepoint += 0xb000; + break; + case 'C': + case 'c': + codepoint += 0xc000; + break; + case 'D': + case 'd': + codepoint += 0xd000; + break; + case 'E': + case 'e': + codepoint += 0xe000; + break; + case 'F': + case 'f': + codepoint += 0xf000; + break; + default: return -1; - } - else - { - codepoint += digit; - } + } - // except the last byte, result must be multiplied by 16 - if (i != 3) - { - codepoint <<= 4; - } + switch (get()) + { + case '0': + break; + case '1': + codepoint += 0x0100; + break; + case '2': + codepoint += 0x0200; + break; + case '3': + codepoint += 0x0300; + break; + case '4': + codepoint += 0x0400; + break; + case '5': + codepoint += 0x0500; + break; + case '6': + codepoint += 0x0600; + break; + case '7': + codepoint += 0x0700; + break; + case '8': + codepoint += 0x0800; + break; + case '9': + codepoint += 0x0900; + break; + case 'A': + case 'a': + codepoint += 0x0a00; + break; + case 'B': + case 'b': + codepoint += 0x0b00; + break; + case 'C': + case 'c': + codepoint += 0x0c00; + break; + case 'D': + case 'd': + codepoint += 0x0d00; + break; + case 'E': + case 'e': + codepoint += 0x0e00; + break; + case 'F': + case 'f': + codepoint += 0x0f00; + break; + default: + return -1; + } + + switch (get()) + { + case '0': + break; + case '1': + codepoint += 0x0010; + break; + case '2': + codepoint += 0x0020; + break; + case '3': + codepoint += 0x0030; + break; + case '4': + codepoint += 0x0040; + break; + case '5': + codepoint += 0x0050; + break; + case '6': + codepoint += 0x0060; + break; + case '7': + codepoint += 0x0070; + break; + case '8': + codepoint += 0x0080; + break; + case '9': + codepoint += 0x0090; + break; + case 'A': + case 'a': + codepoint += 0x00a0; + break; + case 'B': + case 'b': + codepoint += 0x00b0; + break; + case 'C': + case 'c': + codepoint += 0x00c0; + break; + case 'D': + case 'd': + codepoint += 0x00d0; + break; + case 'E': + case 'e': + codepoint += 0x00e0; + break; + case 'F': + case 'f': + codepoint += 0x00f0; + break; + default: + return -1; + } + + switch (get()) + { + case '0': + break; + case '1': + codepoint += 0x0001; + break; + case '2': + codepoint += 0x0002; + break; + case '3': + codepoint += 0x0003; + break; + case '4': + codepoint += 0x0004; + break; + case '5': + codepoint += 0x0005; + break; + case '6': + codepoint += 0x0006; + break; + case '7': + codepoint += 0x0007; + break; + case '8': + codepoint += 0x0008; + break; + case '9': + codepoint += 0x0009; + break; + case 'A': + case 'a': + codepoint += 0x000a; + break; + case 'B': + case 'b': + codepoint += 0x000b; + break; + case 'C': + case 'c': + codepoint += 0x000c; + break; + case 'D': + case 'd': + codepoint += 0x000d; + break; + case 'E': + case 'e': + codepoint += 0x000e; + break; + case 'F': + case 'f': + codepoint += 0x000f; + break; + default: + return -1; } return codepoint; @@ -10627,260 +10841,31 @@ class basic_json // we entered the function by reading an open quote assert(current == '\"'); - static unsigned char next[256] = {17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 2, 2, 6, 3, 3, 3, 7, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18}; - - // state variable - int state = -1; - - // whether the state is already set - bool state_set = false; - while (true) { // get next character get(); - // end of file while parsing string - if (JSON_UNLIKELY(current == std::char_traits::eof())) + switch (current) { - error_message = "invalid string: missing closing quote"; - return token_type::parse_error; - } - - // after coping with EOF, we only cope with bytes - //assert(0 <= current and current <= 255); - unsigned char ch = static_cast(current); - - // get next state - state = state_set ? state : next[ch]; - // reset variable - state_set = false; - - // 'add': 0, - // 'add_check1': 1, - // 'add_check2': 2, - // 'add_check3': 3, - // 'add_check_e0': 4, - // 'add_check_ed': 5, - // 'add_check_f0': 6, - // 'add_check_f4': 7, - // 'check1': 8, - // 'check2': 9, - // 'check3': 10, - // 'check_e0': 11, - // 'check_ed': 12, - // 'check_f0': 13, - // 'check_f4': 14, - // 'escape': 15, - // 'end': 16, - // 'error_invalid': 17, - // 'error_utf8': 18 - assert(0 <= state and state <= 18); - - switch (state) - { - // add - case 0: + // end of file while parsing string + case std::char_traits::eof(): { - add(current); - break; + error_message = "invalid string: missing closing quote"; + return token_type::parse_error; } - // add_check1 - case 1: + // closing quote + case '\"': { - add(current); - // next state is check1 - state = 8; - state_set = true; - break; + // terminate yytext + add('\0'); + --yylen; + return token_type::value_string; } - // add_check2 - case 2: - { - add(current); - // next state is check2 - state = 9; - state_set = true; - break; - } - - // add_check3 - case 3: - { - add(current); - // next state is check3 - state = 10; - state_set = true; - break; - } - - // add_check_e0 - case 4: - { - add(current); - // next state is check_e0 - state = 11; - state_set = true; - break; - } - - // add_check_ed - case 5: - { - add(current); - // next state is check_ed - state = 12; - state_set = true; - break; - } - - // add_check_f0 - case 6: - { - add(current); - // next state is check_f0 - state = 13; - state_set = true; - break; - } - - // add_check_f4 - case 7: - { - add(current); - // next state is check_f4 - state = 14; - state_set = true; - break; - } - - // check1 - case 8: - { - if (JSON_LIKELY(0x80 <= ch and ch <= 0xBF)) - { - add(current); - break; - } - else - { - error_message = "invalid string: not well-formed UTF-8 byte"; - return token_type::parse_error; - } - } - - // check2 - case 9: - { - if (JSON_LIKELY(0x80 <= ch and ch <= 0xBF)) - { - add(current); - // next state is check1 - state = 8; - state_set = true; - break; - } - else - { - error_message = "invalid string: not well-formed UTF-8 byte"; - return token_type::parse_error; - } - } - - // check3 - case 10: - { - if (JSON_LIKELY(0x80 <= ch and ch <= 0xBF)) - { - add(current); - // next state is check2 - state = 9; - state_set = true; - break; - } - else - { - error_message = "invalid string: not well-formed UTF-8 byte"; - return token_type::parse_error; - } - } - - // check_e0 - case 11: - { - if (JSON_LIKELY(0xA0 <= ch and ch <= 0xBF)) - { - add(current); - // next state is check1 - state = 8; - state_set = true; - break; - } - else - { - error_message = "invalid string: not well-formed UTF-8 byte"; - return token_type::parse_error; - } - } - - // check_ed - case 12: - { - if (JSON_LIKELY(0x80 <= ch and ch <= 0x9F)) - { - add(current); - // next state is check1 - state = 8; - state_set = true; - break; - } - else - { - error_message = "invalid string: not well-formed UTF-8 byte"; - return token_type::parse_error; - } - } - - // check_f0 - case 13: - { - if (JSON_LIKELY(0x90 <= ch and ch <= 0xBF)) - { - add(current); - // next state is check2 - state = 9; - state_set = true; - break; - } - else - { - error_message = "invalid string: not well-formed UTF-8 byte"; - return token_type::parse_error; - } - } - - // check_f4 - case 14: - { - if (JSON_LIKELY(0x80 <= ch and ch <= 0x8F)) - { - add(current); - // next state is check2 - state = 9; - state_set = true; - break; - } - else - { - error_message = "invalid string: not well-formed UTF-8 byte"; - return token_type::parse_error; - } - } - - // escape - case 15: + // escapes + case '\\': { switch (get()) { @@ -10935,7 +10920,7 @@ class basic_json // expect next \uxxxx entry if (JSON_LIKELY(get() == '\\' and get() == 'u')) { - int codepoint2 = get_codepoint(); + const int codepoint2 = get_codepoint(); if (JSON_UNLIKELY(codepoint2 == -1)) { @@ -11025,32 +11010,344 @@ class basic_json break; } - // end - case 16: - { - // terminate yytext - add('\0'); - --yylen; - return token_type::value_string; - } - - // error_invalid - case 17: + // invalid control characters + case '\x00': + case '\x01': + case '\x02': + case '\x03': + case '\x04': + case '\x05': + case '\x06': + case '\x07': + case '\x08': + case '\x09': + case '\x0a': + case '\x0b': + case '\x0c': + case '\x0d': + case '\x0e': + case '\x0f': + case '\x10': + case '\x11': + case '\x12': + case '\x13': + case '\x14': + case '\x15': + case '\x16': + case '\x17': + case '\x18': + case '\x19': + case '\x1a': + case '\x1b': + case '\x1c': + case '\x1d': + case '\x1e': + case '\x1f': { error_message = "invalid string: control characters (U+0000 through U+001f) must be escaped"; return token_type::parse_error; } - // error_utf8 - case 18: + // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace)) + case '\x20': + case '\x21': + case '\x23': + case '\x24': + case '\x25': + case '\x26': + case '\x27': + case '\x28': + case '\x29': + case '\x2a': + case '\x2b': + case '\x2c': + case '\x2d': + case '\x2e': + case '\x2f': + case '\x30': + case '\x31': + case '\x32': + case '\x33': + case '\x34': + case '\x35': + case '\x36': + case '\x37': + case '\x38': + case '\x39': + case '\x3a': + case '\x3b': + case '\x3c': + case '\x3d': + case '\x3e': + case '\x3f': + case '\x40': + case '\x41': + case '\x42': + case '\x43': + case '\x44': + case '\x45': + case '\x46': + case '\x47': + case '\x48': + case '\x49': + case '\x4a': + case '\x4b': + case '\x4c': + case '\x4d': + case '\x4e': + case '\x4f': + case '\x50': + case '\x51': + case '\x52': + case '\x53': + case '\x54': + case '\x55': + case '\x56': + case '\x57': + case '\x58': + case '\x59': + case '\x5a': + case '\x5b': + case '\x5d': + case '\x5e': + case '\x5f': + case '\x60': + case '\x61': + case '\x62': + case '\x63': + case '\x64': + case '\x65': + case '\x66': + case '\x67': + case '\x68': + case '\x69': + case '\x6a': + case '\x6b': + case '\x6c': + case '\x6d': + case '\x6e': + case '\x6f': + case '\x70': + case '\x71': + case '\x72': + case '\x73': + case '\x74': + case '\x75': + case '\x76': + case '\x77': + case '\x78': + case '\x79': + case '\x7a': + case '\x7b': + case '\x7c': + case '\x7d': + case '\x7e': + case '\x7f': { + add(current); + break; + } + + // U+0080..U+07FF: bytes C2..DF 80..BF + case '\xc2': + case '\xc3': + case '\xc4': + case '\xc5': + case '\xc6': + case '\xc7': + case '\xc8': + case '\xc9': + case '\xca': + case '\xcb': + case '\xcc': + case '\xcd': + case '\xce': + case '\xcf': + case '\xd0': + case '\xd1': + case '\xd2': + case '\xd3': + case '\xd4': + case '\xd5': + case '\xd6': + case '\xd7': + case '\xd8': + case '\xd9': + case '\xda': + case '\xdb': + case '\xdc': + case '\xdd': + case '\xde': + case '\xdf': + { + add(current); + get(); + if (JSON_LIKELY('\x80' <= current and current <= '\xbf')) + { + add(current); + continue; + } + error_message = "invalid string: not well-formed UTF-8 byte"; return token_type::parse_error; } + // U+0800..U+0FFF: bytes E0 A0..BF 80..BF + case '\xe0': + { + add(current); + get(); + if (JSON_LIKELY('\xa0' <= current and current <= '\xbf')) + { + add(current); + get(); + if (JSON_LIKELY('\x80' <= current and current <= '\xbf')) + { + add(current); + continue; + } + } + + error_message = "invalid string: not well-formed UTF-8 byte"; + return token_type::parse_error; + } + + // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF + // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF + case '\xe1': + case '\xe2': + case '\xe3': + case '\xe4': + case '\xe5': + case '\xe6': + case '\xe7': + case '\xe8': + case '\xe9': + case '\xea': + case '\xeb': + case '\xec': + case '\xee': + case '\xef': + { + add(current); + get(); + if (JSON_LIKELY('\x80' <= current and current <= '\xbf')) + { + add(current); + get(); + if (JSON_LIKELY('\x80' <= current and current <= '\xbf')) + { + add(current); + continue; + } + } + + error_message = "invalid string: not well-formed UTF-8 byte"; + return token_type::parse_error; + } + + // U+D000..U+D7FF: bytes ED 80..9F 80..BF + case '\xed': + { + add(current); + get(); + if (JSON_LIKELY('\x80' <= current and current <= '\x9f')) + { + add(current); + get(); + if (JSON_LIKELY('\x80' <= current and current <= '\xbf')) + { + add(current); + continue; + } + } + + error_message = "invalid string: not well-formed UTF-8 byte"; + return token_type::parse_error; + } + + // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF + case '\xf0': + { + add(current); + get(); + if (JSON_LIKELY('\x90' <= current and current <= '\xbf')) + { + add(current); + get(); + if (JSON_LIKELY('\x80' <= current and current <= '\xbf')) + { + add(current); + get(); + if (JSON_LIKELY('\x80' <= current and current <= '\xbf')) + { + add(current); + continue; + } + } + } + + error_message = "invalid string: not well-formed UTF-8 byte"; + return token_type::parse_error; + } + + // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF + case '\xf1': + case '\xf2': + case '\xf3': + { + add(current); + get(); + if (JSON_LIKELY('\x80' <= current and current <= '\xbf')) + { + add(current); + get(); + if (JSON_LIKELY('\x80' <= current and current <= '\xbf')) + { + add(current); + get(); + if (JSON_LIKELY('\x80' <= current and current <= '\xbf')) + { + add(current); + continue; + } + } + } + + error_message = "invalid string: not well-formed UTF-8 byte"; + return token_type::parse_error; + } + + // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF + case '\xf4': + { + add(current); + get(); + if (JSON_LIKELY('\x80' <= current and current <= '\x8f')) + { + add(current); + get(); + if (JSON_LIKELY('\x80' <= current and current <= '\xbf')) + { + add(current); + get(); + if (JSON_LIKELY('\x80' <= current and current <= '\xbf')) + { + add(current); + continue; + } + } + } + + error_message = "invalid string: not well-formed UTF-8 byte"; + return token_type::parse_error; + } + + // remaining bytes (80..C1 and F5..FF) are not well-formed default: { - assert(false); // LCOV_EXCL_LINE + error_message = "invalid string: not well-formed UTF-8 byte"; + return token_type::parse_error; } } } @@ -11071,70 +11368,301 @@ class basic_json f = std::strtold(str, endptr); } + /*! + state | 0 | 1-9 | e E | + | - | . | anything + ---------|----------|----------|----------|---------|---------|----------|----------- + init | zero | any1 | [error] | [error] | minus | [error] | [error] + minus | zero | any1 | [error] | [error] | [error] | [error] | [error] + zero | done | done | exponent | done | done | decimal1 | done + any1 | any1 | any1 | exponent | done | done | decimal1 | done + decimal1 | decimal2 | [error] | [error] | [error] | [error] | [error] | [error] + decimal2 | decimal2 | decimal2 | exponent | done | done | done | done + exponent | any2 | any2 | [error] | sign | sign | [error] | [error] + sign | any2 | any2 | [error] | [error] | [error] | [error] | [error] + any2 | any2 | any2 | done | done | done | done | done + */ token_type scan_number() { - static unsigned char lookup[9][256] = - { - {10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 1, 10, 10, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10}, - {10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10}, - {9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 4, 9, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 5, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 5, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9}, - {9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 4, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 5, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 5, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9}, - {10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10}, - {10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8, 10, 8, 10, 10, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10}, - {9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 5, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 5, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9}, - {9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9}, - {10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10} - }; - reset(); - bool has_sign = false; - bool has_exp = false; - bool has_point = false; + // the type of the parsed number; initially set to unsigned; will + // be changed if minus sign, decimal point or exponent is read + token_type number_type = token_type::value_unsigned; - int state = lookup[0][static_cast(current)]; - int old_state = 0; - - while (state != 9) + // state: we just found out we need to scan a number + switch (current) { - has_sign = has_sign or (state == 1); - has_point = has_point or (state == 4); - has_exp = has_exp or (state == 5); - - if (JSON_UNLIKELY(state == 10)) + case '-': { - // create error message based on previous state - switch (old_state) - { - case 0: - error_message = "invalid number; expected '-' or digit"; - break; - case 1: - error_message = "invalid number; expected digit after '-'"; - break; - case 4: - error_message = "invalid number; expected digit after '.'"; - break; - case 5: - error_message = "invalid number; expected '+', '-', or digit after exponent"; - break; - case 8: - error_message = "invalid number; expected digit after exponent sign"; - break; - default: - assert(false); // no error in the other states - break; - } - return token_type::parse_error; + add(current); + goto scan_number_minus; } - // add current character and fix decimal point - add((state == 4) ? decimal_point_char : current); - get(); - old_state = state; - state = lookup[state][static_cast(current)]; + case '0': + { + add(current); + goto scan_number_zero; + } + + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any1; + } + + default: + { + // all other characters are rejected outside scan_number() + assert(false); // LCOV_EXCL_LINE + } } +scan_number_minus: + // state: we just parsed a leading minus sign + number_type = token_type::value_integer; + switch (get()) + { + case '0': + { + add(current); + goto scan_number_zero; + } + + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any1; + } + + default: + { + error_message = "invalid number; expected digit after '-'"; + return token_type::parse_error; + } + } + +scan_number_zero: + // state: we just parse a zero (maybe with a leading minus sign) + switch (get()) + { + case '.': + { + add(decimal_point_char); + goto scan_number_decimal1; + } + + case 'e': + case 'E': + { + add(current); + goto scan_number_exponent; + } + + default: + { + goto scan_number_done; + } + } + +scan_number_any1: + // state: we just parsed a number 0-9 (maybe with a leading minus sign) + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any1; + } + + case '.': + { + add(decimal_point_char); + goto scan_number_decimal1; + } + + case 'e': + case 'E': + { + add(current); + goto scan_number_exponent; + } + + default: + { + goto scan_number_done; + } + } + +scan_number_decimal1: + // state: we just parsed a decimal point + number_type = token_type::value_float; + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_decimal2; + } + + default: + { + error_message = "invalid number; expected digit after '.'"; + return token_type::parse_error; + } + } + +scan_number_decimal2: + // we just parsed at least one number after a decimal point + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_decimal2; + } + + case 'e': + case 'E': + { + add(current); + goto scan_number_exponent; + } + + default: + { + goto scan_number_done; + } + } + +scan_number_exponent: + // we just parsed an exponent + number_type = token_type::value_float; + switch (get()) + { + case '+': + case '-': + { + add(current); + goto scan_number_sign; + } + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any2; + } + + default: + { + error_message = "invalid number; expected '+', '-', or digit after exponent"; + return token_type::parse_error; + } + } + +scan_number_sign: + // we just parsed an exponent sign + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any2; + } + + default: + { + error_message = "invalid number; expected digit after exponent sign"; + return token_type::parse_error; + } + } + +scan_number_any2: + // we just parsed a number after the exponent or exponent sign + switch (get()) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + add(current); + goto scan_number_any2; + } + + default: + { + goto scan_number_done; + } + } + +scan_number_done: // unget the character after the number unget(); @@ -11143,30 +11671,42 @@ class basic_json --yylen; // try to parse integers first and fall back to floats - if (not has_exp and not has_point) + if (number_type == token_type::value_unsigned) { + char* endptr = nullptr; errno = 0; - if (has_sign) + const auto x = std::strtoull(yytext.data(), &endptr, 10); + + // we checked the number format before + assert(endptr == yytext.data() + yylen); + + if (errno == 0) { - char* endptr = nullptr; - const auto x = std::strtoll(yytext.data(), &endptr, 10); - value_integer = static_cast(x); - if (errno == 0 and endptr == yytext.data() + yylen and value_integer == x) - { - return token_type::value_integer; - } - } - else - { - char* endptr = nullptr; - const auto x = std::strtoull(yytext.data(), &endptr, 10); value_unsigned = static_cast(x); - if (errno == 0 and endptr == yytext.data() + yylen and value_unsigned == x) + if (value_unsigned == x) { return token_type::value_unsigned; } } } + else if (number_type == token_type::value_integer) + { + char* endptr = nullptr; + errno = 0; + const auto x = std::strtoll(yytext.data(), &endptr, 10); + + // we checked the number format before + assert(endptr == yytext.data() + yylen); + + if (errno == 0) + { + value_integer = static_cast(x); + if (value_integer == x) + { + return token_type::value_integer; + } + } + } strtof(value_float, yytext.data(), nullptr); return token_type::value_float; @@ -11223,17 +11763,9 @@ class basic_json int get() { ++chars_read; - - if (JSON_UNLIKELY(next_unget)) - { - next_unget = false; - } - else - { - current = ia->get_character(); - } - - return current; + return next_unget + ? (next_unget = false, current) + : (current = ia->get_character()); } /// unget a character to the input @@ -11317,7 +11849,6 @@ class basic_json } else { - // add character as is ss << c; } @@ -11441,8 +11972,7 @@ class basic_json public: /// a parser reading from a string literal parser(const char* buff, const parser_callback_t cb = nullptr) - : callback(cb), - m_lexer(buff, std::strlen(buff)) + : callback(cb), m_lexer(buff, std::strlen(buff)) {} /*! @@ -11528,22 +12058,11 @@ class basic_json return result; } - // no comma is expected here - unexpect(lexer::token_type::value_separator); - - // otherwise: parse key-value pairs - do + // parse values + while (true) { - // ugly, but could be fixed with loop reorganization - if (last_token == lexer::token_type::value_separator) - { - get_token(); - } - // store key expect(lexer::token_type::value_string); - // FIXME get_string returns const char*; maybe we can - // avoid this copy in the future const auto key = m_lexer.get_string(); bool keep_tag = false; @@ -11571,12 +12090,20 @@ class basic_json { result[key] = std::move(value); } - } - while (last_token == lexer::token_type::value_separator); - // closing } - expect(lexer::token_type::end_object); - get_token(); + // comma -> next value + if (last_token == lexer::token_type::value_separator) + { + get_token(); + continue; + } + + // closing } + expect(lexer::token_type::end_object); + get_token(); + break; + } + if (keep and callback and not callback(--depth, parse_event_t::object_end, result)) { result = basic_json(value_t::discarded); @@ -11609,30 +12136,29 @@ class basic_json return result; } - // no comma is expected here - unexpect(lexer::token_type::value_separator); - - // otherwise: parse values - do + // parse values + while (true) { - // ugly, but could be fixed with loop reorganization - if (last_token == lexer::token_type::value_separator) - { - get_token(); - } - // parse value auto value = parse_internal(keep); if (keep and not value.is_discarded()) { result.push_back(std::move(value)); } - } - while (last_token == lexer::token_type::value_separator); - // closing ] - expect(lexer::token_type::end_array); - get_token(); + // comma -> next value + if (last_token == lexer::token_type::value_separator) + { + get_token(); + continue; + } + + // closing ] + expect(lexer::token_type::end_array); + get_token(); + break; + } + if (keep and callback and not callback(--depth, parse_event_t::array_end, result)) { result = basic_json(value_t::discarded); @@ -11728,7 +12254,7 @@ class basic_json */ void expect(typename lexer::token_type t) const { - if (t != last_token) + if (JSON_UNLIKELY(t != last_token)) { std::string error_msg = "syntax error - "; if (last_token == lexer::token_type::parse_error) @@ -11750,7 +12276,7 @@ class basic_json */ void unexpect(typename lexer::token_type t) const { - if (t == last_token) + if (JSON_UNLIKELY(t == last_token)) { std::string error_msg = "syntax error - "; if (last_token == lexer::token_type::parse_error) diff --git a/test/src/unit-class_parser.cpp b/test/src/unit-class_parser.cpp index f36eb900..864b7be1 100644 --- a/test/src/unit-class_parser.cpp +++ b/test/src/unit-class_parser.cpp @@ -659,7 +659,7 @@ TEST_CASE("parser class") // test case to make sure no comma preceeds the first key CHECK_THROWS_AS(json::parser("{,\"key\": false}").parse(), json::parse_error); CHECK_THROWS_WITH(json::parser("{,\"key\": false}").parse(), - "[json.exception.parse_error.101] parse error at 2: syntax error - unexpected ','"); + "[json.exception.parse_error.101] parse error at 2: syntax error - unexpected ','; expected string literal"); // test case to make sure an object is properly closed CHECK_THROWS_AS(json::parser("[{\"key\": false true]").parse(), json::parse_error); CHECK_THROWS_WITH(json::parser("[{\"key\": false true]").parse(),