From 2fc82358cecda0490b9422dfe30c748716caf2cc Mon Sep 17 00:00:00 2001 From: Niels Date: Sun, 15 Feb 2015 13:35:51 +0100 Subject: [PATCH] clean up --- src/json.hpp | 1355 ++++++++++++++++++++++----------------------- src/json.hpp.re2c | 179 +++--- test/unit.cpp | 4 +- 3 files changed, 759 insertions(+), 779 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index e86fec11..cfa9baab 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -2456,7 +2456,10 @@ class basic_json end_of_input }; - inline lexer(const char* s) : m_content(s) + /// the char type to use in the lexer + using lexer_char_t = typename string_t::value_type; + + inline lexer(const typename string_t::value_type* s) : m_content(s) { m_start = m_cursor = m_content; m_limit = m_content + strlen(m_content); @@ -2464,46 +2467,39 @@ class basic_json inline lexer() = default; - template - inline static std::basic_string to_unicode(const long codepoint) + inline static string_t to_unicode(const long codepoint) { - std::string result; + string_t result; if (codepoint <= 0x7f) { - // 1-byte (ASCII) characters: 0xxxxxxx - result.append(1, static_cast(codepoint)); + // 1-byte characters: 0xxxxxxx (ASCI) + result.append(1, static_cast(codepoint)); } else if (codepoint <= 0x7ff) { // 2-byte characters: 110xxxxx 10xxxxxx - // the 0xC0 enables the two most significant bits to make this - // a 2-byte UTF-8 character - result.append(1, static_cast(0xC0 | ((codepoint >> 6) & 0x1F))); - result.append(1, static_cast(0x80 | (codepoint & 0x3F))); + result.append(1, static_cast(0xC0 | ((codepoint >> 6) & 0x1F))); + result.append(1, static_cast(0x80 | (codepoint & 0x3F))); } else if (codepoint <= 0xffff) { // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx - // the 0xE0 enables the three most significant bits to make - // this a 3-byte UTF-8 character - result.append(1, static_cast(0xE0 | ((codepoint >> 12) & 0x0F))); - result.append(1, static_cast(0x80 | ((codepoint >> 6) & 0x3F))); - result.append(1, static_cast(0x80 | (codepoint & 0x3F))); + result.append(1, static_cast(0xE0 | ((codepoint >> 12) & 0x0F))); + result.append(1, static_cast(0x80 | ((codepoint >> 6) & 0x3F))); + result.append(1, static_cast(0x80 | (codepoint & 0x3F))); } else if (codepoint <= 0x10ffff) { // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - // the 0xF0 enables the four most significant bits to make this - // a 4-byte UTF-8 character - result.append(1, static_cast(0xF0 | ((codepoint >> 18) & 0x07))); - result.append(1, static_cast(0x80 | ((codepoint >> 12) & 0x3F))); - result.append(1, static_cast(0x80 | ((codepoint >> 6) & 0x3F))); - result.append(1, static_cast(0x80 | (codepoint & 0x3F))); + result.append(1, static_cast(0xF0 | ((codepoint >> 18) & 0x07))); + result.append(1, static_cast(0x80 | ((codepoint >> 12) & 0x3F))); + result.append(1, static_cast(0x80 | ((codepoint >> 6) & 0x3F))); + result.append(1, static_cast(0x80 | (codepoint & 0x3F))); } else { - throw std::out_of_range("code point is invalid"); + throw std::out_of_range("code points above 0x10FFFF are invalid"); } return result; @@ -2553,353 +2549,359 @@ class basic_json with goto jumps. @return the class of the next token read from the buffer - - @todo Unicode support needs to be checked. */ inline token_type scan() { // pointer for backtracking information - const char* m_marker = nullptr; + const typename string_t::value_type* m_marker = nullptr; + + // remember the begin of the token + m_start = m_cursor; + - while (true) { - // remember the begin of the token - m_start = m_cursor; - - + lexer_char_t yych; + unsigned int yyaccept = 0; + static const unsigned char yybm[] = { - char yych; - unsigned int yyaccept = 0; - static const unsigned char yybm[] = - { - 0, 64, 64, 64, 64, 64, 64, 64, - 64, 96, 96, 64, 64, 96, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, - 96, 64, 0, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, - 192, 192, 192, 192, 192, 192, 192, 192, - 192, 192, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 0, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 64, 64, 64, 64, 64, 64, - }; + 0, 64, 64, 64, 64, 64, 64, 64, + 64, 96, 96, 64, 64, 96, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 96, 64, 0, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 0, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + }; - yych = *m_cursor; - if (yych <= '9') + yych = *m_cursor; + if (yych <= '9') + { + if (yych <= ' ') { - if (yych <= ' ') + if (yych <= '\n') { - if (yych <= '\n') + if (yych <= 0x00) { - if (yych <= 0x00) - { - goto basic_json_parser_27; - } - if (yych <= 0x08) - { - goto basic_json_parser_29; - } - if (yych >= '\n') - { - goto basic_json_parser_4; - } + goto basic_json_parser_27; } - else + if (yych <= 0x08) { - if (yych == '\r') - { - goto basic_json_parser_2; - } - if (yych <= 0x1F) - { - goto basic_json_parser_29; - } + goto basic_json_parser_29; + } + if (yych >= '\n') + { + goto basic_json_parser_4; } } else { - if (yych <= ',') + if (yych == '\r') { - if (yych == '"') - { - goto basic_json_parser_26; - } - if (yych <= '+') - { - goto basic_json_parser_29; - } - goto basic_json_parser_14; + goto basic_json_parser_2; } - else + if (yych <= 0x1F) { - if (yych <= '-') - { - goto basic_json_parser_22; - } - if (yych <= '/') - { - goto basic_json_parser_29; - } - if (yych <= '0') - { - goto basic_json_parser_23; - } - goto basic_json_parser_25; + goto basic_json_parser_29; } } } else { - if (yych <= 'm') - { - if (yych <= '\\') - { - if (yych <= ':') - { - goto basic_json_parser_16; - } - if (yych == '[') - { - goto basic_json_parser_6; - } - goto basic_json_parser_29; - } - else - { - if (yych <= ']') - { - goto basic_json_parser_8; - } - if (yych == 'f') - { - goto basic_json_parser_21; - } - goto basic_json_parser_29; - } - } - else - { - if (yych <= 'z') - { - if (yych <= 'n') - { - goto basic_json_parser_18; - } - if (yych == 't') - { - goto basic_json_parser_20; - } - goto basic_json_parser_29; - } - else - { - if (yych <= '{') - { - goto basic_json_parser_10; - } - if (yych == '}') - { - goto basic_json_parser_12; - } - goto basic_json_parser_29; - } - } - } -basic_json_parser_2: - ++m_cursor; - yych = *m_cursor; - goto basic_json_parser_5; -basic_json_parser_3: - { - continue; - } -basic_json_parser_4: - ++m_cursor; - yych = *m_cursor; -basic_json_parser_5: - if (yybm[0 + yych] & 32) - { - goto basic_json_parser_4; - } - goto basic_json_parser_3; -basic_json_parser_6: - ++m_cursor; - { - return token_type::begin_array; - } -basic_json_parser_8: - ++m_cursor; - { - return token_type::end_array; - } -basic_json_parser_10: - ++m_cursor; - { - return token_type::begin_object; - } -basic_json_parser_12: - ++m_cursor; - { - return token_type::end_object; - } -basic_json_parser_14: - ++m_cursor; - { - return token_type::value_separator; - } -basic_json_parser_16: - ++m_cursor; - { - return token_type::name_separator; - } -basic_json_parser_18: - yyaccept = 0; - yych = *(m_marker = ++m_cursor); - if (yych == 'u') - { - goto basic_json_parser_59; - } -basic_json_parser_19: - { - return token_type::parse_error; - } -basic_json_parser_20: - yyaccept = 0; - yych = *(m_marker = ++m_cursor); - if (yych == 'r') - { - goto basic_json_parser_55; - } - goto basic_json_parser_19; -basic_json_parser_21: - yyaccept = 0; - yych = *(m_marker = ++m_cursor); - if (yych == 'a') - { - goto basic_json_parser_50; - } - goto basic_json_parser_19; -basic_json_parser_22: - yych = *++m_cursor; - if (yych <= '/') - { - goto basic_json_parser_19; - } - if (yych <= '0') - { - goto basic_json_parser_49; - } - if (yych <= '9') - { - goto basic_json_parser_40; - } - goto basic_json_parser_19; -basic_json_parser_23: - yyaccept = 1; - yych = *(m_marker = ++m_cursor); - if (yych <= 'D') - { - if (yych == '.') - { - goto basic_json_parser_42; - } - } - else - { - if (yych <= 'E') - { - goto basic_json_parser_43; - } - if (yych == 'e') - { - goto basic_json_parser_43; - } - } -basic_json_parser_24: - { - return token_type::value_number; - } -basic_json_parser_25: - yyaccept = 1; - yych = *(m_marker = ++m_cursor); - goto basic_json_parser_41; -basic_json_parser_26: - yyaccept = 0; - yych = *(m_marker = ++m_cursor); - if (yych <= 0x00) - { - goto basic_json_parser_19; - } - goto basic_json_parser_31; -basic_json_parser_27: - ++m_cursor; - { - return token_type::end_of_input; - } -basic_json_parser_29: - yych = *++m_cursor; - goto basic_json_parser_19; -basic_json_parser_30: - ++m_cursor; - yych = *m_cursor; -basic_json_parser_31: - if (yybm[0 + yych] & 64) - { - goto basic_json_parser_30; - } - if (yych <= 0x00) - { - goto basic_json_parser_32; - } - if (yych <= '"') - { - goto basic_json_parser_34; - } - goto basic_json_parser_33; -basic_json_parser_32: - m_cursor = m_marker; - if (yyaccept == 0) - { - goto basic_json_parser_19; - } - else - { - goto basic_json_parser_24; - } -basic_json_parser_33: - ++m_cursor; - yych = *m_cursor; - if (yych <= 'e') - { - if (yych <= '/') + if (yych <= ',') { if (yych == '"') { - goto basic_json_parser_30; + goto basic_json_parser_26; } - if (yych <= '.') + if (yych <= '+') + { + goto basic_json_parser_29; + } + goto basic_json_parser_14; + } + else + { + if (yych <= '-') + { + goto basic_json_parser_22; + } + if (yych <= '/') + { + goto basic_json_parser_29; + } + if (yych <= '0') + { + goto basic_json_parser_23; + } + goto basic_json_parser_25; + } + } + } + else + { + if (yych <= 'm') + { + if (yych <= '\\') + { + if (yych <= ':') + { + goto basic_json_parser_16; + } + if (yych == '[') + { + goto basic_json_parser_6; + } + goto basic_json_parser_29; + } + else + { + if (yych <= ']') + { + goto basic_json_parser_8; + } + if (yych == 'f') + { + goto basic_json_parser_21; + } + goto basic_json_parser_29; + } + } + else + { + if (yych <= 'z') + { + if (yych <= 'n') + { + goto basic_json_parser_18; + } + if (yych == 't') + { + goto basic_json_parser_20; + } + goto basic_json_parser_29; + } + else + { + if (yych <= '{') + { + goto basic_json_parser_10; + } + if (yych == '}') + { + goto basic_json_parser_12; + } + goto basic_json_parser_29; + } + } + } +basic_json_parser_2: + ++m_cursor; + yych = *m_cursor; + goto basic_json_parser_5; +basic_json_parser_3: + { + return scan(); + } +basic_json_parser_4: + ++m_cursor; + yych = *m_cursor; +basic_json_parser_5: + if (yybm[0 + yych] & 32) + { + goto basic_json_parser_4; + } + goto basic_json_parser_3; +basic_json_parser_6: + ++m_cursor; + { + return token_type::begin_array; + } +basic_json_parser_8: + ++m_cursor; + { + return token_type::end_array; + } +basic_json_parser_10: + ++m_cursor; + { + return token_type::begin_object; + } +basic_json_parser_12: + ++m_cursor; + { + return token_type::end_object; + } +basic_json_parser_14: + ++m_cursor; + { + return token_type::value_separator; + } +basic_json_parser_16: + ++m_cursor; + { + return token_type::name_separator; + } +basic_json_parser_18: + yyaccept = 0; + yych = *(m_marker = ++m_cursor); + if (yych == 'u') + { + goto basic_json_parser_59; + } +basic_json_parser_19: + { + return token_type::parse_error; + } +basic_json_parser_20: + yyaccept = 0; + yych = *(m_marker = ++m_cursor); + if (yych == 'r') + { + goto basic_json_parser_55; + } + goto basic_json_parser_19; +basic_json_parser_21: + yyaccept = 0; + yych = *(m_marker = ++m_cursor); + if (yych == 'a') + { + goto basic_json_parser_50; + } + goto basic_json_parser_19; +basic_json_parser_22: + yych = *++m_cursor; + if (yych <= '/') + { + goto basic_json_parser_19; + } + if (yych <= '0') + { + goto basic_json_parser_49; + } + if (yych <= '9') + { + goto basic_json_parser_40; + } + goto basic_json_parser_19; +basic_json_parser_23: + yyaccept = 1; + yych = *(m_marker = ++m_cursor); + if (yych <= 'D') + { + if (yych == '.') + { + goto basic_json_parser_42; + } + } + else + { + if (yych <= 'E') + { + goto basic_json_parser_43; + } + if (yych == 'e') + { + goto basic_json_parser_43; + } + } +basic_json_parser_24: + { + return token_type::value_number; + } +basic_json_parser_25: + yyaccept = 1; + yych = *(m_marker = ++m_cursor); + goto basic_json_parser_41; +basic_json_parser_26: + yyaccept = 0; + yych = *(m_marker = ++m_cursor); + if (yych <= 0x00) + { + goto basic_json_parser_19; + } + goto basic_json_parser_31; +basic_json_parser_27: + ++m_cursor; + { + return token_type::end_of_input; + } +basic_json_parser_29: + yych = *++m_cursor; + goto basic_json_parser_19; +basic_json_parser_30: + ++m_cursor; + yych = *m_cursor; +basic_json_parser_31: + if (yybm[0 + yych] & 64) + { + goto basic_json_parser_30; + } + if (yych <= 0x00) + { + goto basic_json_parser_32; + } + if (yych <= '"') + { + goto basic_json_parser_34; + } + goto basic_json_parser_33; +basic_json_parser_32: + m_cursor = m_marker; + if (yyaccept == 0) + { + goto basic_json_parser_19; + } + else + { + goto basic_json_parser_24; + } +basic_json_parser_33: + ++m_cursor; + yych = *m_cursor; + if (yych <= 'e') + { + if (yych <= '/') + { + if (yych == '"') + { + goto basic_json_parser_30; + } + if (yych <= '.') + { + goto basic_json_parser_32; + } + goto basic_json_parser_30; + } + else + { + if (yych <= '\\') + { + if (yych <= '[') { goto basic_json_parser_32; } @@ -2907,251 +2909,62 @@ basic_json_parser_33: } else { - if (yych <= '\\') - { - if (yych <= '[') - { - goto basic_json_parser_32; - } - goto basic_json_parser_30; - } - else - { - if (yych == 'b') - { - goto basic_json_parser_30; - } - goto basic_json_parser_32; - } - } - } - else - { - if (yych <= 'q') - { - if (yych <= 'f') - { - goto basic_json_parser_30; - } - if (yych == 'n') + if (yych == 'b') { goto basic_json_parser_30; } goto basic_json_parser_32; } - else - { - if (yych <= 's') - { - if (yych <= 'r') - { - goto basic_json_parser_30; - } - goto basic_json_parser_32; - } - else - { - if (yych <= 't') - { - goto basic_json_parser_30; - } - if (yych <= 'u') - { - goto basic_json_parser_36; - } - goto basic_json_parser_32; - } - } } -basic_json_parser_34: - ++m_cursor; + } + else + { + if (yych <= 'q') { - return token_type::value_string; - } -basic_json_parser_36: - ++m_cursor; - yych = *m_cursor; - if (yych <= '@') - { - if (yych <= '/') - { - goto basic_json_parser_32; - } - if (yych >= ':') - { - goto basic_json_parser_32; - } - } - else - { - if (yych <= 'F') - { - goto basic_json_parser_37; - } - if (yych <= '`') - { - goto basic_json_parser_32; - } - if (yych >= 'g') - { - goto basic_json_parser_32; - } - } -basic_json_parser_37: - ++m_cursor; - yych = *m_cursor; - if (yych <= '@') - { - if (yych <= '/') - { - goto basic_json_parser_32; - } - if (yych >= ':') - { - goto basic_json_parser_32; - } - } - else - { - if (yych <= 'F') - { - goto basic_json_parser_38; - } - if (yych <= '`') - { - goto basic_json_parser_32; - } - if (yych >= 'g') - { - goto basic_json_parser_32; - } - } -basic_json_parser_38: - ++m_cursor; - yych = *m_cursor; - if (yych <= '@') - { - if (yych <= '/') - { - goto basic_json_parser_32; - } - if (yych >= ':') - { - goto basic_json_parser_32; - } - } - else - { - if (yych <= 'F') - { - goto basic_json_parser_39; - } - if (yych <= '`') - { - goto basic_json_parser_32; - } - if (yych >= 'g') - { - goto basic_json_parser_32; - } - } -basic_json_parser_39: - ++m_cursor; - yych = *m_cursor; - if (yych <= '@') - { - if (yych <= '/') - { - goto basic_json_parser_32; - } - if (yych <= '9') - { - goto basic_json_parser_30; - } - goto basic_json_parser_32; - } - else - { - if (yych <= 'F') - { - goto basic_json_parser_30; - } - if (yych <= '`') - { - goto basic_json_parser_32; - } if (yych <= 'f') { goto basic_json_parser_30; } - goto basic_json_parser_32; - } -basic_json_parser_40: - yyaccept = 1; - m_marker = ++m_cursor; - yych = *m_cursor; -basic_json_parser_41: - if (yybm[0 + yych] & 128) - { - goto basic_json_parser_40; - } - if (yych <= 'D') - { - if (yych != '.') + if (yych == 'n') { - goto basic_json_parser_24; + goto basic_json_parser_30; } + goto basic_json_parser_32; } else { - if (yych <= 'E') + if (yych <= 's') { - goto basic_json_parser_43; + if (yych <= 'r') + { + goto basic_json_parser_30; + } + goto basic_json_parser_32; } - if (yych == 'e') - { - goto basic_json_parser_43; - } - goto basic_json_parser_24; - } -basic_json_parser_42: - yych = *++m_cursor; - if (yych <= '/') - { - goto basic_json_parser_32; - } - if (yych <= '9') - { - goto basic_json_parser_47; - } - goto basic_json_parser_32; -basic_json_parser_43: - yych = *++m_cursor; - if (yych <= ',') - { - if (yych != '+') + else { + if (yych <= 't') + { + goto basic_json_parser_30; + } + if (yych <= 'u') + { + goto basic_json_parser_36; + } goto basic_json_parser_32; } } - else - { - if (yych <= '-') - { - goto basic_json_parser_44; - } - if (yych <= '/') - { - goto basic_json_parser_32; - } - if (yych <= '9') - { - goto basic_json_parser_45; - } - goto basic_json_parser_32; - } -basic_json_parser_44: - yych = *++m_cursor; + } +basic_json_parser_34: + ++m_cursor; + { + return token_type::value_string; + } +basic_json_parser_36: + ++m_cursor; + yych = *m_cursor; + if (yych <= '@') + { if (yych <= '/') { goto basic_json_parser_32; @@ -3160,127 +2973,305 @@ basic_json_parser_44: { goto basic_json_parser_32; } + } + else + { + if (yych <= 'F') + { + goto basic_json_parser_37; + } + if (yych <= '`') + { + goto basic_json_parser_32; + } + if (yych >= 'g') + { + goto basic_json_parser_32; + } + } +basic_json_parser_37: + ++m_cursor; + yych = *m_cursor; + if (yych <= '@') + { + if (yych <= '/') + { + goto basic_json_parser_32; + } + if (yych >= ':') + { + goto basic_json_parser_32; + } + } + else + { + if (yych <= 'F') + { + goto basic_json_parser_38; + } + if (yych <= '`') + { + goto basic_json_parser_32; + } + if (yych >= 'g') + { + goto basic_json_parser_32; + } + } +basic_json_parser_38: + ++m_cursor; + yych = *m_cursor; + if (yych <= '@') + { + if (yych <= '/') + { + goto basic_json_parser_32; + } + if (yych >= ':') + { + goto basic_json_parser_32; + } + } + else + { + if (yych <= 'F') + { + goto basic_json_parser_39; + } + if (yych <= '`') + { + goto basic_json_parser_32; + } + if (yych >= 'g') + { + goto basic_json_parser_32; + } + } +basic_json_parser_39: + ++m_cursor; + yych = *m_cursor; + if (yych <= '@') + { + if (yych <= '/') + { + goto basic_json_parser_32; + } + if (yych <= '9') + { + goto basic_json_parser_30; + } + goto basic_json_parser_32; + } + else + { + if (yych <= 'F') + { + goto basic_json_parser_30; + } + if (yych <= '`') + { + goto basic_json_parser_32; + } + if (yych <= 'f') + { + goto basic_json_parser_30; + } + goto basic_json_parser_32; + } +basic_json_parser_40: + yyaccept = 1; + m_marker = ++m_cursor; + yych = *m_cursor; +basic_json_parser_41: + if (yybm[0 + yych] & 128) + { + goto basic_json_parser_40; + } + if (yych <= 'D') + { + if (yych != '.') + { + goto basic_json_parser_24; + } + } + else + { + if (yych <= 'E') + { + goto basic_json_parser_43; + } + if (yych == 'e') + { + goto basic_json_parser_43; + } + goto basic_json_parser_24; + } +basic_json_parser_42: + yych = *++m_cursor; + if (yych <= '/') + { + goto basic_json_parser_32; + } + if (yych <= '9') + { + goto basic_json_parser_47; + } + goto basic_json_parser_32; +basic_json_parser_43: + yych = *++m_cursor; + if (yych <= ',') + { + if (yych != '+') + { + goto basic_json_parser_32; + } + } + else + { + if (yych <= '-') + { + goto basic_json_parser_44; + } + if (yych <= '/') + { + goto basic_json_parser_32; + } + if (yych <= '9') + { + goto basic_json_parser_45; + } + goto basic_json_parser_32; + } +basic_json_parser_44: + yych = *++m_cursor; + if (yych <= '/') + { + goto basic_json_parser_32; + } + if (yych >= ':') + { + goto basic_json_parser_32; + } basic_json_parser_45: - ++m_cursor; - yych = *m_cursor; + ++m_cursor; + yych = *m_cursor; + if (yych <= '/') + { + goto basic_json_parser_24; + } + if (yych <= '9') + { + goto basic_json_parser_45; + } + goto basic_json_parser_24; +basic_json_parser_47: + yyaccept = 1; + m_marker = ++m_cursor; + yych = *m_cursor; + if (yych <= 'D') + { if (yych <= '/') { goto basic_json_parser_24; } if (yych <= '9') { - goto basic_json_parser_45; + goto basic_json_parser_47; } goto basic_json_parser_24; -basic_json_parser_47: - yyaccept = 1; - m_marker = ++m_cursor; - yych = *m_cursor; - if (yych <= 'D') - { - if (yych <= '/') - { - goto basic_json_parser_24; - } - if (yych <= '9') - { - goto basic_json_parser_47; - } - goto basic_json_parser_24; - } - else - { - if (yych <= 'E') - { - goto basic_json_parser_43; - } - if (yych == 'e') - { - goto basic_json_parser_43; - } - goto basic_json_parser_24; - } -basic_json_parser_49: - yyaccept = 1; - yych = *(m_marker = ++m_cursor); - if (yych <= 'D') - { - if (yych == '.') - { - goto basic_json_parser_42; - } - goto basic_json_parser_24; - } - else - { - if (yych <= 'E') - { - goto basic_json_parser_43; - } - if (yych == 'e') - { - goto basic_json_parser_43; - } - goto basic_json_parser_24; - } -basic_json_parser_50: - yych = *++m_cursor; - if (yych != 'l') - { - goto basic_json_parser_32; - } - yych = *++m_cursor; - if (yych != 's') - { - goto basic_json_parser_32; - } - yych = *++m_cursor; - if (yych != 'e') - { - goto basic_json_parser_32; - } - ++m_cursor; - { - return token_type::literal_false; - } -basic_json_parser_55: - yych = *++m_cursor; - if (yych != 'u') - { - goto basic_json_parser_32; - } - yych = *++m_cursor; - if (yych != 'e') - { - goto basic_json_parser_32; - } - ++m_cursor; - { - return token_type::literal_true; - } -basic_json_parser_59: - yych = *++m_cursor; - if (yych != 'l') - { - goto basic_json_parser_32; - } - yych = *++m_cursor; - if (yych != 'l') - { - goto basic_json_parser_32; - } - ++m_cursor; - { - return token_type::literal_null; - } } - + else + { + if (yych <= 'E') + { + goto basic_json_parser_43; + } + if (yych == 'e') + { + goto basic_json_parser_43; + } + goto basic_json_parser_24; + } +basic_json_parser_49: + yyaccept = 1; + yych = *(m_marker = ++m_cursor); + if (yych <= 'D') + { + if (yych == '.') + { + goto basic_json_parser_42; + } + goto basic_json_parser_24; + } + else + { + if (yych <= 'E') + { + goto basic_json_parser_43; + } + if (yych == 'e') + { + goto basic_json_parser_43; + } + goto basic_json_parser_24; + } +basic_json_parser_50: + yych = *++m_cursor; + if (yych != 'l') + { + goto basic_json_parser_32; + } + yych = *++m_cursor; + if (yych != 's') + { + goto basic_json_parser_32; + } + yych = *++m_cursor; + if (yych != 'e') + { + goto basic_json_parser_32; + } + ++m_cursor; + { + return token_type::literal_false; + } +basic_json_parser_55: + yych = *++m_cursor; + if (yych != 'u') + { + goto basic_json_parser_32; + } + yych = *++m_cursor; + if (yych != 'e') + { + goto basic_json_parser_32; + } + ++m_cursor; + { + return token_type::literal_true; + } +basic_json_parser_59: + yych = *++m_cursor; + if (yych != 'l') + { + goto basic_json_parser_32; + } + yych = *++m_cursor; + if (yych != 'l') + { + goto basic_json_parser_32; + } + ++m_cursor; + { + return token_type::literal_null; + } } + } - inline std::string get_token() const + inline string_t get_token() const { - return std::string(m_start, static_cast(m_cursor - m_start)); + return string_t(m_start, static_cast(m_cursor - m_start)); } /*! @@ -3291,16 +3282,14 @@ basic_json_parser_59: from the pointer difference of the two pointers). @return string value of current token without opening and closing quotes - - @todo Take care of Unicode. */ - inline std::string get_string() const + inline string_t get_string() const { - std::string result; + string_t result; result.reserve(static_cast(m_cursor - m_start - 2)); // iterate the result between the quotes - for (const char* i = m_start + 1; i < m_cursor - 1; ++i) + for (const typename string_t::value_type* i = m_start + 1; i < m_cursor - 1; ++i) { // process escaped characters if (*i == '\\') @@ -3360,7 +3349,7 @@ basic_json_parser_59: // get code xxxx from \uxxxx auto codepoint = strtol(i + 1, nullptr, 16); // add unicode character(s) - result += to_unicode(codepoint); + result += to_unicode(codepoint); // skip the next four characters (\uxxxx) i += 4; break; @@ -3399,20 +3388,20 @@ basic_json_parser_59: private: /// the buffer - const char* m_content = nullptr; + const typename string_t::value_type* m_content = nullptr; /// pointer to he beginning of the current symbol - const char* m_start = nullptr; + const typename string_t::value_type* m_start = nullptr; /// pointer to the current symbol - const char* m_cursor = nullptr; + const typename string_t::value_type* m_cursor = nullptr; /// pointer to the end of the buffer - const char* m_limit = nullptr; + const typename string_t::value_type* m_limit = nullptr; }; class parser { public: /// constructor for strings - inline parser(const std::string& s) : m_buffer(s), m_lexer(m_buffer.c_str()) + inline parser(const string_t& s) : m_buffer(s), m_lexer(m_buffer.c_str()) { // read first token get_token(); @@ -3423,7 +3412,7 @@ basic_json_parser_59: { while (_is) { - std::string input_line; + string_t input_line; std::getline(_is, input_line); m_buffer += input_line; } @@ -3617,7 +3606,7 @@ basic_json_parser_59: private: /// the buffer - std::string m_buffer; + string_t m_buffer; /// the type of the last read token typename lexer::token_type last_token = lexer::token_type::uninitialized; /// the lexer diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index c31d4518..ed6dc746 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -2456,7 +2456,10 @@ class basic_json end_of_input }; - inline lexer(const char* s) : m_content(s) + /// the char type to use in the lexer + using lexer_char_t = typename string_t::value_type; + + inline lexer(const typename string_t::value_type* s) : m_content(s) { m_start = m_cursor = m_content; m_limit = m_content + strlen(m_content); @@ -2464,46 +2467,39 @@ class basic_json inline lexer() = default; - template - inline static std::basic_string to_unicode(const long codepoint) + inline static string_t to_unicode(const long codepoint) { - std::string result; + string_t result; if (codepoint <= 0x7f) { - // 1-byte (ASCII) characters: 0xxxxxxx - result.append(1, static_cast(codepoint)); + // 1-byte characters: 0xxxxxxx (ASCI) + result.append(1, static_cast(codepoint)); } else if (codepoint <= 0x7ff) { // 2-byte characters: 110xxxxx 10xxxxxx - // the 0xC0 enables the two most significant bits to make this - // a 2-byte UTF-8 character - result.append(1, static_cast(0xC0 | ((codepoint >> 6) & 0x1F))); - result.append(1, static_cast(0x80 | (codepoint & 0x3F))); + result.append(1, static_cast(0xC0 | ((codepoint >> 6) & 0x1F))); + result.append(1, static_cast(0x80 | (codepoint & 0x3F))); } else if (codepoint <= 0xffff) { // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx - // the 0xE0 enables the three most significant bits to make - // this a 3-byte UTF-8 character - result.append(1, static_cast(0xE0 | ((codepoint >> 12) & 0x0F))); - result.append(1, static_cast(0x80 | ((codepoint >> 6) & 0x3F))); - result.append(1, static_cast(0x80 | (codepoint & 0x3F))); + result.append(1, static_cast(0xE0 | ((codepoint >> 12) & 0x0F))); + result.append(1, static_cast(0x80 | ((codepoint >> 6) & 0x3F))); + result.append(1, static_cast(0x80 | (codepoint & 0x3F))); } else if (codepoint <= 0x10ffff) { // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - // the 0xF0 enables the four most significant bits to make this - // a 4-byte UTF-8 character - result.append(1, static_cast(0xF0 | ((codepoint >> 18) & 0x07))); - result.append(1, static_cast(0x80 | ((codepoint >> 12) & 0x3F))); - result.append(1, static_cast(0x80 | ((codepoint >> 6) & 0x3F))); - result.append(1, static_cast(0x80 | (codepoint & 0x3F))); + result.append(1, static_cast(0xF0 | ((codepoint >> 18) & 0x07))); + result.append(1, static_cast(0x80 | ((codepoint >> 12) & 0x3F))); + result.append(1, static_cast(0x80 | ((codepoint >> 6) & 0x3F))); + result.append(1, static_cast(0x80 | (codepoint & 0x3F))); } else { - throw std::out_of_range("code point is invalid"); + throw std::out_of_range("code points above 0x10FFFF are invalid"); } return result; @@ -2557,77 +2553,74 @@ class basic_json inline token_type scan() { // pointer for backtracking information - const char* m_marker = nullptr; + const typename string_t::value_type* m_marker = nullptr; - while (true) - { - // remember the begin of the token - m_start = m_cursor; + // remember the begin of the token + m_start = m_cursor; - /*!re2c - re2c:define:YYCTYPE = char; - re2c:define:YYCURSOR = m_cursor; - re2c:define:YYLIMIT = m_limit; - re2c:define:YYMARKER = m_marker; - re2c:indent:string = " "; - re2c:indent:top = 1; - re2c:labelprefix = "basic_json_parser_"; - re2c:yyfill:enable = 0; + /*!re2c + re2c:define:YYCTYPE = lexer_char_t; + re2c:define:YYCURSOR = m_cursor; + re2c:define:YYLIMIT = m_limit; + re2c:define:YYMARKER = m_marker; + re2c:indent:string = " "; + re2c:indent:top = 1; + re2c:labelprefix = "basic_json_parser_"; + re2c:yyfill:enable = 0; - // whitespace - ws = [ \t\n\r]+; - ws { continue; } + // whitespace + ws = [ \t\n\r]+; + ws { return scan(); } - // structural characters - "[" { return token_type::begin_array; } - "]" { return token_type::end_array; } - "{" { return token_type::begin_object; } - "}" { return token_type::end_object; } - "," { return token_type::value_separator; } - ":" { return token_type::name_separator; } + // structural characters + "[" { return token_type::begin_array; } + "]" { return token_type::end_array; } + "{" { return token_type::begin_object; } + "}" { return token_type::end_object; } + "," { return token_type::value_separator; } + ":" { return token_type::name_separator; } - // literal names - "null" { return token_type::literal_null; } - "true" { return token_type::literal_true; } - "false" { return token_type::literal_false; } + // literal names + "null" { return token_type::literal_null; } + "true" { return token_type::literal_true; } + "false" { return token_type::literal_false; } - // number - decimal_point = [.]; - digit = [0-9]; - digit_1_9 = [1-9]; - e = [eE]; - minus = [-]; - plus = [+]; - zero = [0]; - exp = e (minus|plus)? digit+; - frac = decimal_point digit+; - int = (zero|digit_1_9 digit*); - number = minus? int frac? exp?; - number { return token_type::value_number; } + // number + decimal_point = [.]; + digit = [0-9]; + digit_1_9 = [1-9]; + e = [eE]; + minus = [-]; + plus = [+]; + zero = [0]; + exp = e (minus|plus)? digit+; + frac = decimal_point digit+; + int = (zero|digit_1_9 digit*); + number = minus? int frac? exp?; + number { return token_type::value_number; } - // string - quotation_mark = [\"]; - escape = [\\]; - unescaped = [^\"\\\000]; - single_escaped = [\"\\/bfnrt]; - unicode_escaped = [u][0-9a-fA-F]{4}; - escaped = escape (single_escaped | unicode_escaped); - char = unescaped | escaped; - string = quotation_mark char* quotation_mark; - string { return token_type::value_string; } + // string + quotation_mark = [\"]; + escape = [\\]; + unescaped = [^\"\\\000]; + single_escaped = [\"\\/bfnrt]; + unicode_escaped = [u][0-9a-fA-F]{4}; + escaped = escape (single_escaped | unicode_escaped); + char = unescaped | escaped; + string = quotation_mark char* quotation_mark; + string { return token_type::value_string; } - // end of file - '\000' { return token_type::end_of_input; } + // end of file + '\000' { return token_type::end_of_input; } - // anything else is an error - . { return token_type::parse_error; } - */ - } + // anything else is an error + . { return token_type::parse_error; } + */ } - inline std::string get_token() const + inline string_t get_token() const { - return std::string(m_start, static_cast(m_cursor - m_start)); + return string_t(m_start, static_cast(m_cursor - m_start)); } /*! @@ -2638,16 +2631,14 @@ class basic_json from the pointer difference of the two pointers). @return string value of current token without opening and closing quotes - - @todo Take care of Unicode. */ - inline std::string get_string() const + inline string_t get_string() const { - std::string result; + string_t result; result.reserve(static_cast(m_cursor - m_start - 2)); // iterate the result between the quotes - for (const char* i = m_start + 1; i < m_cursor - 1; ++i) + for (const typename string_t::value_type* i = m_start + 1; i < m_cursor - 1; ++i) { // process escaped characters if (*i == '\\') @@ -2707,7 +2698,7 @@ class basic_json // get code xxxx from \uxxxx auto codepoint = strtol(i + 1, nullptr, 16); // add unicode character(s) - result += to_unicode(codepoint); + result += to_unicode(codepoint); // skip the next four characters (\uxxxx) i += 4; break; @@ -2746,20 +2737,20 @@ class basic_json private: /// the buffer - const char* m_content = nullptr; + const typename string_t::value_type* m_content = nullptr; /// pointer to he beginning of the current symbol - const char* m_start = nullptr; + const typename string_t::value_type* m_start = nullptr; /// pointer to the current symbol - const char* m_cursor = nullptr; + const typename string_t::value_type* m_cursor = nullptr; /// pointer to the end of the buffer - const char* m_limit = nullptr; + const typename string_t::value_type* m_limit = nullptr; }; class parser { public: /// constructor for strings - inline parser(const std::string& s) : m_buffer(s), m_lexer(m_buffer.c_str()) + inline parser(const string_t& s) : m_buffer(s), m_lexer(m_buffer.c_str()) { // read first token get_token(); @@ -2770,7 +2761,7 @@ class basic_json { while (_is) { - std::string input_line; + string_t input_line; std::getline(_is, input_line); m_buffer += input_line; } @@ -2964,7 +2955,7 @@ class basic_json private: /// the buffer - std::string m_buffer; + string_t m_buffer; /// the type of the last read token typename lexer::token_type last_token = lexer::token_type::uninitialized; /// the lexer diff --git a/test/unit.cpp b/test/unit.cpp index 4dd4edc0..0c6495d9 100644 --- a/test/unit.cpp +++ b/test/unit.cpp @@ -5517,8 +5517,8 @@ TEST_CASE("lexer class") SECTION("to_unicode") { - CHECK(json::lexer::to_unicode(0x1F4A9) == "💩"); - CHECK_THROWS_AS(json::lexer::to_unicode(0x110000), std::out_of_range); + CHECK(json::lexer::to_unicode(0x1F4A9) == "💩"); + CHECK_THROWS_AS(json::lexer::to_unicode(0x200000), std::out_of_range); } }