This commit is contained in:
Niels 2015-02-15 13:35:51 +01:00
parent b21bf95620
commit 2fc82358ce
3 changed files with 759 additions and 779 deletions

View file

@ -2456,7 +2456,10 @@ class basic_json
end_of_input end_of_input
}; };
inline lexer(const char* s) : m_content(s) /// the char type to use in the lexer
using lexer_char_t = typename string_t::value_type;
inline lexer(const typename string_t::value_type* s) : m_content(s)
{ {
m_start = m_cursor = m_content; m_start = m_cursor = m_content;
m_limit = m_content + strlen(m_content); m_limit = m_content + strlen(m_content);
@ -2464,46 +2467,39 @@ class basic_json
inline lexer() = default; inline lexer() = default;
template<typename CharT> inline static string_t to_unicode(const long codepoint)
inline static std::basic_string<CharT> to_unicode(const long codepoint)
{ {
std::string result; string_t result;
if (codepoint <= 0x7f) if (codepoint <= 0x7f)
{ {
// 1-byte (ASCII) characters: 0xxxxxxx // 1-byte characters: 0xxxxxxx (ASCI)
result.append(1, static_cast<char>(codepoint)); result.append(1, static_cast<typename string_t::value_type>(codepoint));
} }
else if (codepoint <= 0x7ff) else if (codepoint <= 0x7ff)
{ {
// 2-byte characters: 110xxxxx 10xxxxxx // 2-byte characters: 110xxxxx 10xxxxxx
// the 0xC0 enables the two most significant bits to make this result.append(1, static_cast<typename string_t::value_type>(0xC0 | ((codepoint >> 6) & 0x1F)));
// a 2-byte UTF-8 character result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
result.append(1, static_cast<CharT>(0xC0 | ((codepoint >> 6) & 0x1F)));
result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
} }
else if (codepoint <= 0xffff) else if (codepoint <= 0xffff)
{ {
// 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
// the 0xE0 enables the three most significant bits to make result.append(1, static_cast<typename string_t::value_type>(0xE0 | ((codepoint >> 12) & 0x0F)));
// this a 3-byte UTF-8 character result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 6) & 0x3F)));
result.append(1, static_cast<CharT>(0xE0 | ((codepoint >> 12) & 0x0F))); result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 6) & 0x3F)));
result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
} }
else if (codepoint <= 0x10ffff) else if (codepoint <= 0x10ffff)
{ {
// 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// the 0xF0 enables the four most significant bits to make this result.append(1, static_cast<typename string_t::value_type>(0xF0 | ((codepoint >> 18) & 0x07)));
// a 4-byte UTF-8 character result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 12) & 0x3F)));
result.append(1, static_cast<CharT>(0xF0 | ((codepoint >> 18) & 0x07))); result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 6) & 0x3F)));
result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 12) & 0x3F))); result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 6) & 0x3F)));
result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
} }
else else
{ {
throw std::out_of_range("code point is invalid"); throw std::out_of_range("code points above 0x10FFFF are invalid");
} }
return result; return result;
@ -2553,22 +2549,18 @@ class basic_json
with goto jumps. with goto jumps.
@return the class of the next token read from the buffer @return the class of the next token read from the buffer
@todo Unicode support needs to be checked.
*/ */
inline token_type scan() inline token_type scan()
{ {
// pointer for backtracking information // pointer for backtracking information
const char* m_marker = nullptr; const typename string_t::value_type* m_marker = nullptr;
while (true)
{
// remember the begin of the token // remember the begin of the token
m_start = m_cursor; m_start = m_cursor;
{ {
char yych; lexer_char_t yych;
unsigned int yyaccept = 0; unsigned int yyaccept = 0;
static const unsigned char yybm[] = static const unsigned char yybm[] =
{ {
@ -2733,7 +2725,7 @@ basic_json_parser_2:
goto basic_json_parser_5; goto basic_json_parser_5;
basic_json_parser_3: basic_json_parser_3:
{ {
continue; return scan();
} }
basic_json_parser_4: basic_json_parser_4:
++m_cursor; ++m_cursor;
@ -3276,11 +3268,10 @@ basic_json_parser_59:
} }
} }
}
inline std::string get_token() const inline string_t get_token() const
{ {
return std::string(m_start, static_cast<size_t>(m_cursor - m_start)); return string_t(m_start, static_cast<size_t>(m_cursor - m_start));
} }
/*! /*!
@ -3291,16 +3282,14 @@ basic_json_parser_59:
from the pointer difference of the two pointers). from the pointer difference of the two pointers).
@return string value of current token without opening and closing quotes @return string value of current token without opening and closing quotes
@todo Take care of Unicode.
*/ */
inline std::string get_string() const inline string_t get_string() const
{ {
std::string result; string_t result;
result.reserve(static_cast<size_t>(m_cursor - m_start - 2)); result.reserve(static_cast<size_t>(m_cursor - m_start - 2));
// iterate the result between the quotes // iterate the result between the quotes
for (const char* i = m_start + 1; i < m_cursor - 1; ++i) for (const typename string_t::value_type* i = m_start + 1; i < m_cursor - 1; ++i)
{ {
// process escaped characters // process escaped characters
if (*i == '\\') if (*i == '\\')
@ -3360,7 +3349,7 @@ basic_json_parser_59:
// get code xxxx from \uxxxx // get code xxxx from \uxxxx
auto codepoint = strtol(i + 1, nullptr, 16); auto codepoint = strtol(i + 1, nullptr, 16);
// add unicode character(s) // add unicode character(s)
result += to_unicode<char>(codepoint); result += to_unicode(codepoint);
// skip the next four characters (\uxxxx) // skip the next four characters (\uxxxx)
i += 4; i += 4;
break; break;
@ -3399,20 +3388,20 @@ basic_json_parser_59:
private: private:
/// the buffer /// the buffer
const char* m_content = nullptr; const typename string_t::value_type* m_content = nullptr;
/// pointer to he beginning of the current symbol /// pointer to he beginning of the current symbol
const char* m_start = nullptr; const typename string_t::value_type* m_start = nullptr;
/// pointer to the current symbol /// pointer to the current symbol
const char* m_cursor = nullptr; const typename string_t::value_type* m_cursor = nullptr;
/// pointer to the end of the buffer /// pointer to the end of the buffer
const char* m_limit = nullptr; const typename string_t::value_type* m_limit = nullptr;
}; };
class parser class parser
{ {
public: public:
/// constructor for strings /// constructor for strings
inline parser(const std::string& s) : m_buffer(s), m_lexer(m_buffer.c_str()) inline parser(const string_t& s) : m_buffer(s), m_lexer(m_buffer.c_str())
{ {
// read first token // read first token
get_token(); get_token();
@ -3423,7 +3412,7 @@ basic_json_parser_59:
{ {
while (_is) while (_is)
{ {
std::string input_line; string_t input_line;
std::getline(_is, input_line); std::getline(_is, input_line);
m_buffer += input_line; m_buffer += input_line;
} }
@ -3617,7 +3606,7 @@ basic_json_parser_59:
private: private:
/// the buffer /// the buffer
std::string m_buffer; string_t m_buffer;
/// the type of the last read token /// the type of the last read token
typename lexer::token_type last_token = lexer::token_type::uninitialized; typename lexer::token_type last_token = lexer::token_type::uninitialized;
/// the lexer /// the lexer

View file

@ -2456,7 +2456,10 @@ class basic_json
end_of_input end_of_input
}; };
inline lexer(const char* s) : m_content(s) /// the char type to use in the lexer
using lexer_char_t = typename string_t::value_type;
inline lexer(const typename string_t::value_type* s) : m_content(s)
{ {
m_start = m_cursor = m_content; m_start = m_cursor = m_content;
m_limit = m_content + strlen(m_content); m_limit = m_content + strlen(m_content);
@ -2464,46 +2467,39 @@ class basic_json
inline lexer() = default; inline lexer() = default;
template<typename CharT> inline static string_t to_unicode(const long codepoint)
inline static std::basic_string<CharT> to_unicode(const long codepoint)
{ {
std::string result; string_t result;
if (codepoint <= 0x7f) if (codepoint <= 0x7f)
{ {
// 1-byte (ASCII) characters: 0xxxxxxx // 1-byte characters: 0xxxxxxx (ASCI)
result.append(1, static_cast<char>(codepoint)); result.append(1, static_cast<typename string_t::value_type>(codepoint));
} }
else if (codepoint <= 0x7ff) else if (codepoint <= 0x7ff)
{ {
// 2-byte characters: 110xxxxx 10xxxxxx // 2-byte characters: 110xxxxx 10xxxxxx
// the 0xC0 enables the two most significant bits to make this result.append(1, static_cast<typename string_t::value_type>(0xC0 | ((codepoint >> 6) & 0x1F)));
// a 2-byte UTF-8 character result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
result.append(1, static_cast<CharT>(0xC0 | ((codepoint >> 6) & 0x1F)));
result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
} }
else if (codepoint <= 0xffff) else if (codepoint <= 0xffff)
{ {
// 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
// the 0xE0 enables the three most significant bits to make result.append(1, static_cast<typename string_t::value_type>(0xE0 | ((codepoint >> 12) & 0x0F)));
// this a 3-byte UTF-8 character result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 6) & 0x3F)));
result.append(1, static_cast<CharT>(0xE0 | ((codepoint >> 12) & 0x0F))); result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 6) & 0x3F)));
result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
} }
else if (codepoint <= 0x10ffff) else if (codepoint <= 0x10ffff)
{ {
// 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// the 0xF0 enables the four most significant bits to make this result.append(1, static_cast<typename string_t::value_type>(0xF0 | ((codepoint >> 18) & 0x07)));
// a 4-byte UTF-8 character result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 12) & 0x3F)));
result.append(1, static_cast<CharT>(0xF0 | ((codepoint >> 18) & 0x07))); result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 6) & 0x3F)));
result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 12) & 0x3F))); result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 6) & 0x3F)));
result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
} }
else else
{ {
throw std::out_of_range("code point is invalid"); throw std::out_of_range("code points above 0x10FFFF are invalid");
} }
return result; return result;
@ -2557,15 +2553,13 @@ class basic_json
inline token_type scan() inline token_type scan()
{ {
// pointer for backtracking information // pointer for backtracking information
const char* m_marker = nullptr; const typename string_t::value_type* m_marker = nullptr;
while (true)
{
// remember the begin of the token // remember the begin of the token
m_start = m_cursor; m_start = m_cursor;
/*!re2c /*!re2c
re2c:define:YYCTYPE = char; re2c:define:YYCTYPE = lexer_char_t;
re2c:define:YYCURSOR = m_cursor; re2c:define:YYCURSOR = m_cursor;
re2c:define:YYLIMIT = m_limit; re2c:define:YYLIMIT = m_limit;
re2c:define:YYMARKER = m_marker; re2c:define:YYMARKER = m_marker;
@ -2576,7 +2570,7 @@ class basic_json
// whitespace // whitespace
ws = [ \t\n\r]+; ws = [ \t\n\r]+;
ws { continue; } ws { return scan(); }
// structural characters // structural characters
"[" { return token_type::begin_array; } "[" { return token_type::begin_array; }
@ -2623,11 +2617,10 @@ class basic_json
. { return token_type::parse_error; } . { return token_type::parse_error; }
*/ */
} }
}
inline std::string get_token() const inline string_t get_token() const
{ {
return std::string(m_start, static_cast<size_t>(m_cursor - m_start)); return string_t(m_start, static_cast<size_t>(m_cursor - m_start));
} }
/*! /*!
@ -2638,16 +2631,14 @@ class basic_json
from the pointer difference of the two pointers). from the pointer difference of the two pointers).
@return string value of current token without opening and closing quotes @return string value of current token without opening and closing quotes
@todo Take care of Unicode.
*/ */
inline std::string get_string() const inline string_t get_string() const
{ {
std::string result; string_t result;
result.reserve(static_cast<size_t>(m_cursor - m_start - 2)); result.reserve(static_cast<size_t>(m_cursor - m_start - 2));
// iterate the result between the quotes // iterate the result between the quotes
for (const char* i = m_start + 1; i < m_cursor - 1; ++i) for (const typename string_t::value_type* i = m_start + 1; i < m_cursor - 1; ++i)
{ {
// process escaped characters // process escaped characters
if (*i == '\\') if (*i == '\\')
@ -2707,7 +2698,7 @@ class basic_json
// get code xxxx from \uxxxx // get code xxxx from \uxxxx
auto codepoint = strtol(i + 1, nullptr, 16); auto codepoint = strtol(i + 1, nullptr, 16);
// add unicode character(s) // add unicode character(s)
result += to_unicode<char>(codepoint); result += to_unicode(codepoint);
// skip the next four characters (\uxxxx) // skip the next four characters (\uxxxx)
i += 4; i += 4;
break; break;
@ -2746,20 +2737,20 @@ class basic_json
private: private:
/// the buffer /// the buffer
const char* m_content = nullptr; const typename string_t::value_type* m_content = nullptr;
/// pointer to he beginning of the current symbol /// pointer to he beginning of the current symbol
const char* m_start = nullptr; const typename string_t::value_type* m_start = nullptr;
/// pointer to the current symbol /// pointer to the current symbol
const char* m_cursor = nullptr; const typename string_t::value_type* m_cursor = nullptr;
/// pointer to the end of the buffer /// pointer to the end of the buffer
const char* m_limit = nullptr; const typename string_t::value_type* m_limit = nullptr;
}; };
class parser class parser
{ {
public: public:
/// constructor for strings /// constructor for strings
inline parser(const std::string& s) : m_buffer(s), m_lexer(m_buffer.c_str()) inline parser(const string_t& s) : m_buffer(s), m_lexer(m_buffer.c_str())
{ {
// read first token // read first token
get_token(); get_token();
@ -2770,7 +2761,7 @@ class basic_json
{ {
while (_is) while (_is)
{ {
std::string input_line; string_t input_line;
std::getline(_is, input_line); std::getline(_is, input_line);
m_buffer += input_line; m_buffer += input_line;
} }
@ -2964,7 +2955,7 @@ class basic_json
private: private:
/// the buffer /// the buffer
std::string m_buffer; string_t m_buffer;
/// the type of the last read token /// the type of the last read token
typename lexer::token_type last_token = lexer::token_type::uninitialized; typename lexer::token_type last_token = lexer::token_type::uninitialized;
/// the lexer /// the lexer

View file

@ -5517,8 +5517,8 @@ TEST_CASE("lexer class")
SECTION("to_unicode") SECTION("to_unicode")
{ {
CHECK(json::lexer::to_unicode<char>(0x1F4A9) == "💩"); CHECK(json::lexer::to_unicode(0x1F4A9) == "💩");
CHECK_THROWS_AS(json::lexer::to_unicode<char>(0x110000), std::out_of_range); CHECK_THROWS_AS(json::lexer::to_unicode(0x200000), std::out_of_range);
} }
} }