clean up
This commit is contained in:
parent
b21bf95620
commit
2fc82358ce
3 changed files with 759 additions and 779 deletions
79
src/json.hpp
79
src/json.hpp
|
@ -2456,7 +2456,10 @@ class basic_json
|
||||||
end_of_input
|
end_of_input
|
||||||
};
|
};
|
||||||
|
|
||||||
inline lexer(const char* s) : m_content(s)
|
/// the char type to use in the lexer
|
||||||
|
using lexer_char_t = typename string_t::value_type;
|
||||||
|
|
||||||
|
inline lexer(const typename string_t::value_type* s) : m_content(s)
|
||||||
{
|
{
|
||||||
m_start = m_cursor = m_content;
|
m_start = m_cursor = m_content;
|
||||||
m_limit = m_content + strlen(m_content);
|
m_limit = m_content + strlen(m_content);
|
||||||
|
@ -2464,46 +2467,39 @@ class basic_json
|
||||||
|
|
||||||
inline lexer() = default;
|
inline lexer() = default;
|
||||||
|
|
||||||
template<typename CharT>
|
inline static string_t to_unicode(const long codepoint)
|
||||||
inline static std::basic_string<CharT> to_unicode(const long codepoint)
|
|
||||||
{
|
{
|
||||||
std::string result;
|
string_t result;
|
||||||
|
|
||||||
if (codepoint <= 0x7f)
|
if (codepoint <= 0x7f)
|
||||||
{
|
{
|
||||||
// 1-byte (ASCII) characters: 0xxxxxxx
|
// 1-byte characters: 0xxxxxxx (ASCI)
|
||||||
result.append(1, static_cast<char>(codepoint));
|
result.append(1, static_cast<typename string_t::value_type>(codepoint));
|
||||||
}
|
}
|
||||||
else if (codepoint <= 0x7ff)
|
else if (codepoint <= 0x7ff)
|
||||||
{
|
{
|
||||||
// 2-byte characters: 110xxxxx 10xxxxxx
|
// 2-byte characters: 110xxxxx 10xxxxxx
|
||||||
// the 0xC0 enables the two most significant bits to make this
|
result.append(1, static_cast<typename string_t::value_type>(0xC0 | ((codepoint >> 6) & 0x1F)));
|
||||||
// a 2-byte UTF-8 character
|
result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
|
||||||
result.append(1, static_cast<CharT>(0xC0 | ((codepoint >> 6) & 0x1F)));
|
|
||||||
result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
|
|
||||||
}
|
}
|
||||||
else if (codepoint <= 0xffff)
|
else if (codepoint <= 0xffff)
|
||||||
{
|
{
|
||||||
// 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
|
// 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
|
||||||
// the 0xE0 enables the three most significant bits to make
|
result.append(1, static_cast<typename string_t::value_type>(0xE0 | ((codepoint >> 12) & 0x0F)));
|
||||||
// this a 3-byte UTF-8 character
|
result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 6) & 0x3F)));
|
||||||
result.append(1, static_cast<CharT>(0xE0 | ((codepoint >> 12) & 0x0F)));
|
result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
|
||||||
result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 6) & 0x3F)));
|
|
||||||
result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
|
|
||||||
}
|
}
|
||||||
else if (codepoint <= 0x10ffff)
|
else if (codepoint <= 0x10ffff)
|
||||||
{
|
{
|
||||||
// 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
// 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||||
// the 0xF0 enables the four most significant bits to make this
|
result.append(1, static_cast<typename string_t::value_type>(0xF0 | ((codepoint >> 18) & 0x07)));
|
||||||
// a 4-byte UTF-8 character
|
result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 12) & 0x3F)));
|
||||||
result.append(1, static_cast<CharT>(0xF0 | ((codepoint >> 18) & 0x07)));
|
result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 6) & 0x3F)));
|
||||||
result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 12) & 0x3F)));
|
result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
|
||||||
result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 6) & 0x3F)));
|
|
||||||
result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
throw std::out_of_range("code point is invalid");
|
throw std::out_of_range("code points above 0x10FFFF are invalid");
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
@ -2553,22 +2549,18 @@ class basic_json
|
||||||
with goto jumps.
|
with goto jumps.
|
||||||
|
|
||||||
@return the class of the next token read from the buffer
|
@return the class of the next token read from the buffer
|
||||||
|
|
||||||
@todo Unicode support needs to be checked.
|
|
||||||
*/
|
*/
|
||||||
inline token_type scan()
|
inline token_type scan()
|
||||||
{
|
{
|
||||||
// pointer for backtracking information
|
// pointer for backtracking information
|
||||||
const char* m_marker = nullptr;
|
const typename string_t::value_type* m_marker = nullptr;
|
||||||
|
|
||||||
while (true)
|
|
||||||
{
|
|
||||||
// remember the begin of the token
|
// remember the begin of the token
|
||||||
m_start = m_cursor;
|
m_start = m_cursor;
|
||||||
|
|
||||||
|
|
||||||
{
|
{
|
||||||
char yych;
|
lexer_char_t yych;
|
||||||
unsigned int yyaccept = 0;
|
unsigned int yyaccept = 0;
|
||||||
static const unsigned char yybm[] =
|
static const unsigned char yybm[] =
|
||||||
{
|
{
|
||||||
|
@ -2733,7 +2725,7 @@ basic_json_parser_2:
|
||||||
goto basic_json_parser_5;
|
goto basic_json_parser_5;
|
||||||
basic_json_parser_3:
|
basic_json_parser_3:
|
||||||
{
|
{
|
||||||
continue;
|
return scan();
|
||||||
}
|
}
|
||||||
basic_json_parser_4:
|
basic_json_parser_4:
|
||||||
++m_cursor;
|
++m_cursor;
|
||||||
|
@ -3276,11 +3268,10 @@ basic_json_parser_59:
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
inline std::string get_token() const
|
inline string_t get_token() const
|
||||||
{
|
{
|
||||||
return std::string(m_start, static_cast<size_t>(m_cursor - m_start));
|
return string_t(m_start, static_cast<size_t>(m_cursor - m_start));
|
||||||
}
|
}
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
|
@ -3291,16 +3282,14 @@ basic_json_parser_59:
|
||||||
from the pointer difference of the two pointers).
|
from the pointer difference of the two pointers).
|
||||||
|
|
||||||
@return string value of current token without opening and closing quotes
|
@return string value of current token without opening and closing quotes
|
||||||
|
|
||||||
@todo Take care of Unicode.
|
|
||||||
*/
|
*/
|
||||||
inline std::string get_string() const
|
inline string_t get_string() const
|
||||||
{
|
{
|
||||||
std::string result;
|
string_t result;
|
||||||
result.reserve(static_cast<size_t>(m_cursor - m_start - 2));
|
result.reserve(static_cast<size_t>(m_cursor - m_start - 2));
|
||||||
|
|
||||||
// iterate the result between the quotes
|
// iterate the result between the quotes
|
||||||
for (const char* i = m_start + 1; i < m_cursor - 1; ++i)
|
for (const typename string_t::value_type* i = m_start + 1; i < m_cursor - 1; ++i)
|
||||||
{
|
{
|
||||||
// process escaped characters
|
// process escaped characters
|
||||||
if (*i == '\\')
|
if (*i == '\\')
|
||||||
|
@ -3360,7 +3349,7 @@ basic_json_parser_59:
|
||||||
// get code xxxx from \uxxxx
|
// get code xxxx from \uxxxx
|
||||||
auto codepoint = strtol(i + 1, nullptr, 16);
|
auto codepoint = strtol(i + 1, nullptr, 16);
|
||||||
// add unicode character(s)
|
// add unicode character(s)
|
||||||
result += to_unicode<char>(codepoint);
|
result += to_unicode(codepoint);
|
||||||
// skip the next four characters (\uxxxx)
|
// skip the next four characters (\uxxxx)
|
||||||
i += 4;
|
i += 4;
|
||||||
break;
|
break;
|
||||||
|
@ -3399,20 +3388,20 @@ basic_json_parser_59:
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/// the buffer
|
/// the buffer
|
||||||
const char* m_content = nullptr;
|
const typename string_t::value_type* m_content = nullptr;
|
||||||
/// pointer to he beginning of the current symbol
|
/// pointer to he beginning of the current symbol
|
||||||
const char* m_start = nullptr;
|
const typename string_t::value_type* m_start = nullptr;
|
||||||
/// pointer to the current symbol
|
/// pointer to the current symbol
|
||||||
const char* m_cursor = nullptr;
|
const typename string_t::value_type* m_cursor = nullptr;
|
||||||
/// pointer to the end of the buffer
|
/// pointer to the end of the buffer
|
||||||
const char* m_limit = nullptr;
|
const typename string_t::value_type* m_limit = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
class parser
|
class parser
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
/// constructor for strings
|
/// constructor for strings
|
||||||
inline parser(const std::string& s) : m_buffer(s), m_lexer(m_buffer.c_str())
|
inline parser(const string_t& s) : m_buffer(s), m_lexer(m_buffer.c_str())
|
||||||
{
|
{
|
||||||
// read first token
|
// read first token
|
||||||
get_token();
|
get_token();
|
||||||
|
@ -3423,7 +3412,7 @@ basic_json_parser_59:
|
||||||
{
|
{
|
||||||
while (_is)
|
while (_is)
|
||||||
{
|
{
|
||||||
std::string input_line;
|
string_t input_line;
|
||||||
std::getline(_is, input_line);
|
std::getline(_is, input_line);
|
||||||
m_buffer += input_line;
|
m_buffer += input_line;
|
||||||
}
|
}
|
||||||
|
@ -3617,7 +3606,7 @@ basic_json_parser_59:
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/// the buffer
|
/// the buffer
|
||||||
std::string m_buffer;
|
string_t m_buffer;
|
||||||
/// the type of the last read token
|
/// the type of the last read token
|
||||||
typename lexer::token_type last_token = lexer::token_type::uninitialized;
|
typename lexer::token_type last_token = lexer::token_type::uninitialized;
|
||||||
/// the lexer
|
/// the lexer
|
||||||
|
|
|
@ -2456,7 +2456,10 @@ class basic_json
|
||||||
end_of_input
|
end_of_input
|
||||||
};
|
};
|
||||||
|
|
||||||
inline lexer(const char* s) : m_content(s)
|
/// the char type to use in the lexer
|
||||||
|
using lexer_char_t = typename string_t::value_type;
|
||||||
|
|
||||||
|
inline lexer(const typename string_t::value_type* s) : m_content(s)
|
||||||
{
|
{
|
||||||
m_start = m_cursor = m_content;
|
m_start = m_cursor = m_content;
|
||||||
m_limit = m_content + strlen(m_content);
|
m_limit = m_content + strlen(m_content);
|
||||||
|
@ -2464,46 +2467,39 @@ class basic_json
|
||||||
|
|
||||||
inline lexer() = default;
|
inline lexer() = default;
|
||||||
|
|
||||||
template<typename CharT>
|
inline static string_t to_unicode(const long codepoint)
|
||||||
inline static std::basic_string<CharT> to_unicode(const long codepoint)
|
|
||||||
{
|
{
|
||||||
std::string result;
|
string_t result;
|
||||||
|
|
||||||
if (codepoint <= 0x7f)
|
if (codepoint <= 0x7f)
|
||||||
{
|
{
|
||||||
// 1-byte (ASCII) characters: 0xxxxxxx
|
// 1-byte characters: 0xxxxxxx (ASCI)
|
||||||
result.append(1, static_cast<char>(codepoint));
|
result.append(1, static_cast<typename string_t::value_type>(codepoint));
|
||||||
}
|
}
|
||||||
else if (codepoint <= 0x7ff)
|
else if (codepoint <= 0x7ff)
|
||||||
{
|
{
|
||||||
// 2-byte characters: 110xxxxx 10xxxxxx
|
// 2-byte characters: 110xxxxx 10xxxxxx
|
||||||
// the 0xC0 enables the two most significant bits to make this
|
result.append(1, static_cast<typename string_t::value_type>(0xC0 | ((codepoint >> 6) & 0x1F)));
|
||||||
// a 2-byte UTF-8 character
|
result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
|
||||||
result.append(1, static_cast<CharT>(0xC0 | ((codepoint >> 6) & 0x1F)));
|
|
||||||
result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
|
|
||||||
}
|
}
|
||||||
else if (codepoint <= 0xffff)
|
else if (codepoint <= 0xffff)
|
||||||
{
|
{
|
||||||
// 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
|
// 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
|
||||||
// the 0xE0 enables the three most significant bits to make
|
result.append(1, static_cast<typename string_t::value_type>(0xE0 | ((codepoint >> 12) & 0x0F)));
|
||||||
// this a 3-byte UTF-8 character
|
result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 6) & 0x3F)));
|
||||||
result.append(1, static_cast<CharT>(0xE0 | ((codepoint >> 12) & 0x0F)));
|
result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
|
||||||
result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 6) & 0x3F)));
|
|
||||||
result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
|
|
||||||
}
|
}
|
||||||
else if (codepoint <= 0x10ffff)
|
else if (codepoint <= 0x10ffff)
|
||||||
{
|
{
|
||||||
// 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
// 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||||
// the 0xF0 enables the four most significant bits to make this
|
result.append(1, static_cast<typename string_t::value_type>(0xF0 | ((codepoint >> 18) & 0x07)));
|
||||||
// a 4-byte UTF-8 character
|
result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 12) & 0x3F)));
|
||||||
result.append(1, static_cast<CharT>(0xF0 | ((codepoint >> 18) & 0x07)));
|
result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 6) & 0x3F)));
|
||||||
result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 12) & 0x3F)));
|
result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
|
||||||
result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 6) & 0x3F)));
|
|
||||||
result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
throw std::out_of_range("code point is invalid");
|
throw std::out_of_range("code points above 0x10FFFF are invalid");
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
@ -2557,15 +2553,13 @@ class basic_json
|
||||||
inline token_type scan()
|
inline token_type scan()
|
||||||
{
|
{
|
||||||
// pointer for backtracking information
|
// pointer for backtracking information
|
||||||
const char* m_marker = nullptr;
|
const typename string_t::value_type* m_marker = nullptr;
|
||||||
|
|
||||||
while (true)
|
|
||||||
{
|
|
||||||
// remember the begin of the token
|
// remember the begin of the token
|
||||||
m_start = m_cursor;
|
m_start = m_cursor;
|
||||||
|
|
||||||
/*!re2c
|
/*!re2c
|
||||||
re2c:define:YYCTYPE = char;
|
re2c:define:YYCTYPE = lexer_char_t;
|
||||||
re2c:define:YYCURSOR = m_cursor;
|
re2c:define:YYCURSOR = m_cursor;
|
||||||
re2c:define:YYLIMIT = m_limit;
|
re2c:define:YYLIMIT = m_limit;
|
||||||
re2c:define:YYMARKER = m_marker;
|
re2c:define:YYMARKER = m_marker;
|
||||||
|
@ -2576,7 +2570,7 @@ class basic_json
|
||||||
|
|
||||||
// whitespace
|
// whitespace
|
||||||
ws = [ \t\n\r]+;
|
ws = [ \t\n\r]+;
|
||||||
ws { continue; }
|
ws { return scan(); }
|
||||||
|
|
||||||
// structural characters
|
// structural characters
|
||||||
"[" { return token_type::begin_array; }
|
"[" { return token_type::begin_array; }
|
||||||
|
@ -2623,11 +2617,10 @@ class basic_json
|
||||||
. { return token_type::parse_error; }
|
. { return token_type::parse_error; }
|
||||||
*/
|
*/
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
inline std::string get_token() const
|
inline string_t get_token() const
|
||||||
{
|
{
|
||||||
return std::string(m_start, static_cast<size_t>(m_cursor - m_start));
|
return string_t(m_start, static_cast<size_t>(m_cursor - m_start));
|
||||||
}
|
}
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
|
@ -2638,16 +2631,14 @@ class basic_json
|
||||||
from the pointer difference of the two pointers).
|
from the pointer difference of the two pointers).
|
||||||
|
|
||||||
@return string value of current token without opening and closing quotes
|
@return string value of current token without opening and closing quotes
|
||||||
|
|
||||||
@todo Take care of Unicode.
|
|
||||||
*/
|
*/
|
||||||
inline std::string get_string() const
|
inline string_t get_string() const
|
||||||
{
|
{
|
||||||
std::string result;
|
string_t result;
|
||||||
result.reserve(static_cast<size_t>(m_cursor - m_start - 2));
|
result.reserve(static_cast<size_t>(m_cursor - m_start - 2));
|
||||||
|
|
||||||
// iterate the result between the quotes
|
// iterate the result between the quotes
|
||||||
for (const char* i = m_start + 1; i < m_cursor - 1; ++i)
|
for (const typename string_t::value_type* i = m_start + 1; i < m_cursor - 1; ++i)
|
||||||
{
|
{
|
||||||
// process escaped characters
|
// process escaped characters
|
||||||
if (*i == '\\')
|
if (*i == '\\')
|
||||||
|
@ -2707,7 +2698,7 @@ class basic_json
|
||||||
// get code xxxx from \uxxxx
|
// get code xxxx from \uxxxx
|
||||||
auto codepoint = strtol(i + 1, nullptr, 16);
|
auto codepoint = strtol(i + 1, nullptr, 16);
|
||||||
// add unicode character(s)
|
// add unicode character(s)
|
||||||
result += to_unicode<char>(codepoint);
|
result += to_unicode(codepoint);
|
||||||
// skip the next four characters (\uxxxx)
|
// skip the next four characters (\uxxxx)
|
||||||
i += 4;
|
i += 4;
|
||||||
break;
|
break;
|
||||||
|
@ -2746,20 +2737,20 @@ class basic_json
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/// the buffer
|
/// the buffer
|
||||||
const char* m_content = nullptr;
|
const typename string_t::value_type* m_content = nullptr;
|
||||||
/// pointer to he beginning of the current symbol
|
/// pointer to he beginning of the current symbol
|
||||||
const char* m_start = nullptr;
|
const typename string_t::value_type* m_start = nullptr;
|
||||||
/// pointer to the current symbol
|
/// pointer to the current symbol
|
||||||
const char* m_cursor = nullptr;
|
const typename string_t::value_type* m_cursor = nullptr;
|
||||||
/// pointer to the end of the buffer
|
/// pointer to the end of the buffer
|
||||||
const char* m_limit = nullptr;
|
const typename string_t::value_type* m_limit = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
class parser
|
class parser
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
/// constructor for strings
|
/// constructor for strings
|
||||||
inline parser(const std::string& s) : m_buffer(s), m_lexer(m_buffer.c_str())
|
inline parser(const string_t& s) : m_buffer(s), m_lexer(m_buffer.c_str())
|
||||||
{
|
{
|
||||||
// read first token
|
// read first token
|
||||||
get_token();
|
get_token();
|
||||||
|
@ -2770,7 +2761,7 @@ class basic_json
|
||||||
{
|
{
|
||||||
while (_is)
|
while (_is)
|
||||||
{
|
{
|
||||||
std::string input_line;
|
string_t input_line;
|
||||||
std::getline(_is, input_line);
|
std::getline(_is, input_line);
|
||||||
m_buffer += input_line;
|
m_buffer += input_line;
|
||||||
}
|
}
|
||||||
|
@ -2964,7 +2955,7 @@ class basic_json
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/// the buffer
|
/// the buffer
|
||||||
std::string m_buffer;
|
string_t m_buffer;
|
||||||
/// the type of the last read token
|
/// the type of the last read token
|
||||||
typename lexer::token_type last_token = lexer::token_type::uninitialized;
|
typename lexer::token_type last_token = lexer::token_type::uninitialized;
|
||||||
/// the lexer
|
/// the lexer
|
||||||
|
|
|
@ -5517,8 +5517,8 @@ TEST_CASE("lexer class")
|
||||||
|
|
||||||
SECTION("to_unicode")
|
SECTION("to_unicode")
|
||||||
{
|
{
|
||||||
CHECK(json::lexer::to_unicode<char>(0x1F4A9) == "💩");
|
CHECK(json::lexer::to_unicode(0x1F4A9) == "💩");
|
||||||
CHECK_THROWS_AS(json::lexer::to_unicode<char>(0x110000), std::out_of_range);
|
CHECK_THROWS_AS(json::lexer::to_unicode(0x200000), std::out_of_range);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue