added Unicode support
This commit is contained in:
parent
6ef3cb51a4
commit
bb558d8e6f
4 changed files with 277 additions and 24 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -2,3 +2,5 @@
|
||||||
utf8_test
|
utf8_test
|
||||||
|
|
||||||
json_unit
|
json_unit
|
||||||
|
|
||||||
|
html
|
||||||
|
|
130
src/json.hpp
130
src/json.hpp
|
@ -11,6 +11,7 @@
|
||||||
#define _NLOHMANN_JSON
|
#define _NLOHMANN_JSON
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <cmath>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include <initializer_list>
|
#include <initializer_list>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
@ -22,7 +23,6 @@
|
||||||
#include <type_traits>
|
#include <type_traits>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <cmath>
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
- ObjectType trick from http://stackoverflow.com/a/9860911
|
- ObjectType trick from http://stackoverflow.com/a/9860911
|
||||||
|
@ -2464,6 +2464,51 @@ class basic_json
|
||||||
|
|
||||||
inline lexer() = default;
|
inline lexer() = default;
|
||||||
|
|
||||||
|
template<typename CharT>
|
||||||
|
inline static std::basic_string<CharT> to_unicode(const long codepoint)
|
||||||
|
{
|
||||||
|
std::string result;
|
||||||
|
|
||||||
|
if (codepoint <= 0x7f)
|
||||||
|
{
|
||||||
|
// 1-byte (ASCII) characters: 0xxxxxxx
|
||||||
|
result.append(1, static_cast<char>(codepoint));
|
||||||
|
}
|
||||||
|
else if (codepoint <= 0x7ff)
|
||||||
|
{
|
||||||
|
// 2-byte characters: 110xxxxx 10xxxxxx
|
||||||
|
// the 0xC0 enables the two most significant bits to make this
|
||||||
|
// a 2-byte UTF-8 character
|
||||||
|
result.append(1, static_cast<CharT>(0xC0 | ((codepoint >> 6) & 0x1F)));
|
||||||
|
result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
|
||||||
|
}
|
||||||
|
else if (codepoint <= 0xffff)
|
||||||
|
{
|
||||||
|
// 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
|
||||||
|
// the 0xE0 enables the three most significant bits to make
|
||||||
|
// this a 3-byte UTF-8 character
|
||||||
|
result.append(1, static_cast<CharT>(0xE0 | ((codepoint >> 12) & 0x0F)));
|
||||||
|
result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 6) & 0x3F)));
|
||||||
|
result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
|
||||||
|
}
|
||||||
|
else if (codepoint <= 0x10ffff)
|
||||||
|
{
|
||||||
|
// 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||||
|
// the 0xF0 enables the four most significant bits to make this
|
||||||
|
// a 4-byte UTF-8 character
|
||||||
|
result.append(1, static_cast<CharT>(0xF0 | ((codepoint >> 18) & 0x07)));
|
||||||
|
result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 12) & 0x3F)));
|
||||||
|
result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 6) & 0x3F)));
|
||||||
|
result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
throw std::out_of_range("code point is invalid");
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
inline static std::string token_type_name(token_type t)
|
inline static std::string token_type_name(token_type t)
|
||||||
{
|
{
|
||||||
switch (t)
|
switch (t)
|
||||||
|
@ -3241,7 +3286,7 @@ basic_json_parser_59:
|
||||||
/*!
|
/*!
|
||||||
The pointer m_start points to the opening quote of the string, and
|
The pointer m_start points to the opening quote of the string, and
|
||||||
m_cursor past the closing quote of the string. We create a std::string
|
m_cursor past the closing quote of the string. We create a std::string
|
||||||
from the character after the opening quotes (m_begin+1) until the
|
from the character after the opening quotes (m_start+1) until the
|
||||||
character before the closing quotes (hence subtracting 2 characters
|
character before the closing quotes (hence subtracting 2 characters
|
||||||
from the pointer difference of the two pointers).
|
from the pointer difference of the two pointers).
|
||||||
|
|
||||||
|
@ -3251,7 +3296,86 @@ basic_json_parser_59:
|
||||||
*/
|
*/
|
||||||
inline std::string get_string() const
|
inline std::string get_string() const
|
||||||
{
|
{
|
||||||
return std::string(m_start + 1, static_cast<size_t>(m_cursor - m_start - 2));
|
std::string result;
|
||||||
|
result.reserve(static_cast<size_t>(m_cursor - m_start - 2));
|
||||||
|
|
||||||
|
// iterate the result between the quotes
|
||||||
|
for (const char* i = m_start + 1; i < m_cursor - 1; ++i)
|
||||||
|
{
|
||||||
|
// process escaped characters
|
||||||
|
if (*i == '\\')
|
||||||
|
{
|
||||||
|
// read next character
|
||||||
|
++i;
|
||||||
|
|
||||||
|
switch (*i)
|
||||||
|
{
|
||||||
|
// the default escapes
|
||||||
|
case 't':
|
||||||
|
{
|
||||||
|
result += "\t";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 'b':
|
||||||
|
{
|
||||||
|
result += "\b";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 'f':
|
||||||
|
{
|
||||||
|
result += "\f";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 'n':
|
||||||
|
{
|
||||||
|
result += "\n";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 'r':
|
||||||
|
{
|
||||||
|
result += "\r";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// characters that are not "un"escsaped
|
||||||
|
case '\\':
|
||||||
|
{
|
||||||
|
result += "\\\\";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case '/':
|
||||||
|
{
|
||||||
|
result += "\\/";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case '"':
|
||||||
|
{
|
||||||
|
result += "\\\"";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// unicode
|
||||||
|
case 'u':
|
||||||
|
{
|
||||||
|
// get code xxxx from \uxxxx
|
||||||
|
auto codepoint = strtol(i + 1, nullptr, 16);
|
||||||
|
// add unicode character(s)
|
||||||
|
result += to_unicode<char>(codepoint);
|
||||||
|
// skip the next four characters (\uxxxx)
|
||||||
|
i += 4;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// all other characters are just copied to the end of the
|
||||||
|
// string
|
||||||
|
result.append(1, *i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline number_float_t get_number() const
|
inline number_float_t get_number() const
|
||||||
|
|
|
@ -11,6 +11,7 @@
|
||||||
#define _NLOHMANN_JSON
|
#define _NLOHMANN_JSON
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <cmath>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include <initializer_list>
|
#include <initializer_list>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
@ -22,7 +23,6 @@
|
||||||
#include <type_traits>
|
#include <type_traits>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <cmath>
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
- ObjectType trick from http://stackoverflow.com/a/9860911
|
- ObjectType trick from http://stackoverflow.com/a/9860911
|
||||||
|
@ -2464,6 +2464,51 @@ class basic_json
|
||||||
|
|
||||||
inline lexer() = default;
|
inline lexer() = default;
|
||||||
|
|
||||||
|
template<typename CharT>
|
||||||
|
inline static std::basic_string<CharT> to_unicode(const long codepoint)
|
||||||
|
{
|
||||||
|
std::string result;
|
||||||
|
|
||||||
|
if (codepoint <= 0x7f)
|
||||||
|
{
|
||||||
|
// 1-byte (ASCII) characters: 0xxxxxxx
|
||||||
|
result.append(1, static_cast<char>(codepoint));
|
||||||
|
}
|
||||||
|
else if (codepoint <= 0x7ff)
|
||||||
|
{
|
||||||
|
// 2-byte characters: 110xxxxx 10xxxxxx
|
||||||
|
// the 0xC0 enables the two most significant bits to make this
|
||||||
|
// a 2-byte UTF-8 character
|
||||||
|
result.append(1, static_cast<CharT>(0xC0 | ((codepoint >> 6) & 0x1F)));
|
||||||
|
result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
|
||||||
|
}
|
||||||
|
else if (codepoint <= 0xffff)
|
||||||
|
{
|
||||||
|
// 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
|
||||||
|
// the 0xE0 enables the three most significant bits to make
|
||||||
|
// this a 3-byte UTF-8 character
|
||||||
|
result.append(1, static_cast<CharT>(0xE0 | ((codepoint >> 12) & 0x0F)));
|
||||||
|
result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 6) & 0x3F)));
|
||||||
|
result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
|
||||||
|
}
|
||||||
|
else if (codepoint <= 0x10ffff)
|
||||||
|
{
|
||||||
|
// 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||||
|
// the 0xF0 enables the four most significant bits to make this
|
||||||
|
// a 4-byte UTF-8 character
|
||||||
|
result.append(1, static_cast<CharT>(0xF0 | ((codepoint >> 18) & 0x07)));
|
||||||
|
result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 12) & 0x3F)));
|
||||||
|
result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 6) & 0x3F)));
|
||||||
|
result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
throw std::out_of_range("code point is invalid");
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
inline static std::string token_type_name(token_type t)
|
inline static std::string token_type_name(token_type t)
|
||||||
{
|
{
|
||||||
switch (t)
|
switch (t)
|
||||||
|
@ -2508,8 +2553,6 @@ class basic_json
|
||||||
with goto jumps.
|
with goto jumps.
|
||||||
|
|
||||||
@return the class of the next token read from the buffer
|
@return the class of the next token read from the buffer
|
||||||
|
|
||||||
@todo Unicode support needs to be checked.
|
|
||||||
*/
|
*/
|
||||||
inline token_type scan()
|
inline token_type scan()
|
||||||
{
|
{
|
||||||
|
@ -2590,7 +2633,7 @@ class basic_json
|
||||||
/*!
|
/*!
|
||||||
The pointer m_start points to the opening quote of the string, and
|
The pointer m_start points to the opening quote of the string, and
|
||||||
m_cursor past the closing quote of the string. We create a std::string
|
m_cursor past the closing quote of the string. We create a std::string
|
||||||
from the character after the opening quotes (m_begin+1) until the
|
from the character after the opening quotes (m_start+1) until the
|
||||||
character before the closing quotes (hence subtracting 2 characters
|
character before the closing quotes (hence subtracting 2 characters
|
||||||
from the pointer difference of the two pointers).
|
from the pointer difference of the two pointers).
|
||||||
|
|
||||||
|
@ -2600,7 +2643,86 @@ class basic_json
|
||||||
*/
|
*/
|
||||||
inline std::string get_string() const
|
inline std::string get_string() const
|
||||||
{
|
{
|
||||||
return std::string(m_start + 1, static_cast<size_t>(m_cursor - m_start - 2));
|
std::string result;
|
||||||
|
result.reserve(static_cast<size_t>(m_cursor - m_start - 2));
|
||||||
|
|
||||||
|
// iterate the result between the quotes
|
||||||
|
for (const char* i = m_start + 1; i < m_cursor - 1; ++i)
|
||||||
|
{
|
||||||
|
// process escaped characters
|
||||||
|
if (*i == '\\')
|
||||||
|
{
|
||||||
|
// read next character
|
||||||
|
++i;
|
||||||
|
|
||||||
|
switch (*i)
|
||||||
|
{
|
||||||
|
// the default escapes
|
||||||
|
case 't':
|
||||||
|
{
|
||||||
|
result += "\t";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 'b':
|
||||||
|
{
|
||||||
|
result += "\b";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 'f':
|
||||||
|
{
|
||||||
|
result += "\f";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 'n':
|
||||||
|
{
|
||||||
|
result += "\n";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 'r':
|
||||||
|
{
|
||||||
|
result += "\r";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// characters that are not "un"escsaped
|
||||||
|
case '\\':
|
||||||
|
{
|
||||||
|
result += "\\\\";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case '/':
|
||||||
|
{
|
||||||
|
result += "\\/";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case '"':
|
||||||
|
{
|
||||||
|
result += "\\\"";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// unicode
|
||||||
|
case 'u':
|
||||||
|
{
|
||||||
|
// get code xxxx from \uxxxx
|
||||||
|
auto codepoint = strtol(i + 1, nullptr, 16);
|
||||||
|
// add unicode character(s)
|
||||||
|
result += to_unicode<char>(codepoint);
|
||||||
|
// skip the next four characters (\uxxxx)
|
||||||
|
i += 4;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// all other characters are just copied to the end of the
|
||||||
|
// string
|
||||||
|
result.append(1, *i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline number_float_t get_number() const
|
inline number_float_t get_number() const
|
||||||
|
|
|
@ -5577,27 +5577,27 @@ TEST_CASE("parser class")
|
||||||
// solidus
|
// solidus
|
||||||
CHECK(json::parser("\"\\/\"").parse() == json("\\/"));
|
CHECK(json::parser("\"\\/\"").parse() == json("\\/"));
|
||||||
// backspace
|
// backspace
|
||||||
CHECK(json::parser("\"\\b\"").parse() == json("\\b"));
|
CHECK(json::parser("\"\\b\"").parse() == json("\b"));
|
||||||
// formfeed
|
// formfeed
|
||||||
CHECK(json::parser("\"\\f\"").parse() == json("\\f"));
|
CHECK(json::parser("\"\\f\"").parse() == json("\f"));
|
||||||
// newline
|
// newline
|
||||||
CHECK(json::parser("\"\\n\"").parse() == json("\\n"));
|
CHECK(json::parser("\"\\n\"").parse() == json("\n"));
|
||||||
// carriage return
|
// carriage return
|
||||||
CHECK(json::parser("\"\\r\"").parse() == json("\\r"));
|
CHECK(json::parser("\"\\r\"").parse() == json("\r"));
|
||||||
// horizontal tab
|
// horizontal tab
|
||||||
CHECK(json::parser("\"\\t\"").parse() == json("\\t"));
|
CHECK(json::parser("\"\\t\"").parse() == json("\t"));
|
||||||
|
|
||||||
CHECK(json::parser("\"\\u0000\"").parse() == json("\\u0000"));
|
CHECK(json::parser("\"\\u0001\"").parse().get<json::string_t>() == "\x01");
|
||||||
CHECK(json::parser("\"\\u000a\"").parse() == json("\\u000a"));
|
CHECK(json::parser("\"\\u000a\"").parse().get<json::string_t>() == "\n");
|
||||||
CHECK(json::parser("\"\\u00b0\"").parse() == json("\\u00b0"));
|
CHECK(json::parser("\"\\u00b0\"").parse().get<json::string_t>() == "°");
|
||||||
CHECK(json::parser("\"\\u0c00\"").parse() == json("\\u0c00"));
|
CHECK(json::parser("\"\\u0c00\"").parse().get<json::string_t>() == "ఀ");
|
||||||
CHECK(json::parser("\"\\ud000\"").parse() == json("\\ud000"));
|
CHECK(json::parser("\"\\ud000\"").parse().get<json::string_t>() == "퀀");
|
||||||
CHECK(json::parser("\"\\u0000\"").parse() == json("\\u0000"));
|
CHECK(json::parser("\"\\u000E\"").parse().get<json::string_t>() == "\x0E");
|
||||||
CHECK(json::parser("\"\\u000E\"").parse() == json("\\u000E"));
|
CHECK(json::parser("\"\\u00F0\"").parse().get<json::string_t>() == "ð");
|
||||||
CHECK(json::parser("\"\\u00F0\"").parse() == json("\\u00F0"));
|
CHECK(json::parser("\"\\u0100\"").parse().get<json::string_t>() == "Ā");
|
||||||
CHECK(json::parser("\"\\u0100\"").parse() == json("\\u0100"));
|
CHECK(json::parser("\"\\u2000\"").parse().get<json::string_t>() == " ");
|
||||||
CHECK(json::parser("\"\\u2000\"").parse() == json("\\u2000"));
|
CHECK(json::parser("\"\\uFFFF\"").parse().get<json::string_t>() == "");
|
||||||
CHECK(json::parser("\"\\uFFFF\"").parse() == json("\\uFFFF"));
|
CHECK(json::parser("\"\\u20AC\"").parse().get<json::string_t>() == "€");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5848,3 +5848,8 @@ TEST_CASE("parser class")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_CASE()
|
||||||
|
{
|
||||||
|
CHECK(json::parser("\"\\u0049\\u004e\"").parse().get<json::string_t>() == "IN");
|
||||||
|
}
|
Loading…
Reference in a new issue