From bb558d8e6f7a014cd5902817eb773c409908df2b Mon Sep 17 00:00:00 2001 From: Niels Date: Sun, 15 Feb 2015 11:44:49 +0100 Subject: [PATCH] added Unicode support --- .gitignore | 2 + src/json.hpp | 130 +++++++++++++++++++++++++++++++++++++++++++-- src/json.hpp.re2c | 132 ++++++++++++++++++++++++++++++++++++++++++++-- test/unit.cpp | 37 +++++++------ 4 files changed, 277 insertions(+), 24 deletions(-) diff --git a/.gitignore b/.gitignore index fead35ad..71c7e865 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ utf8_test json_unit + +html diff --git a/src/json.hpp b/src/json.hpp index 077f605e..e86fec11 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -11,6 +11,7 @@ #define _NLOHMANN_JSON #include +#include #include #include #include @@ -22,7 +23,6 @@ #include #include #include -#include /*! - ObjectType trick from http://stackoverflow.com/a/9860911 @@ -2464,6 +2464,51 @@ class basic_json inline lexer() = default; + template + inline static std::basic_string to_unicode(const long codepoint) + { + std::string result; + + if (codepoint <= 0x7f) + { + // 1-byte (ASCII) characters: 0xxxxxxx + result.append(1, static_cast(codepoint)); + } + else if (codepoint <= 0x7ff) + { + // 2-byte characters: 110xxxxx 10xxxxxx + // the 0xC0 enables the two most significant bits to make this + // a 2-byte UTF-8 character + result.append(1, static_cast(0xC0 | ((codepoint >> 6) & 0x1F))); + result.append(1, static_cast(0x80 | (codepoint & 0x3F))); + } + else if (codepoint <= 0xffff) + { + // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx + // the 0xE0 enables the three most significant bits to make + // this a 3-byte UTF-8 character + result.append(1, static_cast(0xE0 | ((codepoint >> 12) & 0x0F))); + result.append(1, static_cast(0x80 | ((codepoint >> 6) & 0x3F))); + result.append(1, static_cast(0x80 | (codepoint & 0x3F))); + } + else if (codepoint <= 0x10ffff) + { + // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + // the 0xF0 enables the four most significant bits to make this + // a 4-byte UTF-8 character + result.append(1, static_cast(0xF0 | ((codepoint >> 18) & 0x07))); + result.append(1, static_cast(0x80 | ((codepoint >> 12) & 0x3F))); + result.append(1, static_cast(0x80 | ((codepoint >> 6) & 0x3F))); + result.append(1, static_cast(0x80 | (codepoint & 0x3F))); + } + else + { + throw std::out_of_range("code point is invalid"); + } + + return result; + } + inline static std::string token_type_name(token_type t) { switch (t) @@ -3241,7 +3286,7 @@ basic_json_parser_59: /*! The pointer m_start points to the opening quote of the string, and m_cursor past the closing quote of the string. We create a std::string - from the character after the opening quotes (m_begin+1) until the + from the character after the opening quotes (m_start+1) until the character before the closing quotes (hence subtracting 2 characters from the pointer difference of the two pointers). @@ -3251,7 +3296,86 @@ basic_json_parser_59: */ inline std::string get_string() const { - return std::string(m_start + 1, static_cast(m_cursor - m_start - 2)); + std::string result; + result.reserve(static_cast(m_cursor - m_start - 2)); + + // iterate the result between the quotes + for (const char* i = m_start + 1; i < m_cursor - 1; ++i) + { + // process escaped characters + if (*i == '\\') + { + // read next character + ++i; + + switch (*i) + { + // the default escapes + case 't': + { + result += "\t"; + break; + } + case 'b': + { + result += "\b"; + break; + } + case 'f': + { + result += "\f"; + break; + } + case 'n': + { + result += "\n"; + break; + } + case 'r': + { + result += "\r"; + break; + } + + // characters that are not "un"escsaped + case '\\': + { + result += "\\\\"; + break; + } + case '/': + { + result += "\\/"; + break; + } + case '"': + { + result += "\\\""; + break; + } + + // unicode + case 'u': + { + // get code xxxx from \uxxxx + auto codepoint = strtol(i + 1, nullptr, 16); + // add unicode character(s) + result += to_unicode(codepoint); + // skip the next four characters (\uxxxx) + i += 4; + break; + } + } + } + else + { + // all other characters are just copied to the end of the + // string + result.append(1, *i); + } + } + + return result; } inline number_float_t get_number() const diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index 3ac3a4c0..c31d4518 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -11,6 +11,7 @@ #define _NLOHMANN_JSON #include +#include #include #include #include @@ -22,7 +23,6 @@ #include #include #include -#include /*! - ObjectType trick from http://stackoverflow.com/a/9860911 @@ -2464,6 +2464,51 @@ class basic_json inline lexer() = default; + template + inline static std::basic_string to_unicode(const long codepoint) + { + std::string result; + + if (codepoint <= 0x7f) + { + // 1-byte (ASCII) characters: 0xxxxxxx + result.append(1, static_cast(codepoint)); + } + else if (codepoint <= 0x7ff) + { + // 2-byte characters: 110xxxxx 10xxxxxx + // the 0xC0 enables the two most significant bits to make this + // a 2-byte UTF-8 character + result.append(1, static_cast(0xC0 | ((codepoint >> 6) & 0x1F))); + result.append(1, static_cast(0x80 | (codepoint & 0x3F))); + } + else if (codepoint <= 0xffff) + { + // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx + // the 0xE0 enables the three most significant bits to make + // this a 3-byte UTF-8 character + result.append(1, static_cast(0xE0 | ((codepoint >> 12) & 0x0F))); + result.append(1, static_cast(0x80 | ((codepoint >> 6) & 0x3F))); + result.append(1, static_cast(0x80 | (codepoint & 0x3F))); + } + else if (codepoint <= 0x10ffff) + { + // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + // the 0xF0 enables the four most significant bits to make this + // a 4-byte UTF-8 character + result.append(1, static_cast(0xF0 | ((codepoint >> 18) & 0x07))); + result.append(1, static_cast(0x80 | ((codepoint >> 12) & 0x3F))); + result.append(1, static_cast(0x80 | ((codepoint >> 6) & 0x3F))); + result.append(1, static_cast(0x80 | (codepoint & 0x3F))); + } + else + { + throw std::out_of_range("code point is invalid"); + } + + return result; + } + inline static std::string token_type_name(token_type t) { switch (t) @@ -2508,8 +2553,6 @@ class basic_json with goto jumps. @return the class of the next token read from the buffer - - @todo Unicode support needs to be checked. */ inline token_type scan() { @@ -2590,7 +2633,7 @@ class basic_json /*! The pointer m_start points to the opening quote of the string, and m_cursor past the closing quote of the string. We create a std::string - from the character after the opening quotes (m_begin+1) until the + from the character after the opening quotes (m_start+1) until the character before the closing quotes (hence subtracting 2 characters from the pointer difference of the two pointers). @@ -2600,7 +2643,86 @@ class basic_json */ inline std::string get_string() const { - return std::string(m_start + 1, static_cast(m_cursor - m_start - 2)); + std::string result; + result.reserve(static_cast(m_cursor - m_start - 2)); + + // iterate the result between the quotes + for (const char* i = m_start + 1; i < m_cursor - 1; ++i) + { + // process escaped characters + if (*i == '\\') + { + // read next character + ++i; + + switch (*i) + { + // the default escapes + case 't': + { + result += "\t"; + break; + } + case 'b': + { + result += "\b"; + break; + } + case 'f': + { + result += "\f"; + break; + } + case 'n': + { + result += "\n"; + break; + } + case 'r': + { + result += "\r"; + break; + } + + // characters that are not "un"escsaped + case '\\': + { + result += "\\\\"; + break; + } + case '/': + { + result += "\\/"; + break; + } + case '"': + { + result += "\\\""; + break; + } + + // unicode + case 'u': + { + // get code xxxx from \uxxxx + auto codepoint = strtol(i + 1, nullptr, 16); + // add unicode character(s) + result += to_unicode(codepoint); + // skip the next four characters (\uxxxx) + i += 4; + break; + } + } + } + else + { + // all other characters are just copied to the end of the + // string + result.append(1, *i); + } + } + + return result; } inline number_float_t get_number() const diff --git a/test/unit.cpp b/test/unit.cpp index e85cf1f0..86d903fe 100644 --- a/test/unit.cpp +++ b/test/unit.cpp @@ -5577,27 +5577,27 @@ TEST_CASE("parser class") // solidus CHECK(json::parser("\"\\/\"").parse() == json("\\/")); // backspace - CHECK(json::parser("\"\\b\"").parse() == json("\\b")); + CHECK(json::parser("\"\\b\"").parse() == json("\b")); // formfeed - CHECK(json::parser("\"\\f\"").parse() == json("\\f")); + CHECK(json::parser("\"\\f\"").parse() == json("\f")); // newline - CHECK(json::parser("\"\\n\"").parse() == json("\\n")); + CHECK(json::parser("\"\\n\"").parse() == json("\n")); // carriage return - CHECK(json::parser("\"\\r\"").parse() == json("\\r")); + CHECK(json::parser("\"\\r\"").parse() == json("\r")); // horizontal tab - CHECK(json::parser("\"\\t\"").parse() == json("\\t")); + CHECK(json::parser("\"\\t\"").parse() == json("\t")); - CHECK(json::parser("\"\\u0000\"").parse() == json("\\u0000")); - CHECK(json::parser("\"\\u000a\"").parse() == json("\\u000a")); - CHECK(json::parser("\"\\u00b0\"").parse() == json("\\u00b0")); - CHECK(json::parser("\"\\u0c00\"").parse() == json("\\u0c00")); - CHECK(json::parser("\"\\ud000\"").parse() == json("\\ud000")); - CHECK(json::parser("\"\\u0000\"").parse() == json("\\u0000")); - CHECK(json::parser("\"\\u000E\"").parse() == json("\\u000E")); - CHECK(json::parser("\"\\u00F0\"").parse() == json("\\u00F0")); - CHECK(json::parser("\"\\u0100\"").parse() == json("\\u0100")); - CHECK(json::parser("\"\\u2000\"").parse() == json("\\u2000")); - CHECK(json::parser("\"\\uFFFF\"").parse() == json("\\uFFFF")); + CHECK(json::parser("\"\\u0001\"").parse().get() == "\x01"); + CHECK(json::parser("\"\\u000a\"").parse().get() == "\n"); + CHECK(json::parser("\"\\u00b0\"").parse().get() == "°"); + CHECK(json::parser("\"\\u0c00\"").parse().get() == "ఀ"); + CHECK(json::parser("\"\\ud000\"").parse().get() == "퀀"); + CHECK(json::parser("\"\\u000E\"").parse().get() == "\x0E"); + CHECK(json::parser("\"\\u00F0\"").parse().get() == "ð"); + CHECK(json::parser("\"\\u0100\"").parse().get() == "Ā"); + CHECK(json::parser("\"\\u2000\"").parse().get() == " "); + CHECK(json::parser("\"\\uFFFF\"").parse().get() == "￿"); + CHECK(json::parser("\"\\u20AC\"").parse().get() == "€"); } } @@ -5848,3 +5848,8 @@ TEST_CASE("parser class") } } } + +TEST_CASE() +{ + CHECK(json::parser("\"\\u0049\\u004e\"").parse().get() == "IN"); +} \ No newline at end of file