From 222aacc213c0b34d275a119df8e7cabb44993af2 Mon Sep 17 00:00:00 2001 From: Raphael Isemann Date: Sat, 10 Jan 2015 10:36:30 +0100 Subject: [PATCH] Quick and dirty implementation for basic multilingual plane in the unicode escape mechanism --- src/json.cc | 73 +++++++++++++++++++++++++++++++++++++++++++++++ src/json.h | 4 +++ test/json_unit.cc | 4 +++ 3 files changed, 81 insertions(+) diff --git a/src/json.cc b/src/json.cc index 1666b23c..98225167 100644 --- a/src/json.cc +++ b/src/json.cc @@ -2073,6 +2073,9 @@ std::string json::parser::parseString() result += '\n'; } else if (currentChar == 'r') { result += '\r'; + } else if (currentChar == 'u') { + pos_++; + result += parseUnicodeEscape(); } else { error("expected one of \\,/,b,f,n,r,t behind backslash."); } @@ -2118,6 +2121,76 @@ std::string json::parser::parseString() error("expected '\"'"); } +std::string json::parser::unicodeToUTF8(unsigned int codepoint) { + + // it's just a ASCII compatible codepoint, + // so we just interpret the point as a character + if (codepoint <= 0x7f) { + return std::string(1, static_cast(codepoint)); + } + else if (codepoint <= 0x7ff) + { + std::string result(2, static_cast(0xc0 | ((codepoint >> 6) & 0x1f))); + result[1] = static_cast(0x80 | (codepoint & 0x3f)); + return result; + } + else if (codepoint <= 0xffff) + { + std::string result(3, static_cast(0xe0 | ((codepoint >> 12) & 0x0f))); + result[1] = static_cast(0x80 | ((codepoint >> 6) & 0x3f)); + result[2] = static_cast(0x80 | (codepoint & 0x3f)); + return result; + } + else if (codepoint <= 0x1fffff) + { + std::string result(4, static_cast(0xf0 | ((codepoint >> 18) & 0x07))); + result[1] = static_cast(0x80 | ((codepoint >> 12) & 0x3f)); + result[2] = static_cast(0x80 | ((codepoint >> 6) & 0x3f)); + result[3] = static_cast(0x80 | (codepoint & 0x3f)); + return result; + } else { + std::string errorMessage = "Invalid codepoint: "; + errorMessage += codepoint; + error(errorMessage); + } +} + +/*! +Parses the JSON style unicode escape sequence (\uXXXX). + +@return the utf-8 character the escape sequence escaped + +@pre An opening quote \p " was read in the main parse function @ref parse. + pos_ is the position after the opening quote. + +@post The character after the closing quote \p " is the current character @ref + current_. Whitespace is skipped. +*/ +std::string json::parser::parseUnicodeEscape() { + const auto startPos = pos_; + if (pos_ + 3 >= buffer_.size()) { + error("Got end of input while parsing unicode escape sequence \\uXXXX"); + } + std::string hexCode(4, ' '); + for(; pos_ < startPos + 4; pos_++) { + char currentChar = buffer_[pos_]; + if ( (currentChar >= '0' && currentChar <= '9') + || (currentChar >= 'a' && currentChar <= 'f') + || (currentChar >= 'A' && currentChar <= 'F')) { + // all is well, we have valid hexadecimal chars + // so we copy that char into our string + hexCode[pos_ - startPos] = currentChar; + } else { + error("Found non-hexadecimal character in unicode escape sequence!"); + } + } + pos_--; + // case is safe as 4 hex characters can't present more than 16 bits + return unicodeToUTF8(static_cast(std::stoul(hexCode, nullptr, 16))); +} + + + /*! This function is called in case a \p "t" is read in the main parse function @ref parse. In the standard, the \p "true" token is the only candidate, so the diff --git a/src/json.h b/src/json.h index 2dd99348..1b5e8fcf 100644 --- a/src/json.h +++ b/src/json.h @@ -418,6 +418,10 @@ class json inline void error(const std::string&) __attribute__((noreturn)); /// parse a quoted string inline std::string parseString(); + /// transforms a unicode codepoint to it's UTF-8 presentation + inline std::string unicodeToUTF8(unsigned int codepoint); + /// parses a unicode escape sequence + inline std::string parseUnicodeEscape(); /// parse a Boolean "true" inline void parseTrue(); /// parse a Boolean "false" diff --git a/test/json_unit.cc b/test/json_unit.cc index b2fcd65e..fb89a2a1 100644 --- a/test/json_unit.cc +++ b/test/json_unit.cc @@ -1652,6 +1652,10 @@ TEST_CASE("Parser") CHECK(json::parse("\"a\\nz\"") == json("a\nz")); CHECK(json::parse("\"\\n\"") == json("\n")); + // escape unicode characters + CHECK(json::parse("\"\\u002F\"") == json("/")); + CHECK(json::parse("\"\\u00E4\"") == json(u8"\u00E4")); + // escaping senseless stuff CHECK_THROWS_AS(json::parse("\"\\z\""), std::invalid_argument); CHECK_THROWS_AS(json::parse("\"\\ \""), std::invalid_argument);