Quick and dirty implementation for basic multilingual plane in the unicode escape mechanism

This commit is contained in:
Raphael Isemann 2015-01-10 10:36:30 +01:00
parent 13efc7a02a
commit 222aacc213
3 changed files with 81 additions and 0 deletions

View file

@ -2073,6 +2073,9 @@ std::string json::parser::parseString()
result += '\n';
} else if (currentChar == 'r') {
result += '\r';
} else if (currentChar == 'u') {
pos_++;
result += parseUnicodeEscape();
} else {
error("expected one of \\,/,b,f,n,r,t behind backslash.");
}
@ -2118,6 +2121,76 @@ std::string json::parser::parseString()
error("expected '\"'");
}
std::string json::parser::unicodeToUTF8(unsigned int codepoint) {
// it's just a ASCII compatible codepoint,
// so we just interpret the point as a character
if (codepoint <= 0x7f) {
return std::string(1, static_cast<char>(codepoint));
}
else if (codepoint <= 0x7ff)
{
std::string result(2, static_cast<char>(0xc0 | ((codepoint >> 6) & 0x1f)));
result[1] = static_cast<char>(0x80 | (codepoint & 0x3f));
return result;
}
else if (codepoint <= 0xffff)
{
std::string result(3, static_cast<char>(0xe0 | ((codepoint >> 12) & 0x0f)));
result[1] = static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f));
result[2] = static_cast<char>(0x80 | (codepoint & 0x3f));
return result;
}
else if (codepoint <= 0x1fffff)
{
std::string result(4, static_cast<char>(0xf0 | ((codepoint >> 18) & 0x07)));
result[1] = static_cast<char>(0x80 | ((codepoint >> 12) & 0x3f));
result[2] = static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f));
result[3] = static_cast<char>(0x80 | (codepoint & 0x3f));
return result;
} else {
std::string errorMessage = "Invalid codepoint: ";
errorMessage += codepoint;
error(errorMessage);
}
}
/*!
Parses the JSON style unicode escape sequence (\uXXXX).
@return the utf-8 character the escape sequence escaped
@pre An opening quote \p " was read in the main parse function @ref parse.
pos_ is the position after the opening quote.
@post The character after the closing quote \p " is the current character @ref
current_. Whitespace is skipped.
*/
std::string json::parser::parseUnicodeEscape() {
const auto startPos = pos_;
if (pos_ + 3 >= buffer_.size()) {
error("Got end of input while parsing unicode escape sequence \\uXXXX");
}
std::string hexCode(4, ' ');
for(; pos_ < startPos + 4; pos_++) {
char currentChar = buffer_[pos_];
if ( (currentChar >= '0' && currentChar <= '9')
|| (currentChar >= 'a' && currentChar <= 'f')
|| (currentChar >= 'A' && currentChar <= 'F')) {
// all is well, we have valid hexadecimal chars
// so we copy that char into our string
hexCode[pos_ - startPos] = currentChar;
} else {
error("Found non-hexadecimal character in unicode escape sequence!");
}
}
pos_--;
// case is safe as 4 hex characters can't present more than 16 bits
return unicodeToUTF8(static_cast<unsigned int>(std::stoul(hexCode, nullptr, 16)));
}
/*!
This function is called in case a \p "t" is read in the main parse function
@ref parse. In the standard, the \p "true" token is the only candidate, so the

View file

@ -418,6 +418,10 @@ class json
inline void error(const std::string&) __attribute__((noreturn));
/// parse a quoted string
inline std::string parseString();
/// transforms a unicode codepoint to it's UTF-8 presentation
inline std::string unicodeToUTF8(unsigned int codepoint);
/// parses a unicode escape sequence
inline std::string parseUnicodeEscape();
/// parse a Boolean "true"
inline void parseTrue();
/// parse a Boolean "false"

View file

@ -1652,6 +1652,10 @@ TEST_CASE("Parser")
CHECK(json::parse("\"a\\nz\"") == json("a\nz"));
CHECK(json::parse("\"\\n\"") == json("\n"));
// escape unicode characters
CHECK(json::parse("\"\\u002F\"") == json("/"));
CHECK(json::parse("\"\\u00E4\"") == json(u8"\u00E4"));
// escaping senseless stuff
CHECK_THROWS_AS(json::parse("\"\\z\""), std::invalid_argument);
CHECK_THROWS_AS(json::parse("\"\\ \""), std::invalid_argument);