Quick and dirty implementation for basic multilingual plane in the unicode escape mechanism
This commit is contained in:
parent
13efc7a02a
commit
222aacc213
3 changed files with 81 additions and 0 deletions
73
src/json.cc
73
src/json.cc
|
@ -2073,6 +2073,9 @@ std::string json::parser::parseString()
|
|||
result += '\n';
|
||||
} else if (currentChar == 'r') {
|
||||
result += '\r';
|
||||
} else if (currentChar == 'u') {
|
||||
pos_++;
|
||||
result += parseUnicodeEscape();
|
||||
} else {
|
||||
error("expected one of \\,/,b,f,n,r,t behind backslash.");
|
||||
}
|
||||
|
@ -2118,6 +2121,76 @@ std::string json::parser::parseString()
|
|||
error("expected '\"'");
|
||||
}
|
||||
|
||||
std::string json::parser::unicodeToUTF8(unsigned int codepoint) {
|
||||
|
||||
// it's just a ASCII compatible codepoint,
|
||||
// so we just interpret the point as a character
|
||||
if (codepoint <= 0x7f) {
|
||||
return std::string(1, static_cast<char>(codepoint));
|
||||
}
|
||||
else if (codepoint <= 0x7ff)
|
||||
{
|
||||
std::string result(2, static_cast<char>(0xc0 | ((codepoint >> 6) & 0x1f)));
|
||||
result[1] = static_cast<char>(0x80 | (codepoint & 0x3f));
|
||||
return result;
|
||||
}
|
||||
else if (codepoint <= 0xffff)
|
||||
{
|
||||
std::string result(3, static_cast<char>(0xe0 | ((codepoint >> 12) & 0x0f)));
|
||||
result[1] = static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f));
|
||||
result[2] = static_cast<char>(0x80 | (codepoint & 0x3f));
|
||||
return result;
|
||||
}
|
||||
else if (codepoint <= 0x1fffff)
|
||||
{
|
||||
std::string result(4, static_cast<char>(0xf0 | ((codepoint >> 18) & 0x07)));
|
||||
result[1] = static_cast<char>(0x80 | ((codepoint >> 12) & 0x3f));
|
||||
result[2] = static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f));
|
||||
result[3] = static_cast<char>(0x80 | (codepoint & 0x3f));
|
||||
return result;
|
||||
} else {
|
||||
std::string errorMessage = "Invalid codepoint: ";
|
||||
errorMessage += codepoint;
|
||||
error(errorMessage);
|
||||
}
|
||||
}
|
||||
|
||||
/*!
|
||||
Parses the JSON style unicode escape sequence (\uXXXX).
|
||||
|
||||
@return the utf-8 character the escape sequence escaped
|
||||
|
||||
@pre An opening quote \p " was read in the main parse function @ref parse.
|
||||
pos_ is the position after the opening quote.
|
||||
|
||||
@post The character after the closing quote \p " is the current character @ref
|
||||
current_. Whitespace is skipped.
|
||||
*/
|
||||
std::string json::parser::parseUnicodeEscape() {
|
||||
const auto startPos = pos_;
|
||||
if (pos_ + 3 >= buffer_.size()) {
|
||||
error("Got end of input while parsing unicode escape sequence \\uXXXX");
|
||||
}
|
||||
std::string hexCode(4, ' ');
|
||||
for(; pos_ < startPos + 4; pos_++) {
|
||||
char currentChar = buffer_[pos_];
|
||||
if ( (currentChar >= '0' && currentChar <= '9')
|
||||
|| (currentChar >= 'a' && currentChar <= 'f')
|
||||
|| (currentChar >= 'A' && currentChar <= 'F')) {
|
||||
// all is well, we have valid hexadecimal chars
|
||||
// so we copy that char into our string
|
||||
hexCode[pos_ - startPos] = currentChar;
|
||||
} else {
|
||||
error("Found non-hexadecimal character in unicode escape sequence!");
|
||||
}
|
||||
}
|
||||
pos_--;
|
||||
// case is safe as 4 hex characters can't present more than 16 bits
|
||||
return unicodeToUTF8(static_cast<unsigned int>(std::stoul(hexCode, nullptr, 16)));
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*!
|
||||
This function is called in case a \p "t" is read in the main parse function
|
||||
@ref parse. In the standard, the \p "true" token is the only candidate, so the
|
||||
|
|
|
@ -418,6 +418,10 @@ class json
|
|||
inline void error(const std::string&) __attribute__((noreturn));
|
||||
/// parse a quoted string
|
||||
inline std::string parseString();
|
||||
/// transforms a unicode codepoint to it's UTF-8 presentation
|
||||
inline std::string unicodeToUTF8(unsigned int codepoint);
|
||||
/// parses a unicode escape sequence
|
||||
inline std::string parseUnicodeEscape();
|
||||
/// parse a Boolean "true"
|
||||
inline void parseTrue();
|
||||
/// parse a Boolean "false"
|
||||
|
|
|
@ -1652,6 +1652,10 @@ TEST_CASE("Parser")
|
|||
CHECK(json::parse("\"a\\nz\"") == json("a\nz"));
|
||||
CHECK(json::parse("\"\\n\"") == json("\n"));
|
||||
|
||||
// escape unicode characters
|
||||
CHECK(json::parse("\"\\u002F\"") == json("/"));
|
||||
CHECK(json::parse("\"\\u00E4\"") == json(u8"\u00E4"));
|
||||
|
||||
// escaping senseless stuff
|
||||
CHECK_THROWS_AS(json::parse("\"\\z\""), std::invalid_argument);
|
||||
CHECK_THROWS_AS(json::parse("\"\\ \""), std::invalid_argument);
|
||||
|
|
Loading…
Reference in a new issue