Quick and dirty implementation for basic multilingual plane in the unicode escape mechanism
This commit is contained in:
parent
13efc7a02a
commit
222aacc213
3 changed files with 81 additions and 0 deletions
73
src/json.cc
73
src/json.cc
|
@ -2073,6 +2073,9 @@ std::string json::parser::parseString()
|
||||||
result += '\n';
|
result += '\n';
|
||||||
} else if (currentChar == 'r') {
|
} else if (currentChar == 'r') {
|
||||||
result += '\r';
|
result += '\r';
|
||||||
|
} else if (currentChar == 'u') {
|
||||||
|
pos_++;
|
||||||
|
result += parseUnicodeEscape();
|
||||||
} else {
|
} else {
|
||||||
error("expected one of \\,/,b,f,n,r,t behind backslash.");
|
error("expected one of \\,/,b,f,n,r,t behind backslash.");
|
||||||
}
|
}
|
||||||
|
@ -2118,6 +2121,76 @@ std::string json::parser::parseString()
|
||||||
error("expected '\"'");
|
error("expected '\"'");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string json::parser::unicodeToUTF8(unsigned int codepoint) {
|
||||||
|
|
||||||
|
// it's just a ASCII compatible codepoint,
|
||||||
|
// so we just interpret the point as a character
|
||||||
|
if (codepoint <= 0x7f) {
|
||||||
|
return std::string(1, static_cast<char>(codepoint));
|
||||||
|
}
|
||||||
|
else if (codepoint <= 0x7ff)
|
||||||
|
{
|
||||||
|
std::string result(2, static_cast<char>(0xc0 | ((codepoint >> 6) & 0x1f)));
|
||||||
|
result[1] = static_cast<char>(0x80 | (codepoint & 0x3f));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
else if (codepoint <= 0xffff)
|
||||||
|
{
|
||||||
|
std::string result(3, static_cast<char>(0xe0 | ((codepoint >> 12) & 0x0f)));
|
||||||
|
result[1] = static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f));
|
||||||
|
result[2] = static_cast<char>(0x80 | (codepoint & 0x3f));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
else if (codepoint <= 0x1fffff)
|
||||||
|
{
|
||||||
|
std::string result(4, static_cast<char>(0xf0 | ((codepoint >> 18) & 0x07)));
|
||||||
|
result[1] = static_cast<char>(0x80 | ((codepoint >> 12) & 0x3f));
|
||||||
|
result[2] = static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f));
|
||||||
|
result[3] = static_cast<char>(0x80 | (codepoint & 0x3f));
|
||||||
|
return result;
|
||||||
|
} else {
|
||||||
|
std::string errorMessage = "Invalid codepoint: ";
|
||||||
|
errorMessage += codepoint;
|
||||||
|
error(errorMessage);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*!
|
||||||
|
Parses the JSON style unicode escape sequence (\uXXXX).
|
||||||
|
|
||||||
|
@return the utf-8 character the escape sequence escaped
|
||||||
|
|
||||||
|
@pre An opening quote \p " was read in the main parse function @ref parse.
|
||||||
|
pos_ is the position after the opening quote.
|
||||||
|
|
||||||
|
@post The character after the closing quote \p " is the current character @ref
|
||||||
|
current_. Whitespace is skipped.
|
||||||
|
*/
|
||||||
|
std::string json::parser::parseUnicodeEscape() {
|
||||||
|
const auto startPos = pos_;
|
||||||
|
if (pos_ + 3 >= buffer_.size()) {
|
||||||
|
error("Got end of input while parsing unicode escape sequence \\uXXXX");
|
||||||
|
}
|
||||||
|
std::string hexCode(4, ' ');
|
||||||
|
for(; pos_ < startPos + 4; pos_++) {
|
||||||
|
char currentChar = buffer_[pos_];
|
||||||
|
if ( (currentChar >= '0' && currentChar <= '9')
|
||||||
|
|| (currentChar >= 'a' && currentChar <= 'f')
|
||||||
|
|| (currentChar >= 'A' && currentChar <= 'F')) {
|
||||||
|
// all is well, we have valid hexadecimal chars
|
||||||
|
// so we copy that char into our string
|
||||||
|
hexCode[pos_ - startPos] = currentChar;
|
||||||
|
} else {
|
||||||
|
error("Found non-hexadecimal character in unicode escape sequence!");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pos_--;
|
||||||
|
// case is safe as 4 hex characters can't present more than 16 bits
|
||||||
|
return unicodeToUTF8(static_cast<unsigned int>(std::stoul(hexCode, nullptr, 16)));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
This function is called in case a \p "t" is read in the main parse function
|
This function is called in case a \p "t" is read in the main parse function
|
||||||
@ref parse. In the standard, the \p "true" token is the only candidate, so the
|
@ref parse. In the standard, the \p "true" token is the only candidate, so the
|
||||||
|
|
|
@ -418,6 +418,10 @@ class json
|
||||||
inline void error(const std::string&) __attribute__((noreturn));
|
inline void error(const std::string&) __attribute__((noreturn));
|
||||||
/// parse a quoted string
|
/// parse a quoted string
|
||||||
inline std::string parseString();
|
inline std::string parseString();
|
||||||
|
/// transforms a unicode codepoint to it's UTF-8 presentation
|
||||||
|
inline std::string unicodeToUTF8(unsigned int codepoint);
|
||||||
|
/// parses a unicode escape sequence
|
||||||
|
inline std::string parseUnicodeEscape();
|
||||||
/// parse a Boolean "true"
|
/// parse a Boolean "true"
|
||||||
inline void parseTrue();
|
inline void parseTrue();
|
||||||
/// parse a Boolean "false"
|
/// parse a Boolean "false"
|
||||||
|
|
|
@ -1652,6 +1652,10 @@ TEST_CASE("Parser")
|
||||||
CHECK(json::parse("\"a\\nz\"") == json("a\nz"));
|
CHECK(json::parse("\"a\\nz\"") == json("a\nz"));
|
||||||
CHECK(json::parse("\"\\n\"") == json("\n"));
|
CHECK(json::parse("\"\\n\"") == json("\n"));
|
||||||
|
|
||||||
|
// escape unicode characters
|
||||||
|
CHECK(json::parse("\"\\u002F\"") == json("/"));
|
||||||
|
CHECK(json::parse("\"\\u00E4\"") == json(u8"\u00E4"));
|
||||||
|
|
||||||
// escaping senseless stuff
|
// escaping senseless stuff
|
||||||
CHECK_THROWS_AS(json::parse("\"\\z\""), std::invalid_argument);
|
CHECK_THROWS_AS(json::parse("\"\\z\""), std::invalid_argument);
|
||||||
CHECK_THROWS_AS(json::parse("\"\\ \""), std::invalid_argument);
|
CHECK_THROWS_AS(json::parse("\"\\ \""), std::invalid_argument);
|
||||||
|
|
Loading…
Reference in a new issue