+
+
\ No newline at end of file
diff --git a/.idea/json.iml b/.idea/json.iml
new file mode 100644
index 00000000..bc2cd874
--- /dev/null
+++ b/.idea/json.iml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 00000000..6b328020
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,5 @@
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 00000000..cd370d1e
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 00000000..94a25f7f
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fc0d2c70..9e19f57f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,6 +4,9 @@ project(json)
# Enable C++11 and set flags for coverage testing
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -g -O0 --coverage -fprofile-arcs -ftest-coverage")
+# Make everything public for testing purposes
+add_definitions(-Dprivate=public)
+
# If not specified, use Debug as build type (necessary for coverage testing)
if( NOT CMAKE_BUILD_TYPE )
set( CMAKE_BUILD_TYPE Debug CACHE STRING
diff --git a/src/json.cc b/src/json.cc
index 0eef1cc2..177977d7 100644
--- a/src/json.cc
+++ b/src/json.cc
@@ -2096,23 +2096,30 @@ std::string json::parser::parseString()
// the result of the parse process
std::string result;
- // iterate with pos_ over the whole string
+ // iterate with pos_ over the whole input until we found the end and return
+ // or we exit via error()
for (; pos_ < buffer_.size(); pos_++)
{
char currentChar = buffer_[pos_];
- // uneven amount of backslashes means the user wants to escape something
if (!evenAmountOfBackslashes)
{
+ // uneven amount of backslashes means the user wants to escape
+ // something so we know there is a case such as '\X' or '\\\X' but
+ // we don't know yet what X is.
+ // at this point in the code, the currentChar has the value of X.
+
// slash, backslash and quote are copied as is
- if (currentChar == '/' or currentChar == '\\' or currentChar == '"')
+ if ( currentChar == '/'
+ || currentChar == '\\'
+ || currentChar == '"')
{
result += currentChar;
}
else
{
- // all other characters are replaced by their respective
- // special character
+ // all other characters are replaced by their respective special
+ // character
switch (currentChar)
{
case 't':
@@ -2140,12 +2147,26 @@ std::string json::parser::parseString()
result += '\r';
break;
}
+ case 'u':
+ {
+ // \uXXXX[\uXXXX] is used for escaping unicode, which
+ // has it's own subroutine.
+ result += parseUnicodeEscape();
+ // the parsing process has brought us one step behind
+ // the unicode escape sequence:
+ // \uXXXX
+ // ^
+ // we need to go one character back or the parser would
+ // skip the character we are currently pointing at as
+ // the for-loop will decrement pos_ after this iteration
+ pos_--;
+ break;
+ }
default:
{
- error("expected one of \\, /, b, f, n, r, t behind backslash.");
+ error("expected one of \\, /, b, f, n, r, t, u behind backslash.");
}
}
- // TODO implement \uXXXX
}
}
else
@@ -2164,7 +2185,7 @@ std::string json::parser::parseString()
}
else if (currentChar != '\\')
{
- // All non-backslash characters are added to the end of the
+ // all non-backslash characters are added to the end of the
// result string. The only backslashes we want in the result
// are the ones that are escaped (which happens above).
result += currentChar;
@@ -2192,6 +2213,191 @@ std::string json::parser::parseString()
error("expected '\"'");
}
+
+
+/*!
+Turns a code point into it's UTF-8 representation.
+You should only pass numbers < 0x10ffff into this function
+(everything else is a invalid code point).
+
+@return the UTF-8 representation of the given code point
+
+@pre This method isn't accessing the members of the parser
+
+@post This method isn't accessing the members of the parser
+*/
+std::string json::parser::codePointToUTF8(unsigned int codePoint)
+{
+ // this method contains a lot of bit manipulations to
+ // build the bytes for UTF-8.
+
+ // the '(... >> S) & 0xHH'-patterns are used to retrieve
+ // certain bits from the code points.
+
+ // all static casts in this method have boundary checks
+
+ // we initialize all strings with their final length
+ // (e.g. 1 to 4 bytes) to save the reallocations.
+
+
+ if (codePoint <= 0x7f)
+ {
+ // it's just a ASCII compatible codePoint,
+ // so we just interpret the point as a character
+ // and return ASCII
+
+ return std::string(1, static_cast(codePoint));
+ }
+ // if true, we need two bytes to encode this as UTF-8
+ else if (codePoint <= 0x7ff)
+ {
+ // the 0xC0 enables the two most significant two bits
+ // to make this a two-byte UTF-8 character.
+ std::string result(2, static_cast(0xC0 | ((codePoint >> 6) & 0x1F)));
+ result[1] = static_cast(0x80 | (codePoint & 0x3F));
+ return result;
+ }
+ // if true, now we need three bytes to encode this as UTF-8
+ else if (codePoint <= 0xffff)
+ {
+ // the 0xE0 enables the three most significant two bits
+ // to make this a three-byte UTF-8 character.
+ std::string result(3, static_cast(0xE0 | ((codePoint >> 12) & 0x0F)));
+ result[1] = static_cast(0x80 | ((codePoint >> 6) & 0x3F));
+ result[2] = static_cast(0x80 | (codePoint & 0x3F));
+ return result;
+ }
+ // if true, we need maximal four bytes to encode this as UTF-8
+ else if (codePoint <= 0x10ffff)
+ {
+ // the 0xE0 enables the four most significant two bits
+ // to make this a three-byte UTF-8 character.
+ std::string result(4, static_cast(0xF0 | ((codePoint >> 18) & 0x07)));
+ result[1] = static_cast(0x80 | ((codePoint >> 12) & 0x3F));
+ result[2] = static_cast(0x80 | ((codePoint >> 6) & 0x3F));
+ result[3] = static_cast(0x80 | (codePoint & 0x3F));
+ return result;
+ }
+ else
+ {
+ // Can't be tested without direct access to this private method.
+ std::string errorMessage = "Invalid codePoint: ";
+ errorMessage += codePoint;
+ error(errorMessage);
+ }
+}
+
+/*!
+Parses 4 hexadecimal characters as a number.
+
+@return the value of the number the hexadecimal characters represent.
+
+@pre pos_ is pointing to the first of the 4 hexadecimal characters.
+
+@post pos_ is pointing to the character after the 4 hexadecimal characters.
+*/
+unsigned int json::parser::parse4HexCodePoint()
+{
+ const auto startPos = pos_;
+
+ // check if the remaining buffer is long enough to even hold 4 characters
+ if (pos_ + 3 >= buffer_.size())
+ {
+ error("Got end of input while parsing unicode escape sequence \\uXXXX");
+ }
+
+ // make a string that can hold the pair
+ std::string hexCode(4, ' ');
+
+ for(; pos_ < startPos + 4; pos_++)
+ {
+ // no boundary check here as we already checked above
+ char currentChar = buffer_[pos_];
+
+ // check if we have a hexadecimal character
+ if ( (currentChar >= '0' && currentChar <= '9')
+ || (currentChar >= 'a' && currentChar <= 'f')
+ || (currentChar >= 'A' && currentChar <= 'F'))
+ {
+ // all is well, we have valid hexadecimal chars
+ // so we copy that char into our string
+ hexCode[pos_ - startPos] = currentChar;
+ }
+ else
+ {
+ error("Found non-hexadecimal character in unicode escape sequence!");
+ }
+ }
+ // the cast is safe as 4 hex characters can't present more than 16 bits
+ // the input to stoul was checked to contain only hexadecimal characters
+ // (see above)
+ return static_cast(std::stoul(hexCode, nullptr, 16));
+}
+
+/*!
+Parses the unicode escape codes as defined in the ECMA-404.
+The escape sequence has two forms:
+1. \uXXXX
+2. \uXXXX\uYYYY
+where X and Y are a hexadecimal character (a-zA-Z0-9).
+
+Form 1 just contains the unicode code point in the hexadecimal number XXXX.
+Form 2 is encoding a UTF-16 surrogate pair. The high surrogate is XXXX, the low
+surrogate is YYYY.
+
+@return the UTF-8 character this unicode escape sequence escaped.
+
+@pre pos_ is pointing at at the 'u' behind the first backslash.
+
+@post pos_ is pointing at the character behind the last X (or Y in form 2).
+*/
+std::string json::parser::parseUnicodeEscape()
+{
+ // jump to the first hex value
+ pos_++;
+ // parse the hex first hex values
+ unsigned int firstCodePoint = parse4HexCodePoint();
+
+
+ if (firstCodePoint >= 0xD800 && firstCodePoint <= 0xDBFF)
+ {
+ // we found invalid code points, which means we either have a malformed
+ // input or we found a high surrogate.
+ // we can only find out by seeing if the next character also wants to
+ // encode a unicode character (so, we have the \uXXXX\uXXXX case here).
+
+ // jump behind the next \u
+ pos_ += 2;
+ // try to parse the next hex values.
+ // the method does boundary checking for us, so no need to do that here
+ unsigned secondCodePoint = parse4HexCodePoint();
+ // ok, we have a low surrogate, check if it is a valid one
+ if (secondCodePoint >= 0xDC00 && secondCodePoint <= 0xDFFF)
+ {
+ // calculate the code point from the pair according to the spec
+ unsigned int finalCodePoint =
+ // high surrogate occupies the most significant 22 bits
+ (firstCodePoint << 10)
+ // low surrogate occupies the least significant 15 bits
+ + secondCodePoint
+ // there is still the 0xD800, 0xDC00 and 0x10000 noise in
+ // the result
+ // so we have to substract with:
+ // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
+ - 0x35FDC00;
+
+ // we transform the calculated point into UTF-8
+ return codePointToUTF8(finalCodePoint);
+ }
+ else
+ error("missing low surrogate");
+
+ }
+ // We have Form 1, so we just interpret the XXXX as a code point
+ return codePointToUTF8(firstCodePoint);
+}
+
+
/*!
This function is called in case a \p "t" is read in the main parse function
@ref parse. In the standard, the \p "true" token is the only candidate, so the
diff --git a/src/json.h b/src/json.h
index 59a57b84..126f96a3 100644
--- a/src/json.h
+++ b/src/json.h
@@ -421,6 +421,12 @@ class json
inline void error(const std::string&) __attribute__((noreturn));
/// parse a quoted string
inline std::string parseString();
+ /// transforms a unicode codepoint to it's UTF-8 presentation
+ std::string codePointToUTF8(unsigned int codePoint);
+ /// parses 4 hex characters that represent a unicode code point
+ inline unsigned int parse4HexCodePoint();
+ /// parses \uXXXX[\uXXXX] unicode escape characters
+ inline std::string parseUnicodeEscape();
/// parse a Boolean "true"
inline void parseTrue();
/// parse a Boolean "false"
diff --git a/test/json_unit.cc b/test/json_unit.cc
index a894c0a8..542eb436 100644
--- a/test/json_unit.cc
+++ b/test/json_unit.cc
@@ -1672,6 +1672,50 @@ TEST_CASE("Parser")
CHECK_THROWS_AS(json::parse("\""), std::invalid_argument);
}
+ SECTION("unicode_escaping")
+ {
+ // two tests for uppercase and lowercase hex
+
+ // normal forward slash in ASCII range
+ CHECK(json::parse("\"\\u002F\"") == json("/"));
+ CHECK(json::parse("\"\\u002f\"") == json("/"));
+ // german a umlaut
+ CHECK(json::parse("\"\\u00E4\"") == json(u8"\u00E4"));
+ CHECK(json::parse("\"\\u00e4\"") == json(u8"\u00E4"));
+ // weird d
+ CHECK(json::parse("\"\\u0111\"") == json(u8"\u0111"));
+ // unicode arrow left
+ CHECK(json::parse("\"\\u2190\"") == json(u8"\u2190"));
+ // pleasing osiris by testing hieroglyph support
+ CHECK(json::parse("\"\\uD80C\\uDC60\"") == json(u8"\U00013060"));
+ CHECK(json::parse("\"\\ud80C\\udc60\"") == json(u8"\U00013060"));
+
+
+ // no hex numbers behind the \u
+ CHECK_THROWS_AS(json::parse("\"\\uD80v\""), std::invalid_argument);
+ CHECK_THROWS_AS(json::parse("\"\\uD80 A\""), std::invalid_argument);
+ CHECK_THROWS_AS(json::parse("\"\\uD8v\""), std::invalid_argument);
+ CHECK_THROWS_AS(json::parse("\"\\uDv\""), std::invalid_argument);
+ CHECK_THROWS_AS(json::parse("\"\\uv\""), std::invalid_argument);
+ CHECK_THROWS_AS(json::parse("\"\\u\""), std::invalid_argument);
+ CHECK_THROWS_AS(json::parse("\"\\u\\u\""), std::invalid_argument);
+ CHECK_THROWS_AS(json::parse("\"a\\uD80vAz\""), std::invalid_argument);
+ // missing part of a surrogate pair
+ CHECK_THROWS_AS(json::parse("\"bla \\uD80C bla\""), std::invalid_argument);
+ CHECK_THROWS_AS(json::parse("\"\\uD80C bla bla\""), std::invalid_argument);
+ CHECK_THROWS_AS(json::parse("\"bla bla \\uD80C bla bla\""), std::invalid_argument);
+ // senseless surrogate pair
+ CHECK_THROWS_AS(json::parse("\"\\uD80C\\uD80C\""), std::invalid_argument);
+ CHECK_THROWS_AS(json::parse("\"\\uD80C\\u0000\""), std::invalid_argument);
+ CHECK_THROWS_AS(json::parse("\"\\uD80C\\uFFFF\""), std::invalid_argument);
+
+ // test private code point converter function
+ CHECK_NOTHROW(json::parser("").codePointToUTF8(0x10FFFE));
+ CHECK_NOTHROW(json::parser("").codePointToUTF8(0x10FFFF));
+ CHECK_THROWS_AS(json::parser("").codePointToUTF8(0x110000), std::invalid_argument);
+ CHECK_THROWS_AS(json::parser("").codePointToUTF8(0x110001), std::invalid_argument);
+ }
+
SECTION("boolean")
{
// accept the exact values