diff --git a/.idea/codeStyleSettings.xml b/.idea/codeStyleSettings.xml new file mode 100644 index 00000000..65c39702 --- /dev/null +++ b/.idea/codeStyleSettings.xml @@ -0,0 +1,35 @@ + + + + + + \ No newline at end of file diff --git a/.idea/json.iml b/.idea/json.iml new file mode 100644 index 00000000..bc2cd874 --- /dev/null +++ b/.idea/json.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 00000000..6b328020 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,5 @@ + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 00000000..cd370d1e --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 00000000..94a25f7f --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index fc0d2c70..9e19f57f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,6 +4,9 @@ project(json) # Enable C++11 and set flags for coverage testing SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -g -O0 --coverage -fprofile-arcs -ftest-coverage") +# Make everything public for testing purposes +add_definitions(-Dprivate=public) + # If not specified, use Debug as build type (necessary for coverage testing) if( NOT CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Debug CACHE STRING diff --git a/src/json.cc b/src/json.cc index 0eef1cc2..177977d7 100644 --- a/src/json.cc +++ b/src/json.cc @@ -2096,23 +2096,30 @@ std::string json::parser::parseString() // the result of the parse process std::string result; - // iterate with pos_ over the whole string + // iterate with pos_ over the whole input until we found the end and return + // or we exit via error() for (; pos_ < buffer_.size(); pos_++) { char currentChar = buffer_[pos_]; - // uneven amount of backslashes means the user wants to escape something if (!evenAmountOfBackslashes) { + // uneven amount of backslashes means the user wants to escape + // something so we know there is a case such as '\X' or '\\\X' but + // we don't know yet what X is. + // at this point in the code, the currentChar has the value of X. + // slash, backslash and quote are copied as is - if (currentChar == '/' or currentChar == '\\' or currentChar == '"') + if ( currentChar == '/' + || currentChar == '\\' + || currentChar == '"') { result += currentChar; } else { - // all other characters are replaced by their respective - // special character + // all other characters are replaced by their respective special + // character switch (currentChar) { case 't': @@ -2140,12 +2147,26 @@ std::string json::parser::parseString() result += '\r'; break; } + case 'u': + { + // \uXXXX[\uXXXX] is used for escaping unicode, which + // has it's own subroutine. + result += parseUnicodeEscape(); + // the parsing process has brought us one step behind + // the unicode escape sequence: + // \uXXXX + // ^ + // we need to go one character back or the parser would + // skip the character we are currently pointing at as + // the for-loop will decrement pos_ after this iteration + pos_--; + break; + } default: { - error("expected one of \\, /, b, f, n, r, t behind backslash."); + error("expected one of \\, /, b, f, n, r, t, u behind backslash."); } } - // TODO implement \uXXXX } } else @@ -2164,7 +2185,7 @@ std::string json::parser::parseString() } else if (currentChar != '\\') { - // All non-backslash characters are added to the end of the + // all non-backslash characters are added to the end of the // result string. The only backslashes we want in the result // are the ones that are escaped (which happens above). result += currentChar; @@ -2192,6 +2213,191 @@ std::string json::parser::parseString() error("expected '\"'"); } + + +/*! +Turns a code point into it's UTF-8 representation. +You should only pass numbers < 0x10ffff into this function +(everything else is a invalid code point). + +@return the UTF-8 representation of the given code point + +@pre This method isn't accessing the members of the parser + +@post This method isn't accessing the members of the parser +*/ +std::string json::parser::codePointToUTF8(unsigned int codePoint) +{ + // this method contains a lot of bit manipulations to + // build the bytes for UTF-8. + + // the '(... >> S) & 0xHH'-patterns are used to retrieve + // certain bits from the code points. + + // all static casts in this method have boundary checks + + // we initialize all strings with their final length + // (e.g. 1 to 4 bytes) to save the reallocations. + + + if (codePoint <= 0x7f) + { + // it's just a ASCII compatible codePoint, + // so we just interpret the point as a character + // and return ASCII + + return std::string(1, static_cast(codePoint)); + } + // if true, we need two bytes to encode this as UTF-8 + else if (codePoint <= 0x7ff) + { + // the 0xC0 enables the two most significant two bits + // to make this a two-byte UTF-8 character. + std::string result(2, static_cast(0xC0 | ((codePoint >> 6) & 0x1F))); + result[1] = static_cast(0x80 | (codePoint & 0x3F)); + return result; + } + // if true, now we need three bytes to encode this as UTF-8 + else if (codePoint <= 0xffff) + { + // the 0xE0 enables the three most significant two bits + // to make this a three-byte UTF-8 character. + std::string result(3, static_cast(0xE0 | ((codePoint >> 12) & 0x0F))); + result[1] = static_cast(0x80 | ((codePoint >> 6) & 0x3F)); + result[2] = static_cast(0x80 | (codePoint & 0x3F)); + return result; + } + // if true, we need maximal four bytes to encode this as UTF-8 + else if (codePoint <= 0x10ffff) + { + // the 0xE0 enables the four most significant two bits + // to make this a three-byte UTF-8 character. + std::string result(4, static_cast(0xF0 | ((codePoint >> 18) & 0x07))); + result[1] = static_cast(0x80 | ((codePoint >> 12) & 0x3F)); + result[2] = static_cast(0x80 | ((codePoint >> 6) & 0x3F)); + result[3] = static_cast(0x80 | (codePoint & 0x3F)); + return result; + } + else + { + // Can't be tested without direct access to this private method. + std::string errorMessage = "Invalid codePoint: "; + errorMessage += codePoint; + error(errorMessage); + } +} + +/*! +Parses 4 hexadecimal characters as a number. + +@return the value of the number the hexadecimal characters represent. + +@pre pos_ is pointing to the first of the 4 hexadecimal characters. + +@post pos_ is pointing to the character after the 4 hexadecimal characters. +*/ +unsigned int json::parser::parse4HexCodePoint() +{ + const auto startPos = pos_; + + // check if the remaining buffer is long enough to even hold 4 characters + if (pos_ + 3 >= buffer_.size()) + { + error("Got end of input while parsing unicode escape sequence \\uXXXX"); + } + + // make a string that can hold the pair + std::string hexCode(4, ' '); + + for(; pos_ < startPos + 4; pos_++) + { + // no boundary check here as we already checked above + char currentChar = buffer_[pos_]; + + // check if we have a hexadecimal character + if ( (currentChar >= '0' && currentChar <= '9') + || (currentChar >= 'a' && currentChar <= 'f') + || (currentChar >= 'A' && currentChar <= 'F')) + { + // all is well, we have valid hexadecimal chars + // so we copy that char into our string + hexCode[pos_ - startPos] = currentChar; + } + else + { + error("Found non-hexadecimal character in unicode escape sequence!"); + } + } + // the cast is safe as 4 hex characters can't present more than 16 bits + // the input to stoul was checked to contain only hexadecimal characters + // (see above) + return static_cast(std::stoul(hexCode, nullptr, 16)); +} + +/*! +Parses the unicode escape codes as defined in the ECMA-404. +The escape sequence has two forms: +1. \uXXXX +2. \uXXXX\uYYYY +where X and Y are a hexadecimal character (a-zA-Z0-9). + +Form 1 just contains the unicode code point in the hexadecimal number XXXX. +Form 2 is encoding a UTF-16 surrogate pair. The high surrogate is XXXX, the low +surrogate is YYYY. + +@return the UTF-8 character this unicode escape sequence escaped. + +@pre pos_ is pointing at at the 'u' behind the first backslash. + +@post pos_ is pointing at the character behind the last X (or Y in form 2). +*/ +std::string json::parser::parseUnicodeEscape() +{ + // jump to the first hex value + pos_++; + // parse the hex first hex values + unsigned int firstCodePoint = parse4HexCodePoint(); + + + if (firstCodePoint >= 0xD800 && firstCodePoint <= 0xDBFF) + { + // we found invalid code points, which means we either have a malformed + // input or we found a high surrogate. + // we can only find out by seeing if the next character also wants to + // encode a unicode character (so, we have the \uXXXX\uXXXX case here). + + // jump behind the next \u + pos_ += 2; + // try to parse the next hex values. + // the method does boundary checking for us, so no need to do that here + unsigned secondCodePoint = parse4HexCodePoint(); + // ok, we have a low surrogate, check if it is a valid one + if (secondCodePoint >= 0xDC00 && secondCodePoint <= 0xDFFF) + { + // calculate the code point from the pair according to the spec + unsigned int finalCodePoint = + // high surrogate occupies the most significant 22 bits + (firstCodePoint << 10) + // low surrogate occupies the least significant 15 bits + + secondCodePoint + // there is still the 0xD800, 0xDC00 and 0x10000 noise in + // the result + // so we have to substract with: + // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00 + - 0x35FDC00; + + // we transform the calculated point into UTF-8 + return codePointToUTF8(finalCodePoint); + } + else + error("missing low surrogate"); + + } + // We have Form 1, so we just interpret the XXXX as a code point + return codePointToUTF8(firstCodePoint); +} + + /*! This function is called in case a \p "t" is read in the main parse function @ref parse. In the standard, the \p "true" token is the only candidate, so the diff --git a/src/json.h b/src/json.h index 59a57b84..126f96a3 100644 --- a/src/json.h +++ b/src/json.h @@ -421,6 +421,12 @@ class json inline void error(const std::string&) __attribute__((noreturn)); /// parse a quoted string inline std::string parseString(); + /// transforms a unicode codepoint to it's UTF-8 presentation + std::string codePointToUTF8(unsigned int codePoint); + /// parses 4 hex characters that represent a unicode code point + inline unsigned int parse4HexCodePoint(); + /// parses \uXXXX[\uXXXX] unicode escape characters + inline std::string parseUnicodeEscape(); /// parse a Boolean "true" inline void parseTrue(); /// parse a Boolean "false" diff --git a/test/json_unit.cc b/test/json_unit.cc index a894c0a8..542eb436 100644 --- a/test/json_unit.cc +++ b/test/json_unit.cc @@ -1672,6 +1672,50 @@ TEST_CASE("Parser") CHECK_THROWS_AS(json::parse("\""), std::invalid_argument); } + SECTION("unicode_escaping") + { + // two tests for uppercase and lowercase hex + + // normal forward slash in ASCII range + CHECK(json::parse("\"\\u002F\"") == json("/")); + CHECK(json::parse("\"\\u002f\"") == json("/")); + // german a umlaut + CHECK(json::parse("\"\\u00E4\"") == json(u8"\u00E4")); + CHECK(json::parse("\"\\u00e4\"") == json(u8"\u00E4")); + // weird d + CHECK(json::parse("\"\\u0111\"") == json(u8"\u0111")); + // unicode arrow left + CHECK(json::parse("\"\\u2190\"") == json(u8"\u2190")); + // pleasing osiris by testing hieroglyph support + CHECK(json::parse("\"\\uD80C\\uDC60\"") == json(u8"\U00013060")); + CHECK(json::parse("\"\\ud80C\\udc60\"") == json(u8"\U00013060")); + + + // no hex numbers behind the \u + CHECK_THROWS_AS(json::parse("\"\\uD80v\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\uD80 A\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\uD8v\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\uDv\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\uv\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\u\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\u\\u\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"a\\uD80vAz\""), std::invalid_argument); + // missing part of a surrogate pair + CHECK_THROWS_AS(json::parse("\"bla \\uD80C bla\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\uD80C bla bla\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"bla bla \\uD80C bla bla\""), std::invalid_argument); + // senseless surrogate pair + CHECK_THROWS_AS(json::parse("\"\\uD80C\\uD80C\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\uD80C\\u0000\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\uD80C\\uFFFF\""), std::invalid_argument); + + // test private code point converter function + CHECK_NOTHROW(json::parser("").codePointToUTF8(0x10FFFE)); + CHECK_NOTHROW(json::parser("").codePointToUTF8(0x10FFFF)); + CHECK_THROWS_AS(json::parser("").codePointToUTF8(0x110000), std::invalid_argument); + CHECK_THROWS_AS(json::parser("").codePointToUTF8(0x110001), std::invalid_argument); + } + SECTION("boolean") { // accept the exact values