From 222aacc213c0b34d275a119df8e7cabb44993af2 Mon Sep 17 00:00:00 2001 From: Raphael Isemann Date: Sat, 10 Jan 2015 10:36:30 +0100 Subject: [PATCH 1/6] Quick and dirty implementation for basic multilingual plane in the unicode escape mechanism --- src/json.cc | 73 +++++++++++++++++++++++++++++++++++++++++++++++ src/json.h | 4 +++ test/json_unit.cc | 4 +++ 3 files changed, 81 insertions(+) diff --git a/src/json.cc b/src/json.cc index 1666b23c..98225167 100644 --- a/src/json.cc +++ b/src/json.cc @@ -2073,6 +2073,9 @@ std::string json::parser::parseString() result += '\n'; } else if (currentChar == 'r') { result += '\r'; + } else if (currentChar == 'u') { + pos_++; + result += parseUnicodeEscape(); } else { error("expected one of \\,/,b,f,n,r,t behind backslash."); } @@ -2118,6 +2121,76 @@ std::string json::parser::parseString() error("expected '\"'"); } +std::string json::parser::unicodeToUTF8(unsigned int codepoint) { + + // it's just a ASCII compatible codepoint, + // so we just interpret the point as a character + if (codepoint <= 0x7f) { + return std::string(1, static_cast(codepoint)); + } + else if (codepoint <= 0x7ff) + { + std::string result(2, static_cast(0xc0 | ((codepoint >> 6) & 0x1f))); + result[1] = static_cast(0x80 | (codepoint & 0x3f)); + return result; + } + else if (codepoint <= 0xffff) + { + std::string result(3, static_cast(0xe0 | ((codepoint >> 12) & 0x0f))); + result[1] = static_cast(0x80 | ((codepoint >> 6) & 0x3f)); + result[2] = static_cast(0x80 | (codepoint & 0x3f)); + return result; + } + else if (codepoint <= 0x1fffff) + { + std::string result(4, static_cast(0xf0 | ((codepoint >> 18) & 0x07))); + result[1] = static_cast(0x80 | ((codepoint >> 12) & 0x3f)); + result[2] = static_cast(0x80 | ((codepoint >> 6) & 0x3f)); + result[3] = static_cast(0x80 | (codepoint & 0x3f)); + return result; + } else { + std::string errorMessage = "Invalid codepoint: "; + errorMessage += codepoint; + error(errorMessage); + } +} + +/*! +Parses the JSON style unicode escape sequence (\uXXXX). + +@return the utf-8 character the escape sequence escaped + +@pre An opening quote \p " was read in the main parse function @ref parse. + pos_ is the position after the opening quote. + +@post The character after the closing quote \p " is the current character @ref + current_. Whitespace is skipped. +*/ +std::string json::parser::parseUnicodeEscape() { + const auto startPos = pos_; + if (pos_ + 3 >= buffer_.size()) { + error("Got end of input while parsing unicode escape sequence \\uXXXX"); + } + std::string hexCode(4, ' '); + for(; pos_ < startPos + 4; pos_++) { + char currentChar = buffer_[pos_]; + if ( (currentChar >= '0' && currentChar <= '9') + || (currentChar >= 'a' && currentChar <= 'f') + || (currentChar >= 'A' && currentChar <= 'F')) { + // all is well, we have valid hexadecimal chars + // so we copy that char into our string + hexCode[pos_ - startPos] = currentChar; + } else { + error("Found non-hexadecimal character in unicode escape sequence!"); + } + } + pos_--; + // case is safe as 4 hex characters can't present more than 16 bits + return unicodeToUTF8(static_cast(std::stoul(hexCode, nullptr, 16))); +} + + + /*! This function is called in case a \p "t" is read in the main parse function @ref parse. In the standard, the \p "true" token is the only candidate, so the diff --git a/src/json.h b/src/json.h index 2dd99348..1b5e8fcf 100644 --- a/src/json.h +++ b/src/json.h @@ -418,6 +418,10 @@ class json inline void error(const std::string&) __attribute__((noreturn)); /// parse a quoted string inline std::string parseString(); + /// transforms a unicode codepoint to it's UTF-8 presentation + inline std::string unicodeToUTF8(unsigned int codepoint); + /// parses a unicode escape sequence + inline std::string parseUnicodeEscape(); /// parse a Boolean "true" inline void parseTrue(); /// parse a Boolean "false" diff --git a/test/json_unit.cc b/test/json_unit.cc index b2fcd65e..fb89a2a1 100644 --- a/test/json_unit.cc +++ b/test/json_unit.cc @@ -1652,6 +1652,10 @@ TEST_CASE("Parser") CHECK(json::parse("\"a\\nz\"") == json("a\nz")); CHECK(json::parse("\"\\n\"") == json("\n")); + // escape unicode characters + CHECK(json::parse("\"\\u002F\"") == json("/")); + CHECK(json::parse("\"\\u00E4\"") == json(u8"\u00E4")); + // escaping senseless stuff CHECK_THROWS_AS(json::parse("\"\\z\""), std::invalid_argument); CHECK_THROWS_AS(json::parse("\"\\ \""), std::invalid_argument); From 5a54e46709122427fae296abc04cdab8dacbfc6d Mon Sep 17 00:00:00 2001 From: Raphael Isemann Date: Sat, 10 Jan 2015 16:49:10 +0100 Subject: [PATCH 2/6] Fully implemented the JSON spec --- src/json.cc | 246 +++++++++++++++++++++++++++++++++++----------- src/json.h | 6 +- test/json_unit.cc | 42 +++++++- 3 files changed, 232 insertions(+), 62 deletions(-) diff --git a/src/json.cc b/src/json.cc index 98225167..b664ef15 100644 --- a/src/json.cc +++ b/src/json.cc @@ -2049,40 +2049,61 @@ std::string json::parser::parseString() // the result of the parse process std::string result; - // iterate with pos_ over the whole string - for (; pos_ < buffer_.size(); pos_++) { + // iterate with pos_ over the whole input until we found the end and return + // or we exit via error() + for (; pos_ < buffer_.size(); pos_++) + { char currentChar = buffer_[pos_]; - // uneven amount of backslashes means the user wants to escape something - if (!evenAmountOfBackslashes) { + if (!evenAmountOfBackslashes) + { + // uneven amount of backslashes means the user wants to escape something + // so we know there is a case such as '\X' or '\\\X' but we don't + // know yet what X is. + // at this point in the code, the currentChar has the value of X // slash, backslash and quote are copied as is if ( currentChar == '/' || currentChar == '\\' - || currentChar == '"') { + || currentChar == '"') + { result += currentChar; - } else { - // All other characters are replaced by their respective special character - if (currentChar == 't') { - result += '\t'; - } else if (currentChar == 'b') { - result += '\b'; - } else if (currentChar == 'f') { - result += '\f'; - } else if (currentChar == 'n') { - result += '\n'; - } else if (currentChar == 'r') { - result += '\r'; - } else if (currentChar == 'u') { - pos_++; - result += parseUnicodeEscape(); - } else { - error("expected one of \\,/,b,f,n,r,t behind backslash."); - } - // TODO implement \uXXXX } - } else { - if (currentChar == '"') { + else + { + // All other characters are replaced by their respective special character + if (currentChar == 't') + result += '\t'; + else if (currentChar == 'b') + result += '\b'; + else if (currentChar == 'f') + result += '\f'; + else if (currentChar == 'n') + result += '\n'; + else if (currentChar == 'r') + result += '\r'; + else if (currentChar == 'u') + { + // \uXXXX[\uXXXX] is used for escaping unicode, which + // has it's own subroutine. + result += parseUnicodeEscape(); + // the parsing process has brought us one step behind the + // unicode escape sequence: + // \uXXXX + // ^ + // so we need to go one character back or the parser + // would skip the character we are currently pointing at + // (as the for-loop will drecement pos_ after this iteration). + pos_--; + } + else // user did something like \z and we should report a error + error("expected one of \\,/,b,f,n,r,t,u behind backslash."); + } + } + else + { + if (currentChar == '"') + { // currentChar is a quote, so we found the end of the string @@ -2093,7 +2114,9 @@ std::string json::parser::parseString() // bring the result of the parsing process back to the caller return result; - } else if (currentChar != '\\') { + } + else if (currentChar != '\\') + { // all non-backslash characters are added to the end of the result string. // the only backslashes we want in the result are the ones that are escaped (which happens above). result += currentChar; @@ -2121,34 +2144,74 @@ std::string json::parser::parseString() error("expected '\"'"); } -std::string json::parser::unicodeToUTF8(unsigned int codepoint) { - // it's just a ASCII compatible codepoint, - // so we just interpret the point as a character - if (codepoint <= 0x7f) { + +/*! +Turns a code point into it's UTF-8 representation. +You should only pass numbers < 0x10ffff into this function +(everything else is a invalid code point). + +@return the UTF-8 representation of the given codepoint + +@pre This method isn't accessing the members of the parser + +@post This method isn't accessing the members of the parser +*/ +std::string json::parser::codepointToUTF8(unsigned int codepoint) +{ + // this method contains a lot of bit manipulations to + // build the bytes for UTF-8. + + // the '(... >> S) & 0xHH'-patterns are used to retrieve + // certain bits from the code points. + + // all static casts in this method have boundary checks + + // we initialize all strings with their final length + // (e.g. 1 to 4 bytes) to save the reallocations. + + + if (codepoint <= 0x7f) + { + // it's just a ASCII compatible codepoint, + // so we just interpret the point as a character + // and return ASCII + return std::string(1, static_cast(codepoint)); } + // if true, we need two bytes to encode this as UTF-8 else if (codepoint <= 0x7ff) { - std::string result(2, static_cast(0xc0 | ((codepoint >> 6) & 0x1f))); - result[1] = static_cast(0x80 | (codepoint & 0x3f)); + // the 0xC0 enables the two most significant two bits + // to make this a two-byte UTF-8 character. + std::string result(2, static_cast(0xC0 | ((codepoint >> 6) & 0x1F))); + result[1] = static_cast(0x80 | (codepoint & 0x3F)); return result; } + // if true, now we need three bytes to encode this as UTF-8 else if (codepoint <= 0xffff) { - std::string result(3, static_cast(0xe0 | ((codepoint >> 12) & 0x0f))); - result[1] = static_cast(0x80 | ((codepoint >> 6) & 0x3f)); - result[2] = static_cast(0x80 | (codepoint & 0x3f)); + // the 0xE0 enables the three most significant two bits + // to make this a three-byte UTF-8 character. + std::string result(3, static_cast(0xE0 | ((codepoint >> 12) & 0x0F))); + result[1] = static_cast(0x80 | ((codepoint >> 6) & 0x3F)); + result[2] = static_cast(0x80 | (codepoint & 0x3F)); return result; } - else if (codepoint <= 0x1fffff) + // if true, we need maximal four bytes to encode this as UTF-8 + else if (codepoint <= 0x10ffff) { - std::string result(4, static_cast(0xf0 | ((codepoint >> 18) & 0x07))); - result[1] = static_cast(0x80 | ((codepoint >> 12) & 0x3f)); - result[2] = static_cast(0x80 | ((codepoint >> 6) & 0x3f)); - result[3] = static_cast(0x80 | (codepoint & 0x3f)); + // the 0xE0 enables the four most significant two bits + // to make this a three-byte UTF-8 character. + std::string result(4, static_cast(0xF0 | ((codepoint >> 18) & 0x07))); + result[1] = static_cast(0x80 | ((codepoint >> 12) & 0x3F)); + result[2] = static_cast(0x80 | ((codepoint >> 6) & 0x3F)); + result[3] = static_cast(0x80 | (codepoint & 0x3F)); return result; - } else { + } + else + { + // Can't be tested without direct access to this private method. std::string errorMessage = "Invalid codepoint: "; errorMessage += codepoint; error(errorMessage); @@ -2156,39 +2219,110 @@ std::string json::parser::unicodeToUTF8(unsigned int codepoint) { } /*! -Parses the JSON style unicode escape sequence (\uXXXX). +Parses 4 hexadecimal characters as a number. -@return the utf-8 character the escape sequence escaped +@return the value of the number the hexadecimal characters represent. -@pre An opening quote \p " was read in the main parse function @ref parse. - pos_ is the position after the opening quote. +@pre pos_ is pointing to the first of the 4 hexadecimal characters. -@post The character after the closing quote \p " is the current character @ref - current_. Whitespace is skipped. +@post pos_ is pointing to the character after the 4 hexadecimal characters. */ -std::string json::parser::parseUnicodeEscape() { +unsigned int json::parser::parse4HexCodepoint() +{ const auto startPos = pos_; - if (pos_ + 3 >= buffer_.size()) { + + // check if the remaining buffer is long enough to even hold 4 characters + if (pos_ + 3 >= buffer_.size()) + { error("Got end of input while parsing unicode escape sequence \\uXXXX"); } + + // make a string that can hold the pair std::string hexCode(4, ' '); - for(; pos_ < startPos + 4; pos_++) { + + for(; pos_ < startPos + 4; pos_++) + { + // no boundary check here as we already checked above char currentChar = buffer_[pos_]; + + // check if we have a hexadecimal character if ( (currentChar >= '0' && currentChar <= '9') || (currentChar >= 'a' && currentChar <= 'f') - || (currentChar >= 'A' && currentChar <= 'F')) { + || (currentChar >= 'A' && currentChar <= 'F')) + { // all is well, we have valid hexadecimal chars // so we copy that char into our string hexCode[pos_ - startPos] = currentChar; - } else { + } + else + { error("Found non-hexadecimal character in unicode escape sequence!"); } } - pos_--; - // case is safe as 4 hex characters can't present more than 16 bits - return unicodeToUTF8(static_cast(std::stoul(hexCode, nullptr, 16))); + // the cast is safe as 4 hex characters can't present more than 16 bits + // the input to stoul was checked to contain only hexadecimal characters (see above) + return static_cast(std::stoul(hexCode, nullptr, 16)); } +/*! +Parses the unicode escape codes as defined in the ECMA-404. +The escape sequence has two forms: +1. \uXXXX +2. \uXXXX\uYYYY +where X and Y are a hexadecimal character (a-zA-Z0-9). + +Form 1 just contains the unicode code point in the hexadecimal number XXXX. +Form 2 is encoding a UTF-16 surrogate pair. The high surrogate is XXXX, the low surrogate is YYYY. + +@return the UTF-8 character this unicode escape sequence escaped. + +@pre pos_ is pointing at at the 'u' behind the first backslash. + +@post pos_ is pointing at the character behind the last X (or Y in form 2). +*/ +std::string json::parser::parseUnicodeEscape() +{ + // jump to the first hex value + pos_++; + // parse the hex first hex values + unsigned int firstCodepoint = parse4HexCodepoint(); + + + if (firstCodepoint >= 0xD800 && firstCodepoint <= 0xDBFF) + { + // we found invalid code points, which means we either have a malformed input + // or we found a high surrogate. + // we can only find out by seeing if the next character also wants to encode + // a unicode character (so, we have the \uXXXX\uXXXX case here). + + // jump behind the next \u + pos_ += 2; + // try to parse the next hex values. + // the method does boundary checking for us, so no need to do that here + unsigned secondCodepoint = parse4HexCodepoint(); + // ok, we have a low surrogate, check if it is a valid one + if (secondCodepoint >= 0xDC00 && secondCodepoint <= 0xDFFF) + { + // calculate the final code point from the pair according to the spec + unsigned int finalCodePoint = + // high surrogate occupies the most significant 22 bits + (firstCodepoint << 10) + // low surrogate occupies the least significant 15 bits + + secondCodepoint + // there is still the 0xD800, 0xDC00 and 0x10000 noise in the result + // so we have to substract with (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00 + - 0x35FDC00; + + // we transform the calculated point into UTF-8 + return codepointToUTF8(finalCodePoint); + } + else + error("missing low surrogate"); + + } + // We have Form 1, so we just interpret the XXXX as a code point + return codepointToUTF8(firstCodepoint); +} /*! diff --git a/src/json.h b/src/json.h index 1b5e8fcf..90c5ded8 100644 --- a/src/json.h +++ b/src/json.h @@ -419,8 +419,10 @@ class json /// parse a quoted string inline std::string parseString(); /// transforms a unicode codepoint to it's UTF-8 presentation - inline std::string unicodeToUTF8(unsigned int codepoint); - /// parses a unicode escape sequence + inline std::string codepointToUTF8(unsigned int codepoint); + /// parses 4 hex characters that represent a unicode codepoint + inline unsigned int parse4HexCodepoint(); + /// parses \uXXXX[\uXXXX] unicode escape characters inline std::string parseUnicodeEscape(); /// parse a Boolean "true" inline void parseTrue(); diff --git a/test/json_unit.cc b/test/json_unit.cc index fb89a2a1..ab679fbf 100644 --- a/test/json_unit.cc +++ b/test/json_unit.cc @@ -1652,10 +1652,6 @@ TEST_CASE("Parser") CHECK(json::parse("\"a\\nz\"") == json("a\nz")); CHECK(json::parse("\"\\n\"") == json("\n")); - // escape unicode characters - CHECK(json::parse("\"\\u002F\"") == json("/")); - CHECK(json::parse("\"\\u00E4\"") == json(u8"\u00E4")); - // escaping senseless stuff CHECK_THROWS_AS(json::parse("\"\\z\""), std::invalid_argument); CHECK_THROWS_AS(json::parse("\"\\ \""), std::invalid_argument); @@ -1665,6 +1661,44 @@ TEST_CASE("Parser") CHECK_THROWS_AS(json::parse("\""), std::invalid_argument); } + SECTION("unicode_escaping") + { + // two tests for uppercase and lowercase hex + + // normal forward slash in ASCII range + CHECK(json::parse("\"\\u002F\"") == json("/")); + CHECK(json::parse("\"\\u002f\"") == json("/")); + // german a umlaut + CHECK(json::parse("\"\\u00E4\"") == json(u8"\u00E4")); + CHECK(json::parse("\"\\u00e4\"") == json(u8"\u00E4")); + // weird d + CHECK(json::parse("\"\\u0111\"") == json(u8"\u0111")); + // unicode arrow left + CHECK(json::parse("\"\\u2190\"") == json(u8"\u2190")); + // pleasing osiris by testing hieroglyph support + CHECK(json::parse("\"\\uD80C\\uDC60\"") == json(u8"\U00013060")); + CHECK(json::parse("\"\\ud80C\\udc60\"") == json(u8"\U00013060")); + + + // no hex numbers behind the \u + CHECK_THROWS_AS(json::parse("\"\\uD80v\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\uD80 A\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\uD8v\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\uDv\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\uv\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\u\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\u\\u\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"a\\uD80vAz\""), std::invalid_argument); + // missing part of a surrogate pair + CHECK_THROWS_AS(json::parse("\"bla \\uD80C bla\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\uD80C bla bla\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"bla bla \\uD80C bla bla\""), std::invalid_argument); + // senseless surrogate pair + CHECK_THROWS_AS(json::parse("\"\\uD80C\\uD80C\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\uD80C\\u0000\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\uD80C\\uFFFF\""), std::invalid_argument); + } + SECTION("boolean") { // accept the exact values From 1287f03084bdede0484180bb54be73be4349e2f0 Mon Sep 17 00:00:00 2001 From: Raphael Isemann Date: Sat, 10 Jan 2015 16:50:39 +0100 Subject: [PATCH 3/6] Code point are two words, and so the "P" should be capital --- src/json.cc | 56 ++++++++++++++++++++++++++--------------------------- src/json.h | 6 +++--- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/src/json.cc b/src/json.cc index b664ef15..f7da724c 100644 --- a/src/json.cc +++ b/src/json.cc @@ -2151,13 +2151,13 @@ Turns a code point into it's UTF-8 representation. You should only pass numbers < 0x10ffff into this function (everything else is a invalid code point). -@return the UTF-8 representation of the given codepoint +@return the UTF-8 representation of the given code point @pre This method isn't accessing the members of the parser @post This method isn't accessing the members of the parser */ -std::string json::parser::codepointToUTF8(unsigned int codepoint) +std::string json::parser::codePointToUTF8(unsigned int codePoint) { // this method contains a lot of bit manipulations to // build the bytes for UTF-8. @@ -2171,49 +2171,49 @@ std::string json::parser::codepointToUTF8(unsigned int codepoint) // (e.g. 1 to 4 bytes) to save the reallocations. - if (codepoint <= 0x7f) + if (codePoint <= 0x7f) { - // it's just a ASCII compatible codepoint, + // it's just a ASCII compatible codePoint, // so we just interpret the point as a character // and return ASCII - return std::string(1, static_cast(codepoint)); + return std::string(1, static_cast(codePoint)); } // if true, we need two bytes to encode this as UTF-8 - else if (codepoint <= 0x7ff) + else if (codePoint <= 0x7ff) { // the 0xC0 enables the two most significant two bits // to make this a two-byte UTF-8 character. - std::string result(2, static_cast(0xC0 | ((codepoint >> 6) & 0x1F))); - result[1] = static_cast(0x80 | (codepoint & 0x3F)); + std::string result(2, static_cast(0xC0 | ((codePoint >> 6) & 0x1F))); + result[1] = static_cast(0x80 | (codePoint & 0x3F)); return result; } // if true, now we need three bytes to encode this as UTF-8 - else if (codepoint <= 0xffff) + else if (codePoint <= 0xffff) { // the 0xE0 enables the three most significant two bits // to make this a three-byte UTF-8 character. - std::string result(3, static_cast(0xE0 | ((codepoint >> 12) & 0x0F))); - result[1] = static_cast(0x80 | ((codepoint >> 6) & 0x3F)); - result[2] = static_cast(0x80 | (codepoint & 0x3F)); + std::string result(3, static_cast(0xE0 | ((codePoint >> 12) & 0x0F))); + result[1] = static_cast(0x80 | ((codePoint >> 6) & 0x3F)); + result[2] = static_cast(0x80 | (codePoint & 0x3F)); return result; } // if true, we need maximal four bytes to encode this as UTF-8 - else if (codepoint <= 0x10ffff) + else if (codePoint <= 0x10ffff) { // the 0xE0 enables the four most significant two bits // to make this a three-byte UTF-8 character. - std::string result(4, static_cast(0xF0 | ((codepoint >> 18) & 0x07))); - result[1] = static_cast(0x80 | ((codepoint >> 12) & 0x3F)); - result[2] = static_cast(0x80 | ((codepoint >> 6) & 0x3F)); - result[3] = static_cast(0x80 | (codepoint & 0x3F)); + std::string result(4, static_cast(0xF0 | ((codePoint >> 18) & 0x07))); + result[1] = static_cast(0x80 | ((codePoint >> 12) & 0x3F)); + result[2] = static_cast(0x80 | ((codePoint >> 6) & 0x3F)); + result[3] = static_cast(0x80 | (codePoint & 0x3F)); return result; } else { // Can't be tested without direct access to this private method. - std::string errorMessage = "Invalid codepoint: "; - errorMessage += codepoint; + std::string errorMessage = "Invalid codePoint: "; + errorMessage += codePoint; error(errorMessage); } } @@ -2227,7 +2227,7 @@ Parses 4 hexadecimal characters as a number. @post pos_ is pointing to the character after the 4 hexadecimal characters. */ -unsigned int json::parser::parse4HexCodepoint() +unsigned int json::parser::parse4HexCodePoint() { const auto startPos = pos_; @@ -2285,10 +2285,10 @@ std::string json::parser::parseUnicodeEscape() // jump to the first hex value pos_++; // parse the hex first hex values - unsigned int firstCodepoint = parse4HexCodepoint(); + unsigned int firstCodePoint = parse4HexCodePoint(); - if (firstCodepoint >= 0xD800 && firstCodepoint <= 0xDBFF) + if (firstCodePoint >= 0xD800 && firstCodePoint <= 0xDBFF) { // we found invalid code points, which means we either have a malformed input // or we found a high surrogate. @@ -2299,29 +2299,29 @@ std::string json::parser::parseUnicodeEscape() pos_ += 2; // try to parse the next hex values. // the method does boundary checking for us, so no need to do that here - unsigned secondCodepoint = parse4HexCodepoint(); + unsigned secondCodePoint = parse4HexCodePoint(); // ok, we have a low surrogate, check if it is a valid one - if (secondCodepoint >= 0xDC00 && secondCodepoint <= 0xDFFF) + if (secondCodePoint >= 0xDC00 && secondCodePoint <= 0xDFFF) { // calculate the final code point from the pair according to the spec unsigned int finalCodePoint = // high surrogate occupies the most significant 22 bits - (firstCodepoint << 10) + (firstCodePoint << 10) // low surrogate occupies the least significant 15 bits - + secondCodepoint + + secondCodePoint // there is still the 0xD800, 0xDC00 and 0x10000 noise in the result // so we have to substract with (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00 - 0x35FDC00; // we transform the calculated point into UTF-8 - return codepointToUTF8(finalCodePoint); + return codePointToUTF8(finalCodePoint); } else error("missing low surrogate"); } // We have Form 1, so we just interpret the XXXX as a code point - return codepointToUTF8(firstCodepoint); + return codePointToUTF8(firstCodePoint); } diff --git a/src/json.h b/src/json.h index 90c5ded8..2fa0bdf3 100644 --- a/src/json.h +++ b/src/json.h @@ -419,9 +419,9 @@ class json /// parse a quoted string inline std::string parseString(); /// transforms a unicode codepoint to it's UTF-8 presentation - inline std::string codepointToUTF8(unsigned int codepoint); - /// parses 4 hex characters that represent a unicode codepoint - inline unsigned int parse4HexCodepoint(); + inline std::string codePointToUTF8(unsigned int codePoint); + /// parses 4 hex characters that represent a unicode code point + inline unsigned int parse4HexCodePoint(); /// parses \uXXXX[\uXXXX] unicode escape characters inline std::string parseUnicodeEscape(); /// parse a Boolean "true" From 0fcc414995a89eb2f706f7a2bf57d0112b328d2e Mon Sep 17 00:00:00 2001 From: Raphael Isemann Date: Sat, 10 Jan 2015 18:28:53 +0100 Subject: [PATCH 4/6] More testing and updated CMake to allow calling private functions from the tests --- CMakeLists.txt | 3 +++ test/json_unit.cc | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index fc0d2c70..9e19f57f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,6 +4,9 @@ project(json) # Enable C++11 and set flags for coverage testing SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -g -O0 --coverage -fprofile-arcs -ftest-coverage") +# Make everything public for testing purposes +add_definitions(-Dprivate=public) + # If not specified, use Debug as build type (necessary for coverage testing) if( NOT CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Debug CACHE STRING diff --git a/test/json_unit.cc b/test/json_unit.cc index ab679fbf..baad482a 100644 --- a/test/json_unit.cc +++ b/test/json_unit.cc @@ -1697,6 +1697,12 @@ TEST_CASE("Parser") CHECK_THROWS_AS(json::parse("\"\\uD80C\\uD80C\""), std::invalid_argument); CHECK_THROWS_AS(json::parse("\"\\uD80C\\u0000\""), std::invalid_argument); CHECK_THROWS_AS(json::parse("\"\\uD80C\\uFFFF\""), std::invalid_argument); + + // test private code point converter function + CHECK_NOTHROW(json::parser("").codePointToUTF8(0x10FFFE)); + CHECK_NOTHROW(json::parser("").codePointToUTF8(0x10FFFF)); + CHECK_THROWS_AS(json::parser("").codePointToUTF8(0x110000), std::invalid_argument); + CHECK_THROWS_AS(json::parser("").codePointToUTF8(0x110001), std::invalid_argument); } SECTION("boolean") From a409ba94888b823e276b5f6f88b063c4b5a41936 Mon Sep 17 00:00:00 2001 From: Raphael Isemann Date: Sat, 10 Jan 2015 18:46:01 +0100 Subject: [PATCH 5/6] Fixed build --- src/json.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/json.h b/src/json.h index 2fa0bdf3..c09d5665 100644 --- a/src/json.h +++ b/src/json.h @@ -419,7 +419,7 @@ class json /// parse a quoted string inline std::string parseString(); /// transforms a unicode codepoint to it's UTF-8 presentation - inline std::string codePointToUTF8(unsigned int codePoint); + std::string codePointToUTF8(unsigned int codePoint); /// parses 4 hex characters that represent a unicode code point inline unsigned int parse4HexCodePoint(); /// parses \uXXXX[\uXXXX] unicode escape characters From a866a9d9800d68ecd0260f5ffe7206a1b49b94fb Mon Sep 17 00:00:00 2001 From: Raphael Isemann Date: Sat, 10 Jan 2015 19:53:13 +0100 Subject: [PATCH 6/6] Reapplied code style fixes --- .idea/codeStyleSettings.xml | 35 ++++++++++++ .idea/json.iml | 8 +++ .idea/misc.xml | 5 ++ .idea/modules.xml | 8 +++ .idea/vcs.xml | 6 ++ src/json.cc | 107 ++++++++++++++++++++++-------------- 6 files changed, 129 insertions(+), 40 deletions(-) create mode 100644 .idea/codeStyleSettings.xml create mode 100644 .idea/json.iml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml diff --git a/.idea/codeStyleSettings.xml b/.idea/codeStyleSettings.xml new file mode 100644 index 00000000..65c39702 --- /dev/null +++ b/.idea/codeStyleSettings.xml @@ -0,0 +1,35 @@ + + + + + + \ No newline at end of file diff --git a/.idea/json.iml b/.idea/json.iml new file mode 100644 index 00000000..bc2cd874 --- /dev/null +++ b/.idea/json.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 00000000..6b328020 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,5 @@ + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 00000000..cd370d1e --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 00000000..94a25f7f --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/src/json.cc b/src/json.cc index 5fb80e8c..3b307511 100644 --- a/src/json.cc +++ b/src/json.cc @@ -2060,10 +2060,10 @@ std::string json::parser::parseString() if (!evenAmountOfBackslashes) { - // uneven amount of backslashes means the user wants to escape something - // so we know there is a case such as '\X' or '\\\X' but we don't - // know yet what X is. - // at this point in the code, the currentChar has the value of X + // uneven amount of backslashes means the user wants to escape + // something so we know there is a case such as '\X' or '\\\X' but + // we don't know yet what X is. + // at this point in the code, the currentChar has the value of X. // slash, backslash and quote are copied as is if ( currentChar == '/' @@ -2074,33 +2074,55 @@ std::string json::parser::parseString() } else { - // All other characters are replaced by their respective special character - if (currentChar == 't') - result += '\t'; - else if (currentChar == 'b') - result += '\b'; - else if (currentChar == 'f') - result += '\f'; - else if (currentChar == 'n') - result += '\n'; - else if (currentChar == 'r') - result += '\r'; - else if (currentChar == 'u') + // all other characters are replaced by their respective special + // character + switch (currentChar) { - // \uXXXX[\uXXXX] is used for escaping unicode, which - // has it's own subroutine. - result += parseUnicodeEscape(); - // the parsing process has brought us one step behind the - // unicode escape sequence: - // \uXXXX - // ^ - // so we need to go one character back or the parser - // would skip the character we are currently pointing at - // (as the for-loop will drecement pos_ after this iteration). - pos_--; + case 't': + { + result += '\t'; + break; + } + case 'b': + { + result += '\b'; + break; + } + case 'f': + { + result += '\f'; + break; + } + case 'n': + { + result += '\n'; + break; + } + case 'r': + { + result += '\r'; + break; + } + case 'u': + { + // \uXXXX[\uXXXX] is used for escaping unicode, which + // has it's own subroutine. + result += parseUnicodeEscape(); + // the parsing process has brought us one step behind + // the unicode escape sequence: + // \uXXXX + // ^ + // we need to go one character back or the parser would + // skip the character we are currently pointing at as + // the for-loop will decrement pos_ after this iteration + pos_--; + break; + } + default: + { + error("expected one of \\, /, b, f, n, r, t, u behind backslash."); + } } - else // user did something like \z and we should report a error - error("expected one of \\,/,b,f,n,r,t,u behind backslash."); } } else @@ -2119,8 +2141,9 @@ std::string json::parser::parseString() } else if (currentChar != '\\') { - // all non-backslash characters are added to the end of the result string. - // the only backslashes we want in the result are the ones that are escaped (which happens above). + // all non-backslash characters are added to the end of the + // result string. The only backslashes we want in the result + // are the ones that are escaped (which happens above). result += currentChar; } } @@ -2262,7 +2285,8 @@ unsigned int json::parser::parse4HexCodePoint() } } // the cast is safe as 4 hex characters can't present more than 16 bits - // the input to stoul was checked to contain only hexadecimal characters (see above) + // the input to stoul was checked to contain only hexadecimal characters + // (see above) return static_cast(std::stoul(hexCode, nullptr, 16)); } @@ -2274,7 +2298,8 @@ The escape sequence has two forms: where X and Y are a hexadecimal character (a-zA-Z0-9). Form 1 just contains the unicode code point in the hexadecimal number XXXX. -Form 2 is encoding a UTF-16 surrogate pair. The high surrogate is XXXX, the low surrogate is YYYY. +Form 2 is encoding a UTF-16 surrogate pair. The high surrogate is XXXX, the low +surrogate is YYYY. @return the UTF-8 character this unicode escape sequence escaped. @@ -2292,10 +2317,10 @@ std::string json::parser::parseUnicodeEscape() if (firstCodePoint >= 0xD800 && firstCodePoint <= 0xDBFF) { - // we found invalid code points, which means we either have a malformed input - // or we found a high surrogate. - // we can only find out by seeing if the next character also wants to encode - // a unicode character (so, we have the \uXXXX\uXXXX case here). + // we found invalid code points, which means we either have a malformed + // input or we found a high surrogate. + // we can only find out by seeing if the next character also wants to + // encode a unicode character (so, we have the \uXXXX\uXXXX case here). // jump behind the next \u pos_ += 2; @@ -2305,14 +2330,16 @@ std::string json::parser::parseUnicodeEscape() // ok, we have a low surrogate, check if it is a valid one if (secondCodePoint >= 0xDC00 && secondCodePoint <= 0xDFFF) { - // calculate the final code point from the pair according to the spec + // calculate the code point from the pair according to the spec unsigned int finalCodePoint = // high surrogate occupies the most significant 22 bits (firstCodePoint << 10) // low surrogate occupies the least significant 15 bits + secondCodePoint - // there is still the 0xD800, 0xDC00 and 0x10000 noise in the result - // so we have to substract with (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00 + // there is still the 0xD800, 0xDC00 and 0x10000 noise in + // the result + // so we have to substract with: + // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00 - 0x35FDC00; // we transform the calculated point into UTF-8