diff --git a/.gitignore b/.gitignore index 206731f0..432d6ac3 100644 --- a/.gitignore +++ b/.gitignore @@ -46,3 +46,5 @@ CMakeFiles libjson.a Testing + +.idea \ No newline at end of file diff --git a/.idea/codeStyleSettings.xml b/.idea/codeStyleSettings.xml deleted file mode 100644 index 65c39702..00000000 --- a/.idea/codeStyleSettings.xml +++ /dev/null @@ -1,35 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/.idea/json.iml b/.idea/json.iml deleted file mode 100644 index bc2cd874..00000000 --- a/.idea/json.iml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml deleted file mode 100644 index 6b328020..00000000 --- a/.idea/misc.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index cd370d1e..00000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 94a25f7f..00000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/header_only/json.h b/header_only/json.h index c8f5047c..13a2b626 100644 --- a/header_only/json.h +++ b/header_only/json.h @@ -418,9 +418,15 @@ class json /// read the next character, stripping whitespace bool next(); /// raise an exception with an error message - inline void error(const std::string&) __attribute__((noreturn)); + inline void error(const std::string&) const __attribute__((noreturn)); /// parse a quoted string inline std::string parseString(); + /// transforms a unicode codepoint to it's UTF-8 presentation + std::string codePointToUTF8(unsigned int codePoint) const; + /// parses 4 hex characters that represent a unicode code point + inline unsigned int parse4HexCodePoint(); + /// parses \uXXXX[\uXXXX] unicode escape characters + inline std::string parseUnicodeEscape(); /// parse a Boolean "true" inline void parseTrue(); /// parse a Boolean "false" @@ -2434,9 +2440,9 @@ json json::parser::parse() // remember position of number's first character const auto _firstpos_ = pos_ - 1; - while (next() and (std::isdigit(current_) || current_ == '.' - || current_ == 'e' || current_ == 'E' - || current_ == '+' || current_ == '-')); + while (next() and (std::isdigit(current_) or current_ == '.' + or current_ == 'e' or current_ == 'E' + or current_ == '+' or current_ == '-')); try { @@ -2512,7 +2518,7 @@ the error message \p msg), and the last read token. @exception std::invalid_argument whenever the function is called */ -void json::parser::error(const std::string& msg) +void json::parser::error(const std::string& msg) const { throw std::invalid_argument("parse error at position " + std::to_string(pos_) + ": " + msg + @@ -2542,14 +2548,19 @@ std::string json::parser::parseString() // the result of the parse process std::string result; - // iterate with pos_ over the whole string + // iterate with pos_ over the whole input until we found the end and return + // or we exit via error() for (; pos_ < buffer_.size(); pos_++) { char currentChar = buffer_[pos_]; - // uneven amount of backslashes means the user wants to escape something - if (!evenAmountOfBackslashes) + if (not evenAmountOfBackslashes) { + // uneven amount of backslashes means the user wants to escape + // something so we know there is a case such as '\X' or '\\\X' but + // we don't know yet what X is. + // at this point in the code, the currentChar has the value of X. + // slash, backslash and quote are copied as is if (currentChar == '/' or currentChar == '\\' or currentChar == '"') { @@ -2557,8 +2568,8 @@ std::string json::parser::parseString() } else { - // all other characters are replaced by their respective - // special character + // all other characters are replaced by their respective special + // character switch (currentChar) { case 't': @@ -2586,12 +2597,26 @@ std::string json::parser::parseString() result += '\r'; break; } + case 'u': + { + // \uXXXX[\uXXXX] is used for escaping unicode, which + // has it's own subroutine. + result += parseUnicodeEscape(); + // the parsing process has brought us one step behind + // the unicode escape sequence: + // \uXXXX + // ^ + // we need to go one character back or the parser would + // skip the character we are currently pointing at as + // the for-loop will decrement pos_ after this iteration + pos_--; + break; + } default: { - error("expected one of \\, /, b, f, n, r, t behind backslash."); + error("expected one of \\, /, b, f, n, r, t, u behind backslash."); } } - // TODO implement \uXXXX } } else @@ -2610,7 +2635,7 @@ std::string json::parser::parseString() } else if (currentChar != '\\') { - // All non-backslash characters are added to the end of the + // all non-backslash characters are added to the end of the // result string. The only backslashes we want in the result // are the ones that are escaped (which happens above). result += currentChar; @@ -2638,6 +2663,187 @@ std::string json::parser::parseString() error("expected '\"'"); } + + +/*! +Turns a code point into it's UTF-8 representation. +You should only pass numbers < 0x10ffff into this function +(everything else is a invalid code point). + +@return the UTF-8 representation of the given code point +*/ +std::string json::parser::codePointToUTF8(unsigned int codePoint) const +{ + // this method contains a lot of bit manipulations to + // build the bytes for UTF-8. + + // the '(... >> S) & 0xHH'-patterns are used to retrieve + // certain bits from the code points. + + // all static casts in this method have boundary checks + + // we initialize all strings with their final length + // (e.g. 1 to 4 bytes) to save the reallocations. + + if (codePoint <= 0x7f) + { + // it's just a ASCII compatible codePoint, + // so we just interpret the point as a character + // and return ASCII + + return std::string(1, static_cast(codePoint)); + } + // if true, we need two bytes to encode this as UTF-8 + else if (codePoint <= 0x7ff) + { + // the 0xC0 enables the two most significant two bits + // to make this a two-byte UTF-8 character. + std::string result(2, static_cast(0xC0 | ((codePoint >> 6) & 0x1F))); + result[1] = static_cast(0x80 | (codePoint & 0x3F)); + return result; + } + // if true, now we need three bytes to encode this as UTF-8 + else if (codePoint <= 0xffff) + { + // the 0xE0 enables the three most significant two bits + // to make this a three-byte UTF-8 character. + std::string result(3, static_cast(0xE0 | ((codePoint >> 12) & 0x0F))); + result[1] = static_cast(0x80 | ((codePoint >> 6) & 0x3F)); + result[2] = static_cast(0x80 | (codePoint & 0x3F)); + return result; + } + // if true, we need maximal four bytes to encode this as UTF-8 + else if (codePoint <= 0x10ffff) + { + // the 0xE0 enables the four most significant two bits + // to make this a three-byte UTF-8 character. + std::string result(4, static_cast(0xF0 | ((codePoint >> 18) & 0x07))); + result[1] = static_cast(0x80 | ((codePoint >> 12) & 0x3F)); + result[2] = static_cast(0x80 | ((codePoint >> 6) & 0x3F)); + result[3] = static_cast(0x80 | (codePoint & 0x3F)); + return result; + } + else + { + // Can't be tested without direct access to this private method. + std::string errorMessage = "Invalid codePoint: "; + errorMessage += codePoint; + error(errorMessage); + } +} + +/*! +Parses 4 hexadecimal characters as a number. + +@return the value of the number the hexadecimal characters represent. + +@pre pos_ is pointing to the first of the 4 hexadecimal characters. + +@post pos_ is pointing to the character after the 4 hexadecimal characters. +*/ +unsigned int json::parser::parse4HexCodePoint() +{ + const auto startPos = pos_; + + // check if the remaining buffer is long enough to even hold 4 characters + if (pos_ + 3 >= buffer_.size()) + { + error("Got end of input while parsing unicode escape sequence \\uXXXX"); + } + + // make a string that can hold the pair + std::string hexCode(4, ' '); + + for (; pos_ < startPos + 4; pos_++) + { + // no boundary check here as we already checked above + char currentChar = buffer_[pos_]; + + // check if we have a hexadecimal character + if ((currentChar >= '0' and currentChar <= '9') + or (currentChar >= 'a' and currentChar <= 'f') + or (currentChar >= 'A' and currentChar <= 'F')) + { + // all is well, we have valid hexadecimal chars + // so we copy that char into our string + hexCode[pos_ - startPos] = currentChar; + } + else + { + error("Found non-hexadecimal character in unicode escape sequence!"); + } + } + // the cast is safe as 4 hex characters can't present more than 16 bits + // the input to stoul was checked to contain only hexadecimal characters + // (see above) + return static_cast(std::stoul(hexCode, nullptr, 16)); +} + +/*! +Parses the unicode escape codes as defined in the ECMA-404. +The escape sequence has two forms: +1. \uXXXX +2. \uXXXX\uYYYY +where X and Y are a hexadecimal character (a-zA-Z0-9). + +Form 1 just contains the unicode code point in the hexadecimal number XXXX. +Form 2 is encoding a UTF-16 surrogate pair. The high surrogate is XXXX, the low +surrogate is YYYY. + +@return the UTF-8 character this unicode escape sequence escaped. + +@pre pos_ is pointing at at the 'u' behind the first backslash. + +@post pos_ is pointing at the character behind the last X (or Y in form 2). +*/ +std::string json::parser::parseUnicodeEscape() +{ + // jump to the first hex value + pos_++; + // parse the hex first hex values + unsigned int firstCodePoint = parse4HexCodePoint(); + + if (firstCodePoint >= 0xD800 and firstCodePoint <= 0xDBFF) + { + // we found invalid code points, which means we either have a malformed + // input or we found a high surrogate. + // we can only find out by seeing if the next character also wants to + // encode a unicode character (so, we have the \uXXXX\uXXXX case here). + + // jump behind the next \u + pos_ += 2; + // try to parse the next hex values. + // the method does boundary checking for us, so no need to do that here + unsigned secondCodePoint = parse4HexCodePoint(); + // ok, we have a low surrogate, check if it is a valid one + if (secondCodePoint >= 0xDC00 and secondCodePoint <= 0xDFFF) + { + // calculate the code point from the pair according to the spec + unsigned int finalCodePoint = + // high surrogate occupies the most significant 22 bits + (firstCodePoint << 10) + // low surrogate occupies the least significant 15 bits + + secondCodePoint + // there is still the 0xD800, 0xDC00 and 0x10000 noise in + // the result + // so we have to substract with: + // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00 + - 0x35FDC00; + + // we transform the calculated point into UTF-8 + return codePointToUTF8(finalCodePoint); + } + else + { + error("missing low surrogate"); + } + + } + // We have Form 1, so we just interpret the XXXX as a code point + return codePointToUTF8(firstCodePoint); +} + + /*! This function is called in case a \p "t" is read in the main parse function @ref parse. In the standard, the \p "true" token is the only candidate, so the diff --git a/src/json.cc b/src/json.cc index 177977d7..31d709cb 100644 --- a/src/json.cc +++ b/src/json.cc @@ -1988,9 +1988,9 @@ json json::parser::parse() // remember position of number's first character const auto _firstpos_ = pos_ - 1; - while (next() and (std::isdigit(current_) || current_ == '.' - || current_ == 'e' || current_ == 'E' - || current_ == '+' || current_ == '-')); + while (next() and (std::isdigit(current_) or current_ == '.' + or current_ == 'e' or current_ == 'E' + or current_ == '+' or current_ == '-')); try { @@ -2066,7 +2066,7 @@ the error message \p msg), and the last read token. @exception std::invalid_argument whenever the function is called */ -void json::parser::error(const std::string& msg) +void json::parser::error(const std::string& msg) const { throw std::invalid_argument("parse error at position " + std::to_string(pos_) + ": " + msg + @@ -2102,7 +2102,7 @@ std::string json::parser::parseString() { char currentChar = buffer_[pos_]; - if (!evenAmountOfBackslashes) + if (not evenAmountOfBackslashes) { // uneven amount of backslashes means the user wants to escape // something so we know there is a case such as '\X' or '\\\X' but @@ -2110,9 +2110,7 @@ std::string json::parser::parseString() // at this point in the code, the currentChar has the value of X. // slash, backslash and quote are copied as is - if ( currentChar == '/' - || currentChar == '\\' - || currentChar == '"') + if (currentChar == '/' or currentChar == '\\' or currentChar == '"') { result += currentChar; } @@ -2221,12 +2219,8 @@ You should only pass numbers < 0x10ffff into this function (everything else is a invalid code point). @return the UTF-8 representation of the given code point - -@pre This method isn't accessing the members of the parser - -@post This method isn't accessing the members of the parser */ -std::string json::parser::codePointToUTF8(unsigned int codePoint) +std::string json::parser::codePointToUTF8(unsigned int codePoint) const { // this method contains a lot of bit manipulations to // build the bytes for UTF-8. @@ -2239,7 +2233,6 @@ std::string json::parser::codePointToUTF8(unsigned int codePoint) // we initialize all strings with their final length // (e.g. 1 to 4 bytes) to save the reallocations. - if (codePoint <= 0x7f) { // it's just a ASCII compatible codePoint, @@ -2309,15 +2302,15 @@ unsigned int json::parser::parse4HexCodePoint() // make a string that can hold the pair std::string hexCode(4, ' '); - for(; pos_ < startPos + 4; pos_++) + for (; pos_ < startPos + 4; pos_++) { // no boundary check here as we already checked above char currentChar = buffer_[pos_]; // check if we have a hexadecimal character - if ( (currentChar >= '0' && currentChar <= '9') - || (currentChar >= 'a' && currentChar <= 'f') - || (currentChar >= 'A' && currentChar <= 'F')) + if ((currentChar >= '0' and currentChar <= '9') + or (currentChar >= 'a' and currentChar <= 'f') + or (currentChar >= 'A' and currentChar <= 'F')) { // all is well, we have valid hexadecimal chars // so we copy that char into our string @@ -2358,8 +2351,7 @@ std::string json::parser::parseUnicodeEscape() // parse the hex first hex values unsigned int firstCodePoint = parse4HexCodePoint(); - - if (firstCodePoint >= 0xD800 && firstCodePoint <= 0xDBFF) + if (firstCodePoint >= 0xD800 and firstCodePoint <= 0xDBFF) { // we found invalid code points, which means we either have a malformed // input or we found a high surrogate. @@ -2372,25 +2364,27 @@ std::string json::parser::parseUnicodeEscape() // the method does boundary checking for us, so no need to do that here unsigned secondCodePoint = parse4HexCodePoint(); // ok, we have a low surrogate, check if it is a valid one - if (secondCodePoint >= 0xDC00 && secondCodePoint <= 0xDFFF) + if (secondCodePoint >= 0xDC00 and secondCodePoint <= 0xDFFF) { // calculate the code point from the pair according to the spec unsigned int finalCodePoint = - // high surrogate occupies the most significant 22 bits - (firstCodePoint << 10) - // low surrogate occupies the least significant 15 bits - + secondCodePoint - // there is still the 0xD800, 0xDC00 and 0x10000 noise in - // the result - // so we have to substract with: - // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00 - - 0x35FDC00; + // high surrogate occupies the most significant 22 bits + (firstCodePoint << 10) + // low surrogate occupies the least significant 15 bits + + secondCodePoint + // there is still the 0xD800, 0xDC00 and 0x10000 noise in + // the result + // so we have to substract with: + // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00 + - 0x35FDC00; // we transform the calculated point into UTF-8 return codePointToUTF8(finalCodePoint); } else + { error("missing low surrogate"); + } } // We have Form 1, so we just interpret the XXXX as a code point diff --git a/src/json.h b/src/json.h index 126f96a3..cb158bcc 100644 --- a/src/json.h +++ b/src/json.h @@ -418,11 +418,11 @@ class json /// read the next character, stripping whitespace bool next(); /// raise an exception with an error message - inline void error(const std::string&) __attribute__((noreturn)); + inline void error(const std::string&) const __attribute__((noreturn)); /// parse a quoted string inline std::string parseString(); /// transforms a unicode codepoint to it's UTF-8 presentation - std::string codePointToUTF8(unsigned int codePoint); + std::string codePointToUTF8(unsigned int codePoint) const; /// parses 4 hex characters that represent a unicode code point inline unsigned int parse4HexCodePoint(); /// parses \uXXXX[\uXXXX] unicode escape characters