From 222aacc213c0b34d275a119df8e7cabb44993af2 Mon Sep 17 00:00:00 2001
From: Raphael Isemann <teemperor@gmail.com>
Date: Sat, 10 Jan 2015 10:36:30 +0100
Subject: [PATCH 1/6] Quick and dirty implementation for basic multilingual
 plane in the unicode escape mechanism

---
 src/json.cc       | 73 +++++++++++++++++++++++++++++++++++++++++++++++
 src/json.h        |  4 +++
 test/json_unit.cc |  4 +++
 3 files changed, 81 insertions(+)
diff --git a/src/json.cc b/src/json.cc
index 1666b23c..98225167 100644
--- a/src/json.cc
+++ b/src/json.cc
@@ -2073,6 +2073,9 @@ std::string json::parser::parseString()
                     result += '\n';
                 } else if (currentChar == 'r') {
                     result += '\r';
+                } else if (currentChar == 'u') {
+                    pos_++;
+                    result += parseUnicodeEscape();
                 } else {
                     error("expected one of \\,/,b,f,n,r,t behind backslash.");
                 }
@@ -2118,6 +2121,76 @@ std::string json::parser::parseString()
     error("expected '\"'");
 }
 
+std::string json::parser::unicodeToUTF8(unsigned int codepoint) {
+
+    // it's just a ASCII compatible codepoint,
+    // so we just interpret the point as a character
+    if (codepoint <= 0x7f) {
+        return std::string(1, static_cast<char>(codepoint));
+    }
+    else if (codepoint <= 0x7ff)
+    {
+        std::string result(2, static_cast<char>(0xc0 | ((codepoint >> 6) & 0x1f)));
+        result[1] = static_cast<char>(0x80 | (codepoint & 0x3f));
+        return result;
+    }
+    else if (codepoint <= 0xffff)
+    {
+        std::string result(3, static_cast<char>(0xe0 | ((codepoint >> 12) & 0x0f)));
+        result[1] = static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f));
+        result[2] = static_cast<char>(0x80 | (codepoint & 0x3f));
+        return result;
+    }
+    else if (codepoint <= 0x1fffff)
+    {
+        std::string result(4, static_cast<char>(0xf0 | ((codepoint >> 18) & 0x07)));
+        result[1] = static_cast<char>(0x80 | ((codepoint >> 12) & 0x3f));
+        result[2] = static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f));
+        result[3] = static_cast<char>(0x80 | (codepoint & 0x3f));
+        return result;
+    } else {
+        std::string errorMessage = "Invalid codepoint: ";
+        errorMessage += codepoint;
+        error(errorMessage);
+    }
+}
+
+/*!
+Parses the JSON style unicode escape sequence (\uXXXX).
+
+@return the utf-8 character the escape sequence escaped
+
+@pre  An opening quote \p " was read in the main parse function @ref parse.
+      pos_ is the position after the opening quote.
+
+@post The character after the closing quote \p " is the current character @ref
+      current_. Whitespace is skipped.
+*/
+std::string json::parser::parseUnicodeEscape() {
+    const auto startPos = pos_;
+    if (pos_ + 3 >= buffer_.size()) {
+        error("Got end of input while parsing unicode escape sequence \\uXXXX");
+    }
+    std::string hexCode(4, ' ');
+    for(; pos_ < startPos + 4; pos_++) {
+        char currentChar = buffer_[pos_];
+        if (   (currentChar >= '0' && currentChar <= '9')
+            || (currentChar >= 'a' && currentChar <= 'f')
+            || (currentChar >= 'A' && currentChar <= 'F')) {
+            // all is well, we have valid hexadecimal chars
+            // so we copy that char into our string
+            hexCode[pos_ - startPos] = currentChar;
+        } else {
+            error("Found non-hexadecimal character in unicode escape sequence!");
+        }
+    }
+    pos_--;
+    // case is safe as 4 hex characters can't present more than 16 bits
+    return unicodeToUTF8(static_cast<unsigned int>(std::stoul(hexCode, nullptr, 16)));
+}
+
+
+
 /*!
 This function is called in case a \p "t" is read in the main parse function
 @ref parse. In the standard, the \p "true" token is the only candidate, so the
diff --git a/src/json.h b/src/json.h
index 2dd99348..1b5e8fcf 100644
--- a/src/json.h
+++ b/src/json.h
@@ -418,6 +418,10 @@ class json
         inline void error(const std::string&) __attribute__((noreturn));
         /// parse a quoted string
         inline std::string parseString();
+        /// transforms a unicode codepoint to it's UTF-8 presentation
+        inline std::string unicodeToUTF8(unsigned int codepoint);
+        /// parses a unicode escape sequence
+        inline std::string parseUnicodeEscape();
         /// parse a Boolean "true"
         inline void parseTrue();
         /// parse a Boolean "false"
diff --git a/test/json_unit.cc b/test/json_unit.cc
index b2fcd65e..fb89a2a1 100644
--- a/test/json_unit.cc
+++ b/test/json_unit.cc
@@ -1652,6 +1652,10 @@ TEST_CASE("Parser")
         CHECK(json::parse("\"a\\nz\"") == json("a\nz"));
         CHECK(json::parse("\"\\n\"") == json("\n"));
 
+        // escape unicode characters
+        CHECK(json::parse("\"\\u002F\"") == json("/"));
+        CHECK(json::parse("\"\\u00E4\"") == json(u8"\u00E4"));
+
         // escaping senseless stuff
         CHECK_THROWS_AS(json::parse("\"\\z\""), std::invalid_argument);
         CHECK_THROWS_AS(json::parse("\"\\ \""), std::invalid_argument);

From 5a54e46709122427fae296abc04cdab8dacbfc6d Mon Sep 17 00:00:00 2001
From: Raphael Isemann <teemperor@gmail.com>
Date: Sat, 10 Jan 2015 16:49:10 +0100
Subject: [PATCH 2/6] Fully implemented the JSON spec

---
 src/json.cc       | 246 +++++++++++++++++++++++++++++++++++-----------
 src/json.h        |   6 +-
 test/json_unit.cc |  42 +++++++-
 3 files changed, 232 insertions(+), 62 deletions(-)

diff --git a/src/json.cc b/src/json.cc
index 98225167..b664ef15 100644
--- a/src/json.cc
+++ b/src/json.cc
@@ -2049,40 +2049,61 @@ std::string json::parser::parseString()
     // the result of the parse process
     std::string result;
 
-    // iterate with pos_ over the whole string
-    for (; pos_ < buffer_.size(); pos_++) {
+    // iterate with pos_ over the whole input until we found the end and return
+    // or we exit via error()
+    for (; pos_ < buffer_.size(); pos_++)
+    {
         char currentChar = buffer_[pos_];
 
-        // uneven amount of backslashes means the user wants to escape something
-        if (!evenAmountOfBackslashes) {
+        if (!evenAmountOfBackslashes)
+        {
+            // uneven amount of backslashes means the user wants to escape something
+            // so we know there is a case such as '\X' or '\\\X' but we don't
+            // know yet what X is.
+            // at this point in the code, the currentChar has the value of X
 
             // slash, backslash and quote are copied as is
             if (   currentChar == '/'
                 || currentChar == '\\'
-                || currentChar == '"') {
+                || currentChar == '"')
+            {
                 result += currentChar;
-            } else {
-                // All other characters are replaced by their respective special character
-                if (currentChar == 't') {
-                    result += '\t';
-                } else if (currentChar == 'b') {
-                    result += '\b';
-                } else if (currentChar == 'f') {
-                    result += '\f';
-                } else if (currentChar == 'n') {
-                    result += '\n';
-                } else if (currentChar == 'r') {
-                    result += '\r';
-                } else if (currentChar == 'u') {
-                    pos_++;
-                    result += parseUnicodeEscape();
-                } else {
-                    error("expected one of \\,/,b,f,n,r,t behind backslash.");
-                }
-                // TODO implement \uXXXX
             }
-        } else {
-            if (currentChar == '"') {
+            else
+            {
+                // All other characters are replaced by their respective special character
+                if (currentChar == 't')
+                    result += '\t';
+                else if (currentChar == 'b')
+                    result += '\b';
+                else if (currentChar == 'f')
+                    result += '\f';
+                else if (currentChar == 'n')
+                    result += '\n';
+                else if (currentChar == 'r')
+                    result += '\r';
+                else if (currentChar == 'u')
+                {
+                    // \uXXXX[\uXXXX] is used for escaping unicode, which
+                    // has it's own subroutine.
+                    result += parseUnicodeEscape();
+                    // the parsing process has brought us one step behind the
+                    // unicode escape sequence:
+                    // \uXXXX
+                    //       ^
+                    // so we need to go one character back or the parser
+                    // would skip the character we are currently pointing at
+                    // (as the for-loop will drecement pos_ after this iteration).
+                    pos_--;
+                }
+                else // user did something like \z and we should report a error
+                    error("expected one of \\,/,b,f,n,r,t,u behind backslash.");
+            }
+        }
+        else
+        {
+            if (currentChar == '"')
+            {
                 // currentChar is a quote, so we found the end of the string
 
 
@@ -2093,7 +2114,9 @@ std::string json::parser::parseString()
 
                 // bring the result of the parsing process back to the caller
                 return result;
-            } else if (currentChar != '\\') {
+            }
+            else if (currentChar != '\\')
+            {
                 // all non-backslash characters are added to the end of the result string.
                 // the only backslashes we want in the result are the ones that are escaped (which happens above).
                 result += currentChar;
@@ -2121,34 +2144,74 @@ std::string json::parser::parseString()
     error("expected '\"'");
 }
 
-std::string json::parser::unicodeToUTF8(unsigned int codepoint) {
 
-    // it's just a ASCII compatible codepoint,
-    // so we just interpret the point as a character
-    if (codepoint <= 0x7f) {
+
+/*!
+Turns a code point into it's UTF-8 representation.
+You should only pass numbers < 0x10ffff into this function
+(everything else is a invalid code point).
+
+@return the UTF-8 representation of the given codepoint
+
+@pre  This method isn't accessing the members of the parser
+
+@post This method isn't accessing the members of the parser
+*/
+std::string json::parser::codepointToUTF8(unsigned int codepoint)
+{
+    // this method contains a lot of bit manipulations to
+    // build the bytes for UTF-8.
+
+    // the '(... >> S) & 0xHH'-patterns are used to retrieve
+    // certain bits from the code points.
+
+    // all static casts in this method have boundary checks
+
+    // we initialize all strings with their final length
+    // (e.g. 1 to 4 bytes) to save the reallocations.
+
+
+    if (codepoint <= 0x7f)
+    {
+        // it's just a ASCII compatible codepoint,
+        // so we just interpret the point as a character
+        // and return ASCII
+
         return std::string(1, static_cast<char>(codepoint));
     }
+    // if true, we need two bytes to encode this as UTF-8
     else if (codepoint <= 0x7ff)
     {
-        std::string result(2, static_cast<char>(0xc0 | ((codepoint >> 6) & 0x1f)));
-        result[1] = static_cast<char>(0x80 | (codepoint & 0x3f));
+        // the 0xC0 enables the two most significant two bits
+        // to make this a two-byte UTF-8 character.
+        std::string result(2, static_cast<char>(0xC0 | ((codepoint >> 6) & 0x1F)));
+        result[1] = static_cast<char>(0x80 | (codepoint & 0x3F));
         return result;
     }
+    // if true, now we need three bytes to encode this as UTF-8
     else if (codepoint <= 0xffff)
     {
-        std::string result(3, static_cast<char>(0xe0 | ((codepoint >> 12) & 0x0f)));
-        result[1] = static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f));
-        result[2] = static_cast<char>(0x80 | (codepoint & 0x3f));
+        // the 0xE0 enables the three most significant two bits
+        // to make this a three-byte UTF-8 character.
+        std::string result(3, static_cast<char>(0xE0 | ((codepoint >> 12) & 0x0F)));
+        result[1] = static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
+        result[2] = static_cast<char>(0x80 | (codepoint & 0x3F));
         return result;
     }
-    else if (codepoint <= 0x1fffff)
+    // if true, we need maximal four bytes to encode this as UTF-8
+    else if (codepoint <= 0x10ffff)
     {
-        std::string result(4, static_cast<char>(0xf0 | ((codepoint >> 18) & 0x07)));
-        result[1] = static_cast<char>(0x80 | ((codepoint >> 12) & 0x3f));
-        result[2] = static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f));
-        result[3] = static_cast<char>(0x80 | (codepoint & 0x3f));
+        // the 0xE0 enables the four most significant two bits
+        // to make this a three-byte UTF-8 character.
+        std::string result(4, static_cast<char>(0xF0 | ((codepoint >> 18) & 0x07)));
+        result[1] = static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F));
+        result[2] = static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
+        result[3] = static_cast<char>(0x80 | (codepoint & 0x3F));
         return result;
-    } else {
+    }
+    else
+    {
+        // Can't be tested without direct access to this private method.
         std::string errorMessage = "Invalid codepoint: ";
         errorMessage += codepoint;
         error(errorMessage);
@@ -2156,39 +2219,110 @@ std::string json::parser::unicodeToUTF8(unsigned int codepoint) {
 }
 
 /*!
-Parses the JSON style unicode escape sequence (\uXXXX).
+Parses 4 hexadecimal characters as a number.
 
-@return the utf-8 character the escape sequence escaped
+@return the value of the number the hexadecimal characters represent.
 
-@pre  An opening quote \p " was read in the main parse function @ref parse.
-      pos_ is the position after the opening quote.
+@pre  pos_ is pointing to the first of the 4 hexadecimal characters.
 
-@post The character after the closing quote \p " is the current character @ref
-      current_. Whitespace is skipped.
+@post pos_ is pointing to the character after the 4 hexadecimal characters.
 */
-std::string json::parser::parseUnicodeEscape() {
+unsigned int json::parser::parse4HexCodepoint()
+{
     const auto startPos = pos_;
-    if (pos_ + 3 >= buffer_.size()) {
+
+    // check if the  remaining buffer is long enough to even hold 4 characters
+    if (pos_ + 3 >= buffer_.size())
+    {
         error("Got end of input while parsing unicode escape sequence \\uXXXX");
     }
+
+    // make a string that can hold the pair
     std::string hexCode(4, ' ');
-    for(; pos_ < startPos + 4; pos_++) {
+
+    for(; pos_ < startPos + 4; pos_++)
+    {
+        // no boundary check here as we already checked above
         char currentChar = buffer_[pos_];
+
+        // check if we have a hexadecimal character
         if (   (currentChar >= '0' && currentChar <= '9')
             || (currentChar >= 'a' && currentChar <= 'f')
-            || (currentChar >= 'A' && currentChar <= 'F')) {
+            || (currentChar >= 'A' && currentChar <= 'F'))
+        {
             // all is well, we have valid hexadecimal chars
             // so we copy that char into our string
             hexCode[pos_ - startPos] = currentChar;
-        } else {
+        }
+        else
+        {
             error("Found non-hexadecimal character in unicode escape sequence!");
         }
     }
-    pos_--;
-    // case is safe as 4 hex characters can't present more than 16 bits
-    return unicodeToUTF8(static_cast<unsigned int>(std::stoul(hexCode, nullptr, 16)));
+    // the cast is safe as 4 hex characters can't present more than 16 bits
+    // the input to stoul was checked to contain only hexadecimal characters (see above)
+    return static_cast<unsigned int>(std::stoul(hexCode, nullptr, 16));
 }
 
+/*!
+Parses the unicode escape codes as defined in the ECMA-404.
+The escape sequence has two forms:
+1. \uXXXX
+2. \uXXXX\uYYYY
+where X and Y are a hexadecimal character (a-zA-Z0-9).
+
+Form 1 just contains the unicode code point in the hexadecimal number XXXX.
+Form 2 is encoding a UTF-16 surrogate pair. The high surrogate is XXXX, the low surrogate is YYYY.
+
+@return the UTF-8 character this unicode escape sequence escaped.
+
+@pre  pos_ is pointing at at the 'u' behind the first backslash.
+
+@post pos_ is pointing at the character behind the last X (or Y in form 2).
+*/
+std::string json::parser::parseUnicodeEscape()
+{
+    // jump to the first hex value
+    pos_++;
+    // parse the hex first hex values
+    unsigned int firstCodepoint = parse4HexCodepoint();
+
+
+    if (firstCodepoint >= 0xD800 && firstCodepoint <= 0xDBFF)
+    {
+        // we found invalid code points, which means we either have a malformed input
+        // or we found a high surrogate.
+        // we can only find out by seeing if the next character also wants to encode
+        // a unicode character (so, we have the \uXXXX\uXXXX case here).
+
+        // jump behind the next \u
+        pos_ += 2;
+        // try to parse the next hex values.
+        // the method does boundary checking for us, so no need to do that here
+        unsigned secondCodepoint = parse4HexCodepoint();
+        // ok, we have a low surrogate, check if it is a valid one
+        if (secondCodepoint >= 0xDC00 && secondCodepoint <= 0xDFFF)
+        {
+            // calculate the final code point from the pair according to the spec
+            unsigned int finalCodePoint =
+                    // high surrogate occupies the most significant 22 bits
+                    (firstCodepoint << 10)
+                    // low surrogate occupies the least significant 15 bits
+                    + secondCodepoint
+                    // there is still the 0xD800, 0xDC00 and 0x10000 noise in the result
+                    // so we have to substract with (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
+                    - 0x35FDC00;
+
+            // we transform the calculated point into UTF-8
+            return codepointToUTF8(finalCodePoint);
+        }
+        else
+            error("missing low surrogate");
+
+    }
+    // We have Form 1, so we just interpret the XXXX as a code point
+    return codepointToUTF8(firstCodepoint);
+}
 
 
 /*!
diff --git a/src/json.h b/src/json.h
index 1b5e8fcf..90c5ded8 100644
--- a/src/json.h
+++ b/src/json.h
@@ -419,8 +419,10 @@ class json
         /// parse a quoted string
         inline std::string parseString();
         /// transforms a unicode codepoint to it's UTF-8 presentation
-        inline std::string unicodeToUTF8(unsigned int codepoint);
-        /// parses a unicode escape sequence
+        inline std::string codepointToUTF8(unsigned int codepoint);
+        /// parses 4 hex characters that represent a unicode codepoint
+        inline unsigned int parse4HexCodepoint();
+        /// parses \uXXXX[\uXXXX] unicode escape characters
         inline std::string parseUnicodeEscape();
         /// parse a Boolean "true"
         inline void parseTrue();
diff --git a/test/json_unit.cc b/test/json_unit.cc
index fb89a2a1..ab679fbf 100644
--- a/test/json_unit.cc
+++ b/test/json_unit.cc
@@ -1652,10 +1652,6 @@ TEST_CASE("Parser")
         CHECK(json::parse("\"a\\nz\"") == json("a\nz"));
         CHECK(json::parse("\"\\n\"") == json("\n"));
 
-        // escape unicode characters
-        CHECK(json::parse("\"\\u002F\"") == json("/"));
-        CHECK(json::parse("\"\\u00E4\"") == json(u8"\u00E4"));
-
         // escaping senseless stuff
         CHECK_THROWS_AS(json::parse("\"\\z\""), std::invalid_argument);
         CHECK_THROWS_AS(json::parse("\"\\ \""), std::invalid_argument);
@@ -1665,6 +1661,44 @@ TEST_CASE("Parser")
         CHECK_THROWS_AS(json::parse("\""), std::invalid_argument);
     }
 
+    SECTION("unicode_escaping")
+    {
+        // two tests for uppercase and lowercase hex
+
+        // normal forward slash in ASCII range
+        CHECK(json::parse("\"\\u002F\"") == json("/"));
+        CHECK(json::parse("\"\\u002f\"") == json("/"));
+        // german a umlaut
+        CHECK(json::parse("\"\\u00E4\"") == json(u8"\u00E4"));
+        CHECK(json::parse("\"\\u00e4\"") == json(u8"\u00E4"));
+        // weird d
+        CHECK(json::parse("\"\\u0111\"") == json(u8"\u0111"));
+        // unicode arrow left
+        CHECK(json::parse("\"\\u2190\"") == json(u8"\u2190"));
+        // pleasing osiris by testing hieroglyph support
+        CHECK(json::parse("\"\\uD80C\\uDC60\"") == json(u8"\U00013060"));
+        CHECK(json::parse("\"\\ud80C\\udc60\"") == json(u8"\U00013060"));
+
+
+        // no hex numbers behind the \u
+        CHECK_THROWS_AS(json::parse("\"\\uD80v\""), std::invalid_argument);
+        CHECK_THROWS_AS(json::parse("\"\\uD80 A\""), std::invalid_argument);
+        CHECK_THROWS_AS(json::parse("\"\\uD8v\""), std::invalid_argument);
+        CHECK_THROWS_AS(json::parse("\"\\uDv\""), std::invalid_argument);
+        CHECK_THROWS_AS(json::parse("\"\\uv\""), std::invalid_argument);
+        CHECK_THROWS_AS(json::parse("\"\\u\""), std::invalid_argument);
+        CHECK_THROWS_AS(json::parse("\"\\u\\u\""), std::invalid_argument);
+        CHECK_THROWS_AS(json::parse("\"a\\uD80vAz\""), std::invalid_argument);
+        // missing part of a surrogate pair
+        CHECK_THROWS_AS(json::parse("\"bla \\uD80C bla\""), std::invalid_argument);
+        CHECK_THROWS_AS(json::parse("\"\\uD80C bla bla\""), std::invalid_argument);
+        CHECK_THROWS_AS(json::parse("\"bla bla \\uD80C bla bla\""), std::invalid_argument);
+        // senseless surrogate pair
+        CHECK_THROWS_AS(json::parse("\"\\uD80C\\uD80C\""), std::invalid_argument);
+        CHECK_THROWS_AS(json::parse("\"\\uD80C\\u0000\""), std::invalid_argument);
+        CHECK_THROWS_AS(json::parse("\"\\uD80C\\uFFFF\""), std::invalid_argument);
+    }
+
     SECTION("boolean")
     {
         // accept the exact values

From 1287f03084bdede0484180bb54be73be4349e2f0 Mon Sep 17 00:00:00 2001
From: Raphael Isemann <teemperor@gmail.com>
Date: Sat, 10 Jan 2015 16:50:39 +0100
Subject: [PATCH 3/6] Code point are two words, and so the "P" should be
 capital

---
 src/json.cc | 56 ++++++++++++++++++++++++++---------------------------
 src/json.h  |  6 +++---
 2 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/src/json.cc b/src/json.cc
index b664ef15..f7da724c 100644
--- a/src/json.cc
+++ b/src/json.cc
@@ -2151,13 +2151,13 @@ Turns a code point into it's UTF-8 representation.
 You should only pass numbers < 0x10ffff into this function
 (everything else is a invalid code point).
 
-@return the UTF-8 representation of the given codepoint
+@return the UTF-8 representation of the given code point
 
 @pre  This method isn't accessing the members of the parser
 
 @post This method isn't accessing the members of the parser
 */
-std::string json::parser::codepointToUTF8(unsigned int codepoint)
+std::string json::parser::codePointToUTF8(unsigned int codePoint)
 {
     // this method contains a lot of bit manipulations to
     // build the bytes for UTF-8.
@@ -2171,49 +2171,49 @@ std::string json::parser::codepointToUTF8(unsigned int codepoint)
     // (e.g. 1 to 4 bytes) to save the reallocations.
 
 
-    if (codepoint <= 0x7f)
+    if (codePoint <= 0x7f)
     {
-        // it's just a ASCII compatible codepoint,
+        // it's just a ASCII compatible codePoint,
         // so we just interpret the point as a character
         // and return ASCII
 
-        return std::string(1, static_cast<char>(codepoint));
+        return std::string(1, static_cast<char>(codePoint));
     }
     // if true, we need two bytes to encode this as UTF-8
-    else if (codepoint <= 0x7ff)
+    else if (codePoint <= 0x7ff)
     {
         // the 0xC0 enables the two most significant two bits
         // to make this a two-byte UTF-8 character.
-        std::string result(2, static_cast<char>(0xC0 | ((codepoint >> 6) & 0x1F)));
-        result[1] = static_cast<char>(0x80 | (codepoint & 0x3F));
+        std::string result(2, static_cast<char>(0xC0 | ((codePoint >> 6) & 0x1F)));
+        result[1] = static_cast<char>(0x80 | (codePoint & 0x3F));
         return result;
     }
     // if true, now we need three bytes to encode this as UTF-8
-    else if (codepoint <= 0xffff)
+    else if (codePoint <= 0xffff)
     {
         // the 0xE0 enables the three most significant two bits
         // to make this a three-byte UTF-8 character.
-        std::string result(3, static_cast<char>(0xE0 | ((codepoint >> 12) & 0x0F)));
-        result[1] = static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
-        result[2] = static_cast<char>(0x80 | (codepoint & 0x3F));
+        std::string result(3, static_cast<char>(0xE0 | ((codePoint >> 12) & 0x0F)));
+        result[1] = static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F));
+        result[2] = static_cast<char>(0x80 | (codePoint & 0x3F));
         return result;
     }
     // if true, we need maximal four bytes to encode this as UTF-8
-    else if (codepoint <= 0x10ffff)
+    else if (codePoint <= 0x10ffff)
     {
         // the 0xE0 enables the four most significant two bits
         // to make this a three-byte UTF-8 character.
-        std::string result(4, static_cast<char>(0xF0 | ((codepoint >> 18) & 0x07)));
-        result[1] = static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F));
-        result[2] = static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
-        result[3] = static_cast<char>(0x80 | (codepoint & 0x3F));
+        std::string result(4, static_cast<char>(0xF0 | ((codePoint >> 18) & 0x07)));
+        result[1] = static_cast<char>(0x80 | ((codePoint >> 12) & 0x3F));
+        result[2] = static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F));
+        result[3] = static_cast<char>(0x80 | (codePoint & 0x3F));
         return result;
     }
     else
     {
         // Can't be tested without direct access to this private method.
-        std::string errorMessage = "Invalid codepoint: ";
-        errorMessage += codepoint;
+        std::string errorMessage = "Invalid codePoint: ";
+        errorMessage += codePoint;
         error(errorMessage);
     }
 }
@@ -2227,7 +2227,7 @@ Parses 4 hexadecimal characters as a number.
 
 @post pos_ is pointing to the character after the 4 hexadecimal characters.
 */
-unsigned int json::parser::parse4HexCodepoint()
+unsigned int json::parser::parse4HexCodePoint()
 {
     const auto startPos = pos_;
 
@@ -2285,10 +2285,10 @@ std::string json::parser::parseUnicodeEscape()
     // jump to the first hex value
     pos_++;
     // parse the hex first hex values
-    unsigned int firstCodepoint = parse4HexCodepoint();
+    unsigned int firstCodePoint = parse4HexCodePoint();
 
 
-    if (firstCodepoint >= 0xD800 && firstCodepoint <= 0xDBFF)
+    if (firstCodePoint >= 0xD800 && firstCodePoint <= 0xDBFF)
     {
         // we found invalid code points, which means we either have a malformed input
         // or we found a high surrogate.
@@ -2299,29 +2299,29 @@ std::string json::parser::parseUnicodeEscape()
         pos_ += 2;
         // try to parse the next hex values.
         // the method does boundary checking for us, so no need to do that here
-        unsigned secondCodepoint = parse4HexCodepoint();
+        unsigned secondCodePoint = parse4HexCodePoint();
         // ok, we have a low surrogate, check if it is a valid one
-        if (secondCodepoint >= 0xDC00 && secondCodepoint <= 0xDFFF)
+        if (secondCodePoint >= 0xDC00 && secondCodePoint <= 0xDFFF)
         {
             // calculate the final code point from the pair according to the spec
             unsigned int finalCodePoint =
                     // high surrogate occupies the most significant 22 bits
-                    (firstCodepoint << 10)
+                    (firstCodePoint << 10)
                     // low surrogate occupies the least significant 15 bits
-                    + secondCodepoint
+                    + secondCodePoint
                     // there is still the 0xD800, 0xDC00 and 0x10000 noise in the result
                     // so we have to substract with (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
                     - 0x35FDC00;
 
             // we transform the calculated point into UTF-8
-            return codepointToUTF8(finalCodePoint);
+            return codePointToUTF8(finalCodePoint);
         }
         else
             error("missing low surrogate");
 
     }
     // We have Form 1, so we just interpret the XXXX as a code point
-    return codepointToUTF8(firstCodepoint);
+    return codePointToUTF8(firstCodePoint);
 }
 
 
diff --git a/src/json.h b/src/json.h
index 90c5ded8..2fa0bdf3 100644
--- a/src/json.h
+++ b/src/json.h
@@ -419,9 +419,9 @@ class json
         /// parse a quoted string
         inline std::string parseString();
         /// transforms a unicode codepoint to it's UTF-8 presentation
-        inline std::string codepointToUTF8(unsigned int codepoint);
-        /// parses 4 hex characters that represent a unicode codepoint
-        inline unsigned int parse4HexCodepoint();
+        inline std::string codePointToUTF8(unsigned int codePoint);
+        /// parses 4 hex characters that represent a unicode code point
+        inline unsigned int parse4HexCodePoint();
         /// parses \uXXXX[\uXXXX] unicode escape characters
         inline std::string parseUnicodeEscape();
         /// parse a Boolean "true"

From 0fcc414995a89eb2f706f7a2bf57d0112b328d2e Mon Sep 17 00:00:00 2001
From: Raphael Isemann <teemperor@gmail.com>
Date: Sat, 10 Jan 2015 18:28:53 +0100
Subject: [PATCH 4/6] More testing and updated CMake to allow calling private
 functions from the tests

---
 CMakeLists.txt    | 3 +++
 test/json_unit.cc | 6 ++++++
 2 files changed, 9 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fc0d2c70..9e19f57f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,6 +4,9 @@ project(json)
 # Enable C++11 and set flags for coverage testing
 SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -g -O0 --coverage -fprofile-arcs -ftest-coverage")
 
+# Make everything public for testing purposes
+add_definitions(-Dprivate=public)
+
 # If not specified, use Debug as build type (necessary for coverage testing)
 if( NOT CMAKE_BUILD_TYPE )
   set( CMAKE_BUILD_TYPE Debug CACHE STRING
diff --git a/test/json_unit.cc b/test/json_unit.cc
index ab679fbf..baad482a 100644
--- a/test/json_unit.cc
+++ b/test/json_unit.cc
@@ -1697,6 +1697,12 @@ TEST_CASE("Parser")
         CHECK_THROWS_AS(json::parse("\"\\uD80C\\uD80C\""), std::invalid_argument);
         CHECK_THROWS_AS(json::parse("\"\\uD80C\\u0000\""), std::invalid_argument);
         CHECK_THROWS_AS(json::parse("\"\\uD80C\\uFFFF\""), std::invalid_argument);
+
+        // test private code point converter function
+        CHECK_NOTHROW(json::parser("").codePointToUTF8(0x10FFFE));
+        CHECK_NOTHROW(json::parser("").codePointToUTF8(0x10FFFF));
+        CHECK_THROWS_AS(json::parser("").codePointToUTF8(0x110000), std::invalid_argument);
+        CHECK_THROWS_AS(json::parser("").codePointToUTF8(0x110001), std::invalid_argument);
     }
 
     SECTION("boolean")

From a409ba94888b823e276b5f6f88b063c4b5a41936 Mon Sep 17 00:00:00 2001
From: Raphael Isemann <teemperor@gmail.com>
Date: Sat, 10 Jan 2015 18:46:01 +0100
Subject: [PATCH 5/6] Fixed build

---
 src/json.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/json.h b/src/json.h
index 2fa0bdf3..c09d5665 100644
--- a/src/json.h
+++ b/src/json.h
@@ -419,7 +419,7 @@ class json
         /// parse a quoted string
         inline std::string parseString();
         /// transforms a unicode codepoint to it's UTF-8 presentation
-        inline std::string codePointToUTF8(unsigned int codePoint);
+        std::string codePointToUTF8(unsigned int codePoint);
         /// parses 4 hex characters that represent a unicode code point
         inline unsigned int parse4HexCodePoint();
         /// parses \uXXXX[\uXXXX] unicode escape characters

From a866a9d9800d68ecd0260f5ffe7206a1b49b94fb Mon Sep 17 00:00:00 2001
From: Raphael Isemann <teemperor@gmail.com>
Date: Sat, 10 Jan 2015 19:53:13 +0100
Subject: [PATCH 6/6] Reapplied code style fixes

---
 .idea/codeStyleSettings.xml |  35 ++++++++++++
 .idea/json.iml              |   8 +++
 .idea/misc.xml              |   5 ++
 .idea/modules.xml           |   8 +++
 .idea/vcs.xml               |   6 ++
 src/json.cc                 | 107 ++++++++++++++++++++++--------------
 6 files changed, 129 insertions(+), 40 deletions(-)
 create mode 100644 .idea/codeStyleSettings.xml
 create mode 100644 .idea/json.iml
 create mode 100644 .idea/misc.xml
 create mode 100644 .idea/modules.xml
 create mode 100644 .idea/vcs.xml

diff --git a/.idea/codeStyleSettings.xml b/.idea/codeStyleSettings.xml
new file mode 100644
index 00000000..65c39702
--- /dev/null
+++ b/.idea/codeStyleSettings.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectCodeStyleSettingsManager">
+    <option name="PER_PROJECT_SETTINGS">
+      <value>
+        <Objective-C-extensions>
+          <option name="GENERATE_INSTANCE_VARIABLES_FOR_PROPERTIES" value="ASK" />
+          <option name="RELEASE_STYLE" value="IVAR" />
+          <file>
+            <option name="com.jetbrains.objc.util.OCDeclarationKind" value="Import" />
+            <option name="com.jetbrains.objc.util.OCDeclarationKind" value="Macro" />
+            <option name="com.jetbrains.objc.util.OCDeclarationKind" value="Typedef" />
+            <option name="com.jetbrains.objc.util.OCDeclarationKind" value="Struct" />
+            <option name="com.jetbrains.objc.util.OCDeclarationKind" value="Enum" />
+            <option name="com.jetbrains.objc.util.OCDeclarationKind" value="ClassPredef" />
+            <option name="com.jetbrains.objc.util.OCDeclarationKind" value="Constant" />
+            <option name="com.jetbrains.objc.util.OCDeclarationKind" value="Global" />
+            <option name="com.jetbrains.objc.util.OCDeclarationKind" value="FunctionPredecl" />
+            <option name="com.jetbrains.objc.util.OCDeclarationKind" value="Function" />
+            <option name="com.jetbrains.objc.util.OCDeclarationKind" value="Class" />
+          </file>
+          <class>
+            <option name="com.jetbrains.objc.util.OCDeclarationKind" value="Property" />
+            <option name="com.jetbrains.objc.util.OCDeclarationKind" value="Synthesize" />
+            <option name="com.jetbrains.objc.util.OCDeclarationKind" value="InitMethod" />
+            <option name="com.jetbrains.objc.util.OCDeclarationKind" value="StaticMethod" />
+            <option name="com.jetbrains.objc.util.OCDeclarationKind" value="InstanceMethod" />
+            <option name="com.jetbrains.objc.util.OCDeclarationKind" value="DeallocMethod" />
+          </class>
+        </Objective-C-extensions>
+      </value>
+    </option>
+    <option name="PREFERRED_PROJECT_CODE_STYLE" value="Default (1)" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/json.iml b/.idea/json.iml
new file mode 100644
index 00000000..bc2cd874
--- /dev/null
+++ b/.idea/json.iml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="CPP_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 00000000..6b328020
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="CMakeWorkspace" PROJECT_DIR="$PROJECT_DIR$" />
+  <component name="ProjectRootManager" version="2" />
+</project>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 00000000..cd370d1e
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/json.iml" filepath="$PROJECT_DIR$/.idea/json.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 00000000..94a25f7f
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/src/json.cc b/src/json.cc
index 5fb80e8c..3b307511 100644
--- a/src/json.cc
+++ b/src/json.cc
@@ -2060,10 +2060,10 @@ std::string json::parser::parseString()
 
         if (!evenAmountOfBackslashes)
         {
-            // uneven amount of backslashes means the user wants to escape something
-            // so we know there is a case such as '\X' or '\\\X' but we don't
-            // know yet what X is.
-            // at this point in the code, the currentChar has the value of X
+            // uneven amount of backslashes means the user wants to escape
+            // something so we know there is a case such as '\X' or '\\\X' but
+            // we don't know yet what X is.
+            // at this point in the code, the currentChar has the value of X.
 
             // slash, backslash and quote are copied as is
             if (   currentChar == '/'
@@ -2074,33 +2074,55 @@ std::string json::parser::parseString()
             }
             else
             {
-                // All other characters are replaced by their respective special character
-                if (currentChar == 't')
-                    result += '\t';
-                else if (currentChar == 'b')
-                    result += '\b';
-                else if (currentChar == 'f')
-                    result += '\f';
-                else if (currentChar == 'n')
-                    result += '\n';
-                else if (currentChar == 'r')
-                    result += '\r';
-                else if (currentChar == 'u')
+                // all other characters are replaced by their respective special
+                // character
+                switch (currentChar)
                 {
-                    // \uXXXX[\uXXXX] is used for escaping unicode, which
-                    // has it's own subroutine.
-                    result += parseUnicodeEscape();
-                    // the parsing process has brought us one step behind the
-                    // unicode escape sequence:
-                    // \uXXXX
-                    //       ^
-                    // so we need to go one character back or the parser
-                    // would skip the character we are currently pointing at
-                    // (as the for-loop will drecement pos_ after this iteration).
-                    pos_--;
+                    case 't':
+                    {
+                        result += '\t';
+                        break;
+                    }
+                    case 'b':
+                    {
+                        result += '\b';
+                        break;
+                    }
+                    case 'f':
+                    {
+                        result += '\f';
+                        break;
+                    }
+                    case 'n':
+                    {
+                        result += '\n';
+                        break;
+                    }
+                    case 'r':
+                    {
+                        result += '\r';
+                        break;
+                    }
+                    case 'u':
+                    {
+                        // \uXXXX[\uXXXX] is used for escaping unicode, which
+                        // has it's own subroutine.
+                        result += parseUnicodeEscape();
+                        // the parsing process has brought us one step behind
+                        // the unicode escape sequence:
+                        // \uXXXX
+                        //       ^
+                        // we need to go one character back or the parser would
+                        // skip the character we are currently pointing at as
+                        // the for-loop will decrement pos_ after this iteration
+                        pos_--;
+                        break;
+                    }
+                    default:
+                    {
+                        error("expected one of \\, /, b, f, n, r, t, u behind backslash.");
+                    }
                 }
-                else // user did something like \z and we should report a error
-                    error("expected one of \\,/,b,f,n,r,t,u behind backslash.");
             }
         }
         else
@@ -2119,8 +2141,9 @@ std::string json::parser::parseString()
             }
             else if (currentChar != '\\')
             {
-                // all non-backslash characters are added to the end of the result string.
-                // the only backslashes we want in the result are the ones that are escaped (which happens above).
+                // all non-backslash characters are added to the end of the
+                // result string. The only backslashes we want in the result
+                // are the ones that are escaped (which happens above).
                 result += currentChar;
             }
         }
@@ -2262,7 +2285,8 @@ unsigned int json::parser::parse4HexCodePoint()
         }
     }
     // the cast is safe as 4 hex characters can't present more than 16 bits
-    // the input to stoul was checked to contain only hexadecimal characters (see above)
+    // the input to stoul was checked to contain only hexadecimal characters
+    // (see above)
     return static_cast<unsigned int>(std::stoul(hexCode, nullptr, 16));
 }
 
@@ -2274,7 +2298,8 @@ The escape sequence has two forms:
 where X and Y are a hexadecimal character (a-zA-Z0-9).
 
 Form 1 just contains the unicode code point in the hexadecimal number XXXX.
-Form 2 is encoding a UTF-16 surrogate pair. The high surrogate is XXXX, the low surrogate is YYYY.
+Form 2 is encoding a UTF-16 surrogate pair. The high surrogate is XXXX, the low
+surrogate is YYYY.
 
 @return the UTF-8 character this unicode escape sequence escaped.
 
@@ -2292,10 +2317,10 @@ std::string json::parser::parseUnicodeEscape()
 
     if (firstCodePoint >= 0xD800 && firstCodePoint <= 0xDBFF)
     {
-        // we found invalid code points, which means we either have a malformed input
-        // or we found a high surrogate.
-        // we can only find out by seeing if the next character also wants to encode
-        // a unicode character (so, we have the \uXXXX\uXXXX case here).
+        // we found invalid code points, which means we either have a malformed
+        // input or we found a high surrogate.
+        // we can only find out by seeing if the next character also wants to
+        // encode a unicode character (so, we have the \uXXXX\uXXXX case here).
 
         // jump behind the next \u
         pos_ += 2;
@@ -2305,14 +2330,16 @@ std::string json::parser::parseUnicodeEscape()
         // ok, we have a low surrogate, check if it is a valid one
         if (secondCodePoint >= 0xDC00 && secondCodePoint <= 0xDFFF)
         {
-            // calculate the final code point from the pair according to the spec
+            // calculate the code point from the pair according to the spec
             unsigned int finalCodePoint =
                     // high surrogate occupies the most significant 22 bits
                     (firstCodePoint << 10)
                     // low surrogate occupies the least significant 15 bits
                     + secondCodePoint
-                    // there is still the 0xD800, 0xDC00 and 0x10000 noise in the result
-                    // so we have to substract with (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
+                    // there is still the 0xD800, 0xDC00 and 0x10000 noise in
+                    // the result
+                    // so we have to substract with:
+                    // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
                     - 0x35FDC00;
 
             // we transform the calculated point into UTF-8