From f1f72403cdce3555b3e128442c81b9e7392ff075 Mon Sep 17 00:00:00 2001
From: Niels <niels.lohmann@gmail.com>
Date: Sun, 15 Feb 2015 16:56:54 +0100
Subject: [PATCH] some unicode magic

---
 src/json.hpp      | 51 +++++++++++++++++++++++++++++++++++++++--------
 src/json.hpp.re2c | 51 +++++++++++++++++++++++++++++++++++++++--------
 test/unit.cpp     | 15 +++++++++-----
 3 files changed, 96 insertions(+), 21 deletions(-)

diff --git a/src/json.hpp b/src/json.hpp
index 55dca29d..1a09464b 100644
--- a/src/json.hpp
+++ b/src/json.hpp
@@ -2497,14 +2497,37 @@ class basic_json
 
         @param codepoint  the code point (must be in [0x0, 0x10ffff]
         @return string representation of the code point
-        @exception std::out_of_range  if code point is >0x10ffff
+        @exception std::out_of_range if code point is >0x10ffff
+        @exception std::invalid_argument if the low surrogate is invalid
 
         @see <http://en.wikipedia.org/wiki/UTF-8#Sample_code>
         */
-        inline static string_t to_unicode(const size_t codepoint)
+        inline static string_t to_unicode(const size_t codepoint1, size_t codepoint2 = 0)
         {
             string_t result;
 
+            // calculate the codepoint from the given code points
+            size_t codepoint = codepoint1;
+            if (codepoint1 >= 0xD800 and codepoint1 <= 0xDBFF)
+            {
+                if (codepoint2 >= 0xDC00 and codepoint2 <= 0xDFFF)
+                {
+                    codepoint =
+                        // high surrogate occupies the most significant 22 bits
+                        (codepoint1 << 10)
+                        // low surrogate occupies the least significant 15 bits
+                        + codepoint2
+                        // there is still the 0xD800, 0xDC00 and 0x10000 noise
+                        // in the result so we have to substract with:
+                        // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
+                        - 0x35FDC00;
+                }
+                else
+                {
+                    throw std::invalid_argument("missing or wrong low surrogate");
+                }
+            }
+
             if (codepoint <= 0x7f)
             {
                 // 1-byte characters: 0xxxxxxx (ASCI)
@@ -3394,12 +3417,24 @@ basic_json_parser_59:
                         // unicode
                         case 'u':
                         {
-                            // get code xxxx from \uxxxx
-                            auto codepoint = std::strtoul(i + 1, nullptr, 16);
-                            // add unicode character(s)
-                            result += to_unicode(codepoint);
-                            // skip the next four characters (\uxxxx)
-                            i += 4;
+                            // get code xxxx from uxxxx
+                            auto codepoint = std::strtoul(std::string(i + 1, 4).c_str(), nullptr, 16);
+
+                            if (codepoint >= 0xD800 and codepoint <= 0xDBFF)
+                            {
+                                // get code yyyy from uxxxx\uyyyy
+                                auto codepoint2 = std::strtoul(std::string(i + 7, 4).c_str(), nullptr, 16);
+                                result += to_unicode(codepoint, codepoint2);
+                                // skip the next 11 characters (xxxx\uyyyy)
+                                i += 11;
+                            }
+                            else
+                            {
+                                // add unicode character(s)
+                                result += to_unicode(codepoint);
+                                // skip the next four characters (xxxx)
+                                i += 4;
+                            }
                             break;
                         }
                     }
diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c
index 25c5177f..f02111da 100644
--- a/src/json.hpp.re2c
+++ b/src/json.hpp.re2c
@@ -2497,14 +2497,37 @@ class basic_json
 
         @param codepoint  the code point (must be in [0x0, 0x10ffff]
         @return string representation of the code point
-        @exception std::out_of_range  if code point is >0x10ffff
+        @exception std::out_of_range if code point is >0x10ffff
+        @exception std::invalid_argument if the low surrogate is invalid
 
         @see <http://en.wikipedia.org/wiki/UTF-8#Sample_code>
         */
-        inline static string_t to_unicode(const size_t codepoint)
+        inline static string_t to_unicode(const size_t codepoint1, size_t codepoint2 = 0)
         {
             string_t result;
 
+            // calculate the codepoint from the given code points
+            size_t codepoint = codepoint1;
+            if (codepoint1 >= 0xD800 and codepoint1 <= 0xDBFF)
+            {
+                if (codepoint2 >= 0xDC00 and codepoint2 <= 0xDFFF)
+                {
+                    codepoint =
+                        // high surrogate occupies the most significant 22 bits
+                        (codepoint1 << 10)
+                        // low surrogate occupies the least significant 15 bits
+                        + codepoint2
+                        // there is still the 0xD800, 0xDC00 and 0x10000 noise
+                        // in the result so we have to substract with:
+                        // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
+                        - 0x35FDC00;
+                }
+                else
+                {
+                    throw std::invalid_argument("missing or wrong low surrogate");
+                }
+            }
+
             if (codepoint <= 0x7f)
             {
                 // 1-byte characters: 0xxxxxxx (ASCI)
@@ -2743,12 +2766,24 @@ class basic_json
                         // unicode
                         case 'u':
                         {
-                            // get code xxxx from \uxxxx
-                            auto codepoint = std::strtoul(i + 1, nullptr, 16);
-                            // add unicode character(s)
-                            result += to_unicode(codepoint);
-                            // skip the next four characters (\uxxxx)
-                            i += 4;
+                            // get code xxxx from uxxxx
+                            auto codepoint = std::strtoul(std::string(i + 1, 4).c_str(), nullptr, 16);
+
+                            if (codepoint >= 0xD800 and codepoint <= 0xDBFF)
+                            {
+                                // get code yyyy from uxxxx\uyyyy
+                                auto codepoint2 = std::strtoul(std::string(i + 7, 4).c_str(), nullptr, 16);
+                                result += to_unicode(codepoint, codepoint2);
+                                // skip the next 11 characters (xxxx\uyyyy)
+                                i += 11;
+                            }
+                            else
+                            {
+                                // add unicode character(s)
+                                result += to_unicode(codepoint);
+                                // skip the next four characters (xxxx)
+                                i += 4;
+                            }
                             break;
                         }
                     }
diff --git a/test/unit.cpp b/test/unit.cpp
index e819e5fd..78d3fd32 100644
--- a/test/unit.cpp
+++ b/test/unit.cpp
@@ -5645,6 +5645,9 @@ TEST_CASE("parser class")
                 CHECK(json::parser("\"\\u2000\"").parse().get<json::string_t>() == " ");
                 CHECK(json::parser("\"\\uFFFF\"").parse().get<json::string_t>() == "");
                 CHECK(json::parser("\"\\u20AC\"").parse().get<json::string_t>() == "€");
+
+                CHECK(json::parse("\"\\ud80c\\udc60\"").get<json::string_t>() == u8"\U00013060");
+                CHECK(json::parse("\"\\ud83c\\udf1e\"").get<json::string_t>() == "🌞");
             }
         }
 
@@ -5893,10 +5896,12 @@ TEST_CASE("parser class")
                 }
             }
         }
+
+        // missing part of a surrogate pair
+        CHECK_THROWS_AS(json::parse("\"\\uD80C\""), std::invalid_argument);
+        // invalid surrogate pair
+        CHECK_THROWS_AS(json::parse("\"\\uD80C\\uD80C\""), std::invalid_argument);
+        CHECK_THROWS_AS(json::parse("\"\\uD80C\\u0000\""), std::invalid_argument);
+        CHECK_THROWS_AS(json::parse("\"\\uD80C\\uFFFF\""), std::invalid_argument);
     }
 }
-
-TEST_CASE()
-{
-    CHECK(json::parser("\"\\u0049\\u004e\"").parse().get<json::string_t>() == "IN");
-}
\ No newline at end of file