diff --git a/src/json.hpp b/src/json.hpp index 55dca29d..1a09464b 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -2497,14 +2497,37 @@ class basic_json @param codepoint the code point (must be in [0x0, 0x10ffff] @return string representation of the code point - @exception std::out_of_range if code point is >0x10ffff + @exception std::out_of_range if code point is >0x10ffff + @exception std::invalid_argument if the low surrogate is invalid @see */ - inline static string_t to_unicode(const size_t codepoint) + inline static string_t to_unicode(const size_t codepoint1, size_t codepoint2 = 0) { string_t result; + // calculate the codepoint from the given code points + size_t codepoint = codepoint1; + if (codepoint1 >= 0xD800 and codepoint1 <= 0xDBFF) + { + if (codepoint2 >= 0xDC00 and codepoint2 <= 0xDFFF) + { + codepoint = + // high surrogate occupies the most significant 22 bits + (codepoint1 << 10) + // low surrogate occupies the least significant 15 bits + + codepoint2 + // there is still the 0xD800, 0xDC00 and 0x10000 noise + // in the result so we have to substract with: + // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00 + - 0x35FDC00; + } + else + { + throw std::invalid_argument("missing or wrong low surrogate"); + } + } + if (codepoint <= 0x7f) { // 1-byte characters: 0xxxxxxx (ASCI) @@ -3394,12 +3417,24 @@ basic_json_parser_59: // unicode case 'u': { - // get code xxxx from \uxxxx - auto codepoint = std::strtoul(i + 1, nullptr, 16); - // add unicode character(s) - result += to_unicode(codepoint); - // skip the next four characters (\uxxxx) - i += 4; + // get code xxxx from uxxxx + auto codepoint = std::strtoul(std::string(i + 1, 4).c_str(), nullptr, 16); + + if (codepoint >= 0xD800 and codepoint <= 0xDBFF) + { + // get code yyyy from uxxxx\uyyyy + auto codepoint2 = std::strtoul(std::string(i + 7, 4).c_str(), nullptr, 16); + result += to_unicode(codepoint, codepoint2); + // skip the next 11 characters (xxxx\uyyyy) + i += 11; + } + else + { + // add unicode character(s) + result += to_unicode(codepoint); + // skip the next four characters (xxxx) + i += 4; + } break; } } diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index 25c5177f..f02111da 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -2497,14 +2497,37 @@ class basic_json @param codepoint the code point (must be in [0x0, 0x10ffff] @return string representation of the code point - @exception std::out_of_range if code point is >0x10ffff + @exception std::out_of_range if code point is >0x10ffff + @exception std::invalid_argument if the low surrogate is invalid @see */ - inline static string_t to_unicode(const size_t codepoint) + inline static string_t to_unicode(const size_t codepoint1, size_t codepoint2 = 0) { string_t result; + // calculate the codepoint from the given code points + size_t codepoint = codepoint1; + if (codepoint1 >= 0xD800 and codepoint1 <= 0xDBFF) + { + if (codepoint2 >= 0xDC00 and codepoint2 <= 0xDFFF) + { + codepoint = + // high surrogate occupies the most significant 22 bits + (codepoint1 << 10) + // low surrogate occupies the least significant 15 bits + + codepoint2 + // there is still the 0xD800, 0xDC00 and 0x10000 noise + // in the result so we have to substract with: + // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00 + - 0x35FDC00; + } + else + { + throw std::invalid_argument("missing or wrong low surrogate"); + } + } + if (codepoint <= 0x7f) { // 1-byte characters: 0xxxxxxx (ASCI) @@ -2743,12 +2766,24 @@ class basic_json // unicode case 'u': { - // get code xxxx from \uxxxx - auto codepoint = std::strtoul(i + 1, nullptr, 16); - // add unicode character(s) - result += to_unicode(codepoint); - // skip the next four characters (\uxxxx) - i += 4; + // get code xxxx from uxxxx + auto codepoint = std::strtoul(std::string(i + 1, 4).c_str(), nullptr, 16); + + if (codepoint >= 0xD800 and codepoint <= 0xDBFF) + { + // get code yyyy from uxxxx\uyyyy + auto codepoint2 = std::strtoul(std::string(i + 7, 4).c_str(), nullptr, 16); + result += to_unicode(codepoint, codepoint2); + // skip the next 11 characters (xxxx\uyyyy) + i += 11; + } + else + { + // add unicode character(s) + result += to_unicode(codepoint); + // skip the next four characters (xxxx) + i += 4; + } break; } } diff --git a/test/unit.cpp b/test/unit.cpp index e819e5fd..78d3fd32 100644 --- a/test/unit.cpp +++ b/test/unit.cpp @@ -5645,6 +5645,9 @@ TEST_CASE("parser class") CHECK(json::parser("\"\\u2000\"").parse().get() == " "); CHECK(json::parser("\"\\uFFFF\"").parse().get() == "￿"); CHECK(json::parser("\"\\u20AC\"").parse().get() == "€"); + + CHECK(json::parse("\"\\ud80c\\udc60\"").get() == u8"\U00013060"); + CHECK(json::parse("\"\\ud83c\\udf1e\"").get() == "🌞"); } } @@ -5893,10 +5896,12 @@ TEST_CASE("parser class") } } } + + // missing part of a surrogate pair + CHECK_THROWS_AS(json::parse("\"\\uD80C\""), std::invalid_argument); + // invalid surrogate pair + CHECK_THROWS_AS(json::parse("\"\\uD80C\\uD80C\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\uD80C\\u0000\""), std::invalid_argument); + CHECK_THROWS_AS(json::parse("\"\\uD80C\\uFFFF\""), std::invalid_argument); } } - -TEST_CASE() -{ - CHECK(json::parser("\"\\u0049\\u004e\"").parse().get() == "IN"); -} \ No newline at end of file