From eb06d0531a7334017e17fbfb5db7e6c31d40cf7c Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Sun, 1 Apr 2018 19:12:36 +0200 Subject: [PATCH 1/5] :construction: added input adapter for wide strings #1031 --- .../nlohmann/detail/input/input_adapters.hpp | 182 ++++++++++++++++++ single_include/nlohmann/json.hpp | 182 ++++++++++++++++++ test/Makefile | 3 +- test/src/unit-wstring.cpp | 58 ++++++ 4 files changed, 424 insertions(+), 1 deletion(-) create mode 100644 test/src/unit-wstring.cpp diff --git a/include/nlohmann/detail/input/input_adapters.hpp b/include/nlohmann/detail/input/input_adapters.hpp index ef66948d..eb358432 100644 --- a/include/nlohmann/detail/input/input_adapters.hpp +++ b/include/nlohmann/detail/input/input_adapters.hpp @@ -165,6 +165,179 @@ class input_buffer_adapter : public input_adapter_protocol const char* start; }; +template +class wide_string_input_adapter : public input_adapter_protocol +{ + private: + using char_t = typename WideStringType::value_type; + + public: + wide_string_input_adapter(const WideStringType& w) : str(w) {} + + std::char_traits::int_type get_character() noexcept override + { + // unget_character() was called previously: return the last character + if (next_unget) + { + next_unget = false; + return last_char; + } + + // check if buffer needs to be filled + if (utf8_bytes_index == utf8_bytes_filled) + { + if (sizeof(char_t) == 2) + { + fill_buffer_utf16(); + } + else + { + fill_buffer_utf32(); + } + + assert(utf8_bytes_filled > 0); + assert(utf8_bytes_index == 0); + } + + // use buffer + assert(utf8_bytes_filled > 0); + assert(utf8_bytes_index < utf8_bytes_filled); + return (last_char = utf8_bytes[utf8_bytes_index++]); + } + + void unget_character() noexcept override + { + next_unget = true; + } + + private: + void fill_buffer_utf16() + { + utf8_bytes_index = 0; + + if (current_wchar == str.size()) + { + utf8_bytes[0] = std::char_traits::eof(); + utf8_bytes_filled = 1; + } + else + { + // get the current character + const char_t wc = str[current_wchar++]; + + // UTF-16 to UTF-8 encoding + if (wc < 0x80) + { + utf8_bytes[0] = wc; + utf8_bytes_filled = 1; + } + else if (wc <= 0x7FF) + { + utf8_bytes[0] = 0xC0 | ((wc >> 6)); + utf8_bytes[1] = 0x80 | (wc & 0x3F); + utf8_bytes_filled = 2; + } + else if (0xD800 > wc or wc >= 0xE000) + { + utf8_bytes[0] = 0xE0 | ((wc >> 12)); + utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F); + utf8_bytes[2] = 0x80 | (wc & 0x3F); + utf8_bytes_filled = 3; + } + else + { + if (current_wchar < str.size()) + { + const char_t wc2 = str[current_wchar++]; + const int charcode = 0x10000 + (((wc & 0x3FF) << 10) | (wc2 & 0x3FF)); + utf8_bytes[0] = 0xf0 | (charcode >> 18); + utf8_bytes[1] = 0x80 | ((charcode >> 12) & 0x3F); + utf8_bytes[2] = 0x80 | ((charcode >> 6) & 0x3F); + utf8_bytes[3] = 0x80 | (charcode & 0x3F); + utf8_bytes_filled = 4; + } + else + { + // unknown character + ++current_wchar; + utf8_bytes[0] = wc; + utf8_bytes_filled = 1; + } + } + } + } + + void fill_buffer_utf32() + { + utf8_bytes_index = 0; + + if (current_wchar == str.size()) + { + utf8_bytes[0] = std::char_traits::eof(); + utf8_bytes_filled = 1; + } + else + { + // get the current character + const char_t wc = str[current_wchar++]; + + // UTF-32 to UTF-8 encoding + if (wc < 0x80) + { + utf8_bytes[0] = wc; + utf8_bytes_filled = 1; + } + else if (wc <= 0x7FF) + { + utf8_bytes[0] = 0xC0 | ((wc >> 6) & 0x1F); + utf8_bytes[1] = 0x80 | (wc & 0x3F); + utf8_bytes_filled = 2; + } + else if (wc <= 0xFFFF) + { + utf8_bytes[0] = 0xE0 | ((wc >> 12) & 0x0F); + utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F); + utf8_bytes[2] = 0x80 | (wc & 0x3F); + utf8_bytes_filled = 3; + } + else if (wc <= 0x10FFFF) + { + utf8_bytes[0] = 0xF0 | ((wc >> 18 ) & 0x07); + utf8_bytes[1] = 0x80 | ((wc >> 12) & 0x3F); + utf8_bytes[2] = 0x80 | ((wc >> 6) & 0x3F); + utf8_bytes[3] = 0x80 | (wc & 0x3F); + utf8_bytes_filled = 4; + } + else + { + // unknown character + utf8_bytes[0] = wc; + utf8_bytes_filled = 1; + } + } + } + + private: + /// the wstring to process + const WideStringType& str; + + /// index of the current wchar in str + std::size_t current_wchar = 0; + + /// a buffer for UTF-8 bytes + std::array::int_type, 4> utf8_bytes = {{0, 0, 0, 0}}; + + /// index to the utf8_codes array for the next valid byte + std::size_t utf8_bytes_index = 0; + /// number of valid bytes in the utf8_codes array + std::size_t utf8_bytes_filled = 0; + + /// the last character (returned after unget_character() is called) + std::char_traits::int_type last_char = 0; + /// whether get_character() should return last_char + bool next_unget = false; +}; + class input_adapter { public: @@ -178,6 +351,15 @@ class input_adapter input_adapter(std::istream&& i) : ia(std::make_shared(i)) {} + input_adapter(const std::wstring& ws) + : ia(std::make_shared>(ws)) {} + + input_adapter(const std::u16string& ws) + : ia(std::make_shared>(ws)) {} + + input_adapter(const std::u32string& ws) + : ia(std::make_shared>(ws)) {} + /// input adapter for buffer template +class wide_string_input_adapter : public input_adapter_protocol +{ + private: + using char_t = typename WideStringType::value_type; + + public: + wide_string_input_adapter(const WideStringType& w) : str(w) {} + + std::char_traits::int_type get_character() noexcept override + { + // unget_character() was called previously: return the last character + if (next_unget) + { + next_unget = false; + return last_char; + } + + // check if buffer needs to be filled + if (utf8_bytes_index == utf8_bytes_filled) + { + if (sizeof(char_t) == 2) + { + fill_buffer_utf16(); + } + else + { + fill_buffer_utf32(); + } + + assert(utf8_bytes_filled > 0); + assert(utf8_bytes_index == 0); + } + + // use buffer + assert(utf8_bytes_filled > 0); + assert(utf8_bytes_index < utf8_bytes_filled); + return (last_char = utf8_bytes[utf8_bytes_index++]); + } + + void unget_character() noexcept override + { + next_unget = true; + } + + private: + void fill_buffer_utf16() + { + utf8_bytes_index = 0; + + if (current_wchar == str.size()) + { + utf8_bytes[0] = std::char_traits::eof(); + utf8_bytes_filled = 1; + } + else + { + // get the current character + const char_t wc = str[current_wchar++]; + + // UTF-16 to UTF-8 encoding + if (wc < 0x80) + { + utf8_bytes[0] = wc; + utf8_bytes_filled = 1; + } + else if (wc <= 0x7FF) + { + utf8_bytes[0] = 0xC0 | ((wc >> 6)); + utf8_bytes[1] = 0x80 | (wc & 0x3F); + utf8_bytes_filled = 2; + } + else if (0xD800 > wc or wc >= 0xE000) + { + utf8_bytes[0] = 0xE0 | ((wc >> 12)); + utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F); + utf8_bytes[2] = 0x80 | (wc & 0x3F); + utf8_bytes_filled = 3; + } + else + { + if (current_wchar < str.size()) + { + const char_t wc2 = str[current_wchar++]; + const int charcode = 0x10000 + (((wc & 0x3FF) << 10) | (wc2 & 0x3FF)); + utf8_bytes[0] = 0xf0 | (charcode >> 18); + utf8_bytes[1] = 0x80 | ((charcode >> 12) & 0x3F); + utf8_bytes[2] = 0x80 | ((charcode >> 6) & 0x3F); + utf8_bytes[3] = 0x80 | (charcode & 0x3F); + utf8_bytes_filled = 4; + } + else + { + // unknown character + ++current_wchar; + utf8_bytes[0] = wc; + utf8_bytes_filled = 1; + } + } + } + } + + void fill_buffer_utf32() + { + utf8_bytes_index = 0; + + if (current_wchar == str.size()) + { + utf8_bytes[0] = std::char_traits::eof(); + utf8_bytes_filled = 1; + } + else + { + // get the current character + const char_t wc = str[current_wchar++]; + + // UTF-32 to UTF-8 encoding + if (wc < 0x80) + { + utf8_bytes[0] = wc; + utf8_bytes_filled = 1; + } + else if (wc <= 0x7FF) + { + utf8_bytes[0] = 0xC0 | ((wc >> 6) & 0x1F); + utf8_bytes[1] = 0x80 | (wc & 0x3F); + utf8_bytes_filled = 2; + } + else if (wc <= 0xFFFF) + { + utf8_bytes[0] = 0xE0 | ((wc >> 12) & 0x0F); + utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F); + utf8_bytes[2] = 0x80 | (wc & 0x3F); + utf8_bytes_filled = 3; + } + else if (wc <= 0x10FFFF) + { + utf8_bytes[0] = 0xF0 | ((wc >> 18 ) & 0x07); + utf8_bytes[1] = 0x80 | ((wc >> 12) & 0x3F); + utf8_bytes[2] = 0x80 | ((wc >> 6) & 0x3F); + utf8_bytes[3] = 0x80 | (wc & 0x3F); + utf8_bytes_filled = 4; + } + else + { + // unknown character + utf8_bytes[0] = wc; + utf8_bytes_filled = 1; + } + } + } + + private: + /// the wstring to process + const WideStringType& str; + + /// index of the current wchar in str + std::size_t current_wchar = 0; + + /// a buffer for UTF-8 bytes + std::array::int_type, 4> utf8_bytes = {{0, 0, 0, 0}}; + + /// index to the utf8_codes array for the next valid byte + std::size_t utf8_bytes_index = 0; + /// number of valid bytes in the utf8_codes array + std::size_t utf8_bytes_filled = 0; + + /// the last character (returned after unget_character() is called) + std::char_traits::int_type last_char = 0; + /// whether get_character() should return last_char + bool next_unget = false; +}; + class input_adapter { public: @@ -1751,6 +1924,15 @@ class input_adapter input_adapter(std::istream&& i) : ia(std::make_shared(i)) {} + input_adapter(const std::wstring& ws) + : ia(std::make_shared>(ws)) {} + + input_adapter(const std::u16string& ws) + : ia(std::make_shared>(ws)) {} + + input_adapter(const std::u32string& ws) + : ia(std::make_shared>(ws)) {} + /// input adapter for buffer template. +Copyright (c) 2013-2018 Niels Lohmann . + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "catch.hpp" + +#include + +using nlohmann::json; + + +TEST_CASE("wide strings") +{ + SECTION("std::wstring") + { + std::wstring w = L"[12.2,\"Ⴥaäö💤🧢\"]"; + json j = json::parse(w); + CHECK(j.dump() == "[12.2,\"Ⴥaäö💤🧢\"]"); + } + + SECTION("std::u16string") + { + std::u16string w = u"[12.2,\"Ⴥaäö💤🧢\"]"; + json j = json::parse(w); + CHECK(j.dump() == "[12.2,\"Ⴥaäö💤🧢\"]"); + } + + SECTION("std::u32string") + { + std::u32string w = U"[12.2,\"Ⴥaäö💤🧢\"]"; + json j = json::parse(w); + CHECK(j.dump() == "[12.2,\"Ⴥaäö💤🧢\"]"); + } +} From ab89ae4e5010f896fc41d749e1ff2e4cec6c7019 Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Mon, 2 Apr 2018 11:34:36 +0200 Subject: [PATCH 2/5] :hammer: trying to make tests run with MSVC #1031 --- test/src/unit-wstring.cpp | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/test/src/unit-wstring.cpp b/test/src/unit-wstring.cpp index 6b720c99..186ab4c7 100644 --- a/test/src/unit-wstring.cpp +++ b/test/src/unit-wstring.cpp @@ -32,27 +32,45 @@ SOFTWARE. using nlohmann::json; +// from https://www.reddit.com/r/cpp/comments/75gohf/i_just_found_a_use_for_the_poop_emoji_in_c/ +constexpr bool compiler_supports_utf8(); +constexpr bool compiler_supports_utf8() +{ + return (static_cast("💩"[0]) == 0xF0) and + (static_cast("💩"[1]) == 0x9F) and + (static_cast("💩"[2]) == 0x92) and + (static_cast("💩"[3]) == 0xA9); +} TEST_CASE("wide strings") { SECTION("std::wstring") { - std::wstring w = L"[12.2,\"Ⴥaäö💤🧢\"]"; - json j = json::parse(w); - CHECK(j.dump() == "[12.2,\"Ⴥaäö💤🧢\"]"); + if (compiler_supports_utf8()) + { + std::wstring w = L"[12.2,\"Ⴥaäö💤🧢\"]"; + json j = json::parse(w); + CHECK(j.dump() == "[12.2,\"Ⴥaäö💤🧢\"]"); + } } SECTION("std::u16string") { - std::u16string w = u"[12.2,\"Ⴥaäö💤🧢\"]"; - json j = json::parse(w); - CHECK(j.dump() == "[12.2,\"Ⴥaäö💤🧢\"]"); + if (compiler_supports_utf8()) + { + std::u16string w = u"[12.2,\"Ⴥaäö💤🧢\"]"; + json j = json::parse(w); + CHECK(j.dump() == "[12.2,\"Ⴥaäö💤🧢\"]"); + } } SECTION("std::u32string") { - std::u32string w = U"[12.2,\"Ⴥaäö💤🧢\"]"; - json j = json::parse(w); - CHECK(j.dump() == "[12.2,\"Ⴥaäö💤🧢\"]"); + if (compiler_supports_utf8()) + { + std::u32string w = U"[12.2,\"Ⴥaäö💤🧢\"]"; + json j = json::parse(w); + CHECK(j.dump() == "[12.2,\"Ⴥaäö💤🧢\"]"); + } } } From 727dd4664bf458925b7f76541e7effbd17017391 Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Mon, 2 Apr 2018 12:27:07 +0200 Subject: [PATCH 3/5] :hammer: trying to make tests run with MSVC #1031 --- test/src/unit-wstring.cpp | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/test/src/unit-wstring.cpp b/test/src/unit-wstring.cpp index 186ab4c7..a9f2c194 100644 --- a/test/src/unit-wstring.cpp +++ b/test/src/unit-wstring.cpp @@ -32,21 +32,29 @@ SOFTWARE. using nlohmann::json; -// from https://www.reddit.com/r/cpp/comments/75gohf/i_just_found_a_use_for_the_poop_emoji_in_c/ -constexpr bool compiler_supports_utf8(); -constexpr bool compiler_supports_utf8() +bool wstring_is_utf16(); +bool wstring_is_utf16() { - return (static_cast("💩"[0]) == 0xF0) and - (static_cast("💩"[1]) == 0x9F) and - (static_cast("💩"[2]) == 0x92) and - (static_cast("💩"[3]) == 0xA9); + return (std::wstring(L"💩") == std::wstring(L"\U0001F4A9")); +} + +bool u16string_is_utf16(); +bool u16string_is_utf16() +{ + return (std::u16string(u"💩") == std::u16string(u"\U0001F4A9")); +} + +bool u32string_is_utf32(); +bool u32string_is_utf32() +{ + return (std::u32string(U"💩") == std::u32string(U"\U0001F4A9")); } TEST_CASE("wide strings") { SECTION("std::wstring") { - if (compiler_supports_utf8()) + if (wstring_is_utf16()) { std::wstring w = L"[12.2,\"Ⴥaäö💤🧢\"]"; json j = json::parse(w); @@ -56,7 +64,7 @@ TEST_CASE("wide strings") SECTION("std::u16string") { - if (compiler_supports_utf8()) + if (u16string_is_utf16()) { std::u16string w = u"[12.2,\"Ⴥaäö💤🧢\"]"; json j = json::parse(w); @@ -66,7 +74,7 @@ TEST_CASE("wide strings") SECTION("std::u32string") { - if (compiler_supports_utf8()) + if (u32string_is_utf32()) { std::u32string w = U"[12.2,\"Ⴥaäö💤🧢\"]"; json j = json::parse(w); From 16c5bfeaad759dabac9fd54a34c9077cf9621e7f Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Mon, 2 Apr 2018 13:01:56 +0200 Subject: [PATCH 4/5] :ok_hand: fixed compiler warnings #1031 --- include/nlohmann/detail/input/input_adapters.hpp | 11 ++++------- single_include/nlohmann/json.hpp | 11 ++++------- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/include/nlohmann/detail/input/input_adapters.hpp b/include/nlohmann/detail/input/input_adapters.hpp index eb358432..5332a63f 100644 --- a/include/nlohmann/detail/input/input_adapters.hpp +++ b/include/nlohmann/detail/input/input_adapters.hpp @@ -168,9 +168,6 @@ class input_buffer_adapter : public input_adapter_protocol template class wide_string_input_adapter : public input_adapter_protocol { - private: - using char_t = typename WideStringType::value_type; - public: wide_string_input_adapter(const WideStringType& w) : str(w) {} @@ -186,7 +183,7 @@ class wide_string_input_adapter : public input_adapter_protocol // check if buffer needs to be filled if (utf8_bytes_index == utf8_bytes_filled) { - if (sizeof(char_t) == 2) + if (sizeof(typename WideStringType::value_type) == 2) { fill_buffer_utf16(); } @@ -223,7 +220,7 @@ class wide_string_input_adapter : public input_adapter_protocol else { // get the current character - const char_t wc = str[current_wchar++]; + const int wc = static_cast(str[current_wchar++]); // UTF-16 to UTF-8 encoding if (wc < 0x80) @@ -248,7 +245,7 @@ class wide_string_input_adapter : public input_adapter_protocol { if (current_wchar < str.size()) { - const char_t wc2 = str[current_wchar++]; + const int wc2 = static_cast(str[current_wchar++]); const int charcode = 0x10000 + (((wc & 0x3FF) << 10) | (wc2 & 0x3FF)); utf8_bytes[0] = 0xf0 | (charcode >> 18); utf8_bytes[1] = 0x80 | ((charcode >> 12) & 0x3F); @@ -279,7 +276,7 @@ class wide_string_input_adapter : public input_adapter_protocol else { // get the current character - const char_t wc = str[current_wchar++]; + const int wc = static_cast(str[current_wchar++]); // UTF-32 to UTF-8 encoding if (wc < 0x80) diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp index 33a07189..97c83d39 100644 --- a/single_include/nlohmann/json.hpp +++ b/single_include/nlohmann/json.hpp @@ -1741,9 +1741,6 @@ class input_buffer_adapter : public input_adapter_protocol template class wide_string_input_adapter : public input_adapter_protocol { - private: - using char_t = typename WideStringType::value_type; - public: wide_string_input_adapter(const WideStringType& w) : str(w) {} @@ -1759,7 +1756,7 @@ class wide_string_input_adapter : public input_adapter_protocol // check if buffer needs to be filled if (utf8_bytes_index == utf8_bytes_filled) { - if (sizeof(char_t) == 2) + if (sizeof(typename WideStringType::value_type) == 2) { fill_buffer_utf16(); } @@ -1796,7 +1793,7 @@ class wide_string_input_adapter : public input_adapter_protocol else { // get the current character - const char_t wc = str[current_wchar++]; + const int wc = static_cast(str[current_wchar++]); // UTF-16 to UTF-8 encoding if (wc < 0x80) @@ -1821,7 +1818,7 @@ class wide_string_input_adapter : public input_adapter_protocol { if (current_wchar < str.size()) { - const char_t wc2 = str[current_wchar++]; + const int wc2 = static_cast(str[current_wchar++]); const int charcode = 0x10000 + (((wc & 0x3FF) << 10) | (wc2 & 0x3FF)); utf8_bytes[0] = 0xf0 | (charcode >> 18); utf8_bytes[1] = 0x80 | ((charcode >> 12) & 0x3F); @@ -1852,7 +1849,7 @@ class wide_string_input_adapter : public input_adapter_protocol else { // get the current character - const char_t wc = str[current_wchar++]; + const int wc = static_cast(str[current_wchar++]); // UTF-32 to UTF-8 encoding if (wc < 0x80) From 6678eb2b4a1eab6c555c90a4d4a95907df1e909c Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Mon, 2 Apr 2018 15:38:49 +0200 Subject: [PATCH 5/5] :white_check_mark: improved test coverage #1031 --- test/src/unit-wstring.cpp | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/test/src/unit-wstring.cpp b/test/src/unit-wstring.cpp index a9f2c194..37cf42db 100644 --- a/test/src/unit-wstring.cpp +++ b/test/src/unit-wstring.cpp @@ -62,6 +62,15 @@ TEST_CASE("wide strings") } } + SECTION("invalid std::wstring") + { + if (wstring_is_utf16()) + { + std::wstring w = L"\"\xDBFF"; + CHECK_THROWS_AS(json::parse(w), json::parse_error&); + } + } + SECTION("std::u16string") { if (u16string_is_utf16()) @@ -72,6 +81,15 @@ TEST_CASE("wide strings") } } + SECTION("invalid std::u16string") + { + if (wstring_is_utf16()) + { + std::u16string w = u"\"\xDBFF"; + CHECK_THROWS_AS(json::parse(w), json::parse_error&); + } + } + SECTION("std::u32string") { if (u32string_is_utf32()) @@ -81,4 +99,13 @@ TEST_CASE("wide strings") CHECK(j.dump() == "[12.2,\"Ⴥaäö💤🧢\"]"); } } + + SECTION("invalid std::u32string") + { + if (u32string_is_utf32()) + { + std::u32string w = U"\"\x110000"; + CHECK_THROWS_AS(json::parse(w), json::parse_error&); + } + } }