From eb06d0531a7334017e17fbfb5db7e6c31d40cf7c Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Sun, 1 Apr 2018 19:12:36 +0200 Subject: [PATCH] :construction: added input adapter for wide strings #1031 --- .../nlohmann/detail/input/input_adapters.hpp | 182 ++++++++++++++++++ single_include/nlohmann/json.hpp | 182 ++++++++++++++++++ test/Makefile | 3 +- test/src/unit-wstring.cpp | 58 ++++++ 4 files changed, 424 insertions(+), 1 deletion(-) create mode 100644 test/src/unit-wstring.cpp diff --git a/include/nlohmann/detail/input/input_adapters.hpp b/include/nlohmann/detail/input/input_adapters.hpp index ef66948d..eb358432 100644 --- a/include/nlohmann/detail/input/input_adapters.hpp +++ b/include/nlohmann/detail/input/input_adapters.hpp @@ -165,6 +165,179 @@ class input_buffer_adapter : public input_adapter_protocol const char* start; }; +template +class wide_string_input_adapter : public input_adapter_protocol +{ + private: + using char_t = typename WideStringType::value_type; + + public: + wide_string_input_adapter(const WideStringType& w) : str(w) {} + + std::char_traits::int_type get_character() noexcept override + { + // unget_character() was called previously: return the last character + if (next_unget) + { + next_unget = false; + return last_char; + } + + // check if buffer needs to be filled + if (utf8_bytes_index == utf8_bytes_filled) + { + if (sizeof(char_t) == 2) + { + fill_buffer_utf16(); + } + else + { + fill_buffer_utf32(); + } + + assert(utf8_bytes_filled > 0); + assert(utf8_bytes_index == 0); + } + + // use buffer + assert(utf8_bytes_filled > 0); + assert(utf8_bytes_index < utf8_bytes_filled); + return (last_char = utf8_bytes[utf8_bytes_index++]); + } + + void unget_character() noexcept override + { + next_unget = true; + } + + private: + void fill_buffer_utf16() + { + utf8_bytes_index = 0; + + if (current_wchar == str.size()) + { + utf8_bytes[0] = std::char_traits::eof(); + utf8_bytes_filled = 1; + } + else + { + // get the current character + const char_t wc = str[current_wchar++]; + + // UTF-16 to UTF-8 encoding + if (wc < 0x80) + { + utf8_bytes[0] = wc; + utf8_bytes_filled = 1; + } + else if (wc <= 0x7FF) + { + utf8_bytes[0] = 0xC0 | ((wc >> 6)); + utf8_bytes[1] = 0x80 | (wc & 0x3F); + utf8_bytes_filled = 2; + } + else if (0xD800 > wc or wc >= 0xE000) + { + utf8_bytes[0] = 0xE0 | ((wc >> 12)); + utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F); + utf8_bytes[2] = 0x80 | (wc & 0x3F); + utf8_bytes_filled = 3; + } + else + { + if (current_wchar < str.size()) + { + const char_t wc2 = str[current_wchar++]; + const int charcode = 0x10000 + (((wc & 0x3FF) << 10) | (wc2 & 0x3FF)); + utf8_bytes[0] = 0xf0 | (charcode >> 18); + utf8_bytes[1] = 0x80 | ((charcode >> 12) & 0x3F); + utf8_bytes[2] = 0x80 | ((charcode >> 6) & 0x3F); + utf8_bytes[3] = 0x80 | (charcode & 0x3F); + utf8_bytes_filled = 4; + } + else + { + // unknown character + ++current_wchar; + utf8_bytes[0] = wc; + utf8_bytes_filled = 1; + } + } + } + } + + void fill_buffer_utf32() + { + utf8_bytes_index = 0; + + if (current_wchar == str.size()) + { + utf8_bytes[0] = std::char_traits::eof(); + utf8_bytes_filled = 1; + } + else + { + // get the current character + const char_t wc = str[current_wchar++]; + + // UTF-32 to UTF-8 encoding + if (wc < 0x80) + { + utf8_bytes[0] = wc; + utf8_bytes_filled = 1; + } + else if (wc <= 0x7FF) + { + utf8_bytes[0] = 0xC0 | ((wc >> 6) & 0x1F); + utf8_bytes[1] = 0x80 | (wc & 0x3F); + utf8_bytes_filled = 2; + } + else if (wc <= 0xFFFF) + { + utf8_bytes[0] = 0xE0 | ((wc >> 12) & 0x0F); + utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F); + utf8_bytes[2] = 0x80 | (wc & 0x3F); + utf8_bytes_filled = 3; + } + else if (wc <= 0x10FFFF) + { + utf8_bytes[0] = 0xF0 | ((wc >> 18 ) & 0x07); + utf8_bytes[1] = 0x80 | ((wc >> 12) & 0x3F); + utf8_bytes[2] = 0x80 | ((wc >> 6) & 0x3F); + utf8_bytes[3] = 0x80 | (wc & 0x3F); + utf8_bytes_filled = 4; + } + else + { + // unknown character + utf8_bytes[0] = wc; + utf8_bytes_filled = 1; + } + } + } + + private: + /// the wstring to process + const WideStringType& str; + + /// index of the current wchar in str + std::size_t current_wchar = 0; + + /// a buffer for UTF-8 bytes + std::array::int_type, 4> utf8_bytes = {{0, 0, 0, 0}}; + + /// index to the utf8_codes array for the next valid byte + std::size_t utf8_bytes_index = 0; + /// number of valid bytes in the utf8_codes array + std::size_t utf8_bytes_filled = 0; + + /// the last character (returned after unget_character() is called) + std::char_traits::int_type last_char = 0; + /// whether get_character() should return last_char + bool next_unget = false; +}; + class input_adapter { public: @@ -178,6 +351,15 @@ class input_adapter input_adapter(std::istream&& i) : ia(std::make_shared(i)) {} + input_adapter(const std::wstring& ws) + : ia(std::make_shared>(ws)) {} + + input_adapter(const std::u16string& ws) + : ia(std::make_shared>(ws)) {} + + input_adapter(const std::u32string& ws) + : ia(std::make_shared>(ws)) {} + /// input adapter for buffer template +class wide_string_input_adapter : public input_adapter_protocol +{ + private: + using char_t = typename WideStringType::value_type; + + public: + wide_string_input_adapter(const WideStringType& w) : str(w) {} + + std::char_traits::int_type get_character() noexcept override + { + // unget_character() was called previously: return the last character + if (next_unget) + { + next_unget = false; + return last_char; + } + + // check if buffer needs to be filled + if (utf8_bytes_index == utf8_bytes_filled) + { + if (sizeof(char_t) == 2) + { + fill_buffer_utf16(); + } + else + { + fill_buffer_utf32(); + } + + assert(utf8_bytes_filled > 0); + assert(utf8_bytes_index == 0); + } + + // use buffer + assert(utf8_bytes_filled > 0); + assert(utf8_bytes_index < utf8_bytes_filled); + return (last_char = utf8_bytes[utf8_bytes_index++]); + } + + void unget_character() noexcept override + { + next_unget = true; + } + + private: + void fill_buffer_utf16() + { + utf8_bytes_index = 0; + + if (current_wchar == str.size()) + { + utf8_bytes[0] = std::char_traits::eof(); + utf8_bytes_filled = 1; + } + else + { + // get the current character + const char_t wc = str[current_wchar++]; + + // UTF-16 to UTF-8 encoding + if (wc < 0x80) + { + utf8_bytes[0] = wc; + utf8_bytes_filled = 1; + } + else if (wc <= 0x7FF) + { + utf8_bytes[0] = 0xC0 | ((wc >> 6)); + utf8_bytes[1] = 0x80 | (wc & 0x3F); + utf8_bytes_filled = 2; + } + else if (0xD800 > wc or wc >= 0xE000) + { + utf8_bytes[0] = 0xE0 | ((wc >> 12)); + utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F); + utf8_bytes[2] = 0x80 | (wc & 0x3F); + utf8_bytes_filled = 3; + } + else + { + if (current_wchar < str.size()) + { + const char_t wc2 = str[current_wchar++]; + const int charcode = 0x10000 + (((wc & 0x3FF) << 10) | (wc2 & 0x3FF)); + utf8_bytes[0] = 0xf0 | (charcode >> 18); + utf8_bytes[1] = 0x80 | ((charcode >> 12) & 0x3F); + utf8_bytes[2] = 0x80 | ((charcode >> 6) & 0x3F); + utf8_bytes[3] = 0x80 | (charcode & 0x3F); + utf8_bytes_filled = 4; + } + else + { + // unknown character + ++current_wchar; + utf8_bytes[0] = wc; + utf8_bytes_filled = 1; + } + } + } + } + + void fill_buffer_utf32() + { + utf8_bytes_index = 0; + + if (current_wchar == str.size()) + { + utf8_bytes[0] = std::char_traits::eof(); + utf8_bytes_filled = 1; + } + else + { + // get the current character + const char_t wc = str[current_wchar++]; + + // UTF-32 to UTF-8 encoding + if (wc < 0x80) + { + utf8_bytes[0] = wc; + utf8_bytes_filled = 1; + } + else if (wc <= 0x7FF) + { + utf8_bytes[0] = 0xC0 | ((wc >> 6) & 0x1F); + utf8_bytes[1] = 0x80 | (wc & 0x3F); + utf8_bytes_filled = 2; + } + else if (wc <= 0xFFFF) + { + utf8_bytes[0] = 0xE0 | ((wc >> 12) & 0x0F); + utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F); + utf8_bytes[2] = 0x80 | (wc & 0x3F); + utf8_bytes_filled = 3; + } + else if (wc <= 0x10FFFF) + { + utf8_bytes[0] = 0xF0 | ((wc >> 18 ) & 0x07); + utf8_bytes[1] = 0x80 | ((wc >> 12) & 0x3F); + utf8_bytes[2] = 0x80 | ((wc >> 6) & 0x3F); + utf8_bytes[3] = 0x80 | (wc & 0x3F); + utf8_bytes_filled = 4; + } + else + { + // unknown character + utf8_bytes[0] = wc; + utf8_bytes_filled = 1; + } + } + } + + private: + /// the wstring to process + const WideStringType& str; + + /// index of the current wchar in str + std::size_t current_wchar = 0; + + /// a buffer for UTF-8 bytes + std::array::int_type, 4> utf8_bytes = {{0, 0, 0, 0}}; + + /// index to the utf8_codes array for the next valid byte + std::size_t utf8_bytes_index = 0; + /// number of valid bytes in the utf8_codes array + std::size_t utf8_bytes_filled = 0; + + /// the last character (returned after unget_character() is called) + std::char_traits::int_type last_char = 0; + /// whether get_character() should return last_char + bool next_unget = false; +}; + class input_adapter { public: @@ -1751,6 +1924,15 @@ class input_adapter input_adapter(std::istream&& i) : ia(std::make_shared(i)) {} + input_adapter(const std::wstring& ws) + : ia(std::make_shared>(ws)) {} + + input_adapter(const std::u16string& ws) + : ia(std::make_shared>(ws)) {} + + input_adapter(const std::u32string& ws) + : ia(std::make_shared>(ws)) {} + /// input adapter for buffer template. +Copyright (c) 2013-2018 Niels Lohmann . + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "catch.hpp" + +#include + +using nlohmann::json; + + +TEST_CASE("wide strings") +{ + SECTION("std::wstring") + { + std::wstring w = L"[12.2,\"Ⴥaäö💤🧢\"]"; + json j = json::parse(w); + CHECK(j.dump() == "[12.2,\"Ⴥaäö💤🧢\"]"); + } + + SECTION("std::u16string") + { + std::u16string w = u"[12.2,\"Ⴥaäö💤🧢\"]"; + json j = json::parse(w); + CHECK(j.dump() == "[12.2,\"Ⴥaäö💤🧢\"]"); + } + + SECTION("std::u32string") + { + std::u32string w = U"[12.2,\"Ⴥaäö💤🧢\"]"; + json j = json::parse(w); + CHECK(j.dump() == "[12.2,\"Ⴥaäö💤🧢\"]"); + } +}