🚧 added input adapter for wide strings #1031
This commit is contained in:
parent
4efa8cdb4c
commit
eb06d0531a
4 changed files with 424 additions and 1 deletions
|
@ -165,6 +165,179 @@ class input_buffer_adapter : public input_adapter_protocol
|
|||
const char* start;
|
||||
};
|
||||
|
||||
template<typename WideStringType>
|
||||
class wide_string_input_adapter : public input_adapter_protocol
|
||||
{
|
||||
private:
|
||||
using char_t = typename WideStringType::value_type;
|
||||
|
||||
public:
|
||||
wide_string_input_adapter(const WideStringType& w) : str(w) {}
|
||||
|
||||
std::char_traits<char>::int_type get_character() noexcept override
|
||||
{
|
||||
// unget_character() was called previously: return the last character
|
||||
if (next_unget)
|
||||
{
|
||||
next_unget = false;
|
||||
return last_char;
|
||||
}
|
||||
|
||||
// check if buffer needs to be filled
|
||||
if (utf8_bytes_index == utf8_bytes_filled)
|
||||
{
|
||||
if (sizeof(char_t) == 2)
|
||||
{
|
||||
fill_buffer_utf16();
|
||||
}
|
||||
else
|
||||
{
|
||||
fill_buffer_utf32();
|
||||
}
|
||||
|
||||
assert(utf8_bytes_filled > 0);
|
||||
assert(utf8_bytes_index == 0);
|
||||
}
|
||||
|
||||
// use buffer
|
||||
assert(utf8_bytes_filled > 0);
|
||||
assert(utf8_bytes_index < utf8_bytes_filled);
|
||||
return (last_char = utf8_bytes[utf8_bytes_index++]);
|
||||
}
|
||||
|
||||
void unget_character() noexcept override
|
||||
{
|
||||
next_unget = true;
|
||||
}
|
||||
|
||||
private:
|
||||
void fill_buffer_utf16()
|
||||
{
|
||||
utf8_bytes_index = 0;
|
||||
|
||||
if (current_wchar == str.size())
|
||||
{
|
||||
utf8_bytes[0] = std::char_traits<char>::eof();
|
||||
utf8_bytes_filled = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
// get the current character
|
||||
const char_t wc = str[current_wchar++];
|
||||
|
||||
// UTF-16 to UTF-8 encoding
|
||||
if (wc < 0x80)
|
||||
{
|
||||
utf8_bytes[0] = wc;
|
||||
utf8_bytes_filled = 1;
|
||||
}
|
||||
else if (wc <= 0x7FF)
|
||||
{
|
||||
utf8_bytes[0] = 0xC0 | ((wc >> 6));
|
||||
utf8_bytes[1] = 0x80 | (wc & 0x3F);
|
||||
utf8_bytes_filled = 2;
|
||||
}
|
||||
else if (0xD800 > wc or wc >= 0xE000)
|
||||
{
|
||||
utf8_bytes[0] = 0xE0 | ((wc >> 12));
|
||||
utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F);
|
||||
utf8_bytes[2] = 0x80 | (wc & 0x3F);
|
||||
utf8_bytes_filled = 3;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (current_wchar < str.size())
|
||||
{
|
||||
const char_t wc2 = str[current_wchar++];
|
||||
const int charcode = 0x10000 + (((wc & 0x3FF) << 10) | (wc2 & 0x3FF));
|
||||
utf8_bytes[0] = 0xf0 | (charcode >> 18);
|
||||
utf8_bytes[1] = 0x80 | ((charcode >> 12) & 0x3F);
|
||||
utf8_bytes[2] = 0x80 | ((charcode >> 6) & 0x3F);
|
||||
utf8_bytes[3] = 0x80 | (charcode & 0x3F);
|
||||
utf8_bytes_filled = 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
// unknown character
|
||||
++current_wchar;
|
||||
utf8_bytes[0] = wc;
|
||||
utf8_bytes_filled = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void fill_buffer_utf32()
|
||||
{
|
||||
utf8_bytes_index = 0;
|
||||
|
||||
if (current_wchar == str.size())
|
||||
{
|
||||
utf8_bytes[0] = std::char_traits<char>::eof();
|
||||
utf8_bytes_filled = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
// get the current character
|
||||
const char_t wc = str[current_wchar++];
|
||||
|
||||
// UTF-32 to UTF-8 encoding
|
||||
if (wc < 0x80)
|
||||
{
|
||||
utf8_bytes[0] = wc;
|
||||
utf8_bytes_filled = 1;
|
||||
}
|
||||
else if (wc <= 0x7FF)
|
||||
{
|
||||
utf8_bytes[0] = 0xC0 | ((wc >> 6) & 0x1F);
|
||||
utf8_bytes[1] = 0x80 | (wc & 0x3F);
|
||||
utf8_bytes_filled = 2;
|
||||
}
|
||||
else if (wc <= 0xFFFF)
|
||||
{
|
||||
utf8_bytes[0] = 0xE0 | ((wc >> 12) & 0x0F);
|
||||
utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F);
|
||||
utf8_bytes[2] = 0x80 | (wc & 0x3F);
|
||||
utf8_bytes_filled = 3;
|
||||
}
|
||||
else if (wc <= 0x10FFFF)
|
||||
{
|
||||
utf8_bytes[0] = 0xF0 | ((wc >> 18 ) & 0x07);
|
||||
utf8_bytes[1] = 0x80 | ((wc >> 12) & 0x3F);
|
||||
utf8_bytes[2] = 0x80 | ((wc >> 6) & 0x3F);
|
||||
utf8_bytes[3] = 0x80 | (wc & 0x3F);
|
||||
utf8_bytes_filled = 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
// unknown character
|
||||
utf8_bytes[0] = wc;
|
||||
utf8_bytes_filled = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
/// the wstring to process
|
||||
const WideStringType& str;
|
||||
|
||||
/// index of the current wchar in str
|
||||
std::size_t current_wchar = 0;
|
||||
|
||||
/// a buffer for UTF-8 bytes
|
||||
std::array<std::char_traits<char>::int_type, 4> utf8_bytes = {{0, 0, 0, 0}};
|
||||
|
||||
/// index to the utf8_codes array for the next valid byte
|
||||
std::size_t utf8_bytes_index = 0;
|
||||
/// number of valid bytes in the utf8_codes array
|
||||
std::size_t utf8_bytes_filled = 0;
|
||||
|
||||
/// the last character (returned after unget_character() is called)
|
||||
std::char_traits<char>::int_type last_char = 0;
|
||||
/// whether get_character() should return last_char
|
||||
bool next_unget = false;
|
||||
};
|
||||
|
||||
class input_adapter
|
||||
{
|
||||
public:
|
||||
|
@ -178,6 +351,15 @@ class input_adapter
|
|||
input_adapter(std::istream&& i)
|
||||
: ia(std::make_shared<input_stream_adapter>(i)) {}
|
||||
|
||||
input_adapter(const std::wstring& ws)
|
||||
: ia(std::make_shared<wide_string_input_adapter<std::wstring>>(ws)) {}
|
||||
|
||||
input_adapter(const std::u16string& ws)
|
||||
: ia(std::make_shared<wide_string_input_adapter<std::u16string>>(ws)) {}
|
||||
|
||||
input_adapter(const std::u32string& ws)
|
||||
: ia(std::make_shared<wide_string_input_adapter<std::u32string>>(ws)) {}
|
||||
|
||||
/// input adapter for buffer
|
||||
template<typename CharT,
|
||||
typename std::enable_if<
|
||||
|
|
|
@ -1738,6 +1738,179 @@ class input_buffer_adapter : public input_adapter_protocol
|
|||
const char* start;
|
||||
};
|
||||
|
||||
template<typename WideStringType>
|
||||
class wide_string_input_adapter : public input_adapter_protocol
|
||||
{
|
||||
private:
|
||||
using char_t = typename WideStringType::value_type;
|
||||
|
||||
public:
|
||||
wide_string_input_adapter(const WideStringType& w) : str(w) {}
|
||||
|
||||
std::char_traits<char>::int_type get_character() noexcept override
|
||||
{
|
||||
// unget_character() was called previously: return the last character
|
||||
if (next_unget)
|
||||
{
|
||||
next_unget = false;
|
||||
return last_char;
|
||||
}
|
||||
|
||||
// check if buffer needs to be filled
|
||||
if (utf8_bytes_index == utf8_bytes_filled)
|
||||
{
|
||||
if (sizeof(char_t) == 2)
|
||||
{
|
||||
fill_buffer_utf16();
|
||||
}
|
||||
else
|
||||
{
|
||||
fill_buffer_utf32();
|
||||
}
|
||||
|
||||
assert(utf8_bytes_filled > 0);
|
||||
assert(utf8_bytes_index == 0);
|
||||
}
|
||||
|
||||
// use buffer
|
||||
assert(utf8_bytes_filled > 0);
|
||||
assert(utf8_bytes_index < utf8_bytes_filled);
|
||||
return (last_char = utf8_bytes[utf8_bytes_index++]);
|
||||
}
|
||||
|
||||
void unget_character() noexcept override
|
||||
{
|
||||
next_unget = true;
|
||||
}
|
||||
|
||||
private:
|
||||
void fill_buffer_utf16()
|
||||
{
|
||||
utf8_bytes_index = 0;
|
||||
|
||||
if (current_wchar == str.size())
|
||||
{
|
||||
utf8_bytes[0] = std::char_traits<char>::eof();
|
||||
utf8_bytes_filled = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
// get the current character
|
||||
const char_t wc = str[current_wchar++];
|
||||
|
||||
// UTF-16 to UTF-8 encoding
|
||||
if (wc < 0x80)
|
||||
{
|
||||
utf8_bytes[0] = wc;
|
||||
utf8_bytes_filled = 1;
|
||||
}
|
||||
else if (wc <= 0x7FF)
|
||||
{
|
||||
utf8_bytes[0] = 0xC0 | ((wc >> 6));
|
||||
utf8_bytes[1] = 0x80 | (wc & 0x3F);
|
||||
utf8_bytes_filled = 2;
|
||||
}
|
||||
else if (0xD800 > wc or wc >= 0xE000)
|
||||
{
|
||||
utf8_bytes[0] = 0xE0 | ((wc >> 12));
|
||||
utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F);
|
||||
utf8_bytes[2] = 0x80 | (wc & 0x3F);
|
||||
utf8_bytes_filled = 3;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (current_wchar < str.size())
|
||||
{
|
||||
const char_t wc2 = str[current_wchar++];
|
||||
const int charcode = 0x10000 + (((wc & 0x3FF) << 10) | (wc2 & 0x3FF));
|
||||
utf8_bytes[0] = 0xf0 | (charcode >> 18);
|
||||
utf8_bytes[1] = 0x80 | ((charcode >> 12) & 0x3F);
|
||||
utf8_bytes[2] = 0x80 | ((charcode >> 6) & 0x3F);
|
||||
utf8_bytes[3] = 0x80 | (charcode & 0x3F);
|
||||
utf8_bytes_filled = 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
// unknown character
|
||||
++current_wchar;
|
||||
utf8_bytes[0] = wc;
|
||||
utf8_bytes_filled = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void fill_buffer_utf32()
|
||||
{
|
||||
utf8_bytes_index = 0;
|
||||
|
||||
if (current_wchar == str.size())
|
||||
{
|
||||
utf8_bytes[0] = std::char_traits<char>::eof();
|
||||
utf8_bytes_filled = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
// get the current character
|
||||
const char_t wc = str[current_wchar++];
|
||||
|
||||
// UTF-32 to UTF-8 encoding
|
||||
if (wc < 0x80)
|
||||
{
|
||||
utf8_bytes[0] = wc;
|
||||
utf8_bytes_filled = 1;
|
||||
}
|
||||
else if (wc <= 0x7FF)
|
||||
{
|
||||
utf8_bytes[0] = 0xC0 | ((wc >> 6) & 0x1F);
|
||||
utf8_bytes[1] = 0x80 | (wc & 0x3F);
|
||||
utf8_bytes_filled = 2;
|
||||
}
|
||||
else if (wc <= 0xFFFF)
|
||||
{
|
||||
utf8_bytes[0] = 0xE0 | ((wc >> 12) & 0x0F);
|
||||
utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F);
|
||||
utf8_bytes[2] = 0x80 | (wc & 0x3F);
|
||||
utf8_bytes_filled = 3;
|
||||
}
|
||||
else if (wc <= 0x10FFFF)
|
||||
{
|
||||
utf8_bytes[0] = 0xF0 | ((wc >> 18 ) & 0x07);
|
||||
utf8_bytes[1] = 0x80 | ((wc >> 12) & 0x3F);
|
||||
utf8_bytes[2] = 0x80 | ((wc >> 6) & 0x3F);
|
||||
utf8_bytes[3] = 0x80 | (wc & 0x3F);
|
||||
utf8_bytes_filled = 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
// unknown character
|
||||
utf8_bytes[0] = wc;
|
||||
utf8_bytes_filled = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
/// the wstring to process
|
||||
const WideStringType& str;
|
||||
|
||||
/// index of the current wchar in str
|
||||
std::size_t current_wchar = 0;
|
||||
|
||||
/// a buffer for UTF-8 bytes
|
||||
std::array<std::char_traits<char>::int_type, 4> utf8_bytes = {{0, 0, 0, 0}};
|
||||
|
||||
/// index to the utf8_codes array for the next valid byte
|
||||
std::size_t utf8_bytes_index = 0;
|
||||
/// number of valid bytes in the utf8_codes array
|
||||
std::size_t utf8_bytes_filled = 0;
|
||||
|
||||
/// the last character (returned after unget_character() is called)
|
||||
std::char_traits<char>::int_type last_char = 0;
|
||||
/// whether get_character() should return last_char
|
||||
bool next_unget = false;
|
||||
};
|
||||
|
||||
class input_adapter
|
||||
{
|
||||
public:
|
||||
|
@ -1751,6 +1924,15 @@ class input_adapter
|
|||
input_adapter(std::istream&& i)
|
||||
: ia(std::make_shared<input_stream_adapter>(i)) {}
|
||||
|
||||
input_adapter(const std::wstring& ws)
|
||||
: ia(std::make_shared<wide_string_input_adapter<std::wstring>>(ws)) {}
|
||||
|
||||
input_adapter(const std::u16string& ws)
|
||||
: ia(std::make_shared<wide_string_input_adapter<std::u16string>>(ws)) {}
|
||||
|
||||
input_adapter(const std::u32string& ws)
|
||||
: ia(std::make_shared<wide_string_input_adapter<std::u32string>>(ws)) {}
|
||||
|
||||
/// input adapter for buffer
|
||||
template<typename CharT,
|
||||
typename std::enable_if<
|
||||
|
|
|
@ -42,7 +42,8 @@ SOURCES = src/unit.cpp \
|
|||
src/unit-serialization.cpp \
|
||||
src/unit-testsuites.cpp \
|
||||
src/unit-ubjson.cpp \
|
||||
src/unit-unicode.cpp
|
||||
src/unit-unicode.cpp \
|
||||
src/unit-wstring.cpp
|
||||
|
||||
OBJECTS = $(SOURCES:.cpp=.o)
|
||||
|
||||
|
|
58
test/src/unit-wstring.cpp
Normal file
58
test/src/unit-wstring.cpp
Normal file
|
@ -0,0 +1,58 @@
|
|||
/*
|
||||
__ _____ _____ _____
|
||||
__| | __| | | | JSON for Modern C++ (test suite)
|
||||
| | |__ | | | | | | version 3.1.2
|
||||
|_____|_____|_____|_|___| https://github.com/nlohmann/json
|
||||
|
||||
Licensed under the MIT License <http://opensource.org/licenses/MIT>.
|
||||
Copyright (c) 2013-2018 Niels Lohmann <http://nlohmann.me>.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "catch.hpp"
|
||||
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
using nlohmann::json;
|
||||
|
||||
|
||||
TEST_CASE("wide strings")
|
||||
{
|
||||
SECTION("std::wstring")
|
||||
{
|
||||
std::wstring w = L"[12.2,\"Ⴥaäö💤🧢\"]";
|
||||
json j = json::parse(w);
|
||||
CHECK(j.dump() == "[12.2,\"Ⴥaäö💤🧢\"]");
|
||||
}
|
||||
|
||||
SECTION("std::u16string")
|
||||
{
|
||||
std::u16string w = u"[12.2,\"Ⴥaäö💤🧢\"]";
|
||||
json j = json::parse(w);
|
||||
CHECK(j.dump() == "[12.2,\"Ⴥaäö💤🧢\"]");
|
||||
}
|
||||
|
||||
SECTION("std::u32string")
|
||||
{
|
||||
std::u32string w = U"[12.2,\"Ⴥaäö💤🧢\"]";
|
||||
json j = json::parse(w);
|
||||
CHECK(j.dump() == "[12.2,\"Ⴥaäö💤🧢\"]");
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue