🚧 added input adapter for wide strings #1031
This commit is contained in:
parent
4efa8cdb4c
commit
eb06d0531a
4 changed files with 424 additions and 1 deletions
|
@ -165,6 +165,179 @@ class input_buffer_adapter : public input_adapter_protocol
|
||||||
const char* start;
|
const char* start;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<typename WideStringType>
|
||||||
|
class wide_string_input_adapter : public input_adapter_protocol
|
||||||
|
{
|
||||||
|
private:
|
||||||
|
using char_t = typename WideStringType::value_type;
|
||||||
|
|
||||||
|
public:
|
||||||
|
wide_string_input_adapter(const WideStringType& w) : str(w) {}
|
||||||
|
|
||||||
|
std::char_traits<char>::int_type get_character() noexcept override
|
||||||
|
{
|
||||||
|
// unget_character() was called previously: return the last character
|
||||||
|
if (next_unget)
|
||||||
|
{
|
||||||
|
next_unget = false;
|
||||||
|
return last_char;
|
||||||
|
}
|
||||||
|
|
||||||
|
// check if buffer needs to be filled
|
||||||
|
if (utf8_bytes_index == utf8_bytes_filled)
|
||||||
|
{
|
||||||
|
if (sizeof(char_t) == 2)
|
||||||
|
{
|
||||||
|
fill_buffer_utf16();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
fill_buffer_utf32();
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(utf8_bytes_filled > 0);
|
||||||
|
assert(utf8_bytes_index == 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// use buffer
|
||||||
|
assert(utf8_bytes_filled > 0);
|
||||||
|
assert(utf8_bytes_index < utf8_bytes_filled);
|
||||||
|
return (last_char = utf8_bytes[utf8_bytes_index++]);
|
||||||
|
}
|
||||||
|
|
||||||
|
void unget_character() noexcept override
|
||||||
|
{
|
||||||
|
next_unget = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
void fill_buffer_utf16()
|
||||||
|
{
|
||||||
|
utf8_bytes_index = 0;
|
||||||
|
|
||||||
|
if (current_wchar == str.size())
|
||||||
|
{
|
||||||
|
utf8_bytes[0] = std::char_traits<char>::eof();
|
||||||
|
utf8_bytes_filled = 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// get the current character
|
||||||
|
const char_t wc = str[current_wchar++];
|
||||||
|
|
||||||
|
// UTF-16 to UTF-8 encoding
|
||||||
|
if (wc < 0x80)
|
||||||
|
{
|
||||||
|
utf8_bytes[0] = wc;
|
||||||
|
utf8_bytes_filled = 1;
|
||||||
|
}
|
||||||
|
else if (wc <= 0x7FF)
|
||||||
|
{
|
||||||
|
utf8_bytes[0] = 0xC0 | ((wc >> 6));
|
||||||
|
utf8_bytes[1] = 0x80 | (wc & 0x3F);
|
||||||
|
utf8_bytes_filled = 2;
|
||||||
|
}
|
||||||
|
else if (0xD800 > wc or wc >= 0xE000)
|
||||||
|
{
|
||||||
|
utf8_bytes[0] = 0xE0 | ((wc >> 12));
|
||||||
|
utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F);
|
||||||
|
utf8_bytes[2] = 0x80 | (wc & 0x3F);
|
||||||
|
utf8_bytes_filled = 3;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (current_wchar < str.size())
|
||||||
|
{
|
||||||
|
const char_t wc2 = str[current_wchar++];
|
||||||
|
const int charcode = 0x10000 + (((wc & 0x3FF) << 10) | (wc2 & 0x3FF));
|
||||||
|
utf8_bytes[0] = 0xf0 | (charcode >> 18);
|
||||||
|
utf8_bytes[1] = 0x80 | ((charcode >> 12) & 0x3F);
|
||||||
|
utf8_bytes[2] = 0x80 | ((charcode >> 6) & 0x3F);
|
||||||
|
utf8_bytes[3] = 0x80 | (charcode & 0x3F);
|
||||||
|
utf8_bytes_filled = 4;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// unknown character
|
||||||
|
++current_wchar;
|
||||||
|
utf8_bytes[0] = wc;
|
||||||
|
utf8_bytes_filled = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void fill_buffer_utf32()
|
||||||
|
{
|
||||||
|
utf8_bytes_index = 0;
|
||||||
|
|
||||||
|
if (current_wchar == str.size())
|
||||||
|
{
|
||||||
|
utf8_bytes[0] = std::char_traits<char>::eof();
|
||||||
|
utf8_bytes_filled = 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// get the current character
|
||||||
|
const char_t wc = str[current_wchar++];
|
||||||
|
|
||||||
|
// UTF-32 to UTF-8 encoding
|
||||||
|
if (wc < 0x80)
|
||||||
|
{
|
||||||
|
utf8_bytes[0] = wc;
|
||||||
|
utf8_bytes_filled = 1;
|
||||||
|
}
|
||||||
|
else if (wc <= 0x7FF)
|
||||||
|
{
|
||||||
|
utf8_bytes[0] = 0xC0 | ((wc >> 6) & 0x1F);
|
||||||
|
utf8_bytes[1] = 0x80 | (wc & 0x3F);
|
||||||
|
utf8_bytes_filled = 2;
|
||||||
|
}
|
||||||
|
else if (wc <= 0xFFFF)
|
||||||
|
{
|
||||||
|
utf8_bytes[0] = 0xE0 | ((wc >> 12) & 0x0F);
|
||||||
|
utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F);
|
||||||
|
utf8_bytes[2] = 0x80 | (wc & 0x3F);
|
||||||
|
utf8_bytes_filled = 3;
|
||||||
|
}
|
||||||
|
else if (wc <= 0x10FFFF)
|
||||||
|
{
|
||||||
|
utf8_bytes[0] = 0xF0 | ((wc >> 18 ) & 0x07);
|
||||||
|
utf8_bytes[1] = 0x80 | ((wc >> 12) & 0x3F);
|
||||||
|
utf8_bytes[2] = 0x80 | ((wc >> 6) & 0x3F);
|
||||||
|
utf8_bytes[3] = 0x80 | (wc & 0x3F);
|
||||||
|
utf8_bytes_filled = 4;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// unknown character
|
||||||
|
utf8_bytes[0] = wc;
|
||||||
|
utf8_bytes_filled = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
/// the wstring to process
|
||||||
|
const WideStringType& str;
|
||||||
|
|
||||||
|
/// index of the current wchar in str
|
||||||
|
std::size_t current_wchar = 0;
|
||||||
|
|
||||||
|
/// a buffer for UTF-8 bytes
|
||||||
|
std::array<std::char_traits<char>::int_type, 4> utf8_bytes = {{0, 0, 0, 0}};
|
||||||
|
|
||||||
|
/// index to the utf8_codes array for the next valid byte
|
||||||
|
std::size_t utf8_bytes_index = 0;
|
||||||
|
/// number of valid bytes in the utf8_codes array
|
||||||
|
std::size_t utf8_bytes_filled = 0;
|
||||||
|
|
||||||
|
/// the last character (returned after unget_character() is called)
|
||||||
|
std::char_traits<char>::int_type last_char = 0;
|
||||||
|
/// whether get_character() should return last_char
|
||||||
|
bool next_unget = false;
|
||||||
|
};
|
||||||
|
|
||||||
class input_adapter
|
class input_adapter
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
@ -178,6 +351,15 @@ class input_adapter
|
||||||
input_adapter(std::istream&& i)
|
input_adapter(std::istream&& i)
|
||||||
: ia(std::make_shared<input_stream_adapter>(i)) {}
|
: ia(std::make_shared<input_stream_adapter>(i)) {}
|
||||||
|
|
||||||
|
input_adapter(const std::wstring& ws)
|
||||||
|
: ia(std::make_shared<wide_string_input_adapter<std::wstring>>(ws)) {}
|
||||||
|
|
||||||
|
input_adapter(const std::u16string& ws)
|
||||||
|
: ia(std::make_shared<wide_string_input_adapter<std::u16string>>(ws)) {}
|
||||||
|
|
||||||
|
input_adapter(const std::u32string& ws)
|
||||||
|
: ia(std::make_shared<wide_string_input_adapter<std::u32string>>(ws)) {}
|
||||||
|
|
||||||
/// input adapter for buffer
|
/// input adapter for buffer
|
||||||
template<typename CharT,
|
template<typename CharT,
|
||||||
typename std::enable_if<
|
typename std::enable_if<
|
||||||
|
|
|
@ -1738,6 +1738,179 @@ class input_buffer_adapter : public input_adapter_protocol
|
||||||
const char* start;
|
const char* start;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<typename WideStringType>
|
||||||
|
class wide_string_input_adapter : public input_adapter_protocol
|
||||||
|
{
|
||||||
|
private:
|
||||||
|
using char_t = typename WideStringType::value_type;
|
||||||
|
|
||||||
|
public:
|
||||||
|
wide_string_input_adapter(const WideStringType& w) : str(w) {}
|
||||||
|
|
||||||
|
std::char_traits<char>::int_type get_character() noexcept override
|
||||||
|
{
|
||||||
|
// unget_character() was called previously: return the last character
|
||||||
|
if (next_unget)
|
||||||
|
{
|
||||||
|
next_unget = false;
|
||||||
|
return last_char;
|
||||||
|
}
|
||||||
|
|
||||||
|
// check if buffer needs to be filled
|
||||||
|
if (utf8_bytes_index == utf8_bytes_filled)
|
||||||
|
{
|
||||||
|
if (sizeof(char_t) == 2)
|
||||||
|
{
|
||||||
|
fill_buffer_utf16();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
fill_buffer_utf32();
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(utf8_bytes_filled > 0);
|
||||||
|
assert(utf8_bytes_index == 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// use buffer
|
||||||
|
assert(utf8_bytes_filled > 0);
|
||||||
|
assert(utf8_bytes_index < utf8_bytes_filled);
|
||||||
|
return (last_char = utf8_bytes[utf8_bytes_index++]);
|
||||||
|
}
|
||||||
|
|
||||||
|
void unget_character() noexcept override
|
||||||
|
{
|
||||||
|
next_unget = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
void fill_buffer_utf16()
|
||||||
|
{
|
||||||
|
utf8_bytes_index = 0;
|
||||||
|
|
||||||
|
if (current_wchar == str.size())
|
||||||
|
{
|
||||||
|
utf8_bytes[0] = std::char_traits<char>::eof();
|
||||||
|
utf8_bytes_filled = 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// get the current character
|
||||||
|
const char_t wc = str[current_wchar++];
|
||||||
|
|
||||||
|
// UTF-16 to UTF-8 encoding
|
||||||
|
if (wc < 0x80)
|
||||||
|
{
|
||||||
|
utf8_bytes[0] = wc;
|
||||||
|
utf8_bytes_filled = 1;
|
||||||
|
}
|
||||||
|
else if (wc <= 0x7FF)
|
||||||
|
{
|
||||||
|
utf8_bytes[0] = 0xC0 | ((wc >> 6));
|
||||||
|
utf8_bytes[1] = 0x80 | (wc & 0x3F);
|
||||||
|
utf8_bytes_filled = 2;
|
||||||
|
}
|
||||||
|
else if (0xD800 > wc or wc >= 0xE000)
|
||||||
|
{
|
||||||
|
utf8_bytes[0] = 0xE0 | ((wc >> 12));
|
||||||
|
utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F);
|
||||||
|
utf8_bytes[2] = 0x80 | (wc & 0x3F);
|
||||||
|
utf8_bytes_filled = 3;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (current_wchar < str.size())
|
||||||
|
{
|
||||||
|
const char_t wc2 = str[current_wchar++];
|
||||||
|
const int charcode = 0x10000 + (((wc & 0x3FF) << 10) | (wc2 & 0x3FF));
|
||||||
|
utf8_bytes[0] = 0xf0 | (charcode >> 18);
|
||||||
|
utf8_bytes[1] = 0x80 | ((charcode >> 12) & 0x3F);
|
||||||
|
utf8_bytes[2] = 0x80 | ((charcode >> 6) & 0x3F);
|
||||||
|
utf8_bytes[3] = 0x80 | (charcode & 0x3F);
|
||||||
|
utf8_bytes_filled = 4;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// unknown character
|
||||||
|
++current_wchar;
|
||||||
|
utf8_bytes[0] = wc;
|
||||||
|
utf8_bytes_filled = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void fill_buffer_utf32()
|
||||||
|
{
|
||||||
|
utf8_bytes_index = 0;
|
||||||
|
|
||||||
|
if (current_wchar == str.size())
|
||||||
|
{
|
||||||
|
utf8_bytes[0] = std::char_traits<char>::eof();
|
||||||
|
utf8_bytes_filled = 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// get the current character
|
||||||
|
const char_t wc = str[current_wchar++];
|
||||||
|
|
||||||
|
// UTF-32 to UTF-8 encoding
|
||||||
|
if (wc < 0x80)
|
||||||
|
{
|
||||||
|
utf8_bytes[0] = wc;
|
||||||
|
utf8_bytes_filled = 1;
|
||||||
|
}
|
||||||
|
else if (wc <= 0x7FF)
|
||||||
|
{
|
||||||
|
utf8_bytes[0] = 0xC0 | ((wc >> 6) & 0x1F);
|
||||||
|
utf8_bytes[1] = 0x80 | (wc & 0x3F);
|
||||||
|
utf8_bytes_filled = 2;
|
||||||
|
}
|
||||||
|
else if (wc <= 0xFFFF)
|
||||||
|
{
|
||||||
|
utf8_bytes[0] = 0xE0 | ((wc >> 12) & 0x0F);
|
||||||
|
utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F);
|
||||||
|
utf8_bytes[2] = 0x80 | (wc & 0x3F);
|
||||||
|
utf8_bytes_filled = 3;
|
||||||
|
}
|
||||||
|
else if (wc <= 0x10FFFF)
|
||||||
|
{
|
||||||
|
utf8_bytes[0] = 0xF0 | ((wc >> 18 ) & 0x07);
|
||||||
|
utf8_bytes[1] = 0x80 | ((wc >> 12) & 0x3F);
|
||||||
|
utf8_bytes[2] = 0x80 | ((wc >> 6) & 0x3F);
|
||||||
|
utf8_bytes[3] = 0x80 | (wc & 0x3F);
|
||||||
|
utf8_bytes_filled = 4;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// unknown character
|
||||||
|
utf8_bytes[0] = wc;
|
||||||
|
utf8_bytes_filled = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
/// the wstring to process
|
||||||
|
const WideStringType& str;
|
||||||
|
|
||||||
|
/// index of the current wchar in str
|
||||||
|
std::size_t current_wchar = 0;
|
||||||
|
|
||||||
|
/// a buffer for UTF-8 bytes
|
||||||
|
std::array<std::char_traits<char>::int_type, 4> utf8_bytes = {{0, 0, 0, 0}};
|
||||||
|
|
||||||
|
/// index to the utf8_codes array for the next valid byte
|
||||||
|
std::size_t utf8_bytes_index = 0;
|
||||||
|
/// number of valid bytes in the utf8_codes array
|
||||||
|
std::size_t utf8_bytes_filled = 0;
|
||||||
|
|
||||||
|
/// the last character (returned after unget_character() is called)
|
||||||
|
std::char_traits<char>::int_type last_char = 0;
|
||||||
|
/// whether get_character() should return last_char
|
||||||
|
bool next_unget = false;
|
||||||
|
};
|
||||||
|
|
||||||
class input_adapter
|
class input_adapter
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
@ -1751,6 +1924,15 @@ class input_adapter
|
||||||
input_adapter(std::istream&& i)
|
input_adapter(std::istream&& i)
|
||||||
: ia(std::make_shared<input_stream_adapter>(i)) {}
|
: ia(std::make_shared<input_stream_adapter>(i)) {}
|
||||||
|
|
||||||
|
input_adapter(const std::wstring& ws)
|
||||||
|
: ia(std::make_shared<wide_string_input_adapter<std::wstring>>(ws)) {}
|
||||||
|
|
||||||
|
input_adapter(const std::u16string& ws)
|
||||||
|
: ia(std::make_shared<wide_string_input_adapter<std::u16string>>(ws)) {}
|
||||||
|
|
||||||
|
input_adapter(const std::u32string& ws)
|
||||||
|
: ia(std::make_shared<wide_string_input_adapter<std::u32string>>(ws)) {}
|
||||||
|
|
||||||
/// input adapter for buffer
|
/// input adapter for buffer
|
||||||
template<typename CharT,
|
template<typename CharT,
|
||||||
typename std::enable_if<
|
typename std::enable_if<
|
||||||
|
|
|
@ -42,7 +42,8 @@ SOURCES = src/unit.cpp \
|
||||||
src/unit-serialization.cpp \
|
src/unit-serialization.cpp \
|
||||||
src/unit-testsuites.cpp \
|
src/unit-testsuites.cpp \
|
||||||
src/unit-ubjson.cpp \
|
src/unit-ubjson.cpp \
|
||||||
src/unit-unicode.cpp
|
src/unit-unicode.cpp \
|
||||||
|
src/unit-wstring.cpp
|
||||||
|
|
||||||
OBJECTS = $(SOURCES:.cpp=.o)
|
OBJECTS = $(SOURCES:.cpp=.o)
|
||||||
|
|
||||||
|
|
58
test/src/unit-wstring.cpp
Normal file
58
test/src/unit-wstring.cpp
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
/*
|
||||||
|
__ _____ _____ _____
|
||||||
|
__| | __| | | | JSON for Modern C++ (test suite)
|
||||||
|
| | |__ | | | | | | version 3.1.2
|
||||||
|
|_____|_____|_____|_|___| https://github.com/nlohmann/json
|
||||||
|
|
||||||
|
Licensed under the MIT License <http://opensource.org/licenses/MIT>.
|
||||||
|
Copyright (c) 2013-2018 Niels Lohmann <http://nlohmann.me>.
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "catch.hpp"
|
||||||
|
|
||||||
|
#include <nlohmann/json.hpp>
|
||||||
|
|
||||||
|
using nlohmann::json;
|
||||||
|
|
||||||
|
|
||||||
|
TEST_CASE("wide strings")
|
||||||
|
{
|
||||||
|
SECTION("std::wstring")
|
||||||
|
{
|
||||||
|
std::wstring w = L"[12.2,\"Ⴥaäö💤🧢\"]";
|
||||||
|
json j = json::parse(w);
|
||||||
|
CHECK(j.dump() == "[12.2,\"Ⴥaäö💤🧢\"]");
|
||||||
|
}
|
||||||
|
|
||||||
|
SECTION("std::u16string")
|
||||||
|
{
|
||||||
|
std::u16string w = u"[12.2,\"Ⴥaäö💤🧢\"]";
|
||||||
|
json j = json::parse(w);
|
||||||
|
CHECK(j.dump() == "[12.2,\"Ⴥaäö💤🧢\"]");
|
||||||
|
}
|
||||||
|
|
||||||
|
SECTION("std::u32string")
|
||||||
|
{
|
||||||
|
std::u32string w = U"[12.2,\"Ⴥaäö💤🧢\"]";
|
||||||
|
json j = json::parse(w);
|
||||||
|
CHECK(j.dump() == "[12.2,\"Ⴥaäö💤🧢\"]");
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue