🚧 added input adapter for wide strings #1031
This commit is contained in:
		
							parent
							
								
									4efa8cdb4c
								
							
						
					
					
						commit
						eb06d0531a
					
				
					 4 changed files with 424 additions and 1 deletions
				
			
		| 
						 | 
				
			
			@ -1738,6 +1738,179 @@ class input_buffer_adapter : public input_adapter_protocol
 | 
			
		|||
    const char* start;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template<typename WideStringType>
 | 
			
		||||
class wide_string_input_adapter : public input_adapter_protocol
 | 
			
		||||
{
 | 
			
		||||
  private:
 | 
			
		||||
    using char_t = typename WideStringType::value_type;
 | 
			
		||||
 | 
			
		||||
  public:
 | 
			
		||||
    wide_string_input_adapter(const WideStringType& w) : str(w) {}
 | 
			
		||||
 | 
			
		||||
    std::char_traits<char>::int_type get_character() noexcept override
 | 
			
		||||
    {
 | 
			
		||||
        // unget_character() was called previously: return the last character
 | 
			
		||||
        if (next_unget)
 | 
			
		||||
        {
 | 
			
		||||
            next_unget = false;
 | 
			
		||||
            return last_char;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // check if buffer needs to be filled
 | 
			
		||||
        if (utf8_bytes_index == utf8_bytes_filled)
 | 
			
		||||
        {
 | 
			
		||||
            if (sizeof(char_t) == 2)
 | 
			
		||||
            {
 | 
			
		||||
                fill_buffer_utf16();
 | 
			
		||||
            }
 | 
			
		||||
            else
 | 
			
		||||
            {
 | 
			
		||||
                fill_buffer_utf32();
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            assert(utf8_bytes_filled > 0);
 | 
			
		||||
            assert(utf8_bytes_index == 0);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // use buffer
 | 
			
		||||
        assert(utf8_bytes_filled > 0);
 | 
			
		||||
        assert(utf8_bytes_index < utf8_bytes_filled);
 | 
			
		||||
        return (last_char = utf8_bytes[utf8_bytes_index++]);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    void unget_character() noexcept override
 | 
			
		||||
    {
 | 
			
		||||
        next_unget = true;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  private:
 | 
			
		||||
    void fill_buffer_utf16()
 | 
			
		||||
    {
 | 
			
		||||
        utf8_bytes_index = 0;
 | 
			
		||||
 | 
			
		||||
        if (current_wchar == str.size())
 | 
			
		||||
        {
 | 
			
		||||
            utf8_bytes[0] = std::char_traits<char>::eof();
 | 
			
		||||
            utf8_bytes_filled = 1;
 | 
			
		||||
        }
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
            // get the current character
 | 
			
		||||
            const char_t wc = str[current_wchar++];
 | 
			
		||||
 | 
			
		||||
            // UTF-16 to UTF-8 encoding
 | 
			
		||||
            if (wc < 0x80)
 | 
			
		||||
            {
 | 
			
		||||
                utf8_bytes[0] = wc;
 | 
			
		||||
                utf8_bytes_filled = 1;
 | 
			
		||||
            }
 | 
			
		||||
            else if (wc <= 0x7FF)
 | 
			
		||||
            {
 | 
			
		||||
                utf8_bytes[0] = 0xC0 | ((wc >> 6));
 | 
			
		||||
                utf8_bytes[1] = 0x80 | (wc & 0x3F);
 | 
			
		||||
                utf8_bytes_filled = 2;
 | 
			
		||||
            }
 | 
			
		||||
            else if (0xD800 > wc or wc >= 0xE000)
 | 
			
		||||
            {
 | 
			
		||||
                utf8_bytes[0] = 0xE0 | ((wc >> 12));
 | 
			
		||||
                utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F);
 | 
			
		||||
                utf8_bytes[2] = 0x80 | (wc & 0x3F);
 | 
			
		||||
                utf8_bytes_filled = 3;
 | 
			
		||||
            }
 | 
			
		||||
            else
 | 
			
		||||
            {
 | 
			
		||||
                if (current_wchar < str.size())
 | 
			
		||||
                {
 | 
			
		||||
                    const char_t wc2 = str[current_wchar++];
 | 
			
		||||
                    const int charcode = 0x10000 + (((wc & 0x3FF) << 10) | (wc2 & 0x3FF));
 | 
			
		||||
                    utf8_bytes[0] = 0xf0 | (charcode >> 18);
 | 
			
		||||
                    utf8_bytes[1] = 0x80 | ((charcode >> 12) & 0x3F);
 | 
			
		||||
                    utf8_bytes[2] = 0x80 | ((charcode >> 6) & 0x3F);
 | 
			
		||||
                    utf8_bytes[3] = 0x80 | (charcode & 0x3F);
 | 
			
		||||
                    utf8_bytes_filled = 4;
 | 
			
		||||
                }
 | 
			
		||||
                else
 | 
			
		||||
                {
 | 
			
		||||
                    // unknown character
 | 
			
		||||
                    ++current_wchar;
 | 
			
		||||
                    utf8_bytes[0] = wc;
 | 
			
		||||
                    utf8_bytes_filled = 1;
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    void fill_buffer_utf32()
 | 
			
		||||
    {
 | 
			
		||||
        utf8_bytes_index = 0;
 | 
			
		||||
 | 
			
		||||
        if (current_wchar == str.size())
 | 
			
		||||
        {
 | 
			
		||||
            utf8_bytes[0] = std::char_traits<char>::eof();
 | 
			
		||||
            utf8_bytes_filled = 1;
 | 
			
		||||
        }
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
            // get the current character
 | 
			
		||||
            const char_t wc = str[current_wchar++];
 | 
			
		||||
 | 
			
		||||
            // UTF-32 to UTF-8 encoding
 | 
			
		||||
            if (wc < 0x80)
 | 
			
		||||
            {
 | 
			
		||||
                utf8_bytes[0] = wc;
 | 
			
		||||
                utf8_bytes_filled = 1;
 | 
			
		||||
            }
 | 
			
		||||
            else if (wc <= 0x7FF)
 | 
			
		||||
            {
 | 
			
		||||
                utf8_bytes[0] = 0xC0 | ((wc >> 6) & 0x1F);
 | 
			
		||||
                utf8_bytes[1] = 0x80 | (wc & 0x3F);
 | 
			
		||||
                utf8_bytes_filled = 2;
 | 
			
		||||
            }
 | 
			
		||||
            else if (wc <= 0xFFFF)
 | 
			
		||||
            {
 | 
			
		||||
                utf8_bytes[0] = 0xE0 | ((wc >> 12) & 0x0F);
 | 
			
		||||
                utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F);
 | 
			
		||||
                utf8_bytes[2] = 0x80 | (wc & 0x3F);
 | 
			
		||||
                utf8_bytes_filled = 3;
 | 
			
		||||
            }
 | 
			
		||||
            else if (wc <= 0x10FFFF)
 | 
			
		||||
            {
 | 
			
		||||
                utf8_bytes[0] = 0xF0 | ((wc >> 18 ) & 0x07);
 | 
			
		||||
                utf8_bytes[1] = 0x80 | ((wc >> 12) & 0x3F);
 | 
			
		||||
                utf8_bytes[2] = 0x80 | ((wc >> 6) & 0x3F);
 | 
			
		||||
                utf8_bytes[3] = 0x80 | (wc & 0x3F);
 | 
			
		||||
                utf8_bytes_filled = 4;
 | 
			
		||||
            }
 | 
			
		||||
            else
 | 
			
		||||
            {
 | 
			
		||||
                // unknown character
 | 
			
		||||
                utf8_bytes[0] = wc;
 | 
			
		||||
                utf8_bytes_filled = 1;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  private:
 | 
			
		||||
    /// the wstring to process
 | 
			
		||||
    const WideStringType& str;
 | 
			
		||||
 | 
			
		||||
    /// index of the current wchar in str
 | 
			
		||||
    std::size_t current_wchar = 0;
 | 
			
		||||
 | 
			
		||||
    /// a buffer for UTF-8 bytes
 | 
			
		||||
    std::array<std::char_traits<char>::int_type, 4> utf8_bytes = {{0, 0, 0, 0}};
 | 
			
		||||
 | 
			
		||||
    /// index to the utf8_codes array for the next valid byte
 | 
			
		||||
    std::size_t utf8_bytes_index = 0;
 | 
			
		||||
    /// number of valid bytes in the utf8_codes array
 | 
			
		||||
    std::size_t utf8_bytes_filled = 0;
 | 
			
		||||
 | 
			
		||||
    /// the last character (returned after unget_character() is called)
 | 
			
		||||
    std::char_traits<char>::int_type last_char = 0;
 | 
			
		||||
    /// whether get_character() should return last_char
 | 
			
		||||
    bool next_unget = false;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
class input_adapter
 | 
			
		||||
{
 | 
			
		||||
  public:
 | 
			
		||||
| 
						 | 
				
			
			@ -1751,6 +1924,15 @@ class input_adapter
 | 
			
		|||
    input_adapter(std::istream&& i)
 | 
			
		||||
        : ia(std::make_shared<input_stream_adapter>(i)) {}
 | 
			
		||||
 | 
			
		||||
    input_adapter(const std::wstring& ws)
 | 
			
		||||
        : ia(std::make_shared<wide_string_input_adapter<std::wstring>>(ws)) {}
 | 
			
		||||
 | 
			
		||||
    input_adapter(const std::u16string& ws)
 | 
			
		||||
        : ia(std::make_shared<wide_string_input_adapter<std::u16string>>(ws)) {}
 | 
			
		||||
 | 
			
		||||
    input_adapter(const std::u32string& ws)
 | 
			
		||||
        : ia(std::make_shared<wide_string_input_adapter<std::u32string>>(ws)) {}
 | 
			
		||||
 | 
			
		||||
    /// input adapter for buffer
 | 
			
		||||
    template<typename CharT,
 | 
			
		||||
             typename std::enable_if<
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue