Simplify get_token_string, unnecessary buffering, handle Byte Order Mark
This commit is contained in:
parent
0c0851dbea
commit
90adf6ec20
1 changed files with 59 additions and 117 deletions
176
src/json.hpp
176
src/json.hpp
|
|
@ -1398,119 +1398,60 @@ constexpr T static_const<T>::value;
|
||||||
struct input_adapter_protocol
|
struct input_adapter_protocol
|
||||||
{
|
{
|
||||||
virtual int get_character() = 0;
|
virtual int get_character() = 0;
|
||||||
virtual std::string read(std::size_t offset, std::size_t length) = 0;
|
|
||||||
virtual ~input_adapter_protocol() = default;
|
virtual ~input_adapter_protocol() = default;
|
||||||
};
|
};
|
||||||
|
|
||||||
/// a type to simplify interfaces
|
/// a type to simplify interfaces
|
||||||
using input_adapter_t = std::shared_ptr<input_adapter_protocol>;
|
using input_adapter_t = std::shared_ptr<input_adapter_protocol>;
|
||||||
|
|
||||||
/// input adapter for cached stream input
|
|
||||||
template<std::size_t BufferSize>
|
/// input adapter for a (caching) istream. Ignores a UFT Byte Order Mark at beginning of input.
|
||||||
class cached_input_stream_adapter : public input_adapter_protocol
|
class input_stream_adapter : public input_adapter_protocol
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
explicit cached_input_stream_adapter(std::istream& i)
|
explicit input_stream_adapter(std::istream& i)
|
||||||
: is(i), start_position(is.tellg())
|
: is(i)
|
||||||
{
|
{
|
||||||
fill_buffer();
|
// Ignore Byte Order Mark at start of input
|
||||||
|
int c;
|
||||||
// skip byte order mark
|
if (( c = get_character() ) == 0xEF )
|
||||||
if (fill_size >= 3 and buffer[0] == '\xEF' and buffer[1] == '\xBB' and buffer[2] == '\xBF')
|
{
|
||||||
{
|
if (( c = get_character() ) == 0xBB )
|
||||||
buffer_pos += 3;
|
{
|
||||||
processed_chars += 3;
|
if (( c = get_character() ) == 0xBF )
|
||||||
}
|
{
|
||||||
|
return; // Ignore BOM
|
||||||
|
}
|
||||||
|
else if ( c != std::char_traits<char>::eof() )
|
||||||
|
{
|
||||||
|
is.unget();
|
||||||
|
}
|
||||||
|
is.putback( '\xBB' );
|
||||||
|
}
|
||||||
|
else if ( c != std::char_traits<char>::eof() )
|
||||||
|
{
|
||||||
|
is.unget();
|
||||||
|
}
|
||||||
|
is.putback( '\xEF' );
|
||||||
|
}
|
||||||
|
else if ( c != std::char_traits<char>::eof() )
|
||||||
|
{
|
||||||
|
is.unget(); // Not BOM. Process as usual.
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
~cached_input_stream_adapter() override
|
~input_stream_adapter() override {}
|
||||||
{
|
|
||||||
// clear stream flags
|
|
||||||
is.clear();
|
|
||||||
// We initially read a lot of characters into the buffer, and we may
|
|
||||||
// not have processed all of them. Therefore, we need to "rewind" the
|
|
||||||
// stream after the last processed char.
|
|
||||||
is.seekg(start_position);
|
|
||||||
is.ignore(static_cast<std::streamsize>(processed_chars));
|
|
||||||
// clear stream flags
|
|
||||||
is.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
int get_character() override
|
int get_character() override
|
||||||
{
|
{
|
||||||
// check if refilling is necessary and possible
|
int c = is.get();
|
||||||
if (buffer_pos == fill_size and not eof)
|
return c == std::char_traits<char>::eof() ? c : ( c & 0xFF );
|
||||||
{
|
|
||||||
fill_buffer();
|
|
||||||
|
|
||||||
// check and remember that filling did not yield new input
|
|
||||||
if (fill_size == 0)
|
|
||||||
{
|
|
||||||
eof = true;
|
|
||||||
return std::char_traits<char>::eof();
|
|
||||||
}
|
|
||||||
|
|
||||||
// the buffer is ready
|
|
||||||
buffer_pos = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
++processed_chars;
|
|
||||||
assert(buffer_pos < buffer.size());
|
|
||||||
return buffer[buffer_pos++] & 0xFF;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string read(std::size_t offset, std::size_t length) override
|
|
||||||
{
|
|
||||||
// create buffer
|
|
||||||
std::string result(length, '\0');
|
|
||||||
|
|
||||||
// save stream position
|
|
||||||
const auto current_pos = is.tellg();
|
|
||||||
// save stream flags
|
|
||||||
const auto flags = is.rdstate();
|
|
||||||
|
|
||||||
// clear stream flags
|
|
||||||
is.clear();
|
|
||||||
// set stream position
|
|
||||||
is.seekg(static_cast<std::streamoff>(offset));
|
|
||||||
// read bytes
|
|
||||||
is.read(&result[0], static_cast<std::streamsize>(length));
|
|
||||||
|
|
||||||
// reset stream position
|
|
||||||
is.seekg(current_pos);
|
|
||||||
// reset stream flags
|
|
||||||
is.setstate(flags);
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void fill_buffer()
|
|
||||||
{
|
|
||||||
// fill
|
|
||||||
is.read(buffer.data(), static_cast<std::streamsize>(buffer.size()));
|
|
||||||
// store number of bytes in the buffer
|
|
||||||
fill_size = static_cast<size_t>(is.gcount());
|
|
||||||
}
|
|
||||||
|
|
||||||
/// the associated input stream
|
/// the associated input stream
|
||||||
std::istream& is;
|
std::istream& is;
|
||||||
|
|
||||||
/// chars returned via get_character()
|
|
||||||
std::size_t processed_chars = 0;
|
|
||||||
/// chars processed in the current buffer
|
|
||||||
std::size_t buffer_pos = 0;
|
|
||||||
|
|
||||||
/// whether stream reached eof
|
|
||||||
bool eof = false;
|
|
||||||
/// how many chars have been copied to the buffer by last (re)fill
|
|
||||||
std::size_t fill_size = 0;
|
|
||||||
|
|
||||||
/// position of the stream when we started
|
|
||||||
const std::streampos start_position;
|
|
||||||
|
|
||||||
/// internal buffer
|
|
||||||
std::array<char, BufferSize> buffer{{}};
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/// input adapter for buffer input
|
/// input adapter for buffer input
|
||||||
|
|
@ -1541,13 +1482,6 @@ class input_buffer_adapter : public input_adapter_protocol
|
||||||
return std::char_traits<char>::eof();
|
return std::char_traits<char>::eof();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string read(std::size_t offset, std::size_t length) override
|
|
||||||
{
|
|
||||||
// avoid reading too many characters
|
|
||||||
const auto max_length = static_cast<size_t>(limit - start);
|
|
||||||
return std::string(start + offset, (std::min)(length, max_length - offset));
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/// pointer to the current character
|
/// pointer to the current character
|
||||||
const char* cursor;
|
const char* cursor;
|
||||||
|
|
@ -1564,11 +1498,11 @@ class input_adapter
|
||||||
|
|
||||||
/// input adapter for input stream
|
/// input adapter for input stream
|
||||||
input_adapter(std::istream& i)
|
input_adapter(std::istream& i)
|
||||||
: ia(std::make_shared<cached_input_stream_adapter<16384>>(i)) {}
|
: ia(std::make_shared<input_stream_adapter>(i)) {}
|
||||||
|
|
||||||
/// input adapter for input stream
|
/// input adapter for input stream
|
||||||
input_adapter(std::istream&& i)
|
input_adapter(std::istream&& i)
|
||||||
: ia(std::make_shared<cached_input_stream_adapter<16384>>(i)) {}
|
: ia(std::make_shared<input_stream_adapter>(i)) {}
|
||||||
|
|
||||||
/// input adapter for buffer
|
/// input adapter for buffer
|
||||||
template<typename CharT,
|
template<typename CharT,
|
||||||
|
|
@ -2624,8 +2558,7 @@ scan_number_any2:
|
||||||
scan_number_done:
|
scan_number_done:
|
||||||
// unget the character after the number (we only read it to know that
|
// unget the character after the number (we only read it to know that
|
||||||
// we are done scanning a number)
|
// we are done scanning a number)
|
||||||
--chars_read;
|
unget();
|
||||||
next_unget = true;
|
|
||||||
|
|
||||||
// terminate token
|
// terminate token
|
||||||
add('\0');
|
add('\0');
|
||||||
|
|
@ -2702,19 +2635,31 @@ scan_number_done:
|
||||||
// input management
|
// input management
|
||||||
/////////////////////
|
/////////////////////
|
||||||
|
|
||||||
/// reset yytext
|
/// reset yytext; current character is beginning of token
|
||||||
void reset() noexcept
|
void reset() noexcept
|
||||||
{
|
{
|
||||||
yylen = 0;
|
yylen = 0;
|
||||||
start_pos = chars_read - 1;
|
start_pos = chars_read - 1;
|
||||||
|
token_string = static_cast<char>( current );
|
||||||
}
|
}
|
||||||
|
|
||||||
/// get a character from the input
|
/// get a character from the input
|
||||||
int get()
|
int get()
|
||||||
{
|
{
|
||||||
++chars_read;
|
++chars_read;
|
||||||
return next_unget ? (next_unget = false, current)
|
int c = next_unget ? (next_unget = false, current)
|
||||||
: (current = ia->get_character());
|
: (current = ia->get_character());
|
||||||
|
token_string += static_cast<char>( c );
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// unget current character (return it again on next get)
|
||||||
|
void unget()
|
||||||
|
{
|
||||||
|
--chars_read;
|
||||||
|
next_unget = true;
|
||||||
|
if (token_string.size() > 0)
|
||||||
|
token_string.resize( token_string.size() - 1 );
|
||||||
}
|
}
|
||||||
|
|
||||||
/// add a character to yytext
|
/// add a character to yytext
|
||||||
|
|
@ -2774,19 +2719,14 @@ scan_number_done:
|
||||||
/// return the last read token (for errors only)
|
/// return the last read token (for errors only)
|
||||||
std::string get_token_string() const
|
std::string get_token_string() const
|
||||||
{
|
{
|
||||||
// get the raw byte sequence of the last token
|
|
||||||
std::string s = ia->read(start_pos, chars_read - start_pos);
|
|
||||||
|
|
||||||
// escape control characters
|
// escape control characters
|
||||||
std::string result;
|
std::string result;
|
||||||
for (auto c : s)
|
for (auto c : token_string)
|
||||||
{
|
{
|
||||||
if (c == '\0' or c == std::char_traits<char>::eof())
|
if ( c == std::char_traits<char>::eof() ) {
|
||||||
{
|
continue;
|
||||||
// ignore EOF
|
}
|
||||||
continue;
|
else if ('\x00' <= c and c <= '\x1f')
|
||||||
}
|
|
||||||
else if ('\x00' <= c and c <= '\x1f')
|
|
||||||
{
|
{
|
||||||
// escape control characters
|
// escape control characters
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
|
|
@ -2892,6 +2832,8 @@ scan_number_done:
|
||||||
std::size_t chars_read = 0;
|
std::size_t chars_read = 0;
|
||||||
/// the start position of the current token
|
/// the start position of the current token
|
||||||
std::size_t start_pos = 0;
|
std::size_t start_pos = 0;
|
||||||
|
/// raw input token string (for error messages)
|
||||||
|
std::string token_string = "";
|
||||||
|
|
||||||
/// buffer for variable-length tokens (numbers, strings)
|
/// buffer for variable-length tokens (numbers, strings)
|
||||||
std::vector<char> yytext = std::vector<char>(1024, '\0');
|
std::vector<char> yytext = std::vector<char>(1024, '\0');
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue