From 90adf6ec20c581055d748acc1b99b61af58365a7 Mon Sep 17 00:00:00 2001 From: Perry Kundert Date: Mon, 2 Oct 2017 08:50:15 -0700 Subject: [PATCH] Simplify get_token_string, unnecessary buffering, handle Byte Order Mark --- src/json.hpp | 176 +++++++++++++++++---------------------------------- 1 file changed, 59 insertions(+), 117 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 24773823..942acb87 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -1398,119 +1398,60 @@ constexpr T static_const::value; struct input_adapter_protocol { virtual int get_character() = 0; - virtual std::string read(std::size_t offset, std::size_t length) = 0; virtual ~input_adapter_protocol() = default; }; /// a type to simplify interfaces using input_adapter_t = std::shared_ptr; -/// input adapter for cached stream input -template -class cached_input_stream_adapter : public input_adapter_protocol + +/// input adapter for a (caching) istream. Ignores a UFT Byte Order Mark at beginning of input. +class input_stream_adapter : public input_adapter_protocol { public: - explicit cached_input_stream_adapter(std::istream& i) - : is(i), start_position(is.tellg()) + explicit input_stream_adapter(std::istream& i) + : is(i) { - fill_buffer(); - - // skip byte order mark - if (fill_size >= 3 and buffer[0] == '\xEF' and buffer[1] == '\xBB' and buffer[2] == '\xBF') - { - buffer_pos += 3; - processed_chars += 3; - } + // Ignore Byte Order Mark at start of input + int c; + if (( c = get_character() ) == 0xEF ) + { + if (( c = get_character() ) == 0xBB ) + { + if (( c = get_character() ) == 0xBF ) + { + return; // Ignore BOM + } + else if ( c != std::char_traits::eof() ) + { + is.unget(); + } + is.putback( '\xBB' ); + } + else if ( c != std::char_traits::eof() ) + { + is.unget(); + } + is.putback( '\xEF' ); + } + else if ( c != std::char_traits::eof() ) + { + is.unget(); // Not BOM. Process as usual. + } } - ~cached_input_stream_adapter() override - { - // clear stream flags - is.clear(); - // We initially read a lot of characters into the buffer, and we may - // not have processed all of them. Therefore, we need to "rewind" the - // stream after the last processed char. - is.seekg(start_position); - is.ignore(static_cast(processed_chars)); - // clear stream flags - is.clear(); - } + ~input_stream_adapter() override {} int get_character() override { - // check if refilling is necessary and possible - if (buffer_pos == fill_size and not eof) - { - fill_buffer(); - - // check and remember that filling did not yield new input - if (fill_size == 0) - { - eof = true; - return std::char_traits::eof(); - } - - // the buffer is ready - buffer_pos = 0; - } - - ++processed_chars; - assert(buffer_pos < buffer.size()); - return buffer[buffer_pos++] & 0xFF; - } - - std::string read(std::size_t offset, std::size_t length) override - { - // create buffer - std::string result(length, '\0'); - - // save stream position - const auto current_pos = is.tellg(); - // save stream flags - const auto flags = is.rdstate(); - - // clear stream flags - is.clear(); - // set stream position - is.seekg(static_cast(offset)); - // read bytes - is.read(&result[0], static_cast(length)); - - // reset stream position - is.seekg(current_pos); - // reset stream flags - is.setstate(flags); - - return result; + int c = is.get(); + return c == std::char_traits::eof() ? c : ( c & 0xFF ); } private: - void fill_buffer() - { - // fill - is.read(buffer.data(), static_cast(buffer.size())); - // store number of bytes in the buffer - fill_size = static_cast(is.gcount()); - } /// the associated input stream std::istream& is; - - /// chars returned via get_character() - std::size_t processed_chars = 0; - /// chars processed in the current buffer - std::size_t buffer_pos = 0; - - /// whether stream reached eof - bool eof = false; - /// how many chars have been copied to the buffer by last (re)fill - std::size_t fill_size = 0; - - /// position of the stream when we started - const std::streampos start_position; - - /// internal buffer - std::array buffer{{}}; }; /// input adapter for buffer input @@ -1541,13 +1482,6 @@ class input_buffer_adapter : public input_adapter_protocol return std::char_traits::eof(); } - std::string read(std::size_t offset, std::size_t length) override - { - // avoid reading too many characters - const auto max_length = static_cast(limit - start); - return std::string(start + offset, (std::min)(length, max_length - offset)); - } - private: /// pointer to the current character const char* cursor; @@ -1564,11 +1498,11 @@ class input_adapter /// input adapter for input stream input_adapter(std::istream& i) - : ia(std::make_shared>(i)) {} + : ia(std::make_shared(i)) {} /// input adapter for input stream input_adapter(std::istream&& i) - : ia(std::make_shared>(i)) {} + : ia(std::make_shared(i)) {} /// input adapter for buffer template( current ); } /// get a character from the input int get() { ++chars_read; - return next_unget ? (next_unget = false, current) - : (current = ia->get_character()); + int c = next_unget ? (next_unget = false, current) + : (current = ia->get_character()); + token_string += static_cast( c ); + return c; } + /// unget current character (return it again on next get) + void unget() + { + --chars_read; + next_unget = true; + if (token_string.size() > 0) + token_string.resize( token_string.size() - 1 ); + } + /// add a character to yytext void add(int c) { @@ -2774,19 +2719,14 @@ scan_number_done: /// return the last read token (for errors only) std::string get_token_string() const { - // get the raw byte sequence of the last token - std::string s = ia->read(start_pos, chars_read - start_pos); - // escape control characters std::string result; - for (auto c : s) + for (auto c : token_string) { - if (c == '\0' or c == std::char_traits::eof()) - { - // ignore EOF - continue; - } - else if ('\x00' <= c and c <= '\x1f') + if ( c == std::char_traits::eof() ) { + continue; + } + else if ('\x00' <= c and c <= '\x1f') { // escape control characters std::stringstream ss; @@ -2892,6 +2832,8 @@ scan_number_done: std::size_t chars_read = 0; /// the start position of the current token std::size_t start_pos = 0; + /// raw input token string (for error messages) + std::string token_string = ""; /// buffer for variable-length tokens (numbers, strings) std::vector yytext = std::vector(1024, '\0');