Merge pull request #764 from pjkundert/develop-simplify-istream

Simplified istream handing #367
2017-10-22 08:52:28 +02:00 · 2017-10-22 08:52:28 +02:00 · 3094640446
commit 3094640446
parent 33df3250c3 ef40673acb
7 changed files with 341 additions and 164 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,6 @@
 json_unit
 json_benchmarks
 json_benchmarks_simple
 fuzz-testing
 *.dSYM
--- a/benchmarks/Makefile
+++ b/benchmarks/Makefile
@ -1,11 +1,21 @@
 all: json_benchmarks
 	./json_benchmarks
-json_benchmarks: src/benchmarks.cpp ../src/json.hpp number_jsons
+#
 # Build/run json.hpp benchmarks, eg. CXX=g++-7 make
 # 
 # The existing json_benchmarks did not allow optimization under some compilers
 # 
 all: json_benchmarks json_benchmarks_simple number_jsons
 	bash -c 'time ./json_benchmarks'
 	bash -c 'time ./json_benchmarks_simple'
 json_benchmarks: src/benchmarks.cpp ../src/json.hpp
 	$(CXX) -std=c++11 -pthread $(CXXFLAGS) -DNDEBUG -O3 -flto -I thirdparty/benchpress -I thirdparty/cxxopts -I../src src/benchmarks.cpp $(LDFLAGS) -o $@
 json_benchmarks_simple: src/benchmarks_simple.cpp ../src/json.hpp
 	$(CXX) -std=c++11 $(CXXFLAGS) -DNDEBUG -O3 -flto -I../src $(<) $(LDFLAGS) -o $@
 number_jsons:
 	(test -e files/numbers/floats.json -a -e files/numbers/signed_ints.json -a -e files/numbers/unsigned_ints.json) || (cd files/numbers ; python generate.py)
 clean:
-	rm -f json_benchmarks files/numbers/*.json
+	rm -f json_benchmarks json_benchmarks_simple files/numbers/*.json
--- a/benchmarks/src/benchmarks.cpp
+++ b/benchmarks/src/benchmarks.cpp
@ -34,6 +34,19 @@ static void bench(benchpress::context& ctx,
 {
    // using string streams for benchmarking to factor-out cold-cache disk
    // access.
 #if defined( FROMFILE )
    std::ifstream istr;
    {
        istr.open( in_path, std::ifstream::in );
        // read the stream once
        json j;
        istr >> j;
        // clear flags and rewind
        istr.clear();
        istr.seekg(0);
    }
 #else
    std::stringstream istr;
    {
        // read file into string stream
@ -43,11 +56,12 @@ static void bench(benchpress::context& ctx,
        // read the stream once
        json j;
-        j << istr;
+        istr >> j;
        // clear flags and rewind
        istr.clear();
        istr.seekg(0);
    }
 #endif
    switch (mode)
    {
@ -62,7 +76,7 @@ static void bench(benchpress::context& ctx,
                istr.clear();
                istr.seekg(0);
                json j;
-                j << istr;
+                istr >> j;
            }
            break;
@ -74,7 +88,7 @@ static void bench(benchpress::context& ctx,
        {
            // create JSON value from input
            json j;
-            j << istr;
+            istr >> j;
            std::stringstream ostr;
            ctx.reset_timer();
--- a/benchmarks/src/benchmarks_simple.cpp
+++ b/benchmarks/src/benchmarks_simple.cpp
@ -0,0 +1,158 @@
 // 
 // benchmarks_simple.cpp -- a less complex version of benchmarks.cpp, that better reflects actual performance
 // 
 //     For some reason, the complexity of benchmarks.cpp doesn't allow
 // the compiler to optimize code using json.hpp effectively.  The
 // exact same tests, with the use of benchpress and cxxopts produces
 // much faster code, at least under g++.
 // 
 #include <fstream>
 #include <iostream>
 #include <chrono>
 #include <list>
 #include <tuple>
 #include <json.hpp>
 using json = nlohmann::json;
 enum class EMode { input, output, indent };
 static double bench(const EMode mode, size_t iters, const std::string& in_path )
 {
    // using string streams for benchmarking to factor-out cold-cache disk
    // access.  Define FROMFILE to use file I/O instead.
 #if defined( FROMFILE )
    std::ifstream istr;
    {
        istr.open( in_path, std::ifstream::in );
        // read the stream once
        json j;
        istr >> j;
        // clear flags and rewind
        istr.clear();
        istr.seekg(0);
    }
 #else
    std::stringstream istr;
    {
        // read file into string stream
        std::ifstream input_file(in_path);
        istr << input_file.rdbuf();
        input_file.close();
        // read the stream once
        json j;
        istr >> j;
        // clear flags and rewind
        istr.clear();
        istr.seekg(0);
    }
 #endif
    double tps = 0;
    switch (mode)
    {
        // benchmarking input
        case EMode::input:
        {
 	    auto start = std::chrono::system_clock::now();
            for (size_t i = 0; i < iters; ++i)
            {
                // clear flags and rewind
                istr.clear();
                istr.seekg(0);
                json j;
                istr >> j;
            }
 	    auto ended = std::chrono::system_clock::now();
 	    tps = 1.0 / std::chrono::duration<double>( ended - start ).count();
            break;
        }
        // benchmarking output
        case EMode::output:
        case EMode::indent:
        {
            // create JSON value from input
            json j;
            istr >> j;
            std::stringstream ostr;
 	    auto start = std::chrono::system_clock::now();
            for (size_t i = 0; i < iters; ++i)
            {
                if (mode == EMode::indent)
                {
                    ostr << j;
                }
                else
                {
                    ostr << std::setw(4) << j;
                }
                // reset data
                ostr.str(std::string());
            }
 	    auto ended = std::chrono::system_clock::now();
 	    tps = 1.0 / std::chrono::duration<double>( ended - start ).count();
            break;
        }
    }
    return tps;
 }
 template <typename T>
 struct average {
    T _sum { 0 };
    size_t _count { 0 };
    T operator+=( const T &val_ ) { _sum += val_; +_count++; return val_; }
    operator T() { return _sum / _count; }
 };
 // Execute each test approximately enough times to get near 1
 // transaction per second, and compute the average; a single aggregate
 // number that gives a performance metric representing both parsing
 // and output.
 int main( int, char ** )
 {
    std::list<std::tuple<std::string, EMode, size_t, std::string>> tests {
 	{ "parse jeopardy.json",	EMode::input,   2, "files/jeopardy/jeopardy.json" },
 	{ "parse canada.json",		EMode::input,  30, "files/nativejson-benchmark/canada.json" },
 	{ "parse citm_catalog.json",	EMode::input, 120, "files/nativejson-benchmark/citm_catalog.json" },
 	{ "parse twitter.json",		EMode::input, 225, "files/nativejson-benchmark/twitter.json" },
 	{ "parse floats.json",		EMode::input,   5, "files/numbers/floats.json" },
 	{ "parse signed_ints.json",	EMode::input,   6, "files/numbers/signed_ints.json" },
 	{ "parse unsigned_ints.json",	EMode::input,   6, "files/numbers/unsigned_ints.json" },
 	{ "dump jeopardy.json",		EMode::output,  5, "files/jeopardy/jeopardy.json" },
 	{ "dump jeopardy.json w/ind.",	EMode::indent,  5, "files/jeopardy/jeopardy.json" },
 	{ "dump floats.json",		EMode::output,  2, "files/numbers/floats.json" },
 	{ "dump signed_ints.json",	EMode::output, 20, "files/numbers/signed_ints.json" },
    };
    average<double> avg;
    for ( auto t : tests ) {
 	std::string name, path;
 	EMode mode;
 	size_t iters;
 	std::tie(name, mode, iters, path) = t;
 	auto tps = bench( mode, iters, path );
 	avg += tps;
 	std::cout
 	    << std::left 
 	    << std::setw( 30 ) << name
 	    << std::right	    
 	    << " x " 	<< std::setw(  3 ) << iters
 	    << std::left
 	    << " == " 	<< std::setw( 10 ) << tps
 	    << std::right
 	    << " TPS, "	<< std::setw(  8 ) << std::round( tps * 1e6 / iters )
 	    << " ms/op"
 	    << std::endl;
    }
    std::cout << std::setw( 40 ) << "" << std::string( 10, '-' ) << std::endl;
    std::cout << std::setw( 40 ) << "" << std::setw( 10 ) << std::left << avg << " TPS Average" << std::endl;
    return 0;
 }
--- a/src/json.hpp
+++ b/src/json.hpp
@ -1394,123 +1394,97 @@ constexpr T static_const<T>::value;
 // input adapters //
 ////////////////////
-/// abstract input adapter interface
+/*!
@brief abstract input adapter interface
 Produces a stream of std::char_traits<char>::int_type characters from a
 std::istream, a buffer, or some other input type.  Accepts the return of exactly
 one non-EOF character for future input.  The int_type characters returned
 consist of all valid char values as positive values (typically unsigned char),
 plus an EOF value outside that range, specified by the value of the function
 std::char_traits<char>::eof().  This value is typically -1, but could be any
 arbitrary value which is not a valid char value.
@return Typically [0,255] plus std::char_traits<char>::eof().
 */
 struct input_adapter_protocol
 {
-    virtual int get_character() = 0;
+    virtual std::char_traits<char>::int_type get_character() = 0;
-    virtual std::string read(std::size_t offset, std::size_t length) = 0;
+    virtual void unget_character() = 0; // restore the last non-eof() character to input
    virtual ~input_adapter_protocol() = default;
 };
 /// a type to simplify interfaces
 using input_adapter_t = std::shared_ptr<input_adapter_protocol>;
-/// input adapter for cached stream input
+/// input adapter for a (caching) istream.  Ignores a UFT Byte Order Mark at
-template<std::size_t BufferSize>
+/// beginning of input.  Does not support changing the underlying std::streambuf
-class cached_input_stream_adapter : public input_adapter_protocol
+/// in mid-input.  Maintains underlying std::istream and std::streambuf to
 /// support subsequent use of standard std::istream operations to process any
 /// input characters following those used in parsing the JSON input.  Clears the
 /// std::istream flags; any input errors (eg. EOF) will be detected by the first
 /// subsequent call for input from the std::istream.
 class input_stream_adapter : public input_adapter_protocol
 {
  public:
-    explicit cached_input_stream_adapter(std::istream& i)
+    ~input_stream_adapter() override
        : is(i), start_position(is.tellg())
    {
-        fill_buffer();
+        // clear stream flags; we use underlying streambuf I/O, do not maintain ifstream flags
        // skip byte order mark
        if (fill_size >= 3 and buffer[0] == '\xEF' and buffer[1] == '\xBB' and buffer[2] == '\xBF')
        {
            buffer_pos += 3;
            processed_chars += 3;
        }
    }
    ~cached_input_stream_adapter() override
    {
        // clear stream flags
        is.clear();
        // We initially read a lot of characters into the buffer, and we may
        // not have processed all of them. Therefore, we need to "rewind" the
        // stream after the last processed char.
        is.seekg(start_position);
        is.ignore(static_cast<std::streamsize>(processed_chars));
        // clear stream flags
        is.clear();
    }
-
+    explicit input_stream_adapter(std::istream& i)
-    int get_character() override
+        : is(i)
        , sb(*i.rdbuf())
    {
-        // check if refilling is necessary and possible
+        // Ignore Byte Order Mark at start of input
-        if (buffer_pos == fill_size and not eof)
+        std::char_traits<char>::int_type c;
        if (( c = get_character() ) == 0xEF )
        {
-            fill_buffer();
+            if (( c = get_character() ) == 0xBB )
            // check and remember that filling did not yield new input
            if (fill_size == 0)
            {
-                eof = true;
+                if (( c = get_character() ) == 0xBF )
-                return std::char_traits<char>::eof();
+                {
                    return; // Ignore BOM
                }
                else if ( c != std::char_traits<char>::eof() )
                {
                    is.unget();
                }
                is.putback( '\xBB' );
            }
-
+            else if ( c != std::char_traits<char>::eof() )
-            // the buffer is ready
+            {
-            buffer_pos = 0;
+                is.unget();
            }
            is.putback( '\xEF' );
        }
        else if ( c != std::char_traits<char>::eof() )
        {
            is.unget(); // Not BOM.  Process as usual.
        }
        ++processed_chars;
        assert(buffer_pos < buffer.size());
        return buffer[buffer_pos++] & 0xFF;
    }
-    std::string read(std::size_t offset, std::size_t length) override
+    // delete because of pointer members
    input_stream_adapter(const input_stream_adapter&) = delete;
    input_stream_adapter& operator=(input_stream_adapter&) = delete;
    // std::istream/std::streambuf use std::char_traits<char>::to_int_type, to
    // ensure that std::char_traits<char>::eof() and the character 0xff do not
    // end up as the same value, eg. 0xffffffff.
    std::char_traits<char>::int_type get_character() override
    {
-        // create buffer
+        return sb.sbumpc();
        std::string result(length, '\0');
        // save stream position
        const auto current_pos = is.tellg();
        // save stream flags
        const auto flags = is.rdstate();
        // clear stream flags
        is.clear();
        // set stream position
        is.seekg(static_cast<std::streamoff>(offset));
        // read bytes
        is.read(&result[0], static_cast<std::streamsize>(length));
        // reset stream position
        is.seekg(current_pos);
        // reset stream flags
        is.setstate(flags);
        return result;
    }
    void unget_character() override
    {
        sb.sungetc(); // Avoided for performance: is.unget();
    }
  private:
    void fill_buffer()
    {
        // fill
        is.read(buffer.data(), static_cast<std::streamsize>(buffer.size()));
        // store number of bytes in the buffer
        fill_size = static_cast<size_t>(is.gcount());
    }
    /// the associated input stream
    std::istream& is;
-
+    std::streambuf &sb;
    /// chars returned via get_character()
    std::size_t processed_chars = 0;
    /// chars processed in the current buffer
    std::size_t buffer_pos = 0;
    /// whether stream reached eof
    bool eof = false;
    /// how many chars have been copied to the buffer by last (re)fill
    std::size_t fill_size = 0;
    /// position of the stream when we started
    const std::streampos start_position;
    /// internal buffer
    std::array<char, BufferSize> buffer{{}};
 };
 /// input adapter for buffer input
@ -1531,21 +1505,22 @@ class input_buffer_adapter : public input_adapter_protocol
    input_buffer_adapter(const input_buffer_adapter&) = delete;
    input_buffer_adapter& operator=(input_buffer_adapter&) = delete;
-    int get_character() noexcept override
+    std::char_traits<char>::int_type get_character() noexcept override
    {
        if (JSON_LIKELY(cursor < limit))
        {
-            return *(cursor++) & 0xFF;
+            return std::char_traits<char>::to_int_type(*(cursor++));
        }
        return std::char_traits<char>::eof();
    }
-    std::string read(std::size_t offset, std::size_t length) override
+    void unget_character() noexcept override
    {
-        // avoid reading too many characters
+        if (JSON_LIKELY(cursor > start))
-        const auto max_length = static_cast<std::size_t>(limit - start);
+        {
-        return std::string(start + offset, (std::min)(length, max_length - offset));
+            --cursor;
        }
    }
  private:
@ -1564,11 +1539,11 @@ class input_adapter
    /// input adapter for input stream
    input_adapter(std::istream& i)
-        : ia(std::make_shared<cached_input_stream_adapter<16384>>(i)) {}
+        : ia(std::make_shared<input_stream_adapter>(i)) {}
    /// input adapter for input stream
    input_adapter(std::istream&& i)
-        : ia(std::make_shared<cached_input_stream_adapter<16384>>(i)) {}
+        : ia(std::make_shared<input_stream_adapter>(i)) {}
    /// input adapter for buffer
    template<typename CharT,
@ -1845,9 +1820,9 @@ class lexer
    @brief scan a string literal
    This function scans a string according to Sect. 7 of RFC 7159. While
-    scanning, bytes are escaped and copied into buffer yytext. Then the
+    scanning, bytes are escaped and copied into buffer yytext. Then the function
-    function returns successfully, yytext is null-terminated and yylen
+    returns successfully, yytext is *not* null-terminated (as it may contain \0
-    contains the number of bytes in the string.
+    bytes), and yytext.size() is the number of bytes in the string.
    @return token_type::value_string if string could be successfully scanned,
            token_type::parse_error otherwise
@ -1878,9 +1853,6 @@ class lexer
                // closing quote
                case '\"':
                {
                    // terminate yytext
                    add('\0');
                    --yylen;
                    return token_type::value_string;
                }
@ -2624,12 +2596,7 @@ scan_number_any2:
 scan_number_done:
        // unget the character after the number (we only read it to know that
        // we are done scanning a number)
-        --chars_read;
+        unget();
        next_unget = true;
        // terminate token
        add('\0');
        --yylen;
        char* endptr = nullptr;
        errno = 0;
@ -2640,7 +2607,7 @@ scan_number_done:
            const auto x = std::strtoull(yytext.data(), &endptr, 10);
            // we checked the number format before
-            assert(endptr == yytext.data() + yylen);
+            assert(endptr == yytext.data() + yytext.size());
            if (errno == 0)
            {
@ -2656,7 +2623,7 @@ scan_number_done:
            const auto x = std::strtoll(yytext.data(), &endptr, 10);
            // we checked the number format before
-            assert(endptr == yytext.data() + yylen);
+            assert(endptr == yytext.data() + yytext.size());
            if (errno == 0)
            {
@ -2673,7 +2640,7 @@ scan_number_done:
        strtof(value_float, yytext.data(), &endptr);
        // we checked the number format before
-        assert(endptr == yytext.data() + yylen);
+        assert(endptr == yytext.data() + yytext.size());
        return token_type::value_float;
    }
@ -2702,32 +2669,51 @@ scan_number_done:
    // input management
    /////////////////////
-    /// reset yytext
+    /// reset yytext; current character is beginning of token
    void reset() noexcept
    {
-        yylen = 0;
+        yytext.clear();
-        start_pos = chars_read - 1;
+        token_string.clear();
        token_string.push_back(std::char_traits<char>::to_char_type(current));
    }
-    /// get a character from the input
+    /*
-    int get()
+    @brief get next character from the input
    This function provides the interface to the used input adapter. It does
    not throw in case the input reached EOF, but returns a
    `std::char_traits<char>::eof()` in that case.  Stores the scanned characters
    for use in error messages.
    @return character read from the input
    */
    std::char_traits<char>::int_type get()
    {
        ++chars_read;
-        return next_unget ? (next_unget = false, current)
+        current = ia->get_character();
-               : (current = ia->get_character());
+        if (JSON_LIKELY( current != std::char_traits<char>::eof()))
        {
            token_string.push_back(std::char_traits<char>::to_char_type(current));
        }
        return current;
    }
    /// unget current character (return it again on next get)
    void unget()
    {
        --chars_read;
        if (JSON_LIKELY(current != std::char_traits<char>::eof()))
        {
            ia->unget_character();
            assert(token_string.size() != 0);
            token_string.pop_back();
        }
    }
    /// add a character to yytext
    void add(int c)
    {
-        // resize yytext if necessary; this condition is deemed unlikely,
+        yytext.push_back(std::char_traits<char>::to_char_type(c));
        // because we start with a 1024-byte buffer
        if (JSON_UNLIKELY((yylen + 1 > yytext.capacity())))
        {
            yytext.resize(2 * yytext.capacity(), '\0');
        }
        assert(yylen < yytext.size());
        yytext[yylen++] = static_cast<char>(c);
    }
  public:
@ -2753,12 +2739,10 @@ scan_number_done:
        return value_float;
    }
-    /// return string value
+    /// return current string value (implicitly resets the token; useful only once)
-    const std::string get_string()
+    std::string move_string()
    {
-        // yytext cannot be returned as char*, because it may contain a null
+        return std::move( yytext );
        // byte (parsed as "\u0000")
        return std::string(yytext.data(), yylen);
    }
    /////////////////////
@ -2771,22 +2755,16 @@ scan_number_done:
        return chars_read;
    }
-    /// return the last read token (for errors only)
+    /// return the last read token (for errors only).  Will never contain EOF
    /// (an arbitrary value that is not a valid char value, often -1), because
    /// 255 may legitimately occur.  May contain NUL, which should be escaped.
    std::string get_token_string() const
    {
        // get the raw byte sequence of the last token
        std::string s = ia->read(start_pos, chars_read - start_pos);
        // escape control characters
        std::string result;
-        for (auto c : s)
+        for (auto c : token_string)
        {
-            if (c == '\0' or c == std::char_traits<char>::eof())
+            if ('\x00' <= c and c <= '\x1f')
            {
                // ignore EOF
                continue;
            }
            else if ('\x00' <= c and c <= '\x1f')
            {
                // escape control characters
                std::stringstream ss;
@ -2883,20 +2861,16 @@ scan_number_done:
    detail::input_adapter_t ia = nullptr;
    /// the current character
-    int current = std::char_traits<char>::eof();
+    std::char_traits<char>::int_type current = std::char_traits<char>::eof();
    /// whether get() should return the last character again
    bool next_unget = false;
    /// the number of characters read
    std::size_t chars_read = 0;
-    /// the start position of the current token
+
-    std::size_t start_pos = 0;
+    /// raw input token string (for error messages)
    std::vector<char> token_string { };
    /// buffer for variable-length tokens (numbers, strings)
-    std::vector<char> yytext = std::vector<char>(1024, '\0');
+    std::string yytext { };
    /// current index in yytext
    std::size_t yylen = 0;
    /// a description of occurred lexer errors
    const char* error_message = "";
@ -3073,7 +3047,7 @@ class parser
                    {
                        return;
                    }
-                    key = m_lexer.get_string();
+                    key = m_lexer.move_string();
                    bool keep_tag = false;
                    if (keep)
@ -3219,7 +3193,7 @@ class parser
            case token_type::value_string:
            {
                result.m_type = value_t::string;
-                result.m_value = m_lexer.get_string();
+                result.m_value = m_lexer.move_string();
                break;
            }
@ -5221,7 +5195,7 @@ class binary_reader
    @brief get next character from the input
    This function provides the interface to the used input adapter. It does
-    not throw in case the input reached EOF, but returns
+    not throw in case the input reached EOF, but returns a -'ve valued
    `std::char_traits<char>::eof()` in that case.
    @return character read from the input
--- a/test/src/unit-class_parser.cpp
+++ b/test/src/unit-class_parser.cpp
@ -215,7 +215,7 @@ TEST_CASE("parser class")
                    std::string s = "\"1\"";
                    s[1] = '\0';
                    CHECK_THROWS_AS(json::parse(s.begin(), s.end()), json::parse_error&);
-                    CHECK_THROWS_WITH(json::parse(s.begin(), s.end()), "[json.exception.parse_error.101] parse error at 2: syntax error - invalid string: control character must be escaped; last read: '\"'");
+                    CHECK_THROWS_WITH(json::parse(s.begin(), s.end()), "[json.exception.parse_error.101] parse error at 2: syntax error - invalid string: control character must be escaped; last read: '\"<U+0000>'");
                }
            }
--- a/test/src/unit-regression.cpp
+++ b/test/src/unit-regression.cpp
@ -1233,4 +1233,24 @@ TEST_CASE("regression tests")
                              "[json.exception.type_error.302] type must be array, but is null");
        }
    }
    SECTION("issue #367 - Behavior of operator>> should more closely resemble that of built-in overloads.")
    {
 	SECTION("example 1")
 	{
 	    std::istringstream i1_2_3( "{\"first\": \"one\" }{\"second\": \"two\"}3" );
 	    json j1, j2, j3;
 	    i1_2_3 >> j1;
 	    i1_2_3 >> j2;
 	    i1_2_3 >> j3;
 	    std::map<std::string,std::string> m1 = j1;
 	    std::map<std::string,std::string> m2 = j2;
 	    int i3 = j3;
 	    CHECK( m1 == ( std::map<std::string,std::string> {{ "first",  "one" }} ));
 	    CHECK( m2 == ( std::map<std::string,std::string> {{ "second", "two" }} ));
 	    CHECK( i3 == 3 );
 	}
    }
 }