From 90adf6ec20c581055d748acc1b99b61af58365a7 Mon Sep 17 00:00:00 2001 From: Perry Kundert Date: Mon, 2 Oct 2017 08:50:15 -0700 Subject: [PATCH 01/17] Simplify get_token_string, unnecessary buffering, handle Byte Order Mark --- src/json.hpp | 176 +++++++++++++++++---------------------------------- 1 file changed, 59 insertions(+), 117 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 24773823..942acb87 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -1398,119 +1398,60 @@ constexpr T static_const::value; struct input_adapter_protocol { virtual int get_character() = 0; - virtual std::string read(std::size_t offset, std::size_t length) = 0; virtual ~input_adapter_protocol() = default; }; /// a type to simplify interfaces using input_adapter_t = std::shared_ptr; -/// input adapter for cached stream input -template -class cached_input_stream_adapter : public input_adapter_protocol + +/// input adapter for a (caching) istream. Ignores a UFT Byte Order Mark at beginning of input. +class input_stream_adapter : public input_adapter_protocol { public: - explicit cached_input_stream_adapter(std::istream& i) - : is(i), start_position(is.tellg()) + explicit input_stream_adapter(std::istream& i) + : is(i) { - fill_buffer(); - - // skip byte order mark - if (fill_size >= 3 and buffer[0] == '\xEF' and buffer[1] == '\xBB' and buffer[2] == '\xBF') - { - buffer_pos += 3; - processed_chars += 3; - } + // Ignore Byte Order Mark at start of input + int c; + if (( c = get_character() ) == 0xEF ) + { + if (( c = get_character() ) == 0xBB ) + { + if (( c = get_character() ) == 0xBF ) + { + return; // Ignore BOM + } + else if ( c != std::char_traits::eof() ) + { + is.unget(); + } + is.putback( '\xBB' ); + } + else if ( c != std::char_traits::eof() ) + { + is.unget(); + } + is.putback( '\xEF' ); + } + else if ( c != std::char_traits::eof() ) + { + is.unget(); // Not BOM. Process as usual. + } } - ~cached_input_stream_adapter() override - { - // clear stream flags - is.clear(); - // We initially read a lot of characters into the buffer, and we may - // not have processed all of them. Therefore, we need to "rewind" the - // stream after the last processed char. - is.seekg(start_position); - is.ignore(static_cast(processed_chars)); - // clear stream flags - is.clear(); - } + ~input_stream_adapter() override {} int get_character() override { - // check if refilling is necessary and possible - if (buffer_pos == fill_size and not eof) - { - fill_buffer(); - - // check and remember that filling did not yield new input - if (fill_size == 0) - { - eof = true; - return std::char_traits::eof(); - } - - // the buffer is ready - buffer_pos = 0; - } - - ++processed_chars; - assert(buffer_pos < buffer.size()); - return buffer[buffer_pos++] & 0xFF; - } - - std::string read(std::size_t offset, std::size_t length) override - { - // create buffer - std::string result(length, '\0'); - - // save stream position - const auto current_pos = is.tellg(); - // save stream flags - const auto flags = is.rdstate(); - - // clear stream flags - is.clear(); - // set stream position - is.seekg(static_cast(offset)); - // read bytes - is.read(&result[0], static_cast(length)); - - // reset stream position - is.seekg(current_pos); - // reset stream flags - is.setstate(flags); - - return result; + int c = is.get(); + return c == std::char_traits::eof() ? c : ( c & 0xFF ); } private: - void fill_buffer() - { - // fill - is.read(buffer.data(), static_cast(buffer.size())); - // store number of bytes in the buffer - fill_size = static_cast(is.gcount()); - } /// the associated input stream std::istream& is; - - /// chars returned via get_character() - std::size_t processed_chars = 0; - /// chars processed in the current buffer - std::size_t buffer_pos = 0; - - /// whether stream reached eof - bool eof = false; - /// how many chars have been copied to the buffer by last (re)fill - std::size_t fill_size = 0; - - /// position of the stream when we started - const std::streampos start_position; - - /// internal buffer - std::array buffer{{}}; }; /// input adapter for buffer input @@ -1541,13 +1482,6 @@ class input_buffer_adapter : public input_adapter_protocol return std::char_traits::eof(); } - std::string read(std::size_t offset, std::size_t length) override - { - // avoid reading too many characters - const auto max_length = static_cast(limit - start); - return std::string(start + offset, (std::min)(length, max_length - offset)); - } - private: /// pointer to the current character const char* cursor; @@ -1564,11 +1498,11 @@ class input_adapter /// input adapter for input stream input_adapter(std::istream& i) - : ia(std::make_shared>(i)) {} + : ia(std::make_shared(i)) {} /// input adapter for input stream input_adapter(std::istream&& i) - : ia(std::make_shared>(i)) {} + : ia(std::make_shared(i)) {} /// input adapter for buffer template( current ); } /// get a character from the input int get() { ++chars_read; - return next_unget ? (next_unget = false, current) - : (current = ia->get_character()); + int c = next_unget ? (next_unget = false, current) + : (current = ia->get_character()); + token_string += static_cast( c ); + return c; } + /// unget current character (return it again on next get) + void unget() + { + --chars_read; + next_unget = true; + if (token_string.size() > 0) + token_string.resize( token_string.size() - 1 ); + } + /// add a character to yytext void add(int c) { @@ -2774,19 +2719,14 @@ scan_number_done: /// return the last read token (for errors only) std::string get_token_string() const { - // get the raw byte sequence of the last token - std::string s = ia->read(start_pos, chars_read - start_pos); - // escape control characters std::string result; - for (auto c : s) + for (auto c : token_string) { - if (c == '\0' or c == std::char_traits::eof()) - { - // ignore EOF - continue; - } - else if ('\x00' <= c and c <= '\x1f') + if ( c == std::char_traits::eof() ) { + continue; + } + else if ('\x00' <= c and c <= '\x1f') { // escape control characters std::stringstream ss; @@ -2892,6 +2832,8 @@ scan_number_done: std::size_t chars_read = 0; /// the start position of the current token std::size_t start_pos = 0; + /// raw input token string (for error messages) + std::string token_string = ""; /// buffer for variable-length tokens (numbers, strings) std::vector yytext = std::vector(1024, '\0'); From f585fe4eece44413982900f3b1b724cb7eabe541 Mon Sep 17 00:00:00 2001 From: Perry Kundert Date: Mon, 2 Oct 2017 14:17:23 -0700 Subject: [PATCH 02/17] Test to confirm parsing of multiple JSON records in a istream #367 --- test/src/unit-regression.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index 0c0d148d..2d30a52c 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -1233,4 +1233,24 @@ TEST_CASE("regression tests") "[json.exception.type_error.302] type must be array, but is null"); } } + + SECTION("issue #367 - Behavior of operator>> should more closely resemble that of built-in overloads.") + { + SECTION("example 1") + { + std::istringstream i1_2_3( "{\"first\": \"one\" }{\"second\": \"two\"}3" ); + json j1, j2, j3; + i1_2_3 >> j1; + i1_2_3 >> j2; + i1_2_3 >> j3; + + std::map m1 = j1; + std::map m2 = j2; + int i3 = j3; + + CHECK( m1 == ( std::map {{ "first", "one" }} )); + CHECK( m2 == ( std::map {{ "second", "two" }} )); + CHECK( i3 == 3 ); + } + } } From 12efeadc2eead552c5155352b84cd4dd899e169c Mon Sep 17 00:00:00 2001 From: Perry Kundert Date: Tue, 3 Oct 2017 08:49:39 -0700 Subject: [PATCH 03/17] Further simplify istream handling; use native unget --- src/json.hpp | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 942acb87..160fa4eb 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -1398,6 +1398,7 @@ constexpr T static_const::value; struct input_adapter_protocol { virtual int get_character() = 0; + virtual void unget_character() = 0; virtual ~input_adapter_protocol() = default; }; @@ -1448,6 +1449,10 @@ class input_stream_adapter : public input_adapter_protocol return c == std::char_traits::eof() ? c : ( c & 0xFF ); } + void unget_character() override + { + is.unget(); + } private: /// the associated input stream @@ -1482,6 +1487,14 @@ class input_buffer_adapter : public input_adapter_protocol return std::char_traits::eof(); } + void unget_character() noexcept override + { + if (JSON_LIKELY(cursor > 0)) + { + --cursor; + } + } + private: /// pointer to the current character const char* cursor; @@ -2647,8 +2660,8 @@ scan_number_done: int get() { ++chars_read; - int c = next_unget ? (next_unget = false, current) - : (current = ia->get_character()); + + int c = current = ia->get_character(); token_string += static_cast( c ); return c; } @@ -2657,7 +2670,7 @@ scan_number_done: void unget() { --chars_read; - next_unget = true; + if (token_string.size() > 0) token_string.resize( token_string.size() - 1 ); } @@ -2825,9 +2838,6 @@ scan_number_done: /// the current character int current = std::char_traits::eof(); - /// whether get() should return the last character again - bool next_unget = false; - /// the number of characters read std::size_t chars_read = 0; /// the start position of the current token From 14ca1f6f09e8da48e1191a527bba20ec86709c83 Mon Sep 17 00:00:00 2001 From: Perry Kundert Date: Tue, 3 Oct 2017 14:38:38 -0700 Subject: [PATCH 04/17] Restore istream performance #764 o Use std::streambuf I/O instead of std::istream; does not maintain (unused) istream flags. o Further simplify get/unget handling. o Restore original handling of NUL in input stream; ignored during token_string escaping. --- src/json.hpp | 111 +++++++++++++++++++++++++++------------------------ 1 file changed, 58 insertions(+), 53 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 160fa4eb..516f2bbb 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -1410,48 +1410,51 @@ using input_adapter_t = std::shared_ptr; class input_stream_adapter : public input_adapter_protocol { public: + ~input_stream_adapter() override + { + // clear stream flags; we use underlying streambuf I/O, do not maintain ifstream flags + is.clear(); + } explicit input_stream_adapter(std::istream& i) : is(i) { - // Ignore Byte Order Mark at start of input - int c; - if (( c = get_character() ) == 0xEF ) - { - if (( c = get_character() ) == 0xBB ) - { - if (( c = get_character() ) == 0xBF ) - { - return; // Ignore BOM - } - else if ( c != std::char_traits::eof() ) - { - is.unget(); - } - is.putback( '\xBB' ); - } - else if ( c != std::char_traits::eof() ) - { - is.unget(); - } - is.putback( '\xEF' ); - } - else if ( c != std::char_traits::eof() ) - { - is.unget(); // Not BOM. Process as usual. - } + // Ignore Byte Order Mark at start of input + int c; + if (( c = get_character() ) == 0xEF ) + { + if (( c = get_character() ) == 0xBB ) + { + if (( c = get_character() ) == 0xBF ) + { + return; // Ignore BOM + } + else if ( c != std::char_traits::eof() ) + { + is.unget(); + } + is.putback( '\xBB' ); + } + else if ( c != std::char_traits::eof() ) + { + is.unget(); + } + is.putback( '\xEF' ); + } + else if ( c != std::char_traits::eof() ) + { + is.unget(); // Not BOM. Process as usual. + } } - ~input_stream_adapter() override {} - int get_character() override { - int c = is.get(); - return c == std::char_traits::eof() ? c : ( c & 0xFF ); + int c = is.rdbuf()->sbumpc(); // Avoided for performance: int c = is.get(); + return c == std::char_traits::eof() ? c : ( c & 0xFF ); } void unget_character() override { - is.unget(); + is.rdbuf()->sungetc(); // Avoided for performance: is.unget(); } private: @@ -1489,10 +1492,10 @@ class input_buffer_adapter : public input_adapter_protocol void unget_character() noexcept override { - if (JSON_LIKELY(cursor > 0)) - { - --cursor; - } + if (JSON_LIKELY(cursor > start)) + { + --cursor; + } } private: @@ -2571,7 +2574,7 @@ scan_number_any2: scan_number_done: // unget the character after the number (we only read it to know that // we are done scanning a number) - unget(); + unget(); // terminate token add('\0'); @@ -2652,29 +2655,31 @@ scan_number_done: void reset() noexcept { yylen = 0; - start_pos = chars_read - 1; - token_string = static_cast( current ); + token_string.clear(); + token_string.push_back(static_cast(current)); } /// get a character from the input int get() { ++chars_read; - - int c = current = ia->get_character(); - token_string += static_cast( c ); - return c; + int c = current = ia->get_character(); + token_string.push_back(static_cast(c)); + return c; } /// unget current character (return it again on next get) void unget() { --chars_read; - - if (token_string.size() > 0) - token_string.resize( token_string.size() - 1 ); + if (JSON_LIKELY(current != std::char_traits::eof())) + { + ia->unget_character(); + } + if (! token_string.empty()) + token_string.pop_back(); } - + /// add a character to yytext void add(int c) { @@ -2736,10 +2741,12 @@ scan_number_done: std::string result; for (auto c : token_string) { - if ( c == std::char_traits::eof() ) { - continue; - } - else if ('\x00' <= c and c <= '\x1f') + if (c == '\0' or c == std::char_traits::eof()) + { + // ignore EOF + continue; + } + else if ('\x00' <= c and c <= '\x1f') { // escape control characters std::stringstream ss; @@ -2840,10 +2847,8 @@ scan_number_done: /// the number of characters read std::size_t chars_read = 0; - /// the start position of the current token - std::size_t start_pos = 0; /// raw input token string (for error messages) - std::string token_string = ""; + std::vector token_string = std::vector(); /// buffer for variable-length tokens (numbers, strings) std::vector yytext = std::vector(1024, '\0'); From 7c523338c512eee9d0c039bb90edbe64bd3d139e Mon Sep 17 00:00:00 2001 From: Perry Kundert Date: Wed, 4 Oct 2017 08:33:35 -0700 Subject: [PATCH 05/17] Remove unnnecessary NUL termination of yytext (as it may contain NULs) --- src/json.hpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 516f2bbb..1231354c 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -1828,9 +1828,6 @@ class lexer // closing quote case '\"': { - // terminate yytext - add('\0'); - --yylen; return token_type::value_string; } @@ -2576,10 +2573,6 @@ scan_number_done: // we are done scanning a number) unget(); - // terminate token - add('\0'); - --yylen; - char* endptr = nullptr; errno = 0; From 97a388802d9e4175f53a4322e0fa3180b43a1acf Mon Sep 17 00:00:00 2001 From: Perry Kundert Date: Wed, 4 Oct 2017 08:40:32 -0700 Subject: [PATCH 06/17] Improve performance by constructing yytext as a std::string o Return its contents when necessary. In many cases, this avoids construction of multiple copies of the yytext token. Exceeds performance of current develop branch. --- src/json.hpp | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 1231354c..48c5aa86 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -1795,9 +1795,9 @@ class lexer @brief scan a string literal This function scans a string according to Sect. 7 of RFC 7159. While - scanning, bytes are escaped and copied into buffer yytext. Then the - function returns successfully, yytext is null-terminated and yylen - contains the number of bytes in the string. + scanning, bytes are escaped and copied into buffer yytext. Then the function + returns successfully, yytext is *not* null-terminated (as it may contain \0 + bytes), and yytext.size() is the number of bytes in the string. @return token_type::value_string if string could be successfully scanned, token_type::parse_error otherwise @@ -2582,7 +2582,7 @@ scan_number_done: const auto x = std::strtoull(yytext.data(), &endptr, 10); // we checked the number format before - assert(endptr == yytext.data() + yylen); + assert(endptr == yytext.data() + yytext.size()); if (errno == 0) { @@ -2598,7 +2598,7 @@ scan_number_done: const auto x = std::strtoll(yytext.data(), &endptr, 10); // we checked the number format before - assert(endptr == yytext.data() + yylen); + assert(endptr == yytext.data() + yytext.size()); if (errno == 0) { @@ -2615,7 +2615,7 @@ scan_number_done: strtof(value_float, yytext.data(), &endptr); // we checked the number format before - assert(endptr == yytext.data() + yylen); + assert(endptr == yytext.data() + yytext.size()); return token_type::value_float; } @@ -2647,7 +2647,7 @@ scan_number_done: /// reset yytext; current character is beginning of token void reset() noexcept { - yylen = 0; + yytext.clear(); token_string.clear(); token_string.push_back(static_cast(current)); } @@ -2676,14 +2676,7 @@ scan_number_done: /// add a character to yytext void add(int c) { - // resize yytext if necessary; this condition is deemed unlikely, - // because we start with a 1024-byte buffer - if (JSON_UNLIKELY((yylen + 1 > yytext.capacity()))) - { - yytext.resize(2 * yytext.capacity(), '\0'); - } - assert(yylen < yytext.size()); - yytext[yylen++] = static_cast(c); + yytext.push_back(static_cast(c)); } public: @@ -2712,9 +2705,7 @@ scan_number_done: /// return string value const std::string get_string() { - // yytext cannot be returned as char*, because it may contain a null - // byte (parsed as "\u0000") - return std::string(yytext.data(), yylen); + return std::move( yytext ); } ///////////////////// @@ -2844,9 +2835,7 @@ scan_number_done: std::vector token_string = std::vector(); /// buffer for variable-length tokens (numbers, strings) - std::vector yytext = std::vector(1024, '\0'); - /// current index in yytext - std::size_t yylen = 0; + std::string yytext = ""; /// a description of occurred lexer errors const char* error_message = ""; From e0d890cc23045dfb7e2e1af160cc35437dfeff82 Mon Sep 17 00:00:00 2001 From: Perry Kundert Date: Wed, 4 Oct 2017 10:21:55 -0700 Subject: [PATCH 07/17] Corrected unnnecessary const restriction on returned std::string --- src/json.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 48c5aa86..4f996457 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -2656,9 +2656,9 @@ scan_number_done: int get() { ++chars_read; - int c = current = ia->get_character(); - token_string.push_back(static_cast(c)); - return c; + current = ia->get_character(); + token_string.push_back(static_cast(current)); + return current; } /// unget current character (return it again on next get) @@ -2703,7 +2703,7 @@ scan_number_done: } /// return string value - const std::string get_string() + std::string get_string() { return std::move( yytext ); } From 8665e25942a74a3503a76bd59952dd42fbe7ba61 Mon Sep 17 00:00:00 2001 From: Perry Kundert Date: Wed, 4 Oct 2017 10:47:52 -0700 Subject: [PATCH 08/17] Rename get_string to move_string to imply side-effect --- src/json.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 4f996457..2d95f18d 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -2702,8 +2702,8 @@ scan_number_done: return value_float; } - /// return string value - std::string get_string() + /// return current string value (implicitly resets the token; useful only once) + std::string move_string() { return std::move( yytext ); } @@ -3004,7 +3004,7 @@ class parser { return; } - key = m_lexer.get_string(); + key = m_lexer.move_string(); bool keep_tag = false; if (keep) @@ -3142,7 +3142,7 @@ class parser case token_type::value_string: { result.m_type = value_t::string; - result.m_value = m_lexer.get_string(); + result.m_value = m_lexer.move_string(); break; } From 546e148b24c8b58a6e3bed8aadb8f081f7059f98 Mon Sep 17 00:00:00 2001 From: Perry Kundert Date: Wed, 4 Oct 2017 11:31:10 -0700 Subject: [PATCH 09/17] Further performance improvements, and corrections in get_token_string o An (-'ve valued, typically -1) EOF must never be allowed in token_string, as it be converted to 255 -- a legitimate value. o Comparing against a specific eof() (-1, typically) is more costly than detecting +'ve/-'ve. Since EOF is the only non-positive value allowed we can use the simpler test. o Removed unnecessary test for token_string size, as it is already tested in the method, and must never occur in correct code; used an assert instead. --- src/json.hpp | 45 ++++++++++++++++++++-------------- test/src/unit-class_parser.cpp | 2 +- 2 files changed, 28 insertions(+), 19 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 2d95f18d..ed5f1c1d 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -1397,8 +1397,8 @@ constexpr T static_const::value; /// abstract input adapter interface struct input_adapter_protocol { - virtual int get_character() = 0; - virtual void unget_character() = 0; + virtual int get_character() = 0; // returns characters in range [0,255], or eof() (a -'ve value) + virtual void unget_character() = 0; // restore the last non-eof() character to input virtual ~input_adapter_protocol() = default; }; @@ -1449,7 +1449,7 @@ class input_stream_adapter : public input_adapter_protocol int get_character() override { int c = is.rdbuf()->sbumpc(); // Avoided for performance: int c = is.get(); - return c == std::char_traits::eof() ? c : ( c & 0xFF ); + return c < 0 ? c : ( c & 0xFF ); // faster than == std::char_traits::eof() } void unget_character() override @@ -2652,12 +2652,24 @@ scan_number_done: token_string.push_back(static_cast(current)); } - /// get a character from the input + /* + @brief get next character from the input + + This function provides the interface to the used input adapter. It does + not throw in case the input reached EOF, but returns a -'ve valued + `std::char_traits::eof()` in that case. Stores the scanned characters + for use in error messages. + + @return character read from the input + */ int get() { ++chars_read; current = ia->get_character(); - token_string.push_back(static_cast(current)); + if (JSON_LIKELY(current >= 0)) // faster than: != std::char_traits::eof())) + { + token_string.push_back(static_cast(current)); + } return current; } @@ -2665,12 +2677,12 @@ scan_number_done: void unget() { --chars_read; - if (JSON_LIKELY(current != std::char_traits::eof())) + if (JSON_LIKELY(current >= 0)) // faster than: != std::char_traits::eof())) { ia->unget_character(); - } - if (! token_string.empty()) + assert(token_string.size() != 0); token_string.pop_back(); + } } /// add a character to yytext @@ -2718,19 +2730,16 @@ scan_number_done: return chars_read; } - /// return the last read token (for errors only) + /// return the last read token (for errors only). Will never contain EOF + /// (a -'ve value), because 255 may legitimately occur. May contain NUL, which + /// should be escaped. std::string get_token_string() const { // escape control characters std::string result; for (auto c : token_string) { - if (c == '\0' or c == std::char_traits::eof()) - { - // ignore EOF - continue; - } - else if ('\x00' <= c and c <= '\x1f') + if ('\x00' <= c and c <= '\x1f') { // escape control characters std::stringstream ss; @@ -5144,7 +5153,7 @@ class binary_reader @brief get next character from the input This function provides the interface to the used input adapter. It does - not throw in case the input reached EOF, but returns + not throw in case the input reached EOF, but returns a -'ve valued `std::char_traits::eof()` in that case. @return character read from the input @@ -5448,14 +5457,14 @@ class binary_reader { if (expect_eof) { - if (JSON_UNLIKELY(current != std::char_traits::eof())) + if (JSON_UNLIKELY(current >= 0 )) // faster than: != std::char_traits::eof())) { JSON_THROW(parse_error::create(110, chars_read, "expected end of input")); } } else { - if (JSON_UNLIKELY(current == std::char_traits::eof())) + if (JSON_UNLIKELY(current < 0)) // faster than: == std::char_traits::eof())) { JSON_THROW(parse_error::create(110, chars_read, "unexpected end of input")); } diff --git a/test/src/unit-class_parser.cpp b/test/src/unit-class_parser.cpp index 08d4d6ef..9afa7d26 100644 --- a/test/src/unit-class_parser.cpp +++ b/test/src/unit-class_parser.cpp @@ -215,7 +215,7 @@ TEST_CASE("parser class") std::string s = "\"1\""; s[1] = '\0'; CHECK_THROWS_AS(json::parse(s.begin(), s.end()), json::parse_error&); - CHECK_THROWS_WITH(json::parse(s.begin(), s.end()), "[json.exception.parse_error.101] parse error at 2: syntax error - invalid string: control character must be escaped; last read: '\"'"); + CHECK_THROWS_WITH(json::parse(s.begin(), s.end()), "[json.exception.parse_error.101] parse error at 2: syntax error - invalid string: control character must be escaped; last read: '\"'"); } } From f775922ca8d94c2cdf602ab9ffba546f60b79a49 Mon Sep 17 00:00:00 2001 From: Perry Kundert Date: Wed, 4 Oct 2017 15:01:10 -0700 Subject: [PATCH 10/17] Specify initializers for yytest, token_string using initializer-lists o We can retain -Weffc++ and specify default initializers by using initializer lists. The risks are low (of additional non-conformat compilers), because there is already one other such initialization used in the code-base. --- src/json.hpp | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index ed5f1c1d..bf07d1c8 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -2833,29 +2833,30 @@ scan_number_done: private: /// input adapter - detail::input_adapter_t ia = nullptr; + detail::input_adapter_t ia { nullptr }; /// the current character - int current = std::char_traits::eof(); + int current { std::char_traits::eof() }; /// the number of characters read - std::size_t chars_read = 0; + std::size_t chars_read { 0 }; + /// raw input token string (for error messages) - std::vector token_string = std::vector(); + std::vector token_string { }; /// buffer for variable-length tokens (numbers, strings) - std::string yytext = ""; + std::string yytext { }; /// a description of occurred lexer errors - const char* error_message = ""; + const char* error_message { "" }; // number values - number_integer_t value_integer = 0; - number_unsigned_t value_unsigned = 0; - number_float_t value_float = 0; + number_integer_t value_integer { 0 }; + number_unsigned_t value_unsigned { 0 }; + number_float_t value_float { 0 }; /// the decimal point - const char decimal_point_char = '.'; + const char decimal_point_char { '.' }; }; /*! From 184dab60e6440092674d92e855cdcd2d51e175e0 Mon Sep 17 00:00:00 2001 From: Perry Kundert Date: Thu, 5 Oct 2017 14:13:55 -0700 Subject: [PATCH 11/17] Accelerate access to underlying std::istream streambuf --- src/json.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index bf07d1c8..8f28051a 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -1417,6 +1417,7 @@ class input_stream_adapter : public input_adapter_protocol } explicit input_stream_adapter(std::istream& i) : is(i) + , sb(i.rdbuf()) { // Ignore Byte Order Mark at start of input int c; @@ -1448,18 +1449,19 @@ class input_stream_adapter : public input_adapter_protocol int get_character() override { - int c = is.rdbuf()->sbumpc(); // Avoided for performance: int c = is.get(); + int c = sb->sbumpc(); // Avoided for performance: int c = is.get(); return c < 0 ? c : ( c & 0xFF ); // faster than == std::char_traits::eof() } void unget_character() override { - is.rdbuf()->sungetc(); // Avoided for performance: is.unget(); + sb->sungetc(); // Avoided for performance: is.unget(); } private: /// the associated input stream std::istream& is; + std::streambuf *sb; }; /// input adapter for buffer input From 1b43a45bec8057fba315c3eccd91d567cd32a273 Mon Sep 17 00:00:00 2001 From: Perry Kundert Date: Thu, 5 Oct 2017 15:37:03 -0700 Subject: [PATCH 12/17] Implement correct handling of std::streambuf int_type, eof() o Make no assumptions about eof(), other than that it is somewhere outside of the valid range of char_type. --- src/json.hpp | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 8f28051a..5fc467da 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -1397,7 +1397,7 @@ constexpr T static_const::value; /// abstract input adapter interface struct input_adapter_protocol { - virtual int get_character() = 0; // returns characters in range [0,255], or eof() (a -'ve value) + virtual int get_character() = 0; // returns characters in range [0,255], or eof() virtual void unget_character() = 0; // restore the last non-eof() character to input virtual ~input_adapter_protocol() = default; }; @@ -1447,10 +1447,13 @@ class input_stream_adapter : public input_adapter_protocol } } + // delete because of pointer members + input_stream_adapter(const input_stream_adapter&) = delete; + input_stream_adapter& operator=(input_stream_adapter&) = delete; + int get_character() override { - int c = sb->sbumpc(); // Avoided for performance: int c = is.get(); - return c < 0 ? c : ( c & 0xFF ); // faster than == std::char_traits::eof() + return reinterpret_cast( sb->sbumpc() ); } void unget_character() override @@ -1486,10 +1489,10 @@ class input_buffer_adapter : public input_adapter_protocol { if (JSON_LIKELY(cursor < limit)) { - return *(cursor++) & 0xFF; + return reinterpret_cast(std::char_traits::to_int_type(*(cursor++))); } - return std::char_traits::eof(); + return reinterpret_cast(std::char_traits::eof()); } void unget_character() noexcept override @@ -2668,9 +2671,9 @@ scan_number_done: { ++chars_read; current = ia->get_character(); - if (JSON_LIKELY(current >= 0)) // faster than: != std::char_traits::eof())) + if (JSON_LIKELY( current != std::char_traits::eof())) { - token_string.push_back(static_cast(current)); + token_string.push_back(std::char_traits::to_char_type(current)); } return current; } @@ -2679,7 +2682,7 @@ scan_number_done: void unget() { --chars_read; - if (JSON_LIKELY(current >= 0)) // faster than: != std::char_traits::eof())) + if (JSON_LIKELY(current != std::char_traits::eof())) { ia->unget_character(); assert(token_string.size() != 0); @@ -2690,7 +2693,7 @@ scan_number_done: /// add a character to yytext void add(int c) { - yytext.push_back(static_cast(c)); + yytext.push_back(std::char_traits::to_char_type(c)); } public: @@ -5460,14 +5463,14 @@ class binary_reader { if (expect_eof) { - if (JSON_UNLIKELY(current >= 0 )) // faster than: != std::char_traits::eof())) + if (JSON_UNLIKELY(current != std::char_traits::eof())) { JSON_THROW(parse_error::create(110, chars_read, "expected end of input")); } } else { - if (JSON_UNLIKELY(current < 0)) // faster than: == std::char_traits::eof())) + if (JSON_UNLIKELY(current == std::char_traits::eof())) { JSON_THROW(parse_error::create(110, chars_read, "unexpected end of input")); } From 5e480b56d843f3dd6869c2f7bbaeec1788076ac5 Mon Sep 17 00:00:00 2001 From: Perry Kundert Date: Fri, 6 Oct 2017 07:37:49 -0700 Subject: [PATCH 13/17] Further simplify character type handling --- src/json.hpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 5fc467da..e3231f7d 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -1451,9 +1451,12 @@ class input_stream_adapter : public input_adapter_protocol input_stream_adapter(const input_stream_adapter&) = delete; input_stream_adapter& operator=(input_stream_adapter&) = delete; + // std::istream/std::streambuf use std::char_traits::to_int_type, to + // ensure that std::char_traits::eof() and the character 0xff do not + // end up as the same value, eg. 0xffffffff. int get_character() override { - return reinterpret_cast( sb->sbumpc() ); + return sb->sbumpc(); } void unget_character() override @@ -1489,10 +1492,10 @@ class input_buffer_adapter : public input_adapter_protocol { if (JSON_LIKELY(cursor < limit)) { - return reinterpret_cast(std::char_traits::to_int_type(*(cursor++))); + return std::char_traits::to_int_type(*(cursor++)); } - return reinterpret_cast(std::char_traits::eof()); + return std::char_traits::eof(); } void unget_character() noexcept override From 45e1e3d48ad29f1301c9ecc6966c1e240e6b0168 Mon Sep 17 00:00:00 2001 From: Perry Kundert Date: Fri, 6 Oct 2017 07:53:31 -0700 Subject: [PATCH 14/17] Revert some unnecessary member initializer changes. --- src/json.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index e3231f7d..50d23d95 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -2841,13 +2841,13 @@ scan_number_done: private: /// input adapter - detail::input_adapter_t ia { nullptr }; + detail::input_adapter_t ia = nullptr; /// the current character - int current { std::char_traits::eof() }; + int current = std::char_traits::eof(); /// the number of characters read - std::size_t chars_read { 0 }; + std::size_t chars_read = 0; /// raw input token string (for error messages) std::vector token_string { }; @@ -2856,15 +2856,15 @@ scan_number_done: std::string yytext { }; /// a description of occurred lexer errors - const char* error_message { "" }; + const char* error_message = ""; // number values - number_integer_t value_integer { 0 }; - number_unsigned_t value_unsigned { 0 }; - number_float_t value_float { 0 }; + number_integer_t value_integer = 0; + number_unsigned_t value_unsigned = 0; + number_float_t value_float = 0; /// the decimal point - const char decimal_point_char { '.' }; + const char decimal_point_char = '.'; }; /*! From 23440eb86ea2b5bcc4321cf39c4072296f04590b Mon Sep 17 00:00:00 2001 From: Perry Kundert Date: Fri, 6 Oct 2017 12:27:53 -0700 Subject: [PATCH 15/17] Remove outdated commentary about the value of eof(), retain input type o We assume the same character int_type as the unerlying std::istream o There are no assumptions on the value of eof(), other than that it will not be a valid unsigned char value. o To retain performance, we do not allow swapping out the underlying std::streambuf during our use of the std::istream for parsing. --- src/json.hpp | 49 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 50d23d95..750ae51a 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -1394,10 +1394,22 @@ constexpr T static_const::value; // input adapters // //////////////////// -/// abstract input adapter interface +/*! +@brief abstract input adapter interface + +Produces a stream of std::char_traits::int_type characters from a +std::istream, a buffer, or some other input type. Accepts the return of exactly +one non-EOF character for future input. The int_type characters returned +consist of all valid char values as positive values (typically unsigned char), +plus an EOF value outside that range, specified by the value of the function +std::char_traits::eof(). This value is typically -1, but could be any +arbitrary value which is not a valid char value. + +@return Typically [0,255] plus std::char_traits::eof(). +*/ struct input_adapter_protocol { - virtual int get_character() = 0; // returns characters in range [0,255], or eof() + virtual std::char_traits::int_type get_character() = 0; virtual void unget_character() = 0; // restore the last non-eof() character to input virtual ~input_adapter_protocol() = default; }; @@ -1405,8 +1417,13 @@ struct input_adapter_protocol /// a type to simplify interfaces using input_adapter_t = std::shared_ptr; - -/// input adapter for a (caching) istream. Ignores a UFT Byte Order Mark at beginning of input. +/// input adapter for a (caching) istream. Ignores a UFT Byte Order Mark at +/// beginning of input. Does not support changing the underlying std::streambuf +/// in mid-input. Maintains underlying std::istream and std::streambuf to +/// support subsequent use of standard std::istream operations to process any +/// input characters following those used in parsing the JSON input. Clears the +/// std::istream flags; any input errors (eg. EOF) will be detected by the first +/// subsequent call for input from the std::istream. class input_stream_adapter : public input_adapter_protocol { public: @@ -1417,10 +1434,10 @@ class input_stream_adapter : public input_adapter_protocol } explicit input_stream_adapter(std::istream& i) : is(i) - , sb(i.rdbuf()) + , sb(*i.rdbuf()) { // Ignore Byte Order Mark at start of input - int c; + std::char_traits::int_type c; if (( c = get_character() ) == 0xEF ) { if (( c = get_character() ) == 0xBB ) @@ -1454,20 +1471,20 @@ class input_stream_adapter : public input_adapter_protocol // std::istream/std::streambuf use std::char_traits::to_int_type, to // ensure that std::char_traits::eof() and the character 0xff do not // end up as the same value, eg. 0xffffffff. - int get_character() override + std::char_traits::int_type get_character() override { - return sb->sbumpc(); + return sb.sbumpc(); } void unget_character() override { - sb->sungetc(); // Avoided for performance: is.unget(); + sb.sungetc(); // Avoided for performance: is.unget(); } private: /// the associated input stream std::istream& is; - std::streambuf *sb; + std::streambuf &sb; }; /// input adapter for buffer input @@ -1488,7 +1505,7 @@ class input_buffer_adapter : public input_adapter_protocol input_buffer_adapter(const input_buffer_adapter&) = delete; input_buffer_adapter& operator=(input_buffer_adapter&) = delete; - int get_character() noexcept override + std::char_traits::int_type get_character() noexcept override { if (JSON_LIKELY(cursor < limit)) { @@ -2664,13 +2681,13 @@ scan_number_done: @brief get next character from the input This function provides the interface to the used input adapter. It does - not throw in case the input reached EOF, but returns a -'ve valued + not throw in case the input reached EOF, but returns a `std::char_traits::eof()` in that case. Stores the scanned characters for use in error messages. @return character read from the input */ - int get() + std::char_traits::int_type get() { ++chars_read; current = ia->get_character(); @@ -2739,8 +2756,8 @@ scan_number_done: } /// return the last read token (for errors only). Will never contain EOF - /// (a -'ve value), because 255 may legitimately occur. May contain NUL, which - /// should be escaped. + /// (an arbitrary value that is not a valid char value, often -1), because + /// 255 may legitimately occur. May contain NUL, which should be escaped. std::string get_token_string() const { // escape control characters @@ -2844,7 +2861,7 @@ scan_number_done: detail::input_adapter_t ia = nullptr; /// the current character - int current = std::char_traits::eof(); + std::char_traits::int_type current = std::char_traits::eof(); /// the number of characters read std::size_t chars_read = 0; From 0b803d0a5ff48687558c85e7ceb5c2cf70ea29af Mon Sep 17 00:00:00 2001 From: Perry Kundert Date: Sat, 7 Oct 2017 15:50:19 -0700 Subject: [PATCH 16/17] Simplify the json/src/benchmarks.cpp to allow more optimal code gen. o For some unknown reason, the complexity of the benchmark platform prevented some C++ compilers from generating optimal code, properly reflective of the real performance in actual deployment. o Added the json_benchmarks_simple target, which performs the same suite of tests as json_benchmarks. o Simplified the benchmark platform, and emit an "Average" TPS (Transactions Per Second) value reflective of aggregate parse/output performance. --- .gitignore | 1 + benchmarks/Makefile | 18 ++- benchmarks/src/benchmarks.cpp | 20 +++- benchmarks/src/benchmarks_simple.cpp | 158 +++++++++++++++++++++++++++ 4 files changed, 190 insertions(+), 7 deletions(-) create mode 100644 benchmarks/src/benchmarks_simple.cpp diff --git a/.gitignore b/.gitignore index 5b2bc0fa..8157f1a9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ json_unit json_benchmarks +json_benchmarks_simple fuzz-testing *.dSYM diff --git a/benchmarks/Makefile b/benchmarks/Makefile index 0e4068c4..ef2de8a3 100644 --- a/benchmarks/Makefile +++ b/benchmarks/Makefile @@ -1,11 +1,21 @@ -all: json_benchmarks - ./json_benchmarks -json_benchmarks: src/benchmarks.cpp ../src/json.hpp number_jsons +# +# Build/run json.hpp benchmarks, eg. CXX=g++-7 make +# +# The existing json_benchmarks did not allow optimization under some compilers +# +all: json_benchmarks json_benchmarks_simple number_jsons + bash -c 'time ./json_benchmarks' + bash -c 'time ./json_benchmarks_simple' + +json_benchmarks: src/benchmarks.cpp ../src/json.hpp $(CXX) -std=c++11 -pthread $(CXXFLAGS) -DNDEBUG -O3 -flto -I thirdparty/benchpress -I thirdparty/cxxopts -I../src src/benchmarks.cpp $(LDFLAGS) -o $@ +json_benchmarks_simple: src/benchmarks_simple.cpp ../src/json.hpp + $(CXX) -std=c++11 $(CXXFLAGS) -DNDEBUG -O3 -flto -I../src $(<) $(LDFLAGS) -o $@ + number_jsons: (test -e files/numbers/floats.json -a -e files/numbers/signed_ints.json -a -e files/numbers/unsigned_ints.json) || (cd files/numbers ; python generate.py) clean: - rm -f json_benchmarks files/numbers/*.json + rm -f json_benchmarks json_benchmarks_simple files/numbers/*.json diff --git a/benchmarks/src/benchmarks.cpp b/benchmarks/src/benchmarks.cpp index 55a4e478..a76c3783 100644 --- a/benchmarks/src/benchmarks.cpp +++ b/benchmarks/src/benchmarks.cpp @@ -34,6 +34,19 @@ static void bench(benchpress::context& ctx, { // using string streams for benchmarking to factor-out cold-cache disk // access. +#if defined( FROMFILE ) + std::ifstream istr; + { + istr.open( in_path, std::ifstream::in ); + + // read the stream once + json j; + istr >> j; + // clear flags and rewind + istr.clear(); + istr.seekg(0); + } +#else std::stringstream istr; { // read file into string stream @@ -43,11 +56,12 @@ static void bench(benchpress::context& ctx, // read the stream once json j; - j << istr; + istr >> j; // clear flags and rewind istr.clear(); istr.seekg(0); } +#endif switch (mode) { @@ -62,7 +76,7 @@ static void bench(benchpress::context& ctx, istr.clear(); istr.seekg(0); json j; - j << istr; + istr >> j; } break; @@ -74,7 +88,7 @@ static void bench(benchpress::context& ctx, { // create JSON value from input json j; - j << istr; + istr >> j; std::stringstream ostr; ctx.reset_timer(); diff --git a/benchmarks/src/benchmarks_simple.cpp b/benchmarks/src/benchmarks_simple.cpp new file mode 100644 index 00000000..4fad680a --- /dev/null +++ b/benchmarks/src/benchmarks_simple.cpp @@ -0,0 +1,158 @@ +// +// benchmarks_simple.cpp -- a less complex version of benchmarks.cpp, that better reflects actual performance +// +// For some reason, the complexity of benchmarks.cpp doesn't allow +// the compiler to optimize code using json.hpp effectively. The +// exact same tests, with the use of benchpress and cxxopts produces +// much faster code, at least under g++. +// +#include +#include +#include +#include +#include + +#include + +using json = nlohmann::json; + +enum class EMode { input, output, indent }; + +static double bench(const EMode mode, size_t iters, const std::string& in_path ) +{ + // using string streams for benchmarking to factor-out cold-cache disk + // access. Define FROMFILE to use file I/O instead. +#if defined( FROMFILE ) + std::ifstream istr; + { + istr.open( in_path, std::ifstream::in ); + + // read the stream once + json j; + istr >> j; + // clear flags and rewind + istr.clear(); + istr.seekg(0); + } +#else + std::stringstream istr; + { + // read file into string stream + std::ifstream input_file(in_path); + istr << input_file.rdbuf(); + input_file.close(); + + // read the stream once + json j; + istr >> j; + // clear flags and rewind + istr.clear(); + istr.seekg(0); + } +#endif + double tps = 0; + switch (mode) + { + // benchmarking input + case EMode::input: + { + auto start = std::chrono::system_clock::now(); + for (size_t i = 0; i < iters; ++i) + { + // clear flags and rewind + istr.clear(); + istr.seekg(0); + json j; + istr >> j; + } + auto ended = std::chrono::system_clock::now(); + tps = 1.0 / std::chrono::duration( ended - start ).count(); + break; + } + + // benchmarking output + case EMode::output: + case EMode::indent: + { + // create JSON value from input + json j; + istr >> j; + std::stringstream ostr; + + auto start = std::chrono::system_clock::now(); + for (size_t i = 0; i < iters; ++i) + { + if (mode == EMode::indent) + { + ostr << j; + } + else + { + ostr << std::setw(4) << j; + } + + // reset data + ostr.str(std::string()); + } + auto ended = std::chrono::system_clock::now(); + tps = 1.0 / std::chrono::duration( ended - start ).count(); + + break; + } + } + return tps; +} + +template +struct average { + T _sum { 0 }; + size_t _count { 0 }; + T operator+=( const T &val_ ) { _sum += val_; +_count++; return val_; } + operator T() { return _sum / _count; } +}; + +// Execute each test approximately enough times to get near 1 +// transaction per second, and compute the average; a single aggregate +// number that gives a performance metric representing both parsing +// and output. + +int main( int, char ** ) +{ + std::list> tests { + { "parse jeopardy.json", EMode::input, 2, "files/jeopardy/jeopardy.json" }, + { "parse canada.json", EMode::input, 30, "files/nativejson-benchmark/canada.json" }, + { "parse citm_catalog.json", EMode::input, 120, "files/nativejson-benchmark/citm_catalog.json" }, + { "parse twitter.json", EMode::input, 225, "files/nativejson-benchmark/twitter.json" }, + { "parse floats.json", EMode::input, 5, "files/numbers/floats.json" }, + { "parse signed_ints.json", EMode::input, 6, "files/numbers/signed_ints.json" }, + { "parse unsigned_ints.json", EMode::input, 6, "files/numbers/unsigned_ints.json" }, + { "dump jeopardy.json", EMode::output, 5, "files/jeopardy/jeopardy.json" }, + { "dump jeopardy.json w/ind.", EMode::indent, 5, "files/jeopardy/jeopardy.json" }, + { "dump floats.json", EMode::output, 2, "files/numbers/floats.json" }, + { "dump signed_ints.json", EMode::output, 20, "files/numbers/signed_ints.json" }, + }; + + average avg; + for ( auto t : tests ) { + std::string name, path; + EMode mode; + size_t iters; + std::tie(name, mode, iters, path) = t; + auto tps = bench( mode, iters, path ); + avg += tps; + std::cout + << std::left + << std::setw( 30 ) << name + << std::right + << " x " << std::setw( 3 ) << iters + << std::left + << " == " << std::setw( 10 ) << tps + << std::right + << " TPS, " << std::setw( 8 ) << std::round( tps * 1e6 / iters ) + << " ms/op" + << std::endl; + } + std::cout << std::setw( 40 ) << "" << std::string( 10, '-' ) << std::endl; + std::cout << std::setw( 40 ) << "" << std::setw( 10 ) << std::left << avg << " TPS Average" << std::endl; + return 0; +} From a8cc7a1bc8e8addda9504e8510ff95a68d86e4b3 Mon Sep 17 00:00:00 2001 From: Perry Kundert Date: Mon, 16 Oct 2017 08:06:10 -0700 Subject: [PATCH 17/17] Consistently use std::char_traits int_type-->char conversion intrinsics --- src/json.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/json.hpp b/src/json.hpp index 750ae51a..1fa90456 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -2674,7 +2674,7 @@ scan_number_done: { yytext.clear(); token_string.clear(); - token_string.push_back(static_cast(current)); + token_string.push_back(std::char_traits::to_char_type(current)); } /*