diff --git a/.gitignore b/.gitignore index 5b2bc0fa..8157f1a9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ json_unit json_benchmarks +json_benchmarks_simple fuzz-testing *.dSYM diff --git a/benchmarks/Makefile b/benchmarks/Makefile index 0e4068c4..ef2de8a3 100644 --- a/benchmarks/Makefile +++ b/benchmarks/Makefile @@ -1,11 +1,21 @@ -all: json_benchmarks - ./json_benchmarks -json_benchmarks: src/benchmarks.cpp ../src/json.hpp number_jsons +# +# Build/run json.hpp benchmarks, eg. CXX=g++-7 make +# +# The existing json_benchmarks did not allow optimization under some compilers +# +all: json_benchmarks json_benchmarks_simple number_jsons + bash -c 'time ./json_benchmarks' + bash -c 'time ./json_benchmarks_simple' + +json_benchmarks: src/benchmarks.cpp ../src/json.hpp $(CXX) -std=c++11 -pthread $(CXXFLAGS) -DNDEBUG -O3 -flto -I thirdparty/benchpress -I thirdparty/cxxopts -I../src src/benchmarks.cpp $(LDFLAGS) -o $@ +json_benchmarks_simple: src/benchmarks_simple.cpp ../src/json.hpp + $(CXX) -std=c++11 $(CXXFLAGS) -DNDEBUG -O3 -flto -I../src $(<) $(LDFLAGS) -o $@ + number_jsons: (test -e files/numbers/floats.json -a -e files/numbers/signed_ints.json -a -e files/numbers/unsigned_ints.json) || (cd files/numbers ; python generate.py) clean: - rm -f json_benchmarks files/numbers/*.json + rm -f json_benchmarks json_benchmarks_simple files/numbers/*.json diff --git a/benchmarks/src/benchmarks.cpp b/benchmarks/src/benchmarks.cpp index 55a4e478..a76c3783 100644 --- a/benchmarks/src/benchmarks.cpp +++ b/benchmarks/src/benchmarks.cpp @@ -34,6 +34,19 @@ static void bench(benchpress::context& ctx, { // using string streams for benchmarking to factor-out cold-cache disk // access. +#if defined( FROMFILE ) + std::ifstream istr; + { + istr.open( in_path, std::ifstream::in ); + + // read the stream once + json j; + istr >> j; + // clear flags and rewind + istr.clear(); + istr.seekg(0); + } +#else std::stringstream istr; { // read file into string stream @@ -43,11 +56,12 @@ static void bench(benchpress::context& ctx, // read the stream once json j; - j << istr; + istr >> j; // clear flags and rewind istr.clear(); istr.seekg(0); } +#endif switch (mode) { @@ -62,7 +76,7 @@ static void bench(benchpress::context& ctx, istr.clear(); istr.seekg(0); json j; - j << istr; + istr >> j; } break; @@ -74,7 +88,7 @@ static void bench(benchpress::context& ctx, { // create JSON value from input json j; - j << istr; + istr >> j; std::stringstream ostr; ctx.reset_timer(); diff --git a/benchmarks/src/benchmarks_simple.cpp b/benchmarks/src/benchmarks_simple.cpp new file mode 100644 index 00000000..4fad680a --- /dev/null +++ b/benchmarks/src/benchmarks_simple.cpp @@ -0,0 +1,158 @@ +// +// benchmarks_simple.cpp -- a less complex version of benchmarks.cpp, that better reflects actual performance +// +// For some reason, the complexity of benchmarks.cpp doesn't allow +// the compiler to optimize code using json.hpp effectively. The +// exact same tests, with the use of benchpress and cxxopts produces +// much faster code, at least under g++. +// +#include +#include +#include +#include +#include + +#include + +using json = nlohmann::json; + +enum class EMode { input, output, indent }; + +static double bench(const EMode mode, size_t iters, const std::string& in_path ) +{ + // using string streams for benchmarking to factor-out cold-cache disk + // access. Define FROMFILE to use file I/O instead. +#if defined( FROMFILE ) + std::ifstream istr; + { + istr.open( in_path, std::ifstream::in ); + + // read the stream once + json j; + istr >> j; + // clear flags and rewind + istr.clear(); + istr.seekg(0); + } +#else + std::stringstream istr; + { + // read file into string stream + std::ifstream input_file(in_path); + istr << input_file.rdbuf(); + input_file.close(); + + // read the stream once + json j; + istr >> j; + // clear flags and rewind + istr.clear(); + istr.seekg(0); + } +#endif + double tps = 0; + switch (mode) + { + // benchmarking input + case EMode::input: + { + auto start = std::chrono::system_clock::now(); + for (size_t i = 0; i < iters; ++i) + { + // clear flags and rewind + istr.clear(); + istr.seekg(0); + json j; + istr >> j; + } + auto ended = std::chrono::system_clock::now(); + tps = 1.0 / std::chrono::duration( ended - start ).count(); + break; + } + + // benchmarking output + case EMode::output: + case EMode::indent: + { + // create JSON value from input + json j; + istr >> j; + std::stringstream ostr; + + auto start = std::chrono::system_clock::now(); + for (size_t i = 0; i < iters; ++i) + { + if (mode == EMode::indent) + { + ostr << j; + } + else + { + ostr << std::setw(4) << j; + } + + // reset data + ostr.str(std::string()); + } + auto ended = std::chrono::system_clock::now(); + tps = 1.0 / std::chrono::duration( ended - start ).count(); + + break; + } + } + return tps; +} + +template +struct average { + T _sum { 0 }; + size_t _count { 0 }; + T operator+=( const T &val_ ) { _sum += val_; +_count++; return val_; } + operator T() { return _sum / _count; } +}; + +// Execute each test approximately enough times to get near 1 +// transaction per second, and compute the average; a single aggregate +// number that gives a performance metric representing both parsing +// and output. + +int main( int, char ** ) +{ + std::list> tests { + { "parse jeopardy.json", EMode::input, 2, "files/jeopardy/jeopardy.json" }, + { "parse canada.json", EMode::input, 30, "files/nativejson-benchmark/canada.json" }, + { "parse citm_catalog.json", EMode::input, 120, "files/nativejson-benchmark/citm_catalog.json" }, + { "parse twitter.json", EMode::input, 225, "files/nativejson-benchmark/twitter.json" }, + { "parse floats.json", EMode::input, 5, "files/numbers/floats.json" }, + { "parse signed_ints.json", EMode::input, 6, "files/numbers/signed_ints.json" }, + { "parse unsigned_ints.json", EMode::input, 6, "files/numbers/unsigned_ints.json" }, + { "dump jeopardy.json", EMode::output, 5, "files/jeopardy/jeopardy.json" }, + { "dump jeopardy.json w/ind.", EMode::indent, 5, "files/jeopardy/jeopardy.json" }, + { "dump floats.json", EMode::output, 2, "files/numbers/floats.json" }, + { "dump signed_ints.json", EMode::output, 20, "files/numbers/signed_ints.json" }, + }; + + average avg; + for ( auto t : tests ) { + std::string name, path; + EMode mode; + size_t iters; + std::tie(name, mode, iters, path) = t; + auto tps = bench( mode, iters, path ); + avg += tps; + std::cout + << std::left + << std::setw( 30 ) << name + << std::right + << " x " << std::setw( 3 ) << iters + << std::left + << " == " << std::setw( 10 ) << tps + << std::right + << " TPS, " << std::setw( 8 ) << std::round( tps * 1e6 / iters ) + << " ms/op" + << std::endl; + } + std::cout << std::setw( 40 ) << "" << std::string( 10, '-' ) << std::endl; + std::cout << std::setw( 40 ) << "" << std::setw( 10 ) << std::left << avg << " TPS Average" << std::endl; + return 0; +} diff --git a/src/json.hpp b/src/json.hpp index 589f0447..47d51737 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -1394,123 +1394,97 @@ constexpr T static_const::value; // input adapters // //////////////////// -/// abstract input adapter interface +/*! +@brief abstract input adapter interface + +Produces a stream of std::char_traits::int_type characters from a +std::istream, a buffer, or some other input type. Accepts the return of exactly +one non-EOF character for future input. The int_type characters returned +consist of all valid char values as positive values (typically unsigned char), +plus an EOF value outside that range, specified by the value of the function +std::char_traits::eof(). This value is typically -1, but could be any +arbitrary value which is not a valid char value. + +@return Typically [0,255] plus std::char_traits::eof(). +*/ struct input_adapter_protocol { - virtual int get_character() = 0; - virtual std::string read(std::size_t offset, std::size_t length) = 0; + virtual std::char_traits::int_type get_character() = 0; + virtual void unget_character() = 0; // restore the last non-eof() character to input virtual ~input_adapter_protocol() = default; }; /// a type to simplify interfaces using input_adapter_t = std::shared_ptr; -/// input adapter for cached stream input -template -class cached_input_stream_adapter : public input_adapter_protocol +/// input adapter for a (caching) istream. Ignores a UFT Byte Order Mark at +/// beginning of input. Does not support changing the underlying std::streambuf +/// in mid-input. Maintains underlying std::istream and std::streambuf to +/// support subsequent use of standard std::istream operations to process any +/// input characters following those used in parsing the JSON input. Clears the +/// std::istream flags; any input errors (eg. EOF) will be detected by the first +/// subsequent call for input from the std::istream. +class input_stream_adapter : public input_adapter_protocol { public: - explicit cached_input_stream_adapter(std::istream& i) - : is(i), start_position(is.tellg()) + ~input_stream_adapter() override { - fill_buffer(); - - // skip byte order mark - if (fill_size >= 3 and buffer[0] == '\xEF' and buffer[1] == '\xBB' and buffer[2] == '\xBF') - { - buffer_pos += 3; - processed_chars += 3; - } - } - - ~cached_input_stream_adapter() override - { - // clear stream flags - is.clear(); - // We initially read a lot of characters into the buffer, and we may - // not have processed all of them. Therefore, we need to "rewind" the - // stream after the last processed char. - is.seekg(start_position); - is.ignore(static_cast(processed_chars)); - // clear stream flags + // clear stream flags; we use underlying streambuf I/O, do not maintain ifstream flags is.clear(); } - - int get_character() override + explicit input_stream_adapter(std::istream& i) + : is(i) + , sb(*i.rdbuf()) { - // check if refilling is necessary and possible - if (buffer_pos == fill_size and not eof) + // Ignore Byte Order Mark at start of input + std::char_traits::int_type c; + if (( c = get_character() ) == 0xEF ) { - fill_buffer(); - - // check and remember that filling did not yield new input - if (fill_size == 0) + if (( c = get_character() ) == 0xBB ) { - eof = true; - return std::char_traits::eof(); + if (( c = get_character() ) == 0xBF ) + { + return; // Ignore BOM + } + else if ( c != std::char_traits::eof() ) + { + is.unget(); + } + is.putback( '\xBB' ); } - - // the buffer is ready - buffer_pos = 0; + else if ( c != std::char_traits::eof() ) + { + is.unget(); + } + is.putback( '\xEF' ); + } + else if ( c != std::char_traits::eof() ) + { + is.unget(); // Not BOM. Process as usual. } - - ++processed_chars; - assert(buffer_pos < buffer.size()); - return buffer[buffer_pos++] & 0xFF; } - std::string read(std::size_t offset, std::size_t length) override + // delete because of pointer members + input_stream_adapter(const input_stream_adapter&) = delete; + input_stream_adapter& operator=(input_stream_adapter&) = delete; + + // std::istream/std::streambuf use std::char_traits::to_int_type, to + // ensure that std::char_traits::eof() and the character 0xff do not + // end up as the same value, eg. 0xffffffff. + std::char_traits::int_type get_character() override { - // create buffer - std::string result(length, '\0'); - - // save stream position - const auto current_pos = is.tellg(); - // save stream flags - const auto flags = is.rdstate(); - - // clear stream flags - is.clear(); - // set stream position - is.seekg(static_cast(offset)); - // read bytes - is.read(&result[0], static_cast(length)); - - // reset stream position - is.seekg(current_pos); - // reset stream flags - is.setstate(flags); - - return result; + return sb.sbumpc(); } + void unget_character() override + { + sb.sungetc(); // Avoided for performance: is.unget(); + } private: - void fill_buffer() - { - // fill - is.read(buffer.data(), static_cast(buffer.size())); - // store number of bytes in the buffer - fill_size = static_cast(is.gcount()); - } /// the associated input stream std::istream& is; - - /// chars returned via get_character() - std::size_t processed_chars = 0; - /// chars processed in the current buffer - std::size_t buffer_pos = 0; - - /// whether stream reached eof - bool eof = false; - /// how many chars have been copied to the buffer by last (re)fill - std::size_t fill_size = 0; - - /// position of the stream when we started - const std::streampos start_position; - - /// internal buffer - std::array buffer{{}}; + std::streambuf &sb; }; /// input adapter for buffer input @@ -1531,21 +1505,22 @@ class input_buffer_adapter : public input_adapter_protocol input_buffer_adapter(const input_buffer_adapter&) = delete; input_buffer_adapter& operator=(input_buffer_adapter&) = delete; - int get_character() noexcept override + std::char_traits::int_type get_character() noexcept override { if (JSON_LIKELY(cursor < limit)) { - return *(cursor++) & 0xFF; + return std::char_traits::to_int_type(*(cursor++)); } return std::char_traits::eof(); } - std::string read(std::size_t offset, std::size_t length) override + void unget_character() noexcept override { - // avoid reading too many characters - const auto max_length = static_cast(limit - start); - return std::string(start + offset, (std::min)(length, max_length - offset)); + if (JSON_LIKELY(cursor > start)) + { + --cursor; + } } private: @@ -1564,11 +1539,11 @@ class input_adapter /// input adapter for input stream input_adapter(std::istream& i) - : ia(std::make_shared>(i)) {} + : ia(std::make_shared(i)) {} /// input adapter for input stream input_adapter(std::istream&& i) - : ia(std::make_shared>(i)) {} + : ia(std::make_shared(i)) {} /// input adapter for buffer template::to_char_type(current)); } - /// get a character from the input - int get() + /* + @brief get next character from the input + + This function provides the interface to the used input adapter. It does + not throw in case the input reached EOF, but returns a + `std::char_traits::eof()` in that case. Stores the scanned characters + for use in error messages. + + @return character read from the input + */ + std::char_traits::int_type get() { ++chars_read; - return next_unget ? (next_unget = false, current) - : (current = ia->get_character()); + current = ia->get_character(); + if (JSON_LIKELY( current != std::char_traits::eof())) + { + token_string.push_back(std::char_traits::to_char_type(current)); + } + return current; } + /// unget current character (return it again on next get) + void unget() + { + --chars_read; + if (JSON_LIKELY(current != std::char_traits::eof())) + { + ia->unget_character(); + assert(token_string.size() != 0); + token_string.pop_back(); + } + } + /// add a character to yytext void add(int c) { - // resize yytext if necessary; this condition is deemed unlikely, - // because we start with a 1024-byte buffer - if (JSON_UNLIKELY((yylen + 1 > yytext.capacity()))) - { - yytext.resize(2 * yytext.capacity(), '\0'); - } - assert(yylen < yytext.size()); - yytext[yylen++] = static_cast(c); + yytext.push_back(std::char_traits::to_char_type(c)); } public: @@ -2753,12 +2739,10 @@ scan_number_done: return value_float; } - /// return string value - const std::string get_string() + /// return current string value (implicitly resets the token; useful only once) + std::string move_string() { - // yytext cannot be returned as char*, because it may contain a null - // byte (parsed as "\u0000") - return std::string(yytext.data(), yylen); + return std::move( yytext ); } ///////////////////// @@ -2771,22 +2755,16 @@ scan_number_done: return chars_read; } - /// return the last read token (for errors only) + /// return the last read token (for errors only). Will never contain EOF + /// (an arbitrary value that is not a valid char value, often -1), because + /// 255 may legitimately occur. May contain NUL, which should be escaped. std::string get_token_string() const { - // get the raw byte sequence of the last token - std::string s = ia->read(start_pos, chars_read - start_pos); - // escape control characters std::string result; - for (auto c : s) + for (auto c : token_string) { - if (c == '\0' or c == std::char_traits::eof()) - { - // ignore EOF - continue; - } - else if ('\x00' <= c and c <= '\x1f') + if ('\x00' <= c and c <= '\x1f') { // escape control characters std::stringstream ss; @@ -2883,20 +2861,16 @@ scan_number_done: detail::input_adapter_t ia = nullptr; /// the current character - int current = std::char_traits::eof(); - - /// whether get() should return the last character again - bool next_unget = false; + std::char_traits::int_type current = std::char_traits::eof(); /// the number of characters read std::size_t chars_read = 0; - /// the start position of the current token - std::size_t start_pos = 0; + + /// raw input token string (for error messages) + std::vector token_string { }; /// buffer for variable-length tokens (numbers, strings) - std::vector yytext = std::vector(1024, '\0'); - /// current index in yytext - std::size_t yylen = 0; + std::string yytext { }; /// a description of occurred lexer errors const char* error_message = ""; @@ -3073,7 +3047,7 @@ class parser { return; } - key = m_lexer.get_string(); + key = m_lexer.move_string(); bool keep_tag = false; if (keep) @@ -3219,7 +3193,7 @@ class parser case token_type::value_string: { result.m_type = value_t::string; - result.m_value = m_lexer.get_string(); + result.m_value = m_lexer.move_string(); break; } @@ -5221,7 +5195,7 @@ class binary_reader @brief get next character from the input This function provides the interface to the used input adapter. It does - not throw in case the input reached EOF, but returns + not throw in case the input reached EOF, but returns a -'ve valued `std::char_traits::eof()` in that case. @return character read from the input diff --git a/test/src/unit-class_parser.cpp b/test/src/unit-class_parser.cpp index 08d4d6ef..9afa7d26 100644 --- a/test/src/unit-class_parser.cpp +++ b/test/src/unit-class_parser.cpp @@ -215,7 +215,7 @@ TEST_CASE("parser class") std::string s = "\"1\""; s[1] = '\0'; CHECK_THROWS_AS(json::parse(s.begin(), s.end()), json::parse_error&); - CHECK_THROWS_WITH(json::parse(s.begin(), s.end()), "[json.exception.parse_error.101] parse error at 2: syntax error - invalid string: control character must be escaped; last read: '\"'"); + CHECK_THROWS_WITH(json::parse(s.begin(), s.end()), "[json.exception.parse_error.101] parse error at 2: syntax error - invalid string: control character must be escaped; last read: '\"'"); } } diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index 0c0d148d..2d30a52c 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -1233,4 +1233,24 @@ TEST_CASE("regression tests") "[json.exception.type_error.302] type must be array, but is null"); } } + + SECTION("issue #367 - Behavior of operator>> should more closely resemble that of built-in overloads.") + { + SECTION("example 1") + { + std::istringstream i1_2_3( "{\"first\": \"one\" }{\"second\": \"two\"}3" ); + json j1, j2, j3; + i1_2_3 >> j1; + i1_2_3 >> j2; + i1_2_3 >> j3; + + std::map m1 = j1; + std::map m2 = j2; + int i3 = j3; + + CHECK( m1 == ( std::map {{ "first", "one" }} )); + CHECK( m2 == ( std::map {{ "second", "two" }} )); + CHECK( i3 == 3 ); + } + } }