Merge pull request #764 from pjkundert/develop-simplify-istream

Simplified istream handing #367
This commit is contained in:
Niels Lohmann 2017-10-22 08:52:28 +02:00 committed by GitHub
commit 3094640446
7 changed files with 341 additions and 164 deletions

1
.gitignore vendored
View file

@ -1,5 +1,6 @@
json_unit json_unit
json_benchmarks json_benchmarks
json_benchmarks_simple
fuzz-testing fuzz-testing
*.dSYM *.dSYM

View file

@ -1,11 +1,21 @@
all: json_benchmarks
./json_benchmarks
json_benchmarks: src/benchmarks.cpp ../src/json.hpp number_jsons #
# Build/run json.hpp benchmarks, eg. CXX=g++-7 make
#
# The existing json_benchmarks did not allow optimization under some compilers
#
all: json_benchmarks json_benchmarks_simple number_jsons
bash -c 'time ./json_benchmarks'
bash -c 'time ./json_benchmarks_simple'
json_benchmarks: src/benchmarks.cpp ../src/json.hpp
$(CXX) -std=c++11 -pthread $(CXXFLAGS) -DNDEBUG -O3 -flto -I thirdparty/benchpress -I thirdparty/cxxopts -I../src src/benchmarks.cpp $(LDFLAGS) -o $@ $(CXX) -std=c++11 -pthread $(CXXFLAGS) -DNDEBUG -O3 -flto -I thirdparty/benchpress -I thirdparty/cxxopts -I../src src/benchmarks.cpp $(LDFLAGS) -o $@
json_benchmarks_simple: src/benchmarks_simple.cpp ../src/json.hpp
$(CXX) -std=c++11 $(CXXFLAGS) -DNDEBUG -O3 -flto -I../src $(<) $(LDFLAGS) -o $@
number_jsons: number_jsons:
(test -e files/numbers/floats.json -a -e files/numbers/signed_ints.json -a -e files/numbers/unsigned_ints.json) || (cd files/numbers ; python generate.py) (test -e files/numbers/floats.json -a -e files/numbers/signed_ints.json -a -e files/numbers/unsigned_ints.json) || (cd files/numbers ; python generate.py)
clean: clean:
rm -f json_benchmarks files/numbers/*.json rm -f json_benchmarks json_benchmarks_simple files/numbers/*.json

View file

@ -34,6 +34,19 @@ static void bench(benchpress::context& ctx,
{ {
// using string streams for benchmarking to factor-out cold-cache disk // using string streams for benchmarking to factor-out cold-cache disk
// access. // access.
#if defined( FROMFILE )
std::ifstream istr;
{
istr.open( in_path, std::ifstream::in );
// read the stream once
json j;
istr >> j;
// clear flags and rewind
istr.clear();
istr.seekg(0);
}
#else
std::stringstream istr; std::stringstream istr;
{ {
// read file into string stream // read file into string stream
@ -43,11 +56,12 @@ static void bench(benchpress::context& ctx,
// read the stream once // read the stream once
json j; json j;
j << istr; istr >> j;
// clear flags and rewind // clear flags and rewind
istr.clear(); istr.clear();
istr.seekg(0); istr.seekg(0);
} }
#endif
switch (mode) switch (mode)
{ {
@ -62,7 +76,7 @@ static void bench(benchpress::context& ctx,
istr.clear(); istr.clear();
istr.seekg(0); istr.seekg(0);
json j; json j;
j << istr; istr >> j;
} }
break; break;
@ -74,7 +88,7 @@ static void bench(benchpress::context& ctx,
{ {
// create JSON value from input // create JSON value from input
json j; json j;
j << istr; istr >> j;
std::stringstream ostr; std::stringstream ostr;
ctx.reset_timer(); ctx.reset_timer();

View file

@ -0,0 +1,158 @@
//
// benchmarks_simple.cpp -- a less complex version of benchmarks.cpp, that better reflects actual performance
//
// For some reason, the complexity of benchmarks.cpp doesn't allow
// the compiler to optimize code using json.hpp effectively. The
// exact same tests, with the use of benchpress and cxxopts produces
// much faster code, at least under g++.
//
#include <fstream>
#include <iostream>
#include <chrono>
#include <list>
#include <tuple>
#include <json.hpp>
using json = nlohmann::json;
enum class EMode { input, output, indent };
static double bench(const EMode mode, size_t iters, const std::string& in_path )
{
// using string streams for benchmarking to factor-out cold-cache disk
// access. Define FROMFILE to use file I/O instead.
#if defined( FROMFILE )
std::ifstream istr;
{
istr.open( in_path, std::ifstream::in );
// read the stream once
json j;
istr >> j;
// clear flags and rewind
istr.clear();
istr.seekg(0);
}
#else
std::stringstream istr;
{
// read file into string stream
std::ifstream input_file(in_path);
istr << input_file.rdbuf();
input_file.close();
// read the stream once
json j;
istr >> j;
// clear flags and rewind
istr.clear();
istr.seekg(0);
}
#endif
double tps = 0;
switch (mode)
{
// benchmarking input
case EMode::input:
{
auto start = std::chrono::system_clock::now();
for (size_t i = 0; i < iters; ++i)
{
// clear flags and rewind
istr.clear();
istr.seekg(0);
json j;
istr >> j;
}
auto ended = std::chrono::system_clock::now();
tps = 1.0 / std::chrono::duration<double>( ended - start ).count();
break;
}
// benchmarking output
case EMode::output:
case EMode::indent:
{
// create JSON value from input
json j;
istr >> j;
std::stringstream ostr;
auto start = std::chrono::system_clock::now();
for (size_t i = 0; i < iters; ++i)
{
if (mode == EMode::indent)
{
ostr << j;
}
else
{
ostr << std::setw(4) << j;
}
// reset data
ostr.str(std::string());
}
auto ended = std::chrono::system_clock::now();
tps = 1.0 / std::chrono::duration<double>( ended - start ).count();
break;
}
}
return tps;
}
template <typename T>
struct average {
T _sum { 0 };
size_t _count { 0 };
T operator+=( const T &val_ ) { _sum += val_; +_count++; return val_; }
operator T() { return _sum / _count; }
};
// Execute each test approximately enough times to get near 1
// transaction per second, and compute the average; a single aggregate
// number that gives a performance metric representing both parsing
// and output.
int main( int, char ** )
{
std::list<std::tuple<std::string, EMode, size_t, std::string>> tests {
{ "parse jeopardy.json", EMode::input, 2, "files/jeopardy/jeopardy.json" },
{ "parse canada.json", EMode::input, 30, "files/nativejson-benchmark/canada.json" },
{ "parse citm_catalog.json", EMode::input, 120, "files/nativejson-benchmark/citm_catalog.json" },
{ "parse twitter.json", EMode::input, 225, "files/nativejson-benchmark/twitter.json" },
{ "parse floats.json", EMode::input, 5, "files/numbers/floats.json" },
{ "parse signed_ints.json", EMode::input, 6, "files/numbers/signed_ints.json" },
{ "parse unsigned_ints.json", EMode::input, 6, "files/numbers/unsigned_ints.json" },
{ "dump jeopardy.json", EMode::output, 5, "files/jeopardy/jeopardy.json" },
{ "dump jeopardy.json w/ind.", EMode::indent, 5, "files/jeopardy/jeopardy.json" },
{ "dump floats.json", EMode::output, 2, "files/numbers/floats.json" },
{ "dump signed_ints.json", EMode::output, 20, "files/numbers/signed_ints.json" },
};
average<double> avg;
for ( auto t : tests ) {
std::string name, path;
EMode mode;
size_t iters;
std::tie(name, mode, iters, path) = t;
auto tps = bench( mode, iters, path );
avg += tps;
std::cout
<< std::left
<< std::setw( 30 ) << name
<< std::right
<< " x " << std::setw( 3 ) << iters
<< std::left
<< " == " << std::setw( 10 ) << tps
<< std::right
<< " TPS, " << std::setw( 8 ) << std::round( tps * 1e6 / iters )
<< " ms/op"
<< std::endl;
}
std::cout << std::setw( 40 ) << "" << std::string( 10, '-' ) << std::endl;
std::cout << std::setw( 40 ) << "" << std::setw( 10 ) << std::left << avg << " TPS Average" << std::endl;
return 0;
}

View file

@ -1394,123 +1394,97 @@ constexpr T static_const<T>::value;
// input adapters // // input adapters //
//////////////////// ////////////////////
/// abstract input adapter interface /*!
@brief abstract input adapter interface
Produces a stream of std::char_traits<char>::int_type characters from a
std::istream, a buffer, or some other input type. Accepts the return of exactly
one non-EOF character for future input. The int_type characters returned
consist of all valid char values as positive values (typically unsigned char),
plus an EOF value outside that range, specified by the value of the function
std::char_traits<char>::eof(). This value is typically -1, but could be any
arbitrary value which is not a valid char value.
@return Typically [0,255] plus std::char_traits<char>::eof().
*/
struct input_adapter_protocol struct input_adapter_protocol
{ {
virtual int get_character() = 0; virtual std::char_traits<char>::int_type get_character() = 0;
virtual std::string read(std::size_t offset, std::size_t length) = 0; virtual void unget_character() = 0; // restore the last non-eof() character to input
virtual ~input_adapter_protocol() = default; virtual ~input_adapter_protocol() = default;
}; };
/// a type to simplify interfaces /// a type to simplify interfaces
using input_adapter_t = std::shared_ptr<input_adapter_protocol>; using input_adapter_t = std::shared_ptr<input_adapter_protocol>;
/// input adapter for cached stream input /// input adapter for a (caching) istream. Ignores a UFT Byte Order Mark at
template<std::size_t BufferSize> /// beginning of input. Does not support changing the underlying std::streambuf
class cached_input_stream_adapter : public input_adapter_protocol /// in mid-input. Maintains underlying std::istream and std::streambuf to
/// support subsequent use of standard std::istream operations to process any
/// input characters following those used in parsing the JSON input. Clears the
/// std::istream flags; any input errors (eg. EOF) will be detected by the first
/// subsequent call for input from the std::istream.
class input_stream_adapter : public input_adapter_protocol
{ {
public: public:
explicit cached_input_stream_adapter(std::istream& i) ~input_stream_adapter() override
: is(i), start_position(is.tellg())
{ {
fill_buffer(); // clear stream flags; we use underlying streambuf I/O, do not maintain ifstream flags
// skip byte order mark
if (fill_size >= 3 and buffer[0] == '\xEF' and buffer[1] == '\xBB' and buffer[2] == '\xBF')
{
buffer_pos += 3;
processed_chars += 3;
}
}
~cached_input_stream_adapter() override
{
// clear stream flags
is.clear();
// We initially read a lot of characters into the buffer, and we may
// not have processed all of them. Therefore, we need to "rewind" the
// stream after the last processed char.
is.seekg(start_position);
is.ignore(static_cast<std::streamsize>(processed_chars));
// clear stream flags
is.clear(); is.clear();
} }
explicit input_stream_adapter(std::istream& i)
int get_character() override : is(i)
, sb(*i.rdbuf())
{ {
// check if refilling is necessary and possible // Ignore Byte Order Mark at start of input
if (buffer_pos == fill_size and not eof) std::char_traits<char>::int_type c;
if (( c = get_character() ) == 0xEF )
{ {
fill_buffer(); if (( c = get_character() ) == 0xBB )
// check and remember that filling did not yield new input
if (fill_size == 0)
{ {
eof = true; if (( c = get_character() ) == 0xBF )
return std::char_traits<char>::eof(); {
return; // Ignore BOM
}
else if ( c != std::char_traits<char>::eof() )
{
is.unget();
}
is.putback( '\xBB' );
}
else if ( c != std::char_traits<char>::eof() )
{
is.unget();
}
is.putback( '\xEF' );
}
else if ( c != std::char_traits<char>::eof() )
{
is.unget(); // Not BOM. Process as usual.
}
} }
// the buffer is ready // delete because of pointer members
buffer_pos = 0; input_stream_adapter(const input_stream_adapter&) = delete;
} input_stream_adapter& operator=(input_stream_adapter&) = delete;
++processed_chars; // std::istream/std::streambuf use std::char_traits<char>::to_int_type, to
assert(buffer_pos < buffer.size()); // ensure that std::char_traits<char>::eof() and the character 0xff do not
return buffer[buffer_pos++] & 0xFF; // end up as the same value, eg. 0xffffffff.
} std::char_traits<char>::int_type get_character() override
std::string read(std::size_t offset, std::size_t length) override
{ {
// create buffer return sb.sbumpc();
std::string result(length, '\0');
// save stream position
const auto current_pos = is.tellg();
// save stream flags
const auto flags = is.rdstate();
// clear stream flags
is.clear();
// set stream position
is.seekg(static_cast<std::streamoff>(offset));
// read bytes
is.read(&result[0], static_cast<std::streamsize>(length));
// reset stream position
is.seekg(current_pos);
// reset stream flags
is.setstate(flags);
return result;
} }
void unget_character() override
{
sb.sungetc(); // Avoided for performance: is.unget();
}
private: private:
void fill_buffer()
{
// fill
is.read(buffer.data(), static_cast<std::streamsize>(buffer.size()));
// store number of bytes in the buffer
fill_size = static_cast<size_t>(is.gcount());
}
/// the associated input stream /// the associated input stream
std::istream& is; std::istream& is;
std::streambuf &sb;
/// chars returned via get_character()
std::size_t processed_chars = 0;
/// chars processed in the current buffer
std::size_t buffer_pos = 0;
/// whether stream reached eof
bool eof = false;
/// how many chars have been copied to the buffer by last (re)fill
std::size_t fill_size = 0;
/// position of the stream when we started
const std::streampos start_position;
/// internal buffer
std::array<char, BufferSize> buffer{{}};
}; };
/// input adapter for buffer input /// input adapter for buffer input
@ -1531,21 +1505,22 @@ class input_buffer_adapter : public input_adapter_protocol
input_buffer_adapter(const input_buffer_adapter&) = delete; input_buffer_adapter(const input_buffer_adapter&) = delete;
input_buffer_adapter& operator=(input_buffer_adapter&) = delete; input_buffer_adapter& operator=(input_buffer_adapter&) = delete;
int get_character() noexcept override std::char_traits<char>::int_type get_character() noexcept override
{ {
if (JSON_LIKELY(cursor < limit)) if (JSON_LIKELY(cursor < limit))
{ {
return *(cursor++) & 0xFF; return std::char_traits<char>::to_int_type(*(cursor++));
} }
return std::char_traits<char>::eof(); return std::char_traits<char>::eof();
} }
std::string read(std::size_t offset, std::size_t length) override void unget_character() noexcept override
{ {
// avoid reading too many characters if (JSON_LIKELY(cursor > start))
const auto max_length = static_cast<std::size_t>(limit - start); {
return std::string(start + offset, (std::min)(length, max_length - offset)); --cursor;
}
} }
private: private:
@ -1564,11 +1539,11 @@ class input_adapter
/// input adapter for input stream /// input adapter for input stream
input_adapter(std::istream& i) input_adapter(std::istream& i)
: ia(std::make_shared<cached_input_stream_adapter<16384>>(i)) {} : ia(std::make_shared<input_stream_adapter>(i)) {}
/// input adapter for input stream /// input adapter for input stream
input_adapter(std::istream&& i) input_adapter(std::istream&& i)
: ia(std::make_shared<cached_input_stream_adapter<16384>>(i)) {} : ia(std::make_shared<input_stream_adapter>(i)) {}
/// input adapter for buffer /// input adapter for buffer
template<typename CharT, template<typename CharT,
@ -1845,9 +1820,9 @@ class lexer
@brief scan a string literal @brief scan a string literal
This function scans a string according to Sect. 7 of RFC 7159. While This function scans a string according to Sect. 7 of RFC 7159. While
scanning, bytes are escaped and copied into buffer yytext. Then the scanning, bytes are escaped and copied into buffer yytext. Then the function
function returns successfully, yytext is null-terminated and yylen returns successfully, yytext is *not* null-terminated (as it may contain \0
contains the number of bytes in the string. bytes), and yytext.size() is the number of bytes in the string.
@return token_type::value_string if string could be successfully scanned, @return token_type::value_string if string could be successfully scanned,
token_type::parse_error otherwise token_type::parse_error otherwise
@ -1878,9 +1853,6 @@ class lexer
// closing quote // closing quote
case '\"': case '\"':
{ {
// terminate yytext
add('\0');
--yylen;
return token_type::value_string; return token_type::value_string;
} }
@ -2624,12 +2596,7 @@ scan_number_any2:
scan_number_done: scan_number_done:
// unget the character after the number (we only read it to know that // unget the character after the number (we only read it to know that
// we are done scanning a number) // we are done scanning a number)
--chars_read; unget();
next_unget = true;
// terminate token
add('\0');
--yylen;
char* endptr = nullptr; char* endptr = nullptr;
errno = 0; errno = 0;
@ -2640,7 +2607,7 @@ scan_number_done:
const auto x = std::strtoull(yytext.data(), &endptr, 10); const auto x = std::strtoull(yytext.data(), &endptr, 10);
// we checked the number format before // we checked the number format before
assert(endptr == yytext.data() + yylen); assert(endptr == yytext.data() + yytext.size());
if (errno == 0) if (errno == 0)
{ {
@ -2656,7 +2623,7 @@ scan_number_done:
const auto x = std::strtoll(yytext.data(), &endptr, 10); const auto x = std::strtoll(yytext.data(), &endptr, 10);
// we checked the number format before // we checked the number format before
assert(endptr == yytext.data() + yylen); assert(endptr == yytext.data() + yytext.size());
if (errno == 0) if (errno == 0)
{ {
@ -2673,7 +2640,7 @@ scan_number_done:
strtof(value_float, yytext.data(), &endptr); strtof(value_float, yytext.data(), &endptr);
// we checked the number format before // we checked the number format before
assert(endptr == yytext.data() + yylen); assert(endptr == yytext.data() + yytext.size());
return token_type::value_float; return token_type::value_float;
} }
@ -2702,32 +2669,51 @@ scan_number_done:
// input management // input management
///////////////////// /////////////////////
/// reset yytext /// reset yytext; current character is beginning of token
void reset() noexcept void reset() noexcept
{ {
yylen = 0; yytext.clear();
start_pos = chars_read - 1; token_string.clear();
token_string.push_back(std::char_traits<char>::to_char_type(current));
} }
/// get a character from the input /*
int get() @brief get next character from the input
This function provides the interface to the used input adapter. It does
not throw in case the input reached EOF, but returns a
`std::char_traits<char>::eof()` in that case. Stores the scanned characters
for use in error messages.
@return character read from the input
*/
std::char_traits<char>::int_type get()
{ {
++chars_read; ++chars_read;
return next_unget ? (next_unget = false, current) current = ia->get_character();
: (current = ia->get_character()); if (JSON_LIKELY( current != std::char_traits<char>::eof()))
{
token_string.push_back(std::char_traits<char>::to_char_type(current));
}
return current;
}
/// unget current character (return it again on next get)
void unget()
{
--chars_read;
if (JSON_LIKELY(current != std::char_traits<char>::eof()))
{
ia->unget_character();
assert(token_string.size() != 0);
token_string.pop_back();
}
} }
/// add a character to yytext /// add a character to yytext
void add(int c) void add(int c)
{ {
// resize yytext if necessary; this condition is deemed unlikely, yytext.push_back(std::char_traits<char>::to_char_type(c));
// because we start with a 1024-byte buffer
if (JSON_UNLIKELY((yylen + 1 > yytext.capacity())))
{
yytext.resize(2 * yytext.capacity(), '\0');
}
assert(yylen < yytext.size());
yytext[yylen++] = static_cast<char>(c);
} }
public: public:
@ -2753,12 +2739,10 @@ scan_number_done:
return value_float; return value_float;
} }
/// return string value /// return current string value (implicitly resets the token; useful only once)
const std::string get_string() std::string move_string()
{ {
// yytext cannot be returned as char*, because it may contain a null return std::move( yytext );
// byte (parsed as "\u0000")
return std::string(yytext.data(), yylen);
} }
///////////////////// /////////////////////
@ -2771,22 +2755,16 @@ scan_number_done:
return chars_read; return chars_read;
} }
/// return the last read token (for errors only) /// return the last read token (for errors only). Will never contain EOF
/// (an arbitrary value that is not a valid char value, often -1), because
/// 255 may legitimately occur. May contain NUL, which should be escaped.
std::string get_token_string() const std::string get_token_string() const
{ {
// get the raw byte sequence of the last token
std::string s = ia->read(start_pos, chars_read - start_pos);
// escape control characters // escape control characters
std::string result; std::string result;
for (auto c : s) for (auto c : token_string)
{ {
if (c == '\0' or c == std::char_traits<char>::eof()) if ('\x00' <= c and c <= '\x1f')
{
// ignore EOF
continue;
}
else if ('\x00' <= c and c <= '\x1f')
{ {
// escape control characters // escape control characters
std::stringstream ss; std::stringstream ss;
@ -2883,20 +2861,16 @@ scan_number_done:
detail::input_adapter_t ia = nullptr; detail::input_adapter_t ia = nullptr;
/// the current character /// the current character
int current = std::char_traits<char>::eof(); std::char_traits<char>::int_type current = std::char_traits<char>::eof();
/// whether get() should return the last character again
bool next_unget = false;
/// the number of characters read /// the number of characters read
std::size_t chars_read = 0; std::size_t chars_read = 0;
/// the start position of the current token
std::size_t start_pos = 0; /// raw input token string (for error messages)
std::vector<char> token_string { };
/// buffer for variable-length tokens (numbers, strings) /// buffer for variable-length tokens (numbers, strings)
std::vector<char> yytext = std::vector<char>(1024, '\0'); std::string yytext { };
/// current index in yytext
std::size_t yylen = 0;
/// a description of occurred lexer errors /// a description of occurred lexer errors
const char* error_message = ""; const char* error_message = "";
@ -3073,7 +3047,7 @@ class parser
{ {
return; return;
} }
key = m_lexer.get_string(); key = m_lexer.move_string();
bool keep_tag = false; bool keep_tag = false;
if (keep) if (keep)
@ -3219,7 +3193,7 @@ class parser
case token_type::value_string: case token_type::value_string:
{ {
result.m_type = value_t::string; result.m_type = value_t::string;
result.m_value = m_lexer.get_string(); result.m_value = m_lexer.move_string();
break; break;
} }
@ -5221,7 +5195,7 @@ class binary_reader
@brief get next character from the input @brief get next character from the input
This function provides the interface to the used input adapter. It does This function provides the interface to the used input adapter. It does
not throw in case the input reached EOF, but returns not throw in case the input reached EOF, but returns a -'ve valued
`std::char_traits<char>::eof()` in that case. `std::char_traits<char>::eof()` in that case.
@return character read from the input @return character read from the input

View file

@ -215,7 +215,7 @@ TEST_CASE("parser class")
std::string s = "\"1\""; std::string s = "\"1\"";
s[1] = '\0'; s[1] = '\0';
CHECK_THROWS_AS(json::parse(s.begin(), s.end()), json::parse_error&); CHECK_THROWS_AS(json::parse(s.begin(), s.end()), json::parse_error&);
CHECK_THROWS_WITH(json::parse(s.begin(), s.end()), "[json.exception.parse_error.101] parse error at 2: syntax error - invalid string: control character must be escaped; last read: '\"'"); CHECK_THROWS_WITH(json::parse(s.begin(), s.end()), "[json.exception.parse_error.101] parse error at 2: syntax error - invalid string: control character must be escaped; last read: '\"<U+0000>'");
} }
} }

View file

@ -1233,4 +1233,24 @@ TEST_CASE("regression tests")
"[json.exception.type_error.302] type must be array, but is null"); "[json.exception.type_error.302] type must be array, but is null");
} }
} }
SECTION("issue #367 - Behavior of operator>> should more closely resemble that of built-in overloads.")
{
SECTION("example 1")
{
std::istringstream i1_2_3( "{\"first\": \"one\" }{\"second\": \"two\"}3" );
json j1, j2, j3;
i1_2_3 >> j1;
i1_2_3 >> j2;
i1_2_3 >> j3;
std::map<std::string,std::string> m1 = j1;
std::map<std::string,std::string> m2 = j2;
int i3 = j3;
CHECK( m1 == ( std::map<std::string,std::string> {{ "first", "one" }} ));
CHECK( m2 == ( std::map<std::string,std::string> {{ "second", "two" }} ));
CHECK( i3 == 3 );
}
}
} }