Merge pull request #764 from pjkundert/develop-simplify-istream
Simplified istream handing #367
This commit is contained in:
commit
3094640446
7 changed files with 341 additions and 164 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1,5 +1,6 @@
|
||||||
json_unit
|
json_unit
|
||||||
json_benchmarks
|
json_benchmarks
|
||||||
|
json_benchmarks_simple
|
||||||
fuzz-testing
|
fuzz-testing
|
||||||
|
|
||||||
*.dSYM
|
*.dSYM
|
||||||
|
|
|
@ -1,11 +1,21 @@
|
||||||
all: json_benchmarks
|
|
||||||
./json_benchmarks
|
|
||||||
|
|
||||||
json_benchmarks: src/benchmarks.cpp ../src/json.hpp number_jsons
|
#
|
||||||
|
# Build/run json.hpp benchmarks, eg. CXX=g++-7 make
|
||||||
|
#
|
||||||
|
# The existing json_benchmarks did not allow optimization under some compilers
|
||||||
|
#
|
||||||
|
all: json_benchmarks json_benchmarks_simple number_jsons
|
||||||
|
bash -c 'time ./json_benchmarks'
|
||||||
|
bash -c 'time ./json_benchmarks_simple'
|
||||||
|
|
||||||
|
json_benchmarks: src/benchmarks.cpp ../src/json.hpp
|
||||||
$(CXX) -std=c++11 -pthread $(CXXFLAGS) -DNDEBUG -O3 -flto -I thirdparty/benchpress -I thirdparty/cxxopts -I../src src/benchmarks.cpp $(LDFLAGS) -o $@
|
$(CXX) -std=c++11 -pthread $(CXXFLAGS) -DNDEBUG -O3 -flto -I thirdparty/benchpress -I thirdparty/cxxopts -I../src src/benchmarks.cpp $(LDFLAGS) -o $@
|
||||||
|
|
||||||
|
json_benchmarks_simple: src/benchmarks_simple.cpp ../src/json.hpp
|
||||||
|
$(CXX) -std=c++11 $(CXXFLAGS) -DNDEBUG -O3 -flto -I../src $(<) $(LDFLAGS) -o $@
|
||||||
|
|
||||||
number_jsons:
|
number_jsons:
|
||||||
(test -e files/numbers/floats.json -a -e files/numbers/signed_ints.json -a -e files/numbers/unsigned_ints.json) || (cd files/numbers ; python generate.py)
|
(test -e files/numbers/floats.json -a -e files/numbers/signed_ints.json -a -e files/numbers/unsigned_ints.json) || (cd files/numbers ; python generate.py)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -f json_benchmarks files/numbers/*.json
|
rm -f json_benchmarks json_benchmarks_simple files/numbers/*.json
|
||||||
|
|
|
@ -34,6 +34,19 @@ static void bench(benchpress::context& ctx,
|
||||||
{
|
{
|
||||||
// using string streams for benchmarking to factor-out cold-cache disk
|
// using string streams for benchmarking to factor-out cold-cache disk
|
||||||
// access.
|
// access.
|
||||||
|
#if defined( FROMFILE )
|
||||||
|
std::ifstream istr;
|
||||||
|
{
|
||||||
|
istr.open( in_path, std::ifstream::in );
|
||||||
|
|
||||||
|
// read the stream once
|
||||||
|
json j;
|
||||||
|
istr >> j;
|
||||||
|
// clear flags and rewind
|
||||||
|
istr.clear();
|
||||||
|
istr.seekg(0);
|
||||||
|
}
|
||||||
|
#else
|
||||||
std::stringstream istr;
|
std::stringstream istr;
|
||||||
{
|
{
|
||||||
// read file into string stream
|
// read file into string stream
|
||||||
|
@ -43,11 +56,12 @@ static void bench(benchpress::context& ctx,
|
||||||
|
|
||||||
// read the stream once
|
// read the stream once
|
||||||
json j;
|
json j;
|
||||||
j << istr;
|
istr >> j;
|
||||||
// clear flags and rewind
|
// clear flags and rewind
|
||||||
istr.clear();
|
istr.clear();
|
||||||
istr.seekg(0);
|
istr.seekg(0);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
switch (mode)
|
switch (mode)
|
||||||
{
|
{
|
||||||
|
@ -62,7 +76,7 @@ static void bench(benchpress::context& ctx,
|
||||||
istr.clear();
|
istr.clear();
|
||||||
istr.seekg(0);
|
istr.seekg(0);
|
||||||
json j;
|
json j;
|
||||||
j << istr;
|
istr >> j;
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
@ -74,7 +88,7 @@ static void bench(benchpress::context& ctx,
|
||||||
{
|
{
|
||||||
// create JSON value from input
|
// create JSON value from input
|
||||||
json j;
|
json j;
|
||||||
j << istr;
|
istr >> j;
|
||||||
std::stringstream ostr;
|
std::stringstream ostr;
|
||||||
|
|
||||||
ctx.reset_timer();
|
ctx.reset_timer();
|
||||||
|
|
158
benchmarks/src/benchmarks_simple.cpp
Normal file
158
benchmarks/src/benchmarks_simple.cpp
Normal file
|
@ -0,0 +1,158 @@
|
||||||
|
//
|
||||||
|
// benchmarks_simple.cpp -- a less complex version of benchmarks.cpp, that better reflects actual performance
|
||||||
|
//
|
||||||
|
// For some reason, the complexity of benchmarks.cpp doesn't allow
|
||||||
|
// the compiler to optimize code using json.hpp effectively. The
|
||||||
|
// exact same tests, with the use of benchpress and cxxopts produces
|
||||||
|
// much faster code, at least under g++.
|
||||||
|
//
|
||||||
|
#include <fstream>
|
||||||
|
#include <iostream>
|
||||||
|
#include <chrono>
|
||||||
|
#include <list>
|
||||||
|
#include <tuple>
|
||||||
|
|
||||||
|
#include <json.hpp>
|
||||||
|
|
||||||
|
using json = nlohmann::json;
|
||||||
|
|
||||||
|
enum class EMode { input, output, indent };
|
||||||
|
|
||||||
|
static double bench(const EMode mode, size_t iters, const std::string& in_path )
|
||||||
|
{
|
||||||
|
// using string streams for benchmarking to factor-out cold-cache disk
|
||||||
|
// access. Define FROMFILE to use file I/O instead.
|
||||||
|
#if defined( FROMFILE )
|
||||||
|
std::ifstream istr;
|
||||||
|
{
|
||||||
|
istr.open( in_path, std::ifstream::in );
|
||||||
|
|
||||||
|
// read the stream once
|
||||||
|
json j;
|
||||||
|
istr >> j;
|
||||||
|
// clear flags and rewind
|
||||||
|
istr.clear();
|
||||||
|
istr.seekg(0);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
std::stringstream istr;
|
||||||
|
{
|
||||||
|
// read file into string stream
|
||||||
|
std::ifstream input_file(in_path);
|
||||||
|
istr << input_file.rdbuf();
|
||||||
|
input_file.close();
|
||||||
|
|
||||||
|
// read the stream once
|
||||||
|
json j;
|
||||||
|
istr >> j;
|
||||||
|
// clear flags and rewind
|
||||||
|
istr.clear();
|
||||||
|
istr.seekg(0);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
double tps = 0;
|
||||||
|
switch (mode)
|
||||||
|
{
|
||||||
|
// benchmarking input
|
||||||
|
case EMode::input:
|
||||||
|
{
|
||||||
|
auto start = std::chrono::system_clock::now();
|
||||||
|
for (size_t i = 0; i < iters; ++i)
|
||||||
|
{
|
||||||
|
// clear flags and rewind
|
||||||
|
istr.clear();
|
||||||
|
istr.seekg(0);
|
||||||
|
json j;
|
||||||
|
istr >> j;
|
||||||
|
}
|
||||||
|
auto ended = std::chrono::system_clock::now();
|
||||||
|
tps = 1.0 / std::chrono::duration<double>( ended - start ).count();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// benchmarking output
|
||||||
|
case EMode::output:
|
||||||
|
case EMode::indent:
|
||||||
|
{
|
||||||
|
// create JSON value from input
|
||||||
|
json j;
|
||||||
|
istr >> j;
|
||||||
|
std::stringstream ostr;
|
||||||
|
|
||||||
|
auto start = std::chrono::system_clock::now();
|
||||||
|
for (size_t i = 0; i < iters; ++i)
|
||||||
|
{
|
||||||
|
if (mode == EMode::indent)
|
||||||
|
{
|
||||||
|
ostr << j;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ostr << std::setw(4) << j;
|
||||||
|
}
|
||||||
|
|
||||||
|
// reset data
|
||||||
|
ostr.str(std::string());
|
||||||
|
}
|
||||||
|
auto ended = std::chrono::system_clock::now();
|
||||||
|
tps = 1.0 / std::chrono::duration<double>( ended - start ).count();
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return tps;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
struct average {
|
||||||
|
T _sum { 0 };
|
||||||
|
size_t _count { 0 };
|
||||||
|
T operator+=( const T &val_ ) { _sum += val_; +_count++; return val_; }
|
||||||
|
operator T() { return _sum / _count; }
|
||||||
|
};
|
||||||
|
|
||||||
|
// Execute each test approximately enough times to get near 1
|
||||||
|
// transaction per second, and compute the average; a single aggregate
|
||||||
|
// number that gives a performance metric representing both parsing
|
||||||
|
// and output.
|
||||||
|
|
||||||
|
int main( int, char ** )
|
||||||
|
{
|
||||||
|
std::list<std::tuple<std::string, EMode, size_t, std::string>> tests {
|
||||||
|
{ "parse jeopardy.json", EMode::input, 2, "files/jeopardy/jeopardy.json" },
|
||||||
|
{ "parse canada.json", EMode::input, 30, "files/nativejson-benchmark/canada.json" },
|
||||||
|
{ "parse citm_catalog.json", EMode::input, 120, "files/nativejson-benchmark/citm_catalog.json" },
|
||||||
|
{ "parse twitter.json", EMode::input, 225, "files/nativejson-benchmark/twitter.json" },
|
||||||
|
{ "parse floats.json", EMode::input, 5, "files/numbers/floats.json" },
|
||||||
|
{ "parse signed_ints.json", EMode::input, 6, "files/numbers/signed_ints.json" },
|
||||||
|
{ "parse unsigned_ints.json", EMode::input, 6, "files/numbers/unsigned_ints.json" },
|
||||||
|
{ "dump jeopardy.json", EMode::output, 5, "files/jeopardy/jeopardy.json" },
|
||||||
|
{ "dump jeopardy.json w/ind.", EMode::indent, 5, "files/jeopardy/jeopardy.json" },
|
||||||
|
{ "dump floats.json", EMode::output, 2, "files/numbers/floats.json" },
|
||||||
|
{ "dump signed_ints.json", EMode::output, 20, "files/numbers/signed_ints.json" },
|
||||||
|
};
|
||||||
|
|
||||||
|
average<double> avg;
|
||||||
|
for ( auto t : tests ) {
|
||||||
|
std::string name, path;
|
||||||
|
EMode mode;
|
||||||
|
size_t iters;
|
||||||
|
std::tie(name, mode, iters, path) = t;
|
||||||
|
auto tps = bench( mode, iters, path );
|
||||||
|
avg += tps;
|
||||||
|
std::cout
|
||||||
|
<< std::left
|
||||||
|
<< std::setw( 30 ) << name
|
||||||
|
<< std::right
|
||||||
|
<< " x " << std::setw( 3 ) << iters
|
||||||
|
<< std::left
|
||||||
|
<< " == " << std::setw( 10 ) << tps
|
||||||
|
<< std::right
|
||||||
|
<< " TPS, " << std::setw( 8 ) << std::round( tps * 1e6 / iters )
|
||||||
|
<< " ms/op"
|
||||||
|
<< std::endl;
|
||||||
|
}
|
||||||
|
std::cout << std::setw( 40 ) << "" << std::string( 10, '-' ) << std::endl;
|
||||||
|
std::cout << std::setw( 40 ) << "" << std::setw( 10 ) << std::left << avg << " TPS Average" << std::endl;
|
||||||
|
return 0;
|
||||||
|
}
|
286
src/json.hpp
286
src/json.hpp
|
@ -1394,123 +1394,97 @@ constexpr T static_const<T>::value;
|
||||||
// input adapters //
|
// input adapters //
|
||||||
////////////////////
|
////////////////////
|
||||||
|
|
||||||
/// abstract input adapter interface
|
/*!
|
||||||
|
@brief abstract input adapter interface
|
||||||
|
|
||||||
|
Produces a stream of std::char_traits<char>::int_type characters from a
|
||||||
|
std::istream, a buffer, or some other input type. Accepts the return of exactly
|
||||||
|
one non-EOF character for future input. The int_type characters returned
|
||||||
|
consist of all valid char values as positive values (typically unsigned char),
|
||||||
|
plus an EOF value outside that range, specified by the value of the function
|
||||||
|
std::char_traits<char>::eof(). This value is typically -1, but could be any
|
||||||
|
arbitrary value which is not a valid char value.
|
||||||
|
|
||||||
|
@return Typically [0,255] plus std::char_traits<char>::eof().
|
||||||
|
*/
|
||||||
struct input_adapter_protocol
|
struct input_adapter_protocol
|
||||||
{
|
{
|
||||||
virtual int get_character() = 0;
|
virtual std::char_traits<char>::int_type get_character() = 0;
|
||||||
virtual std::string read(std::size_t offset, std::size_t length) = 0;
|
virtual void unget_character() = 0; // restore the last non-eof() character to input
|
||||||
virtual ~input_adapter_protocol() = default;
|
virtual ~input_adapter_protocol() = default;
|
||||||
};
|
};
|
||||||
|
|
||||||
/// a type to simplify interfaces
|
/// a type to simplify interfaces
|
||||||
using input_adapter_t = std::shared_ptr<input_adapter_protocol>;
|
using input_adapter_t = std::shared_ptr<input_adapter_protocol>;
|
||||||
|
|
||||||
/// input adapter for cached stream input
|
/// input adapter for a (caching) istream. Ignores a UFT Byte Order Mark at
|
||||||
template<std::size_t BufferSize>
|
/// beginning of input. Does not support changing the underlying std::streambuf
|
||||||
class cached_input_stream_adapter : public input_adapter_protocol
|
/// in mid-input. Maintains underlying std::istream and std::streambuf to
|
||||||
|
/// support subsequent use of standard std::istream operations to process any
|
||||||
|
/// input characters following those used in parsing the JSON input. Clears the
|
||||||
|
/// std::istream flags; any input errors (eg. EOF) will be detected by the first
|
||||||
|
/// subsequent call for input from the std::istream.
|
||||||
|
class input_stream_adapter : public input_adapter_protocol
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
explicit cached_input_stream_adapter(std::istream& i)
|
~input_stream_adapter() override
|
||||||
: is(i), start_position(is.tellg())
|
|
||||||
{
|
{
|
||||||
fill_buffer();
|
// clear stream flags; we use underlying streambuf I/O, do not maintain ifstream flags
|
||||||
|
|
||||||
// skip byte order mark
|
|
||||||
if (fill_size >= 3 and buffer[0] == '\xEF' and buffer[1] == '\xBB' and buffer[2] == '\xBF')
|
|
||||||
{
|
|
||||||
buffer_pos += 3;
|
|
||||||
processed_chars += 3;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
~cached_input_stream_adapter() override
|
|
||||||
{
|
|
||||||
// clear stream flags
|
|
||||||
is.clear();
|
|
||||||
// We initially read a lot of characters into the buffer, and we may
|
|
||||||
// not have processed all of them. Therefore, we need to "rewind" the
|
|
||||||
// stream after the last processed char.
|
|
||||||
is.seekg(start_position);
|
|
||||||
is.ignore(static_cast<std::streamsize>(processed_chars));
|
|
||||||
// clear stream flags
|
|
||||||
is.clear();
|
is.clear();
|
||||||
}
|
}
|
||||||
|
explicit input_stream_adapter(std::istream& i)
|
||||||
int get_character() override
|
: is(i)
|
||||||
|
, sb(*i.rdbuf())
|
||||||
{
|
{
|
||||||
// check if refilling is necessary and possible
|
// Ignore Byte Order Mark at start of input
|
||||||
if (buffer_pos == fill_size and not eof)
|
std::char_traits<char>::int_type c;
|
||||||
|
if (( c = get_character() ) == 0xEF )
|
||||||
{
|
{
|
||||||
fill_buffer();
|
if (( c = get_character() ) == 0xBB )
|
||||||
|
|
||||||
// check and remember that filling did not yield new input
|
|
||||||
if (fill_size == 0)
|
|
||||||
{
|
{
|
||||||
eof = true;
|
if (( c = get_character() ) == 0xBF )
|
||||||
return std::char_traits<char>::eof();
|
{
|
||||||
|
return; // Ignore BOM
|
||||||
|
}
|
||||||
|
else if ( c != std::char_traits<char>::eof() )
|
||||||
|
{
|
||||||
|
is.unget();
|
||||||
|
}
|
||||||
|
is.putback( '\xBB' );
|
||||||
}
|
}
|
||||||
|
else if ( c != std::char_traits<char>::eof() )
|
||||||
// the buffer is ready
|
{
|
||||||
buffer_pos = 0;
|
is.unget();
|
||||||
|
}
|
||||||
|
is.putback( '\xEF' );
|
||||||
|
}
|
||||||
|
else if ( c != std::char_traits<char>::eof() )
|
||||||
|
{
|
||||||
|
is.unget(); // Not BOM. Process as usual.
|
||||||
}
|
}
|
||||||
|
|
||||||
++processed_chars;
|
|
||||||
assert(buffer_pos < buffer.size());
|
|
||||||
return buffer[buffer_pos++] & 0xFF;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string read(std::size_t offset, std::size_t length) override
|
// delete because of pointer members
|
||||||
|
input_stream_adapter(const input_stream_adapter&) = delete;
|
||||||
|
input_stream_adapter& operator=(input_stream_adapter&) = delete;
|
||||||
|
|
||||||
|
// std::istream/std::streambuf use std::char_traits<char>::to_int_type, to
|
||||||
|
// ensure that std::char_traits<char>::eof() and the character 0xff do not
|
||||||
|
// end up as the same value, eg. 0xffffffff.
|
||||||
|
std::char_traits<char>::int_type get_character() override
|
||||||
{
|
{
|
||||||
// create buffer
|
return sb.sbumpc();
|
||||||
std::string result(length, '\0');
|
|
||||||
|
|
||||||
// save stream position
|
|
||||||
const auto current_pos = is.tellg();
|
|
||||||
// save stream flags
|
|
||||||
const auto flags = is.rdstate();
|
|
||||||
|
|
||||||
// clear stream flags
|
|
||||||
is.clear();
|
|
||||||
// set stream position
|
|
||||||
is.seekg(static_cast<std::streamoff>(offset));
|
|
||||||
// read bytes
|
|
||||||
is.read(&result[0], static_cast<std::streamsize>(length));
|
|
||||||
|
|
||||||
// reset stream position
|
|
||||||
is.seekg(current_pos);
|
|
||||||
// reset stream flags
|
|
||||||
is.setstate(flags);
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void unget_character() override
|
||||||
|
{
|
||||||
|
sb.sungetc(); // Avoided for performance: is.unget();
|
||||||
|
}
|
||||||
private:
|
private:
|
||||||
void fill_buffer()
|
|
||||||
{
|
|
||||||
// fill
|
|
||||||
is.read(buffer.data(), static_cast<std::streamsize>(buffer.size()));
|
|
||||||
// store number of bytes in the buffer
|
|
||||||
fill_size = static_cast<size_t>(is.gcount());
|
|
||||||
}
|
|
||||||
|
|
||||||
/// the associated input stream
|
/// the associated input stream
|
||||||
std::istream& is;
|
std::istream& is;
|
||||||
|
std::streambuf &sb;
|
||||||
/// chars returned via get_character()
|
|
||||||
std::size_t processed_chars = 0;
|
|
||||||
/// chars processed in the current buffer
|
|
||||||
std::size_t buffer_pos = 0;
|
|
||||||
|
|
||||||
/// whether stream reached eof
|
|
||||||
bool eof = false;
|
|
||||||
/// how many chars have been copied to the buffer by last (re)fill
|
|
||||||
std::size_t fill_size = 0;
|
|
||||||
|
|
||||||
/// position of the stream when we started
|
|
||||||
const std::streampos start_position;
|
|
||||||
|
|
||||||
/// internal buffer
|
|
||||||
std::array<char, BufferSize> buffer{{}};
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/// input adapter for buffer input
|
/// input adapter for buffer input
|
||||||
|
@ -1531,21 +1505,22 @@ class input_buffer_adapter : public input_adapter_protocol
|
||||||
input_buffer_adapter(const input_buffer_adapter&) = delete;
|
input_buffer_adapter(const input_buffer_adapter&) = delete;
|
||||||
input_buffer_adapter& operator=(input_buffer_adapter&) = delete;
|
input_buffer_adapter& operator=(input_buffer_adapter&) = delete;
|
||||||
|
|
||||||
int get_character() noexcept override
|
std::char_traits<char>::int_type get_character() noexcept override
|
||||||
{
|
{
|
||||||
if (JSON_LIKELY(cursor < limit))
|
if (JSON_LIKELY(cursor < limit))
|
||||||
{
|
{
|
||||||
return *(cursor++) & 0xFF;
|
return std::char_traits<char>::to_int_type(*(cursor++));
|
||||||
}
|
}
|
||||||
|
|
||||||
return std::char_traits<char>::eof();
|
return std::char_traits<char>::eof();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string read(std::size_t offset, std::size_t length) override
|
void unget_character() noexcept override
|
||||||
{
|
{
|
||||||
// avoid reading too many characters
|
if (JSON_LIKELY(cursor > start))
|
||||||
const auto max_length = static_cast<std::size_t>(limit - start);
|
{
|
||||||
return std::string(start + offset, (std::min)(length, max_length - offset));
|
--cursor;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -1564,11 +1539,11 @@ class input_adapter
|
||||||
|
|
||||||
/// input adapter for input stream
|
/// input adapter for input stream
|
||||||
input_adapter(std::istream& i)
|
input_adapter(std::istream& i)
|
||||||
: ia(std::make_shared<cached_input_stream_adapter<16384>>(i)) {}
|
: ia(std::make_shared<input_stream_adapter>(i)) {}
|
||||||
|
|
||||||
/// input adapter for input stream
|
/// input adapter for input stream
|
||||||
input_adapter(std::istream&& i)
|
input_adapter(std::istream&& i)
|
||||||
: ia(std::make_shared<cached_input_stream_adapter<16384>>(i)) {}
|
: ia(std::make_shared<input_stream_adapter>(i)) {}
|
||||||
|
|
||||||
/// input adapter for buffer
|
/// input adapter for buffer
|
||||||
template<typename CharT,
|
template<typename CharT,
|
||||||
|
@ -1845,9 +1820,9 @@ class lexer
|
||||||
@brief scan a string literal
|
@brief scan a string literal
|
||||||
|
|
||||||
This function scans a string according to Sect. 7 of RFC 7159. While
|
This function scans a string according to Sect. 7 of RFC 7159. While
|
||||||
scanning, bytes are escaped and copied into buffer yytext. Then the
|
scanning, bytes are escaped and copied into buffer yytext. Then the function
|
||||||
function returns successfully, yytext is null-terminated and yylen
|
returns successfully, yytext is *not* null-terminated (as it may contain \0
|
||||||
contains the number of bytes in the string.
|
bytes), and yytext.size() is the number of bytes in the string.
|
||||||
|
|
||||||
@return token_type::value_string if string could be successfully scanned,
|
@return token_type::value_string if string could be successfully scanned,
|
||||||
token_type::parse_error otherwise
|
token_type::parse_error otherwise
|
||||||
|
@ -1878,9 +1853,6 @@ class lexer
|
||||||
// closing quote
|
// closing quote
|
||||||
case '\"':
|
case '\"':
|
||||||
{
|
{
|
||||||
// terminate yytext
|
|
||||||
add('\0');
|
|
||||||
--yylen;
|
|
||||||
return token_type::value_string;
|
return token_type::value_string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2624,12 +2596,7 @@ scan_number_any2:
|
||||||
scan_number_done:
|
scan_number_done:
|
||||||
// unget the character after the number (we only read it to know that
|
// unget the character after the number (we only read it to know that
|
||||||
// we are done scanning a number)
|
// we are done scanning a number)
|
||||||
--chars_read;
|
unget();
|
||||||
next_unget = true;
|
|
||||||
|
|
||||||
// terminate token
|
|
||||||
add('\0');
|
|
||||||
--yylen;
|
|
||||||
|
|
||||||
char* endptr = nullptr;
|
char* endptr = nullptr;
|
||||||
errno = 0;
|
errno = 0;
|
||||||
|
@ -2640,7 +2607,7 @@ scan_number_done:
|
||||||
const auto x = std::strtoull(yytext.data(), &endptr, 10);
|
const auto x = std::strtoull(yytext.data(), &endptr, 10);
|
||||||
|
|
||||||
// we checked the number format before
|
// we checked the number format before
|
||||||
assert(endptr == yytext.data() + yylen);
|
assert(endptr == yytext.data() + yytext.size());
|
||||||
|
|
||||||
if (errno == 0)
|
if (errno == 0)
|
||||||
{
|
{
|
||||||
|
@ -2656,7 +2623,7 @@ scan_number_done:
|
||||||
const auto x = std::strtoll(yytext.data(), &endptr, 10);
|
const auto x = std::strtoll(yytext.data(), &endptr, 10);
|
||||||
|
|
||||||
// we checked the number format before
|
// we checked the number format before
|
||||||
assert(endptr == yytext.data() + yylen);
|
assert(endptr == yytext.data() + yytext.size());
|
||||||
|
|
||||||
if (errno == 0)
|
if (errno == 0)
|
||||||
{
|
{
|
||||||
|
@ -2673,7 +2640,7 @@ scan_number_done:
|
||||||
strtof(value_float, yytext.data(), &endptr);
|
strtof(value_float, yytext.data(), &endptr);
|
||||||
|
|
||||||
// we checked the number format before
|
// we checked the number format before
|
||||||
assert(endptr == yytext.data() + yylen);
|
assert(endptr == yytext.data() + yytext.size());
|
||||||
|
|
||||||
return token_type::value_float;
|
return token_type::value_float;
|
||||||
}
|
}
|
||||||
|
@ -2702,32 +2669,51 @@ scan_number_done:
|
||||||
// input management
|
// input management
|
||||||
/////////////////////
|
/////////////////////
|
||||||
|
|
||||||
/// reset yytext
|
/// reset yytext; current character is beginning of token
|
||||||
void reset() noexcept
|
void reset() noexcept
|
||||||
{
|
{
|
||||||
yylen = 0;
|
yytext.clear();
|
||||||
start_pos = chars_read - 1;
|
token_string.clear();
|
||||||
|
token_string.push_back(std::char_traits<char>::to_char_type(current));
|
||||||
}
|
}
|
||||||
|
|
||||||
/// get a character from the input
|
/*
|
||||||
int get()
|
@brief get next character from the input
|
||||||
|
|
||||||
|
This function provides the interface to the used input adapter. It does
|
||||||
|
not throw in case the input reached EOF, but returns a
|
||||||
|
`std::char_traits<char>::eof()` in that case. Stores the scanned characters
|
||||||
|
for use in error messages.
|
||||||
|
|
||||||
|
@return character read from the input
|
||||||
|
*/
|
||||||
|
std::char_traits<char>::int_type get()
|
||||||
{
|
{
|
||||||
++chars_read;
|
++chars_read;
|
||||||
return next_unget ? (next_unget = false, current)
|
current = ia->get_character();
|
||||||
: (current = ia->get_character());
|
if (JSON_LIKELY( current != std::char_traits<char>::eof()))
|
||||||
|
{
|
||||||
|
token_string.push_back(std::char_traits<char>::to_char_type(current));
|
||||||
|
}
|
||||||
|
return current;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// unget current character (return it again on next get)
|
||||||
|
void unget()
|
||||||
|
{
|
||||||
|
--chars_read;
|
||||||
|
if (JSON_LIKELY(current != std::char_traits<char>::eof()))
|
||||||
|
{
|
||||||
|
ia->unget_character();
|
||||||
|
assert(token_string.size() != 0);
|
||||||
|
token_string.pop_back();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// add a character to yytext
|
/// add a character to yytext
|
||||||
void add(int c)
|
void add(int c)
|
||||||
{
|
{
|
||||||
// resize yytext if necessary; this condition is deemed unlikely,
|
yytext.push_back(std::char_traits<char>::to_char_type(c));
|
||||||
// because we start with a 1024-byte buffer
|
|
||||||
if (JSON_UNLIKELY((yylen + 1 > yytext.capacity())))
|
|
||||||
{
|
|
||||||
yytext.resize(2 * yytext.capacity(), '\0');
|
|
||||||
}
|
|
||||||
assert(yylen < yytext.size());
|
|
||||||
yytext[yylen++] = static_cast<char>(c);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
@ -2753,12 +2739,10 @@ scan_number_done:
|
||||||
return value_float;
|
return value_float;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// return string value
|
/// return current string value (implicitly resets the token; useful only once)
|
||||||
const std::string get_string()
|
std::string move_string()
|
||||||
{
|
{
|
||||||
// yytext cannot be returned as char*, because it may contain a null
|
return std::move( yytext );
|
||||||
// byte (parsed as "\u0000")
|
|
||||||
return std::string(yytext.data(), yylen);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/////////////////////
|
/////////////////////
|
||||||
|
@ -2771,22 +2755,16 @@ scan_number_done:
|
||||||
return chars_read;
|
return chars_read;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// return the last read token (for errors only)
|
/// return the last read token (for errors only). Will never contain EOF
|
||||||
|
/// (an arbitrary value that is not a valid char value, often -1), because
|
||||||
|
/// 255 may legitimately occur. May contain NUL, which should be escaped.
|
||||||
std::string get_token_string() const
|
std::string get_token_string() const
|
||||||
{
|
{
|
||||||
// get the raw byte sequence of the last token
|
|
||||||
std::string s = ia->read(start_pos, chars_read - start_pos);
|
|
||||||
|
|
||||||
// escape control characters
|
// escape control characters
|
||||||
std::string result;
|
std::string result;
|
||||||
for (auto c : s)
|
for (auto c : token_string)
|
||||||
{
|
{
|
||||||
if (c == '\0' or c == std::char_traits<char>::eof())
|
if ('\x00' <= c and c <= '\x1f')
|
||||||
{
|
|
||||||
// ignore EOF
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
else if ('\x00' <= c and c <= '\x1f')
|
|
||||||
{
|
{
|
||||||
// escape control characters
|
// escape control characters
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
|
@ -2883,20 +2861,16 @@ scan_number_done:
|
||||||
detail::input_adapter_t ia = nullptr;
|
detail::input_adapter_t ia = nullptr;
|
||||||
|
|
||||||
/// the current character
|
/// the current character
|
||||||
int current = std::char_traits<char>::eof();
|
std::char_traits<char>::int_type current = std::char_traits<char>::eof();
|
||||||
|
|
||||||
/// whether get() should return the last character again
|
|
||||||
bool next_unget = false;
|
|
||||||
|
|
||||||
/// the number of characters read
|
/// the number of characters read
|
||||||
std::size_t chars_read = 0;
|
std::size_t chars_read = 0;
|
||||||
/// the start position of the current token
|
|
||||||
std::size_t start_pos = 0;
|
/// raw input token string (for error messages)
|
||||||
|
std::vector<char> token_string { };
|
||||||
|
|
||||||
/// buffer for variable-length tokens (numbers, strings)
|
/// buffer for variable-length tokens (numbers, strings)
|
||||||
std::vector<char> yytext = std::vector<char>(1024, '\0');
|
std::string yytext { };
|
||||||
/// current index in yytext
|
|
||||||
std::size_t yylen = 0;
|
|
||||||
|
|
||||||
/// a description of occurred lexer errors
|
/// a description of occurred lexer errors
|
||||||
const char* error_message = "";
|
const char* error_message = "";
|
||||||
|
@ -3073,7 +3047,7 @@ class parser
|
||||||
{
|
{
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
key = m_lexer.get_string();
|
key = m_lexer.move_string();
|
||||||
|
|
||||||
bool keep_tag = false;
|
bool keep_tag = false;
|
||||||
if (keep)
|
if (keep)
|
||||||
|
@ -3219,7 +3193,7 @@ class parser
|
||||||
case token_type::value_string:
|
case token_type::value_string:
|
||||||
{
|
{
|
||||||
result.m_type = value_t::string;
|
result.m_type = value_t::string;
|
||||||
result.m_value = m_lexer.get_string();
|
result.m_value = m_lexer.move_string();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5221,7 +5195,7 @@ class binary_reader
|
||||||
@brief get next character from the input
|
@brief get next character from the input
|
||||||
|
|
||||||
This function provides the interface to the used input adapter. It does
|
This function provides the interface to the used input adapter. It does
|
||||||
not throw in case the input reached EOF, but returns
|
not throw in case the input reached EOF, but returns a -'ve valued
|
||||||
`std::char_traits<char>::eof()` in that case.
|
`std::char_traits<char>::eof()` in that case.
|
||||||
|
|
||||||
@return character read from the input
|
@return character read from the input
|
||||||
|
|
|
@ -215,7 +215,7 @@ TEST_CASE("parser class")
|
||||||
std::string s = "\"1\"";
|
std::string s = "\"1\"";
|
||||||
s[1] = '\0';
|
s[1] = '\0';
|
||||||
CHECK_THROWS_AS(json::parse(s.begin(), s.end()), json::parse_error&);
|
CHECK_THROWS_AS(json::parse(s.begin(), s.end()), json::parse_error&);
|
||||||
CHECK_THROWS_WITH(json::parse(s.begin(), s.end()), "[json.exception.parse_error.101] parse error at 2: syntax error - invalid string: control character must be escaped; last read: '\"'");
|
CHECK_THROWS_WITH(json::parse(s.begin(), s.end()), "[json.exception.parse_error.101] parse error at 2: syntax error - invalid string: control character must be escaped; last read: '\"<U+0000>'");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1233,4 +1233,24 @@ TEST_CASE("regression tests")
|
||||||
"[json.exception.type_error.302] type must be array, but is null");
|
"[json.exception.type_error.302] type must be array, but is null");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SECTION("issue #367 - Behavior of operator>> should more closely resemble that of built-in overloads.")
|
||||||
|
{
|
||||||
|
SECTION("example 1")
|
||||||
|
{
|
||||||
|
std::istringstream i1_2_3( "{\"first\": \"one\" }{\"second\": \"two\"}3" );
|
||||||
|
json j1, j2, j3;
|
||||||
|
i1_2_3 >> j1;
|
||||||
|
i1_2_3 >> j2;
|
||||||
|
i1_2_3 >> j3;
|
||||||
|
|
||||||
|
std::map<std::string,std::string> m1 = j1;
|
||||||
|
std::map<std::string,std::string> m2 = j2;
|
||||||
|
int i3 = j3;
|
||||||
|
|
||||||
|
CHECK( m1 == ( std::map<std::string,std::string> {{ "first", "one" }} ));
|
||||||
|
CHECK( m2 == ( std::map<std::string,std::string> {{ "second", "two" }} ));
|
||||||
|
CHECK( i3 == 3 );
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue