🔨 removing unget_character() function from input adapters #834

This commit is contained in:
Niels Lohmann 2018-04-02 21:10:48 +02:00
parent ba6edd5634
commit aa89c5e048
No known key found for this signature in database
GPG key ID: 7F3CEA63AE251B69
4 changed files with 168 additions and 142 deletions

View file

@ -31,19 +31,17 @@ enum class input_format_t { json, cbor, msgpack, ubjson };
@brief abstract input adapter interface @brief abstract input adapter interface
Produces a stream of std::char_traits<char>::int_type characters from a Produces a stream of std::char_traits<char>::int_type characters from a
std::istream, a buffer, or some other input type. Accepts the return of exactly std::istream, a buffer, or some other input type. Accepts the return of
one non-EOF character for future input. The int_type characters returned exactly one non-EOF character for future input. The int_type characters
consist of all valid char values as positive values (typically unsigned char), returned consist of all valid char values as positive values (typically
plus an EOF value outside that range, specified by the value of the function unsigned char), plus an EOF value outside that range, specified by the value
std::char_traits<char>::eof(). This value is typically -1, but could be any of the function std::char_traits<char>::eof(). This value is typically -1, but
arbitrary value which is not a valid char value. could be any arbitrary value which is not a valid char value.
*/ */
struct input_adapter_protocol struct input_adapter_protocol
{ {
/// get a character [0,255] or std::char_traits<char>::eof(). /// get a character [0,255] or std::char_traits<char>::eof().
virtual std::char_traits<char>::int_type get_character() = 0; virtual std::char_traits<char>::int_type get_character() = 0;
/// restore the last non-eof() character to input
virtual void unget_character() = 0;
virtual ~input_adapter_protocol() = default; virtual ~input_adapter_protocol() = default;
}; };
@ -71,34 +69,7 @@ class input_stream_adapter : public input_adapter_protocol
explicit input_stream_adapter(std::istream& i) explicit input_stream_adapter(std::istream& i)
: is(i), sb(*i.rdbuf()) : is(i), sb(*i.rdbuf())
{ {}
// skip byte order mark
std::char_traits<char>::int_type c;
if ((c = get_character()) == 0xEF)
{
if ((c = get_character()) == 0xBB)
{
if ((c = get_character()) == 0xBF)
{
return; // Ignore BOM
}
else if (c != std::char_traits<char>::eof())
{
is.unget();
}
is.putback('\xBB');
}
else if (c != std::char_traits<char>::eof())
{
is.unget();
}
is.putback('\xEF');
}
else if (c != std::char_traits<char>::eof())
{
is.unget(); // no byte order mark; process as usual
}
}
// delete because of pointer members // delete because of pointer members
input_stream_adapter(const input_stream_adapter&) = delete; input_stream_adapter(const input_stream_adapter&) = delete;
@ -112,11 +83,6 @@ class input_stream_adapter : public input_adapter_protocol
return sb.sbumpc(); return sb.sbumpc();
} }
void unget_character() override
{
sb.sungetc(); // is.unget() avoided for performance
}
private: private:
/// the associated input stream /// the associated input stream
std::istream& is; std::istream& is;
@ -128,14 +94,8 @@ class input_buffer_adapter : public input_adapter_protocol
{ {
public: public:
input_buffer_adapter(const char* b, const std::size_t l) input_buffer_adapter(const char* b, const std::size_t l)
: cursor(b), limit(b + l), start(b) : cursor(b), limit(b + l)
{ {}
// skip byte order mark
if (l >= 3 and b[0] == '\xEF' and b[1] == '\xBB' and b[2] == '\xBF')
{
cursor += 3;
}
}
// delete because of pointer members // delete because of pointer members
input_buffer_adapter(const input_buffer_adapter&) = delete; input_buffer_adapter(const input_buffer_adapter&) = delete;
@ -151,21 +111,11 @@ class input_buffer_adapter : public input_adapter_protocol
return std::char_traits<char>::eof(); return std::char_traits<char>::eof();
} }
void unget_character() noexcept override
{
if (JSON_LIKELY(cursor > start))
{
--cursor;
}
}
private: private:
/// pointer to the current character /// pointer to the current character
const char* cursor; const char* cursor;
/// pointer past the last character /// pointer past the last character
const char* limit; const char* limit;
/// pointer to the first character
const char* start;
}; };
class input_adapter class input_adapter

View file

@ -1081,7 +1081,16 @@ scan_number_done:
std::char_traits<char>::int_type get() std::char_traits<char>::int_type get()
{ {
++chars_read; ++chars_read;
current = ia->get_character(); if (next_unget)
{
// just reset the next_unget variable and work with current
next_unget = false;
}
else
{
current = ia->get_character();
}
if (JSON_LIKELY(current != std::char_traits<char>::eof())) if (JSON_LIKELY(current != std::char_traits<char>::eof()))
{ {
token_string.push_back(std::char_traits<char>::to_char_type(current)); token_string.push_back(std::char_traits<char>::to_char_type(current));
@ -1089,13 +1098,20 @@ scan_number_done:
return current; return current;
} }
/// unget current character (return it again on next get) /*!
@brief unget current character (read it again on next get)
We implement unget by setting variable next_unget to true. The input is not
changed - we just simulate ungetting by modifying chars_read and
token_string. The next call to get() will behave as if the unget character
is read again.
*/
void unget() void unget()
{ {
next_unget = true;
--chars_read; --chars_read;
if (JSON_LIKELY(current != std::char_traits<char>::eof())) if (JSON_LIKELY(current != std::char_traits<char>::eof()))
{ {
ia->unget_character();
assert(token_string.size() != 0); assert(token_string.size() != 0);
token_string.pop_back(); token_string.pop_back();
} }
@ -1183,8 +1199,43 @@ scan_number_done:
// actual scanner // actual scanner
///////////////////// /////////////////////
/*!
@brief skip the UTF-8 byte order mark
@return true iff there is no BOM or the correct BOM has been skipped
*/
bool skip_bom()
{
if (get() == 0xEF)
{
if (get() == 0xBB and get() == 0xBF)
{
// we completely parsed the BOM
return true;
}
else
{
// after reading 0xEF, an unexpected character followed
return false;
}
}
else
{
// the first character is not the beginning of the BOM; unget it to
// process is later
unget();
return true;
}
}
token_type scan() token_type scan()
{ {
// initially, skip the BOM
if (chars_read == 0 and not skip_bom())
{
error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
return token_type::parse_error;
}
// read next character and ignore whitespace // read next character and ignore whitespace
do do
{ {
@ -1254,6 +1305,9 @@ scan_number_done:
/// the current character /// the current character
std::char_traits<char>::int_type current = std::char_traits<char>::eof(); std::char_traits<char>::int_type current = std::char_traits<char>::eof();
/// whether the next get() call should just return current
bool next_unget = false;
/// the number of characters read /// the number of characters read
std::size_t chars_read = 0; std::size_t chars_read = 0;

View file

@ -1604,19 +1604,17 @@ enum class input_format_t { json, cbor, msgpack, ubjson };
@brief abstract input adapter interface @brief abstract input adapter interface
Produces a stream of std::char_traits<char>::int_type characters from a Produces a stream of std::char_traits<char>::int_type characters from a
std::istream, a buffer, or some other input type. Accepts the return of exactly std::istream, a buffer, or some other input type. Accepts the return of
one non-EOF character for future input. The int_type characters returned exactly one non-EOF character for future input. The int_type characters
consist of all valid char values as positive values (typically unsigned char), returned consist of all valid char values as positive values (typically
plus an EOF value outside that range, specified by the value of the function unsigned char), plus an EOF value outside that range, specified by the value
std::char_traits<char>::eof(). This value is typically -1, but could be any of the function std::char_traits<char>::eof(). This value is typically -1, but
arbitrary value which is not a valid char value. could be any arbitrary value which is not a valid char value.
*/ */
struct input_adapter_protocol struct input_adapter_protocol
{ {
/// get a character [0,255] or std::char_traits<char>::eof(). /// get a character [0,255] or std::char_traits<char>::eof().
virtual std::char_traits<char>::int_type get_character() = 0; virtual std::char_traits<char>::int_type get_character() = 0;
/// restore the last non-eof() character to input
virtual void unget_character() = 0;
virtual ~input_adapter_protocol() = default; virtual ~input_adapter_protocol() = default;
}; };
@ -1644,34 +1642,7 @@ class input_stream_adapter : public input_adapter_protocol
explicit input_stream_adapter(std::istream& i) explicit input_stream_adapter(std::istream& i)
: is(i), sb(*i.rdbuf()) : is(i), sb(*i.rdbuf())
{ {}
// skip byte order mark
std::char_traits<char>::int_type c;
if ((c = get_character()) == 0xEF)
{
if ((c = get_character()) == 0xBB)
{
if ((c = get_character()) == 0xBF)
{
return; // Ignore BOM
}
else if (c != std::char_traits<char>::eof())
{
is.unget();
}
is.putback('\xBB');
}
else if (c != std::char_traits<char>::eof())
{
is.unget();
}
is.putback('\xEF');
}
else if (c != std::char_traits<char>::eof())
{
is.unget(); // no byte order mark; process as usual
}
}
// delete because of pointer members // delete because of pointer members
input_stream_adapter(const input_stream_adapter&) = delete; input_stream_adapter(const input_stream_adapter&) = delete;
@ -1685,11 +1656,6 @@ class input_stream_adapter : public input_adapter_protocol
return sb.sbumpc(); return sb.sbumpc();
} }
void unget_character() override
{
sb.sungetc(); // is.unget() avoided for performance
}
private: private:
/// the associated input stream /// the associated input stream
std::istream& is; std::istream& is;
@ -1701,14 +1667,8 @@ class input_buffer_adapter : public input_adapter_protocol
{ {
public: public:
input_buffer_adapter(const char* b, const std::size_t l) input_buffer_adapter(const char* b, const std::size_t l)
: cursor(b), limit(b + l), start(b) : cursor(b), limit(b + l)
{ {}
// skip byte order mark
if (l >= 3 and b[0] == '\xEF' and b[1] == '\xBB' and b[2] == '\xBF')
{
cursor += 3;
}
}
// delete because of pointer members // delete because of pointer members
input_buffer_adapter(const input_buffer_adapter&) = delete; input_buffer_adapter(const input_buffer_adapter&) = delete;
@ -1724,21 +1684,11 @@ class input_buffer_adapter : public input_adapter_protocol
return std::char_traits<char>::eof(); return std::char_traits<char>::eof();
} }
void unget_character() noexcept override
{
if (JSON_LIKELY(cursor > start))
{
--cursor;
}
}
private: private:
/// pointer to the current character /// pointer to the current character
const char* cursor; const char* cursor;
/// pointer past the last character /// pointer past the last character
const char* limit; const char* limit;
/// pointer to the first character
const char* start;
}; };
class input_adapter class input_adapter
@ -2923,7 +2873,16 @@ scan_number_done:
std::char_traits<char>::int_type get() std::char_traits<char>::int_type get()
{ {
++chars_read; ++chars_read;
current = ia->get_character(); if (next_unget)
{
// just reset the next_unget variable and work with current
next_unget = false;
}
else
{
current = ia->get_character();
}
if (JSON_LIKELY(current != std::char_traits<char>::eof())) if (JSON_LIKELY(current != std::char_traits<char>::eof()))
{ {
token_string.push_back(std::char_traits<char>::to_char_type(current)); token_string.push_back(std::char_traits<char>::to_char_type(current));
@ -2931,13 +2890,20 @@ scan_number_done:
return current; return current;
} }
/// unget current character (return it again on next get) /*!
@brief unget current character (read it again on next get)
We implement unget by setting variable next_unget to true. The input is not
changed - we just simulate ungetting by modifying chars_read and
token_string. The next call to get() will behave as if the unget character
is read again.
*/
void unget() void unget()
{ {
next_unget = true;
--chars_read; --chars_read;
if (JSON_LIKELY(current != std::char_traits<char>::eof())) if (JSON_LIKELY(current != std::char_traits<char>::eof()))
{ {
ia->unget_character();
assert(token_string.size() != 0); assert(token_string.size() != 0);
token_string.pop_back(); token_string.pop_back();
} }
@ -3025,8 +2991,43 @@ scan_number_done:
// actual scanner // actual scanner
///////////////////// /////////////////////
/*!
@brief skip the UTF-8 byte order mark
@return true iff there is no BOM or the correct BOM has been skipped
*/
bool skip_bom()
{
if (get() == 0xEF)
{
if (get() == 0xBB and get() == 0xBF)
{
// we completely parsed the BOM
return true;
}
else
{
// after reading 0xEF, an unexpected character followed
return false;
}
}
else
{
// the first character is not the beginning of the BOM; unget it to
// process is later
unget();
return true;
}
}
token_type scan() token_type scan()
{ {
// initially, skip the BOM
if (chars_read == 0 and not skip_bom())
{
error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
return token_type::parse_error;
}
// read next character and ignore whitespace // read next character and ignore whitespace
do do
{ {
@ -3096,6 +3097,9 @@ scan_number_done:
/// the current character /// the current character
std::char_traits<char>::int_type current = std::char_traits<char>::eof(); std::char_traits<char>::int_type current = std::char_traits<char>::eof();
/// whether the next get() call should just return current
bool next_unget = false;
/// the number of characters read /// the number of characters read
std::size_t chars_read = 0; std::size_t chars_read = 0;

View file

@ -798,18 +798,18 @@ TEST_CASE("deserialization")
{ {
CHECK_THROWS_AS(json::parse(bom), json::parse_error&); CHECK_THROWS_AS(json::parse(bom), json::parse_error&);
CHECK_THROWS_WITH(json::parse(bom), CHECK_THROWS_WITH(json::parse(bom),
"[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input; expected '[', '{', or a literal"); "[json.exception.parse_error.101] parse error at 4: syntax error - unexpected end of input; expected '[', '{', or a literal");
CHECK_THROWS_AS(json::parse(std::istringstream(bom)), json::parse_error&); CHECK_THROWS_AS(json::parse(std::istringstream(bom)), json::parse_error&);
CHECK_THROWS_WITH(json::parse(std::istringstream(bom)), CHECK_THROWS_WITH(json::parse(std::istringstream(bom)),
"[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input; expected '[', '{', or a literal"); "[json.exception.parse_error.101] parse error at 4: syntax error - unexpected end of input; expected '[', '{', or a literal");
SaxEventLogger l; SaxEventLogger l;
CHECK(not json::sax_parse(bom, &l)); CHECK(not json::sax_parse(bom, &l));
CHECK(l.events.size() == 1); CHECK(l.events.size() == 1);
CHECK(l.events == std::vector<std::string>( CHECK(l.events == std::vector<std::string>(
{ {
"parse_error(1)" "parse_error(4)"
})); }));
} }
@ -836,12 +836,12 @@ TEST_CASE("deserialization")
SECTION("2 byte of BOM") SECTION("2 byte of BOM")
{ {
CHECK_THROWS_AS(json::parse(bom.substr(0, 2)), json::parse_error&); CHECK_THROWS_AS(json::parse(bom.substr(0, 2)), json::parse_error&);
CHECK_THROWS_WITH(json::parse(bom), CHECK_THROWS_WITH(json::parse(bom.substr(0, 2)),
"[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input; expected '[', '{', or a literal"); "[json.exception.parse_error.101] parse error at 3: syntax error - invalid BOM; must be 0xEF 0xBB 0xBF if given; last read: '\xEF\xBB'");
CHECK_THROWS_AS(json::parse(std::istringstream(bom.substr(0, 2))), json::parse_error&); CHECK_THROWS_AS(json::parse(std::istringstream(bom.substr(0, 2))), json::parse_error&);
CHECK_THROWS_WITH(json::parse(std::istringstream(bom)), CHECK_THROWS_WITH(json::parse(std::istringstream(bom.substr(0, 2))),
"[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input; expected '[', '{', or a literal"); "[json.exception.parse_error.101] parse error at 3: syntax error - invalid BOM; must be 0xEF 0xBB 0xBF if given; last read: '\xEF\xBB'");
SaxEventLogger l1, l2; SaxEventLogger l1, l2;
CHECK(not json::sax_parse(std::istringstream(bom.substr(0, 2)), &l1)); CHECK(not json::sax_parse(std::istringstream(bom.substr(0, 2)), &l1));
@ -849,24 +849,24 @@ TEST_CASE("deserialization")
CHECK(l1.events.size() == 1); CHECK(l1.events.size() == 1);
CHECK(l1.events == std::vector<std::string>( CHECK(l1.events == std::vector<std::string>(
{ {
"parse_error(1)" "parse_error(3)"
})); }));
CHECK(l2.events.size() == 1); CHECK(l2.events.size() == 1);
CHECK(l2.events == std::vector<std::string>( CHECK(l2.events == std::vector<std::string>(
{ {
"parse_error(1)" "parse_error(3)"
})); }));
} }
SECTION("1 byte of BOM") SECTION("1 byte of BOM")
{ {
CHECK_THROWS_AS(json::parse(bom.substr(0, 1)), json::parse_error&); CHECK_THROWS_AS(json::parse(bom.substr(0, 1)), json::parse_error&);
CHECK_THROWS_WITH(json::parse(bom), CHECK_THROWS_WITH(json::parse(bom.substr(0, 1)),
"[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input; expected '[', '{', or a literal"); "[json.exception.parse_error.101] parse error at 2: syntax error - invalid BOM; must be 0xEF 0xBB 0xBF if given; last read: '\xEF'");
CHECK_THROWS_AS(json::parse(std::istringstream(bom.substr(0, 1))), json::parse_error&); CHECK_THROWS_AS(json::parse(std::istringstream(bom.substr(0, 1))), json::parse_error&);
CHECK_THROWS_WITH(json::parse(std::istringstream(bom)), CHECK_THROWS_WITH(json::parse(std::istringstream(bom.substr(0, 1))),
"[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input; expected '[', '{', or a literal"); "[json.exception.parse_error.101] parse error at 2: syntax error - invalid BOM; must be 0xEF 0xBB 0xBF if given; last read: '\xEF'");
SaxEventLogger l1, l2; SaxEventLogger l1, l2;
CHECK(not json::sax_parse(std::istringstream(bom.substr(0, 1)), &l1)); CHECK(not json::sax_parse(std::istringstream(bom.substr(0, 1)), &l1));
@ -874,12 +874,12 @@ TEST_CASE("deserialization")
CHECK(l1.events.size() == 1); CHECK(l1.events.size() == 1);
CHECK(l1.events == std::vector<std::string>( CHECK(l1.events == std::vector<std::string>(
{ {
"parse_error(1)" "parse_error(2)"
})); }));
CHECK(l2.events.size() == 1); CHECK(l2.events.size() == 1);
CHECK(l2.events == std::vector<std::string>( CHECK(l2.events == std::vector<std::string>(
{ {
"parse_error(1)" "parse_error(2)"
})); }));
} }
@ -926,10 +926,28 @@ TEST_CASE("deserialization")
SaxEventLogger l; SaxEventLogger l;
CHECK(not json::sax_parse(s + "null", &l)); CHECK(not json::sax_parse(s + "null", &l));
CHECK(l.events.size() == 1); CHECK(l.events.size() == 1);
CHECK(l.events == std::vector<std::string>(
if (i0 != 0)
{ {
"parse_error(1)" CHECK(l.events == std::vector<std::string>(
})); {
"parse_error(1)"
}));
}
else if (i1 != 0)
{
CHECK(l.events == std::vector<std::string>(
{
"parse_error(2)"
}));
}
else
{
CHECK(l.events == std::vector<std::string>(
{
"parse_error(3)"
}));
}
} }
} }
} }