🔨 removing unget_character() function from input adapters #834

This commit is contained in:
Niels Lohmann 2018-04-02 21:10:48 +02:00
parent ba6edd5634
commit aa89c5e048
No known key found for this signature in database
GPG key ID: 7F3CEA63AE251B69
4 changed files with 168 additions and 142 deletions

View file

@ -31,19 +31,17 @@ enum class input_format_t { json, cbor, msgpack, ubjson };
@brief abstract input adapter interface
Produces a stream of std::char_traits<char>::int_type characters from a
std::istream, a buffer, or some other input type. Accepts the return of exactly
one non-EOF character for future input. The int_type characters returned
consist of all valid char values as positive values (typically unsigned char),
plus an EOF value outside that range, specified by the value of the function
std::char_traits<char>::eof(). This value is typically -1, but could be any
arbitrary value which is not a valid char value.
std::istream, a buffer, or some other input type. Accepts the return of
exactly one non-EOF character for future input. The int_type characters
returned consist of all valid char values as positive values (typically
unsigned char), plus an EOF value outside that range, specified by the value
of the function std::char_traits<char>::eof(). This value is typically -1, but
could be any arbitrary value which is not a valid char value.
*/
struct input_adapter_protocol
{
/// get a character [0,255] or std::char_traits<char>::eof().
virtual std::char_traits<char>::int_type get_character() = 0;
/// restore the last non-eof() character to input
virtual void unget_character() = 0;
virtual ~input_adapter_protocol() = default;
};
@ -71,34 +69,7 @@ class input_stream_adapter : public input_adapter_protocol
explicit input_stream_adapter(std::istream& i)
: is(i), sb(*i.rdbuf())
{
// skip byte order mark
std::char_traits<char>::int_type c;
if ((c = get_character()) == 0xEF)
{
if ((c = get_character()) == 0xBB)
{
if ((c = get_character()) == 0xBF)
{
return; // Ignore BOM
}
else if (c != std::char_traits<char>::eof())
{
is.unget();
}
is.putback('\xBB');
}
else if (c != std::char_traits<char>::eof())
{
is.unget();
}
is.putback('\xEF');
}
else if (c != std::char_traits<char>::eof())
{
is.unget(); // no byte order mark; process as usual
}
}
{}
// delete because of pointer members
input_stream_adapter(const input_stream_adapter&) = delete;
@ -112,11 +83,6 @@ class input_stream_adapter : public input_adapter_protocol
return sb.sbumpc();
}
void unget_character() override
{
sb.sungetc(); // is.unget() avoided for performance
}
private:
/// the associated input stream
std::istream& is;
@ -128,14 +94,8 @@ class input_buffer_adapter : public input_adapter_protocol
{
public:
input_buffer_adapter(const char* b, const std::size_t l)
: cursor(b), limit(b + l), start(b)
{
// skip byte order mark
if (l >= 3 and b[0] == '\xEF' and b[1] == '\xBB' and b[2] == '\xBF')
{
cursor += 3;
}
}
: cursor(b), limit(b + l)
{}
// delete because of pointer members
input_buffer_adapter(const input_buffer_adapter&) = delete;
@ -151,21 +111,11 @@ class input_buffer_adapter : public input_adapter_protocol
return std::char_traits<char>::eof();
}
void unget_character() noexcept override
{
if (JSON_LIKELY(cursor > start))
{
--cursor;
}
}
private:
/// pointer to the current character
const char* cursor;
/// pointer past the last character
const char* limit;
/// pointer to the first character
const char* start;
};
class input_adapter

View file

@ -1081,7 +1081,16 @@ scan_number_done:
std::char_traits<char>::int_type get()
{
++chars_read;
current = ia->get_character();
if (next_unget)
{
// just reset the next_unget variable and work with current
next_unget = false;
}
else
{
current = ia->get_character();
}
if (JSON_LIKELY(current != std::char_traits<char>::eof()))
{
token_string.push_back(std::char_traits<char>::to_char_type(current));
@ -1089,13 +1098,20 @@ scan_number_done:
return current;
}
/// unget current character (return it again on next get)
/*!
@brief unget current character (read it again on next get)
We implement unget by setting variable next_unget to true. The input is not
changed - we just simulate ungetting by modifying chars_read and
token_string. The next call to get() will behave as if the unget character
is read again.
*/
void unget()
{
next_unget = true;
--chars_read;
if (JSON_LIKELY(current != std::char_traits<char>::eof()))
{
ia->unget_character();
assert(token_string.size() != 0);
token_string.pop_back();
}
@ -1183,8 +1199,43 @@ scan_number_done:
// actual scanner
/////////////////////
/*!
@brief skip the UTF-8 byte order mark
@return true iff there is no BOM or the correct BOM has been skipped
*/
bool skip_bom()
{
if (get() == 0xEF)
{
if (get() == 0xBB and get() == 0xBF)
{
// we completely parsed the BOM
return true;
}
else
{
// after reading 0xEF, an unexpected character followed
return false;
}
}
else
{
// the first character is not the beginning of the BOM; unget it to
// process is later
unget();
return true;
}
}
token_type scan()
{
// initially, skip the BOM
if (chars_read == 0 and not skip_bom())
{
error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
return token_type::parse_error;
}
// read next character and ignore whitespace
do
{
@ -1254,6 +1305,9 @@ scan_number_done:
/// the current character
std::char_traits<char>::int_type current = std::char_traits<char>::eof();
/// whether the next get() call should just return current
bool next_unget = false;
/// the number of characters read
std::size_t chars_read = 0;

View file

@ -1604,19 +1604,17 @@ enum class input_format_t { json, cbor, msgpack, ubjson };
@brief abstract input adapter interface
Produces a stream of std::char_traits<char>::int_type characters from a
std::istream, a buffer, or some other input type. Accepts the return of exactly
one non-EOF character for future input. The int_type characters returned
consist of all valid char values as positive values (typically unsigned char),
plus an EOF value outside that range, specified by the value of the function
std::char_traits<char>::eof(). This value is typically -1, but could be any
arbitrary value which is not a valid char value.
std::istream, a buffer, or some other input type. Accepts the return of
exactly one non-EOF character for future input. The int_type characters
returned consist of all valid char values as positive values (typically
unsigned char), plus an EOF value outside that range, specified by the value
of the function std::char_traits<char>::eof(). This value is typically -1, but
could be any arbitrary value which is not a valid char value.
*/
struct input_adapter_protocol
{
/// get a character [0,255] or std::char_traits<char>::eof().
virtual std::char_traits<char>::int_type get_character() = 0;
/// restore the last non-eof() character to input
virtual void unget_character() = 0;
virtual ~input_adapter_protocol() = default;
};
@ -1644,34 +1642,7 @@ class input_stream_adapter : public input_adapter_protocol
explicit input_stream_adapter(std::istream& i)
: is(i), sb(*i.rdbuf())
{
// skip byte order mark
std::char_traits<char>::int_type c;
if ((c = get_character()) == 0xEF)
{
if ((c = get_character()) == 0xBB)
{
if ((c = get_character()) == 0xBF)
{
return; // Ignore BOM
}
else if (c != std::char_traits<char>::eof())
{
is.unget();
}
is.putback('\xBB');
}
else if (c != std::char_traits<char>::eof())
{
is.unget();
}
is.putback('\xEF');
}
else if (c != std::char_traits<char>::eof())
{
is.unget(); // no byte order mark; process as usual
}
}
{}
// delete because of pointer members
input_stream_adapter(const input_stream_adapter&) = delete;
@ -1685,11 +1656,6 @@ class input_stream_adapter : public input_adapter_protocol
return sb.sbumpc();
}
void unget_character() override
{
sb.sungetc(); // is.unget() avoided for performance
}
private:
/// the associated input stream
std::istream& is;
@ -1701,14 +1667,8 @@ class input_buffer_adapter : public input_adapter_protocol
{
public:
input_buffer_adapter(const char* b, const std::size_t l)
: cursor(b), limit(b + l), start(b)
{
// skip byte order mark
if (l >= 3 and b[0] == '\xEF' and b[1] == '\xBB' and b[2] == '\xBF')
{
cursor += 3;
}
}
: cursor(b), limit(b + l)
{}
// delete because of pointer members
input_buffer_adapter(const input_buffer_adapter&) = delete;
@ -1724,21 +1684,11 @@ class input_buffer_adapter : public input_adapter_protocol
return std::char_traits<char>::eof();
}
void unget_character() noexcept override
{
if (JSON_LIKELY(cursor > start))
{
--cursor;
}
}
private:
/// pointer to the current character
const char* cursor;
/// pointer past the last character
const char* limit;
/// pointer to the first character
const char* start;
};
class input_adapter
@ -2923,7 +2873,16 @@ scan_number_done:
std::char_traits<char>::int_type get()
{
++chars_read;
current = ia->get_character();
if (next_unget)
{
// just reset the next_unget variable and work with current
next_unget = false;
}
else
{
current = ia->get_character();
}
if (JSON_LIKELY(current != std::char_traits<char>::eof()))
{
token_string.push_back(std::char_traits<char>::to_char_type(current));
@ -2931,13 +2890,20 @@ scan_number_done:
return current;
}
/// unget current character (return it again on next get)
/*!
@brief unget current character (read it again on next get)
We implement unget by setting variable next_unget to true. The input is not
changed - we just simulate ungetting by modifying chars_read and
token_string. The next call to get() will behave as if the unget character
is read again.
*/
void unget()
{
next_unget = true;
--chars_read;
if (JSON_LIKELY(current != std::char_traits<char>::eof()))
{
ia->unget_character();
assert(token_string.size() != 0);
token_string.pop_back();
}
@ -3025,8 +2991,43 @@ scan_number_done:
// actual scanner
/////////////////////
/*!
@brief skip the UTF-8 byte order mark
@return true iff there is no BOM or the correct BOM has been skipped
*/
bool skip_bom()
{
if (get() == 0xEF)
{
if (get() == 0xBB and get() == 0xBF)
{
// we completely parsed the BOM
return true;
}
else
{
// after reading 0xEF, an unexpected character followed
return false;
}
}
else
{
// the first character is not the beginning of the BOM; unget it to
// process is later
unget();
return true;
}
}
token_type scan()
{
// initially, skip the BOM
if (chars_read == 0 and not skip_bom())
{
error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
return token_type::parse_error;
}
// read next character and ignore whitespace
do
{
@ -3096,6 +3097,9 @@ scan_number_done:
/// the current character
std::char_traits<char>::int_type current = std::char_traits<char>::eof();
/// whether the next get() call should just return current
bool next_unget = false;
/// the number of characters read
std::size_t chars_read = 0;

View file

@ -798,18 +798,18 @@ TEST_CASE("deserialization")
{
CHECK_THROWS_AS(json::parse(bom), json::parse_error&);
CHECK_THROWS_WITH(json::parse(bom),
"[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input; expected '[', '{', or a literal");
"[json.exception.parse_error.101] parse error at 4: syntax error - unexpected end of input; expected '[', '{', or a literal");
CHECK_THROWS_AS(json::parse(std::istringstream(bom)), json::parse_error&);
CHECK_THROWS_WITH(json::parse(std::istringstream(bom)),
"[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input; expected '[', '{', or a literal");
"[json.exception.parse_error.101] parse error at 4: syntax error - unexpected end of input; expected '[', '{', or a literal");
SaxEventLogger l;
CHECK(not json::sax_parse(bom, &l));
CHECK(l.events.size() == 1);
CHECK(l.events == std::vector<std::string>(
{
"parse_error(1)"
"parse_error(4)"
}));
}
@ -836,12 +836,12 @@ TEST_CASE("deserialization")
SECTION("2 byte of BOM")
{
CHECK_THROWS_AS(json::parse(bom.substr(0, 2)), json::parse_error&);
CHECK_THROWS_WITH(json::parse(bom),
"[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input; expected '[', '{', or a literal");
CHECK_THROWS_WITH(json::parse(bom.substr(0, 2)),
"[json.exception.parse_error.101] parse error at 3: syntax error - invalid BOM; must be 0xEF 0xBB 0xBF if given; last read: '\xEF\xBB'");
CHECK_THROWS_AS(json::parse(std::istringstream(bom.substr(0, 2))), json::parse_error&);
CHECK_THROWS_WITH(json::parse(std::istringstream(bom)),
"[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input; expected '[', '{', or a literal");
CHECK_THROWS_WITH(json::parse(std::istringstream(bom.substr(0, 2))),
"[json.exception.parse_error.101] parse error at 3: syntax error - invalid BOM; must be 0xEF 0xBB 0xBF if given; last read: '\xEF\xBB'");
SaxEventLogger l1, l2;
CHECK(not json::sax_parse(std::istringstream(bom.substr(0, 2)), &l1));
@ -849,24 +849,24 @@ TEST_CASE("deserialization")
CHECK(l1.events.size() == 1);
CHECK(l1.events == std::vector<std::string>(
{
"parse_error(1)"
"parse_error(3)"
}));
CHECK(l2.events.size() == 1);
CHECK(l2.events == std::vector<std::string>(
{
"parse_error(1)"
"parse_error(3)"
}));
}
SECTION("1 byte of BOM")
{
CHECK_THROWS_AS(json::parse(bom.substr(0, 1)), json::parse_error&);
CHECK_THROWS_WITH(json::parse(bom),
"[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input; expected '[', '{', or a literal");
CHECK_THROWS_WITH(json::parse(bom.substr(0, 1)),
"[json.exception.parse_error.101] parse error at 2: syntax error - invalid BOM; must be 0xEF 0xBB 0xBF if given; last read: '\xEF'");
CHECK_THROWS_AS(json::parse(std::istringstream(bom.substr(0, 1))), json::parse_error&);
CHECK_THROWS_WITH(json::parse(std::istringstream(bom)),
"[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input; expected '[', '{', or a literal");
CHECK_THROWS_WITH(json::parse(std::istringstream(bom.substr(0, 1))),
"[json.exception.parse_error.101] parse error at 2: syntax error - invalid BOM; must be 0xEF 0xBB 0xBF if given; last read: '\xEF'");
SaxEventLogger l1, l2;
CHECK(not json::sax_parse(std::istringstream(bom.substr(0, 1)), &l1));
@ -874,12 +874,12 @@ TEST_CASE("deserialization")
CHECK(l1.events.size() == 1);
CHECK(l1.events == std::vector<std::string>(
{
"parse_error(1)"
"parse_error(2)"
}));
CHECK(l2.events.size() == 1);
CHECK(l2.events == std::vector<std::string>(
{
"parse_error(1)"
"parse_error(2)"
}));
}
@ -926,10 +926,28 @@ TEST_CASE("deserialization")
SaxEventLogger l;
CHECK(not json::sax_parse(s + "null", &l));
CHECK(l.events.size() == 1);
CHECK(l.events == std::vector<std::string>(
if (i0 != 0)
{
"parse_error(1)"
}));
CHECK(l.events == std::vector<std::string>(
{
"parse_error(1)"
}));
}
else if (i1 != 0)
{
CHECK(l.events == std::vector<std::string>(
{
"parse_error(2)"
}));
}
else
{
CHECK(l.events == std::vector<std::string>(
{
"parse_error(3)"
}));
}
}
}
}