🔨 removing unget_character() function from input adapters #834
This commit is contained in:
parent
ba6edd5634
commit
aa89c5e048
4 changed files with 168 additions and 142 deletions
|
@ -31,19 +31,17 @@ enum class input_format_t { json, cbor, msgpack, ubjson };
|
|||
@brief abstract input adapter interface
|
||||
|
||||
Produces a stream of std::char_traits<char>::int_type characters from a
|
||||
std::istream, a buffer, or some other input type. Accepts the return of exactly
|
||||
one non-EOF character for future input. The int_type characters returned
|
||||
consist of all valid char values as positive values (typically unsigned char),
|
||||
plus an EOF value outside that range, specified by the value of the function
|
||||
std::char_traits<char>::eof(). This value is typically -1, but could be any
|
||||
arbitrary value which is not a valid char value.
|
||||
std::istream, a buffer, or some other input type. Accepts the return of
|
||||
exactly one non-EOF character for future input. The int_type characters
|
||||
returned consist of all valid char values as positive values (typically
|
||||
unsigned char), plus an EOF value outside that range, specified by the value
|
||||
of the function std::char_traits<char>::eof(). This value is typically -1, but
|
||||
could be any arbitrary value which is not a valid char value.
|
||||
*/
|
||||
struct input_adapter_protocol
|
||||
{
|
||||
/// get a character [0,255] or std::char_traits<char>::eof().
|
||||
virtual std::char_traits<char>::int_type get_character() = 0;
|
||||
/// restore the last non-eof() character to input
|
||||
virtual void unget_character() = 0;
|
||||
virtual ~input_adapter_protocol() = default;
|
||||
};
|
||||
|
||||
|
@ -71,34 +69,7 @@ class input_stream_adapter : public input_adapter_protocol
|
|||
|
||||
explicit input_stream_adapter(std::istream& i)
|
||||
: is(i), sb(*i.rdbuf())
|
||||
{
|
||||
// skip byte order mark
|
||||
std::char_traits<char>::int_type c;
|
||||
if ((c = get_character()) == 0xEF)
|
||||
{
|
||||
if ((c = get_character()) == 0xBB)
|
||||
{
|
||||
if ((c = get_character()) == 0xBF)
|
||||
{
|
||||
return; // Ignore BOM
|
||||
}
|
||||
else if (c != std::char_traits<char>::eof())
|
||||
{
|
||||
is.unget();
|
||||
}
|
||||
is.putback('\xBB');
|
||||
}
|
||||
else if (c != std::char_traits<char>::eof())
|
||||
{
|
||||
is.unget();
|
||||
}
|
||||
is.putback('\xEF');
|
||||
}
|
||||
else if (c != std::char_traits<char>::eof())
|
||||
{
|
||||
is.unget(); // no byte order mark; process as usual
|
||||
}
|
||||
}
|
||||
{}
|
||||
|
||||
// delete because of pointer members
|
||||
input_stream_adapter(const input_stream_adapter&) = delete;
|
||||
|
@ -112,11 +83,6 @@ class input_stream_adapter : public input_adapter_protocol
|
|||
return sb.sbumpc();
|
||||
}
|
||||
|
||||
void unget_character() override
|
||||
{
|
||||
sb.sungetc(); // is.unget() avoided for performance
|
||||
}
|
||||
|
||||
private:
|
||||
/// the associated input stream
|
||||
std::istream& is;
|
||||
|
@ -128,14 +94,8 @@ class input_buffer_adapter : public input_adapter_protocol
|
|||
{
|
||||
public:
|
||||
input_buffer_adapter(const char* b, const std::size_t l)
|
||||
: cursor(b), limit(b + l), start(b)
|
||||
{
|
||||
// skip byte order mark
|
||||
if (l >= 3 and b[0] == '\xEF' and b[1] == '\xBB' and b[2] == '\xBF')
|
||||
{
|
||||
cursor += 3;
|
||||
}
|
||||
}
|
||||
: cursor(b), limit(b + l)
|
||||
{}
|
||||
|
||||
// delete because of pointer members
|
||||
input_buffer_adapter(const input_buffer_adapter&) = delete;
|
||||
|
@ -151,21 +111,11 @@ class input_buffer_adapter : public input_adapter_protocol
|
|||
return std::char_traits<char>::eof();
|
||||
}
|
||||
|
||||
void unget_character() noexcept override
|
||||
{
|
||||
if (JSON_LIKELY(cursor > start))
|
||||
{
|
||||
--cursor;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
/// pointer to the current character
|
||||
const char* cursor;
|
||||
/// pointer past the last character
|
||||
const char* limit;
|
||||
/// pointer to the first character
|
||||
const char* start;
|
||||
};
|
||||
|
||||
class input_adapter
|
||||
|
|
|
@ -1081,7 +1081,16 @@ scan_number_done:
|
|||
std::char_traits<char>::int_type get()
|
||||
{
|
||||
++chars_read;
|
||||
if (next_unget)
|
||||
{
|
||||
// just reset the next_unget variable and work with current
|
||||
next_unget = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
current = ia->get_character();
|
||||
}
|
||||
|
||||
if (JSON_LIKELY(current != std::char_traits<char>::eof()))
|
||||
{
|
||||
token_string.push_back(std::char_traits<char>::to_char_type(current));
|
||||
|
@ -1089,13 +1098,20 @@ scan_number_done:
|
|||
return current;
|
||||
}
|
||||
|
||||
/// unget current character (return it again on next get)
|
||||
/*!
|
||||
@brief unget current character (read it again on next get)
|
||||
|
||||
We implement unget by setting variable next_unget to true. The input is not
|
||||
changed - we just simulate ungetting by modifying chars_read and
|
||||
token_string. The next call to get() will behave as if the unget character
|
||||
is read again.
|
||||
*/
|
||||
void unget()
|
||||
{
|
||||
next_unget = true;
|
||||
--chars_read;
|
||||
if (JSON_LIKELY(current != std::char_traits<char>::eof()))
|
||||
{
|
||||
ia->unget_character();
|
||||
assert(token_string.size() != 0);
|
||||
token_string.pop_back();
|
||||
}
|
||||
|
@ -1183,8 +1199,43 @@ scan_number_done:
|
|||
// actual scanner
|
||||
/////////////////////
|
||||
|
||||
/*!
|
||||
@brief skip the UTF-8 byte order mark
|
||||
@return true iff there is no BOM or the correct BOM has been skipped
|
||||
*/
|
||||
bool skip_bom()
|
||||
{
|
||||
if (get() == 0xEF)
|
||||
{
|
||||
if (get() == 0xBB and get() == 0xBF)
|
||||
{
|
||||
// we completely parsed the BOM
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
// after reading 0xEF, an unexpected character followed
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// the first character is not the beginning of the BOM; unget it to
|
||||
// process is later
|
||||
unget();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
token_type scan()
|
||||
{
|
||||
// initially, skip the BOM
|
||||
if (chars_read == 0 and not skip_bom())
|
||||
{
|
||||
error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
|
||||
return token_type::parse_error;
|
||||
}
|
||||
|
||||
// read next character and ignore whitespace
|
||||
do
|
||||
{
|
||||
|
@ -1254,6 +1305,9 @@ scan_number_done:
|
|||
/// the current character
|
||||
std::char_traits<char>::int_type current = std::char_traits<char>::eof();
|
||||
|
||||
/// whether the next get() call should just return current
|
||||
bool next_unget = false;
|
||||
|
||||
/// the number of characters read
|
||||
std::size_t chars_read = 0;
|
||||
|
||||
|
|
|
@ -1604,19 +1604,17 @@ enum class input_format_t { json, cbor, msgpack, ubjson };
|
|||
@brief abstract input adapter interface
|
||||
|
||||
Produces a stream of std::char_traits<char>::int_type characters from a
|
||||
std::istream, a buffer, or some other input type. Accepts the return of exactly
|
||||
one non-EOF character for future input. The int_type characters returned
|
||||
consist of all valid char values as positive values (typically unsigned char),
|
||||
plus an EOF value outside that range, specified by the value of the function
|
||||
std::char_traits<char>::eof(). This value is typically -1, but could be any
|
||||
arbitrary value which is not a valid char value.
|
||||
std::istream, a buffer, or some other input type. Accepts the return of
|
||||
exactly one non-EOF character for future input. The int_type characters
|
||||
returned consist of all valid char values as positive values (typically
|
||||
unsigned char), plus an EOF value outside that range, specified by the value
|
||||
of the function std::char_traits<char>::eof(). This value is typically -1, but
|
||||
could be any arbitrary value which is not a valid char value.
|
||||
*/
|
||||
struct input_adapter_protocol
|
||||
{
|
||||
/// get a character [0,255] or std::char_traits<char>::eof().
|
||||
virtual std::char_traits<char>::int_type get_character() = 0;
|
||||
/// restore the last non-eof() character to input
|
||||
virtual void unget_character() = 0;
|
||||
virtual ~input_adapter_protocol() = default;
|
||||
};
|
||||
|
||||
|
@ -1644,34 +1642,7 @@ class input_stream_adapter : public input_adapter_protocol
|
|||
|
||||
explicit input_stream_adapter(std::istream& i)
|
||||
: is(i), sb(*i.rdbuf())
|
||||
{
|
||||
// skip byte order mark
|
||||
std::char_traits<char>::int_type c;
|
||||
if ((c = get_character()) == 0xEF)
|
||||
{
|
||||
if ((c = get_character()) == 0xBB)
|
||||
{
|
||||
if ((c = get_character()) == 0xBF)
|
||||
{
|
||||
return; // Ignore BOM
|
||||
}
|
||||
else if (c != std::char_traits<char>::eof())
|
||||
{
|
||||
is.unget();
|
||||
}
|
||||
is.putback('\xBB');
|
||||
}
|
||||
else if (c != std::char_traits<char>::eof())
|
||||
{
|
||||
is.unget();
|
||||
}
|
||||
is.putback('\xEF');
|
||||
}
|
||||
else if (c != std::char_traits<char>::eof())
|
||||
{
|
||||
is.unget(); // no byte order mark; process as usual
|
||||
}
|
||||
}
|
||||
{}
|
||||
|
||||
// delete because of pointer members
|
||||
input_stream_adapter(const input_stream_adapter&) = delete;
|
||||
|
@ -1685,11 +1656,6 @@ class input_stream_adapter : public input_adapter_protocol
|
|||
return sb.sbumpc();
|
||||
}
|
||||
|
||||
void unget_character() override
|
||||
{
|
||||
sb.sungetc(); // is.unget() avoided for performance
|
||||
}
|
||||
|
||||
private:
|
||||
/// the associated input stream
|
||||
std::istream& is;
|
||||
|
@ -1701,14 +1667,8 @@ class input_buffer_adapter : public input_adapter_protocol
|
|||
{
|
||||
public:
|
||||
input_buffer_adapter(const char* b, const std::size_t l)
|
||||
: cursor(b), limit(b + l), start(b)
|
||||
{
|
||||
// skip byte order mark
|
||||
if (l >= 3 and b[0] == '\xEF' and b[1] == '\xBB' and b[2] == '\xBF')
|
||||
{
|
||||
cursor += 3;
|
||||
}
|
||||
}
|
||||
: cursor(b), limit(b + l)
|
||||
{}
|
||||
|
||||
// delete because of pointer members
|
||||
input_buffer_adapter(const input_buffer_adapter&) = delete;
|
||||
|
@ -1724,21 +1684,11 @@ class input_buffer_adapter : public input_adapter_protocol
|
|||
return std::char_traits<char>::eof();
|
||||
}
|
||||
|
||||
void unget_character() noexcept override
|
||||
{
|
||||
if (JSON_LIKELY(cursor > start))
|
||||
{
|
||||
--cursor;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
/// pointer to the current character
|
||||
const char* cursor;
|
||||
/// pointer past the last character
|
||||
const char* limit;
|
||||
/// pointer to the first character
|
||||
const char* start;
|
||||
};
|
||||
|
||||
class input_adapter
|
||||
|
@ -2923,7 +2873,16 @@ scan_number_done:
|
|||
std::char_traits<char>::int_type get()
|
||||
{
|
||||
++chars_read;
|
||||
if (next_unget)
|
||||
{
|
||||
// just reset the next_unget variable and work with current
|
||||
next_unget = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
current = ia->get_character();
|
||||
}
|
||||
|
||||
if (JSON_LIKELY(current != std::char_traits<char>::eof()))
|
||||
{
|
||||
token_string.push_back(std::char_traits<char>::to_char_type(current));
|
||||
|
@ -2931,13 +2890,20 @@ scan_number_done:
|
|||
return current;
|
||||
}
|
||||
|
||||
/// unget current character (return it again on next get)
|
||||
/*!
|
||||
@brief unget current character (read it again on next get)
|
||||
|
||||
We implement unget by setting variable next_unget to true. The input is not
|
||||
changed - we just simulate ungetting by modifying chars_read and
|
||||
token_string. The next call to get() will behave as if the unget character
|
||||
is read again.
|
||||
*/
|
||||
void unget()
|
||||
{
|
||||
next_unget = true;
|
||||
--chars_read;
|
||||
if (JSON_LIKELY(current != std::char_traits<char>::eof()))
|
||||
{
|
||||
ia->unget_character();
|
||||
assert(token_string.size() != 0);
|
||||
token_string.pop_back();
|
||||
}
|
||||
|
@ -3025,8 +2991,43 @@ scan_number_done:
|
|||
// actual scanner
|
||||
/////////////////////
|
||||
|
||||
/*!
|
||||
@brief skip the UTF-8 byte order mark
|
||||
@return true iff there is no BOM or the correct BOM has been skipped
|
||||
*/
|
||||
bool skip_bom()
|
||||
{
|
||||
if (get() == 0xEF)
|
||||
{
|
||||
if (get() == 0xBB and get() == 0xBF)
|
||||
{
|
||||
// we completely parsed the BOM
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
// after reading 0xEF, an unexpected character followed
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// the first character is not the beginning of the BOM; unget it to
|
||||
// process is later
|
||||
unget();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
token_type scan()
|
||||
{
|
||||
// initially, skip the BOM
|
||||
if (chars_read == 0 and not skip_bom())
|
||||
{
|
||||
error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
|
||||
return token_type::parse_error;
|
||||
}
|
||||
|
||||
// read next character and ignore whitespace
|
||||
do
|
||||
{
|
||||
|
@ -3096,6 +3097,9 @@ scan_number_done:
|
|||
/// the current character
|
||||
std::char_traits<char>::int_type current = std::char_traits<char>::eof();
|
||||
|
||||
/// whether the next get() call should just return current
|
||||
bool next_unget = false;
|
||||
|
||||
/// the number of characters read
|
||||
std::size_t chars_read = 0;
|
||||
|
||||
|
|
|
@ -798,18 +798,18 @@ TEST_CASE("deserialization")
|
|||
{
|
||||
CHECK_THROWS_AS(json::parse(bom), json::parse_error&);
|
||||
CHECK_THROWS_WITH(json::parse(bom),
|
||||
"[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input; expected '[', '{', or a literal");
|
||||
"[json.exception.parse_error.101] parse error at 4: syntax error - unexpected end of input; expected '[', '{', or a literal");
|
||||
|
||||
CHECK_THROWS_AS(json::parse(std::istringstream(bom)), json::parse_error&);
|
||||
CHECK_THROWS_WITH(json::parse(std::istringstream(bom)),
|
||||
"[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input; expected '[', '{', or a literal");
|
||||
"[json.exception.parse_error.101] parse error at 4: syntax error - unexpected end of input; expected '[', '{', or a literal");
|
||||
|
||||
SaxEventLogger l;
|
||||
CHECK(not json::sax_parse(bom, &l));
|
||||
CHECK(l.events.size() == 1);
|
||||
CHECK(l.events == std::vector<std::string>(
|
||||
{
|
||||
"parse_error(1)"
|
||||
"parse_error(4)"
|
||||
}));
|
||||
}
|
||||
|
||||
|
@ -836,12 +836,12 @@ TEST_CASE("deserialization")
|
|||
SECTION("2 byte of BOM")
|
||||
{
|
||||
CHECK_THROWS_AS(json::parse(bom.substr(0, 2)), json::parse_error&);
|
||||
CHECK_THROWS_WITH(json::parse(bom),
|
||||
"[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input; expected '[', '{', or a literal");
|
||||
CHECK_THROWS_WITH(json::parse(bom.substr(0, 2)),
|
||||
"[json.exception.parse_error.101] parse error at 3: syntax error - invalid BOM; must be 0xEF 0xBB 0xBF if given; last read: '\xEF\xBB'");
|
||||
|
||||
CHECK_THROWS_AS(json::parse(std::istringstream(bom.substr(0, 2))), json::parse_error&);
|
||||
CHECK_THROWS_WITH(json::parse(std::istringstream(bom)),
|
||||
"[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input; expected '[', '{', or a literal");
|
||||
CHECK_THROWS_WITH(json::parse(std::istringstream(bom.substr(0, 2))),
|
||||
"[json.exception.parse_error.101] parse error at 3: syntax error - invalid BOM; must be 0xEF 0xBB 0xBF if given; last read: '\xEF\xBB'");
|
||||
|
||||
SaxEventLogger l1, l2;
|
||||
CHECK(not json::sax_parse(std::istringstream(bom.substr(0, 2)), &l1));
|
||||
|
@ -849,24 +849,24 @@ TEST_CASE("deserialization")
|
|||
CHECK(l1.events.size() == 1);
|
||||
CHECK(l1.events == std::vector<std::string>(
|
||||
{
|
||||
"parse_error(1)"
|
||||
"parse_error(3)"
|
||||
}));
|
||||
CHECK(l2.events.size() == 1);
|
||||
CHECK(l2.events == std::vector<std::string>(
|
||||
{
|
||||
"parse_error(1)"
|
||||
"parse_error(3)"
|
||||
}));
|
||||
}
|
||||
|
||||
SECTION("1 byte of BOM")
|
||||
{
|
||||
CHECK_THROWS_AS(json::parse(bom.substr(0, 1)), json::parse_error&);
|
||||
CHECK_THROWS_WITH(json::parse(bom),
|
||||
"[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input; expected '[', '{', or a literal");
|
||||
CHECK_THROWS_WITH(json::parse(bom.substr(0, 1)),
|
||||
"[json.exception.parse_error.101] parse error at 2: syntax error - invalid BOM; must be 0xEF 0xBB 0xBF if given; last read: '\xEF'");
|
||||
|
||||
CHECK_THROWS_AS(json::parse(std::istringstream(bom.substr(0, 1))), json::parse_error&);
|
||||
CHECK_THROWS_WITH(json::parse(std::istringstream(bom)),
|
||||
"[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input; expected '[', '{', or a literal");
|
||||
CHECK_THROWS_WITH(json::parse(std::istringstream(bom.substr(0, 1))),
|
||||
"[json.exception.parse_error.101] parse error at 2: syntax error - invalid BOM; must be 0xEF 0xBB 0xBF if given; last read: '\xEF'");
|
||||
|
||||
SaxEventLogger l1, l2;
|
||||
CHECK(not json::sax_parse(std::istringstream(bom.substr(0, 1)), &l1));
|
||||
|
@ -874,12 +874,12 @@ TEST_CASE("deserialization")
|
|||
CHECK(l1.events.size() == 1);
|
||||
CHECK(l1.events == std::vector<std::string>(
|
||||
{
|
||||
"parse_error(1)"
|
||||
"parse_error(2)"
|
||||
}));
|
||||
CHECK(l2.events.size() == 1);
|
||||
CHECK(l2.events == std::vector<std::string>(
|
||||
{
|
||||
"parse_error(1)"
|
||||
"parse_error(2)"
|
||||
}));
|
||||
}
|
||||
|
||||
|
@ -926,11 +926,29 @@ TEST_CASE("deserialization")
|
|||
SaxEventLogger l;
|
||||
CHECK(not json::sax_parse(s + "null", &l));
|
||||
CHECK(l.events.size() == 1);
|
||||
|
||||
if (i0 != 0)
|
||||
{
|
||||
CHECK(l.events == std::vector<std::string>(
|
||||
{
|
||||
"parse_error(1)"
|
||||
}));
|
||||
}
|
||||
else if (i1 != 0)
|
||||
{
|
||||
CHECK(l.events == std::vector<std::string>(
|
||||
{
|
||||
"parse_error(2)"
|
||||
}));
|
||||
}
|
||||
else
|
||||
{
|
||||
CHECK(l.events == std::vector<std::string>(
|
||||
{
|
||||
"parse_error(3)"
|
||||
}));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue