🔨 removing unget_character() function from input adapters #834

2018-04-02 21:10:48 +02:00 · 2018-04-02 21:10:48 +02:00 · aa89c5e048
commit aa89c5e048
parent ba6edd5634
4 changed files with 168 additions and 142 deletions
--- a/include/nlohmann/detail/input/input_adapters.hpp
+++ b/include/nlohmann/detail/input/input_adapters.hpp
@ -31,19 +31,17 @@ enum class input_format_t { json, cbor, msgpack, ubjson };
@brief abstract input adapter interface
 Produces a stream of std::char_traits<char>::int_type characters from a
-std::istream, a buffer, or some other input type.  Accepts the return of exactly
+std::istream, a buffer, or some other input type.  Accepts the return of
-one non-EOF character for future input.  The int_type characters returned
+exactly one non-EOF character for future input. The int_type characters
-consist of all valid char values as positive values (typically unsigned char),
+returned consist of all valid char values as positive values (typically
-plus an EOF value outside that range, specified by the value of the function
+unsigned char), plus an EOF value outside that range, specified by the value
-std::char_traits<char>::eof().  This value is typically -1, but could be any
+of the function std::char_traits<char>::eof(). This value is typically -1, but
-arbitrary value which is not a valid char value.
+could be any arbitrary value which is not a valid char value.
 */
 struct input_adapter_protocol
 {
    /// get a character [0,255] or std::char_traits<char>::eof().
    virtual std::char_traits<char>::int_type get_character() = 0;
    /// restore the last non-eof() character to input
    virtual void unget_character() = 0;
    virtual ~input_adapter_protocol() = default;
 };
@ -71,34 +69,7 @@ class input_stream_adapter : public input_adapter_protocol
    explicit input_stream_adapter(std::istream& i)
        : is(i), sb(*i.rdbuf())
-    {
+    {}
        // skip byte order mark
        std::char_traits<char>::int_type c;
        if ((c = get_character()) == 0xEF)
        {
            if ((c = get_character()) == 0xBB)
            {
                if ((c = get_character()) == 0xBF)
                {
                    return; // Ignore BOM
                }
                else if (c != std::char_traits<char>::eof())
                {
                    is.unget();
                }
                is.putback('\xBB');
            }
            else if (c != std::char_traits<char>::eof())
            {
                is.unget();
            }
            is.putback('\xEF');
        }
        else if (c != std::char_traits<char>::eof())
        {
            is.unget(); // no byte order mark; process as usual
        }
    }
    // delete because of pointer members
    input_stream_adapter(const input_stream_adapter&) = delete;
@ -112,11 +83,6 @@ class input_stream_adapter : public input_adapter_protocol
        return sb.sbumpc();
    }
    void unget_character() override
    {
        sb.sungetc();  // is.unget() avoided for performance
    }
  private:
    /// the associated input stream
    std::istream& is;
@ -128,14 +94,8 @@ class input_buffer_adapter : public input_adapter_protocol
 {
  public:
    input_buffer_adapter(const char* b, const std::size_t l)
-        : cursor(b), limit(b + l), start(b)
+        : cursor(b), limit(b + l)
-    {
+    {}
        // skip byte order mark
        if (l >= 3 and b[0] == '\xEF' and b[1] == '\xBB' and b[2] == '\xBF')
        {
            cursor += 3;
        }
    }
    // delete because of pointer members
    input_buffer_adapter(const input_buffer_adapter&) = delete;
@ -151,21 +111,11 @@ class input_buffer_adapter : public input_adapter_protocol
        return std::char_traits<char>::eof();
    }
    void unget_character() noexcept override
    {
        if (JSON_LIKELY(cursor > start))
        {
            --cursor;
        }
    }
  private:
    /// pointer to the current character
    const char* cursor;
    /// pointer past the last character
    const char* limit;
    /// pointer to the first character
    const char* start;
 };
 class input_adapter
--- a/include/nlohmann/detail/input/lexer.hpp
+++ b/include/nlohmann/detail/input/lexer.hpp
@ -1081,7 +1081,16 @@ scan_number_done:
    std::char_traits<char>::int_type get()
    {
        ++chars_read;
-        current = ia->get_character();
+        if (next_unget)
        {
            // just reset the next_unget variable and work with current
            next_unget = false;
        }
        else
        {
            current = ia->get_character();
        }
        if (JSON_LIKELY(current != std::char_traits<char>::eof()))
        {
            token_string.push_back(std::char_traits<char>::to_char_type(current));
@ -1089,13 +1098,20 @@ scan_number_done:
        return current;
    }
-    /// unget current character (return it again on next get)
+    /*!
    @brief unget current character (read it again on next get)
    We implement unget by setting variable next_unget to true. The input is not
    changed - we just simulate ungetting by modifying chars_read and
    token_string. The next call to get() will behave as if the unget character
    is read again.
    */
    void unget()
    {
        next_unget = true;
        --chars_read;
        if (JSON_LIKELY(current != std::char_traits<char>::eof()))
        {
            ia->unget_character();
            assert(token_string.size() != 0);
            token_string.pop_back();
        }
@ -1183,8 +1199,43 @@ scan_number_done:
    // actual scanner
    /////////////////////
    /*!
    @brief skip the UTF-8 byte order mark
    @return true iff there is no BOM or the correct BOM has been skipped
    */
    bool skip_bom()
    {
        if (get() == 0xEF)
        {
            if (get() == 0xBB and get() == 0xBF)
            {
                // we completely parsed the BOM
                return true;
            }
            else
            {
                // after reading 0xEF, an unexpected character followed
                return false;
            }
        }
        else
        {
            // the first character is not the beginning of the BOM; unget it to
            // process is later
            unget();
            return true;
        }
    }
    token_type scan()
    {
        // initially, skip the BOM
        if (chars_read == 0 and not skip_bom())
        {
            error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
            return token_type::parse_error;
        }
        // read next character and ignore whitespace
        do
        {
@ -1254,6 +1305,9 @@ scan_number_done:
    /// the current character
    std::char_traits<char>::int_type current = std::char_traits<char>::eof();
    /// whether the next get() call should just return current
    bool next_unget = false;
    /// the number of characters read
    std::size_t chars_read = 0;
--- a/single_include/nlohmann/json.hpp
+++ b/single_include/nlohmann/json.hpp
@ -1604,19 +1604,17 @@ enum class input_format_t { json, cbor, msgpack, ubjson };
@brief abstract input adapter interface
 Produces a stream of std::char_traits<char>::int_type characters from a
-std::istream, a buffer, or some other input type.  Accepts the return of exactly
+std::istream, a buffer, or some other input type.  Accepts the return of
-one non-EOF character for future input.  The int_type characters returned
+exactly one non-EOF character for future input. The int_type characters
-consist of all valid char values as positive values (typically unsigned char),
+returned consist of all valid char values as positive values (typically
-plus an EOF value outside that range, specified by the value of the function
+unsigned char), plus an EOF value outside that range, specified by the value
-std::char_traits<char>::eof().  This value is typically -1, but could be any
+of the function std::char_traits<char>::eof(). This value is typically -1, but
-arbitrary value which is not a valid char value.
+could be any arbitrary value which is not a valid char value.
 */
 struct input_adapter_protocol
 {
    /// get a character [0,255] or std::char_traits<char>::eof().
    virtual std::char_traits<char>::int_type get_character() = 0;
    /// restore the last non-eof() character to input
    virtual void unget_character() = 0;
    virtual ~input_adapter_protocol() = default;
 };
@ -1644,34 +1642,7 @@ class input_stream_adapter : public input_adapter_protocol
    explicit input_stream_adapter(std::istream& i)
        : is(i), sb(*i.rdbuf())
-    {
+    {}
        // skip byte order mark
        std::char_traits<char>::int_type c;
        if ((c = get_character()) == 0xEF)
        {
            if ((c = get_character()) == 0xBB)
            {
                if ((c = get_character()) == 0xBF)
                {
                    return; // Ignore BOM
                }
                else if (c != std::char_traits<char>::eof())
                {
                    is.unget();
                }
                is.putback('\xBB');
            }
            else if (c != std::char_traits<char>::eof())
            {
                is.unget();
            }
            is.putback('\xEF');
        }
        else if (c != std::char_traits<char>::eof())
        {
            is.unget(); // no byte order mark; process as usual
        }
    }
    // delete because of pointer members
    input_stream_adapter(const input_stream_adapter&) = delete;
@ -1685,11 +1656,6 @@ class input_stream_adapter : public input_adapter_protocol
        return sb.sbumpc();
    }
    void unget_character() override
    {
        sb.sungetc();  // is.unget() avoided for performance
    }
  private:
    /// the associated input stream
    std::istream& is;
@ -1701,14 +1667,8 @@ class input_buffer_adapter : public input_adapter_protocol
 {
  public:
    input_buffer_adapter(const char* b, const std::size_t l)
-        : cursor(b), limit(b + l), start(b)
+        : cursor(b), limit(b + l)
-    {
+    {}
        // skip byte order mark
        if (l >= 3 and b[0] == '\xEF' and b[1] == '\xBB' and b[2] == '\xBF')
        {
            cursor += 3;
        }
    }
    // delete because of pointer members
    input_buffer_adapter(const input_buffer_adapter&) = delete;
@ -1724,21 +1684,11 @@ class input_buffer_adapter : public input_adapter_protocol
        return std::char_traits<char>::eof();
    }
    void unget_character() noexcept override
    {
        if (JSON_LIKELY(cursor > start))
        {
            --cursor;
        }
    }
  private:
    /// pointer to the current character
    const char* cursor;
    /// pointer past the last character
    const char* limit;
    /// pointer to the first character
    const char* start;
 };
 class input_adapter
@ -2923,7 +2873,16 @@ scan_number_done:
    std::char_traits<char>::int_type get()
    {
        ++chars_read;
-        current = ia->get_character();
+        if (next_unget)
        {
            // just reset the next_unget variable and work with current
            next_unget = false;
        }
        else
        {
            current = ia->get_character();
        }
        if (JSON_LIKELY(current != std::char_traits<char>::eof()))
        {
            token_string.push_back(std::char_traits<char>::to_char_type(current));
@ -2931,13 +2890,20 @@ scan_number_done:
        return current;
    }
-    /// unget current character (return it again on next get)
+    /*!
    @brief unget current character (read it again on next get)
    We implement unget by setting variable next_unget to true. The input is not
    changed - we just simulate ungetting by modifying chars_read and
    token_string. The next call to get() will behave as if the unget character
    is read again.
    */
    void unget()
    {
        next_unget = true;
        --chars_read;
        if (JSON_LIKELY(current != std::char_traits<char>::eof()))
        {
            ia->unget_character();
            assert(token_string.size() != 0);
            token_string.pop_back();
        }
@ -3025,8 +2991,43 @@ scan_number_done:
    // actual scanner
    /////////////////////
    /*!
    @brief skip the UTF-8 byte order mark
    @return true iff there is no BOM or the correct BOM has been skipped
    */
    bool skip_bom()
    {
        if (get() == 0xEF)
        {
            if (get() == 0xBB and get() == 0xBF)
            {
                // we completely parsed the BOM
                return true;
            }
            else
            {
                // after reading 0xEF, an unexpected character followed
                return false;
            }
        }
        else
        {
            // the first character is not the beginning of the BOM; unget it to
            // process is later
            unget();
            return true;
        }
    }
    token_type scan()
    {
        // initially, skip the BOM
        if (chars_read == 0 and not skip_bom())
        {
            error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
            return token_type::parse_error;
        }
        // read next character and ignore whitespace
        do
        {
@ -3096,6 +3097,9 @@ scan_number_done:
    /// the current character
    std::char_traits<char>::int_type current = std::char_traits<char>::eof();
    /// whether the next get() call should just return current
    bool next_unget = false;
    /// the number of characters read
    std::size_t chars_read = 0;
--- a/test/src/unit-deserialization.cpp
+++ b/test/src/unit-deserialization.cpp
@ -798,18 +798,18 @@ TEST_CASE("deserialization")
        {
            CHECK_THROWS_AS(json::parse(bom), json::parse_error&);
            CHECK_THROWS_WITH(json::parse(bom),
-                              "[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input; expected '[', '{', or a literal");
+                              "[json.exception.parse_error.101] parse error at 4: syntax error - unexpected end of input; expected '[', '{', or a literal");
            CHECK_THROWS_AS(json::parse(std::istringstream(bom)), json::parse_error&);
            CHECK_THROWS_WITH(json::parse(std::istringstream(bom)),
-                              "[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input; expected '[', '{', or a literal");
+                              "[json.exception.parse_error.101] parse error at 4: syntax error - unexpected end of input; expected '[', '{', or a literal");
            SaxEventLogger l;
            CHECK(not json::sax_parse(bom, &l));
            CHECK(l.events.size() == 1);
            CHECK(l.events == std::vector<std::string>(
            {
-                "parse_error(1)"
+                "parse_error(4)"
            }));
        }
@ -836,12 +836,12 @@ TEST_CASE("deserialization")
        SECTION("2 byte of BOM")
        {
            CHECK_THROWS_AS(json::parse(bom.substr(0, 2)), json::parse_error&);
-            CHECK_THROWS_WITH(json::parse(bom),
+            CHECK_THROWS_WITH(json::parse(bom.substr(0, 2)),
-                              "[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input; expected '[', '{', or a literal");
+                              "[json.exception.parse_error.101] parse error at 3: syntax error - invalid BOM; must be 0xEF 0xBB 0xBF if given; last read: '\xEF\xBB'");
            CHECK_THROWS_AS(json::parse(std::istringstream(bom.substr(0, 2))), json::parse_error&);
-            CHECK_THROWS_WITH(json::parse(std::istringstream(bom)),
+            CHECK_THROWS_WITH(json::parse(std::istringstream(bom.substr(0, 2))),
-                              "[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input; expected '[', '{', or a literal");
+                              "[json.exception.parse_error.101] parse error at 3: syntax error - invalid BOM; must be 0xEF 0xBB 0xBF if given; last read: '\xEF\xBB'");
            SaxEventLogger l1, l2;
            CHECK(not json::sax_parse(std::istringstream(bom.substr(0, 2)), &l1));
@ -849,24 +849,24 @@ TEST_CASE("deserialization")
            CHECK(l1.events.size() == 1);
            CHECK(l1.events == std::vector<std::string>(
            {
-                "parse_error(1)"
+                "parse_error(3)"
            }));
            CHECK(l2.events.size() == 1);
            CHECK(l2.events == std::vector<std::string>(
            {
-                "parse_error(1)"
+                "parse_error(3)"
            }));
        }
        SECTION("1 byte of BOM")
        {
            CHECK_THROWS_AS(json::parse(bom.substr(0, 1)), json::parse_error&);
-            CHECK_THROWS_WITH(json::parse(bom),
+            CHECK_THROWS_WITH(json::parse(bom.substr(0, 1)),
-                              "[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input; expected '[', '{', or a literal");
+                              "[json.exception.parse_error.101] parse error at 2: syntax error - invalid BOM; must be 0xEF 0xBB 0xBF if given; last read: '\xEF'");
            CHECK_THROWS_AS(json::parse(std::istringstream(bom.substr(0, 1))), json::parse_error&);
-            CHECK_THROWS_WITH(json::parse(std::istringstream(bom)),
+            CHECK_THROWS_WITH(json::parse(std::istringstream(bom.substr(0, 1))),
-                              "[json.exception.parse_error.101] parse error at 1: syntax error - unexpected end of input; expected '[', '{', or a literal");
+                              "[json.exception.parse_error.101] parse error at 2: syntax error - invalid BOM; must be 0xEF 0xBB 0xBF if given; last read: '\xEF'");
            SaxEventLogger l1, l2;
            CHECK(not json::sax_parse(std::istringstream(bom.substr(0, 1)), &l1));
@ -874,12 +874,12 @@ TEST_CASE("deserialization")
            CHECK(l1.events.size() == 1);
            CHECK(l1.events == std::vector<std::string>(
            {
-                "parse_error(1)"
+                "parse_error(2)"
            }));
            CHECK(l2.events.size() == 1);
            CHECK(l2.events == std::vector<std::string>(
            {
-                "parse_error(1)"
+                "parse_error(2)"
            }));
        }
@ -926,10 +926,28 @@ TEST_CASE("deserialization")
                            SaxEventLogger l;
                            CHECK(not json::sax_parse(s + "null", &l));
                            CHECK(l.events.size() == 1);
-                            CHECK(l.events == std::vector<std::string>(
+
                            if (i0 != 0)
                            {
-                                "parse_error(1)"
+                                CHECK(l.events == std::vector<std::string>(
-                            }));
+                                {
                                    "parse_error(1)"
                                }));
                            }
                            else if (i1 != 0)
                            {
                                CHECK(l.events == std::vector<std::string>(
                                {
                                    "parse_error(2)"
                                }));
                            }
                            else
                            {
                                CHECK(l.events == std::vector<std::string>(
                                {
                                    "parse_error(3)"
                                }));
                            }
                        }
                    }
                }