🔨 removing unget_character() function from input adapters #834

2018-04-02 21:10:48 +02:00 · 2018-04-02 21:10:48 +02:00 · aa89c5e048
commit aa89c5e048
parent ba6edd5634
4 changed files with 168 additions and 142 deletions
--- a/include/nlohmann/detail/input/lexer.hpp
+++ b/include/nlohmann/detail/input/lexer.hpp
@ -1081,7 +1081,16 @@ scan_number_done:
    std::char_traits<char>::int_type get()
    {
        ++chars_read;
-        current = ia->get_character();
+        if (next_unget)
+        {
+            // just reset the next_unget variable and work with current
+            next_unget = false;
+        }
+        else
+        {
+            current = ia->get_character();
+        }
+
        if (JSON_LIKELY(current != std::char_traits<char>::eof()))
        {
            token_string.push_back(std::char_traits<char>::to_char_type(current));
@ -1089,13 +1098,20 @@ scan_number_done:
        return current;
    }

-    /// unget current character (return it again on next get)
+    /*!
+    @brief unget current character (read it again on next get)
+
+    We implement unget by setting variable next_unget to true. The input is not
+    changed - we just simulate ungetting by modifying chars_read and
+    token_string. The next call to get() will behave as if the unget character
+    is read again.
+    */
    void unget()
    {
+        next_unget = true;
        --chars_read;
        if (JSON_LIKELY(current != std::char_traits<char>::eof()))
        {
-            ia->unget_character();
            assert(token_string.size() != 0);
            token_string.pop_back();
        }
@ -1183,8 +1199,43 @@ scan_number_done:
    // actual scanner
    /////////////////////

+    /*!
+    @brief skip the UTF-8 byte order mark
+    @return true iff there is no BOM or the correct BOM has been skipped
+    */
+    bool skip_bom()
+    {
+        if (get() == 0xEF)
+        {
+            if (get() == 0xBB and get() == 0xBF)
+            {
+                // we completely parsed the BOM
+                return true;
+            }
+            else
+            {
+                // after reading 0xEF, an unexpected character followed
+                return false;
+            }
+        }
+        else
+        {
+            // the first character is not the beginning of the BOM; unget it to
+            // process is later
+            unget();
+            return true;
+        }
+    }
+
    token_type scan()
    {
+        // initially, skip the BOM
+        if (chars_read == 0 and not skip_bom())
+        {
+            error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
+            return token_type::parse_error;
+        }
+
        // read next character and ignore whitespace
        do
        {
@ -1254,6 +1305,9 @@ scan_number_done:
    /// the current character
    std::char_traits<char>::int_type current = std::char_traits<char>::eof();

+    /// whether the next get() call should just return current
+    bool next_unget = false;
+
    /// the number of characters read
    std::size_t chars_read = 0;