clean up

2015-02-15 13:35:51 +01:00 · 2015-02-15 13:35:51 +01:00 · 2fc82358ce
commit 2fc82358ce
parent b21bf95620
3 changed files with 759 additions and 779 deletions
--- a/src/json.hpp
+++ b/src/json.hpp
@ -2456,7 +2456,10 @@ class basic_json
            end_of_input
        };
-        inline lexer(const char* s) : m_content(s)
+        /// the char type to use in the lexer
        using lexer_char_t = typename string_t::value_type;
        inline lexer(const typename string_t::value_type* s) : m_content(s)
        {
            m_start = m_cursor = m_content;
            m_limit = m_content + strlen(m_content);
@ -2464,46 +2467,39 @@ class basic_json
        inline lexer() = default;
-        template<typename CharT>
+        inline static string_t to_unicode(const long codepoint)
        inline static std::basic_string<CharT> to_unicode(const long codepoint)
        {
-            std::string result;
+            string_t result;
            if (codepoint <= 0x7f)
            {
-                // 1-byte (ASCII) characters: 0xxxxxxx
+                // 1-byte characters: 0xxxxxxx (ASCI)
-                result.append(1, static_cast<char>(codepoint));
+                result.append(1, static_cast<typename string_t::value_type>(codepoint));
            }
            else if (codepoint <= 0x7ff)
            {
                // 2-byte characters: 110xxxxx 10xxxxxx
-                // the 0xC0 enables the two most significant bits to make this
+                result.append(1, static_cast<typename string_t::value_type>(0xC0 | ((codepoint >> 6) & 0x1F)));
-                // a 2-byte UTF-8 character
+                result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
                result.append(1, static_cast<CharT>(0xC0 | ((codepoint >> 6) & 0x1F)));
                result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
            }
            else if (codepoint <= 0xffff)
            {
                // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
-                // the 0xE0 enables the three most significant bits to make
+                result.append(1, static_cast<typename string_t::value_type>(0xE0 | ((codepoint >> 12) & 0x0F)));
-                // this a 3-byte UTF-8 character
+                result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 6) & 0x3F)));
-                result.append(1, static_cast<CharT>(0xE0 | ((codepoint >> 12) & 0x0F)));
+                result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
                result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 6) & 0x3F)));
                result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
            }
            else if (codepoint <= 0x10ffff)
            {
                // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-                // the 0xF0 enables the four most significant bits to make this
+                result.append(1, static_cast<typename string_t::value_type>(0xF0 | ((codepoint >> 18) & 0x07)));
-                // a 4-byte UTF-8 character
+                result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 12) & 0x3F)));
-                result.append(1, static_cast<CharT>(0xF0 | ((codepoint >> 18) & 0x07)));
+                result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 6) & 0x3F)));
-                result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 12) & 0x3F)));
+                result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
                result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 6) & 0x3F)));
                result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
            }
            else
            {
-                throw std::out_of_range("code point is invalid");
+                throw std::out_of_range("code points above 0x10FFFF are invalid");
            }
            return result;
@ -2553,22 +2549,18 @@ class basic_json
        with goto jumps.
        @return the class of the next token read from the buffer
        @todo Unicode support needs to be checked.
        */
        inline token_type scan()
        {
            // pointer for backtracking information
-            const char* m_marker = nullptr;
+            const typename string_t::value_type* m_marker = nullptr;
            while (true)
            {
            // remember the begin of the token
            m_start = m_cursor;
            {
-                    char yych;
+                lexer_char_t yych;
                unsigned int yyaccept = 0;
                static const unsigned char yybm[] =
                {
@ -2733,7 +2725,7 @@ basic_json_parser_2:
                goto basic_json_parser_5;
 basic_json_parser_3:
                {
-                        continue;
+                    return scan();
                }
 basic_json_parser_4:
                ++m_cursor;
@ -3276,11 +3268,10 @@ basic_json_parser_59:
            }
        }
        }
-        inline std::string get_token() const
+        inline string_t get_token() const
        {
-            return std::string(m_start, static_cast<size_t>(m_cursor - m_start));
+            return string_t(m_start, static_cast<size_t>(m_cursor - m_start));
        }
        /*!
@ -3291,16 +3282,14 @@ basic_json_parser_59:
        from the pointer difference of the two pointers).
        @return string value of current token without opening and closing quotes
        @todo Take care of Unicode.
        */
-        inline std::string get_string() const
+        inline string_t get_string() const
        {
-            std::string result;
+            string_t result;
            result.reserve(static_cast<size_t>(m_cursor - m_start - 2));
            // iterate the result between the quotes
-            for (const char* i = m_start + 1; i < m_cursor - 1; ++i)
+            for (const typename string_t::value_type* i = m_start + 1; i < m_cursor - 1; ++i)
            {
                // process escaped characters
                if (*i == '\\')
@ -3360,7 +3349,7 @@ basic_json_parser_59:
                            // get code xxxx from \uxxxx
                            auto codepoint = strtol(i + 1, nullptr, 16);
                            // add unicode character(s)
-                            result += to_unicode<char>(codepoint);
+                            result += to_unicode(codepoint);
                            // skip the next four characters (\uxxxx)
                            i += 4;
                            break;
@ -3399,20 +3388,20 @@ basic_json_parser_59:
      private:
        /// the buffer
-        const char* m_content = nullptr;
+        const typename string_t::value_type* m_content = nullptr;
        /// pointer to he beginning of the current symbol
-        const char* m_start = nullptr;
+        const typename string_t::value_type* m_start = nullptr;
        /// pointer to the current symbol
-        const char* m_cursor = nullptr;
+        const typename string_t::value_type* m_cursor = nullptr;
        /// pointer to the end of the buffer
-        const char* m_limit = nullptr;
+        const typename string_t::value_type* m_limit = nullptr;
    };
    class parser
    {
      public:
        /// constructor for strings
-        inline parser(const std::string& s) : m_buffer(s), m_lexer(m_buffer.c_str())
+        inline parser(const string_t& s) : m_buffer(s), m_lexer(m_buffer.c_str())
        {
            // read first token
            get_token();
@ -3423,7 +3412,7 @@ basic_json_parser_59:
        {
            while (_is)
            {
-                std::string input_line;
+                string_t input_line;
                std::getline(_is, input_line);
                m_buffer += input_line;
            }
@ -3617,7 +3606,7 @@ basic_json_parser_59:
      private:
        /// the buffer
-        std::string m_buffer;
+        string_t m_buffer;
        /// the type of the last read token
        typename lexer::token_type last_token = lexer::token_type::uninitialized;
        /// the lexer
--- a/src/json.hpp.re2c
+++ b/src/json.hpp.re2c
@ -2456,7 +2456,10 @@ class basic_json
            end_of_input
        };
-        inline lexer(const char* s) : m_content(s)
+        /// the char type to use in the lexer
        using lexer_char_t = typename string_t::value_type;
        inline lexer(const typename string_t::value_type* s) : m_content(s)
        {
            m_start = m_cursor = m_content;
            m_limit = m_content + strlen(m_content);
@ -2464,46 +2467,39 @@ class basic_json
        inline lexer() = default;
-        template<typename CharT>
+        inline static string_t to_unicode(const long codepoint)
        inline static std::basic_string<CharT> to_unicode(const long codepoint)
        {
-            std::string result;
+            string_t result;
            if (codepoint <= 0x7f)
            {
-                // 1-byte (ASCII) characters: 0xxxxxxx
+                // 1-byte characters: 0xxxxxxx (ASCI)
-                result.append(1, static_cast<char>(codepoint));
+                result.append(1, static_cast<typename string_t::value_type>(codepoint));
            }
            else if (codepoint <= 0x7ff)
            {
                // 2-byte characters: 110xxxxx 10xxxxxx
-                // the 0xC0 enables the two most significant bits to make this
+                result.append(1, static_cast<typename string_t::value_type>(0xC0 | ((codepoint >> 6) & 0x1F)));
-                // a 2-byte UTF-8 character
+                result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
                result.append(1, static_cast<CharT>(0xC0 | ((codepoint >> 6) & 0x1F)));
                result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
            }
            else if (codepoint <= 0xffff)
            {
                // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
-                // the 0xE0 enables the three most significant bits to make
+                result.append(1, static_cast<typename string_t::value_type>(0xE0 | ((codepoint >> 12) & 0x0F)));
-                // this a 3-byte UTF-8 character
+                result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 6) & 0x3F)));
-                result.append(1, static_cast<CharT>(0xE0 | ((codepoint >> 12) & 0x0F)));
+                result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
                result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 6) & 0x3F)));
                result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
            }
            else if (codepoint <= 0x10ffff)
            {
                // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-                // the 0xF0 enables the four most significant bits to make this
+                result.append(1, static_cast<typename string_t::value_type>(0xF0 | ((codepoint >> 18) & 0x07)));
-                // a 4-byte UTF-8 character
+                result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 12) & 0x3F)));
-                result.append(1, static_cast<CharT>(0xF0 | ((codepoint >> 18) & 0x07)));
+                result.append(1, static_cast<typename string_t::value_type>(0x80 | ((codepoint >> 6) & 0x3F)));
-                result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 12) & 0x3F)));
+                result.append(1, static_cast<typename string_t::value_type>(0x80 | (codepoint & 0x3F)));
                result.append(1, static_cast<CharT>(0x80 | ((codepoint >> 6) & 0x3F)));
                result.append(1, static_cast<CharT>(0x80 | (codepoint & 0x3F)));
            }
            else
            {
-                throw std::out_of_range("code point is invalid");
+                throw std::out_of_range("code points above 0x10FFFF are invalid");
            }
            return result;
@ -2557,15 +2553,13 @@ class basic_json
        inline token_type scan()
        {
            // pointer for backtracking information
-            const char* m_marker = nullptr;
+            const typename string_t::value_type* m_marker = nullptr;
            while (true)
            {
            // remember the begin of the token
            m_start = m_cursor;
            /*!re2c
-                    re2c:define:YYCTYPE  = char;
+                re2c:define:YYCTYPE  = lexer_char_t;
                re2c:define:YYCURSOR = m_cursor;
                re2c:define:YYLIMIT  = m_limit;
                re2c:define:YYMARKER = m_marker;
@ -2576,7 +2570,7 @@ class basic_json
                // whitespace
                ws = [ \t\n\r]+;
-                    ws   { continue; }
+                ws   { return scan(); }
                // structural characters
                "[" { return token_type::begin_array; }
@ -2623,11 +2617,10 @@ class basic_json
                .              { return token_type::parse_error; }
             */
        }
        }
-        inline std::string get_token() const
+        inline string_t get_token() const
        {
-            return std::string(m_start, static_cast<size_t>(m_cursor - m_start));
+            return string_t(m_start, static_cast<size_t>(m_cursor - m_start));
        }
        /*!
@ -2638,16 +2631,14 @@ class basic_json
        from the pointer difference of the two pointers).
        @return string value of current token without opening and closing quotes
        @todo Take care of Unicode.
        */
-        inline std::string get_string() const
+        inline string_t get_string() const
        {
-            std::string result;
+            string_t result;
            result.reserve(static_cast<size_t>(m_cursor - m_start - 2));
            // iterate the result between the quotes
-            for (const char* i = m_start + 1; i < m_cursor - 1; ++i)
+            for (const typename string_t::value_type* i = m_start + 1; i < m_cursor - 1; ++i)
            {
                // process escaped characters
                if (*i == '\\')
@ -2707,7 +2698,7 @@ class basic_json
                            // get code xxxx from \uxxxx
                            auto codepoint = strtol(i + 1, nullptr, 16);
                            // add unicode character(s)
-                            result += to_unicode<char>(codepoint);
+                            result += to_unicode(codepoint);
                            // skip the next four characters (\uxxxx)
                            i += 4;
                            break;
@ -2746,20 +2737,20 @@ class basic_json
      private:
        /// the buffer
-        const char* m_content = nullptr;
+        const typename string_t::value_type* m_content = nullptr;
        /// pointer to he beginning of the current symbol
-        const char* m_start = nullptr;
+        const typename string_t::value_type* m_start = nullptr;
        /// pointer to the current symbol
-        const char* m_cursor = nullptr;
+        const typename string_t::value_type* m_cursor = nullptr;
        /// pointer to the end of the buffer
-        const char* m_limit = nullptr;
+        const typename string_t::value_type* m_limit = nullptr;
    };
    class parser
    {
      public:
        /// constructor for strings
-        inline parser(const std::string& s) : m_buffer(s), m_lexer(m_buffer.c_str())
+        inline parser(const string_t& s) : m_buffer(s), m_lexer(m_buffer.c_str())
        {
            // read first token
            get_token();
@ -2770,7 +2761,7 @@ class basic_json
        {
            while (_is)
            {
-                std::string input_line;
+                string_t input_line;
                std::getline(_is, input_line);
                m_buffer += input_line;
            }
@ -2964,7 +2955,7 @@ class basic_json
      private:
        /// the buffer
-        std::string m_buffer;
+        string_t m_buffer;
        /// the type of the last read token
        typename lexer::token_type last_token = lexer::token_type::uninitialized;
        /// the lexer
--- a/test/unit.cpp
+++ b/test/unit.cpp
@ -5517,8 +5517,8 @@ TEST_CASE("lexer class")
    SECTION("to_unicode")
    {
-        CHECK(json::lexer::to_unicode<char>(0x1F4A9) == "💩");
+        CHECK(json::lexer::to_unicode(0x1F4A9) == "💩");
-        CHECK_THROWS_AS(json::lexer::to_unicode<char>(0x110000), std::out_of_range);
+        CHECK_THROWS_AS(json::lexer::to_unicode(0x200000), std::out_of_range);
    }
 }