🔨 refactored some scanner internals

2017-07-28 19:18:50 +02:00 · 2017-07-28 19:18:50 +02:00 · c819a2d732
commit c819a2d732
parent 5851daa576
2 changed files with 99 additions and 363 deletions
--- a/doc/images/scanner.png
+++ b/doc/images/scanner.png
--- a/src/json.hpp
+++ b/src/json.hpp
@ -148,7 +148,6 @@ class basic_json;
    AllocatorType, JSONSerializer>
 /*!
@brief unnamed namespace with internal helper functions
@ -665,8 +664,7 @@ struct external_constructor<value_t::object>
    template<typename BasicJsonType, typename CompatibleObjectType,
             enable_if_t<not std::is_same<CompatibleObjectType,
-                                          typename BasicJsonType::object_t>::value,
+                                          typename BasicJsonType::object_t>::value, int> = 0>
                         int> = 0>
    static void construct(BasicJsonType& j, const CompatibleObjectType& obj)
    {
        using std::begin;
@ -1118,9 +1116,7 @@ void from_json(const BasicJsonType& j, CompatibleObjectType& obj)
        std::inserter(obj, obj.begin()),
        [](typename BasicJsonType::object_t::value_type const & p)
    {
-        return value_type(
+        return value_type(p.first, p.second.template get<typename CompatibleObjectType::mapped_type>());
                   p.first, p.second
                   .template get<typename CompatibleObjectType::mapped_type>());
    });
 }
@ -1264,7 +1260,7 @@ struct input_adapter_protocol
 using input_adapter_t = std::shared_ptr<input_adapter_protocol>;
 /// input adapter for cached stream input
-template<std::size_t N>
+template<std::size_t BufferSize>
 class cached_input_stream_adapter : public input_adapter_protocol
 {
  public:
@ -1368,14 +1364,14 @@ class cached_input_stream_adapter : public input_adapter_protocol
    const std::streampos start_position;
    /// internal buffer
-    std::array<char, N> buffer{{}};
+    std::array<char, BufferSize> buffer{{}};
 };
 /// input adapter for buffer input
 class input_buffer_adapter : public input_adapter_protocol
 {
  public:
-    input_buffer_adapter(const char* b, std::size_t l)
+    input_buffer_adapter(const char* b, const std::size_t l)
        : cursor(b), limit(b + l), start(b)
    {
        // skip byte order mark
@ -1456,8 +1452,7 @@ class input_adapter
    /// input adapter for iterator range with contiguous storage
    template<class IteratorType,
             typename std::enable_if<
-                 std::is_same<typename std::iterator_traits<
+                 std::is_same<typename std::iterator_traits<IteratorType>::iterator_category,
                                  IteratorType>::iterator_category,
                              std::random_access_iterator_tag>::value,
                 int>::type = 0>
    input_adapter(IteratorType first, IteratorType last)
@ -1485,7 +1480,7 @@ class input_adapter
        }
        else
        {
-            // the address of first cannot be used - use nullptr
+            // the address of first cannot be used: use nullptr
            ia = std::make_shared<input_buffer_adapter>(nullptr, len);
        }
    }
@ -1501,9 +1496,7 @@ class input_adapter
        typename std::enable_if <
            not std::is_pointer<ContiguousContainer>::value and
            std::is_base_of<std::random_access_iterator_tag,
-                            typename std::iterator_traits<decltype(std::begin(
+                            typename std::iterator_traits<decltype(std::begin(std::declval<ContiguousContainer const>()))>::iterator_category>::value,
                                        std::declval<ContiguousContainer const>()))>::
                            iterator_category>::value,
            int >::type = 0 >
    input_adapter(const ContiguousContainer& c)
        : input_adapter(std::begin(c), std::end(c)) {}
@ -1629,7 +1622,17 @@ class lexer
    /*!
    @brief get codepoint from 4 hex characters following `\u`
-    @return codepoint or -1 in case of an error (e.g. EOF or non-hex character)
+    For input "\u c1 c2 c3 c4" the codepoint is:
      (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4
    = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0)
    Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f'
    must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The
    conversion is done by subtracting the offset (0x30, 0x37, and 0x57)
    between the ASCII value of the character and the desired integer value.
    @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or
            non-hex character)
    */
    int get_codepoint()
    {
@ -1637,249 +1640,62 @@ class lexer
        assert(current == 'u');
        int codepoint = 0;
-        // byte 1: \uXxxx
+        for (int factor = 12; factor >= 0; factor -= 4)
-        switch (get())
+        {
            get();
            if (current >= '0' and current <= '9')
            {
                codepoint += ((current - 0x30) << factor);
            }
            else if (current >= 'A' and current <= 'F')
            {
                codepoint += ((current - 0x37) << factor);
            }
            else if (current >= 'a' and current <= 'f')
            {
                codepoint += ((current - 0x57) << factor);
            }
            else
            {
            case '0':
                break;
            case '1':
                codepoint += 0x1000;
                break;
            case '2':
                codepoint += 0x2000;
                break;
            case '3':
                codepoint += 0x3000;
                break;
            case '4':
                codepoint += 0x4000;
                break;
            case '5':
                codepoint += 0x5000;
                break;
            case '6':
                codepoint += 0x6000;
                break;
            case '7':
                codepoint += 0x7000;
                break;
            case '8':
                codepoint += 0x8000;
                break;
            case '9':
                codepoint += 0x9000;
                break;
            case 'A':
            case 'a':
                codepoint += 0xa000;
                break;
            case 'B':
            case 'b':
                codepoint += 0xb000;
                break;
            case 'C':
            case 'c':
                codepoint += 0xc000;
                break;
            case 'D':
            case 'd':
                codepoint += 0xd000;
                break;
            case 'E':
            case 'e':
                codepoint += 0xe000;
                break;
            case 'F':
            case 'f':
                codepoint += 0xf000;
                break;
            default:
                return -1;
            }
        // byte 2: \uxXxx
        switch (get())
        {
            case '0':
                break;
            case '1':
                codepoint += 0x0100;
                break;
            case '2':
                codepoint += 0x0200;
                break;
            case '3':
                codepoint += 0x0300;
                break;
            case '4':
                codepoint += 0x0400;
                break;
            case '5':
                codepoint += 0x0500;
                break;
            case '6':
                codepoint += 0x0600;
                break;
            case '7':
                codepoint += 0x0700;
                break;
            case '8':
                codepoint += 0x0800;
                break;
            case '9':
                codepoint += 0x0900;
                break;
            case 'A':
            case 'a':
                codepoint += 0x0a00;
                break;
            case 'B':
            case 'b':
                codepoint += 0x0b00;
                break;
            case 'C':
            case 'c':
                codepoint += 0x0c00;
                break;
            case 'D':
            case 'd':
                codepoint += 0x0d00;
                break;
            case 'E':
            case 'e':
                codepoint += 0x0e00;
                break;
            case 'F':
            case 'f':
                codepoint += 0x0f00;
                break;
            default:
                return -1;
        }
        // byte 3: \uxxXx
        switch (get())
        {
            case '0':
                break;
            case '1':
                codepoint += 0x0010;
                break;
            case '2':
                codepoint += 0x0020;
                break;
            case '3':
                codepoint += 0x0030;
                break;
            case '4':
                codepoint += 0x0040;
                break;
            case '5':
                codepoint += 0x0050;
                break;
            case '6':
                codepoint += 0x0060;
                break;
            case '7':
                codepoint += 0x0070;
                break;
            case '8':
                codepoint += 0x0080;
                break;
            case '9':
                codepoint += 0x0090;
                break;
            case 'A':
            case 'a':
                codepoint += 0x00a0;
                break;
            case 'B':
            case 'b':
                codepoint += 0x00b0;
                break;
            case 'C':
            case 'c':
                codepoint += 0x00c0;
                break;
            case 'D':
            case 'd':
                codepoint += 0x00d0;
                break;
            case 'E':
            case 'e':
                codepoint += 0x00e0;
                break;
            case 'F':
            case 'f':
                codepoint += 0x00f0;
                break;
            default:
                return -1;
        }
        // byte 4: \uxxxX
        switch (get())
        {
            case '0':
                break;
            case '1':
                codepoint += 0x0001;
                break;
            case '2':
                codepoint += 0x0002;
                break;
            case '3':
                codepoint += 0x0003;
                break;
            case '4':
                codepoint += 0x0004;
                break;
            case '5':
                codepoint += 0x0005;
                break;
            case '6':
                codepoint += 0x0006;
                break;
            case '7':
                codepoint += 0x0007;
                break;
            case '8':
                codepoint += 0x0008;
                break;
            case '9':
                codepoint += 0x0009;
                break;
            case 'A':
            case 'a':
                codepoint += 0x000a;
                break;
            case 'B':
            case 'b':
                codepoint += 0x000b;
                break;
            case 'C':
            case 'c':
                codepoint += 0x000c;
                break;
            case 'D':
            case 'd':
                codepoint += 0x000d;
                break;
            case 'E':
            case 'e':
                codepoint += 0x000e;
                break;
            case 'F':
            case 'f':
                codepoint += 0x000f;
                break;
            default:
                return -1;
        }
        assert(0x0000 <= codepoint and codepoint <= 0xFFFF);
        return codepoint;
    }
    /*!
    @brief check if the next byte(s) are inside a given range
    Adds the current byte and, for each passed range, reads a new byte and
    checks if it is inside the range. If a violation was detected, set up an
    error message and return false. Otherwise, return true.
    @return true iff no range violation was detected
    */
    bool next_byte_in_range(std::initializer_list<std::pair<int, int>> ranges)
    {
        add(current);
        for (const auto& range : ranges)
        {
            get();
            if (JSON_LIKELY(range.first <= current and current <= range.second))
            {
                add(current);
            }
            else
            {
                error_message = "invalid string: ill-formed UTF-8 byte";
                return false;
            }
        }
        return true;
    }
    /*!
    @brief scan a string literal
@ -1965,7 +1781,7 @@ class lexer
                        case 'u':
                        {
                            int codepoint;
-                            int codepoint1 = get_codepoint();
+                            const int codepoint1 = get_codepoint();
                            if (JSON_UNLIKELY(codepoint1 == -1))
                            {
@ -2237,37 +2053,22 @@ class lexer
                case 0xde:
                case 0xdf:
                {
-                    add(current);
+                    if (JSON_UNLIKELY(not next_byte_in_range({{0x80, 0xBF}})))
                    get();
                    if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
                    {
                        add(current);
                        continue;
                    }
                    error_message = "invalid string: ill-formed UTF-8 byte";
                        return token_type::parse_error;
                    }
                    break;
                }
                // U+0800..U+0FFF: bytes E0 A0..BF 80..BF
                case 0xe0:
                {
-                    add(current);
+                    if (JSON_UNLIKELY(not (next_byte_in_range({{0xA0, 0xBF}, {0x80, 0xBF}}))))
                    get();
                    if (JSON_LIKELY(0xa0 <= current and current <= 0xbf))
                    {
                        add(current);
                        get();
                        if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
                        {
                            add(current);
                            continue;
                        }
                    }
                    error_message = "invalid string: ill-formed UTF-8 byte";
                        return token_type::parse_error;
                    }
                    break;
                }
                // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
                // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
@ -2286,119 +2087,54 @@ class lexer
                case 0xee:
                case 0xef:
                {
-                    add(current);
+                    if (JSON_UNLIKELY(not (next_byte_in_range({{0x80, 0xBF}, {0x80, 0xBF}}))))
                    get();
                    if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
                    {
                        add(current);
                        get();
                        if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
                        {
                            add(current);
                            continue;
                        }
                    }
                    error_message = "invalid string: ill-formed UTF-8 byte";
                        return token_type::parse_error;
                    }
                    break;
                }
                // U+D000..U+D7FF: bytes ED 80..9F 80..BF
                case 0xed:
                {
-                    add(current);
+                    if (JSON_UNLIKELY(not (next_byte_in_range({{0x80, 0x9F}, {0x80, 0xBF}}))))
                    get();
                    if (JSON_LIKELY(0x80 <= current and current <= 0x9f))
                    {
                        add(current);
                        get();
                        if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
                        {
                            add(current);
                            continue;
                        }
                    }
                    error_message = "invalid string: ill-formed UTF-8 byte";
                        return token_type::parse_error;
                    }
                    break;
                }
                // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
                case 0xf0:
                {
-                    add(current);
+                    if (JSON_UNLIKELY(not (next_byte_in_range({{0x90, 0xBF}, {0x80, 0xBF}, {0x80, 0xBF}}))))
                    get();
                    if (JSON_LIKELY(0x90 <= current and current <= 0xbf))
                    {
                        add(current);
                        get();
                        if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
                        {
                            add(current);
                            get();
                            if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
                            {
                                add(current);
                                continue;
                            }
                        }
                    }
                    error_message = "invalid string: ill-formed UTF-8 byte";
                        return token_type::parse_error;
                    }
                    break;
                }
                // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
                case 0xf1:
                case 0xf2:
                case 0xf3:
                {
-                    add(current);
+                    if (JSON_UNLIKELY(not (next_byte_in_range({{0x80, 0xBF}, {0x80, 0xBF}, {0x80, 0xBF}}))))
                    get();
                    if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
                    {
                        add(current);
                        get();
                        if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
                        {
                            add(current);
                            get();
                            if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
                            {
                                add(current);
                                continue;
                            }
                        }
                    }
                    error_message = "invalid string: ill-formed UTF-8 byte";
                        return token_type::parse_error;
                    }
                    break;
                }
                // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
                case 0xf4:
                {
-                    add(current);
+                    if (JSON_UNLIKELY(not (next_byte_in_range({{0x80, 0x8F}, {0x80, 0xBF}, {0x80, 0xBF}}))))
                    get();
                    if (JSON_LIKELY(0x80 <= current and current <= 0x8f))
                    {
                        add(current);
                        get();
                        if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
                        {
                            add(current);
                            get();
                            if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
                            {
                                add(current);
                                continue;
                            }
                        }
                    }
                    error_message = "invalid string: ill-formed UTF-8 byte";
                        return token_type::parse_error;
                    }
                    break;
                }
                // remaining bytes (80..C1 and F5..FF) are ill-formed
                default: