🔨 refactored some scanner internals

2017-07-28 19:18:50 +02:00 · 2017-07-28 19:18:50 +02:00 · c819a2d732
commit c819a2d732
parent 5851daa576
2 changed files with 99 additions and 363 deletions
--- a/doc/images/scanner.png
+++ b/doc/images/scanner.png
--- a/src/json.hpp
+++ b/src/json.hpp
@ -134,21 +134,20 @@ class basic_json;
 // Ugly macros to avoid uglier copy-paste when specializing basic_json
 // This is only temporary and will be removed in 3.0

-#define NLOHMANN_BASIC_JSON_TPL_DECLARATION                                 \
-    template<template<typename, typename, typename...> class ObjectType,  \
-             template<typename, typename...> class ArrayType,             \
+#define NLOHMANN_BASIC_JSON_TPL_DECLARATION                                \
+    template<template<typename, typename, typename...> class ObjectType,   \
+             template<typename, typename...> class ArrayType,              \
             class StringType, class BooleanType, class NumberIntegerType, \
             class NumberUnsignedType, class NumberFloatType,              \
-             template<typename> class AllocatorType,                      \
+             template<typename> class AllocatorType,                       \
             template<typename, typename = void> class JSONSerializer>

-#define NLOHMANN_BASIC_JSON_TPL                                             \
-    basic_json<ObjectType, ArrayType, StringType, BooleanType,              \
-    NumberIntegerType, NumberUnsignedType, NumberFloatType,                 \
+#define NLOHMANN_BASIC_JSON_TPL                                            \
+    basic_json<ObjectType, ArrayType, StringType, BooleanType,             \
+    NumberIntegerType, NumberUnsignedType, NumberFloatType,                \
    AllocatorType, JSONSerializer>


-
 /*!
@brief unnamed namespace with internal helper functions

@ -509,13 +508,13 @@ struct merge_and_renumber;
 template<std::size_t... I1, std::size_t... I2>
 struct merge_and_renumber<index_sequence<I1...>, index_sequence<I2...>>
        : index_sequence < I1..., (sizeof...(I1) + I2)... >
-          { };
+          {};

 template<std::size_t N>
 struct make_index_sequence
    : merge_and_renumber < typename make_index_sequence < N / 2 >::type,
      typename make_index_sequence < N - N / 2 >::type >
-{ };
+{};

 template<> struct make_index_sequence<0> : index_sequence<> { };
 template<> struct make_index_sequence<1> : index_sequence<0> { };
@ -665,8 +664,7 @@ struct external_constructor<value_t::object>

    template<typename BasicJsonType, typename CompatibleObjectType,
             enable_if_t<not std::is_same<CompatibleObjectType,
-                                          typename BasicJsonType::object_t>::value,
-                         int> = 0>
+                                          typename BasicJsonType::object_t>::value, int> = 0>
    static void construct(BasicJsonType& j, const CompatibleObjectType& obj)
    {
        using std::begin;
@ -1118,9 +1116,7 @@ void from_json(const BasicJsonType& j, CompatibleObjectType& obj)
        std::inserter(obj, obj.begin()),
        [](typename BasicJsonType::object_t::value_type const & p)
    {
-        return value_type(
-                   p.first, p.second
-                   .template get<typename CompatibleObjectType::mapped_type>());
+        return value_type(p.first, p.second.template get<typename CompatibleObjectType::mapped_type>());
    });
 }

@ -1264,7 +1260,7 @@ struct input_adapter_protocol
 using input_adapter_t = std::shared_ptr<input_adapter_protocol>;

 /// input adapter for cached stream input
-template<std::size_t N>
+template<std::size_t BufferSize>
 class cached_input_stream_adapter : public input_adapter_protocol
 {
  public:
@ -1368,14 +1364,14 @@ class cached_input_stream_adapter : public input_adapter_protocol
    const std::streampos start_position;

    /// internal buffer
-    std::array<char, N> buffer{{}};
+    std::array<char, BufferSize> buffer{{}};
 };

 /// input adapter for buffer input
 class input_buffer_adapter : public input_adapter_protocol
 {
  public:
-    input_buffer_adapter(const char* b, std::size_t l)
+    input_buffer_adapter(const char* b, const std::size_t l)
        : cursor(b), limit(b + l), start(b)
    {
        // skip byte order mark
@ -1456,8 +1452,7 @@ class input_adapter
    /// input adapter for iterator range with contiguous storage
    template<class IteratorType,
             typename std::enable_if<
-                 std::is_same<typename std::iterator_traits<
-                                  IteratorType>::iterator_category,
+                 std::is_same<typename std::iterator_traits<IteratorType>::iterator_category,
                              std::random_access_iterator_tag>::value,
                 int>::type = 0>
    input_adapter(IteratorType first, IteratorType last)
@ -1485,7 +1480,7 @@ class input_adapter
        }
        else
        {
-            // the address of first cannot be used - use nullptr
+            // the address of first cannot be used: use nullptr
            ia = std::make_shared<input_buffer_adapter>(nullptr, len);
        }
    }
@ -1501,9 +1496,7 @@ class input_adapter
        typename std::enable_if <
            not std::is_pointer<ContiguousContainer>::value and
            std::is_base_of<std::random_access_iterator_tag,
-                            typename std::iterator_traits<decltype(std::begin(
-                                        std::declval<ContiguousContainer const>()))>::
-                            iterator_category>::value,
+                            typename std::iterator_traits<decltype(std::begin(std::declval<ContiguousContainer const>()))>::iterator_category>::value,
            int >::type = 0 >
    input_adapter(const ContiguousContainer& c)
        : input_adapter(std::begin(c), std::end(c)) {}
@ -1629,7 +1622,17 @@ class lexer
    /*!
    @brief get codepoint from 4 hex characters following `\u`

-    @return codepoint or -1 in case of an error (e.g. EOF or non-hex character)
+    For input "\u c1 c2 c3 c4" the codepoint is:
+      (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4
+    = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0)
+
+    Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f'
+    must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The
+    conversion is done by subtracting the offset (0x30, 0x37, and 0x57)
+    between the ASCII value of the character and the desired integer value.
+
+    @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or
+            non-hex character)
    */
    int get_codepoint()
    {
@ -1637,249 +1640,62 @@ class lexer
        assert(current == 'u');
        int codepoint = 0;

-        // byte 1: \uXxxx
-        switch (get())
+        for (int factor = 12; factor >= 0; factor -= 4)
        {
-            case '0':
-                break;
-            case '1':
-                codepoint += 0x1000;
-                break;
-            case '2':
-                codepoint += 0x2000;
-                break;
-            case '3':
-                codepoint += 0x3000;
-                break;
-            case '4':
-                codepoint += 0x4000;
-                break;
-            case '5':
-                codepoint += 0x5000;
-                break;
-            case '6':
-                codepoint += 0x6000;
-                break;
-            case '7':
-                codepoint += 0x7000;
-                break;
-            case '8':
-                codepoint += 0x8000;
-                break;
-            case '9':
-                codepoint += 0x9000;
-                break;
-            case 'A':
-            case 'a':
-                codepoint += 0xa000;
-                break;
-            case 'B':
-            case 'b':
-                codepoint += 0xb000;
-                break;
-            case 'C':
-            case 'c':
-                codepoint += 0xc000;
-                break;
-            case 'D':
-            case 'd':
-                codepoint += 0xd000;
-                break;
-            case 'E':
-            case 'e':
-                codepoint += 0xe000;
-                break;
-            case 'F':
-            case 'f':
-                codepoint += 0xf000;
-                break;
-            default:
-                return -1;
-        }
-
-        // byte 2: \uxXxx
-        switch (get())
-        {
-            case '0':
-                break;
-            case '1':
-                codepoint += 0x0100;
-                break;
-            case '2':
-                codepoint += 0x0200;
-                break;
-            case '3':
-                codepoint += 0x0300;
-                break;
-            case '4':
-                codepoint += 0x0400;
-                break;
-            case '5':
-                codepoint += 0x0500;
-                break;
-            case '6':
-                codepoint += 0x0600;
-                break;
-            case '7':
-                codepoint += 0x0700;
-                break;
-            case '8':
-                codepoint += 0x0800;
-                break;
-            case '9':
-                codepoint += 0x0900;
-                break;
-            case 'A':
-            case 'a':
-                codepoint += 0x0a00;
-                break;
-            case 'B':
-            case 'b':
-                codepoint += 0x0b00;
-                break;
-            case 'C':
-            case 'c':
-                codepoint += 0x0c00;
-                break;
-            case 'D':
-            case 'd':
-                codepoint += 0x0d00;
-                break;
-            case 'E':
-            case 'e':
-                codepoint += 0x0e00;
-                break;
-            case 'F':
-            case 'f':
-                codepoint += 0x0f00;
-                break;
-            default:
-                return -1;
-        }
-
-        // byte 3: \uxxXx
-        switch (get())
-        {
-            case '0':
-                break;
-            case '1':
-                codepoint += 0x0010;
-                break;
-            case '2':
-                codepoint += 0x0020;
-                break;
-            case '3':
-                codepoint += 0x0030;
-                break;
-            case '4':
-                codepoint += 0x0040;
-                break;
-            case '5':
-                codepoint += 0x0050;
-                break;
-            case '6':
-                codepoint += 0x0060;
-                break;
-            case '7':
-                codepoint += 0x0070;
-                break;
-            case '8':
-                codepoint += 0x0080;
-                break;
-            case '9':
-                codepoint += 0x0090;
-                break;
-            case 'A':
-            case 'a':
-                codepoint += 0x00a0;
-                break;
-            case 'B':
-            case 'b':
-                codepoint += 0x00b0;
-                break;
-            case 'C':
-            case 'c':
-                codepoint += 0x00c0;
-                break;
-            case 'D':
-            case 'd':
-                codepoint += 0x00d0;
-                break;
-            case 'E':
-            case 'e':
-                codepoint += 0x00e0;
-                break;
-            case 'F':
-            case 'f':
-                codepoint += 0x00f0;
-                break;
-            default:
-                return -1;
-        }
-
-        // byte 4: \uxxxX
-        switch (get())
-        {
-            case '0':
-                break;
-            case '1':
-                codepoint += 0x0001;
-                break;
-            case '2':
-                codepoint += 0x0002;
-                break;
-            case '3':
-                codepoint += 0x0003;
-                break;
-            case '4':
-                codepoint += 0x0004;
-                break;
-            case '5':
-                codepoint += 0x0005;
-                break;
-            case '6':
-                codepoint += 0x0006;
-                break;
-            case '7':
-                codepoint += 0x0007;
-                break;
-            case '8':
-                codepoint += 0x0008;
-                break;
-            case '9':
-                codepoint += 0x0009;
-                break;
-            case 'A':
-            case 'a':
-                codepoint += 0x000a;
-                break;
-            case 'B':
-            case 'b':
-                codepoint += 0x000b;
-                break;
-            case 'C':
-            case 'c':
-                codepoint += 0x000c;
-                break;
-            case 'D':
-            case 'd':
-                codepoint += 0x000d;
-                break;
-            case 'E':
-            case 'e':
-                codepoint += 0x000e;
-                break;
-            case 'F':
-            case 'f':
-                codepoint += 0x000f;
-                break;
-            default:
+            get();
+
+            if (current >= '0' and current <= '9')
+            {
+                codepoint += ((current - 0x30) << factor);
+            }
+            else if (current >= 'A' and current <= 'F')
+            {
+                codepoint += ((current - 0x37) << factor);
+            }
+            else if (current >= 'a' and current <= 'f')
+            {
+                codepoint += ((current - 0x57) << factor);
+            }
+            else
+            {
                return -1;
+            }
        }

+        assert(0x0000 <= codepoint and codepoint <= 0xFFFF);
        return codepoint;
    }

+    /*!
+    @brief check if the next byte(s) are inside a given range
+
+    Adds the current byte and, for each passed range, reads a new byte and
+    checks if it is inside the range. If a violation was detected, set up an
+    error message and return false. Otherwise, return true.
+
+    @return true iff no range violation was detected
+    */
+    bool next_byte_in_range(std::initializer_list<std::pair<int, int>> ranges)
+    {
+        add(current);
+
+        for (const auto& range : ranges)
+        {
+            get();
+            if (JSON_LIKELY(range.first <= current and current <= range.second))
+            {
+                add(current);
+            }
+            else
+            {
+                error_message = "invalid string: ill-formed UTF-8 byte";
+                return false;
+            }
+        }
+
+        return true;
+    }
+
    /*!
    @brief scan a string literal

@ -1965,7 +1781,7 @@ class lexer
                        case 'u':
                        {
                            int codepoint;
-                            int codepoint1 = get_codepoint();
+                            const int codepoint1 = get_codepoint();

                            if (JSON_UNLIKELY(codepoint1 == -1))
                            {
@ -2237,36 +2053,21 @@ class lexer
                case 0xde:
                case 0xdf:
                {
-                    add(current);
-                    get();
-                    if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
+                    if (JSON_UNLIKELY(not next_byte_in_range({{0x80, 0xBF}})))
                    {
-                        add(current);
-                        continue;
+                        return token_type::parse_error;
                    }
-
-                    error_message = "invalid string: ill-formed UTF-8 byte";
-                    return token_type::parse_error;
+                    break;
                }

                // U+0800..U+0FFF: bytes E0 A0..BF 80..BF
                case 0xe0:
                {
-                    add(current);
-                    get();
-                    if (JSON_LIKELY(0xa0 <= current and current <= 0xbf))
+                    if (JSON_UNLIKELY(not (next_byte_in_range({{0xA0, 0xBF}, {0x80, 0xBF}}))))
                    {
-                        add(current);
-                        get();
-                        if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
-                        {
-                            add(current);
-                            continue;
-                        }
+                        return token_type::parse_error;
                    }
-
-                    error_message = "invalid string: ill-formed UTF-8 byte";
-                    return token_type::parse_error;
+                    break;
                }

                // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
@ -2286,66 +2087,31 @@ class lexer
                case 0xee:
                case 0xef:
                {
-                    add(current);
-                    get();
-                    if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
+                    if (JSON_UNLIKELY(not (next_byte_in_range({{0x80, 0xBF}, {0x80, 0xBF}}))))
                    {
-                        add(current);
-                        get();
-                        if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
-                        {
-                            add(current);
-                            continue;
-                        }
+                        return token_type::parse_error;
                    }
-
-                    error_message = "invalid string: ill-formed UTF-8 byte";
-                    return token_type::parse_error;
+                    break;
                }

                // U+D000..U+D7FF: bytes ED 80..9F 80..BF
                case 0xed:
                {
-                    add(current);
-                    get();
-                    if (JSON_LIKELY(0x80 <= current and current <= 0x9f))
+                    if (JSON_UNLIKELY(not (next_byte_in_range({{0x80, 0x9F}, {0x80, 0xBF}}))))
                    {
-                        add(current);
-                        get();
-                        if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
-                        {
-                            add(current);
-                            continue;
-                        }
+                        return token_type::parse_error;
                    }
-
-                    error_message = "invalid string: ill-formed UTF-8 byte";
-                    return token_type::parse_error;
+                    break;
                }

                // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
                case 0xf0:
                {
-                    add(current);
-                    get();
-                    if (JSON_LIKELY(0x90 <= current and current <= 0xbf))
+                    if (JSON_UNLIKELY(not (next_byte_in_range({{0x90, 0xBF}, {0x80, 0xBF}, {0x80, 0xBF}}))))
                    {
-                        add(current);
-                        get();
-                        if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
-                        {
-                            add(current);
-                            get();
-                            if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
-                            {
-                                add(current);
-                                continue;
-                            }
-                        }
+                        return token_type::parse_error;
                    }
-
-                    error_message = "invalid string: ill-formed UTF-8 byte";
-                    return token_type::parse_error;
+                    break;
                }

                // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
@ -2353,51 +2119,21 @@ class lexer
                case 0xf2:
                case 0xf3:
                {
-                    add(current);
-                    get();
-                    if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
+                    if (JSON_UNLIKELY(not (next_byte_in_range({{0x80, 0xBF}, {0x80, 0xBF}, {0x80, 0xBF}}))))
                    {
-                        add(current);
-                        get();
-                        if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
-                        {
-                            add(current);
-                            get();
-                            if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
-                            {
-                                add(current);
-                                continue;
-                            }
-                        }
+                        return token_type::parse_error;
                    }
-
-                    error_message = "invalid string: ill-formed UTF-8 byte";
-                    return token_type::parse_error;
+                    break;
                }

                // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
                case 0xf4:
                {
-                    add(current);
-                    get();
-                    if (JSON_LIKELY(0x80 <= current and current <= 0x8f))
+                    if (JSON_UNLIKELY(not (next_byte_in_range({{0x80, 0x8F}, {0x80, 0xBF}, {0x80, 0xBF}}))))
                    {
-                        add(current);
-                        get();
-                        if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
-                        {
-                            add(current);
-                            get();
-                            if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
-                            {
-                                add(current);
-                                continue;
-                            }
-                        }
+                        return token_type::parse_error;
                    }
-
-                    error_message = "invalid string: ill-formed UTF-8 byte";
-                    return token_type::parse_error;
+                    break;
                }

                // remaining bytes (80..C1 and F5..FF) are ill-formed