🔨 refactored some scanner internals

This commit is contained in:
Niels Lohmann 2017-07-28 19:18:50 +02:00
parent 5851daa576
commit c819a2d732
No known key found for this signature in database
GPG key ID: 7F3CEA63AE251B69
2 changed files with 99 additions and 363 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 645 KiB

View file

@ -148,7 +148,6 @@ class basic_json;
AllocatorType, JSONSerializer> AllocatorType, JSONSerializer>
/*! /*!
@brief unnamed namespace with internal helper functions @brief unnamed namespace with internal helper functions
@ -665,8 +664,7 @@ struct external_constructor<value_t::object>
template<typename BasicJsonType, typename CompatibleObjectType, template<typename BasicJsonType, typename CompatibleObjectType,
enable_if_t<not std::is_same<CompatibleObjectType, enable_if_t<not std::is_same<CompatibleObjectType,
typename BasicJsonType::object_t>::value, typename BasicJsonType::object_t>::value, int> = 0>
int> = 0>
static void construct(BasicJsonType& j, const CompatibleObjectType& obj) static void construct(BasicJsonType& j, const CompatibleObjectType& obj)
{ {
using std::begin; using std::begin;
@ -1118,9 +1116,7 @@ void from_json(const BasicJsonType& j, CompatibleObjectType& obj)
std::inserter(obj, obj.begin()), std::inserter(obj, obj.begin()),
[](typename BasicJsonType::object_t::value_type const & p) [](typename BasicJsonType::object_t::value_type const & p)
{ {
return value_type( return value_type(p.first, p.second.template get<typename CompatibleObjectType::mapped_type>());
p.first, p.second
.template get<typename CompatibleObjectType::mapped_type>());
}); });
} }
@ -1264,7 +1260,7 @@ struct input_adapter_protocol
using input_adapter_t = std::shared_ptr<input_adapter_protocol>; using input_adapter_t = std::shared_ptr<input_adapter_protocol>;
/// input adapter for cached stream input /// input adapter for cached stream input
template<std::size_t N> template<std::size_t BufferSize>
class cached_input_stream_adapter : public input_adapter_protocol class cached_input_stream_adapter : public input_adapter_protocol
{ {
public: public:
@ -1368,14 +1364,14 @@ class cached_input_stream_adapter : public input_adapter_protocol
const std::streampos start_position; const std::streampos start_position;
/// internal buffer /// internal buffer
std::array<char, N> buffer{{}}; std::array<char, BufferSize> buffer{{}};
}; };
/// input adapter for buffer input /// input adapter for buffer input
class input_buffer_adapter : public input_adapter_protocol class input_buffer_adapter : public input_adapter_protocol
{ {
public: public:
input_buffer_adapter(const char* b, std::size_t l) input_buffer_adapter(const char* b, const std::size_t l)
: cursor(b), limit(b + l), start(b) : cursor(b), limit(b + l), start(b)
{ {
// skip byte order mark // skip byte order mark
@ -1456,8 +1452,7 @@ class input_adapter
/// input adapter for iterator range with contiguous storage /// input adapter for iterator range with contiguous storage
template<class IteratorType, template<class IteratorType,
typename std::enable_if< typename std::enable_if<
std::is_same<typename std::iterator_traits< std::is_same<typename std::iterator_traits<IteratorType>::iterator_category,
IteratorType>::iterator_category,
std::random_access_iterator_tag>::value, std::random_access_iterator_tag>::value,
int>::type = 0> int>::type = 0>
input_adapter(IteratorType first, IteratorType last) input_adapter(IteratorType first, IteratorType last)
@ -1485,7 +1480,7 @@ class input_adapter
} }
else else
{ {
// the address of first cannot be used - use nullptr // the address of first cannot be used: use nullptr
ia = std::make_shared<input_buffer_adapter>(nullptr, len); ia = std::make_shared<input_buffer_adapter>(nullptr, len);
} }
} }
@ -1501,9 +1496,7 @@ class input_adapter
typename std::enable_if < typename std::enable_if <
not std::is_pointer<ContiguousContainer>::value and not std::is_pointer<ContiguousContainer>::value and
std::is_base_of<std::random_access_iterator_tag, std::is_base_of<std::random_access_iterator_tag,
typename std::iterator_traits<decltype(std::begin( typename std::iterator_traits<decltype(std::begin(std::declval<ContiguousContainer const>()))>::iterator_category>::value,
std::declval<ContiguousContainer const>()))>::
iterator_category>::value,
int >::type = 0 > int >::type = 0 >
input_adapter(const ContiguousContainer& c) input_adapter(const ContiguousContainer& c)
: input_adapter(std::begin(c), std::end(c)) {} : input_adapter(std::begin(c), std::end(c)) {}
@ -1629,7 +1622,17 @@ class lexer
/*! /*!
@brief get codepoint from 4 hex characters following `\u` @brief get codepoint from 4 hex characters following `\u`
@return codepoint or -1 in case of an error (e.g. EOF or non-hex character) For input "\u c1 c2 c3 c4" the codepoint is:
(c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4
= (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0)
Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f'
must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The
conversion is done by subtracting the offset (0x30, 0x37, and 0x57)
between the ASCII value of the character and the desired integer value.
@return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or
non-hex character)
*/ */
int get_codepoint() int get_codepoint()
{ {
@ -1637,249 +1640,62 @@ class lexer
assert(current == 'u'); assert(current == 'u');
int codepoint = 0; int codepoint = 0;
// byte 1: \uXxxx for (int factor = 12; factor >= 0; factor -= 4)
switch (get()) {
get();
if (current >= '0' and current <= '9')
{
codepoint += ((current - 0x30) << factor);
}
else if (current >= 'A' and current <= 'F')
{
codepoint += ((current - 0x37) << factor);
}
else if (current >= 'a' and current <= 'f')
{
codepoint += ((current - 0x57) << factor);
}
else
{ {
case '0':
break;
case '1':
codepoint += 0x1000;
break;
case '2':
codepoint += 0x2000;
break;
case '3':
codepoint += 0x3000;
break;
case '4':
codepoint += 0x4000;
break;
case '5':
codepoint += 0x5000;
break;
case '6':
codepoint += 0x6000;
break;
case '7':
codepoint += 0x7000;
break;
case '8':
codepoint += 0x8000;
break;
case '9':
codepoint += 0x9000;
break;
case 'A':
case 'a':
codepoint += 0xa000;
break;
case 'B':
case 'b':
codepoint += 0xb000;
break;
case 'C':
case 'c':
codepoint += 0xc000;
break;
case 'D':
case 'd':
codepoint += 0xd000;
break;
case 'E':
case 'e':
codepoint += 0xe000;
break;
case 'F':
case 'f':
codepoint += 0xf000;
break;
default:
return -1; return -1;
} }
// byte 2: \uxXxx
switch (get())
{
case '0':
break;
case '1':
codepoint += 0x0100;
break;
case '2':
codepoint += 0x0200;
break;
case '3':
codepoint += 0x0300;
break;
case '4':
codepoint += 0x0400;
break;
case '5':
codepoint += 0x0500;
break;
case '6':
codepoint += 0x0600;
break;
case '7':
codepoint += 0x0700;
break;
case '8':
codepoint += 0x0800;
break;
case '9':
codepoint += 0x0900;
break;
case 'A':
case 'a':
codepoint += 0x0a00;
break;
case 'B':
case 'b':
codepoint += 0x0b00;
break;
case 'C':
case 'c':
codepoint += 0x0c00;
break;
case 'D':
case 'd':
codepoint += 0x0d00;
break;
case 'E':
case 'e':
codepoint += 0x0e00;
break;
case 'F':
case 'f':
codepoint += 0x0f00;
break;
default:
return -1;
}
// byte 3: \uxxXx
switch (get())
{
case '0':
break;
case '1':
codepoint += 0x0010;
break;
case '2':
codepoint += 0x0020;
break;
case '3':
codepoint += 0x0030;
break;
case '4':
codepoint += 0x0040;
break;
case '5':
codepoint += 0x0050;
break;
case '6':
codepoint += 0x0060;
break;
case '7':
codepoint += 0x0070;
break;
case '8':
codepoint += 0x0080;
break;
case '9':
codepoint += 0x0090;
break;
case 'A':
case 'a':
codepoint += 0x00a0;
break;
case 'B':
case 'b':
codepoint += 0x00b0;
break;
case 'C':
case 'c':
codepoint += 0x00c0;
break;
case 'D':
case 'd':
codepoint += 0x00d0;
break;
case 'E':
case 'e':
codepoint += 0x00e0;
break;
case 'F':
case 'f':
codepoint += 0x00f0;
break;
default:
return -1;
}
// byte 4: \uxxxX
switch (get())
{
case '0':
break;
case '1':
codepoint += 0x0001;
break;
case '2':
codepoint += 0x0002;
break;
case '3':
codepoint += 0x0003;
break;
case '4':
codepoint += 0x0004;
break;
case '5':
codepoint += 0x0005;
break;
case '6':
codepoint += 0x0006;
break;
case '7':
codepoint += 0x0007;
break;
case '8':
codepoint += 0x0008;
break;
case '9':
codepoint += 0x0009;
break;
case 'A':
case 'a':
codepoint += 0x000a;
break;
case 'B':
case 'b':
codepoint += 0x000b;
break;
case 'C':
case 'c':
codepoint += 0x000c;
break;
case 'D':
case 'd':
codepoint += 0x000d;
break;
case 'E':
case 'e':
codepoint += 0x000e;
break;
case 'F':
case 'f':
codepoint += 0x000f;
break;
default:
return -1;
} }
assert(0x0000 <= codepoint and codepoint <= 0xFFFF);
return codepoint; return codepoint;
} }
/*!
@brief check if the next byte(s) are inside a given range
Adds the current byte and, for each passed range, reads a new byte and
checks if it is inside the range. If a violation was detected, set up an
error message and return false. Otherwise, return true.
@return true iff no range violation was detected
*/
bool next_byte_in_range(std::initializer_list<std::pair<int, int>> ranges)
{
add(current);
for (const auto& range : ranges)
{
get();
if (JSON_LIKELY(range.first <= current and current <= range.second))
{
add(current);
}
else
{
error_message = "invalid string: ill-formed UTF-8 byte";
return false;
}
}
return true;
}
/*! /*!
@brief scan a string literal @brief scan a string literal
@ -1965,7 +1781,7 @@ class lexer
case 'u': case 'u':
{ {
int codepoint; int codepoint;
int codepoint1 = get_codepoint(); const int codepoint1 = get_codepoint();
if (JSON_UNLIKELY(codepoint1 == -1)) if (JSON_UNLIKELY(codepoint1 == -1))
{ {
@ -2237,37 +2053,22 @@ class lexer
case 0xde: case 0xde:
case 0xdf: case 0xdf:
{ {
add(current); if (JSON_UNLIKELY(not next_byte_in_range({{0x80, 0xBF}})))
get();
if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
{ {
add(current);
continue;
}
error_message = "invalid string: ill-formed UTF-8 byte";
return token_type::parse_error; return token_type::parse_error;
} }
break;
}
// U+0800..U+0FFF: bytes E0 A0..BF 80..BF // U+0800..U+0FFF: bytes E0 A0..BF 80..BF
case 0xe0: case 0xe0:
{ {
add(current); if (JSON_UNLIKELY(not (next_byte_in_range({{0xA0, 0xBF}, {0x80, 0xBF}}))))
get();
if (JSON_LIKELY(0xa0 <= current and current <= 0xbf))
{ {
add(current);
get();
if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
{
add(current);
continue;
}
}
error_message = "invalid string: ill-formed UTF-8 byte";
return token_type::parse_error; return token_type::parse_error;
} }
break;
}
// U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
// U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
@ -2286,119 +2087,54 @@ class lexer
case 0xee: case 0xee:
case 0xef: case 0xef:
{ {
add(current); if (JSON_UNLIKELY(not (next_byte_in_range({{0x80, 0xBF}, {0x80, 0xBF}}))))
get();
if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
{ {
add(current);
get();
if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
{
add(current);
continue;
}
}
error_message = "invalid string: ill-formed UTF-8 byte";
return token_type::parse_error; return token_type::parse_error;
} }
break;
}
// U+D000..U+D7FF: bytes ED 80..9F 80..BF // U+D000..U+D7FF: bytes ED 80..9F 80..BF
case 0xed: case 0xed:
{ {
add(current); if (JSON_UNLIKELY(not (next_byte_in_range({{0x80, 0x9F}, {0x80, 0xBF}}))))
get();
if (JSON_LIKELY(0x80 <= current and current <= 0x9f))
{ {
add(current);
get();
if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
{
add(current);
continue;
}
}
error_message = "invalid string: ill-formed UTF-8 byte";
return token_type::parse_error; return token_type::parse_error;
} }
break;
}
// U+10000..U+3FFFF F0 90..BF 80..BF 80..BF // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
case 0xf0: case 0xf0:
{ {
add(current); if (JSON_UNLIKELY(not (next_byte_in_range({{0x90, 0xBF}, {0x80, 0xBF}, {0x80, 0xBF}}))))
get();
if (JSON_LIKELY(0x90 <= current and current <= 0xbf))
{ {
add(current);
get();
if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
{
add(current);
get();
if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
{
add(current);
continue;
}
}
}
error_message = "invalid string: ill-formed UTF-8 byte";
return token_type::parse_error; return token_type::parse_error;
} }
break;
}
// U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
case 0xf1: case 0xf1:
case 0xf2: case 0xf2:
case 0xf3: case 0xf3:
{ {
add(current); if (JSON_UNLIKELY(not (next_byte_in_range({{0x80, 0xBF}, {0x80, 0xBF}, {0x80, 0xBF}}))))
get();
if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
{ {
add(current);
get();
if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
{
add(current);
get();
if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
{
add(current);
continue;
}
}
}
error_message = "invalid string: ill-formed UTF-8 byte";
return token_type::parse_error; return token_type::parse_error;
} }
break;
}
// U+100000..U+10FFFF F4 80..8F 80..BF 80..BF // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
case 0xf4: case 0xf4:
{ {
add(current); if (JSON_UNLIKELY(not (next_byte_in_range({{0x80, 0x8F}, {0x80, 0xBF}, {0x80, 0xBF}}))))
get();
if (JSON_LIKELY(0x80 <= current and current <= 0x8f))
{ {
add(current);
get();
if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
{
add(current);
get();
if (JSON_LIKELY(0x80 <= current and current <= 0xbf))
{
add(current);
continue;
}
}
}
error_message = "invalid string: ill-formed UTF-8 byte";
return token_type::parse_error; return token_type::parse_error;
} }
break;
}
// remaining bytes (80..C1 and F5..FF) are ill-formed // remaining bytes (80..C1 and F5..FF) are ill-formed
default: default: