diff --git a/doc/images/callback_events.png b/doc/images/callback_events.png new file mode 100644 index 00000000..09aa2b38 Binary files /dev/null and b/doc/images/callback_events.png differ diff --git a/src/json.hpp b/src/json.hpp index 29155327..00bee970 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -717,7 +717,19 @@ class basic_json This enumeration collects the different JSON types. It is internally used to distinguish the stored values, and the functions @ref is_null(), @ref is_object(), @ref is_array(), @ref is_string(), @ref is_boolean(), @ref - is_number(), and @ref is_discarded() rely on it. + is_number() (with @ref is_number_integer(), @ref is_number_unsigned(), and + @ref is_number_float()), @ref is_discarded(), @ref is_primitive(), and + @ref is_structured() rely on it. + + @note There are three enumeration entries (number_integer, + number_unsigned, and number_float), because the library distinguishes + these three types for numbers: @ref number_unsigned_t is used for unsigned + integers, @ref number_integer_t is used for signed integers, and @ref + number_float_t is used for floating-point numbers or to approximate + integers which do not fit in the limits of their respective type. + + @sa @ref basic_json(const value_t value_type) -- create a JSON value with + the default value for a given type @since version 1.0.0 */ @@ -728,7 +740,7 @@ class basic_json array, ///< array (ordered collection of values) string, ///< string value boolean, ///< boolean value - number_integer, ///< number value (integer) + number_integer, ///< number value (signed integer) number_unsigned, ///< number value (unsigned integer) number_float, ///< number value (floating-point) discarded ///< discarded by the the parser callback function @@ -758,7 +770,24 @@ class basic_json /*! @brief a JSON value - The actual storage for a JSON value of the @ref basic_json class. + The actual storage for a JSON value of the @ref basic_json class. This + union combines the different storage types for the JSON value types + defined in @ref value_t. + + JSON type | value_t type | used type + --------- | --------------- | ------------------------ + object | object | pointer to @ref object_t + array | array | pointer to @ref array_t + string | string | pointer to @ref string_t + boolean | boolean | @ref boolean_t + number | number_integer | @ref number_integer_t + number | number_unsigned | @ref number_unsigned_t + number | number_float | @ref number_float_t + null | null | *no value is stored* + + @note Variable-length types (objects, arrays, and strings) are stored as + pointers. The size of the union should not exceed 64 bits if the default + value types are used. @since version 1.0.0 */ @@ -874,6 +903,8 @@ class basic_json This enumeration lists the parser events that can trigger calling a callback function of type @ref parser_callback_t during parsing. + @image html callback_events.png "Example when certain parse events are triggered" + @since version 1.0.0 */ enum class parse_event_t : uint8_t @@ -916,6 +947,8 @@ class basic_json parse_event_t::array_end | the parser read `]` and finished processing a JSON array | depth of the parent of the JSON array | the parsed JSON array parse_event_t::value | the parser finished reading a JSON value | depth of the value | the parsed JSON value + @image html callback_events.png "Example when certain parse events are triggered" + Discarding a value (i.e., returning `false`) has different effects depending on the context in which function was called: @@ -2773,21 +2806,16 @@ class basic_json type of the current JSON */ template - static ReferenceType get_ref_impl(ThisType& obj) + static constexpr ReferenceType get_ref_impl(ThisType& obj) { - // delegate the call to get_ptr<>() + // helper type using PointerType = typename std::add_pointer::type; - auto ptr = obj.template get_ptr(); - if (ptr != nullptr) - { - return *ptr; - } - else - { - throw std::domain_error("incompatible ReferenceType for get_ref, actual type is " + - obj.type_name()); - } + // delegate the call to get_ptr<>() + return obj.template get_ptr() != nullptr + ? *obj.template get_ptr() + : throw std::domain_error("incompatible ReferenceType for get_ref, actual type is " + + obj.type_name()); } public: @@ -3015,7 +3043,7 @@ class basic_json std::is_reference::value and std::is_const::type>::value , int>::type = 0> - ReferenceType get_ref() const + constexpr ReferenceType get_ref() const { // delegate call to get_ref_impl return get_ref_impl(*this); @@ -7286,6 +7314,8 @@ class basic_json @throw std::invalid_argument if the low surrogate is invalid; example: `""missing or wrong low surrogate""` + @complexity Constant. + @see */ static string_t to_unicode(const std::size_t codepoint1, @@ -7402,6 +7432,17 @@ class basic_json function consists of a large block of code with `goto` jumps. @return the class of the next token read from the buffer + + @complexity Linear in the length of the input.\n + + Proposition: The loop below will always terminate for finite input.\n + + Proof (by contradiction): Assume a finite input. To loop forever, the + loop must never hit code with a `break` statement. The only code + snippets without a `break` statement are the continue statements for + whitespace and byte-order-marks. To loop forever, the input must be an + infinite sequence of whitespace or byte-order-marks. This contradicts + the assumption of finite input, q.e.d. */ token_type scan() noexcept { @@ -7422,8 +7463,8 @@ class basic_json { 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 32, 0, 0, 32, 0, 0, - 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 160, 128, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 192, 192, 192, 192, 192, 192, 192, 192, @@ -7602,7 +7643,7 @@ basic_json_parser_6: basic_json_parser_9: yyaccept = 0; yych = *(m_marker = ++m_cursor); - if (yych <= 0x0F) + if (yych <= 0x1F) { goto basic_json_parser_5; } @@ -7760,7 +7801,7 @@ basic_json_parser_32: { goto basic_json_parser_31; } - if (yych <= 0x0F) + if (yych <= 0x1F) { goto basic_json_parser_33; } @@ -8233,16 +8274,53 @@ basic_json_parser_63: according to the nature of the escape. Some escapes create new characters (e.g., `"\\n"` is replaced by `"\n"`), some are copied as is (e.g., `"\\\\"`). Furthermore, Unicode escapes of the shape - `"\\uxxxx"` need special care. In this case, to_unicode takes care - of the construction of the values. + `"\\uxxxx"` need special care. In this case, @ref to_unicode takes + care of the construction of the values. 2. Unescaped characters are copied as is. + @pre `m_cursor - m_start >= 2`, meaning the length of the last token + is at least 2 bytes which is trivially true for any string (which + consists of at least two quotes). + + " c1 c2 c3 ... " + ^ ^ + m_start m_cursor + + @complexity Linear in the length of the string.\n + + Lemma: The loop body will always terminate.\n + + Proof (by contradiction): Assume the loop body does not terminate. As + the loop body does not contain another loop, one of the called + functions must never return. The called functions are `std::strtoul` + and @ref to_unicode. Neither function can loop forever, so the loop + body will never loop forever which contradicts the assumption that the + loop body does not terminate, q.e.d.\n + + Lemma: The loop condition for the for loop is eventually false.\n + + Proof (by contradiction): Assume the loop does not terminate. Due to + the above lemma, this can only be due to a tautological loop + condition; that is, the loop condition i < m_cursor - 1 must always be + true. Let x be the change of i for any loop iteration. Then + m_start + 1 + x < m_cursor - 1 must hold to loop indefinitely. + This can be rephrased to m_cursor - m_start - 2 > x. With the + precondition, we x <= 0, meaning that the loop condition holds + indefinitly if i is always decreased. However, observe that the + value of i is strictly increasing with each iteration, as it is + incremented by 1 in the iteration expression and never + decremented inside the loop body. Hence, the loop condition + will eventually be false which contradicts the assumption that + the loop condition is a tautology, q.e.d. + @return string value of current token without opening and closing quotes @throw std::out_of_range if to_unicode fails */ string_t get_string() const { + assert(m_cursor - m_start >= 2); + string_t result; result.reserve(static_cast(m_cursor - m_start - 2)); @@ -8915,6 +8993,8 @@ basic_json_parser_63: /*! @brief create and return a reference to the pointed to value + + @complexity Linear in the number of reference tokens. */ reference get_and_create(reference j) const { @@ -9352,6 +9432,7 @@ basic_json_parser_63: basic_json result; // iterate the JSON object values + assert(value.m_value.object != nullptr); for (const auto& element : *value.m_value.object) { if (not element.second.is_primitive()) diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index 8eeedefe..05c49ec5 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -717,7 +717,19 @@ class basic_json This enumeration collects the different JSON types. It is internally used to distinguish the stored values, and the functions @ref is_null(), @ref is_object(), @ref is_array(), @ref is_string(), @ref is_boolean(), @ref - is_number(), and @ref is_discarded() rely on it. + is_number() (with @ref is_number_integer(), @ref is_number_unsigned(), and + @ref is_number_float()), @ref is_discarded(), @ref is_primitive(), and + @ref is_structured() rely on it. + + @note There are three enumeration entries (number_integer, + number_unsigned, and number_float), because the library distinguishes + these three types for numbers: @ref number_unsigned_t is used for unsigned + integers, @ref number_integer_t is used for signed integers, and @ref + number_float_t is used for floating-point numbers or to approximate + integers which do not fit in the limits of their respective type. + + @sa @ref basic_json(const value_t value_type) -- create a JSON value with + the default value for a given type @since version 1.0.0 */ @@ -728,7 +740,7 @@ class basic_json array, ///< array (ordered collection of values) string, ///< string value boolean, ///< boolean value - number_integer, ///< number value (integer) + number_integer, ///< number value (signed integer) number_unsigned, ///< number value (unsigned integer) number_float, ///< number value (floating-point) discarded ///< discarded by the the parser callback function @@ -758,7 +770,24 @@ class basic_json /*! @brief a JSON value - The actual storage for a JSON value of the @ref basic_json class. + The actual storage for a JSON value of the @ref basic_json class. This + union combines the different storage types for the JSON value types + defined in @ref value_t. + + JSON type | value_t type | used type + --------- | --------------- | ------------------------ + object | object | pointer to @ref object_t + array | array | pointer to @ref array_t + string | string | pointer to @ref string_t + boolean | boolean | @ref boolean_t + number | number_integer | @ref number_integer_t + number | number_unsigned | @ref number_unsigned_t + number | number_float | @ref number_float_t + null | null | *no value is stored* + + @note Variable-length types (objects, arrays, and strings) are stored as + pointers. The size of the union should not exceed 64 bits if the default + value types are used. @since version 1.0.0 */ @@ -874,6 +903,8 @@ class basic_json This enumeration lists the parser events that can trigger calling a callback function of type @ref parser_callback_t during parsing. + @image html callback_events.png "Example when certain parse events are triggered" + @since version 1.0.0 */ enum class parse_event_t : uint8_t @@ -916,6 +947,8 @@ class basic_json parse_event_t::array_end | the parser read `]` and finished processing a JSON array | depth of the parent of the JSON array | the parsed JSON array parse_event_t::value | the parser finished reading a JSON value | depth of the value | the parsed JSON value + @image html callback_events.png "Example when certain parse events are triggered" + Discarding a value (i.e., returning `false`) has different effects depending on the context in which function was called: @@ -2773,21 +2806,16 @@ class basic_json type of the current JSON */ template - static ReferenceType get_ref_impl(ThisType& obj) + static constexpr ReferenceType get_ref_impl(ThisType& obj) { - // delegate the call to get_ptr<>() + // helper type using PointerType = typename std::add_pointer::type; - auto ptr = obj.template get_ptr(); - if (ptr != nullptr) - { - return *ptr; - } - else - { - throw std::domain_error("incompatible ReferenceType for get_ref, actual type is " + - obj.type_name()); - } + // delegate the call to get_ptr<>() + return obj.template get_ptr() != nullptr + ? *obj.template get_ptr() + : throw std::domain_error("incompatible ReferenceType for get_ref, actual type is " + + obj.type_name()); } public: @@ -3015,7 +3043,7 @@ class basic_json std::is_reference::value and std::is_const::type>::value , int>::type = 0> - ReferenceType get_ref() const + constexpr ReferenceType get_ref() const { // delegate call to get_ref_impl return get_ref_impl(*this); @@ -7286,6 +7314,8 @@ class basic_json @throw std::invalid_argument if the low surrogate is invalid; example: `""missing or wrong low surrogate""` + @complexity Constant. + @see */ static string_t to_unicode(const std::size_t codepoint1, @@ -7402,6 +7432,17 @@ class basic_json function consists of a large block of code with `goto` jumps. @return the class of the next token read from the buffer + + @complexity Linear in the length of the input.\n + + Proposition: The loop below will always terminate for finite input.\n + + Proof (by contradiction): Assume a finite input. To loop forever, the + loop must never hit code with a `break` statement. The only code + snippets without a `break` statement are the continue statements for + whitespace and byte-order-marks. To loop forever, the input must be an + infinite sequence of whitespace or byte-order-marks. This contradicts + the assumption of finite input, q.e.d. */ token_type scan() noexcept { @@ -7447,32 +7488,32 @@ class basic_json "false" { last_token_type = token_type::literal_false; break; } // number - decimal_point = [.]; + decimal_point = "."; digit = [0-9]; digit_1_9 = [1-9]; - e = [eE]; - minus = [-]; - plus = [+]; - zero = [0]; - exp = e (minus|plus)? digit+; + e = "e" | "E"; + minus = "-"; + plus = "+"; + zero = "0"; + exp = e (minus | plus)? digit+; frac = decimal_point digit+; - int = (zero|digit_1_9 digit*); + int = (zero | digit_1_9 digit*); number = minus? int frac? exp?; number { last_token_type = token_type::value_number; break; } // string - quotation_mark = ["]; - escape = [\\]; - unescaped = [^"\\\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F]; - single_escaped = ["\\/bfnrt]; - unicode_escaped = [u][0-9a-fA-F]{4}; + quotation_mark = "\""; + escape = "\\"; + unescaped = [^"\\\x00-\x1f]; + single_escaped = "\"" | "\\" | "/" | "b" | "f" | "n" | "r" | "t"; + unicode_escaped = "u" [0-9a-fA-F]{4}; escaped = escape (single_escaped | unicode_escaped); char = unescaped | escaped; string = quotation_mark char* quotation_mark; string { last_token_type = token_type::value_string; break; } // end of file - '\000' { last_token_type = token_type::end_of_input; break; } + "\000" { last_token_type = token_type::end_of_input; break; } // anything else is an error . { last_token_type = token_type::parse_error; break; } @@ -7530,16 +7571,53 @@ class basic_json according to the nature of the escape. Some escapes create new characters (e.g., `"\\n"` is replaced by `"\n"`), some are copied as is (e.g., `"\\\\"`). Furthermore, Unicode escapes of the shape - `"\\uxxxx"` need special care. In this case, to_unicode takes care - of the construction of the values. + `"\\uxxxx"` need special care. In this case, @ref to_unicode takes + care of the construction of the values. 2. Unescaped characters are copied as is. + @pre `m_cursor - m_start >= 2`, meaning the length of the last token + is at least 2 bytes which is trivially true for any string (which + consists of at least two quotes). + + " c1 c2 c3 ... " + ^ ^ + m_start m_cursor + + @complexity Linear in the length of the string.\n + + Lemma: The loop body will always terminate.\n + + Proof (by contradiction): Assume the loop body does not terminate. As + the loop body does not contain another loop, one of the called + functions must never return. The called functions are `std::strtoul` + and @ref to_unicode. Neither function can loop forever, so the loop + body will never loop forever which contradicts the assumption that the + loop body does not terminate, q.e.d.\n + + Lemma: The loop condition for the for loop is eventually false.\n + + Proof (by contradiction): Assume the loop does not terminate. Due to + the above lemma, this can only be due to a tautological loop + condition; that is, the loop condition i < m_cursor - 1 must always be + true. Let x be the change of i for any loop iteration. Then + m_start + 1 + x < m_cursor - 1 must hold to loop indefinitely. + This can be rephrased to m_cursor - m_start - 2 > x. With the + precondition, we x <= 0, meaning that the loop condition holds + indefinitly if i is always decreased. However, observe that the + value of i is strictly increasing with each iteration, as it is + incremented by 1 in the iteration expression and never + decremented inside the loop body. Hence, the loop condition + will eventually be false which contradicts the assumption that + the loop condition is a tautology, q.e.d. + @return string value of current token without opening and closing quotes @throw std::out_of_range if to_unicode fails */ string_t get_string() const { + assert(m_cursor - m_start >= 2); + string_t result; result.reserve(static_cast(m_cursor - m_start - 2)); @@ -8212,6 +8290,8 @@ class basic_json /*! @brief create and return a reference to the pointed to value + + @complexity Linear in the number of reference tokens. */ reference get_and_create(reference j) const { @@ -8649,6 +8729,7 @@ class basic_json basic_json result; // iterate the JSON object values + assert(value.m_value.object != nullptr); for (const auto& element : *value.m_value.object) { if (not element.second.is_primitive()) diff --git a/test/src/unit.cpp b/test/src/unit.cpp index a7ca7394..8ca9b01f 100644 --- a/test/src/unit.cpp +++ b/test/src/unit.cpp @@ -9716,6 +9716,39 @@ TEST_CASE("parser class") CHECK_THROWS_WITH(json::parser("\"\b\"").parse(), "parse error - unexpected '\"'"); // improve code coverage CHECK_THROWS_AS(json::parser("\uFF01").parse(), std::invalid_argument); + // unescaped control characters + CHECK_THROWS_AS(json::parser("\"\x00\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x01\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x02\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x03\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x04\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x05\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x06\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x07\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x08\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x09\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x0a\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x0b\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x0c\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x0d\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x0e\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x0f\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x10\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x11\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x12\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x13\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x14\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x15\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x16\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x17\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x18\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x19\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x1a\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x1b\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x1c\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x1d\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x1e\"").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("\"\x1f\"").parse(), std::invalid_argument); } SECTION("escaped")