merged #201

2016-04-03 14:08:38 +02:00 · 2016-04-03 14:08:38 +02:00 · 9c233be567
commit 9c233be567
parent 04edafbddc
9 changed files with 980 additions and 650 deletions
--- a/README.md
+++ b/README.md
@ -402,7 +402,7 @@ I deeply appreciate the help of the following people.
 - [406345](https://github.com/406345) fixed two small warnings.
 - [Glen Fernandes](https://github.com/glenfe) noted a potential portability problem in the `has_mapped_type` function.
 - [Corbin Hughes](https://github.com/nibroc) fixed some typos in the contribution guidelines.
- [twelsby](https://github.com/twelsby) fixed the array subscript operator, an issue that failed the MSVC build, and floating-point parsing/dumping. He further added support for unsigned integer numbers.
+- [twelsby](https://github.com/twelsby) fixed the array subscript operator, an issue that failed the MSVC build, and floating-point parsing/dumping. He further added support for unsigned integer numbers and implemented better roundtrip support for parsed numbers.
 - [Volker Diels-Grabsch](https://github.com/vog) fixed a link in the README file.
 - [msm-](https://github.com/msm-) added support for american fuzzy lop. 
 - [Annihil](https://github.com/Annihil) fixed an example in the README file.
--- a/src/json.hpp
+++ b/src/json.hpp
--- a/src/json.hpp.re2c
+++ b/src/json.hpp.re2c
@ -695,6 +695,74 @@ class basic_json


  private:
+
+    /*!
+    @brief a type to hold JSON type information
+
+    This bitfield type holds information about JSON types. It is internally
+    used to hold the basic JSON type enumeration, as well as additional
+    information in the case of values that have been parsed from a string
+    including whether of not it was created directly or parsed, and in the
+    case of floating point numbers the number of significant figures in the
+    original representaiton and if it was in exponential form, if a '+' was
+    included in the exponent and the capitilization of the exponent marker.
+    The sole purpose of this information is to permit accurate round trips.
+
+    @since version 2.0.0
+    */
+    union type_data_t
+    {
+        struct
+        {
+            /// the type of the value (@ref value_t)
+            uint16_t type : 4;
+            /// whether the number was parsed from a string
+            uint16_t parsed : 1;
+            /// whether parsed number contained an exponent ('e'/'E')
+            uint16_t has_exp : 1;
+            /// whether parsed number contained a plus in the exponent
+            uint16_t exp_plus : 1;
+            /// whether parsed number's exponent was capitalized ('E')
+            uint16_t exp_cap : 1;
+            /// the number of figures for a parsed number
+            uint16_t precision : 8;
+        } bits;
+        uint16_t data;
+
+        /// return the type as value_t
+        operator value_t() const
+        {
+            return static_cast<value_t>(bits.type);
+        }
+
+        /// test type for equality (ignore other fields)
+        bool operator==(const value_t& rhs) const
+        {
+            return static_cast<value_t>(bits.type) == rhs;
+        }
+
+        /// assignment
+        type_data_t& operator=(value_t rhs)
+        {
+            bits.type = static_cast<uint16_t>(rhs);
+            return *this;
+        }
+
+        /// construct from value_t
+        type_data_t(value_t t) noexcept
+        {
+            *reinterpret_cast<uint16_t*>(this) = 0;
+            bits.type = static_cast<uint16_t>(t);
+        }
+
+        /// default constructor
+        type_data_t() noexcept
+        {
+            data = 0;
+            bits.type = reinterpret_cast<uint16_t>(value_t::null);
+        }
+    };
+
    /// helper for exception-safe object creation
    template<typename T, typename... Args>
    static T* create(Args&& ... args)
@ -6046,23 +6114,78 @@ class basic_json

            case value_t::number_float:
            {
-                // If the number is an integer then output as a fixed with with
-                // precision 1 to output "0.0", "1.0" etc as expected for some
-                // round trip tests otherwise  15 digits of precision allows
-                // round-trip IEEE 754 string->double->string; to be safe, we
-                // read this value from
-                // std::numeric_limits<number_float_t>::digits10
-                if (std::fmod(m_value.number_float, 1) == 0)
+                // buffer size: precision (2^8-1 = 255) + other ('-.e-xxx' = 7) + null (1)
+                char buf[263];
+                int len;
+
+                // check if number was parsed from a string
+                if (m_type.bits.parsed)
                {
-                    o << std::fixed << std::setprecision(1);
+                    // check if parsed number had an exponent given
+                    if (m_type.bits.has_exp)
+                    {
+                        // handle capitalization of the exponent
+                        if (m_type.bits.exp_cap)
+                        {
+                            len = snprintf(buf, sizeof(buf), "%.*E", m_type.bits.precision, m_value.number_float) + 1;
+                        }
+                        else
+                        {
+                            len = snprintf(buf, sizeof(buf), "%.*e", m_type.bits.precision, m_value.number_float) + 1;
+                        }
+
+                        // remove '+' sign from the exponent if necessary
+                        if (not m_type.bits.exp_plus)
+                        {
+                            if (len > static_cast<int>(sizeof(buf)))
+                            {
+                                len = sizeof(buf);
+                            }
+                            for (int i = 0; i < len; i++)
+                            {
+                                if (buf[i] == '+')
+                                {
+                                    for (; i + 1 < len; i++)
+                                    {
+                                        buf[i] = buf[i + 1];
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    else
+                    {
+                        // no exponent - output as a decimal
+                        snprintf(buf, sizeof(buf), "%.*f",
+                                 m_type.bits.precision, m_value.number_float);
+                    }
+                }
+                else if (m_value.number_float == 0)
+                {
+                    // special case for zero to get "0.0"/"-0.0"
+                    if (std::signbit(m_value.number_float))
+                    {
+                        o << "-0.0";
+                    }
+                    else
+                    {
+                        o << "0.0";
+                    }
+                    return;
                }
                else
                {
-                    // std::defaultfloat not supported in gcc version < 5
-                    o.unsetf(std::ios_base::floatfield);
-                    o << std::setprecision(std::numeric_limits<double>::digits10);
+                    // Otherwise 6, 15 or 16 digits of precision allows
+                    // round-trip IEEE 754 string->float->string,
+                    // string->double->string or string->long double->string;
+                    // to be safe, we read this value from
+                    // std::numeric_limits<number_float_t>::digits10
+                    snprintf(buf, sizeof(buf), "%.*g",
+                             std::numeric_limits<double>::digits10,
+                             m_value.number_float);
                }
-                o << m_value.number_float;
+
+                o << buf;
                return;
            }

@ -6086,7 +6209,7 @@ class basic_json
    //////////////////////

    /// the type of the current element
-    value_t m_type = value_t::null;
+    type_data_t m_type = value_t::null;

    /// the value of the current element
    json_value m_value = {};
@ -7558,124 +7681,145 @@ class basic_json
            return std::strtof(reinterpret_cast<typename string_t::const_pointer>(m_start), endptr);
        }

-        /*!
-        @brief static_cast between two types and indicate if it results in error
-
-        This function performs a `static_cast` between @a source and @a dest.
-        It then checks if a `static_cast` back to @a dest produces an error.
-
-        @param[in] source  the value to cast from
-
-        @param[in, out] dest  the value to cast to
-
-        @return true iff the cast was performed without error
-        */
-        template <typename T_A, typename T_B>
-        static bool attempt_cast(T_A source, T_B& dest)
-        {
-            dest = static_cast<T_B>(source);
-            return (source == static_cast<T_A>(dest));
-        }
-
        /*!
        @brief return number value for number tokens

        This function translates the last token into the most appropriate
-        number type (either integer, unsigned integer or floating point), which
-        is passed back to the caller via the result parameter. The pointer @a
-        m_start points to the beginning of the parsed number. We first examine
-        the first character to determine the sign of the number and then pass
-        this pointer to either @a std::strtoull (if positive) or @a
-        std::strtoll (if negative), both of which set @a endptr to the first
-        character past the converted number. If this pointer is not the same as
-        @a m_cursor, then either more or less characters have been used during
-        the comparison.
+        number type (either integer, unsigned integer or floating point),
+        which is passed back to the caller via the result parameter.

-        This can happen for inputs like "01" which will be treated like number
-        0 followed by number 1. This will also occur for valid floating point
-        inputs like "12e3" will be incorrectly read as 12. Numbers that are too
-        large or too small for a signed/unsigned long long will cause a range
-        error (@a errno set to ERANGE). The parsed number is cast to a @ref
-        number_integer_t/@ref number_unsigned_t using the helper function @ref
-        attempt_cast, which returns @a false if the cast could not be peformed
-        without error.
+        This function parses the integer component up to the radix point or
+        exponent while collecting information about the 'floating point
+        representation', which it stores in the result parameter. If there is
+        no radix point or exponent, and the number can fit into a
+        @ref number_integer_t or @ref number_unsigned_t then it sets the
+        result parameter accordingly.

-        In any of these cases (more/less characters read, range error or a cast
-        error) the pointer is passed to @a std:strtod, which also sets @a
-        endptr to the first character past the converted number. The resulting
-        @ref number_float_t is then cast to a @ref number_integer_t/@ref
-        number_unsigned_t using @ref attempt_cast and if no error occurs is
-        stored in that form, otherwise it is stored as a @ref number_float_t.
+        The 'floating point representation' includes the number of significant
+        figures after the radix point, whether the number is in exponential
+        or decimal form, the capitalization of the exponent marker, and if the
+        optional '+' is present in the exponent. This information is necessary
+        to perform accurate round trips of floating point numbers.

-        A final comparison is made of @a endptr and if still not the same as
-        @ref m_cursor a bad input is assumed and @a result parameter is set to
-        NAN.
+        If the number is a floating point number the number is then parsed
+        using @a std:strtod (or @a std:strtof or @a std::strtold).

-        @param[out] result @ref basic_json object to receive the number, or NAN
-        if the conversion read past the current token. The latter case needs to
-        be treated by the caller function.
+        @param[out] result  @ref basic_json object to receive the number, or
+          NAN if the conversion read past the current token. The latter case
+          needs to be treated by the caller function.
        */
        void get_number(basic_json& result) const
        {
-            typename string_t::value_type* endptr;
            assert(m_start != nullptr);
-            errno = 0;

-            // attempt to parse it as an integer - first checking for a
-            // negative number
-            if (*reinterpret_cast<typename string_t::const_pointer>(m_start) != '-')
+            const lexer::lexer_char_t* curptr = m_start;
+
+            // remember this number was parsed (for later serialization)
+            result.m_type.bits.parsed = true;
+
+            // 'found_radix_point' will be set to 0xFF upon finding a radix
+            // point and later used to mask in/out the precision depending
+            // whether a radix is found i.e. 'precision &= found_radix_point'
+            uint8_t found_radix_point = 0;
+            uint8_t precision = 0;
+
+            // accumulate the integer conversion result (unsigned for now)
+            number_unsigned_t value = 0;
+
+            // maximum absolute value of the relevant integer type
+            number_unsigned_t max;
+
+            // temporarily store the type to avoid unecessary bitfield access
+            value_t type;
+
+            // look for sign
+            if (*curptr == '-')
            {
-                // positive, parse with strtoull and attempt cast to
-                // number_unsigned_t
-                if (attempt_cast(std::strtoull(reinterpret_cast<typename string_t::const_pointer>(m_start), &endptr,
-                                               10), result.m_value.number_unsigned))
-                {
-                    result.m_type = value_t::number_unsigned;
-                }
-                else
-                {
-                    // cast failed due to overflow - store as float
-                    result.m_type = value_t::number_float;
-                }
+                type = value_t::number_integer;
+                max = static_cast<uint64_t>(std::numeric_limits<number_integer_t>::max()) + 1;
+                curptr++;
            }
            else
            {
-                // Negative, parse with strtoll and attempt cast to
-                // number_integer_t
-                if (attempt_cast(std::strtoll(reinterpret_cast<typename string_t::const_pointer>(m_start), &endptr,
-                                              10), result.m_value.number_integer))
+                type = value_t::number_unsigned;
+                max = static_cast<uint64_t>(std::numeric_limits<number_unsigned_t>::max());
+                if (*curptr == '+')
                {
-                    result.m_type = value_t::number_integer;
-                }
-                else
-                {
-                    // cast failed due to overflow - store as float
-                    result.m_type = value_t::number_float;
+                    curptr++;
                }
            }

-            // check the end of the number was reached and no range error
-            // occurred
-            if (reinterpret_cast<lexer_char_t*>(endptr) != m_cursor || errno == ERANGE)
+            // count the significant figures
+            for (; curptr < m_cursor; curptr++)
            {
-                result.m_type = value_t::number_float;
+                // quickly skip tests if a digit
+                if (*curptr < '0' || *curptr > '9')
+                {
+                    if (*curptr == '.')
+                    {
+                        // don't count '.' but change to float
+                        type = value_t::number_float;
+
+                        // reset precision count
+                        precision = 0;
+                        found_radix_point = 0xFF;
+                        continue;
+                    }
+                    // assume exponent (if not then will fail parse): change to
+                    // float, stop counting and record exponent details
+                    type = value_t::number_float;
+                    result.m_type.bits.has_exp = true;
+
+                    // exponent capitalization
+                    result.m_type.bits.exp_cap = (*curptr == 'E');
+
+                    // exponent '+' sign
+                    result.m_type.bits.exp_plus = (*(++curptr) == '+');
+                    break;
+                }
+
+                // skip if definitely not an integer
+                if (type != value_t::number_float)
+                {
+                    // multiply last value by ten and add the new digit
+                    auto temp = value * 10 + *curptr - 0x30;
+
+                    // test for overflow
+                    if (temp < value || temp > max)
+                    {
+                        // overflow
+                        type = value_t::number_float;
+                    }
+                    else
+                    {
+                        // no overflow - save it
+                        value = temp;
+                    }
+                }
+                ++precision;
            }

-            if (result.m_type  == value_t::number_float)
-            {
-                // either the number won't fit in an integer (range error from
-                // strtoull/strtoll or overflow on cast) or there was something
-                // else after the number, which could be an exponent
+            // If no radix point was found then precision would now be set to
+            // the number of digits, which is wrong - clear it.
+            result.m_type.bits.precision = precision & found_radix_point;

+            // save the value (if not a float)
+            if (type == value_t::number_unsigned)
+            {
+                result.m_value.number_unsigned = value;
+            }
+            else if (type == value_t::number_integer)
+            {
+                result.m_value.number_integer = -static_cast<number_integer_t>(value);
+            }
+            else
+            {
                // parse with strtod
-                result.m_value.number_float = str_to_float_t(static_cast<number_float_t*>(nullptr), &endptr);
-
-                // anything after the number is an error
-                if (reinterpret_cast<lexer_char_t*>(endptr) != m_cursor)
-                {
-                    throw std::invalid_argument(std::string("parse error - ") + get_token() + " is not a number");
-                }
+                result.m_value.number_float = str_to_float_t(static_cast<number_float_t*>(nullptr), NULL);
            }
+
+            // save the type
+            result.m_type = type;
        }

      private:
--- a/test/json_roundtrip/roundtrip28.json
+++ b/test/json_roundtrip/roundtrip28.json
@ -0,0 +1 @@
+[4.940656458412e-324]
--- a/test/json_roundtrip/roundtrip29.json
+++ b/test/json_roundtrip/roundtrip29.json
@ -0,0 +1 @@
+[2.2250738585072e-308]
--- a/test/json_roundtrip/roundtrip30.json
+++ b/test/json_roundtrip/roundtrip30.json
@ -0,0 +1 @@
+[1.2345E-30]
--- a/test/json_roundtrip/roundtrip31.json
+++ b/test/json_roundtrip/roundtrip31.json
@ -0,0 +1 @@
+[1.2345E+30]
--- a/test/json_roundtrip/roundtrip32.json
+++ b/test/json_roundtrip/roundtrip32.json
@ -0,0 +1 @@
+[1.2345e+30]
--- a/test/unit.cpp
+++ b/test/unit.cpp
@ -9776,7 +9776,8 @@ TEST_CASE("parser class")
                CHECK_THROWS_AS(json::parser("-0e-:").parse(), std::invalid_argument);
                CHECK_THROWS_AS(json::parser("-0f").parse(), std::invalid_argument);

-                CHECK_THROWS_WITH(json::parser("01").parse(), "parse error - 0 is not a number");
+                CHECK_THROWS_WITH(json::parser("01").parse(),
+                                  "parse error - unexpected number literal; expected end of input");
                CHECK_THROWS_WITH(json::parser("--1").parse(), "parse error - unexpected '-'");
                CHECK_THROWS_WITH(json::parser("1.").parse(),
                                  "parse error - unexpected '.'; expected end of input");
@ -11823,10 +11824,15 @@ TEST_CASE("compliance tests from nativejson-benchmark")
                    "test/json_roundtrip/roundtrip21.json",
                    "test/json_roundtrip/roundtrip22.json",
                    "test/json_roundtrip/roundtrip23.json",
-                    //"test/json_roundtrip/roundtrip24.json",
-                    //"test/json_roundtrip/roundtrip25.json",
-                    //"test/json_roundtrip/roundtrip26.json",
-                    //"test/json_roundtrip/roundtrip27.json"
+                    "test/json_roundtrip/roundtrip24.json",
+                    "test/json_roundtrip/roundtrip25.json",
+                    "test/json_roundtrip/roundtrip26.json",
+                    "test/json_roundtrip/roundtrip27.json",
+                    "test/json_roundtrip/roundtrip28.json",
+                    "test/json_roundtrip/roundtrip29.json",
+                    "test/json_roundtrip/roundtrip30.json",
+                    "test/json_roundtrip/roundtrip31.json",
+                    "test/json_roundtrip/roundtrip32.json"
                })
        {
            CAPTURE(filename);