From 6b78b5c2be7e24cd4a1c59c7fa13a62c2ae33508 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Sat, 3 Dec 2016 19:05:09 -0500 Subject: [PATCH 01/23] Added strtonum for locale-independent number parsing --- src/json.hpp | 374 ++++++++++++++++++++++------------- src/json.hpp.re2c | 374 ++++++++++++++++++++++------------- test/src/unit-regression.cpp | 4 +- 3 files changed, 483 insertions(+), 269 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 6fed0a12..e28333bd 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9060,65 +9060,225 @@ basic_json_parser_66: return result; } + + + + + + + + + + + + + + + + + /*! - @brief parse floating point number + @brief parse string into a built-in arithmetic type as if + the current locale is POSIX. - This function (and its overloads) serves to select the most approprate - standard floating point number parsing function based on the type - supplied via the first parameter. Set this to @a - static_cast(nullptr). + e.g. const auto x = static_cast("123.4"); - @param[in] type the @ref number_float_t in use - - @param[in,out] endptr recieves a pointer to the first character after - the number - - @return the floating point number + throw if can't parse the entire string as a number, + or if the destination type is integral and the value + is outside of the type's range. */ - long double str_to_float_t(long double* /* type */, char** endptr) const + struct strtonum { - return std::strtold(reinterpret_cast(m_start), endptr); - } + public: + strtonum(const char* start, const char* end) + : m_start(start), m_end(end) + {} - /*! - @brief parse floating point number + template::value>::type > + operator T() const + { + T val{0}; - This function (and its overloads) serves to select the most approprate - standard floating point number parsing function based on the type - supplied via the first parameter. Set this to @a - static_cast(nullptr). + if(parse(val, std::is_integral())) { + return val; + } - @param[in] type the @ref number_float_t in use + throw std::invalid_argument( + std::string() + + "Can't parse '" + + std::string(m_start, m_end) + + "' as type " + + typeid(T).name()); + } - @param[in,out] endptr recieves a pointer to the first character after - the number + /// return true iff token matches ^[+-]\d+$ + bool is_integral() const + { + const char* p = m_start; - @return the floating point number - */ - double str_to_float_t(double* /* type */, char** endptr) const - { - return std::strtod(reinterpret_cast(m_start), endptr); - } + if(!p) { + return false; + } - /*! - @brief parse floating point number + if(*p == '-' or *p == '+') { + ++p; + } - This function (and its overloads) serves to select the most approprate - standard floating point number parsing function based on the type - supplied via the first parameter. Set this to @a - static_cast(nullptr). + if(p == m_end) { + return false; + } - @param[in] type the @ref number_float_t in use + while(p < m_end and *p >= '0' and *p <= '9') { + ++p; + } - @param[in,out] endptr recieves a pointer to the first character after - the number + return p == m_end; + } - @return the floating point number - */ - float str_to_float_t(float* /* type */, char** endptr) const - { - return std::strtof(reinterpret_cast(m_start), endptr); - } + private: + const char* const m_start = nullptr; + const char* const m_end = nullptr; + + static void strtof(float& f, + const char* str, + char** endptr) + { + f = std::strtof(str, endptr); + } + + static void strtof(double& f, + const char* str, + char** endptr) + { + f = std::strtod(str, endptr); + } + + static void strtof(long double& f, + const char* str, + char** endptr) + { + f = std::strtold(str, endptr); + } + + template + bool parse(T& value, /*is_integral=*/std::false_type) const + { + const char* data = m_start; + const size_t len = static_cast(m_end - m_start); + + const char decimal_point_char = + std::use_facet< std::numpunct >( + std::locale()).decimal_point(); + + // replace decimal separator with locale-specific + // version, if necessary; data will be repointed + // to either buf or tempstr containing the fixed + // string. + std::string tempstr; + std::array buf; + do { + if(decimal_point_char == '.') { + break; // don't need to convert + } + + const size_t ds_pos = static_cast( + std::find(m_start, m_end, '.') - m_start ); + + if(ds_pos == len) { + break; // no decimal separator + } + + // copy the data into the local buffer or + // tempstr, if buffer is too small; + // replace decimal separator, and update + // data to point to the modified bytes + if(len + 1 < buf.size()) { + std::copy(m_start, m_end, buf.data()); + buf[len] = 0; + buf[ds_pos] = decimal_point_char; + data = buf.data(); + } else { + tempstr.assign(m_start, m_end); + tempstr[ds_pos] = decimal_point_char; + data = tempstr.c_str(); + } + } while(0); + + char* endptr = nullptr; + value = 0; + strtof(value, data, &endptr); + + + + // note that reading past the end is OK, the data may be, + // for example, "123.", where the parsed token only contains "123", + // but strtod will read the dot as well. + const bool ok = endptr >= data + len + and len > 0; + + if(ok and value == 0.0 and *data == '-') { + // some implementations forget to negate the zero + value = -0.0; + } + + if(!ok) { + std::cerr << "data:" << data + << " token:" << std::string(m_start, len) + << " value:" << value + << " len:" << len + << " parsed_len:" << (endptr - data) << std::endl; + } + + return ok; + } + + template + bool parse(T& value, /*is_integral=*/std::true_type) const + { + const char* beg = m_start; + const char* const end = m_end; + + if(beg == end) { + return false; + } + + const bool is_negative = (*beg == '-'); + + // json numbers can't start with '+' but + // this code is not json-specific + if(is_negative or *beg == '+') { + ++beg; // skip the leading sign + } + + bool valid = beg < end // must have some digits; + and ( T(-1) < 0 // type must be signed + or !is_negative); // if value is negative + value = 0; + + while(beg < end and valid) { + const uint8_t c = static_cast(*beg - '0'); + const T upd_value = value * 10 + c; + valid &= value <= std::numeric_limits::max() / 10 + and value <= upd_value // did not overflow + and c < 10; // char was digit + value = upd_value; + ++beg; + } + + if(is_negative) { + value = 0 - value; + } + + if(!valid) { + std::cerr << " token:" << std::string(m_start, m_end) + << std::endl; + } + + return valid; + } + }; /*! @brief return number value for number tokens @@ -9126,111 +9286,57 @@ basic_json_parser_66: This function translates the last token into the most appropriate number type (either integer, unsigned integer or floating point), which is passed back to the caller via the result parameter. + + integral numbers that don't fit into the the range of the respective + type are parsed as number_float_t - This function parses the integer component up to the radix point or - exponent while collecting information about the 'floating point - representation', which it stores in the result parameter. If there is - no radix point or exponent, and the number can fit into a @ref - number_integer_t or @ref number_unsigned_t then it sets the result - parameter accordingly. + floating-point values do not satisfy std::isfinite predicate + are converted to value_t::null + + throws if the entire string [m_start .. m_cursor) cannot be + interpreted as a number - If the number is a floating point number the number is then parsed - using @a std:strtod (or @a std:strtof or @a std::strtold). - - @param[out] result @ref basic_json object to receive the number, or - NAN if the conversion read past the current token. The latter case - needs to be treated by the caller function. + @param[out] result @ref basic_json object to receive the number. */ void get_number(basic_json& result) const { assert(m_start != nullptr); + assert(m_start < m_cursor); + + strtonum num(reinterpret_cast(m_start), + reinterpret_cast(m_cursor)); - const lexer::lexer_char_t* curptr = m_start; + const bool is_negative = *m_start == '-'; - // accumulate the integer conversion result (unsigned for now) - number_unsigned_t value = 0; - - // maximum absolute value of the relevant integer type - number_unsigned_t max; - - // temporarily store the type to avoid unecessary bitfield access - value_t type; - - // look for sign - if (*curptr == '-') - { - type = value_t::number_integer; - max = static_cast((std::numeric_limits::max)()) + 1; - curptr++; - } - else - { - type = value_t::number_unsigned; - max = static_cast((std::numeric_limits::max)()); - } - - // count the significant figures - for (; curptr < m_cursor; curptr++) - { - // quickly skip tests if a digit - if (*curptr < '0' || *curptr > '9') + try { + if(not num.is_integral()) { + ; + } + else if(is_negative) { - if (*curptr == '.') - { - // don't count '.' but change to float - type = value_t::number_float; - continue; - } - // assume exponent (if not then will fail parse): change to - // float, stop counting and record exponent details - type = value_t::number_float; - break; - } - - // skip if definitely not an integer - if (type != value_t::number_float) + result.m_type = value_t::number_integer; + result.m_value = static_cast(num); + return; + } + else { - // multiply last value by ten and add the new digit - auto temp = value * 10 + *curptr - '0'; - - // test for overflow - if (temp < value || temp > max) - { - // overflow - type = value_t::number_float; - } - else - { - // no overflow - save it - value = temp; - } + result.m_type = value_t::number_unsigned; + result.m_value = static_cast(num); + return; } + } catch (std::invalid_argument&) { + ; // overflow - will parse as double } - // save the value (if not a float) - if (type == value_t::number_unsigned) - { - result.m_value.number_unsigned = value; - } - else if (type == value_t::number_integer) - { - result.m_value.number_integer = -static_cast(value); - } - else - { - // parse with strtod - result.m_value.number_float = str_to_float_t(static_cast(nullptr), NULL); + result.m_type = value_t::number_float; + result.m_value = static_cast(num); - // replace infinity and NAN by null - if (not std::isfinite(result.m_value.number_float)) - { - type = value_t::null; - result.m_value = basic_json::json_value(); - } + // replace infinity and NAN by null + if (not std::isfinite(result.m_value.number_float)) + { + result.m_type = value_t::null; + result.m_value = basic_json::json_value(); } - - // save the type - result.m_type = type; } private: diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index c6a99b89..72c413ba 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8209,65 +8209,225 @@ class basic_json return result; } + + + + + + + + + + + + + + + + + /*! - @brief parse floating point number + @brief parse string into a built-in arithmetic type as if + the current locale is POSIX. - This function (and its overloads) serves to select the most approprate - standard floating point number parsing function based on the type - supplied via the first parameter. Set this to @a - static_cast(nullptr). + e.g. const auto x = static_cast("123.4"); - @param[in] type the @ref number_float_t in use - - @param[in,out] endptr recieves a pointer to the first character after - the number - - @return the floating point number + throw if can't parse the entire string as a number, + or if the destination type is integral and the value + is outside of the type's range. */ - long double str_to_float_t(long double* /* type */, char** endptr) const + struct strtonum { - return std::strtold(reinterpret_cast(m_start), endptr); - } + public: + strtonum(const char* start, const char* end) + : m_start(start), m_end(end) + {} - /*! - @brief parse floating point number + template::value>::type > + operator T() const + { + T val{0}; - This function (and its overloads) serves to select the most approprate - standard floating point number parsing function based on the type - supplied via the first parameter. Set this to @a - static_cast(nullptr). + if(parse(val, std::is_integral())) { + return val; + } - @param[in] type the @ref number_float_t in use + throw std::invalid_argument( + std::string() + + "Can't parse '" + + std::string(m_start, m_end) + + "' as type " + + typeid(T).name()); + } - @param[in,out] endptr recieves a pointer to the first character after - the number + /// return true iff token matches ^[+-]\d+$ + bool is_integral() const + { + const char* p = m_start; - @return the floating point number - */ - double str_to_float_t(double* /* type */, char** endptr) const - { - return std::strtod(reinterpret_cast(m_start), endptr); - } + if(!p) { + return false; + } - /*! - @brief parse floating point number + if(*p == '-' or *p == '+') { + ++p; + } - This function (and its overloads) serves to select the most approprate - standard floating point number parsing function based on the type - supplied via the first parameter. Set this to @a - static_cast(nullptr). + if(p == m_end) { + return false; + } - @param[in] type the @ref number_float_t in use + while(p < m_end and *p >= '0' and *p <= '9') { + ++p; + } - @param[in,out] endptr recieves a pointer to the first character after - the number + return p == m_end; + } - @return the floating point number - */ - float str_to_float_t(float* /* type */, char** endptr) const - { - return std::strtof(reinterpret_cast(m_start), endptr); - } + private: + const char* const m_start = nullptr; + const char* const m_end = nullptr; + + static void strtof(float& f, + const char* str, + char** endptr) + { + f = std::strtof(str, endptr); + } + + static void strtof(double& f, + const char* str, + char** endptr) + { + f = std::strtod(str, endptr); + } + + static void strtof(long double& f, + const char* str, + char** endptr) + { + f = std::strtold(str, endptr); + } + + template + bool parse(T& value, /*is_integral=*/std::false_type) const + { + const char* data = m_start; + const size_t len = static_cast(m_end - m_start); + + const char decimal_point_char = + std::use_facet< std::numpunct >( + std::locale()).decimal_point(); + + // replace decimal separator with locale-specific + // version, if necessary; data will be repointed + // to either buf or tempstr containing the fixed + // string. + std::string tempstr; + std::array buf; + do { + if(decimal_point_char == '.') { + break; // don't need to convert + } + + const size_t ds_pos = static_cast( + std::find(m_start, m_end, '.') - m_start ); + + if(ds_pos == len) { + break; // no decimal separator + } + + // copy the data into the local buffer or + // tempstr, if buffer is too small; + // replace decimal separator, and update + // data to point to the modified bytes + if(len + 1 < buf.size()) { + std::copy(m_start, m_end, buf.data()); + buf[len] = 0; + buf[ds_pos] = decimal_point_char; + data = buf.data(); + } else { + tempstr.assign(m_start, m_end); + tempstr[ds_pos] = decimal_point_char; + data = tempstr.c_str(); + } + } while(0); + + char* endptr = nullptr; + value = 0; + strtof(value, data, &endptr); + + + + // note that reading past the end is OK, the data may be, + // for example, "123.", where the parsed token only contains "123", + // but strtod will read the dot as well. + const bool ok = endptr >= data + len + and len > 0; + + if(ok and value == 0.0 and *data == '-') { + // some implementations forget to negate the zero + value = -0.0; + } + + if(!ok) { + std::cerr << "data:" << data + << " token:" << std::string(m_start, len) + << " value:" << value + << " len:" << len + << " parsed_len:" << (endptr - data) << std::endl; + } + + return ok; + } + + template + bool parse(T& value, /*is_integral=*/std::true_type) const + { + const char* beg = m_start; + const char* const end = m_end; + + if(beg == end) { + return false; + } + + const bool is_negative = (*beg == '-'); + + // json numbers can't start with '+' but + // this code is not json-specific + if(is_negative or *beg == '+') { + ++beg; // skip the leading sign + } + + bool valid = beg < end // must have some digits; + and ( T(-1) < 0 // type must be signed + or !is_negative); // if value is negative + value = 0; + + while(beg < end and valid) { + const uint8_t c = static_cast(*beg - '0'); + const T upd_value = value * 10 + c; + valid &= value <= std::numeric_limits::max() / 10 + and value <= upd_value // did not overflow + and c < 10; // char was digit + value = upd_value; + ++beg; + } + + if(is_negative) { + value = 0 - value; + } + + if(!valid) { + std::cerr << " token:" << std::string(m_start, m_end) + << std::endl; + } + + return valid; + } + }; /*! @brief return number value for number tokens @@ -8275,111 +8435,57 @@ class basic_json This function translates the last token into the most appropriate number type (either integer, unsigned integer or floating point), which is passed back to the caller via the result parameter. + + integral numbers that don't fit into the the range of the respective + type are parsed as number_float_t - This function parses the integer component up to the radix point or - exponent while collecting information about the 'floating point - representation', which it stores in the result parameter. If there is - no radix point or exponent, and the number can fit into a @ref - number_integer_t or @ref number_unsigned_t then it sets the result - parameter accordingly. + floating-point values do not satisfy std::isfinite predicate + are converted to value_t::null + + throws if the entire string [m_start .. m_cursor) cannot be + interpreted as a number - If the number is a floating point number the number is then parsed - using @a std:strtod (or @a std:strtof or @a std::strtold). - - @param[out] result @ref basic_json object to receive the number, or - NAN if the conversion read past the current token. The latter case - needs to be treated by the caller function. + @param[out] result @ref basic_json object to receive the number. */ void get_number(basic_json& result) const { assert(m_start != nullptr); + assert(m_start < m_cursor); + + strtonum num(reinterpret_cast(m_start), + reinterpret_cast(m_cursor)); - const lexer::lexer_char_t* curptr = m_start; + const bool is_negative = *m_start == '-'; - // accumulate the integer conversion result (unsigned for now) - number_unsigned_t value = 0; - - // maximum absolute value of the relevant integer type - number_unsigned_t max; - - // temporarily store the type to avoid unecessary bitfield access - value_t type; - - // look for sign - if (*curptr == '-') - { - type = value_t::number_integer; - max = static_cast((std::numeric_limits::max)()) + 1; - curptr++; - } - else - { - type = value_t::number_unsigned; - max = static_cast((std::numeric_limits::max)()); - } - - // count the significant figures - for (; curptr < m_cursor; curptr++) - { - // quickly skip tests if a digit - if (*curptr < '0' || *curptr > '9') + try { + if(not num.is_integral()) { + ; + } + else if(is_negative) { - if (*curptr == '.') - { - // don't count '.' but change to float - type = value_t::number_float; - continue; - } - // assume exponent (if not then will fail parse): change to - // float, stop counting and record exponent details - type = value_t::number_float; - break; - } - - // skip if definitely not an integer - if (type != value_t::number_float) + result.m_type = value_t::number_integer; + result.m_value = static_cast(num); + return; + } + else { - // multiply last value by ten and add the new digit - auto temp = value * 10 + *curptr - '0'; - - // test for overflow - if (temp < value || temp > max) - { - // overflow - type = value_t::number_float; - } - else - { - // no overflow - save it - value = temp; - } + result.m_type = value_t::number_unsigned; + result.m_value = static_cast(num); + return; } + } catch (std::invalid_argument&) { + ; // overflow - will parse as double } - // save the value (if not a float) - if (type == value_t::number_unsigned) - { - result.m_value.number_unsigned = value; - } - else if (type == value_t::number_integer) - { - result.m_value.number_integer = -static_cast(value); - } - else - { - // parse with strtod - result.m_value.number_float = str_to_float_t(static_cast(nullptr), NULL); + result.m_type = value_t::number_float; + result.m_value = static_cast(num); - // replace infinity and NAN by null - if (not std::isfinite(result.m_value.number_float)) - { - type = value_t::null; - result.m_value = basic_json::json_value(); - } + // replace infinity and NAN by null + if (not std::isfinite(result.m_value.number_float)) + { + result.m_type = value_t::null; + result.m_value = basic_json::json_value(); } - - // save the type - result.m_type = type; } private: diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index e04513ca..679b7f3d 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -375,7 +375,7 @@ TEST_CASE("regression tests") }; // change locale to mess with decimal points - std::locale::global(std::locale(std::locale(), new CommaDecimalSeparator)); + auto orig_locale = std::locale::global(std::locale(std::locale(), new CommaDecimalSeparator)); CHECK(j1a.dump() == "23.42"); CHECK(j1b.dump() == "23.42"); @@ -399,6 +399,8 @@ TEST_CASE("regression tests") CHECK(j3c.dump() == "10000"); //CHECK(j3b.dump() == "1E04"); // roundtrip error //CHECK(j3c.dump() == "1e04"); // roundtrip error + + std::locale::global(orig_locale); } SECTION("issue #233 - Can't use basic_json::iterator as a base iterator for std::move_iterator") From 4eafaab8164667a2c309ac5b346981542b0873e5 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Sat, 3 Dec 2016 22:54:36 -0500 Subject: [PATCH 02/23] Improved overflow detection; removed debugging output statements. --- src/json.hpp | 47 ++++++++++++++++++++++++----------------------- src/json.hpp.re2c | 47 ++++++++++++++++++++++++----------------------- 2 files changed, 48 insertions(+), 46 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index e28333bd..70bfce5a 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9223,14 +9223,6 @@ basic_json_parser_66: value = -0.0; } - if(!ok) { - std::cerr << "data:" << data - << " token:" << std::string(m_start, len) - << " value:" << value - << " len:" << len - << " parsed_len:" << (endptr - data) << std::endl; - } - return ok; } @@ -9252,31 +9244,40 @@ basic_json_parser_66: ++beg; // skip the leading sign } + // using unsigned accumulator x because the signed + // type can't hold absolute value of largest + // negative value of this type, so overflow detection + // becomes problematic. + using unsigned_T = typename std::make_unsigned::type; + unsigned_T x = 0; + bool valid = beg < end // must have some digits; and ( T(-1) < 0 // type must be signed or !is_negative); // if value is negative - value = 0; while(beg < end and valid) { const uint8_t c = static_cast(*beg - '0'); - const T upd_value = value * 10 + c; - valid &= value <= std::numeric_limits::max() / 10 - and value <= upd_value // did not overflow - and c < 10; // char was digit - value = upd_value; + const unsigned_T upd_x = x * 10 + c; + + using lim_T = std::numeric_limits; + constexpr unsigned_T thr1 = lim_T::max() / 10; + valid &= c < 10 // char was digit + and x <= thr1 // multiplication did not overflow + and x <= upd_x; // addition did not overflow. + + // note that x <= upd_x alone can't detect overflow since + // we're not just adding a value of decltype(x) but + // also multiplying by 10 + + x = upd_x; ++beg; } - if(is_negative) { - value = 0 - value; - } + value = static_cast(is_negative ? 0 - x : x); - if(!valid) { - std::cerr << " token:" << std::string(m_start, m_end) - << std::endl; - } - - return valid; + // the final check to make sure the value did not roll over + // into positives is for edge cases, e.g. -2^63 + return valid && (is_negative == (value < 0)); } }; diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index 72c413ba..418e302c 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8372,14 +8372,6 @@ class basic_json value = -0.0; } - if(!ok) { - std::cerr << "data:" << data - << " token:" << std::string(m_start, len) - << " value:" << value - << " len:" << len - << " parsed_len:" << (endptr - data) << std::endl; - } - return ok; } @@ -8401,31 +8393,40 @@ class basic_json ++beg; // skip the leading sign } + // using unsigned accumulator x because the signed + // type can't hold absolute value of largest + // negative value of this type, so overflow detection + // becomes problematic. + using unsigned_T = typename std::make_unsigned::type; + unsigned_T x = 0; + bool valid = beg < end // must have some digits; and ( T(-1) < 0 // type must be signed or !is_negative); // if value is negative - value = 0; while(beg < end and valid) { const uint8_t c = static_cast(*beg - '0'); - const T upd_value = value * 10 + c; - valid &= value <= std::numeric_limits::max() / 10 - and value <= upd_value // did not overflow - and c < 10; // char was digit - value = upd_value; + const unsigned_T upd_x = x * 10 + c; + + using lim_T = std::numeric_limits; + constexpr unsigned_T thr1 = lim_T::max() / 10; + valid &= c < 10 // char was digit + and x <= thr1 // multiplication did not overflow + and x <= upd_x; // addition did not overflow. + + // note that x <= upd_x alone can't detect overflow since + // we're not just adding a value of decltype(x) but + // also multiplying by 10 + + x = upd_x; ++beg; } - if(is_negative) { - value = 0 - value; - } + value = static_cast(is_negative ? 0 - x : x); - if(!valid) { - std::cerr << " token:" << std::string(m_start, m_end) - << std::endl; - } - - return valid; + // the final check to make sure the value did not roll over + // into positives is for edge cases, e.g. -2^63 + return valid && (is_negative == (value < 0)); } }; From c75efedc6ec7048244300a6305b25b2e117b440c Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Sat, 3 Dec 2016 23:19:43 -0500 Subject: [PATCH 03/23] stylistic changes --- src/json.hpp | 92 ++++++++++++++++++++++++++--------------------- src/json.hpp.re2c | 92 ++++++++++++++++++++++++++--------------------- 2 files changed, 104 insertions(+), 80 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 70bfce5a..56949a4c 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9060,22 +9060,6 @@ basic_json_parser_66: return result; } - - - - - - - - - - - - - - - - /*! @brief parse string into a built-in arithmetic type as if @@ -9086,6 +9070,9 @@ basic_json_parser_66: throw if can't parse the entire string as a number, or if the destination type is integral and the value is outside of the type's range. + + note: in floating-point case strtod may parse + past the token's end - this is not an error. */ struct strtonum { @@ -9101,7 +9088,8 @@ basic_json_parser_66: { T val{0}; - if(parse(val, std::is_integral())) { + if (parse(val, std::is_integral())) + { return val; } @@ -9113,24 +9101,35 @@ basic_json_parser_66: + typeid(T).name()); } - /// return true iff token matches ^[+-]\d+$ + // return true iff token matches ^[+-]\d+$ + // + // this is a helper to determine whether to + // parse the token into floating-point or + // integral type. We wouldn't need it if + // we had separate token types for interal + // and floating-point cases. bool is_integral() const { const char* p = m_start; - if(!p) { + if (!p) + { return false; } - if(*p == '-' or *p == '+') { + if (*p == '-' or *p == '+') + { ++p; } - if(p == m_end) { + if (p == m_end) + { return false; } - while(p < m_end and *p >= '0' and *p <= '9') { + while (p < m_end and *p >= '0' + and *p <= '9') + { ++p; } @@ -9141,6 +9140,9 @@ basic_json_parser_66: const char* const m_start = nullptr; const char* const m_end = nullptr; + // overloaded wrappers for strtod/strtof/strtold + // that will be called from parse + static void strtof(float& f, const char* str, char** endptr) @@ -9173,20 +9175,22 @@ basic_json_parser_66: std::locale()).decimal_point(); // replace decimal separator with locale-specific - // version, if necessary; data will be repointed + // version, when necessary; data will be repointed // to either buf or tempstr containing the fixed // string. std::string tempstr; std::array buf; do { - if(decimal_point_char == '.') { + if (decimal_point_char == '.') + { break; // don't need to convert } const size_t ds_pos = static_cast( std::find(m_start, m_end, '.') - m_start ); - if(ds_pos == len) { + if (ds_pos == len) + { break; // no decimal separator } @@ -9194,31 +9198,33 @@ basic_json_parser_66: // tempstr, if buffer is too small; // replace decimal separator, and update // data to point to the modified bytes - if(len + 1 < buf.size()) { + if (len + 1 < buf.size()) + { std::copy(m_start, m_end, buf.data()); buf[len] = 0; buf[ds_pos] = decimal_point_char; data = buf.data(); - } else { + } + else + { tempstr.assign(m_start, m_end); tempstr[ds_pos] = decimal_point_char; data = tempstr.c_str(); } - } while(0); + } while (0); char* endptr = nullptr; value = 0; strtof(value, data, &endptr); - - // note that reading past the end is OK, the data may be, - // for example, "123.", where the parsed token only contains "123", - // but strtod will read the dot as well. + // for example, "123.", where the parsed token only + // contains "123", but strtod will read the dot as well. const bool ok = endptr >= data + len and len > 0; - if(ok and value == 0.0 and *data == '-') { + if (ok and value == 0.0 and *data == '-') + { // some implementations forget to negate the zero value = -0.0; } @@ -9232,15 +9238,17 @@ basic_json_parser_66: const char* beg = m_start; const char* const end = m_end; - if(beg == end) { + if (beg == end) + { return false; } const bool is_negative = (*beg == '-'); // json numbers can't start with '+' but - // this code is not json-specific - if(is_negative or *beg == '+') { + // this code is not strictly json-specific + if (is_negative or *beg == '+') + { ++beg; // skip the leading sign } @@ -9255,7 +9263,8 @@ basic_json_parser_66: and ( T(-1) < 0 // type must be signed or !is_negative); // if value is negative - while(beg < end and valid) { + while (beg < end and valid) + { const uint8_t c = static_cast(*beg - '0'); const unsigned_T upd_x = x * 10 + c; @@ -9310,10 +9319,11 @@ basic_json_parser_66: const bool is_negative = *m_start == '-'; try { - if(not num.is_integral()) { + if (not num.is_integral()) + { ; } - else if(is_negative) + else if (is_negative) { result.m_type = value_t::number_integer; result.m_value = static_cast(num); @@ -9325,7 +9335,9 @@ basic_json_parser_66: result.m_value = static_cast(num); return; } - } catch (std::invalid_argument&) { + } + catch (std::invalid_argument&) + { ; // overflow - will parse as double } diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index 418e302c..d97057a8 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8209,22 +8209,6 @@ class basic_json return result; } - - - - - - - - - - - - - - - - /*! @brief parse string into a built-in arithmetic type as if @@ -8235,6 +8219,9 @@ class basic_json throw if can't parse the entire string as a number, or if the destination type is integral and the value is outside of the type's range. + + note: in floating-point case strtod may parse + past the token's end - this is not an error. */ struct strtonum { @@ -8250,7 +8237,8 @@ class basic_json { T val{0}; - if(parse(val, std::is_integral())) { + if (parse(val, std::is_integral())) + { return val; } @@ -8262,24 +8250,35 @@ class basic_json + typeid(T).name()); } - /// return true iff token matches ^[+-]\d+$ + // return true iff token matches ^[+-]\d+$ + // + // this is a helper to determine whether to + // parse the token into floating-point or + // integral type. We wouldn't need it if + // we had separate token types for interal + // and floating-point cases. bool is_integral() const { const char* p = m_start; - if(!p) { + if (!p) + { return false; } - if(*p == '-' or *p == '+') { + if (*p == '-' or *p == '+') + { ++p; } - if(p == m_end) { + if (p == m_end) + { return false; } - while(p < m_end and *p >= '0' and *p <= '9') { + while (p < m_end and *p >= '0' + and *p <= '9') + { ++p; } @@ -8290,6 +8289,9 @@ class basic_json const char* const m_start = nullptr; const char* const m_end = nullptr; + // overloaded wrappers for strtod/strtof/strtold + // that will be called from parse + static void strtof(float& f, const char* str, char** endptr) @@ -8322,20 +8324,22 @@ class basic_json std::locale()).decimal_point(); // replace decimal separator with locale-specific - // version, if necessary; data will be repointed + // version, when necessary; data will be repointed // to either buf or tempstr containing the fixed // string. std::string tempstr; std::array buf; do { - if(decimal_point_char == '.') { + if (decimal_point_char == '.') + { break; // don't need to convert } const size_t ds_pos = static_cast( std::find(m_start, m_end, '.') - m_start ); - if(ds_pos == len) { + if (ds_pos == len) + { break; // no decimal separator } @@ -8343,31 +8347,33 @@ class basic_json // tempstr, if buffer is too small; // replace decimal separator, and update // data to point to the modified bytes - if(len + 1 < buf.size()) { + if (len + 1 < buf.size()) + { std::copy(m_start, m_end, buf.data()); buf[len] = 0; buf[ds_pos] = decimal_point_char; data = buf.data(); - } else { + } + else + { tempstr.assign(m_start, m_end); tempstr[ds_pos] = decimal_point_char; data = tempstr.c_str(); } - } while(0); + } while (0); char* endptr = nullptr; value = 0; strtof(value, data, &endptr); - - // note that reading past the end is OK, the data may be, - // for example, "123.", where the parsed token only contains "123", - // but strtod will read the dot as well. + // for example, "123.", where the parsed token only + // contains "123", but strtod will read the dot as well. const bool ok = endptr >= data + len and len > 0; - if(ok and value == 0.0 and *data == '-') { + if (ok and value == 0.0 and *data == '-') + { // some implementations forget to negate the zero value = -0.0; } @@ -8381,15 +8387,17 @@ class basic_json const char* beg = m_start; const char* const end = m_end; - if(beg == end) { + if (beg == end) + { return false; } const bool is_negative = (*beg == '-'); // json numbers can't start with '+' but - // this code is not json-specific - if(is_negative or *beg == '+') { + // this code is not strictly json-specific + if (is_negative or *beg == '+') + { ++beg; // skip the leading sign } @@ -8404,7 +8412,8 @@ class basic_json and ( T(-1) < 0 // type must be signed or !is_negative); // if value is negative - while(beg < end and valid) { + while (beg < end and valid) + { const uint8_t c = static_cast(*beg - '0'); const unsigned_T upd_x = x * 10 + c; @@ -8459,10 +8468,11 @@ class basic_json const bool is_negative = *m_start == '-'; try { - if(not num.is_integral()) { + if (not num.is_integral()) + { ; } - else if(is_negative) + else if (is_negative) { result.m_type = value_t::number_integer; result.m_value = static_cast(num); @@ -8474,7 +8484,9 @@ class basic_json result.m_value = static_cast(num); return; } - } catch (std::invalid_argument&) { + } + catch (std::invalid_argument&) + { ; // overflow - will parse as double } From e41a956782fd64a2bdf253a13f05bc5e8239ec61 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Sun, 4 Dec 2016 13:23:39 -0500 Subject: [PATCH 04/23] Alternative handling of integer types relying on strto[u]ll --- src/json.hpp | 46 ++++++++++++++++++++++++++++++++++++++++++++-- src/json.hpp.re2c | 46 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 88 insertions(+), 4 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 56949a4c..51b19518 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9073,6 +9073,8 @@ basic_json_parser_66: note: in floating-point case strtod may parse past the token's end - this is not an error. + + any leading blanks are not handled. */ struct strtonum { @@ -9232,6 +9234,45 @@ basic_json_parser_66: return ok; } + +#if 1 // parsing with strto[u]ll - easier to understand but slightly slower + + signed long long parse_integral( + char** endptr, + /*is_signed*/std::true_type) const + { + return std::strtoll(m_start, endptr, 10); + } + + unsigned long long parse_integral( + char** endptr, + /*is_signed*/std::false_type) const + { + return std::strtoull(m_start, endptr, 10); + } + + template + bool parse(T& value, /*is_integral=*/std::true_type) const + { + char* endptr = nullptr; + errno = 0; // these are thread-local + const auto x = parse_integral(&endptr, std::is_signed()); + + static_assert(std::is_signed() // called right overload? + == std::is_signed(), ""); + + value = static_cast(x); + + return x == static_cast(value) // x fits into destination T + and (x != 0 or is_integral()) // strto[u]ll did nto fail + and errno == 0 // strto[u]ll did not overflow + and m_start < m_end // token was not empty + and endptr == m_end // parsed entire token exactly + and (x < 0) == (*m_start == '-'); // input was sign-compatible + } + +#else // parsing integral types manually + template bool parse(T& value, /*is_integral=*/std::true_type) const { @@ -9240,7 +9281,7 @@ basic_json_parser_66: if (beg == end) { - return false; + return false; // empty token } const bool is_negative = (*beg == '-'); @@ -9261,7 +9302,7 @@ basic_json_parser_66: bool valid = beg < end // must have some digits; and ( T(-1) < 0 // type must be signed - or !is_negative); // if value is negative + or !is_negative); // ...if value is negative while (beg < end and valid) { @@ -9288,6 +9329,7 @@ basic_json_parser_66: // into positives is for edge cases, e.g. -2^63 return valid && (is_negative == (value < 0)); } +#endif }; /*! diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index d97057a8..e5f344b2 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8222,6 +8222,8 @@ class basic_json note: in floating-point case strtod may parse past the token's end - this is not an error. + + any leading blanks are not handled. */ struct strtonum { @@ -8381,6 +8383,45 @@ class basic_json return ok; } + +#if 1 // parsing with strto[u]ll - easier to understand but slightly slower + + signed long long parse_integral( + char** endptr, + /*is_signed*/std::true_type) const + { + return std::strtoll(m_start, endptr, 10); + } + + unsigned long long parse_integral( + char** endptr, + /*is_signed*/std::false_type) const + { + return std::strtoull(m_start, endptr, 10); + } + + template + bool parse(T& value, /*is_integral=*/std::true_type) const + { + char* endptr = nullptr; + errno = 0; // these are thread-local + const auto x = parse_integral(&endptr, std::is_signed()); + + static_assert(std::is_signed() // called right overload? + == std::is_signed(), ""); + + value = static_cast(x); + + return x == static_cast(value) // x fits into destination T + and (x != 0 or is_integral()) // strto[u]ll did nto fail + and errno == 0 // strto[u]ll did not overflow + and m_start < m_end // token was not empty + and endptr == m_end // parsed entire token exactly + and (x < 0) == (*m_start == '-'); // input was sign-compatible + } + +#else // parsing integral types manually + template bool parse(T& value, /*is_integral=*/std::true_type) const { @@ -8389,7 +8430,7 @@ class basic_json if (beg == end) { - return false; + return false; // empty token } const bool is_negative = (*beg == '-'); @@ -8410,7 +8451,7 @@ class basic_json bool valid = beg < end // must have some digits; and ( T(-1) < 0 // type must be signed - or !is_negative); // if value is negative + or !is_negative); // ...if value is negative while (beg < end and valid) { @@ -8437,6 +8478,7 @@ class basic_json // into positives is for edge cases, e.g. -2^63 return valid && (is_negative == (value < 0)); } +#endif }; /*! From d64336057569281386aafed096a6db77d8aa901a Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Tue, 6 Dec 2016 00:43:12 -0500 Subject: [PATCH 05/23] Bugfix: when working with C formatting functions we need to query C locales (localeconv) rather than std::locale --- src/json.hpp | 12 +++++++++++- src/json.hpp.re2c | 12 +++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 51b19518..f1666c6c 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9108,7 +9108,7 @@ basic_json_parser_66: // this is a helper to determine whether to // parse the token into floating-point or // integral type. We wouldn't need it if - // we had separate token types for interal + // we had separate token types for integral // and floating-point cases. bool is_integral() const { @@ -9172,9 +9172,19 @@ basic_json_parser_66: const char* data = m_start; const size_t len = static_cast(m_end - m_start); +#if 0 const char decimal_point_char = std::use_facet< std::numpunct >( std::locale()).decimal_point(); +#else + // Since dealing with strtod family of functions, + // need to use the C locales instead of C++ + const auto loc = localeconv(); + assert(loc != nullptr); + const char decimal_point_char = + !loc->decimal_point ? '.' + : loc->decimal_point[0]; +#endif // replace decimal separator with locale-specific // version, when necessary; data will be repointed diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index e5f344b2..b8059035 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8257,7 +8257,7 @@ class basic_json // this is a helper to determine whether to // parse the token into floating-point or // integral type. We wouldn't need it if - // we had separate token types for interal + // we had separate token types for integral // and floating-point cases. bool is_integral() const { @@ -8321,9 +8321,19 @@ class basic_json const char* data = m_start; const size_t len = static_cast(m_end - m_start); +#if 0 const char decimal_point_char = std::use_facet< std::numpunct >( std::locale()).decimal_point(); +#else + // Since dealing with strtod family of functions, + // need to use the C locales instead of C++ + const auto loc = localeconv(); + assert(loc != nullptr); + const char decimal_point_char = + !loc->decimal_point ? '.' + : loc->decimal_point[0]; +#endif // replace decimal separator with locale-specific // version, when necessary; data will be repointed From 0c87d5d6b34d8595ceb645be1e0b82c4a4eaafe2 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Tue, 6 Dec 2016 19:41:05 -0500 Subject: [PATCH 06/23] Refactored preprocessing with a lambda instead of do{...}while(0) --- src/json.hpp | 48 ++++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index f1666c6c..19f1f109 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9169,33 +9169,28 @@ basic_json_parser_66: template bool parse(T& value, /*is_integral=*/std::false_type) const { - const char* data = m_start; - const size_t len = static_cast(m_end - m_start); - -#if 0 - const char decimal_point_char = - std::use_facet< std::numpunct >( - std::locale()).decimal_point(); -#else - // Since dealing with strtod family of functions, - // need to use the C locales instead of C++ - const auto loc = localeconv(); - assert(loc != nullptr); - const char decimal_point_char = - !loc->decimal_point ? '.' - : loc->decimal_point[0]; -#endif - // replace decimal separator with locale-specific - // version, when necessary; data will be repointed - // to either buf or tempstr containing the fixed - // string. + // version, when necessary; data will point to + // either the original string, or buf, or tempstr + // containing the fixed string. std::string tempstr; std::array buf; - do { + const size_t len = static_cast(m_end - m_start); + + const char* const data = [this, &tempstr, &buf, len]() -> const char* + { + // Since dealing with strtod family of functions, + // we're getting the decimal point char from the + // C locale facilities instead of C++'s numpunct + // facet of the current std::locale; + const auto loc = localeconv(); + assert(loc != nullptr); + const char decimal_point_char = !loc->decimal_point ? '.' + : loc->decimal_point[0]; + if (decimal_point_char == '.') { - break; // don't need to convert + return m_start; // don't need to convert } const size_t ds_pos = static_cast( @@ -9203,7 +9198,7 @@ basic_json_parser_66: if (ds_pos == len) { - break; // no decimal separator + return m_start; // no decimal separator } // copy the data into the local buffer or @@ -9215,18 +9210,19 @@ basic_json_parser_66: std::copy(m_start, m_end, buf.data()); buf[len] = 0; buf[ds_pos] = decimal_point_char; - data = buf.data(); + return buf.data(); } else { tempstr.assign(m_start, m_end); tempstr[ds_pos] = decimal_point_char; - data = tempstr.c_str(); + return tempstr.c_str(); } - } while (0); + }(); char* endptr = nullptr; value = 0; + // this calls appropriate overload depending on T strtof(value, data, &endptr); // note that reading past the end is OK, the data may be, From 7a081244a575284246d38a594fb0436ca2b6dacc Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Tue, 6 Dec 2016 19:41:31 -0500 Subject: [PATCH 07/23] Refactored preprocessing with a lambda instead of do{...}while(0) --- src/json.hpp.re2c | 48 ++++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index b8059035..27a53fb7 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8318,33 +8318,28 @@ class basic_json template bool parse(T& value, /*is_integral=*/std::false_type) const { - const char* data = m_start; - const size_t len = static_cast(m_end - m_start); - -#if 0 - const char decimal_point_char = - std::use_facet< std::numpunct >( - std::locale()).decimal_point(); -#else - // Since dealing with strtod family of functions, - // need to use the C locales instead of C++ - const auto loc = localeconv(); - assert(loc != nullptr); - const char decimal_point_char = - !loc->decimal_point ? '.' - : loc->decimal_point[0]; -#endif - // replace decimal separator with locale-specific - // version, when necessary; data will be repointed - // to either buf or tempstr containing the fixed - // string. + // version, when necessary; data will point to + // either the original string, or buf, or tempstr + // containing the fixed string. std::string tempstr; std::array buf; - do { + const size_t len = static_cast(m_end - m_start); + + const char* const data = [this, &tempstr, &buf, len]() -> const char* + { + // Since dealing with strtod family of functions, + // we're getting the decimal point char from the + // C locale facilities instead of C++'s numpunct + // facet of the current std::locale; + const auto loc = localeconv(); + assert(loc != nullptr); + const char decimal_point_char = !loc->decimal_point ? '.' + : loc->decimal_point[0]; + if (decimal_point_char == '.') { - break; // don't need to convert + return m_start; // don't need to convert } const size_t ds_pos = static_cast( @@ -8352,7 +8347,7 @@ class basic_json if (ds_pos == len) { - break; // no decimal separator + return m_start; // no decimal separator } // copy the data into the local buffer or @@ -8364,18 +8359,19 @@ class basic_json std::copy(m_start, m_end, buf.data()); buf[len] = 0; buf[ds_pos] = decimal_point_char; - data = buf.data(); + return buf.data(); } else { tempstr.assign(m_start, m_end); tempstr[ds_pos] = decimal_point_char; - data = tempstr.c_str(); + return tempstr.c_str(); } - } while (0); + }(); char* endptr = nullptr; value = 0; + // this calls appropriate overload depending on T strtof(value, data, &endptr); // note that reading past the end is OK, the data may be, From 6e8da7d8c4aada79740076755ed487da505ceb28 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Tue, 6 Dec 2016 19:45:48 -0500 Subject: [PATCH 08/23] Added unit-test for issue #379 (locale-independent str-to-num) --- test/src/unit-regression.cpp | 38 ++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index 679b7f3d..2787fe26 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -403,6 +403,44 @@ TEST_CASE("regression tests") std::locale::global(orig_locale); } + SECTION("issue #379 - locale-independent str-to-num") + { + const std::string orig_locale_name(setlocale(LC_ALL, NULL)); + + setlocale(LC_NUMERIC, "de_DE"); + std::array buf; + + { + // verify that snprintf now uses commas as decimal-separator + std::snprintf(buf.data(), buf.size(), "%.2f", 3.14); + assert(std::strcmp(buf.data(), "3,14") == 0); + + // verify that strtod now uses commas as decimal-separator + const double d1 = std::strtod(buf.data(), nullptr); + assert(d1 == 3.14); + + // verify that strtod does not understand dots as decimal separator + const double d2 = std::strtod("3.14", nullptr); + assert(d2 == 3); + } + + const json j1 = json::parse("3.14"); + + // verify that parsed correctly despite using strtod internally + CHECK(j1.get() == 3.14); + + // verify that dumped correctly despite using snprintf internally + CHECK(j1.dump() == "3.14"); + + // check a different code path + const json j2 = json::parse("1.000000000000000000000000000000000000000000000000000000000000000000000000"); + CHECK(j2.get() == 1.0); + + // restore original locale + setlocale(LC_ALL, orig_locale_name.c_str()); + } + + SECTION("issue #233 - Can't use basic_json::iterator as a base iterator for std::move_iterator") { json source = {"a", "b", "c"}; From d2e9ce270a642e4ca7e00fc160c81eadb55a5639 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Tue, 6 Dec 2016 22:18:20 -0500 Subject: [PATCH 09/23] Trying to coerce setlocale to make snprintf use commas as delimiter; the behavior appears to be compiler/platform-specific --- test/src/unit-regression.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index 2787fe26..17616a3e 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -407,21 +407,21 @@ TEST_CASE("regression tests") { const std::string orig_locale_name(setlocale(LC_ALL, NULL)); - setlocale(LC_NUMERIC, "de_DE"); + setlocale(LC_NUMERIC, "fr_Fr.UTF-8"); std::array buf; { // verify that snprintf now uses commas as decimal-separator std::snprintf(buf.data(), buf.size(), "%.2f", 3.14); - assert(std::strcmp(buf.data(), "3,14") == 0); + CHECK(std::strcmp(buf.data(), "3,14") == 0); // verify that strtod now uses commas as decimal-separator const double d1 = std::strtod(buf.data(), nullptr); - assert(d1 == 3.14); + CHECK(d1 == 3.14); // verify that strtod does not understand dots as decimal separator const double d2 = std::strtod("3.14", nullptr); - assert(d2 == 3); + CHECK(d2 == 3); } const json j1 = json::parse("3.14"); From d169598c6c236ce115a8453d917d99efc79fa3a3 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Tue, 6 Dec 2016 22:20:48 -0500 Subject: [PATCH 10/23] simplified code a bit based on @gregmarr's suggestions --- src/json.hpp | 69 ++++++++++++++++++++++------------------------- src/json.hpp.re2c | 69 ++++++++++++++++++++++------------------------- 2 files changed, 64 insertions(+), 74 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 19f1f109..1644d91f 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9177,48 +9177,43 @@ basic_json_parser_66: std::array buf; const size_t len = static_cast(m_end - m_start); - const char* const data = [this, &tempstr, &buf, len]() -> const char* + // Since dealing with strtod family of functions, + // we're getting the decimal point char from the + // C locale facilities instead of C++'s numpunct + // facet of the current std::locale; + const auto loc = localeconv(); + assert(loc != nullptr); + const char decimal_point_char = !loc->decimal_point ? '.' + : loc->decimal_point[0]; + + const char *data = m_start; + + if (decimal_point_char != '.') { - // Since dealing with strtod family of functions, - // we're getting the decimal point char from the - // C locale facilities instead of C++'s numpunct - // facet of the current std::locale; - const auto loc = localeconv(); - assert(loc != nullptr); - const char decimal_point_char = !loc->decimal_point ? '.' - : loc->decimal_point[0]; - - if (decimal_point_char == '.') - { - return m_start; // don't need to convert - } - const size_t ds_pos = static_cast( std::find(m_start, m_end, '.') - m_start ); - - if (ds_pos == len) + + if (ds_pos != len) { - return m_start; // no decimal separator + // copy the data into the local buffer or + // tempstr, if buffer is too small; + // replace decimal separator, and update + // data to point to the modified bytes + if (len + 1 < buf.size()) + { + std::copy(m_start, m_end, buf.data()); + buf[len] = 0; + buf[ds_pos] = decimal_point_char; + data = buf.data(); + } + else + { + tempstr.assign(m_start, m_end); + tempstr[ds_pos] = decimal_point_char; + data = tempstr.c_str(); + } } - - // copy the data into the local buffer or - // tempstr, if buffer is too small; - // replace decimal separator, and update - // data to point to the modified bytes - if (len + 1 < buf.size()) - { - std::copy(m_start, m_end, buf.data()); - buf[len] = 0; - buf[ds_pos] = decimal_point_char; - return buf.data(); - } - else - { - tempstr.assign(m_start, m_end); - tempstr[ds_pos] = decimal_point_char; - return tempstr.c_str(); - } - }(); + } char* endptr = nullptr; value = 0; diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index 27a53fb7..b541db1e 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8326,48 +8326,43 @@ class basic_json std::array buf; const size_t len = static_cast(m_end - m_start); - const char* const data = [this, &tempstr, &buf, len]() -> const char* + // Since dealing with strtod family of functions, + // we're getting the decimal point char from the + // C locale facilities instead of C++'s numpunct + // facet of the current std::locale; + const auto loc = localeconv(); + assert(loc != nullptr); + const char decimal_point_char = !loc->decimal_point ? '.' + : loc->decimal_point[0]; + + const char *data = m_start; + + if (decimal_point_char != '.') { - // Since dealing with strtod family of functions, - // we're getting the decimal point char from the - // C locale facilities instead of C++'s numpunct - // facet of the current std::locale; - const auto loc = localeconv(); - assert(loc != nullptr); - const char decimal_point_char = !loc->decimal_point ? '.' - : loc->decimal_point[0]; - - if (decimal_point_char == '.') - { - return m_start; // don't need to convert - } - const size_t ds_pos = static_cast( std::find(m_start, m_end, '.') - m_start ); - - if (ds_pos == len) + + if (ds_pos != len) { - return m_start; // no decimal separator + // copy the data into the local buffer or + // tempstr, if buffer is too small; + // replace decimal separator, and update + // data to point to the modified bytes + if (len + 1 < buf.size()) + { + std::copy(m_start, m_end, buf.data()); + buf[len] = 0; + buf[ds_pos] = decimal_point_char; + data = buf.data(); + } + else + { + tempstr.assign(m_start, m_end); + tempstr[ds_pos] = decimal_point_char; + data = tempstr.c_str(); + } } - - // copy the data into the local buffer or - // tempstr, if buffer is too small; - // replace decimal separator, and update - // data to point to the modified bytes - if (len + 1 < buf.size()) - { - std::copy(m_start, m_end, buf.data()); - buf[len] = 0; - buf[ds_pos] = decimal_point_char; - return buf.data(); - } - else - { - tempstr.assign(m_start, m_end); - tempstr[ds_pos] = decimal_point_char; - return tempstr.c_str(); - } - }(); + } char* endptr = nullptr; value = 0; From 6774457733524e81d7aa6fd0f3df6fd48423d0f7 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Tue, 6 Dec 2016 22:59:12 -0500 Subject: [PATCH 11/23] Trying to coerce setlocale to make snprintf use commas as delimiter some more --- test/src/unit-regression.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index 17616a3e..41ddfa40 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -405,9 +405,14 @@ TEST_CASE("regression tests") SECTION("issue #379 - locale-independent str-to-num") { - const std::string orig_locale_name(setlocale(LC_ALL, NULL)); + // If I save the locale here and restore it in the end + // of this block, then setLocale(LC_NUMERIC, "de_DE") + // does not actually make snprintf use "," as decimal separator + // on some compilers. I have no idea... + // + //const std::string orig_locale_name(setlocale(LC_ALL, NULL)); - setlocale(LC_NUMERIC, "fr_Fr.UTF-8"); + setlocale(LC_NUMERIC, "de_DE"); std::array buf; { @@ -437,7 +442,7 @@ TEST_CASE("regression tests") CHECK(j2.get() == 1.0); // restore original locale - setlocale(LC_ALL, orig_locale_name.c_str()); + // setlocale(LC_ALL, orig_locale_name.c_str()); } From 0a4a6a8399d1f4194d009c50ec076dd287cafa9e Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Wed, 7 Dec 2016 19:53:27 -0500 Subject: [PATCH 12/23] Refactored to avoid using exceptions, as there are plans to support exceptionless mode --- src/json.hpp | 78 +++++++++++++++++++++++++++++++++++------------ src/json.hpp.re2c | 78 +++++++++++++++++++++++++++++++++++------------ 2 files changed, 118 insertions(+), 38 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 1644d91f..79d6bb02 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9079,10 +9079,42 @@ basic_json_parser_66: struct strtonum { public: - strtonum(const char* start, const char* end) + template + struct maybe + { + T x; + bool valid; + + maybe(const T& xx) + : x(xx), valid(true) + {} + + maybe() : x(), valid(false) + {} + + operator bool() const + { + return valid; + } + + const T& operator*() const + { + return x; + } + }; + + strtonum(const char* start, const char* end) : m_start(start), m_end(end) {} + template::value>::type > + bool to(T& val) const + { + return parse(val, std::is_integral()); + } + template::value>::type > @@ -9361,31 +9393,39 @@ basic_json_parser_66: const bool is_negative = *m_start == '-'; - try { - if (not num.is_integral()) - { - ; - } - else if (is_negative) - { + result.m_type = value_t::discarded; + + if (not num.is_integral()) + { + ; // will parse as float below + } + else if (is_negative) + { + number_integer_t val{0}; + if(num.to(val)) { result.m_type = value_t::number_integer; - result.m_value = static_cast(num); - return; - } - else - { - result.m_type = value_t::number_unsigned; - result.m_value = static_cast(num); - return; + result.m_value = val; } } - catch (std::invalid_argument&) + else { - ; // overflow - will parse as double + number_unsigned_t val{0}; + if(num.to(val)) { + result.m_type = value_t::number_unsigned; + result.m_value = val; + } + } + + number_float_t val{0}; + if (result.m_type != value_t::discarded + or !num.to(val)) + { + return; // already have a value from above + // or couldn't parse as float_t } result.m_type = value_t::number_float; - result.m_value = static_cast(num); + result.m_value = val; // replace infinity and NAN by null if (not std::isfinite(result.m_value.number_float)) diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index b541db1e..24c54975 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8228,10 +8228,42 @@ class basic_json struct strtonum { public: - strtonum(const char* start, const char* end) + template + struct maybe + { + T x; + bool valid; + + maybe(const T& xx) + : x(xx), valid(true) + {} + + maybe() : x(), valid(false) + {} + + operator bool() const + { + return valid; + } + + const T& operator*() const + { + return x; + } + }; + + strtonum(const char* start, const char* end) : m_start(start), m_end(end) {} + template::value>::type > + bool to(T& val) const + { + return parse(val, std::is_integral()); + } + template::value>::type > @@ -8510,31 +8542,39 @@ class basic_json const bool is_negative = *m_start == '-'; - try { - if (not num.is_integral()) - { - ; - } - else if (is_negative) - { + result.m_type = value_t::discarded; + + if (not num.is_integral()) + { + ; // will parse as float below + } + else if (is_negative) + { + number_integer_t val{0}; + if(num.to(val)) { result.m_type = value_t::number_integer; - result.m_value = static_cast(num); - return; - } - else - { - result.m_type = value_t::number_unsigned; - result.m_value = static_cast(num); - return; + result.m_value = val; } } - catch (std::invalid_argument&) + else { - ; // overflow - will parse as double + number_unsigned_t val{0}; + if(num.to(val)) { + result.m_type = value_t::number_unsigned; + result.m_value = val; + } + } + + number_float_t val{0}; + if (result.m_type != value_t::discarded + or !num.to(val)) + { + return; // already have a value from above + // or couldn't parse as float_t } result.m_type = value_t::number_float; - result.m_value = static_cast(num); + result.m_value = val; // replace infinity and NAN by null if (not std::isfinite(result.m_value.number_float)) From 27d9740ad6d9b564beb0e995e0ab393c6ab3daf7 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Wed, 7 Dec 2016 19:55:07 -0500 Subject: [PATCH 13/23] Tweaks to unit-test for issue #379 --- test/src/unit-regression.cpp | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index 41ddfa40..deace862 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -406,22 +406,27 @@ TEST_CASE("regression tests") SECTION("issue #379 - locale-independent str-to-num") { // If I save the locale here and restore it in the end - // of this block, then setLocale(LC_NUMERIC, "de_DE") + // of this block, then setLocale(LC_NUMERIC, "de_DE") // does not actually make snprintf use "," as decimal separator // on some compilers. I have no idea... - // //const std::string orig_locale_name(setlocale(LC_ALL, NULL)); - - setlocale(LC_NUMERIC, "de_DE"); + + + // Still, just setting some comma-using locale does not + // make snprintf output commas on all platforms, so instead + // will change dot to comma in the current locale below + //setlocale(LC_NUMERIC, "de_DE"); + + auto loc = localeconv(); + auto orig_decimal_point = loc->decimal_point; + char comma[] = ","; + loc->decimal_point = comma; + std::array buf; { - // verify that snprintf now uses commas as decimal-separator - std::snprintf(buf.data(), buf.size(), "%.2f", 3.14); - CHECK(std::strcmp(buf.data(), "3,14") == 0); - // verify that strtod now uses commas as decimal-separator - const double d1 = std::strtod(buf.data(), nullptr); + const double d1 = std::strtod("3,14", nullptr); CHECK(d1 == 3.14); // verify that strtod does not understand dots as decimal separator @@ -429,18 +434,15 @@ TEST_CASE("regression tests") CHECK(d2 == 3); } - const json j1 = json::parse("3.14"); - // verify that parsed correctly despite using strtod internally + const json j1 = json::parse("3.14"); CHECK(j1.get() == 3.14); - // verify that dumped correctly despite using snprintf internally - CHECK(j1.dump() == "3.14"); - // check a different code path const json j2 = json::parse("1.000000000000000000000000000000000000000000000000000000000000000000000000"); CHECK(j2.get() == 1.0); + loc->decimal_point = orig_decimal_point; // restore original locale // setlocale(LC_ALL, orig_locale_name.c_str()); } From 38499e84fc771b21db7764ff4e014f7c3939d4b0 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Thu, 8 Dec 2016 21:38:14 -0500 Subject: [PATCH 14/23] Removed unused struct; fixed comments --- src/json.hpp | 69 +++++++++-------------------------------------- src/json.hpp.re2c | 69 +++++++++-------------------------------------- 2 files changed, 24 insertions(+), 114 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 79d6bb02..867afbb3 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9065,12 +9065,6 @@ basic_json_parser_66: @brief parse string into a built-in arithmetic type as if the current locale is POSIX. - e.g. const auto x = static_cast("123.4"); - - throw if can't parse the entire string as a number, - or if the destination type is integral and the value - is outside of the type's range. - note: in floating-point case strtod may parse past the token's end - this is not an error. @@ -9079,34 +9073,15 @@ basic_json_parser_66: struct strtonum { public: - template - struct maybe - { - T x; - bool valid; - - maybe(const T& xx) - : x(xx), valid(true) - {} - - maybe() : x(), valid(false) - {} - - operator bool() const - { - return valid; - } - - const T& operator*() const - { - return x; - } - }; - strtonum(const char* start, const char* end) : m_start(start), m_end(end) {} + /// return true iff parsed successfully as + /// number of type T. + /// + /// @val shall contain parsed value, or + /// undefined value if could not parse. template::value>::type > @@ -9115,33 +9090,13 @@ basic_json_parser_66: return parse(val, std::is_integral()); } - template::value>::type > - operator T() const - { - T val{0}; - - if (parse(val, std::is_integral())) - { - return val; - } - - throw std::invalid_argument( - std::string() - + "Can't parse '" - + std::string(m_start, m_end) - + "' as type " - + typeid(T).name()); - } - - // return true iff token matches ^[+-]\d+$ - // - // this is a helper to determine whether to - // parse the token into floating-point or - // integral type. We wouldn't need it if - // we had separate token types for integral - // and floating-point cases. + /// return true iff token matches ^[+-]\d+$ + /// + /// this is a helper to determine whether to + /// parse the token into floating-point or + /// integral type. We wouldn't need it if + /// we had separate token types for integral + /// and floating-point cases. bool is_integral() const { const char* p = m_start; diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index 24c54975..5b2dc8d2 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8214,12 +8214,6 @@ class basic_json @brief parse string into a built-in arithmetic type as if the current locale is POSIX. - e.g. const auto x = static_cast("123.4"); - - throw if can't parse the entire string as a number, - or if the destination type is integral and the value - is outside of the type's range. - note: in floating-point case strtod may parse past the token's end - this is not an error. @@ -8228,34 +8222,15 @@ class basic_json struct strtonum { public: - template - struct maybe - { - T x; - bool valid; - - maybe(const T& xx) - : x(xx), valid(true) - {} - - maybe() : x(), valid(false) - {} - - operator bool() const - { - return valid; - } - - const T& operator*() const - { - return x; - } - }; - strtonum(const char* start, const char* end) : m_start(start), m_end(end) {} + /// return true iff parsed successfully as + /// number of type T. + /// + /// @val shall contain parsed value, or + /// undefined value if could not parse. template::value>::type > @@ -8264,33 +8239,13 @@ class basic_json return parse(val, std::is_integral()); } - template::value>::type > - operator T() const - { - T val{0}; - - if (parse(val, std::is_integral())) - { - return val; - } - - throw std::invalid_argument( - std::string() - + "Can't parse '" - + std::string(m_start, m_end) - + "' as type " - + typeid(T).name()); - } - - // return true iff token matches ^[+-]\d+$ - // - // this is a helper to determine whether to - // parse the token into floating-point or - // integral type. We wouldn't need it if - // we had separate token types for integral - // and floating-point cases. + /// return true iff token matches ^[+-]\d+$ + /// + /// this is a helper to determine whether to + /// parse the token into floating-point or + /// integral type. We wouldn't need it if + /// we had separate token types for integral + /// and floating-point cases. bool is_integral() const { const char* p = m_start; From 1c029b97c0ebc90b5b7f1cb875b0904f115668b8 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Thu, 8 Dec 2016 22:13:05 -0500 Subject: [PATCH 15/23] Still trying to invoke locale-specific behavior in CI --- test/src/unit-regression.cpp | 35 +++++------------------------------ 1 file changed, 5 insertions(+), 30 deletions(-) diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index deace862..5cd6356d 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -405,46 +405,21 @@ TEST_CASE("regression tests") SECTION("issue #379 - locale-independent str-to-num") { - // If I save the locale here and restore it in the end - // of this block, then setLocale(LC_NUMERIC, "de_DE") - // does not actually make snprintf use "," as decimal separator - // on some compilers. I have no idea... - //const std::string orig_locale_name(setlocale(LC_ALL, NULL)); - - - // Still, just setting some comma-using locale does not - // make snprintf output commas on all platforms, so instead - // will change dot to comma in the current locale below - //setlocale(LC_NUMERIC, "de_DE"); - - auto loc = localeconv(); - auto orig_decimal_point = loc->decimal_point; - char comma[] = ","; - loc->decimal_point = comma; - - std::array buf; + setlocale(LC_NUMERIC, "de_DE.UTF-8"); { // verify that strtod now uses commas as decimal-separator - const double d1 = std::strtod("3,14", nullptr); - CHECK(d1 == 3.14); + CHECK(std::strtod("3,14", nullptr) == 3.14); // verify that strtod does not understand dots as decimal separator - const double d2 = std::strtod("3.14", nullptr); - CHECK(d2 == 3); + CHECK(std::strtod("3.14", nullptr) == 3); } // verify that parsed correctly despite using strtod internally - const json j1 = json::parse("3.14"); - CHECK(j1.get() == 3.14); + CHECK(json::parse("3.14").get() == 3.14); // check a different code path - const json j2 = json::parse("1.000000000000000000000000000000000000000000000000000000000000000000000000"); - CHECK(j2.get() == 1.0); - - loc->decimal_point = orig_decimal_point; - // restore original locale - // setlocale(LC_ALL, orig_locale_name.c_str()); + CHECK(json::parse("1.000000000000000000000000000000000000000000000000000000000000000000000000").get() == 1.0); } From cd0b651d43c303dbe0b8016a1305ab9e5fef6449 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Mon, 12 Dec 2016 19:46:47 -0500 Subject: [PATCH 16/23] Tweaked check for preserved sign; added LCOV_EXCL_LINE --- src/json.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 867afbb3..b526400c 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9103,7 +9103,7 @@ basic_json_parser_66: if (!p) { - return false; + return false; // LCOV_EXCL_LINE } if (*p == '-' or *p == '+') @@ -9113,7 +9113,7 @@ basic_json_parser_66: if (p == m_end) { - return false; + return false; // LCOV_EXCL_LINE } while (p < m_end and *p >= '0' @@ -9252,11 +9252,11 @@ basic_json_parser_66: value = static_cast(x); return x == static_cast(value) // x fits into destination T + and (x < 0) == (value < 0) // preserved sign and (x != 0 or is_integral()) // strto[u]ll did nto fail and errno == 0 // strto[u]ll did not overflow and m_start < m_end // token was not empty - and endptr == m_end // parsed entire token exactly - and (x < 0) == (*m_start == '-'); // input was sign-compatible + and endptr == m_end; // parsed entire token exactly } #else // parsing integral types manually From 0f8de48ddbc03b842ef5523b10553d3d319e1666 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Mon, 12 Dec 2016 19:48:14 -0500 Subject: [PATCH 17/23] Disabling strtod pre-check, since can't get locale-specific behavior to manifest in AppVeyor --- test/src/unit-regression.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index 5cd6356d..7016fcaf 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -407,6 +407,9 @@ TEST_CASE("regression tests") { setlocale(LC_NUMERIC, "de_DE.UTF-8"); + // disabled, because locale-specific beharivor is not + // triggered in AppVeyor for some reason +#if 0 { // verify that strtod now uses commas as decimal-separator CHECK(std::strtod("3,14", nullptr) == 3.14); @@ -414,6 +417,7 @@ TEST_CASE("regression tests") // verify that strtod does not understand dots as decimal separator CHECK(std::strtod("3.14", nullptr) == 3); } +#endif // verify that parsed correctly despite using strtod internally CHECK(json::parse("3.14").get() == 3.14); From 5cad2006eb99534274c380af1ad92d5b257c83bc Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Mon, 12 Dec 2016 20:15:57 -0500 Subject: [PATCH 18/23] Tweaked check for preserved sign; added LCOV_EXCL_LINE --- src/json.hpp.re2c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index 5b2dc8d2..208489ea 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8252,7 +8252,7 @@ class basic_json if (!p) { - return false; + return false; // LCOV_EXCL_LINE } if (*p == '-' or *p == '+') @@ -8262,7 +8262,7 @@ class basic_json if (p == m_end) { - return false; + return false; // LCOV_EXCL_LINE } while (p < m_end and *p >= '0' @@ -8401,11 +8401,11 @@ class basic_json value = static_cast(x); return x == static_cast(value) // x fits into destination T + and (x < 0) == (value < 0) // preserved sign and (x != 0 or is_integral()) // strto[u]ll did nto fail and errno == 0 // strto[u]ll did not overflow and m_start < m_end // token was not empty - and endptr == m_end // parsed entire token exactly - and (x < 0) == (*m_start == '-'); // input was sign-compatible + and endptr == m_end; // parsed entire token exactly } #else // parsing integral types manually From 9f5dccbab5765638632b5b380fdecd4533dc5c3e Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Sat, 11 Feb 2017 15:55:53 +0100 Subject: [PATCH 19/23] :lipstick: cleanup --- src/json.hpp | 146 ++++++++++++++++++++-------------------------- src/json.hpp.re2c | 146 ++++++++++++++++++++-------------------------- 2 files changed, 128 insertions(+), 164 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 143ae0b8..2d98db8c 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -10840,13 +10840,13 @@ basic_json_parser_66: /*! - @brief parse string into a built-in arithmetic type as if - the current locale is POSIX. + @brief parse string into a built-in arithmetic type as if the current + locale is POSIX. - note: in floating-point case strtod may parse - past the token's end - this is not an error. + @note in floating-point case strtod may parse past the token's end - + this is not an error - any leading blanks are not handled. + @note any leading blanks are not handled */ struct strtonum { @@ -10855,36 +10855,37 @@ basic_json_parser_66: : m_start(start), m_end(end) {} - /// return true iff parsed successfully as - /// number of type T. - /// - /// @val shall contain parsed value, or - /// undefined value if could not parse. - template::value>::type > + /*! + @return true iff parsed successfully as number of type T + + @param[in,out] val shall contain parsed value, or undefined value + if could not parse + */ + template::value>::type> bool to(T& val) const { return parse(val, std::is_integral()); } - /// return true iff token matches ^[+-]\d+$ - /// - /// this is a helper to determine whether to - /// parse the token into floating-point or - /// integral type. We wouldn't need it if - /// we had separate token types for integral - /// and floating-point cases. + /*! + This is a helper to determine whether to parse the token into + floating-point or integral type. + + @note We wouldn't need it if we had separate token types for + integral and floating-point cases. + + @return true iff token matches `^[+-]\d+$` + */ bool is_integral() const { const char* p = m_start; - if (!p) + if (p == nullptr) { return false; // LCOV_EXCL_LINE } - if (*p == '-' or * p == '+') + if ((*p == '-') or (*p == '+')) { ++p; } @@ -10894,39 +10895,31 @@ basic_json_parser_66: return false; // LCOV_EXCL_LINE } - while (p < m_end and* p >= '0' - and * p <= '9') + while ((p < m_end) and (*p >= '0') and (*p <= '9')) { ++p; } - return p == m_end; + return (p == m_end); } private: const char* const m_start = nullptr; - const char* const m_end = nullptr; + const char* const m_end = nullptr; // overloaded wrappers for strtod/strtof/strtold // that will be called from parse - - static void strtof(float& f, - const char* str, - char** endptr) + static void strtof(float& f, const char* str, char** endptr) { f = std::strtof(str, endptr); } - static void strtof(double& f, - const char* str, - char** endptr) + static void strtof(double& f, const char* str, char** endptr) { f = std::strtod(str, endptr); } - static void strtof(long double& f, - const char* str, - char** endptr) + static void strtof(long double& f, const char* str, char** endptr) { f = std::strtold(str, endptr); } @@ -10934,37 +10927,32 @@ basic_json_parser_66: template bool parse(T& value, /*is_integral=*/std::false_type) const { - // replace decimal separator with locale-specific - // version, when necessary; data will point to - // either the original string, or buf, or tempstr - // containing the fixed string. + // replace decimal separator with locale-specific version, + // when necessary; data will point to either the original + // string, or buf, or tempstr containing the fixed string. std::string tempstr; std::array buf; const size_t len = static_cast(m_end - m_start); - // Since dealing with strtod family of functions, - // we're getting the decimal point char from the - // C locale facilities instead of C++'s numpunct - // facet of the current std::locale; + // since dealing with strtod family of functions, we're + // getting the decimal point char from the C locale facilities + // instead of C++'s numpunct facet of the current std::locale const auto loc = localeconv(); assert(loc != nullptr); - const char decimal_point_char = !loc->decimal_point ? '.' - : loc->decimal_point[0]; + const char decimal_point_char = (loc->decimal_point == nullptr) ? '.' : loc->decimal_point[0]; const char* data = m_start; if (decimal_point_char != '.') { - const size_t ds_pos = static_cast( - std::find(m_start, m_end, '.') - m_start ); + const size_t ds_pos = static_cast(std::find(m_start, m_end, '.') - m_start); if (ds_pos != len) { - // copy the data into the local buffer or - // tempstr, if buffer is too small; - // replace decimal separator, and update - // data to point to the modified bytes - if (len + 1 < buf.size()) + // copy the data into the local buffer or tempstr, if + // buffer is too small; replace decimal separator, and + // update data to point to the modified bytes + if ((len + 1) < buf.size()) { std::copy(m_start, m_end, buf.data()); buf[len] = 0; @@ -10985,13 +10973,12 @@ basic_json_parser_66: // this calls appropriate overload depending on T strtof(value, data, &endptr); - // note that reading past the end is OK, the data may be, - // for example, "123.", where the parsed token only - // contains "123", but strtod will read the dot as well. - const bool ok = endptr >= data + len - and len > 0; + // note that reading past the end is OK, the data may be, for + // example, "123.", where the parsed token only contains + // "123", but strtod will read the dot as well. + const bool ok = (endptr >= (data + len)) and (len > 0); - if (ok and value == 0.0 and * data == '-') + if (ok and (value == 0.0) and (*data == '-')) { // some implementations forget to negate the zero value = -0.0; @@ -11000,16 +10987,12 @@ basic_json_parser_66: return ok; } - signed long long parse_integral( - char** endptr, - /*is_signed*/std::true_type) const + signed long long parse_integral(char** endptr, /*is_signed*/std::true_type) const { return std::strtoll(m_start, endptr, 10); } - unsigned long long parse_integral( - char** endptr, - /*is_signed*/std::false_type) const + unsigned long long parse_integral(char** endptr, /*is_signed*/std::false_type) const { return std::strtoull(m_start, endptr, 10); } @@ -11018,20 +11001,20 @@ basic_json_parser_66: bool parse(T& value, /*is_integral=*/std::true_type) const { char* endptr = nullptr; - errno = 0; // these are thread-local + errno = 0; // these are thread-local const auto x = parse_integral(&endptr, std::is_signed()); - static_assert(std::is_signed() // called right overload? - == std::is_signed(), ""); + // called right overload? + static_assert(std::is_signed() == std::is_signed(), ""); value = static_cast(x); - return x == static_cast(value) // x fits into destination T - and (x < 0) == (value < 0) // preserved sign - and (x != 0 or is_integral()) // strto[u]ll did nto fail - and errno == 0 // strto[u]ll did not overflow - and m_start < m_end // token was not empty - and endptr == m_end; // parsed entire token exactly + return (x == static_cast(value)) // x fits into destination T + and (x < 0) == (value < 0) // preserved sign + and ((x != 0) or is_integral()) // strto[u]ll did nto fail + and (errno == 0) // strto[u]ll did not overflow + and (m_start < m_end) // token was not empty + and (endptr == m_end); // parsed entire token exactly } }; @@ -11061,20 +11044,20 @@ basic_json_parser_66: strtonum num(reinterpret_cast(m_start), reinterpret_cast(m_cursor)); - const bool is_negative = *m_start == '-'; + const bool is_negative = (*m_start == '-'); result.m_type = value_t::discarded; if (not num.is_integral()) { - ; // will parse as float below + // will parse as float below } else if (is_negative) { number_integer_t val{0}; if (num.to(val)) { - result.m_type = value_t::number_integer; + result.m_type = value_t::number_integer; result.m_value = val; } } @@ -11089,14 +11072,13 @@ basic_json_parser_66: } number_float_t val{0}; - if (result.m_type != value_t::discarded - or !num.to(val)) + if (result.m_type != value_t::discarded or (not num.to(val))) { - return; // already have a value from above - // or couldn't parse as float_t + // already have a value from above or couldn't parse as float_t + return; } - result.m_type = value_t::number_float; + result.m_type = value_t::number_float; result.m_value = val; // replace infinity and NAN by null diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index 698fbb3b..b3f7af8d 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -9990,13 +9990,13 @@ class basic_json /*! - @brief parse string into a built-in arithmetic type as if - the current locale is POSIX. + @brief parse string into a built-in arithmetic type as if the current + locale is POSIX. - note: in floating-point case strtod may parse - past the token's end - this is not an error. + @note in floating-point case strtod may parse past the token's end - + this is not an error - any leading blanks are not handled. + @note any leading blanks are not handled */ struct strtonum { @@ -10005,36 +10005,37 @@ class basic_json : m_start(start), m_end(end) {} - /// return true iff parsed successfully as - /// number of type T. - /// - /// @val shall contain parsed value, or - /// undefined value if could not parse. - template::value>::type > + /*! + @return true iff parsed successfully as number of type T + + @param[in,out] val shall contain parsed value, or undefined value + if could not parse + */ + template::value>::type> bool to(T& val) const { return parse(val, std::is_integral()); } - /// return true iff token matches ^[+-]\d+$ - /// - /// this is a helper to determine whether to - /// parse the token into floating-point or - /// integral type. We wouldn't need it if - /// we had separate token types for integral - /// and floating-point cases. + /*! + This is a helper to determine whether to parse the token into + floating-point or integral type. + + @note We wouldn't need it if we had separate token types for + integral and floating-point cases. + + @return true iff token matches `^[+-]\d+$` + */ bool is_integral() const { const char* p = m_start; - if (!p) + if (p == nullptr) { return false; // LCOV_EXCL_LINE } - if (*p == '-' or * p == '+') + if ((*p == '-') or (*p == '+')) { ++p; } @@ -10044,39 +10045,31 @@ class basic_json return false; // LCOV_EXCL_LINE } - while (p < m_end and* p >= '0' - and * p <= '9') + while ((p < m_end) and (*p >= '0') and (*p <= '9')) { ++p; } - return p == m_end; + return (p == m_end); } private: const char* const m_start = nullptr; - const char* const m_end = nullptr; + const char* const m_end = nullptr; // overloaded wrappers for strtod/strtof/strtold // that will be called from parse - - static void strtof(float& f, - const char* str, - char** endptr) + static void strtof(float& f, const char* str, char** endptr) { f = std::strtof(str, endptr); } - static void strtof(double& f, - const char* str, - char** endptr) + static void strtof(double& f, const char* str, char** endptr) { f = std::strtod(str, endptr); } - static void strtof(long double& f, - const char* str, - char** endptr) + static void strtof(long double& f, const char* str, char** endptr) { f = std::strtold(str, endptr); } @@ -10084,37 +10077,32 @@ class basic_json template bool parse(T& value, /*is_integral=*/std::false_type) const { - // replace decimal separator with locale-specific - // version, when necessary; data will point to - // either the original string, or buf, or tempstr - // containing the fixed string. + // replace decimal separator with locale-specific version, + // when necessary; data will point to either the original + // string, or buf, or tempstr containing the fixed string. std::string tempstr; std::array buf; const size_t len = static_cast(m_end - m_start); - // Since dealing with strtod family of functions, - // we're getting the decimal point char from the - // C locale facilities instead of C++'s numpunct - // facet of the current std::locale; + // since dealing with strtod family of functions, we're + // getting the decimal point char from the C locale facilities + // instead of C++'s numpunct facet of the current std::locale const auto loc = localeconv(); assert(loc != nullptr); - const char decimal_point_char = !loc->decimal_point ? '.' - : loc->decimal_point[0]; + const char decimal_point_char = (loc->decimal_point == nullptr) ? '.' : loc->decimal_point[0]; const char* data = m_start; if (decimal_point_char != '.') { - const size_t ds_pos = static_cast( - std::find(m_start, m_end, '.') - m_start ); + const size_t ds_pos = static_cast(std::find(m_start, m_end, '.') - m_start); if (ds_pos != len) { - // copy the data into the local buffer or - // tempstr, if buffer is too small; - // replace decimal separator, and update - // data to point to the modified bytes - if (len + 1 < buf.size()) + // copy the data into the local buffer or tempstr, if + // buffer is too small; replace decimal separator, and + // update data to point to the modified bytes + if ((len + 1) < buf.size()) { std::copy(m_start, m_end, buf.data()); buf[len] = 0; @@ -10135,13 +10123,12 @@ class basic_json // this calls appropriate overload depending on T strtof(value, data, &endptr); - // note that reading past the end is OK, the data may be, - // for example, "123.", where the parsed token only - // contains "123", but strtod will read the dot as well. - const bool ok = endptr >= data + len - and len > 0; + // note that reading past the end is OK, the data may be, for + // example, "123.", where the parsed token only contains + // "123", but strtod will read the dot as well. + const bool ok = (endptr >= (data + len)) and (len > 0); - if (ok and value == 0.0 and * data == '-') + if (ok and (value == 0.0) and (*data == '-')) { // some implementations forget to negate the zero value = -0.0; @@ -10150,16 +10137,12 @@ class basic_json return ok; } - signed long long parse_integral( - char** endptr, - /*is_signed*/std::true_type) const + signed long long parse_integral(char** endptr, /*is_signed*/std::true_type) const { return std::strtoll(m_start, endptr, 10); } - unsigned long long parse_integral( - char** endptr, - /*is_signed*/std::false_type) const + unsigned long long parse_integral(char** endptr, /*is_signed*/std::false_type) const { return std::strtoull(m_start, endptr, 10); } @@ -10168,20 +10151,20 @@ class basic_json bool parse(T& value, /*is_integral=*/std::true_type) const { char* endptr = nullptr; - errno = 0; // these are thread-local + errno = 0; // these are thread-local const auto x = parse_integral(&endptr, std::is_signed()); - static_assert(std::is_signed() // called right overload? - == std::is_signed(), ""); + // called right overload? + static_assert(std::is_signed() == std::is_signed(), ""); value = static_cast(x); - return x == static_cast(value) // x fits into destination T - and (x < 0) == (value < 0) // preserved sign - and (x != 0 or is_integral()) // strto[u]ll did nto fail - and errno == 0 // strto[u]ll did not overflow - and m_start < m_end // token was not empty - and endptr == m_end; // parsed entire token exactly + return (x == static_cast(value)) // x fits into destination T + and (x < 0) == (value < 0) // preserved sign + and ((x != 0) or is_integral()) // strto[u]ll did nto fail + and (errno == 0) // strto[u]ll did not overflow + and (m_start < m_end) // token was not empty + and (endptr == m_end); // parsed entire token exactly } }; @@ -10211,20 +10194,20 @@ class basic_json strtonum num(reinterpret_cast(m_start), reinterpret_cast(m_cursor)); - const bool is_negative = *m_start == '-'; + const bool is_negative = (*m_start == '-'); result.m_type = value_t::discarded; if (not num.is_integral()) { - ; // will parse as float below + // will parse as float below } else if (is_negative) { number_integer_t val{0}; if (num.to(val)) { - result.m_type = value_t::number_integer; + result.m_type = value_t::number_integer; result.m_value = val; } } @@ -10239,14 +10222,13 @@ class basic_json } number_float_t val{0}; - if (result.m_type != value_t::discarded - or !num.to(val)) + if (result.m_type != value_t::discarded or (not num.to(val))) { - return; // already have a value from above - // or couldn't parse as float_t + // already have a value from above or couldn't parse as float_t + return; } - result.m_type = value_t::number_float; + result.m_type = value_t::number_float; result.m_value = val; // replace infinity and NAN by null From c2d55109c1f6b5b39902f52291dee28263b90fa1 Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Sat, 11 Feb 2017 16:54:25 +0100 Subject: [PATCH 20/23] :hammer: make lexer distinguishes number types --- src/json.hpp | 318 ++++++++++++++++++++-------------- src/json.hpp.re2c | 136 +++++++-------- test/src/unit-class_lexer.cpp | 36 ++-- 3 files changed, 275 insertions(+), 215 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 2d98db8c..b8cdc480 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9444,7 +9444,9 @@ class basic_json literal_false, ///< the `false` literal literal_null, ///< the `null` literal value_string, ///< a string -- use get_string() for actual value - value_number, ///< a number -- use get_number() for actual value + value_unsigned_integer, ///< an unsigned integer -- use get_number() for actual value + value_signed_integer, ///< a signed integer -- use get_number() for actual value + value_float, ///< an floating point number -- use get_number() for actual value begin_array, ///< the character for array begin `[` begin_object, ///< the character for object begin `{` end_array, ///< the character for array end `]` @@ -9596,7 +9598,9 @@ class basic_json return "null literal"; case token_type::value_string: return "string literal"; - case token_type::value_number: + case lexer::token_type::value_unsigned_integer: + case lexer::token_type::value_signed_integer: + case lexer::token_type::value_float: return "number literal"; case token_type::begin_array: return "'['"; @@ -9869,11 +9873,11 @@ basic_json_parser_12: } if (yych <= '0') { - goto basic_json_parser_13; + goto basic_json_parser_43; } if (yych <= '9') { - goto basic_json_parser_15; + goto basic_json_parser_45; } goto basic_json_parser_5; basic_json_parser_13: @@ -9883,23 +9887,23 @@ basic_json_parser_13: { if (yych == '.') { - goto basic_json_parser_43; + goto basic_json_parser_47; } } else { if (yych <= 'E') { - goto basic_json_parser_44; + goto basic_json_parser_48; } if (yych == 'e') { - goto basic_json_parser_44; + goto basic_json_parser_48; } } basic_json_parser_14: { - last_token_type = token_type::value_number; + last_token_type = token_type::value_unsigned_integer; break; } basic_json_parser_15: @@ -9918,7 +9922,7 @@ basic_json_parser_15: { if (yych == '.') { - goto basic_json_parser_43; + goto basic_json_parser_47; } goto basic_json_parser_14; } @@ -9926,11 +9930,11 @@ basic_json_parser_15: { if (yych <= 'E') { - goto basic_json_parser_44; + goto basic_json_parser_48; } if (yych == 'e') { - goto basic_json_parser_44; + goto basic_json_parser_48; } goto basic_json_parser_14; } @@ -9957,7 +9961,7 @@ basic_json_parser_23: yych = *(m_marker = ++m_cursor); if (yych == 'a') { - goto basic_json_parser_45; + goto basic_json_parser_49; } goto basic_json_parser_5; basic_json_parser_24: @@ -9965,7 +9969,7 @@ basic_json_parser_24: yych = *(m_marker = ++m_cursor); if (yych == 'u') { - goto basic_json_parser_46; + goto basic_json_parser_50; } goto basic_json_parser_5; basic_json_parser_25: @@ -9973,7 +9977,7 @@ basic_json_parser_25: yych = *(m_marker = ++m_cursor); if (yych == 'r') { - goto basic_json_parser_47; + goto basic_json_parser_51; } goto basic_json_parser_5; basic_json_parser_26: @@ -10055,13 +10059,27 @@ basic_json_parser_31: } basic_json_parser_32: m_cursor = m_marker; - if (yyaccept == 0) + if (yyaccept <= 1) { - goto basic_json_parser_5; + if (yyaccept == 0) + { + goto basic_json_parser_5; + } + else + { + goto basic_json_parser_14; + } } else { - goto basic_json_parser_14; + if (yyaccept == 2) + { + goto basic_json_parser_44; + } + else + { + goto basic_json_parser_55; + } } basic_json_parser_33: ++m_cursor; @@ -10142,7 +10160,7 @@ basic_json_parser_35: } if (yych <= 'u') { - goto basic_json_parser_48; + goto basic_json_parser_52; } goto basic_json_parser_32; } @@ -10261,6 +10279,71 @@ basic_json_parser_42: } goto basic_json_parser_32; basic_json_parser_43: + yyaccept = 2; + yych = *(m_marker = ++m_cursor); + if (yych <= 'D') + { + if (yych == '.') + { + goto basic_json_parser_47; + } + } + else + { + if (yych <= 'E') + { + goto basic_json_parser_48; + } + if (yych == 'e') + { + goto basic_json_parser_48; + } + } +basic_json_parser_44: + { + last_token_type = token_type::value_signed_integer; + break; + } +basic_json_parser_45: + yyaccept = 2; + m_marker = ++m_cursor; + if ((m_limit - m_cursor) < 3) + { + fill_line_buffer(3); // LCOV_EXCL_LINE + } + yych = *m_cursor; + if (yych <= '9') + { + if (yych == '.') + { + goto basic_json_parser_47; + } + if (yych <= '/') + { + goto basic_json_parser_44; + } + goto basic_json_parser_45; + } + else + { + if (yych <= 'E') + { + if (yych <= 'D') + { + goto basic_json_parser_44; + } + goto basic_json_parser_48; + } + else + { + if (yych == 'e') + { + goto basic_json_parser_48; + } + goto basic_json_parser_44; + } + } +basic_json_parser_47: yych = *++m_cursor; if (yych <= '/') { @@ -10268,16 +10351,16 @@ basic_json_parser_43: } if (yych <= '9') { - goto basic_json_parser_49; + goto basic_json_parser_53; } goto basic_json_parser_32; -basic_json_parser_44: +basic_json_parser_48: yych = *++m_cursor; if (yych <= ',') { if (yych == '+') { - goto basic_json_parser_51; + goto basic_json_parser_56; } goto basic_json_parser_32; } @@ -10285,7 +10368,7 @@ basic_json_parser_44: { if (yych <= '-') { - goto basic_json_parser_51; + goto basic_json_parser_56; } if (yych <= '/') { @@ -10293,32 +10376,32 @@ basic_json_parser_44: } if (yych <= '9') { - goto basic_json_parser_52; + goto basic_json_parser_57; } goto basic_json_parser_32; } -basic_json_parser_45: +basic_json_parser_49: yych = *++m_cursor; if (yych == 'l') { - goto basic_json_parser_54; + goto basic_json_parser_59; } goto basic_json_parser_32; -basic_json_parser_46: +basic_json_parser_50: yych = *++m_cursor; if (yych == 'l') { - goto basic_json_parser_55; + goto basic_json_parser_60; } goto basic_json_parser_32; -basic_json_parser_47: +basic_json_parser_51: yych = *++m_cursor; if (yych == 'u') { - goto basic_json_parser_56; + goto basic_json_parser_61; } goto basic_json_parser_32; -basic_json_parser_48: +basic_json_parser_52: ++m_cursor; if (m_limit <= m_cursor) { @@ -10333,7 +10416,7 @@ basic_json_parser_48: } if (yych <= '9') { - goto basic_json_parser_57; + goto basic_json_parser_62; } goto basic_json_parser_32; } @@ -10341,7 +10424,7 @@ basic_json_parser_48: { if (yych <= 'F') { - goto basic_json_parser_57; + goto basic_json_parser_62; } if (yych <= '`') { @@ -10349,12 +10432,12 @@ basic_json_parser_48: } if (yych <= 'f') { - goto basic_json_parser_57; + goto basic_json_parser_62; } goto basic_json_parser_32; } -basic_json_parser_49: - yyaccept = 1; +basic_json_parser_53: + yyaccept = 3; m_marker = ++m_cursor; if ((m_limit - m_cursor) < 3) { @@ -10365,27 +10448,30 @@ basic_json_parser_49: { if (yych <= '/') { - goto basic_json_parser_14; + goto basic_json_parser_55; } if (yych <= '9') { - goto basic_json_parser_49; + goto basic_json_parser_53; } - goto basic_json_parser_14; } else { if (yych <= 'E') { - goto basic_json_parser_44; + goto basic_json_parser_48; } if (yych == 'e') { - goto basic_json_parser_44; + goto basic_json_parser_48; } - goto basic_json_parser_14; } -basic_json_parser_51: +basic_json_parser_55: + { + last_token_type = token_type::value_float; + break; + } +basic_json_parser_56: yych = *++m_cursor; if (yych <= '/') { @@ -10395,7 +10481,7 @@ basic_json_parser_51: { goto basic_json_parser_32; } -basic_json_parser_52: +basic_json_parser_57: ++m_cursor; if (m_limit <= m_cursor) { @@ -10404,35 +10490,35 @@ basic_json_parser_52: yych = *m_cursor; if (yych <= '/') { - goto basic_json_parser_14; + goto basic_json_parser_55; } if (yych <= '9') { - goto basic_json_parser_52; + goto basic_json_parser_57; } - goto basic_json_parser_14; -basic_json_parser_54: + goto basic_json_parser_55; +basic_json_parser_59: yych = *++m_cursor; if (yych == 's') { - goto basic_json_parser_58; + goto basic_json_parser_63; } goto basic_json_parser_32; -basic_json_parser_55: +basic_json_parser_60: yych = *++m_cursor; if (yych == 'l') { - goto basic_json_parser_59; + goto basic_json_parser_64; } goto basic_json_parser_32; -basic_json_parser_56: +basic_json_parser_61: yych = *++m_cursor; if (yych == 'e') { - goto basic_json_parser_61; + goto basic_json_parser_66; } goto basic_json_parser_32; -basic_json_parser_57: +basic_json_parser_62: ++m_cursor; if (m_limit <= m_cursor) { @@ -10447,7 +10533,7 @@ basic_json_parser_57: } if (yych <= '9') { - goto basic_json_parser_63; + goto basic_json_parser_68; } goto basic_json_parser_32; } @@ -10455,7 +10541,7 @@ basic_json_parser_57: { if (yych <= 'F') { - goto basic_json_parser_63; + goto basic_json_parser_68; } if (yych <= '`') { @@ -10463,30 +10549,30 @@ basic_json_parser_57: } if (yych <= 'f') { - goto basic_json_parser_63; + goto basic_json_parser_68; } goto basic_json_parser_32; } -basic_json_parser_58: +basic_json_parser_63: yych = *++m_cursor; if (yych == 'e') { - goto basic_json_parser_64; + goto basic_json_parser_69; } goto basic_json_parser_32; -basic_json_parser_59: +basic_json_parser_64: ++m_cursor; { last_token_type = token_type::literal_null; break; } -basic_json_parser_61: +basic_json_parser_66: ++m_cursor; { last_token_type = token_type::literal_true; break; } -basic_json_parser_63: +basic_json_parser_68: ++m_cursor; if (m_limit <= m_cursor) { @@ -10501,7 +10587,7 @@ basic_json_parser_63: } if (yych <= '9') { - goto basic_json_parser_66; + goto basic_json_parser_71; } goto basic_json_parser_32; } @@ -10509,7 +10595,7 @@ basic_json_parser_63: { if (yych <= 'F') { - goto basic_json_parser_66; + goto basic_json_parser_71; } if (yych <= '`') { @@ -10517,17 +10603,17 @@ basic_json_parser_63: } if (yych <= 'f') { - goto basic_json_parser_66; + goto basic_json_parser_71; } goto basic_json_parser_32; } -basic_json_parser_64: +basic_json_parser_69: ++m_cursor; { last_token_type = token_type::literal_false; break; } -basic_json_parser_66: +basic_json_parser_71: ++m_cursor; if (m_limit <= m_cursor) { @@ -10867,42 +10953,6 @@ basic_json_parser_66: return parse(val, std::is_integral()); } - /*! - This is a helper to determine whether to parse the token into - floating-point or integral type. - - @note We wouldn't need it if we had separate token types for - integral and floating-point cases. - - @return true iff token matches `^[+-]\d+$` - */ - bool is_integral() const - { - const char* p = m_start; - - if (p == nullptr) - { - return false; // LCOV_EXCL_LINE - } - - if ((*p == '-') or (*p == '+')) - { - ++p; - } - - if (p == m_end) - { - return false; // LCOV_EXCL_LINE - } - - while ((p < m_end) and (*p >= '0') and (*p <= '9')) - { - ++p; - } - - return (p == m_end); - } - private: const char* const m_start = nullptr; const char* const m_end = nullptr; @@ -11011,7 +11061,7 @@ basic_json_parser_66: return (x == static_cast(value)) // x fits into destination T and (x < 0) == (value < 0) // preserved sign - and ((x != 0) or is_integral()) // strto[u]ll did nto fail + //and ((x != 0) or is_integral()) // strto[u]ll did nto fail and (errno == 0) // strto[u]ll did not overflow and (m_start < m_end) // token was not empty and (endptr == m_end); // parsed entire token exactly @@ -11035,46 +11085,56 @@ basic_json_parser_66: interpreted as a number @param[out] result @ref basic_json object to receive the number. + @param[in] token the type of the number token */ - void get_number(basic_json& result) const + void get_number(basic_json& result, const token_type token) const { assert(m_start != nullptr); assert(m_start < m_cursor); + assert((token == token_type::value_unsigned_integer) or + (token == token_type::value_signed_integer) or + (token == token_type::value_float)); strtonum num(reinterpret_cast(m_start), reinterpret_cast(m_cursor)); - const bool is_negative = (*m_start == '-'); - - result.m_type = value_t::discarded; - - if (not num.is_integral()) + switch (token) { - // will parse as float below - } - else if (is_negative) - { - number_integer_t val{0}; - if (num.to(val)) + case lexer::token_type::value_unsigned_integer: { - result.m_type = value_t::number_integer; - result.m_value = val; + number_unsigned_t val{0}; + if (num.to(val)) + { + result.m_type = value_t::number_unsigned; + result.m_value = val; + return; + } + break; } - } - else - { - number_unsigned_t val{0}; - if (num.to(val)) + + case lexer::token_type::value_signed_integer: { - result.m_type = value_t::number_unsigned; - result.m_value = val; + number_integer_t val{0}; + if (num.to(val)) + { + result.m_type = value_t::number_integer; + result.m_value = val; + return; + } + break; + } + + default: + { + break; } } number_float_t val{0}; - if (result.m_type != value_t::discarded or (not num.to(val))) + if (not num.to(val)) { - // already have a value from above or couldn't parse as float_t + // couldn't parse as float_t + result.m_type = value_t::discarded; return; } @@ -11330,9 +11390,11 @@ basic_json_parser_66: break; } - case lexer::token_type::value_number: + case lexer::token_type::value_unsigned_integer: + case lexer::token_type::value_signed_integer: + case lexer::token_type::value_float: { - m_lexer.get_number(result); + m_lexer.get_number(result, last_token); get_token(); break; } diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index b3f7af8d..51760eef 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -9444,7 +9444,9 @@ class basic_json literal_false, ///< the `false` literal literal_null, ///< the `null` literal value_string, ///< a string -- use get_string() for actual value - value_number, ///< a number -- use get_number() for actual value + value_unsigned_integer, ///< an unsigned integer -- use get_number() for actual value + value_signed_integer, ///< a signed integer -- use get_number() for actual value + value_float, ///< an floating point number -- use get_number() for actual value begin_array, ///< the character for array begin `[` begin_object, ///< the character for object begin `{` end_array, ///< the character for array end `]` @@ -9596,7 +9598,9 @@ class basic_json return "null literal"; case token_type::value_string: return "string literal"; - case token_type::value_number: + case lexer::token_type::value_unsigned_integer: + case lexer::token_type::value_signed_integer: + case lexer::token_type::value_float: return "number literal"; case token_type::begin_array: return "'['"; @@ -9684,18 +9688,22 @@ class basic_json "false" { last_token_type = token_type::literal_false; break; } // number - decimal_point = "."; - digit = [0-9]; - digit_1_9 = [1-9]; - e = "e" | "E"; - minus = "-"; - plus = "+"; - zero = "0"; - exp = e (minus | plus)? digit+; - frac = decimal_point digit+; - int = (zero | digit_1_9 digit*); - number = minus? int frac? exp?; - number { last_token_type = token_type::value_number; break; } + decimal_point = "."; + digit = [0-9]; + digit_1_9 = [1-9]; + e = "e" | "E"; + minus = "-"; + plus = "+"; + zero = "0"; + exp = e (minus | plus)? digit+; + frac = decimal_point digit+; + int = (zero | digit_1_9 digit*); + number_unsigned = int; + number_unsigned { last_token_type = token_type::value_unsigned_integer; break; } + number_signed = minus int; + number_signed { last_token_type = token_type::value_signed_integer; break; } + number_float = minus? int frac? exp?; + number_float { last_token_type = token_type::value_float; break; } // string quotation_mark = "\""; @@ -10017,42 +10025,6 @@ class basic_json return parse(val, std::is_integral()); } - /*! - This is a helper to determine whether to parse the token into - floating-point or integral type. - - @note We wouldn't need it if we had separate token types for - integral and floating-point cases. - - @return true iff token matches `^[+-]\d+$` - */ - bool is_integral() const - { - const char* p = m_start; - - if (p == nullptr) - { - return false; // LCOV_EXCL_LINE - } - - if ((*p == '-') or (*p == '+')) - { - ++p; - } - - if (p == m_end) - { - return false; // LCOV_EXCL_LINE - } - - while ((p < m_end) and (*p >= '0') and (*p <= '9')) - { - ++p; - } - - return (p == m_end); - } - private: const char* const m_start = nullptr; const char* const m_end = nullptr; @@ -10161,7 +10133,7 @@ class basic_json return (x == static_cast(value)) // x fits into destination T and (x < 0) == (value < 0) // preserved sign - and ((x != 0) or is_integral()) // strto[u]ll did nto fail + //and ((x != 0) or is_integral()) // strto[u]ll did nto fail and (errno == 0) // strto[u]ll did not overflow and (m_start < m_end) // token was not empty and (endptr == m_end); // parsed entire token exactly @@ -10185,46 +10157,56 @@ class basic_json interpreted as a number @param[out] result @ref basic_json object to receive the number. + @param[in] token the type of the number token */ - void get_number(basic_json& result) const + void get_number(basic_json& result, const token_type token) const { assert(m_start != nullptr); assert(m_start < m_cursor); + assert((token == token_type::value_unsigned_integer) or + (token == token_type::value_signed_integer) or + (token == token_type::value_float)); strtonum num(reinterpret_cast(m_start), reinterpret_cast(m_cursor)); - const bool is_negative = (*m_start == '-'); - - result.m_type = value_t::discarded; - - if (not num.is_integral()) + switch (token) { - // will parse as float below - } - else if (is_negative) - { - number_integer_t val{0}; - if (num.to(val)) + case lexer::token_type::value_unsigned_integer: { - result.m_type = value_t::number_integer; - result.m_value = val; + number_unsigned_t val{0}; + if (num.to(val)) + { + result.m_type = value_t::number_unsigned; + result.m_value = val; + return; + } + break; } - } - else - { - number_unsigned_t val{0}; - if (num.to(val)) + + case lexer::token_type::value_signed_integer: { - result.m_type = value_t::number_unsigned; - result.m_value = val; + number_integer_t val{0}; + if (num.to(val)) + { + result.m_type = value_t::number_integer; + result.m_value = val; + return; + } + break; + } + + default: + { + break; } } number_float_t val{0}; - if (result.m_type != value_t::discarded or (not num.to(val))) + if (not num.to(val)) { - // already have a value from above or couldn't parse as float_t + // couldn't parse as float_t + result.m_type = value_t::discarded; return; } @@ -10480,9 +10462,11 @@ class basic_json break; } - case lexer::token_type::value_number: + case lexer::token_type::value_unsigned_integer: + case lexer::token_type::value_signed_integer: + case lexer::token_type::value_float: { - m_lexer.get_number(result); + m_lexer.get_number(result, last_token); get_token(); break; } diff --git a/test/src/unit-class_lexer.cpp b/test/src/unit-class_lexer.cpp index ac43de8a..268f503a 100644 --- a/test/src/unit-class_lexer.cpp +++ b/test/src/unit-class_lexer.cpp @@ -65,25 +65,37 @@ TEST_CASE("lexer class") SECTION("numbers") { CHECK((json::lexer(reinterpret_cast("0"), - 1).scan() == json::lexer::token_type::value_number)); + 1).scan() == json::lexer::token_type::value_unsigned_integer)); CHECK((json::lexer(reinterpret_cast("1"), - 1).scan() == json::lexer::token_type::value_number)); + 1).scan() == json::lexer::token_type::value_unsigned_integer)); CHECK((json::lexer(reinterpret_cast("2"), - 1).scan() == json::lexer::token_type::value_number)); + 1).scan() == json::lexer::token_type::value_unsigned_integer)); CHECK((json::lexer(reinterpret_cast("3"), - 1).scan() == json::lexer::token_type::value_number)); + 1).scan() == json::lexer::token_type::value_unsigned_integer)); CHECK((json::lexer(reinterpret_cast("4"), - 1).scan() == json::lexer::token_type::value_number)); + 1).scan() == json::lexer::token_type::value_unsigned_integer)); CHECK((json::lexer(reinterpret_cast("5"), - 1).scan() == json::lexer::token_type::value_number)); + 1).scan() == json::lexer::token_type::value_unsigned_integer)); CHECK((json::lexer(reinterpret_cast("6"), - 1).scan() == json::lexer::token_type::value_number)); + 1).scan() == json::lexer::token_type::value_unsigned_integer)); CHECK((json::lexer(reinterpret_cast("7"), - 1).scan() == json::lexer::token_type::value_number)); + 1).scan() == json::lexer::token_type::value_unsigned_integer)); CHECK((json::lexer(reinterpret_cast("8"), - 1).scan() == json::lexer::token_type::value_number)); + 1).scan() == json::lexer::token_type::value_unsigned_integer)); CHECK((json::lexer(reinterpret_cast("9"), - 1).scan() == json::lexer::token_type::value_number)); + 1).scan() == json::lexer::token_type::value_unsigned_integer)); + + CHECK((json::lexer(reinterpret_cast("-0"), + 2).scan() == json::lexer::token_type::value_signed_integer)); + CHECK((json::lexer(reinterpret_cast("-1"), + 2).scan() == json::lexer::token_type::value_signed_integer)); + + CHECK((json::lexer(reinterpret_cast("1.1"), + 3).scan() == json::lexer::token_type::value_float)); + CHECK((json::lexer(reinterpret_cast("-1.1"), + 4).scan() == json::lexer::token_type::value_float)); + CHECK((json::lexer(reinterpret_cast("1E10"), + 4).scan() == json::lexer::token_type::value_float)); } SECTION("whitespace") @@ -109,7 +121,9 @@ TEST_CASE("lexer class") CHECK((json::lexer::token_type_name(json::lexer::token_type::literal_false) == "false literal")); CHECK((json::lexer::token_type_name(json::lexer::token_type::literal_null) == "null literal")); CHECK((json::lexer::token_type_name(json::lexer::token_type::value_string) == "string literal")); - CHECK((json::lexer::token_type_name(json::lexer::token_type::value_number) == "number literal")); + CHECK((json::lexer::token_type_name(json::lexer::token_type::value_unsigned_integer) == "number literal")); + CHECK((json::lexer::token_type_name(json::lexer::token_type::value_signed_integer) == "number literal")); + CHECK((json::lexer::token_type_name(json::lexer::token_type::value_float) == "number literal")); CHECK((json::lexer::token_type_name(json::lexer::token_type::begin_array) == "'['")); CHECK((json::lexer::token_type_name(json::lexer::token_type::begin_object) == "'{'")); CHECK((json::lexer::token_type_name(json::lexer::token_type::end_array) == "']'")); From c8191c81728a389cc8869935575eae9cca8117dd Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Sun, 12 Feb 2017 18:50:17 +0100 Subject: [PATCH 21/23] :hammer: further cleanup --- Makefile | 2 +- src/json.hpp | 42 +++++++++++++++++---------------- src/json.hpp.re2c | 44 ++++++++++++++++++----------------- test/src/unit-class_lexer.cpp | 28 +++++++++++----------- test/src/unit-regression.cpp | 2 +- 5 files changed, 61 insertions(+), 57 deletions(-) diff --git a/Makefile b/Makefile index 56cbdb0c..8bd7a2dd 100644 --- a/Makefile +++ b/Makefile @@ -94,7 +94,7 @@ cppcheck: # run clang sanitize (we are overrding the CXXFLAGS provided by travis in order to use gcc's libstdc++) clang_sanitize: clean - CXX=clang++ CXXFLAGS="-g -O2 -fsanitize=address -fsanitize=undefined -fno-omit-frame-pointer" $(MAKE) + CXX=clang++ CXXFLAGS="-g -O2 -fsanitize=address -fsanitize=undefined -fno-omit-frame-pointer" $(MAKE) check ########################################################################## diff --git a/src/json.hpp b/src/json.hpp index b8cdc480..873b8252 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9444,8 +9444,8 @@ class basic_json literal_false, ///< the `false` literal literal_null, ///< the `null` literal value_string, ///< a string -- use get_string() for actual value - value_unsigned_integer, ///< an unsigned integer -- use get_number() for actual value - value_signed_integer, ///< a signed integer -- use get_number() for actual value + value_unsigned, ///< an unsigned integer -- use get_number() for actual value + value_integer, ///< a signed integer -- use get_number() for actual value value_float, ///< an floating point number -- use get_number() for actual value begin_array, ///< the character for array begin `[` begin_object, ///< the character for object begin `{` @@ -9598,8 +9598,8 @@ class basic_json return "null literal"; case token_type::value_string: return "string literal"; - case lexer::token_type::value_unsigned_integer: - case lexer::token_type::value_signed_integer: + case lexer::token_type::value_unsigned: + case lexer::token_type::value_integer: case lexer::token_type::value_float: return "number literal"; case token_type::begin_array: @@ -9903,7 +9903,7 @@ basic_json_parser_13: } basic_json_parser_14: { - last_token_type = token_type::value_unsigned_integer; + last_token_type = token_type::value_unsigned; break; } basic_json_parser_15: @@ -10301,7 +10301,7 @@ basic_json_parser_43: } basic_json_parser_44: { - last_token_type = token_type::value_signed_integer; + last_token_type = token_type::value_integer; break; } basic_json_parser_45: @@ -11091,19 +11091,19 @@ basic_json_parser_71: { assert(m_start != nullptr); assert(m_start < m_cursor); - assert((token == token_type::value_unsigned_integer) or - (token == token_type::value_signed_integer) or + assert((token == token_type::value_unsigned) or + (token == token_type::value_integer) or (token == token_type::value_float)); - strtonum num(reinterpret_cast(m_start), - reinterpret_cast(m_cursor)); + strtonum num_converter(reinterpret_cast(m_start), + reinterpret_cast(m_cursor)); switch (token) { - case lexer::token_type::value_unsigned_integer: + case lexer::token_type::value_unsigned: { - number_unsigned_t val{0}; - if (num.to(val)) + number_unsigned_t val; + if (num_converter.to(val)) { result.m_type = value_t::number_unsigned; result.m_value = val; @@ -11112,10 +11112,10 @@ basic_json_parser_71: break; } - case lexer::token_type::value_signed_integer: + case lexer::token_type::value_integer: { - number_integer_t val{0}; - if (num.to(val)) + number_integer_t val; + if (num_converter.to(val)) { result.m_type = value_t::number_integer; result.m_value = val; @@ -11130,8 +11130,10 @@ basic_json_parser_71: } } - number_float_t val{0}; - if (not num.to(val)) + // parse float (either explicitly or because a previous conversion + // failed) + number_float_t val; + if (not num_converter.to(val)) { // couldn't parse as float_t result.m_type = value_t::discarded; @@ -11390,8 +11392,8 @@ basic_json_parser_71: break; } - case lexer::token_type::value_unsigned_integer: - case lexer::token_type::value_signed_integer: + case lexer::token_type::value_unsigned: + case lexer::token_type::value_integer: case lexer::token_type::value_float: { m_lexer.get_number(result, last_token); diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index 51760eef..cf6ed432 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -9444,8 +9444,8 @@ class basic_json literal_false, ///< the `false` literal literal_null, ///< the `null` literal value_string, ///< a string -- use get_string() for actual value - value_unsigned_integer, ///< an unsigned integer -- use get_number() for actual value - value_signed_integer, ///< a signed integer -- use get_number() for actual value + value_unsigned, ///< an unsigned integer -- use get_number() for actual value + value_integer, ///< a signed integer -- use get_number() for actual value value_float, ///< an floating point number -- use get_number() for actual value begin_array, ///< the character for array begin `[` begin_object, ///< the character for object begin `{` @@ -9598,8 +9598,8 @@ class basic_json return "null literal"; case token_type::value_string: return "string literal"; - case lexer::token_type::value_unsigned_integer: - case lexer::token_type::value_signed_integer: + case lexer::token_type::value_unsigned: + case lexer::token_type::value_integer: case lexer::token_type::value_float: return "number literal"; case token_type::begin_array: @@ -9699,9 +9699,9 @@ class basic_json frac = decimal_point digit+; int = (zero | digit_1_9 digit*); number_unsigned = int; - number_unsigned { last_token_type = token_type::value_unsigned_integer; break; } - number_signed = minus int; - number_signed { last_token_type = token_type::value_signed_integer; break; } + number_unsigned { last_token_type = token_type::value_unsigned; break; } + number_integer = minus int; + number_integer { last_token_type = token_type::value_integer; break; } number_float = minus? int frac? exp?; number_float { last_token_type = token_type::value_float; break; } @@ -10163,19 +10163,19 @@ class basic_json { assert(m_start != nullptr); assert(m_start < m_cursor); - assert((token == token_type::value_unsigned_integer) or - (token == token_type::value_signed_integer) or + assert((token == token_type::value_unsigned) or + (token == token_type::value_integer) or (token == token_type::value_float)); - strtonum num(reinterpret_cast(m_start), - reinterpret_cast(m_cursor)); + strtonum num_converter(reinterpret_cast(m_start), + reinterpret_cast(m_cursor)); switch (token) { - case lexer::token_type::value_unsigned_integer: + case lexer::token_type::value_unsigned: { - number_unsigned_t val{0}; - if (num.to(val)) + number_unsigned_t val; + if (num_converter.to(val)) { result.m_type = value_t::number_unsigned; result.m_value = val; @@ -10184,10 +10184,10 @@ class basic_json break; } - case lexer::token_type::value_signed_integer: + case lexer::token_type::value_integer: { - number_integer_t val{0}; - if (num.to(val)) + number_integer_t val; + if (num_converter.to(val)) { result.m_type = value_t::number_integer; result.m_value = val; @@ -10202,8 +10202,10 @@ class basic_json } } - number_float_t val{0}; - if (not num.to(val)) + // parse float (either explicitly or because a previous conversion + // failed) + number_float_t val; + if (not num_converter.to(val)) { // couldn't parse as float_t result.m_type = value_t::discarded; @@ -10462,8 +10464,8 @@ class basic_json break; } - case lexer::token_type::value_unsigned_integer: - case lexer::token_type::value_signed_integer: + case lexer::token_type::value_unsigned: + case lexer::token_type::value_integer: case lexer::token_type::value_float: { m_lexer.get_number(result, last_token); diff --git a/test/src/unit-class_lexer.cpp b/test/src/unit-class_lexer.cpp index 268f503a..79dbb766 100644 --- a/test/src/unit-class_lexer.cpp +++ b/test/src/unit-class_lexer.cpp @@ -65,30 +65,30 @@ TEST_CASE("lexer class") SECTION("numbers") { CHECK((json::lexer(reinterpret_cast("0"), - 1).scan() == json::lexer::token_type::value_unsigned_integer)); + 1).scan() == json::lexer::token_type::value_unsigned)); CHECK((json::lexer(reinterpret_cast("1"), - 1).scan() == json::lexer::token_type::value_unsigned_integer)); + 1).scan() == json::lexer::token_type::value_unsigned)); CHECK((json::lexer(reinterpret_cast("2"), - 1).scan() == json::lexer::token_type::value_unsigned_integer)); + 1).scan() == json::lexer::token_type::value_unsigned)); CHECK((json::lexer(reinterpret_cast("3"), - 1).scan() == json::lexer::token_type::value_unsigned_integer)); + 1).scan() == json::lexer::token_type::value_unsigned)); CHECK((json::lexer(reinterpret_cast("4"), - 1).scan() == json::lexer::token_type::value_unsigned_integer)); + 1).scan() == json::lexer::token_type::value_unsigned)); CHECK((json::lexer(reinterpret_cast("5"), - 1).scan() == json::lexer::token_type::value_unsigned_integer)); + 1).scan() == json::lexer::token_type::value_unsigned)); CHECK((json::lexer(reinterpret_cast("6"), - 1).scan() == json::lexer::token_type::value_unsigned_integer)); + 1).scan() == json::lexer::token_type::value_unsigned)); CHECK((json::lexer(reinterpret_cast("7"), - 1).scan() == json::lexer::token_type::value_unsigned_integer)); + 1).scan() == json::lexer::token_type::value_unsigned)); CHECK((json::lexer(reinterpret_cast("8"), - 1).scan() == json::lexer::token_type::value_unsigned_integer)); + 1).scan() == json::lexer::token_type::value_unsigned)); CHECK((json::lexer(reinterpret_cast("9"), - 1).scan() == json::lexer::token_type::value_unsigned_integer)); + 1).scan() == json::lexer::token_type::value_unsigned)); CHECK((json::lexer(reinterpret_cast("-0"), - 2).scan() == json::lexer::token_type::value_signed_integer)); + 2).scan() == json::lexer::token_type::value_integer)); CHECK((json::lexer(reinterpret_cast("-1"), - 2).scan() == json::lexer::token_type::value_signed_integer)); + 2).scan() == json::lexer::token_type::value_integer)); CHECK((json::lexer(reinterpret_cast("1.1"), 3).scan() == json::lexer::token_type::value_float)); @@ -121,8 +121,8 @@ TEST_CASE("lexer class") CHECK((json::lexer::token_type_name(json::lexer::token_type::literal_false) == "false literal")); CHECK((json::lexer::token_type_name(json::lexer::token_type::literal_null) == "null literal")); CHECK((json::lexer::token_type_name(json::lexer::token_type::value_string) == "string literal")); - CHECK((json::lexer::token_type_name(json::lexer::token_type::value_unsigned_integer) == "number literal")); - CHECK((json::lexer::token_type_name(json::lexer::token_type::value_signed_integer) == "number literal")); + CHECK((json::lexer::token_type_name(json::lexer::token_type::value_unsigned) == "number literal")); + CHECK((json::lexer::token_type_name(json::lexer::token_type::value_integer) == "number literal")); CHECK((json::lexer::token_type_name(json::lexer::token_type::value_float) == "number literal")); CHECK((json::lexer::token_type_name(json::lexer::token_type::begin_array) == "'['")); CHECK((json::lexer::token_type_name(json::lexer::token_type::begin_object) == "'{'")); diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index 251c4cb8..dcfcf62b 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -417,7 +417,7 @@ TEST_CASE("regression tests") // disabled, because locale-specific beharivor is not // triggered in AppVeyor for some reason -#if 0 +#ifndef _MSC_VER { // verify that strtod now uses commas as decimal-separator CHECK(std::strtod("3,14", nullptr) == 3.14); From b84705d5577c8e44bbb9e3fde81886ade19fcb81 Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Sun, 12 Feb 2017 20:14:43 +0100 Subject: [PATCH 22/23] :white_check_mark: improved test coverage --- test/src/unit-class_parser.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/test/src/unit-class_parser.cpp b/test/src/unit-class_parser.cpp index 59ea3cef..caa672a1 100644 --- a/test/src/unit-class_parser.cpp +++ b/test/src/unit-class_parser.cpp @@ -101,6 +101,7 @@ TEST_CASE("parser class") CHECK_THROWS_WITH(json::parser("\"\b\"").parse(), "parse error - unexpected '\"'"); // improve code coverage CHECK_THROWS_AS(json::parser("\uFF01").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("[-4:1,]").parse(), std::invalid_argument); // unescaped control characters CHECK_THROWS_AS(json::parser("\"\x00\"").parse(), std::invalid_argument); CHECK_THROWS_AS(json::parser("\"\x01\"").parse(), std::invalid_argument); From 265c5b5207cec0327c9e4844a56bcecaaa1f9a8d Mon Sep 17 00:00:00 2001 From: Niels Lohmann Date: Mon, 13 Feb 2017 18:51:23 +0100 Subject: [PATCH 23/23] :hammer: more work on the number parser --- src/json.hpp | 60 ++++++++++++++++++++++------------ src/json.hpp.re2c | 60 ++++++++++++++++++++++------------ test/src/unit-class_parser.cpp | 7 +++- 3 files changed, 84 insertions(+), 43 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 873b8252..bf9386b3 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -10957,6 +10957,8 @@ basic_json_parser_71: const char* const m_start = nullptr; const char* const m_end = nullptr; + // floating-point conversion + // overloaded wrappers for strtod/strtof/strtold // that will be called from parse static void strtof(float& f, const char* str, char** endptr) @@ -10984,6 +10986,9 @@ basic_json_parser_71: std::array buf; const size_t len = static_cast(m_end - m_start); + // lexer will reject empty numbers + assert(len > 0); + // since dealing with strtod family of functions, we're // getting the decimal point char from the C locale facilities // instead of C++'s numpunct facet of the current std::locale @@ -11023,10 +11028,9 @@ basic_json_parser_71: // this calls appropriate overload depending on T strtof(value, data, &endptr); - // note that reading past the end is OK, the data may be, for - // example, "123.", where the parsed token only contains - // "123", but strtod will read the dot as well. - const bool ok = (endptr >= (data + len)) and (len > 0); + // parsing was successful iff strtof parsed exactly the number + // of characters determined by the lexer (len) + const bool ok = (endptr == (data + len)); if (ok and (value == 0.0) and (*data == '-')) { @@ -11037,6 +11041,8 @@ basic_json_parser_71: return ok; } + // integral conversion + signed long long parse_integral(char** endptr, /*is_signed*/std::true_type) const { return std::strtoll(m_start, endptr, 10); @@ -11087,7 +11093,7 @@ basic_json_parser_71: @param[out] result @ref basic_json object to receive the number. @param[in] token the type of the number token */ - void get_number(basic_json& result, const token_type token) const + bool get_number(basic_json& result, const token_type token) const { assert(m_start != nullptr); assert(m_start < m_cursor); @@ -11105,9 +11111,10 @@ basic_json_parser_71: number_unsigned_t val; if (num_converter.to(val)) { + // parsing successful result.m_type = value_t::number_unsigned; result.m_value = val; - return; + return true; } break; } @@ -11117,9 +11124,10 @@ basic_json_parser_71: number_integer_t val; if (num_converter.to(val)) { + // parsing successful result.m_type = value_t::number_integer; result.m_value = val; - return; + return true; } break; } @@ -11133,22 +11141,24 @@ basic_json_parser_71: // parse float (either explicitly or because a previous conversion // failed) number_float_t val; - if (not num_converter.to(val)) + if (num_converter.to(val)) { - // couldn't parse as float_t - result.m_type = value_t::discarded; - return; + // parsing successful + result.m_type = value_t::number_float; + result.m_value = val; + + // replace infinity and NAN by null + if (not std::isfinite(result.m_value.number_float)) + { + result.m_type = value_t::null; + result.m_value = basic_json::json_value(); + } + + return true; } - result.m_type = value_t::number_float; - result.m_value = val; - - // replace infinity and NAN by null - if (not std::isfinite(result.m_value.number_float)) - { - result.m_type = value_t::null; - result.m_value = basic_json::json_value(); - } + // couldn't parse number in any format + return false; } private: @@ -11396,8 +11406,16 @@ basic_json_parser_71: case lexer::token_type::value_integer: case lexer::token_type::value_float: { - m_lexer.get_number(result, last_token); + const bool ok = m_lexer.get_number(result, last_token); get_token(); + + // if number conversion was unsuccessful, then is is + // because the number was directly followed by an + // unexpected character (e.g. "01" where "1" is unexpected) + if (not ok) + { + unexpect(last_token); + } break; } diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index cf6ed432..e15f7b1f 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -10029,6 +10029,8 @@ class basic_json const char* const m_start = nullptr; const char* const m_end = nullptr; + // floating-point conversion + // overloaded wrappers for strtod/strtof/strtold // that will be called from parse static void strtof(float& f, const char* str, char** endptr) @@ -10056,6 +10058,9 @@ class basic_json std::array buf; const size_t len = static_cast(m_end - m_start); + // lexer will reject empty numbers + assert(len > 0); + // since dealing with strtod family of functions, we're // getting the decimal point char from the C locale facilities // instead of C++'s numpunct facet of the current std::locale @@ -10095,10 +10100,9 @@ class basic_json // this calls appropriate overload depending on T strtof(value, data, &endptr); - // note that reading past the end is OK, the data may be, for - // example, "123.", where the parsed token only contains - // "123", but strtod will read the dot as well. - const bool ok = (endptr >= (data + len)) and (len > 0); + // parsing was successful iff strtof parsed exactly the number + // of characters determined by the lexer (len) + const bool ok = (endptr == (data + len)); if (ok and (value == 0.0) and (*data == '-')) { @@ -10109,6 +10113,8 @@ class basic_json return ok; } + // integral conversion + signed long long parse_integral(char** endptr, /*is_signed*/std::true_type) const { return std::strtoll(m_start, endptr, 10); @@ -10159,7 +10165,7 @@ class basic_json @param[out] result @ref basic_json object to receive the number. @param[in] token the type of the number token */ - void get_number(basic_json& result, const token_type token) const + bool get_number(basic_json& result, const token_type token) const { assert(m_start != nullptr); assert(m_start < m_cursor); @@ -10177,9 +10183,10 @@ class basic_json number_unsigned_t val; if (num_converter.to(val)) { + // parsing successful result.m_type = value_t::number_unsigned; result.m_value = val; - return; + return true; } break; } @@ -10189,9 +10196,10 @@ class basic_json number_integer_t val; if (num_converter.to(val)) { + // parsing successful result.m_type = value_t::number_integer; result.m_value = val; - return; + return true; } break; } @@ -10205,22 +10213,24 @@ class basic_json // parse float (either explicitly or because a previous conversion // failed) number_float_t val; - if (not num_converter.to(val)) + if (num_converter.to(val)) { - // couldn't parse as float_t - result.m_type = value_t::discarded; - return; + // parsing successful + result.m_type = value_t::number_float; + result.m_value = val; + + // replace infinity and NAN by null + if (not std::isfinite(result.m_value.number_float)) + { + result.m_type = value_t::null; + result.m_value = basic_json::json_value(); + } + + return true; } - result.m_type = value_t::number_float; - result.m_value = val; - - // replace infinity and NAN by null - if (not std::isfinite(result.m_value.number_float)) - { - result.m_type = value_t::null; - result.m_value = basic_json::json_value(); - } + // couldn't parse number in any format + return false; } private: @@ -10468,8 +10478,16 @@ class basic_json case lexer::token_type::value_integer: case lexer::token_type::value_float: { - m_lexer.get_number(result, last_token); + const bool ok = m_lexer.get_number(result, last_token); get_token(); + + // if number conversion was unsuccessful, then is is + // because the number was directly followed by an + // unexpected character (e.g. "01" where "1" is unexpected) + if (not ok) + { + unexpect(last_token); + } break; } diff --git a/test/src/unit-class_parser.cpp b/test/src/unit-class_parser.cpp index caa672a1..2fb0da2b 100644 --- a/test/src/unit-class_parser.cpp +++ b/test/src/unit-class_parser.cpp @@ -270,6 +270,11 @@ TEST_CASE("parser class") } } + SECTION("overflow") + { + CHECK(json::parser("1.18973e+4932").parse() == json()); + } + SECTION("invalid numbers") { CHECK_THROWS_AS(json::parser("01").parse(), std::invalid_argument); @@ -294,7 +299,7 @@ TEST_CASE("parser class") CHECK_THROWS_AS(json::parser("+0").parse(), std::invalid_argument); CHECK_THROWS_WITH(json::parser("01").parse(), - "parse error - unexpected number literal; expected end of input"); + "parse error - unexpected number literal"); CHECK_THROWS_WITH(json::parser("--1").parse(), "parse error - unexpected '-'"); CHECK_THROWS_WITH(json::parser("1.").parse(), "parse error - unexpected '.'; expected end of input");