From 6b78b5c2be7e24cd4a1c59c7fa13a62c2ae33508 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Sat, 3 Dec 2016 19:05:09 -0500 Subject: [PATCH 01/18] Added strtonum for locale-independent number parsing --- src/json.hpp | 374 ++++++++++++++++++++++------------- src/json.hpp.re2c | 374 ++++++++++++++++++++++------------- test/src/unit-regression.cpp | 4 +- 3 files changed, 483 insertions(+), 269 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 6fed0a12..e28333bd 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9060,65 +9060,225 @@ basic_json_parser_66: return result; } + + + + + + + + + + + + + + + + + /*! - @brief parse floating point number + @brief parse string into a built-in arithmetic type as if + the current locale is POSIX. - This function (and its overloads) serves to select the most approprate - standard floating point number parsing function based on the type - supplied via the first parameter. Set this to @a - static_cast(nullptr). + e.g. const auto x = static_cast("123.4"); - @param[in] type the @ref number_float_t in use - - @param[in,out] endptr recieves a pointer to the first character after - the number - - @return the floating point number + throw if can't parse the entire string as a number, + or if the destination type is integral and the value + is outside of the type's range. */ - long double str_to_float_t(long double* /* type */, char** endptr) const + struct strtonum { - return std::strtold(reinterpret_cast(m_start), endptr); - } + public: + strtonum(const char* start, const char* end) + : m_start(start), m_end(end) + {} - /*! - @brief parse floating point number + template::value>::type > + operator T() const + { + T val{0}; - This function (and its overloads) serves to select the most approprate - standard floating point number parsing function based on the type - supplied via the first parameter. Set this to @a - static_cast(nullptr). + if(parse(val, std::is_integral())) { + return val; + } - @param[in] type the @ref number_float_t in use + throw std::invalid_argument( + std::string() + + "Can't parse '" + + std::string(m_start, m_end) + + "' as type " + + typeid(T).name()); + } - @param[in,out] endptr recieves a pointer to the first character after - the number + /// return true iff token matches ^[+-]\d+$ + bool is_integral() const + { + const char* p = m_start; - @return the floating point number - */ - double str_to_float_t(double* /* type */, char** endptr) const - { - return std::strtod(reinterpret_cast(m_start), endptr); - } + if(!p) { + return false; + } - /*! - @brief parse floating point number + if(*p == '-' or *p == '+') { + ++p; + } - This function (and its overloads) serves to select the most approprate - standard floating point number parsing function based on the type - supplied via the first parameter. Set this to @a - static_cast(nullptr). + if(p == m_end) { + return false; + } - @param[in] type the @ref number_float_t in use + while(p < m_end and *p >= '0' and *p <= '9') { + ++p; + } - @param[in,out] endptr recieves a pointer to the first character after - the number + return p == m_end; + } - @return the floating point number - */ - float str_to_float_t(float* /* type */, char** endptr) const - { - return std::strtof(reinterpret_cast(m_start), endptr); - } + private: + const char* const m_start = nullptr; + const char* const m_end = nullptr; + + static void strtof(float& f, + const char* str, + char** endptr) + { + f = std::strtof(str, endptr); + } + + static void strtof(double& f, + const char* str, + char** endptr) + { + f = std::strtod(str, endptr); + } + + static void strtof(long double& f, + const char* str, + char** endptr) + { + f = std::strtold(str, endptr); + } + + template + bool parse(T& value, /*is_integral=*/std::false_type) const + { + const char* data = m_start; + const size_t len = static_cast(m_end - m_start); + + const char decimal_point_char = + std::use_facet< std::numpunct >( + std::locale()).decimal_point(); + + // replace decimal separator with locale-specific + // version, if necessary; data will be repointed + // to either buf or tempstr containing the fixed + // string. + std::string tempstr; + std::array buf; + do { + if(decimal_point_char == '.') { + break; // don't need to convert + } + + const size_t ds_pos = static_cast( + std::find(m_start, m_end, '.') - m_start ); + + if(ds_pos == len) { + break; // no decimal separator + } + + // copy the data into the local buffer or + // tempstr, if buffer is too small; + // replace decimal separator, and update + // data to point to the modified bytes + if(len + 1 < buf.size()) { + std::copy(m_start, m_end, buf.data()); + buf[len] = 0; + buf[ds_pos] = decimal_point_char; + data = buf.data(); + } else { + tempstr.assign(m_start, m_end); + tempstr[ds_pos] = decimal_point_char; + data = tempstr.c_str(); + } + } while(0); + + char* endptr = nullptr; + value = 0; + strtof(value, data, &endptr); + + + + // note that reading past the end is OK, the data may be, + // for example, "123.", where the parsed token only contains "123", + // but strtod will read the dot as well. + const bool ok = endptr >= data + len + and len > 0; + + if(ok and value == 0.0 and *data == '-') { + // some implementations forget to negate the zero + value = -0.0; + } + + if(!ok) { + std::cerr << "data:" << data + << " token:" << std::string(m_start, len) + << " value:" << value + << " len:" << len + << " parsed_len:" << (endptr - data) << std::endl; + } + + return ok; + } + + template + bool parse(T& value, /*is_integral=*/std::true_type) const + { + const char* beg = m_start; + const char* const end = m_end; + + if(beg == end) { + return false; + } + + const bool is_negative = (*beg == '-'); + + // json numbers can't start with '+' but + // this code is not json-specific + if(is_negative or *beg == '+') { + ++beg; // skip the leading sign + } + + bool valid = beg < end // must have some digits; + and ( T(-1) < 0 // type must be signed + or !is_negative); // if value is negative + value = 0; + + while(beg < end and valid) { + const uint8_t c = static_cast(*beg - '0'); + const T upd_value = value * 10 + c; + valid &= value <= std::numeric_limits::max() / 10 + and value <= upd_value // did not overflow + and c < 10; // char was digit + value = upd_value; + ++beg; + } + + if(is_negative) { + value = 0 - value; + } + + if(!valid) { + std::cerr << " token:" << std::string(m_start, m_end) + << std::endl; + } + + return valid; + } + }; /*! @brief return number value for number tokens @@ -9126,111 +9286,57 @@ basic_json_parser_66: This function translates the last token into the most appropriate number type (either integer, unsigned integer or floating point), which is passed back to the caller via the result parameter. + + integral numbers that don't fit into the the range of the respective + type are parsed as number_float_t - This function parses the integer component up to the radix point or - exponent while collecting information about the 'floating point - representation', which it stores in the result parameter. If there is - no radix point or exponent, and the number can fit into a @ref - number_integer_t or @ref number_unsigned_t then it sets the result - parameter accordingly. + floating-point values do not satisfy std::isfinite predicate + are converted to value_t::null + + throws if the entire string [m_start .. m_cursor) cannot be + interpreted as a number - If the number is a floating point number the number is then parsed - using @a std:strtod (or @a std:strtof or @a std::strtold). - - @param[out] result @ref basic_json object to receive the number, or - NAN if the conversion read past the current token. The latter case - needs to be treated by the caller function. + @param[out] result @ref basic_json object to receive the number. */ void get_number(basic_json& result) const { assert(m_start != nullptr); + assert(m_start < m_cursor); + + strtonum num(reinterpret_cast(m_start), + reinterpret_cast(m_cursor)); - const lexer::lexer_char_t* curptr = m_start; + const bool is_negative = *m_start == '-'; - // accumulate the integer conversion result (unsigned for now) - number_unsigned_t value = 0; - - // maximum absolute value of the relevant integer type - number_unsigned_t max; - - // temporarily store the type to avoid unecessary bitfield access - value_t type; - - // look for sign - if (*curptr == '-') - { - type = value_t::number_integer; - max = static_cast((std::numeric_limits::max)()) + 1; - curptr++; - } - else - { - type = value_t::number_unsigned; - max = static_cast((std::numeric_limits::max)()); - } - - // count the significant figures - for (; curptr < m_cursor; curptr++) - { - // quickly skip tests if a digit - if (*curptr < '0' || *curptr > '9') + try { + if(not num.is_integral()) { + ; + } + else if(is_negative) { - if (*curptr == '.') - { - // don't count '.' but change to float - type = value_t::number_float; - continue; - } - // assume exponent (if not then will fail parse): change to - // float, stop counting and record exponent details - type = value_t::number_float; - break; - } - - // skip if definitely not an integer - if (type != value_t::number_float) + result.m_type = value_t::number_integer; + result.m_value = static_cast(num); + return; + } + else { - // multiply last value by ten and add the new digit - auto temp = value * 10 + *curptr - '0'; - - // test for overflow - if (temp < value || temp > max) - { - // overflow - type = value_t::number_float; - } - else - { - // no overflow - save it - value = temp; - } + result.m_type = value_t::number_unsigned; + result.m_value = static_cast(num); + return; } + } catch (std::invalid_argument&) { + ; // overflow - will parse as double } - // save the value (if not a float) - if (type == value_t::number_unsigned) - { - result.m_value.number_unsigned = value; - } - else if (type == value_t::number_integer) - { - result.m_value.number_integer = -static_cast(value); - } - else - { - // parse with strtod - result.m_value.number_float = str_to_float_t(static_cast(nullptr), NULL); + result.m_type = value_t::number_float; + result.m_value = static_cast(num); - // replace infinity and NAN by null - if (not std::isfinite(result.m_value.number_float)) - { - type = value_t::null; - result.m_value = basic_json::json_value(); - } + // replace infinity and NAN by null + if (not std::isfinite(result.m_value.number_float)) + { + result.m_type = value_t::null; + result.m_value = basic_json::json_value(); } - - // save the type - result.m_type = type; } private: diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index c6a99b89..72c413ba 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8209,65 +8209,225 @@ class basic_json return result; } + + + + + + + + + + + + + + + + + /*! - @brief parse floating point number + @brief parse string into a built-in arithmetic type as if + the current locale is POSIX. - This function (and its overloads) serves to select the most approprate - standard floating point number parsing function based on the type - supplied via the first parameter. Set this to @a - static_cast(nullptr). + e.g. const auto x = static_cast("123.4"); - @param[in] type the @ref number_float_t in use - - @param[in,out] endptr recieves a pointer to the first character after - the number - - @return the floating point number + throw if can't parse the entire string as a number, + or if the destination type is integral and the value + is outside of the type's range. */ - long double str_to_float_t(long double* /* type */, char** endptr) const + struct strtonum { - return std::strtold(reinterpret_cast(m_start), endptr); - } + public: + strtonum(const char* start, const char* end) + : m_start(start), m_end(end) + {} - /*! - @brief parse floating point number + template::value>::type > + operator T() const + { + T val{0}; - This function (and its overloads) serves to select the most approprate - standard floating point number parsing function based on the type - supplied via the first parameter. Set this to @a - static_cast(nullptr). + if(parse(val, std::is_integral())) { + return val; + } - @param[in] type the @ref number_float_t in use + throw std::invalid_argument( + std::string() + + "Can't parse '" + + std::string(m_start, m_end) + + "' as type " + + typeid(T).name()); + } - @param[in,out] endptr recieves a pointer to the first character after - the number + /// return true iff token matches ^[+-]\d+$ + bool is_integral() const + { + const char* p = m_start; - @return the floating point number - */ - double str_to_float_t(double* /* type */, char** endptr) const - { - return std::strtod(reinterpret_cast(m_start), endptr); - } + if(!p) { + return false; + } - /*! - @brief parse floating point number + if(*p == '-' or *p == '+') { + ++p; + } - This function (and its overloads) serves to select the most approprate - standard floating point number parsing function based on the type - supplied via the first parameter. Set this to @a - static_cast(nullptr). + if(p == m_end) { + return false; + } - @param[in] type the @ref number_float_t in use + while(p < m_end and *p >= '0' and *p <= '9') { + ++p; + } - @param[in,out] endptr recieves a pointer to the first character after - the number + return p == m_end; + } - @return the floating point number - */ - float str_to_float_t(float* /* type */, char** endptr) const - { - return std::strtof(reinterpret_cast(m_start), endptr); - } + private: + const char* const m_start = nullptr; + const char* const m_end = nullptr; + + static void strtof(float& f, + const char* str, + char** endptr) + { + f = std::strtof(str, endptr); + } + + static void strtof(double& f, + const char* str, + char** endptr) + { + f = std::strtod(str, endptr); + } + + static void strtof(long double& f, + const char* str, + char** endptr) + { + f = std::strtold(str, endptr); + } + + template + bool parse(T& value, /*is_integral=*/std::false_type) const + { + const char* data = m_start; + const size_t len = static_cast(m_end - m_start); + + const char decimal_point_char = + std::use_facet< std::numpunct >( + std::locale()).decimal_point(); + + // replace decimal separator with locale-specific + // version, if necessary; data will be repointed + // to either buf or tempstr containing the fixed + // string. + std::string tempstr; + std::array buf; + do { + if(decimal_point_char == '.') { + break; // don't need to convert + } + + const size_t ds_pos = static_cast( + std::find(m_start, m_end, '.') - m_start ); + + if(ds_pos == len) { + break; // no decimal separator + } + + // copy the data into the local buffer or + // tempstr, if buffer is too small; + // replace decimal separator, and update + // data to point to the modified bytes + if(len + 1 < buf.size()) { + std::copy(m_start, m_end, buf.data()); + buf[len] = 0; + buf[ds_pos] = decimal_point_char; + data = buf.data(); + } else { + tempstr.assign(m_start, m_end); + tempstr[ds_pos] = decimal_point_char; + data = tempstr.c_str(); + } + } while(0); + + char* endptr = nullptr; + value = 0; + strtof(value, data, &endptr); + + + + // note that reading past the end is OK, the data may be, + // for example, "123.", where the parsed token only contains "123", + // but strtod will read the dot as well. + const bool ok = endptr >= data + len + and len > 0; + + if(ok and value == 0.0 and *data == '-') { + // some implementations forget to negate the zero + value = -0.0; + } + + if(!ok) { + std::cerr << "data:" << data + << " token:" << std::string(m_start, len) + << " value:" << value + << " len:" << len + << " parsed_len:" << (endptr - data) << std::endl; + } + + return ok; + } + + template + bool parse(T& value, /*is_integral=*/std::true_type) const + { + const char* beg = m_start; + const char* const end = m_end; + + if(beg == end) { + return false; + } + + const bool is_negative = (*beg == '-'); + + // json numbers can't start with '+' but + // this code is not json-specific + if(is_negative or *beg == '+') { + ++beg; // skip the leading sign + } + + bool valid = beg < end // must have some digits; + and ( T(-1) < 0 // type must be signed + or !is_negative); // if value is negative + value = 0; + + while(beg < end and valid) { + const uint8_t c = static_cast(*beg - '0'); + const T upd_value = value * 10 + c; + valid &= value <= std::numeric_limits::max() / 10 + and value <= upd_value // did not overflow + and c < 10; // char was digit + value = upd_value; + ++beg; + } + + if(is_negative) { + value = 0 - value; + } + + if(!valid) { + std::cerr << " token:" << std::string(m_start, m_end) + << std::endl; + } + + return valid; + } + }; /*! @brief return number value for number tokens @@ -8275,111 +8435,57 @@ class basic_json This function translates the last token into the most appropriate number type (either integer, unsigned integer or floating point), which is passed back to the caller via the result parameter. + + integral numbers that don't fit into the the range of the respective + type are parsed as number_float_t - This function parses the integer component up to the radix point or - exponent while collecting information about the 'floating point - representation', which it stores in the result parameter. If there is - no radix point or exponent, and the number can fit into a @ref - number_integer_t or @ref number_unsigned_t then it sets the result - parameter accordingly. + floating-point values do not satisfy std::isfinite predicate + are converted to value_t::null + + throws if the entire string [m_start .. m_cursor) cannot be + interpreted as a number - If the number is a floating point number the number is then parsed - using @a std:strtod (or @a std:strtof or @a std::strtold). - - @param[out] result @ref basic_json object to receive the number, or - NAN if the conversion read past the current token. The latter case - needs to be treated by the caller function. + @param[out] result @ref basic_json object to receive the number. */ void get_number(basic_json& result) const { assert(m_start != nullptr); + assert(m_start < m_cursor); + + strtonum num(reinterpret_cast(m_start), + reinterpret_cast(m_cursor)); - const lexer::lexer_char_t* curptr = m_start; + const bool is_negative = *m_start == '-'; - // accumulate the integer conversion result (unsigned for now) - number_unsigned_t value = 0; - - // maximum absolute value of the relevant integer type - number_unsigned_t max; - - // temporarily store the type to avoid unecessary bitfield access - value_t type; - - // look for sign - if (*curptr == '-') - { - type = value_t::number_integer; - max = static_cast((std::numeric_limits::max)()) + 1; - curptr++; - } - else - { - type = value_t::number_unsigned; - max = static_cast((std::numeric_limits::max)()); - } - - // count the significant figures - for (; curptr < m_cursor; curptr++) - { - // quickly skip tests if a digit - if (*curptr < '0' || *curptr > '9') + try { + if(not num.is_integral()) { + ; + } + else if(is_negative) { - if (*curptr == '.') - { - // don't count '.' but change to float - type = value_t::number_float; - continue; - } - // assume exponent (if not then will fail parse): change to - // float, stop counting and record exponent details - type = value_t::number_float; - break; - } - - // skip if definitely not an integer - if (type != value_t::number_float) + result.m_type = value_t::number_integer; + result.m_value = static_cast(num); + return; + } + else { - // multiply last value by ten and add the new digit - auto temp = value * 10 + *curptr - '0'; - - // test for overflow - if (temp < value || temp > max) - { - // overflow - type = value_t::number_float; - } - else - { - // no overflow - save it - value = temp; - } + result.m_type = value_t::number_unsigned; + result.m_value = static_cast(num); + return; } + } catch (std::invalid_argument&) { + ; // overflow - will parse as double } - // save the value (if not a float) - if (type == value_t::number_unsigned) - { - result.m_value.number_unsigned = value; - } - else if (type == value_t::number_integer) - { - result.m_value.number_integer = -static_cast(value); - } - else - { - // parse with strtod - result.m_value.number_float = str_to_float_t(static_cast(nullptr), NULL); + result.m_type = value_t::number_float; + result.m_value = static_cast(num); - // replace infinity and NAN by null - if (not std::isfinite(result.m_value.number_float)) - { - type = value_t::null; - result.m_value = basic_json::json_value(); - } + // replace infinity and NAN by null + if (not std::isfinite(result.m_value.number_float)) + { + result.m_type = value_t::null; + result.m_value = basic_json::json_value(); } - - // save the type - result.m_type = type; } private: diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index e04513ca..679b7f3d 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -375,7 +375,7 @@ TEST_CASE("regression tests") }; // change locale to mess with decimal points - std::locale::global(std::locale(std::locale(), new CommaDecimalSeparator)); + auto orig_locale = std::locale::global(std::locale(std::locale(), new CommaDecimalSeparator)); CHECK(j1a.dump() == "23.42"); CHECK(j1b.dump() == "23.42"); @@ -399,6 +399,8 @@ TEST_CASE("regression tests") CHECK(j3c.dump() == "10000"); //CHECK(j3b.dump() == "1E04"); // roundtrip error //CHECK(j3c.dump() == "1e04"); // roundtrip error + + std::locale::global(orig_locale); } SECTION("issue #233 - Can't use basic_json::iterator as a base iterator for std::move_iterator") From 4eafaab8164667a2c309ac5b346981542b0873e5 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Sat, 3 Dec 2016 22:54:36 -0500 Subject: [PATCH 02/18] Improved overflow detection; removed debugging output statements. --- src/json.hpp | 47 ++++++++++++++++++++++++----------------------- src/json.hpp.re2c | 47 ++++++++++++++++++++++++----------------------- 2 files changed, 48 insertions(+), 46 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index e28333bd..70bfce5a 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9223,14 +9223,6 @@ basic_json_parser_66: value = -0.0; } - if(!ok) { - std::cerr << "data:" << data - << " token:" << std::string(m_start, len) - << " value:" << value - << " len:" << len - << " parsed_len:" << (endptr - data) << std::endl; - } - return ok; } @@ -9252,31 +9244,40 @@ basic_json_parser_66: ++beg; // skip the leading sign } + // using unsigned accumulator x because the signed + // type can't hold absolute value of largest + // negative value of this type, so overflow detection + // becomes problematic. + using unsigned_T = typename std::make_unsigned::type; + unsigned_T x = 0; + bool valid = beg < end // must have some digits; and ( T(-1) < 0 // type must be signed or !is_negative); // if value is negative - value = 0; while(beg < end and valid) { const uint8_t c = static_cast(*beg - '0'); - const T upd_value = value * 10 + c; - valid &= value <= std::numeric_limits::max() / 10 - and value <= upd_value // did not overflow - and c < 10; // char was digit - value = upd_value; + const unsigned_T upd_x = x * 10 + c; + + using lim_T = std::numeric_limits; + constexpr unsigned_T thr1 = lim_T::max() / 10; + valid &= c < 10 // char was digit + and x <= thr1 // multiplication did not overflow + and x <= upd_x; // addition did not overflow. + + // note that x <= upd_x alone can't detect overflow since + // we're not just adding a value of decltype(x) but + // also multiplying by 10 + + x = upd_x; ++beg; } - if(is_negative) { - value = 0 - value; - } + value = static_cast(is_negative ? 0 - x : x); - if(!valid) { - std::cerr << " token:" << std::string(m_start, m_end) - << std::endl; - } - - return valid; + // the final check to make sure the value did not roll over + // into positives is for edge cases, e.g. -2^63 + return valid && (is_negative == (value < 0)); } }; diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index 72c413ba..418e302c 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8372,14 +8372,6 @@ class basic_json value = -0.0; } - if(!ok) { - std::cerr << "data:" << data - << " token:" << std::string(m_start, len) - << " value:" << value - << " len:" << len - << " parsed_len:" << (endptr - data) << std::endl; - } - return ok; } @@ -8401,31 +8393,40 @@ class basic_json ++beg; // skip the leading sign } + // using unsigned accumulator x because the signed + // type can't hold absolute value of largest + // negative value of this type, so overflow detection + // becomes problematic. + using unsigned_T = typename std::make_unsigned::type; + unsigned_T x = 0; + bool valid = beg < end // must have some digits; and ( T(-1) < 0 // type must be signed or !is_negative); // if value is negative - value = 0; while(beg < end and valid) { const uint8_t c = static_cast(*beg - '0'); - const T upd_value = value * 10 + c; - valid &= value <= std::numeric_limits::max() / 10 - and value <= upd_value // did not overflow - and c < 10; // char was digit - value = upd_value; + const unsigned_T upd_x = x * 10 + c; + + using lim_T = std::numeric_limits; + constexpr unsigned_T thr1 = lim_T::max() / 10; + valid &= c < 10 // char was digit + and x <= thr1 // multiplication did not overflow + and x <= upd_x; // addition did not overflow. + + // note that x <= upd_x alone can't detect overflow since + // we're not just adding a value of decltype(x) but + // also multiplying by 10 + + x = upd_x; ++beg; } - if(is_negative) { - value = 0 - value; - } + value = static_cast(is_negative ? 0 - x : x); - if(!valid) { - std::cerr << " token:" << std::string(m_start, m_end) - << std::endl; - } - - return valid; + // the final check to make sure the value did not roll over + // into positives is for edge cases, e.g. -2^63 + return valid && (is_negative == (value < 0)); } }; From c75efedc6ec7048244300a6305b25b2e117b440c Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Sat, 3 Dec 2016 23:19:43 -0500 Subject: [PATCH 03/18] stylistic changes --- src/json.hpp | 92 ++++++++++++++++++++++++++--------------------- src/json.hpp.re2c | 92 ++++++++++++++++++++++++++--------------------- 2 files changed, 104 insertions(+), 80 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 70bfce5a..56949a4c 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9060,22 +9060,6 @@ basic_json_parser_66: return result; } - - - - - - - - - - - - - - - - /*! @brief parse string into a built-in arithmetic type as if @@ -9086,6 +9070,9 @@ basic_json_parser_66: throw if can't parse the entire string as a number, or if the destination type is integral and the value is outside of the type's range. + + note: in floating-point case strtod may parse + past the token's end - this is not an error. */ struct strtonum { @@ -9101,7 +9088,8 @@ basic_json_parser_66: { T val{0}; - if(parse(val, std::is_integral())) { + if (parse(val, std::is_integral())) + { return val; } @@ -9113,24 +9101,35 @@ basic_json_parser_66: + typeid(T).name()); } - /// return true iff token matches ^[+-]\d+$ + // return true iff token matches ^[+-]\d+$ + // + // this is a helper to determine whether to + // parse the token into floating-point or + // integral type. We wouldn't need it if + // we had separate token types for interal + // and floating-point cases. bool is_integral() const { const char* p = m_start; - if(!p) { + if (!p) + { return false; } - if(*p == '-' or *p == '+') { + if (*p == '-' or *p == '+') + { ++p; } - if(p == m_end) { + if (p == m_end) + { return false; } - while(p < m_end and *p >= '0' and *p <= '9') { + while (p < m_end and *p >= '0' + and *p <= '9') + { ++p; } @@ -9141,6 +9140,9 @@ basic_json_parser_66: const char* const m_start = nullptr; const char* const m_end = nullptr; + // overloaded wrappers for strtod/strtof/strtold + // that will be called from parse + static void strtof(float& f, const char* str, char** endptr) @@ -9173,20 +9175,22 @@ basic_json_parser_66: std::locale()).decimal_point(); // replace decimal separator with locale-specific - // version, if necessary; data will be repointed + // version, when necessary; data will be repointed // to either buf or tempstr containing the fixed // string. std::string tempstr; std::array buf; do { - if(decimal_point_char == '.') { + if (decimal_point_char == '.') + { break; // don't need to convert } const size_t ds_pos = static_cast( std::find(m_start, m_end, '.') - m_start ); - if(ds_pos == len) { + if (ds_pos == len) + { break; // no decimal separator } @@ -9194,31 +9198,33 @@ basic_json_parser_66: // tempstr, if buffer is too small; // replace decimal separator, and update // data to point to the modified bytes - if(len + 1 < buf.size()) { + if (len + 1 < buf.size()) + { std::copy(m_start, m_end, buf.data()); buf[len] = 0; buf[ds_pos] = decimal_point_char; data = buf.data(); - } else { + } + else + { tempstr.assign(m_start, m_end); tempstr[ds_pos] = decimal_point_char; data = tempstr.c_str(); } - } while(0); + } while (0); char* endptr = nullptr; value = 0; strtof(value, data, &endptr); - - // note that reading past the end is OK, the data may be, - // for example, "123.", where the parsed token only contains "123", - // but strtod will read the dot as well. + // for example, "123.", where the parsed token only + // contains "123", but strtod will read the dot as well. const bool ok = endptr >= data + len and len > 0; - if(ok and value == 0.0 and *data == '-') { + if (ok and value == 0.0 and *data == '-') + { // some implementations forget to negate the zero value = -0.0; } @@ -9232,15 +9238,17 @@ basic_json_parser_66: const char* beg = m_start; const char* const end = m_end; - if(beg == end) { + if (beg == end) + { return false; } const bool is_negative = (*beg == '-'); // json numbers can't start with '+' but - // this code is not json-specific - if(is_negative or *beg == '+') { + // this code is not strictly json-specific + if (is_negative or *beg == '+') + { ++beg; // skip the leading sign } @@ -9255,7 +9263,8 @@ basic_json_parser_66: and ( T(-1) < 0 // type must be signed or !is_negative); // if value is negative - while(beg < end and valid) { + while (beg < end and valid) + { const uint8_t c = static_cast(*beg - '0'); const unsigned_T upd_x = x * 10 + c; @@ -9310,10 +9319,11 @@ basic_json_parser_66: const bool is_negative = *m_start == '-'; try { - if(not num.is_integral()) { + if (not num.is_integral()) + { ; } - else if(is_negative) + else if (is_negative) { result.m_type = value_t::number_integer; result.m_value = static_cast(num); @@ -9325,7 +9335,9 @@ basic_json_parser_66: result.m_value = static_cast(num); return; } - } catch (std::invalid_argument&) { + } + catch (std::invalid_argument&) + { ; // overflow - will parse as double } diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index 418e302c..d97057a8 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8209,22 +8209,6 @@ class basic_json return result; } - - - - - - - - - - - - - - - - /*! @brief parse string into a built-in arithmetic type as if @@ -8235,6 +8219,9 @@ class basic_json throw if can't parse the entire string as a number, or if the destination type is integral and the value is outside of the type's range. + + note: in floating-point case strtod may parse + past the token's end - this is not an error. */ struct strtonum { @@ -8250,7 +8237,8 @@ class basic_json { T val{0}; - if(parse(val, std::is_integral())) { + if (parse(val, std::is_integral())) + { return val; } @@ -8262,24 +8250,35 @@ class basic_json + typeid(T).name()); } - /// return true iff token matches ^[+-]\d+$ + // return true iff token matches ^[+-]\d+$ + // + // this is a helper to determine whether to + // parse the token into floating-point or + // integral type. We wouldn't need it if + // we had separate token types for interal + // and floating-point cases. bool is_integral() const { const char* p = m_start; - if(!p) { + if (!p) + { return false; } - if(*p == '-' or *p == '+') { + if (*p == '-' or *p == '+') + { ++p; } - if(p == m_end) { + if (p == m_end) + { return false; } - while(p < m_end and *p >= '0' and *p <= '9') { + while (p < m_end and *p >= '0' + and *p <= '9') + { ++p; } @@ -8290,6 +8289,9 @@ class basic_json const char* const m_start = nullptr; const char* const m_end = nullptr; + // overloaded wrappers for strtod/strtof/strtold + // that will be called from parse + static void strtof(float& f, const char* str, char** endptr) @@ -8322,20 +8324,22 @@ class basic_json std::locale()).decimal_point(); // replace decimal separator with locale-specific - // version, if necessary; data will be repointed + // version, when necessary; data will be repointed // to either buf or tempstr containing the fixed // string. std::string tempstr; std::array buf; do { - if(decimal_point_char == '.') { + if (decimal_point_char == '.') + { break; // don't need to convert } const size_t ds_pos = static_cast( std::find(m_start, m_end, '.') - m_start ); - if(ds_pos == len) { + if (ds_pos == len) + { break; // no decimal separator } @@ -8343,31 +8347,33 @@ class basic_json // tempstr, if buffer is too small; // replace decimal separator, and update // data to point to the modified bytes - if(len + 1 < buf.size()) { + if (len + 1 < buf.size()) + { std::copy(m_start, m_end, buf.data()); buf[len] = 0; buf[ds_pos] = decimal_point_char; data = buf.data(); - } else { + } + else + { tempstr.assign(m_start, m_end); tempstr[ds_pos] = decimal_point_char; data = tempstr.c_str(); } - } while(0); + } while (0); char* endptr = nullptr; value = 0; strtof(value, data, &endptr); - - // note that reading past the end is OK, the data may be, - // for example, "123.", where the parsed token only contains "123", - // but strtod will read the dot as well. + // for example, "123.", where the parsed token only + // contains "123", but strtod will read the dot as well. const bool ok = endptr >= data + len and len > 0; - if(ok and value == 0.0 and *data == '-') { + if (ok and value == 0.0 and *data == '-') + { // some implementations forget to negate the zero value = -0.0; } @@ -8381,15 +8387,17 @@ class basic_json const char* beg = m_start; const char* const end = m_end; - if(beg == end) { + if (beg == end) + { return false; } const bool is_negative = (*beg == '-'); // json numbers can't start with '+' but - // this code is not json-specific - if(is_negative or *beg == '+') { + // this code is not strictly json-specific + if (is_negative or *beg == '+') + { ++beg; // skip the leading sign } @@ -8404,7 +8412,8 @@ class basic_json and ( T(-1) < 0 // type must be signed or !is_negative); // if value is negative - while(beg < end and valid) { + while (beg < end and valid) + { const uint8_t c = static_cast(*beg - '0'); const unsigned_T upd_x = x * 10 + c; @@ -8459,10 +8468,11 @@ class basic_json const bool is_negative = *m_start == '-'; try { - if(not num.is_integral()) { + if (not num.is_integral()) + { ; } - else if(is_negative) + else if (is_negative) { result.m_type = value_t::number_integer; result.m_value = static_cast(num); @@ -8474,7 +8484,9 @@ class basic_json result.m_value = static_cast(num); return; } - } catch (std::invalid_argument&) { + } + catch (std::invalid_argument&) + { ; // overflow - will parse as double } From e41a956782fd64a2bdf253a13f05bc5e8239ec61 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Sun, 4 Dec 2016 13:23:39 -0500 Subject: [PATCH 04/18] Alternative handling of integer types relying on strto[u]ll --- src/json.hpp | 46 ++++++++++++++++++++++++++++++++++++++++++++-- src/json.hpp.re2c | 46 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 88 insertions(+), 4 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 56949a4c..51b19518 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9073,6 +9073,8 @@ basic_json_parser_66: note: in floating-point case strtod may parse past the token's end - this is not an error. + + any leading blanks are not handled. */ struct strtonum { @@ -9232,6 +9234,45 @@ basic_json_parser_66: return ok; } + +#if 1 // parsing with strto[u]ll - easier to understand but slightly slower + + signed long long parse_integral( + char** endptr, + /*is_signed*/std::true_type) const + { + return std::strtoll(m_start, endptr, 10); + } + + unsigned long long parse_integral( + char** endptr, + /*is_signed*/std::false_type) const + { + return std::strtoull(m_start, endptr, 10); + } + + template + bool parse(T& value, /*is_integral=*/std::true_type) const + { + char* endptr = nullptr; + errno = 0; // these are thread-local + const auto x = parse_integral(&endptr, std::is_signed()); + + static_assert(std::is_signed() // called right overload? + == std::is_signed(), ""); + + value = static_cast(x); + + return x == static_cast(value) // x fits into destination T + and (x != 0 or is_integral()) // strto[u]ll did nto fail + and errno == 0 // strto[u]ll did not overflow + and m_start < m_end // token was not empty + and endptr == m_end // parsed entire token exactly + and (x < 0) == (*m_start == '-'); // input was sign-compatible + } + +#else // parsing integral types manually + template bool parse(T& value, /*is_integral=*/std::true_type) const { @@ -9240,7 +9281,7 @@ basic_json_parser_66: if (beg == end) { - return false; + return false; // empty token } const bool is_negative = (*beg == '-'); @@ -9261,7 +9302,7 @@ basic_json_parser_66: bool valid = beg < end // must have some digits; and ( T(-1) < 0 // type must be signed - or !is_negative); // if value is negative + or !is_negative); // ...if value is negative while (beg < end and valid) { @@ -9288,6 +9329,7 @@ basic_json_parser_66: // into positives is for edge cases, e.g. -2^63 return valid && (is_negative == (value < 0)); } +#endif }; /*! diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index d97057a8..e5f344b2 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8222,6 +8222,8 @@ class basic_json note: in floating-point case strtod may parse past the token's end - this is not an error. + + any leading blanks are not handled. */ struct strtonum { @@ -8381,6 +8383,45 @@ class basic_json return ok; } + +#if 1 // parsing with strto[u]ll - easier to understand but slightly slower + + signed long long parse_integral( + char** endptr, + /*is_signed*/std::true_type) const + { + return std::strtoll(m_start, endptr, 10); + } + + unsigned long long parse_integral( + char** endptr, + /*is_signed*/std::false_type) const + { + return std::strtoull(m_start, endptr, 10); + } + + template + bool parse(T& value, /*is_integral=*/std::true_type) const + { + char* endptr = nullptr; + errno = 0; // these are thread-local + const auto x = parse_integral(&endptr, std::is_signed()); + + static_assert(std::is_signed() // called right overload? + == std::is_signed(), ""); + + value = static_cast(x); + + return x == static_cast(value) // x fits into destination T + and (x != 0 or is_integral()) // strto[u]ll did nto fail + and errno == 0 // strto[u]ll did not overflow + and m_start < m_end // token was not empty + and endptr == m_end // parsed entire token exactly + and (x < 0) == (*m_start == '-'); // input was sign-compatible + } + +#else // parsing integral types manually + template bool parse(T& value, /*is_integral=*/std::true_type) const { @@ -8389,7 +8430,7 @@ class basic_json if (beg == end) { - return false; + return false; // empty token } const bool is_negative = (*beg == '-'); @@ -8410,7 +8451,7 @@ class basic_json bool valid = beg < end // must have some digits; and ( T(-1) < 0 // type must be signed - or !is_negative); // if value is negative + or !is_negative); // ...if value is negative while (beg < end and valid) { @@ -8437,6 +8478,7 @@ class basic_json // into positives is for edge cases, e.g. -2^63 return valid && (is_negative == (value < 0)); } +#endif }; /*! From d64336057569281386aafed096a6db77d8aa901a Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Tue, 6 Dec 2016 00:43:12 -0500 Subject: [PATCH 05/18] Bugfix: when working with C formatting functions we need to query C locales (localeconv) rather than std::locale --- src/json.hpp | 12 +++++++++++- src/json.hpp.re2c | 12 +++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 51b19518..f1666c6c 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9108,7 +9108,7 @@ basic_json_parser_66: // this is a helper to determine whether to // parse the token into floating-point or // integral type. We wouldn't need it if - // we had separate token types for interal + // we had separate token types for integral // and floating-point cases. bool is_integral() const { @@ -9172,9 +9172,19 @@ basic_json_parser_66: const char* data = m_start; const size_t len = static_cast(m_end - m_start); +#if 0 const char decimal_point_char = std::use_facet< std::numpunct >( std::locale()).decimal_point(); +#else + // Since dealing with strtod family of functions, + // need to use the C locales instead of C++ + const auto loc = localeconv(); + assert(loc != nullptr); + const char decimal_point_char = + !loc->decimal_point ? '.' + : loc->decimal_point[0]; +#endif // replace decimal separator with locale-specific // version, when necessary; data will be repointed diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index e5f344b2..b8059035 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8257,7 +8257,7 @@ class basic_json // this is a helper to determine whether to // parse the token into floating-point or // integral type. We wouldn't need it if - // we had separate token types for interal + // we had separate token types for integral // and floating-point cases. bool is_integral() const { @@ -8321,9 +8321,19 @@ class basic_json const char* data = m_start; const size_t len = static_cast(m_end - m_start); +#if 0 const char decimal_point_char = std::use_facet< std::numpunct >( std::locale()).decimal_point(); +#else + // Since dealing with strtod family of functions, + // need to use the C locales instead of C++ + const auto loc = localeconv(); + assert(loc != nullptr); + const char decimal_point_char = + !loc->decimal_point ? '.' + : loc->decimal_point[0]; +#endif // replace decimal separator with locale-specific // version, when necessary; data will be repointed From 0c87d5d6b34d8595ceb645be1e0b82c4a4eaafe2 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Tue, 6 Dec 2016 19:41:05 -0500 Subject: [PATCH 06/18] Refactored preprocessing with a lambda instead of do{...}while(0) --- src/json.hpp | 48 ++++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index f1666c6c..19f1f109 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9169,33 +9169,28 @@ basic_json_parser_66: template bool parse(T& value, /*is_integral=*/std::false_type) const { - const char* data = m_start; - const size_t len = static_cast(m_end - m_start); - -#if 0 - const char decimal_point_char = - std::use_facet< std::numpunct >( - std::locale()).decimal_point(); -#else - // Since dealing with strtod family of functions, - // need to use the C locales instead of C++ - const auto loc = localeconv(); - assert(loc != nullptr); - const char decimal_point_char = - !loc->decimal_point ? '.' - : loc->decimal_point[0]; -#endif - // replace decimal separator with locale-specific - // version, when necessary; data will be repointed - // to either buf or tempstr containing the fixed - // string. + // version, when necessary; data will point to + // either the original string, or buf, or tempstr + // containing the fixed string. std::string tempstr; std::array buf; - do { + const size_t len = static_cast(m_end - m_start); + + const char* const data = [this, &tempstr, &buf, len]() -> const char* + { + // Since dealing with strtod family of functions, + // we're getting the decimal point char from the + // C locale facilities instead of C++'s numpunct + // facet of the current std::locale; + const auto loc = localeconv(); + assert(loc != nullptr); + const char decimal_point_char = !loc->decimal_point ? '.' + : loc->decimal_point[0]; + if (decimal_point_char == '.') { - break; // don't need to convert + return m_start; // don't need to convert } const size_t ds_pos = static_cast( @@ -9203,7 +9198,7 @@ basic_json_parser_66: if (ds_pos == len) { - break; // no decimal separator + return m_start; // no decimal separator } // copy the data into the local buffer or @@ -9215,18 +9210,19 @@ basic_json_parser_66: std::copy(m_start, m_end, buf.data()); buf[len] = 0; buf[ds_pos] = decimal_point_char; - data = buf.data(); + return buf.data(); } else { tempstr.assign(m_start, m_end); tempstr[ds_pos] = decimal_point_char; - data = tempstr.c_str(); + return tempstr.c_str(); } - } while (0); + }(); char* endptr = nullptr; value = 0; + // this calls appropriate overload depending on T strtof(value, data, &endptr); // note that reading past the end is OK, the data may be, From 7a081244a575284246d38a594fb0436ca2b6dacc Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Tue, 6 Dec 2016 19:41:31 -0500 Subject: [PATCH 07/18] Refactored preprocessing with a lambda instead of do{...}while(0) --- src/json.hpp.re2c | 48 ++++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index b8059035..27a53fb7 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8318,33 +8318,28 @@ class basic_json template bool parse(T& value, /*is_integral=*/std::false_type) const { - const char* data = m_start; - const size_t len = static_cast(m_end - m_start); - -#if 0 - const char decimal_point_char = - std::use_facet< std::numpunct >( - std::locale()).decimal_point(); -#else - // Since dealing with strtod family of functions, - // need to use the C locales instead of C++ - const auto loc = localeconv(); - assert(loc != nullptr); - const char decimal_point_char = - !loc->decimal_point ? '.' - : loc->decimal_point[0]; -#endif - // replace decimal separator with locale-specific - // version, when necessary; data will be repointed - // to either buf or tempstr containing the fixed - // string. + // version, when necessary; data will point to + // either the original string, or buf, or tempstr + // containing the fixed string. std::string tempstr; std::array buf; - do { + const size_t len = static_cast(m_end - m_start); + + const char* const data = [this, &tempstr, &buf, len]() -> const char* + { + // Since dealing with strtod family of functions, + // we're getting the decimal point char from the + // C locale facilities instead of C++'s numpunct + // facet of the current std::locale; + const auto loc = localeconv(); + assert(loc != nullptr); + const char decimal_point_char = !loc->decimal_point ? '.' + : loc->decimal_point[0]; + if (decimal_point_char == '.') { - break; // don't need to convert + return m_start; // don't need to convert } const size_t ds_pos = static_cast( @@ -8352,7 +8347,7 @@ class basic_json if (ds_pos == len) { - break; // no decimal separator + return m_start; // no decimal separator } // copy the data into the local buffer or @@ -8364,18 +8359,19 @@ class basic_json std::copy(m_start, m_end, buf.data()); buf[len] = 0; buf[ds_pos] = decimal_point_char; - data = buf.data(); + return buf.data(); } else { tempstr.assign(m_start, m_end); tempstr[ds_pos] = decimal_point_char; - data = tempstr.c_str(); + return tempstr.c_str(); } - } while (0); + }(); char* endptr = nullptr; value = 0; + // this calls appropriate overload depending on T strtof(value, data, &endptr); // note that reading past the end is OK, the data may be, From 6e8da7d8c4aada79740076755ed487da505ceb28 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Tue, 6 Dec 2016 19:45:48 -0500 Subject: [PATCH 08/18] Added unit-test for issue #379 (locale-independent str-to-num) --- test/src/unit-regression.cpp | 38 ++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index 679b7f3d..2787fe26 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -403,6 +403,44 @@ TEST_CASE("regression tests") std::locale::global(orig_locale); } + SECTION("issue #379 - locale-independent str-to-num") + { + const std::string orig_locale_name(setlocale(LC_ALL, NULL)); + + setlocale(LC_NUMERIC, "de_DE"); + std::array buf; + + { + // verify that snprintf now uses commas as decimal-separator + std::snprintf(buf.data(), buf.size(), "%.2f", 3.14); + assert(std::strcmp(buf.data(), "3,14") == 0); + + // verify that strtod now uses commas as decimal-separator + const double d1 = std::strtod(buf.data(), nullptr); + assert(d1 == 3.14); + + // verify that strtod does not understand dots as decimal separator + const double d2 = std::strtod("3.14", nullptr); + assert(d2 == 3); + } + + const json j1 = json::parse("3.14"); + + // verify that parsed correctly despite using strtod internally + CHECK(j1.get() == 3.14); + + // verify that dumped correctly despite using snprintf internally + CHECK(j1.dump() == "3.14"); + + // check a different code path + const json j2 = json::parse("1.000000000000000000000000000000000000000000000000000000000000000000000000"); + CHECK(j2.get() == 1.0); + + // restore original locale + setlocale(LC_ALL, orig_locale_name.c_str()); + } + + SECTION("issue #233 - Can't use basic_json::iterator as a base iterator for std::move_iterator") { json source = {"a", "b", "c"}; From d2e9ce270a642e4ca7e00fc160c81eadb55a5639 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Tue, 6 Dec 2016 22:18:20 -0500 Subject: [PATCH 09/18] Trying to coerce setlocale to make snprintf use commas as delimiter; the behavior appears to be compiler/platform-specific --- test/src/unit-regression.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index 2787fe26..17616a3e 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -407,21 +407,21 @@ TEST_CASE("regression tests") { const std::string orig_locale_name(setlocale(LC_ALL, NULL)); - setlocale(LC_NUMERIC, "de_DE"); + setlocale(LC_NUMERIC, "fr_Fr.UTF-8"); std::array buf; { // verify that snprintf now uses commas as decimal-separator std::snprintf(buf.data(), buf.size(), "%.2f", 3.14); - assert(std::strcmp(buf.data(), "3,14") == 0); + CHECK(std::strcmp(buf.data(), "3,14") == 0); // verify that strtod now uses commas as decimal-separator const double d1 = std::strtod(buf.data(), nullptr); - assert(d1 == 3.14); + CHECK(d1 == 3.14); // verify that strtod does not understand dots as decimal separator const double d2 = std::strtod("3.14", nullptr); - assert(d2 == 3); + CHECK(d2 == 3); } const json j1 = json::parse("3.14"); From d169598c6c236ce115a8453d917d99efc79fa3a3 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Tue, 6 Dec 2016 22:20:48 -0500 Subject: [PATCH 10/18] simplified code a bit based on @gregmarr's suggestions --- src/json.hpp | 69 ++++++++++++++++++++++------------------------- src/json.hpp.re2c | 69 ++++++++++++++++++++++------------------------- 2 files changed, 64 insertions(+), 74 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 19f1f109..1644d91f 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9177,48 +9177,43 @@ basic_json_parser_66: std::array buf; const size_t len = static_cast(m_end - m_start); - const char* const data = [this, &tempstr, &buf, len]() -> const char* + // Since dealing with strtod family of functions, + // we're getting the decimal point char from the + // C locale facilities instead of C++'s numpunct + // facet of the current std::locale; + const auto loc = localeconv(); + assert(loc != nullptr); + const char decimal_point_char = !loc->decimal_point ? '.' + : loc->decimal_point[0]; + + const char *data = m_start; + + if (decimal_point_char != '.') { - // Since dealing with strtod family of functions, - // we're getting the decimal point char from the - // C locale facilities instead of C++'s numpunct - // facet of the current std::locale; - const auto loc = localeconv(); - assert(loc != nullptr); - const char decimal_point_char = !loc->decimal_point ? '.' - : loc->decimal_point[0]; - - if (decimal_point_char == '.') - { - return m_start; // don't need to convert - } - const size_t ds_pos = static_cast( std::find(m_start, m_end, '.') - m_start ); - - if (ds_pos == len) + + if (ds_pos != len) { - return m_start; // no decimal separator + // copy the data into the local buffer or + // tempstr, if buffer is too small; + // replace decimal separator, and update + // data to point to the modified bytes + if (len + 1 < buf.size()) + { + std::copy(m_start, m_end, buf.data()); + buf[len] = 0; + buf[ds_pos] = decimal_point_char; + data = buf.data(); + } + else + { + tempstr.assign(m_start, m_end); + tempstr[ds_pos] = decimal_point_char; + data = tempstr.c_str(); + } } - - // copy the data into the local buffer or - // tempstr, if buffer is too small; - // replace decimal separator, and update - // data to point to the modified bytes - if (len + 1 < buf.size()) - { - std::copy(m_start, m_end, buf.data()); - buf[len] = 0; - buf[ds_pos] = decimal_point_char; - return buf.data(); - } - else - { - tempstr.assign(m_start, m_end); - tempstr[ds_pos] = decimal_point_char; - return tempstr.c_str(); - } - }(); + } char* endptr = nullptr; value = 0; diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index 27a53fb7..b541db1e 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8326,48 +8326,43 @@ class basic_json std::array buf; const size_t len = static_cast(m_end - m_start); - const char* const data = [this, &tempstr, &buf, len]() -> const char* + // Since dealing with strtod family of functions, + // we're getting the decimal point char from the + // C locale facilities instead of C++'s numpunct + // facet of the current std::locale; + const auto loc = localeconv(); + assert(loc != nullptr); + const char decimal_point_char = !loc->decimal_point ? '.' + : loc->decimal_point[0]; + + const char *data = m_start; + + if (decimal_point_char != '.') { - // Since dealing with strtod family of functions, - // we're getting the decimal point char from the - // C locale facilities instead of C++'s numpunct - // facet of the current std::locale; - const auto loc = localeconv(); - assert(loc != nullptr); - const char decimal_point_char = !loc->decimal_point ? '.' - : loc->decimal_point[0]; - - if (decimal_point_char == '.') - { - return m_start; // don't need to convert - } - const size_t ds_pos = static_cast( std::find(m_start, m_end, '.') - m_start ); - - if (ds_pos == len) + + if (ds_pos != len) { - return m_start; // no decimal separator + // copy the data into the local buffer or + // tempstr, if buffer is too small; + // replace decimal separator, and update + // data to point to the modified bytes + if (len + 1 < buf.size()) + { + std::copy(m_start, m_end, buf.data()); + buf[len] = 0; + buf[ds_pos] = decimal_point_char; + data = buf.data(); + } + else + { + tempstr.assign(m_start, m_end); + tempstr[ds_pos] = decimal_point_char; + data = tempstr.c_str(); + } } - - // copy the data into the local buffer or - // tempstr, if buffer is too small; - // replace decimal separator, and update - // data to point to the modified bytes - if (len + 1 < buf.size()) - { - std::copy(m_start, m_end, buf.data()); - buf[len] = 0; - buf[ds_pos] = decimal_point_char; - return buf.data(); - } - else - { - tempstr.assign(m_start, m_end); - tempstr[ds_pos] = decimal_point_char; - return tempstr.c_str(); - } - }(); + } char* endptr = nullptr; value = 0; From 6774457733524e81d7aa6fd0f3df6fd48423d0f7 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Tue, 6 Dec 2016 22:59:12 -0500 Subject: [PATCH 11/18] Trying to coerce setlocale to make snprintf use commas as delimiter some more --- test/src/unit-regression.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index 17616a3e..41ddfa40 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -405,9 +405,14 @@ TEST_CASE("regression tests") SECTION("issue #379 - locale-independent str-to-num") { - const std::string orig_locale_name(setlocale(LC_ALL, NULL)); + // If I save the locale here and restore it in the end + // of this block, then setLocale(LC_NUMERIC, "de_DE") + // does not actually make snprintf use "," as decimal separator + // on some compilers. I have no idea... + // + //const std::string orig_locale_name(setlocale(LC_ALL, NULL)); - setlocale(LC_NUMERIC, "fr_Fr.UTF-8"); + setlocale(LC_NUMERIC, "de_DE"); std::array buf; { @@ -437,7 +442,7 @@ TEST_CASE("regression tests") CHECK(j2.get() == 1.0); // restore original locale - setlocale(LC_ALL, orig_locale_name.c_str()); + // setlocale(LC_ALL, orig_locale_name.c_str()); } From 0a4a6a8399d1f4194d009c50ec076dd287cafa9e Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Wed, 7 Dec 2016 19:53:27 -0500 Subject: [PATCH 12/18] Refactored to avoid using exceptions, as there are plans to support exceptionless mode --- src/json.hpp | 78 +++++++++++++++++++++++++++++++++++------------ src/json.hpp.re2c | 78 +++++++++++++++++++++++++++++++++++------------ 2 files changed, 118 insertions(+), 38 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 1644d91f..79d6bb02 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9079,10 +9079,42 @@ basic_json_parser_66: struct strtonum { public: - strtonum(const char* start, const char* end) + template + struct maybe + { + T x; + bool valid; + + maybe(const T& xx) + : x(xx), valid(true) + {} + + maybe() : x(), valid(false) + {} + + operator bool() const + { + return valid; + } + + const T& operator*() const + { + return x; + } + }; + + strtonum(const char* start, const char* end) : m_start(start), m_end(end) {} + template::value>::type > + bool to(T& val) const + { + return parse(val, std::is_integral()); + } + template::value>::type > @@ -9361,31 +9393,39 @@ basic_json_parser_66: const bool is_negative = *m_start == '-'; - try { - if (not num.is_integral()) - { - ; - } - else if (is_negative) - { + result.m_type = value_t::discarded; + + if (not num.is_integral()) + { + ; // will parse as float below + } + else if (is_negative) + { + number_integer_t val{0}; + if(num.to(val)) { result.m_type = value_t::number_integer; - result.m_value = static_cast(num); - return; - } - else - { - result.m_type = value_t::number_unsigned; - result.m_value = static_cast(num); - return; + result.m_value = val; } } - catch (std::invalid_argument&) + else { - ; // overflow - will parse as double + number_unsigned_t val{0}; + if(num.to(val)) { + result.m_type = value_t::number_unsigned; + result.m_value = val; + } + } + + number_float_t val{0}; + if (result.m_type != value_t::discarded + or !num.to(val)) + { + return; // already have a value from above + // or couldn't parse as float_t } result.m_type = value_t::number_float; - result.m_value = static_cast(num); + result.m_value = val; // replace infinity and NAN by null if (not std::isfinite(result.m_value.number_float)) diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index b541db1e..24c54975 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8228,10 +8228,42 @@ class basic_json struct strtonum { public: - strtonum(const char* start, const char* end) + template + struct maybe + { + T x; + bool valid; + + maybe(const T& xx) + : x(xx), valid(true) + {} + + maybe() : x(), valid(false) + {} + + operator bool() const + { + return valid; + } + + const T& operator*() const + { + return x; + } + }; + + strtonum(const char* start, const char* end) : m_start(start), m_end(end) {} + template::value>::type > + bool to(T& val) const + { + return parse(val, std::is_integral()); + } + template::value>::type > @@ -8510,31 +8542,39 @@ class basic_json const bool is_negative = *m_start == '-'; - try { - if (not num.is_integral()) - { - ; - } - else if (is_negative) - { + result.m_type = value_t::discarded; + + if (not num.is_integral()) + { + ; // will parse as float below + } + else if (is_negative) + { + number_integer_t val{0}; + if(num.to(val)) { result.m_type = value_t::number_integer; - result.m_value = static_cast(num); - return; - } - else - { - result.m_type = value_t::number_unsigned; - result.m_value = static_cast(num); - return; + result.m_value = val; } } - catch (std::invalid_argument&) + else { - ; // overflow - will parse as double + number_unsigned_t val{0}; + if(num.to(val)) { + result.m_type = value_t::number_unsigned; + result.m_value = val; + } + } + + number_float_t val{0}; + if (result.m_type != value_t::discarded + or !num.to(val)) + { + return; // already have a value from above + // or couldn't parse as float_t } result.m_type = value_t::number_float; - result.m_value = static_cast(num); + result.m_value = val; // replace infinity and NAN by null if (not std::isfinite(result.m_value.number_float)) From 27d9740ad6d9b564beb0e995e0ab393c6ab3daf7 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Wed, 7 Dec 2016 19:55:07 -0500 Subject: [PATCH 13/18] Tweaks to unit-test for issue #379 --- test/src/unit-regression.cpp | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index 41ddfa40..deace862 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -406,22 +406,27 @@ TEST_CASE("regression tests") SECTION("issue #379 - locale-independent str-to-num") { // If I save the locale here and restore it in the end - // of this block, then setLocale(LC_NUMERIC, "de_DE") + // of this block, then setLocale(LC_NUMERIC, "de_DE") // does not actually make snprintf use "," as decimal separator // on some compilers. I have no idea... - // //const std::string orig_locale_name(setlocale(LC_ALL, NULL)); - - setlocale(LC_NUMERIC, "de_DE"); + + + // Still, just setting some comma-using locale does not + // make snprintf output commas on all platforms, so instead + // will change dot to comma in the current locale below + //setlocale(LC_NUMERIC, "de_DE"); + + auto loc = localeconv(); + auto orig_decimal_point = loc->decimal_point; + char comma[] = ","; + loc->decimal_point = comma; + std::array buf; { - // verify that snprintf now uses commas as decimal-separator - std::snprintf(buf.data(), buf.size(), "%.2f", 3.14); - CHECK(std::strcmp(buf.data(), "3,14") == 0); - // verify that strtod now uses commas as decimal-separator - const double d1 = std::strtod(buf.data(), nullptr); + const double d1 = std::strtod("3,14", nullptr); CHECK(d1 == 3.14); // verify that strtod does not understand dots as decimal separator @@ -429,18 +434,15 @@ TEST_CASE("regression tests") CHECK(d2 == 3); } - const json j1 = json::parse("3.14"); - // verify that parsed correctly despite using strtod internally + const json j1 = json::parse("3.14"); CHECK(j1.get() == 3.14); - // verify that dumped correctly despite using snprintf internally - CHECK(j1.dump() == "3.14"); - // check a different code path const json j2 = json::parse("1.000000000000000000000000000000000000000000000000000000000000000000000000"); CHECK(j2.get() == 1.0); + loc->decimal_point = orig_decimal_point; // restore original locale // setlocale(LC_ALL, orig_locale_name.c_str()); } From 38499e84fc771b21db7764ff4e014f7c3939d4b0 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Thu, 8 Dec 2016 21:38:14 -0500 Subject: [PATCH 14/18] Removed unused struct; fixed comments --- src/json.hpp | 69 +++++++++-------------------------------------- src/json.hpp.re2c | 69 +++++++++-------------------------------------- 2 files changed, 24 insertions(+), 114 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 79d6bb02..867afbb3 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9065,12 +9065,6 @@ basic_json_parser_66: @brief parse string into a built-in arithmetic type as if the current locale is POSIX. - e.g. const auto x = static_cast("123.4"); - - throw if can't parse the entire string as a number, - or if the destination type is integral and the value - is outside of the type's range. - note: in floating-point case strtod may parse past the token's end - this is not an error. @@ -9079,34 +9073,15 @@ basic_json_parser_66: struct strtonum { public: - template - struct maybe - { - T x; - bool valid; - - maybe(const T& xx) - : x(xx), valid(true) - {} - - maybe() : x(), valid(false) - {} - - operator bool() const - { - return valid; - } - - const T& operator*() const - { - return x; - } - }; - strtonum(const char* start, const char* end) : m_start(start), m_end(end) {} + /// return true iff parsed successfully as + /// number of type T. + /// + /// @val shall contain parsed value, or + /// undefined value if could not parse. template::value>::type > @@ -9115,33 +9090,13 @@ basic_json_parser_66: return parse(val, std::is_integral()); } - template::value>::type > - operator T() const - { - T val{0}; - - if (parse(val, std::is_integral())) - { - return val; - } - - throw std::invalid_argument( - std::string() - + "Can't parse '" - + std::string(m_start, m_end) - + "' as type " - + typeid(T).name()); - } - - // return true iff token matches ^[+-]\d+$ - // - // this is a helper to determine whether to - // parse the token into floating-point or - // integral type. We wouldn't need it if - // we had separate token types for integral - // and floating-point cases. + /// return true iff token matches ^[+-]\d+$ + /// + /// this is a helper to determine whether to + /// parse the token into floating-point or + /// integral type. We wouldn't need it if + /// we had separate token types for integral + /// and floating-point cases. bool is_integral() const { const char* p = m_start; diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index 24c54975..5b2dc8d2 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8214,12 +8214,6 @@ class basic_json @brief parse string into a built-in arithmetic type as if the current locale is POSIX. - e.g. const auto x = static_cast("123.4"); - - throw if can't parse the entire string as a number, - or if the destination type is integral and the value - is outside of the type's range. - note: in floating-point case strtod may parse past the token's end - this is not an error. @@ -8228,34 +8222,15 @@ class basic_json struct strtonum { public: - template - struct maybe - { - T x; - bool valid; - - maybe(const T& xx) - : x(xx), valid(true) - {} - - maybe() : x(), valid(false) - {} - - operator bool() const - { - return valid; - } - - const T& operator*() const - { - return x; - } - }; - strtonum(const char* start, const char* end) : m_start(start), m_end(end) {} + /// return true iff parsed successfully as + /// number of type T. + /// + /// @val shall contain parsed value, or + /// undefined value if could not parse. template::value>::type > @@ -8264,33 +8239,13 @@ class basic_json return parse(val, std::is_integral()); } - template::value>::type > - operator T() const - { - T val{0}; - - if (parse(val, std::is_integral())) - { - return val; - } - - throw std::invalid_argument( - std::string() - + "Can't parse '" - + std::string(m_start, m_end) - + "' as type " - + typeid(T).name()); - } - - // return true iff token matches ^[+-]\d+$ - // - // this is a helper to determine whether to - // parse the token into floating-point or - // integral type. We wouldn't need it if - // we had separate token types for integral - // and floating-point cases. + /// return true iff token matches ^[+-]\d+$ + /// + /// this is a helper to determine whether to + /// parse the token into floating-point or + /// integral type. We wouldn't need it if + /// we had separate token types for integral + /// and floating-point cases. bool is_integral() const { const char* p = m_start; From 1c029b97c0ebc90b5b7f1cb875b0904f115668b8 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Thu, 8 Dec 2016 22:13:05 -0500 Subject: [PATCH 15/18] Still trying to invoke locale-specific behavior in CI --- test/src/unit-regression.cpp | 35 +++++------------------------------ 1 file changed, 5 insertions(+), 30 deletions(-) diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index deace862..5cd6356d 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -405,46 +405,21 @@ TEST_CASE("regression tests") SECTION("issue #379 - locale-independent str-to-num") { - // If I save the locale here and restore it in the end - // of this block, then setLocale(LC_NUMERIC, "de_DE") - // does not actually make snprintf use "," as decimal separator - // on some compilers. I have no idea... - //const std::string orig_locale_name(setlocale(LC_ALL, NULL)); - - - // Still, just setting some comma-using locale does not - // make snprintf output commas on all platforms, so instead - // will change dot to comma in the current locale below - //setlocale(LC_NUMERIC, "de_DE"); - - auto loc = localeconv(); - auto orig_decimal_point = loc->decimal_point; - char comma[] = ","; - loc->decimal_point = comma; - - std::array buf; + setlocale(LC_NUMERIC, "de_DE.UTF-8"); { // verify that strtod now uses commas as decimal-separator - const double d1 = std::strtod("3,14", nullptr); - CHECK(d1 == 3.14); + CHECK(std::strtod("3,14", nullptr) == 3.14); // verify that strtod does not understand dots as decimal separator - const double d2 = std::strtod("3.14", nullptr); - CHECK(d2 == 3); + CHECK(std::strtod("3.14", nullptr) == 3); } // verify that parsed correctly despite using strtod internally - const json j1 = json::parse("3.14"); - CHECK(j1.get() == 3.14); + CHECK(json::parse("3.14").get() == 3.14); // check a different code path - const json j2 = json::parse("1.000000000000000000000000000000000000000000000000000000000000000000000000"); - CHECK(j2.get() == 1.0); - - loc->decimal_point = orig_decimal_point; - // restore original locale - // setlocale(LC_ALL, orig_locale_name.c_str()); + CHECK(json::parse("1.000000000000000000000000000000000000000000000000000000000000000000000000").get() == 1.0); } From cd0b651d43c303dbe0b8016a1305ab9e5fef6449 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Mon, 12 Dec 2016 19:46:47 -0500 Subject: [PATCH 16/18] Tweaked check for preserved sign; added LCOV_EXCL_LINE --- src/json.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 867afbb3..b526400c 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9103,7 +9103,7 @@ basic_json_parser_66: if (!p) { - return false; + return false; // LCOV_EXCL_LINE } if (*p == '-' or *p == '+') @@ -9113,7 +9113,7 @@ basic_json_parser_66: if (p == m_end) { - return false; + return false; // LCOV_EXCL_LINE } while (p < m_end and *p >= '0' @@ -9252,11 +9252,11 @@ basic_json_parser_66: value = static_cast(x); return x == static_cast(value) // x fits into destination T + and (x < 0) == (value < 0) // preserved sign and (x != 0 or is_integral()) // strto[u]ll did nto fail and errno == 0 // strto[u]ll did not overflow and m_start < m_end // token was not empty - and endptr == m_end // parsed entire token exactly - and (x < 0) == (*m_start == '-'); // input was sign-compatible + and endptr == m_end; // parsed entire token exactly } #else // parsing integral types manually From 0f8de48ddbc03b842ef5523b10553d3d319e1666 Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Mon, 12 Dec 2016 19:48:14 -0500 Subject: [PATCH 17/18] Disabling strtod pre-check, since can't get locale-specific behavior to manifest in AppVeyor --- test/src/unit-regression.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index 5cd6356d..7016fcaf 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -407,6 +407,9 @@ TEST_CASE("regression tests") { setlocale(LC_NUMERIC, "de_DE.UTF-8"); + // disabled, because locale-specific beharivor is not + // triggered in AppVeyor for some reason +#if 0 { // verify that strtod now uses commas as decimal-separator CHECK(std::strtod("3,14", nullptr) == 3.14); @@ -414,6 +417,7 @@ TEST_CASE("regression tests") // verify that strtod does not understand dots as decimal separator CHECK(std::strtod("3.14", nullptr) == 3); } +#endif // verify that parsed correctly despite using strtod internally CHECK(json::parse("3.14").get() == 3.14); From 5cad2006eb99534274c380af1ad92d5b257c83bc Mon Sep 17 00:00:00 2001 From: Alex Astashyn Date: Mon, 12 Dec 2016 20:15:57 -0500 Subject: [PATCH 18/18] Tweaked check for preserved sign; added LCOV_EXCL_LINE --- src/json.hpp.re2c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index 5b2dc8d2..208489ea 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -8252,7 +8252,7 @@ class basic_json if (!p) { - return false; + return false; // LCOV_EXCL_LINE } if (*p == '-' or *p == '+') @@ -8262,7 +8262,7 @@ class basic_json if (p == m_end) { - return false; + return false; // LCOV_EXCL_LINE } while (p < m_end and *p >= '0' @@ -8401,11 +8401,11 @@ class basic_json value = static_cast(x); return x == static_cast(value) // x fits into destination T + and (x < 0) == (value < 0) // preserved sign and (x != 0 or is_integral()) // strto[u]ll did nto fail and errno == 0 // strto[u]ll did not overflow and m_start < m_end // token was not empty - and endptr == m_end // parsed entire token exactly - and (x < 0) == (*m_start == '-'); // input was sign-compatible + and endptr == m_end; // parsed entire token exactly } #else // parsing integral types manually