diff --git a/Makefile b/Makefile index 56cbdb0c..8bd7a2dd 100644 --- a/Makefile +++ b/Makefile @@ -94,7 +94,7 @@ cppcheck: # run clang sanitize (we are overrding the CXXFLAGS provided by travis in order to use gcc's libstdc++) clang_sanitize: clean - CXX=clang++ CXXFLAGS="-g -O2 -fsanitize=address -fsanitize=undefined -fno-omit-frame-pointer" $(MAKE) + CXX=clang++ CXXFLAGS="-g -O2 -fsanitize=address -fsanitize=undefined -fno-omit-frame-pointer" $(MAKE) check ########################################################################## diff --git a/src/json.hpp b/src/json.hpp index 233cf8fa..bf9386b3 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -9444,7 +9444,9 @@ class basic_json literal_false, ///< the `false` literal literal_null, ///< the `null` literal value_string, ///< a string -- use get_string() for actual value - value_number, ///< a number -- use get_number() for actual value + value_unsigned, ///< an unsigned integer -- use get_number() for actual value + value_integer, ///< a signed integer -- use get_number() for actual value + value_float, ///< an floating point number -- use get_number() for actual value begin_array, ///< the character for array begin `[` begin_object, ///< the character for object begin `{` end_array, ///< the character for array end `]` @@ -9596,7 +9598,9 @@ class basic_json return "null literal"; case token_type::value_string: return "string literal"; - case token_type::value_number: + case lexer::token_type::value_unsigned: + case lexer::token_type::value_integer: + case lexer::token_type::value_float: return "number literal"; case token_type::begin_array: return "'['"; @@ -9869,11 +9873,11 @@ basic_json_parser_12: } if (yych <= '0') { - goto basic_json_parser_13; + goto basic_json_parser_43; } if (yych <= '9') { - goto basic_json_parser_15; + goto basic_json_parser_45; } goto basic_json_parser_5; basic_json_parser_13: @@ -9883,23 +9887,23 @@ basic_json_parser_13: { if (yych == '.') { - goto basic_json_parser_43; + goto basic_json_parser_47; } } else { if (yych <= 'E') { - goto basic_json_parser_44; + goto basic_json_parser_48; } if (yych == 'e') { - goto basic_json_parser_44; + goto basic_json_parser_48; } } basic_json_parser_14: { - last_token_type = token_type::value_number; + last_token_type = token_type::value_unsigned; break; } basic_json_parser_15: @@ -9918,7 +9922,7 @@ basic_json_parser_15: { if (yych == '.') { - goto basic_json_parser_43; + goto basic_json_parser_47; } goto basic_json_parser_14; } @@ -9926,11 +9930,11 @@ basic_json_parser_15: { if (yych <= 'E') { - goto basic_json_parser_44; + goto basic_json_parser_48; } if (yych == 'e') { - goto basic_json_parser_44; + goto basic_json_parser_48; } goto basic_json_parser_14; } @@ -9957,7 +9961,7 @@ basic_json_parser_23: yych = *(m_marker = ++m_cursor); if (yych == 'a') { - goto basic_json_parser_45; + goto basic_json_parser_49; } goto basic_json_parser_5; basic_json_parser_24: @@ -9965,7 +9969,7 @@ basic_json_parser_24: yych = *(m_marker = ++m_cursor); if (yych == 'u') { - goto basic_json_parser_46; + goto basic_json_parser_50; } goto basic_json_parser_5; basic_json_parser_25: @@ -9973,7 +9977,7 @@ basic_json_parser_25: yych = *(m_marker = ++m_cursor); if (yych == 'r') { - goto basic_json_parser_47; + goto basic_json_parser_51; } goto basic_json_parser_5; basic_json_parser_26: @@ -10055,13 +10059,27 @@ basic_json_parser_31: } basic_json_parser_32: m_cursor = m_marker; - if (yyaccept == 0) + if (yyaccept <= 1) { - goto basic_json_parser_5; + if (yyaccept == 0) + { + goto basic_json_parser_5; + } + else + { + goto basic_json_parser_14; + } } else { - goto basic_json_parser_14; + if (yyaccept == 2) + { + goto basic_json_parser_44; + } + else + { + goto basic_json_parser_55; + } } basic_json_parser_33: ++m_cursor; @@ -10142,7 +10160,7 @@ basic_json_parser_35: } if (yych <= 'u') { - goto basic_json_parser_48; + goto basic_json_parser_52; } goto basic_json_parser_32; } @@ -10261,6 +10279,71 @@ basic_json_parser_42: } goto basic_json_parser_32; basic_json_parser_43: + yyaccept = 2; + yych = *(m_marker = ++m_cursor); + if (yych <= 'D') + { + if (yych == '.') + { + goto basic_json_parser_47; + } + } + else + { + if (yych <= 'E') + { + goto basic_json_parser_48; + } + if (yych == 'e') + { + goto basic_json_parser_48; + } + } +basic_json_parser_44: + { + last_token_type = token_type::value_integer; + break; + } +basic_json_parser_45: + yyaccept = 2; + m_marker = ++m_cursor; + if ((m_limit - m_cursor) < 3) + { + fill_line_buffer(3); // LCOV_EXCL_LINE + } + yych = *m_cursor; + if (yych <= '9') + { + if (yych == '.') + { + goto basic_json_parser_47; + } + if (yych <= '/') + { + goto basic_json_parser_44; + } + goto basic_json_parser_45; + } + else + { + if (yych <= 'E') + { + if (yych <= 'D') + { + goto basic_json_parser_44; + } + goto basic_json_parser_48; + } + else + { + if (yych == 'e') + { + goto basic_json_parser_48; + } + goto basic_json_parser_44; + } + } +basic_json_parser_47: yych = *++m_cursor; if (yych <= '/') { @@ -10268,16 +10351,16 @@ basic_json_parser_43: } if (yych <= '9') { - goto basic_json_parser_49; + goto basic_json_parser_53; } goto basic_json_parser_32; -basic_json_parser_44: +basic_json_parser_48: yych = *++m_cursor; if (yych <= ',') { if (yych == '+') { - goto basic_json_parser_51; + goto basic_json_parser_56; } goto basic_json_parser_32; } @@ -10285,7 +10368,7 @@ basic_json_parser_44: { if (yych <= '-') { - goto basic_json_parser_51; + goto basic_json_parser_56; } if (yych <= '/') { @@ -10293,32 +10376,32 @@ basic_json_parser_44: } if (yych <= '9') { - goto basic_json_parser_52; + goto basic_json_parser_57; } goto basic_json_parser_32; } -basic_json_parser_45: +basic_json_parser_49: yych = *++m_cursor; if (yych == 'l') { - goto basic_json_parser_54; + goto basic_json_parser_59; } goto basic_json_parser_32; -basic_json_parser_46: +basic_json_parser_50: yych = *++m_cursor; if (yych == 'l') { - goto basic_json_parser_55; + goto basic_json_parser_60; } goto basic_json_parser_32; -basic_json_parser_47: +basic_json_parser_51: yych = *++m_cursor; if (yych == 'u') { - goto basic_json_parser_56; + goto basic_json_parser_61; } goto basic_json_parser_32; -basic_json_parser_48: +basic_json_parser_52: ++m_cursor; if (m_limit <= m_cursor) { @@ -10333,7 +10416,7 @@ basic_json_parser_48: } if (yych <= '9') { - goto basic_json_parser_57; + goto basic_json_parser_62; } goto basic_json_parser_32; } @@ -10341,7 +10424,7 @@ basic_json_parser_48: { if (yych <= 'F') { - goto basic_json_parser_57; + goto basic_json_parser_62; } if (yych <= '`') { @@ -10349,12 +10432,12 @@ basic_json_parser_48: } if (yych <= 'f') { - goto basic_json_parser_57; + goto basic_json_parser_62; } goto basic_json_parser_32; } -basic_json_parser_49: - yyaccept = 1; +basic_json_parser_53: + yyaccept = 3; m_marker = ++m_cursor; if ((m_limit - m_cursor) < 3) { @@ -10365,27 +10448,30 @@ basic_json_parser_49: { if (yych <= '/') { - goto basic_json_parser_14; + goto basic_json_parser_55; } if (yych <= '9') { - goto basic_json_parser_49; + goto basic_json_parser_53; } - goto basic_json_parser_14; } else { if (yych <= 'E') { - goto basic_json_parser_44; + goto basic_json_parser_48; } if (yych == 'e') { - goto basic_json_parser_44; + goto basic_json_parser_48; } - goto basic_json_parser_14; } -basic_json_parser_51: +basic_json_parser_55: + { + last_token_type = token_type::value_float; + break; + } +basic_json_parser_56: yych = *++m_cursor; if (yych <= '/') { @@ -10395,7 +10481,7 @@ basic_json_parser_51: { goto basic_json_parser_32; } -basic_json_parser_52: +basic_json_parser_57: ++m_cursor; if (m_limit <= m_cursor) { @@ -10404,35 +10490,35 @@ basic_json_parser_52: yych = *m_cursor; if (yych <= '/') { - goto basic_json_parser_14; + goto basic_json_parser_55; } if (yych <= '9') { - goto basic_json_parser_52; + goto basic_json_parser_57; } - goto basic_json_parser_14; -basic_json_parser_54: + goto basic_json_parser_55; +basic_json_parser_59: yych = *++m_cursor; if (yych == 's') { - goto basic_json_parser_58; + goto basic_json_parser_63; } goto basic_json_parser_32; -basic_json_parser_55: +basic_json_parser_60: yych = *++m_cursor; if (yych == 'l') { - goto basic_json_parser_59; + goto basic_json_parser_64; } goto basic_json_parser_32; -basic_json_parser_56: +basic_json_parser_61: yych = *++m_cursor; if (yych == 'e') { - goto basic_json_parser_61; + goto basic_json_parser_66; } goto basic_json_parser_32; -basic_json_parser_57: +basic_json_parser_62: ++m_cursor; if (m_limit <= m_cursor) { @@ -10447,7 +10533,7 @@ basic_json_parser_57: } if (yych <= '9') { - goto basic_json_parser_63; + goto basic_json_parser_68; } goto basic_json_parser_32; } @@ -10455,7 +10541,7 @@ basic_json_parser_57: { if (yych <= 'F') { - goto basic_json_parser_63; + goto basic_json_parser_68; } if (yych <= '`') { @@ -10463,30 +10549,30 @@ basic_json_parser_57: } if (yych <= 'f') { - goto basic_json_parser_63; + goto basic_json_parser_68; } goto basic_json_parser_32; } -basic_json_parser_58: +basic_json_parser_63: yych = *++m_cursor; if (yych == 'e') { - goto basic_json_parser_64; + goto basic_json_parser_69; } goto basic_json_parser_32; -basic_json_parser_59: +basic_json_parser_64: ++m_cursor; { last_token_type = token_type::literal_null; break; } -basic_json_parser_61: +basic_json_parser_66: ++m_cursor; { last_token_type = token_type::literal_true; break; } -basic_json_parser_63: +basic_json_parser_68: ++m_cursor; if (m_limit <= m_cursor) { @@ -10501,7 +10587,7 @@ basic_json_parser_63: } if (yych <= '9') { - goto basic_json_parser_66; + goto basic_json_parser_71; } goto basic_json_parser_32; } @@ -10509,7 +10595,7 @@ basic_json_parser_63: { if (yych <= 'F') { - goto basic_json_parser_66; + goto basic_json_parser_71; } if (yych <= '`') { @@ -10517,17 +10603,17 @@ basic_json_parser_63: } if (yych <= 'f') { - goto basic_json_parser_66; + goto basic_json_parser_71; } goto basic_json_parser_32; } -basic_json_parser_64: +basic_json_parser_69: ++m_cursor; { last_token_type = token_type::literal_false; break; } -basic_json_parser_66: +basic_json_parser_71: ++m_cursor; if (m_limit <= m_cursor) { @@ -10838,59 +10924,155 @@ basic_json_parser_66: return result; } - /*! - @brief parse floating point number - - This function (and its overloads) serves to select the most appropriate - standard floating point number parsing function based on the type - supplied via the first parameter. Set this to @a - static_cast(nullptr). - - @param[in,out] endptr receives a pointer to the first character after - the number - - @return the floating point number - */ - long double str_to_float_t(long double* /* type */, char** endptr) const - { - return std::strtold(reinterpret_cast(m_start), endptr); - } /*! - @brief parse floating point number + @brief parse string into a built-in arithmetic type as if the current + locale is POSIX. - This function (and its overloads) serves to select the most appropriate - standard floating point number parsing function based on the type - supplied via the first parameter. Set this to @a - static_cast(nullptr). + @note in floating-point case strtod may parse past the token's end - + this is not an error - @param[in,out] endptr receives a pointer to the first character after - the number - - @return the floating point number + @note any leading blanks are not handled */ - double str_to_float_t(double* /* type */, char** endptr) const + struct strtonum { - return std::strtod(reinterpret_cast(m_start), endptr); - } + public: + strtonum(const char* start, const char* end) + : m_start(start), m_end(end) + {} - /*! - @brief parse floating point number + /*! + @return true iff parsed successfully as number of type T - This function (and its overloads) serves to select the most appropriate - standard floating point number parsing function based on the type - supplied via the first parameter. Set this to @a - static_cast(nullptr). + @param[in,out] val shall contain parsed value, or undefined value + if could not parse + */ + template::value>::type> + bool to(T& val) const + { + return parse(val, std::is_integral()); + } - @param[in,out] endptr receives a pointer to the first character after - the number + private: + const char* const m_start = nullptr; + const char* const m_end = nullptr; - @return the floating point number - */ - float str_to_float_t(float* /* type */, char** endptr) const - { - return std::strtof(reinterpret_cast(m_start), endptr); - } + // floating-point conversion + + // overloaded wrappers for strtod/strtof/strtold + // that will be called from parse + static void strtof(float& f, const char* str, char** endptr) + { + f = std::strtof(str, endptr); + } + + static void strtof(double& f, const char* str, char** endptr) + { + f = std::strtod(str, endptr); + } + + static void strtof(long double& f, const char* str, char** endptr) + { + f = std::strtold(str, endptr); + } + + template + bool parse(T& value, /*is_integral=*/std::false_type) const + { + // replace decimal separator with locale-specific version, + // when necessary; data will point to either the original + // string, or buf, or tempstr containing the fixed string. + std::string tempstr; + std::array buf; + const size_t len = static_cast(m_end - m_start); + + // lexer will reject empty numbers + assert(len > 0); + + // since dealing with strtod family of functions, we're + // getting the decimal point char from the C locale facilities + // instead of C++'s numpunct facet of the current std::locale + const auto loc = localeconv(); + assert(loc != nullptr); + const char decimal_point_char = (loc->decimal_point == nullptr) ? '.' : loc->decimal_point[0]; + + const char* data = m_start; + + if (decimal_point_char != '.') + { + const size_t ds_pos = static_cast(std::find(m_start, m_end, '.') - m_start); + + if (ds_pos != len) + { + // copy the data into the local buffer or tempstr, if + // buffer is too small; replace decimal separator, and + // update data to point to the modified bytes + if ((len + 1) < buf.size()) + { + std::copy(m_start, m_end, buf.data()); + buf[len] = 0; + buf[ds_pos] = decimal_point_char; + data = buf.data(); + } + else + { + tempstr.assign(m_start, m_end); + tempstr[ds_pos] = decimal_point_char; + data = tempstr.c_str(); + } + } + } + + char* endptr = nullptr; + value = 0; + // this calls appropriate overload depending on T + strtof(value, data, &endptr); + + // parsing was successful iff strtof parsed exactly the number + // of characters determined by the lexer (len) + const bool ok = (endptr == (data + len)); + + if (ok and (value == 0.0) and (*data == '-')) + { + // some implementations forget to negate the zero + value = -0.0; + } + + return ok; + } + + // integral conversion + + signed long long parse_integral(char** endptr, /*is_signed*/std::true_type) const + { + return std::strtoll(m_start, endptr, 10); + } + + unsigned long long parse_integral(char** endptr, /*is_signed*/std::false_type) const + { + return std::strtoull(m_start, endptr, 10); + } + + template + bool parse(T& value, /*is_integral=*/std::true_type) const + { + char* endptr = nullptr; + errno = 0; // these are thread-local + const auto x = parse_integral(&endptr, std::is_signed()); + + // called right overload? + static_assert(std::is_signed() == std::is_signed(), ""); + + value = static_cast(x); + + return (x == static_cast(value)) // x fits into destination T + and (x < 0) == (value < 0) // preserved sign + //and ((x != 0) or is_integral()) // strto[u]ll did nto fail + and (errno == 0) // strto[u]ll did not overflow + and (m_start < m_end) // token was not empty + and (endptr == m_end); // parsed entire token exactly + } + }; /*! @brief return number value for number tokens @@ -10899,125 +11081,84 @@ basic_json_parser_66: number type (either integer, unsigned integer or floating point), which is passed back to the caller via the result parameter. - This function parses the integer component up to the radix point or - exponent while collecting information about the 'floating point - representation', which it stores in the result parameter. If there is - no radix point or exponent, and the number can fit into a @ref - number_integer_t or @ref number_unsigned_t then it sets the result - parameter accordingly. + integral numbers that don't fit into the the range of the respective + type are parsed as number_float_t - If the number is a floating point number the number is then parsed - using @a std:strtod (or @a std:strtof or @a std::strtold). + floating-point values do not satisfy std::isfinite predicate + are converted to value_t::null - @param[out] result @ref basic_json object to receive the number, or - NAN if the conversion read past the current token. The latter case - needs to be treated by the caller function. + throws if the entire string [m_start .. m_cursor) cannot be + interpreted as a number + + @param[out] result @ref basic_json object to receive the number. + @param[in] token the type of the number token */ - void get_number(basic_json& result) const + bool get_number(basic_json& result, const token_type token) const { assert(m_start != nullptr); + assert(m_start < m_cursor); + assert((token == token_type::value_unsigned) or + (token == token_type::value_integer) or + (token == token_type::value_float)); - const lexer::lexer_char_t* curptr = m_start; + strtonum num_converter(reinterpret_cast(m_start), + reinterpret_cast(m_cursor)); - // accumulate the integer conversion result (unsigned for now) - number_unsigned_t value = 0; - - // maximum absolute value of the relevant integer type - number_unsigned_t max; - - // temporarily store the type to avoid unnecessary bitfield access - value_t type; - - // look for sign - if (*curptr == '-') + switch (token) { - type = value_t::number_integer; - max = static_cast((std::numeric_limits::max)()) + 1; - curptr++; - } - else - { - type = value_t::number_unsigned; - max = static_cast((std::numeric_limits::max)()); - } - - // count the significant figures - for (; curptr < m_cursor; curptr++) - { - // quickly skip tests if a digit - if (*curptr < '0' or* curptr > '9') + case lexer::token_type::value_unsigned: { - if (*curptr == '.') + number_unsigned_t val; + if (num_converter.to(val)) { - // don't count '.' but change to float - type = value_t::number_float; - continue; + // parsing successful + result.m_type = value_t::number_unsigned; + result.m_value = val; + return true; } - // assume exponent (if not then will fail parse): change to - // float, stop counting and record exponent details - type = value_t::number_float; break; } - // skip if definitely not an integer - if (type != value_t::number_float) + case lexer::token_type::value_integer: { - auto digit = static_cast(*curptr - '0'); - - // overflow if value * 10 + digit > max, move terms around - // to avoid overflow in intermediate values - if (value > (max - digit) / 10) + number_integer_t val; + if (num_converter.to(val)) { - // overflow - type = value_t::number_float; - } - else - { - // no overflow - value = value * 10 + digit; + // parsing successful + result.m_type = value_t::number_integer; + result.m_value = val; + return true; } + break; + } + + default: + { + break; } } - // save the value (if not a float) - if (type == value_t::number_unsigned) + // parse float (either explicitly or because a previous conversion + // failed) + number_float_t val; + if (num_converter.to(val)) { - result.m_value.number_unsigned = value; - } - else if (type == value_t::number_integer) - { - // invariant: if we parsed a '-', the absolute value is between - // 0 (we allow -0) and max == -INT64_MIN - assert(value >= 0); - assert(value <= max); - - if (value == max) - { - // we cannot simply negate value (== max == -INT64_MIN), - // see https://github.com/nlohmann/json/issues/389 - result.m_value.number_integer = static_cast(INT64_MIN); - } - else - { - // all other values can be negated safely - result.m_value.number_integer = -static_cast(value); - } - } - else - { - // parse with strtod - result.m_value.number_float = str_to_float_t(static_cast(nullptr), nullptr); + // parsing successful + result.m_type = value_t::number_float; + result.m_value = val; // replace infinity and NAN by null if (not std::isfinite(result.m_value.number_float)) { - type = value_t::null; + result.m_type = value_t::null; result.m_value = basic_json::json_value(); } + + return true; } - // save the type - result.m_type = type; + // couldn't parse number in any format + return false; } private: @@ -11261,10 +11402,20 @@ basic_json_parser_66: break; } - case lexer::token_type::value_number: + case lexer::token_type::value_unsigned: + case lexer::token_type::value_integer: + case lexer::token_type::value_float: { - m_lexer.get_number(result); + const bool ok = m_lexer.get_number(result, last_token); get_token(); + + // if number conversion was unsuccessful, then is is + // because the number was directly followed by an + // unexpected character (e.g. "01" where "1" is unexpected) + if (not ok) + { + unexpect(last_token); + } break; } diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index c477ba22..e15f7b1f 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -9444,7 +9444,9 @@ class basic_json literal_false, ///< the `false` literal literal_null, ///< the `null` literal value_string, ///< a string -- use get_string() for actual value - value_number, ///< a number -- use get_number() for actual value + value_unsigned, ///< an unsigned integer -- use get_number() for actual value + value_integer, ///< a signed integer -- use get_number() for actual value + value_float, ///< an floating point number -- use get_number() for actual value begin_array, ///< the character for array begin `[` begin_object, ///< the character for object begin `{` end_array, ///< the character for array end `]` @@ -9596,7 +9598,9 @@ class basic_json return "null literal"; case token_type::value_string: return "string literal"; - case token_type::value_number: + case lexer::token_type::value_unsigned: + case lexer::token_type::value_integer: + case lexer::token_type::value_float: return "number literal"; case token_type::begin_array: return "'['"; @@ -9684,18 +9688,22 @@ class basic_json "false" { last_token_type = token_type::literal_false; break; } // number - decimal_point = "."; - digit = [0-9]; - digit_1_9 = [1-9]; - e = "e" | "E"; - minus = "-"; - plus = "+"; - zero = "0"; - exp = e (minus | plus)? digit+; - frac = decimal_point digit+; - int = (zero | digit_1_9 digit*); - number = minus? int frac? exp?; - number { last_token_type = token_type::value_number; break; } + decimal_point = "."; + digit = [0-9]; + digit_1_9 = [1-9]; + e = "e" | "E"; + minus = "-"; + plus = "+"; + zero = "0"; + exp = e (minus | plus)? digit+; + frac = decimal_point digit+; + int = (zero | digit_1_9 digit*); + number_unsigned = int; + number_unsigned { last_token_type = token_type::value_unsigned; break; } + number_integer = minus int; + number_integer { last_token_type = token_type::value_integer; break; } + number_float = minus? int frac? exp?; + number_float { last_token_type = token_type::value_float; break; } // string quotation_mark = "\""; @@ -9988,59 +9996,155 @@ class basic_json return result; } - /*! - @brief parse floating point number - - This function (and its overloads) serves to select the most appropriate - standard floating point number parsing function based on the type - supplied via the first parameter. Set this to @a - static_cast(nullptr). - - @param[in,out] endptr receives a pointer to the first character after - the number - - @return the floating point number - */ - long double str_to_float_t(long double* /* type */, char** endptr) const - { - return std::strtold(reinterpret_cast(m_start), endptr); - } /*! - @brief parse floating point number + @brief parse string into a built-in arithmetic type as if the current + locale is POSIX. - This function (and its overloads) serves to select the most appropriate - standard floating point number parsing function based on the type - supplied via the first parameter. Set this to @a - static_cast(nullptr). + @note in floating-point case strtod may parse past the token's end - + this is not an error - @param[in,out] endptr receives a pointer to the first character after - the number - - @return the floating point number + @note any leading blanks are not handled */ - double str_to_float_t(double* /* type */, char** endptr) const + struct strtonum { - return std::strtod(reinterpret_cast(m_start), endptr); - } + public: + strtonum(const char* start, const char* end) + : m_start(start), m_end(end) + {} - /*! - @brief parse floating point number + /*! + @return true iff parsed successfully as number of type T - This function (and its overloads) serves to select the most appropriate - standard floating point number parsing function based on the type - supplied via the first parameter. Set this to @a - static_cast(nullptr). + @param[in,out] val shall contain parsed value, or undefined value + if could not parse + */ + template::value>::type> + bool to(T& val) const + { + return parse(val, std::is_integral()); + } - @param[in,out] endptr receives a pointer to the first character after - the number + private: + const char* const m_start = nullptr; + const char* const m_end = nullptr; - @return the floating point number - */ - float str_to_float_t(float* /* type */, char** endptr) const - { - return std::strtof(reinterpret_cast(m_start), endptr); - } + // floating-point conversion + + // overloaded wrappers for strtod/strtof/strtold + // that will be called from parse + static void strtof(float& f, const char* str, char** endptr) + { + f = std::strtof(str, endptr); + } + + static void strtof(double& f, const char* str, char** endptr) + { + f = std::strtod(str, endptr); + } + + static void strtof(long double& f, const char* str, char** endptr) + { + f = std::strtold(str, endptr); + } + + template + bool parse(T& value, /*is_integral=*/std::false_type) const + { + // replace decimal separator with locale-specific version, + // when necessary; data will point to either the original + // string, or buf, or tempstr containing the fixed string. + std::string tempstr; + std::array buf; + const size_t len = static_cast(m_end - m_start); + + // lexer will reject empty numbers + assert(len > 0); + + // since dealing with strtod family of functions, we're + // getting the decimal point char from the C locale facilities + // instead of C++'s numpunct facet of the current std::locale + const auto loc = localeconv(); + assert(loc != nullptr); + const char decimal_point_char = (loc->decimal_point == nullptr) ? '.' : loc->decimal_point[0]; + + const char* data = m_start; + + if (decimal_point_char != '.') + { + const size_t ds_pos = static_cast(std::find(m_start, m_end, '.') - m_start); + + if (ds_pos != len) + { + // copy the data into the local buffer or tempstr, if + // buffer is too small; replace decimal separator, and + // update data to point to the modified bytes + if ((len + 1) < buf.size()) + { + std::copy(m_start, m_end, buf.data()); + buf[len] = 0; + buf[ds_pos] = decimal_point_char; + data = buf.data(); + } + else + { + tempstr.assign(m_start, m_end); + tempstr[ds_pos] = decimal_point_char; + data = tempstr.c_str(); + } + } + } + + char* endptr = nullptr; + value = 0; + // this calls appropriate overload depending on T + strtof(value, data, &endptr); + + // parsing was successful iff strtof parsed exactly the number + // of characters determined by the lexer (len) + const bool ok = (endptr == (data + len)); + + if (ok and (value == 0.0) and (*data == '-')) + { + // some implementations forget to negate the zero + value = -0.0; + } + + return ok; + } + + // integral conversion + + signed long long parse_integral(char** endptr, /*is_signed*/std::true_type) const + { + return std::strtoll(m_start, endptr, 10); + } + + unsigned long long parse_integral(char** endptr, /*is_signed*/std::false_type) const + { + return std::strtoull(m_start, endptr, 10); + } + + template + bool parse(T& value, /*is_integral=*/std::true_type) const + { + char* endptr = nullptr; + errno = 0; // these are thread-local + const auto x = parse_integral(&endptr, std::is_signed()); + + // called right overload? + static_assert(std::is_signed() == std::is_signed(), ""); + + value = static_cast(x); + + return (x == static_cast(value)) // x fits into destination T + and (x < 0) == (value < 0) // preserved sign + //and ((x != 0) or is_integral()) // strto[u]ll did nto fail + and (errno == 0) // strto[u]ll did not overflow + and (m_start < m_end) // token was not empty + and (endptr == m_end); // parsed entire token exactly + } + }; /*! @brief return number value for number tokens @@ -10049,125 +10153,84 @@ class basic_json number type (either integer, unsigned integer or floating point), which is passed back to the caller via the result parameter. - This function parses the integer component up to the radix point or - exponent while collecting information about the 'floating point - representation', which it stores in the result parameter. If there is - no radix point or exponent, and the number can fit into a @ref - number_integer_t or @ref number_unsigned_t then it sets the result - parameter accordingly. + integral numbers that don't fit into the the range of the respective + type are parsed as number_float_t - If the number is a floating point number the number is then parsed - using @a std:strtod (or @a std:strtof or @a std::strtold). + floating-point values do not satisfy std::isfinite predicate + are converted to value_t::null - @param[out] result @ref basic_json object to receive the number, or - NAN if the conversion read past the current token. The latter case - needs to be treated by the caller function. + throws if the entire string [m_start .. m_cursor) cannot be + interpreted as a number + + @param[out] result @ref basic_json object to receive the number. + @param[in] token the type of the number token */ - void get_number(basic_json& result) const + bool get_number(basic_json& result, const token_type token) const { assert(m_start != nullptr); + assert(m_start < m_cursor); + assert((token == token_type::value_unsigned) or + (token == token_type::value_integer) or + (token == token_type::value_float)); - const lexer::lexer_char_t* curptr = m_start; + strtonum num_converter(reinterpret_cast(m_start), + reinterpret_cast(m_cursor)); - // accumulate the integer conversion result (unsigned for now) - number_unsigned_t value = 0; - - // maximum absolute value of the relevant integer type - number_unsigned_t max; - - // temporarily store the type to avoid unnecessary bitfield access - value_t type; - - // look for sign - if (*curptr == '-') + switch (token) { - type = value_t::number_integer; - max = static_cast((std::numeric_limits::max)()) + 1; - curptr++; - } - else - { - type = value_t::number_unsigned; - max = static_cast((std::numeric_limits::max)()); - } - - // count the significant figures - for (; curptr < m_cursor; curptr++) - { - // quickly skip tests if a digit - if (*curptr < '0' or* curptr > '9') + case lexer::token_type::value_unsigned: { - if (*curptr == '.') + number_unsigned_t val; + if (num_converter.to(val)) { - // don't count '.' but change to float - type = value_t::number_float; - continue; + // parsing successful + result.m_type = value_t::number_unsigned; + result.m_value = val; + return true; } - // assume exponent (if not then will fail parse): change to - // float, stop counting and record exponent details - type = value_t::number_float; break; } - // skip if definitely not an integer - if (type != value_t::number_float) + case lexer::token_type::value_integer: { - auto digit = static_cast(*curptr - '0'); - - // overflow if value * 10 + digit > max, move terms around - // to avoid overflow in intermediate values - if (value > (max - digit) / 10) + number_integer_t val; + if (num_converter.to(val)) { - // overflow - type = value_t::number_float; - } - else - { - // no overflow - value = value * 10 + digit; + // parsing successful + result.m_type = value_t::number_integer; + result.m_value = val; + return true; } + break; + } + + default: + { + break; } } - // save the value (if not a float) - if (type == value_t::number_unsigned) + // parse float (either explicitly or because a previous conversion + // failed) + number_float_t val; + if (num_converter.to(val)) { - result.m_value.number_unsigned = value; - } - else if (type == value_t::number_integer) - { - // invariant: if we parsed a '-', the absolute value is between - // 0 (we allow -0) and max == -INT64_MIN - assert(value >= 0); - assert(value <= max); - - if (value == max) - { - // we cannot simply negate value (== max == -INT64_MIN), - // see https://github.com/nlohmann/json/issues/389 - result.m_value.number_integer = static_cast(INT64_MIN); - } - else - { - // all other values can be negated safely - result.m_value.number_integer = -static_cast(value); - } - } - else - { - // parse with strtod - result.m_value.number_float = str_to_float_t(static_cast(nullptr), nullptr); + // parsing successful + result.m_type = value_t::number_float; + result.m_value = val; // replace infinity and NAN by null if (not std::isfinite(result.m_value.number_float)) { - type = value_t::null; + result.m_type = value_t::null; result.m_value = basic_json::json_value(); } + + return true; } - // save the type - result.m_type = type; + // couldn't parse number in any format + return false; } private: @@ -10411,10 +10474,20 @@ class basic_json break; } - case lexer::token_type::value_number: + case lexer::token_type::value_unsigned: + case lexer::token_type::value_integer: + case lexer::token_type::value_float: { - m_lexer.get_number(result); + const bool ok = m_lexer.get_number(result, last_token); get_token(); + + // if number conversion was unsuccessful, then is is + // because the number was directly followed by an + // unexpected character (e.g. "01" where "1" is unexpected) + if (not ok) + { + unexpect(last_token); + } break; } diff --git a/test/src/unit-class_lexer.cpp b/test/src/unit-class_lexer.cpp index ac43de8a..79dbb766 100644 --- a/test/src/unit-class_lexer.cpp +++ b/test/src/unit-class_lexer.cpp @@ -65,25 +65,37 @@ TEST_CASE("lexer class") SECTION("numbers") { CHECK((json::lexer(reinterpret_cast("0"), - 1).scan() == json::lexer::token_type::value_number)); + 1).scan() == json::lexer::token_type::value_unsigned)); CHECK((json::lexer(reinterpret_cast("1"), - 1).scan() == json::lexer::token_type::value_number)); + 1).scan() == json::lexer::token_type::value_unsigned)); CHECK((json::lexer(reinterpret_cast("2"), - 1).scan() == json::lexer::token_type::value_number)); + 1).scan() == json::lexer::token_type::value_unsigned)); CHECK((json::lexer(reinterpret_cast("3"), - 1).scan() == json::lexer::token_type::value_number)); + 1).scan() == json::lexer::token_type::value_unsigned)); CHECK((json::lexer(reinterpret_cast("4"), - 1).scan() == json::lexer::token_type::value_number)); + 1).scan() == json::lexer::token_type::value_unsigned)); CHECK((json::lexer(reinterpret_cast("5"), - 1).scan() == json::lexer::token_type::value_number)); + 1).scan() == json::lexer::token_type::value_unsigned)); CHECK((json::lexer(reinterpret_cast("6"), - 1).scan() == json::lexer::token_type::value_number)); + 1).scan() == json::lexer::token_type::value_unsigned)); CHECK((json::lexer(reinterpret_cast("7"), - 1).scan() == json::lexer::token_type::value_number)); + 1).scan() == json::lexer::token_type::value_unsigned)); CHECK((json::lexer(reinterpret_cast("8"), - 1).scan() == json::lexer::token_type::value_number)); + 1).scan() == json::lexer::token_type::value_unsigned)); CHECK((json::lexer(reinterpret_cast("9"), - 1).scan() == json::lexer::token_type::value_number)); + 1).scan() == json::lexer::token_type::value_unsigned)); + + CHECK((json::lexer(reinterpret_cast("-0"), + 2).scan() == json::lexer::token_type::value_integer)); + CHECK((json::lexer(reinterpret_cast("-1"), + 2).scan() == json::lexer::token_type::value_integer)); + + CHECK((json::lexer(reinterpret_cast("1.1"), + 3).scan() == json::lexer::token_type::value_float)); + CHECK((json::lexer(reinterpret_cast("-1.1"), + 4).scan() == json::lexer::token_type::value_float)); + CHECK((json::lexer(reinterpret_cast("1E10"), + 4).scan() == json::lexer::token_type::value_float)); } SECTION("whitespace") @@ -109,7 +121,9 @@ TEST_CASE("lexer class") CHECK((json::lexer::token_type_name(json::lexer::token_type::literal_false) == "false literal")); CHECK((json::lexer::token_type_name(json::lexer::token_type::literal_null) == "null literal")); CHECK((json::lexer::token_type_name(json::lexer::token_type::value_string) == "string literal")); - CHECK((json::lexer::token_type_name(json::lexer::token_type::value_number) == "number literal")); + CHECK((json::lexer::token_type_name(json::lexer::token_type::value_unsigned) == "number literal")); + CHECK((json::lexer::token_type_name(json::lexer::token_type::value_integer) == "number literal")); + CHECK((json::lexer::token_type_name(json::lexer::token_type::value_float) == "number literal")); CHECK((json::lexer::token_type_name(json::lexer::token_type::begin_array) == "'['")); CHECK((json::lexer::token_type_name(json::lexer::token_type::begin_object) == "'{'")); CHECK((json::lexer::token_type_name(json::lexer::token_type::end_array) == "']'")); diff --git a/test/src/unit-class_parser.cpp b/test/src/unit-class_parser.cpp index 59ea3cef..2fb0da2b 100644 --- a/test/src/unit-class_parser.cpp +++ b/test/src/unit-class_parser.cpp @@ -101,6 +101,7 @@ TEST_CASE("parser class") CHECK_THROWS_WITH(json::parser("\"\b\"").parse(), "parse error - unexpected '\"'"); // improve code coverage CHECK_THROWS_AS(json::parser("\uFF01").parse(), std::invalid_argument); + CHECK_THROWS_AS(json::parser("[-4:1,]").parse(), std::invalid_argument); // unescaped control characters CHECK_THROWS_AS(json::parser("\"\x00\"").parse(), std::invalid_argument); CHECK_THROWS_AS(json::parser("\"\x01\"").parse(), std::invalid_argument); @@ -269,6 +270,11 @@ TEST_CASE("parser class") } } + SECTION("overflow") + { + CHECK(json::parser("1.18973e+4932").parse() == json()); + } + SECTION("invalid numbers") { CHECK_THROWS_AS(json::parser("01").parse(), std::invalid_argument); @@ -293,7 +299,7 @@ TEST_CASE("parser class") CHECK_THROWS_AS(json::parser("+0").parse(), std::invalid_argument); CHECK_THROWS_WITH(json::parser("01").parse(), - "parse error - unexpected number literal; expected end of input"); + "parse error - unexpected number literal"); CHECK_THROWS_WITH(json::parser("--1").parse(), "parse error - unexpected '-'"); CHECK_THROWS_WITH(json::parser("1.").parse(), "parse error - unexpected '.'; expected end of input"); diff --git a/test/src/unit-regression.cpp b/test/src/unit-regression.cpp index 17edbd5e..dcfcf62b 100644 --- a/test/src/unit-regression.cpp +++ b/test/src/unit-regression.cpp @@ -383,7 +383,7 @@ TEST_CASE("regression tests") }; // change locale to mess with decimal points - std::locale::global(std::locale(std::locale(), new CommaDecimalSeparator)); + auto orig_locale = std::locale::global(std::locale(std::locale(), new CommaDecimalSeparator)); CHECK(j1a.dump() == "23.42"); CHECK(j1b.dump() == "23.42"); @@ -407,8 +407,34 @@ TEST_CASE("regression tests") CHECK(j3c.dump() == "10000"); //CHECK(j3b.dump() == "1E04"); // roundtrip error //CHECK(j3c.dump() == "1e04"); // roundtrip error + + std::locale::global(orig_locale); } + SECTION("issue #379 - locale-independent str-to-num") + { + setlocale(LC_NUMERIC, "de_DE.UTF-8"); + + // disabled, because locale-specific beharivor is not + // triggered in AppVeyor for some reason +#ifndef _MSC_VER + { + // verify that strtod now uses commas as decimal-separator + CHECK(std::strtod("3,14", nullptr) == 3.14); + + // verify that strtod does not understand dots as decimal separator + CHECK(std::strtod("3.14", nullptr) == 3); + } +#endif + + // verify that parsed correctly despite using strtod internally + CHECK(json::parse("3.14").get() == 3.14); + + // check a different code path + CHECK(json::parse("1.000000000000000000000000000000000000000000000000000000000000000000000000").get() == 1.0); + } + + SECTION("issue #233 - Can't use basic_json::iterator as a base iterator for std::move_iterator") { json source = {"a", "b", "c"};