diff --git a/src/json.hpp b/src/json.hpp index f7fbfd71..82f69abe 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -11127,6 +11127,13 @@ class basic_json return codepoint; } + static std::string codepoint_to_string(int codepoint) + { + std::stringstream ss; + ss << "U+" << std::setw(4) << std::uppercase << std::setfill('0') << std::hex << codepoint; + return ss.str(); + } + token_type scan_string() { // reset yytext (ignore opening quote) @@ -11237,13 +11244,13 @@ class basic_json } else { - error_message = "invalid string: invalid low surrogate"; + error_message = "invalid string: surrogate " + codepoint_to_string(codepoint1) + " must be followed by U+DC00..U+DFFF instead of " + codepoint_to_string(codepoint2); return token_type::parse_error; } } else { - error_message = "invalid string: missing low surrogate"; + error_message = "invalid string: surrogate " + codepoint_to_string(codepoint1) + " must be followed by U+DC00..U+DFFF"; return token_type::parse_error; } } @@ -11251,7 +11258,7 @@ class basic_json { if (JSON_UNLIKELY(0xDC00 <= codepoint1 and codepoint1 <= 0xDFFF)) { - error_message = "invalid string: missing high surrogate"; + error_message = "invalid string: surrogate " + codepoint_to_string(codepoint1) + " must follow U+D800..U+DBFF"; return token_type::parse_error; } @@ -11336,7 +11343,7 @@ class basic_json case 0x1e: case 0x1f: { - error_message = "invalid string: control characters (U+0000 through U+001f) must be escaped"; + error_message = "invalid string: control character " + codepoint_to_string(current) + " must be escaped"; return token_type::parse_error; } @@ -11480,7 +11487,7 @@ class basic_json continue; } - error_message = "invalid string: not well-formed UTF-8 byte"; + error_message = "invalid string: ill-formed UTF-8 byte"; return token_type::parse_error; } @@ -11500,7 +11507,7 @@ class basic_json } } - error_message = "invalid string: not well-formed UTF-8 byte"; + error_message = "invalid string: ill-formed UTF-8 byte"; return token_type::parse_error; } @@ -11534,7 +11541,7 @@ class basic_json } } - error_message = "invalid string: not well-formed UTF-8 byte"; + error_message = "invalid string: ill-formed UTF-8 byte"; return token_type::parse_error; } @@ -11554,7 +11561,7 @@ class basic_json } } - error_message = "invalid string: not well-formed UTF-8 byte"; + error_message = "invalid string: ill-formed UTF-8 byte"; return token_type::parse_error; } @@ -11579,7 +11586,7 @@ class basic_json } } - error_message = "invalid string: not well-formed UTF-8 byte"; + error_message = "invalid string: ill-formed UTF-8 byte"; return token_type::parse_error; } @@ -11606,7 +11613,7 @@ class basic_json } } - error_message = "invalid string: not well-formed UTF-8 byte"; + error_message = "invalid string: ill-formed UTF-8 byte"; return token_type::parse_error; } @@ -11631,14 +11638,14 @@ class basic_json } } - error_message = "invalid string: not well-formed UTF-8 byte"; + error_message = "invalid string: ill-formed UTF-8 byte"; return token_type::parse_error; } - // remaining bytes (80..C1 and F5..FF) are not well-formed + // remaining bytes (80..C1 and F5..FF) are ill-formed default: { - error_message = "invalid string: not well-formed UTF-8 byte"; + error_message = "invalid string: ill-formed UTF-8 byte"; return token_type::parse_error; } } @@ -11681,7 +11688,7 @@ class basic_json // be changed if minus sign, decimal point or exponent is read token_type number_type = token_type::value_unsigned; - // state: we just found out we need to scan a number + // state (init): we just found out we need to scan a number switch (current) { case '-': @@ -12001,6 +12008,8 @@ scan_number_done: } } + // this code is reached if we parse a floating-point number or if + // an integer conversion above failed strtof(value_float, yytext.data(), nullptr); return token_type::value_float; } @@ -12064,7 +12073,8 @@ scan_number_done: /// add a character to yytext void add(int c) { - // resize yytext if necessary + // resize yytext if necessary; this condition is deemed unlikely, + // because we start with a 1024-byte buffer if (JSON_UNLIKELY((yylen + 1 > yytext.capacity()))) { yytext.resize(2 * yytext.capacity(), '\0'); @@ -12120,7 +12130,7 @@ scan_number_done: std::string s = ia->read(start_pos, chars_read - start_pos); // escape control characters - std::stringstream ss; + std::string result; for (auto c : s) { if (c == '\0' or c == std::char_traits::eof()) @@ -12131,16 +12141,16 @@ scan_number_done: else if ('\x00' <= c and c <= '\x1f') { // escape control characters - ss << ""; + result += "<" + codepoint_to_string(c) + ">"; } else { // add character as is - ss << c; + result.append(1, c); } } - return ss.str(); + return result; } /// return syntax error message @@ -12204,7 +12214,8 @@ scan_number_done: case '9': return scan_number(); - // end of input + // end of input (the null byte is needed when parsing from + // string literals) case '\0': case std::char_traits::eof(): return token_type::end_of_input; diff --git a/test/src/unit-class_parser.cpp b/test/src/unit-class_parser.cpp index e0fffac4..b631a978 100644 --- a/test/src/unit-class_parser.cpp +++ b/test/src/unit-class_parser.cpp @@ -98,18 +98,18 @@ TEST_CASE("parser class") // error: tab in string CHECK_THROWS_AS(parse_string("\"\t\"").parse(), json::parse_error); CHECK_THROWS_WITH(parse_string("\"\t\"").parse(), - "[json.exception.parse_error.101] parse error at 2: syntax error - invalid string: control characters (U+0000 through U+001f) must be escaped; last read '\"'"); + "[json.exception.parse_error.101] parse error at 2: syntax error - invalid string: control character U+0009 must be escaped; last read '\"'"); // error: newline in string CHECK_THROWS_AS(parse_string("\"\n\"").parse(), json::parse_error); CHECK_THROWS_AS(parse_string("\"\r\"").parse(), json::parse_error); CHECK_THROWS_WITH(parse_string("\"\n\"").parse(), - "[json.exception.parse_error.101] parse error at 2: syntax error - invalid string: control characters (U+0000 through U+001f) must be escaped; last read '\"'"); + "[json.exception.parse_error.101] parse error at 2: syntax error - invalid string: control character U+000A must be escaped; last read '\"'"); CHECK_THROWS_WITH(parse_string("\"\r\"").parse(), - "[json.exception.parse_error.101] parse error at 2: syntax error - invalid string: control characters (U+0000 through U+001f) must be escaped; last read '\"'"); + "[json.exception.parse_error.101] parse error at 2: syntax error - invalid string: control character U+000D must be escaped; last read '\"'"); // error: backspace in string CHECK_THROWS_AS(parse_string("\"\b\"").parse(), json::parse_error); CHECK_THROWS_WITH(parse_string("\"\b\"").parse(), - "[json.exception.parse_error.101] parse error at 2: syntax error - invalid string: control characters (U+0000 through U+001f) must be escaped; last read '\"'"); + "[json.exception.parse_error.101] parse error at 2: syntax error - invalid string: control character U+0008 must be escaped; last read '\"'"); // improve code coverage CHECK_THROWS_AS(parse_string("\uFF01").parse(), json::parse_error); CHECK_THROWS_AS(parse_string("[-4:1,]").parse(), json::parse_error); @@ -648,17 +648,17 @@ TEST_CASE("parser class") // missing part of a surrogate pair CHECK_THROWS_AS(json::parse("\"\\uD80C\""), json::parse_error); CHECK_THROWS_WITH(json::parse("\"\\uD80C\""), - "[json.exception.parse_error.101] parse error at 8: syntax error - invalid string: missing low surrogate; last read '\"\\uD80C\"'"); + "[json.exception.parse_error.101] parse error at 8: syntax error - invalid string: surrogate U+D80C must be followed by U+DC00..U+DFFF; last read '\"\\uD80C\"'"); // invalid surrogate pair CHECK_THROWS_AS(json::parse("\"\\uD80C\\uD80C\""), json::parse_error); CHECK_THROWS_AS(json::parse("\"\\uD80C\\u0000\""), json::parse_error); CHECK_THROWS_AS(json::parse("\"\\uD80C\\uFFFF\""), json::parse_error); CHECK_THROWS_WITH(json::parse("\"\\uD80C\\uD80C\""), - "[json.exception.parse_error.101] parse error at 13: syntax error - invalid string: invalid low surrogate; last read '\"\\uD80C\\uD80C'"); + "[json.exception.parse_error.101] parse error at 13: syntax error - invalid string: surrogate U+D80C must be followed by U+DC00..U+DFFF instead of U+D80C; last read '\"\\uD80C\\uD80C'"); CHECK_THROWS_WITH(json::parse("\"\\uD80C\\u0000\""), - "[json.exception.parse_error.101] parse error at 13: syntax error - invalid string: invalid low surrogate; last read '\"\\uD80C\\u0000'"); + "[json.exception.parse_error.101] parse error at 13: syntax error - invalid string: surrogate U+D80C must be followed by U+DC00..U+DFFF instead of U+0000; last read '\"\\uD80C\\u0000'"); CHECK_THROWS_WITH(json::parse("\"\\uD80C\\uFFFF\""), - "[json.exception.parse_error.101] parse error at 13: syntax error - invalid string: invalid low surrogate; last read '\"\\uD80C\\uFFFF'"); + "[json.exception.parse_error.101] parse error at 13: syntax error - invalid string: surrogate U+D80C must be followed by U+DC00..U+DFFF instead of U+FFFF; last read '\"\\uD80C\\uFFFF'"); } SECTION("tests found by mutate++")