✅ added more Unicode test cases
This commit is contained in:
parent
734297ff45
commit
6d2c0a7928
1 changed files with 631 additions and 606 deletions
|
@ -74,8 +74,10 @@ void check_utf8string(bool success_expected, int byte1, int byte2 = -1, int byte
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_CASE("RFC 3629", "[hide]")
|
TEST_CASE("Unicode", "[hide]")
|
||||||
{
|
{
|
||||||
|
SECTION("RFC 3629")
|
||||||
|
{
|
||||||
/*
|
/*
|
||||||
RFC 3629 describes in Sect. 4 the syntax of UTF-8 byte sequences as
|
RFC 3629 describes in Sect. 4 the syntax of UTF-8 byte sequences as
|
||||||
follows:
|
follows:
|
||||||
|
@ -850,20 +852,14 @@ TEST_CASE("RFC 3629", "[hide]")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_CASE("Unicode", "[hide]")
|
SECTION("\\uxxxx sequences")
|
||||||
{
|
|
||||||
/* NOTE: to_unicode is not used any more
|
|
||||||
SECTION("full enumeration of Unicode code points")
|
|
||||||
{
|
{
|
||||||
// lexer to call to_unicode on
|
|
||||||
json::lexer dummy_lexer("", 0);
|
|
||||||
|
|
||||||
// create an escaped string from a code point
|
// create an escaped string from a code point
|
||||||
const auto codepoint_to_unicode = [](std::size_t cp)
|
const auto codepoint_to_unicode = [](std::size_t cp)
|
||||||
{
|
{
|
||||||
// copd points are represented as a six-character sequence: a
|
// code points are represented as a six-character sequence: a
|
||||||
// reverse solidus, followed by the lowercase letter u, followed
|
// reverse solidus, followed by the lowercase letter u, followed
|
||||||
// by four hexadecimal digits that encode the character's code
|
// by four hexadecimal digits that encode the character's code
|
||||||
// point
|
// point
|
||||||
|
@ -872,10 +868,18 @@ TEST_CASE("Unicode", "[hide]")
|
||||||
return ss.str();
|
return ss.str();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
SECTION("correct sequences")
|
||||||
|
{
|
||||||
// generate all UTF-8 code points; in total, 1112064 code points are
|
// generate all UTF-8 code points; in total, 1112064 code points are
|
||||||
// generated: 0x1FFFFF code points - 2048 invalid values between
|
// generated: 0x1FFFFF code points - 2048 invalid values between
|
||||||
// 0xD800 and 0xDFFF.
|
// 0xD800 and 0xDFFF.
|
||||||
for (std::size_t cp = 0; cp <= 0x10FFFFu; ++cp)
|
for (std::size_t cp = 0; cp <= 0x10FFFFu; ++cp)
|
||||||
|
{
|
||||||
|
// string to store the code point as in \uxxxx format
|
||||||
|
std::string json_text = "\"";
|
||||||
|
|
||||||
|
// decide whether to use one or two \uxxxx sequences
|
||||||
|
if (cp < 0x10000u)
|
||||||
{
|
{
|
||||||
// The Unicode standard permanently reserves these code point
|
// The Unicode standard permanently reserves these code point
|
||||||
// values for UTF-16 encoding of the high and low surrogates, and
|
// values for UTF-16 encoding of the high and low surrogates, and
|
||||||
|
@ -889,26 +893,9 @@ TEST_CASE("Unicode", "[hide]")
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// string to store the code point as in \uxxxx format
|
|
||||||
std::string escaped_string;
|
|
||||||
// string to store the code point as unescaped character sequence
|
|
||||||
std::string unescaped_string;
|
|
||||||
|
|
||||||
if (cp < 0x10000u)
|
|
||||||
{
|
|
||||||
// code points in the Basic Multilingual Plane can be
|
// code points in the Basic Multilingual Plane can be
|
||||||
// represented with one \\uxxxx sequence
|
// represented with one \uxxxx sequence
|
||||||
escaped_string = codepoint_to_unicode(cp);
|
json_text += codepoint_to_unicode(cp);
|
||||||
|
|
||||||
// All Unicode characters may be placed within the quotation
|
|
||||||
// marks, except for the characters that must be escaped:
|
|
||||||
// quotation mark, reverse solidus, and the control characters
|
|
||||||
// (U+0000 through U+001F); we ignore these code points as
|
|
||||||
// they are checked with codepoint_to_unicode.
|
|
||||||
if (cp > 0x1f and cp != 0x22 and cp != 0x5c)
|
|
||||||
{
|
|
||||||
unescaped_string = dummy_lexer.to_unicode(cp);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -917,27 +904,65 @@ TEST_CASE("Unicode", "[hide]")
|
||||||
// 12-character sequence, encoding the UTF-16 surrogate pair
|
// 12-character sequence, encoding the UTF-16 surrogate pair
|
||||||
const auto codepoint1 = 0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu);
|
const auto codepoint1 = 0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu);
|
||||||
const auto codepoint2 = 0xdc00u + ((cp - 0x10000u) & 0x3ffu);
|
const auto codepoint2 = 0xdc00u + ((cp - 0x10000u) & 0x3ffu);
|
||||||
escaped_string = codepoint_to_unicode(codepoint1);
|
json_text += codepoint_to_unicode(codepoint1) + codepoint_to_unicode(codepoint2);
|
||||||
escaped_string += codepoint_to_unicode(codepoint2);
|
}
|
||||||
unescaped_string += dummy_lexer.to_unicode(codepoint1, codepoint2);
|
|
||||||
|
json_text += "\"";
|
||||||
|
CAPTURE(json_text);
|
||||||
|
CHECK_NOTHROW(json::parse(json_text));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
SECTION("incorrect sequences")
|
||||||
|
{
|
||||||
|
SECTION("high surrogate without low surrogate")
|
||||||
|
{
|
||||||
|
// D800..DBFF are high surrogates and must be followed by low
|
||||||
|
// surrogates DC00..DFFF; here, nothing follows
|
||||||
|
for (std::size_t cp = 0xD800u; cp <= 0xDBFFu; ++cp)
|
||||||
|
{
|
||||||
|
std::string json_text = "\"" + codepoint_to_unicode(cp) + "\"";
|
||||||
|
CAPTURE(json_text);
|
||||||
|
CHECK_THROWS_AS(json::parse(json_text), json::parse_error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
SECTION("high surrogate with wrong low surrogate")
|
||||||
|
{
|
||||||
|
// D800..DBFF are high surrogates and must be followed by low
|
||||||
|
// surrogates DC00..DFFF; here a different sequence follows
|
||||||
|
for (std::size_t cp1 = 0xD800u; cp1 <= 0xDBFFu; ++cp1)
|
||||||
|
{
|
||||||
|
for (std::size_t cp2 = 0x0000u; cp2 <= 0xFFFFu; ++cp2)
|
||||||
|
{
|
||||||
|
if (0xDC00u <= cp2 and cp2 <= 0xDFFFu)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string json_text = "\"" + codepoint_to_unicode(cp1) + codepoint_to_unicode(cp2) + "\"";
|
||||||
|
CAPTURE(json_text);
|
||||||
|
CHECK_THROWS_AS(json::parse(json_text), json::parse_error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
SECTION("low surrogate without high surrogate")
|
||||||
|
{
|
||||||
|
// low surrogates DC00..DFFF must follow high surrogates; here,
|
||||||
|
// they occur alone
|
||||||
|
for (std::size_t cp = 0xDC00u; cp <= 0xDFFFu; ++cp)
|
||||||
|
{
|
||||||
|
std::string json_text = "\"" + codepoint_to_unicode(cp) + "\"";
|
||||||
|
CAPTURE(json_text);
|
||||||
|
CHECK_THROWS_AS(json::parse(json_text), json::parse_error);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// all other code points are valid and must not yield parse errors
|
|
||||||
CAPTURE(cp);
|
|
||||||
CAPTURE(escaped_string);
|
|
||||||
CAPTURE(unescaped_string);
|
|
||||||
|
|
||||||
json j1, j2, j3, j4;
|
|
||||||
CHECK_NOTHROW(j1 = json::parse("\"" + escaped_string + "\""));
|
|
||||||
CHECK_NOTHROW(j2 = json::parse(j1.dump()));
|
|
||||||
CHECK(j1 == j2);
|
|
||||||
|
|
||||||
CHECK_NOTHROW(j3 = json::parse("\"" + unescaped_string + "\""));
|
|
||||||
CHECK_NOTHROW(j4 = json::parse(j3.dump()));
|
|
||||||
CHECK(j3 == j4);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
|
|
||||||
SECTION("read all unicode characters")
|
SECTION("read all unicode characters")
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in a new issue