added more Unicode tests
This commit is contained in:
parent
dcea3a283c
commit
369671f028
2 changed files with 56 additions and 27 deletions
|
@ -505,7 +505,7 @@ $ make
|
||||||
$ ./json_unit "*"
|
$ ./json_unit "*"
|
||||||
|
|
||||||
===============================================================================
|
===============================================================================
|
||||||
All tests passed (5568718 assertions in 32 test cases)
|
All tests passed (8905012 assertions in 32 test cases)
|
||||||
```
|
```
|
||||||
|
|
||||||
For more information, have a look at the file [.travis.yml](https://github.com/nlohmann/json/blob/master/.travis.yml).
|
For more information, have a look at the file [.travis.yml](https://github.com/nlohmann/json/blob/master/.travis.yml).
|
||||||
|
|
|
@ -12287,17 +12287,21 @@ TEST_CASE("Unicode", "[hide]")
|
||||||
{
|
{
|
||||||
SECTION("full enumeration of Unicode code points")
|
SECTION("full enumeration of Unicode code points")
|
||||||
{
|
{
|
||||||
// create a string from a codepoint
|
// create an escaped string from a code point
|
||||||
auto codepoint_to_unicode = [](std::size_t cp)
|
const auto codepoint_to_unicode = [](std::size_t cp)
|
||||||
{
|
{
|
||||||
char* buffer = new char[10];
|
// copd points are represented as a six-character sequence: a
|
||||||
sprintf(buffer, "\\u%04lx", cp);
|
// reverse solidus, followed by the lowercase letter u, followed
|
||||||
std::string result(buffer);
|
// by four hexadecimal digits that encode the character's code
|
||||||
delete[] buffer;
|
// point
|
||||||
return result;
|
std::stringstream ss;
|
||||||
|
ss << "\\u" << std::setw(4) << std::setfill('0') << std::hex << cp;
|
||||||
|
return ss.str();
|
||||||
};
|
};
|
||||||
|
|
||||||
// generate all codepoints
|
// generate all UTF8 code points; in total, 1112064 code points are
|
||||||
|
// generated: 0x1FFFFF code points - 2047 invalid values between
|
||||||
|
// 0xD800 and 0xDFFF.
|
||||||
for (std::size_t cp = 0; cp <= 0x10FFFFu; ++cp)
|
for (std::size_t cp = 0; cp <= 0x10FFFFu; ++cp)
|
||||||
{
|
{
|
||||||
// The Unicode standard permanently reserves these code point
|
// The Unicode standard permanently reserves these code point
|
||||||
|
@ -12307,34 +12311,57 @@ TEST_CASE("Unicode", "[hide]")
|
||||||
// no UTF forms, including UTF-16, can encode these code points.
|
// no UTF forms, including UTF-16, can encode these code points.
|
||||||
if (cp >= 0xD800u and cp <= 0xDFFFu)
|
if (cp >= 0xD800u and cp <= 0xDFFFu)
|
||||||
{
|
{
|
||||||
|
// if we would not skip these code points, we would get a
|
||||||
|
// "missing low surrogate" exception
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string res;
|
// string to store the code point as in \uxxxx format
|
||||||
|
std::string escaped_string;
|
||||||
|
// string to store the code point as unescaped character sequence
|
||||||
|
std::string unescaped_string;
|
||||||
|
|
||||||
if (cp < 0x10000u)
|
if (cp < 0x10000u)
|
||||||
{
|
{
|
||||||
// codepoint can be represented with 16 bit
|
// code points in the Basic Multilingual Plane can be
|
||||||
res += codepoint_to_unicode(cp);
|
// represented with one \\uxxxx sequence
|
||||||
|
escaped_string = codepoint_to_unicode(cp);
|
||||||
|
|
||||||
|
// All Unicode characters may be placed within the quotation
|
||||||
|
// marks, except for the characters that must be escaped:
|
||||||
|
// quotation mark, reverse solidus, and the control characters
|
||||||
|
// (U+0000 through U+001F); we ignore these code points as
|
||||||
|
// they are checked with codepoint_to_unicode.
|
||||||
|
if (cp > 0x1f and cp != 0x22 and cp != 0x5c)
|
||||||
|
{
|
||||||
|
unescaped_string = json::lexer::to_unicode(cp);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// codepoint can be represented with a pair
|
// To escape an extended character that is not in the Basic
|
||||||
res += codepoint_to_unicode(0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu));
|
// Multilingual Plane, the character is represented as a
|
||||||
res += codepoint_to_unicode(0xdc00u + ((cp - 0x10000u) & 0x3ffu));
|
// 12-character sequence, encoding the UTF-16 surrogate pair
|
||||||
|
const auto codepoint1 = 0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu);
|
||||||
|
const auto codepoint2 = 0xdc00u + ((cp - 0x10000u) & 0x3ffu);
|
||||||
|
escaped_string = codepoint_to_unicode(codepoint1);
|
||||||
|
escaped_string += codepoint_to_unicode(codepoint2);
|
||||||
|
unescaped_string += json::lexer::to_unicode(codepoint1, codepoint2);
|
||||||
}
|
}
|
||||||
|
|
||||||
try
|
// all other code points are valid and must not yield parse errors
|
||||||
{
|
CAPTURE(cp);
|
||||||
json j1, j2;
|
CAPTURE(escaped_string);
|
||||||
CHECK_NOTHROW(j1 = json::parse("\"" + res + "\""));
|
CAPTURE(unescaped_string);
|
||||||
|
|
||||||
|
json j1, j2, j3, j4;
|
||||||
|
CHECK_NOTHROW(j1 = json::parse("\"" + escaped_string + "\""));
|
||||||
CHECK_NOTHROW(j2 = json::parse(j1.dump()));
|
CHECK_NOTHROW(j2 = json::parse(j1.dump()));
|
||||||
CHECK(j1 == j2);
|
CHECK(j1 == j2);
|
||||||
}
|
|
||||||
catch (std::invalid_argument)
|
CHECK_NOTHROW(j3 = json::parse("\"" + unescaped_string + "\""));
|
||||||
{
|
CHECK_NOTHROW(j4 = json::parse(j3.dump()));
|
||||||
// we ignore parsing errors
|
CHECK(j3 == j4);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -12347,6 +12374,8 @@ TEST_CASE("Unicode", "[hide]")
|
||||||
CHECK_NOTHROW(j << f);
|
CHECK_NOTHROW(j << f);
|
||||||
|
|
||||||
// the array has 1112064 + 1 elemnts (a terminating "null" value)
|
// the array has 1112064 + 1 elemnts (a terminating "null" value)
|
||||||
|
// Note: 1112064 = 0x1FFFFF code points - 2047 invalid values between
|
||||||
|
// 0xD800 and 0xDFFF.
|
||||||
CHECK(j.size() == 1112065);
|
CHECK(j.size() == 1112065);
|
||||||
|
|
||||||
SECTION("check JSON Pointers")
|
SECTION("check JSON Pointers")
|
||||||
|
|
Loading…
Reference in a new issue