added extensive Unicode tests
This commit is contained in:
parent
6fb829062c
commit
2e9a13bd88
5 changed files with 1112146 additions and 5 deletions
|
@ -22,8 +22,6 @@ Other aspects were not so important to us:
|
||||||
|
|
||||||
- **Speed**. We currently implement the parser as naive [recursive descent parser](http://en.wikipedia.org/wiki/Recursive_descent_parser) with hand coded string handling. It is fast enough, but a [LALR-parser](http://en.wikipedia.org/wiki/LALR_parser) with a decent regular expression processor should be even faster (but would consist of more files which makes the integration harder).
|
- **Speed**. We currently implement the parser as naive [recursive descent parser](http://en.wikipedia.org/wiki/Recursive_descent_parser) with hand coded string handling. It is fast enough, but a [LALR-parser](http://en.wikipedia.org/wiki/LALR_parser) with a decent regular expression processor should be even faster (but would consist of more files which makes the integration harder).
|
||||||
|
|
||||||
- **Rigorous Unicode compliance**. We did our best to implement some robust Unicode support. There are still some issues with escaping, and if you run into a problem, please [tell me](https://github.com/nlohmann/json/issues).
|
|
||||||
|
|
||||||
## Updates since last version
|
## Updates since last version
|
||||||
|
|
||||||
As of February 2015, the following updates were made to the library
|
As of February 2015, the following updates were made to the library
|
||||||
|
@ -400,7 +398,7 @@ $ make
|
||||||
$ ./json_unit
|
$ ./json_unit
|
||||||
|
|
||||||
===============================================================================
|
===============================================================================
|
||||||
All tests passed (4800 assertions in 21 test cases)
|
All tests passed (3341006 assertions in 22 test cases)
|
||||||
```
|
```
|
||||||
|
|
||||||
For more information, have a look at the file [.travis.yml](https://github.com/nlohmann/json/blob/master/.travis.yml).
|
For more information, have a look at the file [.travis.yml](https://github.com/nlohmann/json/blob/master/.travis.yml).
|
||||||
|
|
|
@ -3513,8 +3513,11 @@ class basic_json
|
||||||
|
|
||||||
// calculate the codepoint from the given code points
|
// calculate the codepoint from the given code points
|
||||||
std::size_t codepoint = codepoint1;
|
std::size_t codepoint = codepoint1;
|
||||||
|
|
||||||
|
// check if codepoint1 is a high surrogate
|
||||||
if (codepoint1 >= 0xD800 and codepoint1 <= 0xDBFF)
|
if (codepoint1 >= 0xD800 and codepoint1 <= 0xDBFF)
|
||||||
{
|
{
|
||||||
|
// check if codepoint2 is a low surrogate
|
||||||
if (codepoint2 >= 0xDC00 and codepoint2 <= 0xDFFF)
|
if (codepoint2 >= 0xDC00 and codepoint2 <= 0xDFFF)
|
||||||
{
|
{
|
||||||
codepoint =
|
codepoint =
|
||||||
|
@ -3533,7 +3536,7 @@ class basic_json
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (codepoint <= 0x7f)
|
if (codepoint < 0x80)
|
||||||
{
|
{
|
||||||
// 1-byte characters: 0xxxxxxx (ASCII)
|
// 1-byte characters: 0xxxxxxx (ASCII)
|
||||||
result.append(1, static_cast<typename string_t::value_type>(codepoint));
|
result.append(1, static_cast<typename string_t::value_type>(codepoint));
|
||||||
|
@ -4494,6 +4497,7 @@ basic_json_parser_59:
|
||||||
auto codepoint = std::strtoul(std::string(reinterpret_cast<typename string_t::const_pointer>(i + 1),
|
auto codepoint = std::strtoul(std::string(reinterpret_cast<typename string_t::const_pointer>(i + 1),
|
||||||
4).c_str(), nullptr, 16);
|
4).c_str(), nullptr, 16);
|
||||||
|
|
||||||
|
// check if codepoint is a high surrogate
|
||||||
if (codepoint >= 0xD800 and codepoint <= 0xDBFF)
|
if (codepoint >= 0xD800 and codepoint <= 0xDBFF)
|
||||||
{
|
{
|
||||||
// make sure there is a subsequent unicode
|
// make sure there is a subsequent unicode
|
||||||
|
|
|
@ -3513,8 +3513,11 @@ class basic_json
|
||||||
|
|
||||||
// calculate the codepoint from the given code points
|
// calculate the codepoint from the given code points
|
||||||
std::size_t codepoint = codepoint1;
|
std::size_t codepoint = codepoint1;
|
||||||
|
|
||||||
|
// check if codepoint1 is a high surrogate
|
||||||
if (codepoint1 >= 0xD800 and codepoint1 <= 0xDBFF)
|
if (codepoint1 >= 0xD800 and codepoint1 <= 0xDBFF)
|
||||||
{
|
{
|
||||||
|
// check if codepoint2 is a low surrogate
|
||||||
if (codepoint2 >= 0xDC00 and codepoint2 <= 0xDFFF)
|
if (codepoint2 >= 0xDC00 and codepoint2 <= 0xDFFF)
|
||||||
{
|
{
|
||||||
codepoint =
|
codepoint =
|
||||||
|
@ -3533,7 +3536,7 @@ class basic_json
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (codepoint <= 0x7f)
|
if (codepoint < 0x80)
|
||||||
{
|
{
|
||||||
// 1-byte characters: 0xxxxxxx (ASCII)
|
// 1-byte characters: 0xxxxxxx (ASCII)
|
||||||
result.append(1, static_cast<typename string_t::value_type>(codepoint));
|
result.append(1, static_cast<typename string_t::value_type>(codepoint));
|
||||||
|
@ -3800,6 +3803,7 @@ class basic_json
|
||||||
auto codepoint = std::strtoul(std::string(reinterpret_cast<typename string_t::const_pointer>(i + 1),
|
auto codepoint = std::strtoul(std::string(reinterpret_cast<typename string_t::const_pointer>(i + 1),
|
||||||
4).c_str(), nullptr, 16);
|
4).c_str(), nullptr, 16);
|
||||||
|
|
||||||
|
// check if codepoint is a high surrogate
|
||||||
if (codepoint >= 0xD800 and codepoint <= 0xDBFF)
|
if (codepoint >= 0xD800 and codepoint <= 0xDBFF)
|
||||||
{
|
{
|
||||||
// make sure there is a subsequent unicode
|
// make sure there is a subsequent unicode
|
||||||
|
|
1112067
test/json_nlohmann_tests/all_unicode.json
Normal file
1112067
test/json_nlohmann_tests/all_unicode.json
Normal file
File diff suppressed because it is too large
Load diff
|
@ -8716,6 +8716,74 @@ TEST_CASE("compliance tests from nativejson-benchmark")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_CASE("Unicode")
|
||||||
|
{
|
||||||
|
SECTION("full enumeration of Unicode codepoints")
|
||||||
|
{
|
||||||
|
// create a string from a codepoint
|
||||||
|
auto codepoint_to_unicode = [](std::size_t cp)
|
||||||
|
{
|
||||||
|
char* buffer = new char[10];
|
||||||
|
sprintf(buffer, "\\u%04lx", cp);
|
||||||
|
std::string result(buffer);
|
||||||
|
delete[] buffer;
|
||||||
|
return result;
|
||||||
|
};
|
||||||
|
|
||||||
|
// generate all codepoints
|
||||||
|
for (std::size_t cp = 0; cp <= 0x10FFFFu; ++cp)
|
||||||
|
{
|
||||||
|
// The Unicode standard permanently reserves these code point
|
||||||
|
// values for UTF-16 encoding of the high and low surrogates, and
|
||||||
|
// they will never be assigned a character, so there should be no
|
||||||
|
// reason to encode them. The official Unicode standard says that
|
||||||
|
// no UTF forms, including UTF-16, can encode these code points.
|
||||||
|
if (cp >= 0xD800u and cp <= 0xDFFFu)
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string res;
|
||||||
|
|
||||||
|
if (cp < 0x10000u)
|
||||||
|
{
|
||||||
|
// codepoint can be represented with 16 bit
|
||||||
|
res += codepoint_to_unicode(cp);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// codepoint can be represented with a pair
|
||||||
|
res += codepoint_to_unicode(0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu));
|
||||||
|
res += codepoint_to_unicode(0xdc00u + ((cp - 0x10000u) & 0x3ffu));
|
||||||
|
}
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
json j1, j2;
|
||||||
|
CHECK_NOTHROW(j1 = json::parse("\"" + res + "\""));
|
||||||
|
CHECK_NOTHROW(j2 = json::parse(j1.dump()));
|
||||||
|
CHECK(j1 == j2);
|
||||||
|
}
|
||||||
|
catch (std::invalid_argument)
|
||||||
|
{
|
||||||
|
// we ignore parsing errors
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
SECTION("read all unicode characters")
|
||||||
|
{
|
||||||
|
// read a file with all unicode characters stored as single-character
|
||||||
|
// strings in a JSON array
|
||||||
|
std::ifstream f("test/json_nlohmann_tests/all_unicode.json");
|
||||||
|
json j;
|
||||||
|
CHECK_NOTHROW(j << f);
|
||||||
|
|
||||||
|
// the array has 1112064 + 1 elemnts (a terminating "null" value)
|
||||||
|
CHECK(j.size() == 1112065);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
TEST_CASE("regression tests")
|
TEST_CASE("regression tests")
|
||||||
{
|
{
|
||||||
SECTION("issue #60 - Double quotation mark is not parsed correctly")
|
SECTION("issue #60 - Double quotation mark is not parsed correctly")
|
||||||
|
|
Loading…
Reference in a new issue