added extensive Unicode tests
This commit is contained in:
		
							parent
							
								
									6fb829062c
								
							
						
					
					
						commit
						2e9a13bd88
					
				
					 5 changed files with 1112146 additions and 5 deletions
				
			
		| 
						 | 
				
			
			@ -22,8 +22,6 @@ Other aspects were not so important to us:
 | 
			
		|||
 | 
			
		||||
- **Speed**. We currently implement the parser as naive [recursive descent parser](http://en.wikipedia.org/wiki/Recursive_descent_parser) with hand coded string handling. It is fast enough, but a [LALR-parser](http://en.wikipedia.org/wiki/LALR_parser) with a decent regular expression processor should be even faster (but would consist of more files which makes the integration harder).
 | 
			
		||||
 | 
			
		||||
- **Rigorous Unicode compliance**. We did our best to implement some robust Unicode support. There are still some issues with escaping, and if you run into a problem, please [tell me](https://github.com/nlohmann/json/issues).
 | 
			
		||||
 | 
			
		||||
## Updates since last version
 | 
			
		||||
 | 
			
		||||
As of February 2015, the following updates were made to the library
 | 
			
		||||
| 
						 | 
				
			
			@ -400,7 +398,7 @@ $ make
 | 
			
		|||
$ ./json_unit
 | 
			
		||||
 | 
			
		||||
===============================================================================
 | 
			
		||||
All tests passed (4800 assertions in 21 test cases)
 | 
			
		||||
All tests passed (3341006 assertions in 22 test cases)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
For more information, have a look at the file [.travis.yml](https://github.com/nlohmann/json/blob/master/.travis.yml).
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3513,8 +3513,11 @@ class basic_json
 | 
			
		|||
 | 
			
		||||
            // calculate the codepoint from the given code points
 | 
			
		||||
            std::size_t codepoint = codepoint1;
 | 
			
		||||
 | 
			
		||||
            // check if codepoint1 is a high surrogate
 | 
			
		||||
            if (codepoint1 >= 0xD800 and codepoint1 <= 0xDBFF)
 | 
			
		||||
            {
 | 
			
		||||
                // check if codepoint2 is a low surrogate
 | 
			
		||||
                if (codepoint2 >= 0xDC00 and codepoint2 <= 0xDFFF)
 | 
			
		||||
                {
 | 
			
		||||
                    codepoint =
 | 
			
		||||
| 
						 | 
				
			
			@ -3533,7 +3536,7 @@ class basic_json
 | 
			
		|||
                }
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            if (codepoint <= 0x7f)
 | 
			
		||||
            if (codepoint < 0x80)
 | 
			
		||||
            {
 | 
			
		||||
                // 1-byte characters: 0xxxxxxx (ASCII)
 | 
			
		||||
                result.append(1, static_cast<typename string_t::value_type>(codepoint));
 | 
			
		||||
| 
						 | 
				
			
			@ -4494,6 +4497,7 @@ basic_json_parser_59:
 | 
			
		|||
                            auto codepoint = std::strtoul(std::string(reinterpret_cast<typename string_t::const_pointer>(i + 1),
 | 
			
		||||
                                                          4).c_str(), nullptr, 16);
 | 
			
		||||
 | 
			
		||||
                            // check if codepoint is a high surrogate
 | 
			
		||||
                            if (codepoint >= 0xD800 and codepoint <= 0xDBFF)
 | 
			
		||||
                            {
 | 
			
		||||
                                // make sure there is a subsequent unicode
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3513,8 +3513,11 @@ class basic_json
 | 
			
		|||
 | 
			
		||||
            // calculate the codepoint from the given code points
 | 
			
		||||
            std::size_t codepoint = codepoint1;
 | 
			
		||||
 | 
			
		||||
            // check if codepoint1 is a high surrogate
 | 
			
		||||
            if (codepoint1 >= 0xD800 and codepoint1 <= 0xDBFF)
 | 
			
		||||
            {
 | 
			
		||||
                // check if codepoint2 is a low surrogate
 | 
			
		||||
                if (codepoint2 >= 0xDC00 and codepoint2 <= 0xDFFF)
 | 
			
		||||
                {
 | 
			
		||||
                    codepoint =
 | 
			
		||||
| 
						 | 
				
			
			@ -3533,7 +3536,7 @@ class basic_json
 | 
			
		|||
                }
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            if (codepoint <= 0x7f)
 | 
			
		||||
            if (codepoint < 0x80)
 | 
			
		||||
            {
 | 
			
		||||
                // 1-byte characters: 0xxxxxxx (ASCII)
 | 
			
		||||
                result.append(1, static_cast<typename string_t::value_type>(codepoint));
 | 
			
		||||
| 
						 | 
				
			
			@ -3800,6 +3803,7 @@ class basic_json
 | 
			
		|||
                            auto codepoint = std::strtoul(std::string(reinterpret_cast<typename string_t::const_pointer>(i + 1),
 | 
			
		||||
                                                          4).c_str(), nullptr, 16);
 | 
			
		||||
 | 
			
		||||
                            // check if codepoint is a high surrogate
 | 
			
		||||
                            if (codepoint >= 0xD800 and codepoint <= 0xDBFF)
 | 
			
		||||
                            {
 | 
			
		||||
                                // make sure there is a subsequent unicode
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										1112067
									
								
								test/json_nlohmann_tests/all_unicode.json
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										1112067
									
								
								test/json_nlohmann_tests/all_unicode.json
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							| 
						 | 
				
			
			@ -8716,6 +8716,74 @@ TEST_CASE("compliance tests from nativejson-benchmark")
 | 
			
		|||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
TEST_CASE("Unicode")
 | 
			
		||||
{
 | 
			
		||||
    SECTION("full enumeration of Unicode codepoints")
 | 
			
		||||
    {
 | 
			
		||||
        // create a string from a codepoint
 | 
			
		||||
        auto codepoint_to_unicode = [](std::size_t cp)
 | 
			
		||||
        {
 | 
			
		||||
            char* buffer = new char[10];
 | 
			
		||||
            sprintf(buffer, "\\u%04lx", cp);
 | 
			
		||||
            std::string result(buffer);
 | 
			
		||||
            delete[] buffer;
 | 
			
		||||
            return result;
 | 
			
		||||
        };
 | 
			
		||||
 | 
			
		||||
        // generate all codepoints
 | 
			
		||||
        for (std::size_t cp = 0; cp <= 0x10FFFFu; ++cp)
 | 
			
		||||
        {
 | 
			
		||||
            // The Unicode standard permanently reserves these code point
 | 
			
		||||
            // values for UTF-16 encoding of the high and low surrogates, and
 | 
			
		||||
            // they will never be assigned a character, so there should be no
 | 
			
		||||
            // reason to encode them. The official Unicode standard says that
 | 
			
		||||
            // no UTF forms, including UTF-16, can encode these code points.
 | 
			
		||||
            if (cp >= 0xD800u and cp <= 0xDFFFu)
 | 
			
		||||
            {
 | 
			
		||||
                continue;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            std::string res;
 | 
			
		||||
 | 
			
		||||
            if (cp < 0x10000u)
 | 
			
		||||
            {
 | 
			
		||||
                // codepoint can be represented with 16 bit
 | 
			
		||||
                res += codepoint_to_unicode(cp);
 | 
			
		||||
            }
 | 
			
		||||
            else
 | 
			
		||||
            {
 | 
			
		||||
                // codepoint can be represented with a pair
 | 
			
		||||
                res += codepoint_to_unicode(0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu));
 | 
			
		||||
                res += codepoint_to_unicode(0xdc00u + ((cp - 0x10000u) & 0x3ffu));
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            try
 | 
			
		||||
            {
 | 
			
		||||
                json j1, j2;
 | 
			
		||||
                CHECK_NOTHROW(j1 = json::parse("\"" + res + "\""));
 | 
			
		||||
                CHECK_NOTHROW(j2 = json::parse(j1.dump()));
 | 
			
		||||
                CHECK(j1 == j2);
 | 
			
		||||
            }
 | 
			
		||||
            catch (std::invalid_argument)
 | 
			
		||||
            {
 | 
			
		||||
                // we ignore parsing errors
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    SECTION("read all unicode characters")
 | 
			
		||||
    {
 | 
			
		||||
        // read a file with all unicode characters stored as single-character
 | 
			
		||||
        // strings in a JSON array
 | 
			
		||||
        std::ifstream f("test/json_nlohmann_tests/all_unicode.json");
 | 
			
		||||
        json j;
 | 
			
		||||
        CHECK_NOTHROW(j << f);
 | 
			
		||||
 | 
			
		||||
        // the array has 1112064 + 1 elemnts (a terminating "null" value)
 | 
			
		||||
        CHECK(j.size() == 1112065);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
TEST_CASE("regression tests")
 | 
			
		||||
{
 | 
			
		||||
    SECTION("issue #60 - Double quotation mark is not parsed correctly")
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue