✅ added more Unicode test cases

2017-04-23 22:54:21 +02:00 · 2017-04-23 22:54:21 +02:00 · 6d2c0a7928
commit 6d2c0a7928
parent 734297ff45
1 changed files with 631 additions and 606 deletions
--- a/test/src/unit-unicode.cpp
+++ b/test/src/unit-unicode.cpp
@ -74,8 +74,10 @@ void check_utf8string(bool success_expected, int byte1, int byte2 = -1, int byte
    }
 }

-TEST_CASE("RFC 3629", "[hide]")
+TEST_CASE("Unicode", "[hide]")
 {
+    SECTION("RFC 3629")
+    {
        /*
        RFC 3629 describes in Sect. 4 the syntax of UTF-8 byte sequences as
        follows:
@ -850,20 +852,14 @@ TEST_CASE("RFC 3629", "[hide]")
                }
            }
        }
-}
+    }

-TEST_CASE("Unicode", "[hide]")
-{
-    /* NOTE: to_unicode is not used any more
-    SECTION("full enumeration of Unicode code points")
+    SECTION("\\uxxxx sequences")
    {
-        // lexer to call to_unicode on
-        json::lexer dummy_lexer("", 0);
-
        // create an escaped string from a code point
        const auto codepoint_to_unicode = [](std::size_t cp)
        {
-            // copd points are represented as a six-character sequence: a
+            // code points are represented as a six-character sequence: a
            // reverse solidus, followed by the lowercase letter u, followed
            // by four hexadecimal digits that encode the character's code
            // point
@ -872,10 +868,18 @@ TEST_CASE("Unicode", "[hide]")
            return ss.str();
        };

+        SECTION("correct sequences")
+        {
            // generate all UTF-8 code points; in total, 1112064 code points are
            // generated: 0x1FFFFF code points - 2048 invalid values between
            // 0xD800 and 0xDFFF.
            for (std::size_t cp = 0; cp <= 0x10FFFFu; ++cp)
+            {
+                // string to store the code point as in \uxxxx format
+                std::string json_text = "\"";
+
+                // decide whether to use one or two \uxxxx sequences
+                if (cp < 0x10000u)
                {
                    // The Unicode standard permanently reserves these code point
                    // values for UTF-16 encoding of the high and low surrogates, and
@ -889,26 +893,9 @@ TEST_CASE("Unicode", "[hide]")
                        continue;
                    }

-            // string to store the code point as in \uxxxx format
-            std::string escaped_string;
-            // string to store the code point as unescaped character sequence
-            std::string unescaped_string;
-
-            if (cp < 0x10000u)
-            {
                    // code points in the Basic Multilingual Plane can be
-                // represented with one \\uxxxx sequence
-                escaped_string = codepoint_to_unicode(cp);
-
-                // All Unicode characters may be placed within the quotation
-                // marks, except for the characters that must be escaped:
-                // quotation mark, reverse solidus, and the control characters
-                // (U+0000 through U+001F); we ignore these code points as
-                // they are checked with codepoint_to_unicode.
-                if (cp > 0x1f and cp != 0x22 and cp != 0x5c)
-                {
-                    unescaped_string = dummy_lexer.to_unicode(cp);
-                }
+                    // represented with one \uxxxx sequence
+                    json_text += codepoint_to_unicode(cp);
                }
                else
                {
@ -917,27 +904,65 @@ TEST_CASE("Unicode", "[hide]")
                    // 12-character sequence, encoding the UTF-16 surrogate pair
                    const auto codepoint1 = 0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu);
                    const auto codepoint2 = 0xdc00u + ((cp - 0x10000u) & 0x3ffu);
-                escaped_string = codepoint_to_unicode(codepoint1);
-                escaped_string += codepoint_to_unicode(codepoint2);
-                unescaped_string += dummy_lexer.to_unicode(codepoint1, codepoint2);
+                    json_text += codepoint_to_unicode(codepoint1) + codepoint_to_unicode(codepoint2);
+                }
+
+                json_text += "\"";
+                CAPTURE(json_text);
+                CHECK_NOTHROW(json::parse(json_text));
+            }
+        }
+
+        SECTION("incorrect sequences")
+        {
+            SECTION("high surrogate without low surrogate")
+            {
+                // D800..DBFF are high surrogates and must be followed by low
+                // surrogates DC00..DFFF; here, nothing follows
+                for (std::size_t cp = 0xD800u; cp <= 0xDBFFu; ++cp)
+                {
+                    std::string json_text = "\"" + codepoint_to_unicode(cp) + "\"";
+                    CAPTURE(json_text);
+                    CHECK_THROWS_AS(json::parse(json_text), json::parse_error);
+                }
+            }
+
+#if 0
+            SECTION("high surrogate with wrong low surrogate")
+            {
+                // D800..DBFF are high surrogates and must be followed by low
+                // surrogates DC00..DFFF; here a different sequence follows
+                for (std::size_t cp1 = 0xD800u; cp1 <= 0xDBFFu; ++cp1)
+                {
+                    for (std::size_t cp2 = 0x0000u; cp2 <= 0xFFFFu; ++cp2)
+                    {
+                        if (0xDC00u <= cp2 and cp2 <= 0xDFFFu)
+                        {
+                            continue;
+                        }
+
+                        std::string json_text = "\"" + codepoint_to_unicode(cp1) + codepoint_to_unicode(cp2) + "\"";
+                        CAPTURE(json_text);
+                        CHECK_THROWS_AS(json::parse(json_text), json::parse_error);
+                    }
+                }
+            }
+#endif
+
+            SECTION("low surrogate without high surrogate")
+            {
+                // low surrogates DC00..DFFF must follow high surrogates; here,
+                // they occur alone
+                for (std::size_t cp = 0xDC00u; cp <= 0xDFFFu; ++cp)
+                {
+                    std::string json_text = "\"" + codepoint_to_unicode(cp) + "\"";
+                    CAPTURE(json_text);
+                    CHECK_THROWS_AS(json::parse(json_text), json::parse_error);
+                }
            }

-            // all other code points are valid and must not yield parse errors
-            CAPTURE(cp);
-            CAPTURE(escaped_string);
-            CAPTURE(unescaped_string);
-
-            json j1, j2, j3, j4;
-            CHECK_NOTHROW(j1 = json::parse("\"" + escaped_string + "\""));
-            CHECK_NOTHROW(j2 = json::parse(j1.dump()));
-            CHECK(j1 == j2);
-
-            CHECK_NOTHROW(j3 = json::parse("\"" + unescaped_string + "\""));
-            CHECK_NOTHROW(j4 = json::parse(j3.dump()));
-            CHECK(j3 == j4);
        }
    }
-     */

    SECTION("read all unicode characters")
    {