From 0671e92ced71187690c81db645279d39ecf92e16 Mon Sep 17 00:00:00 2001
From: Niels Lohmann <mail@nlohmann.me>
Date: Tue, 16 Oct 2018 20:38:50 +0200
Subject: [PATCH 1/8] :construction: proposal for different error handlers
 #1198

Proof of concept; currently only as parameter to the internal dump_escaped function; that is, not yet exposed to the dump function.
---
 include/nlohmann/detail/output/serializer.hpp | 67 +++++++++++++++++--
 single_include/nlohmann/json.hpp              | 67 +++++++++++++++++--
 2 files changed, 120 insertions(+), 14 deletions(-)

diff --git a/include/nlohmann/detail/output/serializer.hpp b/include/nlohmann/detail/output/serializer.hpp
index bb74a86e..7adf0c2f 100644
--- a/include/nlohmann/detail/output/serializer.hpp
+++ b/include/nlohmann/detail/output/serializer.hpp
@@ -39,6 +39,14 @@ class serializer
     static constexpr uint8_t UTF8_REJECT = 1;
 
   public:
+    /// how to treat decoding errors
+    enum class error_handler_t
+    {
+        strict,  ///< throw a type_error exception in case of invalid UTF-8
+        replace, ///< replace invalid UTF-8 sequences with U+FFFD
+        ignore   ///< ignore invalid UTF-8 sequences
+    };
+
     /*!
     @param[in] s  output stream to serialize to
     @param[in] ichar  indentation character to use
@@ -278,10 +286,12 @@ class serializer
     @param[in] s  the string to escape
     @param[in] ensure_ascii  whether to escape non-ASCII characters with
                              \uXXXX sequences
+    @param[in] error_handler how to react on decoding errors
 
     @complexity Linear in the length of string @a s.
     */
-    void dump_escaped(const string_t& s, const bool ensure_ascii)
+    void dump_escaped(const string_t& s, const bool ensure_ascii,
+                      const error_handler_t error_handler = error_handler_t::strict)
     {
         uint32_t codepoint;
         uint8_t state = UTF8_ACCEPT;
@@ -389,9 +399,33 @@ class serializer
 
                 case UTF8_REJECT:  // decode found invalid UTF-8 byte
                 {
-                    std::string sn(3, '\0');
-                    snprintf(&sn[0], sn.size(), "%.2X", byte);
-                    JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn));
+                    switch (error_handler)
+                    {
+                        case error_handler_t::strict:
+                        {
+                            std::string sn(3, '\0');
+                            snprintf(&sn[0], sn.size(), "%.2X", byte);
+                            JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn));
+                        }
+
+                        case error_handler_t::ignore:
+                        {
+                            state = UTF8_ACCEPT;
+                            continue;
+                        }
+
+                        case error_handler_t::replace:
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'u';
+                            string_buffer[bytes++] = 'f';
+                            string_buffer[bytes++] = 'f';
+                            string_buffer[bytes++] = 'f';
+                            string_buffer[bytes++] = 'd';
+                            state = UTF8_ACCEPT;
+                            continue;
+                        }
+                    }
                 }
 
                 default:  // decode found yet incomplete multi-byte code point
@@ -417,9 +451,28 @@ class serializer
         else
         {
             // we finish reading, but do not accept: string was incomplete
-            std::string sn(3, '\0');
-            snprintf(&sn[0], sn.size(), "%.2X", static_cast<uint8_t>(s.back()));
-            JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn));
+            switch (error_handler)
+            {
+                case error_handler_t::strict:
+                {
+                    std::string sn(3, '\0');
+                    snprintf(&sn[0], sn.size(), "%.2X", static_cast<uint8_t>(s.back()));
+                    JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn));
+                }
+
+                case error_handler_t::ignore:
+                {
+                    break;
+                }
+
+                case error_handler_t::replace:
+                {
+                    // write buffer, but replace last byte
+                    o->write_characters(string_buffer.data(), bytes - 1);
+                    o->write_characters("\\ufffd", 6);
+                    break;
+                }
+            }
         }
     }
 
diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp
index dc206d30..c4681d7b 100644
--- a/single_include/nlohmann/json.hpp
+++ b/single_include/nlohmann/json.hpp
@@ -9991,6 +9991,14 @@ class serializer
     static constexpr uint8_t UTF8_REJECT = 1;
 
   public:
+    /// how to treat decoding errors
+    enum class error_handler_t
+    {
+        strict,  ///< throw a type_error exception in case of invalid UTF-8
+        replace, ///< replace invalid UTF-8 sequences with U+FFFD
+        ignore   ///< ignore invalid UTF-8 sequences
+    };
+
     /*!
     @param[in] s  output stream to serialize to
     @param[in] ichar  indentation character to use
@@ -10230,10 +10238,12 @@ class serializer
     @param[in] s  the string to escape
     @param[in] ensure_ascii  whether to escape non-ASCII characters with
                              \uXXXX sequences
+    @param[in] error_handler how to react on decoding errors
 
     @complexity Linear in the length of string @a s.
     */
-    void dump_escaped(const string_t& s, const bool ensure_ascii)
+    void dump_escaped(const string_t& s, const bool ensure_ascii,
+                      const error_handler_t error_handler = error_handler_t::strict)
     {
         uint32_t codepoint;
         uint8_t state = UTF8_ACCEPT;
@@ -10341,9 +10351,33 @@ class serializer
 
                 case UTF8_REJECT:  // decode found invalid UTF-8 byte
                 {
-                    std::string sn(3, '\0');
-                    snprintf(&sn[0], sn.size(), "%.2X", byte);
-                    JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn));
+                    switch (error_handler)
+                    {
+                        case error_handler_t::strict:
+                        {
+                            std::string sn(3, '\0');
+                            snprintf(&sn[0], sn.size(), "%.2X", byte);
+                            JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn));
+                        }
+
+                        case error_handler_t::ignore:
+                        {
+                            state = UTF8_ACCEPT;
+                            continue;
+                        }
+
+                        case error_handler_t::replace:
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'u';
+                            string_buffer[bytes++] = 'f';
+                            string_buffer[bytes++] = 'f';
+                            string_buffer[bytes++] = 'f';
+                            string_buffer[bytes++] = 'd';
+                            state = UTF8_ACCEPT;
+                            continue;
+                        }
+                    }
                 }
 
                 default:  // decode found yet incomplete multi-byte code point
@@ -10369,9 +10403,28 @@ class serializer
         else
         {
             // we finish reading, but do not accept: string was incomplete
-            std::string sn(3, '\0');
-            snprintf(&sn[0], sn.size(), "%.2X", static_cast<uint8_t>(s.back()));
-            JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn));
+            switch (error_handler)
+            {
+                case error_handler_t::strict:
+                {
+                    std::string sn(3, '\0');
+                    snprintf(&sn[0], sn.size(), "%.2X", static_cast<uint8_t>(s.back()));
+                    JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn));
+                }
+
+                case error_handler_t::ignore:
+                {
+                    break;
+                }
+
+                case error_handler_t::replace:
+                {
+                    // write buffer, but replace last byte
+                    o->write_characters(string_buffer.data(), bytes - 1);
+                    o->write_characters("\\ufffd", 6);
+                    break;
+                }
+            }
         }
     }
 

From c5821d91e51f9f2b233689e3f7d3f274cc8d27fc Mon Sep 17 00:00:00 2001
From: Niels Lohmann <mail@nlohmann.me>
Date: Sun, 21 Oct 2018 11:49:37 +0200
Subject: [PATCH 2/8] :construction: overworked error handlers #1198

---
 include/nlohmann/detail/output/serializer.hpp |  88 ++++++++++-----
 include/nlohmann/json.hpp                     |  17 ++-
 single_include/nlohmann/json.hpp              | 105 ++++++++++++------
 test/src/unit-serialization.cpp               |  37 ++++++
 4 files changed, 181 insertions(+), 66 deletions(-)

diff --git a/include/nlohmann/detail/output/serializer.hpp b/include/nlohmann/detail/output/serializer.hpp
index 7adf0c2f..3ccca482 100644
--- a/include/nlohmann/detail/output/serializer.hpp
+++ b/include/nlohmann/detail/output/serializer.hpp
@@ -28,6 +28,14 @@ namespace detail
 // serialization //
 ///////////////////
 
+/// how to treat decoding errors
+enum class error_handler_t
+{
+    strict,  ///< throw a type_error exception in case of invalid UTF-8
+    replace, ///< replace invalid UTF-8 sequences with U+FFFD
+    ignore   ///< ignore invalid UTF-8 sequences
+};
+
 template<typename BasicJsonType>
 class serializer
 {
@@ -39,23 +47,20 @@ class serializer
     static constexpr uint8_t UTF8_REJECT = 1;
 
   public:
-    /// how to treat decoding errors
-    enum class error_handler_t
-    {
-        strict,  ///< throw a type_error exception in case of invalid UTF-8
-        replace, ///< replace invalid UTF-8 sequences with U+FFFD
-        ignore   ///< ignore invalid UTF-8 sequences
-    };
-
     /*!
     @param[in] s  output stream to serialize to
     @param[in] ichar  indentation character to use
+    @param[in] error_handler_  how to react on decoding errors
     */
-    serializer(output_adapter_t<char> s, const char ichar)
-        : o(std::move(s)), loc(std::localeconv()),
-          thousands_sep(loc->thousands_sep == nullptr ? '\0' : * (loc->thousands_sep)),
-          decimal_point(loc->decimal_point == nullptr ? '\0' : * (loc->decimal_point)),
-          indent_char(ichar), indent_string(512, indent_char)
+    serializer(output_adapter_t<char> s, const char ichar,
+               error_handler_t error_handler_ = error_handler_t::strict)
+        : o(std::move(s))
+        , loc(std::localeconv())
+        , thousands_sep(loc->thousands_sep == nullptr ? '\0' : * (loc->thousands_sep))
+        , decimal_point(loc->decimal_point == nullptr ? '\0' : * (loc->decimal_point))
+        , indent_char(ichar)
+        , indent_string(512, indent_char)
+        , error_handler(error_handler_)
     {}
 
     // delete because of pointer members
@@ -286,17 +291,18 @@ class serializer
     @param[in] s  the string to escape
     @param[in] ensure_ascii  whether to escape non-ASCII characters with
                              \uXXXX sequences
-    @param[in] error_handler how to react on decoding errors
 
     @complexity Linear in the length of string @a s.
     */
-    void dump_escaped(const string_t& s, const bool ensure_ascii,
-                      const error_handler_t error_handler = error_handler_t::strict)
+    void dump_escaped(const string_t& s, const bool ensure_ascii)
     {
         uint32_t codepoint;
         uint8_t state = UTF8_ACCEPT;
         std::size_t bytes = 0;  // number of bytes written to string_buffer
 
+        // number of bytes written at the point of the last valid byte
+        std::size_t bytes_after_last_accept = 0;
+
         for (std::size_t i = 0; i < s.size(); ++i)
         {
             const auto byte = static_cast<uint8_t>(s[i]);
@@ -394,6 +400,9 @@ class serializer
                         o->write_characters(string_buffer.data(), bytes);
                         bytes = 0;
                     }
+
+                    // remember the byte position of this accept
+                    bytes_after_last_accept = bytes;
                     break;
                 }
 
@@ -409,19 +418,33 @@ class serializer
                         }
 
                         case error_handler_t::ignore:
-                        {
-                            state = UTF8_ACCEPT;
-                            continue;
-                        }
-
                         case error_handler_t::replace:
                         {
-                            string_buffer[bytes++] = '\\';
-                            string_buffer[bytes++] = 'u';
-                            string_buffer[bytes++] = 'f';
-                            string_buffer[bytes++] = 'f';
-                            string_buffer[bytes++] = 'f';
-                            string_buffer[bytes++] = 'd';
+                            // in case we saw this chatacter the first time, we
+                            // would like to read it again, because the byte
+                            // may be OK for itself, but just not OK for the
+                            // previous sequence
+                            if (bytes_after_last_accept != bytes)
+                            {
+                                --i;
+                            }
+
+                            // reset length buffer to the last accepted index;
+                            // thus removing/ignoring the invalid characters
+                            bytes = bytes_after_last_accept;
+
+                            if (error_handler == error_handler_t::replace)
+                            {
+                                // add a replacement character
+                                string_buffer[bytes++] = '\\';
+                                string_buffer[bytes++] = 'u';
+                                string_buffer[bytes++] = 'f';
+                                string_buffer[bytes++] = 'f';
+                                string_buffer[bytes++] = 'f';
+                                string_buffer[bytes++] = 'd';
+                            }
+
+                            // continue processing the string
                             state = UTF8_ACCEPT;
                             continue;
                         }
@@ -440,6 +463,7 @@ class serializer
             }
         }
 
+        // we finished processing the string
         if (JSON_LIKELY(state == UTF8_ACCEPT))
         {
             // write buffer
@@ -462,13 +486,16 @@ class serializer
 
                 case error_handler_t::ignore:
                 {
+                    // write all accepted bytes
+                    o->write_characters(string_buffer.data(), bytes_after_last_accept);
                     break;
                 }
 
                 case error_handler_t::replace:
                 {
-                    // write buffer, but replace last byte
-                    o->write_characters(string_buffer.data(), bytes - 1);
+                    // write all accepted bytes
+                    o->write_characters(string_buffer.data(), bytes_after_last_accept);
+                    // add a replacement character
                     o->write_characters("\\ufffd", 6);
                     break;
                 }
@@ -682,6 +709,9 @@ class serializer
     const char indent_char;
     /// the indentation string
     string_t indent_string;
+
+    /// error_handler how to react on decoding errors
+    const error_handler_t error_handler;
 };
 }  // namespace detail
 }  // namespace nlohmann
diff --git a/include/nlohmann/json.hpp b/include/nlohmann/json.hpp
index 619fd9e6..a5f7d47d 100644
--- a/include/nlohmann/json.hpp
+++ b/include/nlohmann/json.hpp
@@ -208,6 +208,8 @@ class basic_json
     using json_pointer = ::nlohmann::json_pointer<basic_json>;
     template<typename T, typename SFINAE>
     using json_serializer = JSONSerializer<T, SFINAE>;
+    /// how to treat decoding errors
+    using error_handler_t = detail::error_handler_t;
     /// helper type for initializer lists of basic_json values
     using initializer_list_t = std::initializer_list<detail::json_ref<basic_json>>;
 
@@ -1932,6 +1934,10 @@ class basic_json
     @param[in] ensure_ascii If @a ensure_ascii is true, all non-ASCII characters
     in the output are escaped with `\uXXXX` sequences, and the result consists
     of ASCII characters only.
+    @param[in] error_handler  how to react on decoding errors; there are three
+    possible values: `strict` (throws and exception in case a decoding error
+    occurs; default), `replace` (replace invalid UTF-8 sequences with U+FFFD),
+    and `ignore` (ignore invalid UTF-8 sequences during serialization).
 
     @return string containing the serialization of the JSON value
 
@@ -1950,13 +1956,16 @@ class basic_json
     @see https://docs.python.org/2/library/json.html#json.dump
 
     @since version 1.0.0; indentation character @a indent_char, option
-           @a ensure_ascii and exceptions added in version 3.0.0
+           @a ensure_ascii and exceptions added in version 3.0.0; error
+           handlers added in version 3.4.0.
     */
-    string_t dump(const int indent = -1, const char indent_char = ' ',
-                  const bool ensure_ascii = false) const
+    string_t dump(const int indent = -1,
+                  const char indent_char = ' ',
+                  const bool ensure_ascii = false,
+                  const error_handler_t error_handler = error_handler_t::strict) const
     {
         string_t result;
-        serializer s(detail::output_adapter<char, string_t>(result), indent_char);
+        serializer s(detail::output_adapter<char, string_t>(result), indent_char, error_handler);
 
         if (indent >= 0)
         {
diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp
index c4681d7b..b3a4487e 100644
--- a/single_include/nlohmann/json.hpp
+++ b/single_include/nlohmann/json.hpp
@@ -9980,6 +9980,14 @@ namespace detail
 // serialization //
 ///////////////////
 
+/// how to treat decoding errors
+enum class error_handler_t
+{
+    strict,  ///< throw a type_error exception in case of invalid UTF-8
+    replace, ///< replace invalid UTF-8 sequences with U+FFFD
+    ignore   ///< ignore invalid UTF-8 sequences
+};
+
 template<typename BasicJsonType>
 class serializer
 {
@@ -9991,23 +9999,20 @@ class serializer
     static constexpr uint8_t UTF8_REJECT = 1;
 
   public:
-    /// how to treat decoding errors
-    enum class error_handler_t
-    {
-        strict,  ///< throw a type_error exception in case of invalid UTF-8
-        replace, ///< replace invalid UTF-8 sequences with U+FFFD
-        ignore   ///< ignore invalid UTF-8 sequences
-    };
-
     /*!
     @param[in] s  output stream to serialize to
     @param[in] ichar  indentation character to use
+    @param[in] error_handler_  how to react on decoding errors
     */
-    serializer(output_adapter_t<char> s, const char ichar)
-        : o(std::move(s)), loc(std::localeconv()),
-          thousands_sep(loc->thousands_sep == nullptr ? '\0' : * (loc->thousands_sep)),
-          decimal_point(loc->decimal_point == nullptr ? '\0' : * (loc->decimal_point)),
-          indent_char(ichar), indent_string(512, indent_char)
+    serializer(output_adapter_t<char> s, const char ichar,
+               error_handler_t error_handler_ = error_handler_t::strict)
+        : o(std::move(s))
+        , loc(std::localeconv())
+        , thousands_sep(loc->thousands_sep == nullptr ? '\0' : * (loc->thousands_sep))
+        , decimal_point(loc->decimal_point == nullptr ? '\0' : * (loc->decimal_point))
+        , indent_char(ichar)
+        , indent_string(512, indent_char)
+        , error_handler(error_handler_)
     {}
 
     // delete because of pointer members
@@ -10238,17 +10243,18 @@ class serializer
     @param[in] s  the string to escape
     @param[in] ensure_ascii  whether to escape non-ASCII characters with
                              \uXXXX sequences
-    @param[in] error_handler how to react on decoding errors
 
     @complexity Linear in the length of string @a s.
     */
-    void dump_escaped(const string_t& s, const bool ensure_ascii,
-                      const error_handler_t error_handler = error_handler_t::strict)
+    void dump_escaped(const string_t& s, const bool ensure_ascii)
     {
         uint32_t codepoint;
         uint8_t state = UTF8_ACCEPT;
         std::size_t bytes = 0;  // number of bytes written to string_buffer
 
+        // number of bytes written at the point of the last valid byte
+        std::size_t bytes_after_last_accept = 0;
+
         for (std::size_t i = 0; i < s.size(); ++i)
         {
             const auto byte = static_cast<uint8_t>(s[i]);
@@ -10346,6 +10352,9 @@ class serializer
                         o->write_characters(string_buffer.data(), bytes);
                         bytes = 0;
                     }
+
+                    // remember the byte position of this accept
+                    bytes_after_last_accept = bytes;
                     break;
                 }
 
@@ -10361,19 +10370,33 @@ class serializer
                         }
 
                         case error_handler_t::ignore:
-                        {
-                            state = UTF8_ACCEPT;
-                            continue;
-                        }
-
                         case error_handler_t::replace:
                         {
-                            string_buffer[bytes++] = '\\';
-                            string_buffer[bytes++] = 'u';
-                            string_buffer[bytes++] = 'f';
-                            string_buffer[bytes++] = 'f';
-                            string_buffer[bytes++] = 'f';
-                            string_buffer[bytes++] = 'd';
+                            // in case we saw this chatacter the first time, we
+                            // would like to read it again, because the byte
+                            // may be OK for itself, but just not OK for the
+                            // previous sequence
+                            if (bytes_after_last_accept != bytes)
+                            {
+                                --i;
+                            }
+
+                            // reset length buffer to the last accepted index;
+                            // thus removing/ignoring the invalid characters
+                            bytes = bytes_after_last_accept;
+
+                            if (error_handler == error_handler_t::replace)
+                            {
+                                // add a replacement character
+                                string_buffer[bytes++] = '\\';
+                                string_buffer[bytes++] = 'u';
+                                string_buffer[bytes++] = 'f';
+                                string_buffer[bytes++] = 'f';
+                                string_buffer[bytes++] = 'f';
+                                string_buffer[bytes++] = 'd';
+                            }
+
+                            // continue processing the string
                             state = UTF8_ACCEPT;
                             continue;
                         }
@@ -10392,6 +10415,7 @@ class serializer
             }
         }
 
+        // we finished processing the string
         if (JSON_LIKELY(state == UTF8_ACCEPT))
         {
             // write buffer
@@ -10414,13 +10438,16 @@ class serializer
 
                 case error_handler_t::ignore:
                 {
+                    // write all accepted bytes
+                    o->write_characters(string_buffer.data(), bytes_after_last_accept);
                     break;
                 }
 
                 case error_handler_t::replace:
                 {
-                    // write buffer, but replace last byte
-                    o->write_characters(string_buffer.data(), bytes - 1);
+                    // write all accepted bytes
+                    o->write_characters(string_buffer.data(), bytes_after_last_accept);
+                    // add a replacement character
                     o->write_characters("\\ufffd", 6);
                     break;
                 }
@@ -10634,6 +10661,9 @@ class serializer
     const char indent_char;
     /// the indentation string
     string_t indent_string;
+
+    /// error_handler how to react on decoding errors
+    const error_handler_t error_handler;
 };
 }  // namespace detail
 }  // namespace nlohmann
@@ -11603,6 +11633,8 @@ class basic_json
     using json_pointer = ::nlohmann::json_pointer<basic_json>;
     template<typename T, typename SFINAE>
     using json_serializer = JSONSerializer<T, SFINAE>;
+    /// how to treat decoding errors
+    using error_handler_t = detail::error_handler_t;
     /// helper type for initializer lists of basic_json values
     using initializer_list_t = std::initializer_list<detail::json_ref<basic_json>>;
 
@@ -13327,6 +13359,10 @@ class basic_json
     @param[in] ensure_ascii If @a ensure_ascii is true, all non-ASCII characters
     in the output are escaped with `\uXXXX` sequences, and the result consists
     of ASCII characters only.
+    @param[in] error_handler  how to react on decoding errors; there are three
+    possible values: `strict` (throws and exception in case a decoding error
+    occurs; default), `replace` (replace invalid UTF-8 sequences with U+FFFD),
+    and `ignore` (ignore invalid UTF-8 sequences during serialization).
 
     @return string containing the serialization of the JSON value
 
@@ -13345,13 +13381,16 @@ class basic_json
     @see https://docs.python.org/2/library/json.html#json.dump
 
     @since version 1.0.0; indentation character @a indent_char, option
-           @a ensure_ascii and exceptions added in version 3.0.0
+           @a ensure_ascii and exceptions added in version 3.0.0; error
+           handlers added in version 3.4.0.
     */
-    string_t dump(const int indent = -1, const char indent_char = ' ',
-                  const bool ensure_ascii = false) const
+    string_t dump(const int indent = -1,
+                  const char indent_char = ' ',
+                  const bool ensure_ascii = false,
+                  const error_handler_t error_handler = error_handler_t::strict) const
     {
         string_t result;
-        serializer s(detail::output_adapter<char, string_t>(result), indent_char);
+        serializer s(detail::output_adapter<char, string_t>(result), indent_char, error_handler);
 
         if (indent >= 0)
         {
diff --git a/test/src/unit-serialization.cpp b/test/src/unit-serialization.cpp
index 0eed7246..e4069f73 100644
--- a/test/src/unit-serialization.cpp
+++ b/test/src/unit-serialization.cpp
@@ -94,4 +94,41 @@ TEST_CASE("serialization")
                   "[\n\t\"foo\",\n\t1,\n\t2,\n\t3,\n\tfalse,\n\t{\n\t\t\"one\": 1\n\t}\n]");
         }
     }
+
+    SECTION("dump")
+    {
+        SECTION("invalid character")
+        {
+            json j = "ä\xA9ü";
+
+            CHECK_THROWS_AS(j.dump(), json::type_error&);
+            CHECK_THROWS_WITH(j.dump(), "[json.exception.type_error.316] invalid UTF-8 byte at index 2: 0xA9");
+            CHECK_THROWS_AS(j.dump(1, ' ', false, json::error_handler_t::strict), json::type_error&);
+            CHECK_THROWS_WITH(j.dump(1, ' ', false, json::error_handler_t::strict), "[json.exception.type_error.316] invalid UTF-8 byte at index 2: 0xA9");
+            CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"äü\"");
+            CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"ä\\ufffdü\"");
+        }
+
+        SECTION("ending with incomplete character")
+        {
+            json j = "123\xC2";
+
+            CHECK_THROWS_AS(j.dump(), json::type_error&);
+            CHECK_THROWS_WITH(j.dump(), "[json.exception.type_error.316] incomplete UTF-8 string; last byte: 0xC2");
+            CHECK_THROWS_AS(j.dump(1, ' ', false, json::error_handler_t::strict), json::type_error&);
+            CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123\"");
+            CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\\ufffd\"");
+        }
+
+        SECTION("unexpected character")
+        {
+            json j = "123\xF1\xB0\x34\x35\x36";
+
+            CHECK_THROWS_AS(j.dump(), json::type_error&);
+            CHECK_THROWS_WITH(j.dump(), "[json.exception.type_error.316] invalid UTF-8 byte at index 5: 0x34");
+            CHECK_THROWS_AS(j.dump(1, ' ', false, json::error_handler_t::strict), json::type_error&);
+            CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123456\"");
+            CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\\ufffd456\"");
+        }
+    }
 }

From e5dce641159874b9485d4c8a310b82e34b7aca18 Mon Sep 17 00:00:00 2001
From: Niels Lohmann <mail@nlohmann.me>
Date: Sun, 21 Oct 2018 23:26:25 +0200
Subject: [PATCH 3/8] :green_heart: added tests #1198

Test every prefix of Unicode sequences against the different dump functions.
---
 include/nlohmann/detail/output/serializer.hpp |   1 +
 single_include/nlohmann/json.hpp              |   1 +
 test/src/unit-unicode.cpp                     | 120 ++++++++++++++++++
 3 files changed, 122 insertions(+)

diff --git a/include/nlohmann/detail/output/serializer.hpp b/include/nlohmann/detail/output/serializer.hpp
index 3ccca482..c022c307 100644
--- a/include/nlohmann/detail/output/serializer.hpp
+++ b/include/nlohmann/detail/output/serializer.hpp
@@ -442,6 +442,7 @@ class serializer
                                 string_buffer[bytes++] = 'f';
                                 string_buffer[bytes++] = 'f';
                                 string_buffer[bytes++] = 'd';
+                                bytes_after_last_accept = bytes;
                             }
 
                             // continue processing the string
diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp
index b3a4487e..4e86fafc 100644
--- a/single_include/nlohmann/json.hpp
+++ b/single_include/nlohmann/json.hpp
@@ -10394,6 +10394,7 @@ class serializer
                                 string_buffer[bytes++] = 'f';
                                 string_buffer[bytes++] = 'f';
                                 string_buffer[bytes++] = 'd';
+                                bytes_after_last_accept = bytes;
                             }
 
                             // continue processing the string
diff --git a/test/src/unit-unicode.cpp b/test/src/unit-unicode.cpp
index 4def1a3e..c71f6f1d 100644
--- a/test/src/unit-unicode.cpp
+++ b/test/src/unit-unicode.cpp
@@ -39,6 +39,79 @@ using nlohmann::json;
 extern size_t calls;
 size_t calls = 0;
 
+void check_utf8dump(bool success_expected, int byte1, int byte2, int byte3, int byte4);
+
+void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3 = -1, int byte4 = -1)
+{
+    std::string json_string;
+
+    CAPTURE(byte1);
+    CAPTURE(byte2);
+    CAPTURE(byte3);
+    CAPTURE(byte4);
+
+    json_string += std::string(1, static_cast<char>(byte1));
+
+    if (byte2 != -1)
+    {
+        json_string += std::string(1, static_cast<char>(byte2));
+    }
+
+    if (byte3 != -1)
+    {
+        json_string += std::string(1, static_cast<char>(byte3));
+    }
+
+    if (byte4 != -1)
+    {
+        json_string += std::string(1, static_cast<char>(byte4));
+    }
+
+    CAPTURE(json_string);
+
+    // store the string in a JSON value
+    json j = json_string;
+    json j2 = "abc" + json_string + "xyz";
+
+    // dumping with ignore/replace must not throw in any case
+    auto s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore);
+    auto s_ignored2 = j2.dump(-1, ' ', false, json::error_handler_t::ignore);
+    auto s_replaced = j.dump(-1, ' ', false, json::error_handler_t::replace);
+    auto s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace);
+
+    if (success_expected)
+    {
+        // strict mode must not throw if success is expected
+        auto s_strict = j.dump();
+        // all dumps should agree on the string
+        CHECK(s_strict == s_ignored);
+        CHECK(s_strict == s_replaced);
+
+        // check that ignore/replace string does not contain a replacement character
+        CHECK(s_ignored.find("\\ufffd") == std::string::npos);
+        CHECK(s_replaced.find("\\ufffd") == std::string::npos);
+    }
+    else
+    {
+        // strict mode must throw if success is not expected
+        CHECK_THROWS_AS(j.dump(), json::type_error&);
+        // ignore and replace must create different dumps
+        CHECK(s_ignored != s_replaced);
+
+        // check that ignore string does not contain a replacement character
+        CHECK(s_ignored.find("\\ufffd") == std::string::npos);
+        // check that replace string contains a replacement character
+        CHECK(s_replaced.find("\\ufffd") != std::string::npos);
+
+    }
+
+    // check that prefix and suffix are preserved
+    CHECK(s_ignored2.substr(1, 3) == "abc");
+    CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz");
+    CHECK(s_replaced2.substr(1, 3) == "abc");
+    CHECK(s_replaced2.substr(s_replaced2.size() - 4, 3) == "xyz");
+}
+
 void check_utf8string(bool success_expected, int byte1, int byte2, int byte3, int byte4);
 
 // create and check a JSON string with up to four UTF-8 bytes
@@ -115,11 +188,13 @@ TEST_CASE("Unicode", "[hide]")
             for (int byte1 = 0x80; byte1 <= 0xC1; ++byte1)
             {
                 check_utf8string(false, byte1);
+                check_utf8dump(false, byte1);
             }
 
             for (int byte1 = 0xF5; byte1 <= 0xFF; ++byte1)
             {
                 check_utf8string(false, byte1);
+                check_utf8dump(false, byte1);
             }
         }
 
@@ -152,6 +227,7 @@ TEST_CASE("Unicode", "[hide]")
 
                     // all other characters are OK
                     check_utf8string(true, byte1);
+                    check_utf8dump(true, byte1);
                 }
             }
         }
@@ -165,6 +241,7 @@ TEST_CASE("Unicode", "[hide]")
                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
                     {
                         check_utf8string(true, byte1, byte2);
+                        check_utf8dump(true, byte1, byte2);
                     }
                 }
             }
@@ -174,6 +251,7 @@ TEST_CASE("Unicode", "[hide]")
                 for (int byte1 = 0xC2; byte1 <= 0xDF; ++byte1)
                 {
                     check_utf8string(false, byte1);
+                    check_utf8dump(false, byte1);
                 }
             }
 
@@ -190,6 +268,7 @@ TEST_CASE("Unicode", "[hide]")
                         }
 
                         check_utf8string(false, byte1, byte2);
+                        check_utf8dump(false, byte1, byte2);
                     }
                 }
             }
@@ -206,6 +285,7 @@ TEST_CASE("Unicode", "[hide]")
                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
                         {
                             check_utf8string(true, byte1, byte2, byte3);
+                            check_utf8dump(true, byte1, byte2, byte3);
                         }
                     }
                 }
@@ -216,6 +296,7 @@ TEST_CASE("Unicode", "[hide]")
                 for (int byte1 = 0xE0; byte1 <= 0xE0; ++byte1)
                 {
                     check_utf8string(false, byte1);
+                    check_utf8dump(false, byte1);
                 }
             }
 
@@ -226,6 +307,7 @@ TEST_CASE("Unicode", "[hide]")
                     for (int byte2 = 0xA0; byte2 <= 0xBF; ++byte2)
                     {
                         check_utf8string(false, byte1, byte2);
+                        check_utf8dump(false, byte1, byte2);
                     }
                 }
             }
@@ -245,6 +327,7 @@ TEST_CASE("Unicode", "[hide]")
                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
                         {
                             check_utf8string(false, byte1, byte2, byte3);
+                            check_utf8dump(false, byte1, byte2, byte3);
                         }
                     }
                 }
@@ -265,6 +348,7 @@ TEST_CASE("Unicode", "[hide]")
                             }
 
                             check_utf8string(false, byte1, byte2, byte3);
+                            check_utf8dump(false, byte1, byte2, byte3);
                         }
                     }
                 }
@@ -282,6 +366,7 @@ TEST_CASE("Unicode", "[hide]")
                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
                         {
                             check_utf8string(true, byte1, byte2, byte3);
+                            check_utf8dump(true, byte1, byte2, byte3);
                         }
                     }
                 }
@@ -292,6 +377,7 @@ TEST_CASE("Unicode", "[hide]")
                 for (int byte1 = 0xE1; byte1 <= 0xEC; ++byte1)
                 {
                     check_utf8string(false, byte1);
+                    check_utf8dump(false, byte1);
                 }
             }
 
@@ -302,6 +388,7 @@ TEST_CASE("Unicode", "[hide]")
                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
                     {
                         check_utf8string(false, byte1, byte2);
+                        check_utf8dump(false, byte1, byte2);
                     }
                 }
             }
@@ -321,6 +408,7 @@ TEST_CASE("Unicode", "[hide]")
                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
                         {
                             check_utf8string(false, byte1, byte2, byte3);
+                            check_utf8dump(false, byte1, byte2, byte3);
                         }
                     }
                 }
@@ -341,6 +429,7 @@ TEST_CASE("Unicode", "[hide]")
                             }
 
                             check_utf8string(false, byte1, byte2, byte3);
+                            check_utf8dump(false, byte1, byte2, byte3);
                         }
                     }
                 }
@@ -358,6 +447,7 @@ TEST_CASE("Unicode", "[hide]")
                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
                         {
                             check_utf8string(true, byte1, byte2, byte3);
+                            check_utf8dump(true, byte1, byte2, byte3);
                         }
                     }
                 }
@@ -368,6 +458,7 @@ TEST_CASE("Unicode", "[hide]")
                 for (int byte1 = 0xED; byte1 <= 0xED; ++byte1)
                 {
                     check_utf8string(false, byte1);
+                    check_utf8dump(false, byte1);
                 }
             }
 
@@ -378,6 +469,7 @@ TEST_CASE("Unicode", "[hide]")
                     for (int byte2 = 0x80; byte2 <= 0x9F; ++byte2)
                     {
                         check_utf8string(false, byte1, byte2);
+                        check_utf8dump(false, byte1, byte2);
                     }
                 }
             }
@@ -397,6 +489,7 @@ TEST_CASE("Unicode", "[hide]")
                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
                         {
                             check_utf8string(false, byte1, byte2, byte3);
+                            check_utf8dump(false, byte1, byte2, byte3);
                         }
                     }
                 }
@@ -417,6 +510,7 @@ TEST_CASE("Unicode", "[hide]")
                             }
 
                             check_utf8string(false, byte1, byte2, byte3);
+                            check_utf8dump(false, byte1, byte2, byte3);
                         }
                     }
                 }
@@ -434,6 +528,7 @@ TEST_CASE("Unicode", "[hide]")
                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
                         {
                             check_utf8string(true, byte1, byte2, byte3);
+                            check_utf8dump(true, byte1, byte2, byte3);
                         }
                     }
                 }
@@ -444,6 +539,7 @@ TEST_CASE("Unicode", "[hide]")
                 for (int byte1 = 0xEE; byte1 <= 0xEF; ++byte1)
                 {
                     check_utf8string(false, byte1);
+                    check_utf8dump(false, byte1);
                 }
             }
 
@@ -454,6 +550,7 @@ TEST_CASE("Unicode", "[hide]")
                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
                     {
                         check_utf8string(false, byte1, byte2);
+                        check_utf8dump(false, byte1, byte2);
                     }
                 }
             }
@@ -473,6 +570,7 @@ TEST_CASE("Unicode", "[hide]")
                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
                         {
                             check_utf8string(false, byte1, byte2, byte3);
+                            check_utf8dump(false, byte1, byte2, byte3);
                         }
                     }
                 }
@@ -493,6 +591,7 @@ TEST_CASE("Unicode", "[hide]")
                             }
 
                             check_utf8string(false, byte1, byte2, byte3);
+                            check_utf8dump(false, byte1, byte2, byte3);
                         }
                     }
                 }
@@ -512,6 +611,7 @@ TEST_CASE("Unicode", "[hide]")
                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
                             {
                                 check_utf8string(true, byte1, byte2, byte3, byte4);
+                                check_utf8dump(true, byte1, byte2, byte3, byte4);
                             }
                         }
                     }
@@ -523,6 +623,7 @@ TEST_CASE("Unicode", "[hide]")
                 for (int byte1 = 0xF0; byte1 <= 0xF0; ++byte1)
                 {
                     check_utf8string(false, byte1);
+                    check_utf8dump(false, byte1);
                 }
             }
 
@@ -533,6 +634,7 @@ TEST_CASE("Unicode", "[hide]")
                     for (int byte2 = 0x90; byte2 <= 0xBF; ++byte2)
                     {
                         check_utf8string(false, byte1, byte2);
+                        check_utf8dump(false, byte1, byte2);
                     }
                 }
             }
@@ -546,6 +648,7 @@ TEST_CASE("Unicode", "[hide]")
                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
                         {
                             check_utf8string(false, byte1, byte2, byte3);
+                            check_utf8dump(false, byte1, byte2, byte3);
                         }
                     }
                 }
@@ -568,6 +671,7 @@ TEST_CASE("Unicode", "[hide]")
                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
                             {
                                 check_utf8string(false, byte1, byte2, byte3, byte4);
+                                check_utf8dump(false, byte1, byte2, byte3, byte4);
                             }
                         }
                     }
@@ -591,6 +695,7 @@ TEST_CASE("Unicode", "[hide]")
                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
                             {
                                 check_utf8string(false, byte1, byte2, byte3, byte4);
+                                check_utf8dump(false, byte1, byte2, byte3, byte4);
                             }
                         }
                     }
@@ -614,6 +719,7 @@ TEST_CASE("Unicode", "[hide]")
                                 }
 
                                 check_utf8string(false, byte1, byte2, byte3, byte4);
+                                check_utf8dump(false, byte1, byte2, byte3, byte4);
                             }
                         }
                     }
@@ -634,6 +740,7 @@ TEST_CASE("Unicode", "[hide]")
                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
                             {
                                 check_utf8string(true, byte1, byte2, byte3, byte4);
+                                check_utf8dump(true, byte1, byte2, byte3, byte4);
                             }
                         }
                     }
@@ -645,6 +752,7 @@ TEST_CASE("Unicode", "[hide]")
                 for (int byte1 = 0xF1; byte1 <= 0xF3; ++byte1)
                 {
                     check_utf8string(false, byte1);
+                    check_utf8dump(false, byte1);
                 }
             }
 
@@ -655,6 +763,7 @@ TEST_CASE("Unicode", "[hide]")
                     for (int byte2 = 0x80; byte2 <= 0xBF; ++byte2)
                     {
                         check_utf8string(false, byte1, byte2);
+                        check_utf8dump(false, byte1, byte2);
                     }
                 }
             }
@@ -668,6 +777,7 @@ TEST_CASE("Unicode", "[hide]")
                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
                         {
                             check_utf8string(false, byte1, byte2, byte3);
+                            check_utf8dump(false, byte1, byte2, byte3);
                         }
                     }
                 }
@@ -690,6 +800,7 @@ TEST_CASE("Unicode", "[hide]")
                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
                             {
                                 check_utf8string(false, byte1, byte2, byte3, byte4);
+                                check_utf8dump(false, byte1, byte2, byte3, byte4);
                             }
                         }
                     }
@@ -713,6 +824,7 @@ TEST_CASE("Unicode", "[hide]")
                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
                             {
                                 check_utf8string(false, byte1, byte2, byte3, byte4);
+                                check_utf8dump(false, byte1, byte2, byte3, byte4);
                             }
                         }
                     }
@@ -736,6 +848,7 @@ TEST_CASE("Unicode", "[hide]")
                                 }
 
                                 check_utf8string(false, byte1, byte2, byte3, byte4);
+                                check_utf8dump(false, byte1, byte2, byte3, byte4);
                             }
                         }
                     }
@@ -756,6 +869,7 @@ TEST_CASE("Unicode", "[hide]")
                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
                             {
                                 check_utf8string(true, byte1, byte2, byte3, byte4);
+                                check_utf8dump(true, byte1, byte2, byte3, byte4);
                             }
                         }
                     }
@@ -767,6 +881,7 @@ TEST_CASE("Unicode", "[hide]")
                 for (int byte1 = 0xF4; byte1 <= 0xF4; ++byte1)
                 {
                     check_utf8string(false, byte1);
+                    check_utf8dump(false, byte1);
                 }
             }
 
@@ -777,6 +892,7 @@ TEST_CASE("Unicode", "[hide]")
                     for (int byte2 = 0x80; byte2 <= 0x8F; ++byte2)
                     {
                         check_utf8string(false, byte1, byte2);
+                        check_utf8dump(false, byte1, byte2);
                     }
                 }
             }
@@ -790,6 +906,7 @@ TEST_CASE("Unicode", "[hide]")
                         for (int byte3 = 0x80; byte3 <= 0xBF; ++byte3)
                         {
                             check_utf8string(false, byte1, byte2, byte3);
+                            check_utf8dump(false, byte1, byte2, byte3);
                         }
                     }
                 }
@@ -812,6 +929,7 @@ TEST_CASE("Unicode", "[hide]")
                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
                             {
                                 check_utf8string(false, byte1, byte2, byte3, byte4);
+                                check_utf8dump(false, byte1, byte2, byte3, byte4);
                             }
                         }
                     }
@@ -835,6 +953,7 @@ TEST_CASE("Unicode", "[hide]")
                             for (int byte4 = 0x80; byte4 <= 0xBF; ++byte4)
                             {
                                 check_utf8string(false, byte1, byte2, byte3, byte4);
+                                check_utf8dump(false, byte1, byte2, byte3, byte4);
                             }
                         }
                     }
@@ -858,6 +977,7 @@ TEST_CASE("Unicode", "[hide]")
                                 }
 
                                 check_utf8string(false, byte1, byte2, byte3, byte4);
+                                check_utf8dump(false, byte1, byte2, byte3, byte4);
                             }
                         }
                     }

From c7af027cbb2edc39bb35ed5881adfac3ac45f14d Mon Sep 17 00:00:00 2001
From: Niels Lohmann <mail@nlohmann.me>
Date: Mon, 22 Oct 2018 09:18:16 +0200
Subject: [PATCH 4/8] :construction: respect ensure_ascii parameter #1198

---
 include/nlohmann/detail/output/serializer.hpp | 30 ++++++++++++++-----
 single_include/nlohmann/json.hpp              | 30 ++++++++++++++-----
 test/src/unit-unicode.cpp                     |  8 +----
 3 files changed, 47 insertions(+), 21 deletions(-)

diff --git a/include/nlohmann/detail/output/serializer.hpp b/include/nlohmann/detail/output/serializer.hpp
index c022c307..d21cd8fe 100644
--- a/include/nlohmann/detail/output/serializer.hpp
+++ b/include/nlohmann/detail/output/serializer.hpp
@@ -436,12 +436,21 @@ class serializer
                             if (error_handler == error_handler_t::replace)
                             {
                                 // add a replacement character
-                                string_buffer[bytes++] = '\\';
-                                string_buffer[bytes++] = 'u';
-                                string_buffer[bytes++] = 'f';
-                                string_buffer[bytes++] = 'f';
-                                string_buffer[bytes++] = 'f';
-                                string_buffer[bytes++] = 'd';
+                                if (ensure_ascii)
+                                {
+                                    string_buffer[bytes++] = '\\';
+                                    string_buffer[bytes++] = 'u';
+                                    string_buffer[bytes++] = 'f';
+                                    string_buffer[bytes++] = 'f';
+                                    string_buffer[bytes++] = 'f';
+                                    string_buffer[bytes++] = 'd';
+                                }
+                                else
+                                {
+                                    string_buffer[bytes++] = '\xEF';
+                                    string_buffer[bytes++] = '\xBF';
+                                    string_buffer[bytes++] = '\xBD';
+                                }
                                 bytes_after_last_accept = bytes;
                             }
 
@@ -497,7 +506,14 @@ class serializer
                     // write all accepted bytes
                     o->write_characters(string_buffer.data(), bytes_after_last_accept);
                     // add a replacement character
-                    o->write_characters("\\ufffd", 6);
+                    if (ensure_ascii)
+                    {
+                        o->write_characters("\\ufffd", 6);
+                    }
+                    else
+                    {
+                        o->write_characters("\xEF\xBF\xBD", 3);
+                    }
                     break;
                 }
             }
diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp
index 4e86fafc..f1335cd4 100644
--- a/single_include/nlohmann/json.hpp
+++ b/single_include/nlohmann/json.hpp
@@ -10388,12 +10388,21 @@ class serializer
                             if (error_handler == error_handler_t::replace)
                             {
                                 // add a replacement character
-                                string_buffer[bytes++] = '\\';
-                                string_buffer[bytes++] = 'u';
-                                string_buffer[bytes++] = 'f';
-                                string_buffer[bytes++] = 'f';
-                                string_buffer[bytes++] = 'f';
-                                string_buffer[bytes++] = 'd';
+                                if (ensure_ascii)
+                                {
+                                    string_buffer[bytes++] = '\\';
+                                    string_buffer[bytes++] = 'u';
+                                    string_buffer[bytes++] = 'f';
+                                    string_buffer[bytes++] = 'f';
+                                    string_buffer[bytes++] = 'f';
+                                    string_buffer[bytes++] = 'd';
+                                }
+                                else
+                                {
+                                    string_buffer[bytes++] = '\xEF';
+                                    string_buffer[bytes++] = '\xBF';
+                                    string_buffer[bytes++] = '\xBD';
+                                }
                                 bytes_after_last_accept = bytes;
                             }
 
@@ -10449,7 +10458,14 @@ class serializer
                     // write all accepted bytes
                     o->write_characters(string_buffer.data(), bytes_after_last_accept);
                     // add a replacement character
-                    o->write_characters("\\ufffd", 6);
+                    if (ensure_ascii)
+                    {
+                        o->write_characters("\\ufffd", 6);
+                    }
+                    else
+                    {
+                        o->write_characters("\xEF\xBF\xBD", 3);
+                    }
                     break;
                 }
             }
diff --git a/test/src/unit-unicode.cpp b/test/src/unit-unicode.cpp
index c71f6f1d..acc384bd 100644
--- a/test/src/unit-unicode.cpp
+++ b/test/src/unit-unicode.cpp
@@ -86,10 +86,6 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
         // all dumps should agree on the string
         CHECK(s_strict == s_ignored);
         CHECK(s_strict == s_replaced);
-
-        // check that ignore/replace string does not contain a replacement character
-        CHECK(s_ignored.find("\\ufffd") == std::string::npos);
-        CHECK(s_replaced.find("\\ufffd") == std::string::npos);
     }
     else
     {
@@ -98,10 +94,8 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
         // ignore and replace must create different dumps
         CHECK(s_ignored != s_replaced);
 
-        // check that ignore string does not contain a replacement character
-        CHECK(s_ignored.find("\\ufffd") == std::string::npos);
         // check that replace string contains a replacement character
-        CHECK(s_replaced.find("\\ufffd") != std::string::npos);
+        CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos);
 
     }
 

From c51b1e6fabd676dcd84c327a99586778855b7bda Mon Sep 17 00:00:00 2001
From: Niels Lohmann <mail@nlohmann.me>
Date: Mon, 22 Oct 2018 15:53:36 +0200
Subject: [PATCH 5/8] :construction: fixed an issue with ensure_ascii #1198

---
 include/nlohmann/detail/output/serializer.hpp | 7 ++++++-
 single_include/nlohmann/json.hpp              | 7 ++++++-
 test/src/unit-unicode.cpp                     | 8 ++++++++
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/include/nlohmann/detail/output/serializer.hpp b/include/nlohmann/detail/output/serializer.hpp
index d21cd8fe..f1c0b051 100644
--- a/include/nlohmann/detail/output/serializer.hpp
+++ b/include/nlohmann/detail/output/serializer.hpp
@@ -302,6 +302,7 @@ class serializer
 
         // number of bytes written at the point of the last valid byte
         std::size_t bytes_after_last_accept = 0;
+        std::size_t undumped_chars = 0;
 
         for (std::size_t i = 0; i < s.size(); ++i)
         {
@@ -403,6 +404,7 @@ class serializer
 
                     // remember the byte position of this accept
                     bytes_after_last_accept = bytes;
+                    undumped_chars = 0;
                     break;
                 }
 
@@ -424,7 +426,7 @@ class serializer
                             // would like to read it again, because the byte
                             // may be OK for itself, but just not OK for the
                             // previous sequence
-                            if (bytes_after_last_accept != bytes)
+                            if (undumped_chars > 0)
                             {
                                 --i;
                             }
@@ -454,6 +456,8 @@ class serializer
                                 bytes_after_last_accept = bytes;
                             }
 
+                            undumped_chars = 0;
+
                             // continue processing the string
                             state = UTF8_ACCEPT;
                             continue;
@@ -468,6 +472,7 @@ class serializer
                         // code point will not be escaped - copy byte to buffer
                         string_buffer[bytes++] = s[i];
                     }
+                    ++undumped_chars;
                     break;
                 }
             }
diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp
index f1335cd4..025dfeaf 100644
--- a/single_include/nlohmann/json.hpp
+++ b/single_include/nlohmann/json.hpp
@@ -10254,6 +10254,7 @@ class serializer
 
         // number of bytes written at the point of the last valid byte
         std::size_t bytes_after_last_accept = 0;
+        std::size_t undumped_chars = 0;
 
         for (std::size_t i = 0; i < s.size(); ++i)
         {
@@ -10355,6 +10356,7 @@ class serializer
 
                     // remember the byte position of this accept
                     bytes_after_last_accept = bytes;
+                    undumped_chars = 0;
                     break;
                 }
 
@@ -10376,7 +10378,7 @@ class serializer
                             // would like to read it again, because the byte
                             // may be OK for itself, but just not OK for the
                             // previous sequence
-                            if (bytes_after_last_accept != bytes)
+                            if (undumped_chars > 0)
                             {
                                 --i;
                             }
@@ -10406,6 +10408,8 @@ class serializer
                                 bytes_after_last_accept = bytes;
                             }
 
+                            undumped_chars = 0;
+
                             // continue processing the string
                             state = UTF8_ACCEPT;
                             continue;
@@ -10420,6 +10424,7 @@ class serializer
                         // code point will not be escaped - copy byte to buffer
                         string_buffer[bytes++] = s[i];
                     }
+                    ++undumped_chars;
                     break;
                 }
             }
diff --git a/test/src/unit-unicode.cpp b/test/src/unit-unicode.cpp
index acc384bd..fe16eb0d 100644
--- a/test/src/unit-unicode.cpp
+++ b/test/src/unit-unicode.cpp
@@ -76,8 +76,12 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
     // dumping with ignore/replace must not throw in any case
     auto s_ignored = j.dump(-1, ' ', false, json::error_handler_t::ignore);
     auto s_ignored2 = j2.dump(-1, ' ', false, json::error_handler_t::ignore);
+    auto s_ignored_ascii = j.dump(-1, ' ', true, json::error_handler_t::ignore);
+    auto s_ignored2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::ignore);
     auto s_replaced = j.dump(-1, ' ', false, json::error_handler_t::replace);
     auto s_replaced2 = j2.dump(-1, ' ', false, json::error_handler_t::replace);
+    auto s_replaced_ascii = j.dump(-1, ' ', true, json::error_handler_t::replace);
+    auto s_replaced2_ascii = j2.dump(-1, ' ', true, json::error_handler_t::replace);
 
     if (success_expected)
     {
@@ -102,8 +106,12 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
     // check that prefix and suffix are preserved
     CHECK(s_ignored2.substr(1, 3) == "abc");
     CHECK(s_ignored2.substr(s_ignored2.size() - 4, 3) == "xyz");
+    CHECK(s_ignored2_ascii.substr(1, 3) == "abc");
+    CHECK(s_ignored2_ascii.substr(s_ignored2_ascii.size() - 4, 3) == "xyz");
     CHECK(s_replaced2.substr(1, 3) == "abc");
     CHECK(s_replaced2.substr(s_replaced2.size() - 4, 3) == "xyz");
+    CHECK(s_replaced2_ascii.substr(1, 3) == "abc");
+    CHECK(s_replaced2_ascii.substr(s_replaced2_ascii.size() - 4, 3) == "xyz");
 }
 
 void check_utf8string(bool success_expected, int byte1, int byte2, int byte3, int byte4);

From 951a7a64559a746e4cc6e99c0e59a7a00bd72394 Mon Sep 17 00:00:00 2001
From: Niels Lohmann <mail@nlohmann.me>
Date: Mon, 22 Oct 2018 18:20:45 +0200
Subject: [PATCH 6/8] :construction: fixed test cases #1198

---
 test/src/unit-serialization.cpp | 9 ++++++---
 test/src/unit-unicode.cpp       | 1 -
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/test/src/unit-serialization.cpp b/test/src/unit-serialization.cpp
index e4069f73..0255edc9 100644
--- a/test/src/unit-serialization.cpp
+++ b/test/src/unit-serialization.cpp
@@ -106,7 +106,8 @@ TEST_CASE("serialization")
             CHECK_THROWS_AS(j.dump(1, ' ', false, json::error_handler_t::strict), json::type_error&);
             CHECK_THROWS_WITH(j.dump(1, ' ', false, json::error_handler_t::strict), "[json.exception.type_error.316] invalid UTF-8 byte at index 2: 0xA9");
             CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"äü\"");
-            CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"ä\\ufffdü\"");
+            CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"ä\xEF\xBF\xBDü\"");
+            CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"\\u00e4\\ufffd\\u00fc\"");
         }
 
         SECTION("ending with incomplete character")
@@ -117,7 +118,8 @@ TEST_CASE("serialization")
             CHECK_THROWS_WITH(j.dump(), "[json.exception.type_error.316] incomplete UTF-8 string; last byte: 0xC2");
             CHECK_THROWS_AS(j.dump(1, ' ', false, json::error_handler_t::strict), json::type_error&);
             CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123\"");
-            CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\\ufffd\"");
+            CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\xEF\xBF\xBD\"");
+            CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"123\\ufffd\"");
         }
 
         SECTION("unexpected character")
@@ -128,7 +130,8 @@ TEST_CASE("serialization")
             CHECK_THROWS_WITH(j.dump(), "[json.exception.type_error.316] invalid UTF-8 byte at index 5: 0x34");
             CHECK_THROWS_AS(j.dump(1, ' ', false, json::error_handler_t::strict), json::type_error&);
             CHECK(j.dump(-1, ' ', false, json::error_handler_t::ignore) == "\"123456\"");
-            CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\\ufffd456\"");
+            CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\xEF\xBF\xBD\x34\x35\x36\"");
+            CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"123\\ufffd456\"");
         }
     }
 }
diff --git a/test/src/unit-unicode.cpp b/test/src/unit-unicode.cpp
index fe16eb0d..a8a0a938 100644
--- a/test/src/unit-unicode.cpp
+++ b/test/src/unit-unicode.cpp
@@ -100,7 +100,6 @@ void check_utf8dump(bool success_expected, int byte1, int byte2 = -1, int byte3
 
         // check that replace string contains a replacement character
         CHECK(s_replaced.find("\xEF\xBF\xBD") != std::string::npos);
-
     }
 
     // check that prefix and suffix are preserved

From 2343d9caeb12cb0047762c38f22cf428eea61d7f Mon Sep 17 00:00:00 2001
From: Niels Lohmann <mail@nlohmann.me>
Date: Tue, 23 Oct 2018 17:22:13 +0200
Subject: [PATCH 7/8] :green_heart: additional tests from the Unicode spec
 #1198

Thanks @abolz!
---
 test/src/unit-serialization.cpp | 36 +++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/test/src/unit-serialization.cpp b/test/src/unit-serialization.cpp
index 0255edc9..1fe796e5 100644
--- a/test/src/unit-serialization.cpp
+++ b/test/src/unit-serialization.cpp
@@ -133,5 +133,41 @@ TEST_CASE("serialization")
             CHECK(j.dump(-1, ' ', false, json::error_handler_t::replace) == "\"123\xEF\xBF\xBD\x34\x35\x36\"");
             CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"123\\ufffd456\"");
         }
+
+        SECTION("U+FFFD Substitution of Maximal Subparts")
+        {
+            // Some tests (mostly) from
+            // https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf
+            // Section 3.9 -- U+FFFD Substitution of Maximal Subparts
+
+            auto test = [&](std::string const & input, std::string const & expected)
+            {
+                json j = input;
+                CHECK(j.dump(-1, ' ', true, json::error_handler_t::replace) == "\"" + expected + "\"");
+            };
+
+            test("\xC2", "\\ufffd");
+            test("\xC2\x41\x42", "\\ufffd" "\x41" "\x42");
+            test("\xC2\xF4", "\\ufffd" "\\ufffd");
+
+            test("\xF0\x80\x80\x41", "\\ufffd" "\\ufffd" "\\ufffd" "\x41");
+            test("\xF1\x80\x80\x41", "\\ufffd" "\x41");
+            test("\xF2\x80\x80\x41", "\\ufffd" "\x41");
+            test("\xF3\x80\x80\x41", "\\ufffd" "\x41");
+            test("\xF4\x80\x80\x41", "\\ufffd" "\x41");
+            test("\xF5\x80\x80\x41", "\\ufffd" "\\ufffd" "\\ufffd" "\x41");
+
+            test("\xF0\x90\x80\x41", "\\ufffd" "\x41");
+            test("\xF1\x90\x80\x41", "\\ufffd" "\x41");
+            test("\xF2\x90\x80\x41", "\\ufffd" "\x41");
+            test("\xF3\x90\x80\x41", "\\ufffd" "\x41");
+            test("\xF4\x90\x80\x41", "\\ufffd" "\\ufffd" "\\ufffd" "\x41");
+            test("\xF5\x90\x80\x41", "\\ufffd" "\\ufffd" "\\ufffd" "\x41");
+
+            test("\xC0\xAF\xE0\x80\xBF\xF0\x81\x82\x41", "\\ufffd" "\\ufffd" "\\ufffd" "\\ufffd" "\\ufffd" "\\ufffd" "\\ufffd" "\\ufffd" "\x41");
+            test("\xED\xA0\x80\xED\xBF\xBF\xED\xAF\x41", "\\ufffd" "\\ufffd" "\\ufffd" "\\ufffd" "\\ufffd" "\\ufffd" "\\ufffd" "\\ufffd" "\x41");
+            test("\xF4\x91\x92\x93\xFF\x41\x80\xBF\x42", "\\ufffd" "\\ufffd" "\\ufffd" "\\ufffd" "\\ufffd" "\x41" "\\ufffd""\\ufffd" "\x42");
+            test("\xE1\x80\xE2\xF0\x91\x92\xF1\xBF\x41", "\\ufffd" "\\ufffd" "\\ufffd" "\\ufffd" "\x41");
+        }
     }
 }

From 87ef3f25f2bb325e96db0e4b167d63c45f682af5 Mon Sep 17 00:00:00 2001
From: Niels Lohmann <mail@nlohmann.me>
Date: Tue, 23 Oct 2018 22:56:10 +0200
Subject: [PATCH 8/8] :pencil2: fixed a typo #1314

---
 include/nlohmann/detail/output/serializer.hpp | 2 +-
 single_include/nlohmann/json.hpp              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/nlohmann/detail/output/serializer.hpp b/include/nlohmann/detail/output/serializer.hpp
index f1c0b051..1d107ce0 100644
--- a/include/nlohmann/detail/output/serializer.hpp
+++ b/include/nlohmann/detail/output/serializer.hpp
@@ -422,7 +422,7 @@ class serializer
                         case error_handler_t::ignore:
                         case error_handler_t::replace:
                         {
-                            // in case we saw this chatacter the first time, we
+                            // in case we saw this character the first time, we
                             // would like to read it again, because the byte
                             // may be OK for itself, but just not OK for the
                             // previous sequence
diff --git a/single_include/nlohmann/json.hpp b/single_include/nlohmann/json.hpp
index 025dfeaf..be3f8417 100644
--- a/single_include/nlohmann/json.hpp
+++ b/single_include/nlohmann/json.hpp
@@ -10374,7 +10374,7 @@ class serializer
                         case error_handler_t::ignore:
                         case error_handler_t::replace:
                         {
-                            // in case we saw this chatacter the first time, we
+                            // in case we saw this character the first time, we
                             // would like to read it again, because the byte
                             // may be OK for itself, but just not OK for the
                             // previous sequence