From a8136c578a8bfe700f3b6ee942291d053f582716 Mon Sep 17 00:00:00 2001 From: Niels Date: Sat, 29 Oct 2016 21:08:30 +0200 Subject: [PATCH] fixed BOM handling #344 --- src/json.hpp | 521 ++++++++++++++++++-------------------- src/json.hpp.re2c | 12 +- test/src/unit-unicode.cpp | 4 +- 3 files changed, 252 insertions(+), 285 deletions(-) diff --git a/src/json.hpp b/src/json.hpp index 9c142443..13a821ec 100644 --- a/src/json.hpp +++ b/src/json.hpp @@ -7620,6 +7620,14 @@ class basic_json { // fill buffer fill_line_buffer(); + + // skip UTF-8 byte-order mark + if (m_line_buffer.size() >= 3 and m_line_buffer.substr(0, 3) == "\xEF\xBB\xBF") + { + m_line_buffer[0] = ' '; + m_line_buffer[1] = ' '; + m_line_buffer[2] = ' '; + } } // switch off unwanted functions (due to pointer members) @@ -7826,7 +7834,7 @@ class basic_json 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; - if ((m_limit - m_cursor) < 6) + if ((m_limit - m_cursor) < 5) { fill_line_buffer(); } @@ -7835,7 +7843,7 @@ class basic_json { goto basic_json_parser_6; } - if (yych <= '\\') + if (yych <= '[') { if (yych <= '-') { @@ -7884,63 +7892,59 @@ class basic_json { goto basic_json_parser_17; } - if (yych == '[') + if (yych <= 'Z') { - goto basic_json_parser_19; + goto basic_json_parser_4; } - goto basic_json_parser_4; + goto basic_json_parser_19; } } } else { - if (yych <= 't') + if (yych <= 'n') { - if (yych <= 'f') + if (yych <= 'e') { - if (yych <= ']') + if (yych == ']') { goto basic_json_parser_21; } - if (yych <= 'e') - { - goto basic_json_parser_4; - } - goto basic_json_parser_23; - } - else - { - if (yych == 'n') - { - goto basic_json_parser_24; - } - if (yych <= 's') - { - goto basic_json_parser_4; - } - goto basic_json_parser_25; - } - } - else - { - if (yych <= '|') - { - if (yych == '{') - { - goto basic_json_parser_26; - } goto basic_json_parser_4; } else { - if (yych <= '}') + if (yych <= 'f') + { + goto basic_json_parser_23; + } + if (yych <= 'm') + { + goto basic_json_parser_4; + } + goto basic_json_parser_24; + } + } + else + { + if (yych <= 'z') + { + if (yych == 't') + { + goto basic_json_parser_25; + } + goto basic_json_parser_4; + } + else + { + if (yych <= '{') + { + goto basic_json_parser_26; + } + if (yych == '}') { goto basic_json_parser_28; } - if (yych == 0xC3) - { - goto basic_json_parser_30; - } goto basic_json_parser_4; } } @@ -7981,7 +7985,7 @@ basic_json_parser_9: } if (yych <= 0x7F) { - goto basic_json_parser_32; + goto basic_json_parser_31; } if (yych <= 0xC1) { @@ -7989,7 +7993,7 @@ basic_json_parser_9: } if (yych <= 0xF4) { - goto basic_json_parser_32; + goto basic_json_parser_31; } goto basic_json_parser_5; basic_json_parser_10: @@ -8020,18 +8024,18 @@ basic_json_parser_13: { if (yych == '.') { - goto basic_json_parser_44; + goto basic_json_parser_43; } } else { if (yych <= 'E') { - goto basic_json_parser_45; + goto basic_json_parser_44; } if (yych == 'e') { - goto basic_json_parser_45; + goto basic_json_parser_44; } } basic_json_parser_14: @@ -8055,7 +8059,7 @@ basic_json_parser_15: { if (yych == '.') { - goto basic_json_parser_44; + goto basic_json_parser_43; } goto basic_json_parser_14; } @@ -8063,11 +8067,11 @@ basic_json_parser_15: { if (yych <= 'E') { - goto basic_json_parser_45; + goto basic_json_parser_44; } if (yych == 'e') { - goto basic_json_parser_45; + goto basic_json_parser_44; } goto basic_json_parser_14; } @@ -8094,7 +8098,7 @@ basic_json_parser_23: yych = *(m_marker = ++m_cursor); if (yych == 'a') { - goto basic_json_parser_46; + goto basic_json_parser_45; } goto basic_json_parser_5; basic_json_parser_24: @@ -8102,7 +8106,7 @@ basic_json_parser_24: yych = *(m_marker = ++m_cursor); if (yych == 'u') { - goto basic_json_parser_47; + goto basic_json_parser_46; } goto basic_json_parser_5; basic_json_parser_25: @@ -8110,7 +8114,7 @@ basic_json_parser_25: yych = *(m_marker = ++m_cursor); if (yych == 'r') { - goto basic_json_parser_48; + goto basic_json_parser_47; } goto basic_json_parser_5; basic_json_parser_26: @@ -8126,24 +8130,16 @@ basic_json_parser_28: break; } basic_json_parser_30: - yyaccept = 0; - yych = *(m_marker = ++m_cursor); - if (yych == 0xAF) - { - goto basic_json_parser_49; - } - goto basic_json_parser_5; -basic_json_parser_31: ++m_cursor; if (m_limit <= m_cursor) { fill_line_buffer(); } yych = *m_cursor; -basic_json_parser_32: +basic_json_parser_31: if (yybm[0 + yych] & 128) { - goto basic_json_parser_31; + goto basic_json_parser_30; } if (yych <= 0xE0) { @@ -8151,25 +8147,25 @@ basic_json_parser_32: { if (yych <= 0x1F) { - goto basic_json_parser_33; + goto basic_json_parser_32; } if (yych <= '"') { - goto basic_json_parser_34; + goto basic_json_parser_33; } - goto basic_json_parser_36; + goto basic_json_parser_35; } else { if (yych <= 0xC1) { - goto basic_json_parser_33; + goto basic_json_parser_32; } if (yych <= 0xDF) { - goto basic_json_parser_37; + goto basic_json_parser_36; } - goto basic_json_parser_38; + goto basic_json_parser_37; } } else @@ -8178,27 +8174,27 @@ basic_json_parser_32: { if (yych == 0xED) { - goto basic_json_parser_40; + goto basic_json_parser_39; } - goto basic_json_parser_39; + goto basic_json_parser_38; } else { if (yych <= 0xF0) { - goto basic_json_parser_41; + goto basic_json_parser_40; } if (yych <= 0xF3) { - goto basic_json_parser_42; + goto basic_json_parser_41; } if (yych <= 0xF4) { - goto basic_json_parser_43; + goto basic_json_parser_42; } } } -basic_json_parser_33: +basic_json_parser_32: m_cursor = m_marker; if (yyaccept == 0) { @@ -8208,13 +8204,13 @@ basic_json_parser_33: { goto basic_json_parser_14; } -basic_json_parser_34: +basic_json_parser_33: ++m_cursor; { last_token_type = token_type::value_string; break; } -basic_json_parser_36: +basic_json_parser_35: ++m_cursor; if (m_limit <= m_cursor) { @@ -8227,13 +8223,13 @@ basic_json_parser_36: { if (yych == '"') { - goto basic_json_parser_31; + goto basic_json_parser_30; } if (yych <= '.') { - goto basic_json_parser_33; + goto basic_json_parser_32; } - goto basic_json_parser_31; + goto basic_json_parser_30; } else { @@ -8241,17 +8237,17 @@ basic_json_parser_36: { if (yych <= '[') { - goto basic_json_parser_33; + goto basic_json_parser_32; } - goto basic_json_parser_31; + goto basic_json_parser_30; } else { if (yych == 'b') { - goto basic_json_parser_31; + goto basic_json_parser_30; } - goto basic_json_parser_33; + goto basic_json_parser_32; } } } @@ -8261,13 +8257,13 @@ basic_json_parser_36: { if (yych <= 'f') { - goto basic_json_parser_31; + goto basic_json_parser_30; } if (yych == 'n') { - goto basic_json_parser_31; + goto basic_json_parser_30; } - goto basic_json_parser_33; + goto basic_json_parser_32; } else { @@ -8275,25 +8271,25 @@ basic_json_parser_36: { if (yych <= 'r') { - goto basic_json_parser_31; + goto basic_json_parser_30; } - goto basic_json_parser_33; + goto basic_json_parser_32; } else { if (yych <= 't') { - goto basic_json_parser_31; + goto basic_json_parser_30; } if (yych <= 'u') { - goto basic_json_parser_50; + goto basic_json_parser_48; } - goto basic_json_parser_33; + goto basic_json_parser_32; } } } -basic_json_parser_37: +basic_json_parser_36: ++m_cursor; if (m_limit <= m_cursor) { @@ -8302,14 +8298,14 @@ basic_json_parser_37: yych = *m_cursor; if (yych <= 0x7F) { - goto basic_json_parser_33; + goto basic_json_parser_32; } if (yych <= 0xBF) { - goto basic_json_parser_31; + goto basic_json_parser_30; } - goto basic_json_parser_33; -basic_json_parser_38: + goto basic_json_parser_32; +basic_json_parser_37: ++m_cursor; if (m_limit <= m_cursor) { @@ -8318,13 +8314,29 @@ basic_json_parser_38: yych = *m_cursor; if (yych <= 0x9F) { - goto basic_json_parser_33; + goto basic_json_parser_32; } if (yych <= 0xBF) { - goto basic_json_parser_37; + goto basic_json_parser_36; } - goto basic_json_parser_33; + goto basic_json_parser_32; +basic_json_parser_38: + ++m_cursor; + if (m_limit <= m_cursor) + { + fill_line_buffer(); + } + yych = *m_cursor; + if (yych <= 0x7F) + { + goto basic_json_parser_32; + } + if (yych <= 0xBF) + { + goto basic_json_parser_36; + } + goto basic_json_parser_32; basic_json_parser_39: ++m_cursor; if (m_limit <= m_cursor) @@ -8334,30 +8346,14 @@ basic_json_parser_39: yych = *m_cursor; if (yych <= 0x7F) { - goto basic_json_parser_33; - } - if (yych <= 0xBF) - { - goto basic_json_parser_37; - } - goto basic_json_parser_33; -basic_json_parser_40: - ++m_cursor; - if (m_limit <= m_cursor) - { - fill_line_buffer(); - } - yych = *m_cursor; - if (yych <= 0x7F) - { - goto basic_json_parser_33; + goto basic_json_parser_32; } if (yych <= 0x9F) { - goto basic_json_parser_37; + goto basic_json_parser_36; } - goto basic_json_parser_33; -basic_json_parser_41: + goto basic_json_parser_32; +basic_json_parser_40: ++m_cursor; if (m_limit <= m_cursor) { @@ -8366,13 +8362,29 @@ basic_json_parser_41: yych = *m_cursor; if (yych <= 0x8F) { - goto basic_json_parser_33; + goto basic_json_parser_32; } if (yych <= 0xBF) { - goto basic_json_parser_39; + goto basic_json_parser_38; } - goto basic_json_parser_33; + goto basic_json_parser_32; +basic_json_parser_41: + ++m_cursor; + if (m_limit <= m_cursor) + { + fill_line_buffer(); + } + yych = *m_cursor; + if (yych <= 0x7F) + { + goto basic_json_parser_32; + } + if (yych <= 0xBF) + { + goto basic_json_parser_38; + } + goto basic_json_parser_32; basic_json_parser_42: ++m_cursor; if (m_limit <= m_cursor) @@ -8382,95 +8394,72 @@ basic_json_parser_42: yych = *m_cursor; if (yych <= 0x7F) { - goto basic_json_parser_33; - } - if (yych <= 0xBF) - { - goto basic_json_parser_39; - } - goto basic_json_parser_33; -basic_json_parser_43: - ++m_cursor; - if (m_limit <= m_cursor) - { - fill_line_buffer(); - } - yych = *m_cursor; - if (yych <= 0x7F) - { - goto basic_json_parser_33; + goto basic_json_parser_32; } if (yych <= 0x8F) { - goto basic_json_parser_39; + goto basic_json_parser_38; } - goto basic_json_parser_33; -basic_json_parser_44: + goto basic_json_parser_32; +basic_json_parser_43: yych = *++m_cursor; if (yych <= '/') { - goto basic_json_parser_33; + goto basic_json_parser_32; } if (yych <= '9') { - goto basic_json_parser_51; + goto basic_json_parser_49; } - goto basic_json_parser_33; -basic_json_parser_45: + goto basic_json_parser_32; +basic_json_parser_44: yych = *++m_cursor; if (yych <= ',') { if (yych == '+') { - goto basic_json_parser_53; + goto basic_json_parser_51; } - goto basic_json_parser_33; + goto basic_json_parser_32; } else { if (yych <= '-') { - goto basic_json_parser_53; + goto basic_json_parser_51; } if (yych <= '/') { - goto basic_json_parser_33; + goto basic_json_parser_32; } if (yych <= '9') { - goto basic_json_parser_54; + goto basic_json_parser_52; } - goto basic_json_parser_33; + goto basic_json_parser_32; } +basic_json_parser_45: + yych = *++m_cursor; + if (yych == 'l') + { + goto basic_json_parser_54; + } + goto basic_json_parser_32; basic_json_parser_46: yych = *++m_cursor; if (yych == 'l') { - goto basic_json_parser_56; + goto basic_json_parser_55; } - goto basic_json_parser_33; + goto basic_json_parser_32; basic_json_parser_47: - yych = *++m_cursor; - if (yych == 'l') - { - goto basic_json_parser_57; - } - goto basic_json_parser_33; -basic_json_parser_48: yych = *++m_cursor; if (yych == 'u') { - goto basic_json_parser_58; + goto basic_json_parser_56; } - goto basic_json_parser_33; -basic_json_parser_49: - yych = *++m_cursor; - if (yych == 0xC2) - { - goto basic_json_parser_59; - } - goto basic_json_parser_33; -basic_json_parser_50: + goto basic_json_parser_32; +basic_json_parser_48: ++m_cursor; if (m_limit <= m_cursor) { @@ -8481,31 +8470,31 @@ basic_json_parser_50: { if (yych <= '/') { - goto basic_json_parser_33; + goto basic_json_parser_32; } if (yych <= '9') { - goto basic_json_parser_60; + goto basic_json_parser_57; } - goto basic_json_parser_33; + goto basic_json_parser_32; } else { if (yych <= 'F') { - goto basic_json_parser_60; + goto basic_json_parser_57; } if (yych <= '`') { - goto basic_json_parser_33; + goto basic_json_parser_32; } if (yych <= 'f') { - goto basic_json_parser_60; + goto basic_json_parser_57; } - goto basic_json_parser_33; + goto basic_json_parser_32; } -basic_json_parser_51: +basic_json_parser_49: yyaccept = 1; m_marker = ++m_cursor; if ((m_limit - m_cursor) < 3) @@ -8521,7 +8510,7 @@ basic_json_parser_51: } if (yych <= '9') { - goto basic_json_parser_51; + goto basic_json_parser_49; } goto basic_json_parser_14; } @@ -8529,25 +8518,25 @@ basic_json_parser_51: { if (yych <= 'E') { - goto basic_json_parser_45; + goto basic_json_parser_44; } if (yych == 'e') { - goto basic_json_parser_45; + goto basic_json_parser_44; } goto basic_json_parser_14; } -basic_json_parser_53: +basic_json_parser_51: yych = *++m_cursor; if (yych <= '/') { - goto basic_json_parser_33; + goto basic_json_parser_32; } if (yych >= ':') { - goto basic_json_parser_33; + goto basic_json_parser_32; } -basic_json_parser_54: +basic_json_parser_52: ++m_cursor; if (m_limit <= m_cursor) { @@ -8560,99 +8549,85 @@ basic_json_parser_54: } if (yych <= '9') { - goto basic_json_parser_54; + goto basic_json_parser_52; } goto basic_json_parser_14; -basic_json_parser_56: +basic_json_parser_54: yych = *++m_cursor; if (yych == 's') { - goto basic_json_parser_61; + goto basic_json_parser_58; } - goto basic_json_parser_33; -basic_json_parser_57: + goto basic_json_parser_32; +basic_json_parser_55: yych = *++m_cursor; if (yych == 'l') { - goto basic_json_parser_62; + goto basic_json_parser_59; + } + goto basic_json_parser_32; +basic_json_parser_56: + yych = *++m_cursor; + if (yych == 'e') + { + goto basic_json_parser_61; + } + goto basic_json_parser_32; +basic_json_parser_57: + ++m_cursor; + if (m_limit <= m_cursor) + { + fill_line_buffer(); + } + yych = *m_cursor; + if (yych <= '@') + { + if (yych <= '/') + { + goto basic_json_parser_32; + } + if (yych <= '9') + { + goto basic_json_parser_63; + } + goto basic_json_parser_32; + } + else + { + if (yych <= 'F') + { + goto basic_json_parser_63; + } + if (yych <= '`') + { + goto basic_json_parser_32; + } + if (yych <= 'f') + { + goto basic_json_parser_63; + } + goto basic_json_parser_32; } - goto basic_json_parser_33; basic_json_parser_58: yych = *++m_cursor; if (yych == 'e') { goto basic_json_parser_64; } - goto basic_json_parser_33; + goto basic_json_parser_32; basic_json_parser_59: - yych = *++m_cursor; - if (yych == 0xBB) - { - goto basic_json_parser_66; - } - goto basic_json_parser_33; -basic_json_parser_60: - ++m_cursor; - if (m_limit <= m_cursor) - { - fill_line_buffer(); - } - yych = *m_cursor; - if (yych <= '@') - { - if (yych <= '/') - { - goto basic_json_parser_33; - } - if (yych <= '9') - { - goto basic_json_parser_67; - } - goto basic_json_parser_33; - } - else - { - if (yych <= 'F') - { - goto basic_json_parser_67; - } - if (yych <= '`') - { - goto basic_json_parser_33; - } - if (yych <= 'f') - { - goto basic_json_parser_67; - } - goto basic_json_parser_33; - } -basic_json_parser_61: - yych = *++m_cursor; - if (yych == 'e') - { - goto basic_json_parser_68; - } - goto basic_json_parser_33; -basic_json_parser_62: ++m_cursor; { last_token_type = token_type::literal_null; break; } -basic_json_parser_64: +basic_json_parser_61: ++m_cursor; { last_token_type = token_type::literal_true; break; } -basic_json_parser_66: - yych = *++m_cursor; - if (yych == 0xC2) - { - goto basic_json_parser_70; - } - goto basic_json_parser_33; -basic_json_parser_67: +basic_json_parser_63: ++m_cursor; if (m_limit <= m_cursor) { @@ -8663,44 +8638,37 @@ basic_json_parser_67: { if (yych <= '/') { - goto basic_json_parser_33; + goto basic_json_parser_32; } if (yych <= '9') { - goto basic_json_parser_71; + goto basic_json_parser_66; } - goto basic_json_parser_33; + goto basic_json_parser_32; } else { if (yych <= 'F') { - goto basic_json_parser_71; + goto basic_json_parser_66; } if (yych <= '`') { - goto basic_json_parser_33; + goto basic_json_parser_32; } if (yych <= 'f') { - goto basic_json_parser_71; + goto basic_json_parser_66; } - goto basic_json_parser_33; + goto basic_json_parser_32; } -basic_json_parser_68: +basic_json_parser_64: ++m_cursor; { last_token_type = token_type::literal_false; break; } -basic_json_parser_70: - yych = *++m_cursor; - if (yych == 0xBF) - { - goto basic_json_parser_72; - } - goto basic_json_parser_33; -basic_json_parser_71: +basic_json_parser_66: ++m_cursor; if (m_limit <= m_cursor) { @@ -8711,34 +8679,29 @@ basic_json_parser_71: { if (yych <= '/') { - goto basic_json_parser_33; + goto basic_json_parser_32; } if (yych <= '9') { - goto basic_json_parser_31; + goto basic_json_parser_30; } - goto basic_json_parser_33; + goto basic_json_parser_32; } else { if (yych <= 'F') { - goto basic_json_parser_31; + goto basic_json_parser_30; } if (yych <= '`') { - goto basic_json_parser_33; + goto basic_json_parser_32; } if (yych <= 'f') { - goto basic_json_parser_31; + goto basic_json_parser_30; } - goto basic_json_parser_33; - } -basic_json_parser_72: - ++m_cursor; - { - continue; + goto basic_json_parser_32; } } diff --git a/src/json.hpp.re2c b/src/json.hpp.re2c index 470ade21..f6884cb4 100644 --- a/src/json.hpp.re2c +++ b/src/json.hpp.re2c @@ -7620,6 +7620,14 @@ class basic_json { // fill buffer fill_line_buffer(); + + // skip UTF-8 byte-order mark + if (m_line_buffer.size() >= 3 and m_line_buffer.substr(0, 3) == "\xEF\xBB\xBF") + { + m_line_buffer[0] = ' '; + m_line_buffer[1] = ' '; + m_line_buffer[2] = ' '; + } } // switch off unwanted functions (due to pointer members) @@ -7802,10 +7810,6 @@ class basic_json ws = [ \t\n\r]+; ws { continue; } - // ignore byte-order-mark - bom = "\xEF\xBB\xBF"; - bom { continue; } - // structural characters "[" { last_token_type = token_type::begin_array; break; } "]" { last_token_type = token_type::end_array; break; } diff --git a/test/src/unit-unicode.cpp b/test/src/unit-unicode.cpp index aa1b7de6..5ab8f524 100644 --- a/test/src/unit-unicode.cpp +++ b/test/src/unit-unicode.cpp @@ -159,7 +159,7 @@ TEST_CASE("Unicode", "[hide]") } } } -/* + SECTION("ignore byte-order-mark") { // read a file with a UTF-8 BOM @@ -167,7 +167,7 @@ TEST_CASE("Unicode", "[hide]") json j; CHECK_NOTHROW(j << f); } -*/ + SECTION("error for incomplete/wrong BOM") { CHECK_THROWS_AS(json::parse("\xef\xbb"), std::invalid_argument);