From 0ca5675d59dbcb52cedea56729de26b41074024a Mon Sep 17 00:00:00 2001 From: BenJilks Date: Tue, 6 Aug 2024 11:06:05 +0100 Subject: [PATCH] LibTextCodec: Implement `iso-2022-jp` encoder Implements the `iso-2022-jp` encoder, as specified by https://encoding.spec.whatwg.org/#iso-2022-jp-encoder --- .../LibTextCodec/GenerateEncodingIndexes.cpp | 1 + Tests/LibTextCodec/TestTextEncoders.cpp | 73 +++++--- Userland/Libraries/LibTextCodec/Encoder.cpp | 157 ++++++++++++++++-- Userland/Libraries/LibTextCodec/Encoder.h | 28 +++- Userland/Libraries/LibURL/Parser.cpp | 57 ++++--- 5 files changed, 255 insertions(+), 61 deletions(-) diff --git a/Meta/Lagom/Tools/CodeGenerators/LibTextCodec/GenerateEncodingIndexes.cpp b/Meta/Lagom/Tools/CodeGenerators/LibTextCodec/GenerateEncodingIndexes.cpp index 53918f50d54..cb8bbfea3ee 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibTextCodec/GenerateEncodingIndexes.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibTextCodec/GenerateEncodingIndexes.cpp @@ -272,6 +272,7 @@ ErrorOr serenity_main(Main::Arguments arguments) { "jis0212"sv, prepare_table(data.get("jis0212"sv)->as_array(), GenerateAccessor::Yes) }, { "euc_kr"sv, prepare_table(data.get("euc-kr"sv)->as_array(), GenerateAccessor::Yes, GenerateInverseAccessor::Yes) }, { "ibm866"sv, prepare_table(data.get("ibm866"sv)->as_array()) }, + { "iso_2022_jp_katakana"sv, prepare_table(data.get("iso-2022-jp-katakana"sv)->as_array(), GenerateAccessor::Yes) }, { "iso_8859_2"sv, prepare_table(data.get("iso-8859-2"sv)->as_array()) }, { "iso_8859_3"sv, prepare_table(data.get("iso-8859-3"sv)->as_array()) }, { "iso_8859_4"sv, prepare_table(data.get("iso-8859-4"sv)->as_array()) }, diff --git a/Tests/LibTextCodec/TestTextEncoders.cpp b/Tests/LibTextCodec/TestTextEncoders.cpp index d9898d0931e..8ed0759bb68 100644 --- a/Tests/LibTextCodec/TestTextEncoders.cpp +++ b/Tests/LibTextCodec/TestTextEncoders.cpp @@ -14,9 +14,10 @@ TEST_CASE(test_utf8_encode) auto test_string = "\U0001F600"sv; Vector processed_bytes; - MUST(encoder.process(Utf8View(test_string), [&](u8 byte) { - return processed_bytes.try_append(byte); - })); + MUST(encoder.process( + Utf8View(test_string), + [&](u8 byte) { return processed_bytes.try_append(byte); }, + [&](u32) -> ErrorOr { EXPECT(false); return {}; })); EXPECT(processed_bytes.size() == 4); EXPECT(processed_bytes[0] == 0xF0); EXPECT(processed_bytes[1] == 0x9F); @@ -33,9 +34,10 @@ TEST_CASE(test_euc_jp_encoder) auto test_string = "\U000000A5\U00003088\U000030C4"sv; Vector processed_bytes; - MUST(encoder.process(Utf8View(test_string), [&](u8 byte) { - return processed_bytes.try_append(byte); - })); + MUST(encoder.process( + Utf8View(test_string), + [&](u8 byte) { return processed_bytes.try_append(byte); }, + [&](u32) -> ErrorOr { EXPECT(false); return {}; })); EXPECT(processed_bytes.size() == 5); EXPECT(processed_bytes[0] == 0x5C); EXPECT(processed_bytes[1] == 0xA4); @@ -44,6 +46,36 @@ TEST_CASE(test_euc_jp_encoder) EXPECT(processed_bytes[4] == 0xC4); } +TEST_CASE(test_iso_2022_jp_encoder) +{ + TextCodec::ISO2022JPEncoder encoder; + // U+A5 Yen Sign + // U+3088 Hiragana Letter Yo + // U+30C4 Katakana Letter Tu + auto test_string = "\U000000A5\U00003088\U000030C4"sv; + + Vector processed_bytes; + MUST(encoder.process( + Utf8View(test_string), + [&](u8 byte) { return processed_bytes.try_append(byte); }, + [&](u32) -> ErrorOr { EXPECT(false); return {}; })); + EXPECT(processed_bytes.size() == 14); + EXPECT(processed_bytes[0] == 0x1B); + EXPECT(processed_bytes[1] == 0x28); + EXPECT(processed_bytes[2] == 0x4A); + EXPECT(processed_bytes[3] == 0x5C); + EXPECT(processed_bytes[4] == 0x1B); + EXPECT(processed_bytes[5] == 0x24); + EXPECT(processed_bytes[6] == 0x42); + EXPECT(processed_bytes[7] == 0x24); + EXPECT(processed_bytes[8] == 0x68); + EXPECT(processed_bytes[9] == 0x25); + EXPECT(processed_bytes[10] == 0x44); + EXPECT(processed_bytes[11] == 0x1B); + EXPECT(processed_bytes[12] == 0x28); + EXPECT(processed_bytes[13] == 0x42); +} + TEST_CASE(test_shift_jis_encoder) { TextCodec::ShiftJISEncoder encoder; @@ -53,9 +85,10 @@ TEST_CASE(test_shift_jis_encoder) auto test_string = "\U000000A5\U00003088\U000030C4"sv; Vector processed_bytes; - MUST(encoder.process(Utf8View(test_string), [&](u8 byte) { - return processed_bytes.try_append(byte); - })); + MUST(encoder.process( + Utf8View(test_string), + [&](u8 byte) { return processed_bytes.try_append(byte); }, + [&](u32) -> ErrorOr { EXPECT(false); return {}; })); EXPECT(processed_bytes.size() == 5); EXPECT(processed_bytes[0] == 0x5C); EXPECT(processed_bytes[1] == 0x82); @@ -72,9 +105,10 @@ TEST_CASE(test_euc_kr_encoder) auto test_string = "\U0000B29F\U00007C97"sv; Vector processed_bytes; - MUST(encoder.process(Utf8View(test_string), [&](u8 byte) { - return processed_bytes.try_append(byte); - })); + MUST(encoder.process( + Utf8View(test_string), + [&](u8 byte) { return processed_bytes.try_append(byte); }, + [&](u32) -> ErrorOr { EXPECT(false); return {}; })); EXPECT(processed_bytes.size() == 4); EXPECT(processed_bytes[0] == 0x88); EXPECT(processed_bytes[1] == 0x6B); @@ -90,9 +124,10 @@ TEST_CASE(test_big5_encoder) auto test_string = "\U000000A7\U000070D7"sv; Vector processed_bytes; - MUST(encoder.process(Utf8View(test_string), [&](u8 byte) { - return processed_bytes.try_append(byte); - })); + MUST(encoder.process( + Utf8View(test_string), + [&](u8 byte) { return processed_bytes.try_append(byte); }, + [&](u32) -> ErrorOr { EXPECT(false); return {}; })); EXPECT(processed_bytes.size() == 4); EXPECT(processed_bytes[0] == 0xA1); EXPECT(processed_bytes[1] == 0xB1); @@ -108,10 +143,10 @@ TEST_CASE(test_gb18030_encoder) auto test_string = "\U000020AC\U0000E4C5"sv; Vector processed_bytes; - MUST(encoder.process(Utf8View(test_string), [&](u8 byte) { - return processed_bytes.try_append(byte); - })); - + MUST(encoder.process( + Utf8View(test_string), + [&](u8 byte) { return processed_bytes.try_append(byte); }, + [&](u32) -> ErrorOr { EXPECT(false); return {}; })); EXPECT(processed_bytes.size() == 4); EXPECT(processed_bytes[0] == 0xA2); EXPECT(processed_bytes[1] == 0xE3); diff --git a/Userland/Libraries/LibTextCodec/Encoder.cpp b/Userland/Libraries/LibTextCodec/Encoder.cpp index 3448defb43d..540e3e782bd 100644 --- a/Userland/Libraries/LibTextCodec/Encoder.cpp +++ b/Userland/Libraries/LibTextCodec/Encoder.cpp @@ -19,6 +19,7 @@ GB18030Encoder s_gb18030_encoder; GB18030Encoder s_gbk_encoder(GB18030Encoder::IsGBK::Yes); Big5Encoder s_big5_encoder; EUCJPEncoder s_euc_jp_encoder; +ISO2022JPEncoder s_iso_2022_jp_encoder; ShiftJISEncoder s_shift_jis_encoder; EUCKREncoder s_euc_kr_encoder; } @@ -31,6 +32,8 @@ Optional encoder_for_exact_name(StringView encoding) return s_big5_encoder; if (encoding.equals_ignoring_ascii_case("euc-jp"sv)) return s_euc_jp_encoder; + if (encoding.equals_ignoring_ascii_case("iso-2022-jp"sv)) + return s_iso_2022_jp_encoder; if (encoding.equals_ignoring_ascii_case("shift_jis"sv)) return s_shift_jis_encoder; if (encoding.equals_ignoring_ascii_case("euc-kr"sv)) @@ -50,7 +53,7 @@ Optional encoder_for(StringView label) } // https://encoding.spec.whatwg.org/#utf-8-encoder -ErrorOr UTF8Encoder::process(Utf8View input, Function(u8)> on_byte) +ErrorOr UTF8Encoder::process(Utf8View input, Function(u8)> on_byte, Function(u32)>) { ReadonlyBytes bytes { input.bytes(), input.byte_length() }; for (auto byte : bytes) @@ -59,7 +62,7 @@ ErrorOr UTF8Encoder::process(Utf8View input, Function(u8)> o } // https://encoding.spec.whatwg.org/#euc-jp-encoder -ErrorOr EUCJPEncoder::process(Utf8View input, Function(u8)> on_byte) +ErrorOr EUCJPEncoder::process(Utf8View input, Function(u8)> on_byte, Function(u32)> on_error) { for (auto item : input) { // 1. If code point is end-of-queue, return finished. @@ -98,7 +101,7 @@ ErrorOr EUCJPEncoder::process(Utf8View input, Function(u8)> // 8. If pointer is null, return error with code point. if (!pointer.has_value()) { - // TODO: Report error. + TRY(on_error(item)); continue; } @@ -116,6 +119,136 @@ ErrorOr EUCJPEncoder::process(Utf8View input, Function(u8)> return {}; } +// https://encoding.spec.whatwg.org/#iso-2022-jp-encoder +ErrorOr ISO2022JPEncoder::process_item(u32 item, State state, Function(u8)>& on_byte, Function(u32)>& on_error) +{ + // 3. If ISO-2022-JP encoder state is ASCII or Roman, and code point is U+000E, U+000F, or U+001B, return error with U+FFFD. + if (state == State::ASCII || state == State::Roman) { + if (item == 0x000E || item == 0x000F || item == 0x001B) { + TRY(on_error(0xFFFD)); + return state; + } + } + + // 4. If ISO-2022-JP encoder state is ASCII and code point is an ASCII code point, return a byte whose value is code point. + if (state == State::ASCII && is_ascii(item)) { + TRY(on_byte(static_cast(item))); + return state; + } + + // 5. If ISO-2022-JP encoder state is Roman and code point is an ASCII code point, excluding U+005C and U+007E, or is U+00A5 or U+203E, then: + if (state == State::Roman && ((is_ascii(item) && item != 0x005C && item != 0x007E) || (item == 0x00A5 || item == 0x203E))) { + // 1. If code point is an ASCII code point, return a byte whose value is code point. + if (is_ascii(item)) { + TRY(on_byte(static_cast(item))); + return state; + } + + // 2. If code point is U+00A5, return byte 0x5C. + if (item == 0x00A5) { + TRY(on_byte(0x5C)); + return state; + } + + // 3. If code point is U+203E, return byte 0x7E. + if (item == 0x203E) { + TRY(on_byte(0x7E)); + return state; + } + } + + // 6. If code point is an ASCII code point, and ISO-2022-JP encoder state is not ASCII, restore code point to ioQueue, set + // ISO-2022-JP encoder state to ASCII, and return three bytes 0x1B 0x28 0x42. + if (is_ascii(item) && state != State::ASCII) { + TRY(on_byte(0x1B)); + TRY(on_byte(0x28)); + TRY(on_byte(0x42)); + return process_item(item, State::ASCII, on_byte, on_error); + } + + // 7. If code point is either U+00A5 or U+203E, and ISO-2022-JP encoder state is not Roman, restore code point to ioQueue, + // set ISO-2022-JP encoder state to Roman, and return three bytes 0x1B 0x28 0x4A. + if ((item == 0x00A5 || item == 0x203E) && state != State::Roman) { + TRY(on_byte(0x1B)); + TRY(on_byte(0x28)); + TRY(on_byte(0x4A)); + return process_item(item, State::Roman, on_byte, on_error); + } + + // 8. If code point is U+2212, set it to U+FF0D. + if (item == 0x2212) + item = 0xFF0D; + + // 9. If code point is in the range U+FF61 to U+FF9F, inclusive, set it to the index code point for code point − 0xFF61 + // in index ISO-2022-JP katakana. + if (item >= 0xFF61 && item <= 0xFF9F) { + item = *index_iso_2022_jp_katakana_code_point(item - 0xFF61); + } + + // 10. Let pointer be the index pointer for code point in index jis0208. + auto pointer = code_point_jis0208_index(item); + + // 11. If pointer is null, then: + if (!pointer.has_value()) { + // 1. If ISO-2022-JP encoder state is jis0208, then restore code point to ioQueue, set ISO-2022-JP encoder state to + // ASCII, and return three bytes 0x1B 0x28 0x42. + if (state == State::jis0208) { + TRY(on_byte(0x1B)); + TRY(on_byte(0x28)); + TRY(on_byte(0x4A)); + return process_item(item, State::ASCII, on_byte, on_error); + } + + // 2. Return error with code point. + TRY(on_error(item)); + return state; + } + + // 12. If ISO-2022-JP encoder state is not jis0208, restore code point to ioQueue, set ISO-2022-JP encoder state to + // jis0208, and return three bytes 0x1B 0x24 0x42. + if (state != State::jis0208) { + TRY(on_byte(0x1B)); + TRY(on_byte(0x24)); + TRY(on_byte(0x42)); + return process_item(item, State::jis0208, on_byte, on_error); + } + + // 13. Let lead be pointer / 94 + 0x21. + auto lead = *pointer / 94 + 0x21; + + // 14. Let trail be pointer % 94 + 0x21. + auto trail = *pointer % 94 + 0x21; + + // 15. Return two bytes whose values are lead and trail. + TRY(on_byte(static_cast(lead))); + TRY(on_byte(static_cast(trail))); + return state; +} + +// https://encoding.spec.whatwg.org/#iso-2022-jp-encoder +ErrorOr ISO2022JPEncoder::process(Utf8View input, Function(u8)> on_byte, Function(u32)> on_error) +{ + // ISO-2022-JP’s encoder has an associated ISO-2022-JP encoder state which is ASCII, Roman, or jis0208 (initially ASCII). + auto state = State::ASCII; + + for (u32 item : input) { + state = TRY(process_item(item, state, on_byte, on_error)); + } + + // 1. If code point is end-of-queue and ISO-2022-JP encoder state is not ASCII, set ISO-2022-JP + // encoder state to ASCII, and return three bytes 0x1B 0x28 0x42. + if (state != State::ASCII) { + state = State::ASCII; + TRY(on_byte(0x1B)); + TRY(on_byte(0x28)); + TRY(on_byte(0x42)); + return {}; + } + + // 2. If code point is end-of-queue and ISO-2022-JP encoder state is ASCII, return finished. + return {}; +} + static Optional code_point_jis0208_index_skipping_range(u32 code_point, u32 skip_from, u32 skip_to) { VERIFY(skip_to >= skip_from); @@ -141,7 +274,7 @@ static Optional index_shift_jis_pointer(u32 code_point) } // https://encoding.spec.whatwg.org/#shift_jis-encoder -ErrorOr ShiftJISEncoder::process(Utf8View input, Function(u8)> on_byte) +ErrorOr ShiftJISEncoder::process(Utf8View input, Function(u8)> on_byte, Function(u32)> on_error) { for (u32 item : input) { // 1. If code point is end-of-queue, return finished. @@ -179,7 +312,7 @@ ErrorOr ShiftJISEncoder::process(Utf8View input, Function(u8 // 8. If pointer is null, return error with code point. if (!pointer.has_value()) { - // TODO: Report error. + TRY(on_error(item)); continue; } @@ -208,7 +341,7 @@ ErrorOr ShiftJISEncoder::process(Utf8View input, Function(u8 } // https://encoding.spec.whatwg.org/#euc-kr-encoder -ErrorOr EUCKREncoder::process(Utf8View input, Function(u8)> on_byte) +ErrorOr EUCKREncoder::process(Utf8View input, Function(u8)> on_byte, Function(u32)> on_error) { for (u32 item : input) { // 1. If code point is end-of-queue, return finished. @@ -224,7 +357,7 @@ ErrorOr EUCKREncoder::process(Utf8View input, Function(u8)> // 4. If pointer is null, return error with code point. if (!pointer.has_value()) { - // TODO: Report error. + TRY(on_error(item)); continue; } @@ -269,7 +402,7 @@ static Optional index_big5_pointer(u32 code_point) } // https://encoding.spec.whatwg.org/#big5-encoder -ErrorOr Big5Encoder::process(Utf8View input, Function(u8)> on_byte) +ErrorOr Big5Encoder::process(Utf8View input, Function(u8)> on_byte, Function(u32)> on_error) { for (u32 item : input) { // 1. If code point is end-of-queue, return finished. @@ -285,7 +418,7 @@ ErrorOr Big5Encoder::process(Utf8View input, Function(u8)> o // 4. If pointer is null, return error with code point. if (!pointer.has_value()) { - // TODO: Report error. + TRY(on_error(item)); continue; } @@ -334,7 +467,7 @@ GB18030Encoder::GB18030Encoder(IsGBK is_gbk) } // https://encoding.spec.whatwg.org/#gb18030-encoder -ErrorOr GB18030Encoder::process(Utf8View input, Function(u8)> on_byte) +ErrorOr GB18030Encoder::process(Utf8View input, Function(u8)> on_byte, Function(u32)> on_error) { bool gbk = (m_is_gbk == IsGBK::Yes); @@ -349,7 +482,7 @@ ErrorOr GB18030Encoder::process(Utf8View input, Function(u8) // 3. If code point is U+E5E5, return error with code point. if (item == 0xE5E5) { - // TODO: Report error. + TRY(on_error(item)); continue; } @@ -383,7 +516,7 @@ ErrorOr GB18030Encoder::process(Utf8View input, Function(u8) // 7. If is GBK is true, return error with code point. if (gbk) { - // TODO: Report error. + TRY(on_error(item)); continue; } diff --git a/Userland/Libraries/LibTextCodec/Encoder.h b/Userland/Libraries/LibTextCodec/Encoder.h index d21828dfa47..8241fb67153 100644 --- a/Userland/Libraries/LibTextCodec/Encoder.h +++ b/Userland/Libraries/LibTextCodec/Encoder.h @@ -13,7 +13,7 @@ namespace TextCodec { class Encoder { public: - virtual ErrorOr process(Utf8View, Function(u8)> on_byte) = 0; + virtual ErrorOr process(Utf8View, Function(u8)> on_byte, Function(u32)> on_error) = 0; protected: virtual ~Encoder() = default; @@ -21,27 +21,41 @@ protected: class UTF8Encoder final : public Encoder { public: - virtual ErrorOr process(Utf8View, Function(u8)> on_byte) override; + virtual ErrorOr process(Utf8View, Function(u8)> on_byte, Function(u32)> on_error) override; }; class EUCJPEncoder final : public Encoder { public: - virtual ErrorOr process(Utf8View, Function(u8)> on_byte) override; + virtual ErrorOr process(Utf8View, Function(u8)> on_byte, Function(u32)> on_error) override; +}; + +class ISO2022JPEncoder final : public Encoder { +public: + virtual ErrorOr process(Utf8View, Function(u8)> on_byte, Function(u32)> on_error) override; + +private: + enum class State { + ASCII, + Roman, + jis0208, + }; + + ErrorOr process_item(u32 item, State, Function(u8)>& on_byte, Function(u32)>& on_error); }; class ShiftJISEncoder final : public Encoder { public: - virtual ErrorOr process(Utf8View, Function(u8)> on_byte) override; + virtual ErrorOr process(Utf8View, Function(u8)> on_byte, Function(u32)> on_error) override; }; class EUCKREncoder final : public Encoder { public: - virtual ErrorOr process(Utf8View, Function(u8)> on_byte) override; + virtual ErrorOr process(Utf8View, Function(u8)> on_byte, Function(u32)> on_error) override; }; class Big5Encoder final : public Encoder { public: - virtual ErrorOr process(Utf8View, Function(u8)> on_byte) override; + virtual ErrorOr process(Utf8View, Function(u8)> on_byte, Function(u32)> on_error) override; }; class GB18030Encoder final : public Encoder { @@ -53,7 +67,7 @@ public: GB18030Encoder(IsGBK is_gbk = IsGBK::No); - virtual ErrorOr process(Utf8View, Function(u8)> on_byte) override; + virtual ErrorOr process(Utf8View, Function(u8)> on_byte, Function(u32)> on_error) override; private: IsGBK m_is_gbk { IsGBK::No }; diff --git a/Userland/Libraries/LibURL/Parser.cpp b/Userland/Libraries/LibURL/Parser.cpp index f752410f958..af1c710acb1 100644 --- a/Userland/Libraries/LibURL/Parser.cpp +++ b/Userland/Libraries/LibURL/Parser.cpp @@ -775,31 +775,42 @@ ErrorOr Parser::percent_encode_after_encoding(TextCodec::Encoder& encode // 1. Let encodeOutput be an empty I/O queue. StringBuilder output; - // 3. For each byte of encodeOutput converted to a byte sequence: - TRY(encoder.process(Utf8View(input), [&](u8 byte) -> ErrorOr { - // 1. If spaceAsPlus is true and byte is 0x20 (SP), then append U+002B (+) to output and continue. - if (space_as_plus && byte == ' ') { - output.append('+'); + // 2. Set potentialError to the result of running encode or fail with inputQueue, encoder, and encodeOutput. + TRY(encoder.process( + Utf8View(input), + + // 3. For each byte of encodeOutput converted to a byte sequence: + [&](u8 byte) -> ErrorOr { + // 1. If spaceAsPlus is true and byte is 0x20 (SP), then append U+002B (+) to output and continue. + if (space_as_plus && byte == ' ') { + output.append('+'); + return {}; + } + + // 2. Let isomorph be a code point whose value is byte’s value. + u32 isomorph = byte; + + // 3. Assert: percentEncodeSet includes all non-ASCII code points. + + // 4. If isomorphic is not in percentEncodeSet, then append isomorph to output. + if (!code_point_is_in_percent_encode_set(isomorph, percent_encode_set)) { + output.append_code_point(isomorph); + } + + // 5. Otherwise, percent-encode byte and append the result to output. + else { + output.appendff("%{:02X}", byte); + } + return {}; - } + }, - // 2. Let isomorph be a code point whose value is byte’s value. - u32 isomorph = byte; - - // 3. Assert: percentEncodeSet includes all non-ASCII code points. - - // 4. If isomorphic is not in percentEncodeSet, then append isomorph to output. - if (!code_point_is_in_percent_encode_set(isomorph, percent_encode_set)) { - output.append_code_point(isomorph); - } - - // 5. Otherwise, percent-encode byte and append the result to output. - else { - output.appendff("%{:02X}", byte); - } - - return {}; - })); + // 4. If potentialError is non-null, then append "%26%23", followed by the shortest sequence of ASCII digits + // representing potentialError in base ten, followed by "%3B", to output. + [&](u32 error) -> ErrorOr { + output.appendff("%26%23{}%3B", error); + return {}; + })); // 6. Return output. return output.to_string();