LibTextCodec: Implement shift_jis encoder

Implements the `shift_jis` encoder, as specified by
https://encoding.spec.whatwg.org/#shift_jis-encoder
This commit is contained in:
BenJilks 2024-08-05 22:38:07 +01:00 committed by Tim Ledbetter
parent c1958437f9
commit 08a8d67a5b
Notes: github-actions[bot] 2024-08-08 16:51:03 +00:00
3 changed files with 119 additions and 0 deletions

View file

@ -44,6 +44,26 @@ TEST_CASE(test_euc_jp_encoder)
EXPECT(processed_bytes[4] == 0xC4); EXPECT(processed_bytes[4] == 0xC4);
} }
TEST_CASE(test_shift_jis_encoder)
{
TextCodec::ShiftJISEncoder encoder;
// U+A5 Yen Sign
// U+3088 Hiragana Letter Yo
// U+30C4 Katakana Letter Tu
auto test_string = "\U000000A5\U00003088\U000030C4"sv;
Vector<u8> processed_bytes;
MUST(encoder.process(Utf8View(test_string), [&](u8 byte) {
return processed_bytes.try_append(byte);
}));
EXPECT(processed_bytes.size() == 5);
EXPECT(processed_bytes[0] == 0x5C);
EXPECT(processed_bytes[1] == 0x82);
EXPECT(processed_bytes[2] == 0xE6);
EXPECT(processed_bytes[3] == 0x83);
EXPECT(processed_bytes[4] == 0x63);
}
TEST_CASE(test_euc_kr_encoder) TEST_CASE(test_euc_kr_encoder)
{ {
TextCodec::EUCKREncoder encoder; TextCodec::EUCKREncoder encoder;

View file

@ -19,6 +19,7 @@ GB18030Encoder s_gb18030_encoder;
GB18030Encoder s_gbk_encoder(GB18030Encoder::IsGBK::Yes); GB18030Encoder s_gbk_encoder(GB18030Encoder::IsGBK::Yes);
Big5Encoder s_big5_encoder; Big5Encoder s_big5_encoder;
EUCJPEncoder s_euc_jp_encoder; EUCJPEncoder s_euc_jp_encoder;
ShiftJISEncoder s_shift_jis_encoder;
EUCKREncoder s_euc_kr_encoder; EUCKREncoder s_euc_kr_encoder;
} }
@ -30,6 +31,8 @@ Optional<Encoder&> encoder_for_exact_name(StringView encoding)
return s_big5_encoder; return s_big5_encoder;
if (encoding.equals_ignoring_ascii_case("euc-jp"sv)) if (encoding.equals_ignoring_ascii_case("euc-jp"sv))
return s_euc_jp_encoder; return s_euc_jp_encoder;
if (encoding.equals_ignoring_ascii_case("shift_jis"sv))
return s_shift_jis_encoder;
if (encoding.equals_ignoring_ascii_case("euc-kr"sv)) if (encoding.equals_ignoring_ascii_case("euc-kr"sv))
return s_euc_kr_encoder; return s_euc_kr_encoder;
if (encoding.equals_ignoring_ascii_case("gb18030"sv)) if (encoding.equals_ignoring_ascii_case("gb18030"sv))
@ -113,6 +116,97 @@ ErrorOr<void> EUCJPEncoder::process(Utf8View input, Function<ErrorOr<void>(u8)>
return {}; return {};
} }
static Optional<u32> code_point_jis0208_index_skipping_range(u32 code_point, u32 skip_from, u32 skip_to)
{
VERIFY(skip_to >= skip_from);
for (u32 i = 0; i < s_jis0208_index.size(); ++i) {
if (i >= skip_from && i <= skip_to)
continue;
if (s_jis0208_index[i] == code_point)
return i;
}
return {};
}
// https://encoding.spec.whatwg.org/#index-shift_jis-pointer
static Optional<u32> index_shift_jis_pointer(u32 code_point)
{
// 1. Let index be index jis0208 excluding all entries whose pointer is in the range 8272 to 8835, inclusive.
auto pointer = code_point_jis0208_index_skipping_range(code_point, 8272, 8835);
if (!pointer.has_value())
return {};
// 2. Return the index pointer for code point in index.
return *pointer;
}
// https://encoding.spec.whatwg.org/#shift_jis-encoder
ErrorOr<void> ShiftJISEncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte)
{
for (u32 item : input) {
// 1. If code point is end-of-queue, return finished.
// 2. If code point is an ASCII code point or U+0080, return a byte whose value is code point.
if (is_ascii(item) || item == 0x0080) {
TRY(on_byte(static_cast<u8>(item)));
continue;
}
// 3. If code point is U+00A5, return byte 0x5C.
if (item == 0x00A5) {
TRY(on_byte(0x5C));
continue;
}
// 4. If code point is U+203E, return byte 0x7E.
if (item == 0x203E) {
TRY(on_byte(0x7E));
continue;
}
// 5. If code point is in the range U+FF61 to U+FF9F, inclusive, return a byte whose value is code point 0xFF61 + 0xA1.
if (item >= 0xFF61 && item <= 0xFF9F) {
TRY(on_byte(static_cast<u8>(item - 0xFF61 + 0xA1)));
continue;
}
// 6. If code point is U+2212, set it to U+FF0D.
if (item == 0x2212)
item = 0xFF0D;
// 7. Let pointer be the index Shift_JIS pointer for code point.
auto pointer = index_shift_jis_pointer(item);
// 8. If pointer is null, return error with code point.
if (!pointer.has_value()) {
// TODO: Report error.
continue;
}
// 9. Let lead be pointer / 188.
auto lead = *pointer / 188;
// 10. Let lead offset be 0x81 if lead is less than 0x1F, otherwise 0xC1.
auto lead_offset = 0xC1;
if (lead < 0x1F)
lead_offset = 0x81;
// 11. Let trail be pointer % 188.
auto trail = *pointer % 188;
// 12. Let offset be 0x40 if trail is less than 0x3F, otherwise 0x41.
auto offset = 0x41;
if (trail < 0x3F)
offset = 0x40;
// 13. Return two bytes whose values are lead + lead offset and trail + offset.
TRY(on_byte(static_cast<u8>(lead + lead_offset)));
TRY(on_byte(static_cast<u8>(trail + offset)));
}
return {};
}
// https://encoding.spec.whatwg.org/#euc-kr-encoder // https://encoding.spec.whatwg.org/#euc-kr-encoder
ErrorOr<void> EUCKREncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte) ErrorOr<void> EUCKREncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte)
{ {

View file

@ -29,6 +29,11 @@ public:
virtual ErrorOr<void> process(Utf8View, Function<ErrorOr<void>(u8)> on_byte) override; virtual ErrorOr<void> process(Utf8View, Function<ErrorOr<void>(u8)> on_byte) override;
}; };
class ShiftJISEncoder final : public Encoder {
public:
virtual ErrorOr<void> process(Utf8View, Function<ErrorOr<void>(u8)> on_byte) override;
};
class EUCKREncoder final : public Encoder { class EUCKREncoder final : public Encoder {
public: public:
virtual ErrorOr<void> process(Utf8View, Function<ErrorOr<void>(u8)> on_byte) override; virtual ErrorOr<void> process(Utf8View, Function<ErrorOr<void>(u8)> on_byte) override;