LibWeb: Escape HTML text fragments with multi-byte code point awareness

The UTF-8 encoding of U+00A0 (NBSP) is the bytes 0xc2 0xa0. By looping
over the string to escape byte-by-byte, we replace the second byte with
" ", but leave the first byte in the resulting text. This creates
an invalid UTF-8 string, with a lone leading byte.
This commit is contained in:
Timothy Flynn 2023-03-12 22:05:03 -04:00 committed by Linus Groh
parent 3219ecbdc0
commit f5f1a5228e
Notes: sideshowbarker 2024-07-17 21:26:19 +09:00

View file

@ -3586,23 +3586,23 @@ DeprecatedString HTMLParser::serialize_html_fragment(DOM::Node const& node)
auto escape_string = [](StringView string, AttributeMode attribute_mode) -> DeprecatedString {
// https://html.spec.whatwg.org/multipage/parsing.html#escapingString
StringBuilder builder;
for (auto& ch : string) {
for (auto code_point : Utf8View { string }) {
// 1. Replace any occurrence of the "&" character by the string "&".
if (ch == '&')
if (code_point == '&')
builder.append("&"sv);
// 2. Replace any occurrences of the U+00A0 NO-BREAK SPACE character by the string " ".
else if (ch == '\xA0')
else if (code_point == 0xA0)
builder.append(" "sv);
// 3. If the algorithm was invoked in the attribute mode, replace any occurrences of the """ character by the string """.
else if (ch == '"' && attribute_mode == AttributeMode::Yes)
else if (code_point == '"' && attribute_mode == AttributeMode::Yes)
builder.append("""sv);
// 4. If the algorithm was not invoked in the attribute mode, replace any occurrences of the "<" character by the string "&lt;", and any occurrences of the ">" character by the string "&gt;".
else if (ch == '<' && attribute_mode == AttributeMode::No)
else if (code_point == '<' && attribute_mode == AttributeMode::No)
builder.append("&lt;"sv);
else if (ch == '>' && attribute_mode == AttributeMode::No)
else if (code_point == '>' && attribute_mode == AttributeMode::No)
builder.append("&gt;"sv);
else
builder.append(ch);
builder.append_code_point(code_point);
}
return builder.to_deprecated_string();
};