From 8b2499b11211556cc5998aa4e08501cf580e4c93 Mon Sep 17 00:00:00 2001 From: Andreas Kling Date: Mon, 21 Feb 2022 21:54:21 +0100 Subject: [PATCH] LibWeb: Make document.write() work while document is parsing This necessitated making HTMLParser ref-counted, and having it register itself with Document when created. That makes it possible for scripts to add new input at the current parser insertion point. There is now a reference cycle between Document and HTMLParser. This cycle is explicitly broken by calling Document::detach_parser() at the end of HTMLParser::run(). This is a huge progression on ACID3, from 31% to 49%! :^) --- Tests/LibWeb/test-web.cpp | 6 +-- Userland/Libraries/LibWeb/DOM/Document.cpp | 12 ++++- Userland/Libraries/LibWeb/DOM/Document.h | 5 +- Userland/Libraries/LibWeb/HTML/DOMParser.cpp | 4 +- .../LibWeb/HTML/Parser/HTMLParser.cpp | 53 ++++++++++++------- .../Libraries/LibWeb/HTML/Parser/HTMLParser.h | 11 ++-- .../Libraries/LibWeb/Loader/FrameLoader.cpp | 14 ++--- 7 files changed, 67 insertions(+), 38 deletions(-) diff --git a/Tests/LibWeb/test-web.cpp b/Tests/LibWeb/test-web.cpp index 14819e41c1b..a7d9913d3d2 100644 --- a/Tests/LibWeb/test-web.cpp +++ b/Tests/LibWeb/test-web.cpp @@ -150,10 +150,10 @@ JS_DEFINE_NATIVE_FUNCTION(TestWebGlobalObject::wait_for_page_to_load) loader.load_sync( request, [&](auto data, auto&, auto) { - Web::HTML::HTMLParser parser(document, data, "utf-8"); + auto parser = Web::HTML::HTMLParser::create(document, data, "utf-8"); // Now parse the HTML page. - parser.run(next_page_to_load.value()); - g_page_view->set_document(&parser.document()); + parser->run(next_page_to_load.value()); + g_page_view->set_document(&parser->document()); // Note: Unhandled exceptions are just dropped here. // Run the "after" hooks diff --git a/Userland/Libraries/LibWeb/DOM/Document.cpp b/Userland/Libraries/LibWeb/DOM/Document.cpp index 19c36132f53..999a8fcd28b 100644 --- a/Userland/Libraries/LibWeb/DOM/Document.cpp +++ b/Userland/Libraries/LibWeb/DOM/Document.cpp @@ -256,7 +256,7 @@ ExceptionOr Document::open(String const&, String const&) set_quirks_mode(QuirksMode::No); // 16. Create a new HTML parser and associate it with document. This is a script-created parser (meaning that it can be closed by the document.open() and document.close() methods, and that the tokenizer will wait for an explicit call to document.close() before emitting an end-of-file token). The encoding confidence is irrelevant. - m_parser = make(*this); + m_parser = HTML::HTMLParser::create_for_scripting(*this); // 17. Set the insertion point to point at just before the end of the input stream (which at this point will be empty). m_parser->tokenizer().update_insertion_point(); @@ -1324,4 +1324,14 @@ bool Document::has_focus() const return true; } +void Document::set_parser(Badge, HTML::HTMLParser& parser) +{ + m_parser = parser; +} + +void Document::detach_parser(Badge) +{ + m_parser = nullptr; +} + } diff --git a/Userland/Libraries/LibWeb/DOM/Document.h b/Userland/Libraries/LibWeb/DOM/Document.h index 650b14b36e3..c856387c742 100644 --- a/Userland/Libraries/LibWeb/DOM/Document.h +++ b/Userland/Libraries/LibWeb/DOM/Document.h @@ -313,6 +313,9 @@ public: bool has_focus() const; + void set_parser(Badge, HTML::HTMLParser&); + void detach_parser(Badge); + private: explicit Document(const AK::URL&); @@ -358,7 +361,7 @@ private: RefPtr m_style_update_timer; RefPtr m_layout_update_timer; - OwnPtr m_parser; + RefPtr m_parser; bool m_active_parser_was_aborted { false }; String m_source; diff --git a/Userland/Libraries/LibWeb/HTML/DOMParser.cpp b/Userland/Libraries/LibWeb/HTML/DOMParser.cpp index de8ca9c07be..ae2b255700c 100644 --- a/Userland/Libraries/LibWeb/HTML/DOMParser.cpp +++ b/Userland/Libraries/LibWeb/HTML/DOMParser.cpp @@ -34,11 +34,11 @@ NonnullRefPtr DOMParser::parse_from_string(String const& string, // 2. Create an HTML parser parser, associated with document. // 3. Place string into the input stream for parser. The encoding confidence is irrelevant. // FIXME: We don't have the concept of encoding confidence yet. - HTMLParser parser(document, string, "UTF-8"); + auto parser = HTMLParser::create(document, string, "UTF-8"); // 4. Start parser and let it run until it has consumed all the characters just inserted into the input stream. // FIXME: This is to match the default URL. Instead, pass in this's relevant global object's associated Document's URL. - parser.run("about:blank"); + parser->run("about:blank"); } else { // -> Otherwise // FIXME: 1. Create an XML parser parse, associated with document, and with XML scripting support disabled. diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp b/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp index 89e9d34231c..4c2e4dda3fc 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp @@ -121,8 +121,8 @@ static bool is_html_integration_point(DOM::Element const& element) RefPtr parse_html_document(StringView data, const AK::URL& url, const String& encoding) { auto document = DOM::Document::create(url); - HTMLParser parser(document, data, encoding); - parser.run(url); + auto parser = HTMLParser::create(document, data, encoding); + parser->run(url); return document; } @@ -131,6 +131,7 @@ HTMLParser::HTMLParser(DOM::Document& document, StringView input, const String& , m_document(document) { m_tokenizer.set_parser({}, *this); + m_document->set_parser({}, *this); m_document->set_should_invalidate_styles_on_attribute_changes(false); auto standardized_encoding = TextCodec::get_standardized_encoding(encoding); VERIFY(standardized_encoding.has_value()); @@ -140,6 +141,7 @@ HTMLParser::HTMLParser(DOM::Document& document, StringView input, const String& HTMLParser::HTMLParser(DOM::Document& document) : m_document(document) { + m_document->set_parser({}, *this); m_tokenizer.set_parser({}, *this); } @@ -201,6 +203,7 @@ void HTMLParser::run(const AK::URL& url) m_document->set_source(m_tokenizer.source()); run(); the_end(); + m_document->detach_parser({}); } // https://html.spec.whatwg.org/multipage/parsing.html#the-end @@ -3180,44 +3183,44 @@ DOM::Document& HTMLParser::document() NonnullRefPtrVector HTMLParser::parse_html_fragment(DOM::Element& context_element, StringView markup) { auto temp_document = DOM::Document::create(); - HTMLParser parser(*temp_document, markup, "utf-8"); - parser.m_context_element = context_element; - parser.m_parsing_fragment = true; - parser.document().set_quirks_mode(context_element.document().mode()); + auto parser = HTMLParser::create(*temp_document, markup, "utf-8"); + parser->m_context_element = context_element; + parser->m_parsing_fragment = true; + parser->document().set_quirks_mode(context_element.document().mode()); if (context_element.local_name().is_one_of(HTML::TagNames::title, HTML::TagNames::textarea)) { - parser.m_tokenizer.switch_to({}, HTMLTokenizer::State::RCDATA); + parser->m_tokenizer.switch_to({}, HTMLTokenizer::State::RCDATA); } else if (context_element.local_name().is_one_of(HTML::TagNames::style, HTML::TagNames::xmp, HTML::TagNames::iframe, HTML::TagNames::noembed, HTML::TagNames::noframes)) { - parser.m_tokenizer.switch_to({}, HTMLTokenizer::State::RAWTEXT); + parser->m_tokenizer.switch_to({}, HTMLTokenizer::State::RAWTEXT); } else if (context_element.local_name().is_one_of(HTML::TagNames::script)) { - parser.m_tokenizer.switch_to({}, HTMLTokenizer::State::ScriptData); + parser->m_tokenizer.switch_to({}, HTMLTokenizer::State::ScriptData); } else if (context_element.local_name().is_one_of(HTML::TagNames::noscript)) { if (context_element.document().is_scripting_enabled()) - parser.m_tokenizer.switch_to({}, HTMLTokenizer::State::RAWTEXT); + parser->m_tokenizer.switch_to({}, HTMLTokenizer::State::RAWTEXT); } else if (context_element.local_name().is_one_of(HTML::TagNames::plaintext)) { - parser.m_tokenizer.switch_to({}, HTMLTokenizer::State::PLAINTEXT); + parser->m_tokenizer.switch_to({}, HTMLTokenizer::State::PLAINTEXT); } auto root = create_element(context_element.document(), HTML::TagNames::html, Namespace::HTML); - parser.document().append_child(root); - parser.m_stack_of_open_elements.push(root); + parser->document().append_child(root); + parser->m_stack_of_open_elements.push(root); if (context_element.local_name() == HTML::TagNames::template_) { - parser.m_stack_of_template_insertion_modes.append(InsertionMode::InTemplate); + parser->m_stack_of_template_insertion_modes.append(InsertionMode::InTemplate); } // FIXME: Create a start tag token whose name is the local name of context and whose attributes are the attributes of context. - parser.reset_the_insertion_mode_appropriately(); + parser->reset_the_insertion_mode_appropriately(); for (auto* form_candidate = &context_element; form_candidate; form_candidate = form_candidate->parent_element()) { if (is(*form_candidate)) { - parser.m_form_element = verify_cast(*form_candidate); + parser->m_form_element = verify_cast(*form_candidate); break; } } - parser.run(context_element.document().url()); + parser->run(context_element.document().url()); NonnullRefPtrVector children; while (RefPtr child = root->first_child()) { @@ -3228,13 +3231,23 @@ NonnullRefPtrVector HTMLParser::parse_html_fragment(DOM::Element& con return children; } -NonnullOwnPtr HTMLParser::create_with_uncertain_encoding(DOM::Document& document, const ByteBuffer& input) +NonnullRefPtr HTMLParser::create_for_scripting(DOM::Document& document) +{ + return adopt_ref(*new HTMLParser(document)); +} + +NonnullRefPtr HTMLParser::create_with_uncertain_encoding(DOM::Document& document, const ByteBuffer& input) { if (document.has_encoding()) - return make(document, input, document.encoding().value()); + return adopt_ref(*new HTMLParser(document, input, document.encoding().value())); auto encoding = run_encoding_sniffing_algorithm(document, input); dbgln("The encoding sniffing algorithm returned encoding '{}'", encoding); - return make(document, input, encoding); + return adopt_ref(*new HTMLParser(document, input, encoding)); +} + +NonnullRefPtr HTMLParser::create(DOM::Document& document, StringView input, String const& encoding) +{ + return adopt_ref(*new HTMLParser(document, input, encoding)); } // https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-serialisation-algorithm diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.h b/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.h index cbfa767632f..34ffcdead6a 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.h +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.h @@ -41,15 +41,15 @@ namespace Web::HTML { RefPtr parse_html_document(StringView, const AK::URL&, const String& encoding); -class HTMLParser { +class HTMLParser : public RefCounted { friend class HTMLTokenizer; public: - HTMLParser(DOM::Document&, StringView input, const String& encoding); - HTMLParser(DOM::Document&); ~HTMLParser(); - static NonnullOwnPtr create_with_uncertain_encoding(DOM::Document&, const ByteBuffer& input); + static NonnullRefPtr create_for_scripting(DOM::Document&); + static NonnullRefPtr create_with_uncertain_encoding(DOM::Document&, ByteBuffer const& input); + static NonnullRefPtr create(DOM::Document&, StringView input, String const& encoding); void run(); void run(const AK::URL&); @@ -76,6 +76,9 @@ public: size_t script_nesting_level() const { return m_script_nesting_level; } private: + HTMLParser(DOM::Document&, StringView input, const String& encoding); + HTMLParser(DOM::Document&); + const char* insertion_mode_name() const; DOM::QuirksMode which_quirks_mode(const HTMLToken&) const; diff --git a/Userland/Libraries/LibWeb/Loader/FrameLoader.cpp b/Userland/Libraries/LibWeb/Loader/FrameLoader.cpp index 97529dda05c..ded5417615d 100644 --- a/Userland/Libraries/LibWeb/Loader/FrameLoader.cpp +++ b/Userland/Libraries/LibWeb/Loader/FrameLoader.cpp @@ -46,8 +46,8 @@ static bool build_markdown_document(DOM::Document& document, const ByteBuffer& d if (!markdown_document) return false; - HTML::HTMLParser parser(document, markdown_document->render_to_html(), "utf-8"); - parser.run(document.url()); + auto parser = HTML::HTMLParser::create(document, markdown_document->render_to_html(), "utf-8"); + parser->run(document.url()); return true; } @@ -116,8 +116,8 @@ static bool build_gemini_document(DOM::Document& document, const ByteBuffer& dat dbgln_if(GEMINI_DEBUG, "Gemini data:\n\"\"\"{}\"\"\"", gemini_data); dbgln_if(GEMINI_DEBUG, "Converted to HTML:\n\"\"\"{}\"\"\"", html_data); - HTML::HTMLParser parser(document, html_data, "utf-8"); - parser.run(document.url()); + auto parser = HTML::HTMLParser::create(document, html_data, "utf-8"); + parser->run(document.url()); return true; } @@ -226,9 +226,9 @@ bool FrameLoader::load(const AK::URL& url, Type type) void FrameLoader::load_html(StringView html, const AK::URL& url) { auto document = DOM::Document::create(url); - HTML::HTMLParser parser(document, html, "utf-8"); - parser.run(url); - browsing_context().set_active_document(&parser.document()); + auto parser = HTML::HTMLParser::create(document, html, "utf-8"); + parser->run(url); + browsing_context().set_active_document(&parser->document()); } // FIXME: Use an actual templating engine (our own one when it's built, preferably