srctree

Robin Linden parent 95d9cb7a 12bc83b6
html2: Implement more of the InBody parsing

html/parser.cpp added: 123, removed: 7, total 116
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2021-2023 Robin Lindén <dev@robinlinden.eu>
// SPDX-FileCopyrightText: 2021-2024 Robin Lindén <dev@robinlinden.eu>
//
// SPDX-License-Identifier: BSD-2-Clause
 
@@ -118,6 +118,10 @@ void Parser::on_token(html2::Tokenizer &, html2::Token &&token) {
start != nullptr && start->tag_name == "body") {
return;
}
 
if (std::holds_alternative<html2::CharacterToken>(token)) {
return;
}
}
 
if (kHandledByOldParser(insertion_mode_)) {
@@ -257,6 +261,13 @@ void Parser::generate_text_node_if_needed() {
return;
}
 
if (!open_elements_.back()->children.empty()) {
if (auto *t = std::get_if<dom::Text>(&open_elements_.back()->children.back()); t != nullptr) {
t->text += text;
return;
}
}
 
open_elements_.back()->children.emplace_back(dom::Text{std::move(text)});
}
 
 
html/parser_actions.h added: 123, removed: 7, total 116
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2023 Robin Lindén <dev@robinlinden.eu>
// SPDX-FileCopyrightText: 2023-2024 Robin Lindén <dev@robinlinden.eu>
//
// SPDX-License-Identifier: BSD-2-Clause
 
@@ -120,6 +120,10 @@ public:
open_elements_.erase(it);
}
 
void reconstruct_active_formatting_elements() override {
// TODO(robinlinden): Implement.
}
 
private:
void insert(dom::Element element) {
if (element.name == "html") {
 
html2/iparser_actions.h added: 123, removed: 7, total 116
@@ -40,6 +40,7 @@ public:
virtual void set_frameset_ok(bool) = 0;
virtual void push_head_as_current_open_element() = 0;
virtual void remove_from_open_elements(std::string_view element_name) = 0;
virtual void reconstruct_active_formatting_elements() = 0;
 
virtual InsertionMode current_insertion_mode() const = 0;
};
 
html2/parser_states.cpp added: 123, removed: 7, total 116
@@ -51,6 +51,7 @@ public:
void remove_from_open_elements(std::string_view element_name) override {
wrapped_.remove_from_open_elements(element_name);
}
void reconstruct_active_formatting_elements() override { wrapped_.reconstruct_active_formatting_elements(); }
 
private:
IActions &wrapped_;
@@ -500,6 +501,35 @@ std::optional<InsertionMode> AfterHead::process(IActions &a, html2::Token const
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
// Incomplete.
std::optional<InsertionMode> InBody::process(IActions &a, html2::Token const &token) {
auto const *character = std::get_if<html2::CharacterToken>(&token);
if (character != nullptr && character->data == '\0') {
// Parse error.
return {};
}
 
if (is_boring_whitespace(token)) {
a.reconstruct_active_formatting_elements();
a.insert_character(std::get<html2::CharacterToken>(token));
return {};
}
 
if (character != nullptr) {
a.reconstruct_active_formatting_elements();
a.insert_character(*character);
a.set_frameset_ok(false);
return {};
}
 
if (std::holds_alternative<html2::CommentToken>(token)) {
// TODO(robinlinden): Insert.
return {};
}
 
if (std::holds_alternative<html2::DoctypeToken>(token)) {
// Parse error.
return {};
}
 
if (auto const *start = std::get_if<html2::StartTagToken>(&token); start != nullptr && start->tag_name == "html") {
// Parse error.
// TODO(robinlinden): If there is a template element on the stack of open elements, then ignore the token.
@@ -507,6 +537,29 @@ std::optional<InsertionMode> InBody::process(IActions &a, html2::Token const &to
// The spec says to add attributes not already in the top element of the
// stack of open elements. By top, they obviously mean the <html> tag.
a.merge_into_html_node(start->attributes);
return {};
}
 
static constexpr auto kInHeadElements = std::to_array<std::string_view>({
"base"sv,
"basefont"sv,
"bgsound"sv,
"link"sv,
"meta"sv,
"noframes"sv,
"script"sv,
"style"sv,
"template"sv,
"title"sv,
});
 
if (auto const *start = std::get_if<html2::StartTagToken>(&token);
start != nullptr && is_in_array<kInHeadElements>(start->tag_name)) {
return InHead{}.process(a, token);
}
 
if (auto const *end = std::get_if<html2::EndTagToken>(&token); end != nullptr && end->tag_name == "template") {
return InHead{}.process(a, token);
}
 
return {};
 
html2/parser_states_test.cpp added: 123, removed: 7, total 116
@@ -16,6 +16,8 @@
#include <variant>
#include <vector>
 
using namespace std::literals;
 
using etest::expect_eq;
 
using NodeVec = std::vector<dom::Node>;
@@ -393,6 +395,50 @@ void after_head_tests() {
});
}
 
void in_body_tests() {
etest::test("InBody: null character", [] {
auto res = parse("<body>\0"sv, {});
auto const &actual_body = std::get<dom::Element>(res.document.html().children.at(1));
expect_eq(actual_body, dom::Element{"body"});
});
 
etest::test("InBody: boring whitespace", [] {
auto res = parse("<body>\t"sv, {});
auto const &actual_body = std::get<dom::Element>(res.document.html().children.at(1));
expect_eq(actual_body, dom::Element{"body", {}, {dom::Text{"\t"}}});
});
 
etest::test("InBody: character", [] {
auto res = parse("<body>asdf"sv, {});
auto const &actual_body = std::get<dom::Element>(res.document.html().children.at(1));
expect_eq(actual_body, dom::Element{"body", {}, {dom::Text{"asdf"}}});
});
 
etest::test("InBody: comment", [] {
auto res = parse("<body><!-- comment -->", {});
auto const &actual_body = std::get<dom::Element>(res.document.html().children.at(1));
expect_eq(actual_body, dom::Element{"body"});
});
 
etest::test("InBody: doctype", [] {
auto res = parse("<body><!doctype html>", {});
auto const &actual_body = std::get<dom::Element>(res.document.html().children.at(1));
expect_eq(actual_body, dom::Element{"body"});
});
 
etest::test("InBody: in-head-element", [] {
auto res = parse("<body><title><html>&amp;</title>", {});
auto const &actual_body = std::get<dom::Element>(res.document.html().children.at(1));
expect_eq(actual_body, dom::Element{"body", {}, {dom::Element{"title", {}, {dom::Text{"<html>&"}}}}});
});
 
etest::test("InBody: template end tag", [] {
auto res = parse("<body></template>", {});
auto const &actual_body = std::get<dom::Element>(res.document.html().children.at(1));
expect_eq(actual_body, dom::Element{"body"});
});
}
 
void in_frameset_tests() {
etest::test("InFrameset: boring whitespace", [] {
auto res = parse("<head></head><frameset> ", {});
@@ -499,6 +545,7 @@ int main() {
in_head_tests();
in_head_noscript_tests();
after_head_tests();
in_body_tests();
in_frameset_tests();
return etest::run_all_tests();
}