srctree

Robin Linden parent 79a89eee c9a3b0fd
html: Keep the 'stack' of open elements as a vector

The spec has a very strange idea of what a stack is. We have to be ableto e.g. remove elements from the middle of it.

inlinesplit
html/parser.cpp added: 41, removed: 39, total 2
@@ -141,15 +141,15 @@ void Parser::operator()(html2::StartTagToken const &start_tag) {
// comment, except if the first thing inside the body element is a meta,
// noscript, link, script, style, or template element.
if (doc_.html().children.size() == 1 && start_tag.tag_name != "body") {
auto &body = open_elements_.top()->children.emplace_back(dom::Element{.name{"body"}});
open_elements_.push(&std::get<dom::Element>(body));
auto &body = open_elements_.back()->children.emplace_back(dom::Element{.name{"body"}});
open_elements_.push_back(&std::get<dom::Element>(body));
}
 
generate_text_node_if_needed();
 
// https://html.spec.whatwg.org/multipage/grouping-content.html#the-p-element
if (open_elements_.top()->name == "p" && is_in_array<kAllowsParagraphEndTagOmission>(start_tag.tag_name)) {
open_elements_.pop();
if (open_elements_.back()->name == "p" && is_in_array<kAllowsParagraphEndTagOmission>(start_tag.tag_name)) {
open_elements_.pop_back();
}
 
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
@@ -157,7 +157,7 @@ void Parser::operator()(html2::StartTagToken const &start_tag) {
tokenizer_.set_state(html2::State::Rawtext);
}
 
auto &new_element = open_elements_.top()->children.emplace_back(
auto &new_element = open_elements_.back()->children.emplace_back(
dom::Element{start_tag.tag_name, into_dom_attributes(start_tag.attributes), {}});
 
if (!start_tag.self_closing) {
@@ -165,13 +165,13 @@ void Parser::operator()(html2::StartTagToken const &start_tag) {
// if they need it, but we only ever add new children to the
// top-most element in the stack, so this pointer will be valid
// until it's been popped from the stack and we add its siblings.
open_elements_.push(std::get_if<dom::Element>(&new_element));
open_elements_.push_back(std::get_if<dom::Element>(&new_element));
}
 
// Special cases from https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
// Immediately popped off the stack of open elements special cases.
if (!start_tag.self_closing && is_in_array<kImmediatelyPopped>(start_tag.tag_name)) {
open_elements_.pop();
open_elements_.pop_back();
}
}
 
@@ -182,9 +182,9 @@ void Parser::operator()(html2::EndTagToken const &end_tag) {
}
 
if (end_tag.tag_name == "html" && doc_.html().children.size() == 1) {
if (open_elements_.top()->name == "html") {
auto &body = open_elements_.top()->children.emplace_back(dom::Element{.name = "body"});
open_elements_.push(&std::get<dom::Element>(body));
if (open_elements_.back()->name == "html") {
auto &body = open_elements_.back()->children.emplace_back(dom::Element{.name = "body"});
open_elements_.push_back(&std::get<dom::Element>(body));
}
}
 
@@ -192,22 +192,22 @@ void Parser::operator()(html2::EndTagToken const &end_tag) {
 
// https://html.spec.whatwg.org/multipage/grouping-content.html#the-p-element
// TODO(robinlinden): or if the parent element is an autonomous custom element.
if (open_elements_.top()->name == "p" && end_tag.tag_name != "p"
if (open_elements_.back()->name == "p" && end_tag.tag_name != "p"
&& !is_in_array<kDisallowsParagraphEndTagOmissionWhenClosed>(end_tag.tag_name)) {
open_elements_.pop();
open_elements_.pop_back();
}
 
if (end_tag.tag_name == "html" && open_elements_.top()->name == "body") {
open_elements_.pop();
if (end_tag.tag_name == "html" && open_elements_.back()->name == "body") {
open_elements_.pop_back();
}
 
auto const &expected_tag = open_elements_.top()->name;
auto const &expected_tag = open_elements_.back()->name;
if (end_tag.tag_name != expected_tag) {
spdlog::warn("Unexpected end_tag name, expected [{}] but got [{}]", expected_tag, end_tag.tag_name);
return;
}
 
open_elements_.pop();
open_elements_.pop_back();
}
 
void Parser::operator()(html2::CommentToken const &) {
@@ -219,22 +219,23 @@ void Parser::operator()(html2::CharacterToken const &character) {
}
 
void Parser::operator()(html2::EndOfFileToken const &) {
if (!open_elements_.empty() && open_elements_.top()->name == "html" && open_elements_.top()->children.size() == 1) {
if (!open_elements_.empty() && open_elements_.back()->name == "html"
&& open_elements_.back()->children.size() == 1) {
auto &body = doc_.html().children.emplace_back(dom::Element{.name = "body"});
open_elements_.push(&std::get<dom::Element>(body));
open_elements_.push_back(&std::get<dom::Element>(body));
}
 
if (!open_elements_.empty()) {
generate_text_node_if_needed();
}
 
if (!open_elements_.empty() && open_elements_.top()->name == "body") {
open_elements_.pop();
if (!open_elements_.empty() && open_elements_.back()->name == "body") {
open_elements_.pop_back();
}
 
// https://html.spec.whatwg.org/multipage/semantics.html#the-html-element
if (!open_elements_.empty() && open_elements_.top()->name == "html") {
open_elements_.pop();
if (!open_elements_.empty() && open_elements_.back()->name == "html") {
open_elements_.pop_back();
}
 
if (!open_elements_.empty()) {
@@ -250,7 +251,7 @@ void Parser::generate_text_node_if_needed() {
return;
}
 
open_elements_.top()->children.emplace_back(dom::Text{std::move(text)});
open_elements_.back()->children.emplace_back(dom::Text{std::move(text)});
}
 
} // namespace html
 
html/parser.h added: 41, removed: 39, total 2
@@ -13,9 +13,9 @@
 
#include <functional>
#include <sstream>
#include <stack>
#include <string_view>
#include <utility>
#include <vector>
 
namespace html {
 
@@ -53,7 +53,7 @@ private:
 
html2::Tokenizer tokenizer_;
dom::Document doc_{};
std::stack<dom::Element *> open_elements_{};
std::vector<dom::Element *> open_elements_{};
std::stringstream current_text_{};
bool scripting_{false};
html2::InsertionMode insertion_mode_{};
 
html/parser_actions.h added: 41, removed: 39, total 2
@@ -13,7 +13,6 @@
#include <algorithm>
#include <cassert>
#include <span>
#include <stack>
#include <utility>
#include <variant>
#include <vector>
@@ -26,7 +25,7 @@ public:
html2::Tokenizer &tokenizer,
bool scripting,
html2::InsertionMode &current_insertion_mode,
std::stack<dom::Element *> &open_elements)
std::vector<dom::Element *> &open_elements)
: document_{document}, tokenizer_{tokenizer}, scripting_{scripting},
current_insertion_mode_{current_insertion_mode}, open_elements_{open_elements} {}
 
@@ -61,8 +60,8 @@ public:
insert({token.tag_name, into_dom_attributes(token.attributes)});
}
 
void pop_current_node() override { open_elements_.pop(); }
std::string_view current_node_name() const override { return open_elements_.top()->name; }
void pop_current_node() override { open_elements_.pop_back(); }
std::string_view current_node_name() const override { return open_elements_.back()->name; }
 
void merge_into_html_node(std::span<html2::Attribute const> attrs) override {
auto &html = document_.html();
@@ -76,7 +75,7 @@ public:
}
 
void insert_character(html2::CharacterToken const &character) override {
auto &current_element = open_elements_.top();
auto &current_element = open_elements_.back();
if (current_element->children.empty() || !std::holds_alternative<dom::Text>(current_element->children.back())) {
current_element->children.emplace_back(dom::Text{});
}
@@ -104,12 +103,12 @@ private:
assert(open_elements_.empty());
document_.html().name = std::move(element.name);
document_.html().attributes = std::move(element.attributes);
open_elements_.push(&document_.html());
open_elements_.push_back(&document_.html());
return;
}
 
dom::Node &node = open_elements_.top()->children.emplace_back(std::move(element));
open_elements_.push(&std::get<dom::Element>(node));
dom::Node &node = open_elements_.back()->children.emplace_back(std::move(element));
open_elements_.push_back(&std::get<dom::Element>(node));
}
 
dom::Document &document_;
@@ -117,7 +116,7 @@ private:
bool scripting_;
html2::InsertionMode original_insertion_mode_;
html2::InsertionMode &current_insertion_mode_;
std::stack<dom::Element *> &open_elements_;
std::vector<dom::Element *> &open_elements_;
};
 
} // namespace html
 
html2/parser_states_test.cpp added: 41, removed: 39, total 2
@@ -9,6 +9,8 @@
#include "etest/etest.h"
#include "html/parser_actions.h"
 
#include <vector>
 
using etest::expect_eq;
 
using NodeVec = std::vector<dom::Node>;
@@ -30,7 +32,7 @@ ParseResult parse(std::string_view html, ParseOptions opts) {
 
ParseResult res{};
html2::InsertionMode mode{opts.initial_insertion_mode};
std::stack<dom::Element *> open_elements{};
std::vector<dom::Element *> open_elements{};
html::Actions actions{res.document, tokenizer, opts.scripting, mode, open_elements};
 
auto on_token = [&](html2::Tokenizer &, html2::Token const &token) {