srctree

Robin Linden parent 6f3620c6 cfb07a5d
html2: Handle cdata during tokenization

inlinesplit
html2/tokenizer.cpp added: 123, removed: 10, total 113
@@ -1059,7 +1059,16 @@ void Tokenizer::run() {
}
 
if (input_.substr(pos_, std::strlen("[CDATA[")) == "[CDATA["sv) {
std::terminate();
pos_ += std::strlen("[CDATA[");
if (adjusted_current_node_not_in_html_namespace_) {
state_ = State::CdataSection;
continue;
}
 
emit(ParseError::CdataInHtmlContent);
current_token_ = CommentToken{.data = "[CDATA["};
state_ = State::BogusComment;
continue;
}
 
emit(ParseError::IncorrectlyOpenedComment);
@@ -1832,6 +1841,57 @@ void Tokenizer::run() {
}
}
 
// https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
case State::CdataSection: {
auto c = consume_next_input_character();
if (!c) {
emit(ParseError::EofInCdata);
emit(EndOfFileToken{});
return;
}
 
switch (*c) {
case ']':
state_ = State::CdataSectionBracket;
continue;
default:
emit(CharacterToken{*c});
continue;
}
}
 
// https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state
case State::CdataSectionBracket: {
auto c = consume_next_input_character();
if (c == ']') {
state_ = State::CdataSectionEnd;
continue;
}
 
emit(CharacterToken{']'});
reconsume_in(State::CdataSection);
continue;
}
 
// https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state
case State::CdataSectionEnd: {
auto c = consume_next_input_character();
if (c == ']') {
emit(CharacterToken{']'});
continue;
}
 
if (c == '>') {
state_ = State::Data;
continue;
}
 
emit(CharacterToken{']'});
emit(CharacterToken{']'});
reconsume_in(State::CdataSection);
continue;
}
 
case State::CharacterReference: {
temporary_buffer_ = "&"s;
 
 
html2/tokenizer.h added: 123, removed: 10, total 113
@@ -105,7 +105,9 @@ enum class ParseError {
AbruptDoctypePublicIdentifier,
AbruptDoctypeSystemIdentifier,
AbsenceOfDigitsInNumericCharacterReference,
CdataInHtmlContent,
ControlCharacterReference,
EofInCdata,
EofInComment,
EofInDoctype,
EofInTag,
@@ -140,6 +142,12 @@ public:
void set_state(State);
void run();
 
// This will definitely change once we implement the tree construction, but this works for now.
// https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
void set_adjusted_current_node_not_in_html_namespace(bool in_html_namespace) {
adjusted_current_node_not_in_html_namespace_ = in_html_namespace;
}
 
private:
std::string_view input_;
std::size_t pos_{0};
@@ -151,6 +159,7 @@ private:
std::string last_start_tag_name_{};
 
std::uint32_t character_reference_code_{};
bool adjusted_current_node_not_in_html_namespace_{false};
 
std::function<void(Tokenizer &, Token &&)> on_emit_{};
std::function<void(Tokenizer &, ParseError)> on_error_{};
 
html2/tokenizer_test.cpp added: 123, removed: 10, total 113
@@ -40,22 +40,26 @@ public:
etest::source_location loc;
};
 
TokenizerOutput run_tokenizer(std::string_view input, etest::source_location loc = etest::source_location::current()) {
TokenizerOutput run_tokenizer(std::string_view input,
bool in_html_namespace = true,
etest::source_location loc = etest::source_location::current()) {
std::vector<Token> tokens;
std::vector<ParseError> errors;
Tokenizer{input,
[&](Tokenizer &tokenizer, Token &&t) {
Tokenizer tokenizer{input,
[&](Tokenizer &the, Token &&t) {
if (std::holds_alternative<StartTagToken>(t)) {
if (std::get<StartTagToken>(t).tag_name == "script") {
tokenizer.set_state(State::ScriptData);
the.set_state(State::ScriptData);
}
}
tokens.push_back(std::move(t));
},
[&](auto &, ParseError e) {
errors.push_back(e);
}}
.run();
}};
tokenizer.set_adjusted_current_node_not_in_html_namespace(!in_html_namespace);
tokenizer.run();
 
return {std::move(tokens), std::move(errors), std::move(loc)};
}
 
@@ -81,6 +85,45 @@ void expect_error(
output.errors.erase(begin(output.errors));
}
 
void cdata_tests() {
etest::test("cdata, currently in html", [] {
auto tokens = run_tokenizer("<![CDATA["sv);
expect_error(tokens, ParseError::CdataInHtmlContent);
expect_token(tokens, CommentToken{.data = "[CDATA["});
expect_token(tokens, EndOfFileToken{});
});
 
etest::test("cdata, eof", [] {
auto tokens = run_tokenizer("<![CDATA["sv, false);
expect_error(tokens, html2::ParseError::EofInCdata);
expect_token(tokens, EndOfFileToken{});
});
 
etest::test("cdata, bracket", [] {
auto tokens = run_tokenizer("<![CDATA[]hello"sv, false);
expect_error(tokens, html2::ParseError::EofInCdata);
expect_text(tokens, "]hello");
expect_token(tokens, EndOfFileToken{});
});
 
etest::test("cdata, end", [] {
auto tokens = run_tokenizer("<![CDATA[]]>"sv, false);
expect_token(tokens, EndOfFileToken{});
});
 
etest::test("cdata, end, extra bracket", [] {
auto tokens = run_tokenizer("<![CDATA[]]]>"sv, false);
expect_token(tokens, CharacterToken{']'});
expect_token(tokens, EndOfFileToken{});
});
 
etest::test("cdata, end, extra text", [] {
auto tokens = run_tokenizer("<![CDATA[]]a]]>"sv, false);
expect_text(tokens, "]]a");
expect_token(tokens, EndOfFileToken{});
});
}
 
void doctype_system_keyword_tests() {
etest::test("doctype system keyword, single-quoted system identifier, missing space", [] {
auto tokens = run_tokenizer("<!DOCTYPE HTML SYSTEM'great'>");
@@ -154,6 +197,7 @@ void doctype_system_keyword_tests() {
} // namespace
 
int main() {
cdata_tests();
doctype_system_keyword_tests();
 
etest::test("script, empty", [] {