srctree

Robin Linden parent 6f3620c6 cfb07a5d
html2: Handle cdata during tokenization

inline split

html2/tokenizer.cpp added: 123, removed: 10, total 113

@@ -1059,7 +1059,16 @@ void Tokenizer::run() {

}

if (input_.substr(pos_, std::strlen("[CDATA[")) == "[CDATA["sv) {

std::terminate();

pos_ += std::strlen("[CDATA[");

if (adjusted_current_node_not_in_html_namespace_) {

state_ = State::CdataSection;

continue;

}

emit(ParseError::CdataInHtmlContent);

current_token_ = CommentToken{.data = "[CDATA["};

state_ = State::BogusComment;

continue;

}

emit(ParseError::IncorrectlyOpenedComment);

@@ -1832,6 +1841,57 @@ void Tokenizer::run() {

}

// https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state

case State::CdataSection: {

auto c = consume_next_input_character();

if (!c) {

emit(ParseError::EofInCdata);

emit(EndOfFileToken{});

return;

}

switch (*c) {

case ']':

state_ = State::CdataSectionBracket;

continue;

default:

emit(CharacterToken{*c});

continue;

}

// https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state

case State::CdataSectionBracket: {

auto c = consume_next_input_character();

if (c == ']') {

state_ = State::CdataSectionEnd;

continue;

}

emit(CharacterToken{']'});

reconsume_in(State::CdataSection);

continue;

}

// https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state

case State::CdataSectionEnd: {

auto c = consume_next_input_character();

if (c == ']') {

emit(CharacterToken{']'});

continue;

}

if (c == '>') {

state_ = State::Data;

continue;

}

emit(CharacterToken{']'});

reconsume_in(State::CdataSection);

continue;

}

case State::CharacterReference: {

temporary_buffer_ = "&"s;

html2/tokenizer.h added: 123, removed: 10, total 113

@@ -105,7 +105,9 @@ enum class ParseError {

AbruptDoctypePublicIdentifier,

AbruptDoctypeSystemIdentifier,

AbsenceOfDigitsInNumericCharacterReference,

CdataInHtmlContent,

ControlCharacterReference,

EofInCdata,

EofInComment,

EofInDoctype,

EofInTag,

@@ -140,6 +142,12 @@ public:

void set_state(State);

void run();

// This will definitely change once we implement the tree construction, but this works for now.

// https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state

void set_adjusted_current_node_not_in_html_namespace(bool in_html_namespace) {

adjusted_current_node_not_in_html_namespace_ = in_html_namespace;

}

private:

std::string_view input_;

std::size_t pos_{0};

@@ -151,6 +159,7 @@ private:

std::string last_start_tag_name_{};

std::uint32_t character_reference_code_{};

bool adjusted_current_node_not_in_html_namespace_{false};

std::function<void(Tokenizer &, Token &&)> on_emit_{};

std::function<void(Tokenizer &, ParseError)> on_error_{};

html2/tokenizer_test.cpp added: 123, removed: 10, total 113

@@ -40,22 +40,26 @@ public:

etest::source_location loc;

};

TokenizerOutput run_tokenizer(std::string_view input, etest::source_location loc = etest::source_location::current()) {

TokenizerOutput run_tokenizer(std::string_view input,

bool in_html_namespace = true,

etest::source_location loc = etest::source_location::current()) {

std::vector<Token> tokens;

std::vector<ParseError> errors;

Tokenizer{input,

[&](Tokenizer &tokenizer, Token &&t) {

Tokenizer tokenizer{input,

[&](Tokenizer &the, Token &&t) {

if (std::holds_alternative<StartTagToken>(t)) {

if (std::get<StartTagToken>(t).tag_name == "script") {

tokenizer.set_state(State::ScriptData);

the.set_state(State::ScriptData);

}

tokens.push_back(std::move(t));

[&](auto &, ParseError e) {

errors.push_back(e);

}}

.run();

}};

tokenizer.set_adjusted_current_node_not_in_html_namespace(!in_html_namespace);

tokenizer.run();

return {std::move(tokens), std::move(errors), std::move(loc)};

}

@@ -81,6 +85,45 @@ void expect_error(

output.errors.erase(begin(output.errors));

}

void cdata_tests() {

etest::test("cdata, currently in html", [] {

auto tokens = run_tokenizer("<![CDATA["sv);

expect_error(tokens, ParseError::CdataInHtmlContent);

expect_token(tokens, CommentToken{.data = "[CDATA["});