srctree

Robin Linden parent bd51687a 4818d0b4
html2: Add support for tokenizing ambiguous ampersands

inlinesplit
html2/tokenizer.cpp added: 45, removed: 2, total 43
@@ -1394,6 +1394,30 @@ void Tokenizer::run() {
continue;
}
 
case State::AmbiguousAmpersand: {
auto c = consume_next_input_character();
if (!c) {
reconsume_in(return_state_);
continue;
}
 
if (is_ascii_alphanumeric(*c)) {
if (consumed_as_part_of_an_attribute()) {
current_attribute().value += *c;
} else {
emit(CharacterToken{*c});
}
continue;
}
 
if (*c == ';') {
// This is an unknown-named-character-reference parse error.
}
 
reconsume_in(return_state_);
continue;
}
 
case State::NumericCharacterReference: {
character_reference_code_ = 0;
auto c = consume_next_input_character();
 
html2/tokenizer_test.cpp added: 45, removed: 2, total 43
@@ -459,6 +459,25 @@ int main() {
expect_eq(glyph, "∾̳"sv);
});
 
etest::test("ambiguous ampersand", [] {
auto tokens = run_tokenizer("&blah;");
expect_token(tokens, CharacterToken{'&'});
expect_token(tokens, CharacterToken{'b'});
expect_token(tokens, CharacterToken{'l'});
expect_token(tokens, CharacterToken{'a'});
expect_token(tokens, CharacterToken{'h'});
expect_token(tokens, CharacterToken{';'});
expect_token(tokens, EndOfFileToken{});
expect(tokens.empty());
});
 
etest::test("ambiguous ampersand in attribute", [] {
auto tokens = run_tokenizer("<p attr='&blah;'>");
expect_token(tokens, StartTagToken{.tag_name = "p", .attributes = {{"attr", "&blah;"}}});
expect_token(tokens, EndOfFileToken{});
expect(tokens.empty());
});
 
etest::test("attribute, one attribute single quoted", [] {
auto tokens = run_tokenizer("<tag a='b'>");