srctree

Robin Linden parent ed1ac4d0 86509c7e
html2: Add support for unquoted attribute values

This is used on e.g. GitHub.

html2/tokenizer.cpp added: 86, removed: 2, total 84

@@ -943,6 +943,46 @@ void Tokenizer::run() {

}

}

case State::AttributeValueUnquoted: {

auto c = consume_next_input_character();

if (!c) {

// This is an eof-in-tag parse error.

emit(EndOfFileToken{});

return;

}

switch (*c) {

case '\t':

case '\n':

case '\f':

case ' ':

state_ = State::BeforeAttributeName;

continue;

case '&':

return_state_ = State::AttributeValueUnquoted;

state_ = State::CharacterReference;

continue;

case '>':

state_ = State::Data;

emit(std::move(current_token_));

continue;

case '\0':

// This is an unexpected-null-character parse error.

current_attribute().value.append(util::unicode_to_utf8(0xFFFD));

continue;

case '"':

case '\'':

case '<':

case '=':

case '`':

// This is an unexpected-character-in-unquoted-attribute-value parse error.

[[fallthrough]];

default:

current_attribute().value += *c;

continue;

}

}

case State::AfterAttributeValueQuoted: {

auto c = consume_next_input_character();

if (!c) {

html2/tokenizer_test.cpp added: 86, removed: 2, total 84

@@ -22,6 +22,9 @@ using etest::require;

using namespace html2;

namespace {

static constexpr char const *kReplacementCharacter = "\xef\xbf\xbd";

std::vector<Token> run_tokenizer(std::string_view input) {

std::vector<Token> tokens;

Tokenizer{input,

@@ -507,6 +510,47 @@ int main() {

expect_token(tokens, EndOfFileToken{});

});

etest::test("attribute, one attribute unquoted", [] {

auto tokens = run_tokenizer("<tag a=b>");

expect_token(tokens, StartTagToken{.tag_name = "tag", .attributes = {{"a", "b"}}});

expect_token(tokens, EndOfFileToken{});

});

etest::test("attribute, multiple attributes unquoted", [] {

auto tokens = run_tokenizer("<tag a=b c=d>");

expect_token(tokens, StartTagToken{.tag_name = "tag", .attributes = {{"a", "b"}, {"c", "d"}}});

expect_token(tokens, EndOfFileToken{});

});

etest::test("attribute, multiple attributes unquoted", [] {

auto tokens = run_tokenizer("<tag a=b c=d>");

expect_token(tokens, StartTagToken{.tag_name = "tag", .attributes = {{"a", "b"}, {"c", "d"}}});

expect_token(tokens, EndOfFileToken{});

});

etest::test("attribute, unexpected-character-in-unquoted-attribute", [] {

auto tokens = run_tokenizer("<tag a=b=c>");

expect_token(tokens, StartTagToken{.tag_name = "tag", .attributes = {{"a", "b=c"}}});

expect_token(tokens, EndOfFileToken{});

});

etest::test("attribute, unquoted, eof-in-tag", [] {

auto tokens = run_tokenizer("<tag a=b");

expect_token(tokens, EndOfFileToken{});

});

etest::test("attribute, unquoted, with character reference", [] {

auto tokens = run_tokenizer("<tag a=&amp>");

expect_token(tokens, StartTagToken{.tag_name = "tag", .attributes = {{"a", "&"}}});

expect_token(tokens, EndOfFileToken{});

});

etest::test("attribute, unquoted, unexpected-null-character", [] {

auto tokens = run_tokenizer("<tag a=\0>"sv);

expect_token(tokens, StartTagToken{.tag_name = "tag", .attributes = {{"a", kReplacementCharacter}}});

expect_token(tokens, EndOfFileToken{});

});

etest::test("numeric character reference", [] {

auto tokens = run_tokenizer("☃"); // U+2603: SNOWMAN

expect_token(tokens, CharacterToken{'\xe2'});