srctree

Robin Linden parent ed1ac4d0 86509c7e
html2: Add support for unquoted attribute values

This is used on e.g. GitHub.

inlinesplit
html2/tokenizer.cpp added: 86, removed: 2, total 84
@@ -943,6 +943,46 @@ void Tokenizer::run() {
}
}
 
case State::AttributeValueUnquoted: {
auto c = consume_next_input_character();
if (!c) {
// This is an eof-in-tag parse error.
emit(EndOfFileToken{});
return;
}
 
switch (*c) {
case '\t':
case '\n':
case '\f':
case ' ':
state_ = State::BeforeAttributeName;
continue;
case '&':
return_state_ = State::AttributeValueUnquoted;
state_ = State::CharacterReference;
continue;
case '>':
state_ = State::Data;
emit(std::move(current_token_));
continue;
case '\0':
// This is an unexpected-null-character parse error.
current_attribute().value.append(util::unicode_to_utf8(0xFFFD));
continue;
case '"':
case '\'':
case '<':
case '=':
case '`':
// This is an unexpected-character-in-unquoted-attribute-value parse error.
[[fallthrough]];
default:
current_attribute().value += *c;
continue;
}
}
 
case State::AfterAttributeValueQuoted: {
auto c = consume_next_input_character();
if (!c) {
 
html2/tokenizer_test.cpp added: 86, removed: 2, total 84
@@ -22,6 +22,9 @@ using etest::require;
using namespace html2;
 
namespace {
 
static constexpr char const *kReplacementCharacter = "\xef\xbf\xbd";
 
std::vector<Token> run_tokenizer(std::string_view input) {
std::vector<Token> tokens;
Tokenizer{input,
@@ -507,6 +510,47 @@ int main() {
expect_token(tokens, EndOfFileToken{});
});
 
etest::test("attribute, one attribute unquoted", [] {
auto tokens = run_tokenizer("<tag a=b>");
expect_token(tokens, StartTagToken{.tag_name = "tag", .attributes = {{"a", "b"}}});
expect_token(tokens, EndOfFileToken{});
});
 
etest::test("attribute, multiple attributes unquoted", [] {
auto tokens = run_tokenizer("<tag a=b c=d>");
expect_token(tokens, StartTagToken{.tag_name = "tag", .attributes = {{"a", "b"}, {"c", "d"}}});
expect_token(tokens, EndOfFileToken{});
});
 
etest::test("attribute, multiple attributes unquoted", [] {
auto tokens = run_tokenizer("<tag a=b c=d>");
expect_token(tokens, StartTagToken{.tag_name = "tag", .attributes = {{"a", "b"}, {"c", "d"}}});
expect_token(tokens, EndOfFileToken{});
});
 
etest::test("attribute, unexpected-character-in-unquoted-attribute", [] {
auto tokens = run_tokenizer("<tag a=b=c>");
expect_token(tokens, StartTagToken{.tag_name = "tag", .attributes = {{"a", "b=c"}}});
expect_token(tokens, EndOfFileToken{});
});
 
etest::test("attribute, unquoted, eof-in-tag", [] {
auto tokens = run_tokenizer("<tag a=b");
expect_token(tokens, EndOfFileToken{});
});
 
etest::test("attribute, unquoted, with character reference", [] {
auto tokens = run_tokenizer("<tag a=&amp>");
expect_token(tokens, StartTagToken{.tag_name = "tag", .attributes = {{"a", "&"}}});
expect_token(tokens, EndOfFileToken{});
});
 
etest::test("attribute, unquoted, unexpected-null-character", [] {
auto tokens = run_tokenizer("<tag a=\0>"sv);
expect_token(tokens, StartTagToken{.tag_name = "tag", .attributes = {{"a", kReplacementCharacter}}});
expect_token(tokens, EndOfFileToken{});
});
 
etest::test("numeric character reference", [] {
auto tokens = run_tokenizer("&#9731;"); // U+2603: SNOWMAN
expect_token(tokens, CharacterToken{'\xe2'});