srctree

Robin Linden parent 77d14921 bd51687a
html2: Add support for tokenizing numeric character entities

inlinesplit
html2/tokenizer.cpp added: 207, removed: 3, total 204
@@ -14,6 +14,7 @@
#include <cstring>
#include <exception>
#include <limits>
#include <map>
#include <sstream>
 
using namespace std::literals;
@@ -21,6 +22,23 @@ using namespace std::literals;
namespace html2 {
namespace {
 
constexpr bool is_control(int code_point) {
return code_point >= 0x7F && code_point <= 0x9F;
}
 
constexpr bool is_ascii_whitespace(int code_point) {
switch (code_point) {
case 0x09:
case 0x0A:
case 0x0C:
case 0x0D:
case 0x20:
return true;
default:
return false;
}
}
 
constexpr bool is_ascii_upper_alpha(char c) {
return c >= 'A' && c <= 'Z';
}
@@ -45,6 +63,58 @@ constexpr char to_lower(char c) {
return c + 0x20;
}
 
// https://infra.spec.whatwg.org/#surrogate
constexpr bool is_unicode_surrogate(int code_point) {
return code_point >= 0xD800 && code_point <= 0xDFFF;
}
 
// https://infra.spec.whatwg.org/#noncharacter
constexpr bool is_unicode_noncharacter(int code_point) {
if (code_point >= 0xFDD0 && code_point <= 0xFDEF) {
return true;
}
 
switch (code_point) {
case 0xFFFE:
case 0xFFFF:
case 0x1FFF:
case 0x1FFFF:
case 0x2FFFE:
case 0x2FFFF:
case 0x3FFFE:
case 0x3FFFF:
case 0x4FFFE:
case 0x4FFFF:
case 0x5FFFE:
case 0x5FFFF:
case 0x6FFFE:
case 0x6FFFF:
case 0x7FFFE:
case 0x7FFFF:
case 0x8FFFE:
case 0x8FFFF:
case 0x9FFFE:
case 0x9FFFF:
case 0xAFFFE:
case 0xAFFFF:
case 0xBFFFE:
case 0xBFFFF:
case 0xCFFFE:
case 0xCFFFF:
case 0xDFFFE:
case 0xDFFFF:
case 0xEFFFE:
case 0xEFFFF:
case 0xFFFFE:
case 0xFFFFF:
case 0x10FFFE:
case 0x10FFFF:
return true;
default:
return false;
}
}
 
} // namespace
 
void Tokenizer::set_state(State state) {
@@ -1324,6 +1394,127 @@ void Tokenizer::run() {
continue;
}
 
case State::NumericCharacterReference: {
character_reference_code_ = 0;
auto c = consume_next_input_character();
if (!c) {
reconsume_in(State::DecimalCharacterReferenceStart);
continue;
}
 
switch (*c) {
case 'x':
case 'X':
temporary_buffer_ += *c;
state_ = State::HexadecimalCharacterReferenceStart;
continue;
default:
reconsume_in(State::DecimalCharacterReferenceStart);
continue;
}
}
 
case State::DecimalCharacterReferenceStart: {
auto c = consume_next_input_character();
if (!c || !is_ascii_digit(*c)) {
// This is an absence-of-digits-in-numeric-character-reference parse error.
flush_code_points_consumed_as_a_character_reference();
reconsume_in(return_state_);
continue;
}
 
reconsume_in(State::DecimalCharacterReference);
continue;
}
 
case State::DecimalCharacterReference: {
auto c = consume_next_input_character();
if (!c) {
// This is a missing-semicolon-after-character-reference parse error.
reconsume_in(State::NumericCharacterReferenceEnd);
continue;
}
 
if (is_ascii_digit(*c)) {
character_reference_code_ *= 10;
character_reference_code_ += *c - 0x30;
continue;
}
 
if (*c == ';') {
state_ = State::NumericCharacterReferenceEnd;
continue;
}
 
// This is a missing-semicolon-after-character-reference parse error.
reconsume_in(State::NumericCharacterReferenceEnd);
continue;
}
 
case State::NumericCharacterReferenceEnd: {
if (character_reference_code_ == 0) {
// This is a null-character-reference parse error.
character_reference_code_ = 0xFFFD;
}
 
if (character_reference_code_ > 0x10FFFF) {
// This is a character-reference-outside-unicode-range parse error.
character_reference_code_ = 0xFFFD;
}
 
if (is_unicode_surrogate(character_reference_code_)) {
// This is a surrogate-character-reference parse error.
character_reference_code_ = 0xFFFD;
}
 
if (is_unicode_noncharacter(character_reference_code_)) {
// This is a noncharacter-character-reference parse error.
character_reference_code_ = 0xFFFD;
}
 
if (character_reference_code_ == 0x0D
|| (is_control(character_reference_code_) && !is_ascii_whitespace(character_reference_code_))) {
// This is a control-character-reference parse error.
}
 
static std::map<std::uint32_t, std::uint32_t> const replacements{{0x80, 0x20AC},
{0x82, 0x201A},
{0x83, 0x0192},
{0x84, 0x201E},
{0x85, 0x2026},
{0x86, 0x2020},
{0x87, 0x2021},
{0x88, 0x02C6},
{0x89, 0x2030},
{0x8A, 0x0160},
{0x8B, 0x2039},
{0x8C, 0x0152},
{0x8E, 0x017D},
{0x91, 0x2018},
{0x92, 0x2019},
{0x93, 0x201C},
{0x94, 0x201D},
{0x95, 0x2022},
{0x96, 0x2013},
{0x97, 0x2014},
{0x98, 0x02DC},
{0x99, 0x2122},
{0x9A, 0x0161},
{0x9B, 0x203A},
{0x9C, 0x0153},
{0x9E, 0x017E},
{0x9F, 0x0178}};
 
if (replacements.contains(character_reference_code_)) {
character_reference_code_ = replacements.at(character_reference_code_);
}
 
temporary_buffer_ = util::unicode_to_utf8(character_reference_code_);
flush_code_points_consumed_as_a_character_reference();
state_ = return_state_;
continue;
}
 
default:
std::terminate();
}
 
html2/tokenizer.h added: 207, removed: 3, total 204
@@ -7,6 +7,7 @@
 
#include "html2/token.h"
 
#include <cstdint>
#include <functional>
#include <optional>
#include <string>
@@ -117,6 +118,8 @@ private:
std::string temporary_buffer_{};
std::string last_start_tag_name_{};
 
std::uint32_t character_reference_code_{};
 
std::function<void(Tokenizer &, Token &&)> on_emit_{};
 
void emit(Token &&);
 
html2/tokenizer_test.cpp added: 207, removed: 3, total 204
@@ -15,6 +15,7 @@
 
using namespace std::literals;
 
using etest::expect;
using etest::expect_eq;
using etest::require;
 
@@ -487,5 +488,14 @@ int main() {
expect_token(tokens, EndOfFileToken{});
});
 
etest::test("numeric character reference", [] {
auto tokens = run_tokenizer("&#9731;"); // U+2603: SNOWMAN
expect_token(tokens, CharacterToken{'\xe2'});
expect_token(tokens, CharacterToken{'\x98'});
expect_token(tokens, CharacterToken{'\x83'});
expect_token(tokens, EndOfFileToken{});
expect(tokens.empty());
});
 
return etest::run_all_tests();
}