srctree

Robin Linden parent 6698b415 9a7cd004
css2: Support tokenizing escaped code points

inline split

css2/BUILD added: 116, removed: 14, total 102

@@ -12,6 +12,7 @@ cc_library(

visibility = ["//visibility:public"],

deps = [

"//util:string",

"//util:unicode",

)

css2/tokenizer.cpp added: 116, removed: 14, total 102

@@ -1,4 +1,4 @@

// SPDX-FileCopyrightText: 2021-2023 Robin Lindén <dev@robinlinden.eu>

// SPDX-FileCopyrightText: 2021-2024 Robin Lindén <dev@robinlinden.eu>

// SPDX-FileCopyrightText: 2022 Mikael Larsson <c.mikael.larsson@gmail.com>

// SPDX-License-Identifier: BSD-2-Clause

@@ -8,13 +8,15 @@

#include "css2/token.h"

#include "util/string.h"

#include "util/unicode.h"

#include <cassert>

#include <charconv>

#include <exception>

#include <cstdint>

#include <optional>

#include <string>

#include <system_error>

#include <tuple>

#include <utility>

#include <variant>

@@ -226,8 +228,8 @@ void Tokenizer::run() {

}

if (*c == '\\') {

// TODO(mkiael): Handle escaped code point

std::terminate();

temporary_buffer_ += consume_an_escaped_code_point();

continue;

}

emit(AtKeywordToken{temporary_buffer_});

@@ -248,8 +250,8 @@ void Tokenizer::run() {

}

if (*c == '\\') {

// TODO(mkiael): Handle escaped code point

std::terminate();

temporary_buffer_ += consume_an_escaped_code_point();

continue;

}

// TODO(mkiael): Handle url and function token

@@ -275,8 +277,8 @@ void Tokenizer::run() {

switch (*c) {

case '\\':

// TODO(mkiael): Handle escaped code point

std::terminate();

std::get<StringToken>(current_token_).data += consume_an_escaped_code_point();

continue;

case '\n':

emit(ParseError::NewlineInString);

emit(BadStringToken{});

@@ -389,4 +391,45 @@ std::pair<std::variant<int, double>, NumericType> Tokenizer::consume_number(char

return {result, type};

}

// https://www.w3.org/TR/css-syntax-3/#consume-escaped-code-point

std::string Tokenizer::consume_an_escaped_code_point() {

static constexpr std::uint32_t kReplacementCharacter = 0xFFFD;

auto c = consume_next_input_character();

if (!c) {

emit(ParseError::EofInEscapeSequence);

return util::unicode_to_utf8(kReplacementCharacter);

}

if (util::is_hex_digit(*c)) {

std::string hex{*c};

for (int i = 0; i < 5; ++i) {

auto next_input = peek_input(0);

if (!next_input || !util::is_hex_digit(*next_input)) {

break;

}

hex += *next_input;

std::ignore = consume_next_input_character();

}

if (auto next_input = peek_input(0); next_input && util::is_whitespace(*next_input)) {

std::ignore = consume_next_input_character();

}

std::uint32_t code_point;

[[maybe_unused]] auto res = std::from_chars(hex.data(), hex.data() + hex.size(), code_point, 16);

assert(res.ec == std::errc{} && res.ptr == hex.data() + hex.size());

// https://www.w3.org/TR/css-syntax-3/#maximum-allowed-code-point

static constexpr std::uint32_t kMaximumAllowedCodePoint = 0x10FFFF;

if (code_point == 0 || code_point > kMaximumAllowedCodePoint || util::is_unicode_surrogate(code_point)) {

code_point = kReplacementCharacter;

}

return util::unicode_to_utf8(code_point);

}

return std::string{*c};

}

} // namespace css2

css2/tokenizer.h added: 116, removed: 14, total 102

@@ -1,4 +1,4 @@

// SPDX-FileCopyrightText: 2021-2023 Robin Lindén <dev@robinlinden.eu>

// SPDX-FileCopyrightText: 2021-2024 Robin Lindén <dev@robinlinden.eu>

// SPDX-FileCopyrightText: 2022 Mikael Larsson <c.mikael.larsson@gmail.com>

// SPDX-License-Identifier: BSD-2-Clause

@@ -33,6 +33,7 @@ enum class State {

enum class ParseError {

EofInComment,

EofInEscapeSequence,

EofInString,

NewlineInString,

};

@@ -66,6 +67,7 @@ private:

void reconsume_in(State);

std::pair<std::variant<int, double>, NumericType> consume_number(char first_byte);

std::string consume_an_escaped_code_point();

};

} // namespace css2

css2/tokenizer_test.cpp added: 116, removed: 14, total 102

@@ -1,4 +1,4 @@

// SPDX-FileCopyrightText: 2021-2023 Robin Lindén <dev@robinlinden.eu>

// SPDX-FileCopyrightText: 2021-2024 Robin Lindén <dev@robinlinden.eu>

// SPDX-FileCopyrightText: 2022 Mikael Larsson <c.mikael.larsson@gmail.com>

// SPDX-License-Identifier: BSD-2-Clause

@@ -10,6 +10,7 @@

#include "etest/cxx_compat.h"

#include "etest/etest.h"

#include <string>

#include <string_view>

#include <utility>

#include <vector>

@@ -19,9 +20,12 @@ using etest::expect_eq;

using etest::require;

using namespace css2;

using namespace std::literals;

namespace {

constexpr char const *kReplacementCharacter = "\xef\xbf\xbd";

class TokenizerOutput {

public:

~TokenizerOutput() {

@@ -169,6 +173,11 @@ int main() {

expect_token(output, WhitespaceToken{});

});

etest::test("single quoted string with escaped code point", [] {

auto output = run_tokenizer("'foo\\40'");

expect_token(output, StringToken{"foo@"});

});

etest::test("ident token", [] {

auto output = run_tokenizer("foo");

@@ -199,6 +208,48 @@ int main() {

expect_token(output, IdentToken{"_foo-bar"});

});

etest::test("ident token with escaped code point", [] {

auto output = run_tokenizer("foo\\40");

expect_token(output, IdentToken{"foo@"});

});

etest::test("ident token with escaped code point, eof", [] {

auto output = run_tokenizer("foo\\");

expect_token(output, IdentToken{"foo"s + kReplacementCharacter});

expect_error(output, ParseError::EofInEscapeSequence);

});

etest::test("ident token with escaped code point, non-hex after", [] {

auto output = run_tokenizer("foo\\40Z");

expect_token(output, IdentToken{"foo@Z"});

});

etest::test("ident token with escaped code point, whitespace after", [] {

auto output = run_tokenizer("foo\\40 ");

expect_token(output, IdentToken{"foo@"});

});

etest::test("ident token with escaped code point, max characters in escape", [] {

auto output = run_tokenizer("foo\\10fffff");

// \u{10ffff} would've been nicer, but it's not yet supported by the compilers we support.

expect_token(output, IdentToken{"foo\U0010FFFFf"});

});

etest::test("ident token with escaped code point, outside the unicode range", [] {

auto output = run_tokenizer("foo\\110000");

expect_token(output, IdentToken{"foo"s + kReplacementCharacter});

});

etest::test("ident token with escaped code point, surrogate", [] {

auto output = run_tokenizer("foo\\d800");

expect_token(output, IdentToken{"foo"s + kReplacementCharacter});

});

etest::test("ident token with escaped code point, null", [] {

auto output = run_tokenizer("foo\\0");

expect_token(output, IdentToken{"foo"s + kReplacementCharacter});

});

etest::test("whitespace after ident", [] {

auto output = run_tokenizer("abc ");

@@ -212,6 +263,11 @@ int main() {

expect_token(output, AtKeywordToken{"foo"});

});

etest::test("at keyword token with escaped code point", [] {

auto output = run_tokenizer("@foo\\23");

expect_token(output, AtKeywordToken{"foo#"});

});

etest::test("at keyword token with digit", [] {

auto output = run_tokenizer("@b4z");