srctree

Robin Linden parent 6698b415 9a7cd004
css2: Support tokenizing escaped code points

inlinesplit
css2/BUILD added: 116, removed: 14, total 102
@@ -12,6 +12,7 @@ cc_library(
visibility = ["//visibility:public"],
deps = [
"//util:string",
"//util:unicode",
],
)
 
 
css2/tokenizer.cpp added: 116, removed: 14, total 102
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2021-2023 Robin Lindén <dev@robinlinden.eu>
// SPDX-FileCopyrightText: 2021-2024 Robin Lindén <dev@robinlinden.eu>
// SPDX-FileCopyrightText: 2022 Mikael Larsson <c.mikael.larsson@gmail.com>
//
// SPDX-License-Identifier: BSD-2-Clause
@@ -8,13 +8,15 @@
#include "css2/token.h"
 
#include "util/string.h"
#include "util/unicode.h"
 
#include <cassert>
#include <charconv>
#include <exception>
#include <cstdint>
#include <optional>
#include <string>
#include <system_error>
#include <tuple>
#include <utility>
#include <variant>
 
@@ -226,8 +228,8 @@ void Tokenizer::run() {
}
 
if (*c == '\\') {
// TODO(mkiael): Handle escaped code point
std::terminate();
temporary_buffer_ += consume_an_escaped_code_point();
continue;
}
 
emit(AtKeywordToken{temporary_buffer_});
@@ -248,8 +250,8 @@ void Tokenizer::run() {
}
 
if (*c == '\\') {
// TODO(mkiael): Handle escaped code point
std::terminate();
temporary_buffer_ += consume_an_escaped_code_point();
continue;
}
 
// TODO(mkiael): Handle url and function token
@@ -275,8 +277,8 @@ void Tokenizer::run() {
 
switch (*c) {
case '\\':
// TODO(mkiael): Handle escaped code point
std::terminate();
std::get<StringToken>(current_token_).data += consume_an_escaped_code_point();
continue;
case '\n':
emit(ParseError::NewlineInString);
emit(BadStringToken{});
@@ -389,4 +391,45 @@ std::pair<std::variant<int, double>, NumericType> Tokenizer::consume_number(char
return {result, type};
}
 
// https://www.w3.org/TR/css-syntax-3/#consume-escaped-code-point
std::string Tokenizer::consume_an_escaped_code_point() {
static constexpr std::uint32_t kReplacementCharacter = 0xFFFD;
auto c = consume_next_input_character();
if (!c) {
emit(ParseError::EofInEscapeSequence);
return util::unicode_to_utf8(kReplacementCharacter);
}
 
if (util::is_hex_digit(*c)) {
std::string hex{*c};
for (int i = 0; i < 5; ++i) {
auto next_input = peek_input(0);
if (!next_input || !util::is_hex_digit(*next_input)) {
break;
}
 
hex += *next_input;
std::ignore = consume_next_input_character();
}
 
if (auto next_input = peek_input(0); next_input && util::is_whitespace(*next_input)) {
std::ignore = consume_next_input_character();
}
 
std::uint32_t code_point;
[[maybe_unused]] auto res = std::from_chars(hex.data(), hex.data() + hex.size(), code_point, 16);
assert(res.ec == std::errc{} && res.ptr == hex.data() + hex.size());
 
// https://www.w3.org/TR/css-syntax-3/#maximum-allowed-code-point
static constexpr std::uint32_t kMaximumAllowedCodePoint = 0x10FFFF;
if (code_point == 0 || code_point > kMaximumAllowedCodePoint || util::is_unicode_surrogate(code_point)) {
code_point = kReplacementCharacter;
}
 
return util::unicode_to_utf8(code_point);
}
 
return std::string{*c};
}
 
} // namespace css2
 
css2/tokenizer.h added: 116, removed: 14, total 102
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2021-2023 Robin Lindén <dev@robinlinden.eu>
// SPDX-FileCopyrightText: 2021-2024 Robin Lindén <dev@robinlinden.eu>
// SPDX-FileCopyrightText: 2022 Mikael Larsson <c.mikael.larsson@gmail.com>
//
// SPDX-License-Identifier: BSD-2-Clause
@@ -33,6 +33,7 @@ enum class State {
 
enum class ParseError {
EofInComment,
EofInEscapeSequence,
EofInString,
NewlineInString,
};
@@ -66,6 +67,7 @@ private:
void reconsume_in(State);
 
std::pair<std::variant<int, double>, NumericType> consume_number(char first_byte);
std::string consume_an_escaped_code_point();
};
 
} // namespace css2
 
css2/tokenizer_test.cpp added: 116, removed: 14, total 102
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2021-2023 Robin Lindén <dev@robinlinden.eu>
// SPDX-FileCopyrightText: 2021-2024 Robin Lindén <dev@robinlinden.eu>
// SPDX-FileCopyrightText: 2022 Mikael Larsson <c.mikael.larsson@gmail.com>
//
// SPDX-License-Identifier: BSD-2-Clause
@@ -10,6 +10,7 @@
#include "etest/cxx_compat.h"
#include "etest/etest.h"
 
#include <string>
#include <string_view>
#include <utility>
#include <vector>
@@ -19,9 +20,12 @@ using etest::expect_eq;
using etest::require;
 
using namespace css2;
using namespace std::literals;
 
namespace {
 
constexpr char const *kReplacementCharacter = "\xef\xbf\xbd";
 
class TokenizerOutput {
public:
~TokenizerOutput() {
@@ -169,6 +173,11 @@ int main() {
expect_token(output, WhitespaceToken{});
});
 
etest::test("single quoted string with escaped code point", [] {
auto output = run_tokenizer("'foo\\40'");
expect_token(output, StringToken{"foo@"});
});
 
etest::test("ident token", [] {
auto output = run_tokenizer("foo");
 
@@ -199,6 +208,48 @@ int main() {
expect_token(output, IdentToken{"_foo-bar"});
});
 
etest::test("ident token with escaped code point", [] {
auto output = run_tokenizer("foo\\40");
expect_token(output, IdentToken{"foo@"});
});
 
etest::test("ident token with escaped code point, eof", [] {
auto output = run_tokenizer("foo\\");
expect_token(output, IdentToken{"foo"s + kReplacementCharacter});
expect_error(output, ParseError::EofInEscapeSequence);
});
 
etest::test("ident token with escaped code point, non-hex after", [] {
auto output = run_tokenizer("foo\\40Z");
expect_token(output, IdentToken{"foo@Z"});
});
 
etest::test("ident token with escaped code point, whitespace after", [] {
auto output = run_tokenizer("foo\\40 ");
expect_token(output, IdentToken{"foo@"});
});
 
etest::test("ident token with escaped code point, max characters in escape", [] {
auto output = run_tokenizer("foo\\10fffff");
// \u{10ffff} would've been nicer, but it's not yet supported by the compilers we support.
expect_token(output, IdentToken{"foo\U0010FFFFf"});
});
 
etest::test("ident token with escaped code point, outside the unicode range", [] {
auto output = run_tokenizer("foo\\110000");
expect_token(output, IdentToken{"foo"s + kReplacementCharacter});
});
 
etest::test("ident token with escaped code point, surrogate", [] {
auto output = run_tokenizer("foo\\d800");
expect_token(output, IdentToken{"foo"s + kReplacementCharacter});
});
 
etest::test("ident token with escaped code point, null", [] {
auto output = run_tokenizer("foo\\0");
expect_token(output, IdentToken{"foo"s + kReplacementCharacter});
});
 
etest::test("whitespace after ident", [] {
auto output = run_tokenizer("abc ");
 
@@ -212,6 +263,11 @@ int main() {
expect_token(output, AtKeywordToken{"foo"});
});
 
etest::test("at keyword token with escaped code point", [] {
auto output = run_tokenizer("@foo\\23");
expect_token(output, AtKeywordToken{"foo#"});
});
 
etest::test("at keyword token with digit", [] {
auto output = run_tokenizer("@b4z");