srctree

Robin Linden parent 37913518 b3888c48
unicode: Adopt util/unicode.h

It fits better here now, and gets its own target as it has no dependencies.
css2/BUILD added: 116, removed: 111, total 5
@@ -11,8 +11,8 @@ cc_library(
copts = HASTUR_COPTS,
visibility = ["//visibility:public"],
deps = [
"//unicode:util",
"//util:string",
"//util:unicode",
],
)
 
 
css2/tokenizer.cpp added: 116, removed: 111, total 5
@@ -7,8 +7,8 @@
 
#include "css2/token.h"
 
#include "unicode/util.h"
#include "util/string.h"
#include "util/unicode.h"
 
#include <cassert>
#include <charconv>
@@ -397,7 +397,7 @@ std::string Tokenizer::consume_an_escaped_code_point() {
auto c = consume_next_input_character();
if (!c) {
emit(ParseError::EofInEscapeSequence);
return util::unicode_to_utf8(kReplacementCharacter);
return unicode::to_utf8(kReplacementCharacter);
}
 
if (util::is_hex_digit(*c)) {
@@ -422,11 +422,11 @@ std::string Tokenizer::consume_an_escaped_code_point() {
 
// https://www.w3.org/TR/css-syntax-3/#maximum-allowed-code-point
static constexpr std::uint32_t kMaximumAllowedCodePoint = 0x10FFFF;
if (code_point == 0 || code_point > kMaximumAllowedCodePoint || util::is_unicode_surrogate(code_point)) {
if (code_point == 0 || code_point > kMaximumAllowedCodePoint || unicode::is_surrogate(code_point)) {
code_point = kReplacementCharacter;
}
 
return util::unicode_to_utf8(code_point);
return unicode::to_utf8(code_point);
}
 
return std::string{*c};
 
html2/BUILD added: 116, removed: 111, total 5
@@ -12,8 +12,8 @@ cc_library(
copts = HASTUR_COPTS,
visibility = ["//visibility:public"],
deps = [
"//unicode:util",
"//util:string",
"//util:unicode",
"@fmt",
],
)
 
html2/tokenizer.cpp added: 116, removed: 111, total 5
@@ -7,8 +7,8 @@
#include "html2/character_reference.h"
#include "html2/token.h"
 
#include "unicode/util.h"
#include "util/string.h"
#include "util/unicode.h"
 
#include <algorithm>
#include <cassert>
@@ -2244,9 +2244,9 @@ void Tokenizer::run() {
}
 
temporary_buffer_.clear();
temporary_buffer_.append(util::unicode_to_utf8(maybe_reference->first_codepoint));
temporary_buffer_.append(unicode::to_utf8(maybe_reference->first_codepoint));
if (maybe_reference->second_codepoint) {
temporary_buffer_.append(util::unicode_to_utf8(*maybe_reference->second_codepoint));
temporary_buffer_.append(unicode::to_utf8(*maybe_reference->second_codepoint));
}
 
flush_code_points_consumed_as_a_character_reference();
@@ -2406,12 +2406,12 @@ void Tokenizer::run() {
character_reference_code_ = 0xFFFD;
}
 
if (util::is_unicode_surrogate(character_reference_code_)) {
if (unicode::is_surrogate(character_reference_code_)) {
emit(ParseError::SurrogateCharacterReference);
character_reference_code_ = 0xFFFD;
}
 
if (util::is_unicode_noncharacter(character_reference_code_)) {
if (unicode::is_noncharacter(character_reference_code_)) {
emit(ParseError::NoncharacterCharacterReference);
}
 
@@ -2452,7 +2452,7 @@ void Tokenizer::run() {
character_reference_code_ = it->second;
}
 
temporary_buffer_ = util::unicode_to_utf8(character_reference_code_);
temporary_buffer_ = unicode::to_utf8(character_reference_code_);
flush_code_points_consumed_as_a_character_reference();
state_ = return_state_;
continue;
 
idna/BUILD added: 116, removed: 111, total 5
@@ -26,7 +26,7 @@ cc_library(
],
copts = HASTUR_COPTS,
visibility = ["//visibility:public"],
deps = ["//util:unicode"],
deps = ["//unicode:util"],
)
 
[cc_test(
@@ -37,6 +37,6 @@ cc_library(
deps = [
":idna",
"//etest",
"//util:unicode",
"//unicode:util",
],
) for src in glob(["*_test.cpp"])]
 
idna/punycode.h added: 116, removed: 111, total 5
@@ -1,11 +1,11 @@
// SPDX-FileCopyrightText: 2023 Robin Lindén <dev@robinlinden.eu>
// SPDX-FileCopyrightText: 2023-2024 Robin Lindén <dev@robinlinden.eu>
//
// SPDX-License-Identifier: BSD-2-Clause
 
#ifndef IDNA_PUNYCODE_H_
#define IDNA_PUNYCODE_H_
 
#include "util/unicode.h"
#include "unicode/util.h"
 
#include <algorithm>
#include <functional>
@@ -134,7 +134,7 @@ private:
static constexpr std::string unicode_to_utf8(std::u32string const &code_points) {
std::string result{};
for (auto const code_point : code_points) {
result += util::unicode_to_utf8(code_point);
result += unicode::to_utf8(code_point);
}
 
return result;
 
idna/punycode_test.cpp added: 116, removed: 111, total 5
@@ -1,11 +1,11 @@
// SPDX-FileCopyrightText: 2023 Robin Lindén <dev@robinlinden.eu>
// SPDX-FileCopyrightText: 2023-2024 Robin Lindén <dev@robinlinden.eu>
//
// SPDX-License-Identifier: BSD-2-Clause
 
#include "idna/punycode.h"
 
#include "etest/etest2.h"
#include "util/unicode.h"
#include "unicode/util.h"
 
#include <string>
#include <vector>
@@ -14,7 +14,7 @@ namespace {
std::string unicode_as_utf8_string(std::vector<int> const &code_points) {
std::string result{};
for (auto const code_point : code_points) {
result += util::unicode_to_utf8(code_point);
result += unicode::to_utf8(code_point);
}
 
return result;
 
idna/uts46.cpp added: 116, removed: 111, total 5
@@ -6,7 +6,7 @@
 
#include "idna/idna_data.h"
 
#include "util/unicode.h"
#include "unicode/util.h"
 
// NOLINTNEXTLINE(misc-include-cleaner): This is used for std::ranges::lower_bound.
#include <algorithm>
@@ -23,7 +23,7 @@ std::optional<std::string> Uts46::map(std::string_view input) {
// input.size is just an estimate, but probably good enough for now.
result.reserve(input.size());
 
for (auto const code_point : util::CodePointView{input}) {
for (auto const code_point : unicode::CodePointView{input}) {
// * clang-tidy thinks std::ranges::lower_bound is provided by
// <bits/ranges_algo.h> when it's actually provided by <algorithm>.
// * clang-tidy says this is pointer-ish, but msvc disagrees.
@@ -57,14 +57,14 @@ std::optional<std::string> Uts46::map(std::string_view input) {
 
// These would be mapped in transitional processing, but we don't support that.
if (std::holds_alternative<uts46::Deviation>(entry)) {
result += util::unicode_to_utf8(code_point);
result += unicode::to_utf8(code_point);
continue;
}
 
if (std::holds_alternative<uts46::Valid>(entry) //
|| std::holds_alternative<uts46::ValidNv8>(entry) //
|| std::holds_alternative<uts46::ValidXv8>(entry)) {
result += util::unicode_to_utf8(code_point);
result += unicode::to_utf8(code_point);
continue;
}
}
 
unicode/BUILD added: 116, removed: 111, total 5
@@ -16,15 +16,22 @@ genrule(
)
 
cc_library(
name = "unicode",
srcs = [":generate_unicode_data"] + glob(
include = ["*.cpp"],
exclude = ["*_test.cpp"],
),
hdrs = glob(["*.h"]),
name = "normalization",
srcs = [
"normalization.cpp",
":generate_unicode_data",
],
hdrs = ["normalization.h"],
copts = HASTUR_COPTS,
visibility = ["//visibility:public"],
deps = [":util"],
)
 
cc_library(
name = "util",
hdrs = ["util.h"],
copts = HASTUR_COPTS,
visibility = ["//visibility:public"],
deps = ["//util:unicode"],
)
 
[cc_test(
@@ -33,8 +40,8 @@ cc_library(
srcs = [src],
copts = HASTUR_COPTS,
deps = [
":unicode",
":normalization",
":util",
"//etest",
"//util:unicode",
],
) for src in glob(["*_test.cpp"])]
 
unicode/normalization.cpp added: 116, removed: 111, total 5
@@ -5,8 +5,7 @@
#include "unicode/normalization.h"
 
#include "unicode/unicode_data.h"
 
#include "util/unicode.h"
#include "unicode/util.h"
 
// NOLINTNEXTLINE(misc-include-cleaner): This is used for std::ranges::lower_bound.
#include <algorithm>
@@ -29,13 +28,13 @@ void decompose_to(std::ostream &os, char32_t code_point) {
 
// This code point does not decompose.
if (maybe_decomposition->code_point != code_point) {
os << util::unicode_to_utf8(code_point);
os << to_utf8(code_point);
return;
}
 
// Recursively decompose the decomposition. This is needed as some code
// points decompose into code points that also decompose.
for (auto const decomposed : util::CodePointView{maybe_decomposition->decomposes_to}) {
for (auto const decomposed : CodePointView{maybe_decomposition->decomposes_to}) {
decompose_to(os, decomposed);
}
}
@@ -45,7 +44,7 @@ void decompose_to(std::ostream &os, char32_t code_point) {
std::string Normalization::decompose(std::string_view input) {
std::stringstream ss{};
 
for (auto const code_point : util::CodePointView{input}) {
for (auto const code_point : CodePointView{input}) {
decompose_to(ss, code_point);
}
 
 
util/unicode.h added: 116, removed: 111, total 5
@@ -2,8 +2,8 @@
//
// SPDX-License-Identifier: BSD-2-Clause
 
#ifndef UTIL_UNICODE_H_
#define UTIL_UNICODE_H_
#ifndef UNICODE_UTIL_H_
#define UNICODE_UTIL_H_
 
#include <cstddef>
#include <cstdint>
@@ -12,14 +12,14 @@
#include <string_view>
#include <utility>
 
namespace util {
namespace unicode {
 
constexpr bool unicode_is_ascii(std::uint32_t code_point) {
constexpr bool is_ascii(std::uint32_t code_point) {
return code_point <= 0x7f;
}
 
constexpr std::optional<std::uint8_t> unicode_utf8_byte_count(std::uint32_t code_point) {
if (unicode_is_ascii(code_point)) {
constexpr std::optional<std::uint8_t> utf8_byte_count(std::uint32_t code_point) {
if (is_ascii(code_point)) {
return std::uint8_t{1};
}
 
@@ -38,8 +38,8 @@ constexpr std::optional<std::uint8_t> unicode_utf8_byte_count(std::uint32_t code
return std::nullopt;
}
 
constexpr std::string unicode_to_utf8(std::uint32_t code_point) {
switch (unicode_utf8_byte_count(code_point).value_or(0)) {
constexpr std::string to_utf8(std::uint32_t code_point) {
switch (utf8_byte_count(code_point).value_or(0)) {
case 1:
return {static_cast<char>(code_point & 0x7F)};
case 2:
@@ -66,12 +66,12 @@ constexpr std::string unicode_to_utf8(std::uint32_t code_point) {
}
 
// https://infra.spec.whatwg.org/#surrogate
constexpr bool is_unicode_surrogate(std::uint32_t code_point) {
constexpr bool is_surrogate(std::uint32_t code_point) {
return code_point >= 0xD800 && code_point <= 0xDFFF;
}
 
// https://infra.spec.whatwg.org/#noncharacter
constexpr bool is_unicode_noncharacter(std::uint32_t code_point) {
constexpr bool is_noncharacter(std::uint32_t code_point) {
if (code_point >= 0xFDD0 && code_point <= 0xFDEF) {
return true;
}
@@ -242,6 +242,6 @@ private:
};
};
 
} // namespace util
} // namespace unicode
 
#endif
 
util/unicode_test.cpp added: 116, removed: 111, total 5
@@ -2,7 +2,7 @@
//
// SPDX-License-Identifier: BSD-2-Clause
 
#include "util/unicode.h"
#include "unicode/util.h"
 
#include "etest/etest.h"
 
@@ -13,70 +13,70 @@
#include <vector>
 
using namespace std::literals;
using namespace util;
using namespace unicode;
 
using etest::expect;
using etest::expect_eq;
 
int main() {
etest::test("unicode_utf8_byte_count", [] {
expect_eq(unicode_utf8_byte_count(0), 1);
expect_eq(unicode_utf8_byte_count(0x7f), 1);
etest::test("utf8_byte_count", [] {
expect_eq(utf8_byte_count(0), 1);
expect_eq(utf8_byte_count(0x7f), 1);
 
expect_eq(unicode_utf8_byte_count(0x80), 2);
expect_eq(unicode_utf8_byte_count(0x7ff), 2);
expect_eq(utf8_byte_count(0x80), 2);
expect_eq(utf8_byte_count(0x7ff), 2);
 
expect_eq(unicode_utf8_byte_count(0x800), 3);
expect_eq(unicode_utf8_byte_count(0xffff), 3);
expect_eq(utf8_byte_count(0x800), 3);
expect_eq(utf8_byte_count(0xffff), 3);
 
expect_eq(unicode_utf8_byte_count(0x100000), 4);
expect_eq(unicode_utf8_byte_count(0x10ffff), 4);
expect_eq(utf8_byte_count(0x100000), 4);
expect_eq(utf8_byte_count(0x10ffff), 4);
 
expect_eq(unicode_utf8_byte_count(0x110000), std::nullopt);
expect_eq(utf8_byte_count(0x110000), std::nullopt);
});
 
etest::test("unicode_to_utf8", [] {
expect_eq(unicode_to_utf8(0x002f), "/"sv);
etest::test("to_utf8", [] {
expect_eq(to_utf8(0x002f), "/"sv);
 
expect_eq(unicode_to_utf8(0x00a3), "£"sv);
expect_eq(unicode_to_utf8(0x07f9), "߹"sv);
expect_eq(to_utf8(0x00a3), "£"sv);
expect_eq(to_utf8(0x07f9), "߹"sv);
 
expect_eq(unicode_to_utf8(0x0939), "ह"sv);
expect_eq(unicode_to_utf8(0x20ac), "€"sv);
expect_eq(unicode_to_utf8(0xd55c), "한"sv);
expect_eq(unicode_to_utf8(0xfffd), "�"sv);
expect_eq(to_utf8(0x0939), "ह"sv);
expect_eq(to_utf8(0x20ac), "€"sv);
expect_eq(to_utf8(0xd55c), "한"sv);
expect_eq(to_utf8(0xfffd), "�"sv);
 
expect_eq(unicode_to_utf8(0x10348), "𐍈"sv);
expect_eq(to_utf8(0x10348), "𐍈"sv);
 
// Invalid code points return "".
expect_eq(unicode_to_utf8(0x110000), ""sv);
expect_eq(to_utf8(0x110000), ""sv);
});
 
etest::test("is_unicode_surrogate", [] {
expect(!is_unicode_surrogate(0xD799));
expect(is_unicode_surrogate(0xD800)); // First leading surrogate.
expect(is_unicode_surrogate(0xDBFF)); // Last leading surrogate.
expect(is_unicode_surrogate(0xDC00)); // First trailing surrogate.
expect(is_unicode_surrogate(0xDFFF)); // Last trailing surrogate.
expect(!is_unicode_surrogate(0xE000));
etest::test("is_surrogate", [] {
expect(!is_surrogate(0xD799));
expect(is_surrogate(0xD800)); // First leading surrogate.
expect(is_surrogate(0xDBFF)); // Last leading surrogate.
expect(is_surrogate(0xDC00)); // First trailing surrogate.
expect(is_surrogate(0xDFFF)); // Last trailing surrogate.
expect(!is_surrogate(0xE000));
});
 
etest::test("is_unicode_noncharacter", [] {
expect(!is_unicode_noncharacter(0xFDD0 - 1));
etest::test("is_noncharacter", [] {
expect(!is_noncharacter(0xFDD0 - 1));
 
for (std::uint32_t i = 0xFDD0; i <= 0xFDEF; ++i) {
expect(is_unicode_noncharacter(i));
expect(is_noncharacter(i));
}
 
expect(!is_unicode_noncharacter(0xFDEF + 1));
expect(!is_unicode_noncharacter(0xFFFE - 1));
expect(!is_noncharacter(0xFDEF + 1));
expect(!is_noncharacter(0xFFFE - 1));
 
// Every 0x10000 pair of values ending in FFFE and FFFF are noncharacters.
for (std::uint32_t i = 0xFFFE; i <= 0x10FFFE; i += 0x10000) {
expect(!is_unicode_noncharacter(i - 1));
expect(is_unicode_noncharacter(i));
expect(is_unicode_noncharacter(i + 1));
expect(!is_unicode_noncharacter(i + 2));
expect(!is_noncharacter(i - 1));
expect(is_noncharacter(i));
expect(is_noncharacter(i + 1));
expect(!is_noncharacter(i + 2));
}
});
 
 
url/BUILD added: 116, removed: 111, total 5
@@ -24,8 +24,8 @@ cc_library(
visibility = ["//visibility:public"],
deps = [
":rtti_hack",
"//unicode:util",
"//util:string",
"//util:unicode",
"//util:uuid",
"@icu//:common",
"@icu//:icudata",
 
url/url.cpp added: 116, removed: 111, total 5
@@ -7,8 +7,8 @@
 
#include "url/rtti_hack.h" // IWYU pragma: keep
 
#include "unicode/util.h"
#include "util/string.h"
#include "util/unicode.h"
#include "util/uuid.h"
 
// unicode/uclean is for u_cleanup, but icu does a lot of macro magic renaming
@@ -1085,7 +1085,7 @@ void UrlParser::state_path() {
state_ = ParserState::Fragment;
}
} else {
if (!is_url_codepoint(util::utf8_to_utf32(remaining_from(0))) && c != '%') {
if (!is_url_codepoint(unicode::utf8_to_utf32(remaining_from(0))) && c != '%') {
validation_error(ValidationError::InvalidUrlUnit);
}
 
@@ -1110,7 +1110,7 @@ void UrlParser::state_opaque_path() {
 
state_ = ParserState::Fragment;
} else {
if (!is_eof() && !is_url_codepoint(util::utf8_to_utf32(remaining_from(0))) && c != '%') {
if (!is_eof() && !is_url_codepoint(unicode::utf8_to_utf32(remaining_from(0))) && c != '%') {
validation_error(ValidationError::InvalidUrlUnit);
}
 
@@ -1143,7 +1143,7 @@ void UrlParser::state_query() {
state_ = ParserState::Fragment;
}
} else if (!is_eof()) {
if (!is_url_codepoint(util::utf8_to_utf32(remaining_from(0))) && c != '%') {
if (!is_url_codepoint(unicode::utf8_to_utf32(remaining_from(0))) && c != '%') {
validation_error(ValidationError::InvalidUrlUnit);
}
 
@@ -1160,7 +1160,7 @@ void UrlParser::state_query() {
// https://url.spec.whatwg.org/#fragment-state
void UrlParser::state_fragment() {
if (auto c = peek(); !is_eof()) {
if (!is_url_codepoint(util::utf8_to_utf32(remaining_from(0))) && c != '%') {
if (!is_url_codepoint(unicode::utf8_to_utf32(remaining_from(0))) && c != '%') {
validation_error(ValidationError::InvalidUrlUnit);
}
 
@@ -1660,7 +1660,7 @@ std::optional<std::string> UrlParser::parse_opaque_host(std::string_view input)
std::string_view tmp = input;
 
while (!tmp.empty()) {
std::uint32_t cp = util::utf8_to_utf32(tmp);
std::uint32_t cp = unicode::utf8_to_utf32(tmp);
 
if (!is_url_codepoint(cp)) {
validation_error(ValidationError::InvalidUrlUnit);
@@ -1675,7 +1675,7 @@ std::optional<std::string> UrlParser::parse_opaque_host(std::string_view input)
// unicode_utf8_byte_count fails if the codepoint is larger than the
// maximum valid code point, 0x10ffff, meaning it'll have to take up at
// least 4 bytes.
int len = util::unicode_utf8_byte_count(cp).value_or(4);
int len = unicode::utf8_byte_count(cp).value_or(4);
tmp.remove_prefix(len);
}
 
@@ -1686,8 +1686,7 @@ bool UrlParser::is_url_codepoint(std::uint32_t cp) const {
return cp == '!' || cp == '$' || cp == '&' || cp == '\'' || cp == '(' || cp == ')' || cp == '*' || cp == '+'
|| cp == ',' || cp == '-' || cp == '.' || cp == '/' || cp == ':' || cp == ';' || cp == '=' || cp == '?'
|| cp == '@' || cp == '_' || cp == '~'
|| (cp >= 0x00a0 && cp <= 0x10fffd && !util::is_unicode_noncharacter(cp)
&& !util::is_unicode_surrogate(cp));
|| (cp >= 0x00a0 && cp <= 0x10fffd && !unicode::is_noncharacter(cp) && !unicode::is_surrogate(cp));
}
 
// NOLINTEND(bugprone-unchecked-optional-access)