srctree

Robin Linden parent 3ab18413 92762f85
util/unicode: Make invalid input to unicode_utf8_byte_count more obvious

inlinesplit
url/url.cpp added: 18, removed: 18, total 0
@@ -1673,10 +1673,11 @@ std::optional<std::string> UrlParser::parse_opaque_host(std::string_view input)
 
// I don't *think* this can remove > size(), but maybe i should clamp it anyway
 
// len is 0 if the codepoint is larger than the maximum valid code
// point, 0x10ffff, meaning it'll have to take up at least 4 bytes.
int len = util::unicode_utf8_byte_count(cp);
tmp.remove_prefix(len == 0 ? 4 : len);
// unicode_utf8_byte_count fails if the codepoint is larger than the
// maximum valid code point, 0x10ffff, meaning it'll have to take up at
// least 4 bytes.
int len = util::unicode_utf8_byte_count(cp).value_or(4);
tmp.remove_prefix(len);
}
 
return util::percent_encode(input, PercentEncodeSet::c0_control);
 
util/unicode.h added: 18, removed: 18, total 0
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2022-2023 Robin Lindén <dev@robinlinden.eu>
// SPDX-FileCopyrightText: 2022-2024 Robin Lindén <dev@robinlinden.eu>
//
// SPDX-License-Identifier: BSD-2-Clause
 
@@ -18,28 +18,28 @@ constexpr bool unicode_is_ascii(std::uint32_t code_point) {
return code_point <= 0x7f;
}
 
constexpr int unicode_utf8_byte_count(std::uint32_t code_point) {
constexpr std::optional<std::uint8_t> unicode_utf8_byte_count(std::uint32_t code_point) {
if (unicode_is_ascii(code_point)) {
return 1;
return std::uint8_t{1};
}
 
if (code_point <= 0x7ff) {
return 2;
return std::uint8_t{2};
}
 
if (code_point <= 0xffff) {
return 3;
return std::uint8_t{3};
}
 
if (code_point <= 0x10ffff) {
return 4;
return std::uint8_t{4};
}
 
return 0;
return std::nullopt;
}
 
constexpr std::string unicode_to_utf8(std::uint32_t code_point) {
switch (unicode_utf8_byte_count(code_point)) {
switch (unicode_utf8_byte_count(code_point).value_or(0)) {
case 1:
return {static_cast<char>(code_point & 0x7F)};
case 2:
 
util/unicode_test.cpp added: 18, removed: 18, total 0
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: 2022-2023 Robin Lindén <dev@robinlinden.eu>
// SPDX-FileCopyrightText: 2022-2024 Robin Lindén <dev@robinlinden.eu>
//
// SPDX-License-Identifier: BSD-2-Clause
 
@@ -32,8 +32,7 @@ int main() {
expect_eq(unicode_utf8_byte_count(0x100000), 4);
expect_eq(unicode_utf8_byte_count(0x10ffff), 4);
 
// Invalid code points return 0.
expect_eq(unicode_utf8_byte_count(0x110000), 0);
expect_eq(unicode_utf8_byte_count(0x110000), std::nullopt);
});
 
etest::test("unicode_to_utf8", [] {