srctree

David Zero parent 24b75eee 79a89eee
url: Perform RFC3986 normalization on serialization

inlinesplit
url/url.cpp added: 165, removed: 7, total 158
@@ -221,7 +221,7 @@ std::string Url::serialize_path() const {
}
 
// https://url.spec.whatwg.org/#concept-url-serializer
std::string Url::serialize(bool exclude_fragment) const {
std::string Url::serialize(bool exclude_fragment, bool rfc3986_norm) const {
std::string output = scheme + ":";
 
if (host.has_value()) {
@@ -259,6 +259,14 @@ std::string Url::serialize(bool exclude_fragment) const {
output += "#" + *fragment;
}
 
// Slight deviation from the spec; perform some optional normalization to
// help with things like caching, remembering visited links, etc
// https://en.wikipedia.org/wiki/URI_normalization#Normalizations_that_preserve_semantics
if (rfc3986_norm) {
output = util::percent_encoded_triplets_to_upper(output);
output = util::percent_decode_unreserved(output);
}
 
return output;
}
 
 
url/url.h added: 165, removed: 7, total 158
@@ -97,7 +97,7 @@ struct Url {
std::optional<std::string> query;
std::optional<std::string> fragment;
 
std::string serialize(bool exclude_fragment = false) const;
std::string serialize(bool exclude_fragment = false, bool rfc3986_norm = false) const;
std::string serialize_path() const;
 
Origin origin() const;
 
url/url_test.cpp added: 165, removed: 7, total 158
@@ -700,6 +700,66 @@ int main() {
etest::expect_eq(errors, std::vector{url::ValidationError::MissingSchemeNonRelativeUrl});
});
 
etest::test("URL normalization: uppercasing percent-encoded triplets", [] {
url::UrlParser p;
 
std::optional<url::Url> url = p.parse("http://example.com/foo%2a");
 
etest::require(url.has_value());
 
etest::expect_eq(url->serialize(false, true), "http://example.com/foo%2A");
});
 
etest::test("URL normalization: lowercasing scheme and host", [] {
url::UrlParser p;
 
std::optional<url::Url> url = p.parse("HTTP://User@Example.COM/Foo");
 
etest::require(url.has_value());
 
etest::expect_eq(url->serialize(), "http://User@example.com/Foo");
});
 
etest::test("URL normalization: decoding percent-encoded triplets of unreserved characters", [] {
url::UrlParser p;
 
std::optional<url::Url> url = p.parse("http://example.com/%7Efoo");
 
etest::require(url.has_value());
 
etest::expect_eq(url->serialize(false, true), "http://example.com/~foo");
});
 
etest::test("URL normalization: removing dot-segments", [] {
url::UrlParser p;
 
std::optional<url::Url> url = p.parse("http://example.com/foo/./bar/baz/../qux");
 
etest::require(url.has_value());
 
etest::expect_eq(url->serialize(), "http://example.com/foo/bar/qux");
});
 
etest::test("URL normalization: converting empty path to '/'", [] {
url::UrlParser p;
 
std::optional<url::Url> url = p.parse("http://example.com");
 
etest::require(url.has_value());
 
etest::expect_eq(url->serialize(), "http://example.com/");
});
 
etest::test("URL normalization: removing default port", [] {
url::UrlParser p;
 
std::optional<url::Url> url = p.parse("http://example.com:80/");
 
etest::require(url.has_value());
 
etest::expect_eq(url->serialize(), "http://example.com/");
});
 
etest::test("Web Platform Tests", [] {
url::UrlParser p;
 
 
util/string.h added: 165, removed: 7, total 158
@@ -82,6 +82,14 @@ constexpr char lowercased(char c) {
return c + ('a' - 'A');
}
 
constexpr char uppercased(char c) {
if (!is_lower_alpha(c)) {
return c;
}
 
return c - ('a' - 'A');
}
 
[[nodiscard]] constexpr std::string lowercased(std::string s) {
std::ranges::for_each(s, [](char &c) { c = lowercased(c); });
return s;
@@ -260,6 +268,56 @@ constexpr std::string percent_decode(std::string_view input) {
return output;
}
 
// RFC3986 normalization; uppercase all percent-encoded triplets.
constexpr std::string percent_encoded_triplets_to_upper(std::string_view input) {
std::string output;
 
for (std::size_t i = 0; i < input.size(); i++) {
if (input[i] == '%' && (input.size() > i + 2 && is_hex_digit(input[i + 1]) && is_hex_digit(input[i + 2]))) {
output += input[i];
output += uppercased(input[i + 1]);
output += uppercased(input[i + 2]);
 
i += 2;
} else {
output += input[i];
}
}
 
return output;
}
 
// RFC3986 normalization; decode percent-encoded triplets that encode unreserved characters
constexpr std::string percent_decode_unreserved(std::string_view input) {
std::string output;
 
for (std::size_t i = 0; i < input.size(); i++) {
if (input[i] != '%' || (input.size() <= i + 2 || !is_hex_digit(input[i + 1]) || !is_hex_digit(input[i + 2]))) {
output += input[i];
} else {
std::string_view digits = input.substr(i + 1, 2);
std::uint8_t num;
 
[[maybe_unused]] auto res = std::from_chars(digits.data(), digits.data() + digits.size(), num, 16);
 
assert(res.ec != std::errc::invalid_argument && res.ec != std::errc::result_out_of_range);
 
if (num > 127
|| (!is_alpha(num) && !is_digit(num) && num != '-' && num != '.' && num != '_' && num != '~')) {
output += input[i];
 
continue;
}
 
output += static_cast<char>(num);
 
i += 2;
}
}
 
return output;
}
 
} // namespace util
 
#endif
 
util/string_test.cpp added: 165, removed: 7, total 158
@@ -303,5 +303,37 @@ int main() {
expect(util::ipv6_serialize(global) == "2001:db8:85a3::8a2e:370:7334");
});
 
etest::test("uppercase percent-encoded triplets", [] {
std::string foo{"https://example.com/%ff"};
std::string foo2{"%be%ee%ee%ff"};
std::string foo3;
std::string foo4{"%"};
std::string foo5{"%77"};
std::string foo6{"%EE"};
 
expect_eq(percent_encoded_triplets_to_upper(foo), "https://example.com/%FF");
expect_eq(percent_encoded_triplets_to_upper(foo2), "%BE%EE%EE%FF");
expect_eq(percent_encoded_triplets_to_upper(foo3), "");
expect_eq(percent_encoded_triplets_to_upper(foo4), "%");
expect_eq(percent_encoded_triplets_to_upper(foo5), "%77");
expect_eq(percent_encoded_triplets_to_upper(foo6), "%EE");
});
 
etest::test("percent-decode URL unreserved", [] {
std::string foo{"https://example.com/%7e"};
std::string foo2{"%7e%30%61%2D%2e%5F"};
std::string foo3;
std::string foo4{"%"};
std::string foo5{"%77"};
std::string foo6{"%7F"};
 
expect_eq(percent_decode_unreserved(foo), "https://example.com/~");
expect_eq(percent_decode_unreserved(foo2), "~0a-._");
expect_eq(percent_decode_unreserved(foo3), "");
expect_eq(percent_decode_unreserved(foo4), "%");
expect_eq(percent_decode_unreserved(foo5), "w");
expect_eq(percent_decode_unreserved(foo6), "%7F");
});
 
return etest::run_all_tests();
}