srctree

David Zero parent 243b7a8b ded99b12
url/url: Add URL parser

inlinesplit
.bazelrc added: 2390, removed: 46, total 2344
@@ -6,6 +6,8 @@ coverage --combined_report=lcov
test --test_output=errors
test --test_summary=terse
test --test_verbose_timeout_warnings
# Set ICU data directory for tests
test --test_env=HASTUR_ICU_DATA=external/icu-data/
 
# Bazel deprecations
# =========================================================
 
.clang-tidy added: 2390, removed: 46, total 2344
@@ -4,6 +4,8 @@
#
# -bugprone-narrowing-conversions: Very noisy for not much gain.
#
# -bugprone-unchecked-optional-access: Makes clang-tidy hang during CI.
#
# -clang-analyzer-cplusplus.NewDeleteLeaks: Lots of false positives w/
# -std=c++2b when calling std::make_shared in the JS AST.
# js/ast_executor_test.cpp:176:5: error: Potential leak of memory pointed to by
@@ -54,6 +56,7 @@ Checks: >
readability-qualified-auto,
-bugprone-exception-escape,
-bugprone-narrowing-conversions,
-bugprone-unchecked-optional-access,
-clang-analyzer-cplusplus.NewDeleteLeaks,
-clang-analyzer-optin.cplusplus.UninitializedObject,
-clang-diagnostic-builtin-macro-redefined,
 
.github/workflows/ci.yaml added: 2390, removed: 46, total 2344
@@ -266,7 +266,7 @@ jobs:
timeout-minutes: 30
steps:
- uses: actions/checkout@v3
- run: grep --recursive --no-filename --only-matching --exclude=WORKSPACE --exclude=*test.cpp --exclude=ci.yaml 'https://[^)(}{",# ]*' | grep -v '^https://$' | sort | uniq | xargs wget --spider
- run: grep --recursive --no-filename --only-matching --exclude-dir="*corpus" --exclude=WORKSPACE --exclude=*test.cpp --exclude=ci.yaml 'https://[^)(}{",# ]*' | grep -v '^https://$' | sort | uniq | xargs wget --spider
 
gitlint:
runs-on: ubuntu-22.04
 
WORKSPACE added: 2390, removed: 46, total 2344
@@ -119,6 +119,13 @@ http_archive(
)
 
# https://github.com/ocornut/imgui
http_archive(
name = "icu-data", # Unicode-DFS-2016
build_file_content = "exports_files([\"icudt72l.dat\"])",
sha256 = "1bc02487cbeaec3fc2d0dc941e8b243e7d35cd79899a201df88dc9ec9667a162",
url = "https://github.com/unicode-org/icu/releases/download/release-72-1/icu4c-72_1-data-bin-l.zip",
)
 
http_archive(
name = "imgui", # MIT
build_file = "//third_party:imgui.BUILD",
 
third_party/icu.BUILD added: 2390, removed: 46, total 2344
@@ -31,13 +31,20 @@ cc_library(
],
"//conditions:default": [],
}),
defines = [
"U_STATIC_IMPLEMENTATION",
"U_COMMON_IMPLEMENTATION",
"U_CHARSET_IS_UTF8=1",
"U_HIDE_OBSOLETE_UTF_OLD_H=1",
"UCONFIG_NO_CONVERSION=1",
],
linkopts = select({
"@platforms//os:windows": [],
"@platforms//os:windows": [
"-DEFAULTLIB:advapi32",
],
"//conditions:default": ["-ldl"],
}),
local_defines = [
"U_COMMON_IMPLEMENTATION",
],
linkstatic = True,
strip_include_prefix = "source/common/",
visibility = ["//visibility:public"],
)
 
url/BUILD added: 2390, removed: 46, total 2344
@@ -1,16 +1,33 @@
load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
load("//bzl:copts.bzl", "HASTUR_COPTS")
load("@rules_fuzzing//fuzzing:cc_defs.bzl", "cc_fuzz_test")
load("//bzl:copts.bzl", "HASTUR_COPTS", "HASTUR_FUZZ_PLATFORMS")
 
cc_library(
name = "rtti_hack",
srcs = ["rtti_hack.cpp"],
hdrs = ["rtti_hack.h"],
copts = HASTUR_COPTS + select({
"@platforms//os:windows": ["/GR"],
"//conditions:default": ["-frtti"],
}),
deps = ["@icu//:common"],
)
 
cc_library(
name = "url",
srcs = ["url.cpp"],
hdrs = ["url.h"],
copts = HASTUR_COPTS,
data = ["@icu-data//:icudt72l.dat"],
visibility = ["//visibility:public"],
deps = [
":rtti_hack",
"//util:base_parser",
"//util:string",
"//util:unicode",
"//util:uuid",
"@icu//:common",
"@spdlog",
],
)
 
@@ -22,5 +39,16 @@ cc_test(
deps = [
":url",
"//etest",
"@icu//:common",
],
)
 
cc_fuzz_test(
name = "url_fuzz_test",
size = "small",
srcs = ["url_fuzz_test.cpp"],
copts = HASTUR_COPTS,
corpus = glob(["url_fuzz_test_corpus/**"]),
target_compatible_with = HASTUR_FUZZ_PLATFORMS,
deps = [":url"],
)
 
filename was Deleted added: 2390, removed: 46, total 2344
@@ -0,0 +1,7 @@
// SPDX-FileCopyrightText: 2023 Robin Lindén <dev@robinlinden.eu>
//
// SPDX-License-Identifier: BSD-2-Clause
 
#include "url/rtti_hack.h"
 
template class icu::StringByteSink<std::string>;
 
filename was Deleted added: 2390, removed: 46, total 2344
@@ -0,0 +1,17 @@
// SPDX-FileCopyrightText: 2023 Robin Lindén <dev@robinlinden.eu>
//
// SPDX-License-Identifier: BSD-2-Clause
 
#ifndef URL_RTTI_HACK_H_
#define URL_RTTI_HACK_H_
 
#include <unicode/bytestream.h>
 
#include <string>
 
// icu needs to be compiled w/ rtti, and that means that any templates of theirs
// that we instantiate also require rtti, so we instantiate them here to try to
// shield the rest of the codebase from that.
extern template class icu::StringByteSink<std::string>;
 
#endif
 
url/url.cpp added: 2390, removed: 46, total 2344
@@ -4,15 +4,155 @@
 
#include "url/url.h"
 
#include "url/rtti_hack.h"
 
#include "util/string.h"
#include "util/unicode.h"
#include "util/uuid.h"
 
#include <spdlog/spdlog.h>
#include <unicode/bytestream.h>
#include <unicode/idna.h>
#include <unicode/putil.h>
#include <unicode/uclean.h>
 
#include <array>
#include <atomic>
#include <cassert>
#include <charconv>
#include <cmath>
#include <cstdint>
#include <cstdlib>
#include <filesystem>
#include <optional>
#include <string>
#include <string_view>
#include <tuple>
#include <utility>
#include <variant>
#include <vector>
 
namespace url {
// NOLINTBEGIN(misc-redundant-expression)
// NOLINTBEGIN(bugprone-unchecked-optional-access)
 
namespace {
const std::map<std::string, std::uint16_t> special_schemes = {{"ftp", std::uint16_t{21}},
{"file", std::uint16_t{0}},
{"http", std::uint16_t{80}},
{"https", std::uint16_t{443}},
{"ws", std::uint16_t{80}},
{"wss", std::uint16_t{443}}};
 
const std::map<UrlParser::ValidationError, std::string> validation_error_str = {
{UrlParser::ValidationError::DomainToAscii, "Unicode ToASCII records an error or returns the empty string"},
{UrlParser::ValidationError::DomainToUnicode, "Unicode ToUnicode records an error"},
{UrlParser::ValidationError::DomainInvalidCodePoint, "The input's host contains a forbidden domain code point"},
{UrlParser::ValidationError::HostInvalidCodePoint,
"An opaque host (in a URL that is not special) contains a forbidden host code point"},
{UrlParser::ValidationError::IPv4EmptyPart, "An IPv4 address ends with a U+002E (.)"},
{UrlParser::ValidationError::IPv4TooManyParts, "An IPv4 address does not consist of exactly 4 parts"},
{UrlParser::ValidationError::IPv4NonNumericPart, "An IPv4 address part is not numeric"},
{UrlParser::ValidationError::IPv4NonDecimalPart,
"The IPv4 address contains numbers expressed using hexadecimal or octal digits"},
{UrlParser::ValidationError::IPv4OutOfRangePart, "An IPv4 address part exceeds 255"},
{UrlParser::ValidationError::IPv6Unclosed, "An IPv6 address is missing the closing U+005D (])"},
{UrlParser::ValidationError::IPv6InvalidCompression, "An IPv6 address begins with improper compression"},
{UrlParser::ValidationError::IPv6TooManyPieces, "An IPv6 address contains more than 8 pieces"},
{UrlParser::ValidationError::IPv6MultipleCompression, "An IPv6 address is compressed in more than one spot"},
{UrlParser::ValidationError::IPv6InvalidCodePoint,
"An IPv6 address contains a code point that is neither an ASCII hex digit nor a U+003A (:), or it "
"unexpectedly ends"},
{UrlParser::ValidationError::IPv6TooFewPieces, "An uncompressed IPv6 address contains fewer than 8 pieces"},
{UrlParser::ValidationError::IPv4InIPv6TooManyPieces,
"An IPv6 address with IPv4 address syntax: the IPv6 address has more than 6 pieces"},
{UrlParser::ValidationError::IPv4InIPv6InvalidCodePoint,
"An IPv6 address with IPv4 address syntax: An IPv4 part is empty or contains a non-ASCII digit, an "
"IPv4 part contains a leading 0, or there are too many IPv4 parts"},
{UrlParser::ValidationError::IPv4InIPv6OutOfRangePart,
"An IPv6 address with IPv4 address syntax: an IPv4 part exceeds 255"},
{UrlParser::ValidationError::IPv4InIPv6TooFewParts,
"An IPv6 address with IPv4 address syntax: an IPv4 address contains too few parts"},
{UrlParser::ValidationError::InvalidUrlUnit, "A code point is found that is not a URL unit"},
{UrlParser::ValidationError::SpecialSchemeMissingFollowingSolidus,
"The input's scheme is not followed by \"//\""},
{UrlParser::ValidationError::MissingSchemeNonRelativeUrl,
"The input is missing a scheme, because it does not begin with an ASCII alpha, and either no base "
"URL was provided or the base URL cannot be used as a base URL because it has an opaque path"},
{UrlParser::ValidationError::InvalidReverseSolidus,
"The URL has a special scheme and it uses U+005C (\\) instead of U+002F (/)"},
{UrlParser::ValidationError::InvalidCredentials, "The input includes credentials"},
{UrlParser::ValidationError::HostMissing, "The input has a special scheme, but does not contain a host"},
{UrlParser::ValidationError::PortOutOfRange, "The input's port is too big"},
{UrlParser::ValidationError::PortInvalid, "The input's port is invalid"},
{UrlParser::ValidationError::FileInvalidWindowsDriveLetter,
"The input is a relative-URL string that starts with a Windows drive letter and the base URL's "
"scheme is \"file\""},
{UrlParser::ValidationError::FileInvalidWindowsDriveLetterHost,
"A file: URL's host is a Windows drive letter"}};
 
struct PercentEncodeSet {
static constexpr bool c0_control(char c) {
return util::is_c0(c) || c == 0x7f || static_cast<std::uint8_t>(c) > 0x7f;
}
 
static constexpr bool fragment(char c) {
return c0_control(c) || c == ' ' || c == '"' || c == '<' || c == '>' || c == '`';
}
 
static constexpr bool query(char c) {
return c0_control(c) || c == ' ' || c == '"' || c == '#' || c == '<' || c == '>';
}
 
static constexpr bool special_query(char c) { return query(c) || c == '\''; }
 
static constexpr bool path(char c) { return query(c) || c == '?' || c == '`' || c == '{' || c == '}'; }
 
static constexpr bool userinfo(char c) {
return path(c) || c == '/' || c == ':' || c == ';' || c == '=' || c == '@' || (c >= '[' && c <= '^')
|| c == '|';
}
 
static constexpr bool component(char c) { return userinfo(c) || (c >= '$' && c <= '&') || c == '+' || c == ','; }
};
} // namespace
 
void icu_cleanup() {
u_cleanup();
}
 
static void icu_init() {
static std::atomic<bool> called_once = false;
 
if (called_once.exchange(true)) {
return;
}
 
char *data = std::getenv("HASTUR_ICU_DATA");
 
if (data != nullptr) {
std::filesystem::path env_path{data};
 
if (std::filesystem::is_directory(env_path)) {
u_setDataDirectory(env_path.string().c_str());
}
} else {
// Use current working directory as a last resort.
// TODO(zero-one): Look at engine config for paths.
u_setDataDirectory(std::filesystem::current_path().string().c_str());
}
 
UErrorCode err = U_ZERO_ERROR;
 
std::uint32_t opts =
UIDNA_NONTRANSITIONAL_TO_ASCII | UIDNA_CHECK_BIDI | UIDNA_CHECK_CONTEXTJ | UIDNA_USE_STD3_RULES;
 
[[maybe_unused]] auto *uts = icu::IDNA::createUTS46Instance(opts, err);
 
assert(!U_FAILURE(err));
 
delete uts;
}
 
// https://w3c.github.io/FileAPI/#unicodeBlobURL
std::string blob_url_create(Origin const &origin) {
@@ -51,4 +191,1378 @@ std::string blob_url_create(Origin const &origin) {
return result;
}
 
void UrlParser::validation_error(ValidationError err) const {
spdlog::debug("url: InputPos: {}, ParserState: {}, Validation Error: {} {}",
current_pos(),
std::to_underlying(state_),
std::to_underlying(err),
validation_error_str.at(err));
}
 
// https://url.spec.whatwg.org/#concept-url-parser
std::optional<Url> UrlParser::parse(std::string input, std::optional<Url> base) {
if (input.empty() && !base.has_value()) {
return std::nullopt;
}
 
std::optional<Url> url = parse_basic(std::move(input), std::move(base), std::nullopt, std::nullopt);
 
if (url.has_value() && url->scheme == "blob") {
// TODO(zero-one): Resolve blob URL
}
 
return url;
}
 
// https://url.spec.whatwg.org/#concept-basic-url-parser
std::optional<Url> UrlParser::parse_basic(std::string input,
std::optional<Url> base, // NOLINT(bugprone-easily-swappable-parameters)
std::optional<Url> url, // NOLINT(bugprone-easily-swappable-parameters)
std::optional<ParserState> state_override) {
base_ = std::move(base);
state_override_ = state_override;
 
if (!url.has_value()) {
// Set url to a new URL
url_ = Url();
url_.path = std::vector<std::string>{};
 
bool leading_trailing_c0 = false;
 
while (!input.empty() && util::is_c0_or_space(input.front())) {
input.erase(0, 1);
 
leading_trailing_c0 = true;
}
 
while (!input.empty() && util::is_c0_or_space(input.back())) {
input.pop_back();
 
leading_trailing_c0 = true;
}
 
if (leading_trailing_c0) {
validation_error(ValidationError::InvalidUrlUnit);
}
} else {
url_ = *url;
}
 
if (std::erase_if(input, util::is_tab_or_newline) > 0) {
validation_error(ValidationError::InvalidUrlUnit);
}
 
state_ = state_override_.value_or(ParserState::SchemeStart);
 
buffer_.clear();
 
at_sign_seen_ = false;
inside_brackets_ = false;
password_token_seen_ = false;
 
// Initialize BaseParser with our modified input
reset(input);
 
while (true) {
switch (state_) {
case ParserState::SchemeStart:
state_scheme_start();
break;
case ParserState::Scheme:
state_scheme();
break;
case ParserState::NoScheme:
state_no_scheme();
break;
case ParserState::SpecialRelativeOrAuthority:
state_special_relative_or_authority();
break;
case ParserState::PathOrAuthority:
state_path_or_authority();
break;
case ParserState::Relative:
state_relative();
break;
case ParserState::RelativeSlash:
state_relative_slash();
break;
case ParserState::SpecialAuthoritySlashes:
state_special_authority_slashes();
break;
case ParserState::SpecialAuthorityIgnoreSlashes:
state_special_authority_ignore_slashes();
break;
case ParserState::Authority:
state_authority();
break;
case ParserState::Host:
case ParserState::Hostname:
state_host();
break;
case ParserState::Port:
state_port();
break;
case ParserState::File:
state_file();
break;
case ParserState::FileSlash:
state_file_slash();
break;
case ParserState::FileHost:
state_file_host();
break;
case ParserState::PathStart:
state_path_start();
break;
case ParserState::Path:
state_path();
break;
case ParserState::OpaquePath:
state_opaque_path();
break;
case ParserState::Query:
state_query();
break;
case ParserState::Fragment:
state_fragment();
break;
case ParserState::Failure:
return std::nullopt;
case ParserState::Terminate:
// I use this state where the spec returns "nothing" (i.e, the parser is modifying a given optional URL)
// Instead of modifying it in-place, I modify a copy and return that instead of nothing.
return url_;
}
 
// This check accomodates the one scenario (commented on in
// state_scheme_start, below) in which the parser position goes
// negative.
if (is_eof() && current_pos() != static_cast<std::size_t>(-1)) {
break;
}
 
advance(1);
}
 
return url_;
}
 
// https://url.spec.whatwg.org/#scheme-start-state
void UrlParser::state_scheme_start() {
if (auto c = peek(); c.has_value() && util::is_alpha(*c)) {
buffer_ += util::lowercased(*c);
 
state_ = ParserState::Scheme;
} else if (!state_override_.has_value()) {
state_ = ParserState::NoScheme;
 
// This can underflow pos_; that's ok, because it's incremented again before it's ever used.
back(1);
} else {
state_ = ParserState::Failure;
 
return;
}
}
 
// https://url.spec.whatwg.org/#scheme-state
void UrlParser::state_scheme() {
if (auto c = peek(); c.has_value() && (util::is_alphanumeric(*c) || c == '+' || c == '-' || c == '.')) {
buffer_ += util::lowercased(*c);
} else if (c == ':') {
if (state_override_.has_value()) {
if (special_schemes.contains(url_.scheme) && !special_schemes.contains(buffer_)) {
state_ = ParserState::Terminate;
 
return;
}
if (!special_schemes.contains(url_.scheme) && special_schemes.contains(buffer_)) {
state_ = ParserState::Terminate;
 
return;
}
if ((includes_credentials(url_) || url_.port.has_value()) && buffer_ == "file") {
state_ = ParserState::Terminate;
 
return;
}
if (url_.scheme == "file" && url_.host.has_value() && url_.host->type == HostType::Empty) {
state_ = ParserState::Terminate;
 
return;
}
}
 
url_.scheme = buffer_;
 
if (state_override_.has_value()) {
if (special_schemes.contains(url_.scheme) && url_.port == special_schemes.at(url_.scheme)) {
url_.port.reset();
}
 
state_ = ParserState::Terminate;
 
return;
}
 
buffer_.clear();
 
if (url_.scheme == "file") {
if (!remaining_from(1).starts_with("//")) {
validation_error(ValidationError::SpecialSchemeMissingFollowingSolidus);
}
 
state_ = ParserState::File;
} else if (special_schemes.contains(url_.scheme) && base_.has_value() && base_->scheme == url_.scheme) {
assert(special_schemes.contains(base_->scheme));
 
state_ = ParserState::SpecialRelativeOrAuthority;
} else if (special_schemes.contains(url_.scheme)) {
state_ = ParserState::SpecialAuthoritySlashes;
} else if (remaining_from(1).starts_with('/')) {
state_ = ParserState::PathOrAuthority;
 
advance(1);
} else {
url_.path = "";
 
state_ = ParserState::OpaquePath;
}
} else if (!state_override_.has_value()) {
buffer_.clear();
 
state_ = ParserState::NoScheme;
 
reset();
} else {
state_ = ParserState::Failure;
 
return;
}
}
 
// https://url.spec.whatwg.org/#no-scheme-state
void UrlParser::state_no_scheme() {
if (auto c = peek(); !base_.has_value() || (has_opaque_path(*base_) && c != '#')) {
validation_error(ValidationError::MissingSchemeNonRelativeUrl);
 
state_ = ParserState::Failure;
 
return;
} else if (has_opaque_path(*base_) && c == '#') {
url_.scheme = base_->scheme;
url_.path = base_->path;
url_.query = base_->query;
url_.fragment = "";
 
state_ = ParserState::Fragment;
} else if (base_->scheme != "file") {
state_ = ParserState::Relative;
 
back(1);
} else {
state_ = ParserState::File;
 
back(1);
}
}
 
// https://url.spec.whatwg.org/#special-relative-or-authority-state
void UrlParser::state_special_relative_or_authority() {
if (peek() == '/' && remaining_from(1).starts_with('/')) {
state_ = ParserState::SpecialAuthorityIgnoreSlashes;
 
advance(1);
} else {
validation_error(ValidationError::SpecialSchemeMissingFollowingSolidus);
 
state_ = ParserState::Relative;
 
back(1);
}
}
 
// https://url.spec.whatwg.org/#path-or-authority-state
void UrlParser::state_path_or_authority() {
if (peek() == '/') {
state_ = ParserState::Authority;
} else {
state_ = ParserState::Path;
 
back(1);
}
}
 
// https://url.spec.whatwg.org/#relative-state
void UrlParser::state_relative() {
assert(base_.has_value() && base_->scheme != "file");
 
url_.scheme = base_->scheme;
 
if (auto c = peek(); c == '/') {
state_ = ParserState::RelativeSlash;
} else if (special_schemes.contains(url_.scheme) && c == '\\') {
validation_error(ValidationError::InvalidReverseSolidus);
 
state_ = ParserState::RelativeSlash;
} else {
url_.user = base_->user;
url_.passwd = base_->passwd;
url_.host = base_->host;
url_.port = base_->port;
url_.path = base_->path;
url_.query = base_->query;
 
if (c == '?') {
url_.query = "";
 
state_ = ParserState::Query;
} else if (c == '#') {
url_.fragment = "";
 
state_ = ParserState::Fragment;
} else if (!is_eof()) {
url_.query.reset();
 
shorten_url_path(url_);
 
state_ = ParserState::Path;
 
back(1);
}
}
}
 
// https://url.spec.whatwg.org/#relative-slash-state
void UrlParser::state_relative_slash() {
if (auto c = peek(); special_schemes.contains(url_.scheme) && (c == '/' || c == '\\')) {
if (c == '\\') {
validation_error(ValidationError::InvalidReverseSolidus);
}
 
state_ = ParserState::SpecialAuthorityIgnoreSlashes;
} else if (c == '/') {
state_ = ParserState::Authority;
} else {
url_.user = base_->user;
url_.passwd = base_->passwd;
url_.host = base_->host;
url_.port = base_->port;
 
state_ = ParserState::Path;
 
back(1);
}
}
 
// https://url.spec.whatwg.org/#special-authority-slashes-state
void UrlParser::state_special_authority_slashes() {
if (peek() == '/' && remaining_from(1).starts_with('/')) {
state_ = ParserState::SpecialAuthorityIgnoreSlashes;
 
advance(1);
} else {
validation_error(ValidationError::SpecialSchemeMissingFollowingSolidus);
 
state_ = ParserState::SpecialAuthorityIgnoreSlashes;
 
back(1);
}
}
 
// https://url.spec.whatwg.org/#special-authority-ignore-slashes-state
void UrlParser::state_special_authority_ignore_slashes() {
if (auto c = peek(); c != '/' && c != '\\') {
state_ = ParserState::Authority;
 
back(1);
} else {
validation_error(ValidationError::SpecialSchemeMissingFollowingSolidus);
}
}
 
// https://url.spec.whatwg.org/#authority-state
void UrlParser::state_authority() {
if (auto c = peek(); c == '@') {
validation_error(ValidationError::InvalidCredentials);
 
if (at_sign_seen_) {
buffer_.insert(0, "%40");
}
 
at_sign_seen_ = true;
 
for (std::size_t i = 0; i < buffer_.size(); i++) {
if (buffer_[i] == ':' && !password_token_seen_) {
password_token_seen_ = true;
 
continue;
}
 
std::string encoded_code_points =
util::percent_encode(std::string_view{buffer_}.substr(i, 1), PercentEncodeSet::userinfo);
 
if (password_token_seen_) {
url_.passwd += encoded_code_points;
} else {
url_.user += encoded_code_points;
}
}
 
buffer_.clear();
} else if (is_eof() || c == '/' || c == '?' || c == '#' || (special_schemes.contains(url_.scheme) && c == '\\')) {
if (at_sign_seen_ && buffer_.empty()) {
validation_error(ValidationError::InvalidCredentials);
 
state_ = ParserState::Failure;
 
return;
}
 
// The spec says to use code-point length, but that causes the parser
// not to back up far enough; it will truncate characters going into
// the host state. It seems to only apply if you're parsing codepoint
// by codepoint instead of byte-by-byte like we are.
// back(util::utf8_length(buffer_) + 1);
back(buffer_.size() + 1);
 
buffer_.clear();
 
state_ = ParserState::Host;
} else {
buffer_ += *c;
}
}
 
// https://url.spec.whatwg.org/#host-state
void UrlParser::state_host() {
if (auto c = peek(); state_override_.has_value() && url_.scheme == "file") {
back(1);
 
state_ = ParserState::FileHost;
} else if (c == ':' && !inside_brackets_) {
if (buffer_.empty()) {
validation_error(ValidationError::HostMissing);
 
state_ = ParserState::Failure;
 
return;
}
 
if (state_override_.has_value() && *state_override_ == ParserState::Hostname) {
state_ = ParserState::Terminate;
 
return;
}
 
std::optional<Host> host = parse_host(buffer_, !special_schemes.contains(url_.scheme));
 
if (!host.has_value()) {
state_ = ParserState::Failure;
 
return;
}
 
url_.host = host;
 
buffer_.clear();
 
state_ = ParserState::Port;
} else if ((is_eof() || c == '/' || c == '?' || c == '#') || (special_schemes.contains(url_.scheme) && c == '\\')) {
back(1);
 
if (special_schemes.contains(url_.scheme) && buffer_.empty()) {
validation_error(ValidationError::HostMissing);
 
state_ = ParserState::Failure;
 
return;
} else if (state_override_.has_value() && buffer_.empty()
&& (includes_credentials(url_) || url_.port.has_value())) {
state_ = ParserState::Terminate;
 
return;
}
 
std::optional<Host> host = parse_host(buffer_, !special_schemes.contains(url_.scheme));
 
if (!host.has_value()) {
state_ = ParserState::Failure;
 
return;
}
 
url_.host = host;
 
buffer_.clear();
 
state_ = ParserState::PathStart;
 
if (state_override_.has_value()) {
state_ = ParserState::Terminate;
 
return;
}
} else {
if (c == '[') {
inside_brackets_ = true;
}
if (c == ']') {
inside_brackets_ = false;
}
 
buffer_ += *c;
}
}
 
// https://url.spec.whatwg.org/#port-state
void UrlParser::state_port() {
if (auto c = peek(); c.has_value() && util::is_digit(*c)) {
buffer_ += *c;
} else if ((is_eof() || c == '/' || c == '?' || c == '#') || (special_schemes.contains(url_.scheme) && c == '\\')
|| state_override_.has_value()) {
if (!buffer_.empty()) {
std::uint32_t port;
 
auto res = std::from_chars(buffer_.data(), buffer_.data() + buffer_.size(), port);
 
if (res.ec == std::errc::invalid_argument || res.ec == std::errc::result_out_of_range) {
spdlog::info("Invalid port given in URL");
 
state_ = ParserState::Failure;
 
return;
}
 
if (port > std::pow(2, 16) - 1) {
validation_error(ValidationError::PortOutOfRange);
 
state_ = ParserState::Failure;
 
return;
}
 
if (special_schemes.contains(url_.scheme) && port == special_schemes.at(url_.scheme)) {
url_.port = std::nullopt;
} else {
url_.port = static_cast<std::uint16_t>(port);
}
 
buffer_.clear();
}
if (state_override_.has_value()) {
state_ = ParserState::Terminate;
 
return;
}
 
state_ = ParserState::PathStart;
 
back(1);
} else {
validation_error(ValidationError::PortInvalid);
 
state_ = ParserState::Failure;
 
return;
}
}
 
// https://url.spec.whatwg.org/#file-state
void UrlParser::state_file() {
url_.scheme = "file";
url_.host = Host{HostType::Empty};
 
if (auto c = peek(); c == '/' || c == '\\') {
if (c == '\\') {
validation_error(ValidationError::InvalidReverseSolidus);
}
 
state_ = ParserState::FileSlash;
} else if (base_.has_value() && base_->scheme == "file") {
url_.host = base_->host;
url_.path = base_->path;
url_.query = base_->query;
 
if (c == '?') {
url_.query = "";
 
state_ = ParserState::Query;
} else if (c == '#') {
url_.fragment = "";
 
state_ = ParserState::Fragment;
} else if (!is_eof()) {
url_.query = std::nullopt;
 
if (!starts_with_windows_drive_letter(remaining_from(1))) {
shorten_url_path(url_);
} else {
validation_error(ValidationError::FileInvalidWindowsDriveLetter);
 
url_.path = std::vector<std::string>{};
}
 
state_ = ParserState::Path;
 
back(1);
}
} else {
state_ = ParserState::Path;
 
back(1);
}
}
 
// https://url.spec.whatwg.org/#file-slash-state
void UrlParser::state_file_slash() {
if (auto c = peek(); c == '/' || c == '\\') {
if (c == '\\') {
validation_error(ValidationError::InvalidReverseSolidus);
}
 
state_ = ParserState::FileHost;
} else {
if (base_.has_value() && base_->scheme == "file") {
url_.host = base_->host;
 
if (!starts_with_windows_drive_letter(remaining_from(1))
&& is_normal_windows_drive_letter(std::get<1>(base_->path)[0])) {
std::get<1>(url_.path).push_back(std::get<1>(base_->path)[0]);
}
}
 
state_ = ParserState::Path;
 
back(1);
}
}
 
// https://url.spec.whatwg.org/#file-host-state
void UrlParser::state_file_host() {
if (auto c = peek(); is_eof() || c == '/' || c == '\\' || c == '?' || c == '#') {
back(1);
 
if (!state_override_.has_value() && is_windows_drive_letter(buffer_)) {
validation_error(ValidationError::FileInvalidWindowsDriveLetterHost);
 
state_ = ParserState::Path;
} else if (buffer_.empty()) {
url_.host = Host{HostType::Empty};
 
if (state_override_.has_value()) {
state_ = ParserState::Terminate;
 
return;
}
 
state_ = ParserState::PathStart;
} else {
std::optional<Host> host = parse_host(buffer_, !special_schemes.contains(url_.scheme));
 
if (!host.has_value()) {
state_ = ParserState::Failure;
 
return;
}
 
if (auto *h = std::get_if<0>(&host->data); h != nullptr && *h == "localhost") {
*h = "";
}
 
url_.host = host;
 
if (state_override_.has_value()) {
state_ = ParserState::Terminate;
 
return;
}
 
buffer_.clear();
 
state_ = ParserState::PathStart;
}
} else {
buffer_ += *c;
}
}
 
// https://url.spec.whatwg.org/#path-start-state
void UrlParser::state_path_start() {
if (auto c = peek(); special_schemes.contains(url_.scheme)) {
if (c == '\\') {
validation_error(ValidationError::InvalidReverseSolidus);
}
 
state_ = ParserState::Path;
 
if (c != '/' && c != '\\') {
back(1);
}
} else if (!state_override_.has_value() && c == '?') {
url_.query = "";
 
state_ = ParserState::Query;
} else if (!state_override_.has_value() && c == '#') {
url_.fragment = "";
 
state_ = ParserState::Fragment;
} else if (!is_eof()) {
state_ = ParserState::Path;
 
if (c != '/') {
back(1);
}
} else if (state_override_.has_value() && !url_.host.has_value()) {
std::get<1>(url_.path).push_back("");
}
}
 
// https://url.spec.whatwg.org/#path-state
void UrlParser::state_path() {
if (auto c = peek(); is_eof() || c == '/' || (special_schemes.contains(url_.scheme) && c == '\\')
|| (!state_override_.has_value() && (c == '?' || c == '#'))) {
if (special_schemes.contains(url_.scheme) && c == '\\') {
validation_error(ValidationError::InvalidReverseSolidus);
}
 
if (buffer_ == ".." || util::lowercased(buffer_) == ".%2e" || util::lowercased(buffer_) == "%2e."
|| util::lowercased(buffer_) == "%2e%2e") {
shorten_url_path(url_);
 
if (c != '/' && !(special_schemes.contains(url_.scheme) && c == '\\')) {
std::get<1>(url_.path).push_back("");
}
} else if ((buffer_ == "." || util::lowercased(buffer_) == "%2e")
&& (c != '/' && !(special_schemes.contains(url_.scheme) && c == '\\'))) {
std::get<1>(url_.path).push_back("");
} else if (buffer_ != "." && util::lowercased(buffer_) != "%2e") {
if (url_.scheme == "file" && std::get<1>(url_.path).empty() && is_windows_drive_letter(buffer_)) {
buffer_[1] = ':';
}
 
std::get<1>(url_.path).push_back(buffer_);
}
 
buffer_.clear();
 
if (c == '?') {
url_.query = "";
 
state_ = ParserState::Query;
}
 
if (c == '#') {
url_.fragment = "";
 
state_ = ParserState::Fragment;
}
} else {
if (!is_url_codepoint(util::utf8_to_utf32(remaining_from(0))) && c != '%') {
validation_error(ValidationError::InvalidUrlUnit);
}
 
if (c == '%'
&& (remaining_from(1).size() < 2 || !util::is_hex_digit(remaining_from(1)[0])
|| !util::is_hex_digit(remaining_from(1)[1]))) {
validation_error(ValidationError::InvalidUrlUnit);
}
 
buffer_ += util::percent_encode(*peek(1), PercentEncodeSet::path);
}
}
 
// https://url.spec.whatwg.org/#cannot-be-a-base-url-path-state
void UrlParser::state_opaque_path() {
if (auto c = peek(); c == '?') {
url_.query = "";
 
state_ = ParserState::Query;
} else if (c == '#') {
url_.fragment = "";
 
state_ = ParserState::Fragment;
} else {
if (!is_eof() && !is_url_codepoint(util::utf8_to_utf32(remaining_from(0))) && c != '%') {
validation_error(ValidationError::InvalidUrlUnit);
}
 
if (c == '%'
&& (remaining_from(1).size() < 2 || !util::is_hex_digit(remaining_from(1)[0])
|| !util::is_hex_digit(remaining_from(1)[1]))) {
validation_error(ValidationError::InvalidUrlUnit);
}
 
if (!is_eof()) {
std::get<0>(url_.path) += util::percent_encode(*peek(1), PercentEncodeSet::c0_control);
}
}
}
 
// https://url.spec.whatwg.org/#query-state
void UrlParser::state_query() {
if (auto c = peek(); (!state_override_.has_value() && c == '#') || is_eof()) {
if (special_schemes.contains(url_.scheme)) {
url_.query.value() += util::percent_encode(buffer_, PercentEncodeSet::special_query);
} else {
url_.query.value() += util::percent_encode(buffer_, PercentEncodeSet::query);
}
 
buffer_.clear();
 
if (c == '#') {
url_.fragment = "";
 
state_ = ParserState::Fragment;
}
} else if (!is_eof()) {
if (!is_url_codepoint(util::utf8_to_utf32(remaining_from(0))) && c != '%') {
validation_error(ValidationError::InvalidUrlUnit);
}
 
if (c == '%'
&& (remaining_from(1).size() < 2 || !util::is_hex_digit(remaining_from(1)[0])
|| !util::is_hex_digit(remaining_from(1)[1]))) {
validation_error(ValidationError::InvalidUrlUnit);
}
 
buffer_ += *c;
}
}
 
// https://url.spec.whatwg.org/#fragment-state
void UrlParser::state_fragment() {
if (auto c = peek(); !is_eof()) {
if (!is_url_codepoint(util::utf8_to_utf32(remaining_from(0))) && c != '%') {
validation_error(ValidationError::InvalidUrlUnit);
}
 
if (c == '%'
&& (remaining_from(1).size() < 2 || !util::is_hex_digit(remaining_from(1)[0])
|| !util::is_hex_digit(remaining_from(1)[1]))) {
validation_error(ValidationError::InvalidUrlUnit);
}
 
url_.fragment.value() += util::percent_encode(*peek(1), PercentEncodeSet::fragment);
}
}
 
// https://url.spec.whatwg.org/#concept-domain-to-ascii
std::optional<std::string> UrlParser::domain_to_ascii(std::string_view domain, bool be_strict) const {
icu_init();
 
std::string ascii_domain;
icu::StringByteSink<std::string> tmp{&ascii_domain};
 
icu::IDNAInfo inf;
UErrorCode err = U_ZERO_ERROR;
 
std::uint32_t opts = UIDNA_NONTRANSITIONAL_TO_ASCII | UIDNA_CHECK_BIDI | UIDNA_CHECK_CONTEXTJ;
 
if (be_strict) {
opts |= UIDNA_USE_STD3_RULES;
}
 
auto *uts = icu::IDNA::createUTS46Instance(opts, err);
 
if (U_FAILURE(err)) {
spdlog::info("Failed to create UTS46 instance, error {}; idna data probably missing from icu",
static_cast<std::int64_t>(err));
 
return std::nullopt;
}
 
err = U_ZERO_ERROR;
 
uts->nameToASCII_UTF8(domain, tmp, inf, err);
 
delete uts;
 
std::uint32_t proc_err = inf.getErrors();
 
// icu doesn't offer a flag to disable VerifyDnsLength or CheckHyphens, so just ignore those failures
proc_err &= ~UIDNA_ERROR_LEADING_HYPHEN;
proc_err &= ~UIDNA_ERROR_TRAILING_HYPHEN;
proc_err &= ~UIDNA_ERROR_HYPHEN_3_4;
 
if (!be_strict) {
proc_err &= ~UIDNA_ERROR_LABEL_TOO_LONG;
proc_err &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
}
 
// If domain or any label is empty, proc_err should contain UIDNA_ERROR_EMPTY_LABEL
if (U_FAILURE(err) || proc_err != 0) {
validation_error(ValidationError::DomainToAscii);
 
return std::nullopt;
}
 
return ascii_domain;
}
 
// https://url.spec.whatwg.org/#start-with-a-windows-drive-letter
bool UrlParser::starts_with_windows_drive_letter(std::string_view input) const {
if (input.size() < 2) {
return false;
}
 
if (!util::is_alpha(input[0]) || !(input[1] == ':' || input[1] == '|')) {
return false;
}
 
if (input.size() == 2) {
return true;
}
 
if (input.size() > 2 && (input[2] == '/' || input[2] == '\\' || input[2] == '?' || input[2] == '#')) {
return true;
}
 
return false;
}
 
// https://url.spec.whatwg.org/#shorten-a-urls-path
void UrlParser::shorten_url_path(Url &url) const {
assert(!std::holds_alternative<std::string>(url.path));
 
if (url.scheme == "file" && std::get<1>(url.path).size() == 1
&& is_normal_windows_drive_letter(std::get<1>(url.path)[0])) {
return;
}
 
if (!std::get<1>(url.path).empty()) {
std::get<1>(url.path).pop_back();
}
}
 
// https://url.spec.whatwg.org/#concept-host-parser
std::optional<Host> UrlParser::parse_host(std::string_view input, bool is_not_special) const {
assert(!input.empty());
 
if (input.starts_with("[")) {
if (!input.ends_with("]")) {
validation_error(ValidationError::IPv6Unclosed);
 
return std::nullopt;
}
 
input.remove_prefix(1);
input.remove_suffix(1);
 
std::optional<std::array<std::uint16_t, 8>> addr = parse_ipv6(input);
 
if (!addr.has_value()) {
return std::nullopt;
}
 
return Host{HostType::Ip6Addr, *addr};
}
 
if (is_not_special) {
std::optional<std::string> host = parse_opaque_host(input);
 
if (!host.has_value()) {
return std::nullopt;
}
 
return Host{HostType::Opaque, *host};
}
 
std::string domain = util::percent_decode(input);
 
std::optional<std::string> ascii_domain = domain_to_ascii(domain, false);
 
if (!ascii_domain.has_value()) {
return std::nullopt;
}
 
std::string forbidden = "\t\n\r #/:<>?@[\\]^|";
 
for (std::size_t i = 0; i < ascii_domain->size(); i++) {
if (forbidden.find_first_of(ascii_domain.value()[i]) != std::string::npos || ascii_domain.value()[i] <= 0x1f
|| ascii_domain.value()[i] == '%' || ascii_domain.value()[i] == 0x7f) {
validation_error(ValidationError::DomainInvalidCodePoint);
 
return std::nullopt;
}
}
 
if (ends_in_number(*ascii_domain)) {
std::optional<std::uint32_t> ip = parse_ipv4(*ascii_domain);
 
if (!ip.has_value()) {
return std::nullopt;
}
 
return Host{HostType::Ip4Addr, *ip};
}
 
return Host{HostType::DnsDomain, *ascii_domain};
}
 
// https://url.spec.whatwg.org/#ends-in-a-number-checker
bool UrlParser::ends_in_number(std::string_view input) const {
// Let parts be the result of strictly splitting input on U+002E (.)
std::vector<std::string_view> parts = util::split(input, ".");
 
if (parts.back().empty()) {
if (parts.size() == 1) {
return false;
}
 
parts.pop_back();
}
 
// If last part is non-empty and contains only ASCII digits, return true
if (!parts.back().empty()) {
if (std::ranges::all_of(parts.back(), util::is_digit)) {
return true;
}
}
 
// If parsing last part as an IPv4 number does not return failure, then return true
if (parse_ipv4_number(parts.back()).has_value()) {
return true;
}
 
return false;
}
 
// https://url.spec.whatwg.org/#concept-ipv4-parser
std::optional<std::uint32_t> UrlParser::parse_ipv4(std::string_view input) const {
std::vector<std::string_view> parts = util::split(input, ".");
 
if (parts.back().empty()) {
validation_error(ValidationError::IPv4EmptyPart);
 
if (parts.size() > 1) {
parts.pop_back();
}
}
 
if (parts.size() > 4) {
validation_error(ValidationError::IPv4TooManyParts);
 
return std::nullopt;
}
 
std::vector<std::uint64_t> numbers;
 
for (auto part : parts) {
std::optional<std::tuple<std::uint64_t, bool>> result = parse_ipv4_number(part);
 
if (!result.has_value()) {
validation_error(ValidationError::IPv4NonNumericPart);
 
return std::nullopt;
}
 
if (std::get<1>(*result)) {
validation_error(ValidationError::IPv4NonDecimalPart);
}
 
numbers.emplace_back(std::get<0>(*result));
}
 
for (std::size_t i = 0; i < numbers.size(); i++) {
if (numbers[i] > 255) {
validation_error(ValidationError::IPv4OutOfRangePart);
 
if (i != numbers.size() - 1) {
return std::nullopt;
}
}
}
 
if (numbers.back() >= std::pow(256, 5 - numbers.size())) {
return std::nullopt;
}
 
auto ipv4 = static_cast<std::uint32_t>(numbers.back());
 
numbers.pop_back();
 
for (std::size_t i = 0; i < numbers.size(); i++) {
ipv4 += static_cast<std::uint32_t>(numbers[i] * std::pow(256, 3 - i));
}
 
return ipv4;
}
 
// https://url.spec.whatwg.org/#ipv4-number-parser
std::optional<std::tuple<std::uint64_t, bool>> UrlParser::parse_ipv4_number(std::string_view input) const {
if (input.empty()) {
return std::nullopt;
}
 
bool v_err = false;
int r = 10;
 
if (input.size() >= 2 && (input.starts_with("0X") || input.starts_with("0x"))) {
v_err = true;
 
input.remove_prefix(2);
 
r = 16;
} else if (input.size() >= 2 && input.starts_with("0")) {
v_err = true;
 
input.remove_prefix(1);
 
r = 8;
}
 
if (input.empty()) {
return {{0, true}};
}
 
for (char i : input) {
if ((r == 10 && !util::is_digit(i)) || (r == 16 && !util::is_hex_digit(i))
|| (r == 8 && !util::is_octal_digit(i))) {
return std::nullopt;
}
}
 
std::uint64_t out;
 
auto res = std::from_chars(input.data(), input.data() + input.size(), out, r);
 
if (res.ec == std::errc::invalid_argument || res.ec == std::errc::result_out_of_range) {
spdlog::info("Invalid ipv4 number");
 
return std::nullopt;
}
 
return {{out, v_err}};
}
 
// https://url.spec.whatwg.org/#concept-ipv6-parser
std::optional<std::array<std::uint16_t, 8>> UrlParser::parse_ipv6(std::string_view input) const {
std::array<std::uint16_t, 8> address = {0, 0, 0, 0, 0, 0, 0, 0};
 
std::size_t piece_index = 0;
 
std::optional<std::size_t> compress;
 
std::size_t pointer = 0;
 
if (!input.empty() && input[pointer] == ':') {
if (!input.substr(1).starts_with(":")) {
validation_error(ValidationError::IPv6InvalidCompression);
 
return std::nullopt;
}
 
pointer += 2;
 
piece_index++;
 
compress = piece_index;
}
 
while (pointer < input.size()) {
if (piece_index == 8) {
validation_error(ValidationError::IPv6TooManyPieces);
 
return std::nullopt;
}
 
if (input[pointer] == ':') {
if (compress.has_value()) {
validation_error(ValidationError::IPv6MultipleCompression);
 
return std::nullopt;
}
 
pointer++;
 
piece_index++;
 
compress = piece_index;
 
continue;
}
 
std::uint64_t value = 0;
std::size_t length = 0;
 
for (; length < 4 && pointer < input.size() && util::is_hex_digit(input[pointer]); pointer++, length++) {
std::uint64_t out;
 
auto res = std::from_chars(input.data() + pointer, input.data() + pointer + 1, out, 16);
 
if (res.ec == std::errc::invalid_argument || res.ec == std::errc::result_out_of_range) {
spdlog::info("Invalid IPv6 input");
 
return std::nullopt;
}
 
value = value * 0x10 + out;
}
 
if (pointer < input.size() && input[pointer] == '.') {
if (length == 0) {
validation_error(ValidationError::IPv4InIPv6InvalidCodePoint);
 
return std::nullopt;
}
 
pointer -= length;
 
if (piece_index > 6) {
validation_error(ValidationError::IPv4InIPv6TooManyPieces);
 
return std::nullopt;
}
 
std::size_t numbers_seen = 0;
 
while (pointer < input.size()) {
std::optional<std::uint64_t> ipv4_piece;
 
if (numbers_seen > 0) {
if (pointer < input.size() && input[pointer] == '.' && numbers_seen < 4) {
pointer++;
} else {
validation_error(ValidationError::IPv4InIPv6InvalidCodePoint);
 
return std::nullopt;
}
}
 
if (pointer >= input.size() || !util::is_digit(input[pointer])) {
validation_error(ValidationError::IPv4InIPv6InvalidCodePoint);
 
return std::nullopt;
}
 
while (pointer < input.size() && util::is_digit(input[pointer])) {
std::uint64_t number;
 
auto res = std::from_chars(input.data() + pointer, input.data() + pointer + 1, number);
 
if (res.ec == std::errc::invalid_argument || res.ec == std::errc::result_out_of_range) {
spdlog::info("Invalid IPv6 input 2");
 
return std::nullopt;
}
 
if (!ipv4_piece.has_value()) {
ipv4_piece = number;
} else if (ipv4_piece == 0) {
validation_error(ValidationError::IPv4InIPv6InvalidCodePoint);
 
return std::nullopt;
} else {
ipv4_piece = *ipv4_piece * 10 + number;
}
 
if (ipv4_piece > 255) {
validation_error(ValidationError::IPv4InIPv6OutOfRangePart);
 
return std::nullopt;
}
 
pointer++;
}
 
address[piece_index] = static_cast<std::uint16_t>(address[piece_index] * 0x100ul + *ipv4_piece);
 
numbers_seen++;
 
if (numbers_seen == 2 || numbers_seen == 4) {
piece_index++;
}
}
 
if (numbers_seen != 4) {
validation_error(ValidationError::IPv4InIPv6TooFewParts);
 
return std::nullopt;
}
 
break;
} else if (pointer < input.size() && input[pointer] == ':') {
pointer++;
 
if (pointer >= input.size()) {
validation_error(ValidationError::IPv6InvalidCodePoint);
 
return std::nullopt;
}
} else if (pointer < input.size()) {
validation_error(ValidationError::IPv6InvalidCodePoint);
 
return std::nullopt;
}
 
address[piece_index] = static_cast<std::uint16_t>(value);
 
piece_index++;
}
 
if (compress.has_value()) {
std::size_t swaps = piece_index - *compress;
 
piece_index = 7;
 
for (; piece_index != 0 && swaps > 0; piece_index--, swaps--) {
std::uint16_t tmp = address[piece_index];
address[piece_index] = address[*compress + swaps - 1];
address[*compress + swaps - 1] = tmp;
}
} else if (!compress.has_value() && piece_index != 8) {
validation_error(ValidationError::IPv6TooFewPieces);
 
return std::nullopt;
}
 
return address;
}
 
// https://url.spec.whatwg.org/#concept-opaque-host-parser
std::optional<std::string> UrlParser::parse_opaque_host(std::string_view input) const {
std::string forbidden = "\t\n\r #/:<>?@[\\]^|";
 
for (char i : input) {
if (forbidden.find_first_of(i) != std::string_view::npos || i == '\0') {
validation_error(ValidationError::HostInvalidCodePoint);
 
return std::nullopt;
}
}
 
std::string_view tmp = input;
int len = 0;
 
while (!tmp.empty()) {
std::uint32_t cp = util::utf8_to_utf32(tmp);
 
len = util::unicode_utf8_byte_count(cp);
 
if (!is_url_codepoint(cp)) {
validation_error(ValidationError::InvalidUrlUnit);
}
 
if (tmp[0] == '%' && tmp.size() > 2 && (!util::is_hex_digit(tmp[1]) || !util::is_hex_digit(tmp[2]))) {
validation_error(ValidationError::InvalidUrlUnit);
}
 
// I don't *think* this can remove > size(), but maybe i should clamp it anyway
tmp.remove_prefix(len);
}
 
return util::percent_encode(input, PercentEncodeSet::c0_control);
}
 
bool UrlParser::is_url_codepoint(std::uint32_t cp) const {
return cp == '!' || cp == '$' || cp == '&' || cp == '\'' || cp == '(' || cp == ')' || cp == '*' || cp == '+'
|| cp == ',' || cp == '-' || cp == '.' || cp == '/' || cp == ':' || cp == ';' || cp == '=' || cp == '?'
|| cp == '@' || cp == '_' || cp == '~'
|| (cp >= 0x00a0 && cp <= 0x10fffd && !util::is_unicode_noncharacter(cp)
&& !util::is_unicode_surrogate(cp));
}
 
// NOLINTEND(bugprone-unchecked-optional-access)
// NOLINTEND(misc-redundant-expression)
} // namespace url
 
url/url.h added: 2390, removed: 46, total 2344
@@ -5,14 +5,21 @@
#ifndef URL_URL_H_
#define URL_URL_H_
 
#include "util/base_parser.h"
#include "util/string.h"
 
#include <array>
#include <cstdint>
#include <map>
#include <optional>
#include <string>
#include <string_view>
#include <variant>
 
namespace url {
 
void icu_cleanup();
 
enum class HostType { DnsDomain, Ip4Addr, Ip6Addr, Opaque, Empty };
 
struct Host {
@@ -31,10 +38,161 @@ struct Origin {
};
 
/**
* Generates a new Blob URL for the given origin
* Generates a new Blob URL for the given origin.
*/
std::string blob_url_create(Origin const &origin);
 
struct Url {
std::string scheme;
std::string user;
std::string passwd;
std::optional<Host> host;
std::optional<std::uint16_t> port;
std::variant<std::string, std::vector<std::string>> path;
std::optional<std::string> query;
std::optional<std::string> fragment;
};
 
// This parser is current with the WHATWG URL specification as of 1 March 2023
class UrlParser final : util::BaseParser {
public:
UrlParser() : BaseParser{""} {}
 
std::optional<Url> parse(std::string input, std::optional<Url> base = std::nullopt);
 
enum class ValidationError {
// IDNA
DomainToAscii,
DomainToUnicode,
// Host parsing
DomainInvalidCodePoint,
HostInvalidCodePoint,
IPv4EmptyPart,
IPv4TooManyParts,
IPv4NonNumericPart,
IPv4NonDecimalPart,
IPv4OutOfRangePart,
IPv6Unclosed,
IPv6InvalidCompression,
IPv6TooManyPieces,
IPv6MultipleCompression,
IPv6InvalidCodePoint,
IPv6TooFewPieces,
IPv4InIPv6TooManyPieces,
IPv4InIPv6InvalidCodePoint,
IPv4InIPv6OutOfRangePart,
IPv4InIPv6TooFewParts,
// URL parsing
InvalidUrlUnit,
SpecialSchemeMissingFollowingSolidus,
MissingSchemeNonRelativeUrl,
InvalidReverseSolidus,
InvalidCredentials,
HostMissing,
PortOutOfRange,
PortInvalid,
FileInvalidWindowsDriveLetter,
FileInvalidWindowsDriveLetterHost
};
 
private:
enum class ParserState {
SchemeStart,
Scheme,
NoScheme,
SpecialRelativeOrAuthority,
PathOrAuthority,
Relative,
RelativeSlash,
SpecialAuthoritySlashes,
SpecialAuthorityIgnoreSlashes,
Authority,
Host,
Hostname,
Port,
File,
FileSlash,
FileHost,
PathStart,
Path,
OpaquePath,
Query,
Fragment,
Failure,
Terminate
};
 
// Main parser
std::optional<Url> parse_basic(std::string input,
std::optional<Url> base,
std::optional<Url> url,
std::optional<ParserState> state_override);
 
void state_scheme_start();
void state_scheme();
void state_no_scheme();
void state_special_relative_or_authority();
void state_path_or_authority();
void state_relative();
void state_relative_slash();
void state_special_authority_slashes();
void state_special_authority_ignore_slashes();
void state_authority();
void state_host();
void state_port();
void state_file();
void state_file_slash();
void state_file_host();
void state_path_start();
void state_path();
void state_opaque_path();
void state_query();
void state_fragment();
 
void validation_error(ValidationError) const;
 
// Host parsing
std::optional<Host> parse_host(std::string_view input, bool is_not_special = false) const;
bool ends_in_number(std::string_view) const;
std::optional<std::uint32_t> parse_ipv4(std::string_view) const;
std::optional<std::tuple<std::uint64_t, bool>> parse_ipv4_number(std::string_view) const;
std::optional<std::array<std::uint16_t, 8>> parse_ipv6(std::string_view) const;
std::optional<std::string> parse_opaque_host(std::string_view) const;
bool is_url_codepoint(std::uint32_t) const;
 
// IDNA
std::optional<std::string> domain_to_ascii(std::string_view domain, bool be_strict) const;
 
// Misc
bool starts_with_windows_drive_letter(std::string_view) const;
void shorten_url_path(Url &) const;
 
constexpr bool includes_credentials(Url &url) const { return !url.user.empty() || !url.user.empty(); }
 
constexpr bool has_opaque_path(Url &url) const { return std::holds_alternative<std::string>(url.path); }
 
constexpr bool is_windows_drive_letter(std::string_view input) const {
return input.size() == 2 && util::is_alpha(input[0]) && (input[1] == ':' || input[1] == '|');
}
 
constexpr bool is_normal_windows_drive_letter(std::string_view input) const {
return input.size() == 2 && util::is_alpha(input[0]) && input[1] == ':';
}
 
// Parser state
Url url_;
std::optional<Url> base_;
std::optional<ParserState> state_override_;
 
ParserState state_ = ParserState::Failure;
 
std::string buffer_;
 
bool at_sign_seen_ = false;
bool inside_brackets_ = false;
bool password_token_seen_ = false;
};
 
} // namespace url
 
#endif
 
filename was Deleted added: 2390, removed: 46, total 2344
@@ -0,0 +1,18 @@
// SPDX-FileCopyrightText: 2023 David Zero <zero-one@zer0-one.net>
//
// SPDX-License-Identifier: BSD-2-Clause
 
#include "url/url.h"
 
// NOLINTNEXTLINE(readability-identifier-naming)
extern "C" int LLVMFuzzerTestOneInput(uint8_t const *data, size_t size);
 
// NOLINTNEXTLINE(readability-identifier-naming)
extern "C" int LLVMFuzzerTestOneInput(uint8_t const *data, size_t size) {
url::UrlParser p;
std::optional<url::Url> url;
 
url = p.parse({reinterpret_cast<char const *>(data), size});
 
return 0;
}
 
filename was Deleted added: 2390, removed: 46, total 2344
@@ -0,0 +1 @@
https://example.com:8080/index.html
 
filename was Deleted added: 2390, removed: 46, total 2344
@@ -0,0 +1 @@
https://[2001:db8:85a3::8a2e:370:7334]:631
No newline at end of file
 
filename was Deleted added: 2390, removed: 46, total 2344
@@ -0,0 +1 @@
https://[0000:0000:0000:0000:0000:ffff:4ccb:8c22]:631
No newline at end of file
 
filename was Deleted added: 2390, removed: 46, total 2344
@@ -0,0 +1 @@
https://[::ffff:76.203.140.34]:631
No newline at end of file
 
filename was Deleted added: 2390, removed: 46, total 2344
@@ -0,0 +1 @@
file:///home/zero-one/repos/hastur/README.md
 
filename was Deleted added: 2390, removed: 46, total 2344
@@ -0,0 +1 @@
file:///home/zero-one/repos/../hastur/README.md
 
filename was Deleted added: 2390, removed: 46, total 2344
@@ -0,0 +1 @@
file:///home/zero-one/repos/./hastur/README.md
 
filename was Deleted added: 2390, removed: 46, total 2344
@@ -0,0 +1 @@
file://C:\\Users\\zero-one\\repos\\hastur\\README.md
 
filename was Deleted added: 2390, removed: 46, total 2344
@@ -0,0 +1 @@
https://zero-one:testpass123@example.com/login.php
 
filename was Deleted added: 2390, removed: 46, total 2344
@@ -0,0 +1 @@
http://bücher.de
No newline at end of file
 
filename was Deleted added: 2390, removed: 46, total 2344
@@ -0,0 +1 @@
https://√.com/i/itunes.gif
No newline at end of file
 
filename was Deleted added: 2390, removed: 46, total 2344
@@ -0,0 +1 @@
https://ar.wikipedia.org/wiki/نجيب_محفوظ
No newline at end of file
 
filename was Deleted added: 2390, removed: 46, total 2344
@@ -0,0 +1 @@
tel:+1-555-555-5555
No newline at end of file
 
filename was Deleted added: 2390, removed: 46, total 2344
@@ -0,0 +1 @@
https://www.youtube.com/watch?v=2g5xkLqIElU&list=PLHwvDXmNUa92NlFPooY1P5tfDo4T85ORz&index=3
No newline at end of file
 
filename was Deleted added: 2390, removed: 46, total 2344
@@ -0,0 +1 @@
https://llanfairpwllgwyngyllgogerychwyrndrobwllllantysiliogogogoch.co.uk/images/platformticket.gif
No newline at end of file
 
filename was Deleted added: 2390, removed: 46, total 2344
@@ -0,0 +1 @@
https://github.com/robinlinden/hastur/actions/runs/4441133331/jobs/7795829478?pr=476#step:7:31
No newline at end of file
 
filename was Deleted added: 2390, removed: 46, total 2344
@@ -0,0 +1 @@
https://127.0.0.1:631
No newline at end of file
 
url/url_test.cpp added: 2390, removed: 46, total 2344
@@ -9,9 +9,18 @@
#include <array>
#include <cstdint>
#include <iostream>
#include <optional>
#include <regex>
#include <variant>
 
int main() {
const url::Url base{"https",
"",
"",
url::Host{url::HostType::DnsDomain, "example.com"},
std::uint16_t{8080},
std::vector<std::string>{"test", "index.php"}};
 
etest::test("blob URL generation", [] {
std::string regex_uuid = "[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}";
 
@@ -19,7 +28,7 @@ int main() {
url::Origin o = {"https", h, std::uint16_t{8080}, std::nullopt, false};
 
std::string blob = url::blob_url_create(o);
std::cout << "Generated Blob URL: " << blob << std::endl;
std::cout << std::endl << "Generated Blob URL: " << blob << std::endl;
 
etest::expect(std::regex_match(blob, std::regex("blob:https://example.com:8080/" + regex_uuid)));
 
@@ -36,11 +45,406 @@ int main() {
o = {"https", h, std::uint16_t{8080}, std::nullopt, false};
 
blob = url::blob_url_create(o);
std::cout << "Generated Blob URL: " << blob;
std::cout << "Generated Blob URL: " << blob << std::endl;
 
etest::expect(std::regex_match(
blob, std::regex("blob:https://\\[2001:db8:85a3::8a2e:370:7334\\]:8080/" + regex_uuid)));
});
 
return etest::run_all_tests();
etest::test("URL parsing: port and path", [] {
url::UrlParser p;
 
std::optional<url::Url> url = p.parse("https://example.com:8080/index.html");
 
etest::require(url.has_value());
 
etest::expect_eq(url->scheme, "https");
etest::expect_eq(std::get<0>(url->host->data), "example.com");
etest::expect_eq(url->port.value(), 8080);
etest::expect_eq(std::get<1>(url->path)[0], "index.html");
etest::expect(!url->query.has_value());
etest::expect(!url->fragment.has_value());
});
 
etest::test("URL parsing: 1 unicode char", [] {
url::UrlParser p;
 
std::optional<url::Url> url = p.parse("http://bücher.de");
 
etest::require(url.has_value());
 
etest::expect_eq(url->scheme, "http");
etest::expect_eq(std::get<0>(url->host->data), "xn--bcher-kva.de");
etest::expect(!url->port.has_value());
etest::expect_eq(std::get<1>(url->path)[0], "");
etest::expect(!url->query.has_value());
etest::expect(!url->fragment.has_value());
});
 
etest::test("URL parsing: 1 unicode char with path", [] {
url::UrlParser p;
 
std::optional<url::Url> url = p.parse("https://√.com/i/itunes.gif");
 
etest::require(url.has_value());
 
etest::expect_eq(url->scheme, "https");
etest::expect_eq(std::get<0>(url->host->data), "xn--19g.com");
etest::expect(!url->port.has_value());
etest::expect_eq(std::get<1>(url->path)[0], "i");
etest::expect_eq(std::get<1>(url->path)[1], "itunes.gif");
etest::expect(!url->query.has_value());
etest::expect(!url->fragment.has_value());
});
 
etest::test("URL parsing: unicode path", [] {
url::UrlParser p;
 
std::optional<url::Url> url = p.parse("https://ar.wikipedia.org/wiki/نجيب_محفوظ");
 
etest::require(url.has_value());
 
etest::expect_eq(url->scheme, "https");
etest::expect_eq(std::get<0>(url->host->data), "ar.wikipedia.org");
etest::expect(!url->port.has_value());
etest::expect_eq(std::get<1>(url->path)[0], "wiki");
etest::expect_eq(std::get<1>(url->path)[1], "%D9%86%D8%AC%D9%8A%D8%A8_%D9%85%D8%AD%D9%81%D9%88%D8%B8");
etest::expect(!url->query.has_value());
etest::expect(!url->fragment.has_value());
});
 
etest::test("URL parsing: tel URI", [] {
url::UrlParser p;
 
std::optional<url::Url> url = p.parse("tel:+1-555-555-5555");
 
etest::require(url.has_value());
 
etest::expect_eq(url->scheme, "tel");
etest::expect(!url->host.has_value());
etest::expect(!url->port.has_value());
etest::expect_eq(std::get<0>(url->path), "+1-555-555-5555");
etest::expect(!url->query.has_value());
etest::expect(!url->fragment.has_value());
});
 
etest::test("URL parsing: username and passwd in authority", [] {
url::UrlParser p;
 
std::optional<url::Url> url = p.parse("https://zero-one:testpass123@example.com/login.php");
 
etest::require(url.has_value());
 
etest::expect_eq(url->scheme, "https");
etest::expect_eq(url->user, "zero-one");
etest::expect_eq(url->passwd, "testpass123");
etest::expect_eq(std::get<0>(url->host->data), "example.com");
etest::expect(!url->port.has_value());
etest::expect_eq(std::get<1>(url->path)[0], "login.php");
etest::expect(!url->query.has_value());
etest::expect(!url->fragment.has_value());
});
 
etest::test("URL parsing: query", [] {
url::UrlParser p;
 
std::optional<url::Url> url =
p.parse("https://www.youtube.com/watch?v=2g5xkLqIElUlist=PLHwvDXmNUa92NlFPooY1P5tfDo4T85ORzindex=3");
 
etest::require(url.has_value());
 
etest::expect_eq(url->scheme, "https");
etest::expect_eq(std::get<0>(url->host->data), "www.youtube.com");
etest::expect(!url->port.has_value());
etest::expect_eq(std::get<1>(url->path)[0], "watch");
etest::expect_eq(url->query, "v=2g5xkLqIElUlist=PLHwvDXmNUa92NlFPooY1P5tfDo4T85ORzindex=3");
etest::expect(!url->fragment.has_value());
});
 
etest::test("URL parsing: Welsh", [] {
url::UrlParser p;
 
std::optional<url::Url> url = p.parse(
"https://llanfairpwllgwyngyllgogerychwyrndrobwllllantysiliogogogoch.co.uk/images/platformticket.gif");
 
etest::require(url.has_value());
 
etest::expect_eq(url->scheme, "https");
etest::expect_eq(
std::get<0>(url->host->data), "llanfairpwllgwyngyllgogerychwyrndrobwllllantysiliogogogoch.co.uk");
etest::expect(!url->port.has_value());
etest::expect_eq(std::get<1>(url->path)[0], "images");
etest::expect_eq(std::get<1>(url->path)[1], "platformticket.gif");
etest::expect(!url->query.has_value());
etest::expect(!url->fragment.has_value());
});
 
// This domain exceeds the maximum length of both a domain component/label and a FQDN
etest::test("URL parsing: extreme Welsh", [] {
url::UrlParser p;
 
std::optional<url::Url> url =
p.parse("https://"
"llanfairpwllgwyngyllgogerychwyrndrobwllllantysiliogogogochobwllllantysiliogogogochanfairpwllgw"
"yngyllgogerychgogerychwyrndrobwllllantysiliogogogochobwllllantysiliogogogochllanfairpwllgwyngy"
"llgogerychwyrndrobwllllantysiliogogogochobwllllantysiliogogogochanfairpwllgwyngyllgogerychgoge"
"rychwyrndrobwllllantysiliogogogochobwllllantysiliogogogoch.co.uk");
 
etest::require(url.has_value());
 
etest::expect_eq(url->scheme, "https");
etest::expect_eq(std::get<0>(url->host->data),
"llanfairpwllgwyngyllgogerychwyrndrobwllllantysiliogogogochobwllllantysiliogogogochanfairpwllgwyngyllgo"
"gerychgogerychwyrndrobwllllantysiliogogogochobwllllantysiliogogogochllanfairpwllgwyngyllgogerychwyrndr"
"obwllllantysiliogogogochobwllllantysiliogogogochanfairpwllgwyngyllgogerychgogerychwyrndrobwllllantysil"
"iogogogochobwllllantysiliogogogoch.co.uk");
etest::expect(!url->port.has_value());
etest::expect_eq(std::get<1>(url->path)[0], "");
etest::expect(!url->query.has_value());
etest::expect(!url->fragment.has_value());
});
 
etest::test("URL parsing: path, query, and fragment", [] {
url::UrlParser p;
 
std::optional<url::Url> url = p.parse(
"https://github.com/robinlinden/hastur/actions/runs/4441133331/jobs/7795829478?pr=476#step:7:31");
 
etest::require(url.has_value());
 
etest::expect_eq(url->scheme, "https");
etest::expect_eq(std::get<0>(url->host->data), "github.com");
etest::expect(!url->port.has_value());
etest::expect_eq(std::get<1>(url->path)[0], "robinlinden");
etest::expect_eq(std::get<1>(url->path)[1], "hastur");
etest::expect_eq(std::get<1>(url->path)[2], "actions");
etest::expect_eq(std::get<1>(url->path)[3], "runs");
etest::expect_eq(std::get<1>(url->path)[4], "4441133331");
etest::expect_eq(std::get<1>(url->path)[5], "jobs");
etest::expect_eq(std::get<1>(url->path)[6], "7795829478");
etest::expect_eq(url->query, "pr=476");
etest::expect_eq(url->fragment, "step:7:31");
});
 
etest::test("URL parsing: ipv4 and port", [] {
url::UrlParser p;
 
std::optional<url::Url> url = p.parse("https://127.0.0.1:631");
 
etest::require(url.has_value());
 
etest::expect_eq(url->scheme, "https");
etest::expect_eq(std::get<1>(url->host->data), 2130706433ul);
etest::expect_eq(url->port, 631);
etest::expect_eq(std::get<1>(url->path)[0], "");
etest::expect(!url->query.has_value());
etest::expect(!url->fragment.has_value());
});
 
etest::test("URL parsing: ipv6 and port", [] {
url::UrlParser p;
 
const std::array<std::uint16_t, 8> addr{0x2001, 0xdb8, 0x85a3, 0, 0, 0x8a2e, 0x370, 0x7334};
 
std::optional<url::Url> url = p.parse("https://[2001:db8:85a3::8a2e:370:7334]:631");
 
etest::require(url.has_value());
 
etest::expect_eq(url->scheme, "https");
etest::expect_eq(std::get<2>(url->host->data), addr);
etest::expect_eq(url->port, 631);
etest::expect_eq(std::get<1>(url->path)[0], "");
etest::expect(!url->query.has_value());
etest::expect(!url->fragment.has_value());
});
 
etest::test("URL parsing: ipv6 v4-mapped with port", [] {
url::UrlParser p;
 
const std::array<std::uint16_t, 8> addr{0, 0, 0, 0, 0, 0xffff, 0x4ccb, 0x8c22};
 
std::optional<url::Url> url = p.parse("https://[0000:0000:0000:0000:0000:ffff:4ccb:8c22]:631");
 
etest::require(url.has_value());
 
etest::expect_eq(url->scheme, "https");
etest::expect_eq(std::get<2>(url->host->data), addr);
etest::expect_eq(url->port, 631);
etest::expect_eq(std::get<1>(url->path)[0], "");
etest::expect(!url->query.has_value());
etest::expect(!url->fragment.has_value());
});
 
etest::test("URL parsing: ipv6 v4-mapped compressed with dot-decimal", [] {
url::UrlParser p;
 
const std::array<std::uint16_t, 8> addr{0, 0, 0, 0, 0, 0xffff, 0x4ccb, 0x8c22};
 
std::optional<url::Url> url = p.parse("https://[::ffff:76.203.140.34]:631");
 
etest::require(url.has_value());
 
etest::expect_eq(url->scheme, "https");
etest::expect_eq(std::get<2>(url->host->data), addr);
etest::expect_eq(url->port, 631);
etest::expect_eq(std::get<1>(url->path)[0], "");
etest::expect(!url->query.has_value());
etest::expect(!url->fragment.has_value());
});
 
etest::test("URL parsing: empty input", [] {
url::UrlParser p;
 
std::optional<url::Url> url = p.parse("");
 
etest::expect(!url.has_value());
});
 
etest::test("URL parsing: empty input with base URL", [&base] {
url::UrlParser p;
 
std::optional<url::Url> url = p.parse("", base);
 
etest::require(url.has_value());
 
etest::expect_eq(url->scheme, "https");
etest::expect_eq(std::get<0>(url->host->data), "example.com");
etest::expect_eq(url->port, 8080);
etest::expect_eq(std::get<1>(url->path)[0], "test");
etest::expect_eq(std::get<1>(url->path)[1], "index.php");
etest::expect(!url->query.has_value());
etest::expect(!url->fragment.has_value());
});
 
etest::test("URL parsing: query input with base URL", [&base] {
url::UrlParser p;
 
std::optional<url::Url> url = p.parse("?view=table", base);
 
etest::require(url.has_value());
 
etest::expect_eq(url->scheme, "https");
etest::expect_eq(std::get<0>(url->host->data), "example.com");
etest::expect_eq(url->port, 8080);
etest::expect_eq(std::get<1>(url->path)[0], "test");
etest::expect_eq(std::get<1>(url->path)[1], "index.php");
etest::expect_eq(url->query, "view=table");
etest::expect(!url->fragment.has_value());
});
 
etest::test("URL parsing: file URL", [] {
url::UrlParser p;
 
std::optional<url::Url> url = p.parse("file:///home/zero-one/repos/hastur/README.md");
 
etest::require(url.has_value());
 
etest::expect_eq(url->scheme, "file");
etest::expect_eq(std::get<0>(url->host->data), "");
etest::expect(!url->port.has_value());
etest::expect_eq(std::get<1>(url->path)[0], "home");
etest::expect_eq(std::get<1>(url->path)[1], "zero-one");
etest::expect_eq(std::get<1>(url->path)[2], "repos");
etest::expect_eq(std::get<1>(url->path)[3], "hastur");
etest::expect_eq(std::get<1>(url->path)[4], "README.md");
etest::expect(!url->query.has_value());
etest::expect(!url->fragment.has_value());
});
 
etest::test("URL parsing: file URL with double-dot", [] {
url::UrlParser p;
 
std::optional<url::Url> url = p.parse("file:///home/zero-one/repos/../hastur/README.md");
 
etest::require(url.has_value());
 
etest::expect_eq(url->scheme, "file");
etest::expect_eq(std::get<0>(url->host->data), "");
etest::expect(!url->port.has_value());
etest::expect_eq(std::get<1>(url->path)[0], "home");
etest::expect_eq(std::get<1>(url->path)[1], "zero-one");
etest::expect_eq(std::get<1>(url->path)[2], "hastur");
etest::expect_eq(std::get<1>(url->path)[3], "README.md");
etest::expect(!url->query.has_value());
etest::expect(!url->fragment.has_value());
});
 
etest::test("URL parsing: file URL with double-dot 2", [] {
url::UrlParser p;
 
std::optional<url::Url> url = p.parse("file:///home/zero-one/repos/../hastur/../README.md");
 
etest::require(url.has_value());
 
etest::expect_eq(url->scheme, "file");
etest::expect_eq(std::get<0>(url->host->data), "");
etest::expect(!url->port.has_value());
etest::expect_eq(std::get<1>(url->path)[0], "home");
etest::expect_eq(std::get<1>(url->path)[1], "zero-one");
etest::expect_eq(std::get<1>(url->path)[2], "README.md");
etest::expect(!url->query.has_value());
etest::expect(!url->fragment.has_value());
});
 
etest::test("URL parsing: file URL with double-dot 3", [] {
url::UrlParser p;
 
std::optional<url::Url> url = p.parse("file:///../home/zero-one/repos/");
 
etest::require(url.has_value());
 
etest::expect_eq(url->scheme, "file");
etest::expect_eq(std::get<0>(url->host->data), "");
etest::expect(!url->port.has_value());
etest::expect_eq(std::get<1>(url->path)[0], "home");
etest::expect_eq(std::get<1>(url->path)[1], "zero-one");
etest::expect_eq(std::get<1>(url->path)[2], "repos");
etest::expect(!url->query.has_value());
etest::expect(!url->fragment.has_value());
});
 
etest::test("URL parsing: file URL with single-dot", [] {
url::UrlParser p;
 
std::optional<url::Url> url = p.parse("file:///home/zero-one/repos/./hastur/README.md");
 
etest::require(url.has_value());
 
etest::expect_eq(url->scheme, "file");
etest::expect_eq(std::get<0>(url->host->data), "");
etest::expect(!url->port.has_value());
etest::expect_eq(std::get<1>(url->path)[0], "home");
etest::expect_eq(std::get<1>(url->path)[1], "zero-one");
etest::expect_eq(std::get<1>(url->path)[2], "repos");
etest::expect_eq(std::get<1>(url->path)[3], "hastur");
etest::expect_eq(std::get<1>(url->path)[4], "README.md");
etest::expect(!url->query.has_value());
etest::expect(!url->fragment.has_value());
});
 
etest::test("URL parsing: file URL with windows path", [] {
url::UrlParser p;
 
std::optional<url::Url> url = p.parse(R"(file://C:\Users\zero-one\repos\hastur\README.md)");
 
etest::require(url.has_value());
 
etest::expect_eq(url->scheme, "file");
etest::expect_eq(std::get<0>(url->host->data), "");
etest::expect(!url->port.has_value());
etest::expect_eq(std::get<1>(url->path)[0], "C:");
etest::expect_eq(std::get<1>(url->path)[1], "Users");
etest::expect_eq(std::get<1>(url->path)[2], "zero-one");
etest::expect_eq(std::get<1>(url->path)[3], "repos");
etest::expect_eq(std::get<1>(url->path)[4], "hastur");
etest::expect_eq(std::get<1>(url->path)[5], "README.md");
etest::expect(!url->query.has_value());
etest::expect(!url->fragment.has_value());
});
 
int ret = etest::run_all_tests();
 
url::icu_cleanup();
 
return ret;
}
 
util/base_parser.h added: 2390, removed: 46, total 2344
@@ -10,6 +10,7 @@
 
#include <concepts>
#include <cstddef>
#include <optional>
#include <string_view>
 
namespace util {
@@ -17,13 +18,30 @@ namespace util {
template<typename T>
concept Predicate = std::predicate<T, char>;
 
// NOLINTBEGIN(bugprone-unchecked-optional-access)
class BaseParser {
public:
constexpr explicit BaseParser(std::string_view input) : input_{input} {}
 
constexpr char peek() const { return input_[pos_]; }
constexpr std::optional<char> peek() const {
if (is_eof()) {
return std::nullopt;
}
 
constexpr std::string_view peek(std::size_t chars) const { return input_.substr(pos_, chars); }
return input_[pos_];
}
 
constexpr std::optional<std::string_view> peek(std::size_t chars) const {
if (is_eof()) {
return std::nullopt;
}
 
return input_.substr(pos_, chars);
}
 
constexpr std::string_view remaining_from(std::size_t skip) const {
return pos_ + skip >= input_.size() ? "" : input_.substr(pos_ + skip);
}
 
constexpr bool starts_with(std::string_view prefix) const { return peek(prefix.size()) == prefix; }
 
@@ -42,6 +60,8 @@ public:
pos_ = 0;
}
 
constexpr std::size_t current_pos() const { return pos_; }
 
template<Predicate T>
constexpr std::string_view consume_while(T const &pred) {
std::size_t start = pos_;
@@ -52,7 +72,7 @@ public:
}
 
constexpr void skip_whitespace() {
while (!is_eof() && util::is_whitespace(peek())) {
while (!is_eof() && util::is_whitespace(*peek())) {
advance(1);
}
}
@@ -61,6 +81,7 @@ private:
std::string_view input_;
std::size_t pos_{0};
};
// NOLINTEND(bugprone-unchecked-optional-access)
 
} // namespace util
 
 
util/string.h added: 2390, removed: 46, total 2344
@@ -9,9 +9,14 @@
 
#include <algorithm>
#include <array>
#include <cassert>
#include <charconv>
#include <concepts>
#include <cstdint>
#include <iomanip>
#include <ios>
#include <iterator>
#include <optional>
#include <span>
#include <sstream>
#include <string>
@@ -65,6 +70,10 @@ constexpr bool is_hex_digit(char c) {
return is_upper_hex_digit(c) || is_lower_hex_digit(c);
}
 
constexpr bool is_octal_digit(char c) {
return c >= '0' && c <= '7';
}
 
constexpr char lowercased(char c) {
if (!is_upper_alpha(c)) {
return c;
@@ -196,6 +205,49 @@ inline std::string ipv6_serialize(std::span<std::uint16_t, 8> addr) {
return std::move(out).str();
}
 
// https://url.spec.whatwg.org/#string-percent-encode-after-encoding
inline std::string percent_encode(
std::string_view input, std::predicate<char> auto in_encode_set, bool space_as_plus = false) {
std::stringstream out;
 
for (char i : input) {
if (space_as_plus && i == ' ') {
out << '+';
} else if (in_encode_set(i)) {
out << '%' << std::setw(2) << std::uppercase << std::hex
<< static_cast<unsigned int>(static_cast<unsigned char>(i));
} else {
out << i;
}
}
 
return std::move(out).str();
}
 
// https://url.spec.whatwg.org/#percent-decode
constexpr std::string percent_decode(std::string_view input) {
std::string output;
 
for (std::size_t i = 0; i < input.size(); i++) {
if (input[i] != '%' || (input.size() <= i + 2 || !is_hex_digit(input[i + 1]) || !is_hex_digit(input[i + 2]))) {
output += input[i];
} else {
std::string_view digits = input.substr(i + 1, 2);
std::uint8_t num;
 
[[maybe_unused]] auto res = std::from_chars(digits.data(), digits.data() + digits.size(), num, 16);
 
assert(res.ec != std::errc::invalid_argument && res.ec != std::errc::result_out_of_range);
 
output += static_cast<char>(num);
 
i += 2;
}
}
 
return output;
}
 
} // namespace util
 
#endif
 
util/unicode.h added: 2390, removed: 46, total 2344
@@ -6,6 +6,7 @@
#define UTIL_UNICODE_H_
 
#include <cstdint>
#include <optional>
#include <string>
 
namespace util {
@@ -113,6 +114,70 @@ constexpr bool is_unicode_noncharacter(std::uint32_t code_point) {
}
}
 
// Takes a UTF-8 encoded codepoint, and returns the codepoint value.
//
// Note: This routine assumes that the input is a valid UTF-8 string. Strings that are too short return 0.
constexpr std::uint32_t utf8_to_utf32(std::string_view input) {
std::uint32_t codepoint = 0;
 
if (!input.empty() && (input[0] & 0b10000000) == 0b00000000) {
codepoint = static_cast<unsigned char>(input[0]);
} else if (input.size() > 1 && (input[0] & 0b11100000) == 0b11000000) {
codepoint = ((input[0] & 0b00011111) << 6) | (input[1] & 0b00111111);
} else if (input.size() > 2 && (input[0] & 0b11110000) == 0b11100000) {
codepoint = ((input[0] & 0b00001111) << 12) | ((input[1] & 0b00111111) << 6) | (input[2] & 0b00111111);
} else if (input.size() > 3 && (input[0] & 0b11111000) == 0b11110000) {
codepoint = ((input[0] & 0b00000111) << 18) | ((input[1] & 0b00111111) << 12) | ((input[2] & 0b00111111) << 6)
| (input[3] & 0b00111111);
}
 
return codepoint;
}
 
// Calculates codepoint length of a UTF-8 string.
//
// Note: This routine assumes that the string is valid UTF-8, otherwise we need
// to check if the bytes following the first byte of the codepoint are correct
// instead of just advancing the index.
//
// For incorrectly-encoded strings which do not have enough data to match the
// size suggested by the initial code unit, this function returns std::nullopt
constexpr std::optional<std::size_t> utf8_length(std::string_view input) {
std::size_t len = 0;
 
for (std::size_t i = 0; i < input.size(); i++) {
if ((input[i] & 0b10000000) == 0b00000000) {
len++;
} else if ((input[i] & 0b11100000) == 0b11000000) {
i++;
 
if (input.size() <= i) {
return std::nullopt;
}
 
len++;
} else if ((input[i] & 0b11110000) == 0b11100000) {
i += 2;
 
if (input.size() <= i) {
return std::nullopt;
}
 
len++;
} else if ((input[i] & 0b11111000) == 0b11110000) {
i += 3;
 
if (input.size() <= i) {
return std::nullopt;
}
 
len++;
}
}
 
return len;
}
 
} // namespace util
 
#endif
 
util/unicode_test.cpp added: 2390, removed: 46, total 2344
@@ -78,5 +78,29 @@ int main() {
}
});
 
etest::test("utf8_to_utf32", [] {
expect_eq(utf8_to_utf32("/"sv), 0x002ful);
 
expect_eq(utf8_to_utf32("Д"sv), 0x0414ul);
 
expect_eq(utf8_to_utf32("ᛋ"sv), 0x16cbul);
 
expect_eq(utf8_to_utf32("🫸"sv), 0x1faf8ul);
 
// Pass several codepoints, it should just decode the first one
expect_eq(utf8_to_utf32("🯷🯷🯷"sv), 0x1fbf7ul);
});
 
etest::test("utf8_length", [] {
expect_eq(utf8_length("🮻"sv), 1ul);
expect_eq(utf8_length("This string is 33 characters long"sv), 33ul);
expect_eq(utf8_length("🤖🤖🤖"sv), 3ul);
expect_eq(utf8_length("🆒🆒🆒🆒🆒🆒🆒!"sv), 8ul);
 
// First byte suggests a 2-byte char, but we don't supply the 2nd byte
std::string invalid{static_cast<char>(0b11000000)};
expect_eq(utf8_length(invalid), std::nullopt);
});
 
return etest::run_all_tests();
}