srctree

David Zero parent 243b7a8b ded99b12
url/url: Add URL parser

inline split

.bazelrc added: 2390, removed: 46, total 2344

@@ -6,6 +6,8 @@ coverage --combined_report=lcov

test --test_output=errors

test --test_summary=terse

test --test_verbose_timeout_warnings

# Set ICU data directory for tests

test --test_env=HASTUR_ICU_DATA=external/icu-data/

# Bazel deprecations

# =========================================================

.clang-tidy added: 2390, removed: 46, total 2344

@@ -4,6 +4,8 @@

# -bugprone-narrowing-conversions: Very noisy for not much gain.

# -bugprone-unchecked-optional-access: Makes clang-tidy hang during CI.

# -clang-analyzer-cplusplus.NewDeleteLeaks: Lots of false positives w/

# -std=c++2b when calling std::make_shared in the JS AST.

# js/ast_executor_test.cpp:176:5: error: Potential leak of memory pointed to by

@@ -54,6 +56,7 @@ Checks: >

readability-qualified-auto,

-bugprone-exception-escape,

-bugprone-narrowing-conversions,

-bugprone-unchecked-optional-access,

-clang-analyzer-cplusplus.NewDeleteLeaks,

-clang-analyzer-optin.cplusplus.UninitializedObject,

-clang-diagnostic-builtin-macro-redefined,

.github/workflows/ci.yaml added: 2390, removed: 46, total 2344

@@ -266,7 +266,7 @@ jobs:

timeout-minutes: 30

steps:

- uses: actions/checkout@v3

- run: grep --recursive --no-filename --only-matching --exclude=WORKSPACE --exclude=*test.cpp --exclude=ci.yaml 'https://[^)(}{",# ]*' | grep -v '^https://$' | sort | uniq | xargs wget --spider

- run: grep --recursive --no-filename --only-matching --exclude-dir="*corpus" --exclude=WORKSPACE --exclude=*test.cpp --exclude=ci.yaml 'https://[^)(}{",# ]*' | grep -v '^https://$' | sort | uniq | xargs wget --spider

gitlint:

runs-on: ubuntu-22.04

WORKSPACE added: 2390, removed: 46, total 2344

@@ -119,6 +119,13 @@ http_archive(

)

# https://github.com/ocornut/imgui

http_archive(

name = "icu-data", # Unicode-DFS-2016

build_file_content = "exports_files([\"icudt72l.dat\"])",

sha256 = "1bc02487cbeaec3fc2d0dc941e8b243e7d35cd79899a201df88dc9ec9667a162",

url = "https://github.com/unicode-org/icu/releases/download/release-72-1/icu4c-72_1-data-bin-l.zip",

)

http_archive(

name = "imgui", # MIT

build_file = "//third_party:imgui.BUILD",

third_party/icu.BUILD added: 2390, removed: 46, total 2344

@@ -31,13 +31,20 @@ cc_library(

"//conditions:default": [],

}),

defines = [

"U_STATIC_IMPLEMENTATION",

"U_COMMON_IMPLEMENTATION",

"U_CHARSET_IS_UTF8=1",

"U_HIDE_OBSOLETE_UTF_OLD_H=1",

"UCONFIG_NO_CONVERSION=1",

linkopts = select({

"@platforms//os:windows": [],

"@platforms//os:windows": [

"-DEFAULTLIB:advapi32",

"//conditions:default": ["-ldl"],

}),

local_defines = [

"U_COMMON_IMPLEMENTATION",

linkstatic = True,

strip_include_prefix = "source/common/",

visibility = ["//visibility:public"],

)

url/BUILD added: 2390, removed: 46, total 2344

@@ -1,16 +1,33 @@

load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")

load("//bzl:copts.bzl", "HASTUR_COPTS")

load("@rules_fuzzing//fuzzing:cc_defs.bzl", "cc_fuzz_test")

load("//bzl:copts.bzl", "HASTUR_COPTS", "HASTUR_FUZZ_PLATFORMS")

cc_library(

name = "rtti_hack",

srcs = ["rtti_hack.cpp"],

hdrs = ["rtti_hack.h"],

copts = HASTUR_COPTS + select({

"@platforms//os:windows": ["/GR"],

"//conditions:default": ["-frtti"],

}),

deps = ["@icu//:common"],

)

cc_library(

name = "url",

srcs = ["url.cpp"],

hdrs = ["url.h"],

copts = HASTUR_COPTS,

data = ["@icu-data//:icudt72l.dat"],

visibility = ["//visibility:public"],

deps = [

":rtti_hack",

"//util:base_parser",

"//util:string",

"//util:unicode",

"//util:uuid",

"@icu//:common",

"@spdlog",

)

@@ -22,5 +39,16 @@ cc_test(

deps = [

":url",

"//etest",

"@icu//:common",

)

cc_fuzz_test(

name = "url_fuzz_test",

size = "small",

srcs = ["url_fuzz_test.cpp"],

copts = HASTUR_COPTS,

corpus = glob(["url_fuzz_test_corpus/**"]),

target_compatible_with = HASTUR_FUZZ_PLATFORMS,

deps = [":url"],

)

filename was Deleted added: 2390, removed: 46, total 2344

@@ -0,0 +1,7 @@

// SPDX-FileCopyrightText: 2023 Robin Lindén <dev@robinlinden.eu>

// SPDX-License-Identifier: BSD-2-Clause

#include "url/rtti_hack.h"

template class icu::StringByteSink<std::string>;

filename was Deleted added: 2390, removed: 46, total 2344

@@ -0,0 +1,17 @@

// SPDX-FileCopyrightText: 2023 Robin Lindén <dev@robinlinden.eu>

// SPDX-License-Identifier: BSD-2-Clause

#ifndef URL_RTTI_HACK_H_

#define URL_RTTI_HACK_H_

#include <unicode/bytestream.h>

#include <string>

// icu needs to be compiled w/ rtti, and that means that any templates of theirs

// that we instantiate also require rtti, so we instantiate them here to try to

// shield the rest of the codebase from that.

extern template class icu::StringByteSink<std::string>;

#endif

url/url.cpp added: 2390, removed: 46, total 2344

@@ -4,15 +4,155 @@

#include "url/url.h"

#include "url/rtti_hack.h"

#include "util/string.h"

#include "util/unicode.h"

#include "util/uuid.h"

#include <spdlog/spdlog.h>

#include <unicode/bytestream.h>

#include <unicode/idna.h>

#include <unicode/putil.h>

#include <unicode/uclean.h>

#include <array>

#include <atomic>

#include <cassert>

#include <charconv>

#include <cmath>

#include <cstdint>

#include <cstdlib>

#include <filesystem>

#include <optional>

#include <string>

#include <string_view>

#include <tuple>

#include <utility>

#include <variant>

#include <vector>

namespace url {

// NOLINTBEGIN(misc-redundant-expression)

// NOLINTBEGIN(bugprone-unchecked-optional-access)

namespace {

const std::map<std::string, std::uint16_t> special_schemes = {{"ftp", std::uint16_t{21}},

{"file", std::uint16_t{0}},

{"http", std::uint16_t{80}},

{"https", std::uint16_t{443}},

{"ws", std::uint16_t{80}},

{"wss", std::uint16_t{443}}};

const std::map<UrlParser::ValidationError, std::string> validation_error_str = {

{UrlParser::ValidationError::DomainToAscii, "Unicode ToASCII records an error or returns the empty string"},

{UrlParser::ValidationError::DomainToUnicode, "Unicode ToUnicode records an error"},

{UrlParser::ValidationError::DomainInvalidCodePoint, "The input's host contains a forbidden domain code point"},

{UrlParser::ValidationError::HostInvalidCodePoint,

"An opaque host (in a URL that is not special) contains a forbidden host code point"},

{UrlParser::ValidationError::IPv4EmptyPart, "An IPv4 address ends with a U+002E (.)"},

{UrlParser::ValidationError::IPv4TooManyParts, "An IPv4 address does not consist of exactly 4 parts"},

{UrlParser::ValidationError::IPv4NonNumericPart, "An IPv4 address part is not numeric"},

{UrlParser::ValidationError::IPv4NonDecimalPart,

"The IPv4 address contains numbers expressed using hexadecimal or octal digits"},

{UrlParser::ValidationError::IPv4OutOfRangePart, "An IPv4 address part exceeds 255"},

{UrlParser::ValidationError::IPv6Unclosed, "An IPv6 address is missing the closing U+005D (])"},

{UrlParser::ValidationError::IPv6InvalidCompression, "An IPv6 address begins with improper compression"},

{UrlParser::ValidationError::IPv6TooManyPieces, "An IPv6 address contains more than 8 pieces"},

{UrlParser::ValidationError::IPv6MultipleCompression, "An IPv6 address is compressed in more than one spot"},

{UrlParser::ValidationError::IPv6InvalidCodePoint,

"An IPv6 address contains a code point that is neither an ASCII hex digit nor a U+003A (:), or it "

"unexpectedly ends"},

{UrlParser::ValidationError::IPv6TooFewPieces, "An uncompressed IPv6 address contains fewer than 8 pieces"},

{UrlParser::ValidationError::IPv4InIPv6TooManyPieces,

"An IPv6 address with IPv4 address syntax: the IPv6 address has more than 6 pieces"},

{UrlParser::ValidationError::IPv4InIPv6InvalidCodePoint,

"An IPv6 address with IPv4 address syntax: An IPv4 part is empty or contains a non-ASCII digit, an "

"IPv4 part contains a leading 0, or there are too many IPv4 parts"},

{UrlParser::ValidationError::IPv4InIPv6OutOfRangePart,

"An IPv6 address with IPv4 address syntax: an IPv4 part exceeds 255"},

{UrlParser::ValidationError::IPv4InIPv6TooFewParts,

"An IPv6 address with IPv4 address syntax: an IPv4 address contains too few parts"},

{UrlParser::ValidationError::InvalidUrlUnit, "A code point is found that is not a URL unit"},

{UrlParser::ValidationError::SpecialSchemeMissingFollowingSolidus,

"The input's scheme is not followed by \"//\""},

{UrlParser::ValidationError::MissingSchemeNonRelativeUrl,

"The input is missing a scheme, because it does not begin with an ASCII alpha, and either no base "

"URL was provided or the base URL cannot be used as a base URL because it has an opaque path"},

{UrlParser::ValidationError::InvalidReverseSolidus,

"The URL has a special scheme and it uses U+005C (\\) instead of U+002F (/)"},

{UrlParser::ValidationError::InvalidCredentials, "The input includes credentials"},

{UrlParser::ValidationError::HostMissing, "The input has a special scheme, but does not contain a host"},

{UrlParser::ValidationError::PortOutOfRange, "The input's port is too big"},

{UrlParser::ValidationError::PortInvalid, "The input's port is invalid"},

{UrlParser::ValidationError::FileInvalidWindowsDriveLetter,

"The input is a relative-URL string that starts with a Windows drive letter and the base URL's "

"scheme is \"file\""},

{UrlParser::ValidationError::FileInvalidWindowsDriveLetterHost,

"A file: URL's host is a Windows drive letter"}};

struct PercentEncodeSet {

static constexpr bool c0_control(char c) {

return util::is_c0(c) || c == 0x7f || static_cast<std::uint8_t>(c) > 0x7f;

}

static constexpr bool fragment(char c) {

return c0_control(c) || c == ' ' || c == '"' || c == '<' || c == '>' || c == '`';

}

static constexpr bool query(char c) {

return c0_control(c) || c == ' ' || c == '"' || c == '#' || c == '<' || c == '>';

}

static constexpr bool special_query(char c) { return query(c) || c == '\''; }

static constexpr bool path(char c) { return query(c) || c == '?' || c == '`' || c == '{' || c == '}'; }

static constexpr bool userinfo(char c) {

return path(c) || c == '/' || c == ':' || c == ';' || c == '=' || c == '@' || (c >= '[' && c <= '^')

|| c == '|';

}

static constexpr bool component(char c) { return userinfo(c) || (c >= '$' && c <= '&') || c == '+' || c == ','; }

};

} // namespace

void icu_cleanup() {

u_cleanup();

}

static void icu_init() {

static std::atomic<bool> called_once = false;

if (called_once.exchange(true)) {

return;

}

char *data = std::getenv("HASTUR_ICU_DATA");

if (data != nullptr) {

std::filesystem::path env_path{data};

if (std::filesystem::is_directory(env_path)) {

u_setDataDirectory(env_path.string().c_str());

}

} else {

// Use current working directory as a last resort.

// TODO(zero-one): Look at engine config for paths.

u_setDataDirectory(std::filesystem::current_path().string().c_str());

}

UErrorCode err = U_ZERO_ERROR;

std::uint32_t opts =

UIDNA_NONTRANSITIONAL_TO_ASCII | UIDNA_CHECK_BIDI | UIDNA_CHECK_CONTEXTJ | UIDNA_USE_STD3_RULES;

[[maybe_unused]] auto *uts = icu::IDNA::createUTS46Instance(opts, err);

assert(!U_FAILURE(err));

delete uts;

}

// https://w3c.github.io/FileAPI/#unicodeBlobURL

std::string blob_url_create(Origin const &origin) {

@@ -51,4 +191,1378 @@ std::string blob_url_create(Origin const &origin) {

return result;

}

void UrlParser::validation_error(ValidationError err) const {

spdlog::debug("url: InputPos: {}, ParserState: {}, Validation Error: {} {}",

current_pos(),

std::to_underlying(state_),

std::to_underlying(err),

validation_error_str.at(err));

}

// https://url.spec.whatwg.org/#concept-url-parser

std::optional<Url> UrlParser::parse(std::string input, std::optional<Url> base) {

if (input.empty() && !base.has_value()) {

return std::nullopt;

}

std::optional<Url> url = parse_basic(std::move(input), std::move(base), std::nullopt, std::nullopt);

if (url.has_value() && url->scheme == "blob") {

// TODO(zero-one): Resolve blob URL

}

return url;

}

// https://url.spec.whatwg.org/#concept-basic-url-parser

std::optional<Url> UrlParser::parse_basic(std::string input,

std::optional<Url> base, // NOLINT(bugprone-easily-swappable-parameters)

std::optional<Url> url, // NOLINT(bugprone-easily-swappable-parameters)

std::optional<ParserState> state_override) {

base_ = std::move(base);

state_override_ = state_override;

if (!url.has_value()) {

// Set url to a new URL

url_ = Url();

url_.path = std::vector<std::string>{};

bool leading_trailing_c0 = false;

while (!input.empty() && util::is_c0_or_space(input.front())) {

input.erase(0, 1);

leading_trailing_c0 = true;

}

while (!input.empty() && util::is_c0_or_space(input.back())) {

input.pop_back();

leading_trailing_c0 = true;

}

if (leading_trailing_c0) {

validation_error(ValidationError::InvalidUrlUnit);

}

} else {

url_ = *url;

}

if (std::erase_if(input, util::is_tab_or_newline) > 0) {

validation_error(ValidationError::InvalidUrlUnit);

}

state_ = state_override_.value_or(ParserState::SchemeStart);

buffer_.clear();

at_sign_seen_ = false;

inside_brackets_ = false;

password_token_seen_ = false;

// Initialize BaseParser with our modified input

reset(input);

while (true) {

switch (state_) {

case ParserState::SchemeStart:

state_scheme_start();

break;

case ParserState::Scheme:

state_scheme();

break;

case ParserState::NoScheme:

state_no_scheme();

break;

case ParserState::SpecialRelativeOrAuthority:

state_special_relative_or_authority();

break;

case ParserState::PathOrAuthority:

state_path_or_authority();

break;

case ParserState::Relative:

state_relative();

break;

case ParserState::RelativeSlash:

state_relative_slash();

break;

case ParserState::SpecialAuthoritySlashes:

state_special_authority_slashes();

break;

case ParserState::SpecialAuthorityIgnoreSlashes:

state_special_authority_ignore_slashes();

break;

case ParserState::Authority:

state_authority();

break;

case ParserState::Host:

case ParserState::Hostname:

state_host();

break;

case ParserState::Port:

state_port();

break;

case ParserState::File:

state_file();

break;

case ParserState::FileSlash:

state_file_slash();

break;

case ParserState::FileHost:

state_file_host();

break;

case ParserState::PathStart:

state_path_start();

break;

case ParserState::Path:

state_path();

break;

case ParserState::OpaquePath:

state_opaque_path();

break;

case ParserState::Query:

state_query();

break;

case ParserState::Fragment:

state_fragment();

break;

case ParserState::Failure:

return std::nullopt;

case ParserState::Terminate:

// I use this state where the spec returns "nothing" (i.e, the parser is modifying a given optional URL)

// Instead of modifying it in-place, I modify a copy and return that instead of nothing.

return url_;

}

// This check accomodates the one scenario (commented on in

// state_scheme_start, below) in which the parser position goes

// negative.

if (is_eof() && current_pos() != static_cast<std::size_t>(-1)) {

break;

}

advance(1);

}

return url_;

}

// https://url.spec.whatwg.org/#scheme-start-state

void UrlParser::state_scheme_start() {

if (auto c = peek(); c.has_value() && util::is_alpha(*c)) {

buffer_ += util::lowercased(*c);

state_ = ParserState::Scheme;

} else if (!state_override_.has_value()) {

state_ = ParserState::NoScheme;

// This can underflow pos_; that's ok, because it's incremented again before it's ever used.

back(1);

} else {

state_ = ParserState::Failure;

return;

}

// https://url.spec.whatwg.org/#scheme-state

void UrlParser::state_scheme() {

if (auto c = peek(); c.has_value() && (util::is_alphanumeric(*c) || c == '+' || c == '-' || c == '.')) {

buffer_ += util::lowercased(*c);

} else if (c == ':') {

if (state_override_.has_value()) {

if (special_schemes.contains(url_.scheme) && !special_schemes.contains(buffer_)) {

state_ = ParserState::Terminate;

return;

}

if (!special_schemes.contains(url_.scheme) && special_schemes.contains(buffer_)) {

state_ = ParserState::Terminate;

return;

}

if ((includes_credentials(url_) || url_.port.has_value()) && buffer_ == "file") {

state_ = ParserState::Terminate;

return;

}

if (url_.scheme == "file" && url_.host.has_value() && url_.host->type == HostType::Empty) {

state_ = ParserState::Terminate;

return;

}

url_.scheme = buffer_;

if (state_override_.has_value()) {

if (special_schemes.contains(url_.scheme) && url_.port == special_schemes.at(url_.scheme)) {

url_.port.reset();

}

state_ = ParserState::Terminate;

return;

}

buffer_.clear();

if (url_.scheme == "file") {

if (!remaining_from(1).starts_with("//")) {

validation_error(ValidationError::SpecialSchemeMissingFollowingSolidus);

}

state_ = ParserState::File;

} else if (special_schemes.contains(url_.scheme) && base_.has_value() && base_->scheme == url_.scheme) {

assert(special_schemes.contains(base_->scheme));

state_ = ParserState::SpecialRelativeOrAuthority;

} else if (special_schemes.contains(url_.scheme)) {

state_ = ParserState::SpecialAuthoritySlashes;

} else if (remaining_from(1).starts_with('/')) {

state_ = ParserState::PathOrAuthority;

advance(1);

} else {

url_.path = "";

state_ = ParserState::OpaquePath;

}

} else if (!state_override_.has_value()) {

buffer_.clear();

state_ = ParserState::NoScheme;

reset();

} else {

state_ = ParserState::Failure;

return;

}

// https://url.spec.whatwg.org/#no-scheme-state

void UrlParser::state_no_scheme() {

if (auto c = peek(); !base_.has_value() || (has_opaque_path(*base_) && c != '#')) {

validation_error(ValidationError::MissingSchemeNonRelativeUrl);

state_ = ParserState::Failure;

return;

} else if (has_opaque_path(*base_) && c == '#') {

url_.scheme = base_->scheme;

url_.path = base_->path;

url_.query = base_->query;

url_.fragment = "";

state_ = ParserState::Fragment;

} else if (base_->scheme != "file") {

state_ = ParserState::Relative;

back(1);

} else {

state_ = ParserState::File;

back(1);

}

// https://url.spec.whatwg.org/#special-relative-or-authority-state

void UrlParser::state_special_relative_or_authority() {

if (peek() == '/' && remaining_from(1).starts_with('/')) {

state_ = ParserState::SpecialAuthorityIgnoreSlashes;

advance(1);

} else {

validation_error(ValidationError::SpecialSchemeMissingFollowingSolidus);

state_ = ParserState::Relative;

back(1);

}

// https://url.spec.whatwg.org/#path-or-authority-state

void UrlParser::state_path_or_authority() {

if (peek() == '/') {

state_ = ParserState::Authority;

} else {

state_ = ParserState::Path;

back(1);

}

// https://url.spec.whatwg.org/#relative-state

void UrlParser::state_relative() {

assert(base_.has_value() && base_->scheme != "file");

url_.scheme = base_->scheme;

if (auto c = peek(); c == '/') {

state_ = ParserState::RelativeSlash;

} else if (special_schemes.contains(url_.scheme) && c == '\\') {

validation_error(ValidationError::InvalidReverseSolidus);

state_ = ParserState::RelativeSlash;

} else {

url_.user = base_->user;

url_.passwd = base_->passwd;

url_.host = base_->host;

url_.port = base_->port;

url_.path = base_->path;

url_.query = base_->query;

if (c == '?') {

url_.query = "";

state_ = ParserState::Query;

} else if (c == '#') {

url_.fragment = "";

state_ = ParserState::Fragment;

} else if (!is_eof()) {

url_.query.reset();

shorten_url_path(url_);

state_ = ParserState::Path;

back(1);

}

// https://url.spec.whatwg.org/#relative-slash-state

void UrlParser::state_relative_slash() {

if (auto c = peek(); special_schemes.contains(url_.scheme) && (c == '/' || c == '\\')) {

if (c == '\\') {

validation_error(ValidationError::InvalidReverseSolidus);

}

state_ = ParserState::SpecialAuthorityIgnoreSlashes;

} else if (c == '/') {

state_ = ParserState::Authority;

} else {

url_.user = base_->user;

url_.passwd = base_->passwd;

url_.host = base_->host;

url_.port = base_->port;

state_ = ParserState::Path;

back(1);

}

// https://url.spec.whatwg.org/#special-authority-slashes-state

void UrlParser::state_special_authority_slashes() {

if (peek() == '/' && remaining_from(1).starts_with('/')) {

state_ = ParserState::SpecialAuthorityIgnoreSlashes;

advance(1);

} else {

validation_error(ValidationError::SpecialSchemeMissingFollowingSolidus);

state_ = ParserState::SpecialAuthorityIgnoreSlashes;

back(1);

}

// https://url.spec.whatwg.org/#special-authority-ignore-slashes-state

void UrlParser::state_special_authority_ignore_slashes() {

if (auto c = peek(); c != '/' && c != '\\') {

state_ = ParserState::Authority;

back(1);

} else {

validation_error(ValidationError::SpecialSchemeMissingFollowingSolidus);

}

// https://url.spec.whatwg.org/#authority-state

void UrlParser::state_authority() {

if (auto c = peek(); c == '@') {

validation_error(ValidationError::InvalidCredentials);

if (at_sign_seen_) {

buffer_.insert(0, "%40");

}

at_sign_seen_ = true;

for (std::size_t i = 0; i < buffer_.size(); i++) {

if (buffer_[i] == ':' && !password_token_seen_) {

password_token_seen_ = true;

continue;

}

std::string encoded_code_points =

util::percent_encode(std::string_view{buffer_}.substr(i, 1), PercentEncodeSet::userinfo);

if (password_token_seen_) {

url_.passwd += encoded_code_points;

} else {

url_.user += encoded_code_points;

}

buffer_.clear();

} else if (is_eof() || c == '/' || c == '?' || c == '#' || (special_schemes.contains(url_.scheme) && c == '\\')) {

if (at_sign_seen_ && buffer_.empty()) {

validation_error(ValidationError::InvalidCredentials);

state_ = ParserState::Failure;

return;

}

// The spec says to use code-point length, but that causes the parser

// not to back up far enough; it will truncate characters going into

// the host state. It seems to only apply if you're parsing codepoint

// by codepoint instead of byte-by-byte like we are.

// back(util::utf8_length(buffer_) + 1);

back(buffer_.size() + 1);

buffer_.clear();

state_ = ParserState::Host;

} else {

buffer_ += *c;

}

// https://url.spec.whatwg.org/#host-state

void UrlParser::state_host() {

if (auto c = peek(); state_override_.has_value() && url_.scheme == "file") {

back(1);

state_ = ParserState::FileHost;

} else if (c == ':' && !inside_brackets_) {

if (buffer_.empty()) {

validation_error(ValidationError::HostMissing);

state_ = ParserState::Failure;

return;

}

if (state_override_.has_value() && *state_override_ == ParserState::Hostname) {

state_ = ParserState::Terminate;

return;

}

std::optional<Host> host = parse_host(buffer_, !special_schemes.contains(url_.scheme));

if (!host.has_value()) {

state_ = ParserState::Failure;

return;

}

url_.host = host;

buffer_.clear();

state_ = ParserState::Port;

} else if ((is_eof() || c == '/' || c == '?' || c == '#') || (special_schemes.contains(url_.scheme) && c == '\\')) {

back(1);

if (special_schemes.contains(url_.scheme) && buffer_.empty()) {

validation_error(ValidationError::HostMissing);

state_ = ParserState::Failure;

return;

} else if (state_override_.has_value() && buffer_.empty()

&& (includes_credentials(url_) || url_.port.has_value())) {

state_ = ParserState::Terminate;

return;

}

std::optional<Host> host = parse_host(buffer_, !special_schemes.contains(url_.scheme));

if (!host.has_value()) {

state_ = ParserState::Failure;

return;

}

url_.host = host;

buffer_.clear();

state_ = ParserState::PathStart;

if (state_override_.has_value()) {

state_ = ParserState::Terminate;

return;

}

} else {

if (c == '[') {

inside_brackets_ = true;

}

if (c == ']') {

inside_brackets_ = false;

}

buffer_ += *c;

}

// https://url.spec.whatwg.org/#port-state

void UrlParser::state_port() {

if (auto c = peek(); c.has_value() && util::is_digit(*c)) {

buffer_ += *c;

} else if ((is_eof() || c == '/' || c == '?' || c == '#') || (special_schemes.contains(url_.scheme) && c == '\\')

|| state_override_.has_value()) {

if (!buffer_.empty()) {

std::uint32_t port;

auto res = std::from_chars(buffer_.data(), buffer_.data() + buffer_.size(), port);

if (res.ec == std::errc::invalid_argument || res.ec == std::errc::result_out_of_range) {

spdlog::info("Invalid port given in URL");

state_ = ParserState::Failure;

return;

}

if (port > std::pow(2, 16) - 1) {

validation_error(ValidationError::PortOutOfRange);

state_ = ParserState::Failure;

return;

}

if (special_schemes.contains(url_.scheme) && port == special_schemes.at(url_.scheme)) {

url_.port = std::nullopt;

} else {

url_.port = static_cast<std::uint16_t>(port);

}

buffer_.clear();

}

if (state_override_.has_value()) {

state_ = ParserState::Terminate;

return;

}

state_ = ParserState::PathStart;

back(1);

} else {

validation_error(ValidationError::PortInvalid);

state_ = ParserState::Failure;

return;

}

// https://url.spec.whatwg.org/#file-state

void UrlParser::state_file() {

url_.scheme = "file";

url_.host = Host{HostType::Empty};

if (auto c = peek(); c == '/' || c == '\\') {

if (c == '\\') {

validation_error(ValidationError::InvalidReverseSolidus);

}

state_ = ParserState::FileSlash;

} else if (base_.has_value() && base_->scheme == "file") {

url_.host = base_->host;

url_.path = base_->path;

url_.query = base_->query;

if (c == '?') {

url_.query = "";

state_ = ParserState::Query;

} else if (c == '#') {

url_.fragment = "";

state_ = ParserState::Fragment;

} else if (!is_eof()) {

url_.query = std::nullopt;

if (!starts_with_windows_drive_letter(remaining_from(1))) {

shorten_url_path(url_);

} else {

validation_error(ValidationError::FileInvalidWindowsDriveLetter);

url_.path = std::vector<std::string>{};

}

state_ = ParserState::Path;

back(1);

}

} else {

state_ = ParserState::Path;

back(1);

}

// https://url.spec.whatwg.org/#file-slash-state

void UrlParser::state_file_slash() {

if (auto c = peek(); c == '/' || c == '\\') {

if (c == '\\') {

validation_error(ValidationError::InvalidReverseSolidus);

}

state_ = ParserState::FileHost;

} else {

if (base_.has_value() && base_->scheme == "file") {

url_.host = base_->host;

if (!starts_with_windows_drive_letter(remaining_from(1))

&& is_normal_windows_drive_letter(std::get<1>(base_->path)[0])) {

std::get<1>(url_.path).push_back(std::get<1>(base_->path)[0]);

}

state_ = ParserState::Path;

back(1);

}

// https://url.spec.whatwg.org/#file-host-state

void UrlParser::state_file_host() {

if (auto c = peek(); is_eof() || c == '/' || c == '\\' || c == '?' || c == '#') {

back(1);

if (!state_override_.has_value() && is_windows_drive_letter(buffer_)) {

validation_error(ValidationError::FileInvalidWindowsDriveLetterHost);

state_ = ParserState::Path;

} else if (buffer_.empty()) {

url_.host = Host{HostType::Empty};

if (state_override_.has_value()) {

state_ = ParserState::Terminate;

return;

}

state_ = ParserState::PathStart;

} else {

std::optional<Host> host = parse_host(buffer_, !special_schemes.contains(url_.scheme));

if (!host.has_value()) {

state_ = ParserState::Failure;

return;

}

if (auto *h = std::get_if<0>(&host->data); h != nullptr && *h == "localhost") {

*h = "";

}

url_.host = host;

if (state_override_.has_value()) {

state_ = ParserState::Terminate;

return;

}

buffer_.clear();

state_ = ParserState::PathStart;

}

} else {

buffer_ += *c;

}

// https://url.spec.whatwg.org/#path-start-state

void UrlParser::state_path_start() {

if (auto c = peek(); special_schemes.contains(url_.scheme)) {

if (c == '\\') {

validation_error(ValidationError::InvalidReverseSolidus);

}

state_ = ParserState::Path;

if (c != '/' && c != '\\') {

back(1);

}

} else if (!state_override_.has_value() && c == '?') {

url_.query = "";

state_ = ParserState::Query;

} else if (!state_override_.has_value() && c == '#') {

url_.fragment = "";

state_ = ParserState::Fragment;

} else if (!is_eof()) {

state_ = ParserState::Path;

if (c != '/') {

back(1);

}

} else if (state_override_.has_value() && !url_.host.has_value()) {

std::get<1>(url_.path).push_back("");

}

// https://url.spec.whatwg.org/#path-state

void UrlParser::state_path() {

if (auto c = peek(); is_eof() || c == '/' || (special_schemes.contains(url_.scheme) && c == '\\')

|| (!state_override_.has_value() && (c == '?' || c == '#'))) {

if (special_schemes.contains(url_.scheme) && c == '\\') {

validation_error(ValidationError::InvalidReverseSolidus);

}

if (buffer_ == ".." || util::lowercased(buffer_) == ".%2e" || util::lowercased(buffer_) == "%2e."

|| util::lowercased(buffer_) == "%2e%2e") {

shorten_url_path(url_);

if (c != '/' && !(special_schemes.contains(url_.scheme) && c == '\\')) {

std::get<1>(url_.path).push_back("");

}

} else if ((buffer_ == "." || util::lowercased(buffer_) == "%2e")

&& (c != '/' && !(special_schemes.contains(url_.scheme) && c == '\\'))) {

std::get<1>(url_.path).push_back("");

} else if (buffer_ != "." && util::lowercased(buffer_) != "%2e") {

if (url_.scheme == "file" && std::get<1>(url_.path).empty() && is_windows_drive_letter(buffer_)) {

buffer_[1] = ':';

}

std::get<1>(url_.path).push_back(buffer_);

}

buffer_.clear();

if (c == '?') {

url_.query = "";

state_ = ParserState::Query;

}

if (c == '#') {

url_.fragment = "";

state_ = ParserState::Fragment;

}

} else {

if (!is_url_codepoint(util::utf8_to_utf32(remaining_from(0))) && c != '%') {

validation_error(ValidationError::InvalidUrlUnit);

}

if (c == '%'

&& (remaining_from(1).size() < 2 || !util::is_hex_digit(remaining_from(1)[0])

|| !util::is_hex_digit(remaining_from(1)[1]))) {

validation_error(ValidationError::InvalidUrlUnit);

}

buffer_ += util::percent_encode(*peek(1), PercentEncodeSet::path);

}

// https://url.spec.whatwg.org/#cannot-be-a-base-url-path-state

void UrlParser::state_opaque_path() {

if (auto c = peek(); c == '?') {

url_.query = "";

state_ = ParserState::Query;

} else if (c == '#') {

url_.fragment = "";

state_ = ParserState::Fragment;

} else {

if (!is_eof() && !is_url_codepoint(util::utf8_to_utf32(remaining_from(0))) && c != '%') {

validation_error(ValidationError::InvalidUrlUnit);

}

if (c == '%'

&& (remaining_from(1).size() < 2 || !util::is_hex_digit(remaining_from(1)[0])

|| !util::is_hex_digit(remaining_from(1)[1]))) {

validation_error(ValidationError::InvalidUrlUnit);

}

if (!is_eof()) {

std::get<0>(url_.path) += util::percent_encode(*peek(1), PercentEncodeSet::c0_control);

}

// https://url.spec.whatwg.org/#query-state

void UrlParser::state_query() {

if (auto c = peek(); (!state_override_.has_value() && c == '#') || is_eof()) {

if (special_schemes.contains(url_.scheme)) {

url_.query.value() += util::percent_encode(buffer_, PercentEncodeSet::special_query);

} else {

url_.query.value() += util::percent_encode(buffer_, PercentEncodeSet::query);

}

buffer_.clear();

if (c == '#') {

url_.fragment = "";

state_ = ParserState::Fragment;

}

} else if (!is_eof()) {

if (!is_url_codepoint(util::utf8_to_utf32(remaining_from(0))) && c != '%') {

validation_error(ValidationError::InvalidUrlUnit);

}

if (c == '%'

&& (remaining_from(1).size() < 2 || !util::is_hex_digit(remaining_from(1)[0])

|| !util::is_hex_digit(remaining_from(1)[1]))) {

validation_error(ValidationError::InvalidUrlUnit);

}

buffer_ += *c;

}

// https://url.spec.whatwg.org/#fragment-state

void UrlParser::state_fragment() {

if (auto c = peek(); !is_eof()) {

if (!is_url_codepoint(util::utf8_to_utf32(remaining_from(0))) && c != '%') {

validation_error(ValidationError::InvalidUrlUnit);

}

if (c == '%'

&& (remaining_from(1).size() < 2 || !util::is_hex_digit(remaining_from(1)[0])

|| !util::is_hex_digit(remaining_from(1)[1]))) {

validation_error(ValidationError::InvalidUrlUnit);

}

url_.fragment.value() += util::percent_encode(*peek(1), PercentEncodeSet::fragment);

}

// https://url.spec.whatwg.org/#concept-domain-to-ascii

std::optional<std::string> UrlParser::domain_to_ascii(std::string_view domain, bool be_strict) const {

icu_init();

std::string ascii_domain;

icu::StringByteSink<std::string> tmp{&ascii_domain};

icu::IDNAInfo inf;

UErrorCode err = U_ZERO_ERROR;

std::uint32_t opts = UIDNA_NONTRANSITIONAL_TO_ASCII | UIDNA_CHECK_BIDI | UIDNA_CHECK_CONTEXTJ;

if (be_strict) {

opts |= UIDNA_USE_STD3_RULES;

}

auto *uts = icu::IDNA::createUTS46Instance(opts, err);

if (U_FAILURE(err)) {

spdlog::info("Failed to create UTS46 instance, error {}; idna data probably missing from icu",

static_cast<std::int64_t>(err));

return std::nullopt;

}

err = U_ZERO_ERROR;

uts->nameToASCII_UTF8(domain, tmp, inf, err);

delete uts;

std::uint32_t proc_err = inf.getErrors();

// icu doesn't offer a flag to disable VerifyDnsLength or CheckHyphens, so just ignore those failures

proc_err &= ~UIDNA_ERROR_LEADING_HYPHEN;

proc_err &= ~UIDNA_ERROR_TRAILING_HYPHEN;

proc_err &= ~UIDNA_ERROR_HYPHEN_3_4;

if (!be_strict) {

proc_err &= ~UIDNA_ERROR_LABEL_TOO_LONG;

proc_err &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;

}

// If domain or any label is empty, proc_err should contain UIDNA_ERROR_EMPTY_LABEL

if (U_FAILURE(err) || proc_err != 0) {

validation_error(ValidationError::DomainToAscii);

return std::nullopt;

}

return ascii_domain;

}

// https://url.spec.whatwg.org/#start-with-a-windows-drive-letter

bool UrlParser::starts_with_windows_drive_letter(std::string_view input) const {

if (input.size() < 2) {

return false;

}

if (!util::is_alpha(input[0]) || !(input[1] == ':' || input[1] == '|')) {

return false;

}

if (input.size() == 2) {

return true;

}

if (input.size() > 2 && (input[2] == '/' || input[2] == '\\' || input[2] == '?' || input[2] == '#')) {

return true;

}

return false;

}

// https://url.spec.whatwg.org/#shorten-a-urls-path

void UrlParser::shorten_url_path(Url &url) const {

assert(!std::holds_alternative<std::string>(url.path));

if (url.scheme == "file" && std::get<1>(url.path).size() == 1

&& is_normal_windows_drive_letter(std::get<1>(url.path)[0])) {

return;

}

if (!std::get<1>(url.path).empty()) {

std::get<1>(url.path).pop_back();

}

// https://url.spec.whatwg.org/#concept-host-parser

std::optional<Host> UrlParser::parse_host(std::string_view input, bool is_not_special) const {

assert(!input.empty());

if (input.starts_with("[")) {

if (!input.ends_with("]")) {

validation_error(ValidationError::IPv6Unclosed);

return std::nullopt;

}

input.remove_prefix(1);

input.remove_suffix(1);

std::optional<std::array<std::uint16_t, 8>> addr = parse_ipv6(input);

if (!addr.has_value()) {

return std::nullopt;

}

return Host{HostType::Ip6Addr, *addr};

}

if (is_not_special) {

std::optional<std::string> host = parse_opaque_host(input);

if (!host.has_value()) {

return std::nullopt;

}

return Host{HostType::Opaque, *host};

}

std::string domain = util::percent_decode(input);

std::optional<std::string> ascii_domain = domain_to_ascii(domain, false);

if (!ascii_domain.has_value()) {

return std::nullopt;

}

std::string forbidden = "\t\n\r #/:<>?@[\\]^|";

for (std::size_t i = 0; i < ascii_domain->size(); i++) {

if (forbidden.find_first_of(ascii_domain.value()[i]) != std::string::npos || ascii_domain.value()[i] <= 0x1f

|| ascii_domain.value()[i] == '%' || ascii_domain.value()[i] == 0x7f) {

validation_error(ValidationError::DomainInvalidCodePoint);

return std::nullopt;

}

if (ends_in_number(*ascii_domain)) {

std::optional<std::uint32_t> ip = parse_ipv4(*ascii_domain);

if (!ip.has_value()) {

return std::nullopt;

}

return Host{HostType::Ip4Addr, *ip};

}

return Host{HostType::DnsDomain, *ascii_domain};

}

// https://url.spec.whatwg.org/#ends-in-a-number-checker

bool UrlParser::ends_in_number(std::string_view input) const {

// Let parts be the result of strictly splitting input on U+002E (.)

std::vector<std::string_view> parts = util::split(input, ".");

if (parts.back().empty()) {

if (parts.size() == 1) {

return false;

}

parts.pop_back();

}

// If last part is non-empty and contains only ASCII digits, return true

if (!parts.back().empty()) {

if (std::ranges::all_of(parts.back(), util::is_digit)) {

return true;

}

// If parsing last part as an IPv4 number does not return failure, then return true

if (parse_ipv4_number(parts.back()).has_value()) {

return true;

}

return false;

}

// https://url.spec.whatwg.org/#concept-ipv4-parser

std::optional<std::uint32_t> UrlParser::parse_ipv4(std::string_view input) const {

std::vector<std::string_view> parts = util::split(input, ".");

if (parts.back().empty()) {

validation_error(ValidationError::IPv4EmptyPart);

if (parts.size() > 1) {

parts.pop_back();

}

if (parts.size() > 4) {

validation_error(ValidationError::IPv4TooManyParts);

return std::nullopt;

}

std::vector<std::uint64_t> numbers;

for (auto part : parts) {

std::optional<std::tuple<std::uint64_t, bool>> result = parse_ipv4_number(part);

if (!result.has_value()) {

validation_error(ValidationError::IPv4NonNumericPart);

return std::nullopt;

}

if (std::get<1>(*result)) {

validation_error(ValidationError::IPv4NonDecimalPart);

}

numbers.emplace_back(std::get<0>(*result));

}

for (std::size_t i = 0; i < numbers.size(); i++) {

if (numbers[i] > 255) {

validation_error(ValidationError::IPv4OutOfRangePart);

if (i != numbers.size() - 1) {

return std::nullopt;

}

if (numbers.back() >= std::pow(256, 5 - numbers.size())) {

return std::nullopt;

}

auto ipv4 = static_cast<std::uint32_t>(numbers.back());

numbers.pop_back();

for (std::size_t i = 0; i < numbers.size(); i++) {

ipv4 += static_cast<std::uint32_t>(numbers[i] * std::pow(256, 3 - i));

}

return ipv4;

}

// https://url.spec.whatwg.org/#ipv4-number-parser

std::optional<std::tuple<std::uint64_t, bool>> UrlParser::parse_ipv4_number(std::string_view input) const {

if (input.empty()) {

return std::nullopt;

}

bool v_err = false;

int r = 10;

if (input.size() >= 2 && (input.starts_with("0X") || input.starts_with("0x"))) {

v_err = true;

input.remove_prefix(2);

r = 16;

} else if (input.size() >= 2 && input.starts_with("0")) {

v_err = true;

input.remove_prefix(1);

r = 8;

}

if (input.empty()) {

return {{0, true}};

}

for (char i : input) {

if ((r == 10 && !util::is_digit(i)) || (r == 16 && !util::is_hex_digit(i))

|| (r == 8 && !util::is_octal_digit(i))) {

return std::nullopt;

}

std::uint64_t out;

auto res = std::from_chars(input.data(), input.data() + input.size(), out, r);

if (res.ec == std::errc::invalid_argument || res.ec == std::errc::result_out_of_range) {

spdlog::info("Invalid ipv4 number");

return std::nullopt;

}

return {{out, v_err}};

}

// https://url.spec.whatwg.org/#concept-ipv6-parser

std::optional<std::array<std::uint16_t, 8>> UrlParser::parse_ipv6(std::string_view input) const {

std::array<std::uint16_t, 8> address = {0, 0, 0, 0, 0, 0, 0, 0};

std::size_t piece_index = 0;

std::optional<std::size_t> compress;

std::size_t pointer = 0;

if (!input.empty() && input[pointer] == ':') {

if (!input.substr(1).starts_with(":")) {

validation_error(ValidationError::IPv6InvalidCompression);

return std::nullopt;

}

pointer += 2;

piece_index++;

compress = piece_index;

}

while (pointer < input.size()) {

if (piece_index == 8) {

validation_error(ValidationError::IPv6TooManyPieces);

return std::nullopt;

}

if (input[pointer] == ':') {

if (compress.has_value()) {

validation_error(ValidationError::IPv6MultipleCompression);

return std::nullopt;

}

pointer++;

piece_index++;

compress = piece_index;

continue;

}

std::uint64_t value = 0;

std::size_t length = 0;

for (; length < 4 && pointer < input.size() && util::is_hex_digit(input[pointer]); pointer++, length++) {

std::uint64_t out;

auto res = std::from_chars(input.data() + pointer, input.data() + pointer + 1, out, 16);

if (res.ec == std::errc::invalid_argument || res.ec == std::errc::result_out_of_range) {

spdlog::info("Invalid IPv6 input");

return std::nullopt;

}

value = value * 0x10 + out;

}

if (pointer < input.size() && input[pointer] == '.') {

if (length == 0) {

validation_error(ValidationError::IPv4InIPv6InvalidCodePoint);

return std::nullopt;

}

pointer -= length;

if (piece_index > 6) {

validation_error(ValidationError::IPv4InIPv6TooManyPieces);

return std::nullopt;

}

std::size_t numbers_seen = 0;

while (pointer < input.size()) {

std::optional<std::uint64_t> ipv4_piece;

if (numbers_seen > 0) {

if (pointer < input.size() && input[pointer] == '.' && numbers_seen < 4) {

pointer++;

} else {

validation_error(ValidationError::IPv4InIPv6InvalidCodePoint);

return std::nullopt;

}

if (pointer >= input.size() || !util::is_digit(input[pointer])) {

validation_error(ValidationError::IPv4InIPv6InvalidCodePoint);

return std::nullopt;

}

while (pointer < input.size() && util::is_digit(input[pointer])) {

std::uint64_t number;

auto res = std::from_chars(input.data() + pointer, input.data() + pointer + 1, number);

if (res.ec == std::errc::invalid_argument || res.ec == std::errc::result_out_of_range) {

spdlog::info("Invalid IPv6 input 2");

return std::nullopt;

}

if (!ipv4_piece.has_value()) {

ipv4_piece = number;

} else if (ipv4_piece == 0) {

validation_error(ValidationError::IPv4InIPv6InvalidCodePoint);

return std::nullopt;

} else {

ipv4_piece = *ipv4_piece * 10 + number;

}

if (ipv4_piece > 255) {

validation_error(ValidationError::IPv4InIPv6OutOfRangePart);

return std::nullopt;

}

pointer++;

}

address[piece_index] = static_cast<std::uint16_t>(address[piece_index] * 0x100ul + *ipv4_piece);

numbers_seen++;

if (numbers_seen == 2 || numbers_seen == 4) {

piece_index++;

}

if (numbers_seen != 4) {

validation_error(ValidationError::IPv4InIPv6TooFewParts);

return std::nullopt;

}

break;

} else if (pointer < input.size() && input[pointer] == ':') {

pointer++;

if (pointer >= input.size()) {

validation_error(ValidationError::IPv6InvalidCodePoint);

return std::nullopt;

}

} else if (pointer < input.size()) {

validation_error(ValidationError::IPv6InvalidCodePoint);

return std::nullopt;

}

address[piece_index] = static_cast<std::uint16_t>(value);

piece_index++;

}

if (compress.has_value()) {

std::size_t swaps = piece_index - *compress;

piece_index = 7;

for (; piece_index != 0 && swaps > 0; piece_index--, swaps--) {

std::uint16_t tmp = address[piece_index];

address[piece_index] = address[*compress + swaps - 1];

address[*compress + swaps - 1] = tmp;

}

} else if (!compress.has_value() && piece_index != 8) {

validation_error(ValidationError::IPv6TooFewPieces);

return std::nullopt;

}

return address;

}

// https://url.spec.whatwg.org/#concept-opaque-host-parser

std::optional<std::string> UrlParser::parse_opaque_host(std::string_view input) const {

std::string forbidden = "\t\n\r #/:<>?@[\\]^|";

for (char i : input) {

if (forbidden.find_first_of(i) != std::string_view::npos || i == '\0') {

validation_error(ValidationError::HostInvalidCodePoint);

return std::nullopt;

}

std::string_view tmp = input;

int len = 0;

while (!tmp.empty()) {

std::uint32_t cp = util::utf8_to_utf32(tmp);

len = util::unicode_utf8_byte_count(cp);

if (!is_url_codepoint(cp)) {

validation_error(ValidationError::InvalidUrlUnit);

}

if (tmp[0] == '%' && tmp.size() > 2 && (!util::is_hex_digit(tmp[1]) || !util::is_hex_digit(tmp[2]))) {

validation_error(ValidationError::InvalidUrlUnit);

}

// I don't *think* this can remove > size(), but maybe i should clamp it anyway

tmp.remove_prefix(len);

}

return util::percent_encode(input, PercentEncodeSet::c0_control);

}

bool UrlParser::is_url_codepoint(std::uint32_t cp) const {

return cp == '!' || cp == '$' || cp == '&' || cp == '\'' || cp == '(' || cp == ')' || cp == '*' || cp == '+'

|| cp == ',' || cp == '-' || cp == '.' || cp == '/' || cp == ':' || cp == ';' || cp == '=' || cp == '?'

|| cp == '@' || cp == '_' || cp == '~'

|| (cp >= 0x00a0 && cp <= 0x10fffd && !util::is_unicode_noncharacter(cp)

&& !util::is_unicode_surrogate(cp));

}

// NOLINTEND(bugprone-unchecked-optional-access)

// NOLINTEND(misc-redundant-expression)

} // namespace url

url/url.h added: 2390, removed: 46, total 2344

@@ -5,14 +5,21 @@

#ifndef URL_URL_H_

#define URL_URL_H_

#include "util/base_parser.h"

#include "util/string.h"

#include <array>

#include <cstdint>

#include <map>

#include <optional>

#include <string>

#include <string_view>

#include <variant>

namespace url {

void icu_cleanup();

enum class HostType { DnsDomain, Ip4Addr, Ip6Addr, Opaque, Empty };

struct Host {

@@ -31,10 +38,161 @@ struct Origin {

};

/**

* Generates a new Blob URL for the given origin

* Generates a new Blob URL for the given origin.

std::string blob_url_create(Origin const &origin);

struct Url {

std::string scheme;

std::string user;

std::string passwd;

std::optional<Host> host;

std::optional<std::uint16_t> port;

std::variant<std::string, std::vector<std::string>> path;

std::optional<std::string> query;

std::optional<std::string> fragment;

};

// This parser is current with the WHATWG URL specification as of 1 March 2023

class UrlParser final : util::BaseParser {

public:

UrlParser() : BaseParser{""} {}

std::optional<Url> parse(std::string input, std::optional<Url> base = std::nullopt);

enum class ValidationError {

// IDNA

DomainToAscii,

DomainToUnicode,

// Host parsing

DomainInvalidCodePoint,

HostInvalidCodePoint,

IPv4EmptyPart,

IPv4TooManyParts,

IPv4NonNumericPart,

IPv4NonDecimalPart,

IPv4OutOfRangePart,

IPv6Unclosed,

IPv6InvalidCompression,

IPv6TooManyPieces,

IPv6MultipleCompression,

IPv6InvalidCodePoint,

IPv6TooFewPieces,

IPv4InIPv6TooManyPieces,

IPv4InIPv6InvalidCodePoint,

IPv4InIPv6OutOfRangePart,

IPv4InIPv6TooFewParts,

// URL parsing

InvalidUrlUnit,

SpecialSchemeMissingFollowingSolidus,

MissingSchemeNonRelativeUrl,

InvalidReverseSolidus,

InvalidCredentials,

HostMissing,

PortOutOfRange,

PortInvalid,

FileInvalidWindowsDriveLetter,

FileInvalidWindowsDriveLetterHost

};

private:

enum class ParserState {

SchemeStart,

Scheme,

NoScheme,

SpecialRelativeOrAuthority,

PathOrAuthority,

Relative,

RelativeSlash,

SpecialAuthoritySlashes,

SpecialAuthorityIgnoreSlashes,

Authority,

Host,

Hostname,

Port,

File,

FileSlash,

FileHost,

PathStart,

Path,

OpaquePath,

Query,

Fragment,

Failure,

Terminate

};

// Main parser

std::optional<Url> parse_basic(std::string input,

std::optional<Url> base,

std::optional<Url> url,

std::optional<ParserState> state_override);

void state_scheme_start();

void state_scheme();

void state_no_scheme();

void state_special_relative_or_authority();

void state_path_or_authority();

void state_relative();

void state_relative_slash();

void state_special_authority_slashes();

void state_special_authority_ignore_slashes();

void state_authority();

void state_host();

void state_port();

void state_file();

void state_file_slash();

void state_file_host();

void state_path_start();

void state_path();

void state_opaque_path();

void state_query();

void state_fragment();

void validation_error(ValidationError) const;

// Host parsing

std::optional<Host> parse_host(std::string_view input, bool is_not_special = false) const;

bool ends_in_number(std::string_view) const;

std::optional<std::uint32_t> parse_ipv4(std::string_view) const;

std::optional<std::tuple<std::uint64_t, bool>> parse_ipv4_number(std::string_view) const;

std::optional<std::array<std::uint16_t, 8>> parse_ipv6(std::string_view) const;

std::optional<std::string> parse_opaque_host(std::string_view) const;

bool is_url_codepoint(std::uint32_t) const;

// IDNA

std::optional<std::string> domain_to_ascii(std::string_view domain, bool be_strict) const;

// Misc

bool starts_with_windows_drive_letter(std::string_view) const;

void shorten_url_path(Url &) const;

constexpr bool includes_credentials(Url &url) const { return !url.user.empty() || !url.user.empty(); }

constexpr bool has_opaque_path(Url &url) const { return std::holds_alternative<std::string>(url.path); }

constexpr bool is_windows_drive_letter(std::string_view input) const {

return input.size() == 2 && util::is_alpha(input[0]) && (input[1] == ':' || input[1] == '|');

}

constexpr bool is_normal_windows_drive_letter(std::string_view input) const {

return input.size() == 2 && util::is_alpha(input[0]) && input[1] == ':';

}

// Parser state

Url url_;

std::optional<Url> base_;

std::optional<ParserState> state_override_;

ParserState state_ = ParserState::Failure;

std::string buffer_;

bool at_sign_seen_ = false;

bool inside_brackets_ = false;

bool password_token_seen_ = false;

};

} // namespace url

#endif

filename was Deleted added: 2390, removed: 46, total 2344

@@ -0,0 +1,18 @@

// SPDX-FileCopyrightText: 2023 David Zero <zero-one@zer0-one.net>

// SPDX-License-Identifier: BSD-2-Clause

#include "url/url.h"

// NOLINTNEXTLINE(readability-identifier-naming)

extern "C" int LLVMFuzzerTestOneInput(uint8_t const *data, size_t size);

// NOLINTNEXTLINE(readability-identifier-naming)

extern "C" int LLVMFuzzerTestOneInput(uint8_t const *data, size_t size) {

url::UrlParser p;

std::optional<url::Url> url;

url = p.parse({reinterpret_cast<char const *>(data), size});

return 0;

}

filename was Deleted added: 2390, removed: 46, total 2344

@@ -0,0 +1 @@

https://example.com:8080/index.html

filename was Deleted added: 2390, removed: 46, total 2344

@@ -0,0 +1 @@

https://[2001:db8:85a3::8a2e:370:7334]:631

No newline at end of file

filename was Deleted added: 2390, removed: 46, total 2344

@@ -0,0 +1 @@

https://[0000:0000:0000:0000:0000:ffff:4ccb:8c22]:631

No newline at end of file

filename was Deleted added: 2390, removed: 46, total 2344

@@ -0,0 +1 @@

https://[::ffff:76.203.140.34]:631

No newline at end of file

filename was Deleted added: 2390, removed: 46, total 2344

@@ -0,0 +1 @@

file:///home/zero-one/repos/hastur/README.md

filename was Deleted added: 2390, removed: 46, total 2344

@@ -0,0 +1 @@

file:///home/zero-one/repos/../hastur/README.md

filename was Deleted added: 2390, removed: 46, total 2344

@@ -0,0 +1 @@

file:///home/zero-one/repos/./hastur/README.md

filename was Deleted added: 2390, removed: 46, total 2344

@@ -0,0 +1 @@

file://C:\\Users\\zero-one\\repos\\hastur\\README.md

filename was Deleted added: 2390, removed: 46, total 2344

@@ -0,0 +1 @@

https://zero-one:testpass123@example.com/login.php

filename was Deleted added: 2390, removed: 46, total 2344

@@ -0,0 +1 @@

http://bücher.de

No newline at end of file

filename was Deleted added: 2390, removed: 46, total 2344

@@ -0,0 +1 @@

https://√.com/i/itunes.gif

No newline at end of file

filename was Deleted added: 2390, removed: 46, total 2344

@@ -0,0 +1 @@

https://ar.wikipedia.org/wiki/نجيب_محفوظ

No newline at end of file

filename was Deleted added: 2390, removed: 46, total 2344

@@ -0,0 +1 @@

tel:+1-555-555-5555

No newline at end of file

filename was Deleted added: 2390, removed: 46, total 2344

@@ -0,0 +1 @@

https://www.youtube.com/watch?v=2g5xkLqIElU&list=PLHwvDXmNUa92NlFPooY1P5tfDo4T85ORz&index=3

No newline at end of file

filename was Deleted added: 2390, removed: 46, total 2344

@@ -0,0 +1 @@

https://llanfairpwllgwyngyllgogerychwyrndrobwllllantysiliogogogoch.co.uk/images/platformticket.gif

No newline at end of file

filename was Deleted added: 2390, removed: 46, total 2344

@@ -0,0 +1 @@

https://github.com/robinlinden/hastur/actions/runs/4441133331/jobs/7795829478?pr=476#step:7:31

No newline at end of file

filename was Deleted added: 2390, removed: 46, total 2344

@@ -0,0 +1 @@

https://127.0.0.1:631

No newline at end of file

url/url_test.cpp added: 2390, removed: 46, total 2344

@@ -9,9 +9,18 @@

#include <array>

#include <cstdint>

#include <iostream>

#include <optional>

#include <regex>

#include <variant>

int main() {

const url::Url base{"https",

"",

url::Host{url::HostType::DnsDomain, "example.com"},

std::uint16_t{8080},

std::vector<std::string>{"test", "index.php"}};

etest::test("blob URL generation", [] {

std::string regex_uuid = "[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}";

@@ -19,7 +28,7 @@ int main() {

url::Origin o = {"https", h, std::uint16_t{8080}, std::nullopt, false};

std::string blob = url::blob_url_create(o);

std::cout << "Generated Blob URL: " << blob << std::endl;

std::cout << std::endl << "Generated Blob URL: " << blob << std::endl;

etest::expect(std::regex_match(blob, std::regex("blob:https://example.com:8080/" + regex_uuid)));

@@ -36,11 +45,406 @@ int main() {

o = {"https", h, std::uint16_t{8080}, std::nullopt, false};

blob = url::blob_url_create(o);

std::cout << "Generated Blob URL: " << blob;

std::cout << "Generated Blob URL: " << blob << std::endl;

etest::expect(std::regex_match(

blob, std::regex("blob:https://\\[2001:db8:85a3::8a2e:370:7334\\]:8080/" + regex_uuid)));

});

return etest::run_all_tests();

etest::test("URL parsing: port and path", [] {

url::UrlParser p;

std::optional<url::Url> url = p.parse("https://example.com:8080/index.html");

etest::require(url.has_value());

etest::expect_eq(url->scheme, "https");

etest::expect_eq(std::get<0>(url->host->data), "example.com");

etest::expect_eq(url->port.value(), 8080);

etest::expect_eq(std::get<1>(url->path)[0], "index.html");

etest::expect(!url->query.has_value());

etest::expect(!url->fragment.has_value());

});

etest::test("URL parsing: 1 unicode char", [] {

url::UrlParser p;

std::optional<url::Url> url = p.parse("http://bücher.de");

etest::require(url.has_value());

etest::expect_eq(url->scheme, "http");

etest::expect_eq(std::get<0>(url->host->data), "xn--bcher-kva.de");

etest::expect(!url->port.has_value());

etest::expect_eq(std::get<1>(url->path)[0], "");

etest::expect(!url->query.has_value());

etest::expect(!url->fragment.has_value());

});

etest::test("URL parsing: 1 unicode char with path", [] {

url::UrlParser p;

std::optional<url::Url> url = p.parse("https://√.com/i/itunes.gif");

etest::require(url.has_value());

etest::expect_eq(url->scheme, "https");

etest::expect_eq(std::get<0>(url->host->data), "xn--19g.com");

etest::expect(!url->port.has_value());

etest::expect_eq(std::get<1>(url->path)[0], "i");

etest::expect_eq(std::get<1>(url->path)[1], "itunes.gif");

etest::expect(!url->query.has_value());

etest::expect(!url->fragment.has_value());

});

etest::test("URL parsing: unicode path", [] {

url::UrlParser p;

std::optional<url::Url> url = p.parse("https://ar.wikipedia.org/wiki/نجيب_محفوظ");

etest::require(url.has_value());

etest::expect_eq(url->scheme, "https");

etest::expect_eq(std::get<0>(url->host->data), "ar.wikipedia.org");

etest::expect(!url->port.has_value());

etest::expect_eq(std::get<1>(url->path)[0], "wiki");

etest::expect_eq(std::get<1>(url->path)[1], "%D9%86%D8%AC%D9%8A%D8%A8_%D9%85%D8%AD%D9%81%D9%88%D8%B8");

etest::expect(!url->query.has_value());

etest::expect(!url->fragment.has_value());

});

etest::test("URL parsing: tel URI", [] {

url::UrlParser p;

std::optional<url::Url> url = p.parse("tel:+1-555-555-5555");

etest::require(url.has_value());

etest::expect_eq(url->scheme, "tel");

etest::expect(!url->host.has_value());

etest::expect(!url->port.has_value());

etest::expect_eq(std::get<0>(url->path), "+1-555-555-5555");

etest::expect(!url->query.has_value());

etest::expect(!url->fragment.has_value());

});

etest::test("URL parsing: username and passwd in authority", [] {

url::UrlParser p;

std::optional<url::Url> url = p.parse("https://zero-one:testpass123@example.com/login.php");

etest::require(url.has_value());

etest::expect_eq(url->scheme, "https");

etest::expect_eq(url->user, "zero-one");

etest::expect_eq(url->passwd, "testpass123");

etest::expect_eq(std::get<0>(url->host->data), "example.com");

etest::expect(!url->port.has_value());

etest::expect_eq(std::get<1>(url->path)[0], "login.php");

etest::expect(!url->query.has_value());

etest::expect(!url->fragment.has_value());

});

etest::test("URL parsing: query", [] {

url::UrlParser p;

std::optional<url::Url> url =

p.parse("https://www.youtube.com/watch?v=2g5xkLqIElUlist=PLHwvDXmNUa92NlFPooY1P5tfDo4T85ORzindex=3");

etest::require(url.has_value());

etest::expect_eq(url->scheme, "https");

etest::expect_eq(std::get<0>(url->host->data), "www.youtube.com");

etest::expect(!url->port.has_value());

etest::expect_eq(std::get<1>(url->path)[0], "watch");

etest::expect_eq(url->query, "v=2g5xkLqIElUlist=PLHwvDXmNUa92NlFPooY1P5tfDo4T85ORzindex=3");

etest::expect(!url->fragment.has_value());

});

etest::test("URL parsing: Welsh", [] {

url::UrlParser p;

std::optional<url::Url> url = p.parse(

"https://llanfairpwllgwyngyllgogerychwyrndrobwllllantysiliogogogoch.co.uk/images/platformticket.gif");

etest::require(url.has_value());

etest::expect_eq(url->scheme, "https");

etest::expect_eq(

std::get<0>(url->host->data), "llanfairpwllgwyngyllgogerychwyrndrobwllllantysiliogogogoch.co.uk");

etest::expect(!url->port.has_value());

etest::expect_eq(std::get<1>(url->path)[0], "images");

etest::expect_eq(std::get<1>(url->path)[1], "platformticket.gif");

etest::expect(!url->query.has_value());

etest::expect(!url->fragment.has_value());

});

// This domain exceeds the maximum length of both a domain component/label and a FQDN

etest::test("URL parsing: extreme Welsh", [] {

url::UrlParser p;

std::optional<url::Url> url =

p.parse("https://"

"llanfairpwllgwyngyllgogerychwyrndrobwllllantysiliogogogochobwllllantysiliogogogochanfairpwllgw"

"yngyllgogerychgogerychwyrndrobwllllantysiliogogogochobwllllantysiliogogogochllanfairpwllgwyngy"

"llgogerychwyrndrobwllllantysiliogogogochobwllllantysiliogogogochanfairpwllgwyngyllgogerychgoge"

"rychwyrndrobwllllantysiliogogogochobwllllantysiliogogogoch.co.uk");

etest::require(url.has_value());

etest::expect_eq(url->scheme, "https");

etest::expect_eq(std::get<0>(url->host->data),

"llanfairpwllgwyngyllgogerychwyrndrobwllllantysiliogogogochobwllllantysiliogogogochanfairpwllgwyngyllgo"

"gerychgogerychwyrndrobwllllantysiliogogogochobwllllantysiliogogogochllanfairpwllgwyngyllgogerychwyrndr"

"obwllllantysiliogogogochobwllllantysiliogogogochanfairpwllgwyngyllgogerychgogerychwyrndrobwllllantysil"

"iogogogochobwllllantysiliogogogoch.co.uk");

etest::expect(!url->port.has_value());

etest::expect_eq(std::get<1>(url->path)[0], "");

etest::expect(!url->query.has_value());

etest::expect(!url->fragment.has_value());

});

etest::test("URL parsing: path, query, and fragment", [] {

url::UrlParser p;

std::optional<url::Url> url = p.parse(

"https://github.com/robinlinden/hastur/actions/runs/4441133331/jobs/7795829478?pr=476#step:7:31");

etest::require(url.has_value());

etest::expect_eq(url->scheme, "https");

etest::expect_eq(std::get<0>(url->host->data), "github.com");

etest::expect(!url->port.has_value());

etest::expect_eq(std::get<1>(url->path)[0], "robinlinden");

etest::expect_eq(std::get<1>(url->path)[1], "hastur");

etest::expect_eq(std::get<1>(url->path)[2], "actions");

etest::expect_eq(std::get<1>(url->path)[3], "runs");

etest::expect_eq(std::get<1>(url->path)[4], "4441133331");

etest::expect_eq(std::get<1>(url->path)[5], "jobs");

etest::expect_eq(std::get<1>(url->path)[6], "7795829478");

etest::expect_eq(url->query, "pr=476");

etest::expect_eq(url->fragment, "step:7:31");

});

etest::test("URL parsing: ipv4 and port", [] {

url::UrlParser p;

std::optional<url::Url> url = p.parse("https://127.0.0.1:631");

etest::require(url.has_value());

etest::expect_eq(url->scheme, "https");

etest::expect_eq(std::get<1>(url->host->data), 2130706433ul);

etest::expect_eq(url->port, 631);

etest::expect_eq(std::get<1>(url->path)[0], "");

etest::expect(!url->query.has_value());

etest::expect(!url->fragment.has_value());

});

etest::test("URL parsing: ipv6 and port", [] {

url::UrlParser p;

const std::array<std::uint16_t, 8> addr{0x2001, 0xdb8, 0x85a3, 0, 0, 0x8a2e, 0x370, 0x7334};

std::optional<url::Url> url = p.parse("https://[2001:db8:85a3::8a2e:370:7334]:631");

etest::require(url.has_value());

etest::expect_eq(url->scheme, "https");

etest::expect_eq(std::get<2>(url->host->data), addr);

etest::expect_eq(url->port, 631);

etest::expect_eq(std::get<1>(url->path)[0], "");

etest::expect(!url->query.has_value());

etest::expect(!url->fragment.has_value());

});

etest::test("URL parsing: ipv6 v4-mapped with port", [] {

url::UrlParser p;

const std::array<std::uint16_t, 8> addr{0, 0, 0, 0, 0, 0xffff, 0x4ccb, 0x8c22};

std::optional<url::Url> url = p.parse("https://[0000:0000:0000:0000:0000:ffff:4ccb:8c22]:631");

etest::require(url.has_value());

etest::expect_eq(url->scheme, "https");

etest::expect_eq(std::get<2>(url->host->data), addr);

etest::expect_eq(url->port, 631);

etest::expect_eq(std::get<1>(url->path)[0], "");

etest::expect(!url->query.has_value());

etest::expect(!url->fragment.has_value());

});

etest::test("URL parsing: ipv6 v4-mapped compressed with dot-decimal", [] {

url::UrlParser p;

const std::array<std::uint16_t, 8> addr{0, 0, 0, 0, 0, 0xffff, 0x4ccb, 0x8c22};

std::optional<url::Url> url = p.parse("https://[::ffff:76.203.140.34]:631");

etest::require(url.has_value());

etest::expect_eq(url->scheme, "https");

etest::expect_eq(std::get<2>(url->host->data), addr);

etest::expect_eq(url->port, 631);

etest::expect_eq(std::get<1>(url->path)[0], "");

etest::expect(!url->query.has_value());

etest::expect(!url->fragment.has_value());

});

etest::test("URL parsing: empty input", [] {

url::UrlParser p;

std::optional<url::Url> url = p.parse("");

etest::expect(!url.has_value());

});

etest::test("URL parsing: empty input with base URL", [&base] {

url::UrlParser p;

std::optional<url::Url> url = p.parse("", base);

etest::require(url.has_value());

etest::expect_eq(url->scheme, "https");

etest::expect_eq(std::get<0>(url->host->data), "example.com");

etest::expect_eq(url->port, 8080);

etest::expect_eq(std::get<1>(url->path)[0], "test");

etest::expect_eq(std::get<1>(url->path)[1], "index.php");

etest::expect(!url->query.has_value());

etest::expect(!url->fragment.has_value());

});

etest::test("URL parsing: query input with base URL", [&base] {

url::UrlParser p;

std::optional<url::Url> url = p.parse("?view=table", base);

etest::require(url.has_value());

etest::expect_eq(url->scheme, "https");

etest::expect_eq(std::get<0>(url->host->data), "example.com");

etest::expect_eq(url->port, 8080);

etest::expect_eq(std::get<1>(url->path)[0], "test");

etest::expect_eq(std::get<1>(url->path)[1], "index.php");

etest::expect_eq(url->query, "view=table");

etest::expect(!url->fragment.has_value());

});

etest::test("URL parsing: file URL", [] {

url::UrlParser p;

std::optional<url::Url> url = p.parse("file:///home/zero-one/repos/hastur/README.md");

etest::require(url.has_value());

etest::expect_eq(url->scheme, "file");

etest::expect_eq(std::get<0>(url->host->data), "");

etest::expect(!url->port.has_value());

etest::expect_eq(std::get<1>(url->path)[0], "home");

etest::expect_eq(std::get<1>(url->path)[1], "zero-one");

etest::expect_eq(std::get<1>(url->path)[2], "repos");

etest::expect_eq(std::get<1>(url->path)[3], "hastur");

etest::expect_eq(std::get<1>(url->path)[4], "README.md");

etest::expect(!url->query.has_value());

etest::expect(!url->fragment.has_value());

});

etest::test("URL parsing: file URL with double-dot", [] {

url::UrlParser p;

std::optional<url::Url> url = p.parse("file:///home/zero-one/repos/../hastur/README.md");

etest::require(url.has_value());

etest::expect_eq(url->scheme, "file");

etest::expect_eq(std::get<0>(url->host->data), "");

etest::expect(!url->port.has_value());

etest::expect_eq(std::get<1>(url->path)[0], "home");

etest::expect_eq(std::get<1>(url->path)[1], "zero-one");

etest::expect_eq(std::get<1>(url->path)[2], "hastur");

etest::expect_eq(std::get<1>(url->path)[3], "README.md");

etest::expect(!url->query.has_value());

etest::expect(!url->fragment.has_value());

});

etest::test("URL parsing: file URL with double-dot 2", [] {

url::UrlParser p;

std::optional<url::Url> url = p.parse("file:///home/zero-one/repos/../hastur/../README.md");

etest::require(url.has_value());

etest::expect_eq(url->scheme, "file");

etest::expect_eq(std::get<0>(url->host->data), "");

etest::expect(!url->port.has_value());

etest::expect_eq(std::get<1>(url->path)[0], "home");

etest::expect_eq(std::get<1>(url->path)[1], "zero-one");

etest::expect_eq(std::get<1>(url->path)[2], "README.md");

etest::expect(!url->query.has_value());

etest::expect(!url->fragment.has_value());

});

etest::test("URL parsing: file URL with double-dot 3", [] {

url::UrlParser p;

std::optional<url::Url> url = p.parse("file:///../home/zero-one/repos/");

etest::require(url.has_value());

etest::expect_eq(url->scheme, "file");

etest::expect_eq(std::get<0>(url->host->data), "");

etest::expect(!url->port.has_value());

etest::expect_eq(std::get<1>(url->path)[0], "home");

etest::expect_eq(std::get<1>(url->path)[1], "zero-one");

etest::expect_eq(std::get<1>(url->path)[2], "repos");

etest::expect(!url->query.has_value());

etest::expect(!url->fragment.has_value());

});

etest::test("URL parsing: file URL with single-dot", [] {

url::UrlParser p;

std::optional<url::Url> url = p.parse("file:///home/zero-one/repos/./hastur/README.md");

etest::require(url.has_value());

etest::expect_eq(url->scheme, "file");

etest::expect_eq(std::get<0>(url->host->data), "");

etest::expect(!url->port.has_value());

etest::expect_eq(std::get<1>(url->path)[0], "home");

etest::expect_eq(std::get<1>(url->path)[1], "zero-one");

etest::expect_eq(std::get<1>(url->path)[2], "repos");

etest::expect_eq(std::get<1>(url->path)[3], "hastur");

etest::expect_eq(std::get<1>(url->path)[4], "README.md");

etest::expect(!url->query.has_value());

etest::expect(!url->fragment.has_value());

});

etest::test("URL parsing: file URL with windows path", [] {

url::UrlParser p;

std::optional<url::Url> url = p.parse(R"(file://C:\Users\zero-one\repos\hastur\README.md)");

etest::require(url.has_value());

etest::expect_eq(url->scheme, "file");

etest::expect_eq(std::get<0>(url->host->data), "");

etest::expect(!url->port.has_value());

etest::expect_eq(std::get<1>(url->path)[0], "C:");

etest::expect_eq(std::get<1>(url->path)[1], "Users");

etest::expect_eq(std::get<1>(url->path)[2], "zero-one");

etest::expect_eq(std::get<1>(url->path)[3], "repos");

etest::expect_eq(std::get<1>(url->path)[4], "hastur");

etest::expect_eq(std::get<1>(url->path)[5], "README.md");

etest::expect(!url->query.has_value());

etest::expect(!url->fragment.has_value());

});

int ret = etest::run_all_tests();

url::icu_cleanup();

return ret;

}

util/base_parser.h added: 2390, removed: 46, total 2344

@@ -10,6 +10,7 @@

#include <concepts>

#include <cstddef>

#include <optional>

#include <string_view>

namespace util {

@@ -17,13 +18,30 @@ namespace util {

template<typename T>

concept Predicate = std::predicate<T, char>;

// NOLINTBEGIN(bugprone-unchecked-optional-access)

class BaseParser {

public:

constexpr explicit BaseParser(std::string_view input) : input_{input} {}

constexpr char peek() const { return input_[pos_]; }

constexpr std::optional<char> peek() const {

if (is_eof()) {

return std::nullopt;

}

constexpr std::string_view peek(std::size_t chars) const { return input_.substr(pos_, chars); }

return input_[pos_];

}

constexpr std::optional<std::string_view> peek(std::size_t chars) const {

if (is_eof()) {

return std::nullopt;

}

return input_.substr(pos_, chars);

}

constexpr std::string_view remaining_from(std::size_t skip) const {

return pos_ + skip >= input_.size() ? "" : input_.substr(pos_ + skip);

}

constexpr bool starts_with(std::string_view prefix) const { return peek(prefix.size()) == prefix; }

@@ -42,6 +60,8 @@ public:

pos_ = 0;

}

constexpr std::size_t current_pos() const { return pos_; }

template<Predicate T>

constexpr std::string_view consume_while(T const &pred) {

std::size_t start = pos_;

@@ -52,7 +72,7 @@ public:

}

constexpr void skip_whitespace() {

while (!is_eof() && util::is_whitespace(peek())) {

while (!is_eof() && util::is_whitespace(*peek())) {

advance(1);

}

@@ -61,6 +81,7 @@ private:

std::string_view input_;

std::size_t pos_{0};

};

// NOLINTEND(bugprone-unchecked-optional-access)

} // namespace util

util/string.h added: 2390, removed: 46, total 2344

@@ -9,9 +9,14 @@

#include <algorithm>

#include <array>

#include <cassert>

#include <charconv>

#include <concepts>

#include <cstdint>

#include <iomanip>

#include <ios>

#include <iterator>

#include <optional>

#include <span>

#include <sstream>

#include <string>

@@ -65,6 +70,10 @@ constexpr bool is_hex_digit(char c) {

return is_upper_hex_digit(c) || is_lower_hex_digit(c);

}

constexpr bool is_octal_digit(char c) {

return c >= '0' && c <= '7';

}

constexpr char lowercased(char c) {

if (!is_upper_alpha(c)) {

return c;

@@ -196,6 +205,49 @@ inline std::string ipv6_serialize(std::span<std::uint16_t, 8> addr) {

return std::move(out).str();

}

// https://url.spec.whatwg.org/#string-percent-encode-after-encoding

inline std::string percent_encode(

std::string_view input, std::predicate<char> auto in_encode_set, bool space_as_plus = false) {

std::stringstream out;

for (char i : input) {

if (space_as_plus && i == ' ') {

out << '+';

} else if (in_encode_set(i)) {

out << '%' << std::setw(2) << std::uppercase << std::hex

<< static_cast<unsigned int>(static_cast<unsigned char>(i));

} else {

out << i;

}

return std::move(out).str();

}

// https://url.spec.whatwg.org/#percent-decode

constexpr std::string percent_decode(std::string_view input) {

std::string output;

for (std::size_t i = 0; i < input.size(); i++) {

if (input[i] != '%' || (input.size() <= i + 2 || !is_hex_digit(input[i + 1]) || !is_hex_digit(input[i + 2]))) {

output += input[i];

} else {

std::string_view digits = input.substr(i + 1, 2);

std::uint8_t num;

[[maybe_unused]] auto res = std::from_chars(digits.data(), digits.data() + digits.size(), num, 16);

assert(res.ec != std::errc::invalid_argument && res.ec != std::errc::result_out_of_range);

output += static_cast<char>(num);

i += 2;

}

return output;

}

} // namespace util

#endif

util/unicode.h added: 2390, removed: 46, total 2344

@@ -6,6 +6,7 @@

#define UTIL_UNICODE_H_

#include <cstdint>

#include <optional>

#include <string>

namespace util {

@@ -113,6 +114,70 @@ constexpr bool is_unicode_noncharacter(std::uint32_t code_point) {

}

// Takes a UTF-8 encoded codepoint, and returns the codepoint value.

// Note: This routine assumes that the input is a valid UTF-8 string. Strings that are too short return 0.

constexpr std::uint32_t utf8_to_utf32(std::string_view input) {

std::uint32_t codepoint = 0;

if (!input.empty() && (input[0] & 0b10000000) == 0b00000000) {

codepoint = static_cast<unsigned char>(input[0]);

} else if (input.size() > 1 && (input[0] & 0b11100000) == 0b11000000) {

codepoint = ((input[0] & 0b00011111) << 6) | (input[1] & 0b00111111);

} else if (input.size() > 2 && (input[0] & 0b11110000) == 0b11100000) {

codepoint = ((input[0] & 0b00001111) << 12) | ((input[1] & 0b00111111) << 6) | (input[2] & 0b00111111);

} else if (input.size() > 3 && (input[0] & 0b11111000) == 0b11110000) {

codepoint = ((input[0] & 0b00000111) << 18) | ((input[1] & 0b00111111) << 12) | ((input[2] & 0b00111111) << 6)

| (input[3] & 0b00111111);

}

return codepoint;

}

// Calculates codepoint length of a UTF-8 string.

// Note: This routine assumes that the string is valid UTF-8, otherwise we need

// to check if the bytes following the first byte of the codepoint are correct

// instead of just advancing the index.

// For incorrectly-encoded strings which do not have enough data to match the

// size suggested by the initial code unit, this function returns std::nullopt

constexpr std::optional<std::size_t> utf8_length(std::string_view input) {

std::size_t len = 0;

for (std::size_t i = 0; i < input.size(); i++) {

if ((input[i] & 0b10000000) == 0b00000000) {

len++;

} else if ((input[i] & 0b11100000) == 0b11000000) {

i++;

if (input.size() <= i) {

return std::nullopt;

}

len++;

} else if ((input[i] & 0b11110000) == 0b11100000) {

i += 2;

if (input.size() <= i) {

return std::nullopt;

}

len++;

} else if ((input[i] & 0b11111000) == 0b11110000) {

i += 3;

if (input.size() <= i) {

return std::nullopt;

}

len++;

}

return len;

}

} // namespace util

#endif

util/unicode_test.cpp added: 2390, removed: 46, total 2344

@@ -78,5 +78,29 @@ int main() {

}

});

etest::test("utf8_to_utf32", [] {

expect_eq(utf8_to_utf32("/"sv), 0x002ful);

expect_eq(utf8_to_utf32("Д"sv), 0x0414ul);

expect_eq(utf8_to_utf32("ᛋ"sv), 0x16cbul);

expect_eq(utf8_to_utf32("🫸"sv), 0x1faf8ul);

// Pass several codepoints, it should just decode the first one

expect_eq(utf8_to_utf32("🯷🯷🯷"sv), 0x1fbf7ul);

});

etest::test("utf8_length", [] {

expect_eq(utf8_length("🮻"sv), 1ul);

expect_eq(utf8_length("This string is 33 characters long"sv), 33ul);

expect_eq(utf8_length("🤖🤖🤖"sv), 3ul);

expect_eq(utf8_length("🆒🆒🆒🆒🆒🆒🆒!"sv), 8ul);

// First byte suggests a 2-byte char, but we don't supply the 2nd byte

std::string invalid{static_cast<char>(0b11000000)};

expect_eq(utf8_length(invalid), std::nullopt);

});

return etest::run_all_tests();

}