srctree

Robin Linden parent e9a214f7 e0887114
idna: Implement decoding punycode to UTF-8

.clang-tidy added: 289, removed: 7, total 282

@@ -70,7 +70,7 @@ Checks: >

# clang-tidy-16.

WarningsAsErrors: "*,-clang-diagnostic-builtin-macro-redefined"

HeaderFilterRegex: "\\./(archive|browser|css|css2|dom|dom2|engine|etest|geom|gfx|html|html2|img|js|layout|net|os|protocol|render|style|tui|uri|url|util|wasm)/"

HeaderFilterRegex: "\\./(archive|browser|css|css2|dom|dom2|engine|etest|geom|gfx|html|html2|idna|img|js|layout|net|os|protocol|render|style|tui|uri|url|util|wasm)/"

CheckOptions:

# readability-identifier-naming

.gitlint added: 289, removed: 7, total 282

@@ -4,4 +4,4 @@ ignore=body-is-missing

# TODO(robinlinden): Better way of documenting and setting this up.

# Each commit must start with the main area it affects.

[title-match-regex]

regex=^(archive|browser|bzl|css|css2|dom|dom2|engine|etest|geom|gfx|html|html2|img|js|layout|net|os|protocol|render|style|tui|uri|url|util|wasm|all|build|ci|deps|doc|meta)(/.*|\+.*)?:

regex=^(archive|browser|bzl|css|css2|dom|dom2|engine|etest|geom|gfx|html|html2|idna|img|js|layout|net|os|protocol|render|style|tui|uri|url|util|wasm|all|build|ci|deps|doc|meta)(/.*|\+.*)?:

filename was Deleted added: 289, removed: 7, total 282

@@ -0,0 +1,22 @@

load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")

load("//bzl:copts.bzl", "HASTUR_COPTS")

cc_library(

name = "idna",

hdrs = glob(["*.h"]),

copts = HASTUR_COPTS,

visibility = ["//visibility:public"],

deps = ["//util:unicode"],

)

[cc_test(

name = src[:-4],

size = "small",

srcs = [src],

copts = HASTUR_COPTS,

deps = [

":idna",

"//etest",

"//util:unicode",

) for src in glob(["*_test.cpp"])]

filename was Deleted added: 289, removed: 7, total 282

@@ -0,0 +1,148 @@

// SPDX-FileCopyrightText: 2023 Robin Lindén <dev@robinlinden.eu>

// SPDX-License-Identifier: BSD-2-Clause

#ifndef IDNA_PUNYCODE_H_

#define IDNA_PUNYCODE_H_

#include "util/unicode.h"

#include <algorithm>

#include <cassert>

#include <cstdint>

#include <functional>

#include <optional>

#include <string>

#include <string_view>

namespace idna {

// https://datatracker.ietf.org/doc/html/rfc3492

class Punycode {

public:

// https://datatracker.ietf.org/doc/html/rfc3492#section-6.2

static constexpr std::optional<std::string> to_utf8(std::string_view encoded) {

int n = kInitialN;

int i = 0;

int bias = kInitialBias;

std::u32string output{};

if (auto last_delimiter = encoded.find_last_of(kDelimiter); last_delimiter != std::string_view::npos) {

// No need to turn these into code points, since they are all ASCII.

for (auto cp : encoded.substr(0, last_delimiter)) {

output.push_back(cp);

}

encoded.remove_prefix(last_delimiter + 1);

if (std::ranges::any_of(output, std::not_fn(&is_basic_code_point))) {

return std::nullopt;

}

// clang-tidy says this is pointer-ish, but msvc disagrees.

// NOLINTNEXTLINE(readability-qualified-auto)

auto input = encoded.begin();

while (input != encoded.end()) {

int oldi = i;

int w = 1;

for (int k = kBase; true; k += kBase) {

if (input == encoded.end()) {

return std::nullopt;

}

auto digit = digit_value(*input++).value_or(-1);

if (digit == -1) {

return std::nullopt;

}

// TODO(robinlinden): Fail on overflow.

i += digit * w;

int t = [&] {

if (k <= bias) {

return kTMin;

}

if (k >= bias + kTMax) {

return kTMax;

}

return k - bias;

}();

if (digit < t) {

break;

}

// TODO(robinlinden): Fail on overflow.

w *= kBase - t;

}

bias = adapt(i - oldi, static_cast<int>(output.size()) + 1, oldi == 0);

n += i / (static_cast<int>(output.size()) + 1);

i %= output.size() + 1;

output.insert(output.begin() + i, n);

i += 1;

}

return unicode_to_utf8(output);

}

private:

// Parameter values for Punycode

// https://datatracker.ietf.org/doc/html/rfc3492#section-5

static constexpr int kBase = 36;

static constexpr int kTMin = 1;

static constexpr int kTMax = 26;

static constexpr int kSkew = 38;

static constexpr int kDamp = 700;

static constexpr int kInitialBias = 72;

static constexpr int kInitialN = 128;

static constexpr bool is_basic_code_point(char32_t cp) { return cp < 0x80; }

static constexpr char32_t kDelimiter = '-';

static constexpr std::optional<int> digit_value(char32_t cp) {

if (cp >= 'A' && cp <= 'Z') {

return cp - 'A';

}

if (cp >= 'a' && cp <= 'z') {

return cp - 'a';

}

if (cp >= '0' && cp <= '9') {

return cp - '0' + 26;

}

return std::nullopt;

}

// https://datatracker.ietf.org/doc/html/rfc3492#section-6.1

static constexpr int adapt(int delta, int numpoints, bool firsttime) {

delta = firsttime ? delta / kDamp : delta / 2;

delta += delta / numpoints;

int k = 0;

while (delta > ((kBase - kTMin) * kTMax) / 2) {

delta /= kBase - kTMin;

k += kBase;

}

return k + ((kBase - kTMin + 1) * delta) / (delta + kSkew);

}

static constexpr std::string unicode_to_utf8(std::u32string const &code_points) {

std::string result{};

for (auto const code_point : code_points) {

result += util::unicode_to_utf8(code_point);

}

return result;

}

};

} // namespace idna

#endif

filename was Deleted added: 289, removed: 7, total 282

@@ -0,0 +1,112 @@

// SPDX-FileCopyrightText: 2023 Robin Lindén <dev@robinlinden.eu>

// SPDX-License-Identifier: BSD-2-Clause

#include "idna/punycode.h"

#include "etest/etest2.h"

#include "util/unicode.h"

namespace {

std::string unicode_as_utf8_string(std::vector<int> const &code_points) {

std::string result{};

for (auto const code_point : code_points) {

result += util::unicode_to_utf8(code_point);

}

return result;

}

} // namespace

int main() {

etest::Suite s{};

// https://datatracker.ietf.org/doc/html/rfc3492#section-7

s.add_test("(A) Arabic (Egyptian)", [](etest::IActions &a) {

// u+0644 u+064A u+0647 u+0645 u+0627 u+0628 u+062A u+0643 u+0644

// u+0645 u+0648 u+0634 u+0639 u+0631 u+0628 u+064A u+061F

// Punycode: egbpdaj6bu4bxfgehfvwxn

std::string expected = unicode_as_utf8_string({0x0644,

0x064A,

0x0647,

0x0645,

0x0627,

0x0628,

0x062A,

0x0643,

0x0644,

0x0645,

0x0648,

0x0634,

0x0639,

0x0631,

0x0628,

0x064A,

0x061F});

a.expect_eq(idna::Punycode::to_utf8("egbpdaj6bu4bxfgehfvwxn").value(), expected);

});

s.add_test("(M) <amuro><namie>-with-SUPER-MONKEYS", [](etest::IActions &a) {

// u+5B89 u+5BA4 u+5948 u+7F8E u+6075 u+002D u+0077 u+0069 u+0074

// u+0068 u+002D U+0053 U+0055 U+0050 U+0045 U+0052 u+002D U+004D

// U+004F U+004E U+004B U+0045 U+0059 U+0053

// Punycode: -with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n

std::string expected = unicode_as_utf8_string({0x5B89,

0x5BA4,

0x5948,

0x7F8E,

0x6075,

'-',

'w',

'i',

't',

'h',

'-',

'S',

'U',

'P',

'E',

'R',

'-',

'M',

'O',

'N',

'K',

'E',

'Y',

'S'});

a.expect_eq(idna::Punycode::to_utf8("-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n").value(), expected);

});

s.add_test("(P) Maji<de>Koi<suru>5<byou><mae>", [](etest::IActions &a) {

// U+004D u+0061 u+006A u+0069 u+3067 U+004B u+006F u+0069 u+3059

// u+308B u+0035 u+79D2 u+524D

// Punycode: MajiKoi5-783gue6qz075azm5e

std::string expected = unicode_as_utf8_string(

{'M', 'a', 'j', 'i', 0x3067, 'K', 'o', 'i', 0x3059, 0x308B, '5', 0x79D2, 0x524D});

a.expect_eq(idna::Punycode::to_utf8("MajiKoi5-783gue6qz075azm5e").value(), expected);

});

// Error handling.

s.add_test("non-ascii before separator", [](etest::IActions &a) {

a.expect_eq(idna::Punycode::to_utf8("\xF0-").has_value(), false); //

});

s.add_test("out of data", [](etest::IActions &a) {

a.expect_eq(idna::Punycode::to_utf8("-3").has_value(), false); //

});

s.add_test("non-ascii after separator", [](etest::IActions &a) {

a.expect_eq(idna::Punycode::to_utf8("-\xF0").has_value(), false); //

});

// Other functionality.

s.add_test("uppercase punycode", [](etest::IActions &a) {

// Same as (P) Maji<de>Koi<suru>5<byou><mae>, but with the punycode capitalized.

std::string expected = unicode_as_utf8_string(

{'M', 'a', 'j', 'i', 0x3067, 'K', 'o', 'i', 0x3059, 0x308B, '5', 0x79D2, 0x524D});

a.expect_eq(idna::Punycode::to_utf8("MajiKoi5-783GUE6QZ075AZM5E").value(), expected);

});

return s.run();

}