srctree

Robin Linden parent e9a214f7 e0887114
idna: Implement decoding punycode to UTF-8

inlinesplit
.clang-tidy added: 289, removed: 7, total 282
@@ -70,7 +70,7 @@ Checks: >
# clang-tidy-16.
WarningsAsErrors: "*,-clang-diagnostic-builtin-macro-redefined"
 
HeaderFilterRegex: "\\./(archive|browser|css|css2|dom|dom2|engine|etest|geom|gfx|html|html2|img|js|layout|net|os|protocol|render|style|tui|uri|url|util|wasm)/"
HeaderFilterRegex: "\\./(archive|browser|css|css2|dom|dom2|engine|etest|geom|gfx|html|html2|idna|img|js|layout|net|os|protocol|render|style|tui|uri|url|util|wasm)/"
 
CheckOptions:
# readability-identifier-naming
 
.gitlint added: 289, removed: 7, total 282
@@ -4,4 +4,4 @@ ignore=body-is-missing
# TODO(robinlinden): Better way of documenting and setting this up.
# Each commit must start with the main area it affects.
[title-match-regex]
regex=^(archive|browser|bzl|css|css2|dom|dom2|engine|etest|geom|gfx|html|html2|img|js|layout|net|os|protocol|render|style|tui|uri|url|util|wasm|all|build|ci|deps|doc|meta)(/.*|\+.*)?:
regex=^(archive|browser|bzl|css|css2|dom|dom2|engine|etest|geom|gfx|html|html2|idna|img|js|layout|net|os|protocol|render|style|tui|uri|url|util|wasm|all|build|ci|deps|doc|meta)(/.*|\+.*)?:
 
filename was Deleted added: 289, removed: 7, total 282
@@ -0,0 +1,22 @@
load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
load("//bzl:copts.bzl", "HASTUR_COPTS")
 
cc_library(
name = "idna",
hdrs = glob(["*.h"]),
copts = HASTUR_COPTS,
visibility = ["//visibility:public"],
deps = ["//util:unicode"],
)
 
[cc_test(
name = src[:-4],
size = "small",
srcs = [src],
copts = HASTUR_COPTS,
deps = [
":idna",
"//etest",
"//util:unicode",
],
) for src in glob(["*_test.cpp"])]
 
filename was Deleted added: 289, removed: 7, total 282
@@ -0,0 +1,148 @@
// SPDX-FileCopyrightText: 2023 Robin Lindén <dev@robinlinden.eu>
//
// SPDX-License-Identifier: BSD-2-Clause
 
#ifndef IDNA_PUNYCODE_H_
#define IDNA_PUNYCODE_H_
 
#include "util/unicode.h"
 
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <functional>
#include <optional>
#include <string>
#include <string_view>
 
namespace idna {
 
// https://datatracker.ietf.org/doc/html/rfc3492
class Punycode {
public:
// https://datatracker.ietf.org/doc/html/rfc3492#section-6.2
static constexpr std::optional<std::string> to_utf8(std::string_view encoded) {
int n = kInitialN;
int i = 0;
int bias = kInitialBias;
std::u32string output{};
 
if (auto last_delimiter = encoded.find_last_of(kDelimiter); last_delimiter != std::string_view::npos) {
// No need to turn these into code points, since they are all ASCII.
for (auto cp : encoded.substr(0, last_delimiter)) {
output.push_back(cp);
}
 
encoded.remove_prefix(last_delimiter + 1);
 
if (std::ranges::any_of(output, std::not_fn(&is_basic_code_point))) {
return std::nullopt;
}
}
 
// clang-tidy says this is pointer-ish, but msvc disagrees.
// NOLINTNEXTLINE(readability-qualified-auto)
auto input = encoded.begin();
while (input != encoded.end()) {
int oldi = i;
int w = 1;
for (int k = kBase; true; k += kBase) {
if (input == encoded.end()) {
return std::nullopt;
}
 
auto digit = digit_value(*input++).value_or(-1);
if (digit == -1) {
return std::nullopt;
}
 
// TODO(robinlinden): Fail on overflow.
i += digit * w;
int t = [&] {
if (k <= bias) {
return kTMin;
}
 
if (k >= bias + kTMax) {
return kTMax;
}
 
return k - bias;
}();
 
if (digit < t) {
break;
}
 
// TODO(robinlinden): Fail on overflow.
w *= kBase - t;
}
 
bias = adapt(i - oldi, static_cast<int>(output.size()) + 1, oldi == 0);
n += i / (static_cast<int>(output.size()) + 1);
i %= output.size() + 1;
output.insert(output.begin() + i, n);
i += 1;
}
 
return unicode_to_utf8(output);
}
 
private:
// Parameter values for Punycode
// https://datatracker.ietf.org/doc/html/rfc3492#section-5
static constexpr int kBase = 36;
static constexpr int kTMin = 1;
static constexpr int kTMax = 26;
static constexpr int kSkew = 38;
static constexpr int kDamp = 700;
static constexpr int kInitialBias = 72;
static constexpr int kInitialN = 128;
 
static constexpr bool is_basic_code_point(char32_t cp) { return cp < 0x80; }
 
static constexpr char32_t kDelimiter = '-';
 
static constexpr std::optional<int> digit_value(char32_t cp) {
if (cp >= 'A' && cp <= 'Z') {
return cp - 'A';
}
 
if (cp >= 'a' && cp <= 'z') {
return cp - 'a';
}
 
if (cp >= '0' && cp <= '9') {
return cp - '0' + 26;
}
 
return std::nullopt;
}
 
// https://datatracker.ietf.org/doc/html/rfc3492#section-6.1
static constexpr int adapt(int delta, int numpoints, bool firsttime) {
delta = firsttime ? delta / kDamp : delta / 2;
delta += delta / numpoints;
 
int k = 0;
while (delta > ((kBase - kTMin) * kTMax) / 2) {
delta /= kBase - kTMin;
k += kBase;
}
 
return k + ((kBase - kTMin + 1) * delta) / (delta + kSkew);
}
 
static constexpr std::string unicode_to_utf8(std::u32string const &code_points) {
std::string result{};
for (auto const code_point : code_points) {
result += util::unicode_to_utf8(code_point);
}
 
return result;
}
};
 
} // namespace idna
 
#endif
 
filename was Deleted added: 289, removed: 7, total 282
@@ -0,0 +1,112 @@
// SPDX-FileCopyrightText: 2023 Robin Lindén <dev@robinlinden.eu>
//
// SPDX-License-Identifier: BSD-2-Clause
 
#include "idna/punycode.h"
 
#include "etest/etest2.h"
#include "util/unicode.h"
 
namespace {
std::string unicode_as_utf8_string(std::vector<int> const &code_points) {
std::string result{};
for (auto const code_point : code_points) {
result += util::unicode_to_utf8(code_point);
}
 
return result;
}
} // namespace
 
int main() {
etest::Suite s{};
 
// https://datatracker.ietf.org/doc/html/rfc3492#section-7
s.add_test("(A) Arabic (Egyptian)", [](etest::IActions &a) {
// u+0644 u+064A u+0647 u+0645 u+0627 u+0628 u+062A u+0643 u+0644
// u+0645 u+0648 u+0634 u+0639 u+0631 u+0628 u+064A u+061F
// Punycode: egbpdaj6bu4bxfgehfvwxn
std::string expected = unicode_as_utf8_string({0x0644,
0x064A,
0x0647,
0x0645,
0x0627,
0x0628,
0x062A,
0x0643,
0x0644,
0x0645,
0x0648,
0x0634,
0x0639,
0x0631,
0x0628,
0x064A,
0x061F});
a.expect_eq(idna::Punycode::to_utf8("egbpdaj6bu4bxfgehfvwxn").value(), expected);
});
 
s.add_test("(M) <amuro><namie>-with-SUPER-MONKEYS", [](etest::IActions &a) {
// u+5B89 u+5BA4 u+5948 u+7F8E u+6075 u+002D u+0077 u+0069 u+0074
// u+0068 u+002D U+0053 U+0055 U+0050 U+0045 U+0052 u+002D U+004D
// U+004F U+004E U+004B U+0045 U+0059 U+0053
// Punycode: -with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n
std::string expected = unicode_as_utf8_string({0x5B89,
0x5BA4,
0x5948,
0x7F8E,
0x6075,
'-',
'w',
'i',
't',
'h',
'-',
'S',
'U',
'P',
'E',
'R',
'-',
'M',
'O',
'N',
'K',
'E',
'Y',
'S'});
a.expect_eq(idna::Punycode::to_utf8("-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n").value(), expected);
});
 
s.add_test("(P) Maji<de>Koi<suru>5<byou><mae>", [](etest::IActions &a) {
// U+004D u+0061 u+006A u+0069 u+3067 U+004B u+006F u+0069 u+3059
// u+308B u+0035 u+79D2 u+524D
// Punycode: MajiKoi5-783gue6qz075azm5e
std::string expected = unicode_as_utf8_string(
{'M', 'a', 'j', 'i', 0x3067, 'K', 'o', 'i', 0x3059, 0x308B, '5', 0x79D2, 0x524D});
a.expect_eq(idna::Punycode::to_utf8("MajiKoi5-783gue6qz075azm5e").value(), expected);
});
 
// Error handling.
s.add_test("non-ascii before separator", [](etest::IActions &a) {
a.expect_eq(idna::Punycode::to_utf8("\xF0-").has_value(), false); //
});
 
s.add_test("out of data", [](etest::IActions &a) {
a.expect_eq(idna::Punycode::to_utf8("-3").has_value(), false); //
});
 
s.add_test("non-ascii after separator", [](etest::IActions &a) {
a.expect_eq(idna::Punycode::to_utf8("-\xF0").has_value(), false); //
});
 
// Other functionality.
s.add_test("uppercase punycode", [](etest::IActions &a) {
// Same as (P) Maji<de>Koi<suru>5<byou><mae>, but with the punycode capitalized.
std::string expected = unicode_as_utf8_string(
{'M', 'a', 'j', 'i', 0x3067, 'K', 'o', 'i', 0x3059, 0x308B, '5', 0x79D2, 0x524D});
a.expect_eq(idna::Punycode::to_utf8("MajiKoi5-783GUE6QZ075AZM5E").value(), expected);
});
 
return s.run();
}