srctree

Robin Linden parent 32616ac2 6c0ac80d
idna: Add a class for applying the uts46 mappings

idna/BUILD added: 150, removed: 4, total 146
@@ -17,6 +17,10 @@ genrule(
 
cc_library(
name = "idna",
srcs = glob(
include = ["*.cpp"],
exclude = ["*_test.cpp"],
),
hdrs = [":generate_idna_data"] + glob(["*.h"]),
copts = HASTUR_COPTS,
visibility = ["//visibility:public"],
 
filename was Deleted added: 150, removed: 4, total 146
@@ -0,0 +1,80 @@
// SPDX-FileCopyrightText: 2024 Robin Lindén <dev@robinlinden.eu>
//
// SPDX-License-Identifier: BSD-2-Clause
 
#include "idna/uts46.h"
 
#include "idna/idna_data.h"
 
#include "util/unicode.h"
 
// NOLINTNEXTLINE(misc-include-cleaner): This is used for std::ranges::lower_bound.
#include <algorithm>
#include <cassert>
#include <optional>
#include <string>
#include <string_view>
#include <variant>
 
namespace idna {
 
std::optional<std::string> Uts46::map(std::string_view input) {
std::string result{};
// input.size is just an estimate, but probably good enough for now.
result.reserve(input.size());
 
for (auto const code_point : util::CodePointView{input}) {
// * clang-tidy thinks std::ranges::lower_bound is provided by
// <bits/ranges_algo.h> when it's actually provided by <algorithm>.
// * clang-tidy says this is pointer-ish, but msvc disagrees.
// NOLINTNEXTLINE(misc-include-cleaner,readability-qualified-auto)
auto mapping = std::ranges::lower_bound(
uts46::kMappings, code_point, {}, &decltype(uts46::kMappings)::value_type::first);
 
// TODO(robinlinden): Generate better mapping table.
if (mapping->first != code_point) {
mapping -= 1;
}
 
auto const &entry = mapping->second;
if (std::holds_alternative<uts46::Ignored>(entry)) {
continue;
}
 
if (std::holds_alternative<uts46::Disallowed>(entry)) {
return std::nullopt;
}
 
// tr46 strongly recommends using the std3 rules, so no opt-out for
// this.
if (std::holds_alternative<uts46::DisallowedStd3Valid>(entry)) {
return std::nullopt;
}
 
if (std::holds_alternative<uts46::DisallowedStd3Mapped>(entry)) {
return std::nullopt;
}
 
if (auto const *mapped = std::get_if<uts46::Mapped>(&entry)) {
result += mapped->maps_to;
continue;
}
 
// These would be mapped in transitional processing, but we don't support that.
if (std::holds_alternative<uts46::Deviation>(entry)) {
result += util::unicode_to_utf8(code_point);
continue;
}
 
if (std::holds_alternative<uts46::Valid>(entry) //
|| std::holds_alternative<uts46::ValidNv8>(entry) //
|| std::holds_alternative<uts46::ValidXv8>(entry)) {
result += util::unicode_to_utf8(code_point);
continue;
}
}
 
return result;
}
 
} // namespace idna
 
filename was Deleted added: 150, removed: 4, total 146
@@ -0,0 +1,21 @@
// SPDX-FileCopyrightText: 2024 Robin Lindén <dev@robinlinden.eu>
//
// SPDX-License-Identifier: BSD-2-Clause
 
#ifndef IDNA_UTS46_H_
#define IDNA_UTS46_H_
 
#include <optional>
#include <string>
#include <string_view>
 
namespace idna {
 
class Uts46 {
public:
static std::optional<std::string> map(std::string_view);
};
 
} // namespace idna
 
#endif
 
filename was Deleted added: 150, removed: 4, total 146
@@ -0,0 +1,41 @@
// SPDX-FileCopyrightText: 2024 Robin Lindén <dev@robinlinden.eu>
//
// SPDX-License-Identifier: BSD-2-Clause
 
#include "idna/uts46.h"
 
#include "etest/etest2.h"
 
#include <optional>
#include <string_view>
 
using namespace std::literals;
 
// https://unicode.org/reports/tr46/#Table_Example_Processing
int main() {
etest::Suite s{};
 
s.add_test("disallowed", [](etest::IActions &a) {
a.expect_eq(idna::Uts46::map("\0"sv), std::nullopt);
a.expect_eq(idna::Uts46::map(","), std::nullopt);
a.expect_eq(idna::Uts46::map("\xc2\xa0"), std::nullopt);
a.expect_eq(idna::Uts46::map("a⒈com"), std::nullopt);
});
 
s.add_test("mapped", [](etest::IActions &a) {
a.expect_eq(idna::Uts46::map("ABCXYZ"), "abcxyz");
a.expect_eq(idna::Uts46::map("日本語。JP"), "日本語.jp");
a.expect_eq(idna::Uts46::map("☕.us"), "☕.us");
});
 
s.add_test("deviation", [](etest::IActions &a) {
a.expect_eq(idna::Uts46::map("Bloß.de"), "bloß.de");
a.expect_eq(idna::Uts46::map("BLOẞ.de"), "bloß.de");
});
 
s.add_test("ignored", [](etest::IActions &a) {
a.expect_eq(idna::Uts46::map("\xc2\xad"), ""); //
});
 
return s.run();
}