srctree

Robin Linden parent fd231f69 6e5458ca
idna: Add a helper for applying NFD-normalization to a string

inlinesplit
filename was Deleted added: 112, removed: 3, total 109
@@ -0,0 +1,55 @@
// SPDX-FileCopyrightText: 2024 Robin Lindén <dev@robinlinden.eu>
//
// SPDX-License-Identifier: BSD-2-Clause
 
#include "idna/unicode.h"
 
#include "idna/unicode_data.h"
 
#include "util/unicode.h"
 
// NOLINTNEXTLINE(misc-include-cleaner): This is used for std::ranges::lower_bound.
#include <algorithm>
#include <ostream>
#include <sstream>
#include <string>
#include <string_view>
#include <utility>
 
namespace idna {
namespace {
 
void decompose_to(std::ostream &os, char32_t code_point) {
// * clang-tidy thinks std::ranges::lower_bound is provided by
// <bits/ranges_algo.h> when it's actually provided by <algorithm>.
// * clang-tidy says this is pointer-ish, but msvc disagrees.
// NOLINTNEXTLINE(misc-include-cleaner,readability-qualified-auto)
auto maybe_decomposition = std::ranges::lower_bound(
unicode::kDecompositions, code_point, {}, &decltype(unicode::kDecompositions)::value_type::code_point);
 
// This code point does not decompose.
if (maybe_decomposition->code_point != code_point) {
os << util::unicode_to_utf8(code_point);
return;
}
 
// Recursively decompose the decomposition. This is needed as some code
// points decompose into code points that also decompose.
for (auto const decomposed : util::CodePointView{maybe_decomposition->decomposes_to}) {
decompose_to(os, decomposed);
}
}
 
} // namespace
 
std::string Unicode::decompose(std::string_view input) {
std::stringstream ss{};
 
for (auto const code_point : util::CodePointView{input}) {
decompose_to(ss, code_point);
}
 
return std::move(ss).str();
}
 
} // namespace idna
 
filename was Deleted added: 112, removed: 3, total 109
@@ -0,0 +1,21 @@
// SPDX-FileCopyrightText: 2024 Robin Lindén <dev@robinlinden.eu>
//
// SPDX-License-Identifier: BSD-2-Clause
 
#ifndef IDNA_UNICODE_H_
#define IDNA_UNICODE_H_
 
#include <string>
#include <string_view>
 
namespace idna {
 
class Unicode {
public:
// Normalizes the input into its canonical decomposition, NFD.
static std::string decompose(std::string_view);
};
 
} // namespace idna
 
#endif
 
filename was Deleted added: 112, removed: 3, total 109
@@ -0,0 +1,33 @@
// SPDX-FileCopyrightText: 2024 Robin Lindén <dev@robinlinden.eu>
//
// SPDX-License-Identifier: BSD-2-Clause
 
#include "idna/unicode.h"
 
#include "etest/etest2.h"
 
int main() {
etest::Suite s{};
 
s.add_test("not decomposed", [](etest::IActions &a) {
a.expect_eq(idna::Unicode::decompose("abc123xyz"), "abc123xyz"); //
});
 
s.add_test("decomposed", [](etest::IActions &a) {
// A + COMBINING RING ABOVE
a.expect_eq(idna::Unicode::decompose("Å"), "A\xcc\x8a");
 
// s + COMBINING DOT BELOW + COMBINING DOT ABOVE
a.expect_eq(idna::Unicode::decompose("ṩ"), "s\xcc\xa3\xcc\x87");
});
 
s.add_test("mixed", [](etest::IActions &a) {
// s + COMBINING DOT BELOW + COMBINING DOT ABOVE
a.expect_eq(idna::Unicode::decompose("123ṩ567"),
"123"
"s\xcc\xa3\xcc\x87"
"567");
});
 
return s.run();
}